Child pages
  • Reading a WARC file
Skip to end of metadata
Go to start of metadata

Out of date API

Note that this example uses an out of date API and will not compile against e.g. JWAT 0.9.1. See the JWAT-Tools source code for more up-to-date examples.

Insert short page description here

Not final!

TestWarc.java
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.Iterator;

import org.jwat.warc.WarcReader;
import org.jwat.warc.WarcReaderFactory;
import org.jwat.warc.WarcRecord;
import org.jwat.warc.WarcValidationError;

public class TestWarc {

	static String warcFile = "/home/nicl/Downloads/IAH-20080430204825-00000-blackbook.warc";
	//static String warcFile = "/home/nicl/Downloads/MYWARC.warc";

	public static void main(String[] args) {
		File file = new File( warcFile );
		try {
			InputStream in = new FileInputStream( file );

			int records = 0;
			int errors = 0;

			WarcReader reader = WarcReaderFactory.getReader( in );
			WarcRecord record;

			while ( (record = reader.getNextRecord()) != null ) {
				printRecord(record);
				printRecordErrors(record);

				++records;

				if (record.hasErrors()) {
					errors += record.getValidationErrors().size();
				}
			}

			System.out.println("--------------");
			System.out.println("       Records: " + records);
			System.out.println("        Errors: " + errors);
			reader.close();
			in.close();
		}
		catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	public static void printRecord(WarcRecord record) {
		System.out.println("--------------");
		System.out.println("       Version: " + record.bMagicIdentified + " " + record.bVersionParsed + " " + record.major + "." + record.minor);
		System.out.println("       TypeIdx: " + record.warcTypeIdx);
		System.out.println("          Type: " + record.warcTypeStr);
		System.out.println("      Filename: " + record.warcFilename);
		System.out.println("     Record-ID: " + record.warcRecordIdUri);
		System.out.println("          Date: " + record.warcDate);
		System.out.println("Content-Length: " + record.contentLength);
		System.out.println("  Content-Type: " + record.contentType);
		System.out.println("     Truncated: " + record.warcTruncatedStr);
		System.out.println("   InetAddress: " + record.warcInetAddress);
		System.out.println("  ConcurrentTo: " + record.warcConcurrentToUriList);
		System.out.println("      RefersTo: " + record.warcRefersToUri);
		System.out.println("     TargetUri: " + record.warcTargetUriUri);
		System.out.println("   WarcInfo-Id: " + record.warcWarcInfoIdUri);
		System.out.println("   BlockDigest: " + record.warcBlockDigest);
		System.out.println(" PayloadDigest: " + record.warcPayloadDigest);
		System.out.println("IdentPloadType: " + record.warcIdentifiedPayloadType);
		System.out.println("       Profile: " + record.warcProfileStr);
		System.out.println("      Segment#: " + record.warcSegmentNumber);
		System.out.println(" SegmentOrg-Id: " + record.warcSegmentOriginIdUrl);
		System.out.println("SegmentTLength: " + record.warcSegmentTotalLength);
	}

	public static void printRecordErrors(WarcRecord record) {
		if (record.hasErrors()) {
			Collection<WarcValidationError> errorCol = record.getValidationErrors();
			if (errorCol != null && errorCol.size() > 0) {
				Iterator<WarcValidationError> iter = errorCol.iterator();
				while (iter.hasNext()) {
					WarcValidationError error = iter.next();
					System.out.println( error.error );
					System.out.println( error.field );
					System.out.println( error.value );
				}
			}
		}
	}

}

  • No labels