001package dk.netarkivet.harvester.harvesting; 002 003import java.net.InetAddress; 004import java.net.UnknownHostException; 005import java.util.Collections; 006import java.util.HashMap; 007import java.util.List; 008import java.util.Map; 009 010import org.archive.modules.CrawlMetadata; 011import org.archive.modules.writer.WARCWriterProcessor; 012import org.archive.util.ArchiveUtils; 013import org.archive.util.anvl.ANVLRecord; 014 015/** 016 * Custom NAS WARCWriterProcessor addding NetarchiveSuite metadata to the WARCInfo records written 017 * by Heritrix by just extending the org.archive.modules.writer.WARCWriterProcessor; 018 * This was not possible in H1. 019 * @author svc 020 * 021 */ 022public class NasWARCProcessor extends WARCWriterProcessor { 023 024 // Constants for the contents of the WarcInfo record 025 private static final String HARVESTINFO_VERSION = "harvestInfo.version"; 026 private static final String HARVESTINFO_JOBID = "harvestInfo.jobId"; 027 private static final String HARVESTINFO_CHANNEL = "harvestInfo.channel"; 028 private static final String HARVESTINFO_HARVESTNUM = "harvestInfo.harvestNum"; 029 private static final String HARVESTINFO_ORIGHARVESTDEFINITIONID = "harvestInfo.origHarvestDefinitionID"; 030 private static final String HARVESTINFO_MAXBYTESPERDOMAIN = "harvestInfo.maxBytesPerDomain"; 031 private static final String HARVESTINFO_MAXOBJECTSPERDOMAIN = "harvestInfo.maxObjectsPerDomain"; 032 private static final String HARVESTINFO_ORDERXMLNAME = "harvestInfo.orderXMLName"; 033 private static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME = "harvestInfo.origHarvestDefinitionName"; 034 private static final String HARVESTINFO_SCHEDULENAME = "harvestInfo.scheduleName"; 035 private static final String HARVESTINFO_HARVESTFILENAMEPREFIX = "harvestInfo.harvestFilenamePrefix"; 036 private static final String HARVESTINFO_JOBSUBMITDATE = "harvestInfo.jobSubmitDate"; 037 private static final String HARVESTINFO_PERFORMER = "harvestInfo.performer"; 038 private static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience"; 039 040 public NasWARCProcessor() { 041 super(); 042 } 043 044 List<String> cachedMetadata; 045 046 /** 047 * metadata items. 048 * Add to bean WARCProcessor bean as as 049 * <property name="metadataItems"> 050 * <map> 051 * <entry key="harvestInfo.version" value="0.5"/> 052 * <entry key="harvestInfo.jobId" value="23"/> 053 * <entry key="harvestInfo.channel" value="FOCUSED"/> 054 * ... 055 * </map> 056 057 */ 058 protected Map<String,String> metadataMap = new HashMap<String,String>(); 059 060 public Map<String,String> getFormItems() { 061 return this.metadataMap; 062 } 063 public void setMetadataItems(Map<String,String> metadataItems) { 064 this.metadataMap = metadataItems; 065 } 066 067 068 @Override 069 public List<String> getMetadata() { 070 if (cachedMetadata != null) { 071 return cachedMetadata; 072 } 073 ANVLRecord record = new ANVLRecord(); 074 record.addLabelValue("software", "Heritrix/" + 075 ArchiveUtils.VERSION + " http://crawler.archive.org"); 076 try { 077 InetAddress host = InetAddress.getLocalHost(); 078 record.addLabelValue("ip", host.getHostAddress()); 079 record.addLabelValue("hostname", host.getCanonicalHostName()); 080 } catch (UnknownHostException e) { 081 //logger.log(Level.WARNING,"unable top obtain local crawl engine host",e); 082 } 083 084 // conforms to ISO 28500:2009 as of May 2009 085 // as described at http://bibnum.bnf.fr/WARC/ 086 // latest draft as of November 2008 087 record.addLabelValue("format","WARC File Format 1.0"); 088 record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); 089 090 // Get other values from metadata provider 091 092 CrawlMetadata provider = getMetadataProvider(); 093 094 addIfNotBlank(record,"operator", provider.getOperator()); 095 addIfNotBlank(record,"publisher", provider.getOrganization()); 096 addIfNotBlank(record,"audience", provider.getAudience()); 097 addIfNotBlank(record,"isPartOf", provider.getJobName()); 098 // TODO: make date match 'job creation date' as in Heritrix 1.x 099 // until then, leave out (plenty of dates already in WARC 100 // records 101// String rawDate = provider.getBeginDate(); 102// if(StringUtils.isNotBlank(rawDate)) { 103// Date date; 104// try { 105// date = ArchiveUtils.parse14DigitDate(rawDate); 106// addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date)); 107// } catch (ParseException e) { 108// logger.log(Level.WARNING,"obtaining warc created date",e); 109// } 110// } 111 addIfNotBlank(record,"description", provider.getDescription()); 112 addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase()); 113 114 addIfNotBlank(record,"http-header-user-agent", 115 provider.getUserAgent()); 116 addIfNotBlank(record,"http-header-from", 117 provider.getOperatorFrom()); 118 119 120 String netarchiveSuiteComment = "#added by NetarchiveSuite " 121 + dk.netarkivet.common.Constants.getVersionString(); 122 ANVLRecord recordNAS = new ANVLRecord(); // Previously new ANVLRecord(7); 123 124 // Add the data from the metadataMap to the WarcInfoRecord. 125 recordNAS.addLabelValue(HARVESTINFO_VERSION, (String) metadataMap.get(HARVESTINFO_VERSION)); 126 recordNAS.addLabelValue(HARVESTINFO_JOBID, (String) metadataMap.get(HARVESTINFO_JOBID)); 127 recordNAS.addLabelValue(HARVESTINFO_CHANNEL, (String) metadataMap.get(HARVESTINFO_CHANNEL)); 128 recordNAS.addLabelValue(HARVESTINFO_HARVESTNUM, (String) metadataMap.get(HARVESTINFO_HARVESTNUM)); 129 recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONID, 130 (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONID)); 131 recordNAS.addLabelValue(HARVESTINFO_MAXBYTESPERDOMAIN, 132 (String) metadataMap.get(HARVESTINFO_MAXBYTESPERDOMAIN)); 133 134 recordNAS.addLabelValue(HARVESTINFO_MAXOBJECTSPERDOMAIN, 135 (String) metadataMap.get(HARVESTINFO_MAXOBJECTSPERDOMAIN)); 136 recordNAS.addLabelValue(HARVESTINFO_ORDERXMLNAME, 137 (String) metadataMap.get(HARVESTINFO_ORDERXMLNAME)); 138 recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONNAME, 139 (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONNAME)); 140 141 if (metadataMap.containsKey(HARVESTINFO_SCHEDULENAME)) { 142 recordNAS.addLabelValue(HARVESTINFO_SCHEDULENAME, 143 (String) metadataMap.get(HARVESTINFO_SCHEDULENAME)); 144 } 145 recordNAS.addLabelValue(HARVESTINFO_HARVESTFILENAMEPREFIX, 146 (String) metadataMap.get(HARVESTINFO_HARVESTFILENAMEPREFIX)); 147 148 recordNAS.addLabelValue(HARVESTINFO_JOBSUBMITDATE, 149 (String) metadataMap.get(HARVESTINFO_JOBSUBMITDATE)); 150 151 if (metadataMap.containsKey(HARVESTINFO_PERFORMER)) { 152 recordNAS.addLabelValue(HARVESTINFO_PERFORMER, 153 (String) metadataMap.get(HARVESTINFO_PERFORMER)); 154 } 155 156 if (metadataMap.containsKey(HARVESTINFO_AUDIENCE)) { 157 recordNAS.addLabelValue(HARVESTINFO_AUDIENCE, 158 (String) metadataMap.get(HARVESTINFO_AUDIENCE)); 159 } 160 161 // really ugly to return as List<String>, but changing would require 162 // larger refactoring 163 cachedMetadata = Collections.singletonList(record.toString() 164 + netarchiveSuiteComment + "\n" + recordNAS.toString()); 165 return cachedMetadata; 166 } 167 168}