001package dk.netarkivet.harvester.harvesting; 002 003import static org.archive.format.warc.WARCConstants.TYPE; 004import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS; 005import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG; 006 007import java.io.ByteArrayInputStream; 008import java.io.IOException; 009import java.net.InetAddress; 010import java.net.URI; 011import java.net.UnknownHostException; 012import java.util.Collection; 013import java.util.Collections; 014import java.util.HashMap; 015import java.util.List; 016import java.util.Map; 017 018import org.apache.commons.lang.StringUtils; 019import org.archive.format.warc.WARCConstants.WARCRecordType; 020import org.archive.io.warc.WARCRecordInfo; 021import org.archive.io.warc.WARCWriter; 022import org.archive.modules.CrawlMetadata; 023import org.archive.modules.CrawlURI; 024import org.archive.modules.writer.WARCWriterProcessor; 025import org.archive.util.ArchiveUtils; 026import org.archive.util.anvl.ANVLRecord; 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030/** 031 * Custom NAS WARCWriterProcessor addding NetarchiveSuite metadata to the WARCInfo records written 032 * by Heritrix by just extending the org.archive.modules.writer.WARCWriterProcessor; 033 * This was not possible in H1. 034 * @author svc 035 * 036 */ 037public class NasWARCProcessor extends WARCWriterProcessor { 038 039 /** Logger instance. */ 040 private static final Logger logger = LoggerFactory.getLogger(NasWARCProcessor.class); 041 042 043 // Constants for the contents of the WarcInfo record 044 private static final String HARVESTINFO_VERSION = "harvestInfo.version"; 045 private static final String HARVESTINFO_JOBID = "harvestInfo.jobId"; 046 private static final String HARVESTINFO_CHANNEL = "harvestInfo.channel"; 047 private static final String HARVESTINFO_HARVESTNUM = "harvestInfo.harvestNum"; 048 private static final String HARVESTINFO_ORIGHARVESTDEFINITIONID = "harvestInfo.origHarvestDefinitionID"; 049 private static final String HARVESTINFO_MAXBYTESPERDOMAIN = "harvestInfo.maxBytesPerDomain"; 050 private static final String HARVESTINFO_MAXOBJECTSPERDOMAIN = "harvestInfo.maxObjectsPerDomain"; 051 private static final String HARVESTINFO_ORDERXMLNAME = "harvestInfo.templateName"; 052 private static final String HARVESTINFO_ORDERXMLUPDATEDATE = "harvestInfo.templateLastUpdateDate"; 053 private static final String HARVESTINFO_ORDERXMLDESCRIPTION = "harvestInfo.templateDescription"; 054 private static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME = "harvestInfo.origHarvestDefinitionName"; 055 private static final String HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS = "harvestInfo.origHarvestDefinitionComments"; 056 private static final String HARVESTINFO_SCHEDULENAME = "harvestInfo.scheduleName"; 057 private static final String HARVESTINFO_HARVESTFILENAMEPREFIX = "harvestInfo.harvestFilenamePrefix"; 058 private static final String HARVESTINFO_JOBSUBMITDATE = "harvestInfo.jobSubmitDate"; 059 private static final String HARVESTINFO_PERFORMER = "harvestInfo.performer"; 060 private static final String HARVESTINFO_OPERATOR = "harvestInfo.operator"; 061 private static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience"; 062 063 public boolean getWriteMetadataOutlinks() { 064 return (Boolean) kp.get("writeMetadataOutlinks"); 065 } 066 public void setWriteMetadataOutlinks(boolean writeMetadataOutlinks) { 067 kp.put("writeMetadataOutlinks",writeMetadataOutlinks); 068 } 069 070 List<String> cachedMetadata; 071 072 public NasWARCProcessor() { 073 super(); 074 } 075 076 /** 077 * metadata items. 078 * Add to bean WARCProcessor bean as as 079 * <property name="metadataItems"> 080 * <map> 081 * <entry key="harvestInfo.version" value="0.6"/> 082 * <entry key="harvestInfo.jobId" value="23"/> 083 * <entry key="harvestInfo.channel" value="FOCUSED"/> 084 * ... 085 * </map> 086 087 */ 088 protected Map<String,String> metadataMap = new HashMap<String,String>(); 089 090 public Map<String,String> getFormItems() { 091 return this.metadataMap; 092 } 093 public void setMetadataItems(Map<String,String> metadataItems) { 094 this.metadataMap = metadataItems; 095 } 096 097 098 @Override 099 public List<String> getMetadata() { 100 if (cachedMetadata != null) { 101 return cachedMetadata; 102 } 103 ANVLRecord record = new ANVLRecord(); 104 record.addLabelValue("software", "Heritrix/" + 105 ArchiveUtils.VERSION + " http://crawler.archive.org"); 106 try { 107 InetAddress host = InetAddress.getLocalHost(); 108 record.addLabelValue("ip", host.getHostAddress()); 109 record.addLabelValue("hostname", host.getCanonicalHostName()); 110 } catch (UnknownHostException e) { 111 //logger.log(Level.WARNING,"unable top obtain local crawl engine host",e); 112 } 113 114 // conforms to ISO 28500:2009 as of May 2009 115 // as described at http://bibnum.bnf.fr/WARC/ 116 // latest draft as of November 2008 117 record.addLabelValue("format","WARC File Format 1.0"); 118 record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); 119 120 // Get other values from metadata provider 121 122 CrawlMetadata provider = getMetadataProvider(); 123 124 addIfNotBlank(record,"operator", provider.getOperator()); 125 addIfNotBlank(record,"publisher", provider.getOrganization()); 126 addIfNotBlank(record,"audience", provider.getAudience()); 127 addIfNotBlank(record,"isPartOf", provider.getJobName()); 128 // TODO: make date match 'job creation date' as in Heritrix 1.x 129 // until then, leave out (plenty of dates already in WARC 130 // records 131// String rawDate = provider.getBeginDate(); 132// if(StringUtils.isNotBlank(rawDate)) { 133// Date date; 134// try { 135// date = ArchiveUtils.parse14DigitDate(rawDate); 136// addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date)); 137// } catch (ParseException e) { 138// logger.log(Level.WARNING,"obtaining warc created date",e); 139// } 140// } 141 addIfNotBlank(record,"description", provider.getDescription()); 142 addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase()); 143 144 addIfNotBlank(record,"http-header-user-agent", 145 provider.getUserAgent()); 146 addIfNotBlank(record,"http-header-from", 147 provider.getOperatorFrom()); 148 149 150 String netarchiveSuiteComment = "#added by NetarchiveSuite " 151 + dk.netarkivet.common.Constants.getVersionString(false); 152 ANVLRecord recordNAS = new ANVLRecord(); // Previously new ANVLRecord(7); 153 154 // Add the data from the metadataMap to the WarcInfoRecord if it exists 155 if (metadataMap == null) { 156 logger.warn("No NetarchiveSuite harvestInfo data available in the template"); 157 } else { 158 try { 159 160 recordNAS.addLabelValue(HARVESTINFO_VERSION, (String) metadataMap.get(HARVESTINFO_VERSION)); 161 recordNAS.addLabelValue(HARVESTINFO_JOBID, (String) metadataMap.get(HARVESTINFO_JOBID)); 162 recordNAS.addLabelValue(HARVESTINFO_CHANNEL, (String) metadataMap.get(HARVESTINFO_CHANNEL)); 163 recordNAS.addLabelValue(HARVESTINFO_HARVESTNUM, (String) metadataMap.get(HARVESTINFO_HARVESTNUM)); 164 recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONID, 165 (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONID)); 166 recordNAS.addLabelValue(HARVESTINFO_MAXBYTESPERDOMAIN, 167 (String) metadataMap.get(HARVESTINFO_MAXBYTESPERDOMAIN)); 168 169 recordNAS.addLabelValue(HARVESTINFO_MAXOBJECTSPERDOMAIN, 170 (String) metadataMap.get(HARVESTINFO_MAXOBJECTSPERDOMAIN)); 171 recordNAS.addLabelValue(HARVESTINFO_ORDERXMLNAME, 172 (String) metadataMap.get(HARVESTINFO_ORDERXMLNAME)); 173 if (metadataMap.containsKey(HARVESTINFO_ORDERXMLUPDATEDATE)) { 174 recordNAS.addLabelValue(HARVESTINFO_ORDERXMLUPDATEDATE, (String) metadataMap.get(HARVESTINFO_ORDERXMLUPDATEDATE)); 175 } 176 if (metadataMap.containsKey(HARVESTINFO_ORDERXMLDESCRIPTION)) { 177 recordNAS.addLabelValue(HARVESTINFO_ORDERXMLDESCRIPTION, (String) metadataMap.get(HARVESTINFO_ORDERXMLDESCRIPTION)); 178 } 179 recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONNAME, 180 (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONNAME)); 181 182 if (metadataMap.containsKey(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS)) { 183 recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS, 184 (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS)); 185 } 186 187 if (metadataMap.containsKey(HARVESTINFO_SCHEDULENAME)) { 188 recordNAS.addLabelValue(HARVESTINFO_SCHEDULENAME, 189 (String) metadataMap.get(HARVESTINFO_SCHEDULENAME)); 190 } 191 recordNAS.addLabelValue(HARVESTINFO_HARVESTFILENAMEPREFIX, 192 (String) metadataMap.get(HARVESTINFO_HARVESTFILENAMEPREFIX)); 193 194 recordNAS.addLabelValue(HARVESTINFO_JOBSUBMITDATE, 195 (String) metadataMap.get(HARVESTINFO_JOBSUBMITDATE)); 196 197 if (metadataMap.containsKey(HARVESTINFO_PERFORMER)) { 198 recordNAS.addLabelValue(HARVESTINFO_PERFORMER, 199 (String) metadataMap.get(HARVESTINFO_PERFORMER)); 200 } 201 if (metadataMap.containsKey(HARVESTINFO_OPERATOR)) { 202 recordNAS.addLabelValue(HARVESTINFO_OPERATOR, (String) metadataMap.get(HARVESTINFO_OPERATOR)); 203 } 204 205 if (metadataMap.containsKey(HARVESTINFO_AUDIENCE)) { 206 recordNAS.addLabelValue(HARVESTINFO_AUDIENCE, 207 (String) metadataMap.get(HARVESTINFO_AUDIENCE)); 208 } 209 } catch (Exception e) { 210 logger.warn("Error processing harvest info" , e); 211 } 212 } 213 214 // really ugly to return as List<String>, but changing would require 215 // larger refactoring 216 cachedMetadata = Collections.singletonList(record.toString() 217 + netarchiveSuiteComment + "\n" + recordNAS.toString()); 218 return cachedMetadata; 219 } 220 221 /** 222 * modify default writeMetadata method to handle the write of outlinks 223 * in metadata or not 224 */ 225 @Override 226 protected URI writeMetadata(final WARCWriter w, 227 final String timestamp, 228 final URI baseid, final CrawlURI curi, 229 final ANVLRecord namedFields) 230 throws IOException { 231 WARCRecordInfo recordInfo = new WARCRecordInfo(); 232 recordInfo.setType(WARCRecordType.metadata); 233 recordInfo.setUrl(curi.toString()); 234 recordInfo.setCreate14DigitDate(timestamp); 235 recordInfo.setMimetype(ANVLRecord.MIMETYPE); 236 recordInfo.setExtraHeaders(namedFields); 237 recordInfo.setEnforceLength(true); 238 239 recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString())); 240 241 // Get some metadata from the curi. 242 // TODO: Get all curi metadata. 243 // TODO: Use other than ANVL (or rename ANVL as NameValue or use 244 // RFC822 (commons-httpclient?). 245 ANVLRecord r = new ANVLRecord(); 246 if (curi.isSeed()) { 247 r.addLabel("seed"); 248 } else { 249 if (curi.forceFetch()) { 250 r.addLabel("force-fetch"); 251 } 252 if(StringUtils.isNotBlank(flattenVia(curi))) { 253 r.addLabelValue("via", flattenVia(curi)); 254 } 255 if(StringUtils.isNotBlank(curi.getPathFromSeed())) { 256 r.addLabelValue("hopsFromSeed", curi.getPathFromSeed()); 257 } 258 if (curi.containsDataKey(A_SOURCE_TAG)) { 259 r.addLabelValue("sourceTag", 260 (String)curi.getData().get(A_SOURCE_TAG)); 261 } 262 } 263 long duration = curi.getFetchCompletedTime() - curi.getFetchBeginTime(); 264 if (duration > -1) { 265 r.addLabelValue("fetchTimeMs", Long.toString(duration)); 266 } 267 268 if (curi.getData().containsKey(A_FTP_FETCH_STATUS)) { 269 r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString()); 270 } 271 272 if (curi.getRecorder() != null && curi.getRecorder().getCharset() != null) { 273 r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name()); 274 } 275 276 for (String annotation: curi.getAnnotations()) { 277 if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) { 278 String[] kv = annotation.split(":", 2); 279 r.addLabelValue(kv[0], kv[1]); 280 } 281 } 282 283 //only if parameter is true, add the outlinks 284 if (getWriteMetadataOutlinks() == true) { 285 // Add outlinks though they are effectively useless without anchor text. 286 Collection<CrawlURI> links = curi.getOutLinks(); 287 if (links != null && links.size() > 0) { 288 for (CrawlURI link: links) { 289 r.addLabelValue("outlink", link.getURI()+" "+link.getLastHop()+" "+link.getViaContext()); 290 } 291 } 292 } 293 294 // TODO: Other curi fields to write to metadata. 295 // 296 // Credentials 297 // 298 // fetch-began-time: 1154569278774 299 // fetch-completed-time: 1154569281816 300 // 301 // Annotations. 302 303 byte [] b = r.getUTF8Bytes(); 304 recordInfo.setContentStream(new ByteArrayInputStream(b)); 305 recordInfo.setContentLength((long) b.length); 306 307 w.writeRecord(recordInfo); 308 309 return recordInfo.getRecordId(); 310 } 311 312}