001package dk.netarkivet.harvester.harvesting;
002
003import java.net.InetAddress;
004import java.net.UnknownHostException;
005import java.util.Collections;
006import java.util.HashMap;
007import java.util.List;
008import java.util.Map;
009
010import org.archive.modules.CrawlMetadata;
011import org.archive.modules.writer.WARCWriterProcessor;
012import org.archive.util.ArchiveUtils;
013import org.archive.util.anvl.ANVLRecord;
014
015/**
016 * Custom NAS WARCWriterProcessor addding NetarchiveSuite metadata to the WARCInfo records written
017 * by Heritrix by just extending the org.archive.modules.writer.WARCWriterProcessor;
018 * This was not possible in H1.
019 * @author svc 
020 * 
021 */
022public class NasWARCProcessor extends WARCWriterProcessor {
023
024        // Constants for the contents of the WarcInfo record
025        private static final String HARVESTINFO_VERSION = "harvestInfo.version";
026        private static final String HARVESTINFO_JOBID = "harvestInfo.jobId";
027        private static final String HARVESTINFO_CHANNEL = "harvestInfo.channel";
028        private static final String HARVESTINFO_HARVESTNUM = "harvestInfo.harvestNum";
029        private static final String HARVESTINFO_ORIGHARVESTDEFINITIONID = "harvestInfo.origHarvestDefinitionID";
030        private static final String HARVESTINFO_MAXBYTESPERDOMAIN = "harvestInfo.maxBytesPerDomain";
031        private static final String HARVESTINFO_MAXOBJECTSPERDOMAIN = "harvestInfo.maxObjectsPerDomain";
032        private static final String HARVESTINFO_ORDERXMLNAME = "harvestInfo.orderXMLName";
033        private static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME = "harvestInfo.origHarvestDefinitionName";
034        private static final String HARVESTINFO_SCHEDULENAME = "harvestInfo.scheduleName";
035        private static final String HARVESTINFO_HARVESTFILENAMEPREFIX = "harvestInfo.harvestFilenamePrefix";
036        private static final String HARVESTINFO_JOBSUBMITDATE = "harvestInfo.jobSubmitDate";
037        private static final String HARVESTINFO_PERFORMER = "harvestInfo.performer";
038        private static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience";
039
040        public NasWARCProcessor() {
041                super();
042        }
043        
044        List<String> cachedMetadata;
045        
046         /**
047     * metadata items.
048     * Add to bean WARCProcessor bean as as
049     * <property name="metadataItems"> 
050     * <map>
051     *  <entry key="harvestInfo.version" value="0.5"/>
052         *      <entry key="harvestInfo.jobId" value="23"/>
053         *  <entry key="harvestInfo.channel" value="FOCUSED"/>
054         * ...  
055     * </map>
056
057     */
058    protected Map<String,String> metadataMap = new HashMap<String,String>();
059
060    public Map<String,String> getFormItems() {
061        return this.metadataMap;
062    }
063    public void setMetadataItems(Map<String,String> metadataItems) {
064        this.metadataMap = metadataItems;
065    }
066
067        
068        @Override
069        public List<String> getMetadata() {
070        if (cachedMetadata != null) {
071            return cachedMetadata;
072        }
073        ANVLRecord record = new ANVLRecord();
074        record.addLabelValue("software", "Heritrix/" +
075                ArchiveUtils.VERSION + " http://crawler.archive.org");
076        try {
077            InetAddress host = InetAddress.getLocalHost();
078            record.addLabelValue("ip", host.getHostAddress());
079            record.addLabelValue("hostname", host.getCanonicalHostName());
080        } catch (UnknownHostException e) {
081            //logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
082        }
083        
084        // conforms to ISO 28500:2009 as of May 2009
085        // as described at http://bibnum.bnf.fr/WARC/ 
086        // latest draft as of November 2008
087        record.addLabelValue("format","WARC File Format 1.0"); 
088        record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
089        
090        // Get other values from metadata provider
091
092        CrawlMetadata provider = getMetadataProvider();
093
094        addIfNotBlank(record,"operator", provider.getOperator());
095        addIfNotBlank(record,"publisher", provider.getOrganization());
096        addIfNotBlank(record,"audience", provider.getAudience());
097        addIfNotBlank(record,"isPartOf", provider.getJobName());
098        // TODO: make date match 'job creation date' as in Heritrix 1.x
099        // until then, leave out (plenty of dates already in WARC 
100        // records
101//            String rawDate = provider.getBeginDate();
102//            if(StringUtils.isNotBlank(rawDate)) {
103//                Date date;
104//                try {
105//                    date = ArchiveUtils.parse14DigitDate(rawDate);
106//                    addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
107//                } catch (ParseException e) {
108//                    logger.log(Level.WARNING,"obtaining warc created date",e);
109//                }
110//            }
111        addIfNotBlank(record,"description", provider.getDescription());
112        addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase());
113
114        addIfNotBlank(record,"http-header-user-agent",
115                provider.getUserAgent());
116        addIfNotBlank(record,"http-header-from",
117                provider.getOperatorFrom());
118        
119        
120        String netarchiveSuiteComment = "#added by NetarchiveSuite "
121                + dk.netarkivet.common.Constants.getVersionString();
122        ANVLRecord recordNAS = new ANVLRecord(); // Previously new ANVLRecord(7); 
123
124        // Add the data from the metadataMap to the WarcInfoRecord.
125        recordNAS.addLabelValue(HARVESTINFO_VERSION, (String) metadataMap.get(HARVESTINFO_VERSION));
126        recordNAS.addLabelValue(HARVESTINFO_JOBID, (String) metadataMap.get(HARVESTINFO_JOBID));
127        recordNAS.addLabelValue(HARVESTINFO_CHANNEL, (String) metadataMap.get(HARVESTINFO_CHANNEL));
128        recordNAS.addLabelValue(HARVESTINFO_HARVESTNUM, (String) metadataMap.get(HARVESTINFO_HARVESTNUM));
129        recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONID, 
130                (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONID));
131        recordNAS.addLabelValue(HARVESTINFO_MAXBYTESPERDOMAIN, 
132                (String) metadataMap.get(HARVESTINFO_MAXBYTESPERDOMAIN));
133
134        recordNAS.addLabelValue(HARVESTINFO_MAXOBJECTSPERDOMAIN, 
135                (String) metadataMap.get(HARVESTINFO_MAXOBJECTSPERDOMAIN));
136        recordNAS.addLabelValue(HARVESTINFO_ORDERXMLNAME, 
137                (String) metadataMap.get(HARVESTINFO_ORDERXMLNAME));
138        recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONNAME,
139                (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONNAME));
140
141        if (metadataMap.containsKey(HARVESTINFO_SCHEDULENAME)) {
142            recordNAS.addLabelValue(HARVESTINFO_SCHEDULENAME, 
143                (String) metadataMap.get(HARVESTINFO_SCHEDULENAME));
144        }
145        recordNAS.addLabelValue(HARVESTINFO_HARVESTFILENAMEPREFIX,
146                (String) metadataMap.get(HARVESTINFO_HARVESTFILENAMEPREFIX));
147 
148        recordNAS.addLabelValue(HARVESTINFO_JOBSUBMITDATE, 
149                (String) metadataMap.get(HARVESTINFO_JOBSUBMITDATE));
150        
151        if (metadataMap.containsKey(HARVESTINFO_PERFORMER)) {
152                recordNAS.addLabelValue(HARVESTINFO_PERFORMER, 
153                (String) metadataMap.get(HARVESTINFO_PERFORMER));
154        }
155
156        if (metadataMap.containsKey(HARVESTINFO_AUDIENCE)) { 
157            recordNAS.addLabelValue(HARVESTINFO_AUDIENCE, 
158                (String) metadataMap.get(HARVESTINFO_AUDIENCE));
159        }
160        
161        // really ugly to return as List<String>, but changing would require 
162        // larger refactoring
163        cachedMetadata = Collections.singletonList(record.toString() 
164                        + netarchiveSuiteComment + "\n" + recordNAS.toString());
165        return cachedMetadata;
166    }
167        
168}