001package dk.netarkivet.harvester.harvesting;
002
003import static org.archive.format.warc.WARCConstants.TYPE;
004import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS;
005import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG;
006
007import java.io.ByteArrayInputStream;
008import java.io.IOException;
009import java.net.InetAddress;
010import java.net.URI;
011import java.net.UnknownHostException;
012import java.util.Collection;
013import java.util.Collections;
014import java.util.HashMap;
015import java.util.List;
016import java.util.Map;
017
018import org.apache.commons.lang.StringUtils;
019import org.archive.format.warc.WARCConstants.WARCRecordType;
020import org.archive.io.warc.WARCRecordInfo;
021import org.archive.io.warc.WARCWriter;
022import org.archive.modules.CrawlMetadata;
023import org.archive.modules.CrawlURI;
024import org.archive.modules.writer.WARCWriterProcessor;
025import org.archive.util.ArchiveUtils;
026import org.archive.util.anvl.ANVLRecord;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030/**
031 * Custom NAS WARCWriterProcessor addding NetarchiveSuite metadata to the WARCInfo records written
032 * by Heritrix by just extending the org.archive.modules.writer.WARCWriterProcessor;
033 * This was not possible in H1.
034 * @author svc 
035 * 
036 */
037public class NasWARCProcessor extends WARCWriterProcessor {
038
039    /** Logger instance. */
040    private static final Logger logger = LoggerFactory.getLogger(NasWARCProcessor.class);
041
042
043    // Constants for the contents of the WarcInfo record
044        private static final String HARVESTINFO_VERSION = "harvestInfo.version";
045        private static final String HARVESTINFO_JOBID = "harvestInfo.jobId";
046        private static final String HARVESTINFO_CHANNEL = "harvestInfo.channel";
047        private static final String HARVESTINFO_HARVESTNUM = "harvestInfo.harvestNum";
048        private static final String HARVESTINFO_ORIGHARVESTDEFINITIONID = "harvestInfo.origHarvestDefinitionID";
049        private static final String HARVESTINFO_MAXBYTESPERDOMAIN = "harvestInfo.maxBytesPerDomain";
050        private static final String HARVESTINFO_MAXOBJECTSPERDOMAIN = "harvestInfo.maxObjectsPerDomain";
051        private static final String HARVESTINFO_ORDERXMLNAME = "harvestInfo.templateName";
052        private static final String HARVESTINFO_ORDERXMLUPDATEDATE = "harvestInfo.templateLastUpdateDate";
053        private static final String HARVESTINFO_ORDERXMLDESCRIPTION = "harvestInfo.templateDescription";
054        private static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME = "harvestInfo.origHarvestDefinitionName";
055        private static final String HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS = "harvestInfo.origHarvestDefinitionComments";
056        private static final String HARVESTINFO_SCHEDULENAME = "harvestInfo.scheduleName";
057        private static final String HARVESTINFO_HARVESTFILENAMEPREFIX = "harvestInfo.harvestFilenamePrefix";
058        private static final String HARVESTINFO_JOBSUBMITDATE = "harvestInfo.jobSubmitDate";
059        private static final String HARVESTINFO_PERFORMER = "harvestInfo.performer";
060        private static final String HARVESTINFO_OPERATOR = "harvestInfo.operator";
061        private static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience";
062
063        public boolean getWriteMetadataOutlinks() {
064        return (Boolean) kp.get("writeMetadataOutlinks");
065    }
066    public void setWriteMetadataOutlinks(boolean writeMetadataOutlinks) {
067        kp.put("writeMetadataOutlinks",writeMetadataOutlinks);
068    }
069
070    List<String> cachedMetadata;
071
072        public NasWARCProcessor() {
073                super();
074        }
075        
076         /**
077     * metadata items.
078     * Add to bean WARCProcessor bean as as
079     * <property name="metadataItems"> 
080     * <map>
081     *  <entry key="harvestInfo.version" value="0.6"/>
082         *      <entry key="harvestInfo.jobId" value="23"/>
083         *  <entry key="harvestInfo.channel" value="FOCUSED"/>
084         * ...  
085     * </map>
086
087     */
088    protected Map<String,String> metadataMap = new HashMap<String,String>();
089
090    public Map<String,String> getFormItems() {
091        return this.metadataMap;
092    }
093    public void setMetadataItems(Map<String,String> metadataItems) {
094        this.metadataMap = metadataItems;
095    }
096
097        
098        @Override
099        public List<String> getMetadata() {
100        if (cachedMetadata != null) {
101            return cachedMetadata;
102        }
103        ANVLRecord record = new ANVLRecord();
104        record.addLabelValue("software", "Heritrix/" +
105                ArchiveUtils.VERSION + " http://crawler.archive.org");
106        try {
107            InetAddress host = InetAddress.getLocalHost();
108            record.addLabelValue("ip", host.getHostAddress());
109            record.addLabelValue("hostname", host.getCanonicalHostName());
110        } catch (UnknownHostException e) {
111            //logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
112        }
113        
114        // conforms to ISO 28500:2009 as of May 2009
115        // as described at http://bibnum.bnf.fr/WARC/ 
116        // latest draft as of November 2008
117        record.addLabelValue("format","WARC File Format 1.0"); 
118        record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
119        
120        // Get other values from metadata provider
121
122        CrawlMetadata provider = getMetadataProvider();
123
124        addIfNotBlank(record,"operator", provider.getOperator());
125        addIfNotBlank(record,"publisher", provider.getOrganization());
126        addIfNotBlank(record,"audience", provider.getAudience());
127        addIfNotBlank(record,"isPartOf", provider.getJobName());
128        // TODO: make date match 'job creation date' as in Heritrix 1.x
129        // until then, leave out (plenty of dates already in WARC 
130        // records
131//            String rawDate = provider.getBeginDate();
132//            if(StringUtils.isNotBlank(rawDate)) {
133//                Date date;
134//                try {
135//                    date = ArchiveUtils.parse14DigitDate(rawDate);
136//                    addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
137//                } catch (ParseException e) {
138//                    logger.log(Level.WARNING,"obtaining warc created date",e);
139//                }
140//            }
141        addIfNotBlank(record,"description", provider.getDescription());
142        addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase());
143
144        addIfNotBlank(record,"http-header-user-agent",
145                provider.getUserAgent());
146        addIfNotBlank(record,"http-header-from",
147                provider.getOperatorFrom());
148        
149        
150        String netarchiveSuiteComment = "#added by NetarchiveSuite "
151                + dk.netarkivet.common.Constants.getVersionString(false);
152        ANVLRecord recordNAS = new ANVLRecord(); // Previously new ANVLRecord(7); 
153        
154        // Add the data from the metadataMap to the WarcInfoRecord  if it exists
155        if (metadataMap == null) {
156                logger.warn("No NetarchiveSuite harvestInfo data available in the template");
157        } else {
158                try {
159    
160                        recordNAS.addLabelValue(HARVESTINFO_VERSION, (String) metadataMap.get(HARVESTINFO_VERSION));
161                        recordNAS.addLabelValue(HARVESTINFO_JOBID, (String) metadataMap.get(HARVESTINFO_JOBID));
162                        recordNAS.addLabelValue(HARVESTINFO_CHANNEL, (String) metadataMap.get(HARVESTINFO_CHANNEL));
163                        recordNAS.addLabelValue(HARVESTINFO_HARVESTNUM, (String) metadataMap.get(HARVESTINFO_HARVESTNUM));
164                        recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONID,
165                                        (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONID));
166                        recordNAS.addLabelValue(HARVESTINFO_MAXBYTESPERDOMAIN,
167                                        (String) metadataMap.get(HARVESTINFO_MAXBYTESPERDOMAIN));
168
169                        recordNAS.addLabelValue(HARVESTINFO_MAXOBJECTSPERDOMAIN,
170                                        (String) metadataMap.get(HARVESTINFO_MAXOBJECTSPERDOMAIN));
171                        recordNAS.addLabelValue(HARVESTINFO_ORDERXMLNAME,
172                                        (String) metadataMap.get(HARVESTINFO_ORDERXMLNAME));
173                        if (metadataMap.containsKey(HARVESTINFO_ORDERXMLUPDATEDATE)) {
174                                recordNAS.addLabelValue(HARVESTINFO_ORDERXMLUPDATEDATE, (String) metadataMap.get(HARVESTINFO_ORDERXMLUPDATEDATE));
175                        }
176                        if (metadataMap.containsKey(HARVESTINFO_ORDERXMLDESCRIPTION)) {
177                                recordNAS.addLabelValue(HARVESTINFO_ORDERXMLDESCRIPTION, (String) metadataMap.get(HARVESTINFO_ORDERXMLDESCRIPTION));
178                        }
179                        recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONNAME,
180                                        (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONNAME));
181
182                        if (metadataMap.containsKey(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS)) {
183                                recordNAS.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS,
184                                                (String) metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS));
185                        }
186                        
187                        if (metadataMap.containsKey(HARVESTINFO_SCHEDULENAME)) {
188                                recordNAS.addLabelValue(HARVESTINFO_SCHEDULENAME,
189                                                (String) metadataMap.get(HARVESTINFO_SCHEDULENAME));
190                        }
191                        recordNAS.addLabelValue(HARVESTINFO_HARVESTFILENAMEPREFIX,
192                                        (String) metadataMap.get(HARVESTINFO_HARVESTFILENAMEPREFIX));
193
194                        recordNAS.addLabelValue(HARVESTINFO_JOBSUBMITDATE,
195                                        (String) metadataMap.get(HARVESTINFO_JOBSUBMITDATE));
196
197                        if (metadataMap.containsKey(HARVESTINFO_PERFORMER)) {
198                                recordNAS.addLabelValue(HARVESTINFO_PERFORMER,
199                                                (String) metadataMap.get(HARVESTINFO_PERFORMER));
200                        }
201                        if (metadataMap.containsKey(HARVESTINFO_OPERATOR)) {
202                                recordNAS.addLabelValue(HARVESTINFO_OPERATOR, (String) metadataMap.get(HARVESTINFO_OPERATOR));
203                        }
204
205                        if (metadataMap.containsKey(HARVESTINFO_AUDIENCE)) {
206                                recordNAS.addLabelValue(HARVESTINFO_AUDIENCE,
207                                                (String) metadataMap.get(HARVESTINFO_AUDIENCE));
208                        }
209                } catch (Exception e) {
210                        logger.warn("Error processing harvest info" , e);
211                }
212        }
213
214        // really ugly to return as List<String>, but changing would require 
215        // larger refactoring
216        cachedMetadata = Collections.singletonList(record.toString() 
217                        + netarchiveSuiteComment + "\n" + recordNAS.toString());
218        return cachedMetadata;
219    }
220        
221        /**
222         * modify default writeMetadata method to handle the write of outlinks
223         * in metadata or not
224         */
225        @Override
226        protected URI writeMetadata(final WARCWriter w,
227            final String timestamp,
228            final URI baseid, final CrawlURI curi,
229            final ANVLRecord namedFields) 
230    throws IOException {
231            WARCRecordInfo recordInfo = new WARCRecordInfo();
232        recordInfo.setType(WARCRecordType.metadata);
233        recordInfo.setUrl(curi.toString());
234        recordInfo.setCreate14DigitDate(timestamp);
235        recordInfo.setMimetype(ANVLRecord.MIMETYPE);
236        recordInfo.setExtraHeaders(namedFields);
237        recordInfo.setEnforceLength(true);
238            
239        recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString()));
240
241        // Get some metadata from the curi.
242        // TODO: Get all curi metadata.
243        // TODO: Use other than ANVL (or rename ANVL as NameValue or use
244        // RFC822 (commons-httpclient?).
245        ANVLRecord r = new ANVLRecord();
246        if (curi.isSeed()) {
247            r.addLabel("seed");
248        } else {
249                if (curi.forceFetch()) {
250                        r.addLabel("force-fetch");
251                }
252            if(StringUtils.isNotBlank(flattenVia(curi))) {
253                r.addLabelValue("via", flattenVia(curi));
254            }
255            if(StringUtils.isNotBlank(curi.getPathFromSeed())) {
256                r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
257            }
258            if (curi.containsDataKey(A_SOURCE_TAG)) {
259                r.addLabelValue("sourceTag", 
260                        (String)curi.getData().get(A_SOURCE_TAG));
261            }
262        }
263        long duration = curi.getFetchCompletedTime() - curi.getFetchBeginTime();
264        if (duration > -1) {
265            r.addLabelValue("fetchTimeMs", Long.toString(duration));
266        }
267        
268        if (curi.getData().containsKey(A_FTP_FETCH_STATUS)) {
269            r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString());
270        }
271
272        if (curi.getRecorder() != null && curi.getRecorder().getCharset() != null) {
273            r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name());
274        }
275        
276        for (String annotation: curi.getAnnotations()) {
277            if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) {
278                String[] kv = annotation.split(":", 2);
279                r.addLabelValue(kv[0], kv[1]);
280            }
281        }
282
283        //only if parameter is true, add the outlinks
284        if (getWriteMetadataOutlinks() == true) {
285                // Add outlinks though they are effectively useless without anchor text.
286            Collection<CrawlURI> links = curi.getOutLinks();
287            if (links != null && links.size() > 0) {
288                for (CrawlURI link: links) {
289                    r.addLabelValue("outlink", link.getURI()+" "+link.getLastHop()+" "+link.getViaContext());
290                }
291            }
292        }
293
294        // TODO: Other curi fields to write to metadata.
295        // 
296        // Credentials
297        // 
298        // fetch-began-time: 1154569278774
299        // fetch-completed-time: 1154569281816
300        //
301        // Annotations.
302        
303        byte [] b = r.getUTF8Bytes();
304        recordInfo.setContentStream(new ByteArrayInputStream(b));
305        recordInfo.setContentLength((long) b.length);
306        
307        w.writeRecord(recordInfo);
308        
309        return recordInfo.getRecordId();
310    }
311        
312}