Source code

001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 *
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.wayback.batch;
025
026import java.io.OutputStream;
027
028import org.archive.io.arc.ARCRecord;
029import org.archive.wayback.UrlCanonicalizer;
030import org.archive.wayback.core.CaptureSearchResult;
031import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
032import org.archive.wayback.resourcestore.indexer.ARCRecordToSearchResultAdapter;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036import dk.netarkivet.common.Constants;
037import dk.netarkivet.common.utils.arc.ARCBatchJob;
038import dk.netarkivet.common.utils.batch.ARCBatchFilter;
039
040/**
041 * Returns a cdx file using the appropriate format for wayback, including canonicalisation of urls. The returned files
042 * are unsorted.
043 */
044@SuppressWarnings({"deprecation", "serial"})
045public class WaybackCDXExtractionARCBatchJob extends ARCBatchJob {
046
047    /** Logger for this class. */
048    private static final Logger log = LoggerFactory.getLogger(WaybackCDXExtractionARCBatchJob.class);
049
050    /** Utility for converting an ArcRecord to a CaptureSearchResult (wayback's representation of a CDX record). */
051    private ARCRecordToSearchResultAdapter aToSAdapter;
052
053    /** Utility for converting a wayback CaptureSearchResult to a String representing a line in a CDX file. */
054    private SearchResultToCDXLineAdapter srToCDXAdapter;
055
056    /**
057     * Constructor which set timeout to one day.
058     */
059    public WaybackCDXExtractionARCBatchJob() {
060        batchJobTimeout = Constants.ONE_DAY_IN_MILLIES;
061    }
062
063    /**
064     * Constructor.
065     *
066     * @param timeout specific timeout period
067     */
068    public WaybackCDXExtractionARCBatchJob(long timeout) {
069        batchJobTimeout = timeout;
070    }
071
072    /**
073     * Initializes the private fields of this class. Some of these are relatively heavy objects, so it is important that
074     * they are only initialised once.
075     *
076     * @param os unused argument
077     */
078    @Override
079    public void initialize(OutputStream os) {
080        log.info("Starting a {}", this.getClass().getName());
081        aToSAdapter = new ARCRecordToSearchResultAdapter();
082        UrlCanonicalizer uc = UrlCanonicalizerFactory.getDefaultUrlCanonicalizer();
083        aToSAdapter.setCanonicalizer(uc);
084        srToCDXAdapter = new SearchResultToCDXLineAdapter();
085    }
086
087    /**
088     * Does nothing except log the end of the job.
089     *
090     * @param os unused argument.
091     */
092    public void finish(OutputStream os) {
093        log.info("Finishing the {}", this.getClass().getName());
094        // No cleanup required
095    }
096
097    @Override
098    public ARCBatchFilter getFilter() {
099        return ARCBatchFilter.EXCLUDE_FILE_HEADERS;
100    }
101
102    /**
103     * For each ARCRecord writes one CDX line (including newline) to the output. If an arcrecord cannot be converted to
104     * a CDX record for any reason then any resulting exception is caught and logged.
105     *
106     * @param record the ARCRecord to be indexed.
107     * @param os the OutputStream to which output is written.
108     */
109    @Override
110    public void processRecord(ARCRecord record, OutputStream os) {
111        CaptureSearchResult csr = null;
112        log.debug("Entered {} for '{}'", this.getClass().getName(), record.getHeaderString());
113        try {
114            log.debug("Adapting Record '{}'", record.getHeader());
115            csr = aToSAdapter.adapt(record);
116            log.debug("Adapted Record '{}' to '{}'", record.getHeader(), csr);
117        } catch (Exception e) {
118            log.error("Exception processing ARC record:", e);
119            return;
120        }
121        try {
122            if (csr != null) {
123                log.debug("Adapting Search Result'{}'", csr);
124                String cdx = srToCDXAdapter.adapt(csr);
125                os.write(cdx.getBytes());
126                os.write("\n".getBytes());
127                log.debug("Adapted Search Result '{}' + to '{}'", csr, cdx);
128            } else {
129                log.info("Could not parse '{}'", record.getHeaderString());
130            }
131        } catch (Exception e) {
132            log.error("Exception processing ARC record:", e);
133        }
134    }
135
136}