001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.wayback.batch; 025 026import java.io.OutputStream; 027 028import org.archive.io.arc.ARCRecord; 029import org.archive.wayback.UrlCanonicalizer; 030import org.archive.wayback.core.CaptureSearchResult; 031import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; 032import org.archive.wayback.resourcestore.indexer.ARCRecordToSearchResultAdapter; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import dk.netarkivet.common.Constants; 037import dk.netarkivet.common.utils.arc.ARCBatchJob; 038import dk.netarkivet.common.utils.batch.ARCBatchFilter; 039 040/** 041 * Returns a cdx file using the appropriate format for wayback, including canonicalisation of urls. The returned files 042 * are unsorted. 043 */ 044@SuppressWarnings({"deprecation", "serial"}) 045public class WaybackCDXExtractionARCBatchJob extends ARCBatchJob { 046 047 /** Logger for this class. */ 048 private static final Logger log = LoggerFactory.getLogger(WaybackCDXExtractionARCBatchJob.class); 049 050 /** Utility for converting an ArcRecord to a CaptureSearchResult (wayback's representation of a CDX record). */ 051 private ARCRecordToSearchResultAdapter aToSAdapter; 052 053 /** Utility for converting a wayback CaptureSearchResult to a String representing a line in a CDX file. */ 054 private SearchResultToCDXLineAdapter srToCDXAdapter; 055 056 /** 057 * Constructor which set timeout to one day. 058 */ 059 public WaybackCDXExtractionARCBatchJob() { 060 batchJobTimeout = Constants.ONE_DAY_IN_MILLIES; 061 } 062 063 /** 064 * Constructor. 065 * 066 * @param timeout specific timeout period 067 */ 068 public WaybackCDXExtractionARCBatchJob(long timeout) { 069 batchJobTimeout = timeout; 070 } 071 072 /** 073 * Initializes the private fields of this class. Some of these are relatively heavy objects, so it is important that 074 * they are only initialised once. 075 * 076 * @param os unused argument 077 */ 078 @Override 079 public void initialize(OutputStream os) { 080 log.info("Starting a {}", this.getClass().getName()); 081 aToSAdapter = new ARCRecordToSearchResultAdapter(); 082 UrlCanonicalizer uc = UrlCanonicalizerFactory.getDefaultUrlCanonicalizer(); 083 aToSAdapter.setCanonicalizer(uc); 084 srToCDXAdapter = new SearchResultToCDXLineAdapter(); 085 } 086 087 /** 088 * Does nothing except log the end of the job. 089 * 090 * @param os unused argument. 091 */ 092 public void finish(OutputStream os) { 093 log.info("Finishing the {}", this.getClass().getName()); 094 // No cleanup required 095 } 096 097 @Override 098 public ARCBatchFilter getFilter() { 099 return ARCBatchFilter.EXCLUDE_FILE_HEADERS; 100 } 101 102 /** 103 * For each ARCRecord writes one CDX line (including newline) to the output. If an arcrecord cannot be converted to 104 * a CDX record for any reason then any resulting exception is caught and logged. 105 * 106 * @param record the ARCRecord to be indexed. 107 * @param os the OutputStream to which output is written. 108 */ 109 @Override 110 public void processRecord(ARCRecord record, OutputStream os) { 111 CaptureSearchResult csr = null; 112 log.debug("Entered {} for '{}'", this.getClass().getName(), record.getHeaderString()); 113 try { 114 log.debug("Adapting Record '{}'", record.getHeader()); 115 csr = aToSAdapter.adapt(record); 116 log.debug("Adapted Record '{}' to '{}'", record.getHeader(), csr); 117 } catch (Exception e) { 118 log.error("Exception processing ARC record:", e); 119 return; 120 } 121 try { 122 if (csr != null) { 123 log.debug("Adapting Search Result'{}'", csr); 124 String cdx = srToCDXAdapter.adapt(csr); 125 os.write(cdx.getBytes()); 126 os.write("\n".getBytes()); 127 log.debug("Adapted Search Result '{}' + to '{}'", csr, cdx); 128 } else { 129 log.info("Could not parse '{}'", record.getHeaderString()); 130 } 131 } catch (Exception e) { 132 log.error("Exception processing ARC record:", e); 133 } 134 } 135 136}