Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.heritrix3;
025
026import java.io.BufferedReader;
027import java.io.File;
028import java.io.FileReader;
029import java.io.IOException;
030import java.util.ArrayList;
031import java.util.List;
032
033import org.apache.commons.io.IOUtils;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036
037import dk.netarkivet.common.CommonSettings;
038import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
039import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
040import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord;
041import dk.netarkivet.common.exceptions.ArgumentNotValid;
042import dk.netarkivet.common.exceptions.IOFailure;
043import dk.netarkivet.common.utils.FileUtils;
044import dk.netarkivet.common.utils.Settings;
045import dk.netarkivet.common.utils.batch.FileBatchJob;
046import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob;
047import dk.netarkivet.common.utils.cdx.CDXRecord;
048import dk.netarkivet.harvester.datamodel.HeritrixTemplate;
049import dk.netarkivet.harvester.datamodel.Job;
050import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
051
052/**
053 * This class handles all the things in a single harvest that are not related directly related either to launching
054 * Heritrix3 or to handling JMS messages.
055 * 
056 * This is currently code only related to retriving the recoverlog from metadata-file and inserting
057 * it into Heritrix3. Netarchivesuite has not fully implemented this feature yet. 
058 */
059public class HarvestControllerUtils {
060
061    /** The instance logger. */
062    private static final Logger log = LoggerFactory.getLogger(HarvestJob.class);
063
064    /**
065     * This method attempts to retrieve the Heritrix recover log from the job which this job tries to continue. If
066     * successful, the Heritrix template is updated accordingly.
067     *
068     * @param job The harvest Job object containing various harvest setup data.
069     * @param files Heritrix files related to this harvestjob.
070     */
071    private static void tryToRetrieveRecoverLog(Job job, Heritrix3Files files) {
072        Long previousJob = job.getContinuationOf();
073        List<CDXRecord> metaCDXes = null;
074        try {
075            metaCDXes = getMetadataCDXRecordsForJob(previousJob);
076        } catch (IOFailure e) {
077            log.debug("Failed to retrive CDX of metatadata records. "
078                    + "Maybe the metadata arcfile for job {} does not exist in repository", previousJob, e);
079        }
080
081        CDXRecord recoverlogCDX = null;
082        if (metaCDXes != null) {
083            for (CDXRecord cdx : metaCDXes) {
084                if (cdx.getURL().matches(MetadataFile.RECOVER_LOG_PATTERN)) {
085                    recoverlogCDX = cdx;
086                }
087            }
088            if (recoverlogCDX == null) {
089                log.debug("A recover.gz log file was not found in metadata-arcfile");
090            } else {
091                log.debug("recover.gz log found in metadata-arcfile");
092            }
093        }
094
095        BitarchiveRecord br = null;
096        if (recoverlogCDX != null) { // Retrieve recover.gz from metadata.arc file
097            br = ArcRepositoryClientFactory.getViewerInstance().get(recoverlogCDX.getArcfile(),
098                    recoverlogCDX.getOffset());
099            if (br != null) {
100                log.debug("recover.gz log retrieved from metadata-arcfile");
101                if (files.writeRecoverBackupfile(br.getData())) {
102                    // modify order.xml, so Heritrix recover-path points
103                    // to the retrieved recoverlog
104                    insertHeritrixRecoverPathInOrderXML(job, files);
105                } else {
106                    log.warn("Failed to retrieve and write recoverlog to disk.");
107                }
108            } else {
109                log.debug("recover.gz log not retrieved from metadata-arcfile");
110            }
111        }
112    }
113
114    /**
115     * Insert the correct recoverpath in the order.xml for the given harvestjob.
116     *
117     * @param job A harvestjob
118     * @param files Heritrix files related to this harvestjob.
119     */
120    private static void insertHeritrixRecoverPathInOrderXML(Job job, Heritrix3Files files) {            
121        HeritrixTemplate temp = job.getOrderXMLdoc(); 
122        temp.setRecoverlogNode(files.getRecoverBackupGzFile());
123        job.setOrderXMLDoc(temp); // Update template associated with job
124    }
125
126    /**
127     * Submit a batch job to generate cdx for all metadata files for a job, and report result in a list.
128     *
129     * @param jobid The job to get cdx for.
130     * @return A list of cdx records.
131     * @throws ArgumentNotValid If jobid is 0 or negative.
132     * @throws IOFailure On trouble generating the cdx
133     */
134    private static List<CDXRecord> getMetadataCDXRecordsForJob(long jobid) {
135        ArgumentNotValid.checkPositive(jobid, "jobid");
136        FileBatchJob cdxJob = new ArchiveExtractCDXJob(false);
137        cdxJob.processOnlyFilesMatching(jobid + "-metadata-[0-9]+\\.(w)?arc(\\.gz)?");
138        File f;
139        try {
140            f = File.createTempFile(jobid + "-reports", ".cdx", FileUtils.getTempDir());
141        } catch (IOException e) {
142            throw new IOFailure("Could not create temporary file", e);
143        }
144        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(cdxJob,
145                Settings.get(CommonSettings.USE_REPLICA_ID));
146        status.getResultFile().copyTo(f);
147        List<CDXRecord> records;
148        BufferedReader reader = null;
149        try {
150            reader = new BufferedReader(new FileReader(f));
151            records = new ArrayList<CDXRecord>();
152            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
153                String[] parts = line.split("\\s+");
154                CDXRecord record = new CDXRecord(parts);
155                records.add(record);
156            }
157        } catch (IOException e) {
158            throw new IOFailure("Unable to read results from file '" + f + "'", e);
159        } finally {
160            IOUtils.closeQuietly(reader);
161            FileUtils.remove(f);
162        }
163        return records;
164    }
165
166}