001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.heritrix3; 025 026import java.io.BufferedReader; 027import java.io.File; 028import java.io.FileReader; 029import java.io.IOException; 030import java.util.ArrayList; 031import java.util.List; 032 033import org.apache.commons.io.IOUtils; 034import org.slf4j.Logger; 035import org.slf4j.LoggerFactory; 036 037import dk.netarkivet.common.CommonSettings; 038import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 039import dk.netarkivet.common.distribute.arcrepository.BatchStatus; 040import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord; 041import dk.netarkivet.common.exceptions.ArgumentNotValid; 042import dk.netarkivet.common.exceptions.IOFailure; 043import dk.netarkivet.common.utils.FileUtils; 044import dk.netarkivet.common.utils.Settings; 045import dk.netarkivet.common.utils.batch.FileBatchJob; 046import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob; 047import dk.netarkivet.common.utils.cdx.CDXRecord; 048import dk.netarkivet.harvester.datamodel.HeritrixTemplate; 049import dk.netarkivet.harvester.datamodel.Job; 050import dk.netarkivet.harvester.harvesting.metadata.MetadataFile; 051 052/** 053 * This class handles all the things in a single harvest that are not related directly related either to launching 054 * Heritrix3 or to handling JMS messages. 055 * 056 * This is currently code only related to retriving the recoverlog from metadata-file and inserting 057 * it into Heritrix3. Netarchivesuite has not fully implemented this feature yet. 058 */ 059public class HarvestControllerUtils { 060 061 /** The instance logger. */ 062 private static final Logger log = LoggerFactory.getLogger(HarvestJob.class); 063 064 /** 065 * This method attempts to retrieve the Heritrix recover log from the job which this job tries to continue. If 066 * successful, the Heritrix template is updated accordingly. 067 * 068 * @param job The harvest Job object containing various harvest setup data. 069 * @param files Heritrix files related to this harvestjob. 070 */ 071 private static void tryToRetrieveRecoverLog(Job job, Heritrix3Files files) { 072 Long previousJob = job.getContinuationOf(); 073 List<CDXRecord> metaCDXes = null; 074 try { 075 metaCDXes = getMetadataCDXRecordsForJob(previousJob); 076 } catch (IOFailure e) { 077 log.debug("Failed to retrive CDX of metatadata records. " 078 + "Maybe the metadata arcfile for job {} does not exist in repository", previousJob, e); 079 } 080 081 CDXRecord recoverlogCDX = null; 082 if (metaCDXes != null) { 083 for (CDXRecord cdx : metaCDXes) { 084 if (cdx.getURL().matches(MetadataFile.RECOVER_LOG_PATTERN)) { 085 recoverlogCDX = cdx; 086 } 087 } 088 if (recoverlogCDX == null) { 089 log.debug("A recover.gz log file was not found in metadata-arcfile"); 090 } else { 091 log.debug("recover.gz log found in metadata-arcfile"); 092 } 093 } 094 095 BitarchiveRecord br = null; 096 if (recoverlogCDX != null) { // Retrieve recover.gz from metadata.arc file 097 br = ArcRepositoryClientFactory.getViewerInstance().get(recoverlogCDX.getArcfile(), 098 recoverlogCDX.getOffset()); 099 if (br != null) { 100 log.debug("recover.gz log retrieved from metadata-arcfile"); 101 if (files.writeRecoverBackupfile(br.getData())) { 102 // modify order.xml, so Heritrix recover-path points 103 // to the retrieved recoverlog 104 insertHeritrixRecoverPathInOrderXML(job, files); 105 } else { 106 log.warn("Failed to retrieve and write recoverlog to disk."); 107 } 108 } else { 109 log.debug("recover.gz log not retrieved from metadata-arcfile"); 110 } 111 } 112 } 113 114 /** 115 * Insert the correct recoverpath in the order.xml for the given harvestjob. 116 * 117 * @param job A harvestjob 118 * @param files Heritrix files related to this harvestjob. 119 */ 120 private static void insertHeritrixRecoverPathInOrderXML(Job job, Heritrix3Files files) { 121 HeritrixTemplate temp = job.getOrderXMLdoc(); 122 temp.setRecoverlogNode(files.getRecoverBackupGzFile()); 123 job.setOrderXMLDoc(temp); // Update template associated with job 124 } 125 126 /** 127 * Submit a batch job to generate cdx for all metadata files for a job, and report result in a list. 128 * 129 * @param jobid The job to get cdx for. 130 * @return A list of cdx records. 131 * @throws ArgumentNotValid If jobid is 0 or negative. 132 * @throws IOFailure On trouble generating the cdx 133 */ 134 private static List<CDXRecord> getMetadataCDXRecordsForJob(long jobid) { 135 ArgumentNotValid.checkPositive(jobid, "jobid"); 136 FileBatchJob cdxJob = new ArchiveExtractCDXJob(false); 137 cdxJob.processOnlyFilesMatching(jobid + "-metadata-[0-9]+\\.(w)?arc(\\.gz)?"); 138 File f; 139 try { 140 f = File.createTempFile(jobid + "-reports", ".cdx", FileUtils.getTempDir()); 141 } catch (IOException e) { 142 throw new IOFailure("Could not create temporary file", e); 143 } 144 BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(cdxJob, 145 Settings.get(CommonSettings.USE_REPLICA_ID)); 146 status.getResultFile().copyTo(f); 147 List<CDXRecord> records; 148 BufferedReader reader = null; 149 try { 150 reader = new BufferedReader(new FileReader(f)); 151 records = new ArrayList<CDXRecord>(); 152 for (String line = reader.readLine(); line != null; line = reader.readLine()) { 153 String[] parts = line.split("\\s+"); 154 CDXRecord record = new CDXRecord(parts); 155 records.add(record); 156 } 157 } catch (IOException e) { 158 throw new IOFailure("Unable to read results from file '" + f + "'", e); 159 } finally { 160 IOUtils.closeQuietly(reader); 161 FileUtils.remove(f); 162 } 163 return records; 164 } 165 166}