001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3;
024
025import java.io.File;
026
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import dk.netarkivet.common.exceptions.ArgumentNotValid;
031import dk.netarkivet.common.exceptions.IOFailure;
032import dk.netarkivet.common.exceptions.UnknownID;
033import dk.netarkivet.common.utils.FileUtils;
034import dk.netarkivet.common.utils.Settings;
035import dk.netarkivet.harvester.HarvesterSettings;
036import dk.netarkivet.harvester.datamodel.HeritrixTemplate;
037import dk.netarkivet.harvester.datamodel.Job;
038import dk.netarkivet.harvester.harvesting.PersistentJobData;
039
040/**
041 * This class encapsulates the information generated by Heritrix3 or delivered to Heritrix3 before a crawl.
042 */
043public class Heritrix3Files {
044
045        /** The logger for this class. */
046    private static final Logger LOG = LoggerFactory.getLogger(Heritrix3Files.class);
047
048
049        private static final String HERITRIX_UNPACKDIR = "heritrix3/";
050        
051        private File crawlDir;
052        private Long harvestID;
053        private Long jobID;
054        private File orderXML;
055        private File indexDir;
056        private String archiveFilePrefix;
057        private File h3ZipBall;
058        private File h3CerticateFile;
059        private File h3BaseDir;
060        private File h3JobDir;
061        private String jobName;
062        private File h3LogDir;
063
064        private File seedsFile;
065
066        private File orderFile;
067        
068        public static Heritrix3Files getH3HeritrixFiles(File crawldir, PersistentJobData harvestInfo) {
069                Heritrix3Files files = new Heritrix3Files();
070                files.setCrawldir(crawldir);
071                files.setJobId(harvestInfo.getJobID());
072                files.setHarvestID(harvestInfo.getOrigHarvestDefinitionID());
073                files.setArchivePrefix(harvestInfo.getHarvestFilenamePrefix());
074                files.setHeritrixZip();
075                files.setCertificateFile();
076                files.setHeritrixBaseDir();
077                files.setHeritrixJobDir();
078                return files;
079        }
080
081        public static Heritrix3Files getH3HeritrixFiles(File crawldir, Job job) { 
082                Heritrix3Files files = new Heritrix3Files();
083                files.setCrawldir(crawldir);
084                files.setJobId(job.getJobID());
085                files.setHarvestID(job.getOrigHarvestDefinitionID());
086                files.setArchivePrefix(job.getHarvestFilenamePrefix());
087                files.setHeritrixZip();
088                files.setCertificateFile();
089                files.setHeritrixBaseDir();
090                files.setHeritrixJobDir();
091                return files;
092        }
093        
094        private void setHarvestID(Long origHarvestDefinitionID) {
095                this.harvestID = origHarvestDefinitionID;
096        }
097
098        private void setHeritrixJobDir() {
099                jobName = crawlDir.getName();
100                h3JobDir = new File(h3BaseDir, "jobs/" + jobName);
101                h3LogDir = new File(h3JobDir, "logs");
102        }
103
104        private void setHeritrixBaseDir() {
105                h3BaseDir = new File(crawlDir, HERITRIX_UNPACKDIR);
106        }
107
108        private void setHeritrixZip() {
109                h3ZipBall = Settings.getFile(HarvesterSettings.HERITRIX3_BUNDLE);
110                if (!h3ZipBall.isFile()) {
111                        throw new IOFailure("The path to the heritrix3 zipfile '" 
112                                        +  h3ZipBall.getAbsolutePath() + "' does not represent a proper file");
113                }
114        }
115
116        private void setArchivePrefix(String harvestFilenamePrefix) {
117                this.archiveFilePrefix = harvestFilenamePrefix;
118                
119        }
120
121        private void setJobId(Long jobID) {
122                this.jobID = jobID;
123        }
124        
125        private void setCrawldir(File crawldir) {
126                this.crawlDir = crawldir;
127                this.seedsFile = new File(crawldir, "seeds.txt");
128                this.orderFile = new File(crawldir, "crawler-beans.cxml");
129        }
130
131        private Heritrix3Files(){
132        }
133
134        public File getCrawlDir() {
135                return this.crawlDir;
136        }
137
138        public void writeSeedsTxt(String seedListAsString) {
139                ArgumentNotValid.checkNotNullOrEmpty(seedListAsString, "String seedListAsString");
140                LOG.debug("Writing seeds to disk as file: " + seedsFile.getAbsolutePath());
141                FileUtils.writeBinaryFile(seedsFile, seedListAsString.getBytes());
142        }
143        
144        public File getSeedsFile() {            
145                return this.seedsFile;
146        }
147        
148        public File getOrderFile() {            
149                return this.orderFile;
150        }
151
152
153        public void setIndexDir(File indexDir) {
154                ArgumentNotValid.checkExistsDirectory(indexDir, "File indexDir");
155                this.indexDir = indexDir;;
156                
157        }
158        public void writeOrderXml(HeritrixTemplate orderXMLdoc) { 
159                File destination = this.orderFile;
160                
161                orderXMLdoc.writeToFile(destination);
162                this.orderXML = destination;    
163        }
164
165        public File getProgressStatisticsLog() {
166                return new File(h3LogDir, "progress-statistics.log");
167        }
168
169        public Long getJobID() {
170                return this.jobID;
171        }
172
173        public File getOrderXmlFile() {
174                return this.orderXML; 
175        }
176        public File getSeedsTxtFile() {
177                return new File(h3JobDir, "seeds.txt"); 
178        }
179
180        public Long getHarvestID() {
181                return this.harvestID;
182        }
183
184        public String getArchiveFilePrefix() {
185                return this.archiveFilePrefix;
186        }       
187
188        public File getIndexDir() {
189                return this.indexDir;
190        }
191
192        public File getCrawlLog() {
193                return new File(h3LogDir, "crawl.log");
194        }
195        
196        public File getHeritrixZip() {
197                return this.h3ZipBall;
198        }
199
200        public File getCertificateFile() {
201                return h3CerticateFile;
202        }       
203
204        private void setCertificateFile() {
205                try {
206                        h3CerticateFile = Settings.getFile(HarvesterSettings.HERITRIX3_CERTIFICATE);
207                } catch (UnknownID unknownID) {
208                        LOG.debug("No heritrix3 certificate defined in settings, using default");
209                        return;
210                }
211                if (h3CerticateFile != null && !h3CerticateFile.isFile()) {
212                        throw new IOFailure("The path to the heritrix3 certificate '" 
213                                        +  h3CerticateFile.getAbsolutePath() + "' does not represent a proper file");
214                }
215        }
216        
217        public File getHeritrixOutput() {
218                return new File(crawlDir, "heritrix_out.log");
219        }
220        
221        public File getHeritrixStderrLog() {
222                return new File(crawlDir, "heritrix3_err.log");
223        }
224        
225        public File getHeritrixStdoutLog() {
226                return new File(crawlDir, "heritrix3_out.log");
227        }
228
229        public File getHeritrixJobDir() {
230                return h3JobDir; 
231        }
232        
233
234        public File getHeritrixBaseDir() {
235                return h3BaseDir;
236        }
237
238        public String getJobname() {
239                return this.jobName;
240        }
241
242        public void deleteFinalLogs() {
243                try {
244                        FileUtils.remove(getCrawlLog());
245                } catch (IOFailure e) {
246                        // Log harmless trouble
247                        LOG.debug("Couldn't delete crawl log file.", e);
248                }
249                try {
250                        FileUtils.remove(getProgressStatisticsLog());
251                } catch (IOFailure e) {
252                        // Log harmless trouble
253                        LOG.debug("Couldn't delete progress statistics log file.", e);
254                }
255        }
256
257        public void cleanUpAfterHarvest(File oldJobsDir) {
258                 // delete disposable files
259        for (File disposable : getDisposableFiles()) {
260            if (disposable.exists()) {
261                try {
262                    FileUtils.removeRecursively(disposable);
263                } catch (IOFailure e) {
264                    // Log harmless trouble
265                    LOG.debug("Couldn't delete leftover file '{}'", disposable.getAbsolutePath(), e);
266                }
267            }
268        }
269        // move the rest to oldjobs
270        FileUtils.createDir(oldJobsDir);
271        File destDir = new File(oldJobsDir, crawlDir.getName());
272        boolean success = crawlDir.renameTo(destDir);
273        if (!success) {
274            LOG.warn("Failed to rename jobdir '{}' to '{}'", crawlDir, destDir);
275        }
276        }
277
278        public File[] getDisposableFiles() {
279        return new File[] {new File(h3JobDir, "state"), new File(crawlDir, "checkpoints"), new File(h3JobDir, "scratch")};
280    }
281}