001package dk.netarkivet.harvester.heritrix3;
002
003import java.io.File;
004import java.io.InputStream;
005
006import org.slf4j.Logger;
007import org.slf4j.LoggerFactory;
008
009import dk.netarkivet.common.exceptions.ArgumentNotValid;
010import dk.netarkivet.common.exceptions.IOFailure;
011import dk.netarkivet.common.exceptions.UnknownID;
012import dk.netarkivet.common.utils.FileUtils;
013import dk.netarkivet.common.utils.Settings;
014import dk.netarkivet.harvester.datamodel.HeritrixTemplate;
015import dk.netarkivet.harvester.datamodel.Job;
016import dk.netarkivet.harvester.harvesting.PersistentJobData;
017
018/**
019 * This class encapsulates the information generated by Heritrix3 or delivered to Heritrix3 before a crawl.
020 * @author svc
021 * 
022 * TODO implementing recoverlog handling
023 */
024public class Heritrix3Files {
025
026        /** The logger for this class. */
027    //private static final Log LOG = LogFactory.getLog(Heritrix3Files.class);
028    private static final Logger LOG = LoggerFactory.getLogger(Heritrix3Files.class);
029
030
031        private static final String HERITRIX_UNPACKDIR = "heritrix3/";
032        
033        private File crawlDir;
034        private Long harvestID;
035        private Long jobID;
036        private File orderXML;
037        private File indexDir;
038        private String archiveFilePrefix;
039        private File h3ZipBall;
040        private File h3CerticateFile;
041        private File h3BaseDir;
042        private File h3JobDir;
043        private String jobName;
044        private File h3LogDir;
045
046        private File seedsFile;
047
048        private File orderFile;
049        
050        public static Heritrix3Files getH3HeritrixFiles(File crawldir, PersistentJobData harvestInfo) {
051                Heritrix3Files files = new Heritrix3Files();
052                files.setCrawldir(crawldir);
053                files.setJobId(harvestInfo.getJobID());
054                files.setHarvestID(harvestInfo.getOrigHarvestDefinitionID());
055                files.setArchivePrefix(harvestInfo.getHarvestFilenamePrefix());
056                files.setHeritrixZip();
057                files.setCertificateFile();
058                files.setHeritrixBaseDir();
059                files.setHeritrixJobDir();
060                return files;
061        }
062
063        public static Heritrix3Files getH3HeritrixFiles(File crawldir, Job job) { 
064                Heritrix3Files files = new Heritrix3Files();
065                files.setCrawldir(crawldir);
066                files.setJobId(job.getJobID());
067                files.setHarvestID(job.getOrigHarvestDefinitionID());
068                files.setArchivePrefix(job.getHarvestFilenamePrefix());
069                files.setHeritrixZip();
070                files.setCertificateFile();
071                files.setHeritrixBaseDir();
072                files.setHeritrixJobDir();
073                return files;
074        }
075        
076        private void setHarvestID(Long origHarvestDefinitionID) {
077                this.harvestID = origHarvestDefinitionID;
078        }
079
080        private void setHeritrixJobDir() {
081                jobName = crawlDir.getName();
082                h3JobDir = new File(h3BaseDir, "jobs/" + jobName);
083                h3LogDir = new File(h3JobDir, "logs");
084        }
085
086        private void setHeritrixBaseDir() {
087                h3BaseDir = new File(crawlDir, HERITRIX_UNPACKDIR);
088        }
089
090        private void setHeritrixZip() {
091                h3ZipBall = Settings.getFile(Heritrix3Settings.HERITRIX3_BUNDLE);
092                if (!h3ZipBall.isFile()) {
093                        throw new IOFailure("The path to the heritrix3 zipfile '" 
094                                        +  h3ZipBall.getAbsolutePath() + "' does not represent a proper file");
095                }
096        }
097
098        private void setArchivePrefix(String harvestFilenamePrefix) {
099                this.archiveFilePrefix = harvestFilenamePrefix;
100                
101        }
102
103        private void setJobId(Long jobID) {
104                this.jobID = jobID;
105        }
106        
107        private void setCrawldir(File crawldir) {
108                this.crawlDir = crawldir;
109                this.seedsFile = new File(crawldir, "seeds.txt");
110                this.orderFile = new File(crawldir, "crawler-beans.cxml");
111        }
112
113        private Heritrix3Files(){
114        }
115
116        public File getCrawlDir() {
117                return this.crawlDir;
118        }
119
120        public void writeSeedsTxt(String seedListAsString) {
121                ArgumentNotValid.checkNotNullOrEmpty(seedListAsString, "String seedListAsString");
122                LOG.debug("Writing seeds to disk as file: " + seedsFile.getAbsolutePath());
123                FileUtils.writeBinaryFile(seedsFile, seedListAsString.getBytes());
124        }
125        
126        public File getSeedsFile() {            
127                return this.seedsFile;
128        }
129        
130        public File getOrderFile() {            
131                return this.orderFile;
132        }
133
134
135        public void setIndexDir(File indexDir) {
136                ArgumentNotValid.checkExistsDirectory(indexDir, "File indexDir");
137                this.indexDir = indexDir;;
138                
139        }
140        public void writeOrderXml(HeritrixTemplate orderXMLdoc) { 
141                File destination = this.orderFile;
142                
143                orderXMLdoc.writeToFile(destination);
144                this.orderXML = destination;    
145        }
146
147        public File getProgressStatisticsLog() {
148                return new File(h3LogDir, "progress-statistics.log");
149        }
150
151        public Long getJobID() {
152                return this.jobID;
153        }
154
155        public File getOrderXmlFile() {
156                return this.orderXML; 
157        }
158        public File getSeedsTxtFile() {
159                return new File(h3JobDir, "seeds.txt"); 
160        }
161
162        public Long getHarvestID() {
163                return this.harvestID;
164        }
165
166        public String getArchiveFilePrefix() {
167                return this.archiveFilePrefix;
168        }       
169
170        public File getIndexDir() {
171                return this.indexDir;
172        }
173
174        public File getCrawlLog() {
175                return new File(h3LogDir, "crawl.log");
176        }
177        
178        public File getHeritrixZip() {
179                return this.h3ZipBall;
180        }
181
182        public File getCertificateFile() {
183                return h3CerticateFile;
184        }       
185
186        private void setCertificateFile() {
187                try {
188                        h3CerticateFile = Settings.getFile(Heritrix3Settings.HERITRIX3_CERTIFICATE);
189                } catch (UnknownID unknownID) {
190                        LOG.debug("No heritrix3 certificate defined in settings, using default");
191                        return;
192                }
193                if (h3CerticateFile != null && !h3CerticateFile.isFile()) {
194                        throw new IOFailure("The path to the heritrix3 certificate '" 
195                                        +  h3CerticateFile.getAbsolutePath() + "' does not represent a proper file");
196                }
197        }
198        
199        public File getHeritrixOutput() {
200                return new File(crawlDir, "heritrix_out.log");
201        }
202        
203        public File getHeritrixStderrLog() {
204                return new File(crawlDir, "heritrix3_err.log");
205        }
206        
207        public File getHeritrixStdoutLog() {
208                return new File(crawlDir, "heritrix3_out.log");
209        }
210
211        public File getHeritrixJobDir() {
212                return h3JobDir; 
213        }
214        
215
216        public File getHeritrixBaseDir() {
217                return h3BaseDir;
218        }
219
220        public String getJobname() {
221                return this.jobName;
222        }
223
224        public void deleteFinalLogs() {
225                try {
226                        FileUtils.remove(getCrawlLog());
227                } catch (IOFailure e) {
228                        // Log harmless trouble
229                        LOG.debug("Couldn't delete crawl log file.", e);
230                }
231                try {
232                        FileUtils.remove(getProgressStatisticsLog());
233                } catch (IOFailure e) {
234                        // Log harmless trouble
235                        LOG.debug("Couldn't delete progress statistics log file.", e);
236                }
237        }
238
239        public void cleanUpAfterHarvest(File oldJobsDir) {
240                 // delete disposable files
241        for (File disposable : getDisposableFiles()) {
242            if (disposable.exists()) {
243                try {
244                    FileUtils.removeRecursively(disposable);
245                } catch (IOFailure e) {
246                    // Log harmless trouble
247                    LOG.debug("Couldn't delete leftover file '{}'", disposable.getAbsolutePath(), e);
248                }
249            }
250        }
251        // move the rest to oldjobs
252        FileUtils.createDir(oldJobsDir);
253        File destDir = new File(oldJobsDir, crawlDir.getName());
254        boolean success = crawlDir.renameTo(destDir);
255        if (!success) {
256            LOG.warn("Failed to rename jobdir '{}' to '{}'", crawlDir, destDir);
257        }
258        }
259
260        public File[] getDisposableFiles() {
261        return new File[] {new File(h3JobDir, "state"), new File(crawlDir, "checkpoints"), new File(h3JobDir, "scratch")};
262    }
263        
264        ////////////////////// UNIMPLEMENTED METHODS ///////////////////////////////
265        
266        // FIXME Handling of the recoverLog is postponed
267
268        public boolean writeRecoverBackupfile(InputStream data) {
269                // TODO Auto-generated method stub
270                return false;
271        }
272
273        public File getRecoverBackupGzFile() {
274                // TODO Auto-generated method stub
275                return null;
276        }
277}