001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.metadata; 024 025import java.io.File; 026 027import dk.netarkivet.common.CommonSettings; 028import dk.netarkivet.common.utils.Settings; 029import dk.netarkivet.harvester.HarvesterSettings; 030 031/** 032 * Wraps information for an Heritrix file that should be stored in the metadata ARC. 033 * <p> 034 * Defines a natural order to sort them. 035 */ 036public class MetadataFile implements Comparable<MetadataFile> { 037 038 /** 039 * The available type of metadata records. 040 * 041 * @author ngiraud 042 */ 043 private enum MetadataType { 044 /** Metadata records for the setup of Heritrix. */ 045 setup, 046 /** Metadata records for the reports generated by Heritrix. */ 047 reports, 048 /** Metadata records for the logs generated by Heritrix. */ 049 logs, 050 /** Metadata records for the index. */ 051 index 052 } 053 054 /** 055 * A string format that is used to build metadata URLs. Parameters are, in order : 056 * <ol> 057 * <li>The organization</li> 058 * <li>the file type @see {@link MetadataType}</li> 059 * <li>the file name</li> 060 * <li>the Heritrix version</li> 061 * <li>the harvest id</li> 062 * <li>the job id</li> 063 * </ol> 064 */ 065 private static final String URL_FORMAT = "metadata://%s/crawl/%s/%s?heritrixVersion=%s&harvestid=%s&jobid=%s"; 066 /** 067 * A pattern identifying a CDX metadata entry. 068 * 069 * @see dk.netarkivet.harvester.indexserver.CDXDataCache#CDXDataCache() 070 */ 071 public static final String CDX_PATTERN = "metadata://[^/]*/crawl/index/cdx.*"; 072 073 /** 074 * A pattern identifying the crawl log metadata entry. 075 * 076 * @see dk.netarkivet.harvester.indexserver.CrawlLogDataCache #CrawlLogDataCache() 077 */ 078 public static final String CRAWL_LOG_PATTERN = "metadata://[^/]*/crawl/logs/crawl\\.log.*"; 079 080 /** 081 * A pattern identifying the recover.gz log metadata entry. 082 */ 083 public static final String RECOVER_LOG_PATTERN = "metadata://[^/]*/crawl/logs/recover\\.gz.*"; 084 085 /** 086 * The pattern controlling which files in the crawl directory root should be stored in the metadata ARC. 087 */ 088 public static final String HERITRIX_FILE_PATTERN = Settings.get(HarvesterSettings.METADATA_HERITRIX_FILE_PATTERN); 089 090 /** 091 * The pattern controlling which files in the crawl directory root should be stored in the metadata ARC as reports. 092 */ 093 public static final String REPORT_FILE_PATTERN = Settings.get(HarvesterSettings.METADATA_REPORT_FILE_PATTERN); 094 095 /** 096 * The pattern controlling which files in the logs subdirectory of the crawl directory root should be stored in the 097 * metadata ARC as log files. 098 */ 099 public static final String LOG_FILE_PATTERN = Settings.get(HarvesterSettings.METADATA_LOG_FILE_PATTERN); 100 101 /** 102 * The name of a domain-specific Heritrix settings file (a.k.a. override). 103 */ 104 public static final String DOMAIN_SETTINGS_FILE = "settings.xml"; 105 106 /** 107 * The url representing the record in metadata-arc-file for this metadata file. 108 */ 109 private String url; 110 /** The Heritrix metadata file. */ 111 private File heritrixFile; 112 /** The type of Heritrix metadata file. */ 113 private MetadataType type; 114 115 /** 116 * Creates a metadata file and finds which metadata type it belongs to. First the name of a heritrixfile is tested 117 * against the reportfile pattern, then again the logfile pattern. If the name matches neither of these, it is 118 * considered a setup file. 119 */ 120 public MetadataFile(File heritrixFile, Long harvestId, Long jobId, String heritrixVersion) { 121 this.heritrixFile = heritrixFile; 122 123 type = MetadataType.setup; 124 String name = heritrixFile.getName(); 125 if (name.matches(REPORT_FILE_PATTERN)) { 126 type = MetadataType.reports; 127 url = makeMetadataURL(MetadataType.reports, heritrixFile.getName(), harvestId, jobId, heritrixVersion); 128 } else if (name.matches(LOG_FILE_PATTERN)) { 129 type = MetadataType.logs; 130 url = makeMetadataURL(MetadataType.logs, heritrixFile.getName(), harvestId, jobId, heritrixVersion); 131 } else { 132 url = makeMetadataURL(MetadataType.setup, heritrixFile.getName(), harvestId, jobId, heritrixVersion); 133 } 134 } 135 136 /** 137 * Creates a metadata file for a domain-specific override file. 138 * 139 * @param heritrixFile a given heritrix metadata file. 140 * @param harvestId The harvestID that the job generating this file is part of. 141 * @param jobId The Id of the job generating this file 142 * @param heritrixVersion the version of Heritrix generating the file 143 * @param domain The name of the domain, this metadata belongs to 144 */ 145 public MetadataFile(File heritrixFile, Long harvestId, Long jobId, String heritrixVersion, String domain) { 146 this(heritrixFile, harvestId, jobId, heritrixVersion); 147 url += "&domain=" + domain; 148 } 149 150 /** 151 * @return the metadata URL associated to this file. 152 */ 153 public String getUrl() { 154 return url; 155 } 156 157 /** 158 * Returns the actual file. 159 * 160 * @return the actual file. 161 */ 162 public File getHeritrixFile() { 163 return heritrixFile; 164 } 165 166 /** First we compare the type ordinals, then the URLs. */ 167 public int compareTo(MetadataFile other) { 168 Integer thisOrdinal = this.type.ordinal(); 169 Integer otherOrdinal = other.type.ordinal(); 170 171 int ordinalCompare = thisOrdinal.compareTo(otherOrdinal); 172 if (ordinalCompare != 0) { 173 return ordinalCompare; 174 } 175 return this.url.compareTo(other.url); 176 } 177 178 /** 179 * Creates a metadata URL for this file. Metadata URLs are used to retrieve records in the metadata ARC file. 180 * 181 * @param theType The type of metadata for this file. 182 * @param name The name of the file. 183 * @param harvestID The harvestID that the job is part of. 184 * @param jobID The jobID that this file belongs to. 185 * @param heritrixVersion The version of Heritrix generating the file. 186 * @return the metadata URL for this file 187 */ 188 private String makeMetadataURL(MetadataType theType, String name, long harvestID, long jobID, String heritrixVersion) { 189 return String.format(URL_FORMAT, Settings.get(CommonSettings.ORGANIZATION), theType.name(), name, 190 heritrixVersion, Long.toString(harvestID), Long.toString(jobID)); 191 } 192 193}