001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.metadata;
024
025import java.io.File;
026
027import dk.netarkivet.common.CommonSettings;
028import dk.netarkivet.common.utils.Settings;
029import dk.netarkivet.harvester.HarvesterSettings;
030
031/**
032 * Wraps information for an Heritrix file that should be stored in the metadata ARC.
033 * <p>
034 * Defines a natural order to sort them.
035 */
036public class MetadataFile implements Comparable<MetadataFile> {
037
038    /**
039     * The available type of metadata records.
040     *
041     * @author ngiraud
042     */
043    private enum MetadataType {
044        /** Metadata records for the setup of Heritrix. */
045        setup,
046        /** Metadata records for the reports generated by Heritrix. */
047        reports,
048        /** Metadata records for the logs generated by Heritrix. */
049        logs,
050        /** Metadata records for the index. */
051        index
052    }
053
054    /**
055     * A string format that is used to build metadata URLs. Parameters are, in order :
056     * <ol>
057     * <li>The organization</li>
058     * <li>the file type @see {@link MetadataType}</li>
059     * <li>the file name</li>
060     * <li>the Heritrix version</li>
061     * <li>the harvest id</li>
062     * <li>the job id</li>
063     * </ol>
064     */
065    private static final String URL_FORMAT = "metadata://%s/crawl/%s/%s?heritrixVersion=%s&harvestid=%s&jobid=%s";
066    /**
067     * A pattern identifying a CDX metadata entry.
068     *
069     * @see dk.netarkivet.harvester.indexserver.CDXDataCache#CDXDataCache()
070     */
071    public static final String CDX_PATTERN = "metadata://[^/]*/crawl/index/cdx.*";
072
073    /**
074     * A pattern identifying the crawl log metadata entry.
075     *
076     * @see dk.netarkivet.harvester.indexserver.CrawlLogDataCache #CrawlLogDataCache()
077     */
078    public static final String CRAWL_LOG_PATTERN = "metadata://[^/]*/crawl/logs/crawl\\.log.*";
079
080    /**
081     * A pattern identifying the recover.gz log metadata entry.
082     */
083    public static final String RECOVER_LOG_PATTERN = "metadata://[^/]*/crawl/logs/recover\\.gz.*";
084
085    /**
086     * The pattern controlling which files in the crawl directory root should be stored in the metadata ARC.
087     */
088    public static final String HERITRIX_FILE_PATTERN = Settings.get(HarvesterSettings.METADATA_HERITRIX_FILE_PATTERN);
089
090    /**
091     * The pattern controlling which files in the crawl directory root should be stored in the metadata ARC as reports.
092     */
093    public static final String REPORT_FILE_PATTERN = Settings.get(HarvesterSettings.METADATA_REPORT_FILE_PATTERN);
094
095    /**
096     * The pattern controlling which files in the logs subdirectory of the crawl directory root should be stored in the
097     * metadata ARC as log files.
098     */
099    public static final String LOG_FILE_PATTERN = Settings.get(HarvesterSettings.METADATA_LOG_FILE_PATTERN);
100
101    /**
102     * The name of a domain-specific Heritrix settings file (a.k.a. override).
103     */
104    public static final String DOMAIN_SETTINGS_FILE = "settings.xml";
105
106    /**
107     * The url representing the record in metadata-arc-file for this metadata file.
108     */
109    private String url;
110    /** The Heritrix metadata file. */
111    private File heritrixFile;
112    /** The type of Heritrix metadata file. */
113    private MetadataType type;
114
115    /**
116     * Creates a metadata file and finds which metadata type it belongs to. First the name of a heritrixfile is tested
117     * against the reportfile pattern, then again the logfile pattern. If the name matches neither of these, it is
118     * considered a setup file.
119     */
120    public MetadataFile(File heritrixFile, Long harvestId, Long jobId, String heritrixVersion) {
121        this.heritrixFile = heritrixFile;
122
123        type = MetadataType.setup;
124        String name = heritrixFile.getName();
125        if (name.matches(REPORT_FILE_PATTERN)) {
126            type = MetadataType.reports;
127            url = makeMetadataURL(MetadataType.reports, heritrixFile.getName(), harvestId, jobId, heritrixVersion);
128        } else if (name.matches(LOG_FILE_PATTERN)) {
129            type = MetadataType.logs;
130            url = makeMetadataURL(MetadataType.logs, heritrixFile.getName(), harvestId, jobId, heritrixVersion);
131        } else {
132            url = makeMetadataURL(MetadataType.setup, heritrixFile.getName(), harvestId, jobId, heritrixVersion);
133        }
134    }
135
136    /**
137     * Creates a metadata file for a domain-specific override file.
138     *
139     * @param heritrixFile a given heritrix metadata file.
140     * @param harvestId The harvestID that the job generating this file is part of.
141     * @param jobId The Id of the job generating this file
142     * @param heritrixVersion the version of Heritrix generating the file
143     * @param domain The name of the domain, this metadata belongs to
144     */
145    public MetadataFile(File heritrixFile, Long harvestId, Long jobId, String heritrixVersion, String domain) {
146        this(heritrixFile, harvestId, jobId, heritrixVersion);
147        url += "&domain=" + domain;
148    }
149
150    /**
151     * @return the metadata URL associated to this file.
152     */
153    public String getUrl() {
154        return url;
155    }
156
157    /**
158     * Returns the actual file.
159     *
160     * @return the actual file.
161     */
162    public File getHeritrixFile() {
163        return heritrixFile;
164    }
165
166    /** First we compare the type ordinals, then the URLs. */
167    public int compareTo(MetadataFile other) {
168        Integer thisOrdinal = this.type.ordinal();
169        Integer otherOrdinal = other.type.ordinal();
170
171        int ordinalCompare = thisOrdinal.compareTo(otherOrdinal);
172        if (ordinalCompare != 0) {
173            return ordinalCompare;
174        }
175        return this.url.compareTo(other.url);
176    }
177
178    /**
179     * Creates a metadata URL for this file. Metadata URLs are used to retrieve records in the metadata ARC file.
180     *
181     * @param theType The type of metadata for this file.
182     * @param name The name of the file.
183     * @param harvestID The harvestID that the job is part of.
184     * @param jobID The jobID that this file belongs to.
185     * @param heritrixVersion The version of Heritrix generating the file.
186     * @return the metadata URL for this file
187     */
188    private String makeMetadataURL(MetadataType theType, String name, long harvestID, long jobID, String heritrixVersion) {
189        return String.format(URL_FORMAT, Settings.get(CommonSettings.ORGANIZATION), theType.name(), name,
190                heritrixVersion, Long.toString(harvestID), Long.toString(jobID));
191    }
192
193}