Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.indexserver;
024
025import java.io.File;
026import java.util.regex.Pattern;
027
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import dk.netarkivet.common.CommonSettings;
032import dk.netarkivet.common.Constants;
033import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
034import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
035import dk.netarkivet.common.distribute.arcrepository.Replica;
036import dk.netarkivet.common.distribute.arcrepository.ReplicaType;
037import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient;
038import dk.netarkivet.common.exceptions.ArgumentNotValid;
039import dk.netarkivet.common.utils.Settings;
040import dk.netarkivet.common.utils.archive.ArchiveBatchJob;
041import dk.netarkivet.common.utils.archive.GetMetadataArchiveBatchJob;
042import dk.netarkivet.harvester.HarvesterSettings;
043
044/**
045 * This is an implementation of the RawDataCache specialized for data out of metadata files. It uses regular expressions
046 * for matching URL and mime-type of ARC entries for the kind of metadata we want.
047 */
048public class RawMetadataCache extends FileBasedCache<Long> implements RawDataCache {
049
050    /** The logger for this class. */
051    private static final Logger log = LoggerFactory.getLogger(RawMetadataCache.class);
052
053    /** A regular expression object that matches everything. */
054    public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(".*");
055    /** The prefix (cache name) that this cache uses. */
056    private final String prefix;
057    /**
058     * The arc repository interface. This does not need to be closed, it is a singleton.
059     */
060    private ViewerArcRepositoryClient arcrep = ArcRepositoryClientFactory.getViewerInstance();
061
062    /** The job that we use to dig through metadata files. */
063    private final ArchiveBatchJob job;
064
065    /**
066     * Create a new RawMetadataCache. For a given job ID, this will fetch and cache selected content from metadata files
067     * (&lt;ID&gt;-metadata-[0-9]+.arc). Any entry in a metadata file that matches both patterns will be returned. The
068     * returned data does not directly indicate which file they were from, though parts intrinsic to the particular
069     * format might.
070     *
071     * @param prefix A prefix that will be used to distinguish this cache's files from other caches'. It will be used
072     * for creating a directory, so it must not contain characters not legal in directory names.
073     * @param urlMatcher A pattern for matching URLs of the desired entries. If null, a .* pattern will be used.
074     * @param mimeMatcher A pattern for matching mime-types of the desired entries. If null, a .* pattern will be used.
075     */
076    public RawMetadataCache(String prefix, Pattern urlMatcher, Pattern mimeMatcher) {
077        super(prefix);
078        this.prefix = prefix;
079        Pattern urlMatcher1;
080        if (urlMatcher != null) {
081            urlMatcher1 = urlMatcher;
082        } else {
083            urlMatcher1 = MATCH_ALL_PATTERN;
084        }
085        Pattern mimeMatcher1;
086        if (mimeMatcher != null) {
087            mimeMatcher1 = mimeMatcher;
088        } else {
089            mimeMatcher1 = MATCH_ALL_PATTERN;
090        }
091        log.info("Metadata cache for '{}' is fetching metadata with urls matching '{}' and mimetype matching '{}'",
092                prefix, urlMatcher1.toString(), mimeMatcher1);
093        job = new GetMetadataArchiveBatchJob(urlMatcher1, mimeMatcher1);
094    }
095
096    /**
097     * Get the file potentially containing (cached) data for a single job.
098     *
099     * @param id The job to find data for.
100     * @return The file where cache data for the job can be stored.
101     * @see FileBasedCache#getCacheFile(Object)
102     */
103    @Override
104    public File getCacheFile(Long id) {
105        ArgumentNotValid.checkNotNull(id, "job ID");
106        ArgumentNotValid.checkNotNegative(id, "job ID");
107        return new File(getCacheDir(), prefix + "-" + id + "-cache");
108    }
109
110    /**
111     * Actually cache data for the given ID.
112     *
113     * @param id A job ID to cache data for.
114     * @return A File containing the data. This file will be the same as getCacheFile(ID);
115     * @see FileBasedCache#cacheData(Object)
116     */
117    protected Long cacheData(Long id) {
118        final String replicaUsed = Settings.get(CommonSettings.USE_REPLICA_ID);
119        final String metadataFilePatternSuffix = Settings.get(CommonSettings.METADATAFILE_REGEX_SUFFIX);
120        log.debug("Extract using a batchjob of type '{}' cachedata from files matching '{}{}' on replica '{}'", job
121                .getClass().getName(), id, metadataFilePatternSuffix, replicaUsed);
122        job.processOnlyFilesMatching(".*" + id + ".*" + metadataFilePatternSuffix);
123        BatchStatus b = arcrep.batch(job, replicaUsed);
124
125        // This check ensures that we got data from at least one file.
126        // Mind you, the data may be empty, but at least one file was
127        // successfully processed.
128        if (b.hasResultFile() && b.getNoOfFilesProcessed() > b.getFilesFailed().size()) {
129            File cacheFileName = getCacheFile(id);
130            b.copyResults(cacheFileName);
131            log.debug("Cached data for job '{}' for '{}'", id, prefix);
132            return id;
133        } else {
134            // Look for data in other bitarchive replicas, if this option is enabled
135            if (!Settings.getBoolean(HarvesterSettings.INDEXSERVER_INDEXING_LOOKFORDATAINOTHERBITARCHIVEREPLICAS)) {
136                log.info("No data found for job '{}' for '{}' in local bitarchive '{}'. ", id, prefix, replicaUsed);
137                return null;
138            } else {
139                log.info("No data found for job '{}' for '{}' in local bitarchive '{}'. Trying other replicas.", id,
140                        prefix, replicaUsed);
141                for (Replica rep : Replica.getKnown()) {
142                    // Only use different bitarchive replicas than replicaUsed
143                    if (rep.getType().equals(ReplicaType.BITARCHIVE) && !rep.getId().equals(replicaUsed)) {
144                        log.debug("Trying to retrieve index data for job '{}' from '{}'.", id, rep.getId());
145                        b = arcrep.batch(job, rep.getId());
146
147                        // Perform same check as for the batchresults from
148                        // the default replica.
149                        if (b.hasResultFile() && (b.getNoOfFilesProcessed() > b.getFilesFailed().size())) {
150                            File cacheFileName = getCacheFile(id);
151                            b.copyResults(cacheFileName);
152                            log.info("Cached data for job '{}' for '{}' from '{}' instead of '{}'", id, prefix, rep,
153                                    replicaUsed);
154                            return id;
155                        } else {
156                            log.trace("No data found for job '{}' for '{}' in bitarchive '{}'. ", id, prefix, rep);
157                        }
158                    }
159                }
160                log.info("No data found for job '{}' for '{}' in all bitarchive replicas", id, prefix);
161                return null;
162            }
163        }
164    }
165
166}