001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.indexserver; 024 025import java.io.File; 026import java.util.regex.Pattern; 027 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import dk.netarkivet.common.CommonSettings; 032import dk.netarkivet.common.Constants; 033import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 034import dk.netarkivet.common.distribute.arcrepository.BatchStatus; 035import dk.netarkivet.common.distribute.arcrepository.Replica; 036import dk.netarkivet.common.distribute.arcrepository.ReplicaType; 037import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient; 038import dk.netarkivet.common.exceptions.ArgumentNotValid; 039import dk.netarkivet.common.utils.Settings; 040import dk.netarkivet.common.utils.archive.ArchiveBatchJob; 041import dk.netarkivet.common.utils.archive.GetMetadataArchiveBatchJob; 042import dk.netarkivet.harvester.HarvesterSettings; 043 044/** 045 * This is an implementation of the RawDataCache specialized for data out of metadata files. It uses regular expressions 046 * for matching URL and mime-type of ARC entries for the kind of metadata we want. 047 */ 048public class RawMetadataCache extends FileBasedCache<Long> implements RawDataCache { 049 050 /** The logger for this class. */ 051 private static final Logger log = LoggerFactory.getLogger(RawMetadataCache.class); 052 053 /** A regular expression object that matches everything. */ 054 public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(".*"); 055 /** The prefix (cache name) that this cache uses. */ 056 private final String prefix; 057 /** 058 * The arc repository interface. This does not need to be closed, it is a singleton. 059 */ 060 private ViewerArcRepositoryClient arcrep = ArcRepositoryClientFactory.getViewerInstance(); 061 062 /** The job that we use to dig through metadata files. */ 063 private final ArchiveBatchJob job; 064 065 /** 066 * Create a new RawMetadataCache. For a given job ID, this will fetch and cache selected content from metadata files 067 * (<ID>-metadata-[0-9]+.arc). Any entry in a metadata file that matches both patterns will be returned. The 068 * returned data does not directly indicate which file they were from, though parts intrinsic to the particular 069 * format might. 070 * 071 * @param prefix A prefix that will be used to distinguish this cache's files from other caches'. It will be used 072 * for creating a directory, so it must not contain characters not legal in directory names. 073 * @param urlMatcher A pattern for matching URLs of the desired entries. If null, a .* pattern will be used. 074 * @param mimeMatcher A pattern for matching mime-types of the desired entries. If null, a .* pattern will be used. 075 */ 076 public RawMetadataCache(String prefix, Pattern urlMatcher, Pattern mimeMatcher) { 077 super(prefix); 078 this.prefix = prefix; 079 Pattern urlMatcher1; 080 if (urlMatcher != null) { 081 urlMatcher1 = urlMatcher; 082 } else { 083 urlMatcher1 = MATCH_ALL_PATTERN; 084 } 085 Pattern mimeMatcher1; 086 if (mimeMatcher != null) { 087 mimeMatcher1 = mimeMatcher; 088 } else { 089 mimeMatcher1 = MATCH_ALL_PATTERN; 090 } 091 log.info("Metadata cache for '{}' is fetching metadata with urls matching '{}' and mimetype matching '{}'", 092 prefix, urlMatcher1.toString(), mimeMatcher1); 093 job = new GetMetadataArchiveBatchJob(urlMatcher1, mimeMatcher1); 094 } 095 096 /** 097 * Get the file potentially containing (cached) data for a single job. 098 * 099 * @param id The job to find data for. 100 * @return The file where cache data for the job can be stored. 101 * @see FileBasedCache#getCacheFile(Object) 102 */ 103 @Override 104 public File getCacheFile(Long id) { 105 ArgumentNotValid.checkNotNull(id, "job ID"); 106 ArgumentNotValid.checkNotNegative(id, "job ID"); 107 return new File(getCacheDir(), prefix + "-" + id + "-cache"); 108 } 109 110 /** 111 * Actually cache data for the given ID. 112 * 113 * @param id A job ID to cache data for. 114 * @return A File containing the data. This file will be the same as getCacheFile(ID); 115 * @see FileBasedCache#cacheData(Object) 116 */ 117 protected Long cacheData(Long id) { 118 final String replicaUsed = Settings.get(CommonSettings.USE_REPLICA_ID); 119 final String metadataFilePatternSuffix = Settings.get(CommonSettings.METADATAFILE_REGEX_SUFFIX); 120 log.debug("Extract using a batchjob of type '{}' cachedata from files matching '{}{}' on replica '{}'", job 121 .getClass().getName(), id, metadataFilePatternSuffix, replicaUsed); 122 job.processOnlyFilesMatching(".*" + id + ".*" + metadataFilePatternSuffix); 123 BatchStatus b = arcrep.batch(job, replicaUsed); 124 125 // This check ensures that we got data from at least one file. 126 // Mind you, the data may be empty, but at least one file was 127 // successfully processed. 128 if (b.hasResultFile() && b.getNoOfFilesProcessed() > b.getFilesFailed().size()) { 129 File cacheFileName = getCacheFile(id); 130 b.copyResults(cacheFileName); 131 log.debug("Cached data for job '{}' for '{}'", id, prefix); 132 return id; 133 } else { 134 // Look for data in other bitarchive replicas, if this option is enabled 135 if (!Settings.getBoolean(HarvesterSettings.INDEXSERVER_INDEXING_LOOKFORDATAINOTHERBITARCHIVEREPLICAS)) { 136 log.info("No data found for job '{}' for '{}' in local bitarchive '{}'. ", id, prefix, replicaUsed); 137 return null; 138 } else { 139 log.info("No data found for job '{}' for '{}' in local bitarchive '{}'. Trying other replicas.", id, 140 prefix, replicaUsed); 141 for (Replica rep : Replica.getKnown()) { 142 // Only use different bitarchive replicas than replicaUsed 143 if (rep.getType().equals(ReplicaType.BITARCHIVE) && !rep.getId().equals(replicaUsed)) { 144 log.debug("Trying to retrieve index data for job '{}' from '{}'.", id, rep.getId()); 145 b = arcrep.batch(job, rep.getId()); 146 147 // Perform same check as for the batchresults from 148 // the default replica. 149 if (b.hasResultFile() && (b.getNoOfFilesProcessed() > b.getFilesFailed().size())) { 150 File cacheFileName = getCacheFile(id); 151 b.copyResults(cacheFileName); 152 log.info("Cached data for job '{}' for '{}' from '{}' instead of '{}'", id, prefix, rep, 153 replicaUsed); 154 return id; 155 } else { 156 log.trace("No data found for job '{}' for '{}' in bitarchive '{}'. ", id, prefix, rep); 157 } 158 } 159 } 160 log.info("No data found for job '{}' for '{}' in all bitarchive replicas", id, prefix); 161 return null; 162 } 163 } 164 } 165 166}