001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.utils.cdx; 024 025import java.io.File; 026import java.util.ArrayList; 027import java.util.Collections; 028import java.util.HashMap; 029import java.util.List; 030import java.util.Map; 031 032import org.slf4j.Logger; 033import org.slf4j.LoggerFactory; 034 035import dk.netarkivet.common.exceptions.ArgumentNotValid; 036import dk.netarkivet.common.exceptions.IOFailure; 037import dk.netarkivet.common.exceptions.UnknownID; 038import dk.netarkivet.common.utils.arc.ARCKey; 039 040/** 041 * This class handles reading CDX files and finding entries in them. Furthermore it implements the possibility to do 042 * filtering of searchresults 043 */ 044public class CDXReader { 045 046 /** The instance logger. */ 047 private static final Logger log = LoggerFactory.getLogger(CDXReader.class); 048 049 /** The CDX files that we want to iterate over. */ 050 private List<File> files = new ArrayList<File>(); 051 052 /** Any filters we want to apply. */ 053 private Map<String, CDXRecordFilter> cdxrecordfilters = new HashMap<String, CDXRecordFilter>(); 054 055 /** The regular expression that defines separation between fields. */ 056 static final String SEPARATOR_REGEX = "\\s+"; 057 058 /** 059 * Create a new CDXReader that reads the given file. 060 * 061 * @param cdxFile A CDX file to read. 062 * @throws IOFailure If the file cannot be found. 063 */ 064 public CDXReader(File cdxFile) { 065 addCDXFile(cdxFile); 066 } 067 068 /** Create a new CDXReader with no file. */ 069 public CDXReader() { 070 } 071 072 /** 073 * Add another CDX file to those being searched. 074 * 075 * @param cdxFile A CDX file to search. 076 * @throws IOFailure If the file cannot be found or read 077 */ 078 public void addCDXFile(File cdxFile) { 079 ArgumentNotValid.checkNotNull(cdxFile, "cdxFile"); 080 if (!cdxFile.exists() || !cdxFile.canRead()) { 081 final String message = "Can't find CDX file '" + cdxFile.getAbsolutePath() + "'"; 082 log.debug(message); 083 throw new IOFailure(message); 084 } 085 files.add(cdxFile); 086 } 087 088 /** 089 * Forget about all CDX files. 090 */ 091 public void clearCDXFiles() { 092 files.clear(); 093 } 094 095 /** 096 * Add another CDXRecordFilter to the list of filters to use when searching. 097 * 098 * @param cdxrecfilter A CDXRecordFilter to use when searching. 099 * @throws ArgumentNotValid If the filter is invalid or another filter exists with the same name. 100 */ 101 public void addCDXRecordFilter(CDXRecordFilter cdxrecfilter) throws ArgumentNotValid { 102 ArgumentNotValid.checkNotNull(cdxrecfilter, "cdxrecfilter"); 103 ArgumentNotValid.checkNotNullOrEmpty(cdxrecfilter.getFilterName(), "cdxrecfilter.getFilterName()"); 104 105 if (cdxrecordfilters.containsKey(cdxrecfilter.getFilterName())) { 106 throw new ArgumentNotValid("The Filtername '" + cdxrecfilter.getFilterName() + "' is already in use !"); 107 } 108 cdxrecordfilters.put(cdxrecfilter.getFilterName(), cdxrecfilter); 109 } 110 111 /** 112 * Remove all CDXRecordFilters. 113 */ 114 public void removeAllCDXRecordFilters() { 115 cdxrecordfilters = new HashMap<String, CDXRecordFilter>(); 116 } 117 118 /** 119 * Get a table of all filters. 120 * 121 * @return a Hashtable with all the filters. 122 */ 123 public Map<String, CDXRecordFilter> getFilters() { 124 return Collections.unmodifiableMap(cdxrecordfilters); 125 } 126 127 /** 128 * Get a specific filter by the name of the filter - if not found return null. 129 * 130 * @param filtername The given filtername. 131 * @return the CDXRecordFilter 132 */ 133 public CDXRecordFilter getCDXRecordFilter(String filtername) { 134 return cdxrecordfilters.get(filtername); 135 } 136 137 /** 138 * Remove a specific filter by the name of the filter. 139 * 140 * @param filtername The given filtername. 141 * @throws UnknownID if there is no filter of that name. 142 */ 143 public void removeCDXRecordFilter(String filtername) { 144 if (!cdxrecordfilters.containsKey(filtername)) { 145 throw new UnknownID("No filter found named " + filtername); 146 } 147 cdxrecordfilters.remove(filtername); 148 } 149 150 /** 151 * Look up an entry in CDX files. Notice that only full match search is allowed, not prefix search. 152 * 153 * @param uri A URI to find in the CDX files. 154 * @return A key indicating the place where the entry can be found, or null if no such entry was found; 155 */ 156 public ARCKey getKey(String uri) { 157 for (File f : files) { 158 String firstBrokenLine = null; 159 long numBrokenLines = 0; 160 try { 161 CDXLINES: for (String s : BinSearch.getLinesInFile(f, uri)) { 162 String[] fieldParts = s.split(SEPARATOR_REGEX); 163 CDXRecord cdxrec; 164 try { 165 cdxrec = new CDXRecord(fieldParts); 166 } catch (RuntimeException e) { 167 // Skip lines with wrong format 168 numBrokenLines++; 169 if (firstBrokenLine == null) { 170 firstBrokenLine = s; 171 } 172 continue CDXLINES; 173 } 174 String cdxuri = cdxrec.getURL(); 175 if (CDXRecord.URLsEqual(uri, cdxuri)) { 176 for (CDXRecordFilter cdxrecf : cdxrecordfilters.values()) { 177 if (!cdxrecf.process(cdxrec)) { 178 continue CDXLINES; 179 } 180 } 181 return new ARCKey(cdxrec.getArcfile(), cdxrec.getOffset()); 182 } 183 } 184 } finally { 185 if (numBrokenLines > 0) { 186 log.warn("CDX file '{}' contains {} invalid CDX lines, first one is\n{}", f, numBrokenLines, 187 firstBrokenLine); 188 } 189 } 190 } 191 return null; 192 } 193 194}