001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils.cdx;
024
025import java.io.File;
026import java.util.ArrayList;
027import java.util.Collections;
028import java.util.HashMap;
029import java.util.List;
030import java.util.Map;
031
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034
035import dk.netarkivet.common.exceptions.ArgumentNotValid;
036import dk.netarkivet.common.exceptions.IOFailure;
037import dk.netarkivet.common.exceptions.UnknownID;
038import dk.netarkivet.common.utils.arc.ARCKey;
039
040/**
041 * This class handles reading CDX files and finding entries in them. Furthermore it implements the possibility to do
042 * filtering of searchresults
043 */
044public class CDXReader {
045
046    /** The instance logger. */
047    private static final Logger log = LoggerFactory.getLogger(CDXReader.class);
048
049    /** The CDX files that we want to iterate over. */
050    private List<File> files = new ArrayList<File>();
051
052    /** Any filters we want to apply. */
053    private Map<String, CDXRecordFilter> cdxrecordfilters = new HashMap<String, CDXRecordFilter>();
054
055    /** The regular expression that defines separation between fields. */
056    static final String SEPARATOR_REGEX = "\\s+";
057
058    /**
059     * Create a new CDXReader that reads the given file.
060     *
061     * @param cdxFile A CDX file to read.
062     * @throws IOFailure If the file cannot be found.
063     */
064    public CDXReader(File cdxFile) {
065        addCDXFile(cdxFile);
066    }
067
068    /** Create a new CDXReader with no file. */
069    public CDXReader() {
070    }
071
072    /**
073     * Add another CDX file to those being searched.
074     *
075     * @param cdxFile A CDX file to search.
076     * @throws IOFailure If the file cannot be found or read
077     */
078    public void addCDXFile(File cdxFile) {
079        ArgumentNotValid.checkNotNull(cdxFile, "cdxFile");
080        if (!cdxFile.exists() || !cdxFile.canRead()) {
081            final String message = "Can't find CDX file '" + cdxFile.getAbsolutePath() + "'";
082            log.debug(message);
083            throw new IOFailure(message);
084        }
085        files.add(cdxFile);
086    }
087
088    /**
089     * Forget about all CDX files.
090     */
091    public void clearCDXFiles() {
092        files.clear();
093    }
094
095    /**
096     * Add another CDXRecordFilter to the list of filters to use when searching.
097     *
098     * @param cdxrecfilter A CDXRecordFilter to use when searching.
099     * @throws ArgumentNotValid If the filter is invalid or another filter exists with the same name.
100     */
101    public void addCDXRecordFilter(CDXRecordFilter cdxrecfilter) throws ArgumentNotValid {
102        ArgumentNotValid.checkNotNull(cdxrecfilter, "cdxrecfilter");
103        ArgumentNotValid.checkNotNullOrEmpty(cdxrecfilter.getFilterName(), "cdxrecfilter.getFilterName()");
104
105        if (cdxrecordfilters.containsKey(cdxrecfilter.getFilterName())) {
106            throw new ArgumentNotValid("The Filtername '" + cdxrecfilter.getFilterName() + "' is already in use !");
107        }
108        cdxrecordfilters.put(cdxrecfilter.getFilterName(), cdxrecfilter);
109    }
110
111    /**
112     * Remove all CDXRecordFilters.
113     */
114    public void removeAllCDXRecordFilters() {
115        cdxrecordfilters = new HashMap<String, CDXRecordFilter>();
116    }
117
118    /**
119     * Get a table of all filters.
120     *
121     * @return a Hashtable with all the filters.
122     */
123    public Map<String, CDXRecordFilter> getFilters() {
124        return Collections.unmodifiableMap(cdxrecordfilters);
125    }
126
127    /**
128     * Get a specific filter by the name of the filter - if not found return null.
129     *
130     * @param filtername The given filtername.
131     * @return the CDXRecordFilter
132     */
133    public CDXRecordFilter getCDXRecordFilter(String filtername) {
134        return cdxrecordfilters.get(filtername);
135    }
136
137    /**
138     * Remove a specific filter by the name of the filter.
139     *
140     * @param filtername The given filtername.
141     * @throws UnknownID if there is no filter of that name.
142     */
143    public void removeCDXRecordFilter(String filtername) {
144        if (!cdxrecordfilters.containsKey(filtername)) {
145            throw new UnknownID("No filter found named " + filtername);
146        }
147        cdxrecordfilters.remove(filtername);
148    }
149
150    /**
151     * Look up an entry in CDX files. Notice that only full match search is allowed, not prefix search.
152     *
153     * @param uri A URI to find in the CDX files.
154     * @return A key indicating the place where the entry can be found, or null if no such entry was found;
155     */
156    public ARCKey getKey(String uri) {
157        for (File f : files) {
158            String firstBrokenLine = null;
159            long numBrokenLines = 0;
160            try {
161                CDXLINES: for (String s : BinSearch.getLinesInFile(f, uri)) {
162                    String[] fieldParts = s.split(SEPARATOR_REGEX);
163                    CDXRecord cdxrec;
164                    try {
165                        cdxrec = new CDXRecord(fieldParts);
166                    } catch (RuntimeException e) {
167                        // Skip lines with wrong format
168                        numBrokenLines++;
169                        if (firstBrokenLine == null) {
170                            firstBrokenLine = s;
171                        }
172                        continue CDXLINES;
173                    }
174                    String cdxuri = cdxrec.getURL();
175                    if (CDXRecord.URLsEqual(uri, cdxuri)) {
176                        for (CDXRecordFilter cdxrecf : cdxrecordfilters.values()) {
177                            if (!cdxrecf.process(cdxrec)) {
178                                continue CDXLINES;
179                            }
180                        }
181                        return new ARCKey(cdxrec.getArcfile(), cdxrec.getOffset());
182                    }
183                }
184            } finally {
185                if (numBrokenLines > 0) {
186                    log.warn("CDX file '{}' contains {} invalid CDX lines, first one is\n{}", f, numBrokenLines,
187                            firstBrokenLine);
188                }
189            }
190        }
191        return null;
192    }
193
194}