001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.common.utils.cdx;
025
026import java.io.File;
027import java.io.IOException;
028import java.io.RandomAccessFile;
029import java.util.Collections;
030import java.util.Iterator;
031import java.util.NoSuchElementException;
032
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036import dk.netarkivet.common.exceptions.ArgumentNotValid;
037import dk.netarkivet.common.exceptions.IOFailure;
038
039/**
040 * Performs a binary search through .cdx files for a given prefix string. Currently only handles a single .cdx file.
041 */
042public class BinSearch {
043
044    /** The logger. */
045    private static final Logger log = LoggerFactory.getLogger(BinSearch.class);
046
047    /**
048     * Our own comparison function. Right now just does prefix match.
049     *
050     * @param line A line to find the prefix of
051     * @param pattern The prefix to find.
052     * @return A result equivalent to String.compareTo, but only for a prefix.
053     */
054    private static int compare(String line, String pattern) {
055        String start = line.substring(0, Math.min(pattern.length(), line.length()));
056        int cmp = start.compareTo(pattern);
057        return cmp;
058    }
059
060    /**
061     * Given a file in sorted order and a prefix to search for, return a an iterable that will return the lines in the
062     * files that start with the prefix, in order. They will be read lazily from the file.
063     * <p>
064     * If no matches are found, it will still return an iterable with no entries.
065     *
066     * @param file A CDX file to search in.
067     * @param prefix The line prefix to search for.
068     * @return An Iterable object that will return the lines matching the prefix in the file.
069     */
070    public static Iterable<String> getLinesInFile(File file, String prefix) {
071        try {
072            RandomAccessFile in = null;
073            try {
074                in = new RandomAccessFile(file, "r");
075                long matchingline = binSearch(in, prefix);
076                if (matchingline == -1) {
077                    // Simple empty Iterable
078                    return Collections.emptyList();
079                }
080                long firstMatching = findFirstLine(in, prefix, matchingline);
081                return new PrefixIterable(file, firstMatching, prefix);
082            } finally {
083                if (in != null) {
084                    in.close();
085                }
086            }
087        } catch (IOException e) {
088            String message = "IOException reading file '" + file + "'";
089            log.warn(message, e);
090            throw new IOFailure(message, e);
091        }
092    }
093
094    /**
095     * An implementation of Iterable that returns lines matching a given prefix, starting at an offset. We use compare()
096     * to determine if the prefix matches.
097     */
098    private static class PrefixIterable implements Iterable<String> {
099        /** File we're reading from. */
100        private final File file;
101        /** The prefix of all lines we return. */
102        private final String prefix;
103        /** Where to start reading - seek to this without reading it. */
104        private final long offset;
105
106        /**
107         * Construct an Iterable from the given file, offset and prefix.
108         *
109         * @param file This file will be read when the iterator() is made. The lines in this file must be sorted
110         * alphabetically.
111         * @param offset The place where reading will start from.
112         * @param prefix The prefix of all lines that will be read.
113         */
114        public PrefixIterable(File file, long offset, String prefix) {
115            this.file = file;
116            this.offset = offset;
117            this.prefix = prefix;
118        }
119
120        /**
121         * Return a new iterator that stops (not skips) when the line read no longer matches the prefix.
122         *
123         * @return an iterator that stops (not skips) when the line read no longer matches the prefix. When the iterator
124         * ends, the underlying file is closed.
125         */
126        public Iterator<String> iterator() {
127            final RandomAccessFile infile;
128            try {
129                infile = new RandomAccessFile(file, "r");
130                infile.seek(offset);
131            } catch (IOException e) {
132                String message = "IOException reading file '" + file + "'";
133                log.warn(message, e);
134                throw new IOFailure(message, e);
135            }
136            return new Iterator<String>() {
137                String nextLine = null;
138                boolean finished = false;
139
140                /**
141                 * Check whether there is a next element. Implementation note: This method has the sideeffect of reading
142                 * a line into its own buffer, if none is already read.
143                 *
144                 * @return True if there is a next element to be had.
145                 * @throws IOFailure if there is an error reading the file.
146                 */
147                public boolean hasNext() {
148                    if (nextLine != null) {
149                        return true;
150                    }
151                    if (finished) {
152                        return false;
153                    }
154                    String line;
155                    try {
156                        line = infile.readLine();
157                    } catch (IOException e) {
158                        String message = "IOException reading file '" + file + "'";
159                        log.warn(message, e);
160                        throw new IOFailure(message, e);
161                    }
162                    if (line == null || compare(line, prefix) != 0) {
163                        finished = true;
164                        cleanUp();
165                        return false;
166                    }
167                    nextLine = line;
168                    return true;
169                }
170
171                /**
172                 * Close the input file and mark us to be done reading, so we don't try to read from a closed file.
173                 */
174                private void cleanUp() {
175                    try {
176                        infile.close();
177                    } catch (IOException e) {
178                        String message = "IOException closing file '" + file + "'";
179                        log.warn(message, e);
180                        throw new IOFailure(message, e);
181                    }
182                }
183
184                /**
185                 * Return the next element, if any.
186                 *
187                 * @return Next element.
188                 * @throws IOFailure if reading the underlying file causes errors.
189                 */
190                public String next() {
191                    if (nextLine == null && !hasNext()) {
192                        throw new NoSuchElementException();
193                    }
194                    String line = nextLine;
195                    nextLine = null;
196                    return line;
197                }
198
199                /**
200                 * This iterator doesn't support remove.
201                 * 
202                 * @throws UnsupportedOperationException
203                 */
204                public void remove() {
205                    throw new UnsupportedOperationException();
206                }
207
208                /**
209                 * Ensures that the file pointer is really closed.
210                 *
211                 */
212                public void finalize() {
213                    cleanUp();
214                }
215            };
216        }
217    }
218
219    /**
220     * Return the index of the first line in the file to match 'find'. If the lines in the file are roughly equal
221     * length, it reads O(sqrt(n)) lines, where n is the distance from matchingline to the first line.
222     *
223     * @param in The file to search in
224     * @param find The string to match against the first line
225     * @param matchingline The index to start searching from. This index must be at the start of a line that matches
226     * 'find'
227     * @return The offset into the file of the first line matching 'find'. Guaranteed to be <= matchingline.
228     * @throws IOException If the matchingLine < 0 or some I/O error occurs.
229     */
230    private static long findFirstLine(RandomAccessFile in, String find, long matchingline) throws IOException {
231        in.seek(matchingline);
232        String line = in.readLine();
233        if (line == null || compare(line, find) != 0) {
234            final String msg = "Internal: Called findFirstLine without a matching line in '" + in + "' byte "
235                    + matchingline;
236            log.warn(msg);
237            throw new ArgumentNotValid(msg);
238        }
239        // Skip backwards in quadratically increasing steps.
240        int linelength = line.length();
241        long offset = linelength;
242        for (int i = 1; matchingline - offset > 0; i++, offset = i * i * linelength) {
243            skipToLine(in, matchingline - offset);
244            line = in.readLine();
245            if (line == null || compare(line, find) != 0) {
246                break;
247            }
248        }
249        // Either found start-of-file or a non-matching line
250        long pos;
251        if (matchingline - offset <= 0) {
252            pos = 0;
253            in.seek(0);
254        } else {
255            pos = in.getFilePointer();
256        }
257        // Seek forwards line by line until the first matching line.
258        // This takes no more than sqrt(n) steps since we know there is
259        // a matching line that far away by the way we skipped to here.
260        while ((line = in.readLine()) != null) {
261            if (compare(line, find) == 0) {
262                return pos;
263            }
264            pos = in.getFilePointer();
265        }
266
267        return -1;
268    }
269
270    /**
271     * Skip to the next line after the given position by reading a line. Note that if the position is at the start of a
272     * line, it will go to the next line.
273     *
274     * @param in A file to read from
275     * @param pos The position to start at.
276     * @return A new position in the file. The file's pointer (as given by getFilePointer()) is updated to match.
277     * @throws IOException If some I/O error occurs
278     */
279    private static long skipToLine(RandomAccessFile in, long pos) throws IOException {
280        in.seek(pos);
281        in.readLine();
282        return in.getFilePointer();
283    }
284
285    /**
286     * Perform a binary search for a string in a file. Returns the position of a line that begins with 'find'. Note that
287     * this may not be the first line, if there be duplicates.
288     *
289     * @param in the RandomAccessFile
290     * @param find The String to look for in the above file
291     * @return The index of a line matching find, or -1 if none found.
292     * @throws IOException If some I/O error occurs
293     */
294    private static long binSearch(RandomAccessFile in, String find) throws IOException {
295        // The starting position for the binary search. Always
296        // at the start of a line that's < the wanted line.
297        long startpos = 0;
298        // Ensure that startpos isn't a match.
299        in.seek(startpos);
300        String line = in.readLine();
301        if (line == null) {
302            return -1;
303        }
304        if (compare(line, find) == 0) {
305            return startpos;
306        }
307        // The ending position for the binary search. Always
308        // *after* a line that >= the wanted line (which also means
309        // at the start of a line that's > the wanted line, or at EOF
310        long endpos = in.length();
311
312        // Set file pos to first line after middle.
313        findMiddleLine(in, startpos, endpos);
314
315        // When done searching, midpos points to a matching line, if any
316        // Until the search is done, both endpos and startpos point
317        // at non-matching lines (or EOF), and startpos < prevpos < endpos
318        long prevpos = in.getFilePointer();
319        do {
320            line = in.readLine();
321            if (line == null) {
322                log.debug("Internal: Ran past end of file in '{}' at {}", in, endpos);
323                return -1;
324            }
325            int cmp = compare(line, find);
326            if (cmp > 0) {
327                endpos = prevpos;
328            } else if (cmp < 0) {
329                startpos = prevpos;
330            } else {
331                return prevpos;
332            }
333            if (startpos == endpos) {
334                return -1;
335            }
336            prevpos = findMiddleLine(in, startpos, endpos);
337            if (prevpos == -1) {
338                return -1;
339            }
340        } while (true);
341    }
342
343    /**
344     * Returns the position of a line between startpos and endpos. If no line other than the one starting at startpos
345     * can be found, returns -1. Also sets the file pointer to the start of the line.
346     *
347     * @param in The file to read from
348     * @param startpos The lower bound for the position. Must be the start of a line.
349     * @param endpos The upper bound for the position. Must be the start of a line or EOF.
350     * @return The position of a line s.t. startpos < returnval < endpos, or -1 if no such line can be found.
351     * @throws IOException If some I/O error occurs
352     */
353    private static long findMiddleLine(RandomAccessFile in, long startpos, long endpos) throws IOException {
354        // First check that there is a middle line at all.
355        // If there is a line after startpos, but before endpos,
356        // we remember it and as soon as we hit that line.
357        long firstmiddleline = skipToLine(in, startpos);
358        if (firstmiddleline == endpos) {
359            return -1;
360        }
361        long newmidpos = endpos;
362        int div = 1;
363        while (newmidpos == endpos) {
364            // Drat, newmidpos is not far enough back.
365            // Divide back until we find a previous line.
366            div *= 2;
367            // Find an earlier point, half as far from startpos as the previous
368            newmidpos = startpos + (endpos - startpos) / div;
369            // If we get beyond the found line after the start line, just
370            // return that.
371            if (newmidpos < firstmiddleline) {
372                newmidpos = firstmiddleline;
373                in.seek(newmidpos);
374                break;
375            }
376            // Now find the first line after the new middle
377            newmidpos = skipToLine(in, newmidpos);
378        }
379        // Now the midpos should be != startpos && != endpos
380        assert newmidpos != startpos : "Invariant violated: Newmidpos > startpos";
381        return newmidpos;
382    }
383
384}