001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.common.utils.cdx; 025 026import java.io.File; 027import java.io.IOException; 028import java.io.RandomAccessFile; 029import java.util.Collections; 030import java.util.Iterator; 031import java.util.NoSuchElementException; 032 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import dk.netarkivet.common.exceptions.ArgumentNotValid; 037import dk.netarkivet.common.exceptions.IOFailure; 038 039/** 040 * Performs a binary search through .cdx files for a given prefix string. Currently only handles a single .cdx file. 041 */ 042public class BinSearch { 043 044 /** The logger. */ 045 private static final Logger log = LoggerFactory.getLogger(BinSearch.class); 046 047 /** 048 * Our own comparison function. Right now just does prefix match. 049 * 050 * @param line A line to find the prefix of 051 * @param pattern The prefix to find. 052 * @return A result equivalent to String.compareTo, but only for a prefix. 053 */ 054 private static int compare(String line, String pattern) { 055 String start = line.substring(0, Math.min(pattern.length(), line.length())); 056 int cmp = start.compareTo(pattern); 057 return cmp; 058 } 059 060 /** 061 * Given a file in sorted order and a prefix to search for, return a an iterable that will return the lines in the 062 * files that start with the prefix, in order. They will be read lazily from the file. 063 * <p> 064 * If no matches are found, it will still return an iterable with no entries. 065 * 066 * @param file A CDX file to search in. 067 * @param prefix The line prefix to search for. 068 * @return An Iterable object that will return the lines matching the prefix in the file. 069 */ 070 public static Iterable<String> getLinesInFile(File file, String prefix) { 071 try { 072 RandomAccessFile in = null; 073 try { 074 in = new RandomAccessFile(file, "r"); 075 long matchingline = binSearch(in, prefix); 076 if (matchingline == -1) { 077 // Simple empty Iterable 078 return Collections.emptyList(); 079 } 080 long firstMatching = findFirstLine(in, prefix, matchingline); 081 return new PrefixIterable(file, firstMatching, prefix); 082 } finally { 083 if (in != null) { 084 in.close(); 085 } 086 } 087 } catch (IOException e) { 088 String message = "IOException reading file '" + file + "'"; 089 log.warn(message, e); 090 throw new IOFailure(message, e); 091 } 092 } 093 094 /** 095 * An implementation of Iterable that returns lines matching a given prefix, starting at an offset. We use compare() 096 * to determine if the prefix matches. 097 */ 098 private static class PrefixIterable implements Iterable<String> { 099 /** File we're reading from. */ 100 private final File file; 101 /** The prefix of all lines we return. */ 102 private final String prefix; 103 /** Where to start reading - seek to this without reading it. */ 104 private final long offset; 105 106 /** 107 * Construct an Iterable from the given file, offset and prefix. 108 * 109 * @param file This file will be read when the iterator() is made. The lines in this file must be sorted 110 * alphabetically. 111 * @param offset The place where reading will start from. 112 * @param prefix The prefix of all lines that will be read. 113 */ 114 public PrefixIterable(File file, long offset, String prefix) { 115 this.file = file; 116 this.offset = offset; 117 this.prefix = prefix; 118 } 119 120 /** 121 * Return a new iterator that stops (not skips) when the line read no longer matches the prefix. 122 * 123 * @return an iterator that stops (not skips) when the line read no longer matches the prefix. When the iterator 124 * ends, the underlying file is closed. 125 */ 126 public Iterator<String> iterator() { 127 final RandomAccessFile infile; 128 try { 129 infile = new RandomAccessFile(file, "r"); 130 infile.seek(offset); 131 } catch (IOException e) { 132 String message = "IOException reading file '" + file + "'"; 133 log.warn(message, e); 134 throw new IOFailure(message, e); 135 } 136 return new Iterator<String>() { 137 String nextLine = null; 138 boolean finished = false; 139 140 /** 141 * Check whether there is a next element. Implementation note: This method has the sideeffect of reading 142 * a line into its own buffer, if none is already read. 143 * 144 * @return True if there is a next element to be had. 145 * @throws IOFailure if there is an error reading the file. 146 */ 147 public boolean hasNext() { 148 if (nextLine != null) { 149 return true; 150 } 151 if (finished) { 152 return false; 153 } 154 String line; 155 try { 156 line = infile.readLine(); 157 } catch (IOException e) { 158 String message = "IOException reading file '" + file + "'"; 159 log.warn(message, e); 160 throw new IOFailure(message, e); 161 } 162 if (line == null || compare(line, prefix) != 0) { 163 finished = true; 164 cleanUp(); 165 return false; 166 } 167 nextLine = line; 168 return true; 169 } 170 171 /** 172 * Close the input file and mark us to be done reading, so we don't try to read from a closed file. 173 */ 174 private void cleanUp() { 175 try { 176 infile.close(); 177 } catch (IOException e) { 178 String message = "IOException closing file '" + file + "'"; 179 log.warn(message, e); 180 throw new IOFailure(message, e); 181 } 182 } 183 184 /** 185 * Return the next element, if any. 186 * 187 * @return Next element. 188 * @throws IOFailure if reading the underlying file causes errors. 189 */ 190 public String next() { 191 if (nextLine == null && !hasNext()) { 192 throw new NoSuchElementException(); 193 } 194 String line = nextLine; 195 nextLine = null; 196 return line; 197 } 198 199 /** 200 * This iterator doesn't support remove. 201 * 202 * @throws UnsupportedOperationException 203 */ 204 public void remove() { 205 throw new UnsupportedOperationException(); 206 } 207 208 /** 209 * Ensures that the file pointer is really closed. 210 * 211 */ 212 public void finalize() { 213 cleanUp(); 214 } 215 }; 216 } 217 } 218 219 /** 220 * Return the index of the first line in the file to match 'find'. If the lines in the file are roughly equal 221 * length, it reads O(sqrt(n)) lines, where n is the distance from matchingline to the first line. 222 * 223 * @param in The file to search in 224 * @param find The string to match against the first line 225 * @param matchingline The index to start searching from. This index must be at the start of a line that matches 226 * 'find' 227 * @return The offset into the file of the first line matching 'find'. Guaranteed to be <= matchingline. 228 * @throws IOException If the matchingLine < 0 or some I/O error occurs. 229 */ 230 private static long findFirstLine(RandomAccessFile in, String find, long matchingline) throws IOException { 231 in.seek(matchingline); 232 String line = in.readLine(); 233 if (line == null || compare(line, find) != 0) { 234 final String msg = "Internal: Called findFirstLine without a matching line in '" + in + "' byte " 235 + matchingline; 236 log.warn(msg); 237 throw new ArgumentNotValid(msg); 238 } 239 // Skip backwards in quadratically increasing steps. 240 int linelength = line.length(); 241 long offset = linelength; 242 for (int i = 1; matchingline - offset > 0; i++, offset = i * i * linelength) { 243 skipToLine(in, matchingline - offset); 244 line = in.readLine(); 245 if (line == null || compare(line, find) != 0) { 246 break; 247 } 248 } 249 // Either found start-of-file or a non-matching line 250 long pos; 251 if (matchingline - offset <= 0) { 252 pos = 0; 253 in.seek(0); 254 } else { 255 pos = in.getFilePointer(); 256 } 257 // Seek forwards line by line until the first matching line. 258 // This takes no more than sqrt(n) steps since we know there is 259 // a matching line that far away by the way we skipped to here. 260 while ((line = in.readLine()) != null) { 261 if (compare(line, find) == 0) { 262 return pos; 263 } 264 pos = in.getFilePointer(); 265 } 266 267 return -1; 268 } 269 270 /** 271 * Skip to the next line after the given position by reading a line. Note that if the position is at the start of a 272 * line, it will go to the next line. 273 * 274 * @param in A file to read from 275 * @param pos The position to start at. 276 * @return A new position in the file. The file's pointer (as given by getFilePointer()) is updated to match. 277 * @throws IOException If some I/O error occurs 278 */ 279 private static long skipToLine(RandomAccessFile in, long pos) throws IOException { 280 in.seek(pos); 281 in.readLine(); 282 return in.getFilePointer(); 283 } 284 285 /** 286 * Perform a binary search for a string in a file. Returns the position of a line that begins with 'find'. Note that 287 * this may not be the first line, if there be duplicates. 288 * 289 * @param in the RandomAccessFile 290 * @param find The String to look for in the above file 291 * @return The index of a line matching find, or -1 if none found. 292 * @throws IOException If some I/O error occurs 293 */ 294 private static long binSearch(RandomAccessFile in, String find) throws IOException { 295 // The starting position for the binary search. Always 296 // at the start of a line that's < the wanted line. 297 long startpos = 0; 298 // Ensure that startpos isn't a match. 299 in.seek(startpos); 300 String line = in.readLine(); 301 if (line == null) { 302 return -1; 303 } 304 if (compare(line, find) == 0) { 305 return startpos; 306 } 307 // The ending position for the binary search. Always 308 // *after* a line that >= the wanted line (which also means 309 // at the start of a line that's > the wanted line, or at EOF 310 long endpos = in.length(); 311 312 // Set file pos to first line after middle. 313 findMiddleLine(in, startpos, endpos); 314 315 // When done searching, midpos points to a matching line, if any 316 // Until the search is done, both endpos and startpos point 317 // at non-matching lines (or EOF), and startpos < prevpos < endpos 318 long prevpos = in.getFilePointer(); 319 do { 320 line = in.readLine(); 321 if (line == null) { 322 log.debug("Internal: Ran past end of file in '{}' at {}", in, endpos); 323 return -1; 324 } 325 int cmp = compare(line, find); 326 if (cmp > 0) { 327 endpos = prevpos; 328 } else if (cmp < 0) { 329 startpos = prevpos; 330 } else { 331 return prevpos; 332 } 333 if (startpos == endpos) { 334 return -1; 335 } 336 prevpos = findMiddleLine(in, startpos, endpos); 337 if (prevpos == -1) { 338 return -1; 339 } 340 } while (true); 341 } 342 343 /** 344 * Returns the position of a line between startpos and endpos. If no line other than the one starting at startpos 345 * can be found, returns -1. Also sets the file pointer to the start of the line. 346 * 347 * @param in The file to read from 348 * @param startpos The lower bound for the position. Must be the start of a line. 349 * @param endpos The upper bound for the position. Must be the start of a line or EOF. 350 * @return The position of a line s.t. startpos < returnval < endpos, or -1 if no such line can be found. 351 * @throws IOException If some I/O error occurs 352 */ 353 private static long findMiddleLine(RandomAccessFile in, long startpos, long endpos) throws IOException { 354 // First check that there is a middle line at all. 355 // If there is a line after startpos, but before endpos, 356 // we remember it and as soon as we hit that line. 357 long firstmiddleline = skipToLine(in, startpos); 358 if (firstmiddleline == endpos) { 359 return -1; 360 } 361 long newmidpos = endpos; 362 int div = 1; 363 while (newmidpos == endpos) { 364 // Drat, newmidpos is not far enough back. 365 // Divide back until we find a previous line. 366 div *= 2; 367 // Find an earlier point, half as far from startpos as the previous 368 newmidpos = startpos + (endpos - startpos) / div; 369 // If we get beyond the found line after the start line, just 370 // return that. 371 if (newmidpos < firstmiddleline) { 372 newmidpos = firstmiddleline; 373 in.seek(newmidpos); 374 break; 375 } 376 // Now find the first line after the new middle 377 newmidpos = skipToLine(in, newmidpos); 378 } 379 // Now the midpos should be != startpos && != endpos 380 assert newmidpos != startpos : "Invariant violated: Newmidpos > startpos"; 381 return newmidpos; 382 } 383 384}