001/* CrawlDataIterator 002 * 003 * Created on 10.04.2006 004 * 005 * Copyright (C) 2006 National and University Library of Iceland 006 * 007 * This file is part of the DeDuplicator (Heritrix add-on module). 008 * 009 * DeDuplicator is free software; you can redistribute it and/or modify 010 * it under the terms of the GNU Lesser Public License as published by 011 * the Free Software Foundation; either version 2.1 of the License, or 012 * any later version. 013 * 014 * DeDuplicator is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 017 * GNU Lesser Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser Public License 020 * along with DeDuplicator; if not, write to the Free Software 021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 022 */ 023package is.hi.bok.deduplicator; 024 025import java.io.IOException; 026 027/** 028 * An abstract base class for implementations of iterators that iterate over different sets of crawl data (i.e. 029 * crawl.log, ARC, WARC etc.) 030 * 031 * @author Kristinn Sigurðsson 032 */ 033public abstract class CrawlDataIterator { 034 035 String source; 036 037 /** 038 * Constructor. 039 * 040 * @param source The location of the crawl data. The meaning of this value may vary based on the implementation of 041 * concrete subclasses. Typically it will refer to a directory or a file. 042 */ 043 public CrawlDataIterator(String source) { 044 this.source = source; 045 } 046 047 /** 048 * Are there more elements? 049 * 050 * @return true if there are more elements, false otherwise 051 * @throws IOException If an error occurs accessing the crawl data. 052 */ 053 public abstract boolean hasNext() throws IOException; 054 055 /** 056 * Get the next {@link CrawlDataItem}. 057 * 058 * @return the next CrawlDataItem. If there are no further elements then null will be returned. 059 * @throws IOException If an error occurs accessing the crawl data. 060 */ 061 public abstract CrawlDataItem next() throws IOException; 062 063 /** 064 * Close any resources held open to read the crawl data. 065 * 066 * @throws IOException If an error occurs closing access to crawl data. 067 */ 068 public abstract void close() throws IOException; 069 070 /** 071 * A short, human readable, string about what source this iterator uses. I.e. 072 * "Iterator for Heritrix style crawl.log" etc. 073 * 074 * @return A short, human readable, string about what source this iterator uses. 075 */ 076 public abstract String getSourceType(); 077}