001/* CrawlDataIterator
002 * 
003 * Created on 10.04.2006
004 *
005 * Copyright (C) 2006 National and University Library of Iceland
006 * 
007 * This file is part of the DeDuplicator (Heritrix add-on module).
008 * 
009 * DeDuplicator is free software; you can redistribute it and/or modify
010 * it under the terms of the GNU Lesser Public License as published by
011 * the Free Software Foundation; either version 2.1 of the License, or
012 * any later version.
013 * 
014 * DeDuplicator is distributed in the hope that it will be useful, 
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017 * GNU Lesser Public License for more details.
018 * 
019 * You should have received a copy of the GNU Lesser Public License
020 * along with DeDuplicator; if not, write to the Free Software
021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022 */
023package is.hi.bok.deduplicator;
024
025import java.io.IOException;
026
027/**
028 * An abstract base class for implementations of iterators that iterate over different sets of crawl data (i.e.
029 * crawl.log, ARC, WARC etc.)
030 *
031 * @author Kristinn Sigurðsson
032 */
033public abstract class CrawlDataIterator {
034
035    String source;
036
037    /**
038     * Constructor.
039     *
040     * @param source The location of the crawl data. The meaning of this value may vary based on the implementation of
041     * concrete subclasses. Typically it will refer to a directory or a file.
042     */
043    public CrawlDataIterator(String source) {
044        this.source = source;
045    }
046
047    /**
048     * Are there more elements?
049     *
050     * @return true if there are more elements, false otherwise
051     * @throws IOException If an error occurs accessing the crawl data.
052     */
053    public abstract boolean hasNext() throws IOException;
054
055    /**
056     * Get the next {@link CrawlDataItem}.
057     *
058     * @return the next CrawlDataItem. If there are no further elements then null will be returned.
059     * @throws IOException If an error occurs accessing the crawl data.
060     */
061    public abstract CrawlDataItem next() throws IOException;
062
063    /**
064     * Close any resources held open to read the crawl data.
065     *
066     * @throws IOException If an error occurs closing access to crawl data.
067     */
068    public abstract void close() throws IOException;
069
070    /**
071     * A short, human readable, string about what source this iterator uses. I.e.
072     * "Iterator for Heritrix style crawl.log" etc.
073     *
074     * @return A short, human readable, string about what source this iterator uses.
075     */
076    public abstract String getSourceType();
077}