001/* CrawlDataItem
002 * 
003 * Created on 10.04.2006
004 *
005 * Copyright (C) 2006 National and University Library of Iceland
006 * 
007 * This file is part of the DeDuplicator (Heritrix add-on module).
008 * 
009 * DeDuplicator is free software; you can redistribute it and/or modify
010 * it under the terms of the GNU Lesser Public License as published by
011 * the Free Software Foundation; either version 2.1 of the License, or
012 * any later version.
013 * 
014 * DeDuplicator is distributed in the hope that it will be useful, 
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017 * GNU Lesser Public License for more details.
018 * 
019 * You should have received a copy of the GNU Lesser Public License
020 * along with DeDuplicator; if not, write to the Free Software
021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022 */
023package is.hi.bok.deduplicator;
024
025/**
026 * A base class for individual items of crawl data that should be added to the index.
027 *
028 * @author Kristinn Sigurðsson
029 */
030public class CrawlDataItem {
031
032    /**
033     * The proper formating of {@link #setURL(String)} and {@link #getURL()}
034     */
035    public static final String dateFormat = "yyyyMMddHHmmssSSS";
036
037    protected String URL;
038    protected String contentDigest;
039    protected String timestamp;
040    protected String etag;
041    protected String mimetype;
042    protected String origin;
043    protected boolean duplicate;
044
045    /**
046     * Constructor. Creates a new CrawlDataItem with all its data initialized to null.
047     */
048    public CrawlDataItem() {
049        URL = null;
050        contentDigest = null;
051        timestamp = null;
052        etag = null;
053        mimetype = null;
054        origin = null;
055        duplicate = false;
056    }
057
058    /**
059     * Constructor. Creates a new CrawlDataItem with all its data initialized via the constructor.
060     *
061     * @param URL The URL for this CrawlDataItem
062     * @param contentDigest A content digest of the document found at the URL
063     * @param timestamp Date of when the content digest was valid for that URL. Format: yyyyMMddHHmmssSSS
064     * @param etag Etag for the URL
065     * @param mimetype MIME type of the document found at the URL
066     * @param origin The origin of the CrawlDataItem (the exact meaning of the origin is outside the scope of this class
067     * and it may be any String value)
068     * @param duplicate True if this CrawlDataItem was marked as duplicate
069     */
070    public CrawlDataItem(String URL, String contentDigest, String timestamp, String etag, String mimetype,
071            String origin, boolean duplicate) {
072        this.URL = URL;
073        this.contentDigest = contentDigest;
074        this.timestamp = timestamp;
075        this.etag = etag;
076        this.mimetype = mimetype;
077        this.origin = origin;
078        this.duplicate = duplicate;
079    }
080
081    /**
082     * Returns the URL
083     *
084     * @return the URL
085     */
086    public String getURL() {
087        return URL;
088    }
089
090    /**
091     * Set the URL
092     *
093     * @param URL the new URL
094     */
095    public void setURL(String URL) {
096        this.URL = URL;
097    }
098
099    /**
100     * Returns the documents content digest
101     *
102     * @return the documents content digest
103     */
104    public String getContentDigest() {
105        return contentDigest;
106    }
107
108    /**
109     * Set the content digest
110     *
111     * @param contentDigest The new value of the content digest
112     */
113    public void setContentDigest(String contentDigest) {
114        this.contentDigest = contentDigest;
115    }
116
117    /**
118     * Returns a timestamp for when the URL was fetched in the format: yyyyMMddHHmmssSSS
119     *
120     * @return the time of the URLs fetching
121     */
122    public String getTimestamp() {
123        return timestamp;
124    }
125
126    /**
127     * Set a new timestamp.
128     *
129     * @param timestamp The new timestamp. It should be in the format: yyyyMMddHHmmssSSS
130     */
131    public void setTimestamp(String timestamp) {
132        this.timestamp = timestamp;
133    }
134
135    /**
136     * Returns the etag that was associated with the document.
137     * <p>
138     * If etag is unavailable null will be returned.
139     *
140     * @return the etag.
141     */
142    public String getEtag() {
143        return etag;
144    }
145
146    /**
147     * Set a new Etag
148     *
149     * @param etag The new etag
150     */
151    public void setEtag(String etag) {
152        this.etag = etag;
153    }
154
155    /**
156     * Returns the mimetype that was associated with the document.
157     *
158     * @return the mimetype.
159     */
160    public String getMimeType() {
161        return mimetype;
162    }
163
164    /**
165     * Set new MIME type.
166     *
167     * @param mimetype The new MIME type
168     */
169    public void setMimeType(String mimetype) {
170        this.mimetype = mimetype;
171    }
172
173    /**
174     * Returns the "origin" that was associated with the document.
175     *
176     * @return the origin (may be null if none was provided for the document)
177     */
178    public String getOrigin() {
179        return origin;
180    }
181
182    /**
183     * Set new origin
184     *
185     * @param origin A new origin.
186     */
187    public void setOrigin(String origin) {
188        this.origin = origin;
189    }
190
191    /**
192     * Returns whether the CrawlDataItem was marked as duplicate.
193     *
194     * @return true if duplicate, false otherwise
195     */
196    public boolean isDuplicate() {
197        return duplicate;
198    }
199
200    /**
201     * Set whether duplicate or not.
202     *
203     * @param duplicate true if duplicate, false otherwise
204     */
205    public void setDuplicate(boolean duplicate) {
206        this.duplicate = duplicate;
207    }
208
209}