001/* CrawlDataItem 002 * 003 * Created on 10.04.2006 004 * 005 * Copyright (C) 2006 National and University Library of Iceland 006 * 007 * This file is part of the DeDuplicator (Heritrix add-on module). 008 * 009 * DeDuplicator is free software; you can redistribute it and/or modify 010 * it under the terms of the GNU Lesser Public License as published by 011 * the Free Software Foundation; either version 2.1 of the License, or 012 * any later version. 013 * 014 * DeDuplicator is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 017 * GNU Lesser Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser Public License 020 * along with DeDuplicator; if not, write to the Free Software 021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 022 */ 023package is.hi.bok.deduplicator; 024 025/** 026 * A base class for individual items of crawl data that should be added to the index. 027 * 028 * @author Kristinn Sigurðsson 029 */ 030public class CrawlDataItem { 031 032 /** 033 * The proper formating of {@link #setURL(String)} and {@link #getURL()} 034 */ 035 public static final String dateFormat = "yyyyMMddHHmmssSSS"; 036 037 protected String URL; 038 protected String contentDigest; 039 protected String timestamp; 040 protected String etag; 041 protected String mimetype; 042 protected String origin; 043 protected boolean duplicate; 044 045 /** 046 * Constructor. Creates a new CrawlDataItem with all its data initialized to null. 047 */ 048 public CrawlDataItem() { 049 URL = null; 050 contentDigest = null; 051 timestamp = null; 052 etag = null; 053 mimetype = null; 054 origin = null; 055 duplicate = false; 056 } 057 058 /** 059 * Constructor. Creates a new CrawlDataItem with all its data initialized via the constructor. 060 * 061 * @param URL The URL for this CrawlDataItem 062 * @param contentDigest A content digest of the document found at the URL 063 * @param timestamp Date of when the content digest was valid for that URL. Format: yyyyMMddHHmmssSSS 064 * @param etag Etag for the URL 065 * @param mimetype MIME type of the document found at the URL 066 * @param origin The origin of the CrawlDataItem (the exact meaning of the origin is outside the scope of this class 067 * and it may be any String value) 068 * @param duplicate True if this CrawlDataItem was marked as duplicate 069 */ 070 public CrawlDataItem(String URL, String contentDigest, String timestamp, String etag, String mimetype, 071 String origin, boolean duplicate) { 072 this.URL = URL; 073 this.contentDigest = contentDigest; 074 this.timestamp = timestamp; 075 this.etag = etag; 076 this.mimetype = mimetype; 077 this.origin = origin; 078 this.duplicate = duplicate; 079 } 080 081 /** 082 * Returns the URL 083 * 084 * @return the URL 085 */ 086 public String getURL() { 087 return URL; 088 } 089 090 /** 091 * Set the URL 092 * 093 * @param URL the new URL 094 */ 095 public void setURL(String URL) { 096 this.URL = URL; 097 } 098 099 /** 100 * Returns the documents content digest 101 * 102 * @return the documents content digest 103 */ 104 public String getContentDigest() { 105 return contentDigest; 106 } 107 108 /** 109 * Set the content digest 110 * 111 * @param contentDigest The new value of the content digest 112 */ 113 public void setContentDigest(String contentDigest) { 114 this.contentDigest = contentDigest; 115 } 116 117 /** 118 * Returns a timestamp for when the URL was fetched in the format: yyyyMMddHHmmssSSS 119 * 120 * @return the time of the URLs fetching 121 */ 122 public String getTimestamp() { 123 return timestamp; 124 } 125 126 /** 127 * Set a new timestamp. 128 * 129 * @param timestamp The new timestamp. It should be in the format: yyyyMMddHHmmssSSS 130 */ 131 public void setTimestamp(String timestamp) { 132 this.timestamp = timestamp; 133 } 134 135 /** 136 * Returns the etag that was associated with the document. 137 * <p> 138 * If etag is unavailable null will be returned. 139 * 140 * @return the etag. 141 */ 142 public String getEtag() { 143 return etag; 144 } 145 146 /** 147 * Set a new Etag 148 * 149 * @param etag The new etag 150 */ 151 public void setEtag(String etag) { 152 this.etag = etag; 153 } 154 155 /** 156 * Returns the mimetype that was associated with the document. 157 * 158 * @return the mimetype. 159 */ 160 public String getMimeType() { 161 return mimetype; 162 } 163 164 /** 165 * Set new MIME type. 166 * 167 * @param mimetype The new MIME type 168 */ 169 public void setMimeType(String mimetype) { 170 this.mimetype = mimetype; 171 } 172 173 /** 174 * Returns the "origin" that was associated with the document. 175 * 176 * @return the origin (may be null if none was provided for the document) 177 */ 178 public String getOrigin() { 179 return origin; 180 } 181 182 /** 183 * Set new origin 184 * 185 * @param origin A new origin. 186 */ 187 public void setOrigin(String origin) { 188 this.origin = origin; 189 } 190 191 /** 192 * Returns whether the CrawlDataItem was marked as duplicate. 193 * 194 * @return true if duplicate, false otherwise 195 */ 196 public boolean isDuplicate() { 197 return duplicate; 198 } 199 200 /** 201 * Set whether duplicate or not. 202 * 203 * @param duplicate true if duplicate, false otherwise 204 */ 205 public void setDuplicate(boolean duplicate) { 206 this.duplicate = duplicate; 207 } 208 209}