001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.datamodel; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.util.HashSet; 031import java.util.List; 032import java.util.Set; 033import java.util.regex.Pattern; 034import java.util.regex.PatternSyntaxException; 035 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import dk.netarkivet.common.exceptions.ArgumentNotValid; 040import dk.netarkivet.common.exceptions.IOFailure; 041import dk.netarkivet.common.utils.StringUtils; 042import dk.netarkivet.harvester.utils.CrawlertrapsUtils; 043 044/** 045 * Class representing one or more global crawler traps, modeled as a set of regular expressions. 046 */ 047public class GlobalCrawlerTrapList { 048 049 /** The class logger. */ 050 private static final Logger log = LoggerFactory.getLogger(GlobalCrawlerTrapList.class); 051 052 /** The unique id of this collection of crawler traps. */ 053 private int id; 054 055 /** 056 * The set of traps. Each item is a regular expression matching url's to be avoided. In the database, (id, trap) is 057 * a primary key for the table global_crawler_trap_expressions so we model the traps as a Set to avoid possible 058 * duplicates. 059 */ 060 private Set<String> traps; 061 062 /** A unique name by which this list is identified. */ 063 private String name; 064 065 /** A free-text description of the traps in this collection. */ 066 private String description; 067 068 /** Whether or not this set of traps is active (in use). */ 069 private boolean isActive; 070 071 /** 072 * Protected constructor used by the DAO to create instances of this class. 073 * 074 * @param id the id of this list. 075 * @param name a name by which this list is known. 076 * @param traps the set of trap expressions. 077 * @param description A textual description of this list (may be null). 078 * @param isActive flag indicating whether this list is isActive. 079 * @throws ArgumentNotValid if the name is empty or null 080 */ 081 protected GlobalCrawlerTrapList(int id, List<String> traps, String name, String description, boolean isActive) 082 throws ArgumentNotValid { 083 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 084 ArgumentNotValid.checkNotNull(traps, "traps"); 085 this.id = id; 086 this.traps = new HashSet<String>(traps.size()); 087 this.traps.addAll(traps); 088 this.description = description; 089 this.isActive = isActive; 090 this.name = name; 091 log.debug("Constructed the list {} with traps {}", name, traps.size()); 092 } 093 094 /** 095 * Construct a new GlobalCrawlerTrapList from an input stream consisting of newline-separated regular expressions. 096 * 097 * @param is an input stream from which the list of trap expressions can be read. 098 * @param name a name by which this list is known. 099 * @param description A textual description of this list. 100 * @param isActive flag indicating whether this list is isActive. 101 * @throws IOFailure if the input stream cannot be found or read. 102 * @throws ArgumentNotValid if the input stream is null, the name is null or empty, or the list contains invalid expressions 103 */ 104 public GlobalCrawlerTrapList(InputStream is, String name, String description, boolean isActive) throws IOFailure, 105 ArgumentNotValid { 106 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 107 ArgumentNotValid.checkNotNull(is, "is"); 108 this.traps = new HashSet<String>(); 109 this.isActive = isActive; 110 this.name = name; 111 if (description == null) { 112 this.description = ""; 113 } else { 114 this.description = description; 115 } 116 setTrapsFromInputStream(is, name); 117 } 118 119 /** 120 * A utility method to read the list of traps from an InputStream, line-by-line. 121 * 122 * @param is The input stream from which to read. 123 * @param listName the name of the list being constructed 124 * @throws IOFailure if the input stream cannot be read. 125 * @throws ArgumentNotValid if the input stream is null or if any of the specified traps are not valid regular 126 * expressions and valid XML 127 */ 128 public void setTrapsFromInputStream(InputStream is, String listName) throws ArgumentNotValid { 129 ArgumentNotValid.checkNotNull(is, "is"); 130 traps.clear(); 131 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 132 String line; 133 int trapsAdded=0; 134 int skippedEmptyLines=0; 135 Set<String> errors = new HashSet<String>(); 136 try { 137 while ((line = reader.readLine()) != null) { 138 final String trap = line.trim(); 139 if (trap.isEmpty()) { 140 log.debug("Skipping empty line in input for list '{}'", listName); 141 skippedEmptyLines++; 142 continue; 143 } else { 144 try { 145 Pattern.compile(trap); 146 if (!CrawlertrapsUtils.isCrawlertrapsWellformedXML(trap)) { 147 errors.add("The trap '" + trap + "' is not wellformed XML."); 148 } 149 } catch (PatternSyntaxException e) { 150 errors.add("The trap '" + trap + "' is not a valid Java regular expression: " + e + " ."); 151 } 152 traps.add(trap); 153 trapsAdded++; 154 log.trace("Added trap #{}: '{}'", trapsAdded, trap); 155 } 156 } 157 } catch (IOException e) { 158 throw new IOFailure("Could not read crawler traps", e); 159 } 160 // See if any errors have been found 161 if (errors.size() > 0) { 162 throw new ArgumentNotValid("The traplist '" + listName + "' contains invalid expressions: " + StringUtils.conjoin( 163 ",", errors)); 164 } 165 166 167 log.info("GlobalCrawlertraps list '{}' with {} unique traps (non-unique={}, skipped emptyLines={})", listName, traps.size(), trapsAdded, skippedEmptyLines); 168 } 169 170 /** 171 * Get the id of this list. 172 * 173 * @return the id. 174 */ 175 public int getId() { 176 return id; 177 } 178 179 /** 180 * Set the id of this list. 181 * 182 * @param id the id. 183 */ 184 protected void setId(int id) { 185 this.id = id; 186 } 187 188 /** 189 * Get the name of the list. 190 * 191 * @return the name. 192 */ 193 public String getName() { 194 return name; 195 } 196 197 /** 198 * Set the name of the list. 199 * 200 * @param name the name. 201 */ 202 public void setName(String name) { 203 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 204 this.name = name; 205 } 206 207 /** 208 * Get the trap expressions for this list. 209 * 210 * @return the trap expressions. 211 */ 212 public Set<String> getTraps() { 213 return traps; 214 } 215 216 /** 217 * Set the trap expressions for this list. 218 * 219 * @param traps the trap expressions. 220 */ 221 public void setTraps(Set<String> traps) { 222 ArgumentNotValid.checkNotNull(traps, "traps"); 223 this.traps = traps; 224 } 225 226 /** 227 * Get the description of this list. 228 * 229 * @return the description. 230 */ 231 public String getDescription() { 232 return description; 233 } 234 235 /** 236 * Set the description of this list. 237 * 238 * @param description the description. 239 */ 240 public void setDescription(String description) { 241 ArgumentNotValid.checkNotNull(description, "description"); 242 this.description = description; 243 } 244 245 /** 246 * Returns true if this list is active. 247 * 248 * @return the activity state of the list. 249 */ 250 public boolean isActive() { 251 return isActive; 252 } 253 254 /** 255 * Set the activity state of the list. 256 * 257 * @param active the activity state. 258 */ 259 public void setActive(boolean active) { 260 isActive = active; 261 } 262 263 @Override 264 public boolean equals(Object o) { 265 if (this == o) { 266 return true; 267 } 268 if (o == null || getClass() != o.getClass()) { 269 return false; 270 } 271 272 GlobalCrawlerTrapList that = (GlobalCrawlerTrapList) o; 273 274 if (id != that.id) { 275 return false; 276 } 277 if (isActive != that.isActive) { 278 return false; 279 } 280 if (description != null ? !description.equals(that.description) : that.description != null) { 281 return false; 282 } 283 if (name != null ? !name.equals(that.name) : that.name != null) { 284 return false; 285 } 286 if (traps != null ? !traps.equals(that.traps) : that.traps != null) { 287 return false; 288 } 289 290 return true; 291 } 292 293 @Override 294 public int hashCode() { 295 int result = id; 296 result = 31 * result + (traps != null ? traps.hashCode() : 0); 297 result = 31 * result + (name != null ? name.hashCode() : 0); 298 result = 31 * result + (description != null ? description.hashCode() : 0); 299 result = 31 * result + (isActive ? 1 : 0); 300 return result; 301 } 302 303}