001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.datamodel;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.InputStreamReader;
030import java.util.HashSet;
031import java.util.List;
032import java.util.Set;
033import java.util.regex.Pattern;
034import java.util.regex.PatternSyntaxException;
035
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import dk.netarkivet.common.exceptions.ArgumentNotValid;
040import dk.netarkivet.common.exceptions.IOFailure;
041import dk.netarkivet.common.utils.StringUtils;
042import dk.netarkivet.harvester.utils.CrawlertrapsUtils;
043
044/**
045 * Class representing one or more global crawler traps, modeled as a set of regular expressions.
046 */
047public class GlobalCrawlerTrapList {
048
049    /** The class logger. */
050    private static final Logger log = LoggerFactory.getLogger(GlobalCrawlerTrapList.class);
051    
052    /** The unique id of this collection of crawler traps. */
053    private int id;
054
055    /**
056     * The set of traps. Each item is a regular expression matching url's to be avoided. In the database, (id, trap) is
057     * a primary key for the table global_crawler_trap_expressions so we model the traps as a Set to avoid possible
058     * duplicates.
059     */
060    private Set<String> traps;
061
062    /** A unique name by which this list is identified. */
063    private String name;
064
065    /** A free-text description of the traps in this collection. */
066    private String description;
067
068    /** Whether or not this set of traps is active (in use). */
069    private boolean isActive;
070
071    /**
072     * Protected constructor used by the DAO to create instances of this class.
073     *
074     * @param id the id of this list.
075     * @param name a name by which this list is known.
076     * @param traps the set of trap expressions.
077     * @param description A textual description of this list (may be null).
078     * @param isActive flag indicating whether this list is isActive.
079     * @throws ArgumentNotValid if the name is empty or null
080     */
081    protected GlobalCrawlerTrapList(int id, List<String> traps, String name, String description, boolean isActive)
082            throws ArgumentNotValid {
083        ArgumentNotValid.checkNotNullOrEmpty(name, "name");
084        ArgumentNotValid.checkNotNull(traps, "traps");
085        this.id = id;
086        this.traps = new HashSet<String>(traps.size());
087        this.traps.addAll(traps);
088        this.description = description;
089        this.isActive = isActive;
090        this.name = name;
091        log.debug("Constructed the list {} with traps {}", name, traps.size());
092    }
093
094    /**
095     * Construct a new GlobalCrawlerTrapList from an input stream consisting of newline-separated regular expressions.
096     *
097     * @param is an input stream from which the list of trap expressions can be read.
098     * @param name a name by which this list is known.
099     * @param description A textual description of this list.
100     * @param isActive flag indicating whether this list is isActive.
101     * @throws IOFailure if the input stream cannot be found or read.
102     * @throws ArgumentNotValid if the input stream is null, the name is null or empty, or the list contains invalid expressions
103     */
104    public GlobalCrawlerTrapList(InputStream is, String name, String description, boolean isActive) throws IOFailure,
105            ArgumentNotValid {
106        ArgumentNotValid.checkNotNullOrEmpty(name, "name");
107        ArgumentNotValid.checkNotNull(is, "is");
108        this.traps = new HashSet<String>();
109        this.isActive = isActive;
110        this.name = name;
111        if (description == null) {
112            this.description = "";
113        } else {
114            this.description = description;
115        }
116        setTrapsFromInputStream(is, name);
117    }
118
119    /**
120     * A utility method to read the list of traps from an InputStream, line-by-line.
121     *
122     * @param is The input stream from which to read.
123     * @param listName the name of the list being constructed
124     * @throws IOFailure if the input stream cannot be read.
125     * @throws ArgumentNotValid if the input stream is null or if any of the specified traps are not valid regular
126     * expressions and valid XML
127     */
128    public void setTrapsFromInputStream(InputStream is, String listName) throws ArgumentNotValid {
129        ArgumentNotValid.checkNotNull(is, "is");
130        traps.clear();
131        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
132        String line;
133        int trapsAdded=0;
134        int skippedEmptyLines=0;
135        Set<String> errors = new HashSet<String>();
136        try {
137            while ((line = reader.readLine()) != null) {
138                final String trap = line.trim();
139                if (trap.isEmpty()) {
140                    log.debug("Skipping empty line in input for list '{}'", listName);
141                    skippedEmptyLines++;
142                    continue; 
143                } else {
144                    try {
145                        Pattern.compile(trap);
146                        if (!CrawlertrapsUtils.isCrawlertrapsWellformedXML(trap)) {
147                                errors.add("The trap '" + trap + "' is not wellformed XML.");
148                        }
149                    } catch (PatternSyntaxException e) {
150                        errors.add("The trap '" + trap + "' is not a valid Java regular expression: " + e + " .");
151                    }
152                    traps.add(trap);
153                    trapsAdded++;
154                    log.trace("Added trap #{}: '{}'", trapsAdded, trap);
155                }
156            }
157        } catch (IOException e) {
158            throw new IOFailure("Could not read crawler traps", e);
159        }
160        // See if any errors have been found 
161        if (errors.size() > 0) {
162                throw new ArgumentNotValid("The traplist '" + listName + "' contains invalid expressions: " + StringUtils.conjoin(
163                                ",", errors));
164        }
165        
166        
167        log.info("GlobalCrawlertraps list '{}' with {} unique traps (non-unique={}, skipped emptyLines={})", listName, traps.size(), trapsAdded, skippedEmptyLines);
168    }
169
170    /**
171     * Get the id of this list.
172     *
173     * @return the id.
174     */
175    public int getId() {
176        return id;
177    }
178
179    /**
180     * Set the id of this list.
181     *
182     * @param id the id.
183     */
184    protected void setId(int id) {
185        this.id = id;
186    }
187
188    /**
189     * Get the name of the list.
190     *
191     * @return the name.
192     */
193    public String getName() {
194        return name;
195    }
196
197    /**
198     * Set the name of the list.
199     *
200     * @param name the name.
201     */
202    public void setName(String name) {
203        ArgumentNotValid.checkNotNullOrEmpty(name, "name");
204        this.name = name;
205    }
206
207    /**
208     * Get the trap expressions for this list.
209     *
210     * @return the trap expressions.
211     */
212    public Set<String> getTraps() {
213        return traps;
214    }
215
216    /**
217     * Set the trap expressions for this list.
218     *
219     * @param traps the trap expressions.
220     */
221    public void setTraps(Set<String> traps) {
222        ArgumentNotValid.checkNotNull(traps, "traps");
223        this.traps = traps;
224    }
225
226    /**
227     * Get the description of this list.
228     *
229     * @return the description.
230     */
231    public String getDescription() {
232        return description;
233    }
234
235    /**
236     * Set the description of this list.
237     *
238     * @param description the description.
239     */
240    public void setDescription(String description) {
241        ArgumentNotValid.checkNotNull(description, "description");
242        this.description = description;
243    }
244
245    /**
246     * Returns true if this list is active.
247     *
248     * @return the activity state of the list.
249     */
250    public boolean isActive() {
251        return isActive;
252    }
253
254    /**
255     * Set the activity state of the list.
256     *
257     * @param active the activity state.
258     */
259    public void setActive(boolean active) {
260        isActive = active;
261    }
262
263    @Override
264    public boolean equals(Object o) {
265        if (this == o) {
266            return true;
267        }
268        if (o == null || getClass() != o.getClass()) {
269            return false;
270        }
271
272        GlobalCrawlerTrapList that = (GlobalCrawlerTrapList) o;
273
274        if (id != that.id) {
275            return false;
276        }
277        if (isActive != that.isActive) {
278            return false;
279        }
280        if (description != null ? !description.equals(that.description) : that.description != null) {
281            return false;
282        }
283        if (name != null ? !name.equals(that.name) : that.name != null) {
284            return false;
285        }
286        if (traps != null ? !traps.equals(that.traps) : that.traps != null) {
287            return false;
288        }
289
290        return true;
291    }
292
293    @Override
294    public int hashCode() {
295        int result = id;
296        result = 31 * result + (traps != null ? traps.hashCode() : 0);
297        result = 31 * result + (name != null ? name.hashCode() : 0);
298        result = 31 * result + (description != null ? description.hashCode() : 0);
299        result = 31 * result + (isActive ? 1 : 0);
300        return result;
301    }
302
303}