001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.wayback.accesscontrol;
024
025import java.io.File;
026import java.io.IOException;
027import java.util.ArrayList;
028import java.util.Collection;
029import java.util.regex.Pattern;
030import java.util.regex.PatternSyntaxException;
031
032import org.apache.commons.logging.Log;
033import org.apache.commons.logging.LogFactory;
034import org.archive.util.iterator.CloseableIterator;
035import org.archive.wayback.accesscontrol.ExclusionFilterFactory;
036import org.archive.wayback.resourceindex.filters.ExclusionFilter;
037import org.archive.wayback.util.flatfile.FlatFile;
038
039/**
040 * This class allows one to specify a file containing a list of regular expressions specifying url's to be blocked from
041 * access via wayback.
042 * <p>
043 * The class is intended to be instantiated as a Spring bean in a wayback access point, for example by adding something
044 * like
045 * <p>
046 * 
047 * <pre>
048 * {@code
049 *   <property name="exclusionFactory">
050 *       <bean class="dk.netarkivet.wayback.accesscontrol.RegExpExclusionFilterFactory" init-method="init">
051 *           <property name="file" value="/home/test/wayback_regexps.txt" />
052 *       </bean>
053 *   </property>
054 * }
055 * </pre>
056 * <p>
057 * to an access-point definition in wayback.xml.
058 */
059public class RegExpExclusionFilterFactory implements ExclusionFilterFactory {
060
061    /**
062     * Use apache commons logging for easy integration with wayback.
063     */
064    private static final Log log = LogFactory.getLog(RegExpExclusionFilterFactory.class);
065
066    /**
067     * Spring bean property specifying a flat file from which the regular expressions are to be read.
068     */
069    private File file;
070
071    /**
072     * The collection of regular expressions to be checked
073     */
074    Collection<Pattern> patterns;
075
076    /**
077     * Initialiser to be called from Spring framework.
078     *
079     * @throws IOException if the file specifying the exclusions cannot be read.
080     * @throws PatternSyntaxException if one or more of the patterns in the configuration file is an invalid java
081     * regular expression.
082     */
083    public void init() throws IOException, PatternSyntaxException {
084        loadFile();
085    }
086
087    /**
088     * Reads the file containing the regular expressions to be used as a filter, ignoring any blank lines or leading and
089     * trailing whitespace.
090     *
091     * @throws IOException if the file cannot be read.
092     * @throws PatternSyntaxException if one or more of the patterns in the configuration file is an invalid java
093     * regular expression.
094     */
095    private void loadFile() throws IOException, PatternSyntaxException {
096        Collection<Pattern> regexps = new ArrayList<Pattern>();
097        final String absolutePath = file.getAbsolutePath();
098        log.info("Loading exclusions from " + absolutePath);
099        FlatFile ff = new FlatFile(absolutePath);
100        CloseableIterator<String> itr = ff.getSequentialIterator();
101        while (itr.hasNext()) {
102            String line = (String) itr.next();
103            line = line.trim();
104            if (line.length() == 0 || line.startsWith("##")) {
105                continue;
106            }
107            log.info("Adding exclusion regular expression: '" + line + "'");
108            regexps.add(Pattern.compile(line));
109        }
110        this.patterns = regexps;
111        log.info("Finished adding exclusion regular expressions.");
112    }
113
114    /**
115     * Get the file from which regexps are read.
116     *
117     * @return the file.
118     */
119    public File getFile() {
120        return file;
121    }
122
123    /**
124     * Set the file from which regexps are read.
125     *
126     * @param file thefile.
127     */
128    public void setFile(File file) {
129        this.file = file;
130    }
131
132    @Override
133    public ExclusionFilter get() {
134        return new RegExpExclusionFilter(patterns);
135    }
136
137    @Override
138    public void shutdown() {
139        // Nothing to do
140    }
141}