001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.wayback.accesscontrol;
024
025import java.util.Collection;
026import java.util.regex.Pattern;
027
028import org.archive.wayback.core.CaptureSearchResult;
029import org.archive.wayback.resourceindex.filters.ExclusionFilter;
030import org.archive.wayback.util.ObjectFilter;
031
032public class RegExpExclusionFilter extends ExclusionFilter {
033
034    /**
035     * The regexps to be used as the exclusion filter.
036     */
037    Collection<Pattern> regexps;
038
039    /**
040     * Creates an exclusion filter which will filter out any search result for which the original url matches any of the
041     * specified regular expression.
042     *
043     * @param regexps The regular expressions to match.
044     */
045    public RegExpExclusionFilter(Collection<Pattern> regexps) {
046        this.regexps = regexps;
047    }
048
049    @Override
050    public int filterObject(CaptureSearchResult captureSearchResult) {
051        // Note that the behaviour of the two calls to methods of the class
052        // ExclusionCaptureFilterGroup is not well documented. Omitting them
053        // results in the excluded objects being marked as not in the archive.
054        // With these calls, they are correctly identified as blocked.
055        filterGroup.setSawAdministrative();
056        for (Pattern regexp : regexps) {
057            if (regexp.matcher(captureSearchResult.getOriginalUrl()).matches()) {
058                return ObjectFilter.FILTER_EXCLUDE;
059            }
060        }
061        filterGroup.setPassedAdministrative();
062        return ObjectFilter.FILTER_INCLUDE;
063    }
064}