001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.wayback.accesscontrol; 024 025import java.util.Collection; 026import java.util.regex.Pattern; 027 028import org.archive.wayback.core.CaptureSearchResult; 029import org.archive.wayback.resourceindex.filters.ExclusionFilter; 030import org.archive.wayback.util.ObjectFilter; 031 032public class RegExpExclusionFilter extends ExclusionFilter { 033 034 /** 035 * The regexps to be used as the exclusion filter. 036 */ 037 Collection<Pattern> regexps; 038 039 /** 040 * Creates an exclusion filter which will filter out any search result for which the original url matches any of the 041 * specified regular expression. 042 * 043 * @param regexps The regular expressions to match. 044 */ 045 public RegExpExclusionFilter(Collection<Pattern> regexps) { 046 this.regexps = regexps; 047 } 048 049 @Override 050 public int filterObject(CaptureSearchResult captureSearchResult) { 051 // Note that the behaviour of the two calls to methods of the class 052 // ExclusionCaptureFilterGroup is not well documented. Omitting them 053 // results in the excluded objects being marked as not in the archive. 054 // With these calls, they are correctly identified as blocked. 055 filterGroup.setSawAdministrative(); 056 for (Pattern regexp : regexps) { 057 if (regexp.matcher(captureSearchResult.getOriginalUrl()).matches()) { 058 return ObjectFilter.FILTER_EXCLUDE; 059 } 060 } 061 filterGroup.setPassedAdministrative(); 062 return ObjectFilter.FILTER_INCLUDE; 063 } 064}