001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.wayback.accesscontrol; 024 025import java.io.File; 026import java.io.IOException; 027import java.util.ArrayList; 028import java.util.Collection; 029import java.util.regex.Pattern; 030import java.util.regex.PatternSyntaxException; 031 032import org.apache.commons.logging.Log; 033import org.apache.commons.logging.LogFactory; 034import org.archive.util.iterator.CloseableIterator; 035import org.archive.wayback.accesscontrol.ExclusionFilterFactory; 036import org.archive.wayback.resourceindex.filters.ExclusionFilter; 037import org.archive.wayback.util.flatfile.FlatFile; 038 039/** 040 * This class allows one to specify a file containing a list of regular expressions specifying url's to be blocked from 041 * access via wayback. 042 * <p> 043 * The class is intended to be instantiated as a Spring bean in a wayback access point, for example by adding something 044 * like 045 * <p> 046 * 047 * <pre> 048 * {@code 049 * <property name="exclusionFactory"> 050 * <bean class="dk.netarkivet.wayback.accesscontrol.RegExpExclusionFilterFactory" init-method="init"> 051 * <property name="file" value="/home/test/wayback_regexps.txt" /> 052 * </bean> 053 * </property> 054 * } 055 * </pre> 056 * <p> 057 * to an access-point definition in wayback.xml. 058 */ 059public class RegExpExclusionFilterFactory implements ExclusionFilterFactory { 060 061 /** 062 * Use apache commons logging for easy integration with wayback. 063 */ 064 private static final Log log = LogFactory.getLog(RegExpExclusionFilterFactory.class); 065 066 /** 067 * Spring bean property specifying a flat file from which the regular expressions are to be read. 068 */ 069 private File file; 070 071 /** 072 * The collection of regular expressions to be checked 073 */ 074 Collection<Pattern> patterns; 075 076 /** 077 * Initialiser to be called from Spring framework. 078 * 079 * @throws IOException if the file specifying the exclusions cannot be read. 080 * @throws PatternSyntaxException if one or more of the patterns in the configuration file is an invalid java 081 * regular expression. 082 */ 083 public void init() throws IOException, PatternSyntaxException { 084 loadFile(); 085 } 086 087 /** 088 * Reads the file containing the regular expressions to be used as a filter, ignoring any blank lines or leading and 089 * trailing whitespace. 090 * 091 * @throws IOException if the file cannot be read. 092 * @throws PatternSyntaxException if one or more of the patterns in the configuration file is an invalid java 093 * regular expression. 094 */ 095 private void loadFile() throws IOException, PatternSyntaxException { 096 Collection<Pattern> regexps = new ArrayList<Pattern>(); 097 final String absolutePath = file.getAbsolutePath(); 098 log.info("Loading exclusions from " + absolutePath); 099 FlatFile ff = new FlatFile(absolutePath); 100 CloseableIterator<String> itr = ff.getSequentialIterator(); 101 while (itr.hasNext()) { 102 String line = (String) itr.next(); 103 line = line.trim(); 104 if (line.length() == 0 || line.startsWith("##")) { 105 continue; 106 } 107 log.info("Adding exclusion regular expression: '" + line + "'"); 108 regexps.add(Pattern.compile(line)); 109 } 110 this.patterns = regexps; 111 log.info("Finished adding exclusion regular expressions."); 112 } 113 114 /** 115 * Get the file from which regexps are read. 116 * 117 * @return the file. 118 */ 119 public File getFile() { 120 return file; 121 } 122 123 /** 124 * Set the file from which regexps are read. 125 * 126 * @param file thefile. 127 */ 128 public void setFile(File file) { 129 this.file = file; 130 } 131 132 @Override 133 public ExclusionFilter get() { 134 return new RegExpExclusionFilter(patterns); 135 } 136 137 @Override 138 public void shutdown() { 139 // Nothing to do 140 } 141}