001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.utils.batch; 024 025import java.awt.datatransfer.MimeTypeParseException; 026import java.io.Serializable; 027import java.util.regex.Pattern; 028 029import dk.netarkivet.common.exceptions.ArgumentNotValid; 030import dk.netarkivet.common.utils.archive.ArchiveRecordBase; 031 032/** 033 * A filter class for batch entries. Allows testing whether or not to process an entry without loading the entry data 034 * first. 035 * <p> 036 * accept() is given an ArchiveRecord to avoid unnecessary reading and copying of data of records not accepted by 037 * filter. 038 */ 039@SuppressWarnings({"serial"}) 040public abstract class ArchiveBatchFilter implements Serializable { 041 042 /** The name of the BatchFilter. */ 043 protected String name; 044 045 /** 046 * Create a new filter with the given name. 047 * 048 * @param name The name of this filter, for debugging mostly. 049 */ 050 protected ArchiveBatchFilter(String name) { 051 ArgumentNotValid.checkNotNullOrEmpty(name, "String name"); 052 this.name = name; 053 } 054 055 /** 056 * Get the name of the filter. 057 * 058 * @return the name of the filter. 059 */ 060 protected String getName() { 061 return this.name; 062 } 063 064 /** 065 * Check if a given record is accepted (not filtered out) by this filter. 066 * 067 * @param record a given archive record 068 * @return true, if the given archive record is accepted by this filter 069 */ 070 public abstract boolean accept(ArchiveRecordBase record); 071 072 /** A default filter: Accepts everything. */ 073 public static final ArchiveBatchFilter NO_FILTER = new ArchiveBatchFilter("NO_FILTER") { 074 @Override 075 public boolean accept(ArchiveRecordBase record) { 076 return true; 077 } 078 }; 079 080 /** 081 * The ARCRecord url for the filedesc record (the header record of every ARC File). 082 */ 083 private static final String ARC_FILE_FILEDESC_HEADER_PREFIX = "filedesc"; 084 085 /** The name of the filter that filters out the filedesc record and/or non-response records. */ 086 private static final String EXCLUDE_NON_RESPONSE_RECORDS_FILTER_NAME = "EXCLUDE_NON_RESPONSE_RECORDS"; 087 088 /** The name of the filter that filters out the filedesc record and/or non-warcinfo records */ 089 private static final String EXCLUDE_WARCINFO_AND_FILEDESC_RECORDS_FILTER_NAME = "EXCLUDE_WARCINFO_AND_FILEDESC_RECORDS"; 090 091 /** A default filter: Accepts only response records. */ 092 public static final ArchiveBatchFilter EXCLUDE_NON_RESPONSE_RECORDS = new ArchiveBatchFilter( 093 EXCLUDE_NON_RESPONSE_RECORDS_FILTER_NAME) { 094 @Override 095 public boolean accept(ArchiveRecordBase record) { 096 if (record.bIsArc) { 097 return !record.getHeader().getUrl().startsWith(ARC_FILE_FILEDESC_HEADER_PREFIX); 098 } 099 if (record.bIsWarc) { 100 String warcType = record.getHeader().getHeaderStringValue("WARC-Type"); 101 return "response".equalsIgnoreCase(warcType); 102 } 103 return false; 104 } 105 }; 106 107 /** A default filter: Accepts only response records. */ 108 public static final ArchiveBatchFilter EXCLUDE_NON_WARCINFO_RECORDS = new ArchiveBatchFilter( 109 EXCLUDE_WARCINFO_AND_FILEDESC_RECORDS_FILTER_NAME) { 110 @Override 111 public boolean accept(ArchiveRecordBase record) { 112 if (record.bIsArc) { 113 return !record.getHeader().getUrl().startsWith(ARC_FILE_FILEDESC_HEADER_PREFIX); 114 } 115 if (record.bIsWarc) { 116 String warcType = record.getHeader().getHeaderStringValue("WARC-Type"); 117 return !"warcinfo".equalsIgnoreCase(warcType); 118 } 119 return false; 120 } 121 }; 122 123 /** Prefix for the url in HTTP records. */ 124 private static final String EXCLUDE_HTTP_ENTRIES_HTTP_PREFIX = "http:"; 125 /** The name of the filter accepting only HTTP entries. */ 126 private static final String ONLY_HTTP_ENTRIES_FILTER_NAME = "ONLY_HTTP_ENTRIES"; 127 128 /** 129 * Filter that only accepts records where the url starts with http. 130 */ 131 public static final ArchiveBatchFilter ONLY_HTTP_ENTRIES = new ArchiveBatchFilter(ONLY_HTTP_ENTRIES_FILTER_NAME) { 132 @Override 133 public boolean accept(ArchiveRecordBase record) { 134 return record.getHeader().getUrl().startsWith(EXCLUDE_HTTP_ENTRIES_HTTP_PREFIX); 135 } 136 }; 137 138 private static final String MIMETYPE_BATCH_FILTER_NAME_PREFIX = "MimetypeBatchFilter-"; 139 140 /** 141 * Note that the mimetype of the WARC responserecord is not (necessarily) the same as its payload. 142 * 143 * @param mimetype String denoting the mimetype this filter represents 144 * @return a BatchFilter that filters out all ARCRecords, that does not have this mimetype 145 * @throws java.awt.datatransfer.MimeTypeParseException (if mimetype is invalid) 146 */ 147 public static ArchiveBatchFilter getMimetypeBatchFilter(final String mimetype) throws MimeTypeParseException { 148 if (!mimetypeIsOk(mimetype)) { 149 throw new MimeTypeParseException("Mimetype argument '" + mimetype + "' is invalid"); 150 } 151 return new ArchiveBatchFilter(MIMETYPE_BATCH_FILTER_NAME_PREFIX + mimetype) { 152 @Override 153 public boolean accept(ArchiveRecordBase record) { 154 return record.getHeader().getMimetype().startsWith(mimetype); 155 } 156 }; 157 } 158 159 /** Regexp for mimetypes. */ 160 private static final String MIMETYPE_REGEXP = "\\w+/\\w+"; 161 /** Pattern for mimetypes. */ 162 private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP); 163 164 /** 165 * Check, if a certain mimetype is valid 166 * 167 * @param mimetype 168 * @return boolean true, if mimetype matches word/word, otherwise false 169 */ 170 public static boolean mimetypeIsOk(String mimetype) { 171 return MIMETYPE_PATTERN.matcher(mimetype).matches(); 172 } 173 174}