001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.utils.batch; 024 025import java.awt.datatransfer.MimeTypeParseException; 026import java.io.Serializable; 027import java.util.regex.Pattern; 028 029import org.archive.io.warc.WARCRecord; 030 031import dk.netarkivet.common.exceptions.ArgumentNotValid; 032import dk.netarkivet.common.utils.archive.HeritrixArchiveRecordWrapper; 033 034/** 035 * A filter class for batch entries. Allows testing whether or not to process an entry without loading the entry data 036 * first. The class in itself is abstract but contains implementation of several filters. 037 */ 038@SuppressWarnings({"serial"}) 039public abstract class WARCBatchFilter implements Serializable { 040 041 /** The name of the BatchFilter. */ 042 private String name; 043 044 /** A default filter: Accepts everything. */ 045 public static final WARCBatchFilter NO_FILTER = new WARCBatchFilter("NO_FILTER") { 046 public boolean accept(WARCRecord record) { 047 return true; 048 } 049 }; 050 051 /** The name of the filter that filters out non response records. */ 052 private static final String EXCLUDE_NON_RESPONSE_RECORDS_FILTER_NAME = "EXCLUDE_NON_RESPONSE_RECORDS"; 053 054 /** A default filter: Accepts on response records. */ 055 public static final WARCBatchFilter EXCLUDE_NON_RESPONSE_RECORDS = new WARCBatchFilter( 056 EXCLUDE_NON_RESPONSE_RECORDS_FILTER_NAME) { 057 public boolean accept(WARCRecord record) { 058 HeritrixArchiveRecordWrapper recordWrapper = new HeritrixArchiveRecordWrapper(record); 059 String warcType = recordWrapper.getHeader().getHeaderStringValue("WARC-Type"); 060 return "response".equalsIgnoreCase(warcType); 061 } 062 }; 063 064 /** Prefix for the url in HTTP records. */ 065 private static final String HTTP_ENTRIES_HTTP_PREFIX = "http:"; 066 /** The name of the filter accepting only HTTP entries. */ 067 private static final String ONLY_HTTP_ENTRIES_FILTER_NAME = "ONLY_HTTP_ENTRIES"; 068 069 /** 070 * Filter that only accepts records where the url starts with http. 071 */ 072 public static final WARCBatchFilter ONLY_HTTP_ENTRIES = new WARCBatchFilter(ONLY_HTTP_ENTRIES_FILTER_NAME) { 073 public boolean accept(WARCRecord record) { 074 HeritrixArchiveRecordWrapper recordWrapper = new HeritrixArchiveRecordWrapper(record); 075 return recordWrapper.getHeader().getUrl().startsWith(HTTP_ENTRIES_HTTP_PREFIX); 076 } 077 }; 078 079 /** 080 * Create a new filter with the given name. 081 * 082 * @param name The name of this filter, for debugging mostly. 083 */ 084 protected WARCBatchFilter(String name) { 085 ArgumentNotValid.checkNotNullOrEmpty(name, "String name"); 086 this.name = name; 087 } 088 089 /** 090 * Get the name of the filter. 091 * 092 * @return the name of the filter. 093 */ 094 protected String getName() { 095 return this.name; 096 } 097 098 /** 099 * Note that the mimetype of the WARC responserecord is not (necessarily) the same as its payload. 100 * 101 * @param mimetype String denoting the mimetype this filter represents 102 * @return a BatchFilter that filters out all WARCRecords, that does not have this mimetype 103 * @throws MimeTypeParseException If mimetype is invalid 104 */ 105 public static WARCBatchFilter getMimetypeBatchFilter(final String mimetype) throws MimeTypeParseException { 106 ArgumentNotValid.checkNotNullOrEmpty(mimetype, "String mimetype"); 107 if (!mimetypeIsOk(mimetype)) { 108 throw new MimeTypeParseException("Mimetype argument '" + mimetype + "' is invalid"); 109 } 110 return new WARCBatchFilter(MIMETYPE_BATCH_FILTER_NAME_PREFIX + mimetype) { 111 public boolean accept(WARCRecord record) { 112 HeritrixArchiveRecordWrapper recordWrapper = new HeritrixArchiveRecordWrapper(record); 113 return recordWrapper.getHeader().getMimetype().startsWith(mimetype); 114 } 115 }; 116 } 117 118 /** The name-prefix for mimetype filters. */ 119 private static final String MIMETYPE_BATCH_FILTER_NAME_PREFIX = "MimetypeBatchFilter-"; 120 /** Regexp for mimetypes. */ 121 private static final String MIMETYPE_REGEXP = "\\w+/\\w+"; 122 /** Pattern for mimetypes. */ 123 private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP); 124 125 /** 126 * Check, if a certain mimetype is valid. 127 * 128 * @param mimetype a given mimetype 129 * @return boolean true, if mimetype matches word/word, otherwise false 130 */ 131 public static boolean mimetypeIsOk(String mimetype) { 132 ArgumentNotValid.checkNotNullOrEmpty(mimetype, "String mimetype"); 133 return MIMETYPE_PATTERN.matcher(mimetype).matches(); 134 } 135 136 /** 137 * Check if a given record is accepted (not filtered out) by this filter. 138 * 139 * @param record a given WARCRecord 140 * @return true, if the given record is accepted by this filter 141 */ 142 public abstract boolean accept(WARCRecord record); 143 144}