001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.utils.batch; 024 025import java.awt.datatransfer.MimeTypeParseException; 026import java.io.Serializable; 027import java.util.regex.Pattern; 028 029import org.archive.io.arc.ARCRecord; 030 031import dk.netarkivet.common.exceptions.ArgumentNotValid; 032 033/** 034 * A filter class for batch entries. Allows testing whether or not to process an entry without loading the entry data 035 * first. The class in itself is abstract but contains implementation of several filters. 036 */ 037@SuppressWarnings({"serial"}) 038public abstract class ARCBatchFilter implements Serializable { 039 040 /** The name of the BatchFilter. */ 041 private String name; 042 043 /** A default filter: Accepts everything. */ 044 public static final ARCBatchFilter NO_FILTER = new ARCBatchFilter("NO_FILTER") { 045 public boolean accept(ARCRecord record) { 046 return true; 047 } 048 }; 049 050 /** 051 * The ARCRecord url for the filedesc record (the header record of every ARC File). 052 */ 053 private static final String FILE_HEADERS_FILEDESC_PREFIX = "filedesc"; 054 /** The name of the filter that filters out the filedesc record. */ 055 private static final String EXCLUDE_FILE_HEADERS_FILTER_NAME = "EXCLUDE_FILE_HEADERS"; 056 /** A default filter: Accepts all but the first file. */ 057 public static final ARCBatchFilter EXCLUDE_FILE_HEADERS = new ARCBatchFilter(EXCLUDE_FILE_HEADERS_FILTER_NAME) { 058 public boolean accept(ARCRecord record) { 059 return !record.getMetaData().getUrl().startsWith(FILE_HEADERS_FILEDESC_PREFIX); 060 } 061 }; 062 063 /** Prefix for the url in HTTP records. */ 064 private static final String HTTP_ENTRIES_HTTP_PREFIX = "http:"; 065 /** The name of th filter accepting only HTTP entries. */ 066 private static final String ONLY_HTTP_ENTRIES_FILTER_NAME = "ONLY_HTTP_ENTRIES"; 067 068 /** 069 * Filter that only accepts records where the url starts with http. 070 */ 071 public static final ARCBatchFilter ONLY_HTTP_ENTRIES = new ARCBatchFilter(ONLY_HTTP_ENTRIES_FILTER_NAME) { 072 public boolean accept(ARCRecord record) { 073 return record.getMetaData().getUrl().startsWith(HTTP_ENTRIES_HTTP_PREFIX); 074 } 075 }; 076 077 /** The name-prefix for mimetype filters. */ 078 private static final String MIMETYPE_BATCH_FILTER_NAME_PREFIX = "MimetypeBatchFilter-"; 079 /** Regexp for mimetypes. */ 080 private static final String MIMETYPE_REGEXP = "\\w+/\\w+"; 081 /** Pattern for mimetypes. */ 082 private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP); 083 084 /** 085 * Create a new filter with the given name. 086 * 087 * @param name The name of this filter, for debugging mostly. 088 */ 089 protected ARCBatchFilter(String name) { 090 ArgumentNotValid.checkNotNullOrEmpty(name, "String name"); 091 this.name = name; 092 } 093 094 /** 095 * Get the name of the filter. 096 * 097 * @return the name of the filter. 098 */ 099 protected String getName() { 100 return this.name; 101 } 102 103 /** 104 * @param mimetype String denoting the mimetype this filter represents 105 * @return a BatchFilter that filters out all ARCRecords, that does not have this mimetype 106 * @throws MimeTypeParseException If mimetype is invalid 107 */ 108 public static ARCBatchFilter getMimetypeBatchFilter(final String mimetype) throws MimeTypeParseException { 109 ArgumentNotValid.checkNotNullOrEmpty(mimetype, "String mimetype"); 110 if (!mimetypeIsOk(mimetype)) { 111 throw new MimeTypeParseException("Mimetype argument '" + mimetype + "' is invalid"); 112 } 113 114 return new ARCBatchFilter(MIMETYPE_BATCH_FILTER_NAME_PREFIX + mimetype) { 115 public boolean accept(ARCRecord record) { 116 return record.getMetaData().getMimetype().startsWith(mimetype); 117 } 118 }; 119 } 120 121 /** 122 * Check, if a certain mimetype is valid. 123 * 124 * @param mimetype a given mimetype 125 * @return boolean true, if mimetype matches word/word, otherwise false 126 */ 127 public static boolean mimetypeIsOk(String mimetype) { 128 ArgumentNotValid.checkNotNullOrEmpty(mimetype, "String mimetype"); 129 return MIMETYPE_PATTERN.matcher(mimetype).matches(); 130 } 131 132 /** 133 * Check if a given record is accepted (not filtered out) by this filter. 134 * 135 * @param record a given ARCRecord 136 * @return true, if the given record is accepted by this filter 137 */ 138 public abstract boolean accept(ARCRecord record); 139 140}