001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils.batch;
024
025import java.awt.datatransfer.MimeTypeParseException;
026import java.io.Serializable;
027import java.util.regex.Pattern;
028
029import org.archive.io.arc.ARCRecord;
030
031import dk.netarkivet.common.exceptions.ArgumentNotValid;
032
033/**
034 * A filter class for batch entries. Allows testing whether or not to process an entry without loading the entry data
035 * first. The class in itself is abstract but contains implementation of several filters.
036 */
037@SuppressWarnings({"serial"})
038public abstract class ARCBatchFilter implements Serializable {
039
040    /** The name of the BatchFilter. */
041    private String name;
042
043    /** A default filter: Accepts everything. */
044    public static final ARCBatchFilter NO_FILTER = new ARCBatchFilter("NO_FILTER") {
045        public boolean accept(ARCRecord record) {
046            return true;
047        }
048    };
049
050    /**
051     * The ARCRecord url for the filedesc record (the header record of every ARC File).
052     */
053    private static final String FILE_HEADERS_FILEDESC_PREFIX = "filedesc";
054    /** The name of the filter that filters out the filedesc record. */
055    private static final String EXCLUDE_FILE_HEADERS_FILTER_NAME = "EXCLUDE_FILE_HEADERS";
056    /** A default filter: Accepts all but the first file. */
057    public static final ARCBatchFilter EXCLUDE_FILE_HEADERS = new ARCBatchFilter(EXCLUDE_FILE_HEADERS_FILTER_NAME) {
058        public boolean accept(ARCRecord record) {
059            return !record.getMetaData().getUrl().startsWith(FILE_HEADERS_FILEDESC_PREFIX);
060        }
061    };
062
063    /** Prefix for the url in HTTP records. */
064    private static final String HTTP_ENTRIES_HTTP_PREFIX = "http:";
065    /** The name of th filter accepting only HTTP entries. */
066    private static final String ONLY_HTTP_ENTRIES_FILTER_NAME = "ONLY_HTTP_ENTRIES";
067
068    /**
069     * Filter that only accepts records where the url starts with http.
070     */
071    public static final ARCBatchFilter ONLY_HTTP_ENTRIES = new ARCBatchFilter(ONLY_HTTP_ENTRIES_FILTER_NAME) {
072        public boolean accept(ARCRecord record) {
073            return record.getMetaData().getUrl().startsWith(HTTP_ENTRIES_HTTP_PREFIX);
074        }
075    };
076
077    /** The name-prefix for mimetype filters. */
078    private static final String MIMETYPE_BATCH_FILTER_NAME_PREFIX = "MimetypeBatchFilter-";
079    /** Regexp for mimetypes. */
080    private static final String MIMETYPE_REGEXP = "\\w+/\\w+";
081    /** Pattern for mimetypes. */
082    private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP);
083
084    /**
085     * Create a new filter with the given name.
086     *
087     * @param name The name of this filter, for debugging mostly.
088     */
089    protected ARCBatchFilter(String name) {
090        ArgumentNotValid.checkNotNullOrEmpty(name, "String name");
091        this.name = name;
092    }
093
094    /**
095     * Get the name of the filter.
096     *
097     * @return the name of the filter.
098     */
099    protected String getName() {
100        return this.name;
101    }
102
103    /**
104     * @param mimetype String denoting the mimetype this filter represents
105     * @return a BatchFilter that filters out all ARCRecords, that does not have this mimetype
106     * @throws MimeTypeParseException If mimetype is invalid
107     */
108    public static ARCBatchFilter getMimetypeBatchFilter(final String mimetype) throws MimeTypeParseException {
109        ArgumentNotValid.checkNotNullOrEmpty(mimetype, "String mimetype");
110        if (!mimetypeIsOk(mimetype)) {
111            throw new MimeTypeParseException("Mimetype argument '" + mimetype + "' is invalid");
112        }
113
114        return new ARCBatchFilter(MIMETYPE_BATCH_FILTER_NAME_PREFIX + mimetype) {
115            public boolean accept(ARCRecord record) {
116                return record.getMetaData().getMimetype().startsWith(mimetype);
117            }
118        };
119    }
120
121    /**
122     * Check, if a certain mimetype is valid.
123     *
124     * @param mimetype a given mimetype
125     * @return boolean true, if mimetype matches word/word, otherwise false
126     */
127    public static boolean mimetypeIsOk(String mimetype) {
128        ArgumentNotValid.checkNotNullOrEmpty(mimetype, "String mimetype");
129        return MIMETYPE_PATTERN.matcher(mimetype).matches();
130    }
131
132    /**
133     * Check if a given record is accepted (not filtered out) by this filter.
134     *
135     * @param record a given ARCRecord
136     * @return true, if the given record is accepted by this filter
137     */
138    public abstract boolean accept(ARCRecord record);
139
140}