001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils.batch;
024
025import java.awt.datatransfer.MimeTypeParseException;
026import java.io.Serializable;
027import java.util.regex.Pattern;
028
029import org.archive.io.warc.WARCRecord;
030
031import dk.netarkivet.common.exceptions.ArgumentNotValid;
032import dk.netarkivet.common.utils.archive.HeritrixArchiveRecordWrapper;
033
034/**
035 * A filter class for batch entries. Allows testing whether or not to process an entry without loading the entry data
036 * first. The class in itself is abstract but contains implementation of several filters.
037 */
038@SuppressWarnings({"serial"})
039public abstract class WARCBatchFilter implements Serializable {
040
041    /** The name of the BatchFilter. */
042    private String name;
043
044    /** A default filter: Accepts everything. */
045    public static final WARCBatchFilter NO_FILTER = new WARCBatchFilter("NO_FILTER") {
046        public boolean accept(WARCRecord record) {
047            return true;
048        }
049    };
050
051    /** The name of the filter that filters out non response records. */
052    private static final String EXCLUDE_NON_RESPONSE_RECORDS_FILTER_NAME = "EXCLUDE_NON_RESPONSE_RECORDS";
053
054    /** A default filter: Accepts on response records. */
055    public static final WARCBatchFilter EXCLUDE_NON_RESPONSE_RECORDS = new WARCBatchFilter(
056            EXCLUDE_NON_RESPONSE_RECORDS_FILTER_NAME) {
057        public boolean accept(WARCRecord record) {
058            HeritrixArchiveRecordWrapper recordWrapper = new HeritrixArchiveRecordWrapper(record);
059            String warcType = recordWrapper.getHeader().getHeaderStringValue("WARC-Type");
060            return "response".equalsIgnoreCase(warcType);
061        }
062    };
063
064    /** Prefix for the url in HTTP records. */
065    private static final String HTTP_ENTRIES_HTTP_PREFIX = "http:";
066    /** The name of the filter accepting only HTTP entries. */
067    private static final String ONLY_HTTP_ENTRIES_FILTER_NAME = "ONLY_HTTP_ENTRIES";
068
069    /**
070     * Filter that only accepts records where the url starts with http.
071     */
072    public static final WARCBatchFilter ONLY_HTTP_ENTRIES = new WARCBatchFilter(ONLY_HTTP_ENTRIES_FILTER_NAME) {
073        public boolean accept(WARCRecord record) {
074            HeritrixArchiveRecordWrapper recordWrapper = new HeritrixArchiveRecordWrapper(record);
075            return recordWrapper.getHeader().getUrl().startsWith(HTTP_ENTRIES_HTTP_PREFIX);
076        }
077    };
078
079    /**
080     * Create a new filter with the given name.
081     *
082     * @param name The name of this filter, for debugging mostly.
083     */
084    protected WARCBatchFilter(String name) {
085        ArgumentNotValid.checkNotNullOrEmpty(name, "String name");
086        this.name = name;
087    }
088
089    /**
090     * Get the name of the filter.
091     *
092     * @return the name of the filter.
093     */
094    protected String getName() {
095        return this.name;
096    }
097
098    /**
099     * Note that the mimetype of the WARC responserecord is not (necessarily) the same as its payload.
100     *
101     * @param mimetype String denoting the mimetype this filter represents
102     * @return a BatchFilter that filters out all WARCRecords, that does not have this mimetype
103     * @throws MimeTypeParseException If mimetype is invalid
104     */
105    public static WARCBatchFilter getMimetypeBatchFilter(final String mimetype) throws MimeTypeParseException {
106        ArgumentNotValid.checkNotNullOrEmpty(mimetype, "String mimetype");
107        if (!mimetypeIsOk(mimetype)) {
108            throw new MimeTypeParseException("Mimetype argument '" + mimetype + "' is invalid");
109        }
110        return new WARCBatchFilter(MIMETYPE_BATCH_FILTER_NAME_PREFIX + mimetype) {
111            public boolean accept(WARCRecord record) {
112                HeritrixArchiveRecordWrapper recordWrapper = new HeritrixArchiveRecordWrapper(record);
113                return recordWrapper.getHeader().getMimetype().startsWith(mimetype);
114            }
115        };
116    }
117
118    /** The name-prefix for mimetype filters. */
119    private static final String MIMETYPE_BATCH_FILTER_NAME_PREFIX = "MimetypeBatchFilter-";
120    /** Regexp for mimetypes. */
121    private static final String MIMETYPE_REGEXP = "\\w+/\\w+";
122    /** Pattern for mimetypes. */
123    private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP);
124
125    /**
126     * Check, if a certain mimetype is valid.
127     *
128     * @param mimetype a given mimetype
129     * @return boolean true, if mimetype matches word/word, otherwise false
130     */
131    public static boolean mimetypeIsOk(String mimetype) {
132        ArgumentNotValid.checkNotNullOrEmpty(mimetype, "String mimetype");
133        return MIMETYPE_PATTERN.matcher(mimetype).matches();
134    }
135
136    /**
137     * Check if a given record is accepted (not filtered out) by this filter.
138     *
139     * @param record a given WARCRecord
140     * @return true, if the given record is accepted by this filter
141     */
142    public abstract boolean accept(WARCRecord record);
143
144}