001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils.batch;
024
025import java.awt.datatransfer.MimeTypeParseException;
026import java.io.Serializable;
027import java.util.regex.Pattern;
028
029import dk.netarkivet.common.exceptions.ArgumentNotValid;
030import dk.netarkivet.common.utils.archive.ArchiveRecordBase;
031
032/**
033 * A filter class for batch entries. Allows testing whether or not to process an entry without loading the entry data
034 * first.
035 * <p>
036 * accept() is given an ArchiveRecord to avoid unnecessary reading and copying of data of records not accepted by
037 * filter.
038 */
039@SuppressWarnings({"serial"})
040public abstract class ArchiveBatchFilter implements Serializable {
041
042    /** The name of the BatchFilter. */
043    protected String name;
044
045    /**
046     * Create a new filter with the given name.
047     *
048     * @param name The name of this filter, for debugging mostly.
049     */
050    protected ArchiveBatchFilter(String name) {
051        ArgumentNotValid.checkNotNullOrEmpty(name, "String name");
052        this.name = name;
053    }
054
055    /**
056     * Get the name of the filter.
057     *
058     * @return the name of the filter.
059     */
060    protected String getName() {
061        return this.name;
062    }
063
064    /**
065     * Check if a given record is accepted (not filtered out) by this filter.
066     *
067     * @param record a given archive record
068     * @return true, if the given archive record is accepted by this filter
069     */
070    public abstract boolean accept(ArchiveRecordBase record);
071
072    /** A default filter: Accepts everything. */
073    public static final ArchiveBatchFilter NO_FILTER = new ArchiveBatchFilter("NO_FILTER") {
074        @Override
075        public boolean accept(ArchiveRecordBase record) {
076            return true;
077        }
078    };
079
080    /**
081     * The ARCRecord url for the filedesc record (the header record of every ARC File).
082     */
083    private static final String ARC_FILE_FILEDESC_HEADER_PREFIX = "filedesc";
084
085    /** The name of the filter that filters out the filedesc record and/or non-response records. */
086    private static final String EXCLUDE_NON_RESPONSE_RECORDS_FILTER_NAME = "EXCLUDE_NON_RESPONSE_RECORDS";
087
088    /** The name of the filter that filters out the filedesc record and/or non-warcinfo records */
089    private static final String EXCLUDE_WARCINFO_AND_FILEDESC_RECORDS_FILTER_NAME = "EXCLUDE_WARCINFO_AND_FILEDESC_RECORDS";
090
091    /** A default filter: Accepts only response records. */
092    public static final ArchiveBatchFilter EXCLUDE_NON_RESPONSE_RECORDS = new ArchiveBatchFilter(
093            EXCLUDE_NON_RESPONSE_RECORDS_FILTER_NAME) {
094        @Override
095        public boolean accept(ArchiveRecordBase record) {
096            if (record.bIsArc) {
097                return !record.getHeader().getUrl().startsWith(ARC_FILE_FILEDESC_HEADER_PREFIX);
098            }
099            if (record.bIsWarc) {
100                String warcType = record.getHeader().getHeaderStringValue("WARC-Type");
101                return "response".equalsIgnoreCase(warcType);
102            }
103            return false;
104        }
105    };
106
107    /** A default filter: Accepts only response records. */
108    public static final ArchiveBatchFilter EXCLUDE_NON_WARCINFO_RECORDS = new ArchiveBatchFilter(
109            EXCLUDE_WARCINFO_AND_FILEDESC_RECORDS_FILTER_NAME) {
110        @Override
111        public boolean accept(ArchiveRecordBase record) {
112            if (record.bIsArc) {
113                return !record.getHeader().getUrl().startsWith(ARC_FILE_FILEDESC_HEADER_PREFIX);
114            }
115            if (record.bIsWarc) {
116                String warcType = record.getHeader().getHeaderStringValue("WARC-Type");
117                return !"warcinfo".equalsIgnoreCase(warcType);
118            }
119            return false;
120        }
121    };
122
123    /** Prefix for the url in HTTP records. */
124    private static final String EXCLUDE_HTTP_ENTRIES_HTTP_PREFIX = "http:";
125    /** The name of the filter accepting only HTTP entries. */
126    private static final String ONLY_HTTP_ENTRIES_FILTER_NAME = "ONLY_HTTP_ENTRIES";
127
128    /**
129     * Filter that only accepts records where the url starts with http.
130     */
131    public static final ArchiveBatchFilter ONLY_HTTP_ENTRIES = new ArchiveBatchFilter(ONLY_HTTP_ENTRIES_FILTER_NAME) {
132        @Override
133        public boolean accept(ArchiveRecordBase record) {
134            return record.getHeader().getUrl().startsWith(EXCLUDE_HTTP_ENTRIES_HTTP_PREFIX);
135        }
136    };
137
138    private static final String MIMETYPE_BATCH_FILTER_NAME_PREFIX = "MimetypeBatchFilter-";
139
140    /**
141     * Note that the mimetype of the WARC responserecord is not (necessarily) the same as its payload.
142     *
143     * @param mimetype String denoting the mimetype this filter represents
144     * @return a BatchFilter that filters out all ARCRecords, that does not have this mimetype
145     * @throws java.awt.datatransfer.MimeTypeParseException (if mimetype is invalid)
146     */
147    public static ArchiveBatchFilter getMimetypeBatchFilter(final String mimetype) throws MimeTypeParseException {
148        if (!mimetypeIsOk(mimetype)) {
149            throw new MimeTypeParseException("Mimetype argument '" + mimetype + "' is invalid");
150        }
151        return new ArchiveBatchFilter(MIMETYPE_BATCH_FILTER_NAME_PREFIX + mimetype) {
152            @Override
153            public boolean accept(ArchiveRecordBase record) {
154                return record.getHeader().getMimetype().startsWith(mimetype);
155            }
156        };
157    }
158
159    /** Regexp for mimetypes. */
160    private static final String MIMETYPE_REGEXP = "\\w+/\\w+";
161    /** Pattern for mimetypes. */
162    private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP);
163
164    /**
165     * Check, if a certain mimetype is valid
166     *
167     * @param mimetype
168     * @return boolean true, if mimetype matches word/word, otherwise false
169     */
170    public static boolean mimetypeIsOk(String mimetype) {
171        return MIMETYPE_PATTERN.matcher(mimetype).matches();
172    }
173
174}