001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.utils.archive; 024 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.OutputStream; 028import java.util.regex.Pattern; 029 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033import dk.netarkivet.common.Constants; 034import dk.netarkivet.common.exceptions.IOFailure; 035 036/** A batch job that extracts metadata. */ 037@SuppressWarnings({"serial"}) 038public class GetMetadataArchiveBatchJob extends ArchiveBatchJob { 039 040 /** The logger for this class. */ 041 private static final Logger log = LoggerFactory.getLogger(GetMetadataArchiveBatchJob.class); 042 043 /** The pattern for matching the urls. */ 044 private final Pattern urlMatcher; 045 /** The pattern for the mimetype matcher. */ 046 private final Pattern mimeMatcher; 047 048 /** 049 * Constructor. 050 * 051 * @param urlMatcher A pattern for matching URLs of the desired entries. If null, a .* pattern will be used. 052 * @param mimeMatcher A pattern for matching mime-types of the desired entries. If null, a .* pattern will be used. 053 * <p> 054 * The batchJobTimeout is set to one day. 055 */ 056 public GetMetadataArchiveBatchJob(Pattern urlMatcher, Pattern mimeMatcher) { 057 this.urlMatcher = urlMatcher; 058 this.mimeMatcher = mimeMatcher; 059 060 batchJobTimeout = Constants.ONE_DAY_IN_MILLIES; 061 } 062 063 /** 064 * Initialize method. Run before the arc-records are being processed. Currently does nothing. 065 * 066 * @param os The output stream to print any pre-processing data. 067 */ 068 @Override 069 public void initialize(OutputStream os) { 070 } 071 072 /** 073 * The method for processing the arc-records. 074 * 075 * @param record The arc-record to process. 076 * @param os The output stream to write the results of the processing. 077 * @throws IOFailure In an IOException is caught during handling of the arc record. 078 */ 079 @Override 080 public void processRecord(ArchiveRecordBase record, OutputStream os) throws IOFailure { 081 ArchiveHeaderBase header = record.getHeader(); 082 InputStream in = record.getInputStream(); 083 084 if (header.getUrl() == null) { 085 return; 086 } 087 log.info(header.getUrl() + " - " + header.getMimetype()); 088 if (urlMatcher.matcher(header.getUrl()).matches() && mimeMatcher.matcher(header.getMimetype()).matches()) { 089 try { 090 byte[] buf = new byte[Constants.IO_BUFFER_SIZE]; 091 int bytesRead; 092 while ((bytesRead = in.read(buf)) != -1) { 093 os.write(buf, 0, bytesRead); 094 } 095 } catch (IOException e) { 096 // TODO is getOffset() correct using the IA archiveReader? 097 String message = "Error writing body of Archive entry '" + header.getArchiveFile() + "' offset '" 098 + header.getOffset() + "'"; 099 throw new IOFailure(message, e); 100 } 101 } 102 103 try { 104 in.close(); 105 } catch (IOException e) { 106 String message = "Error closing Archive input stream"; 107 throw new IOFailure(message, e); 108 } 109 } 110 111 /** 112 * Method for post-processing the data. Currently does nothing. 113 * 114 * @param os The output stream to write the results of the post-processing data. 115 */ 116 @Override 117 public void finish(OutputStream os) { 118 } 119 120 /** 121 * Humanly readable description of this instance. 122 * 123 * @return The human readable description of this instance. 124 */ 125 @Override 126 public String toString() { 127 return getClass().getName() + ", with arguments: URLMatcher = " + urlMatcher + ", mimeMatcher = " + mimeMatcher; 128 } 129 130}