001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.common.utils.cdx; 025 026import java.io.IOException; 027import java.io.OutputStream; 028import java.util.regex.Pattern; 029 030import org.archive.io.arc.ARCRecord; 031 032import dk.netarkivet.common.Constants; 033import dk.netarkivet.common.exceptions.IOFailure; 034import dk.netarkivet.common.utils.arc.ARCBatchJob; 035 036/** 037 * Job to get cdx records out of metadata files. 038 */ 039@SuppressWarnings({"serial"}) 040public class GetCDXRecordsBatchJob extends ARCBatchJob { 041 042 /** The URL pattern used to retrieve the CDX-records. */ 043 private final Pattern URLMatcher; 044 /** The MIME pattern used to retrieve the CDX-records. */ 045 private final Pattern mimeMatcher; 046 047 /** 048 * Constructor. 049 */ 050 public GetCDXRecordsBatchJob() { 051 URLMatcher = Pattern.compile(Constants.ALL_PATTERN); 052 mimeMatcher = Pattern.compile(Constants.CDX_MIME_PATTERN); 053 batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES; 054 } 055 056 /** 057 * Initialize job. Does nothing 058 * 059 * @param os The output stream (unused in this implementation) 060 */ 061 public void initialize(OutputStream os) { 062 } 063 064 /** 065 * Process a single ARCRecord if the record contains cdx. 066 * 067 * @param sar The record we want to process 068 * @param os The output stream to write the result to 069 */ 070 public void processRecord(ARCRecord sar, OutputStream os) { 071 if (URLMatcher.matcher(sar.getMetaData().getUrl()).matches() 072 && mimeMatcher.matcher(sar.getMetaData().getMimetype()).matches()) { 073 try { 074 try { 075 byte[] buf = new byte[Constants.IO_BUFFER_SIZE]; 076 int bytesRead; 077 while ((bytesRead = sar.read(buf)) != -1) { 078 os.write(buf, 0, bytesRead); 079 } 080 } finally { 081 // TODO Should we close ARCRecord here??? 082 // if (is != null) { 083 // is.close(); 084 // } 085 } 086 } catch (IOException e) { 087 String message = "Error writing body of ARC entry '" + sar.getMetaData().getArcFile() + "' offset '" 088 + sar.getMetaData().getOffset() + "'"; 089 throw new IOFailure(message, e); 090 } 091 } 092 } 093 094 /** 095 * Finish job. Does nothing 096 * 097 * @param os The Outputstream (unused in this implementation) 098 */ 099 public void finish(OutputStream os) { 100 } 101 102}