001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.distribute.arcrepository; 024 025import java.io.ByteArrayInputStream; 026import java.io.File; 027import java.io.FileOutputStream; 028import java.io.FilterInputStream; 029import java.io.IOException; 030import java.io.InputStream; 031import java.io.OutputStream; 032import java.io.Serializable; 033 034import org.archive.io.ArchiveReader; 035import org.archive.io.ArchiveReaderFactory; 036import org.archive.io.ArchiveRecord; 037import org.archive.io.arc.ARCRecord; 038import org.archive.io.warc.WARCRecord; 039import org.slf4j.Logger; 040import org.slf4j.LoggerFactory; 041 042import dk.netarkivet.common.CommonSettings; 043import dk.netarkivet.common.distribute.RemoteFile; 044import dk.netarkivet.common.distribute.RemoteFileFactory; 045import dk.netarkivet.common.exceptions.ArgumentNotValid; 046import dk.netarkivet.common.exceptions.IOFailure; 047import dk.netarkivet.common.exceptions.IllegalState; 048import dk.netarkivet.common.utils.FileUtils; 049import dk.netarkivet.common.utils.Settings; 050import dk.netarkivet.common.utils.arc.ARCUtils; 051import dk.netarkivet.common.utils.warc.WARCUtils; 052 053/** 054 * Class to hold the result of a lookup operation in the bitarchive: The metadata information associated with the record 055 * The actual byte content The name of the file the data were retrieved from If length of record exceeds value of 056 * Settings.BITARCHIVE_LIMIT_FOR_RECORD_DATATRANSFER_IN_FILE The record is stored in a RemoteFile. 057 */ 058@SuppressWarnings({"serial"}) 059public class BitarchiveRecord implements Serializable { 060 061 /** the log. */ 062 private static final transient Logger log = LoggerFactory.getLogger(BitarchiveRecord.class); 063 064 /** The file the data were retrieved from. */ 065 private String fileName; 066 067 /** The actual data. */ 068 private byte[] objectBuffer; 069 070 /** The offset of the ArchiveRecord contained. */ 071 private long offset; 072 073 /** The length of the ArchiveRecord contained. */ 074 private long length; 075 076 /** The actual data as a remote file. */ 077 private RemoteFile objectAsRemoteFile; 078 079 /** Is the data stored in a RemoteFile. */ 080 private boolean isStoredAsRemoteFile = false; 081 082 /** Set after deleting RemoteFile. */ 083 private boolean hasRemoteFileBeenDeleted = false; 084 085 /** How large the ARCRecord can before saving as RemoteFile. */ 086 private final long LIMIT_FOR_SAVING_DATA_IN_OBJECT_BUFFER = Settings 087 .getLong(CommonSettings.BITARCHIVE_LIMIT_FOR_RECORD_DATATRANSFER_IN_FILE); 088 089 /** 090 * Creates a BitarchiveRecord from the a ArchiveRecord, which can be either a ARCRecord or WARCRecord. Note that 091 * record metadata is not included with the BitarchiveRecord, only the payload of the record. 092 * <p> 093 * If the length of the record is higher than Settings .BITARCHIVE_LIMIT_FOR_RECORD_DATATRANSFER_IN_FILE the data is 094 * stored in a RemoteFile, otherwise the data is stored in a byte array. 095 * 096 * @param record the ArchiveRecord that the data should come from. We do not close the ArchiveRecord. 097 * @param filename The filename of the ArchiveFile 098 */ 099 public BitarchiveRecord(ArchiveRecord record, String filename) { 100 ArgumentNotValid.checkNotNull(record, "ArchiveRecord record"); 101 ArgumentNotValid.checkNotNull(filename, "String filename"); 102 this.fileName = filename; 103 this.offset = record.getHeader().getOffset(); 104 if (record instanceof ARCRecord) { 105 length = record.getHeader().getLength(); 106 } else if (record instanceof WARCRecord) { 107 // The length of the payload of the warc-record is not getLength(), 108 // but getLength minus getContentBegin(), which is the number of 109 // bytes used for the record-header! 110 length = record.getHeader().getLength() - record.getHeader().getContentBegin(); 111 } else { 112 throw new ArgumentNotValid("Unknown type of ArchiveRecord"); 113 } 114 if (length > LIMIT_FOR_SAVING_DATA_IN_OBJECT_BUFFER) { 115 // copy arc-data to local file and create a RemoteFile based on this 116 log.info("Record exceeds limit of {} bytes. Length is {} bytes, Storing as instance of {}", 117 LIMIT_FOR_SAVING_DATA_IN_OBJECT_BUFFER, length, Settings.get(CommonSettings.REMOTE_FILE_CLASS)); 118 if (RemoteFileFactory.isExtendedRemoteFile()) { 119 objectAsRemoteFile = RemoteFileFactory.getExtendedInstance(record); 120 isStoredAsRemoteFile = true; 121 } else { 122 File localTmpFile = null; 123 try { 124 localTmpFile = File.createTempFile("BitarchiveRecord-" + fileName, ".tmp", FileUtils.getTempDir()); 125 record.dump(new FileOutputStream(localTmpFile)); 126 objectAsRemoteFile = RemoteFileFactory.getMovefileInstance(localTmpFile); 127 isStoredAsRemoteFile = true; 128 } catch (IOException e) { 129 throw new IOFailure("Unable to store record(" + fileName + "," + offset + ") as remotefile", e); 130 } 131 } 132 } else { // Store data in objectbuffer 133 try { 134 if (record instanceof ARCRecord) { 135 objectBuffer = ARCUtils.readARCRecord((ARCRecord) record); 136 } else if (record instanceof WARCRecord) { 137 objectBuffer = WARCUtils.readWARCRecord((WARCRecord) record); 138 } 139 log.debug("Bytes stored in objectBuffer: {}", objectBuffer.length); 140 } catch (IOException e) { 141 throw new ExceptionInInitializerError(e); 142 } 143 } 144 } 145 146 /** 147 * Returns the file that this information was loaded from. 148 * 149 * @return the file that this ARC record comes from. 150 */ 151 public String getFile() { 152 return fileName; 153 } 154 155 /** 156 * Returns the length of the ARCRecord contained. 157 * 158 * @return the length of the ARCRecord contained 159 */ 160 public long getLength() { 161 return length; 162 } 163 164 /** 165 * Retrieve the data in the record. If data is in RemoteFile, this operation deletes the RemoteFile. 166 * 167 * @return the data from the ARCRecord as an InputStream. 168 * @throws IllegalState if remotefile already deleted 169 */ 170 public InputStream getData() { 171 InputStream result = null; 172 if (isStoredAsRemoteFile) { 173 if (hasRemoteFileBeenDeleted) { 174 throw new IllegalState("RemoteFile has already been deleted"); 175 } 176 log.info("Reading {} bytes from RemoteFile", length); 177 InputStream rfInputStream = objectAsRemoteFile.getInputStream(); 178 result = new FilterInputStream(rfInputStream) { 179 public void close() throws IOException { 180 super.close(); 181 objectAsRemoteFile.cleanup(); 182 hasRemoteFileBeenDeleted = true; 183 } 184 }; 185 } else { 186 log.debug("Reading {} bytes from objectBuffer", length); 187 result = new ByteArrayInputStream(objectBuffer); 188 } 189 return result; 190 } 191 192 /** 193 * Deliver the data in the record to a given OutputStream. If data is in RemoteFile, this operation deletes the 194 * RemoteFile 195 * 196 * @param out deliver the data to this outputstream 197 * @throws IOFailure if any IOException occurs reading or writing the data 198 * @throws IllegalState if remotefile already deleted 199 */ 200 public void getData(OutputStream out) { 201 ArgumentNotValid.checkNotNull(out, "OutputStream out"); 202 if (isStoredAsRemoteFile) { 203 if (hasRemoteFileBeenDeleted) { 204 throw new IllegalState("RemoteFile has already been deleted"); 205 } 206 try { 207 log.debug("Reading {} bytes from RemoteFile", length); 208 objectAsRemoteFile.appendTo(out); 209 } finally { 210 log.trace("Deleting the RemoteFile '{}'.", objectAsRemoteFile.getName()); 211 objectAsRemoteFile.cleanup(); 212 hasRemoteFileBeenDeleted = true; 213 } 214 } else { 215 try { 216 log.debug("Reading {} bytes from objectBuffer", length); 217 out.write(objectBuffer, 0, objectBuffer.length); 218 } catch (IOException e) { 219 throw new IOFailure("Unable to write data from " + "objectBuffer to the outputstream", e); 220 } 221 } 222 } 223 224 225 public static BitarchiveRecord getBitarchiveRecord(String filename, File f, long index) { 226 try (ArchiveReader reader = ArchiveReaderFactory.get(f, index); ArchiveRecord record = reader.get();){ 227 return new BitarchiveRecord(record, filename); 228 } catch (IOException e) { 229 throw new IOFailure("Error reading record from '" + filename + "' in file '"+f.getAbsolutePath()+"' offset " + index, e); 230 } 231 } 232 233}