001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.common.utils.arc; 025 026import java.io.File; 027import java.io.FileInputStream; 028import java.io.FileOutputStream; 029import java.io.IOException; 030import java.io.InputStream; 031import java.io.PrintStream; 032import java.util.HashMap; 033import java.util.Iterator; 034import java.util.Map; 035import java.util.concurrent.atomic.AtomicInteger; 036import java.util.regex.Matcher; 037import java.util.regex.Pattern; 038 039import org.archive.format.arc.ARCConstants; 040import org.archive.io.ArchiveRecord; 041import org.archive.io.WriterPoolSettings; 042import org.archive.io.arc.ARCReader; 043import org.archive.io.arc.ARCReaderFactory; 044import org.archive.io.arc.ARCRecord; 045import org.archive.io.arc.ARCRecordMetaData; 046import org.archive.io.arc.ARCWriter; 047import org.archive.io.arc.WriterPoolSettingsData; 048import org.archive.util.ArchiveUtils; 049import org.slf4j.Logger; 050import org.slf4j.LoggerFactory; 051 052import dk.netarkivet.common.Constants; 053import dk.netarkivet.common.exceptions.ArgumentNotValid; 054import dk.netarkivet.common.exceptions.IOFailure; 055import dk.netarkivet.common.utils.InputStreamUtils; 056import dk.netarkivet.common.utils.SystemUtils; 057 058/** 059 * Various utilities that do stuff that ARCWriter does not provide. Also includes method for converting an ARCRecord to 060 * a byte array. 061 */ 062public final class ARCUtils { 063 064 /** The log. */ 065 private static final Logger log = LoggerFactory.getLogger(ARCUtils.class); 066 067 /** Default constructor to avoid initialization. */ 068 private ARCUtils() { 069 } 070 071 /** 072 * Matches HTTP header lines like HTTP/1.1 404 Page has gone south Groups: 111 2222222222222222222. 073 */ 074 private static final Pattern HTTP_HEADER_PATTERN = Pattern.compile("^HTTP/1\\.[01] (\\d+) (.*)$"); 075 076 /** Extra ARC Record metadata. */ 077 public static final String RESPONSETEXT = "RESPONSETEXT"; 078 079 /** 080 * Insert the contents of an ARC file (skipping an optional initial filedesc: header) in another ARCfile. 081 * 082 * @param arcFile An ARC file to read. 083 * @param aw A place to write the arc records 084 * @throws IOFailure if there are problems reading the file. 085 */ 086 public static void insertARCFile(File arcFile, ARCWriter aw) { 087 ArgumentNotValid.checkNotNull(aw, "ARCWriter aw"); 088 ArgumentNotValid.checkNotNull(arcFile, "File arcFile"); 089 ARCReader r; 090 091 try { 092 r = ARCReaderFactory.get(arcFile); 093 } catch (IOException e) { 094 String message = "Error while copying ARC records from " + arcFile; 095 log.warn(message, e); 096 throw new IOFailure(message, e); 097 } 098 Iterator<ArchiveRecord> it = r.iterator(); 099 ARCRecord record; 100 it.next(); // Skip ARC file header 101 // ARCReaderFactory guarantees the first record exists and is a 102 // filedesc, or it would throw exception 103 while (it.hasNext()) { 104 record = (ARCRecord) it.next(); 105 copySingleRecord(aw, record); 106 } 107 } 108 109 /** 110 * Writes the given ARCRecord on the given ARCWriter. 111 * <p> 112 * Note that the ARCWriter.write method takes the metadata fields as separate arguments instead of accepting an 113 * ARCRecordMetaData object. It uses the ArchiveUtils.getDate method to convert an ARCstyle datestring to a Date 114 * object. 115 * 116 * @param aw The ARCWriter to output the record on. 117 * @param record The record to output 118 * @see ArchiveUtils#getDate(java.lang.String) 119 */ 120 private static void copySingleRecord(ARCWriter aw, ARCRecord record) { 121 try { 122 // Prepare metadata... 123 ARCRecordMetaData meta = record.getMetaData(); 124 String uri = meta.getUrl(); 125 String mime = meta.getMimetype(); 126 String ip = meta.getIp(); 127 // Note the ArchiveUtils.getDate() converts an ARC-style datestring 128 // to a Date object 129 long timeStamp = ArchiveUtils.getDate(meta.getDate()).getTime(); 130 // ...and write the given files content into the writer 131 // Note ARCRecord extends InputStream 132 aw.write(uri, mime, ip, timeStamp, meta.getLength(), record); 133 } catch (Exception e) { 134 throw new IOFailure("Error occurred while writing an ARC record" + record, e); 135 } 136 } 137 138 /** 139 * Create new ARCWriter, writing to arcfile newFile. 140 * 141 * @param newFile the ARCfile, that the ARCWriter writes to. 142 * @return new ARCWriter, writing to arcfile newFile. 143 */ 144 public static ARCWriter createARCWriter(File newFile) { 145 ARCWriter aw; 146 PrintStream ps = null; 147 try { 148 ps = new PrintStream(new FileOutputStream(newFile)); 149 /* 150 aw = new ARCWriter(new AtomicInteger(), ps, 151 // This name is used for the first (file metadata) record 152 newFile, false, // Don't compress 153 // Use current time 154 ArchiveUtils.get14DigitDate(System.currentTimeMillis()), null // No particular file metadata to add 155 ); 156 */ 157 WriterPoolSettings settings = new WriterPoolSettingsData( 158 ARCConstants.ARC_FILE_EXTENSION, null, ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE, false, null, null); 159 // This name is used for the first (file metadata) record 160 aw = new ARCWriter(new AtomicInteger(), ps, newFile, settings); 161 } catch (IOException e) { 162 if (ps != null) { 163 ps.close(); 164 } 165 String message = "Could not create ARCWriter to file '" + newFile + "'.\n"; 166 log.warn(message); 167 throw new IOFailure(message, e); 168 } 169 return aw; 170 } 171 172 /** 173 * Write a file to an ARC file. The writing is done by an existing ARCWriter. An ARCRecord will be added, which 174 * contains a header and the contents of the file. The date of the record written will be set to the lastModified 175 * value of the file being written. 176 * 177 * @param aw The ARCWriter doing the writing 178 * @param file The file we want to write to the ARC file 179 * @param uri The uri for the ARCRecord being written 180 * @param mime The mimetype for the ARCRecord being written 181 * @throws ArgumentNotValid if any arguments aw and file are null and arguments uri and mime are null or empty. 182 */ 183 public static void writeFileToARC(ARCWriter aw, File file, String uri, String mime) { 184 ArgumentNotValid.checkNotNull(aw, "ARCWriter aw"); 185 ArgumentNotValid.checkNotNull(file, "File file"); 186 ArgumentNotValid.checkNotNullOrEmpty(uri, "String uri"); 187 ArgumentNotValid.checkNotNullOrEmpty(mime, "String mime"); 188 189 InputStream is = null; 190 try { 191 try { 192 // Prepare metadata... 193 String ip = SystemUtils.getLocalIP(); 194 long timeStamp = file.lastModified(); 195 long length = file.length(); 196 // ...and write the CDX file's content into the writer 197 is = new FileInputStream(file); 198 aw.write(uri, mime, ip, timeStamp, length, is); 199 } finally { 200 if (is != null) { 201 is.close(); 202 } 203 } 204 } catch (IOException e) { 205 String msg = "Error writing '" + file + "' to " + aw + " as " + uri; 206 log.warn(msg, e); 207 throw new IOFailure(msg, e); 208 } 209 } 210 211 /** 212 * Return an ARCWriter suitable for the tools ArcMerge and ArcWrap. 213 * 214 * @param stream the given PrintStream. 215 * @param destinationArcfile the given destination ARC file. 216 * @return ARCWriter to be used by tools ArcMerge and ArcWrap 217 * @throws IOException redirect from ARCWriter constructure 218 */ 219 public static ARCWriter getToolsARCWriter(PrintStream stream, File destinationArcfile) throws IOException { 220 /* 221 return new ARCWriter(new AtomicInteger(), stream, destinationArcfile, false, // Don't compress 222 // Use current time 223 ArchiveUtils.get14DigitDate(System.currentTimeMillis()), null // //No particular file metadata to add 224 ); 225 */ 226 WriterPoolSettings settings = new WriterPoolSettingsData( 227 ARCConstants.ARC_FILE_EXTENSION, null, ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE, false, null, null); 228 return new ARCWriter(new AtomicInteger(), stream, destinationArcfile, settings); 229 } 230 231 /** 232 * Read the contents of an ARC record into a byte array. 233 * 234 * @param in An ARC record to read from. After reading, the ARC Record will no longer have its own data available 235 * for reading. 236 * @return A byte array containing the contents of the ARC record. Note that the size of this may be different from 237 * the size given in the ARC record metadata. 238 * @throws IOException If there is an error reading the data, or if the record is longer than Integer.MAX_VALUE 239 * (since we can't make bigger arrays). 240 */ 241 public static byte[] readARCRecord(ARCRecord in) throws IOException { 242 ArgumentNotValid.checkNotNull(in, "ARCRecord in"); 243 if (in.getMetaData().getLength() > Integer.MAX_VALUE) { 244 throw new IOFailure("ARC Record too long to fit in array: " + in.getMetaData().getLength() + " > " 245 + Integer.MAX_VALUE); 246 } 247 // read from stream 248 // The arcreader has a number of "features" that complicates the read 249 // 1) the record at offset 0, returns too large a length 250 // 2) readfully does not work 251 // 3) ARCRecord.read(buf, offset, length) is broken. 252 // TODO verify if these "features" are still around: See bugs #903, #904, 253 // #905 254 int dataLength = (int) in.getMetaData().getLength(); 255 byte[] tmpbuffer = new byte[dataLength]; 256 byte[] buffer = new byte[Constants.IO_BUFFER_SIZE]; 257 int bytesRead; 258 int totalBytes = 0; 259 for (; (totalBytes < dataLength) && ((bytesRead = in.read(buffer)) != -1); totalBytes += bytesRead) { 260 System.arraycopy(buffer, 0, tmpbuffer, totalBytes, bytesRead); 261 } 262 // Check if the number of bytes read (=i) matches the 263 // size of the buffer. 264 if (tmpbuffer.length != totalBytes) { 265 // make sure we only return an array with bytes we actualy read 266 byte[] truncateBuffer = new byte[totalBytes]; 267 System.arraycopy(tmpbuffer, 0, truncateBuffer, 0, totalBytes); 268 return truncateBuffer; 269 } else { 270 return tmpbuffer; 271 } 272 } 273 274 /** 275 * TODO write unit test. 276 * 277 * @param in pointing at start of ARC record. 278 * @param offset into ARC file. 279 * @return pairwise headers. 280 * @throws IOException if fails to read ARC files or ARC files isn't valid. 281 */ 282 public static Map<String, Object> getHeadersFromARCFile(InputStream in, Long offset) throws IOException { 283 Map<String, Object> headers = new HashMap<String, Object>(); 284 // extra needed headers. 285 headers.put(ARCRecordMetaData.VERSION_FIELD_KEY, ""); 286 headers.put(ARCRecordMetaData.ABSOLUTE_OFFSET_KEY, offset); 287 288 String line = InputStreamUtils.readLine(in); 289 String[] tmp = line.split(" "); 290 291 // decode header. 292 if (tmp.length == 5) { 293 headers.put(ARCRecordMetaData.URL_FIELD_KEY, tmp[0]); 294 headers.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, tmp[1]); 295 headers.put(ARCRecordMetaData.DATE_FIELD_KEY, tmp[2]); 296 headers.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, tmp[3]); 297 headers.put(ARCRecordMetaData.LENGTH_FIELD_KEY, tmp[4]); 298 } else { 299 throw new IOException("Does not include required metadata to be a valid ARC header: " + line); 300 } 301 // Matches rest of header lines. 302 line = InputStreamUtils.readLine(in); 303 Matcher m = HTTP_HEADER_PATTERN.matcher(line); 304 305 if (m.matches()) { 306 headers.put(ARCRecordMetaData.STATUSCODE_FIELD_KEY, m.group(1)); 307 // not valid META DATA 308 headers.put(RESPONSETEXT, line); 309 } 310 /* arc/warc header */ 311 while ((line = InputStreamUtils.readLine(in)) != null && line.length() > 0 && line.startsWith("<")) { 312 int index = line.indexOf(':'); 313 if (index != -1) { 314 headers.put(line.substring(0, index), line.substring(index + 2)); 315 } else { 316 throw new IOException("Inputstream doesn't not point to valid ARC record"); 317 } 318 } 319 320 return headers; 321 } 322 323 /** 324 * Check if the filename belongs to an ARC file. 325 * 326 * @param filename a given filename 327 * @return true, if the filename converted to lowercase ends with .arc or .arc.gz 328 */ 329 public static boolean isARC(String filename) { 330 ArgumentNotValid.checkNotNullOrEmpty(filename, "filename"); 331 String filenameLowercase = filename.toLowerCase(); 332 return (filenameLowercase.endsWith(".arc") || filenameLowercase.endsWith(".arc.gz")); 333 } 334 335}