001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.utils.archive; 024 025import java.io.File; 026import java.text.DateFormat; 027import java.text.ParseException; 028import java.util.Collections; 029import java.util.Date; 030import java.util.HashMap; 031import java.util.Iterator; 032import java.util.Map; 033import java.util.Set; 034 035import org.archive.io.ArchiveRecord; 036import org.archive.io.ArchiveRecordHeader; 037import org.archive.io.arc.ARCRecord; 038import org.archive.io.warc.WARCRecord; 039import org.slf4j.Logger; 040import org.slf4j.LoggerFactory; 041 042import dk.netarkivet.common.exceptions.ArgumentNotValid; 043 044/** 045 * Heritrix wrapper implementation of the abstract archive header interface. 046 */ 047@SuppressWarnings({"unchecked"}) 048public class HeritrixArchiveHeaderWrapper extends ArchiveHeaderBase { 049 050 /** The logger for this class. */ 051 private static final Logger log = LoggerFactory.getLogger(HeritrixArchiveHeaderWrapper.class); 052 053 /** Reuse the sme WARC <code>DateFormat</code> object. */ 054 protected DateFormat warcDateFormat = ArchiveDateConverter.getWarcDateFormat(); 055 056 /** Reuse the same ARC <code>DateFormat</code> object. */ 057 protected DateFormat arcDateFormat = ArchiveDateConverter.getArcDateFormat(); 058 059 /** Wrapper Heritrix header. */ 060 protected HeritrixArchiveRecordWrapper recordWrapper; 061 062 /** Original Heritrix header object. */ 063 protected ArchiveRecordHeader header; 064 065 /** 066 * Map of header fields extracted from the Heritrix header. Only difference is that the keys are normalized to lower 067 * case. 068 */ 069 protected Map<String, Object> headerFields = new HashMap<String, Object>(); 070 071 /** 072 * Construct a Heritrix record header wrapper object. 073 * 074 * @param recordWrapper wrapped Heritrix header 075 * @param record original Heritrix record 076 * @return wrapped Heritrix record header 077 */ 078 public static HeritrixArchiveHeaderWrapper wrapArchiveHeader(HeritrixArchiveRecordWrapper recordWrapper, 079 ArchiveRecord record) { 080 // ArgumentNotValid.checkNotNull(recordWrapper, "recordWrapper"); 081 ArgumentNotValid.checkNotNull(record, "record"); 082 HeritrixArchiveHeaderWrapper headerWrapper = new HeritrixArchiveHeaderWrapper(); 083 headerWrapper.recordWrapper = recordWrapper; 084 headerWrapper.header = record.getHeader(); 085 Map<String, Object> heritrixHeaderFields = (Map<String, Object>) headerWrapper.header.getHeaderFields(); 086 Iterator<Map.Entry<String, Object>> iter = heritrixHeaderFields.entrySet().iterator(); 087 Map.Entry<String, Object> entry; 088 while (iter.hasNext()) { 089 entry = iter.next(); 090 headerWrapper.headerFields.put(entry.getKey().toLowerCase(), entry.getValue()); 091 } 092 if (record instanceof ARCRecord) { 093 headerWrapper.bIsArc = true; 094 } else if (record instanceof WARCRecord) { 095 headerWrapper.bIsWarc = true; 096 } else { 097 throw new ArgumentNotValid("Unsupported ArchiveRecord type: " + record.getClass().getName()); 098 } 099 return headerWrapper; 100 } 101 102 @Override 103 public Object getHeaderValue(String key) { 104 return headerFields.get(key.toLowerCase()); 105 } 106 107 @Override 108 public String getHeaderStringValue(String key) { 109 Object tmpObj = headerFields.get(key.toLowerCase()); 110 String str; 111 if (tmpObj != null) { 112 str = tmpObj.toString(); 113 } else { 114 str = null; 115 } 116 return str; 117 } 118 119 @Override 120 public Set<String> getHeaderFieldKeys() { 121 return Collections.unmodifiableSet(headerFields.keySet()); 122 } 123 124 @Override 125 public Map<String, Object> getHeaderFields() { 126 return Collections.unmodifiableMap(headerFields); 127 } 128 129 /* 130 * The following fields do not need converting. 131 */ 132 133 @Override 134 public String getVersion() { 135 return header.getVersion(); 136 } 137 138 @Override 139 public String getReaderIdentifier() { 140 return header.getReaderIdentifier(); 141 } 142 143 @Override 144 public String getRecordIdentifier() { 145 return header.getRecordIdentifier(); 146 } 147 148 @Override 149 public String getUrl() { 150 return header.getUrl(); 151 } 152 153 @Override 154 public String getIp() { 155 Object tmpObj = getHeaderValue("WARC-IP-Address"); 156 String ip; 157 if (tmpObj != null) { 158 ip = tmpObj.toString(); 159 } else { 160 ip = null; 161 } 162 return ip; 163 } 164 165 @Override 166 public long getOffset() { 167 return header.getOffset(); 168 } 169 170 @Override 171 public long getLength() { 172 return header.getLength(); 173 } 174 175 /* 176 * Conversion required. 177 */ 178 179 @Override 180 public Date getDate() { 181 String dateStr = header.getDate(); 182 Date date = null; 183 try { 184 if (bIsArc) { 185 date = arcDateFormat.parse(dateStr); 186 } else if (bIsWarc) { 187 date = warcDateFormat.parse(dateStr); 188 } 189 } catch (ParseException e) { 190 log.info("Archive date could not be parsed: '{}'.", dateStr); 191 } 192 return date; 193 } 194 195 @Override 196 public String getArcDateStr() { 197 String dateStr = header.getDate(); 198 if (bIsWarc) { 199 try { 200 Date warcDate = warcDateFormat.parse(dateStr); 201 dateStr = arcDateFormat.format(warcDate); 202 return dateStr; 203 } catch (Exception e) { 204 log.info("Archive date could not be parsed: {}.", dateStr); 205 } 206 } 207 return dateStr; 208 } 209 210 @Override 211 public String getMimetype() { 212 return header.getMimetype(); 213 } 214 215 @Override 216 public File getArchiveFile() { 217 return new File(header.getReaderIdentifier()); 218 } 219 220}