001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.utils.cdx; 024 025import java.io.UnsupportedEncodingException; 026import java.net.URLDecoder; 027 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import dk.netarkivet.common.exceptions.ArgumentNotValid; 032import dk.netarkivet.common.utils.StringUtils; 033 034/** 035 * Represents a line i a CDX-file. A CDX-file is an index over arcfiles, with fields for uri, ip, date, mimetype, 036 * length, arcfile, and offset in the file. 037 */ 038public class CDXRecord { 039 040 /** The logger for this class. */ 041 private static final Logger log = LoggerFactory.getLogger(CDXRecord.class); 042 043 /** The uri information in a CDX entry. */ 044 private String url; 045 /** The ip information in a CDX entry. */ 046 private String ip; 047 /** The date information in a CDX entry. */ 048 private String date; 049 /** The mimetype information in a CDX entry. */ 050 private String mimetype; 051 /** The length information in a CDX entry. */ 052 private long length; 053 /** The arcfile information in a CDX entry. */ 054 private String arcfile; 055 /** The offset information in a CDX entry. */ 056 private long offset; 057 058 /** 059 * Helper method to avoid exception in URL decoding. 060 * 061 * @param s The string to unescape. 062 * @return the unescaped string. 063 */ 064 private static String unescape(String s) { 065 try { 066 return URLDecoder.decode(s, "UTF-8"); 067 } catch (UnsupportedEncodingException e) { 068 throw new ArgumentNotValid("UTF-8 is an unknown encoding. This should never happen!"); 069 } 070 } 071 072 /** 073 * Compare two URLs for equality; first URL-unescaping (in UTF-8) all arguments in any query part. 074 * 075 * @param url1 The first URL 076 * @param url2 The second URL 077 * @return A boolean indicating whether the URLs are equal 078 */ 079 public static boolean URLsEqual(String url1, String url2) { 080 ArgumentNotValid.checkNotNull(url1, "String uri1"); 081 ArgumentNotValid.checkNotNull(url2, "String uri2"); 082 boolean result = url1.equals(url2); 083 if (!result && url1.contains("?") && url2.contains("?")) { 084 // split at ? and compare prefix 085 String pre1 = url1.substring(0, url1.indexOf('?') + 1); 086 String post1 = url1.substring(url1.indexOf('?') + 1); 087 String pre2 = url2.substring(0, url2.indexOf('?') + 1); 088 String post2 = url2.substring(url2.indexOf('?') + 1); 089 if (pre1.equals(pre2)) { 090 String postdecode1 = unescape(post1); 091 String postdecode2 = unescape(post2); 092 result = (post1.equals(post2) || postdecode1.equals(postdecode2)); 093 } 094 } 095 return result; 096 } 097 098 /** 099 * Constructor for class CDXRecord. 100 * 101 * @param fields the given fields of a line i CDX-format. 102 * @throws ArgumentNotValid if argument is null or number of fields is less than 7 or if length or offset does not 103 * contain long values. 104 */ 105 public CDXRecord(String[] fields) { 106 ArgumentNotValid.checkNotNull(fields, "String[] fields"); 107 if (fields.length >= 7) { 108 try { 109 this.url = fields[0]; 110 this.ip = fields[1]; 111 this.date = fields[2]; 112 this.mimetype = fields[3]; 113 this.length = Long.parseLong(fields[4]); 114 this.arcfile = fields[5]; 115 this.offset = Long.parseLong(fields[6]); 116 } catch (NumberFormatException e) { 117 String message = "Could not make CDXRecord - out of fields " + StringUtils.conjoin(",", fields) 118 + ". Length or offset was not a parsable long value."; 119 log.debug(message); 120 throw new ArgumentNotValid(message); 121 } 122 } else { 123 String message = "Could not make CDXRecord - out of " + fields.length + " fields: " 124 + StringUtils.conjoin(",", fields); 125 log.debug(message); 126 throw new ArgumentNotValid(message); 127 } 128 } 129 130 /** 131 * Constructor, which tries to parse the given string as a CDXRecord. 132 * 133 * @param line a CDXline 134 */ 135 public CDXRecord(String line) { 136 this(line.split(CDXReader.SEPARATOR_REGEX)); 137 } 138 139 /** 140 * Get the given URL. 141 * 142 * @return the URL 143 */ 144 public String getURL() { 145 return url; 146 } 147 148 /** 149 * Get the given IP. 150 * 151 * @return the IP 152 */ 153 public String getIP() { 154 return ip; 155 } 156 157 /** 158 * Get the given date. 159 * 160 * @return the date 161 */ 162 public String getDate() { 163 return date; 164 } 165 166 /** 167 * Get the given mimetype. 168 * 169 * @return The given mimetype 170 */ 171 public String getMimetype() { 172 return mimetype; 173 } 174 175 /** 176 * Get the given length. 177 * 178 * @return The given length 179 */ 180 public long getLength() { 181 return length; 182 } 183 184 /** 185 * Get the given arcfile. 186 * 187 * @return The given arcfile 188 */ 189 public String getArcfile() { 190 return arcfile; 191 } 192 193 /** 194 * Get the given offset. 195 * 196 * @return The given offset 197 */ 198 public long getOffset() { 199 return offset; 200 } 201 202}