001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils.cdx;
024
025import java.io.UnsupportedEncodingException;
026import java.net.URLDecoder;
027
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import dk.netarkivet.common.exceptions.ArgumentNotValid;
032import dk.netarkivet.common.utils.StringUtils;
033
034/**
035 * Represents a line i a CDX-file. A CDX-file is an index over arcfiles, with fields for uri, ip, date, mimetype,
036 * length, arcfile, and offset in the file.
037 */
038public class CDXRecord {
039
040    /** The logger for this class. */
041    private static final Logger log = LoggerFactory.getLogger(CDXRecord.class);
042
043    /** The uri information in a CDX entry. */
044    private String url;
045    /** The ip information in a CDX entry. */
046    private String ip;
047    /** The date information in a CDX entry. */
048    private String date;
049    /** The mimetype information in a CDX entry. */
050    private String mimetype;
051    /** The length information in a CDX entry. */
052    private long length;
053    /** The arcfile information in a CDX entry. */
054    private String arcfile;
055    /** The offset information in a CDX entry. */
056    private long offset;
057
058    /**
059     * Helper method to avoid exception in URL decoding.
060     *
061     * @param s The string to unescape.
062     * @return the unescaped string.
063     */
064    private static String unescape(String s) {
065        try {
066            return URLDecoder.decode(s, "UTF-8");
067        } catch (UnsupportedEncodingException e) {
068            throw new ArgumentNotValid("UTF-8 is an unknown encoding. This should never happen!");
069        }
070    }
071
072    /**
073     * Compare two URLs for equality; first URL-unescaping (in UTF-8) all arguments in any query part.
074     *
075     * @param url1 The first URL
076     * @param url2 The second URL
077     * @return A boolean indicating whether the URLs are equal
078     */
079    public static boolean URLsEqual(String url1, String url2) {
080        ArgumentNotValid.checkNotNull(url1, "String uri1");
081        ArgumentNotValid.checkNotNull(url2, "String uri2");
082        boolean result = url1.equals(url2);
083        if (!result && url1.contains("?") && url2.contains("?")) {
084            // split at ? and compare prefix
085            String pre1 = url1.substring(0, url1.indexOf('?') + 1);
086            String post1 = url1.substring(url1.indexOf('?') + 1);
087            String pre2 = url2.substring(0, url2.indexOf('?') + 1);
088            String post2 = url2.substring(url2.indexOf('?') + 1);
089            if (pre1.equals(pre2)) {
090                String postdecode1 = unescape(post1);
091                String postdecode2 = unescape(post2);
092                result = (post1.equals(post2) || postdecode1.equals(postdecode2));
093            }
094        }
095        return result;
096    }
097
098    /**
099     * Constructor for class CDXRecord.
100     *
101     * @param fields the given fields of a line i CDX-format.
102     * @throws ArgumentNotValid if argument is null or number of fields is less than 7 or if length or offset does not
103     * contain long values.
104     */
105    public CDXRecord(String[] fields) {
106        ArgumentNotValid.checkNotNull(fields, "String[] fields");
107        if (fields.length >= 7) {
108            try {
109                this.url = fields[0];
110                this.ip = fields[1];
111                this.date = fields[2];
112                this.mimetype = fields[3];
113                this.length = Long.parseLong(fields[4]);
114                this.arcfile = fields[5];
115                this.offset = Long.parseLong(fields[6]);
116            } catch (NumberFormatException e) {
117                String message = "Could not make CDXRecord - out of fields " + StringUtils.conjoin(",", fields)
118                        + ". Length or offset was not a parsable long value.";
119                log.debug(message);
120                throw new ArgumentNotValid(message);
121            }
122        } else {
123            String message = "Could not make CDXRecord - out of " + fields.length + " fields: "
124                    + StringUtils.conjoin(",", fields);
125            log.debug(message);
126            throw new ArgumentNotValid(message);
127        }
128    }
129
130    /**
131     * Constructor, which tries to parse the given string as a CDXRecord.
132     *
133     * @param line a CDXline
134     */
135    public CDXRecord(String line) {
136        this(line.split(CDXReader.SEPARATOR_REGEX));
137    }
138
139    /**
140     * Get the given URL.
141     *
142     * @return the URL
143     */
144    public String getURL() {
145        return url;
146    }
147
148    /**
149     * Get the given IP.
150     *
151     * @return the IP
152     */
153    public String getIP() {
154        return ip;
155    }
156
157    /**
158     * Get the given date.
159     *
160     * @return the date
161     */
162    public String getDate() {
163        return date;
164    }
165
166    /**
167     * Get the given mimetype.
168     *
169     * @return The given mimetype
170     */
171    public String getMimetype() {
172        return mimetype;
173    }
174
175    /**
176     * Get the given length.
177     *
178     * @return The given length
179     */
180    public long getLength() {
181        return length;
182    }
183
184    /**
185     * Get the given arcfile.
186     *
187     * @return The given arcfile
188     */
189    public String getArcfile() {
190        return arcfile;
191    }
192
193    /**
194     * Get the given offset.
195     *
196     * @return The given offset
197     */
198    public long getOffset() {
199        return offset;
200    }
201
202}