001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.tools;
025
026import java.io.BufferedReader;
027import java.io.ByteArrayOutputStream;
028import java.io.File;
029import java.io.FileReader;
030import java.io.IOException;
031
032import org.apache.commons.cli.CommandLine;
033import org.apache.commons.cli.CommandLineParser;
034import org.apache.commons.cli.MissingArgumentException;
035import org.apache.commons.cli.Option;
036import org.apache.commons.cli.OptionGroup;
037import org.apache.commons.cli.Options;
038import org.apache.commons.cli.ParseException;
039import org.apache.commons.cli.PosixParser;
040import org.jwat.common.ANVLRecord;
041
042import dk.netarkivet.common.CommonSettings;
043import dk.netarkivet.common.Constants;
044import dk.netarkivet.common.distribute.JMSConnectionFactory;
045import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
046import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
047import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient;
048import dk.netarkivet.common.exceptions.IOFailure;
049import dk.netarkivet.common.exceptions.NetarkivetException;
050import dk.netarkivet.common.tools.SimpleCmdlineTool;
051import dk.netarkivet.common.tools.ToolRunnerBase;
052import dk.netarkivet.common.utils.FileUtils;
053import dk.netarkivet.common.utils.Settings;
054import dk.netarkivet.common.utils.SystemUtils;
055import dk.netarkivet.common.utils.batch.FileBatchJob;
056import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob;
057import dk.netarkivet.common.utils.cdx.CDXRecord;
058import dk.netarkivet.harvester.HarvesterSettings;
059import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
060import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc;
061
062/**
063 * This tool creates a CDX metadata file for a given job's jobID and harvestPrefix by running a batch job on the
064 * bitarchive and processing the results to give a metadata file. Use option -w to select WARC output, and -a to select
065 * ARC output: If no option available, then warc mode is selected
066 * <p>
067 * Usage: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile -w --jobID 2 --harvestnamePrefix 2-1 Usage: java
068 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile -a --jobID 2 --harvestnamePrefix 2-1 Usage: java
069 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile --jobID 2 --harvestnamePrefix 2-1
070 * <p>
071 * The CDX records is slightly different from the one produced normally. As we are not able to extract the timestamp,
072 * and harvestID from the (W) arcfilenames, this information is not part of the CXDURI.
073 */
074public class CreateCDXMetadataFile extends ToolRunnerBase {
075
076    public static final String ARCMODE = "arc";
077    public static final String WARCMODE = "warc";
078    public static final String usageString = "[-a|w] --jobID X --harvestnamePrefix somePrefix";
079
080    /**
081     * Main method. Creates and runs the tool object responsible for batching over the bitarchive and creating a
082     * metadata file for a job.
083     *
084     * @param argv Arguments to the tool: jobID harvestnamePrefix
085     */
086    public static void main(String[] argv) {
087        new CreateCDXMetadataFile().runTheTool(argv);
088    }
089
090    /**
091     * Create the tool instance.
092     *
093     * @return A new tool object.
094     */
095    protected SimpleCmdlineTool makeMyTool() {
096        return new CreateCDXMetadataFileTool();
097    }
098
099    /**
100     * The actual tool object that creates CDX files.
101     */
102    private static class CreateCDXMetadataFileTool implements SimpleCmdlineTool {
103        /** Write output mode. Is it arc or warc mode. */
104        private boolean isWarcOutputMode;
105        /** Which jobId to process. */
106        private long jobId;
107        /** HarvestnamePrefix used to locate the files for the job. */
108        private String harvestnamePrefix;
109
110        /** The connection to the arc repository. */
111        private ViewerArcRepositoryClient arcrep;
112
113        /**
114         * The file pattern that matches an ARC or WARC file name without the jobID. If combined with a
115         * harvestnameprefix, this will match filenames that begin with the given harvestname prefix.
116         */
117        private static final String REMAINING_ARCHIVE_FILE_PATTERN = ".*";
118
119        /**
120         * Checks that a valid jobID were given. This does not check whether jobs actually exist for that ID.
121         *
122         * @param args The args given on the command line.
123         * @return True if the args are legal.
124         */
125        public boolean checkArgs(String... args) {
126            final String ARC_OPTION_KEY = "a";
127            final String WARC_OPTION_KEY = "w";
128            final String JOBID_OPTION_KEY = "jobID";
129            final String HARVESTNAMEPREFIX_OPTION_KEY = "harvestnamePrefix";
130
131            OptionGroup metadataGroup = new OptionGroup();
132            Option arcOption = new Option(ARC_OPTION_KEY, false, "write an metadata ARC file");
133            Option warcOption = new Option(WARC_OPTION_KEY, false, "write an metadata WARC file");
134            metadataGroup.addOption(arcOption);
135            metadataGroup.addOption(warcOption);
136            metadataGroup.setRequired(false);
137            OptionGroup jobIDGroup = new OptionGroup();
138            Option jobIdOption = new Option(JOBID_OPTION_KEY, true, "The JobID");
139            jobIDGroup.addOption(jobIdOption);
140            jobIDGroup.setRequired(true);
141
142            Option harvestprefixOption = new Option(HARVESTNAMEPREFIX_OPTION_KEY, true, "The harvestnamePrefix");
143            OptionGroup harvestnamePrefixGroup = new OptionGroup();
144            harvestnamePrefixGroup.addOption(harvestprefixOption);
145            harvestnamePrefixGroup.setRequired(true);
146            Options options = new Options();
147            options.addOptionGroup(metadataGroup);
148            options.addOptionGroup(jobIDGroup);
149            options.addOptionGroup(harvestnamePrefixGroup);
150            String jobIdString = null;
151
152            CommandLineParser parser = new PosixParser();
153            CommandLine cli = null;
154            try {
155                cli = parser.parse(options, args);
156            } catch (MissingArgumentException e) {
157                System.err.println("Missing or wrong arguments given");
158                printUsage();
159                return false;
160            } catch (ParseException e) {
161                System.err.println("Missing or wrong arguments given");
162                printUsage();
163                return false;
164            }
165
166            isWarcOutputMode = true; // the default
167            // Only need to check for the ARC option, as the WARC option cannot be set at the same time
168            // It is either one or none of them.
169            if (cli.hasOption(ARC_OPTION_KEY)) {
170                isWarcOutputMode = false;
171            }
172            jobIdString = cli.getOptionValue(JOBID_OPTION_KEY);
173            this.harvestnamePrefix = cli.getOptionValue(HARVESTNAMEPREFIX_OPTION_KEY);
174
175            try {
176                this.jobId = Long.parseLong(jobIdString);
177                if (jobId < 1) {
178                    System.err.println("'" + jobIdString + "' is not a valid job ID");
179                    return false;
180                }
181            } catch (NumberFormatException e) {
182                System.err.println("'" + jobIdString + "' is not a valid job ID");
183                return false;
184            }
185            return true;
186        }
187
188        /**
189         * Create required resources here (the ArcRepositoryClient instance). Resources created here should be released
190         * in tearDown, which is guaranteed to be run.
191         *
192         * @param args The arguments that were given on the command line (not used here)
193         */
194        public void setUp(String... args) {
195            arcrep = ArcRepositoryClientFactory.getViewerInstance();
196        }
197
198        /**
199         * Closes all resources we are using, which is only the ArcRepositoryClient. This is guaranteed to be called at
200         * shutdown.
201         */
202        public void tearDown() {
203            if (arcrep != null) {
204                arcrep.close();
205                if (arcrep.getClass().getName()
206                        .equals("dk.netarkivet.archive.arcrepository.distribute.JMSArcRepositoryClient")) {
207                    JMSConnectionFactory.getInstance().cleanup();
208                }
209            }
210        }
211
212        /**
213         * The workhorse method of this tool: Runs the batch job, copies the result, then turns the result into a proper
214         * metadata file. This method assumes that the args have already been read by the checkArgs method, and thus
215         * jobId has been parsed, and the isWarcOutputMode established
216         *
217         * @param args Arguments given on the command line.
218         */
219        public void run(String... args) {
220            final long jobID = this.jobId;
221            final String harvestPrefix = this.harvestnamePrefix;
222            FileBatchJob job = new ArchiveExtractCDXJob();
223            Settings.set(HarvesterSettings.METADATA_FORMAT, (isWarcOutputMode) ? "warc" : "arc");
224            final String filePattern = harvestPrefix + REMAINING_ARCHIVE_FILE_PATTERN;
225
226            System.out.println("Creating cdx-" + ((isWarcOutputMode) ? "warcfile" : "arcfile")
227                    + " from file matching pattern '" + filePattern + "'.");
228            job.processOnlyFilesMatching(filePattern);
229
230            BatchStatus status = arcrep.batch(job, Settings.get(CommonSettings.USE_REPLICA_ID));
231            if (status.hasResultFile()) {
232                System.out.println("Got results from archive. Processing data");
233                File resultFile = null;
234                try {
235                    resultFile = File.createTempFile("extract-batch", ".cdx", FileUtils.getTempDir());
236                    resultFile.deleteOnExit();
237                    status.copyResults(resultFile);
238                    arcifyResultFile(resultFile, jobID);
239                } catch (IOException e) {
240                    throw new IOFailure("Error getting results for job " + jobID, e);
241                } finally {
242                    if (resultFile != null) {
243                        FileUtils.remove(resultFile);
244                    }
245                }
246            } else {
247                System.err.println("Got new results from archive. Program ending now");
248            }
249        }
250
251        /**
252         * Turns a raw CDX file for the given jobID into a metadatafile containing the CDX lines in one archive record
253         * per each ARC or WARC file indexed. The output is put into a file called &lt;jobID&gt;-metadata-1.arc.
254         *
255         * @param resultFile The CDX file returned by a ExtractCDXJob for the given jobID.
256         * @param jobID The jobID we work on.
257         * @throws IOException If an I/O error occurs, or the resultFile does not exist
258         */
259        private void arcifyResultFile(File resultFile, long jobID) throws IOException {
260            BufferedReader reader = new BufferedReader(new FileReader(resultFile));
261
262            File outputFile = new File(MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobID)));
263            System.out.println("Writing cdx to file '" + outputFile.getAbsolutePath() + "'.");
264            try {
265                MetadataFileWriter writer = MetadataFileWriter.createWriter(outputFile);
266                if (writer instanceof MetadataFileWriterWarc) {
267                    insertWarcInfo((MetadataFileWriterWarc) writer, jobID);
268                }
269                try {
270                    String line;
271                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
272                    String lastFilename = null;
273                    String newFilename = null;
274
275                    while ((line = reader.readLine()) != null) {
276                        // parse filename out of line
277                        newFilename = parseLine(line, harvestnamePrefix);
278                        if (newFilename == null) { // Bad line, try the next
279                            continue;
280                        }
281                        if (lastFilename != null && !newFilename.equals(lastFilename)) {
282                            // When we reach the end of a block of lines from
283                            // one ARC/WARC file, we write those as a single entry.
284                            writeCDXEntry(writer, newFilename, baos.toByteArray());
285                            baos.reset();
286                        }
287                        baos.write(line.getBytes());
288                        baos.write("\n".getBytes());
289                        lastFilename = newFilename;
290                    }
291                    if (newFilename != null) {
292                        writeCDXEntry(writer, newFilename, baos.toByteArray());
293                    }
294                } finally {
295                    writer.close();
296                }
297            } finally {
298                reader.close();
299            }
300        }
301
302        private void insertWarcInfo(MetadataFileWriterWarc writer, Long jobID) {
303            ANVLRecord infoPayload = new ANVLRecord();
304            infoPayload.addLabelValue("software",
305                    "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/"
306                            + dk.netarkivet.common.Constants.PROJECT_WEBSITE);
307            infoPayload.addLabelValue("ip", SystemUtils.getLocalIP());
308            infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName());
309            infoPayload
310                    .addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
311            infoPayload.addLabelValue("isPartOf", "" + jobID);
312            writer.insertInfoRecord(infoPayload);
313        }
314
315        /**
316         * Utility method to parse out the parts of a CDX line. If a different jobID is found in the CDX line than we're
317         * given, or the CDX line is unparsable, we print an error message and return null, expecting processing to
318         * continue.
319         *
320         * @param line The line to parse.
321         * @param harvestnamePrefix .
322         * @return An object containing the salient parts of the filename of the ARC file as mentioned in the given CDX
323         * line, or null if the filename didn't match the job we're working on.
324         */
325        private String parseLine(String line, String harvestnamePrefix) {
326            try {
327                String filename = new CDXRecord(line).getArcfile();
328                if (!filename.startsWith(harvestnamePrefix)) {
329                    System.err.println("Found CXD-entry with unexpected filename '" + filename
330                            + "': does not match harvestnamePrefix '" + harvestnamePrefix + "' in " + line);
331                    return null;
332                }
333                return filename;
334            } catch (NetarkivetException e) {
335                System.err.println("Error parsing CDX line '" + line + "': " + e);
336                return null;
337            }
338        }
339
340        /**
341         * Writes a full entry of CDX files to the ARCWriter.
342         *
343         * @param writer The writer we're currently writing to.
344         * @param filename The filename of all the entries stored in baos. This is used to generate the URI for the
345         * entry.
346         * @param bytes The bytes of the CDX records to be written under this entry.
347         * @throws IOFailure if the write fails for any reason
348         */
349        private void writeCDXEntry(MetadataFileWriter writer, String filename, byte[] bytes) throws IOFailure {
350            try {
351                writer.write(MetadataFileWriter.getAlternateCDXURI(this.jobId, filename).toString(),
352                        Constants.CDX_MIME_TYPE, SystemUtils.getLocalIP(), System.currentTimeMillis(), bytes);
353            } catch (IOException e) {
354                throw new IOFailure("Failed to write ARC/WARC entry with CDX lines " + "for " + filename, e);
355            }
356        }
357
358        /**
359         * Return a string describing the parameters accepted by the CreateCDXMetadataFile tool.
360         *
361         * @return String with description of parameters.
362         */
363        public String listParameters() {
364            return usageString;
365        }
366
367        private static void printUsage() {
368            System.err.println("Usage 1: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile"
369                    + " -w --jobID 2 --harvestnamePrefix 2-1");
370            System.err.println("Usage 2: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile"
371                    + " -a --jobID 2 --harvestnamePrefix 2-1");
372            System.err.println("Usage 3: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile"
373                    + " --jobID 2 --harvestnamePrefix 2-1");
374        }
375    }
376}