Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.tools;
025
026import java.io.BufferedReader;
027import java.io.ByteArrayOutputStream;
028import java.io.File;
029import java.io.FileReader;
030import java.io.IOException;
031
032import org.apache.commons.cli.CommandLine;
033import org.apache.commons.cli.CommandLineParser;
034import org.apache.commons.cli.MissingArgumentException;
035import org.apache.commons.cli.Option;
036import org.apache.commons.cli.OptionGroup;
037import org.apache.commons.cli.Options;
038import org.apache.commons.cli.ParseException;
039import org.apache.commons.cli.PosixParser;
040import org.jwat.common.ANVLRecord;
041
042import dk.netarkivet.common.CommonSettings;
043import dk.netarkivet.common.Constants;
044import dk.netarkivet.common.distribute.JMSConnectionFactory;
045import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
046import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
047import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient;
048import dk.netarkivet.common.exceptions.IOFailure;
049import dk.netarkivet.common.exceptions.NetarkivetException;
050import dk.netarkivet.common.tools.SimpleCmdlineTool;
051import dk.netarkivet.common.tools.ToolRunnerBase;
052import dk.netarkivet.common.utils.FileUtils;
053import dk.netarkivet.common.utils.Settings;
054import dk.netarkivet.common.utils.SystemUtils;
055import dk.netarkivet.common.utils.batch.FileBatchJob;
056import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob;
057import dk.netarkivet.common.utils.cdx.CDXRecord;
058import dk.netarkivet.harvester.HarvesterSettings;
059import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
060import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc;
061
062/**
063 * This tool creates a CDX metadata file for a given job's jobID and harvestPrefix by running a batch job on the
064 * bitarchive and processing the results to give a metadata file. Use option -w to select WARC output, and -a to select
065 * ARC output: If no option available, then warc mode is selected
066 * <p>
067 * Usage: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile -w --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 Usage: java
068 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile -a --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 Usage: java
069 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile --jobID 2 --harvestID 5 --harvestnamePrefix 2-1
070 * <p>
071 * The CDX records is slightly different from the one produced normally. As we are not able to extract the timestamp,
072 * and harvestID from the (W) arcfilenames, this information is not part of the CXDURI.
073 */
074public class CreateCDXMetadataFile extends ToolRunnerBase {
075
076    public static final String ARCMODE = "arc";
077    public static final String WARCMODE = "warc";
078    public static final String usageString = "[-a|w] --jobID X --harvestID Y --harvestnamePrefix somePrefix";
079
080    /**
081     * Main method. Creates and runs the tool object responsible for batching over the bitarchive and creating a
082     * metadata file for a job.
083     *
084     * @param argv Arguments to the tool: jobID harvestnamePrefix
085     */
086    public static void main(String[] argv) {
087        new CreateCDXMetadataFile().runTheTool(argv);
088    }
089
090    /**
091     * Create the tool instance.
092     *
093     * @return A new tool object.
094     */
095    protected SimpleCmdlineTool makeMyTool() {
096        return new CreateCDXMetadataFileTool();
097    }
098
099    /**
100     * The actual tool object that creates CDX files.
101     */
102    private static class CreateCDXMetadataFileTool implements SimpleCmdlineTool {
103        /** Write output mode. Is it arc or warc mode. */
104        private boolean isWarcOutputMode;
105        /** Which jobId to process. */
106        private long jobId;
107        /** The harvestID of the job to to process. */
108        private long harvestId;
109        /** HarvestnamePrefix used to locate the files for the job. */
110        private String harvestnamePrefix;
111
112        /** The connection to the arc repository. */
113        private ViewerArcRepositoryClient arcrep;
114
115        /**
116         * The file pattern that matches an ARC or WARC file name without the jobID. If combined with a
117         * harvestnameprefix, this will match filenames that begin with the given harvestname prefix.
118         */
119        private static final String REMAINING_ARCHIVE_FILE_PATTERN = ".*";
120
121        /**
122         * Checks that a valid jobID were given. This does not check whether jobs actually exist for that ID.
123         *
124         * @param args The args given on the command line.
125         * @return True if the args are legal.
126         */
127        public boolean checkArgs(String... args) {
128            final String ARC_OPTION_KEY = "a";
129            final String WARC_OPTION_KEY = "w";
130            final String JOBID_OPTION_KEY = "jobID";
131            final String HARVESTID_OPTION_KEY = "harvestID";
132            final String HARVESTNAMEPREFIX_OPTION_KEY = "harvestnamePrefix";
133
134            OptionGroup metadataGroup = new OptionGroup();
135            Option arcOption = new Option(ARC_OPTION_KEY, false, "write an metadata ARC file");
136            Option warcOption = new Option(WARC_OPTION_KEY, false, "write an metadata WARC file");
137            metadataGroup.addOption(arcOption);
138            metadataGroup.addOption(warcOption);
139            metadataGroup.setRequired(false);
140            OptionGroup jobIDGroup = new OptionGroup();
141            Option jobIdOption = new Option(JOBID_OPTION_KEY, true, "The JobID");
142            jobIDGroup.addOption(jobIdOption);
143            jobIDGroup.setRequired(true);
144
145            OptionGroup harvestIDGroup = new OptionGroup();
146            Option harvestIdOption = new Option(HARVESTID_OPTION_KEY, true, "The HarvestID");
147            harvestIDGroup.addOption(harvestIdOption);
148            harvestIDGroup.setRequired(true);
149
150            Option harvestprefixOption = new Option(HARVESTNAMEPREFIX_OPTION_KEY, true, "The harvestnamePrefix");
151            OptionGroup harvestnamePrefixGroup = new OptionGroup();
152            harvestnamePrefixGroup.addOption(harvestprefixOption);
153            harvestnamePrefixGroup.setRequired(true);
154            Options options = new Options();
155            options.addOptionGroup(metadataGroup);
156            options.addOptionGroup(jobIDGroup);
157            options.addOptionGroup(harvestIDGroup);
158            options.addOptionGroup(harvestnamePrefixGroup);
159            String jobIdString = null;
160            String harvestIdString = null;
161
162            CommandLineParser parser = new PosixParser();
163            CommandLine cli = null;
164            try {
165                cli = parser.parse(options, args);
166            } catch (MissingArgumentException e) {
167                System.err.println("Missing or wrong arguments given");
168                printUsage();
169                return false;
170            } catch (ParseException e) {
171                System.err.println("Missing or wrong arguments given");
172                printUsage();
173                return false;
174            }
175
176            isWarcOutputMode = true; // the default
177            // Only need to check for the ARC option, as the WARC option cannot be set at the same time
178            // It is either one or none of them.
179            if (cli.hasOption(ARC_OPTION_KEY)) {
180                isWarcOutputMode = false;
181            }
182            jobIdString = cli.getOptionValue(JOBID_OPTION_KEY);
183            harvestIdString = cli.getOptionValue(HARVESTID_OPTION_KEY);
184            this.harvestnamePrefix = cli.getOptionValue(HARVESTNAMEPREFIX_OPTION_KEY);
185
186            try {
187                this.jobId = Long.parseLong(jobIdString);
188                if (jobId < 1) {
189                    System.err.println("'" + jobIdString + "' is not a valid job ID");
190                    return false;
191                }
192            } catch (NumberFormatException e) {
193                System.err.println("'" + jobIdString + "' is not a valid job ID");
194                return false;
195            }
196
197            try {
198                this.harvestId = Long.parseLong(harvestIdString);
199                if (harvestId < 1) {
200                    System.err.println("'" + harvestIdString + "' is not a valid harvest ID");
201                    return false;
202                }
203            } catch (NumberFormatException e) {
204                System.err.println("'" + harvestIdString + "' is not a valid harvest ID");
205                return false;
206            }
207            return true;
208        }
209
210        /**
211         * Create required resources here (the ArcRepositoryClient instance). Resources created here should be released
212         * in tearDown, which is guaranteed to be run.
213         *
214         * @param args The arguments that were given on the command line (not used here)
215         */
216        public void setUp(String... args) {
217            arcrep = ArcRepositoryClientFactory.getViewerInstance();
218        }
219
220        /**
221         * Closes all resources we are using, which is only the ArcRepositoryClient. This is guaranteed to be called at
222         * shutdown.
223         */
224        public void tearDown() {
225            if (arcrep != null) {
226                arcrep.close();
227                if (arcrep.getClass().getName()
228                        .equals("dk.netarkivet.archive.arcrepository.distribute.JMSArcRepositoryClient")) {
229                    JMSConnectionFactory.getInstance().cleanup();
230                }
231            }
232        }
233
234        /**
235         * The workhorse method of this tool: Runs the batch job, copies the result, then turns the result into a proper
236         * metadata file. This method assumes that the args have already been read by the checkArgs method, and thus
237         * jobId has been parsed, and the isWarcOutputMode established
238         *
239         * @param args Arguments given on the command line.
240         */
241        public void run(String... args) {
242            final long jobID = this.jobId;
243            final long harvestID = this.harvestId;
244            final String harvestPrefix = this.harvestnamePrefix;
245            FileBatchJob job = new ArchiveExtractCDXJob();
246            Settings.set(HarvesterSettings.METADATA_FORMAT, (isWarcOutputMode) ? "warc" : "arc");
247            final String filePattern = harvestPrefix + REMAINING_ARCHIVE_FILE_PATTERN;
248
249            System.out.println("Creating cdx-" + ((isWarcOutputMode) ? "warcfile" : "arcfile")
250                    + " from file matching pattern '" + filePattern + "'.");
251            job.processOnlyFilesMatching(filePattern);
252
253            BatchStatus status = arcrep.batch(job, Settings.get(CommonSettings.USE_REPLICA_ID));
254            if (status.hasResultFile()) {
255                System.out.println("Got results from archive. Processing data");
256                File resultFile = null;
257                try {
258                    resultFile = File.createTempFile("extract-batch", ".cdx", FileUtils.getTempDir());
259                    resultFile.deleteOnExit();
260                    status.copyResults(resultFile);
261                    arcifyResultFile(resultFile, jobID, harvestId);
262                } catch (IOException e) {
263                    throw new IOFailure("Error getting results for job " + jobID, e);
264                } finally {
265                    if (resultFile != null) {
266                        FileUtils.remove(resultFile);
267                    }
268                }
269            } else {
270                System.err.println("Got new results from archive. Program ending now");
271            }
272        }
273
274        /**
275         * Turns a raw CDX file for the given jobID into a metadatafile containing the CDX lines in one archive record
276         * per each ARC or WARC file indexed. The output is put into a file called &lt;jobID&gt;-metadata-1.arc.
277         *
278         * @param resultFile The CDX file returned by a ExtractCDXJob for the given jobID.
279         * @param jobID The jobID we work on.
280         * @throws IOException If an I/O error occurs, or the resultFile does not exist
281         */
282        private void arcifyResultFile(File resultFile, long jobID, long harvestID) throws IOException {
283            BufferedReader reader = new BufferedReader(new FileReader(resultFile));
284
285            File outputFile = new File(MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobID), harvestID));
286            System.out.println("Writing cdx to file '" + outputFile.getAbsolutePath() + "'.");
287            try {
288                MetadataFileWriter writer = MetadataFileWriter.createWriter(outputFile);
289                if (writer instanceof MetadataFileWriterWarc) {
290                    insertWarcInfo((MetadataFileWriterWarc) writer, jobID);
291                }
292                try {
293                    String line;
294                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
295                    String lastFilename = null;
296                    String newFilename = null;
297
298                    while ((line = reader.readLine()) != null) {
299                        // parse filename out of line
300                        newFilename = parseLine(line, harvestnamePrefix);
301                        if (newFilename == null) { // Bad line, try the next
302                            continue;
303                        }
304                        if (lastFilename != null && !newFilename.equals(lastFilename)) {
305                            // When we reach the end of a block of lines from
306                            // one ARC/WARC file, we write those as a single entry.
307                            writeCDXEntry(writer, newFilename, baos.toByteArray());
308                            baos.reset();
309                        }
310                        baos.write(line.getBytes());
311                        baos.write("\n".getBytes());
312                        lastFilename = newFilename;
313                    }
314                    if (newFilename != null) {
315                        writeCDXEntry(writer, newFilename, baos.toByteArray());
316                    }
317                } finally {
318                    writer.close();
319                }
320            } finally {
321                reader.close();
322            }
323        }
324
325        private void insertWarcInfo(MetadataFileWriterWarc writer, Long jobID) {
326            ANVLRecord infoPayload = new ANVLRecord();
327            infoPayload.addLabelValue("software",
328                    "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/"
329                            + dk.netarkivet.common.Constants.PROJECT_WEBSITE);
330            infoPayload.addLabelValue("ip", SystemUtils.getLocalIP());
331            infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName());
332            infoPayload
333                    .addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
334            infoPayload.addLabelValue("isPartOf", "" + jobID);
335            writer.insertInfoRecord(infoPayload);
336        }
337
338        /**
339         * Utility method to parse out the parts of a CDX line. If a different jobID is found in the CDX line than we're
340         * given, or the CDX line is unparsable, we print an error message and return null, expecting processing to
341         * continue.
342         *
343         * @param line The line to parse.
344         * @param harvestnamePrefix .
345         * @return An object containing the salient parts of the filename of the ARC file as mentioned in the given CDX
346         * line, or null if the filename didn't match the job we're working on.
347         */
348        private String parseLine(String line, String harvestnamePrefix) {
349            try {
350                String filename = new CDXRecord(line).getArcfile();
351                if (!filename.startsWith(harvestnamePrefix)) {
352                    System.err.println("Found CXD-entry with unexpected filename '" + filename
353                            + "': does not match harvestnamePrefix '" + harvestnamePrefix + "' in " + line);
354                    return null;
355                }
356                return filename;
357            } catch (NetarkivetException e) {
358                System.err.println("Error parsing CDX line '" + line + "': " + e);
359                return null;
360            }
361        }
362
363        /**
364         * Writes a full entry of CDX files to the ARCWriter.
365         *
366         * @param writer The writer we're currently writing to.
367         * @param filename The filename of all the entries stored in baos. This is used to generate the URI for the
368         * entry.
369         * @param bytes The bytes of the CDX records to be written under this entry.
370         * @throws IOFailure if the write fails for any reason
371         */
372        private void writeCDXEntry(MetadataFileWriter writer, String filename, byte[] bytes) throws IOFailure {
373            try {
374                writer.write(MetadataFileWriter.getAlternateCDXURI(this.jobId, filename).toString(),
375                        Constants.CDX_MIME_TYPE, SystemUtils.getLocalIP(), System.currentTimeMillis(), bytes);
376            } catch (IOException e) {
377                throw new IOFailure("Failed to write ARC/WARC entry with CDX lines " + "for " + filename, e);
378            }
379        }
380
381        /**
382         * Return a string describing the parameters accepted by the CreateCDXMetadataFile tool.
383         *
384         * @return String with description of parameters.
385         */
386        public String listParameters() {
387            return usageString;
388        }
389
390        private static void printUsage() {
391            System.err.println("Usage 1: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile"
392                    + " -w --jobID 2 --harvestnamePrefix 2-1");
393            System.err.println("Usage 2: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile"
394                    + " -a --jobID 2 --harvestnamePrefix 2-1");
395            System.err.println("Usage 3: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile"
396                    + " --jobID 2 --harvestnamePrefix 2-1");
397        }
398    }
399}