Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.tools;
025
026import java.io.BufferedReader;
027import java.io.ByteArrayOutputStream;
028import java.io.File;
029import java.io.FileReader;
030import java.io.IOException;
031
032import org.apache.commons.cli.CommandLine;
033import org.apache.commons.cli.CommandLineParser;
034import org.apache.commons.cli.MissingArgumentException;
035import org.apache.commons.cli.Option;
036import org.apache.commons.cli.OptionGroup;
037import org.apache.commons.cli.Options;
038import org.apache.commons.cli.ParseException;
039import org.apache.commons.cli.PosixParser;
040import org.jwat.common.ANVLRecord;
041
042import dk.netarkivet.common.CommonSettings;
043import dk.netarkivet.common.Constants;
044import dk.netarkivet.common.distribute.JMSConnectionFactory;
045import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
046import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
047import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient;
048import dk.netarkivet.common.exceptions.IOFailure;
049import dk.netarkivet.common.exceptions.NetarkivetException;
050import dk.netarkivet.common.tools.SimpleCmdlineTool;
051import dk.netarkivet.common.tools.ToolRunnerBase;
052import dk.netarkivet.common.utils.FileUtils;
053import dk.netarkivet.common.utils.Settings;
054import dk.netarkivet.common.utils.SystemUtils;
055import dk.netarkivet.common.utils.batch.FileBatchJob;
056import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob;
057import dk.netarkivet.common.utils.cdx.CDXRecord;
058import dk.netarkivet.harvester.HarvesterSettings;
059import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
060import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc;
061
062/**
063 * This tool creates a CDX metadata file for a given job's jobID and harvestPrefix by running a batch job on the
064 * bitarchive and processing the results to give a metadata file. Use option -w to select WARC output, and -a to select
065 * ARC output: If no option available, then warc mode is selected
066 * <p>
067 * Usage: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile -w --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 Usage: java
068 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile -a --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 Usage: java
069 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile --jobID 2 --harvestID 5 --harvestnamePrefix 2-1
070 * <p>
071 * The CDX records is slightly different from the one produced normally. As we are not able to extract the timestamp,
072 * and harvestID from the (W) arcfilenames, this information is not part of the CXDURI.
073 */
074public class CreateCDXMetadataFile extends ToolRunnerBase {
075
076    public static final String ARCMODE = "arc";
077    public static final String WARCMODE = "warc";
078    public static final String usageString = "[-a|w] --jobID X --harvestID Y --harvestnamePrefix somePrefix";
079
080    /**
081     * Main method. Creates and runs the tool object responsible for batching over the bitarchive and creating a
082     * metadata file for a job.
083     *
084     * @param argv Arguments to the tool: jobID harvestnamePrefix
085     */
086    public static void main(String[] argv) {
087        new CreateCDXMetadataFile().runTheTool(argv);
088    }
089
090    /**
091     * Create the tool instance.
092     *
093     * @return A new tool object.
094     */
095    protected SimpleCmdlineTool makeMyTool() {
096        return new CreateCDXMetadataFileTool();
097    }
098
099    /**
100     * The actual tool object that creates CDX files.
101     */
102    private static class CreateCDXMetadataFileTool implements SimpleCmdlineTool {
103        /** Write output mode. Is it arc or warc mode. */
104        private boolean isWarcOutputMode;
105        /** Which jobId to process. */
106        private long jobId;
107        /** Which harvestId to process. */
108        private long harvestId;
109        /** HarvestnamePrefix used to locate the files for the job. */
110        private String harvestnamePrefix;
111
112        /** The connection to the arc repository. */
113        private ViewerArcRepositoryClient arcrep;
114
115        /**
116         * The file pattern that matches an ARC or WARC file name without the jobID. If combined with a
117         * harvestnameprefix, this will match filenames that begin with the given harvestname prefix.
118         */
119        private static final String REMAINING_ARCHIVE_FILE_PATTERN = ".*";
120
121        /**
122         * Checks that a valid jobID were given. This does not check whether jobs actually exist for that ID.
123         *
124         * @param args The args given on the command line.
125         * @return True if the args are legal.
126         */
127        public boolean checkArgs(String... args) {
128            final String ARC_OPTION_KEY = "a";
129            final String WARC_OPTION_KEY = "w";
130            final String JOBID_OPTION_KEY = "jobID";
131            final String HARVESTID_OPTION_KEY = "harvestID";
132            final String HARVESTNAMEPREFIX_OPTION_KEY = "harvestnamePrefix";
133
134            OptionGroup metadataGroup = new OptionGroup();
135            Option arcOption = new Option(ARC_OPTION_KEY, false, "write an metadata ARC file");
136            Option warcOption = new Option(WARC_OPTION_KEY, false, "write an metadata WARC file");
137            metadataGroup.addOption(arcOption);
138            metadataGroup.addOption(warcOption);
139            metadataGroup.setRequired(false);
140            OptionGroup jobIDGroup = new OptionGroup();
141            Option jobIdOption = new Option(JOBID_OPTION_KEY, true, "The JobID");
142            jobIDGroup.addOption(jobIdOption);
143            jobIDGroup.setRequired(true);
144
145            OptionGroup harvestIDGroup = new OptionGroup();
146            Option harvestIdOption = new Option(HARVESTID_OPTION_KEY, true, "The HarvestID");
147            harvestIDGroup.addOption(harvestIdOption);
148            harvestIDGroup.setRequired(true);
149
150            Option harvestprefixOption = new Option(HARVESTNAMEPREFIX_OPTION_KEY, true, "The harvestnamePrefix");
151            OptionGroup harvestnamePrefixGroup = new OptionGroup();
152            harvestnamePrefixGroup.addOption(harvestprefixOption);
153            harvestnamePrefixGroup.setRequired(true);
154            Options options = new Options();
155            options.addOptionGroup(metadataGroup);
156            options.addOptionGroup(jobIDGroup);
157            options.addOptionGroup(harvestIDGroup);
158            options.addOptionGroup(harvestnamePrefixGroup);
159            String jobIdString = null;
160            String harvestIdString = null;
161
162            CommandLineParser parser = new PosixParser();
163            CommandLine cli = null;
164            try {
165                cli = parser.parse(options, args);
166            } catch (MissingArgumentException e) {
167                System.err.println("Missing or wrong arguments given");
168                printUsage();
169                return false;
170            } catch (ParseException e) {
171                System.err.println("Missing or wrong arguments given");
172                printUsage();
173                return false;
174            }
175
176            isWarcOutputMode = true; // the default
177            // Only need to check for the ARC option, as the WARC option cannot be set at the same time
178            // It is either one or none of them.
179            if (cli.hasOption(ARC_OPTION_KEY)) {
180                isWarcOutputMode = false;
181            }
182            jobIdString = cli.getOptionValue(JOBID_OPTION_KEY);
183            harvestIdString = cli.getOptionValue(HARVESTID_OPTION_KEY);
184            this.harvestnamePrefix = cli.getOptionValue(HARVESTNAMEPREFIX_OPTION_KEY);
185
186            try {
187                this.jobId = Long.parseLong(jobIdString);
188                if (jobId < 1) {
189                    System.err.println("'" + jobIdString + "' is not a valid job ID");
190                    return false;
191                }
192            } catch (NumberFormatException e) {
193                System.err.println("'" + jobIdString + "' is not a valid job ID");
194                return false;
195            }
196
197            try {
198                this.harvestId = Long.parseLong(harvestIdString);
199                if (harvestId < 1) {
200                    System.err.println("'" + harvestIdString + "' is not a valid harvest ID");
201                    return false;
202                }
203            } catch (NumberFormatException e) {
204                System.err.println("'" + harvestIdString + "' is not a valid harvest ID");
205                return false;
206            }
207
208            return true;
209        }
210
211        /**
212         * Create required resources here (the ArcRepositoryClient instance). Resources created here should be released
213         * in tearDown, which is guaranteed to be run.
214         *
215         * @param args The arguments that were given on the command line (not used here)
216         */
217        public void setUp(String... args) {
218            arcrep = ArcRepositoryClientFactory.getViewerInstance();
219        }
220
221        /**
222         * Closes all resources we are using, which is only the ArcRepositoryClient. This is guaranteed to be called at
223         * shutdown.
224         */
225        public void tearDown() {
226            if (arcrep != null) {
227                arcrep.close();
228                if (arcrep.getClass().getName()
229                        .equals("dk.netarkivet.archive.arcrepository.distribute.JMSArcRepositoryClient")) {
230                    JMSConnectionFactory.getInstance().cleanup();
231                }
232            }
233        }
234
235        /**
236         * The workhorse method of this tool: Runs the batch job, copies the result, then turns the result into a proper
237         * metadata file. This method assumes that the args have already been read by the checkArgs method, and thus
238         * jobId has been parsed, and the isWarcOutputMode established
239         *
240         * @param args Arguments given on the command line.
241         */
242        public void run(String... args) {
243            final long jobID = this.jobId;
244            final long harvestId = this.harvestId;
245            final String harvestPrefix = this.harvestnamePrefix;
246            FileBatchJob job = new ArchiveExtractCDXJob();
247            Settings.set(HarvesterSettings.METADATA_FORMAT, (isWarcOutputMode) ? "warc" : "arc");
248            final String filePattern = harvestPrefix + REMAINING_ARCHIVE_FILE_PATTERN;
249
250            System.out.println("Creating cdx-" + ((isWarcOutputMode) ? "warcfile" : "arcfile")
251                    + " from file matching pattern '" + filePattern + "'.");
252            job.processOnlyFilesMatching(filePattern);
253
254            BatchStatus status = arcrep.batch(job, Settings.get(CommonSettings.USE_REPLICA_ID));
255            if (status.hasResultFile()) {
256                System.out.println("Got results from archive. Processing data");
257                File resultFile = null;
258                try {
259                    resultFile = File.createTempFile("extract-batch", ".cdx", FileUtils.getTempDir());
260                    resultFile.deleteOnExit();
261                    status.copyResults(resultFile);
262                    arcifyResultFile(resultFile, jobID, harvestId);
263                } catch (IOException e) {
264                    throw new IOFailure("Error getting results for job " + jobID, e);
265                } finally {
266                    if (resultFile != null) {
267                        FileUtils.remove(resultFile);
268                    }
269                }
270            } else {
271                System.err.println("Got new results from archive. Program ending now");
272            }
273        }
274
275        /**
276         * Turns a raw CDX file for the given jobID into a metadatafile containing the CDX lines in one archive record
277         * per each ARC or WARC file indexed. The output is put into a file called &lt;jobID&gt;-metadata-1.arc.
278         *
279         * @param resultFile The CDX file returned by a ExtractCDXJob for the given jobID.
280         * @param jobID The jobID we work on.
281         * @throws IOException If an I/O error occurs, or the resultFile does not exist
282         */
283        private void arcifyResultFile(File resultFile, long jobID, long harvestID) throws IOException {
284            BufferedReader reader = new BufferedReader(new FileReader(resultFile));
285
286            File outputFile = new File(MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobID), harvestID));
287            System.out.println("Writing cdx to file '" + outputFile.getAbsolutePath() + "'.");
288            try {
289                MetadataFileWriter writer = MetadataFileWriter.createWriter(outputFile);
290                if (writer instanceof MetadataFileWriterWarc) {
291                    insertWarcInfo((MetadataFileWriterWarc) writer, jobID);
292                }
293                try {
294                    String line;
295                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
296                    String lastFilename = null;
297                    String newFilename = null;
298
299                    while ((line = reader.readLine()) != null) {
300                        // parse filename out of line
301                        newFilename = parseLine(line, harvestnamePrefix);
302                        if (newFilename == null) { // Bad line, try the next
303                            continue;
304                        }
305                        if (lastFilename != null && !newFilename.equals(lastFilename)) {
306                            // When we reach the end of a block of lines from
307                            // one ARC/WARC file, we write those as a single entry.
308                            writeCDXEntry(writer, newFilename, baos.toByteArray());
309                            baos.reset();
310                        }
311                        baos.write(line.getBytes());
312                        baos.write("\n".getBytes());
313                        lastFilename = newFilename;
314                    }
315                    if (newFilename != null) {
316                        writeCDXEntry(writer, newFilename, baos.toByteArray());
317                    }
318                } finally {
319                    writer.close();
320                }
321            } finally {
322                reader.close();
323            }
324        }
325
326        private void insertWarcInfo(MetadataFileWriterWarc writer, Long jobID) {
327            ANVLRecord infoPayload = new ANVLRecord();
328            infoPayload.addLabelValue("software",
329                    "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString(false) + "/"
330                            + dk.netarkivet.common.Constants.PROJECT_WEBSITE);
331            infoPayload.addLabelValue("ip", SystemUtils.getLocalIP());
332            infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName());
333            infoPayload
334                    .addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
335            infoPayload.addLabelValue("isPartOf", "" + jobID);
336            writer.insertInfoRecord(infoPayload);
337        }
338
339        /**
340         * Utility method to parse out the parts of a CDX line. If a different jobID is found in the CDX line than we're
341         * given, or the CDX line is unparsable, we print an error message and return null, expecting processing to
342         * continue.
343         *
344         * @param line The line to parse.
345         * @param harvestnamePrefix .
346         * @return An object containing the salient parts of the filename of the ARC file as mentioned in the given CDX
347         * line, or null if the filename didn't match the job we're working on.
348         */
349        private String parseLine(String line, String harvestnamePrefix) {
350            try {
351                String filename = new CDXRecord(line).getArcfile();
352                if (!filename.startsWith(harvestnamePrefix)) {
353                    System.err.println("Found CXD-entry with unexpected filename '" + filename
354                            + "': does not match harvestnamePrefix '" + harvestnamePrefix + "' in " + line);
355                    return null;
356                }
357                return filename;
358            } catch (NetarkivetException e) {
359                System.err.println("Error parsing CDX line '" + line + "': " + e);
360                return null;
361            }
362        }
363
364        /**
365         * Writes a full entry of CDX files to the ARCWriter.
366         *
367         * @param writer The writer we're currently writing to.
368         * @param filename The filename of all the entries stored. This is used to generate the URI for the
369         * entry.
370         * @param bytes The bytes of the CDX records to be written under this entry.
371         * @throws IOFailure if the write fails for any reason
372         */
373        private void writeCDXEntry(MetadataFileWriter writer, String filename, byte[] bytes) throws IOFailure {
374            try {
375                writer.write(MetadataFileWriter.getAlternateCDXURI(this.jobId, filename).toString(),
376                        Constants.CDX_MIME_TYPE, SystemUtils.getLocalIP(), System.currentTimeMillis(), bytes);
377            } catch (IOException e) {
378                throw new IOFailure("Failed to write ARC/WARC entry with CDX lines " + "for " + filename, e);
379            }
380        }
381
382        /**
383         * Return a string describing the parameters accepted by the CreateCDXMetadataFile tool.
384         *
385         * @return String with description of parameters.
386         */
387        public String listParameters() {
388            return usageString;
389        }
390
391        private static void printUsage() {
392            System.err.println("Usage 1: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile"
393                    + " -w --jobID 2 --harvestnamePrefix 2-1");
394            System.err.println("Usage 2: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile"
395                    + " -a --jobID 2 --harvestnamePrefix 2-1");
396            System.err.println("Usage 3: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile"
397                    + " --jobID 2 --harvestnamePrefix 2-1");
398        }
399    }
400}