001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.tools; 025 026import java.io.BufferedReader; 027import java.io.ByteArrayOutputStream; 028import java.io.File; 029import java.io.FileReader; 030import java.io.IOException; 031 032import org.apache.commons.cli.CommandLine; 033import org.apache.commons.cli.CommandLineParser; 034import org.apache.commons.cli.MissingArgumentException; 035import org.apache.commons.cli.Option; 036import org.apache.commons.cli.OptionGroup; 037import org.apache.commons.cli.Options; 038import org.apache.commons.cli.ParseException; 039import org.apache.commons.cli.PosixParser; 040import org.jwat.common.ANVLRecord; 041 042import dk.netarkivet.common.CommonSettings; 043import dk.netarkivet.common.Constants; 044import dk.netarkivet.common.distribute.JMSConnectionFactory; 045import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 046import dk.netarkivet.common.distribute.arcrepository.BatchStatus; 047import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient; 048import dk.netarkivet.common.exceptions.IOFailure; 049import dk.netarkivet.common.exceptions.NetarkivetException; 050import dk.netarkivet.common.tools.SimpleCmdlineTool; 051import dk.netarkivet.common.tools.ToolRunnerBase; 052import dk.netarkivet.common.utils.FileUtils; 053import dk.netarkivet.common.utils.Settings; 054import dk.netarkivet.common.utils.SystemUtils; 055import dk.netarkivet.common.utils.batch.FileBatchJob; 056import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob; 057import dk.netarkivet.common.utils.cdx.CDXRecord; 058import dk.netarkivet.harvester.HarvesterSettings; 059import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter; 060import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc; 061 062/** 063 * This tool creates a CDX metadata file for a given job's jobID and harvestPrefix by running a batch job on the 064 * bitarchive and processing the results to give a metadata file. Use option -w to select WARC output, and -a to select 065 * ARC output: If no option available, then warc mode is selected 066 * <p> 067 * Usage: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile -w --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 Usage: java 068 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile -a --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 Usage: java 069 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 070 * <p> 071 * The CDX records is slightly different from the one produced normally. As we are not able to extract the timestamp, 072 * and harvestID from the (W) arcfilenames, this information is not part of the CXDURI. 073 */ 074public class CreateCDXMetadataFile extends ToolRunnerBase { 075 076 public static final String ARCMODE = "arc"; 077 public static final String WARCMODE = "warc"; 078 public static final String usageString = "[-a|w] --jobID X --harvestID Y --harvestnamePrefix somePrefix"; 079 080 /** 081 * Main method. Creates and runs the tool object responsible for batching over the bitarchive and creating a 082 * metadata file for a job. 083 * 084 * @param argv Arguments to the tool: jobID harvestnamePrefix 085 */ 086 public static void main(String[] argv) { 087 new CreateCDXMetadataFile().runTheTool(argv); 088 } 089 090 /** 091 * Create the tool instance. 092 * 093 * @return A new tool object. 094 */ 095 protected SimpleCmdlineTool makeMyTool() { 096 return new CreateCDXMetadataFileTool(); 097 } 098 099 /** 100 * The actual tool object that creates CDX files. 101 */ 102 private static class CreateCDXMetadataFileTool implements SimpleCmdlineTool { 103 /** Write output mode. Is it arc or warc mode. */ 104 private boolean isWarcOutputMode; 105 /** Which jobId to process. */ 106 private long jobId; 107 /** Which harvestId to process. */ 108 private long harvestId; 109 /** HarvestnamePrefix used to locate the files for the job. */ 110 private String harvestnamePrefix; 111 112 /** The connection to the arc repository. */ 113 private ViewerArcRepositoryClient arcrep; 114 115 /** 116 * The file pattern that matches an ARC or WARC file name without the jobID. If combined with a 117 * harvestnameprefix, this will match filenames that begin with the given harvestname prefix. 118 */ 119 private static final String REMAINING_ARCHIVE_FILE_PATTERN = ".*"; 120 121 /** 122 * Checks that a valid jobID were given. This does not check whether jobs actually exist for that ID. 123 * 124 * @param args The args given on the command line. 125 * @return True if the args are legal. 126 */ 127 public boolean checkArgs(String... args) { 128 final String ARC_OPTION_KEY = "a"; 129 final String WARC_OPTION_KEY = "w"; 130 final String JOBID_OPTION_KEY = "jobID"; 131 final String HARVESTID_OPTION_KEY = "harvestID"; 132 final String HARVESTNAMEPREFIX_OPTION_KEY = "harvestnamePrefix"; 133 134 OptionGroup metadataGroup = new OptionGroup(); 135 Option arcOption = new Option(ARC_OPTION_KEY, false, "write an metadata ARC file"); 136 Option warcOption = new Option(WARC_OPTION_KEY, false, "write an metadata WARC file"); 137 metadataGroup.addOption(arcOption); 138 metadataGroup.addOption(warcOption); 139 metadataGroup.setRequired(false); 140 OptionGroup jobIDGroup = new OptionGroup(); 141 Option jobIdOption = new Option(JOBID_OPTION_KEY, true, "The JobID"); 142 jobIDGroup.addOption(jobIdOption); 143 jobIDGroup.setRequired(true); 144 145 OptionGroup harvestIDGroup = new OptionGroup(); 146 Option harvestIdOption = new Option(HARVESTID_OPTION_KEY, true, "The HarvestID"); 147 harvestIDGroup.addOption(harvestIdOption); 148 harvestIDGroup.setRequired(true); 149 150 Option harvestprefixOption = new Option(HARVESTNAMEPREFIX_OPTION_KEY, true, "The harvestnamePrefix"); 151 OptionGroup harvestnamePrefixGroup = new OptionGroup(); 152 harvestnamePrefixGroup.addOption(harvestprefixOption); 153 harvestnamePrefixGroup.setRequired(true); 154 Options options = new Options(); 155 options.addOptionGroup(metadataGroup); 156 options.addOptionGroup(jobIDGroup); 157 options.addOptionGroup(harvestIDGroup); 158 options.addOptionGroup(harvestnamePrefixGroup); 159 String jobIdString = null; 160 String harvestIdString = null; 161 162 CommandLineParser parser = new PosixParser(); 163 CommandLine cli = null; 164 try { 165 cli = parser.parse(options, args); 166 } catch (MissingArgumentException e) { 167 System.err.println("Missing or wrong arguments given"); 168 printUsage(); 169 return false; 170 } catch (ParseException e) { 171 System.err.println("Missing or wrong arguments given"); 172 printUsage(); 173 return false; 174 } 175 176 isWarcOutputMode = true; // the default 177 // Only need to check for the ARC option, as the WARC option cannot be set at the same time 178 // It is either one or none of them. 179 if (cli.hasOption(ARC_OPTION_KEY)) { 180 isWarcOutputMode = false; 181 } 182 jobIdString = cli.getOptionValue(JOBID_OPTION_KEY); 183 harvestIdString = cli.getOptionValue(HARVESTID_OPTION_KEY); 184 this.harvestnamePrefix = cli.getOptionValue(HARVESTNAMEPREFIX_OPTION_KEY); 185 186 try { 187 this.jobId = Long.parseLong(jobIdString); 188 if (jobId < 1) { 189 System.err.println("'" + jobIdString + "' is not a valid job ID"); 190 return false; 191 } 192 } catch (NumberFormatException e) { 193 System.err.println("'" + jobIdString + "' is not a valid job ID"); 194 return false; 195 } 196 197 try { 198 this.harvestId = Long.parseLong(harvestIdString); 199 if (harvestId < 1) { 200 System.err.println("'" + harvestIdString + "' is not a valid harvest ID"); 201 return false; 202 } 203 } catch (NumberFormatException e) { 204 System.err.println("'" + harvestIdString + "' is not a valid harvest ID"); 205 return false; 206 } 207 208 return true; 209 } 210 211 /** 212 * Create required resources here (the ArcRepositoryClient instance). Resources created here should be released 213 * in tearDown, which is guaranteed to be run. 214 * 215 * @param args The arguments that were given on the command line (not used here) 216 */ 217 public void setUp(String... args) { 218 arcrep = ArcRepositoryClientFactory.getViewerInstance(); 219 } 220 221 /** 222 * Closes all resources we are using, which is only the ArcRepositoryClient. This is guaranteed to be called at 223 * shutdown. 224 */ 225 public void tearDown() { 226 if (arcrep != null) { 227 arcrep.close(); 228 if (arcrep.getClass().getName() 229 .equals("dk.netarkivet.archive.arcrepository.distribute.JMSArcRepositoryClient")) { 230 JMSConnectionFactory.getInstance().cleanup(); 231 } 232 } 233 } 234 235 /** 236 * The workhorse method of this tool: Runs the batch job, copies the result, then turns the result into a proper 237 * metadata file. This method assumes that the args have already been read by the checkArgs method, and thus 238 * jobId has been parsed, and the isWarcOutputMode established 239 * 240 * @param args Arguments given on the command line. 241 */ 242 public void run(String... args) { 243 final long jobID = this.jobId; 244 final long harvestId = this.harvestId; 245 final String harvestPrefix = this.harvestnamePrefix; 246 FileBatchJob job = new ArchiveExtractCDXJob(); 247 Settings.set(HarvesterSettings.METADATA_FORMAT, (isWarcOutputMode) ? "warc" : "arc"); 248 final String filePattern = harvestPrefix + REMAINING_ARCHIVE_FILE_PATTERN; 249 250 System.out.println("Creating cdx-" + ((isWarcOutputMode) ? "warcfile" : "arcfile") 251 + " from file matching pattern '" + filePattern + "'."); 252 job.processOnlyFilesMatching(filePattern); 253 254 BatchStatus status = arcrep.batch(job, Settings.get(CommonSettings.USE_REPLICA_ID)); 255 if (status.hasResultFile()) { 256 System.out.println("Got results from archive. Processing data"); 257 File resultFile = null; 258 try { 259 resultFile = File.createTempFile("extract-batch", ".cdx", FileUtils.getTempDir()); 260 resultFile.deleteOnExit(); 261 status.copyResults(resultFile); 262 arcifyResultFile(resultFile, jobID, harvestId); 263 } catch (IOException e) { 264 throw new IOFailure("Error getting results for job " + jobID, e); 265 } finally { 266 if (resultFile != null) { 267 FileUtils.remove(resultFile); 268 } 269 } 270 } else { 271 System.err.println("Got new results from archive. Program ending now"); 272 } 273 } 274 275 /** 276 * Turns a raw CDX file for the given jobID into a metadatafile containing the CDX lines in one archive record 277 * per each ARC or WARC file indexed. The output is put into a file called <jobID>-metadata-1.arc. 278 * 279 * @param resultFile The CDX file returned by a ExtractCDXJob for the given jobID. 280 * @param jobID The jobID we work on. 281 * @throws IOException If an I/O error occurs, or the resultFile does not exist 282 */ 283 private void arcifyResultFile(File resultFile, long jobID, long harvestID) throws IOException { 284 BufferedReader reader = new BufferedReader(new FileReader(resultFile)); 285 286 File outputFile = new File(MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobID), harvestID)); 287 System.out.println("Writing cdx to file '" + outputFile.getAbsolutePath() + "'."); 288 try { 289 MetadataFileWriter writer = MetadataFileWriter.createWriter(outputFile); 290 if (writer instanceof MetadataFileWriterWarc) { 291 insertWarcInfo((MetadataFileWriterWarc) writer, jobID); 292 } 293 try { 294 String line; 295 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 296 String lastFilename = null; 297 String newFilename = null; 298 299 while ((line = reader.readLine()) != null) { 300 // parse filename out of line 301 newFilename = parseLine(line, harvestnamePrefix); 302 if (newFilename == null) { // Bad line, try the next 303 continue; 304 } 305 if (lastFilename != null && !newFilename.equals(lastFilename)) { 306 // When we reach the end of a block of lines from 307 // one ARC/WARC file, we write those as a single entry. 308 writeCDXEntry(writer, newFilename, baos.toByteArray()); 309 baos.reset(); 310 } 311 baos.write(line.getBytes()); 312 baos.write("\n".getBytes()); 313 lastFilename = newFilename; 314 } 315 if (newFilename != null) { 316 writeCDXEntry(writer, newFilename, baos.toByteArray()); 317 } 318 } finally { 319 writer.close(); 320 } 321 } finally { 322 reader.close(); 323 } 324 } 325 326 private void insertWarcInfo(MetadataFileWriterWarc writer, Long jobID) { 327 ANVLRecord infoPayload = new ANVLRecord(); 328 infoPayload.addLabelValue("software", 329 "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString(false) + "/" 330 + dk.netarkivet.common.Constants.PROJECT_WEBSITE); 331 infoPayload.addLabelValue("ip", SystemUtils.getLocalIP()); 332 infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName()); 333 infoPayload 334 .addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); 335 infoPayload.addLabelValue("isPartOf", "" + jobID); 336 writer.insertInfoRecord(infoPayload); 337 } 338 339 /** 340 * Utility method to parse out the parts of a CDX line. If a different jobID is found in the CDX line than we're 341 * given, or the CDX line is unparsable, we print an error message and return null, expecting processing to 342 * continue. 343 * 344 * @param line The line to parse. 345 * @param harvestnamePrefix . 346 * @return An object containing the salient parts of the filename of the ARC file as mentioned in the given CDX 347 * line, or null if the filename didn't match the job we're working on. 348 */ 349 private String parseLine(String line, String harvestnamePrefix) { 350 try { 351 String filename = new CDXRecord(line).getArcfile(); 352 if (!filename.startsWith(harvestnamePrefix)) { 353 System.err.println("Found CXD-entry with unexpected filename '" + filename 354 + "': does not match harvestnamePrefix '" + harvestnamePrefix + "' in " + line); 355 return null; 356 } 357 return filename; 358 } catch (NetarkivetException e) { 359 System.err.println("Error parsing CDX line '" + line + "': " + e); 360 return null; 361 } 362 } 363 364 /** 365 * Writes a full entry of CDX files to the ARCWriter. 366 * 367 * @param writer The writer we're currently writing to. 368 * @param filename The filename of all the entries stored. This is used to generate the URI for the 369 * entry. 370 * @param bytes The bytes of the CDX records to be written under this entry. 371 * @throws IOFailure if the write fails for any reason 372 */ 373 private void writeCDXEntry(MetadataFileWriter writer, String filename, byte[] bytes) throws IOFailure { 374 try { 375 writer.write(MetadataFileWriter.getAlternateCDXURI(this.jobId, filename).toString(), 376 Constants.CDX_MIME_TYPE, SystemUtils.getLocalIP(), System.currentTimeMillis(), bytes); 377 } catch (IOException e) { 378 throw new IOFailure("Failed to write ARC/WARC entry with CDX lines " + "for " + filename, e); 379 } 380 } 381 382 /** 383 * Return a string describing the parameters accepted by the CreateCDXMetadataFile tool. 384 * 385 * @return String with description of parameters. 386 */ 387 public String listParameters() { 388 return usageString; 389 } 390 391 private static void printUsage() { 392 System.err.println("Usage 1: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile" 393 + " -w --jobID 2 --harvestnamePrefix 2-1"); 394 System.err.println("Usage 2: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile" 395 + " -a --jobID 2 --harvestnamePrefix 2-1"); 396 System.err.println("Usage 3: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile" 397 + " --jobID 2 --harvestnamePrefix 2-1"); 398 } 399 } 400}