001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.tools; 025 026import java.io.BufferedReader; 027import java.io.ByteArrayOutputStream; 028import java.io.File; 029import java.io.FileReader; 030import java.io.IOException; 031 032import org.apache.commons.cli.CommandLine; 033import org.apache.commons.cli.CommandLineParser; 034import org.apache.commons.cli.MissingArgumentException; 035import org.apache.commons.cli.Option; 036import org.apache.commons.cli.OptionGroup; 037import org.apache.commons.cli.Options; 038import org.apache.commons.cli.ParseException; 039import org.apache.commons.cli.PosixParser; 040import org.jwat.common.ANVLRecord; 041 042import dk.netarkivet.common.CommonSettings; 043import dk.netarkivet.common.Constants; 044import dk.netarkivet.common.distribute.JMSConnectionFactory; 045import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 046import dk.netarkivet.common.distribute.arcrepository.BatchStatus; 047import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient; 048import dk.netarkivet.common.exceptions.IOFailure; 049import dk.netarkivet.common.exceptions.NetarkivetException; 050import dk.netarkivet.common.tools.SimpleCmdlineTool; 051import dk.netarkivet.common.tools.ToolRunnerBase; 052import dk.netarkivet.common.utils.FileUtils; 053import dk.netarkivet.common.utils.Settings; 054import dk.netarkivet.common.utils.SystemUtils; 055import dk.netarkivet.common.utils.batch.FileBatchJob; 056import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob; 057import dk.netarkivet.common.utils.cdx.CDXRecord; 058import dk.netarkivet.harvester.HarvesterSettings; 059import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter; 060import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc; 061 062/** 063 * This tool creates a CDX metadata file for a given job's jobID and harvestPrefix by running a batch job on the 064 * bitarchive and processing the results to give a metadata file. Use option -w to select WARC output, and -a to select 065 * ARC output: If no option available, then warc mode is selected 066 * <p> 067 * Usage: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile -w --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 Usage: java 068 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile -a --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 Usage: java 069 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile --jobID 2 --harvestID 5 --harvestnamePrefix 2-1 070 * <p> 071 * The CDX records is slightly different from the one produced normally. As we are not able to extract the timestamp, 072 * and harvestID from the (W) arcfilenames, this information is not part of the CXDURI. 073 */ 074public class CreateCDXMetadataFile extends ToolRunnerBase { 075 076 public static final String ARCMODE = "arc"; 077 public static final String WARCMODE = "warc"; 078 public static final String usageString = "[-a|w] --jobID X --harvestID Y --harvestnamePrefix somePrefix"; 079 080 /** 081 * Main method. Creates and runs the tool object responsible for batching over the bitarchive and creating a 082 * metadata file for a job. 083 * 084 * @param argv Arguments to the tool: jobID harvestnamePrefix 085 */ 086 public static void main(String[] argv) { 087 new CreateCDXMetadataFile().runTheTool(argv); 088 } 089 090 /** 091 * Create the tool instance. 092 * 093 * @return A new tool object. 094 */ 095 protected SimpleCmdlineTool makeMyTool() { 096 return new CreateCDXMetadataFileTool(); 097 } 098 099 /** 100 * The actual tool object that creates CDX files. 101 */ 102 private static class CreateCDXMetadataFileTool implements SimpleCmdlineTool { 103 /** Write output mode. Is it arc or warc mode. */ 104 private boolean isWarcOutputMode; 105 /** Which jobId to process. */ 106 private long jobId; 107 /** The harvestID of the job to to process. */ 108 private long harvestId; 109 /** HarvestnamePrefix used to locate the files for the job. */ 110 private String harvestnamePrefix; 111 112 /** The connection to the arc repository. */ 113 private ViewerArcRepositoryClient arcrep; 114 115 /** 116 * The file pattern that matches an ARC or WARC file name without the jobID. If combined with a 117 * harvestnameprefix, this will match filenames that begin with the given harvestname prefix. 118 */ 119 private static final String REMAINING_ARCHIVE_FILE_PATTERN = ".*"; 120 121 /** 122 * Checks that a valid jobID were given. This does not check whether jobs actually exist for that ID. 123 * 124 * @param args The args given on the command line. 125 * @return True if the args are legal. 126 */ 127 public boolean checkArgs(String... args) { 128 final String ARC_OPTION_KEY = "a"; 129 final String WARC_OPTION_KEY = "w"; 130 final String JOBID_OPTION_KEY = "jobID"; 131 final String HARVESTID_OPTION_KEY = "harvestID"; 132 final String HARVESTNAMEPREFIX_OPTION_KEY = "harvestnamePrefix"; 133 134 OptionGroup metadataGroup = new OptionGroup(); 135 Option arcOption = new Option(ARC_OPTION_KEY, false, "write an metadata ARC file"); 136 Option warcOption = new Option(WARC_OPTION_KEY, false, "write an metadata WARC file"); 137 metadataGroup.addOption(arcOption); 138 metadataGroup.addOption(warcOption); 139 metadataGroup.setRequired(false); 140 OptionGroup jobIDGroup = new OptionGroup(); 141 Option jobIdOption = new Option(JOBID_OPTION_KEY, true, "The JobID"); 142 jobIDGroup.addOption(jobIdOption); 143 jobIDGroup.setRequired(true); 144 145 OptionGroup harvestIDGroup = new OptionGroup(); 146 Option harvestIdOption = new Option(HARVESTID_OPTION_KEY, true, "The HarvestID"); 147 harvestIDGroup.addOption(harvestIdOption); 148 harvestIDGroup.setRequired(true); 149 150 Option harvestprefixOption = new Option(HARVESTNAMEPREFIX_OPTION_KEY, true, "The harvestnamePrefix"); 151 OptionGroup harvestnamePrefixGroup = new OptionGroup(); 152 harvestnamePrefixGroup.addOption(harvestprefixOption); 153 harvestnamePrefixGroup.setRequired(true); 154 Options options = new Options(); 155 options.addOptionGroup(metadataGroup); 156 options.addOptionGroup(jobIDGroup); 157 options.addOptionGroup(harvestIDGroup); 158 options.addOptionGroup(harvestnamePrefixGroup); 159 String jobIdString = null; 160 String harvestIdString = null; 161 162 CommandLineParser parser = new PosixParser(); 163 CommandLine cli = null; 164 try { 165 cli = parser.parse(options, args); 166 } catch (MissingArgumentException e) { 167 System.err.println("Missing or wrong arguments given"); 168 printUsage(); 169 return false; 170 } catch (ParseException e) { 171 System.err.println("Missing or wrong arguments given"); 172 printUsage(); 173 return false; 174 } 175 176 isWarcOutputMode = true; // the default 177 // Only need to check for the ARC option, as the WARC option cannot be set at the same time 178 // It is either one or none of them. 179 if (cli.hasOption(ARC_OPTION_KEY)) { 180 isWarcOutputMode = false; 181 } 182 jobIdString = cli.getOptionValue(JOBID_OPTION_KEY); 183 harvestIdString = cli.getOptionValue(HARVESTID_OPTION_KEY); 184 this.harvestnamePrefix = cli.getOptionValue(HARVESTNAMEPREFIX_OPTION_KEY); 185 186 try { 187 this.jobId = Long.parseLong(jobIdString); 188 if (jobId < 1) { 189 System.err.println("'" + jobIdString + "' is not a valid job ID"); 190 return false; 191 } 192 } catch (NumberFormatException e) { 193 System.err.println("'" + jobIdString + "' is not a valid job ID"); 194 return false; 195 } 196 197 try { 198 this.harvestId = Long.parseLong(harvestIdString); 199 if (harvestId < 1) { 200 System.err.println("'" + harvestIdString + "' is not a valid harvest ID"); 201 return false; 202 } 203 } catch (NumberFormatException e) { 204 System.err.println("'" + harvestIdString + "' is not a valid harvest ID"); 205 return false; 206 } 207 return true; 208 } 209 210 /** 211 * Create required resources here (the ArcRepositoryClient instance). Resources created here should be released 212 * in tearDown, which is guaranteed to be run. 213 * 214 * @param args The arguments that were given on the command line (not used here) 215 */ 216 public void setUp(String... args) { 217 arcrep = ArcRepositoryClientFactory.getViewerInstance(); 218 } 219 220 /** 221 * Closes all resources we are using, which is only the ArcRepositoryClient. This is guaranteed to be called at 222 * shutdown. 223 */ 224 public void tearDown() { 225 if (arcrep != null) { 226 arcrep.close(); 227 if (arcrep.getClass().getName() 228 .equals("dk.netarkivet.archive.arcrepository.distribute.JMSArcRepositoryClient")) { 229 JMSConnectionFactory.getInstance().cleanup(); 230 } 231 } 232 } 233 234 /** 235 * The workhorse method of this tool: Runs the batch job, copies the result, then turns the result into a proper 236 * metadata file. This method assumes that the args have already been read by the checkArgs method, and thus 237 * jobId has been parsed, and the isWarcOutputMode established 238 * 239 * @param args Arguments given on the command line. 240 */ 241 public void run(String... args) { 242 final long jobID = this.jobId; 243 final long harvestID = this.harvestId; 244 final String harvestPrefix = this.harvestnamePrefix; 245 FileBatchJob job = new ArchiveExtractCDXJob(); 246 Settings.set(HarvesterSettings.METADATA_FORMAT, (isWarcOutputMode) ? "warc" : "arc"); 247 final String filePattern = harvestPrefix + REMAINING_ARCHIVE_FILE_PATTERN; 248 249 System.out.println("Creating cdx-" + ((isWarcOutputMode) ? "warcfile" : "arcfile") 250 + " from file matching pattern '" + filePattern + "'."); 251 job.processOnlyFilesMatching(filePattern); 252 253 BatchStatus status = arcrep.batch(job, Settings.get(CommonSettings.USE_REPLICA_ID)); 254 if (status.hasResultFile()) { 255 System.out.println("Got results from archive. Processing data"); 256 File resultFile = null; 257 try { 258 resultFile = File.createTempFile("extract-batch", ".cdx", FileUtils.getTempDir()); 259 resultFile.deleteOnExit(); 260 status.copyResults(resultFile); 261 arcifyResultFile(resultFile, jobID, harvestId); 262 } catch (IOException e) { 263 throw new IOFailure("Error getting results for job " + jobID, e); 264 } finally { 265 if (resultFile != null) { 266 FileUtils.remove(resultFile); 267 } 268 } 269 } else { 270 System.err.println("Got new results from archive. Program ending now"); 271 } 272 } 273 274 /** 275 * Turns a raw CDX file for the given jobID into a metadatafile containing the CDX lines in one archive record 276 * per each ARC or WARC file indexed. The output is put into a file called <jobID>-metadata-1.arc. 277 * 278 * @param resultFile The CDX file returned by a ExtractCDXJob for the given jobID. 279 * @param jobID The jobID we work on. 280 * @throws IOException If an I/O error occurs, or the resultFile does not exist 281 */ 282 private void arcifyResultFile(File resultFile, long jobID, long harvestID) throws IOException { 283 BufferedReader reader = new BufferedReader(new FileReader(resultFile)); 284 285 File outputFile = new File(MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobID), harvestID)); 286 System.out.println("Writing cdx to file '" + outputFile.getAbsolutePath() + "'."); 287 try { 288 MetadataFileWriter writer = MetadataFileWriter.createWriter(outputFile); 289 if (writer instanceof MetadataFileWriterWarc) { 290 insertWarcInfo((MetadataFileWriterWarc) writer, jobID); 291 } 292 try { 293 String line; 294 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 295 String lastFilename = null; 296 String newFilename = null; 297 298 while ((line = reader.readLine()) != null) { 299 // parse filename out of line 300 newFilename = parseLine(line, harvestnamePrefix); 301 if (newFilename == null) { // Bad line, try the next 302 continue; 303 } 304 if (lastFilename != null && !newFilename.equals(lastFilename)) { 305 // When we reach the end of a block of lines from 306 // one ARC/WARC file, we write those as a single entry. 307 writeCDXEntry(writer, newFilename, baos.toByteArray()); 308 baos.reset(); 309 } 310 baos.write(line.getBytes()); 311 baos.write("\n".getBytes()); 312 lastFilename = newFilename; 313 } 314 if (newFilename != null) { 315 writeCDXEntry(writer, newFilename, baos.toByteArray()); 316 } 317 } finally { 318 writer.close(); 319 } 320 } finally { 321 reader.close(); 322 } 323 } 324 325 private void insertWarcInfo(MetadataFileWriterWarc writer, Long jobID) { 326 ANVLRecord infoPayload = new ANVLRecord(); 327 infoPayload.addLabelValue("software", 328 "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/" 329 + dk.netarkivet.common.Constants.PROJECT_WEBSITE); 330 infoPayload.addLabelValue("ip", SystemUtils.getLocalIP()); 331 infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName()); 332 infoPayload 333 .addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); 334 infoPayload.addLabelValue("isPartOf", "" + jobID); 335 writer.insertInfoRecord(infoPayload); 336 } 337 338 /** 339 * Utility method to parse out the parts of a CDX line. If a different jobID is found in the CDX line than we're 340 * given, or the CDX line is unparsable, we print an error message and return null, expecting processing to 341 * continue. 342 * 343 * @param line The line to parse. 344 * @param harvestnamePrefix . 345 * @return An object containing the salient parts of the filename of the ARC file as mentioned in the given CDX 346 * line, or null if the filename didn't match the job we're working on. 347 */ 348 private String parseLine(String line, String harvestnamePrefix) { 349 try { 350 String filename = new CDXRecord(line).getArcfile(); 351 if (!filename.startsWith(harvestnamePrefix)) { 352 System.err.println("Found CXD-entry with unexpected filename '" + filename 353 + "': does not match harvestnamePrefix '" + harvestnamePrefix + "' in " + line); 354 return null; 355 } 356 return filename; 357 } catch (NetarkivetException e) { 358 System.err.println("Error parsing CDX line '" + line + "': " + e); 359 return null; 360 } 361 } 362 363 /** 364 * Writes a full entry of CDX files to the ARCWriter. 365 * 366 * @param writer The writer we're currently writing to. 367 * @param filename The filename of all the entries stored in baos. This is used to generate the URI for the 368 * entry. 369 * @param bytes The bytes of the CDX records to be written under this entry. 370 * @throws IOFailure if the write fails for any reason 371 */ 372 private void writeCDXEntry(MetadataFileWriter writer, String filename, byte[] bytes) throws IOFailure { 373 try { 374 writer.write(MetadataFileWriter.getAlternateCDXURI(this.jobId, filename).toString(), 375 Constants.CDX_MIME_TYPE, SystemUtils.getLocalIP(), System.currentTimeMillis(), bytes); 376 } catch (IOException e) { 377 throw new IOFailure("Failed to write ARC/WARC entry with CDX lines " + "for " + filename, e); 378 } 379 } 380 381 /** 382 * Return a string describing the parameters accepted by the CreateCDXMetadataFile tool. 383 * 384 * @return String with description of parameters. 385 */ 386 public String listParameters() { 387 return usageString; 388 } 389 390 private static void printUsage() { 391 System.err.println("Usage 1: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile" 392 + " -w --jobID 2 --harvestnamePrefix 2-1"); 393 System.err.println("Usage 2: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile" 394 + " -a --jobID 2 --harvestnamePrefix 2-1"); 395 System.err.println("Usage 3: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile" 396 + " --jobID 2 --harvestnamePrefix 2-1"); 397 } 398 } 399}