001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.tools; 025 026import java.io.BufferedReader; 027import java.io.ByteArrayOutputStream; 028import java.io.File; 029import java.io.FileReader; 030import java.io.IOException; 031 032import org.apache.commons.cli.CommandLine; 033import org.apache.commons.cli.CommandLineParser; 034import org.apache.commons.cli.MissingArgumentException; 035import org.apache.commons.cli.Option; 036import org.apache.commons.cli.OptionGroup; 037import org.apache.commons.cli.Options; 038import org.apache.commons.cli.ParseException; 039import org.apache.commons.cli.PosixParser; 040import org.jwat.common.ANVLRecord; 041 042import dk.netarkivet.common.CommonSettings; 043import dk.netarkivet.common.Constants; 044import dk.netarkivet.common.distribute.JMSConnectionFactory; 045import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 046import dk.netarkivet.common.distribute.arcrepository.BatchStatus; 047import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient; 048import dk.netarkivet.common.exceptions.IOFailure; 049import dk.netarkivet.common.exceptions.NetarkivetException; 050import dk.netarkivet.common.tools.SimpleCmdlineTool; 051import dk.netarkivet.common.tools.ToolRunnerBase; 052import dk.netarkivet.common.utils.FileUtils; 053import dk.netarkivet.common.utils.Settings; 054import dk.netarkivet.common.utils.SystemUtils; 055import dk.netarkivet.common.utils.batch.FileBatchJob; 056import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob; 057import dk.netarkivet.common.utils.cdx.CDXRecord; 058import dk.netarkivet.harvester.HarvesterSettings; 059import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter; 060import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc; 061 062/** 063 * This tool creates a CDX metadata file for a given job's jobID and harvestPrefix by running a batch job on the 064 * bitarchive and processing the results to give a metadata file. Use option -w to select WARC output, and -a to select 065 * ARC output: If no option available, then warc mode is selected 066 * <p> 067 * Usage: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile -w --jobID 2 --harvestnamePrefix 2-1 Usage: java 068 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile -a --jobID 2 --harvestnamePrefix 2-1 Usage: java 069 * dk.netarkivet.harvester.tools.CreateCDXMetadataFile --jobID 2 --harvestnamePrefix 2-1 070 * <p> 071 * The CDX records is slightly different from the one produced normally. As we are not able to extract the timestamp, 072 * and harvestID from the (W) arcfilenames, this information is not part of the CXDURI. 073 */ 074public class CreateCDXMetadataFile extends ToolRunnerBase { 075 076 public static final String ARCMODE = "arc"; 077 public static final String WARCMODE = "warc"; 078 public static final String usageString = "[-a|w] --jobID X --harvestnamePrefix somePrefix"; 079 080 /** 081 * Main method. Creates and runs the tool object responsible for batching over the bitarchive and creating a 082 * metadata file for a job. 083 * 084 * @param argv Arguments to the tool: jobID harvestnamePrefix 085 */ 086 public static void main(String[] argv) { 087 new CreateCDXMetadataFile().runTheTool(argv); 088 } 089 090 /** 091 * Create the tool instance. 092 * 093 * @return A new tool object. 094 */ 095 protected SimpleCmdlineTool makeMyTool() { 096 return new CreateCDXMetadataFileTool(); 097 } 098 099 /** 100 * The actual tool object that creates CDX files. 101 */ 102 private static class CreateCDXMetadataFileTool implements SimpleCmdlineTool { 103 /** Write output mode. Is it arc or warc mode. */ 104 private boolean isWarcOutputMode; 105 /** Which jobId to process. */ 106 private long jobId; 107 /** HarvestnamePrefix used to locate the files for the job. */ 108 private String harvestnamePrefix; 109 110 /** The connection to the arc repository. */ 111 private ViewerArcRepositoryClient arcrep; 112 113 /** 114 * The file pattern that matches an ARC or WARC file name without the jobID. If combined with a 115 * harvestnameprefix, this will match filenames that begin with the given harvestname prefix. 116 */ 117 private static final String REMAINING_ARCHIVE_FILE_PATTERN = ".*"; 118 119 /** 120 * Checks that a valid jobID were given. This does not check whether jobs actually exist for that ID. 121 * 122 * @param args The args given on the command line. 123 * @return True if the args are legal. 124 */ 125 public boolean checkArgs(String... args) { 126 final String ARC_OPTION_KEY = "a"; 127 final String WARC_OPTION_KEY = "w"; 128 final String JOBID_OPTION_KEY = "jobID"; 129 final String HARVESTNAMEPREFIX_OPTION_KEY = "harvestnamePrefix"; 130 131 OptionGroup metadataGroup = new OptionGroup(); 132 Option arcOption = new Option(ARC_OPTION_KEY, false, "write an metadata ARC file"); 133 Option warcOption = new Option(WARC_OPTION_KEY, false, "write an metadata WARC file"); 134 metadataGroup.addOption(arcOption); 135 metadataGroup.addOption(warcOption); 136 metadataGroup.setRequired(false); 137 OptionGroup jobIDGroup = new OptionGroup(); 138 Option jobIdOption = new Option(JOBID_OPTION_KEY, true, "The JobID"); 139 jobIDGroup.addOption(jobIdOption); 140 jobIDGroup.setRequired(true); 141 142 Option harvestprefixOption = new Option(HARVESTNAMEPREFIX_OPTION_KEY, true, "The harvestnamePrefix"); 143 OptionGroup harvestnamePrefixGroup = new OptionGroup(); 144 harvestnamePrefixGroup.addOption(harvestprefixOption); 145 harvestnamePrefixGroup.setRequired(true); 146 Options options = new Options(); 147 options.addOptionGroup(metadataGroup); 148 options.addOptionGroup(jobIDGroup); 149 options.addOptionGroup(harvestnamePrefixGroup); 150 String jobIdString = null; 151 152 CommandLineParser parser = new PosixParser(); 153 CommandLine cli = null; 154 try { 155 cli = parser.parse(options, args); 156 } catch (MissingArgumentException e) { 157 System.err.println("Missing or wrong arguments given"); 158 printUsage(); 159 return false; 160 } catch (ParseException e) { 161 System.err.println("Missing or wrong arguments given"); 162 printUsage(); 163 return false; 164 } 165 166 isWarcOutputMode = true; // the default 167 // Only need to check for the ARC option, as the WARC option cannot be set at the same time 168 // It is either one or none of them. 169 if (cli.hasOption(ARC_OPTION_KEY)) { 170 isWarcOutputMode = false; 171 } 172 jobIdString = cli.getOptionValue(JOBID_OPTION_KEY); 173 this.harvestnamePrefix = cli.getOptionValue(HARVESTNAMEPREFIX_OPTION_KEY); 174 175 try { 176 this.jobId = Long.parseLong(jobIdString); 177 if (jobId < 1) { 178 System.err.println("'" + jobIdString + "' is not a valid job ID"); 179 return false; 180 } 181 } catch (NumberFormatException e) { 182 System.err.println("'" + jobIdString + "' is not a valid job ID"); 183 return false; 184 } 185 return true; 186 } 187 188 /** 189 * Create required resources here (the ArcRepositoryClient instance). Resources created here should be released 190 * in tearDown, which is guaranteed to be run. 191 * 192 * @param args The arguments that were given on the command line (not used here) 193 */ 194 public void setUp(String... args) { 195 arcrep = ArcRepositoryClientFactory.getViewerInstance(); 196 } 197 198 /** 199 * Closes all resources we are using, which is only the ArcRepositoryClient. This is guaranteed to be called at 200 * shutdown. 201 */ 202 public void tearDown() { 203 if (arcrep != null) { 204 arcrep.close(); 205 if (arcrep.getClass().getName() 206 .equals("dk.netarkivet.archive.arcrepository.distribute.JMSArcRepositoryClient")) { 207 JMSConnectionFactory.getInstance().cleanup(); 208 } 209 } 210 } 211 212 /** 213 * The workhorse method of this tool: Runs the batch job, copies the result, then turns the result into a proper 214 * metadata file. This method assumes that the args have already been read by the checkArgs method, and thus 215 * jobId has been parsed, and the isWarcOutputMode established 216 * 217 * @param args Arguments given on the command line. 218 */ 219 public void run(String... args) { 220 final long jobID = this.jobId; 221 final String harvestPrefix = this.harvestnamePrefix; 222 FileBatchJob job = new ArchiveExtractCDXJob(); 223 Settings.set(HarvesterSettings.METADATA_FORMAT, (isWarcOutputMode) ? "warc" : "arc"); 224 final String filePattern = harvestPrefix + REMAINING_ARCHIVE_FILE_PATTERN; 225 226 System.out.println("Creating cdx-" + ((isWarcOutputMode) ? "warcfile" : "arcfile") 227 + " from file matching pattern '" + filePattern + "'."); 228 job.processOnlyFilesMatching(filePattern); 229 230 BatchStatus status = arcrep.batch(job, Settings.get(CommonSettings.USE_REPLICA_ID)); 231 if (status.hasResultFile()) { 232 System.out.println("Got results from archive. Processing data"); 233 File resultFile = null; 234 try { 235 resultFile = File.createTempFile("extract-batch", ".cdx", FileUtils.getTempDir()); 236 resultFile.deleteOnExit(); 237 status.copyResults(resultFile); 238 arcifyResultFile(resultFile, jobID); 239 } catch (IOException e) { 240 throw new IOFailure("Error getting results for job " + jobID, e); 241 } finally { 242 if (resultFile != null) { 243 FileUtils.remove(resultFile); 244 } 245 } 246 } else { 247 System.err.println("Got new results from archive. Program ending now"); 248 } 249 } 250 251 /** 252 * Turns a raw CDX file for the given jobID into a metadatafile containing the CDX lines in one archive record 253 * per each ARC or WARC file indexed. The output is put into a file called <jobID>-metadata-1.arc. 254 * 255 * @param resultFile The CDX file returned by a ExtractCDXJob for the given jobID. 256 * @param jobID The jobID we work on. 257 * @throws IOException If an I/O error occurs, or the resultFile does not exist 258 */ 259 private void arcifyResultFile(File resultFile, long jobID) throws IOException { 260 BufferedReader reader = new BufferedReader(new FileReader(resultFile)); 261 262 File outputFile = new File(MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobID))); 263 System.out.println("Writing cdx to file '" + outputFile.getAbsolutePath() + "'."); 264 try { 265 MetadataFileWriter writer = MetadataFileWriter.createWriter(outputFile); 266 if (writer instanceof MetadataFileWriterWarc) { 267 insertWarcInfo((MetadataFileWriterWarc) writer, jobID); 268 } 269 try { 270 String line; 271 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 272 String lastFilename = null; 273 String newFilename = null; 274 275 while ((line = reader.readLine()) != null) { 276 // parse filename out of line 277 newFilename = parseLine(line, harvestnamePrefix); 278 if (newFilename == null) { // Bad line, try the next 279 continue; 280 } 281 if (lastFilename != null && !newFilename.equals(lastFilename)) { 282 // When we reach the end of a block of lines from 283 // one ARC/WARC file, we write those as a single entry. 284 writeCDXEntry(writer, newFilename, baos.toByteArray()); 285 baos.reset(); 286 } 287 baos.write(line.getBytes()); 288 baos.write("\n".getBytes()); 289 lastFilename = newFilename; 290 } 291 if (newFilename != null) { 292 writeCDXEntry(writer, newFilename, baos.toByteArray()); 293 } 294 } finally { 295 writer.close(); 296 } 297 } finally { 298 reader.close(); 299 } 300 } 301 302 private void insertWarcInfo(MetadataFileWriterWarc writer, Long jobID) { 303 ANVLRecord infoPayload = new ANVLRecord(); 304 infoPayload.addLabelValue("software", 305 "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/" 306 + dk.netarkivet.common.Constants.PROJECT_WEBSITE); 307 infoPayload.addLabelValue("ip", SystemUtils.getLocalIP()); 308 infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName()); 309 infoPayload 310 .addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); 311 infoPayload.addLabelValue("isPartOf", "" + jobID); 312 writer.insertInfoRecord(infoPayload); 313 } 314 315 /** 316 * Utility method to parse out the parts of a CDX line. If a different jobID is found in the CDX line than we're 317 * given, or the CDX line is unparsable, we print an error message and return null, expecting processing to 318 * continue. 319 * 320 * @param line The line to parse. 321 * @param harvestnamePrefix . 322 * @return An object containing the salient parts of the filename of the ARC file as mentioned in the given CDX 323 * line, or null if the filename didn't match the job we're working on. 324 */ 325 private String parseLine(String line, String harvestnamePrefix) { 326 try { 327 String filename = new CDXRecord(line).getArcfile(); 328 if (!filename.startsWith(harvestnamePrefix)) { 329 System.err.println("Found CXD-entry with unexpected filename '" + filename 330 + "': does not match harvestnamePrefix '" + harvestnamePrefix + "' in " + line); 331 return null; 332 } 333 return filename; 334 } catch (NetarkivetException e) { 335 System.err.println("Error parsing CDX line '" + line + "': " + e); 336 return null; 337 } 338 } 339 340 /** 341 * Writes a full entry of CDX files to the ARCWriter. 342 * 343 * @param writer The writer we're currently writing to. 344 * @param filename The filename of all the entries stored in baos. This is used to generate the URI for the 345 * entry. 346 * @param bytes The bytes of the CDX records to be written under this entry. 347 * @throws IOFailure if the write fails for any reason 348 */ 349 private void writeCDXEntry(MetadataFileWriter writer, String filename, byte[] bytes) throws IOFailure { 350 try { 351 writer.write(MetadataFileWriter.getAlternateCDXURI(this.jobId, filename).toString(), 352 Constants.CDX_MIME_TYPE, SystemUtils.getLocalIP(), System.currentTimeMillis(), bytes); 353 } catch (IOException e) { 354 throw new IOFailure("Failed to write ARC/WARC entry with CDX lines " + "for " + filename, e); 355 } 356 } 357 358 /** 359 * Return a string describing the parameters accepted by the CreateCDXMetadataFile tool. 360 * 361 * @return String with description of parameters. 362 */ 363 public String listParameters() { 364 return usageString; 365 } 366 367 private static void printUsage() { 368 System.err.println("Usage 1: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile" 369 + " -w --jobID 2 --harvestnamePrefix 2-1"); 370 System.err.println("Usage 2: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile" 371 + " -a --jobID 2 --harvestnamePrefix 2-1"); 372 System.err.println("Usage 3: java dk.netarkivet.harvester.tools.CreateCDXMetadataFile" 373 + " --jobID 2 --harvestnamePrefix 2-1"); 374 } 375 } 376}