001package dk.netarkivet.harvester.tools; 002 003import java.io.File; 004import java.io.FileOutputStream; 005import java.io.IOException; 006import java.io.OutputStream; 007import java.util.List; 008 009import dk.netarkivet.common.utils.batch.BatchLocalFiles; 010import dk.netarkivet.common.utils.batch.FileBatchJob; 011import dk.netarkivet.viewerproxy.webinterface.CrawlLogLinesMatchingRegexp; 012 013/** 014 * Find relevant crawllog lines for a specific domain in a specific metadata file 015 * args: domain metadatafile 016 * 017 * Note: currently the regexp is embedded in the jsp page harvester/qa-gui/src/main/webapp/QA-searchcrawllog.jsp 018 * but should probably be removed to the Reporting class ./harvester/harvester-core/src/main/java/dk/netarkivet/viewerproxy/webinterface/Reporting.java 019 * 020 */ 021public class FindRelevantCrawllogLines { 022 023 /** New regexp to fix NARK-1212 /NAS-2690 */ 024 public static String getRegexpToFindDomainLines(String domain) { 025 return ".*(https?:\\/\\/(www\\.)?|dns:|ftp:\\/\\/)([\\w_-]+\\.)?([\\w_-]+\\.)?([\\w_-]+\\.)?" + domain.replaceAll("\\.", "\\\\.") + "($|\\/|\\w|\\s).*"; 026 } 027 028 public static void main(String[] args) throws IOException { 029 if (args.length != 2) { 030 System.err.println("Too few or too many arguments. Two needed. You gave me " + args.length); 031 System.exit(1); 032 } 033 String domain = args[0]; 034 File metadatafile = new File(args[1]); 035 if (!metadatafile.isFile()) { 036 System.err.println("The file given as argument does not exist or is a directory: " 037 + metadatafile.getAbsolutePath()); 038 System.exit(1); 039 } 040 File resultFile1 = File.createTempFile("FindRelevant", "matchingLines", new File("/tmp")); 041 042 String regexp = getRegexpToFindDomainLines(domain); 043 File resultFile = resultFile1; 044 List<String> lines = findLines(metadatafile, regexp, resultFile); 045 System.out.println("Found " + lines.size() + " matching lines for domain '" + domain + "' in file '" + metadatafile.getAbsolutePath() + "'"); 046 System.out.println("Resultfile is " + resultFile.getAbsolutePath()); 047 lines.clear(); 048 System.exit(0); 049 } 050 051 private static List<String> findLines(File metadatafile, String regexp, File resultFile) throws IOException { 052 FileBatchJob job = new CrawlLogLinesMatchingRegexp(regexp); 053 BatchLocalFiles batch = new BatchLocalFiles(new File[]{metadatafile}); 054 055 OutputStream os = new FileOutputStream(resultFile); 056 batch.run(job, os); 057 os.close(); 058 return org.apache.commons.io.FileUtils.readLines(resultFile); 059 } 060}