001package dk.netarkivet.harvester.tools;
002
003import java.io.File;
004import java.io.FileOutputStream;
005import java.io.IOException;
006import java.io.OutputStream;
007import java.util.List;
008
009import dk.netarkivet.common.utils.batch.BatchLocalFiles;
010import dk.netarkivet.common.utils.batch.FileBatchJob;
011import dk.netarkivet.viewerproxy.webinterface.CrawlLogLinesMatchingRegexp;
012
013/**
014 * Find relevant crawllog lines for a specific domain in a specific metadata file 
015 * args: domain metadatafile
016 * 
017 * Note: currently the regexp is embedded in the jsp page harvester/qa-gui/src/main/webapp/QA-searchcrawllog.jsp
018 * but should probably be removed to the Reporting class ./harvester/harvester-core/src/main/java/dk/netarkivet/viewerproxy/webinterface/Reporting.java
019 * 
020 */
021public class FindRelevantCrawllogLines {
022
023        /** New regexp to fix NARK-1212 /NAS-2690 */
024        public static String getRegexpToFindDomainLines(String domain) {
025                return ".*(https?:\\/\\/(www\\.)?|dns:|ftp:\\/\\/)([\\w_-]+\\.)?([\\w_-]+\\.)?([\\w_-]+\\.)?" + domain.replaceAll("\\.", "\\\\.") +  "($|\\/|\\w|\\s).*"; 
026        }
027        
028        public static void main(String[] args) throws IOException {
029                if (args.length != 2) {
030                        System.err.println("Too few or too many arguments. Two needed. You gave me " + args.length);
031                        System.exit(1);
032                }
033                String domain = args[0];
034                File metadatafile = new File(args[1]);
035                if (!metadatafile.isFile()) {
036                        System.err.println("The file given as argument does not exist or is a directory: " 
037                                        + metadatafile.getAbsolutePath());
038                        System.exit(1);
039                }
040                File resultFile1 = File.createTempFile("FindRelevant", "matchingLines", new File("/tmp"));
041                
042                String regexp = getRegexpToFindDomainLines(domain);
043                File resultFile = resultFile1;  
044                List<String> lines = findLines(metadatafile, regexp, resultFile);
045                System.out.println("Found " + lines.size() + " matching lines for domain '" + domain + "' in file '" + metadatafile.getAbsolutePath() + "'");
046                System.out.println("Resultfile is " + resultFile.getAbsolutePath());
047                lines.clear();
048                System.exit(0);
049        }
050        
051        private static List<String> findLines(File metadatafile, String regexp, File resultFile) throws IOException {
052                FileBatchJob job = new CrawlLogLinesMatchingRegexp(regexp);
053                BatchLocalFiles batch = new BatchLocalFiles(new File[]{metadatafile});
054                
055                OutputStream os = new FileOutputStream(resultFile);
056                batch.run(job, os);
057                os.close();
058                return org.apache.commons.io.FileUtils.readLines(resultFile);
059        }
060}