001/* 002 * #%L 003 * Netarchivesuite - heritrix 3 monitor 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.heritrix3.monitor; 025 026import java.io.ByteArrayOutputStream; 027import java.io.File; 028import java.io.FileWriter; 029import java.io.IOException; 030import java.io.InputStream; 031import java.net.MalformedURLException; 032import java.net.URL; 033import java.nio.charset.Charset; 034import java.nio.file.Files; 035import java.nio.file.Paths; 036import java.util.ArrayList; 037import java.util.Date; 038import java.util.List; 039import java.util.regex.Matcher; 040import java.util.regex.Pattern; 041import java.util.regex.PatternSyntaxException; 042import java.util.stream.Stream; 043 044import javax.servlet.ServletConfig; 045import javax.servlet.ServletContext; 046import javax.servlet.ServletException; 047 048import com.antiaction.common.templateengine.TemplateMaster; 049import com.antiaction.common.templateengine.login.LoginTemplateHandler; 050import com.antiaction.common.templateengine.storage.TemplateFileStorageManager; 051 052import dk.netarkivet.common.exceptions.ArgumentNotValid; 053import dk.netarkivet.common.exceptions.IOFailure; 054import dk.netarkivet.common.utils.DomainUtils; 055import dk.netarkivet.common.utils.I18n; 056import dk.netarkivet.common.utils.Settings; 057import dk.netarkivet.harvester.Constants; 058import dk.netarkivet.harvester.HarvesterSettings; 059import dk.netarkivet.heritrix3.monitor.resources.H3JobResource; 060 061public class NASEnvironment { 062 063 private static final String NAS_GROOVY_RESOURCE_PATH = "dk/netarkivet/heritrix3/monitor/nas.groovy"; 064 065 public String NAS_GROOVY_SCRIPT; 066 067 /** servletConfig. */ 068 protected ServletConfig servletConfig = null; 069 070 public TemplateMaster templateMaster = null; 071 072 protected String login_template_name = null; 073 074 protected LoginTemplateHandler<NASUser> loginHandler = null; 075 076 public File tempPath; 077 078 public String h3AdminName; 079 080 public String h3AdminPassword; 081 082 public Heritrix3JobMonitorThread h3JobMonitorThread; 083 084 public static String contextPath; 085 086 public static String servicePath; 087 088 public HttpLocaleHandler httpLocaleUtils; 089 090 public static class StringMatcher { 091 public String str; 092 public Pattern p; 093 public Matcher m; 094 } 095 096 public List<StringMatcher> h3HostPortAllowRegexList = new ArrayList<StringMatcher>(); 097 098 public final I18n I18N = new I18n(Constants.TRANSLATIONS_BUNDLE); 099 100 public String getResourceAsString(String resource) throws IOException { 101 InputStream in = H3JobResource.class.getClassLoader().getResourceAsStream(resource); 102 ByteArrayOutputStream bOut = new ByteArrayOutputStream(); 103 byte[] tmpArr = new byte[8192]; 104 int read; 105 while ((read = in.read(tmpArr)) != -1) { 106 bOut.write(tmpArr, 0, read); 107 } 108 in.close(); 109 return new String(bOut.toByteArray(), "UTF-8"); 110 } 111 112 public NASEnvironment(ServletContext servletContext, ServletConfig theServletConfig) throws ServletException { 113 httpLocaleUtils = HttpLocaleHandler.getInstance(); 114 115 try { 116 NAS_GROOVY_SCRIPT = getResourceAsString(NAS_GROOVY_RESOURCE_PATH); 117 } catch (IOException e) { 118 throw new ServletException("Resource missing: " + NAS_GROOVY_RESOURCE_PATH); 119 } 120 121 login_template_name = "login.html"; 122 123 templateMaster = TemplateMaster.getInstance("default"); 124 templateMaster.addTemplateStorage(TemplateFileStorageManager.getInstance( 125 servletContext.getRealPath("/"), "UTF-8")); 126 127 loginHandler = new LoginTemplateHandler<NASUser>(); 128 loginHandler.templateMaster = templateMaster; 129 loginHandler.templateName = login_template_name; 130 loginHandler.title = "Webdanica - Login"; 131 loginHandler.adminPath = "/"; 132 133 try { 134 tempPath = Settings.getFile(HarvesterSettings.HERITRIX3_MONITOR_TEMP_PATH); 135 writeDiagnostics("Trying to use tempPath '" + tempPath.getAbsolutePath() + "' as read from setting: " + 136 HarvesterSettings.HERITRIX3_MONITOR_TEMP_PATH); 137 if (!tempPath.isDirectory()) { // Try to create tempPath if it doesn't exist already 138 tempPath.mkdirs(); 139 } 140 } catch (Exception e) { 141 //This is normal if tempPath is unset, so system directory is used. 142 tempPath = new File(System.getProperty("java.io.tmpdir")); 143 } 144 // Fallback to System.getProperty("java.io.tmpdir") 145 if (tempPath == null || !tempPath.isDirectory()) { 146 tempPath = new File(System.getProperty("java.io.tmpdir")); 147 } 148 writeDiagnostics("Using dir '" + tempPath.getAbsolutePath() + "' as tempPath"); 149 h3AdminName = Settings.get(HarvesterSettings.HERITRIX_ADMIN_NAME); 150 h3AdminPassword = Settings.get(HarvesterSettings.HERITRIX_ADMIN_PASSWORD); 151 152 this.servletConfig = theServletConfig; 153 h3JobMonitorThread = new Heritrix3JobMonitorThread(this); 154 writeDiagnostics("Initialized " + this.getClass().getName()); 155 } 156 157 public void start() { 158 try { 159 h3JobMonitorThread.init(); 160 h3JobMonitorThread.start(); 161 } 162 catch (Throwable t) { 163 t.printStackTrace(); 164 } 165 } 166 167 private synchronized void writeDiagnostics(String logEntry) { 168 File logFile = new File(tempPath, "h3monitor.log"); 169 String dateStamp= "[" + new Date() + "] "; 170 try (FileWriter logFileWriter = new FileWriter(logFile, true);) { 171 logFileWriter.write(dateStamp); 172 logFileWriter.write(logEntry); 173 logFileWriter.write(System.lineSeparator()); 174 logFileWriter.flush(); 175 logFileWriter.close(); 176 } catch (IOException e) { 177 e.printStackTrace(); 178 } 179 } 180 181 /** 182 * Do some cleanup. This waits for the different workflow threads to stop running. 183 */ 184 public void cleanup() { 185 servletConfig = null; 186 } 187 188 public void replaceH3HostnamePortRegexList(List<String> h3HostnamePortRegexList, List<String> invalidPatternsList) { 189 String regex; 190 StringMatcher stringMatcher; 191 synchronized (h3HostPortAllowRegexList) { 192 h3HostPortAllowRegexList.clear(); 193 for (int i=0; i<h3HostnamePortRegexList.size(); ++i) { 194 regex = h3HostnamePortRegexList.get(i); 195 try { 196 stringMatcher = new StringMatcher(); 197 stringMatcher.str = regex; 198 stringMatcher.p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); 199 stringMatcher.m = stringMatcher.p.matcher("42"); 200 h3HostPortAllowRegexList.add(stringMatcher); 201 } catch (PatternSyntaxException e) { 202 invalidPatternsList.add(regex); 203 } 204 } 205 } 206 } 207 208 public boolean isH3HostnamePortEnabled(String h3HostnamePort) { 209 boolean bAllowed = false; 210 synchronized (h3HostPortAllowRegexList) { 211 StringMatcher stringMatcher; 212 int idx = 0; 213 while (!bAllowed && idx < h3HostPortAllowRegexList.size()) { 214 stringMatcher = h3HostPortAllowRegexList.get(idx++); 215 stringMatcher.m.reset(h3HostnamePort); 216 bAllowed = stringMatcher.m.matches(); 217 } 218 } 219 return bAllowed; 220 } 221 222 /** 223 * Determine whether URL in given crawllog line is attempted harvested 224 * @param crawllogLine Line from the crawllog under consideration 225 * @return whether the given crawllog line contains an URL that is attempted harvested 226 */ 227 private boolean urlInLineIsAttemptedHarvested(String crawllogLine) { 228 String[] columns = crawllogLine.split("\\s+"); 229 if (columns.length < 4) { 230 return false; 231 } 232 String fetchStatusCode = columns[1]; 233 String harvestedUrl = columns[3]; 234 235 // Do not include URLs with a negative Fetch Status Code (coz they're not even attempted crawled) 236 if (Integer.parseInt(fetchStatusCode) < 0) { 237 return false; 238 } 239 240 // Do not include dns-look-ups from the crawllog 241 if (harvestedUrl.startsWith("dns:")) { 242 return false; 243 } 244 245 return true; 246 } 247 248 /** 249 * Get the (attempted) crawled URLs of the crawllog for the running job with the given job id 250 * 251 * @param jobId Id of the running job 252 * @param h3Job Heritrix3JobMonitor from which to get the job for the given jobId 253 * @return The (attempted) crawled URLs of the crawllog for given job 254 */ 255 public Stream<String> getCrawledUrls(long jobId, Heritrix3JobMonitor h3Job) { 256 if (h3Job == null) { 257 h3Job = h3JobMonitorThread.getRunningH3Job(jobId); 258 if (h3Job == null) { 259 // There were no running jobs 260 return Stream.empty(); 261 } 262 } 263 String crawlLogPath = h3Job.logFile.getAbsolutePath(); 264 writeDiagnostics("Trying to getCrawledUrls from job " + jobId + " using cached crawllog '" + crawlLogPath + "'"); 265 long cachedLines = h3Job.totalCachedLines; 266 267 if (cachedLines == 0) { 268 writeDiagnostics("No cached crawllog-lines for job " + jobId); 269 return Stream.empty(); 270 } else { 271 writeDiagnostics("Number of cached crawllog-lines for job " + jobId + ": " + cachedLines); 272 } 273 try { 274 // test that the crawllog-exists if not, return an empty stream 275 if (!Paths.get(crawlLogPath).toFile().isFile()) { 276 writeDiagnostics("The file '" + crawlLogPath + "' doesn't correspond to a file. returning an empty stream"); 277 return Stream.empty(); 278 } 279 Stream<String> attemptedHarvestedUrlsFromCrawllog = Files.lines(Paths.get(crawlLogPath), 280 Charset.forName("UTF-8")) 281 .filter(line -> urlInLineIsAttemptedHarvested(line)) 282 .map(line -> line.split("\\s+")[3]); 283 284 return attemptedHarvestedUrlsFromCrawllog; 285 } catch (java.io.IOException e) { 286 throw new IOFailure("Could not open crawllog file", e); 287 } 288 } 289 290 /** 291 * Normalizes input URL so that only the domain part remains. 292 * 293 * @param url URL intended to be stripped to it's domain part 294 * @return The domain part of the input URL 295 * @throws ArgumentNotValid if URL was malformed 296 */ 297 private String normalizeDomainUrl(String url) { 298 if (!url.toLowerCase().matches("^\\w+://.*")) { 299 // URL has no protocol part, so let's add one 300 url = "http://" + url; 301 } 302 URL domainUrl; 303 try { 304 domainUrl = new URL(url); 305 } catch (MalformedURLException e) { 306 return ""; 307 } 308 String domainHost = domainUrl.getHost(); 309 String normalizedDomainUrl = DomainUtils.domainNameFromHostname(domainHost); 310 if (normalizedDomainUrl == null) { 311 // Invalid domain 312 throw new ArgumentNotValid(url + " is not a valid domain name."); 313 } 314 return normalizedDomainUrl; 315 } 316 317 /** 318 * Find out whether the given job harvests given domain. 319 * 320 * @param jobId The job 321 * @param domainName The domain 322 * @return whether the given job harvests given domain 323 */ 324 public boolean jobHarvestsDomain(long jobId, String domainName, Heritrix3JobMonitor h3Job) { 325 // Normalize search URL 326 String searchedDomain = normalizeDomainUrl(domainName); 327 328 // Return whether or not the crawled URLs contain the searched URL 329 return getCrawledUrls(jobId, h3Job) 330 .map(url -> normalizeDomainUrl(url)) 331 .anyMatch(url -> searchedDomain.equalsIgnoreCase(url)); 332 } 333 334}