Source code

001/*
002 * #%L
003 * Netarchivesuite - heritrix 3 monitor
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.heritrix3.monitor;
025
026import java.io.ByteArrayOutputStream;
027import java.io.File;
028import java.io.FileWriter;
029import java.io.IOException;
030import java.io.InputStream;
031import java.net.MalformedURLException;
032import java.net.URL;
033import java.nio.charset.Charset;
034import java.nio.file.Files;
035import java.nio.file.Paths;
036import java.util.ArrayList;
037import java.util.Date;
038import java.util.List;
039import java.util.regex.Matcher;
040import java.util.regex.Pattern;
041import java.util.regex.PatternSyntaxException;
042import java.util.stream.Stream;
043
044import javax.servlet.ServletConfig;
045import javax.servlet.ServletContext;
046import javax.servlet.ServletException;
047
048import com.antiaction.common.templateengine.TemplateMaster;
049import com.antiaction.common.templateengine.login.LoginTemplateHandler;
050import com.antiaction.common.templateengine.storage.TemplateFileStorageManager;
051
052import dk.netarkivet.common.exceptions.ArgumentNotValid;
053import dk.netarkivet.common.exceptions.IOFailure;
054import dk.netarkivet.common.utils.DomainUtils;
055import dk.netarkivet.common.utils.I18n;
056import dk.netarkivet.common.utils.Settings;
057import dk.netarkivet.harvester.Constants;
058import dk.netarkivet.harvester.HarvesterSettings;
059import dk.netarkivet.heritrix3.monitor.resources.H3JobResource;
060
061public class NASEnvironment {
062
063    private static final String NAS_GROOVY_RESOURCE_PATH = "dk/netarkivet/heritrix3/monitor/nas.groovy";
064
065    public String NAS_GROOVY_SCRIPT;
066
067    /** servletConfig. */
068    protected ServletConfig servletConfig = null;
069
070    public TemplateMaster templateMaster = null;
071
072    protected String login_template_name = null;
073
074    protected LoginTemplateHandler<NASUser> loginHandler = null;
075
076    public File tempPath;
077
078    public String h3AdminName;
079
080    public String h3AdminPassword;
081
082    public Heritrix3JobMonitorThread h3JobMonitorThread;
083
084    public static String contextPath;
085
086    public static String servicePath;
087
088    public HttpLocaleHandler httpLocaleUtils;
089
090    public static class StringMatcher {
091        public String str;
092        public Pattern p;
093        public Matcher m;
094    }
095
096    public List<StringMatcher> h3HostPortAllowRegexList = new ArrayList<StringMatcher>();
097
098    public final I18n I18N = new I18n(Constants.TRANSLATIONS_BUNDLE);
099
100    public String getResourceAsString(String resource) throws IOException {
101        InputStream in = H3JobResource.class.getClassLoader().getResourceAsStream(resource);
102        ByteArrayOutputStream bOut = new ByteArrayOutputStream();
103        byte[] tmpArr = new byte[8192];
104        int read;
105        while ((read = in.read(tmpArr)) != -1) {
106            bOut.write(tmpArr, 0, read);
107        }
108        in.close();
109        return new String(bOut.toByteArray(), "UTF-8");
110    }
111
112    public NASEnvironment(ServletContext servletContext, ServletConfig theServletConfig) throws ServletException {
113        httpLocaleUtils = HttpLocaleHandler.getInstance();
114
115        try {
116            NAS_GROOVY_SCRIPT = getResourceAsString(NAS_GROOVY_RESOURCE_PATH);
117        } catch (IOException e) {
118                throw new ServletException("Resource missing: " + NAS_GROOVY_RESOURCE_PATH);
119        }
120
121        login_template_name = "login.html";
122
123        templateMaster = TemplateMaster.getInstance("default");
124        templateMaster.addTemplateStorage(TemplateFileStorageManager.getInstance(
125                        servletContext.getRealPath("/"), "UTF-8"));
126
127        loginHandler = new LoginTemplateHandler<NASUser>();
128        loginHandler.templateMaster = templateMaster;
129        loginHandler.templateName = login_template_name;
130        loginHandler.title = "Webdanica - Login";
131        loginHandler.adminPath = "/";
132
133        try {
134            tempPath = Settings.getFile(HarvesterSettings.HERITRIX3_MONITOR_TEMP_PATH);
135            writeDiagnostics("Trying to use tempPath '" + tempPath.getAbsolutePath() + "' as read from setting: " +
136                    HarvesterSettings.HERITRIX3_MONITOR_TEMP_PATH);
137            if (!tempPath.isDirectory()) { // Try to create tempPath if it doesn't exist already
138                tempPath.mkdirs();
139            }
140        } catch (Exception e) {
141            //This is normal if tempPath is unset, so system directory is used.
142            tempPath = new File(System.getProperty("java.io.tmpdir"));
143        }
144        // Fallback to System.getProperty("java.io.tmpdir")
145        if (tempPath == null || !tempPath.isDirectory()) {
146            tempPath = new File(System.getProperty("java.io.tmpdir"));
147        }
148        writeDiagnostics("Using dir '" + tempPath.getAbsolutePath() + "' as tempPath");
149        h3AdminName = Settings.get(HarvesterSettings.HERITRIX_ADMIN_NAME);
150        h3AdminPassword = Settings.get(HarvesterSettings.HERITRIX_ADMIN_PASSWORD);
151
152        this.servletConfig = theServletConfig;
153        h3JobMonitorThread = new Heritrix3JobMonitorThread(this);
154        writeDiagnostics("Initialized " + this.getClass().getName());
155    }
156
157    public void start() {
158        try {
159                h3JobMonitorThread.init();
160            h3JobMonitorThread.start();
161        }
162        catch (Throwable t) {
163                t.printStackTrace();
164        }
165    }
166    
167    private synchronized void writeDiagnostics(String logEntry) {
168        File logFile = new File(tempPath, "h3monitor.log");
169        String dateStamp= "[" + new Date() + "] ";
170        try (FileWriter logFileWriter = new FileWriter(logFile, true);) {
171            logFileWriter.write(dateStamp);
172            logFileWriter.write(logEntry);
173            logFileWriter.write(System.lineSeparator());
174            logFileWriter.flush();
175            logFileWriter.close();
176        } catch (IOException e) {
177            e.printStackTrace();
178        }
179    }
180
181    /**
182     * Do some cleanup. This waits for the different workflow threads to stop running.
183     */
184    public void cleanup() {
185        servletConfig = null;
186    }
187
188    public void replaceH3HostnamePortRegexList(List<String> h3HostnamePortRegexList, List<String> invalidPatternsList) {
189        String regex;
190        StringMatcher stringMatcher;
191        synchronized (h3HostPortAllowRegexList) {
192            h3HostPortAllowRegexList.clear();
193            for (int i=0; i<h3HostnamePortRegexList.size(); ++i) {
194                regex = h3HostnamePortRegexList.get(i);
195                try {
196                    stringMatcher = new StringMatcher();
197                    stringMatcher.str = regex;
198                    stringMatcher.p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
199                    stringMatcher.m = stringMatcher.p.matcher("42");
200                    h3HostPortAllowRegexList.add(stringMatcher);
201                } catch (PatternSyntaxException e) {
202                    invalidPatternsList.add(regex);
203                }
204            }
205        }
206    }
207
208    public boolean isH3HostnamePortEnabled(String h3HostnamePort) {
209        boolean bAllowed = false;
210        synchronized (h3HostPortAllowRegexList) {
211            StringMatcher stringMatcher;
212            int idx = 0;
213            while (!bAllowed && idx < h3HostPortAllowRegexList.size()) {
214                stringMatcher = h3HostPortAllowRegexList.get(idx++);
215                stringMatcher.m.reset(h3HostnamePort);
216                bAllowed = stringMatcher.m.matches();
217            }
218        }
219        return bAllowed;
220    }
221
222    /**
223     * Determine whether URL in given crawllog line is attempted harvested
224     * @param crawllogLine Line from the crawllog under consideration
225     * @return whether the given crawllog line contains an URL that is attempted harvested
226     */
227    private boolean urlInLineIsAttemptedHarvested(String crawllogLine) {
228        String[] columns = crawllogLine.split("\\s+");
229        if (columns.length < 4) {
230            return false;
231        }
232        String fetchStatusCode = columns[1];
233        String harvestedUrl = columns[3];
234
235        // Do not include URLs with a negative Fetch Status Code (coz they're not even attempted crawled)
236        if (Integer.parseInt(fetchStatusCode) < 0) {
237            return false;
238        }
239
240        // Do not include dns-look-ups from the crawllog
241        if (harvestedUrl.startsWith("dns:")) {
242            return false;
243        }
244
245        return true;
246    }
247
248    /**
249     * Get the (attempted) crawled URLs of the crawllog for the running job with the given job id
250     *
251     * @param jobId Id of the running job
252     * @param h3Job Heritrix3JobMonitor from which to get the job for the given jobId
253     * @return The (attempted) crawled URLs of the crawllog for given job
254     */
255    public Stream<String> getCrawledUrls(long jobId, Heritrix3JobMonitor h3Job) {
256        if (h3Job == null) {
257            h3Job = h3JobMonitorThread.getRunningH3Job(jobId);
258            if (h3Job == null) {
259                // There were no running jobs
260                return Stream.empty();
261            }
262        }
263        String crawlLogPath = h3Job.logFile.getAbsolutePath();
264        writeDiagnostics("Trying to getCrawledUrls from job " + jobId + " using cached crawllog '" + crawlLogPath + "'");
265        long cachedLines = h3Job.totalCachedLines;
266
267        if (cachedLines == 0) {
268            writeDiagnostics("No cached crawllog-lines for job " + jobId);
269            return Stream.empty();
270        } else {
271            writeDiagnostics("Number of cached crawllog-lines for job " + jobId + ": " +  cachedLines);
272        }
273        try {
274            // test that the crawllog-exists if not, return an empty stream 
275            if (!Paths.get(crawlLogPath).toFile().isFile()) {
276                writeDiagnostics("The file '" + crawlLogPath + "' doesn't correspond to a file. returning an empty stream");
277                return Stream.empty();
278            }
279            Stream<String> attemptedHarvestedUrlsFromCrawllog = Files.lines(Paths.get(crawlLogPath),
280                    Charset.forName("UTF-8"))
281                    .filter(line -> urlInLineIsAttemptedHarvested(line))
282                    .map(line -> line.split("\\s+")[3]);
283
284            return attemptedHarvestedUrlsFromCrawllog;
285        } catch (java.io.IOException e) {
286            throw new IOFailure("Could not open crawllog file", e);
287        }
288    }
289
290    /**
291     * Normalizes input URL so that only the domain part remains.
292     *
293     * @param url URL intended to be stripped to it's domain part
294     * @return The domain part of the input URL
295     * @throws ArgumentNotValid if URL was malformed
296     */
297    private String normalizeDomainUrl(String url) {
298        if (!url.toLowerCase().matches("^\\w+://.*")) {
299            // URL has no protocol part, so let's add one
300            url = "http://" + url;
301        }
302        URL domainUrl;
303        try {
304            domainUrl = new URL(url);
305        } catch (MalformedURLException e) {
306            return "";
307        }
308        String domainHost = domainUrl.getHost();
309        String normalizedDomainUrl = DomainUtils.domainNameFromHostname(domainHost);
310        if (normalizedDomainUrl == null) {
311            // Invalid domain
312            throw new ArgumentNotValid(url + " is not a valid domain name.");
313        }
314        return normalizedDomainUrl;
315    }
316
317    /**
318     * Find out whether the given job harvests given domain.
319     *
320     * @param jobId The job
321     * @param domainName The domain
322     * @return whether the given job harvests given domain
323     */
324    public boolean jobHarvestsDomain(long jobId, String domainName, Heritrix3JobMonitor h3Job) {
325        // Normalize search URL
326        String searchedDomain = normalizeDomainUrl(domainName);
327
328        // Return whether or not the crawled URLs contain the searched URL
329        return getCrawledUrls(jobId, h3Job)
330                .map(url -> normalizeDomainUrl(url))
331                .anyMatch(url -> searchedDomain.equalsIgnoreCase(url));
332    }
333
334}