001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3; 024 025import dk.netarkivet.common.utils.Settings; 026 027/** 028 * Settings specific to the heritrix3 harvester module of NetarchiveSuite. 029 */ 030public class Heritrix3Settings { 031 032 /** The default place in classpath where the settings file can be found. */ 033 private static final String DEFAULT_SETTINGS_CLASSPATH = "dk/netarkivet/harvester/heritrix3/settings.xml"; 034 035 /* 036 * The static initialiser is called when the class is loaded. It will add default values for all settings defined in 037 * this class, by loading them from a settings.xml file in classpath. 038 */ 039 static { 040 Settings.addDefaultClasspathSettings(DEFAULT_SETTINGS_CLASSPATH); 041 } 042 043 // NOTE: The constants defining setting names below are left non-final on 044 // purpose! Otherwise, the static initialiser that loads default values 045 // will not run. 046 047// /** 048// * <b>settings.harvester.harvesting.heritrix.crawlLoopWaitTime</b>:<br> 049// * Time interval in seconds to wait during a crawl loop in the harvest controller. Default value is 20 seconds. 050// * 051// * TODO Maybe move this from the heritrix settings (settings.harvester.harvesting.heritrix) to 052// * settings.harvester.harvesting.controller. 053// * 054// */ 055 public static String CRAWL_LOOP_WAIT_TIME = "settings.harvester.harvesting.heritrix.crawlLoopWaitTime"; 056 057 // 058// /** 059// * <b>settings.harvester.harvesting.frontier.frontierReportWaitTime</b>:<br> 060// * Time interval in seconds to wait between two requests to generate a full frontier report. Default value is 600 061// * seconds (10 min). 062// */ 063 public static String FRONTIER_REPORT_WAIT_TIME = "settings.harvester.harvesting.frontier.frontierReportWaitTime"; 064// 065// /** 066// * <b>settings.harvester.harvesting.frontier.filter.class</b> Defines a filter to apply to the full frontier report. 067// * the default class: {@link TopTotalEnqueuesFilter} 068// */ 069// public static String FRONTIER_REPORT_FILTER_CLASS = "settings.harvester.harvesting.frontier.filter.class"; 070// 071// /** 072// * <b>settings.harvester.harvesting.frontier.filter.args</b> Defines a frontier report filter's arguments. Arguments 073// * should be separated by semicolons. 074// */ 075// public static String FRONTIER_REPORT_FILTER_ARGS = "settings.harvester.harvesting.frontier.filter.args"; 076// 077// /** 078// * <b>settings.harvester.harvesting.heritrix.abortIfConnectionLost</b>:<br> 079// * Boolean flag. If set to true, the harvest controller will abort the current crawl when the JMX connection is 080// * lost. If set to true it will only log a warning, leaving the crawl operator shutting down harvester manually. 081// * Default value is true. 082// * 083// * @see BnfHeritrixController 084// */ 085// public static String ABORT_IF_CONNECTION_LOST = "settings.harvester.harvesting.heritrix.abortIfConnectionLost"; 086// 087// /** 088// * <b>settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout</b>:<br> 089// * Maximum time in seconds to wait for Heritrix to generate report files once crawling is over. 090// */ 091// public static String WAIT_FOR_REPORT_GENERATION_TIMEOUT = "settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout"; 092// 093// /** 094// * <b>settings.harvester.harvesting.heritrix.adminName</b>: <br> 095// * The name used to access the Heritrix GUI. 096// */ 097 public static String HERITRIX_ADMIN_NAME = "settings.harvester.harvesting.heritrix.adminName"; 098// 099// /** 100// * <b>settings.harvester.harvesting.heritrix.adminPassword</b>: <br> 101// * The password used to access the Heritrix GUI. 102// */ 103 public static String HERITRIX_ADMIN_PASSWORD = "settings.harvester.harvesting.heritrix.adminPassword"; 104// 105// /** 106// * <b>settings.harvester.harvesting.heritrix.guiPort</b>: <br> 107// * Port used to access the Heritrix web user interface. This port must not be used by anything else on the machine. 108// * Note that apart from pausing a job, modifications done directly on Heritrix may cause unexpected breakage. 109// */ 110 public static String HERITRIX_GUI_PORT = "settings.harvester.harvesting.heritrix.guiPort"; 111 112// /** 113// * <b>settings.harvester.harvesting.heritrix.heapSize</b>: <br> 114// * The heap size to use for the Heritrix sub-process. This should probably be fairly large. It can be specified in 115// * the same way as for the -Xmx argument to Java, e.g. 512M, 2G etc. 116// */ 117 public static String HERITRIX_HEAP_SIZE = "settings.harvester.harvesting.heritrix.heapSize"; 118// 119// /** 120// * <b>settings.harvester.harvesting.heritrix.javaOpts</b>: <br> 121// * Additional JVM options for the Heritrix sub-process. By default there is no additional JVM option. 122// */ 123 public static String HERITRIX_JVM_OPTS = "settings.harvester.harvesting.heritrix.javaOpts"; 124// 125// /** 126// * <b>settings.harvester.harvesting.heritrixControllerClass</b>:<br/> 127// * The implementation of the HeritrixController interface to be used. 128// */ 129 public static String HERITRIX_CONTROLLER_CLASS = "settings.harvester.harvesting.heritrixController.class"; 130// 131// /** 132// * <b>settings.harvester.harvesting.heritrixLauncherClass</b>:<br/> 133// * The implementation of the HeritrixLauncher abstract class to be used. 134// */ 135 public static String HERITRIX_LAUNCHER_CLASS = "settings.harvester.harvesting.heritrixLauncher.class"; 136// 137// /** 138// * <b>settings.harvester.harvesting.harvestReport</b>:<br/> 139// * The implementation of {@link HarvestReport} interface to be used. 140// */ 141 public static String HARVEST_REPORT_CLASS = "settings.harvester.harvesting.harvestReport.class"; 142// 143// /** 144// * <b>settings.harvester.harvesting.harvestReport.disregardSeedsURLInfo</b>:<br/> 145// * Should we disregard seedURL-information and thus assign the harvested bytes to the domain of the harvested URL 146// * instead of the seed url domain? The default is false; 147// */ 148 public static String DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG = "settings.harvester.harvesting.harvestReport.disregardSeedURLInfo"; 149 150 /** 151 * <b>settings.harvester.harvesting.metadata.generateArchiveFilesReport</b> This setting is a boolean flag that 152 * enables/disables the generation of an ARC/WARC files report. Default value is 'true'. 153 * 154 * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles) 155 */ 156 public static String METADATA_GENERATE_ARCHIVE_FILES_REPORT = "settings.harvester.harvesting.metadata.archiveFilesReport.generate"; 157 158 /** 159 * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If 160 * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the name of the generated report file. 161 * Default value is 'archivefiles-report.txt'. 162 * 163 * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles) 164 */ 165 public static String METADATA_ARCHIVE_FILES_REPORT_NAME = "settings.harvester.harvesting.metadata.archiveFilesReport.fileName"; 166 167 /** 168 * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If 169 * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the header of the generated report file. 170 * This setting should generally be left to its default value, which is '[ARCHIVEFILE] [Closed] [Size]'. 171 * 172 * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles) 173 */ 174 public static String METADATA_ARCHIVE_FILES_REPORT_HEADER = "settings.harvester.harvesting.metadata.archiveFilesReport.fileHeader"; 175}