001package dk.netarkivet.harvester.heritrix3; 002 003import java.util.regex.Pattern; 004 005import dk.netarkivet.common.utils.Settings; 006import dk.netarkivet.harvester.harvesting.distribute.HarvesterReadyMessage; 007import dk.netarkivet.harvester.harvesting.frontier.TopTotalEnqueuesFilter; 008import dk.netarkivet.harvester.harvesting.report.HarvestReport; 009 010/** 011 * Settings specific to the heritrix3 harvester module of NetarchiveSuite. 012 */ 013public class Heritrix3Settings { 014 015 /** The default place in classpath where the settings file can be found. */ 016 private static final String DEFAULT_SETTINGS_CLASSPATH = "dk/netarkivet/harvester/heritrix3/settings.xml"; 017 018 /* 019 * The static initialiser is called when the class is loaded. It will add default values for all settings defined in 020 * this class, by loading them from a settings.xml file in classpath. 021 */ 022 static { 023 Settings.addDefaultClasspathSettings(DEFAULT_SETTINGS_CLASSPATH); 024 } 025 026 // NOTE: The constants defining setting names below are left non-final on 027 // purpose! Otherwise, the static initialiser that loads default values 028 // will not run. 029 030 /** 031 * <b>settings.harvester.datamodel.domain.defaultSeedlist</b>: <br> 032 * Default name of the seedlist to use when new domains are created. 033 */ 034 public static String DEFAULT_SEEDLIST = "settings.harvester.datamodel.domain.defaultSeedlist"; 035 036 /** 037 * <b>settings.harvester.datamodel.domain.validSeedRegex</b>: <br> 038 * Regular expression used to validate a seed within a seedlist. 039 * <p> 040 * Default value accepts all non-empty strings. 041 */ 042 public static String VALID_SEED_REGEX = "settings.harvester.datamodel.domain.validSeedRegex"; 043 044 /** 045 * <b>settings.harvester.datamodel.domain.defaultConfig</b>: <br> 046 * The name of a configuration that is created by default and which is initially used for snapshot harvests. 047 */ 048 public static String DOMAIN_DEFAULT_CONFIG = "settings.harvester.datamodel.domain.defaultConfig"; 049 050 /** 051 * <b>settings.harvester.datamodel.domain.defaultOrderxml</b>: <br> 052 * Name of order xml template used for domains if nothing else is specified. The newly created configurations use 053 * this. This template must exist before harvesting can commence 054 */ 055 public static String DOMAIN_DEFAULT_ORDERXML = "settings.harvester.datamodel.domain.defaultOrderxml"; 056 057 /** 058 * <b>settings.harvester.datamodel.domain.defaultMaxrate</b>: <br> 059 * Default download rate for domain configuration. Not currently enforced. 060 */ 061 public static String DOMAIN_CONFIG_MAXRATE = "settings.harvester.datamodel.domain.defaultMaxrate"; 062 063 /** 064 * <b>settings.harvester.datamodel.domain.defaultMaxbytes</b>: <br> 065 * Default byte limit for domain configuration. 066 */ 067 public static String DOMAIN_CONFIG_MAXBYTES = "settings.harvester.datamodel.domain.defaultMaxbytes"; 068 069 /** 070 * <b>settings.harvester.datamodel.domain.defaultMaxobjects</b>: <br> 071 * Default object limit for domain configuration. 072 */ 073 public static String DOMAIN_CONFIG_MAXOBJECTS = "settings.harvester.datamodel.domain.defaultMaxobjects"; 074 075 /** 076 * <b>settings.harvester.scheduler.jobGen.config.errorFactorPrevResult</b>: <br> 077 * Used when calculating expected size of a harvest of some configuration during job-creation process. This defines 078 * how great a possible factor we will permit a harvest to be larger then the expectation, when basing the 079 * expectation on a previous completed job. 080 */ 081 public static String ERRORFACTOR_PERMITTED_PREVRESULT = "settings.harvester.scheduler.jobGen.config.errorFactorPrevResult"; 082 083 /** 084 * <b>settings.harvester.scheduler.jobGen.config.errorFactorBestGuess</b>: <br> 085 * Used when calculating expected size of a harvest of some configuration during job-creation process. This defines 086 * how great a possible factor we will permit a harvest to be larger then the expectation, when basing the 087 * expectation on previous uncompleted harvests or no harvest data at all. 088 */ 089 public static String ERRORFACTOR_PERMITTED_BESTGUESS = "settings.harvester.scheduler.jobGen.config.errorFactorBestGuess"; 090 091 /** 092 * <b>settings.harvester.scheduler.jobGen.config.expectedAverageBytesPerObject</b>: <br> 093 * How many bytes the average object is expected to be on domains where we don't know any better. This number should 094 * grow over time, as of end of 2005 empirical data shows 38000. 095 */ 096 public static String EXPECTED_AVERAGE_BYTES_PER_OBJECT = "settings.harvester.scheduler.jobGen.config.expectedAverageBytesPerObject"; 097 098 /** 099 * <b>settings.harvester.scheduler.jobGen.config.maxDomainSize</b>: <br> 100 * The initial guess of the domain size (number of objects) of an unknown domain. 101 */ 102 public static String MAX_DOMAIN_SIZE = "settings.harvester.scheduler.jobGen.config.maxDomainSize"; 103 104 /** 105 * <b>settings.harvester.scheduler.jobGen.config.maxRelativeSizeDifference</b>: <br> 106 * The maximum allowed relative difference in expected number of objects retrieved in a single job definition. To 107 * avoid job splitting, set the value as Long.MAX_VALUE. 108 */ 109 public static String JOBS_MAX_RELATIVE_SIZE_DIFFERENCE = "settings.harvester.scheduler.jobGen.config.maxRelativeSizeDifference"; 110 111 /** 112 * <b>settings.harvester.scheduler.jobGen.config.minAbsoluteSizeDifference</b>: <br> 113 * Size differences for jobs below this threshold are ignored, regardless of the limits for the relative size 114 * difference. To avoid job splitting, set the value as Long.MAX_VALUE. 115 */ 116 public static String JOBS_MIN_ABSOLUTE_SIZE_DIFFERENCE = "settings.harvester.scheduler.jobGen.config.minAbsoluteSizeDifference"; 117 118 /** 119 * <b>settings.harvester.scheduler.jobGen.config.maxTotalSize</b>: <br> 120 * When this limit is exceeded no more configurations may be added to a job. To avoid job splitting, set the value 121 * as Long.MAX_VALUE. 122 */ 123 public static String JOBS_MAX_TOTAL_JOBSIZE = "settings.harvester.scheduler.jobGen.config.maxTotalSize"; 124 125 /** 126 * <b>settings.harvester.scheduler.jobGen.maxTimeToCompleteJob</b>: <br> 127 * The limit on how many seconds Heritrix should continue on each job. O means no limit. 128 */ 129 public static String JOBS_MAX_TIME_TO_COMPLETE = "settings.harvester.scheduler.jobGen.maxTimeToCompleteJob"; 130 131 /** 132 * <b>settings.harvester.scheduler.jobGen.domainConfigSubsetSize</b>: <br> 133 * How many domain configurations we will process in one go before making jobs out of them. This amount of domains 134 * will be stored in memory at the same time. To avoid job splitting, set this value as Long.MAX_VALUE. 135 */ 136 public static String JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE = "settings.harvester.scheduler.jobGen.domainConfigSubsetSize"; 137 138 /** 139 * <b>settings.harvester.scheduler.jobGen.config.fixedDomainCountFocused</b>: <br> 140 * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter represents the 141 * maximum number of domain configurations in a partial harvest job. 142 */ 143 public static String JOBGEN_FIXED_CONFIG_COUNT_FOCUSED = "settings.harvester.scheduler.jobGen.config.fixedDomainCountFocused"; 144 145 /** 146 * <b>settings.harvester.scheduler.jobGen.config.fixedDomainCountSnapshot</b>: <br> 147 * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter represents the 148 * maximum number of domain configurations in a full harvest job. 149 */ 150 public static String JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT = "settings.harvester.scheduler.jobGen.config.fixedDomainCountSnapshot"; 151 152 /** 153 * <b>settings.harvester.scheduler.jobGen.config.excludeDomainsWithZeroBudget</b>: <br> 154 * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter toggles whether or 155 * not domain configurations with a budget of zero (byte or objects) should be excluded from jobs. The default value 156 * is 'false'. 157 */ 158 public static String JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET = "settings.harvester.scheduler.jobGen.config.excludeDomainsWithZeroBudget"; 159 160 /** 161 * <b>settings.harvester.scheduler.jobGen.config.postponeUnregisteredChannel</b>: <br> 162 * If this property is true, then the job generator will postpone job generation for harvest definitions that are 163 * mapped to a harvest channel not registered to at least one harvester. The default value is 'true'. 164 */ 165 public static String JOBGEN_POSTPONE_UNREGISTERED_HARVEST_CHANNEL = "settings.harvester.scheduler.jobGen.config.postponeUnregisteredChannel"; 166 167 /** 168 * <b>settings.harvester.scheduler.jobGen.class</b>: <br> 169 * The fully qualified class name of the chosen job generator implementation, currently either 170 * {@link DefaultJobGenerator} or {@link FixedDomainConfigurationCountJobGenerator}. The default is 171 * {@link DefaultJobGenerator}. 172 */ 173 public static String JOBGEN_CLASS = "settings.harvester.scheduler.jobGen.class"; 174 175 /** 176 * <b>settings.harvester.scheduler.jobGen.config.splitByObjectLimit</b>: <br> 177 * By default the byte limit is used as the base criterion for how many domain configurations are put into one 178 * harvest job. However if this parameter is set to "true", then the object limit is used instead as the base 179 * criterion. 180 */ 181 public static String SPLIT_BY_OBJECTLIMIT = "settings.harvester.scheduler.jobGen.config.splitByObjectLimit"; 182 183 /** 184 * <b>settings.harvester.scheduler.jobGen.objectLimitIsSetByQuotaEnforcer</b>: <br> 185 * Controls whether the domain configuration object limit should be set in Heritrix's crawl order through the 186 * QuotaEnforcer configuration (parameter set to true) or through the frontier parameter 'queue-total-budget' ( 187 * parameter set to false). 188 * <p> 189 * Default value is true, as legacy implementation was to use only the QuotaEnforcer. 190 */ 191 public static String OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER = "settings.harvester.scheduler.jobGen.objectLimitIsSetByQuotaEnforcer"; 192 193 /** 194 * <b>settings.harvester.scheduler.jobtimeouttime</b>:<br /> 195 * Time before a STARTED job times out and change status to FAILED. In seconds. 196 */ 197 public static String JOB_TIMEOUT_TIME = "settings.harvester.scheduler.jobtimeouttime"; 198 199 /** 200 * <b>settings.harvester.scheduler.jobgenerationperiode</b>: <br> 201 * The period between checking if new jobs should be generated, in seconds. This is one minute because that's the 202 * finest we can define in a harvest definition. 203 */ 204 public static String GENERATE_JOBS_PERIOD = "settings.harvester.scheduler.jobgenerationperiode"; 205 206 /** 207 * <b>settings.harvester.harvesting.serverDir</b>: <br> 208 * Each job gets a subdir of this dir. Job data is written and Heritrix writes to that subdir. 209 */ 210 public static String HARVEST_CONTROLLER_SERVERDIR = "settings.harvester.harvesting.serverDir"; 211 212 /** 213 * <b>settings.harvester.harvesting.minSpaceLeft</b>: <br> 214 * The minimum amount of free bytes in the serverDir required before accepting any harvest-jobs. 215 */ 216 public static String HARVEST_SERVERDIR_MINSPACE = "settings.harvester.harvesting.minSpaceLeft"; 217 218 /** 219 * <b>settings.harvester.harvesting.oldjobsDir</b>: <br> 220 * The directory in which data from old jobs is kept after uploading. Each directory from serverDir will be moved to 221 * here if any data remains, either due to failed uploads or because it wasn't attempted uploaded. 222 */ 223 public static String HARVEST_CONTROLLER_OLDJOBSDIR = "settings.harvester.harvesting.oldjobsDir"; 224 225 /** 226 * <b>settings.harvester.harvesting.channel</b>: <br> 227 * Harvest channel to take jobs from. This is the default channel assigned to the harvest controller. 228 * 229 * @see dk.netarkivet.harvester.datamodel.HarvestChannel <p> 230 * NOTE: this one is also used in SingleMBeanObject parsing information to System state 231 */ 232 public static String HARVEST_CONTROLLER_CHANNEL = "settings.harvester.harvesting.channel"; 233 234 /** 235 * <b>settings.harvester.harvesting.heritrix.inactivityTimeout</b>: <br> 236 * The timeout setting for aborting a crawl based on crawler-inactivity. If the crawler is inactive for this amount 237 * of seconds the crawl will be aborted. The inactivity is measured on the crawlController.activeToeCount(). 238 */ 239 public static String INACTIVITY_TIMEOUT_IN_SECS = "settings.harvester.harvesting.heritrix.inactivityTimeout"; 240 241 /** 242 * <b>settings.harvester.harvesting.heritrix.noresponseTimeout</b>: <br> 243 * The timeout value (in seconds) used in HeritrixLauncher for aborting crawl when no bytes are being received from 244 * web servers. 245 */ 246 public static String CRAWLER_TIMEOUT_NON_RESPONDING = "settings.harvester.harvesting.heritrix.noresponseTimeout"; 247 /** 248 * <b>settings.harvester.monitor.refreshInterval</b>:<br> 249 * Time interval in seconds after which the harvest monitor pages will be automatically refreshed. 250 */ 251 public static String HARVEST_MONITOR_REFRESH_INTERVAL = "settings.harvester.monitor.refreshInterval"; 252 253 /** 254 * <b>settings.harvester.monitor.historySampleRate</b>:<br> 255 * Time interval in seconds between historical records stores in the DB. Default value is 5 minutes. 256 */ 257 public static String HARVEST_MONITOR_HISTORY_SAMPLE_RATE = "settings.harvester.monitor.historySampleRate"; 258 259 /** 260 * <b>settings.harvester.monitor.historyChartGenIntervall</b>:<br> 261 * Time interval in seconds between regenerating the chart of historical data for a running job. Default value is 5 262 * minutes. 263 */ 264 public static String HARVEST_MONITOR_HISTORY_CHART_GEN_INTERVAL = "settings.harvester.monitor.historyChartGenInterval"; 265 266 /** 267 * <b>settings.harvester.monitor.displayedHistorySize</b>:<br> 268 * Maximum number of most recent history records displayed on the running job details page. 269 */ 270 public static String HARVEST_MONITOR_DISPLAYED_HISTORY_SIZE = "settings.harvester.monitor.displayedHistorySize"; 271 272 /** 273 * <b>settings.harvester.harvesting.heritrix.crawlLoopWaitTime</b>:<br> 274 * Time interval in seconds to wait during a crawl loop in the harvest controller. Default value is 20 seconds. 275 * 276 * TODO Maybe move this from the heritrix settings (settings.harvester.harvesting.heritrix) to 277 * settings.harvester.harvesting.controller. 278 * 279 */ 280 public static String CRAWL_LOOP_WAIT_TIME = "settings.harvester.harvesting.heritrix.crawlLoopWaitTime"; 281 282 /** 283 * <b>settings.harvester.harvesting.sendReadyInterval</b>:<br> 284 * Time interval in seconds to wait before transmitting a {@link HarvesterReadyMessage} to the {@link JobDispatcher} 285 * . 286 * <p> 287 * <p> 288 * Lower values will make the JobDispatcher detect ready harvester faster, but will make it more likely that the 289 * harvester may send two ready messages before a job is received, causing the JobDispatcher to dispatch two jobs. 290 * <p> 291 * Default value is 30 second. 292 */ 293 public static String SEND_READY_INTERVAL = "settings.harvester.harvesting.sendReadyInterval"; 294 295 /** 296 * <b>settings.harvester.harvesting.sendReadyDelay</b>:<br> 297 * Time in milliseconds to wait from starting to listen on the job queue to a potential ready message is sent to the 298 * HarvestJobManager. This small delay is used to retrieve any left over jobs on the queue before sending the ready 299 * message to the harvester. Default value is 1000 millisecond. 300 */ 301 public static String SEND_READY_DELAY = "settings.harvester.harvesting.sendReadyDelay"; 302 303 /** 304 * <b>settings.harvester.harvesting.frontier.frontierReportWaitTime</b>:<br> 305 * Time interval in seconds to wait between two requests to generate a full frontier report. Default value is 600 306 * seconds (10 min). 307 */ 308 public static String FRONTIER_REPORT_WAIT_TIME = "settings.harvester.harvesting.frontier.frontierReportWaitTime"; 309 310 /** 311 * <b>settings.harvester.harvesting.frontier.filter.class</b> Defines a filter to apply to the full frontier report. 312 * the default class: {@link TopTotalEnqueuesFilter} 313 */ 314 public static String FRONTIER_REPORT_FILTER_CLASS = "settings.harvester.harvesting.frontier.filter.class"; 315 316 /** 317 * <b>settings.harvester.harvesting.frontier.filter.args</b> Defines a frontier report filter's arguments. Arguments 318 * should be separated by semicolons. 319 */ 320 public static String FRONTIER_REPORT_FILTER_ARGS = "settings.harvester.harvesting.frontier.filter.args"; 321 322 /** 323 * <b>settings.harvester.harvesting.heritrix.abortIfConnectionLost</b>:<br> 324 * Boolean flag. If set to true, the harvest controller will abort the current crawl when the JMX connection is 325 * lost. If set to true it will only log a warning, leaving the crawl operator shutting down harvester manually. 326 * Default value is true. 327 * 328 * @see BnfHeritrixController 329 */ 330 public static String ABORT_IF_CONNECTION_LOST = "settings.harvester.harvesting.heritrix.abortIfConnectionLost"; 331 332 /** 333 * <b>settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout</b>:<br> 334 * Maximum time in seconds to wait for Heritrix to generate report files once crawling is over. 335 */ 336 public static String WAIT_FOR_REPORT_GENERATION_TIMEOUT = "settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout"; 337 338 /** 339 * <b>settings.harvester.harvesting.heritrix.adminName</b>: <br> 340 * The name used to access the Heritrix GUI. 341 */ 342 public static String HERITRIX_ADMIN_NAME = "settings.harvester.harvesting.heritrix.adminName"; 343 344 /** 345 * <b>settings.harvester.harvesting.heritrix.adminPassword</b>: <br> 346 * The password used to access the Heritrix GUI. 347 */ 348 public static String HERITRIX_ADMIN_PASSWORD = "settings.harvester.harvesting.heritrix.adminPassword"; 349 350 /** 351 * <b>settings.harvester.harvesting.heritrix.guiPort</b>: <br> 352 * Port used to access the Heritrix web user interface. This port must not be used by anything else on the machine. 353 * Note that apart from pausing a job, modifications done directly on Heritrix may cause unexpected breakage. 354 */ 355 public static String HERITRIX_GUI_PORT = "settings.harvester.harvesting.heritrix.guiPort"; 356 357 /** 358 * <b>settings.harvester.harvesting.heritrix.jmxPort</b>: <br> 359 * The port that Heritrix 1.14.4 uses to expose its JMX interface. This port must not be used by anything else on the 360 * machine, but does not need to be accessible from other machines unless you want to be able to use jconsole to 361 * access Heritrix directly. Note that apart from pausing a job, modifications done directly on Heritrix may cause 362 * unexpected breakage. Irrelevant for Heritrix 3+ 363 */ 364 public static String HERITRIX_JMX_PORT = "settings.harvester.harvesting.heritrix.jmxPort"; 365 366 /** 367 * <b>settings.harvester.harvesting.heritrix.jmxUsername</b>: <br> 368 * The username used to connect to Heritrix 1.14.4 JMX interface The username must correspond to the value stored in the 369 * jmxremote.password file (name defined in setting settings.common.jmx.passwordFile). 370 * Irrelevant for Heritrix 3+ 371 */ 372 public static String HERITRIX_JMX_USERNAME = "settings.harvester.harvesting.heritrix.jmxUsername"; 373 374 /** 375 * <b>settings.harvester.harvesting.heritrix.jmxPassword</b>: <br> 376 * The password used to connect to Heritrix JMX interface The password must correspond to the value stored in the 377 * jmxremote.password file (name defined in setting settings.common.jmx.passwordFile). 378 * Irrelevant for Heritrix 3+ 379 */ 380 public static String HERITRIX_JMX_PASSWORD = "settings.harvester.harvesting.heritrix.jmxPassword"; 381 382 /** 383 * <b>settings.harvester.harvesting.heritrix.heapSize</b>: <br> 384 * The heap size to use for the Heritrix sub-process. This should probably be fairly large. It can be specified in 385 * the same way as for the -Xmx argument to Java, e.g. 512M, 2G etc. 386 */ 387 public static String HERITRIX_HEAP_SIZE = "settings.harvester.harvesting.heritrix.heapSize"; 388 389 /** 390 * <b>settings.harvester.harvesting.heritrix.javaOpts</b>: <br> 391 * Additional JVM options for the Heritrix sub-process. By default there is no additional JVM option. 392 */ 393 public static String HERITRIX_JVM_OPTS = "settings.harvester.harvesting.heritrix.javaOpts"; 394 395 /** 396 * <b>settings.harvester.harvesting.heritrixControllerClass</b>:<br/> 397 * The implementation of the HeritrixController interface to be used. 398 */ 399 public static String HERITRIX_CONTROLLER_CLASS = "settings.harvester.harvesting.heritrixController.class"; 400 401 /** 402 * <b>settings.harvester.harvesting.heritrixLauncherClass</b>:<br/> 403 * The implementation of the HeritrixLauncher abstract class to be used. 404 */ 405 public static String HERITRIX_LAUNCHER_CLASS = "settings.harvester.harvesting.heritrixLauncher.class"; 406 407 /** 408 * <b>settings.harvester.harvesting.harvestReport</b>:<br/> 409 * The implementation of {@link HarvestReport} interface to be used. 410 */ 411 public static String HARVEST_REPORT_CLASS = "settings.harvester.harvesting.harvestReport.class"; 412 413 /** 414 * <b>settings.harvester.harvesting.harvestReport.disregardSeedsURLInfo</b>:<br/> 415 * Should we disregard seedURL-information and thus assign the harvested bytes to the domain of the harvested URL 416 * instead of the seed url domain? The default is false; 417 */ 418 public static String DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG = "settings.harvester.harvesting.harvestReport.disregardSeedURLInfo"; 419 420 /** 421 * <b>settings.harvester.harvesting.deduplication.enabled</b>:<br/> 422 * This setting tells the system whether or not to use deduplication. This setting is true by default. 423 */ 424 public static String DEDUPLICATION_ENABLED = "settings.harvester.harvesting.deduplication.enabled"; 425 426 /** 427 * <b>settings.harvester.harvesting.metadata.heritrixFilePattern</b> This setting allows to filter which Heritrix 428 * files should be stored in the metadata (W)ARC file.. 429 * 430 * @see Pattern 431 */ 432 public static String METADATA_HERITRIX_FILE_PATTERN = "settings.harvester.harvesting.metadata.heritrixFilePattern"; 433 434 /** 435 * <b>settings.harvester.harvesting.metadata.reportFilePattern</b> This setting allows to filter which Heritrix 436 * files that should be stored in the metadata (W)ARC file are to be classified as a report. 437 * 438 * @see Pattern 439 */ 440 public static String METADATA_REPORT_FILE_PATTERN = "settings.harvester.harvesting.metadata.reportFilePattern"; 441 442 /** 443 * <b>settings.harvester.harvesting.metadata.logFilePattern</b> This setting allows to filter which Heritrix log 444 * files should be stored in the metadata (W)ARC file. 445 * 446 * @see Pattern 447 */ 448 public static String METADATA_LOG_FILE_PATTERN = "settings.harvester.harvesting.metadata.logFilePattern"; 449 450 /** 451 * <b>settings.harvester.harvesting.metadata.generateArchiveFilesReport</b> This setting is a boolean flag that 452 * enables/disables the generation of an ARC/WARC files report. Default value is 'true'. 453 * 454 * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles) 455 */ 456 public static String METADATA_GENERATE_ARCHIVE_FILES_REPORT = "settings.harvester.harvesting.metadata.archiveFilesReport.generate"; 457 458 /** 459 * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If 460 * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the name of the generated report file. 461 * Default value is 'archivefiles-report.txt'. 462 * 463 * FIXME: not easily portable to H3, as it depends on information in heritrix_out.log no longer available. 464 * 465 * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles) 466 */ 467 public static String METADATA_ARCHIVE_FILES_REPORT_NAME = "settings.harvester.harvesting.metadata.archiveFilesReport.fileName"; 468 469 /** 470 * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If 471 * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the header of the generated report file. 472 * This setting should generally be left to its default value, which is '[ARCHIVEFILE] [Opened] [Closed] [Size]'. 473 * 474 * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles) 475 */ 476 public static String METADATA_ARCHIVE_FILES_REPORT_HEADER = "settings.harvester.harvesting.metadata.archiveFilesReport.fileHeader"; 477 478 /** 479 * <b>settings.harvester.aliases.timeout</b> The amount of time in seconds before an alias times out, and needs to 480 * be re-evaluated. The default value is one year, i.e 31536000 seconds. 481 */ 482 public static String ALIAS_TIMEOUT = "settings.harvester.aliases.timeout"; 483 484 /** 485 * <b>settings.harvester.harvesting.continuationFromHeritrixRecoverlogEnabled</b>:</br> Setting for whether or not a 486 * restarted job should try fetching the recoverlog of the previous failed job, and ask Heritrix to continue from 487 * this log. The default is false. 488 */ 489 public static String RECOVERlOG_CONTINUATION_ENABLED = "settings.harvester.harvesting.continuationFromHeritrixRecoverlogEnabled"; 490 491 /** 492 * <b>settings.harvester.harvesting.metadata.metadataFormat</b> The dataformat used by Netarchivesuite to write the 493 * metadata associated with a given harvest job. default: arc (alternative: warc) 494 */ 495 public static String METADATA_FORMAT = "settings.harvester.harvesting.metadata.metadataFormat"; 496 497 /** 498 * <b>settings.harvester.harvesting.heritrix.archiveFormat</b> The dataformat used by heritrix to write the 499 * harvested data. default: warc (alternative: arc) 500 */ 501 public static String HERITRIX_ARCHIVE_FORMAT = "settings.harvester.harvesting.heritrix.archiveFormat"; 502 /** 503 * <b>settings.harvester.harvesting.heritrix.archiveNaming.class</b> The class implementing the chosen way of naming 504 * your archive-files default: LegacyNamingConvention. This class decides what to put into the Heritrix "prefix" 505 * property of the org.archive.crawler.writer.ARCWriterProcessor and/or 506 * org.archive.crawler.writer.WARCWriterProcessor. 507 */ 508 public static String HERITRIX_ARCHIVE_NAMING_CLASS = "settings.harvester.harvesting.heritrix.archiveNaming.class"; 509 510 /** 511 * <b>settings.harvester.harvesting.heritrix.warc.skipIdenticalDigests</b> Represents the 'skip-identical-digests' 512 * setting in the Heritrix WARCWriterProcessor. The default is false. 513 */ 514 public static String HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix.warc.skipIdenticalDigests"; 515 /** 516 * <b>settings.harvester.harvesting.heritrix.warc.writeRequests</b> Represents the 'write-requests' setting in the 517 * Heritrix WARCWriterProcessor. The default is true 518 */ 519 public static String HERITRIX_WARC_WRITE_REQUESTS = "settings.harvester.harvesting.heritrix.warc.writeRequests"; 520 /** 521 * <b>settings.harvester.harvesting.heritrix.warc.writeMetadata</b> Represents the 'write-metadata' setting in the 522 * Heritrix WARCWriterProcessor. The default is false. 523 */ 524 public static String HERITRIX_WARC_WRITE_METADATA = "settings.harvester.harvesting.heritrix.warc.writeMetadata"; 525 /** 526 * <b>settings.harvester.harvesting.heritrix.warc.writeRevisitForIdenticalDigests</b> Represents the 527 * 'write-revisit-for-identical-digests' setting in the Heritrix WARCWriterProcessor. The default is false. 528 */ 529 public static String HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix.warc.writeRevisitForIdenticalDigests"; 530 /** 531 * <b>settings.harvester.harvesting.heritrix.warc.writeRevisitForNotModified</b> Represents the 532 * 'write-revisit-for-not-modified' setting in the Heritrix WARCWriterProcessor. The default is true. 533 */ 534 public static String HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED = "settings.harvester.harvesting.heritrix.warc.writeRevisitForNotModified"; 535 536 /** 537 * <b>settings.harvester.harvesting.heritrix.version</b> Represents the version of Heritrix used by Netarchivesuite 538 * The default is h3. The optional value is h1. 539 * 540 * If h1 is chosen, we assume that our templates is h1, as well. 541 * If h3 is chosen, we assume that our templates is h3, as well. 542 * There is no attempt at migration from one to the other. This must be done by an commandline-tool. 543 */ 544 public static String HERITRIX_VERSION = "settings.harvester.harvesting.heritrix.version"; 545 /** 546 * <b>settings.harvester.harvesting.heritrix.bundle</b>Points to the Heritrix3 zipfile bundled with 547 * netarchiveSuite classes. Currently no default value 548 */ 549 public static String HERITRIX3_BUNDLE = "settings.harvester.harvesting.heritrix.bundle"; 550 551 public static String HERITRIX3_CERTIFICATE = "settings.harvester.harvesting.heritrix.certificate"; 552 553 public static String HERITRIX3_CERTIFICATE_PASSWORD = "settings.harvester.harvesting.heritrix.certificatePassword"; 554 555 /** 556 * <b>settings.harvester.performer</b>: <br> 557 * The agent performing these harvests. The default is: "" 558 */ 559 public static String PERFORMER = "settings.harvester.performer"; 560 561 /***************************/ 562 /* Indexserver - settings. */ 563 /***************************/ 564 565 /** 566 * <b>settings.harvester.indexserver.requestdir</b>: <br> 567 * Setting for where the requests of the indexserver are stored. 568 */ 569 public static String INDEXSERVER_INDEXING_REQUESTDIR = "settings.harvester.indexserver.requestdir"; 570 571 /** 572 * <b>settings.harvester.indexserver.maxclients</b>: <br> 573 * Setting for the max number of clients the indexserver can handle simultaneously. 574 */ 575 public static String INDEXSERVER_INDEXING_MAXCLIENTS = "settings.harvester.indexserver.maxclients"; 576 577 /** 578 * <b>settings.harvester.indexserver.maxthreads</b>: <br> 579 * Setting for the max number of threads the deduplication indexer shall use. 580 */ 581 public static String INDEXSERVER_INDEXING_MAXTHREADS = "settings.harvester.indexserver.maxthreads"; 582 /** 583 * <b>settings.harvester.indexserver.checkinterval</b>: <br> 584 * Setting for the time in milliseconds between each check of the state of sub-indexing. Default: 30 seconds (30000 585 * milliseconds). 586 */ 587 public static String INDEXSERVER_INDEXING_CHECKINTERVAL = "settings.harvester.indexserver.checkinterval"; 588 589 /** 590 * <b>settings.harvester.indexserver.indexingtimeout</b>: <br> 591 * Setting for the indexing timeout in milliseconds. The default is 259200000 (3 days). 592 */ 593 public static String INDEXSERVER_INDEXING_TIMEOUT = "settings.harvester.indexserver.indexingtimeout"; 594 595 /** 596 * <b>settings.harvester.indexserver.maxsegments</b>: <br> 597 * Setting for how many segments we will accept in our lucene indices. The default is 15. 598 */ 599 public static String INDEXSERVER_INDEXING_MAX_SEGMENTS = "settings.harvester.indexserver.maxsegments"; 600 601 /** 602 * <b>settings.harvester.indexserver.listeningcheckinterval</b>: <br> 603 * Setting for the interval between each listening check in milliseconds. The default is 30000 (5 minutes). 604 */ 605 public static String INDEXSERVER_INDEXING_LISTENING_INTERVAL = "settings.harvester.indexserver.listeningcheckinterval"; 606 /** 607 * <b>settings.archive.indexserver.satisfactorythresholdpercentage</b>: <br> 608 * Setting for the satisfactory threshold of the indexing result as a percentage. The default is 70 percent 609 */ 610 public static String INDEXSERVER_INDEXING_SATISFACTORYTHRESHOLD_PERCENTAGE = "settings.harvester.indexserver.satisfactorythresholdpercentage"; 611 612 /** 613 * <b>settings.harvester.indexserver.indexrequestserver.class</b>: <br> 614 * Setting for which type of indexrequestserver to use. The default is: 615 * {@link dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer} 616 */ 617 public static String INDEXREQUEST_SERVER_CLASS = "settings.harvester.indexserver.indexrequestserver.class"; 618 619 /** 620 * b>settings.harvester.indexserver.lookfordataInAllBitarchiveReplicas</b>: <br> 621 * Setting for whether or not data not found in the default bitarchive replica shall be looked for in other 622 * bitarchive replicas. The default is false. 623 */ 624 public static String INDEXSERVER_INDEXING_LOOKFORDATAINOTHERBITARCHIVEREPLICAS = "settings.harvester.indexserver.lookfordataInAllBitarchiveReplicas"; 625 626 /***************************/ 627 /* Viewerproxy - settings. */ 628 /***************************/ 629 630 /** 631 * <b>settings.viewerproxy.baseDir</b>: <br> 632 * The main directory for the ViewerProxy, used for storing the Lucene index for the jobs being viewed. This 633 * directory can be used by multiple ViewerProxy applications running on the same machine. 634 */ 635 public static String VIEWERPROXY_DIR = "settings.harvester.viewerproxy.baseDir"; 636 637 /** 638 * <b>settings.viewerproxy.tryLookupUriAsFtp</b>: <br> 639 * If we fail to lookup an URI, we will try changing the protocol to ftp, if this setting is set to true. The 640 * default is false. 641 */ 642 public static String TRY_LOOKUP_URI_AS_FTP = "settings.harvester.viewerproxy.tryLookupUriAsFtp"; 643 644 /** 645 * <b>settings.viewerproxy.maxSizeInBrowser</b> The size (in bytes) of the largest object to be returned for viewing 646 * in the browser window. Larger objects will be returned with the appropriate http header for saving them to a 647 * file. 648 */ 649 public static String MAXIMUM_OBJECT_IN_BROWSER = "settings.harvester.viewerproxy.maxSizeInBrowser"; 650 651 /** 652 * The maximum length (in lines) of crawllog to be displayed in a browser window. 653 */ 654 public static String MAX_CRAWLLOG_IN_BROWSER = "settings.harvester.webinterface.maxCrawlLogInBrowser"; 655 656}