001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester; 024 025import java.util.regex.Pattern; 026 027import dk.netarkivet.common.utils.Settings; 028import dk.netarkivet.harvester.harvesting.distribute.HarvesterReadyMessage; 029import dk.netarkivet.harvester.harvesting.report.HarvestReport; 030 031/** Settings specific to the harvester module of NetarchiveSuite. */ 032public class HarvesterSettings { 033 034 /** The default place in classpath where the settings file can be found. */ 035 private static final String DEFAULT_SETTINGS_CLASSPATH = "dk/netarkivet/harvester/settings.xml"; 036 037 /* 038 * The static initialiser is called when the class is loaded. It will add default values for all settings defined in 039 * this class, by loading them from a settings.xml file in classpath. 040 */ 041 static { 042 Settings.addDefaultClasspathSettings(DEFAULT_SETTINGS_CLASSPATH); 043 } 044 045 // NOTE: The constants defining setting names below are left non-final on 046 // purpose! Otherwise, the static initialiser that loads default values 047 // will not run. 048 049 /** 050 * <b>settings.harvester.datamodel.domain.defaultSeedlist</b>: <br> 051 * Default name of the seedlist to use when new domains are created. 052 */ 053 public static String DEFAULT_SEEDLIST = "settings.harvester.datamodel.domain.defaultSeedlist"; 054 055 /** 056 * <b>settings.harvester.datamodel.domain.validSeedRegex</b>: <br> 057 * Regular expression used to validate a seed within a seedlist. 058 * <p> 059 * Default value accepts all non-empty strings. 060 */ 061 public static String VALID_SEED_REGEX = "settings.harvester.datamodel.domain.validSeedRegex"; 062 063 /** 064 * <b>settings.harvester.datamodel.domain.defaultConfig</b>: <br> 065 * The name of a configuration that is created by default and which is initially used for snapshot harvests. 066 */ 067 public static String DOMAIN_DEFAULT_CONFIG = "settings.harvester.datamodel.domain.defaultConfig"; 068 069 /** 070 * <b>settings.harvester.datamodel.domain.defaultOrderxml</b>: <br> 071 * Name of order xml template used for domains if nothing else is specified. The newly created configurations use 072 * this. This template must exist before harvesting can commence 073 */ 074 public static String DOMAIN_DEFAULT_ORDERXML = "settings.harvester.datamodel.domain.defaultOrderxml"; 075 076 /** 077 * <b>settings.harvester.datamodel.domain.defaultMaxrate</b>: <br> 078 * Default download rate for domain configuration. Not currently enforced. 079 */ 080 public static String DOMAIN_CONFIG_MAXRATE = "settings.harvester.datamodel.domain.defaultMaxrate"; 081 082 /** 083 * <b>settings.harvester.datamodel.domain.defaultMaxbytes</b>: <br> 084 * Default byte limit for domain configuration. 085 */ 086 public static String DOMAIN_CONFIG_MAXBYTES = "settings.harvester.datamodel.domain.defaultMaxbytes"; 087 088 /** 089 * <b>settings.harvester.datamodel.domain.defaultMaxobjects</b>: <br> 090 * Default object limit for domain configuration. 091 */ 092 public static String DOMAIN_CONFIG_MAXOBJECTS = "settings.harvester.datamodel.domain.defaultMaxobjects"; 093 094 /** 095 * <b>settings.harvester.datamodel.domain.defaultSchedule</b>: <br> 096 * Default schedule for selective harvesting,. No default by default. 097 */ 098 public static String DOMAIN_CONFIG_SCHEDULE = "settings.harvester.datamodel.domain.defaultSchedule"; 099 100 /** 101 * <b>settings.harvester.scheduler.jobGen.config.errorFactorPrevResult</b>: <br> 102 * Used when calculating expected size of a harvest of some configuration during job-creation process. This defines 103 * how great a possible factor we will permit a harvest to be larger then the expectation, when basing the 104 * expectation on a previous completed job. 105 */ 106 public static String ERRORFACTOR_PERMITTED_PREVRESULT = "settings.harvester.scheduler.jobGen.config.errorFactorPrevResult"; 107 108 /** 109 * <b>settings.harvester.scheduler.jobGen.config.errorFactorBestGuess</b>: <br> 110 * Used when calculating expected size of a harvest of some configuration during job-creation process. This defines 111 * how great a possible factor we will permit a harvest to be larger then the expectation, when basing the 112 * expectation on previous uncompleted harvests or no harvest data at all. 113 */ 114 public static String ERRORFACTOR_PERMITTED_BESTGUESS = "settings.harvester.scheduler.jobGen.config.errorFactorBestGuess"; 115 116 /** 117 * <b>settings.harvester.scheduler.jobGen.config.expectedAverageBytesPerObject</b>: <br> 118 * How many bytes the average object is expected to be on domains where we don't know any better. This number should 119 * grow over time, as of end of 2005 empirical data shows 38000. 120 */ 121 public static String EXPECTED_AVERAGE_BYTES_PER_OBJECT = "settings.harvester.scheduler.jobGen.config.expectedAverageBytesPerObject"; 122 123 /** 124 * <b>settings.harvester.scheduler.jobGen.config.maxDomainSize</b>: <br> 125 * The initial guess of the domain size (number of objects) of an unknown domain. 126 */ 127 public static String MAX_DOMAIN_SIZE = "settings.harvester.scheduler.jobGen.config.maxDomainSize"; 128 129 /** 130 * <b>settings.harvester.scheduler.jobGen.config.maxRelativeSizeDifference</b>: <br> 131 * The maximum allowed relative difference in expected number of objects retrieved in a single job definition. To 132 * avoid job splitting, set the value as Long.MAX_VALUE. 133 */ 134 public static String JOBS_MAX_RELATIVE_SIZE_DIFFERENCE = "settings.harvester.scheduler.jobGen.config.maxRelativeSizeDifference"; 135 136 /** 137 * <b>settings.harvester.scheduler.jobGen.config.minAbsoluteSizeDifference</b>: <br> 138 * Size differences for jobs below this threshold are ignored, regardless of the limits for the relative size 139 * difference. To avoid job splitting, set the value as Long.MAX_VALUE. 140 */ 141 public static String JOBS_MIN_ABSOLUTE_SIZE_DIFFERENCE = "settings.harvester.scheduler.jobGen.config.minAbsoluteSizeDifference"; 142 143 /** 144 * <b>settings.harvester.scheduler.jobGen.config.maxTotalSize</b>: <br> 145 * When this limit is exceeded no more configurations may be added to a job. To avoid job splitting, set the value 146 * as Long.MAX_VALUE. 147 */ 148 public static String JOBS_MAX_TOTAL_JOBSIZE = "settings.harvester.scheduler.jobGen.config.maxTotalSize"; 149 150 /** 151 * <b>settings.harvester.scheduler.jobGen.maxTimeToCompleteJob</b>: <br> 152 * The limit on how many seconds Heritrix should continue on each job. O means no limit. 153 */ 154 public static String JOBS_MAX_TIME_TO_COMPLETE = "settings.harvester.scheduler.jobGen.maxTimeToCompleteJob"; 155 156 /** 157 * <b>settings.harvester.scheduler.jobGen.domainConfigSubsetSize</b>: <br> 158 * How many domain configurations we will process in one go before making jobs out of them. This amount of domains 159 * will be stored in memory at the same time. To avoid job splitting, set this value as Long.MAX_VALUE. 160 */ 161 public static String JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE = "settings.harvester.scheduler.jobGen.domainConfigSubsetSize"; 162 163 /** 164 * <b>settings.harvester.scheduler.jobGen.config.fixedDomainCountFocused</b>: <br> 165 * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter represents the 166 * maximum number of domain configurations in a partial harvest job. 167 */ 168 public static String JOBGEN_FIXED_CONFIG_COUNT_FOCUSED = "settings.harvester.scheduler.jobGen.config.fixedDomainCountFocused"; 169 170 /** 171 * <b>settings.harvester.scheduler.jobGen.config.fixedDomainCountSnapshot</b>: <br> 172 * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter represents the 173 * maximum number of domain configurations in a full harvest job. 174 */ 175 public static String JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT = "settings.harvester.scheduler.jobGen.config.fixedDomainCountSnapshot"; 176 177 /** 178 * <b>settings.harvester.scheduler.jobGen.config.excludeDomainsWithZeroBudget</b>: <br> 179 * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter toggles whether or 180 * not domain configurations with a budget of zero (byte or objects) should be excluded from jobs. The default value 181 * is 'false'. 182 */ 183 public static String JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET = "settings.harvester.scheduler.jobGen.config.excludeDomainsWithZeroBudget"; 184 185 /** 186 * <b>settings.harvester.scheduler.jobGen.config.postponeUnregisteredChannel</b>: <br> 187 * If this property is true, then the job generator will postpone job generation for harvest definitions that are 188 * mapped to a harvest channel not registered to at least one harvester. The default value is 'true'. 189 */ 190 public static String JOBGEN_POSTPONE_UNREGISTERED_HARVEST_CHANNEL = "settings.harvester.scheduler.jobGen.config.postponeUnregisteredChannel"; 191 192 /** 193 * <b>settings.harvester.scheduler.jobGen.class</b>: <br> 194 * The fully qualified class name of the chosen job generator implementation, currently either 195 * {@link DefaultJobGenerator} or {@link FixedDomainConfigurationCountJobGenerator}. The default is 196 * {@link DefaultJobGenerator}. 197 */ 198 public static String JOBGEN_CLASS = "settings.harvester.scheduler.jobGen.class"; 199 200 /** 201 * <b>settings.harvester.scheduler.jobGen.config.splitByObjectLimit</b>: <br> 202 * By default the byte limit is used as the base criterion for how many domain configurations are put into one 203 * harvest job. However if this parameter is set to "true", then the object limit is used instead as the base 204 * criterion. 205 */ 206 public static String SPLIT_BY_OBJECTLIMIT = "settings.harvester.scheduler.jobGen.config.splitByObjectLimit"; 207 208 /** 209 * <b>settings.harvester.scheduler.jobGen.objectLimitIsSetByQuotaEnforcer</b>: <br> 210 * Controls whether the domain configuration object limit should be set in Heritrix's crawl order through the 211 * QuotaEnforcer configuration (parameter set to true) or through the frontier parameter 'queue-total-budget' ( 212 * parameter set to false). 213 * <p> 214 * Default value is true, as legacy implementation was to use only the QuotaEnforcer. 215 */ 216 public static String OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER = "settings.harvester.scheduler.jobGen.objectLimitIsSetByQuotaEnforcer"; 217 218 /** 219 * <b>settings.harvester.scheduler.jobtimeouttime</b>:<br /> 220 * Time before a STARTED job times out and change status to FAILED. In seconds. 221 */ 222 public static String JOB_TIMEOUT_TIME = "settings.harvester.scheduler.jobtimeouttime"; 223 224 /** 225 * <b>settings.harvester.scheduler.jobgenerationperiode</b>: <br> 226 * The period between checking if new jobs should be generated, in seconds. This is one minute because that's the 227 * finest we can define in a harvest definition. 228 */ 229 public static String GENERATE_JOBS_PERIOD = "settings.harvester.scheduler.jobgenerationperiode"; 230 231 /** 232 * <b>settings.harvester.harvesting.serverDir</b>: <br> 233 * Each job gets a subdir of this dir. Job data is written and Heritrix writes to that subdir. 234 */ 235 public static String HARVEST_CONTROLLER_SERVERDIR = "settings.harvester.harvesting.serverDir"; 236 237 /** 238 * <b>settings.harvester.harvesting.minSpaceLeft</b>: <br> 239 * The minimum amount of free bytes in the serverDir required before accepting any harvest-jobs. 240 */ 241 public static String HARVEST_SERVERDIR_MINSPACE = "settings.harvester.harvesting.minSpaceLeft"; 242 243 /** 244 * <b>settings.harvester.harvesting.oldjobsDir</b>: <br> 245 * The directory in which data from old jobs is kept after uploading. Each directory from serverDir will be moved to 246 * here if any data remains, either due to failed uploads or because it wasn't attempted uploaded. 247 */ 248 public static String HARVEST_CONTROLLER_OLDJOBSDIR = "settings.harvester.harvesting.oldjobsDir"; 249 250 /** 251 * <b>settings.harvester.harvesting.channel</b>: <br> 252 * Harvest channel to take jobs from. This is the default channel assigned to the harvest controller. 253 * 254 * @see dk.netarkivet.harvester.datamodel.HarvestChannel <p> 255 * NOTE: this one is also used in SingleMBeanObject parsing information to System state 256 */ 257 public static String HARVEST_CONTROLLER_CHANNEL = "settings.harvester.harvesting.channel"; 258 259 /** 260 * <b>settings.harvester.harvesting.heritrix.inactivityTimeout</b>: <br> 261 * The timeout setting for aborting a crawl based on crawler-inactivity. If the crawler is inactive for this amount 262 * of seconds the crawl will be aborted. The inactivity is measured on the crawlController.activeToeCount(). 263 */ 264 public static String INACTIVITY_TIMEOUT_IN_SECS = "settings.harvester.harvesting.heritrix.inactivityTimeout"; 265 266 /** 267 * <b>settings.harvester.harvesting.heritrix.noresponseTimeout</b>: <br> 268 * The timeout value (in seconds) used in HeritrixLauncher for aborting crawl when no bytes are being received from 269 * web servers. 270 */ 271 public static String CRAWLER_TIMEOUT_NON_RESPONDING = "settings.harvester.harvesting.heritrix.noresponseTimeout"; 272 /** 273 * <b>settings.harvester.monitor.refreshInterval</b>:<br> 274 * Time interval in seconds after which the harvest monitor pages will be automatically refreshed. 275 */ 276 public static String HARVEST_MONITOR_REFRESH_INTERVAL = "settings.harvester.monitor.refreshInterval"; 277 278 /** 279 * <b>settings.harvester.monitor.historySampleRate</b>:<br> 280 * Time interval in seconds between historical records stores in the DB. Default value is 5 minutes. 281 */ 282 public static String HARVEST_MONITOR_HISTORY_SAMPLE_RATE = "settings.harvester.monitor.historySampleRate"; 283 284 /** 285 * <b>settings.harvester.monitor.historyChartGenIntervall</b>:<br> 286 * Time interval in seconds between regenerating the chart of historical data for a running job. Default value is 5 287 * minutes. 288 */ 289 public static String HARVEST_MONITOR_HISTORY_CHART_GEN_INTERVAL = "settings.harvester.monitor.historyChartGenInterval"; 290 291 /** 292 * <b>settings.harvester.monitor.displayedHistorySize</b>:<br> 293 * Maximum number of most recent history records displayed on the running job details page. 294 */ 295 public static String HARVEST_MONITOR_DISPLAYED_HISTORY_SIZE = "settings.harvester.monitor.displayedHistorySize"; 296 297 /** 298 * <b>settings.harvester.harvesting.heritrix.crawlLoopWaitTime</b>:<br> 299 * Time interval in seconds to wait during a crawl loop in the harvest controller. Default value is 20 seconds. 300 * 301 * TODO Maybe move this from the heritrix settings (settings.harvester.harvesting.heritrix) to 302 * settings.harvester.harvesting.controller. 303 */ 304 public static String CRAWL_LOOP_WAIT_TIME = "settings.harvester.harvesting.heritrix.crawlLoopWaitTime"; 305 306 /** 307 * <b>settings.harvester.harvesting.sendReadyInterval</b>:<br> 308 * Time interval in seconds to wait before transmitting a {@link HarvesterReadyMessage} to the {@link JobDispatcher} 309 * . 310 * <p> 311 * <p> 312 * Lower values will make the JobDispatcher detect ready harvester faster, but will make it more likely that the 313 * harvester may send two ready messages before a job is received, causing the JobDispatcher to dispatch two jobs. 314 * <p> 315 * Default value is 30 second. 316 */ 317 public static String SEND_READY_INTERVAL = "settings.harvester.harvesting.sendReadyInterval"; 318 319 /** 320 * <b>settings.harvester.harvesting.sendReadyDelay</b>:<br> 321 * Time in milliseconds to wait from starting to listen on the job queue to a potential ready message is sent to the 322 * HarvestJobManager. This small delay is used to retrieve any left over jobs on the queue before sending the ready 323 * message to the harvester. Default value is 1000 millisecond. 324 */ 325 public static String SEND_READY_DELAY = "settings.harvester.harvesting.sendReadyDelay"; 326 327 /** 328 * <b>settings.harvester.harvesting.frontier.frontierReportWaitTime</b>:<br> 329 * Time interval in seconds to wait between two requests to generate a full frontier report. Default value is 600 330 * seconds (10 min). 331 */ 332 public static String FRONTIER_REPORT_WAIT_TIME = "settings.harvester.harvesting.frontier.frontierReportWaitTime"; 333 334 /** 335 * <b>settings.harvester.harvesting.frontier.filter.class</b> Defines a filter to apply to the full frontier report. 336 * the default class: {@link TopTotalEnqueuesFilter} 337 */ 338 public static String FRONTIER_REPORT_FILTER_CLASS = "settings.harvester.harvesting.frontier.filter.class"; 339 340 /** 341 * <b>settings.harvester.harvesting.frontier.filter.args</b> Defines a frontier report filter's arguments. Arguments 342 * should be separated by semicolons. 343 */ 344 public static String FRONTIER_REPORT_FILTER_ARGS = "settings.harvester.harvesting.frontier.filter.args"; 345 346 /** 347 * <b>settings.harvester.harvesting.heritrix.abortIfConnectionLost</b>:<br> 348 * Boolean flag. If set to true, the harvest controller will abort the current crawl when the JMX connection is 349 * lost. If set to true it will only log a warning, leaving the crawl operator shutting down harvester manually. 350 * Default value is true. 351 * 352 * @see BnfHeritrixController 353 */ 354 public static String ABORT_IF_CONNECTION_LOST = "settings.harvester.harvesting.heritrix.abortIfConnectionLost"; 355 356 /** 357 * <b>settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout</b>:<br> 358 * Maximum time in seconds to wait for Heritrix to generate report files once crawling is over. 359 */ 360 public static String WAIT_FOR_REPORT_GENERATION_TIMEOUT = "settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout"; 361 362 /** 363 * <b>settings.harvester.harvesting.heritrix</b>: <br> 364 * The path to the Heritrix SETTINGS. 365 */ 366 public static String HERITRIX = "settings.harvester.harvesting.heritrix"; 367 368 /** 369 * <b>settings.harvester.harvesting.heritrix.adminName</b>: <br> 370 * The name used to access the Heritrix GUI. 371 */ 372 public static String HERITRIX_ADMIN_NAME = "settings.harvester.harvesting.heritrix.adminName"; 373 374 /** 375 * <b>settings.harvester.harvesting.heritrix.adminPassword</b>: <br> 376 * The password used to access the Heritrix GUI. 377 */ 378 public static String HERITRIX_ADMIN_PASSWORD = "settings.harvester.harvesting.heritrix.adminPassword"; 379 380 /** 381 * <b>settings.harvester.harvesting.heritrix.guiPort</b>: <br> 382 * Port used to access the Heritrix web user interface. This port must not be used by anything else on the machine. 383 * Note that apart from pausing a job, modifications done directly on Heritrix may cause unexpected breakage. 384 */ 385 public static String HERITRIX_GUI_PORT = "settings.harvester.harvesting.heritrix.guiPort"; 386 387 /** 388 * <b>settings.harvester.harvesting.heritrix.jmxPort</b>: <br> 389 * The port that Heritrix 1.14.4 uses to expose its JMX interface. This port must not be used by anything else on the 390 * machine, but does not need to be accessible from other machines unless you want to be able to use jconsole to 391 * access Heritrix directly. Note that apart from pausing a job, modifications done directly on Heritrix may cause 392 * unexpected breakage. Irrelevant for Heritrix 3+ 393 */ 394 public static String HERITRIX_JMX_PORT = "settings.harvester.harvesting.heritrix.jmxPort"; 395 396 /** 397 * <b>settings.harvester.harvesting.heritrix.jmxUsername</b>: <br> 398 * The username used to connect to Heritrix 1.14.4 JMX interface The username must correspond to the value stored in the 399 * jmxremote.password file (name defined in setting settings.common.jmx.passwordFile). 400 * Irrelevant for Heritrix 3+ 401 */ 402 public static String HERITRIX_JMX_USERNAME = "settings.harvester.harvesting.heritrix.jmxUsername"; 403 404 /** 405 * <b>settings.harvester.harvesting.heritrix.jmxPassword</b>: <br> 406 * The password used to connect to Heritrix JMX interface The password must correspond to the value stored in the 407 * jmxremote.password file (name defined in setting settings.common.jmx.passwordFile). 408 * Irrelevant for Heritrix 3+ 409 */ 410 public static String HERITRIX_JMX_PASSWORD = "settings.harvester.harvesting.heritrix.jmxPassword"; 411 412 /** 413 * <b>settings.harvester.harvesting.heritrix.heapSize</b>: <br> 414 * The heap size to use for the Heritrix sub-process. This should probably be fairly large. It can be specified in 415 * the same way as for the -Xmx argument to Java, e.g. 512M, 2G etc. 416 */ 417 public static String HERITRIX_HEAP_SIZE = "settings.harvester.harvesting.heritrix.heapSize"; 418 419 /** 420 * <b>settings.harvester.harvesting.heritrix.javaOpts</b>: <br> 421 * Additional JVM options for the Heritrix sub-process. By default there is no additional JVM option. 422 */ 423 public static String HERITRIX_JVM_OPTS = "settings.harvester.harvesting.heritrix.javaOpts"; 424 425 /** 426 * <b>settings.harvester.harvesting.heritrixControllerClass</b>:<br/> 427 * The implementation of the HeritrixController interface to be used. 428 */ 429 public static String HERITRIX_CONTROLLER_CLASS = "settings.harvester.harvesting.heritrixController.class"; 430 431 /** 432 * <b>settings.harvester.harvesting.heritrixLauncherClass</b>:<br/> 433 * The implementation of the HeritrixLauncher abstract class to be used. 434 */ 435 public static String HERITRIX_LAUNCHER_CLASS = "settings.harvester.harvesting.heritrixLauncher.class"; 436 437 /** 438 * <b>settings.harvester.harvesting.harvestReport</b>:<br/> 439 * The implementation of {@link HarvestReport} interface to be used. 440 */ 441 public static String HARVEST_REPORT_CLASS = "settings.harvester.harvesting.harvestReport.class"; 442 443 /** 444 * <b>settings.harvester.harvesting.harvestReport.disregardSeedsURLInfo</b>:<br/> 445 * Should we disregard seedURL-information and thus assign the harvested bytes to the domain of the harvested URL 446 * instead of the seed url domain? The default is false; 447 */ 448 public static String DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG = "settings.harvester.harvesting.harvestReport.disregardSeedURLInfo"; 449 450 /** 451 * <b>settings.harvester.harvesting.deduplication.enabled</b>:<br/> 452 * This setting tells the system whether or not to use deduplication. This setting is true by default. 453 */ 454 public static String DEDUPLICATION_ENABLED = "settings.harvester.harvesting.deduplication.enabled"; 455 456 /** 457 * <b>settings.harvester.harvesting.metadata.heritrixFilePattern</b> This setting allows to filter which Heritrix 458 * files should be stored in the metadata (W)ARC file.. 459 * 460 * @see Pattern 461 */ 462 public static String METADATA_HERITRIX_FILE_PATTERN = "settings.harvester.harvesting.metadata.heritrixFilePattern"; 463 464 /** 465 * <b>settings.harvester.harvesting.metadata.reportFilePattern</b> This setting allows to filter which Heritrix 466 * files that should be stored in the metadata (W)ARC file are to be classified as a report. 467 * 468 * @see Pattern 469 */ 470 public static String METADATA_REPORT_FILE_PATTERN = "settings.harvester.harvesting.metadata.reportFilePattern"; 471 472 /** 473 * <b>settings.harvester.harvesting.metadata.logFilePattern</b> This setting allows to filter which Heritrix log 474 * files should be stored in the metadata (W)ARC file. 475 * 476 * @see Pattern 477 */ 478 public static String METADATA_LOG_FILE_PATTERN = "settings.harvester.harvesting.metadata.logFilePattern"; 479 480 /** 481 * <b>settings.harvester.harvesting.metadata.generateArchiveFilesReport</b> This setting is a boolean flag that 482 * enables/disables the generation of an ARC/WARC files report. Default value is 'true'. 483 * 484 * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles) 485 */ 486 public static String METADATA_GENERATE_ARCHIVE_FILES_REPORT = "settings.harvester.harvesting.metadata.archiveFilesReport.generate"; 487 488 /** 489 * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If 490 * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the name of the generated report file. 491 * Default value is 'archivefiles-report.txt'. 492 * 493 * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles) 494 */ 495 public static String METADATA_ARCHIVE_FILES_REPORT_NAME = "settings.harvester.harvesting.metadata.archiveFilesReport.fileName"; 496 497 /** 498 * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If 499 * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the header of the generated report file. 500 * This setting should generally be left to its default value, which is '[ARCHIVEFILE] [Closed] [Size]'. 501 * 502 * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles) 503 */ 504 public static String METADATA_ARCHIVE_FILES_REPORT_HEADER = "settings.harvester.harvesting.metadata.archiveFilesReport.fileHeader"; 505 506 /** 507 * The version number which goes in metadata file names like 12345-metadata-<version number>.warc.gz 508 */ 509 public static String METADATA_FILE_VERSION_NUMBER = "settings.harvester.harvesting.metadata.filename.versionnumber"; 510 511 /** 512 * <b>settings.harvester.aliases.timeout</b> The amount of time in seconds before an alias times out, and needs to 513 * be re-evaluated. The default value is one year, i.e 31536000 seconds. 514 */ 515 public static String ALIAS_TIMEOUT = "settings.harvester.aliases.timeout"; 516 517 /** 518 * <b>settings.harvester.harvesting.continuationFromHeritrixRecoverlogEnabled</b>:</br> Setting for whether or not a 519 * restarted job should try fetching the recoverlog of the previous failed job, and ask Heritrix to continue from 520 * this log. The default is false. 521 */ 522 public static String RECOVERlOG_CONTINUATION_ENABLED = "settings.harvester.harvesting.continuationFromHeritrixRecoverlogEnabled"; 523 524 /** 525 * <b>settings.harvester.harvesting.metadata.metadataFormat</b> The dataformat used by Netarchivesuite to write the 526 * metadata associated with a given harvest job. default: arc (alternative: warc) 527 */ 528 public static String METADATA_FORMAT = "settings.harvester.harvesting.metadata.metadataFormat"; 529 530 /** 531 * <b>settings.harvester.harvesting.metadata.metadataFileNameFormat</b> The format of the name of the metadata file : 532 * By default, it will be jobID-metadata.1.extension for example 3161-metadata-1.warc 533 * If the value is "prefix", it will be named like a warc file : Prefix-61-3161-metadata-1.warc 534 * default value : default (alternative: prefix) 535 */ 536 public static String METADATA_FILENAME_FORMAT = "settings.harvester.harvesting.metadata.metadataFileNameFormat"; 537 538 /** 539 * <b>settings.harvester.harvesting.metadata.compression</b> Do we compress the 540 * metadata associated with a given harvest job. 541 * default: false 542 */ 543 public static String METADATA_COMPRESSION = "settings.harvester.harvesting.metadata.compression"; 544 545 /** 546 * <b>settings.harvester.harvesting.heritrix.archiveNaming.collectionName</b> 547 * prefix for archive file 548 * if METADATA_FILENAME_FORMAT is "prefix", then check of a collection name to prefix metadata filename 549 */ 550 public static String HERITRIX_PREFIX_COLLECTION_NAME = "settings.harvester.harvesting.heritrix.archiveNaming.collectionName"; 551 552 /** 553 * <b>settings.harvester.harvesting.heritrix.archiveFormat</b> The dataformat used by heritrix to write the 554 * harvested data. default: warc (alternative: arc) 555 */ 556 public static String HERITRIX_ARCHIVE_FORMAT = "settings.harvester.harvesting.heritrix.archiveFormat"; 557 /** 558 * <b>settings.harvester.harvesting.heritrix.archiveNaming.class</b> The class implementing the chosen way of naming 559 * your archive-files default: LegacyNamingConvention. This class decides what to put into the Heritrix "prefix" 560 * property of the org.archive.crawler.writer.ARCWriterProcessor and/or 561 * org.archive.crawler.writer.WARCWriterProcessor. 562 */ 563 public static String HERITRIX_ARCHIVE_NAMING_CLASS = "settings.harvester.harvesting.heritrix.archiveNaming.class"; 564 565 /** 566 * <b>settings.harvester.harvesting.heritrix.warc.warcParametersOverride</b> This paramater define NAS behaviour 567 * regarding warc parameters (write request, write metadata, etc.) : if this parameter is true, the warc parameters 568 * defined in harvester templates are not considered. The default is true. 569 */ 570 public static String HERITRIX_WARC_PARAMETERS_OVERRIDE = "settings.harvester.harvesting.heritrix.warc.warcParametersOverride"; 571 572 /** 573 * <b>settings.harvester.harvesting.heritrix.warc.skipIdenticalDigests</b> Represents the 'skip-identical-digests' 574 * setting in the Heritrix WARCWriterProcessor. The default is false. 575 */ 576 public static String HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix.warc.skipIdenticalDigests"; 577 /** 578 * <b>settings.harvester.harvesting.heritrix.warc.writeRequests</b> Represents the 'write-requests' setting in the 579 * Heritrix WARCWriterProcessor. The default is true 580 */ 581 public static String HERITRIX_WARC_WRITE_REQUESTS = "settings.harvester.harvesting.heritrix.warc.writeRequests"; 582 /** 583 * <b>settings.harvester.harvesting.heritrix.warc.writeMetadata</b> Represents the 'write-metadata' setting in the 584 * Heritrix WARCWriterProcessor. The default is true. 585 */ 586 public static String HERITRIX_WARC_WRITE_METADATA = "settings.harvester.harvesting.heritrix.warc.writeMetadata"; 587 /** 588 * <b>settings.harvester.harvesting.heritrix.warc.writeMetadataOutlinks</b> Represents the 'write-metadata-outlinks' setting in the Heritrix 589 * WARCWriterProcessor. The default is false.s 590 */ 591 public static String HERITRIX_WARC_WRITE_METADATA_OUTLINKS = "settings.harvester.harvesting.heritrix.warc.writeMetadataOutlinks"; 592 /** 593 * <b>settings.harvester.harvesting.heritrix.warc.writeRevisitForIdenticalDigests</b> Represents the 594 * 'write-revisit-for-identical-digests' setting in the Heritrix WARCWriterProcessor. The default is true. 595 */ 596 public static String HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix.warc.writeRevisitForIdenticalDigests"; 597 /** 598 * <b>settings.harvester.harvesting.heritrix.warc.writeRevisitForNotModified</b> Represents the 599 * 'write-revisit-for-not-modified' setting in the Heritrix WARCWriterProcessor. The default is true. 600 */ 601 public static String HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED = "settings.harvester.harvesting.heritrix.warc.writeRevisitForNotModified"; 602 603 /** 604 * <b>settings.harvester.harvesting.heritrix.warc.startNewFilesOnCheckpoint</b> Represents the 605 * 'startNewFilesOnCheckpoint' setting in the Heritrix WARCWriterProcessor. Only available with H3. The default is true. 606 */ 607 public static String HERITRIX_WARC_START_NEW_FILES_ON_CHECKPOINT 608 = "settings.harvester.harvesting.heritrix.warc.startNewFilesOnCheckpoint"; 609 610 /** 611 * Currently UNUSED. 612 * <b>settings.harvester.harvesting.heritrix.version</b> Represents the version of Heritrix used by Netarchivesuite 613 * The default is h3. The optional value is h1. 614 * 615 * 616 * If h1 is chosen, we assume that our templates is h1, as well. 617 * If h3 is chosen, we assume that our templates is h3, as well. 618 * There is no attempt at migration from one to the other. This must be done by an commandline-tool. 619 */ 620 public static String HERITRIX_VERSION = "settings.harvester.harvesting.heritrix.version"; 621 622 623 /** 624 * <b>settings.harvester.performer</b>: <br> 625 * The agent performing these harvests. The default is: "" 626 */ 627 public static String PERFORMER = "settings.harvester.performer"; 628 629 /***************************/ 630 /* Indexserver - settings. */ 631 /***************************/ 632 633 /** 634 * <b>settings.harvester.indexserver.requestdir</b>: <br> 635 * Setting for where the requests of the indexserver are stored. 636 */ 637 public static String INDEXSERVER_INDEXING_REQUESTDIR = "settings.harvester.indexserver.requestdir"; 638 639 /** 640 * <b>settings.harvester.indexserver.maxclients</b>: <br> 641 * Setting for the max number of clients the indexserver can handle simultaneously. 642 */ 643 public static String INDEXSERVER_INDEXING_MAXCLIENTS = "settings.harvester.indexserver.maxclients"; 644 645 /** 646 * <b>settings.harvester.indexserver.maxthreads</b>: <br> 647 * Setting for the max number of threads the deduplication indexer shall use. 648 */ 649 public static String INDEXSERVER_INDEXING_MAXTHREADS = "settings.harvester.indexserver.maxthreads"; 650 /** 651 * <b>settings.harvester.indexserver.checkinterval</b>: <br> 652 * Setting for the time in milliseconds between each check of the state of sub-indexing. Default: 30 seconds (30000 653 * milliseconds). 654 */ 655 public static String INDEXSERVER_INDEXING_CHECKINTERVAL = "settings.harvester.indexserver.checkinterval"; 656 657 /** 658 * <b>settings.harvester.indexserver.indexingtimeout</b>: <br> 659 * Setting for the indexing timeout in milliseconds. The default is 259200000 (3 days). 660 */ 661 public static String INDEXSERVER_INDEXING_TIMEOUT = "settings.harvester.indexserver.indexingtimeout"; 662 663 /** 664 * <b>settings.harvester.indexserver.maxsegments</b>: <br> 665 * Setting for how many segments we will accept in our lucene indices. The default is 15. 666 */ 667 public static String INDEXSERVER_INDEXING_MAX_SEGMENTS = "settings.harvester.indexserver.maxsegments"; 668 669 /** 670 * <b>settings.harvester.indexserver.listeningcheckinterval</b>: <br> 671 * Setting for the interval between each listening check in milliseconds. The default is 30000 (5 minutes). 672 */ 673 public static String INDEXSERVER_INDEXING_LISTENING_INTERVAL = "settings.harvester.indexserver.listeningcheckinterval"; 674 /** 675 * <b>settings.archive.indexserver.satisfactorythresholdpercentage</b>: <br> 676 * Setting for the satisfactory threshold of the indexing result as a percentage. The default is 70 percent 677 */ 678 public static String INDEXSERVER_INDEXING_SATISFACTORYTHRESHOLD_PERCENTAGE = "settings.harvester.indexserver.satisfactorythresholdpercentage"; 679 680 /** 681 * <b>settings.harvester.indexserver.indexrequestserver.class</b>: <br> 682 * Setting for which type of indexrequestserver to use. The default is: 683 * {@link dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer} 684 */ 685 public static String INDEXREQUEST_SERVER_CLASS = "settings.harvester.indexserver.indexrequestserver.class"; 686 687 /** 688 * b>settings.harvester.indexserver.lookfordataInAllBitarchiveReplicas</b>: <br> 689 * Setting for whether or not data not found in the default bitarchive replica shall be looked for in other 690 * bitarchive replicas. The default is false. 691 */ 692 public static String INDEXSERVER_INDEXING_LOOKFORDATAINOTHERBITARCHIVEREPLICAS = "settings.harvester.indexserver.lookfordataInAllBitarchiveReplicas"; 693 694 /***************************/ 695 /* Viewerproxy - settings. */ 696 /***************************/ 697 698 /** 699 * <b>settings.viewerproxy.baseDir</b>: <br> 700 * The main directory for the ViewerProxy, used for storing the Lucene index for the jobs being viewed. This 701 * directory can be used by multiple ViewerProxy applications running on the same machine. 702 */ 703 public static String VIEWERPROXY_DIR = "settings.harvester.viewerproxy.baseDir"; 704 705 /** 706 * <b>settings.viewerproxy.tryLookupUriAsFtp</b>: <br> 707 * If we fail to lookup an URI, we will try changing the protocol to ftp, if this setting is set to true. The 708 * default is false. 709 */ 710 public static String TRY_LOOKUP_URI_AS_FTP = "settings.harvester.viewerproxy.tryLookupUriAsFtp"; 711 712 /** 713 * <b>settings.viewerproxy.maxSizeInBrowser</b> The size (in bytes) of the largest object to be returned for viewing 714 * in the browser window. Larger objects will be returned with the appropriate http header for saving them to a 715 * file. 716 */ 717 public static String MAXIMUM_OBJECT_IN_BROWSER = "settings.harvester.viewerproxy.maxSizeInBrowser"; 718 719 /** 720 * <b>settings.harvester.viewerproxy.allowFileDownloads</b> If set to false, there will be no links to 721 * allow download of warcfiles via the Viewerproxy GUI. 722 */ 723 public static String ALLOW_FILE_DOWNLOADS = "settings.harvester.viewerproxy.allowFileDownloads"; 724 725 /** 726 * The maximum length (in lines) of crawllog to be displayed in a browser window. 727 */ 728 public static String MAX_CRAWLLOG_IN_BROWSER = "settings.harvester.webinterface.maxCrawlLogInBrowser"; 729 730 /** 731 * <b>settings.harvester.harvesting.heritrix</b>: <br> 732 * The path to the Heritrix3 SETTINGS. 733 */ 734 public static String HERITRIX3 = "settings.harvester.harvesting.heritrix3"; 735 736 /** Heritrix3 ArcWriter settings **/ 737 738 public static String HERITRIX3_ARC_COMPRESSION = "settings.harvester.harvesting.heritrix3.arc.compression"; 739 740 public static String HERITRIX3_ARC_SUFFIX = "settings.harvester.harvesting.heritrix3.arc.suffix"; 741 742 public static String HERITRIX3_ARC_MAXSIZE = "settings.harvester.harvesting.heritrix3.arc.maxFileSizeBytes"; 743 744 public static String HERITRIX3_ARC_POOL_MAXACTIVE = "settings.harvester.harvesting.heritrix3.arc.poolMaxActive"; 745 746 public static String HERITRIX3_ARC_SKIP_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix3.arc.skipIdenticalDigests"; 747 748 /** 749 * <b>settings.harvester.harvesting.heritrix3.warc.template</b>: <br> 750 * The template for warcfiles created by Heritrix. 751 * Default value in NAS: ${prefix}-${timestamp17}-${serialno}-${heritrix.hostname} 752 * Default value in H3: ${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port} 753 */ 754 public static String HERITRIX3_WARC_TEMPLATE = "settings.harvester.harvesting.heritrix3.warc.template"; 755 756 public static String HERITRIX3_WARC_COMPRESSION = "settings.harvester.harvesting.heritrix3.warc.compression"; 757 758 public static String HERITRIX3_WARC_POOL_MAXACTIVE = "settings.harvester.harvesting.heritrix3.warc.poolMaxActive"; 759 760 public static String HERITRIX3_WARC_MAXSIZE = "settings.harvester.harvesting.heritrix3.warc.maxFileSizeBytes"; 761 762 public static String HERITRIX3_WARC_WRITE_REQUESTS = "settings.harvester.harvesting.heritrix3.warc.writeRequests"; 763 764 public static String HERITRIX3_WARC_WRITE_METADATA = "settings.harvester.harvesting.heritrix3.warc.writeMetadata"; 765 766 public static String HERITRIX3_WARC_WRITE_METADATA_OUTLINKS = "settings.harvester.harvesting.heritrix3.warc.writeMetadataOutlinks"; 767 768 public static String HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix3.warc.skipIdenticalDigests"; 769 770 public static String HERITRIX3_WARC_START_NEW_FILES_ON_CHECKPOINT = "settings.harvester.harvesting.heritrix3.warc.startNewFilesOnCheckpoint"; 771 772 /** 773 * <b>settings.harvester.harvesting.heritrix.archiveFormat</b> The dataformat used by heritrix to write the 774 * harvested data. default: warc (alternative: arc) 775 */ 776 public static String HERITRIX3_ARCHIVE_FORMAT = "settings.harvester.harvesting.heritrix3.archiveFormat"; 777 /** 778 * <b>settings.harvester.harvesting.heritrix.archiveNaming.class</b> The class implementing the chosen way of naming 779 * your archive-files default: LegacyNamingConvention. This class decides what to put into the Heritrix "prefix" 780 * property of the org.archive.crawler.writer.ARCWriterProcessor and/or 781 * org.archive.crawler.writer.WARCWriterProcessor. 782 */ 783 public static String HERITRIX3_ARCHIVE_NAMING_CLASS = "settings.harvester.harvesting.heritrix3.archiveNaming.class"; 784 785 /** 786 * <b>settings.harvester.harvesting.heritrix.warc.writeMetadataOutlinks</b> This paramater define NAS behaviour 787 * regarding warc parameters (write request, write metadata, etc.) : if this parameter is true, the warc parameters 788 * defined in harvester templates are not considered. The default is true. 789 */ 790 public static String HERITRIX3_WARC_PARAMETERS_OVERRIDE = "settings.harvester.harvesting.heritrix3.warc.warcParametersOverride"; 791 792 /** 793 * <b>settings.harvester.harvesting.heritrix.bundle</b>Points to the Heritrix3 zipfile bundled with 794 * netarchiveSuite classes. Currently no default value 795 */ 796 public static String HERITRIX3_BUNDLE = "settings.harvester.harvesting.heritrix3.bundle"; 797 798 /** 799 * <b>settings.harvester.harvesting.heritrix.certificate</b>Points to the jks keystore to use for connection to the 800 * Heritrix3 rest api. If undefined the keystore provided with the heritrix3 bundler is used. 801 */ 802 public static String HERITRIX3_CERTIFICATE = "settings.harvester.harvesting.heritrix3.certificate"; 803 /** 804 * <b>settings.harvester.harvesting.heritrix.certificatePassword</b>Points to the password to use for connection to the 805 * Heritrix3 rest api. 806 */ 807 public static String HERITRIX3_CERTIFICATE_PASSWORD = "settings.harvester.harvesting.heritrix3.certificatePassword"; 808 809 public static String HERITRIX3_MONITOR_TEMP_PATH = "settings.harvester.harvesting.monitor.tempPath"; 810 811}