Source code

001package dk.netarkivet.harvester.heritrix3;
002
003import java.util.regex.Pattern;
004
005import dk.netarkivet.common.utils.Settings;
006import dk.netarkivet.harvester.harvesting.distribute.HarvesterReadyMessage;
007import dk.netarkivet.harvester.harvesting.frontier.TopTotalEnqueuesFilter;
008import dk.netarkivet.harvester.harvesting.report.HarvestReport;
009
010/**
011 * Settings specific to the heritrix3 harvester module of NetarchiveSuite.
012 */
013public class Heritrix3Settings {
014
015        /** The default place in classpath where the settings file can be found. */
016    private static final String DEFAULT_SETTINGS_CLASSPATH = "dk/netarkivet/harvester/heritrix3/settings.xml";
017
018    /*
019     * The static initialiser is called when the class is loaded. It will add default values for all settings defined in
020     * this class, by loading them from a settings.xml file in classpath.
021     */
022    static {
023        Settings.addDefaultClasspathSettings(DEFAULT_SETTINGS_CLASSPATH);
024    }
025
026    // NOTE: The constants defining setting names below are left non-final on
027    // purpose! Otherwise, the static initialiser that loads default values
028    // will not run.
029
030    /**
031     * <b>settings.harvester.datamodel.domain.defaultSeedlist</b>: <br>
032     * Default name of the seedlist to use when new domains are created.
033     */
034    public static String DEFAULT_SEEDLIST = "settings.harvester.datamodel.domain.defaultSeedlist";
035
036    /**
037     * <b>settings.harvester.datamodel.domain.validSeedRegex</b>: <br>
038     * Regular expression used to validate a seed within a seedlist.
039     * <p>
040     * Default value accepts all non-empty strings.
041     */
042    public static String VALID_SEED_REGEX = "settings.harvester.datamodel.domain.validSeedRegex";
043
044    /**
045     * <b>settings.harvester.datamodel.domain.defaultConfig</b>: <br>
046     * The name of a configuration that is created by default and which is initially used for snapshot harvests.
047     */
048    public static String DOMAIN_DEFAULT_CONFIG = "settings.harvester.datamodel.domain.defaultConfig";
049
050    /**
051     * <b>settings.harvester.datamodel.domain.defaultOrderxml</b>: <br>
052     * Name of order xml template used for domains if nothing else is specified. The newly created configurations use
053     * this. This template must exist before harvesting can commence
054     */
055    public static String DOMAIN_DEFAULT_ORDERXML = "settings.harvester.datamodel.domain.defaultOrderxml";
056
057    /**
058     * <b>settings.harvester.datamodel.domain.defaultMaxrate</b>: <br>
059     * Default download rate for domain configuration. Not currently enforced.
060     */
061    public static String DOMAIN_CONFIG_MAXRATE = "settings.harvester.datamodel.domain.defaultMaxrate";
062
063    /**
064     * <b>settings.harvester.datamodel.domain.defaultMaxbytes</b>: <br>
065     * Default byte limit for domain configuration.
066     */
067    public static String DOMAIN_CONFIG_MAXBYTES = "settings.harvester.datamodel.domain.defaultMaxbytes";
068
069    /**
070     * <b>settings.harvester.datamodel.domain.defaultMaxobjects</b>: <br>
071     * Default object limit for domain configuration.
072     */
073    public static String DOMAIN_CONFIG_MAXOBJECTS = "settings.harvester.datamodel.domain.defaultMaxobjects";
074
075    /**
076     * <b>settings.harvester.scheduler.jobGen.config.errorFactorPrevResult</b>: <br>
077     * Used when calculating expected size of a harvest of some configuration during job-creation process. This defines
078     * how great a possible factor we will permit a harvest to be larger then the expectation, when basing the
079     * expectation on a previous completed job.
080     */
081    public static String ERRORFACTOR_PERMITTED_PREVRESULT = "settings.harvester.scheduler.jobGen.config.errorFactorPrevResult";
082
083    /**
084     * <b>settings.harvester.scheduler.jobGen.config.errorFactorBestGuess</b>: <br>
085     * Used when calculating expected size of a harvest of some configuration during job-creation process. This defines
086     * how great a possible factor we will permit a harvest to be larger then the expectation, when basing the
087     * expectation on previous uncompleted harvests or no harvest data at all.
088     */
089    public static String ERRORFACTOR_PERMITTED_BESTGUESS = "settings.harvester.scheduler.jobGen.config.errorFactorBestGuess";
090
091    /**
092     * <b>settings.harvester.scheduler.jobGen.config.expectedAverageBytesPerObject</b>: <br>
093     * How many bytes the average object is expected to be on domains where we don't know any better. This number should
094     * grow over time, as of end of 2005 empirical data shows 38000.
095     */
096    public static String EXPECTED_AVERAGE_BYTES_PER_OBJECT = "settings.harvester.scheduler.jobGen.config.expectedAverageBytesPerObject";
097
098    /**
099     * <b>settings.harvester.scheduler.jobGen.config.maxDomainSize</b>: <br>
100     * The initial guess of the domain size (number of objects) of an unknown domain.
101     */
102    public static String MAX_DOMAIN_SIZE = "settings.harvester.scheduler.jobGen.config.maxDomainSize";
103
104    /**
105     * <b>settings.harvester.scheduler.jobGen.config.maxRelativeSizeDifference</b>: <br>
106     * The maximum allowed relative difference in expected number of objects retrieved in a single job definition. To
107     * avoid job splitting, set the value as Long.MAX_VALUE.
108     */
109    public static String JOBS_MAX_RELATIVE_SIZE_DIFFERENCE = "settings.harvester.scheduler.jobGen.config.maxRelativeSizeDifference";
110
111    /**
112     * <b>settings.harvester.scheduler.jobGen.config.minAbsoluteSizeDifference</b>: <br>
113     * Size differences for jobs below this threshold are ignored, regardless of the limits for the relative size
114     * difference. To avoid job splitting, set the value as Long.MAX_VALUE.
115     */
116    public static String JOBS_MIN_ABSOLUTE_SIZE_DIFFERENCE = "settings.harvester.scheduler.jobGen.config.minAbsoluteSizeDifference";
117
118    /**
119     * <b>settings.harvester.scheduler.jobGen.config.maxTotalSize</b>: <br>
120     * When this limit is exceeded no more configurations may be added to a job. To avoid job splitting, set the value
121     * as Long.MAX_VALUE.
122     */
123    public static String JOBS_MAX_TOTAL_JOBSIZE = "settings.harvester.scheduler.jobGen.config.maxTotalSize";
124
125    /**
126     * <b>settings.harvester.scheduler.jobGen.maxTimeToCompleteJob</b>: <br>
127     * The limit on how many seconds Heritrix should continue on each job. O means no limit.
128     */
129    public static String JOBS_MAX_TIME_TO_COMPLETE = "settings.harvester.scheduler.jobGen.maxTimeToCompleteJob";
130
131    /**
132     * <b>settings.harvester.scheduler.jobGen.domainConfigSubsetSize</b>: <br>
133     * How many domain configurations we will process in one go before making jobs out of them. This amount of domains
134     * will be stored in memory at the same time. To avoid job splitting, set this value as Long.MAX_VALUE.
135     */
136    public static String JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE = "settings.harvester.scheduler.jobGen.domainConfigSubsetSize";
137
138    /**
139     * <b>settings.harvester.scheduler.jobGen.config.fixedDomainCountFocused</b>: <br>
140     * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter represents the
141     * maximum number of domain configurations in a partial harvest job.
142     */
143    public static String JOBGEN_FIXED_CONFIG_COUNT_FOCUSED = "settings.harvester.scheduler.jobGen.config.fixedDomainCountFocused";
144
145    /**
146     * <b>settings.harvester.scheduler.jobGen.config.fixedDomainCountSnapshot</b>: <br>
147     * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter represents the
148     * maximum number of domain configurations in a full harvest job.
149     */
150    public static String JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT = "settings.harvester.scheduler.jobGen.config.fixedDomainCountSnapshot";
151
152    /**
153     * <b>settings.harvester.scheduler.jobGen.config.excludeDomainsWithZeroBudget</b>: <br>
154     * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter toggles whether or
155     * not domain configurations with a budget of zero (byte or objects) should be excluded from jobs. The default value
156     * is 'false'.
157     */
158    public static String JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET = "settings.harvester.scheduler.jobGen.config.excludeDomainsWithZeroBudget";
159
160    /**
161     * <b>settings.harvester.scheduler.jobGen.config.postponeUnregisteredChannel</b>: <br>
162     * If this property is true, then the job generator will postpone job generation for harvest definitions that are
163     * mapped to a harvest channel not registered to at least one harvester. The default value is 'true'.
164     */
165    public static String JOBGEN_POSTPONE_UNREGISTERED_HARVEST_CHANNEL = "settings.harvester.scheduler.jobGen.config.postponeUnregisteredChannel";
166
167    /**
168     * <b>settings.harvester.scheduler.jobGen.class</b>: <br>
169     * The fully qualified class name of the chosen job generator implementation, currently either
170     * {@link DefaultJobGenerator} or {@link FixedDomainConfigurationCountJobGenerator}. The default is
171     * {@link DefaultJobGenerator}.
172     */
173    public static String JOBGEN_CLASS = "settings.harvester.scheduler.jobGen.class";
174
175    /**
176     * <b>settings.harvester.scheduler.jobGen.config.splitByObjectLimit</b>: <br>
177     * By default the byte limit is used as the base criterion for how many domain configurations are put into one
178     * harvest job. However if this parameter is set to "true", then the object limit is used instead as the base
179     * criterion.
180     */
181    public static String SPLIT_BY_OBJECTLIMIT = "settings.harvester.scheduler.jobGen.config.splitByObjectLimit";
182
183    /**
184     * <b>settings.harvester.scheduler.jobGen.objectLimitIsSetByQuotaEnforcer</b>: <br>
185     * Controls whether the domain configuration object limit should be set in Heritrix's crawl order through the
186     * QuotaEnforcer configuration (parameter set to true) or through the frontier parameter 'queue-total-budget' (
187     * parameter set to false).
188     * <p>
189     * Default value is true, as legacy implementation was to use only the QuotaEnforcer.
190     */
191    public static String OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER = "settings.harvester.scheduler.jobGen.objectLimitIsSetByQuotaEnforcer";
192
193    /**
194     * <b>settings.harvester.scheduler.jobtimeouttime</b>:<br />
195     * Time before a STARTED job times out and change status to FAILED. In seconds.
196     */
197    public static String JOB_TIMEOUT_TIME = "settings.harvester.scheduler.jobtimeouttime";
198
199    /**
200     * <b>settings.harvester.scheduler.jobgenerationperiode</b>: <br>
201     * The period between checking if new jobs should be generated, in seconds. This is one minute because that's the
202     * finest we can define in a harvest definition.
203     */
204    public static String GENERATE_JOBS_PERIOD = "settings.harvester.scheduler.jobgenerationperiode";
205
206    /**
207     * <b>settings.harvester.harvesting.serverDir</b>: <br>
208     * Each job gets a subdir of this dir. Job data is written and Heritrix writes to that subdir.
209     */
210    public static String HARVEST_CONTROLLER_SERVERDIR = "settings.harvester.harvesting.serverDir";
211
212    /**
213     * <b>settings.harvester.harvesting.minSpaceLeft</b>: <br>
214     * The minimum amount of free bytes in the serverDir required before accepting any harvest-jobs.
215     */
216    public static String HARVEST_SERVERDIR_MINSPACE = "settings.harvester.harvesting.minSpaceLeft";
217
218    /**
219     * <b>settings.harvester.harvesting.oldjobsDir</b>: <br>
220     * The directory in which data from old jobs is kept after uploading. Each directory from serverDir will be moved to
221     * here if any data remains, either due to failed uploads or because it wasn't attempted uploaded.
222     */
223    public static String HARVEST_CONTROLLER_OLDJOBSDIR = "settings.harvester.harvesting.oldjobsDir";
224
225    /**
226     * <b>settings.harvester.harvesting.channel</b>: <br>
227     * Harvest channel to take jobs from. This is the default channel assigned to the harvest controller.
228     *
229     * @see dk.netarkivet.harvester.datamodel.HarvestChannel <p>
230     * NOTE: this one is also used in SingleMBeanObject parsing information to System state
231     */
232    public static String HARVEST_CONTROLLER_CHANNEL = "settings.harvester.harvesting.channel";
233
234    /**
235     * <b>settings.harvester.harvesting.heritrix.inactivityTimeout</b>: <br>
236     * The timeout setting for aborting a crawl based on crawler-inactivity. If the crawler is inactive for this amount
237     * of seconds the crawl will be aborted. The inactivity is measured on the crawlController.activeToeCount().
238     */
239    public static String INACTIVITY_TIMEOUT_IN_SECS = "settings.harvester.harvesting.heritrix.inactivityTimeout";
240
241    /**
242     * <b>settings.harvester.harvesting.heritrix.noresponseTimeout</b>: <br>
243     * The timeout value (in seconds) used in HeritrixLauncher for aborting crawl when no bytes are being received from
244     * web servers.
245     */
246    public static String CRAWLER_TIMEOUT_NON_RESPONDING = "settings.harvester.harvesting.heritrix.noresponseTimeout";
247    /**
248     * <b>settings.harvester.monitor.refreshInterval</b>:<br>
249     * Time interval in seconds after which the harvest monitor pages will be automatically refreshed.
250     */
251    public static String HARVEST_MONITOR_REFRESH_INTERVAL = "settings.harvester.monitor.refreshInterval";
252
253    /**
254     * <b>settings.harvester.monitor.historySampleRate</b>:<br>
255     * Time interval in seconds between historical records stores in the DB. Default value is 5 minutes.
256     */
257    public static String HARVEST_MONITOR_HISTORY_SAMPLE_RATE = "settings.harvester.monitor.historySampleRate";
258
259    /**
260     * <b>settings.harvester.monitor.historyChartGenIntervall</b>:<br>
261     * Time interval in seconds between regenerating the chart of historical data for a running job. Default value is 5
262     * minutes.
263     */
264    public static String HARVEST_MONITOR_HISTORY_CHART_GEN_INTERVAL = "settings.harvester.monitor.historyChartGenInterval";
265
266    /**
267     * <b>settings.harvester.monitor.displayedHistorySize</b>:<br>
268     * Maximum number of most recent history records displayed on the running job details page.
269     */
270    public static String HARVEST_MONITOR_DISPLAYED_HISTORY_SIZE = "settings.harvester.monitor.displayedHistorySize";
271
272    /**
273     * <b>settings.harvester.harvesting.heritrix.crawlLoopWaitTime</b>:<br>
274     * Time interval in seconds to wait during a crawl loop in the harvest controller. Default value is 20 seconds.
275     * 
276     * TODO Maybe move this from the heritrix settings (settings.harvester.harvesting.heritrix) to 
277     * settings.harvester.harvesting.controller.  
278     * 
279     */
280    public static String CRAWL_LOOP_WAIT_TIME = "settings.harvester.harvesting.heritrix.crawlLoopWaitTime";
281    
282    /**
283     * <b>settings.harvester.harvesting.sendReadyInterval</b>:<br>
284     * Time interval in seconds to wait before transmitting a {@link HarvesterReadyMessage} to the {@link JobDispatcher}
285     * .
286     * <p>
287     * <p>
288     * Lower values will make the JobDispatcher detect ready harvester faster, but will make it more likely that the
289     * harvester may send two ready messages before a job is received, causing the JobDispatcher to dispatch two jobs.
290     * <p>
291     * Default value is 30 second.
292     */
293    public static String SEND_READY_INTERVAL = "settings.harvester.harvesting.sendReadyInterval";
294
295    /**
296     * <b>settings.harvester.harvesting.sendReadyDelay</b>:<br>
297     * Time in milliseconds to wait from starting to listen on the job queue to a potential ready message is sent to the
298     * HarvestJobManager. This small delay is used to retrieve any left over jobs on the queue before sending the ready
299     * message to the harvester. Default value is 1000 millisecond.
300     */
301    public static String SEND_READY_DELAY = "settings.harvester.harvesting.sendReadyDelay";
302
303    /**
304     * <b>settings.harvester.harvesting.frontier.frontierReportWaitTime</b>:<br>
305     * Time interval in seconds to wait between two requests to generate a full frontier report. Default value is 600
306     * seconds (10 min).
307     */
308    public static String FRONTIER_REPORT_WAIT_TIME = "settings.harvester.harvesting.frontier.frontierReportWaitTime";
309
310    /**
311     * <b>settings.harvester.harvesting.frontier.filter.class</b> Defines a filter to apply to the full frontier report.
312     * the default class: {@link TopTotalEnqueuesFilter}
313     */
314    public static String FRONTIER_REPORT_FILTER_CLASS = "settings.harvester.harvesting.frontier.filter.class";
315
316    /**
317     * <b>settings.harvester.harvesting.frontier.filter.args</b> Defines a frontier report filter's arguments. Arguments
318     * should be separated by semicolons.
319     */
320    public static String FRONTIER_REPORT_FILTER_ARGS = "settings.harvester.harvesting.frontier.filter.args";
321
322    /**
323     * <b>settings.harvester.harvesting.heritrix.abortIfConnectionLost</b>:<br>
324     * Boolean flag. If set to true, the harvest controller will abort the current crawl when the JMX connection is
325     * lost. If set to true it will only log a warning, leaving the crawl operator shutting down harvester manually.
326     * Default value is true.
327     *
328     * @see BnfHeritrixController
329     */
330    public static String ABORT_IF_CONNECTION_LOST = "settings.harvester.harvesting.heritrix.abortIfConnectionLost";
331
332    /**
333     * <b>settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout</b>:<br>
334     * Maximum time in seconds to wait for Heritrix to generate report files once crawling is over.
335     */
336    public static String WAIT_FOR_REPORT_GENERATION_TIMEOUT = "settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout";
337
338    /**
339     * <b>settings.harvester.harvesting.heritrix.adminName</b>: <br>
340     * The name used to access the Heritrix GUI.
341     */
342    public static String HERITRIX_ADMIN_NAME = "settings.harvester.harvesting.heritrix.adminName";
343
344    /**
345     * <b>settings.harvester.harvesting.heritrix.adminPassword</b>: <br>
346     * The password used to access the Heritrix GUI.
347     */
348    public static String HERITRIX_ADMIN_PASSWORD = "settings.harvester.harvesting.heritrix.adminPassword";
349
350    /**
351     * <b>settings.harvester.harvesting.heritrix.guiPort</b>: <br>
352     * Port used to access the Heritrix web user interface. This port must not be used by anything else on the machine.
353     * Note that apart from pausing a job, modifications done directly on Heritrix may cause unexpected breakage.
354     */
355    public static String HERITRIX_GUI_PORT = "settings.harvester.harvesting.heritrix.guiPort";
356    
357    /**
358     * <b>settings.harvester.harvesting.heritrix.jmxPort</b>: <br>
359     * The port that Heritrix 1.14.4 uses to expose its JMX interface. This port must not be used by anything else on the
360     * machine, but does not need to be accessible from other machines unless you want to be able to use jconsole to
361     * access Heritrix directly. Note that apart from pausing a job, modifications done directly on Heritrix may cause
362     * unexpected breakage. Irrelevant for Heritrix 3+
363     */
364    public static String HERITRIX_JMX_PORT = "settings.harvester.harvesting.heritrix.jmxPort";
365
366    /**
367     * <b>settings.harvester.harvesting.heritrix.jmxUsername</b>: <br>
368     * The username used to connect to Heritrix 1.14.4 JMX interface The username must correspond to the value stored in the
369     * jmxremote.password file (name defined in setting settings.common.jmx.passwordFile).
370     * Irrelevant for Heritrix 3+
371     */
372    public static String HERITRIX_JMX_USERNAME = "settings.harvester.harvesting.heritrix.jmxUsername";
373
374    /**
375     * <b>settings.harvester.harvesting.heritrix.jmxPassword</b>: <br>
376     * The password used to connect to Heritrix JMX interface The password must correspond to the value stored in the
377     * jmxremote.password file (name defined in setting settings.common.jmx.passwordFile).
378     * Irrelevant for Heritrix 3+
379     */
380    public static String HERITRIX_JMX_PASSWORD = "settings.harvester.harvesting.heritrix.jmxPassword";
381
382    /**
383     * <b>settings.harvester.harvesting.heritrix.heapSize</b>: <br>
384     * The heap size to use for the Heritrix sub-process. This should probably be fairly large. It can be specified in
385     * the same way as for the -Xmx argument to Java, e.g. 512M, 2G etc.
386     */
387    public static String HERITRIX_HEAP_SIZE = "settings.harvester.harvesting.heritrix.heapSize";
388
389    /**
390     * <b>settings.harvester.harvesting.heritrix.javaOpts</b>: <br>
391     * Additional JVM options for the Heritrix sub-process. By default there is no additional JVM option.
392     */
393    public static String HERITRIX_JVM_OPTS = "settings.harvester.harvesting.heritrix.javaOpts";
394
395    /**
396     * <b>settings.harvester.harvesting.heritrixControllerClass</b>:<br/>
397     * The implementation of the HeritrixController interface to be used.
398     */
399    public static String HERITRIX_CONTROLLER_CLASS = "settings.harvester.harvesting.heritrixController.class";
400
401    /**
402     * <b>settings.harvester.harvesting.heritrixLauncherClass</b>:<br/>
403     * The implementation of the HeritrixLauncher abstract class to be used.
404     */
405    public static String HERITRIX_LAUNCHER_CLASS = "settings.harvester.harvesting.heritrixLauncher.class";
406
407    /**
408     * <b>settings.harvester.harvesting.harvestReport</b>:<br/>
409     * The implementation of {@link HarvestReport} interface to be used.
410     */
411    public static String HARVEST_REPORT_CLASS = "settings.harvester.harvesting.harvestReport.class";
412
413    /**
414     * <b>settings.harvester.harvesting.harvestReport.disregardSeedsURLInfo</b>:<br/>
415     * Should we disregard seedURL-information and thus assign the harvested bytes to the domain of the harvested URL
416     * instead of the seed url domain? The default is false;
417     */
418    public static String DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG = "settings.harvester.harvesting.harvestReport.disregardSeedURLInfo";
419
420    /**
421     * <b>settings.harvester.harvesting.deduplication.enabled</b>:<br/>
422     * This setting tells the system whether or not to use deduplication. This setting is true by default.
423     */
424    public static String DEDUPLICATION_ENABLED = "settings.harvester.harvesting.deduplication.enabled";
425
426    /**
427     * <b>settings.harvester.harvesting.metadata.heritrixFilePattern</b> This setting allows to filter which Heritrix
428     * files should be stored in the metadata (W)ARC file..
429     *
430     * @see Pattern
431     */
432    public static String METADATA_HERITRIX_FILE_PATTERN = "settings.harvester.harvesting.metadata.heritrixFilePattern";
433
434    /**
435     * <b>settings.harvester.harvesting.metadata.reportFilePattern</b> This setting allows to filter which Heritrix
436     * files that should be stored in the metadata (W)ARC file are to be classified as a report.
437     *
438     * @see Pattern
439     */
440    public static String METADATA_REPORT_FILE_PATTERN = "settings.harvester.harvesting.metadata.reportFilePattern";
441
442    /**
443     * <b>settings.harvester.harvesting.metadata.logFilePattern</b> This setting allows to filter which Heritrix log
444     * files should be stored in the metadata (W)ARC file.
445     *
446     * @see Pattern
447     */
448    public static String METADATA_LOG_FILE_PATTERN = "settings.harvester.harvesting.metadata.logFilePattern";
449
450    /**
451     * <b>settings.harvester.harvesting.metadata.generateArchiveFilesReport</b> This setting is a boolean flag that
452     * enables/disables the generation of an ARC/WARC files report. Default value is 'true'.
453     *
454     * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
455     */
456    public static String METADATA_GENERATE_ARCHIVE_FILES_REPORT = "settings.harvester.harvesting.metadata.archiveFilesReport.generate";
457
458    /**
459     * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If
460     * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the name of the generated report file.
461     * Default value is 'archivefiles-report.txt'.
462     * 
463     * FIXME: not easily portable to H3, as it depends on information in heritrix_out.log no longer available.
464     *
465     * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
466     */
467    public static String METADATA_ARCHIVE_FILES_REPORT_NAME = "settings.harvester.harvesting.metadata.archiveFilesReport.fileName";
468
469    /**
470     * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If
471     * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the header of the generated report file.
472     * This setting should generally be left to its default value, which is '[ARCHIVEFILE] [Opened] [Closed] [Size]'.
473     *
474     * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
475     */
476    public static String METADATA_ARCHIVE_FILES_REPORT_HEADER = "settings.harvester.harvesting.metadata.archiveFilesReport.fileHeader";
477
478    /**
479     * <b>settings.harvester.aliases.timeout</b> The amount of time in seconds before an alias times out, and needs to
480     * be re-evaluated. The default value is one year, i.e 31536000 seconds.
481     */
482    public static String ALIAS_TIMEOUT = "settings.harvester.aliases.timeout";
483
484    /**
485     * <b>settings.harvester.harvesting.continuationFromHeritrixRecoverlogEnabled</b>:</br> Setting for whether or not a
486     * restarted job should try fetching the recoverlog of the previous failed job, and ask Heritrix to continue from
487     * this log. The default is false.
488     */
489    public static String RECOVERlOG_CONTINUATION_ENABLED = "settings.harvester.harvesting.continuationFromHeritrixRecoverlogEnabled";
490
491    /**
492     * <b>settings.harvester.harvesting.metadata.metadataFormat</b> The dataformat used by Netarchivesuite to write the
493     * metadata associated with a given harvest job. default: arc (alternative: warc)
494     */
495    public static String METADATA_FORMAT = "settings.harvester.harvesting.metadata.metadataFormat";
496
497    /**
498     * <b>settings.harvester.harvesting.heritrix.archiveFormat</b> The dataformat used by heritrix to write the
499     * harvested data. default: warc (alternative: arc)
500     */
501    public static String HERITRIX_ARCHIVE_FORMAT = "settings.harvester.harvesting.heritrix.archiveFormat";
502    /**
503     * <b>settings.harvester.harvesting.heritrix.archiveNaming.class</b> The class implementing the chosen way of naming
504     * your archive-files default: LegacyNamingConvention. This class decides what to put into the Heritrix "prefix"
505     * property of the org.archive.crawler.writer.ARCWriterProcessor and/or
506     * org.archive.crawler.writer.WARCWriterProcessor.
507     */
508    public static String HERITRIX_ARCHIVE_NAMING_CLASS = "settings.harvester.harvesting.heritrix.archiveNaming.class";
509
510    /**
511     * <b>settings.harvester.harvesting.heritrix.warc.skipIdenticalDigests</b> Represents the 'skip-identical-digests'
512     * setting in the Heritrix WARCWriterProcessor. The default is false.
513     */
514    public static String HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix.warc.skipIdenticalDigests";
515    /**
516     * <b>settings.harvester.harvesting.heritrix.warc.writeRequests</b> Represents the 'write-requests' setting in the
517     * Heritrix WARCWriterProcessor. The default is true
518     */
519    public static String HERITRIX_WARC_WRITE_REQUESTS = "settings.harvester.harvesting.heritrix.warc.writeRequests";
520    /**
521     * <b>settings.harvester.harvesting.heritrix.warc.writeMetadata</b> Represents the 'write-metadata' setting in the
522     * Heritrix WARCWriterProcessor. The default is false.
523     */
524    public static String HERITRIX_WARC_WRITE_METADATA = "settings.harvester.harvesting.heritrix.warc.writeMetadata";
525    /**
526     * <b>settings.harvester.harvesting.heritrix.warc.writeRevisitForIdenticalDigests</b> Represents the
527     * 'write-revisit-for-identical-digests' setting in the Heritrix WARCWriterProcessor. The default is false.
528     */
529    public static String HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix.warc.writeRevisitForIdenticalDigests";
530    /**
531     * <b>settings.harvester.harvesting.heritrix.warc.writeRevisitForNotModified</b> Represents the
532     * 'write-revisit-for-not-modified' setting in the Heritrix WARCWriterProcessor. The default is true.
533     */
534    public static String HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED = "settings.harvester.harvesting.heritrix.warc.writeRevisitForNotModified";
535
536    /**
537     * <b>settings.harvester.harvesting.heritrix.version</b> Represents the version of Heritrix used by Netarchivesuite 
538     * The default is h3. The optional value is h1.
539     * 
540     * If h1 is chosen, we assume that our templates is h1, as well.
541     * If h3 is chosen, we assume that our templates is h3, as well.
542     * There is no attempt at migration from one to the other. This must be done by an commandline-tool.
543     */
544    public static String HERITRIX_VERSION = "settings.harvester.harvesting.heritrix.version";
545    /**
546     * <b>settings.harvester.harvesting.heritrix.bundle</b>Points to the Heritrix3 zipfile bundled with 
547     * netarchiveSuite classes. Currently no default value
548     */         
549    public static String HERITRIX3_BUNDLE = "settings.harvester.harvesting.heritrix.bundle";
550
551    public static String HERITRIX3_CERTIFICATE = "settings.harvester.harvesting.heritrix.certificate";
552
553    public static String HERITRIX3_CERTIFICATE_PASSWORD = "settings.harvester.harvesting.heritrix.certificatePassword";
554    
555    /**
556     * <b>settings.harvester.performer</b>: <br>
557     * The agent performing these harvests. The default is: ""
558     */
559    public static String PERFORMER = "settings.harvester.performer";
560
561    /***************************/
562    /* Indexserver - settings. */
563    /***************************/
564
565    /**
566     * <b>settings.harvester.indexserver.requestdir</b>: <br>
567     * Setting for where the requests of the indexserver are stored.
568     */
569    public static String INDEXSERVER_INDEXING_REQUESTDIR = "settings.harvester.indexserver.requestdir";
570
571    /**
572     * <b>settings.harvester.indexserver.maxclients</b>: <br>
573     * Setting for the max number of clients the indexserver can handle simultaneously.
574     */
575    public static String INDEXSERVER_INDEXING_MAXCLIENTS = "settings.harvester.indexserver.maxclients";
576
577    /**
578     * <b>settings.harvester.indexserver.maxthreads</b>: <br>
579     * Setting for the max number of threads the deduplication indexer shall use.
580     */
581    public static String INDEXSERVER_INDEXING_MAXTHREADS = "settings.harvester.indexserver.maxthreads";
582    /**
583     * <b>settings.harvester.indexserver.checkinterval</b>: <br>
584     * Setting for the time in milliseconds between each check of the state of sub-indexing. Default: 30 seconds (30000
585     * milliseconds).
586     */
587    public static String INDEXSERVER_INDEXING_CHECKINTERVAL = "settings.harvester.indexserver.checkinterval";
588
589    /**
590     * <b>settings.harvester.indexserver.indexingtimeout</b>: <br>
591     * Setting for the indexing timeout in milliseconds. The default is 259200000 (3 days).
592     */
593    public static String INDEXSERVER_INDEXING_TIMEOUT = "settings.harvester.indexserver.indexingtimeout";
594
595    /**
596     * <b>settings.harvester.indexserver.maxsegments</b>: <br>
597     * Setting for how many segments we will accept in our lucene indices. The default is 15.
598     */
599    public static String INDEXSERVER_INDEXING_MAX_SEGMENTS = "settings.harvester.indexserver.maxsegments";
600
601    /**
602     * <b>settings.harvester.indexserver.listeningcheckinterval</b>: <br>
603     * Setting for the interval between each listening check in milliseconds. The default is 30000 (5 minutes).
604     */
605    public static String INDEXSERVER_INDEXING_LISTENING_INTERVAL = "settings.harvester.indexserver.listeningcheckinterval";
606    /**
607     * <b>settings.archive.indexserver.satisfactorythresholdpercentage</b>: <br>
608     * Setting for the satisfactory threshold of the indexing result as a percentage. The default is 70 percent
609     */
610    public static String INDEXSERVER_INDEXING_SATISFACTORYTHRESHOLD_PERCENTAGE = "settings.harvester.indexserver.satisfactorythresholdpercentage";
611
612    /**
613     * <b>settings.harvester.indexserver.indexrequestserver.class</b>: <br>
614     * Setting for which type of indexrequestserver to use. The default is:
615     * {@link dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer}
616     */
617    public static String INDEXREQUEST_SERVER_CLASS = "settings.harvester.indexserver.indexrequestserver.class";
618
619    /**
620     * b>settings.harvester.indexserver.lookfordataInAllBitarchiveReplicas</b>: <br>
621     * Setting for whether or not data not found in the default bitarchive replica shall be looked for in other
622     * bitarchive replicas. The default is false.
623     */
624    public static String INDEXSERVER_INDEXING_LOOKFORDATAINOTHERBITARCHIVEREPLICAS = "settings.harvester.indexserver.lookfordataInAllBitarchiveReplicas";
625
626    /***************************/
627    /* Viewerproxy - settings. */
628    /***************************/
629
630    /**
631     * <b>settings.viewerproxy.baseDir</b>: <br>
632     * The main directory for the ViewerProxy, used for storing the Lucene index for the jobs being viewed. This
633     * directory can be used by multiple ViewerProxy applications running on the same machine.
634     */
635    public static String VIEWERPROXY_DIR = "settings.harvester.viewerproxy.baseDir";
636
637    /**
638     * <b>settings.viewerproxy.tryLookupUriAsFtp</b>: <br>
639     * If we fail to lookup an URI, we will try changing the protocol to ftp, if this setting is set to true. The
640     * default is false.
641     */
642    public static String TRY_LOOKUP_URI_AS_FTP = "settings.harvester.viewerproxy.tryLookupUriAsFtp";
643
644    /**
645     * <b>settings.viewerproxy.maxSizeInBrowser</b> The size (in bytes) of the largest object to be returned for viewing
646     * in the browser window. Larger objects will be returned with the appropriate http header for saving them to a
647     * file.
648     */
649    public static String MAXIMUM_OBJECT_IN_BROWSER = "settings.harvester.viewerproxy.maxSizeInBrowser";
650
651    /**
652     * The maximum length (in lines) of crawllog to be displayed in a browser window.
653     */
654    public static String MAX_CRAWLLOG_IN_BROWSER = "settings.harvester.webinterface.maxCrawlLogInBrowser";
655
656}