Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester;
024
025import java.util.regex.Pattern;
026
027import dk.netarkivet.common.utils.Settings;
028import dk.netarkivet.harvester.harvesting.distribute.HarvesterReadyMessage;
029import dk.netarkivet.harvester.harvesting.report.HarvestReport;
030
031/** Settings specific to the harvester module of NetarchiveSuite. */
032public class HarvesterSettings {
033
034    /** The default place in classpath where the settings file can be found. */
035    private static final String DEFAULT_SETTINGS_CLASSPATH = "dk/netarkivet/harvester/settings.xml";
036
037    /*
038     * The static initialiser is called when the class is loaded. It will add default values for all settings defined in
039     * this class, by loading them from a settings.xml file in classpath.
040     */
041    static {
042        Settings.addDefaultClasspathSettings(DEFAULT_SETTINGS_CLASSPATH);
043    }
044
045    // NOTE: The constants defining setting names below are left non-final on
046    // purpose! Otherwise, the static initialiser that loads default values
047    // will not run.
048
049    /**
050     * <b>settings.harvester.datamodel.domain.defaultSeedlist</b>: <br>
051     * Default name of the seedlist to use when new domains are created.
052     */
053    public static String DEFAULT_SEEDLIST = "settings.harvester.datamodel.domain.defaultSeedlist";
054    
055    /**
056     * <b>settings.harvester.datamodel.domain.validSeedRegex</b>: <br>
057     * Regular expression used to validate a seed within a seedlist.
058     * <p>
059     * Default value accepts all non-empty strings.
060     */
061    public static String VALID_SEED_REGEX = "settings.harvester.datamodel.domain.validSeedRegex";
062
063    /**
064     * <b>settings.harvester.datamodel.domain.defaultConfig</b>: <br>
065     * The name of a configuration that is created by default and which is initially used for snapshot harvests.
066     */
067    public static String DOMAIN_DEFAULT_CONFIG = "settings.harvester.datamodel.domain.defaultConfig";
068
069    /**
070     * <b>settings.harvester.datamodel.domain.defaultOrderxml</b>: <br>
071     * Name of order xml template used for domains if nothing else is specified. The newly created configurations use
072     * this. This template must exist before harvesting can commence
073     */
074    public static String DOMAIN_DEFAULT_ORDERXML = "settings.harvester.datamodel.domain.defaultOrderxml";
075
076    /**
077     * <b>settings.harvester.datamodel.domain.defaultMaxrate</b>: <br>
078     * Default download rate for domain configuration. Not currently enforced.
079     */
080    public static String DOMAIN_CONFIG_MAXRATE = "settings.harvester.datamodel.domain.defaultMaxrate";
081
082    /**
083     * <b>settings.harvester.datamodel.domain.defaultMaxbytes</b>: <br>
084     * Default byte limit for domain configuration.
085     */
086    public static String DOMAIN_CONFIG_MAXBYTES = "settings.harvester.datamodel.domain.defaultMaxbytes";
087
088    /**
089     * <b>settings.harvester.datamodel.domain.defaultMaxobjects</b>: <br>
090     * Default object limit for domain configuration.
091     */
092    public static String DOMAIN_CONFIG_MAXOBJECTS = "settings.harvester.datamodel.domain.defaultMaxobjects";
093    
094    /**
095     * <b>settings.harvester.datamodel.domain.defaultSchedule</b>: <br>
096     * Default schedule for selective harvesting,. No default by default.
097     */
098    public static String DOMAIN_CONFIG_SCHEDULE = "settings.harvester.datamodel.domain.defaultSchedule";
099    
100    /**
101     * <b>settings.harvester.scheduler.jobGen.config.errorFactorPrevResult</b>: <br>
102     * Used when calculating expected size of a harvest of some configuration during job-creation process. This defines
103     * how great a possible factor we will permit a harvest to be larger then the expectation, when basing the
104     * expectation on a previous completed job.
105     */
106    public static String ERRORFACTOR_PERMITTED_PREVRESULT = "settings.harvester.scheduler.jobGen.config.errorFactorPrevResult";
107
108    /**
109     * <b>settings.harvester.scheduler.jobGen.config.errorFactorBestGuess</b>: <br>
110     * Used when calculating expected size of a harvest of some configuration during job-creation process. This defines
111     * how great a possible factor we will permit a harvest to be larger then the expectation, when basing the
112     * expectation on previous uncompleted harvests or no harvest data at all.
113     */
114    public static String ERRORFACTOR_PERMITTED_BESTGUESS = "settings.harvester.scheduler.jobGen.config.errorFactorBestGuess";
115
116    /**
117     * <b>settings.harvester.scheduler.jobGen.config.expectedAverageBytesPerObject</b>: <br>
118     * How many bytes the average object is expected to be on domains where we don't know any better. This number should
119     * grow over time, as of end of 2005 empirical data shows 38000.
120     */
121    public static String EXPECTED_AVERAGE_BYTES_PER_OBJECT = "settings.harvester.scheduler.jobGen.config.expectedAverageBytesPerObject";
122
123    /**
124     * <b>settings.harvester.scheduler.jobGen.config.maxDomainSize</b>: <br>
125     * The initial guess of the domain size (number of objects) of an unknown domain.
126     */
127    public static String MAX_DOMAIN_SIZE = "settings.harvester.scheduler.jobGen.config.maxDomainSize";
128
129    /**
130     * <b>settings.harvester.scheduler.jobGen.config.maxRelativeSizeDifference</b>: <br>
131     * The maximum allowed relative difference in expected number of objects retrieved in a single job definition. To
132     * avoid job splitting, set the value as Long.MAX_VALUE.
133     */
134    public static String JOBS_MAX_RELATIVE_SIZE_DIFFERENCE = "settings.harvester.scheduler.jobGen.config.maxRelativeSizeDifference";
135
136    /**
137     * <b>settings.harvester.scheduler.jobGen.config.minAbsoluteSizeDifference</b>: <br>
138     * Size differences for jobs below this threshold are ignored, regardless of the limits for the relative size
139     * difference. To avoid job splitting, set the value as Long.MAX_VALUE.
140     */
141    public static String JOBS_MIN_ABSOLUTE_SIZE_DIFFERENCE = "settings.harvester.scheduler.jobGen.config.minAbsoluteSizeDifference";
142
143    /**
144     * <b>settings.harvester.scheduler.jobGen.config.maxTotalSize</b>: <br>
145     * When this limit is exceeded no more configurations may be added to a job. To avoid job splitting, set the value
146     * as Long.MAX_VALUE.
147     */
148    public static String JOBS_MAX_TOTAL_JOBSIZE = "settings.harvester.scheduler.jobGen.config.maxTotalSize";
149
150    /**
151     * <b>settings.harvester.scheduler.jobGen.maxTimeToCompleteJob</b>: <br>
152     * The limit on how many seconds Heritrix should continue on each job. O means no limit.
153     */
154    public static String JOBS_MAX_TIME_TO_COMPLETE = "settings.harvester.scheduler.jobGen.maxTimeToCompleteJob";
155
156    /**
157     * <b>settings.harvester.scheduler.jobGen.domainConfigSubsetSize</b>: <br>
158     * How many domain configurations we will process in one go before making jobs out of them. This amount of domains
159     * will be stored in memory at the same time. To avoid job splitting, set this value as Long.MAX_VALUE.
160     */
161    public static String JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE = "settings.harvester.scheduler.jobGen.domainConfigSubsetSize";
162
163    /**
164     * <b>settings.harvester.scheduler.jobGen.config.fixedDomainCountFocused</b>: <br>
165     * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter represents the
166     * maximum number of domain configurations in a partial harvest job.
167     */
168    public static String JOBGEN_FIXED_CONFIG_COUNT_FOCUSED = "settings.harvester.scheduler.jobGen.config.fixedDomainCountFocused";
169
170    /**
171     * <b>settings.harvester.scheduler.jobGen.config.fixedDomainCountSnapshot</b>: <br>
172     * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter represents the
173     * maximum number of domain configurations in a full harvest job.
174     */
175    public static String JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT = "settings.harvester.scheduler.jobGen.config.fixedDomainCountSnapshot";
176
177    /**
178     * <b>settings.harvester.scheduler.jobGen.config.excludeDomainsWithZeroBudget</b>: <br>
179     * If the job generator is {@link FixedDomainConfigurationCountJobGenerator}, then this parameter toggles whether or
180     * not domain configurations with a budget of zero (byte or objects) should be excluded from jobs. The default value
181     * is 'false'.
182     */
183    public static String JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET = "settings.harvester.scheduler.jobGen.config.excludeDomainsWithZeroBudget";
184
185    /**
186     * <b>settings.harvester.scheduler.jobGen.config.postponeUnregisteredChannel</b>: <br>
187     * If this property is true, then the job generator will postpone job generation for harvest definitions that are
188     * mapped to a harvest channel not registered to at least one harvester. The default value is 'true'.
189     */
190    public static String JOBGEN_POSTPONE_UNREGISTERED_HARVEST_CHANNEL = "settings.harvester.scheduler.jobGen.config.postponeUnregisteredChannel";
191
192    /**
193     * <b>settings.harvester.scheduler.jobGen.class</b>: <br>
194     * The fully qualified class name of the chosen job generator implementation, currently either
195     * {@link DefaultJobGenerator} or {@link FixedDomainConfigurationCountJobGenerator}. The default is
196     * {@link DefaultJobGenerator}.
197     */
198    public static String JOBGEN_CLASS = "settings.harvester.scheduler.jobGen.class";
199
200    /**
201     * <b>settings.harvester.scheduler.jobGen.config.splitByObjectLimit</b>: <br>
202     * By default the byte limit is used as the base criterion for how many domain configurations are put into one
203     * harvest job. However if this parameter is set to "true", then the object limit is used instead as the base
204     * criterion.
205     */
206    public static String SPLIT_BY_OBJECTLIMIT = "settings.harvester.scheduler.jobGen.config.splitByObjectLimit";
207
208    /**
209     * <b>settings.harvester.scheduler.jobGen.objectLimitIsSetByQuotaEnforcer</b>: <br>
210     * Controls whether the domain configuration object limit should be set in Heritrix's crawl order through the
211     * QuotaEnforcer configuration (parameter set to true) or through the frontier parameter 'queue-total-budget' (
212     * parameter set to false).
213     * <p>
214     * Default value is true, as legacy implementation was to use only the QuotaEnforcer.
215     */
216    public static String OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER = "settings.harvester.scheduler.jobGen.objectLimitIsSetByQuotaEnforcer";
217
218    /**
219     * <b>settings.harvester.scheduler.jobGen.useAlternateSnapShotJobgenerationMethod</b>:</br>
220     * If value is true, we use an alternate method for jobgeneration of a snapshotharvest continuing a previous harvest.
221     * Default value is false.
222     */
223    public static String USE_ALTERNATE_SNAPSHOT_JOBGENERATION_METHOD = "settings.harvester.scheduler.jobGen.useAlternateSnapshotJobgenerationMethod";
224    
225    /**
226     * <b>settings.harvester.scheduler.jobtimeouttime</b>:<br />
227     * Time before a STARTED job times out and change status to FAILED. In seconds.
228     */
229    public static String JOB_TIMEOUT_TIME = "settings.harvester.scheduler.jobtimeouttime";
230
231    /**
232     * <b>settings.harvester.scheduler.jobgenerationperiod</b>: <br>
233     * The period between checking if new jobs should be generated, in seconds. This is one minute because that's the
234     * finest we can define in a harvest definition.
235     */
236    public static String GENERATE_JOBS_PERIOD = "settings.harvester.scheduler.jobgenerationperiod";
237
238    /**
239     * <b>settings.harvester.harvesting.serverDir</b>: <br>
240     * Each job gets a subdir of this dir. Job data is written and Heritrix writes to that subdir. 
241     */
242    public static String HARVEST_CONTROLLER_SERVERDIR = "settings.harvester.harvesting.serverDir";
243
244    /**
245     * <b>settings.harvester.harvesting.minSpaceLeft</b>: <br>
246     * The minimum amount of free bytes in the serverDir required before accepting any harvest-jobs.
247     */
248    public static String HARVEST_SERVERDIR_MINSPACE = "settings.harvester.harvesting.minSpaceLeft";
249
250    /**
251     * <b>settings.harvester.harvesting.oldjobsDir</b>: <br>
252     * The directory in which data from old jobs is kept after uploading. Each directory from serverDir will be moved to
253     * here if any data remains, either due to failed uploads or because it wasn't attempted uploaded.
254     */
255    public static String HARVEST_CONTROLLER_OLDJOBSDIR = "settings.harvester.harvesting.oldjobsDir";
256
257    /**
258     * <b>settings.harvester.harvesting.channel</b>: <br>
259     * Harvest channel to take jobs from. This is the default channel assigned to the harvest controller.
260     *
261     * @see dk.netarkivet.harvester.datamodel.HarvestChannel <p>
262     * NOTE: this one is also used in SingleMBeanObject parsing information to System state
263     */
264    public static String HARVEST_CONTROLLER_CHANNEL = "settings.harvester.harvesting.channel";
265
266    /**
267     * <b>settings.harvester.harvesting.heritrix.inactivityTimeout</b>: <br>
268     * The timeout setting for aborting a crawl based on crawler-inactivity. If the crawler is inactive for this amount
269     * of seconds the crawl will be aborted. The inactivity is measured on the crawlController.activeToeCount().
270     */
271    public static String INACTIVITY_TIMEOUT_IN_SECS = "settings.harvester.harvesting.heritrix.inactivityTimeout";
272
273    /**
274     * <b>settings.harvester.harvesting.heritrix.noresponseTimeout</b>: <br>
275     * The timeout value (in seconds) used in HeritrixLauncher for aborting crawl when no bytes are being received from
276     * web servers.
277     */
278    public static String CRAWLER_TIMEOUT_NON_RESPONDING = "settings.harvester.harvesting.heritrix.noresponseTimeout";
279    /**
280     * <b>settings.harvester.monitor.refreshInterval</b>:<br>
281     * Time interval in seconds after which the harvest monitor pages will be automatically refreshed.
282     */
283    public static String HARVEST_MONITOR_REFRESH_INTERVAL = "settings.harvester.monitor.refreshInterval";
284
285    /**
286     * <b>settings.harvester.monitor.historySampleRate</b>:<br>
287     * Time interval in seconds between historical records stores in the DB. Default value is 5 minutes.
288     */
289    public static String HARVEST_MONITOR_HISTORY_SAMPLE_RATE = "settings.harvester.monitor.historySampleRate";
290
291    /**
292     * <b>settings.harvester.monitor.historyChartGenIntervall</b>:<br>
293     * Time interval in seconds between regenerating the chart of historical data for a running job. Default value is 5
294     * minutes.
295     */
296    public static String HARVEST_MONITOR_HISTORY_CHART_GEN_INTERVAL = "settings.harvester.monitor.historyChartGenInterval";
297
298    /**
299     * <b>settings.harvester.monitor.displayedHistorySize</b>:<br>
300     * Maximum number of most recent history records displayed on the running job details page.
301     */
302    public static String HARVEST_MONITOR_DISPLAYED_HISTORY_SIZE = "settings.harvester.monitor.displayedHistorySize";
303    
304    /**
305     * <b>settings.harvester.monitor.displayedFrontierQueuesSize</b>:<br>
306     * Maximum number of frontier queues displayed on the running job details page.
307     */
308    public static String HARVEST_MONITOR_DISPLAYED_FRONTIER_QUEUE_SIZE = "settings.harvester.monitor.displayedFrontierQueuesSize";
309
310    /**
311     * <b>settings.harvester.harvesting.heritrix.crawlLoopWaitTime</b>:<br>
312     * Time interval in seconds to wait during a crawl loop in the harvest controller. Default value is 20 seconds.
313     * 
314     * TODO Maybe move this from the heritrix settings (settings.harvester.harvesting.heritrix) to 
315     * settings.harvester.harvesting.controller.  
316     */
317    public static String CRAWL_LOOP_WAIT_TIME = "settings.harvester.harvesting.heritrix.crawlLoopWaitTime";
318    
319    /**
320     * <b>settings.harvester.harvesting.sendReadyInterval</b>:<br>
321     * Time interval in seconds to wait before transmitting a {@link HarvesterReadyMessage} to the {@link JobDispatcher}
322     * .
323     * <p>
324     * <p>
325     * Lower values will make the JobDispatcher detect ready harvester faster, but will make it more likely that the
326     * harvester may send two ready messages before a job is received, causing the JobDispatcher to dispatch two jobs.
327     * <p>
328     * Default value is 30 second.
329     */
330    public static String SEND_READY_INTERVAL = "settings.harvester.harvesting.sendReadyInterval";
331
332    /**
333     * <b>settings.harvester.harvesting.sendReadyDelay</b>:<br>
334     * Time in milliseconds to wait from starting to listen on the job queue to a potential ready message is sent to the
335     * HarvestJobManager. This small delay is used to retrieve any left over jobs on the queue before sending the ready
336     * message to the harvester. Default value is 1000 millisecond.
337     */
338    public static String SEND_READY_DELAY = "settings.harvester.harvesting.sendReadyDelay";
339
340    /** 
341     * Support for limiting the number of submitted messages in each harvestchannel using a javax.jms.QueueBrowser.
342     * Default value is: false
343     */
344        public static String SCHEDULER_LIMIT_SUBMITTED_JOBS_IN_QUEUE = "settings.harvester.scheduler.limitSubmittedJobsInQueue";
345        
346        /** 
347     * The limit for submitted messages in each harvestchannel. Not enabled if SCHEDULER_LIMIT_SUBMITTED_JOBS_IN_QUEUE is false 
348     * Default value is: 1
349     */
350        public static String SCHEDULER_SUBMITTED_JOBS_IN_QUEUE_LIMIT = "settings.harvester.scheduler.submittedJobsInQueueLimit";
351        
352    /**
353     * <b>settings.harvester.harvesting.frontier.frontierReportWaitTime</b>:<br>
354     * Time interval in seconds to wait between two requests to generate a full frontier report. Default value is 600
355     * seconds (10 min).
356     */
357    public static String FRONTIER_REPORT_WAIT_TIME = "settings.harvester.harvesting.frontier.frontierReportWaitTime";
358
359    /**
360     * <b>settings.harvester.harvesting.frontier.filter.class</b> Defines a filter to apply to the full frontier report.
361     * the default class: {@link TopTotalEnqueuesFilter}
362     */
363    public static String FRONTIER_REPORT_FILTER_CLASS = "settings.harvester.harvesting.frontier.filter.class";
364
365    /**
366     * <b>settings.harvester.harvesting.frontier.filter.args</b> Defines a frontier report filter's arguments. Arguments
367     * should be separated by semicolons.
368     */
369    public static String FRONTIER_REPORT_FILTER_ARGS = "settings.harvester.harvesting.frontier.filter.args";
370
371    /**
372     * <b>settings.harvester.harvesting.heritrix.abortIfConnectionLost</b>:<br>
373     * Boolean flag. If set to true, the harvest controller will abort the current crawl when the JMX connection is
374     * lost. If set to true it will only log a warning, leaving the crawl operator shutting down harvester manually.
375     * Default value is true.
376     *
377     * @see BnfHeritrixController
378     */
379    public static String ABORT_IF_CONNECTION_LOST = "settings.harvester.harvesting.heritrix.abortIfConnectionLost";
380
381    /**
382     * <b>settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout</b>:<br>
383     * Maximum time in seconds to wait for Heritrix to generate report files once crawling is over.
384     */
385    public static String WAIT_FOR_REPORT_GENERATION_TIMEOUT = "settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout";
386
387    /**
388     * <b>settings.harvester.harvesting.heritrix</b>: <br>
389     * The path to the Heritrix SETTINGS.
390     */
391    public static String HERITRIX = "settings.harvester.harvesting.heritrix";
392
393    /**
394     * <b>settings.harvester.harvesting.heritrix.adminName</b>: <br>
395     * The name used to access the Heritrix GUI.
396     */
397    public static String HERITRIX_ADMIN_NAME = "settings.harvester.harvesting.heritrix.adminName";
398
399    /**
400     * <b>settings.harvester.harvesting.heritrix.adminPassword</b>: <br>
401     * The password used to access the Heritrix GUI.
402     */
403    public static String HERITRIX_ADMIN_PASSWORD = "settings.harvester.harvesting.heritrix.adminPassword";
404
405    /**
406     * <b>settings.harvester.harvesting.heritrix.guiPort</b>: <br>
407     * Port used to access the Heritrix web user interface. This port must not be used by anything else on the machine.
408     * Note that apart from pausing a job, modifications done directly on Heritrix may cause unexpected breakage.
409     */
410    public static String HERITRIX_GUI_PORT = "settings.harvester.harvesting.heritrix.guiPort";
411    
412    /**
413     * <b>settings.harvester.harvesting.heritrix.jmxPort</b>: <br>
414     * The port that Heritrix 1.14.4 uses to expose its JMX interface. This port must not be used by anything else on the
415     * machine, but does not need to be accessible from other machines unless you want to be able to use jconsole to
416     * access Heritrix directly. Note that apart from pausing a job, modifications done directly on Heritrix may cause
417     * unexpected breakage. Irrelevant for Heritrix 3+
418     */
419    public static String HERITRIX_JMX_PORT = "settings.harvester.harvesting.heritrix.jmxPort";
420
421    /**
422     * <b>settings.harvester.harvesting.heritrix.jmxUsername</b>: <br>
423     * The username used to connect to Heritrix 1.14.4 JMX interface The username must correspond to the value stored in the
424     * jmxremote.password file (name defined in setting settings.common.jmx.passwordFile).
425     * Irrelevant for Heritrix 3+
426     */
427    public static String HERITRIX_JMX_USERNAME = "settings.harvester.harvesting.heritrix.jmxUsername";
428
429    /**
430     * <b>settings.harvester.harvesting.heritrix.jmxPassword</b>: <br>
431     * The password used to connect to Heritrix JMX interface The password must correspond to the value stored in the
432     * jmxremote.password file (name defined in setting settings.common.jmx.passwordFile).
433     * Irrelevant for Heritrix 3+
434     */
435    public static String HERITRIX_JMX_PASSWORD = "settings.harvester.harvesting.heritrix.jmxPassword";
436
437    /**
438     * <b>settings.harvester.harvesting.heritrix.heapSize</b>: <br>
439     * The heap size to use for the Heritrix sub-process. This should probably be fairly large. It can be specified in
440     * the same way as for the -Xmx argument to Java, e.g. 512M, 2G etc.
441     */
442    public static String HERITRIX_HEAP_SIZE = "settings.harvester.harvesting.heritrix.heapSize";
443
444    /**
445     * <b>settings.harvester.harvesting.heritrix.javaOpts</b>: <br>
446     * Additional JVM options for the Heritrix sub-process. By default there is no additional JVM option.
447     */
448    public static String HERITRIX_JVM_OPTS = "settings.harvester.harvesting.heritrix.javaOpts";
449
450    /**
451     * <b>settings.harvester.harvesting.heritrixControllerClass</b>:<br/>
452     * The implementation of the HeritrixController interface to be used.
453     */
454    public static String HERITRIX_CONTROLLER_CLASS = "settings.harvester.harvesting.heritrixController.class";
455
456    /**
457     * <b>settings.harvester.harvesting.heritrixLauncherClass</b>:<br/>
458     * The implementation of the HeritrixLauncher abstract class to be used.
459     */
460    public static String HERITRIX_LAUNCHER_CLASS = "settings.harvester.harvesting.heritrixLauncher.class";
461
462    /**
463     * <b>settings.harvester.harvesting.harvestReport</b>:<br/>
464     * The implementation of {@link HarvestReport} interface to be used.
465     */
466    public static String HARVEST_REPORT_CLASS = "settings.harvester.harvesting.harvestReport.class";
467
468    /**
469     * <b>settings.harvester.harvesting.harvestReport.disregardSeedsURLInfo</b>:<br/>
470     * Should we disregard seedURL-information and thus assign the harvested bytes to the domain of the harvested URL
471     * instead of the seed url domain? The default is false;
472     */
473    public static String DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG = "settings.harvester.harvesting.harvestReport.disregardSeedURLInfo";
474
475    /**
476     * <b>settings.harvester.harvesting.deduplication.enabled</b>:<br/>
477     * This setting tells the system whether or not to use deduplication. This setting is true by default.
478     */
479    public static String DEDUPLICATION_ENABLED = "settings.harvester.harvesting.deduplication.enabled";
480
481    /**
482     * <b>settings.harvester.harvesting.metadata.heritrixFilePattern</b> This setting allows to filter which Heritrix
483     * files should be stored in the metadata (W)ARC file..
484     *
485     * @see Pattern
486     */
487    public static String METADATA_HERITRIX_FILE_PATTERN = "settings.harvester.harvesting.metadata.heritrixFilePattern";
488
489    /**
490     * <b>settings.harvester.harvesting.metadata.reportFilePattern</b> This setting allows to filter which Heritrix
491     * files that should be stored in the metadata (W)ARC file are to be classified as a report.
492     *
493     * @see Pattern
494     */
495    public static String METADATA_REPORT_FILE_PATTERN = "settings.harvester.harvesting.metadata.reportFilePattern";
496
497    /**
498     * <b>settings.harvester.harvesting.metadata.logFilePattern</b> This setting allows to filter which Heritrix log
499     * files should be stored in the metadata (W)ARC file.
500     *
501     * @see Pattern
502     */
503    public static String METADATA_LOG_FILE_PATTERN = "settings.harvester.harvesting.metadata.logFilePattern";
504
505    /**
506     * <b>settings.harvester.harvesting.metadata.generateArchiveFilesReport</b> This setting is a boolean flag that
507     * enables/disables the generation of an ARC/WARC files report. Default value is 'true'.
508     *
509     * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
510     */
511    public static String METADATA_GENERATE_ARCHIVE_FILES_REPORT = "settings.harvester.harvesting.metadata.archiveFilesReport.generate";
512
513    /**
514     * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If
515     * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the name of the generated report file.
516     * Default value is 'archivefiles-report.txt'.
517     *
518     * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
519     */
520    public static String METADATA_ARCHIVE_FILES_REPORT_NAME = "settings.harvester.harvesting.metadata.archiveFilesReport.fileName";
521
522    /**
523     * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If
524     * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the header of the generated report file.
525     * This setting should generally be left to its default value, which is '[ARCHIVEFILE] [Closed] [Size]'.
526     *
527     * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
528     */
529    public static String METADATA_ARCHIVE_FILES_REPORT_HEADER = "settings.harvester.harvesting.metadata.archiveFilesReport.fileHeader";
530
531    /**
532     * The version number which goes in metadata file names like 12345-metadata-&lt;version number&gt;.warc.gz
533     */
534    public static String METADATA_FILE_VERSION_NUMBER = "settings.harvester.harvesting.metadata.filename.versionnumber";
535
536    /**
537     * <b>settings.harvester.aliases.timeout</b> The amount of time in seconds before an alias times out, and needs to
538     * be re-evaluated. The default value is one year, i.e 31536000 seconds.
539     */
540    public static String ALIAS_TIMEOUT = "settings.harvester.aliases.timeout";
541
542    /**
543     * <b>settings.harvester.harvesting.continuationFromHeritrixRecoverlogEnabled</b>:</br> Setting for whether or not a
544     * restarted job should try fetching the recoverlog of the previous failed job, and ask Heritrix to continue from
545     * this log. The default is false.
546     */
547    public static String RECOVERlOG_CONTINUATION_ENABLED = "settings.harvester.harvesting.continuationFromHeritrixRecoverlogEnabled";
548
549    /**
550     * <b>settings.harvester.harvesting.metadata.metadataFormat</b> The dataformat used by Netarchivesuite to write the
551     * metadata associated with a given harvest job. default: arc (alternative: warc)
552     */
553    public static String METADATA_FORMAT = "settings.harvester.harvesting.metadata.metadataFormat";
554
555    /**
556     * <b>settings.harvester.harvesting.metadata.metadataFileNameFormat</b> The format of the name of the metadata file :
557     * By default, it will be jobID-metadata.1.extension for example 3161-metadata-1.warc
558     * If the value is "prefix", it will be named like a warc file : Prefix-61-3161-metadata-1.warc
559     * default value : default (alternative: prefix) 
560     */
561    public static String METADATA_FILENAME_FORMAT = "settings.harvester.harvesting.metadata.metadataFileNameFormat";
562
563    /**
564     * <b>settings.harvester.harvesting.metadata.compression</b> Do we compress the
565     * metadata associated with a given harvest job. 
566     * default: false 
567     */
568    public static String METADATA_COMPRESSION = "settings.harvester.harvesting.metadata.compression";
569    
570    /**
571     * <b>settings.harvester.harvesting.heritrix.archiveNaming.collectionName</b>
572     * prefix for archive file
573     * if METADATA_FILENAME_FORMAT is "prefix", then check of a collection name to prefix metadata filename
574     */
575     public static String HERITRIX_PREFIX_COLLECTION_NAME = "settings.harvester.harvesting.heritrix.archiveNaming.collectionName";
576
577    /**
578     * <b>settings.harvester.harvesting.heritrix.archiveFormat</b> The dataformat used by heritrix to write the
579     * harvested data. default: warc (alternative: arc)
580     */
581    public static String HERITRIX_ARCHIVE_FORMAT = "settings.harvester.harvesting.heritrix.archiveFormat";
582    /**
583     * <b>settings.harvester.harvesting.heritrix.archiveNaming.class</b> The class implementing the chosen way of naming
584     * your archive-files default: LegacyNamingConvention. This class decides what to put into the Heritrix "prefix"
585     * property of the org.archive.crawler.writer.ARCWriterProcessor and/or
586     * org.archive.crawler.writer.WARCWriterProcessor.
587     */
588    public static String HERITRIX_ARCHIVE_NAMING_CLASS = "settings.harvester.harvesting.heritrix.archiveNaming.class";
589
590    /**
591     * <b>settings.harvester.harvesting.heritrix.warc.warcParametersOverride</b> This paramater define NAS behaviour 
592     * regarding warc parameters (write request, write metadata, etc.) : if this parameter is true, the warc parameters
593     * defined in harvester templates are not considered. The default is true.
594     */
595    public static String HERITRIX_WARC_PARAMETERS_OVERRIDE = "settings.harvester.harvesting.heritrix.warc.warcParametersOverride";
596
597    /**
598     * <b>settings.harvester.harvesting.heritrix.warc.skipIdenticalDigests</b> Represents the 'skip-identical-digests'
599     * setting in the Heritrix WARCWriterProcessor. The default is false.
600     */
601    public static String HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix.warc.skipIdenticalDigests";
602    /**
603     * <b>settings.harvester.harvesting.heritrix.warc.writeRequests</b> Represents the 'write-requests' setting in the
604     * Heritrix WARCWriterProcessor. The default is true
605     */
606    public static String HERITRIX_WARC_WRITE_REQUESTS = "settings.harvester.harvesting.heritrix.warc.writeRequests";
607    /**
608     * <b>settings.harvester.harvesting.heritrix.warc.writeMetadata</b> Represents the 'write-metadata' setting in the
609     * Heritrix WARCWriterProcessor. The default is true.
610     */
611    public static String HERITRIX_WARC_WRITE_METADATA = "settings.harvester.harvesting.heritrix.warc.writeMetadata";
612    /**
613     * <b>settings.harvester.harvesting.heritrix.warc.writeMetadataOutlinks</b> Represents the 'write-metadata-outlinks' setting in the Heritrix 
614     * WARCWriterProcessor. The default is false.s
615     */
616    public static String HERITRIX_WARC_WRITE_METADATA_OUTLINKS = "settings.harvester.harvesting.heritrix.warc.writeMetadataOutlinks";
617    /**
618     * <b>settings.harvester.harvesting.heritrix.warc.writeRevisitForIdenticalDigests</b> Represents the
619     * 'write-revisit-for-identical-digests' setting in the Heritrix WARCWriterProcessor. The default is true.
620     */
621    public static String HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix.warc.writeRevisitForIdenticalDigests";
622    /**
623     * <b>settings.harvester.harvesting.heritrix.warc.writeRevisitForNotModified</b> Represents the
624     * 'write-revisit-for-not-modified' setting in the Heritrix WARCWriterProcessor. The default is true.
625     */
626    public static String HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED = "settings.harvester.harvesting.heritrix.warc.writeRevisitForNotModified";
627
628    /**
629     * <b>settings.harvester.harvesting.heritrix.warc.startNewFilesOnCheckpoint</b> Represents the
630     * 'startNewFilesOnCheckpoint' setting in the Heritrix WARCWriterProcessor. Only available with H3. The default is true.
631     */
632    public static String HERITRIX_WARC_START_NEW_FILES_ON_CHECKPOINT 
633        = "settings.harvester.harvesting.heritrix.warc.startNewFilesOnCheckpoint";
634    
635    /**
636     * Currently UNUSED.
637     * <b>settings.harvester.harvesting.heritrix.version</b> Represents the version of Heritrix used by Netarchivesuite 
638     * The default is h3. The optional value is h1.
639     * 
640     * 
641     * If h1 is chosen, we assume that our templates is h1, as well.
642     * If h3 is chosen, we assume that our templates is h3, as well.
643     * There is no attempt at migration from one to the other. This must be done by an commandline-tool.
644     */
645    public static String HERITRIX_VERSION = "settings.harvester.harvesting.heritrix.version";
646    
647    
648    /**
649     * <b>settings.harvester.performer</b>: <br>
650     * The agent performing these harvests. The default is: ""
651     */
652    public static String PERFORMER = "settings.harvester.performer";
653
654    /***************************/
655    /* Indexserver - settings. */
656    /***************************/
657
658    /**
659     * <b>settings.harvester.indexserver.requestdir</b>: <br>
660     * Setting for where the requests of the indexserver are stored.
661     */
662    public static String INDEXSERVER_INDEXING_REQUESTDIR = "settings.harvester.indexserver.requestdir";
663
664    /**
665     * <b>settings.harvester.indexserver.maxclients</b>: <br>
666     * Setting for the max number of clients the indexserver can handle simultaneously.
667     */
668    public static String INDEXSERVER_INDEXING_MAXCLIENTS = "settings.harvester.indexserver.maxclients";
669
670    /**
671     * <b>settings.harvester.indexserver.maxthreads</b>: <br>
672     * Setting for the max number of threads the deduplication indexer shall use.
673     */
674    public static String INDEXSERVER_INDEXING_MAXTHREADS = "settings.harvester.indexserver.maxthreads";
675    /**
676     * <b>settings.harvester.indexserver.checkinterval</b>: <br>
677     * Setting for the time in milliseconds between each check of the state of sub-indexing. Default: 30 seconds (30000
678     * milliseconds).
679     */
680    public static String INDEXSERVER_INDEXING_CHECKINTERVAL = "settings.harvester.indexserver.checkinterval";
681
682    /**
683     * <b>settings.harvester.indexserver.indexingtimeout</b>: <br>
684     * Setting for the indexing timeout in milliseconds. The default is 259200000 (3 days).
685     */
686    public static String INDEXSERVER_INDEXING_TIMEOUT = "settings.harvester.indexserver.indexingtimeout";
687
688    /**
689     * <b>settings.harvester.indexserver.maxsegments</b>: <br>
690     * Setting for how many segments we will accept in our lucene indices. The default is 15.
691     */
692    public static String INDEXSERVER_INDEXING_MAX_SEGMENTS = "settings.harvester.indexserver.maxsegments";
693
694    /**
695     * <b>settings.harvester.indexserver.listeningcheckinterval</b>: <br>
696     * Setting for the interval between each listening check in milliseconds. The default is 30000 (5 minutes).
697     */
698    public static String INDEXSERVER_INDEXING_LISTENING_INTERVAL = "settings.harvester.indexserver.listeningcheckinterval";
699    /**
700     * <b>settings.archive.indexserver.satisfactorythresholdpercentage</b>: <br>
701     * Setting for the satisfactory threshold of the indexing result as a percentage. The default is 70 percent
702     */
703    public static String INDEXSERVER_INDEXING_SATISFACTORYTHRESHOLD_PERCENTAGE = "settings.harvester.indexserver.satisfactorythresholdpercentage";
704
705    /**
706     * <b>settings.harvester.indexserver.indexrequestserver.class</b>: <br>
707     * Setting for which type of indexrequestserver to use. The default is:
708     * {@link dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer}
709     */
710    public static String INDEXREQUEST_SERVER_CLASS = "settings.harvester.indexserver.indexrequestserver.class";
711
712    /**
713     * b>settings.harvester.indexserver.lookfordataInAllBitarchiveReplicas</b>: <br>
714     * Setting for whether or not data not found in the default bitarchive replica shall be looked for in other
715     * bitarchive replicas. The default is false.
716     */
717    public static String INDEXSERVER_INDEXING_LOOKFORDATAINOTHERBITARCHIVEREPLICAS = "settings.harvester.indexserver.lookfordataInAllBitarchiveReplicas";
718
719    /***************************/
720    /* Viewerproxy - settings. */
721    /***************************/
722
723    /**
724     * <b>settings.viewerproxy.baseDir</b>: <br>
725     * The main directory for the ViewerProxy, used for storing the Lucene index for the jobs being viewed. This
726     * directory can be used by multiple ViewerProxy applications running on the same machine.
727     */
728    public static String VIEWERPROXY_DIR = "settings.harvester.viewerproxy.baseDir";
729
730    /**
731     * <b>settings.viewerproxy.tryLookupUriAsFtp</b>: <br>
732     * If we fail to lookup an URI, we will try changing the protocol to ftp, if this setting is set to true. The
733     * default is false.
734     */
735    public static String TRY_LOOKUP_URI_AS_FTP = "settings.harvester.viewerproxy.tryLookupUriAsFtp";
736
737    /**
738     * <b>settings.viewerproxy.maxSizeInBrowser</b> The size (in bytes) of the largest object to be returned for viewing
739     * in the browser window. Larger objects will be returned with the appropriate http header for saving them to a
740     * file.
741     */
742    public static String MAXIMUM_OBJECT_IN_BROWSER = "settings.harvester.viewerproxy.maxSizeInBrowser";
743
744    /**
745     * <b>settings.harvester.viewerproxy.allowFileDownloads</b> If set to false, there will be no links to
746     * allow download of warcfiles via the Viewerproxy GUI.
747     */
748    public static String ALLOW_FILE_DOWNLOADS = "settings.harvester.viewerproxy.allowFileDownloads";
749
750    /**
751     * <b>settings.harvester.webinterface.maxCrawlLogInBrowser</b>: The maximum length (in lines) of 
752     * crawllog to be displayed in a browser window.
753     * default value: 1000
754     */
755    public static String MAX_CRAWLLOG_IN_BROWSER = "settings.harvester.webinterface.maxCrawlLogInBrowser";
756
757    /**
758     * <b>settings.harvester.webinterface.runningjobsFilteringMethod</b>: The filtering method using on the running jobs page.
759     * There are two available methods. Searching in the cached crawllogs (cachedLogs) or in the harvest database (database)  
760     * default: database
761     */
762    public static String RUNNINGJOBS_FILTERING_METHOD = "settings.harvester.webinterface.runningjobsFilteringMethod";
763
764   /**
765     * <b>settings.harvester.harvesting.heritrix</b>: <br>
766     * The path to the Heritrix3 SETTINGS.
767     */
768    public static String HERITRIX3 = "settings.harvester.harvesting.heritrix3";
769
770    /** Heritrix3  ArcWriter settings **/
771    
772    public static String HERITRIX3_ARC_COMPRESSION = "settings.harvester.harvesting.heritrix3.arc.compression";
773
774    public static String HERITRIX3_ARC_SUFFIX = "settings.harvester.harvesting.heritrix3.arc.suffix";
775
776    public static String HERITRIX3_ARC_MAXSIZE = "settings.harvester.harvesting.heritrix3.arc.maxFileSizeBytes";
777
778    public static String HERITRIX3_ARC_POOL_MAXACTIVE = "settings.harvester.harvesting.heritrix3.arc.poolMaxActive";
779
780    public static String HERITRIX3_ARC_SKIP_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix3.arc.skipIdenticalDigests";
781    
782    /**
783     * <b>settings.harvester.harvesting.heritrix3.warc.template</b>: <br>
784     * The template for warcfiles created by Heritrix.
785     * Default value in NAS: ${prefix}-${timestamp17}-${serialno}-${heritrix.hostname}
786     * Default value in H3:  ${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}
787     */
788    public static String HERITRIX3_WARC_TEMPLATE = "settings.harvester.harvesting.heritrix3.warc.template";
789
790    public static String HERITRIX3_WARC_COMPRESSION = "settings.harvester.harvesting.heritrix3.warc.compression";
791
792    public static String HERITRIX3_WARC_POOL_MAXACTIVE = "settings.harvester.harvesting.heritrix3.warc.poolMaxActive";
793    
794    public static String HERITRIX3_WARC_MAXSIZE = "settings.harvester.harvesting.heritrix3.warc.maxFileSizeBytes";
795    
796    public static String HERITRIX3_WARC_WRITE_REQUESTS = "settings.harvester.harvesting.heritrix3.warc.writeRequests";
797
798    public static String HERITRIX3_WARC_WRITE_METADATA = "settings.harvester.harvesting.heritrix3.warc.writeMetadata";
799
800    public static String HERITRIX3_WARC_WRITE_METADATA_OUTLINKS = "settings.harvester.harvesting.heritrix3.warc.writeMetadataOutlinks";
801
802    public static String HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix3.warc.skipIdenticalDigests";
803
804    public static String HERITRIX3_WARC_START_NEW_FILES_ON_CHECKPOINT = "settings.harvester.harvesting.heritrix3.warc.startNewFilesOnCheckpoint";
805
806    /**
807     * <b>settings.harvester.harvesting.heritrix.archiveFormat</b> The dataformat used by heritrix to write the
808     * harvested data. default: warc (alternative: arc)
809     */
810    public static String HERITRIX3_ARCHIVE_FORMAT = "settings.harvester.harvesting.heritrix3.archiveFormat";
811    /**
812     * <b>settings.harvester.harvesting.heritrix.archiveNaming.class</b> The class implementing the chosen way of naming
813     * your archive-files default: LegacyNamingConvention. This class decides what to put into the Heritrix "prefix"
814     * property of the org.archive.crawler.writer.ARCWriterProcessor and/or
815     * org.archive.crawler.writer.WARCWriterProcessor.
816     */
817    public static String HERITRIX3_ARCHIVE_NAMING_CLASS = "settings.harvester.harvesting.heritrix3.archiveNaming.class";
818 
819     /**
820     * <b>settings.harvester.harvesting.heritrix.warc.writeMetadataOutlinks</b> This paramater define NAS behaviour 
821     * regarding warc parameters (write request, write metadata, etc.) : if this parameter is true, the warc parameters
822     * defined in harvester templates are not considered. The default is true.
823     */
824    public static String HERITRIX3_WARC_PARAMETERS_OVERRIDE = "settings.harvester.harvesting.heritrix3.warc.warcParametersOverride";
825   
826    /**
827     * <b>settings.harvester.harvesting.heritrix.bundle</b>Points to the Heritrix3 zipfile bundled with 
828     * netarchiveSuite classes. Currently no default value
829     */     
830    public static String HERITRIX3_BUNDLE = "settings.harvester.harvesting.heritrix3.bundle";
831
832    /**
833     * <b>settings.harvester.harvesting.heritrix.certificate</b>Points to the jks keystore to use for connection to the
834     * Heritrix3 rest api. If undefined the keystore provided with the heritrix3 bundler is used.
835     */
836    public static String HERITRIX3_CERTIFICATE = "settings.harvester.harvesting.heritrix3.certificate";
837    /**
838     * <b>settings.harvester.harvesting.heritrix.certificatePassword</b>Points to the password to use for connection to the
839     * Heritrix3 rest api.
840     */
841    public static String HERITRIX3_CERTIFICATE_PASSWORD = "settings.harvester.harvesting.heritrix3.certificatePassword";
842
843    /**
844     * <b>settings.harvester.harvesting.monitor.tempPath</b>: The directory used to cache the h3 crawllogs.
845     * Default value: cached_crawllogs
846     */
847    public static String HERITRIX3_MONITOR_TEMP_PATH = "settings.harvester.harvesting.monitor.tempPath";
848
849}