001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3;
024
025import dk.netarkivet.common.utils.Settings;
026
027/**
028 * Settings specific to the heritrix3 harvester module of NetarchiveSuite.
029 */
030public class Heritrix3Settings {
031
032        /** The default place in classpath where the settings file can be found. */
033    private static final String DEFAULT_SETTINGS_CLASSPATH = "dk/netarkivet/harvester/heritrix3/settings.xml";
034
035    /*
036     * The static initialiser is called when the class is loaded. It will add default values for all settings defined in
037     * this class, by loading them from a settings.xml file in classpath.
038     */
039    static {
040        Settings.addDefaultClasspathSettings(DEFAULT_SETTINGS_CLASSPATH);
041    }
042
043    // NOTE: The constants defining setting names below are left non-final on
044    // purpose! Otherwise, the static initialiser that loads default values
045    // will not run.
046
047//    /**
048//     * <b>settings.harvester.harvesting.heritrix.crawlLoopWaitTime</b>:<br>
049//     * Time interval in seconds to wait during a crawl loop in the harvest controller. Default value is 20 seconds.
050//     * 
051//     * TODO Maybe move this from the heritrix settings (settings.harvester.harvesting.heritrix) to 
052//     * settings.harvester.harvesting.controller.  
053//     * 
054//     */
055    public static String CRAWL_LOOP_WAIT_TIME = "settings.harvester.harvesting.heritrix.crawlLoopWaitTime";
056
057    //
058//    /**
059//     * <b>settings.harvester.harvesting.frontier.frontierReportWaitTime</b>:<br>
060//     * Time interval in seconds to wait between two requests to generate a full frontier report. Default value is 600
061//     * seconds (10 min).
062//     */
063    public static String FRONTIER_REPORT_WAIT_TIME = "settings.harvester.harvesting.frontier.frontierReportWaitTime";
064//
065//    /**
066//     * <b>settings.harvester.harvesting.frontier.filter.class</b> Defines a filter to apply to the full frontier report.
067//     * the default class: {@link TopTotalEnqueuesFilter}
068//     */
069//    public static String FRONTIER_REPORT_FILTER_CLASS = "settings.harvester.harvesting.frontier.filter.class";
070//
071//    /**
072//     * <b>settings.harvester.harvesting.frontier.filter.args</b> Defines a frontier report filter's arguments. Arguments
073//     * should be separated by semicolons.
074//     */
075//    public static String FRONTIER_REPORT_FILTER_ARGS = "settings.harvester.harvesting.frontier.filter.args";
076//
077//    /**
078//     * <b>settings.harvester.harvesting.heritrix.abortIfConnectionLost</b>:<br>
079//     * Boolean flag. If set to true, the harvest controller will abort the current crawl when the JMX connection is
080//     * lost. If set to true it will only log a warning, leaving the crawl operator shutting down harvester manually.
081//     * Default value is true.
082//     *
083//     * @see BnfHeritrixController
084//     */
085//    public static String ABORT_IF_CONNECTION_LOST = "settings.harvester.harvesting.heritrix.abortIfConnectionLost";
086//
087//    /**
088//     * <b>settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout</b>:<br>
089//     * Maximum time in seconds to wait for Heritrix to generate report files once crawling is over.
090//     */
091//    public static String WAIT_FOR_REPORT_GENERATION_TIMEOUT = "settings.harvester.harvesting.heritrix.waitForReportGenerationTimeout";
092//
093//    /**
094//     * <b>settings.harvester.harvesting.heritrix.adminName</b>: <br>
095//     * The name used to access the Heritrix GUI.
096//     */
097   public static String HERITRIX_ADMIN_NAME = "settings.harvester.harvesting.heritrix.adminName";
098//
099//    /**
100//     * <b>settings.harvester.harvesting.heritrix.adminPassword</b>: <br>
101//     * The password used to access the Heritrix GUI.
102//     */
103    public static String HERITRIX_ADMIN_PASSWORD = "settings.harvester.harvesting.heritrix.adminPassword";
104//
105//    /**
106//     * <b>settings.harvester.harvesting.heritrix.guiPort</b>: <br>
107//     * Port used to access the Heritrix web user interface. This port must not be used by anything else on the machine.
108//     * Note that apart from pausing a job, modifications done directly on Heritrix may cause unexpected breakage.
109//     */
110   public static String HERITRIX_GUI_PORT = "settings.harvester.harvesting.heritrix.guiPort";
111
112//    /**
113//     * <b>settings.harvester.harvesting.heritrix.heapSize</b>: <br>
114//     * The heap size to use for the Heritrix sub-process. This should probably be fairly large. It can be specified in
115//     * the same way as for the -Xmx argument to Java, e.g. 512M, 2G etc.
116//     */
117    public static String HERITRIX_HEAP_SIZE = "settings.harvester.harvesting.heritrix.heapSize";
118//
119//    /**
120//     * <b>settings.harvester.harvesting.heritrix.javaOpts</b>: <br>
121//     * Additional JVM options for the Heritrix sub-process. By default there is no additional JVM option.
122//     */
123    public static String HERITRIX_JVM_OPTS = "settings.harvester.harvesting.heritrix.javaOpts";
124//
125//    /**
126//     * <b>settings.harvester.harvesting.heritrixControllerClass</b>:<br/>
127//     * The implementation of the HeritrixController interface to be used.
128//     */
129    public static String HERITRIX_CONTROLLER_CLASS = "settings.harvester.harvesting.heritrixController.class";
130//
131//    /**
132//     * <b>settings.harvester.harvesting.heritrixLauncherClass</b>:<br/>
133//     * The implementation of the HeritrixLauncher abstract class to be used.
134//     */
135    public static String HERITRIX_LAUNCHER_CLASS = "settings.harvester.harvesting.heritrixLauncher.class";
136//
137//    /**
138//     * <b>settings.harvester.harvesting.harvestReport</b>:<br/>
139//     * The implementation of {@link HarvestReport} interface to be used.
140//     */
141    public static String HARVEST_REPORT_CLASS = "settings.harvester.harvesting.harvestReport.class";
142//
143//    /**
144//     * <b>settings.harvester.harvesting.harvestReport.disregardSeedsURLInfo</b>:<br/>
145//     * Should we disregard seedURL-information and thus assign the harvested bytes to the domain of the harvested URL
146//     * instead of the seed url domain? The default is false;
147//     */
148    public static String DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG = "settings.harvester.harvesting.harvestReport.disregardSeedURLInfo";
149
150    /**
151     * <b>settings.harvester.harvesting.metadata.generateArchiveFilesReport</b> This setting is a boolean flag that
152     * enables/disables the generation of an ARC/WARC files report. Default value is 'true'.
153     *
154     * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
155     */
156    public static String METADATA_GENERATE_ARCHIVE_FILES_REPORT = "settings.harvester.harvesting.metadata.archiveFilesReport.generate";
157
158    /**
159     * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If
160     * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the name of the generated report file.
161     * Default value is 'archivefiles-report.txt'.
162     *
163     * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
164     */
165    public static String METADATA_ARCHIVE_FILES_REPORT_NAME = "settings.harvester.harvesting.metadata.archiveFilesReport.fileName";
166
167    /**
168     * <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If
169     * {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the header of the generated report file.
170     * This setting should generally be left to its default value, which is '[ARCHIVEFILE] [Closed] [Size]'.
171     *
172     * @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
173     */
174    public static String METADATA_ARCHIVE_FILES_REPORT_HEADER = "settings.harvester.harvesting.metadata.archiveFilesReport.fileHeader";
175}