Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.scheduler.jobgen;
024
025import java.util.Comparator;
026import java.util.Iterator;
027import java.util.Map;
028
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032import dk.netarkivet.common.exceptions.ArgumentNotValid;
033import dk.netarkivet.common.exceptions.UnknownID;
034import dk.netarkivet.common.utils.Settings;
035import dk.netarkivet.harvester.HarvesterSettings;
036import dk.netarkivet.harvester.datamodel.DomainConfiguration;
037import dk.netarkivet.harvester.datamodel.HarvestDefinition;
038import dk.netarkivet.harvester.datamodel.Job;
039import dk.netarkivet.harvester.datamodel.JobDAO;
040import dk.netarkivet.harvester.datamodel.NumberUtils;
041import dk.netarkivet.harvester.datamodel.eav.EAV;
042
043/**
044 * The legacy job generator implementation. Aims at generating jobs that execute in a predictable time by taking
045 * advantage of previous crawls statistics.
046 */
047public class DefaultJobGenerator extends AbstractJobGenerator {
048
049    /** Logger for this class. */
050    private static final Logger log = LoggerFactory.getLogger(DefaultJobGenerator.class);
051
052    private static Long CONFIG_COUNT_SNAPSHOT = null;
053
054    public DefaultJobGenerator() {
055        try {
056            CONFIG_COUNT_SNAPSHOT = Settings.getLong(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT);
057            if (CONFIG_COUNT_SNAPSHOT <= 0) {
058                log.info("The parameter {} has the value {} and is therefore ignored during job splitting for "
059                        + "snapshot jobs.", HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT, CONFIG_COUNT_SNAPSHOT);
060            } else {
061                log.info("Snapshot jobs will be split at an absolute maximum of {} configurations ({}).",
062                        CONFIG_COUNT_SNAPSHOT, HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT);
063            }
064        } catch (UnknownID u) {
065            log.info("The parameter {} is not set so there is no absolute limit to the number of configurations per "
066                    + "snapshot job.", HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT);
067        }
068    }
069
070    /**
071     * Compare two configurations using the following order: 1) Harvest template 2) Byte limit 3) expected number of
072     * object a harvest of the configuration will produce. The comparison will put the largest configuration first (with
073     * respect to 2) and 3))
074     */
075    public static class CompareConfigsDesc implements Comparator<DomainConfiguration> {
076
077        private long objectLimit;
078        private long byteLimit;
079
080        public CompareConfigsDesc(long objectLimit, long byteLimit) {
081            this.objectLimit = objectLimit;
082            this.byteLimit = byteLimit;
083        }
084
085        public int compare(DomainConfiguration cfg1, DomainConfiguration cfg2) {
086            log.trace("Comparing " + cfg1 + " " + cfg2);
087            // Compare order xml names
088            int cmp = cfg1.getOrderXmlName().compareTo(cfg2.getOrderXmlName());
089            if (cmp != 0) {
090                return cmp;
091            }
092            log.trace("Comparing EAV attributes now");
093            int result = EAV.compare(cfg1.getAttributesAndTypes(), cfg2.getAttributesAndTypes());
094            log.trace("Comparison of EAV attributes gave result " + result);
095            if (result != 0) {
096                return result;
097            }
098            // Compare byte limits
099            long bytelimit1 = NumberUtils.minInf(cfg1.getMaxBytes(), byteLimit);
100            long bytelimit2 = NumberUtils.minInf(cfg2.getMaxBytes(), byteLimit);
101            cmp = NumberUtils.compareInf(bytelimit2, bytelimit1);
102            if (cmp != 0) {
103                return cmp;
104            }
105            // Compare expected sizes
106            long expectedsize1 = cfg1.getExpectedNumberOfObjects(objectLimit, byteLimit);
107            long expectedsize2 = cfg2.getExpectedNumberOfObjects(objectLimit, byteLimit);
108            long res = expectedsize2 - expectedsize1;
109            if (res != 0L) {
110                return res < 0L ? -1 : 1;
111            }
112            return 0;
113        }
114    }
115
116    /**
117     * Job limits read from settings during construction.
118     */
119    private final long LIM_MAX_REL_SIZE = Long.parseLong(Settings
120            .get(HarvesterSettings.JOBS_MAX_RELATIVE_SIZE_DIFFERENCE));
121    private final long LIM_MIN_ABS_SIZE = Long.parseLong(Settings
122            .get(HarvesterSettings.JOBS_MIN_ABSOLUTE_SIZE_DIFFERENCE));
123    private final long LIM_MAX_TOTAL_SIZE = Long.parseLong(Settings.get(HarvesterSettings.JOBS_MAX_TOTAL_JOBSIZE));
124    /** Constant : exclude {@link DomainConfiguration}s with a budget of zero (bytes or objects). */
125    private final boolean EXCLUDE_ZERO_BUDGET = Settings
126            .getBoolean(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET);
127
128    /** Singleton instance. */
129    private static DefaultJobGenerator instance;
130
131    /**
132     * @return the singleton instance, builds it if necessary.
133     */
134    public static synchronized DefaultJobGenerator getInstance() {
135        if (instance == null) {
136            instance = new DefaultJobGenerator();
137        }
138        return instance;
139    }
140
141    @Override
142    protected Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator(HarvestDefinition harvest) {
143        return new CompareConfigsDesc(harvest.getMaxCountObjects(), harvest.getMaxBytes());
144    }
145
146    /**
147     * Create new jobs from a collection of configurations. All configurations must use the same order.xml file.Jobs
148     *
149     * @param harvest the {@link HarvestDefinition} being processed.
150     * @param domainConfSubset the configurations to use to create the jobs
151     * @return The number of jobs created
152     * @throws ArgumentNotValid if any of the parameters is null or if the cfglist does not contain any configurations
153     */
154    @Override
155    protected int processDomainConfigurationSubset(HarvestDefinition harvest,
156            Iterator<DomainConfiguration> domainConfSubset) {
157        int jobsMade = 0;
158        Job job = null;
159        log.debug("Adding domainconfigs with the same order.xml for harvest #{}", harvest.getOid());
160        JobDAO dao = JobDAO.getInstance();
161        DomainConfiguration previousDomainConf = null;
162        while (domainConfSubset.hasNext()) {
163            DomainConfiguration cfg = domainConfSubset.next();
164            log.trace("Processing " + DomainConfiguration.cfgToString(cfg));
165            if (EXCLUDE_ZERO_BUDGET && (0 == cfg.getMaxBytes() || 0 == cfg.getMaxObjects())) {
166                log.info("Config '{}' for '{}'" + " excluded (0{})", cfg.getName(), cfg.getDomainName(),
167                        (cfg.getMaxBytes() == 0 ? " bytes" : " objects"));
168                continue;
169            }
170            // excluding configs with no active seeds
171            if (ignoreConfiguration(cfg)) {
172                log.info("Ignoring config '{}' for domain '{}' - no active seeds !");
173                continue;
174            }
175            
176            if ((job == null) || (!canAccept(job, cfg, previousDomainConf))) {
177                if (job != null) {
178                    // If we're done with a job, write it out
179                    ++jobsMade;
180                    dao.create(job);
181                }
182                job = getNewJob(harvest, cfg);
183                log.trace("Created new job for harvest #{} to add configuration {} for domain {}", harvest.getOid(),
184                        cfg.getName(), cfg.getDomainName());
185
186            } else {
187                job.addConfiguration(cfg);
188                log.trace("Added job configuration {} for domain {} to current job for harvest #{}", cfg.getName(),
189                        cfg.getDomainName(), harvest.getOid());
190            }
191            previousDomainConf = cfg;
192        }
193        if (job != null) {
194            ++jobsMade;
195            editJobOrderXml(job);
196            dao.create(job);
197            if (log.isTraceEnabled()) {
198                log.trace("Generated job: '{}'", job.toString());
199                StringBuilder logMsg = new StringBuilder("Job configurationsDomain:");
200                for (Map.Entry<String, String> config : job.getDomainConfigurationMap().entrySet()) {
201                    logMsg.append("\n ").append(config.getKey()).append(":").append(config.getValue());
202                }
203                log.trace(logMsg.toString());
204            }
205            log.debug("Created {} jobs for harvest #{}", jobsMade, harvest.getOid());
206        }
207        return jobsMade;
208    }
209
210
211    @Override
212    protected boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg) {
213        if (job.isSnapshot()
214                && CONFIG_COUNT_SNAPSHOT != null
215                && CONFIG_COUNT_SNAPSHOT > 0
216                && job.getDomainConfigurationMap().size() >= CONFIG_COUNT_SNAPSHOT
217                ) {
218            log.debug("Job for HD #{} has now reached the CONFIG_COUNT_SNAPSHOT limit {}", job.getOrigHarvestDefinitionID(), CONFIG_COUNT_SNAPSHOT);
219            return false;
220        }
221
222        // By default byte limit is used as base criterion for splitting a
223        // harvest in config chunks, however the configuration can override
224        // this and instead use object limit.
225        boolean splitByObjectLimit = Settings.getBoolean(HarvesterSettings.SPLIT_BY_OBJECTLIMIT);
226        long forceMaxObjectsPerDomain = job.getForceMaxObjectsPerDomain();
227        long forceMaxBytesPerDomain = job.getForceMaxBytesPerDomain();
228        if (splitByObjectLimit) {
229            if (NumberUtils.compareInf(cfg.getMaxObjects(), forceMaxObjectsPerDomain) < 0
230                    || (job.isConfigurationSetsObjectLimit() && NumberUtils.compareInf(cfg.getMaxObjects(),
231                            forceMaxObjectsPerDomain) != 0)) {
232                log.debug("Job for HD #{} OBJECT_LIMIT of config (domain,config={},{}) incompatible with current job", 
233                        job.getOrigHarvestDefinitionID(), cfg.getDomainName(), cfg.getName());
234                return false;
235            }
236        } else {
237            if (NumberUtils.compareInf(cfg.getMaxBytes(), forceMaxBytesPerDomain) < 0
238                    || (job.isConfigurationSetsByteLimit() && NumberUtils.compareInf(cfg.getMaxBytes(),
239                            forceMaxBytesPerDomain) != 0)) {
240                log.debug("Job for HD #{} BYTE_LIMIT of config (domain,config={},{}) incompatible with current job", 
241                        job.getOrigHarvestDefinitionID(), cfg.getDomainName(), cfg.getName());
242                return false;
243            }
244        }
245
246        long maxCountObjects = job.getMaxCountObjects();
247        long minCountObjects = job.getMinCountObjects();
248
249        assert (maxCountObjects >= minCountObjects) : "basic invariant";
250
251        // The expected number of objects retrieved by this job from
252        // the configuration based on historical harvest results.
253        long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
254
255        // Check if total count is exceeded
256        long totalCountObjects = job.getTotalCountObjects();
257        if ((totalCountObjects > 0) && ((expectation + totalCountObjects) > LIM_MAX_TOTAL_SIZE)) {
258            log.debug("Job for HD #{} will exceed LIM_MAX_TOTAL_SIZE({}), if config(domain,config={},{}) with expected object count {} is added", 
259                    job.getOrigHarvestDefinitionID(), LIM_MAX_TOTAL_SIZE, cfg.getDomainName(), cfg.getName(), expectation);
260            return false;
261        }
262
263        // total count OK
264        // Check if size within existing limits
265        if ((expectation <= maxCountObjects) && (expectation >= minCountObjects)) {
266            // total count ok and within current max and min
267            return true;
268        }
269
270        // Outside current range we need to check the relative difference
271        long absDiff;
272        long xmaxCountObjects = maxCountObjects;
273        long yminCountObjects = minCountObjects;
274
275        // New max or new min ?
276        if (expectation > maxCountObjects) {
277            xmaxCountObjects = expectation;
278        } else {
279            assert (expectation < minCountObjects) : "New minimum expected";
280            yminCountObjects = expectation;
281        }
282
283        absDiff = (xmaxCountObjects - yminCountObjects);
284
285        if ((absDiff == 0) || (absDiff <= LIM_MIN_ABS_SIZE)) {
286            return true; // difference too small to matter
287        }
288
289        if (yminCountObjects == 0) {
290            yminCountObjects = 1; // make sure division succeeds
291        }
292
293        float relDiff = (float) xmaxCountObjects / (float) yminCountObjects;
294        if (relDiff > LIM_MAX_REL_SIZE) {
295            log.debug("Job for HD #{} will be incompatible with LIM_MAX_REL_SIZE({}), if config(domain,config={},{}) with relDiff {} is added", 
296                    job.getOrigHarvestDefinitionID(), LIM_MAX_REL_SIZE, cfg.getDomainName(), cfg.getName(), relDiff);
297            return false;
298        }
299
300        // all tests passed
301        return true;
302    }
303
304    /** Only to be used by unittests. */
305    public static void reset() {
306        instance = null;
307    }
308
309}