001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.scheduler.jobgen;
024
025import java.util.Comparator;
026import java.util.Iterator;
027import java.util.Map;
028
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032import dk.netarkivet.common.exceptions.ArgumentNotValid;
033import dk.netarkivet.common.utils.Settings;
034import dk.netarkivet.harvester.HarvesterSettings;
035import dk.netarkivet.harvester.datamodel.DomainConfiguration;
036import dk.netarkivet.harvester.datamodel.HarvestDefinition;
037import dk.netarkivet.harvester.datamodel.Job;
038import dk.netarkivet.harvester.datamodel.JobDAO;
039import dk.netarkivet.harvester.datamodel.NumberUtils;
040
041/**
042 * The legacy job generator implementation. Aims at generating jobs that execute in a predictable time by taking
043 * advantage of previous crawls statistics.
044 */
045public class DefaultJobGenerator extends AbstractJobGenerator {
046
047    /** Logger for this class. */
048    private static final Logger log = LoggerFactory.getLogger(DefaultJobGenerator.class);
049
050    /**
051     * Compare two configurations using the following order: 1) Harvest template 2) Byte limit 3) expected number of
052     * object a harvest of the configuration will produce. The comparison will put the largest configuration first (with
053     * respect to 2) and 3))
054     */
055    public static class CompareConfigsDesc implements Comparator<DomainConfiguration> {
056
057        private long objectLimit;
058        private long byteLimit;
059
060        public CompareConfigsDesc(long objectLimit, long byteLimit) {
061            this.objectLimit = objectLimit;
062            this.byteLimit = byteLimit;
063        }
064
065        public int compare(DomainConfiguration cfg1, DomainConfiguration cfg2) {
066            // Compare order xml names
067            int cmp = cfg1.getOrderXmlName().compareTo(cfg2.getOrderXmlName());
068            if (cmp != 0) {
069                return cmp;
070            }
071
072            // Compare byte limits
073            long bytelimit1 = NumberUtils.minInf(cfg1.getMaxBytes(), byteLimit);
074            long bytelimit2 = NumberUtils.minInf(cfg2.getMaxBytes(), byteLimit);
075            cmp = NumberUtils.compareInf(bytelimit2, bytelimit1);
076            if (cmp != 0) {
077                return cmp;
078            }
079
080            // Compare expected sizes
081            long expectedsize1 = cfg1.getExpectedNumberOfObjects(objectLimit, byteLimit);
082            long expectedsize2 = cfg2.getExpectedNumberOfObjects(objectLimit, byteLimit);
083            long res = expectedsize2 - expectedsize1;
084            if (res != 0L) {
085                return res < 0L ? -1 : 1;
086            }
087
088            return 0;
089        }
090    }
091
092    /**
093     * Job limits read from settings during construction.
094     */
095    private final long LIM_MAX_REL_SIZE = Long.parseLong(Settings
096            .get(HarvesterSettings.JOBS_MAX_RELATIVE_SIZE_DIFFERENCE));
097    private final long LIM_MIN_ABS_SIZE = Long.parseLong(Settings
098            .get(HarvesterSettings.JOBS_MIN_ABSOLUTE_SIZE_DIFFERENCE));
099    private final long LIM_MAX_TOTAL_SIZE = Long.parseLong(Settings.get(HarvesterSettings.JOBS_MAX_TOTAL_JOBSIZE));
100    /** Constant : exclude {@link DomainConfiguration}s with a budget of zero (bytes or objects). */
101    private final boolean EXCLUDE_ZERO_BUDGET = Settings
102            .getBoolean(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET);
103
104    /** Singleton instance. */
105    private static DefaultJobGenerator instance;
106
107    /**
108     * @return the singleton instance, builds it if necessary.
109     */
110    public static synchronized DefaultJobGenerator getInstance() {
111        if (instance == null) {
112            instance = new DefaultJobGenerator();
113        }
114        return instance;
115    }
116
117    @Override
118    protected Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator(HarvestDefinition harvest) {
119        return new CompareConfigsDesc(harvest.getMaxCountObjects(), harvest.getMaxBytes());
120    }
121
122    /**
123     * Create new jobs from a collection of configurations. All configurations must use the same order.xml file.Jobs
124     *
125     * @param harvest the {@link HarvestDefinition} being processed.
126     * @param domainConfSubset the configurations to use to create the jobs
127     * @return The number of jobs created
128     * @throws ArgumentNotValid if any of the parameters is null or if the cfglist does not contain any configurations
129     */
130    @Override
131    protected int processDomainConfigurationSubset(HarvestDefinition harvest,
132            Iterator<DomainConfiguration> domainConfSubset) {
133        int jobsMade = 0;
134        Job job = null;
135        log.debug("Adding domainconfigs with the same order.xml for harvest #{}", harvest.getOid());
136        JobDAO dao = JobDAO.getInstance();
137        while (domainConfSubset.hasNext()) {
138            DomainConfiguration cfg = domainConfSubset.next();
139            if (EXCLUDE_ZERO_BUDGET && (0 == cfg.getMaxBytes() || 0 == cfg.getMaxObjects())) {
140                log.info("Config '{}' for '{}'" + " excluded (0{})", cfg.getName(), cfg.getDomainName(),
141                        (cfg.getMaxBytes() == 0 ? " bytes" : " objects"));
142                continue;
143            }
144            // Do we need to create a new Job or is the current job ok
145            if ((job == null) || (!canAccept(job, cfg))) {
146                if (job != null) {
147                    // If we're done with a job, write it out
148                    ++jobsMade;
149                    dao.create(job);
150                }
151                job = getNewJob(harvest, cfg);
152                log.trace("Created new job for harvest #{} to add configuration {} for domain {}", harvest.getOid(),
153                        cfg.getName(), cfg.getDomainName());
154
155            } else {
156                job.addConfiguration(cfg);
157                log.trace("Added job configuration {} for domain {} to current job for harvest #{}", cfg.getName(),
158                        cfg.getDomainName(), harvest.getOid());
159            }
160        }
161        if (job != null) {
162            ++jobsMade;
163            editJobOrderXml(job);
164            dao.create(job);
165            if (log.isTraceEnabled()) {
166                log.trace("Generated job: '{}'", job.toString());
167                StringBuilder logMsg = new StringBuilder("Job configurationsDomain:");
168                for (Map.Entry<String, String> config : job.getDomainConfigurationMap().entrySet()) {
169                    logMsg.append("\n ").append(config.getKey()).append(":").append(config.getValue());
170                }
171                log.trace(logMsg.toString());
172            }
173            log.debug("Created {} jobs for harvest #{}", jobsMade, harvest.getOid());
174        }
175        return jobsMade;
176    }
177
178    @Override
179    protected boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg) {
180        // By default byte limit is used as base criterion for splitting a
181        // harvest in config chunks, however the configuration can override
182        // this and instead use object limit.
183        boolean splitByObjectLimit = Settings.getBoolean(HarvesterSettings.SPLIT_BY_OBJECTLIMIT);
184        long forceMaxObjectsPerDomain = job.getForceMaxObjectsPerDomain();
185        long forceMaxBytesPerDomain = job.getForceMaxBytesPerDomain();
186        if (splitByObjectLimit) {
187            if (NumberUtils.compareInf(cfg.getMaxObjects(), forceMaxObjectsPerDomain) < 0
188                    || (job.isConfigurationSetsObjectLimit() && NumberUtils.compareInf(cfg.getMaxObjects(),
189                            forceMaxObjectsPerDomain) != 0)) {
190                return false;
191            }
192        } else {
193            if (NumberUtils.compareInf(cfg.getMaxBytes(), forceMaxBytesPerDomain) < 0
194                    || (job.isConfigurationSetsByteLimit() && NumberUtils.compareInf(cfg.getMaxBytes(),
195                            forceMaxBytesPerDomain) != 0)) {
196                return false;
197            }
198        }
199
200        long maxCountObjects = job.getMaxCountObjects();
201        long minCountObjects = job.getMinCountObjects();
202
203        assert (maxCountObjects >= minCountObjects) : "basic invariant";
204
205        // The expected number of objects retrieved by this job from
206        // the configuration based on historical harvest results.
207        long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
208
209        // Check if total count is exceeded
210        long totalCountObjects = job.getTotalCountObjects();
211        if ((totalCountObjects > 0) && ((expectation + totalCountObjects) > LIM_MAX_TOTAL_SIZE)) {
212            return false;
213        }
214
215        // total count OK
216        // Check if size within existing limits
217        if ((expectation <= maxCountObjects) && (expectation >= minCountObjects)) {
218            // total count ok and within current max and min
219            return true;
220        }
221
222        // Outside current range we need to check the relative difference
223        long absDiff;
224        long xmaxCountObjects = maxCountObjects;
225        long yminCountObjects = minCountObjects;
226
227        // New max or new min ?
228        if (expectation > maxCountObjects) {
229            xmaxCountObjects = expectation;
230        } else {
231            assert (expectation < minCountObjects) : "New minimum expected";
232            yminCountObjects = expectation;
233        }
234
235        absDiff = (xmaxCountObjects - yminCountObjects);
236
237        if ((absDiff == 0) || (absDiff <= LIM_MIN_ABS_SIZE)) {
238            return true; // difference too small to matter
239        }
240
241        if (yminCountObjects == 0) {
242            yminCountObjects = 1; // make sure division succeeds
243        }
244
245        float relDiff = (float) xmaxCountObjects / (float) yminCountObjects;
246        if (relDiff > LIM_MAX_REL_SIZE) {
247            return false;
248        }
249
250        // all tests passed
251        return true;
252    }
253
254    /** Only to be used by unittests. */
255    public static void reset() {
256        instance = null;
257    }
258
259}