001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.scheduler.jobgen; 024 025import java.util.Comparator; 026import java.util.Iterator; 027import java.util.Map; 028 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032import dk.netarkivet.common.exceptions.ArgumentNotValid; 033import dk.netarkivet.common.utils.Settings; 034import dk.netarkivet.harvester.HarvesterSettings; 035import dk.netarkivet.harvester.datamodel.DomainConfiguration; 036import dk.netarkivet.harvester.datamodel.HarvestDefinition; 037import dk.netarkivet.harvester.datamodel.Job; 038import dk.netarkivet.harvester.datamodel.JobDAO; 039import dk.netarkivet.harvester.datamodel.NumberUtils; 040 041/** 042 * The legacy job generator implementation. Aims at generating jobs that execute in a predictable time by taking 043 * advantage of previous crawls statistics. 044 */ 045public class DefaultJobGenerator extends AbstractJobGenerator { 046 047 /** Logger for this class. */ 048 private static final Logger log = LoggerFactory.getLogger(DefaultJobGenerator.class); 049 050 /** 051 * Compare two configurations using the following order: 1) Harvest template 2) Byte limit 3) expected number of 052 * object a harvest of the configuration will produce. The comparison will put the largest configuration first (with 053 * respect to 2) and 3)) 054 */ 055 public static class CompareConfigsDesc implements Comparator<DomainConfiguration> { 056 057 private long objectLimit; 058 private long byteLimit; 059 060 public CompareConfigsDesc(long objectLimit, long byteLimit) { 061 this.objectLimit = objectLimit; 062 this.byteLimit = byteLimit; 063 } 064 065 public int compare(DomainConfiguration cfg1, DomainConfiguration cfg2) { 066 // Compare order xml names 067 int cmp = cfg1.getOrderXmlName().compareTo(cfg2.getOrderXmlName()); 068 if (cmp != 0) { 069 return cmp; 070 } 071 072 // Compare byte limits 073 long bytelimit1 = NumberUtils.minInf(cfg1.getMaxBytes(), byteLimit); 074 long bytelimit2 = NumberUtils.minInf(cfg2.getMaxBytes(), byteLimit); 075 cmp = NumberUtils.compareInf(bytelimit2, bytelimit1); 076 if (cmp != 0) { 077 return cmp; 078 } 079 080 // Compare expected sizes 081 long expectedsize1 = cfg1.getExpectedNumberOfObjects(objectLimit, byteLimit); 082 long expectedsize2 = cfg2.getExpectedNumberOfObjects(objectLimit, byteLimit); 083 long res = expectedsize2 - expectedsize1; 084 if (res != 0L) { 085 return res < 0L ? -1 : 1; 086 } 087 088 return 0; 089 } 090 } 091 092 /** 093 * Job limits read from settings during construction. 094 */ 095 private final long LIM_MAX_REL_SIZE = Long.parseLong(Settings 096 .get(HarvesterSettings.JOBS_MAX_RELATIVE_SIZE_DIFFERENCE)); 097 private final long LIM_MIN_ABS_SIZE = Long.parseLong(Settings 098 .get(HarvesterSettings.JOBS_MIN_ABSOLUTE_SIZE_DIFFERENCE)); 099 private final long LIM_MAX_TOTAL_SIZE = Long.parseLong(Settings.get(HarvesterSettings.JOBS_MAX_TOTAL_JOBSIZE)); 100 /** Constant : exclude {@link DomainConfiguration}s with a budget of zero (bytes or objects). */ 101 private final boolean EXCLUDE_ZERO_BUDGET = Settings 102 .getBoolean(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET); 103 104 /** Singleton instance. */ 105 private static DefaultJobGenerator instance; 106 107 /** 108 * @return the singleton instance, builds it if necessary. 109 */ 110 public static synchronized DefaultJobGenerator getInstance() { 111 if (instance == null) { 112 instance = new DefaultJobGenerator(); 113 } 114 return instance; 115 } 116 117 @Override 118 protected Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator(HarvestDefinition harvest) { 119 return new CompareConfigsDesc(harvest.getMaxCountObjects(), harvest.getMaxBytes()); 120 } 121 122 /** 123 * Create new jobs from a collection of configurations. All configurations must use the same order.xml file.Jobs 124 * 125 * @param harvest the {@link HarvestDefinition} being processed. 126 * @param domainConfSubset the configurations to use to create the jobs 127 * @return The number of jobs created 128 * @throws ArgumentNotValid if any of the parameters is null or if the cfglist does not contain any configurations 129 */ 130 @Override 131 protected int processDomainConfigurationSubset(HarvestDefinition harvest, 132 Iterator<DomainConfiguration> domainConfSubset) { 133 int jobsMade = 0; 134 Job job = null; 135 log.debug("Adding domainconfigs with the same order.xml for harvest #{}", harvest.getOid()); 136 JobDAO dao = JobDAO.getInstance(); 137 while (domainConfSubset.hasNext()) { 138 DomainConfiguration cfg = domainConfSubset.next(); 139 if (EXCLUDE_ZERO_BUDGET && (0 == cfg.getMaxBytes() || 0 == cfg.getMaxObjects())) { 140 log.info("Config '{}' for '{}'" + " excluded (0{})", cfg.getName(), cfg.getDomainName(), 141 (cfg.getMaxBytes() == 0 ? " bytes" : " objects")); 142 continue; 143 } 144 // Do we need to create a new Job or is the current job ok 145 if ((job == null) || (!canAccept(job, cfg))) { 146 if (job != null) { 147 // If we're done with a job, write it out 148 ++jobsMade; 149 dao.create(job); 150 } 151 job = getNewJob(harvest, cfg); 152 log.trace("Created new job for harvest #{} to add configuration {} for domain {}", harvest.getOid(), 153 cfg.getName(), cfg.getDomainName()); 154 155 } else { 156 job.addConfiguration(cfg); 157 log.trace("Added job configuration {} for domain {} to current job for harvest #{}", cfg.getName(), 158 cfg.getDomainName(), harvest.getOid()); 159 } 160 } 161 if (job != null) { 162 ++jobsMade; 163 editJobOrderXml(job); 164 dao.create(job); 165 if (log.isTraceEnabled()) { 166 log.trace("Generated job: '{}'", job.toString()); 167 StringBuilder logMsg = new StringBuilder("Job configurationsDomain:"); 168 for (Map.Entry<String, String> config : job.getDomainConfigurationMap().entrySet()) { 169 logMsg.append("\n ").append(config.getKey()).append(":").append(config.getValue()); 170 } 171 log.trace(logMsg.toString()); 172 } 173 log.debug("Created {} jobs for harvest #{}", jobsMade, harvest.getOid()); 174 } 175 return jobsMade; 176 } 177 178 @Override 179 protected boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg) { 180 // By default byte limit is used as base criterion for splitting a 181 // harvest in config chunks, however the configuration can override 182 // this and instead use object limit. 183 boolean splitByObjectLimit = Settings.getBoolean(HarvesterSettings.SPLIT_BY_OBJECTLIMIT); 184 long forceMaxObjectsPerDomain = job.getForceMaxObjectsPerDomain(); 185 long forceMaxBytesPerDomain = job.getForceMaxBytesPerDomain(); 186 if (splitByObjectLimit) { 187 if (NumberUtils.compareInf(cfg.getMaxObjects(), forceMaxObjectsPerDomain) < 0 188 || (job.isConfigurationSetsObjectLimit() && NumberUtils.compareInf(cfg.getMaxObjects(), 189 forceMaxObjectsPerDomain) != 0)) { 190 return false; 191 } 192 } else { 193 if (NumberUtils.compareInf(cfg.getMaxBytes(), forceMaxBytesPerDomain) < 0 194 || (job.isConfigurationSetsByteLimit() && NumberUtils.compareInf(cfg.getMaxBytes(), 195 forceMaxBytesPerDomain) != 0)) { 196 return false; 197 } 198 } 199 200 long maxCountObjects = job.getMaxCountObjects(); 201 long minCountObjects = job.getMinCountObjects(); 202 203 assert (maxCountObjects >= minCountObjects) : "basic invariant"; 204 205 // The expected number of objects retrieved by this job from 206 // the configuration based on historical harvest results. 207 long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); 208 209 // Check if total count is exceeded 210 long totalCountObjects = job.getTotalCountObjects(); 211 if ((totalCountObjects > 0) && ((expectation + totalCountObjects) > LIM_MAX_TOTAL_SIZE)) { 212 return false; 213 } 214 215 // total count OK 216 // Check if size within existing limits 217 if ((expectation <= maxCountObjects) && (expectation >= minCountObjects)) { 218 // total count ok and within current max and min 219 return true; 220 } 221 222 // Outside current range we need to check the relative difference 223 long absDiff; 224 long xmaxCountObjects = maxCountObjects; 225 long yminCountObjects = minCountObjects; 226 227 // New max or new min ? 228 if (expectation > maxCountObjects) { 229 xmaxCountObjects = expectation; 230 } else { 231 assert (expectation < minCountObjects) : "New minimum expected"; 232 yminCountObjects = expectation; 233 } 234 235 absDiff = (xmaxCountObjects - yminCountObjects); 236 237 if ((absDiff == 0) || (absDiff <= LIM_MIN_ABS_SIZE)) { 238 return true; // difference too small to matter 239 } 240 241 if (yminCountObjects == 0) { 242 yminCountObjects = 1; // make sure division succeeds 243 } 244 245 float relDiff = (float) xmaxCountObjects / (float) yminCountObjects; 246 if (relDiff > LIM_MAX_REL_SIZE) { 247 return false; 248 } 249 250 // all tests passed 251 return true; 252 } 253 254 /** Only to be used by unittests. */ 255 public static void reset() { 256 instance = null; 257 } 258 259}