001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.scheduler.jobgen; 024 025import java.util.Comparator; 026import java.util.Iterator; 027import java.util.Map; 028 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032import dk.netarkivet.common.exceptions.ArgumentNotValid; 033import dk.netarkivet.common.exceptions.UnknownID; 034import dk.netarkivet.common.utils.Settings; 035import dk.netarkivet.harvester.HarvesterSettings; 036import dk.netarkivet.harvester.datamodel.DomainConfiguration; 037import dk.netarkivet.harvester.datamodel.HarvestDefinition; 038import dk.netarkivet.harvester.datamodel.Job; 039import dk.netarkivet.harvester.datamodel.JobDAO; 040import dk.netarkivet.harvester.datamodel.NumberUtils; 041import dk.netarkivet.harvester.datamodel.eav.EAV; 042 043/** 044 * The legacy job generator implementation. Aims at generating jobs that execute in a predictable time by taking 045 * advantage of previous crawls statistics. 046 */ 047public class DefaultJobGenerator extends AbstractJobGenerator { 048 049 /** Logger for this class. */ 050 private static final Logger log = LoggerFactory.getLogger(DefaultJobGenerator.class); 051 052 private static Long CONFIG_COUNT_SNAPSHOT = null; 053 054 public DefaultJobGenerator() { 055 try { 056 CONFIG_COUNT_SNAPSHOT = Settings.getLong(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT); 057 if (CONFIG_COUNT_SNAPSHOT <= 0) { 058 log.info("The parameter {} has the value {} and is therefore ignored during job splitting for " 059 + "snapshot jobs.", HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT, CONFIG_COUNT_SNAPSHOT); 060 } else { 061 log.info("Snapshot jobs will be split at an absolute maximum of {} configurations ({}).", 062 CONFIG_COUNT_SNAPSHOT, HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT); 063 } 064 } catch (UnknownID u) { 065 log.info("The parameter {} is not set so there is no absolute limit to the number of configurations per " 066 + "snapshot job.", HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT); 067 } 068 } 069 070 /** 071 * Compare two configurations using the following order: 1) Harvest template 2) Byte limit 3) expected number of 072 * object a harvest of the configuration will produce. The comparison will put the largest configuration first (with 073 * respect to 2) and 3)) 074 */ 075 public static class CompareConfigsDesc implements Comparator<DomainConfiguration> { 076 077 private long objectLimit; 078 private long byteLimit; 079 080 public CompareConfigsDesc(long objectLimit, long byteLimit) { 081 this.objectLimit = objectLimit; 082 this.byteLimit = byteLimit; 083 } 084 085 public int compare(DomainConfiguration cfg1, DomainConfiguration cfg2) { 086 log.trace("Comparing " + cfg1 + " " + cfg2); 087 // Compare order xml names 088 int cmp = cfg1.getOrderXmlName().compareTo(cfg2.getOrderXmlName()); 089 if (cmp != 0) { 090 return cmp; 091 } 092 log.trace("Comparing EAV attributes now"); 093 int result = EAV.compare(cfg1.getAttributesAndTypes(), cfg2.getAttributesAndTypes()); 094 log.trace("Comparison of EAV attributes gave result " + result); 095 if (result != 0) { 096 return result; 097 } 098 // Compare byte limits 099 long bytelimit1 = NumberUtils.minInf(cfg1.getMaxBytes(), byteLimit); 100 long bytelimit2 = NumberUtils.minInf(cfg2.getMaxBytes(), byteLimit); 101 cmp = NumberUtils.compareInf(bytelimit2, bytelimit1); 102 if (cmp != 0) { 103 return cmp; 104 } 105 // Compare expected sizes 106 long expectedsize1 = cfg1.getExpectedNumberOfObjects(objectLimit, byteLimit); 107 long expectedsize2 = cfg2.getExpectedNumberOfObjects(objectLimit, byteLimit); 108 long res = expectedsize2 - expectedsize1; 109 if (res != 0L) { 110 return res < 0L ? -1 : 1; 111 } 112 return 0; 113 } 114 } 115 116 /** 117 * Job limits read from settings during construction. 118 */ 119 private final long LIM_MAX_REL_SIZE = Long.parseLong(Settings 120 .get(HarvesterSettings.JOBS_MAX_RELATIVE_SIZE_DIFFERENCE)); 121 private final long LIM_MIN_ABS_SIZE = Long.parseLong(Settings 122 .get(HarvesterSettings.JOBS_MIN_ABSOLUTE_SIZE_DIFFERENCE)); 123 private final long LIM_MAX_TOTAL_SIZE = Long.parseLong(Settings.get(HarvesterSettings.JOBS_MAX_TOTAL_JOBSIZE)); 124 /** Constant : exclude {@link DomainConfiguration}s with a budget of zero (bytes or objects). */ 125 private final boolean EXCLUDE_ZERO_BUDGET = Settings 126 .getBoolean(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET); 127 128 /** Singleton instance. */ 129 private static DefaultJobGenerator instance; 130 131 /** 132 * @return the singleton instance, builds it if necessary. 133 */ 134 public static synchronized DefaultJobGenerator getInstance() { 135 if (instance == null) { 136 instance = new DefaultJobGenerator(); 137 } 138 return instance; 139 } 140 141 @Override 142 protected Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator(HarvestDefinition harvest) { 143 return new CompareConfigsDesc(harvest.getMaxCountObjects(), harvest.getMaxBytes()); 144 } 145 146 /** 147 * Create new jobs from a collection of configurations. All configurations must use the same order.xml file.Jobs 148 * 149 * @param harvest the {@link HarvestDefinition} being processed. 150 * @param domainConfSubset the configurations to use to create the jobs 151 * @return The number of jobs created 152 * @throws ArgumentNotValid if any of the parameters is null or if the cfglist does not contain any configurations 153 */ 154 @Override 155 protected int processDomainConfigurationSubset(HarvestDefinition harvest, 156 Iterator<DomainConfiguration> domainConfSubset) { 157 int jobsMade = 0; 158 Job job = null; 159 log.debug("Adding domainconfigs with the same order.xml for harvest #{}", harvest.getOid()); 160 JobDAO dao = JobDAO.getInstance(); 161 DomainConfiguration previousDomainConf = null; 162 while (domainConfSubset.hasNext()) { 163 DomainConfiguration cfg = domainConfSubset.next(); 164 log.trace("Processing " + DomainConfiguration.cfgToString(cfg)); 165 if (EXCLUDE_ZERO_BUDGET && (0 == cfg.getMaxBytes() || 0 == cfg.getMaxObjects())) { 166 log.info("Config '{}' for '{}'" + " excluded (0{})", cfg.getName(), cfg.getDomainName(), 167 (cfg.getMaxBytes() == 0 ? " bytes" : " objects")); 168 continue; 169 } 170 // excluding configs with no active seeds 171 if (ignoreConfiguration(cfg)) { 172 log.info("Ignoring config '{}' for domain '{}' - no active seeds !"); 173 continue; 174 } 175 176 if ((job == null) || (!canAccept(job, cfg, previousDomainConf))) { 177 if (job != null) { 178 // If we're done with a job, write it out 179 ++jobsMade; 180 dao.create(job); 181 } 182 job = getNewJob(harvest, cfg); 183 log.trace("Created new job for harvest #{} to add configuration {} for domain {}", harvest.getOid(), 184 cfg.getName(), cfg.getDomainName()); 185 186 } else { 187 job.addConfiguration(cfg); 188 log.trace("Added job configuration {} for domain {} to current job for harvest #{}", cfg.getName(), 189 cfg.getDomainName(), harvest.getOid()); 190 } 191 previousDomainConf = cfg; 192 } 193 if (job != null) { 194 ++jobsMade; 195 editJobOrderXml(job); 196 dao.create(job); 197 if (log.isTraceEnabled()) { 198 log.trace("Generated job: '{}'", job.toString()); 199 StringBuilder logMsg = new StringBuilder("Job configurationsDomain:"); 200 for (Map.Entry<String, String> config : job.getDomainConfigurationMap().entrySet()) { 201 logMsg.append("\n ").append(config.getKey()).append(":").append(config.getValue()); 202 } 203 log.trace(logMsg.toString()); 204 } 205 log.debug("Created {} jobs for harvest #{}", jobsMade, harvest.getOid()); 206 } 207 return jobsMade; 208 } 209 210 211 @Override 212 protected boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg) { 213 if (job.isSnapshot() 214 && CONFIG_COUNT_SNAPSHOT != null 215 && CONFIG_COUNT_SNAPSHOT > 0 216 && job.getDomainConfigurationMap().size() >= CONFIG_COUNT_SNAPSHOT 217 ) { 218 log.debug("Job for HD #{} has now reached the CONFIG_COUNT_SNAPSHOT limit {}", job.getOrigHarvestDefinitionID(), CONFIG_COUNT_SNAPSHOT); 219 return false; 220 } 221 222 // By default byte limit is used as base criterion for splitting a 223 // harvest in config chunks, however the configuration can override 224 // this and instead use object limit. 225 boolean splitByObjectLimit = Settings.getBoolean(HarvesterSettings.SPLIT_BY_OBJECTLIMIT); 226 long forceMaxObjectsPerDomain = job.getForceMaxObjectsPerDomain(); 227 long forceMaxBytesPerDomain = job.getForceMaxBytesPerDomain(); 228 if (splitByObjectLimit) { 229 if (NumberUtils.compareInf(cfg.getMaxObjects(), forceMaxObjectsPerDomain) < 0 230 || (job.isConfigurationSetsObjectLimit() && NumberUtils.compareInf(cfg.getMaxObjects(), 231 forceMaxObjectsPerDomain) != 0)) { 232 log.debug("Job for HD #{} OBJECT_LIMIT of config (domain,config={},{}) incompatible with current job", 233 job.getOrigHarvestDefinitionID(), cfg.getDomainName(), cfg.getName()); 234 return false; 235 } 236 } else { 237 if (NumberUtils.compareInf(cfg.getMaxBytes(), forceMaxBytesPerDomain) < 0 238 || (job.isConfigurationSetsByteLimit() && NumberUtils.compareInf(cfg.getMaxBytes(), 239 forceMaxBytesPerDomain) != 0)) { 240 log.debug("Job for HD #{} BYTE_LIMIT of config (domain,config={},{}) incompatible with current job", 241 job.getOrigHarvestDefinitionID(), cfg.getDomainName(), cfg.getName()); 242 return false; 243 } 244 } 245 246 long maxCountObjects = job.getMaxCountObjects(); 247 long minCountObjects = job.getMinCountObjects(); 248 249 assert (maxCountObjects >= minCountObjects) : "basic invariant"; 250 251 // The expected number of objects retrieved by this job from 252 // the configuration based on historical harvest results. 253 long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); 254 255 // Check if total count is exceeded 256 long totalCountObjects = job.getTotalCountObjects(); 257 if ((totalCountObjects > 0) && ((expectation + totalCountObjects) > LIM_MAX_TOTAL_SIZE)) { 258 log.debug("Job for HD #{} will exceed LIM_MAX_TOTAL_SIZE({}), if config(domain,config={},{}) with expected object count {} is added", 259 job.getOrigHarvestDefinitionID(), LIM_MAX_TOTAL_SIZE, cfg.getDomainName(), cfg.getName(), expectation); 260 return false; 261 } 262 263 // total count OK 264 // Check if size within existing limits 265 if ((expectation <= maxCountObjects) && (expectation >= minCountObjects)) { 266 // total count ok and within current max and min 267 return true; 268 } 269 270 // Outside current range we need to check the relative difference 271 long absDiff; 272 long xmaxCountObjects = maxCountObjects; 273 long yminCountObjects = minCountObjects; 274 275 // New max or new min ? 276 if (expectation > maxCountObjects) { 277 xmaxCountObjects = expectation; 278 } else { 279 assert (expectation < minCountObjects) : "New minimum expected"; 280 yminCountObjects = expectation; 281 } 282 283 absDiff = (xmaxCountObjects - yminCountObjects); 284 285 if ((absDiff == 0) || (absDiff <= LIM_MIN_ABS_SIZE)) { 286 return true; // difference too small to matter 287 } 288 289 if (yminCountObjects == 0) { 290 yminCountObjects = 1; // make sure division succeeds 291 } 292 293 float relDiff = (float) xmaxCountObjects / (float) yminCountObjects; 294 if (relDiff > LIM_MAX_REL_SIZE) { 295 log.debug("Job for HD #{} will be incompatible with LIM_MAX_REL_SIZE({}), if config(domain,config={},{}) with relDiff {} is added", 296 job.getOrigHarvestDefinitionID(), LIM_MAX_REL_SIZE, cfg.getDomainName(), cfg.getName(), relDiff); 297 return false; 298 } 299 300 // all tests passed 301 return true; 302 } 303 304 /** Only to be used by unittests. */ 305 public static void reset() { 306 instance = null; 307 } 308 309}