001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.scheduler.jobgen; 024 025import java.util.Comparator; 026import java.util.HashMap; 027import java.util.Iterator; 028import java.util.Map; 029import java.util.NoSuchElementException; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.utils.Settings; 035import dk.netarkivet.harvester.HarvesterSettings; 036import dk.netarkivet.harvester.datamodel.DomainConfiguration; 037import dk.netarkivet.harvester.datamodel.HarvestDefinition; 038import dk.netarkivet.harvester.datamodel.Job; 039import dk.netarkivet.harvester.datamodel.JobDAO; 040 041/** 042 * Job generator implementation. Generates jobs with a fixed number of domain configurations. Configuration allows to 043 * choose a different count for partial and full harvests. The last job generated may have less configurations in it, as 044 * job generation happens on a per-harvest basis. 045 * 046 * @see HarvesterSettings#JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT 047 * @see HarvesterSettings#JOBGEN_FIXED_CONFIG_COUNT_FOCUSED 048 */ 049public class FixedDomainConfigurationCountJobGenerator extends AbstractJobGenerator { 050 051 /** Logger for this class. */ 052 private static final Logger log = LoggerFactory.getLogger(FixedDomainConfigurationCountJobGenerator.class); 053 054 /** 055 * A compound key used to split domain configurations in jobs. 056 */ 057 private class DomainConfigurationKey { 058 059 /** The name of the Heritrix crawl order template. */ 060 private final String orderXmlName; 061 /** The crawl budget in URI. */ 062 private final long maxObjects; 063 /** The crawl budget in bytes. */ 064 private final long maxBytes; 065 066 /** 067 * Constructor from a domain configuration. 068 * 069 * @param cfg the related {@link DomainConfiguration} 070 */ 071 DomainConfigurationKey(DomainConfiguration cfg) { 072 this.orderXmlName = cfg.getOrderXmlName(); 073 long cMaxBytes = cfg.getMaxBytes(); 074 long cMaxObjects = cfg.getMaxObjects(); 075 if (cMaxBytes == 0 || cMaxObjects == 0) { 076 // All domain configurations with a zero budget (either size or URI count 077 // end up in the same group 078 this.maxBytes = 0; 079 this.maxObjects = 0; 080 } else { 081 this.maxBytes = cMaxBytes; 082 this.maxObjects = cMaxObjects; 083 } 084 } 085 086 @Override 087 public int hashCode() { 088 final int prime = 31; 089 int result = 1; 090 result = prime * result + (int) (maxBytes ^ (maxBytes >>> 32)); 091 result = prime * result + (int) (maxObjects ^ (maxObjects >>> 32)); 092 result = prime * result + ((orderXmlName == null) ? 0 : orderXmlName.hashCode()); 093 return result; 094 } 095 096 @Override 097 public boolean equals(Object obj) { 098 if (obj == null || !DomainConfigurationKey.class.equals(obj.getClass())) { 099 return false; 100 } 101 DomainConfigurationKey dc = (DomainConfigurationKey) obj; 102 return orderXmlName.equals(dc.orderXmlName) && maxBytes == dc.maxBytes && maxObjects == dc.maxObjects; 103 } 104 105 @Override 106 public String toString() { 107 return orderXmlName + ":" + maxObjects + ":" + maxBytes; 108 } 109 } 110 111 /** 112 * Simple marker class to improve code readability. 113 * <p> 114 * Maps jobs currently being filled, for a given harvest definition, with domain configurations by harvest template 115 * name. These jobs keep getting new configurations until no more configurations are left to process or the 116 * configured size has been reached. 117 */ 118 @SuppressWarnings("serial") 119 private class HarvestJobGenerationState extends HashMap<DomainConfigurationKey, Job> { 120 } 121 122 /** 123 * Compare two configurations in alphabetical order of their name. 124 */ 125 private static class ConfigNamesComparator implements Comparator<DomainConfiguration> { 126 127 @Override 128 public int compare(DomainConfiguration dc1, DomainConfiguration dc2) { 129 return dc1.getName().compareTo(dc2.getName()); 130 } 131 132 } 133 134 /** Constant : how many {@link DomainConfiguration}s we want in a focused harvest job. */ 135 private static long CONFIG_COUNT_FOCUSED = Settings.getLong(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_FOCUSED); 136 137 /** Constant : how many {@link DomainConfiguration}s we want in a snapshot harvest job. */ 138 private static long CONFIG_COUNT_SNAPSHOT = Settings.getLong(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT); 139 140 /** Constant : exclude {@link DomainConfiguration}s with a budget of zero (bytes or objects). */ 141 private static boolean EXCLUDE_ZERO_BUDGET = Settings 142 .getBoolean(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET); 143 144 /** The singleton instance. */ 145 public static FixedDomainConfigurationCountJobGenerator instance; 146 147 /** 148 * Maps jobs currently being filled with domain configurations by harvest template name. These jobs keep getting new 149 * configurations until no more configurations are left to process or the configured size has been reached. 150 */ 151 private Map<Long, HarvestJobGenerationState> state; 152 153 /** The job DAO instance (singleton). */ 154 private JobDAO dao = JobDAO.getInstance(); 155 156 private FixedDomainConfigurationCountJobGenerator() { 157 this.state = new HashMap<Long, HarvestJobGenerationState>(); 158 } 159 160 /** 161 * @return the singleton instance, builds it if necessary. 162 */ 163 public synchronized static FixedDomainConfigurationCountJobGenerator getInstance() { 164 if (instance == null) { 165 instance = new FixedDomainConfigurationCountJobGenerator(); 166 } 167 return instance; 168 } 169 170 @Override 171 protected Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator(HarvestDefinition harvest) { 172 return new ConfigNamesComparator(); 173 } 174 175 @Override 176 protected boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg) { 177 return job.getDomainConfigurationMap().size() < (job.isSnapshot() ? CONFIG_COUNT_SNAPSHOT 178 : CONFIG_COUNT_FOCUSED); 179 } 180 181 @Override 182 public int generateJobs(HarvestDefinition harvest) { 183 HarvestJobGenerationState jobsUnderConstruction = getStateForHarvest(harvest); 184 185 try { 186 int jobsComplete = super.generateJobs(harvest); 187 188 // Look if we have jobs that have not reached their limit, but are complete 189 // as we have finished processing the harvest 190 if (!jobsUnderConstruction.isEmpty()) { 191 for (Job job : jobsUnderConstruction.values()) { 192 // The job is ready, post-process and store it in DB 193 editJobOrderXml(job); 194 dao.create(job); 195 196 // Increment counter 197 ++jobsComplete; 198 } 199 } 200 201 return jobsComplete; 202 } finally { 203 dropStateForHarvest(harvest); 204 } 205 } 206 207 @Override 208 protected int processDomainConfigurationSubset(HarvestDefinition harvest, 209 Iterator<DomainConfiguration> domainConfSubset) { 210 HarvestJobGenerationState jobsUnderConstruction = getExistingStateForHarvest(harvest); 211 int jobsComplete = 0; 212 while (domainConfSubset.hasNext()) { 213 DomainConfiguration cfg = domainConfSubset.next(); 214 215 // Should we exclude a configuration with a budget of zero? 216 if (EXCLUDE_ZERO_BUDGET && (0 == cfg.getMaxBytes() || 0 == cfg.getMaxObjects())) { 217 log.info("[JobGen] Config '{}' for '{}' excluded (0{})", cfg.getName(), cfg.getDomainName(), 218 (cfg.getMaxBytes() == 0 ? " bytes" : " objects")); 219 continue; 220 } 221 222 DomainConfigurationKey domainConfigKey = new DomainConfigurationKey(cfg); 223 Job match = jobsUnderConstruction.get(domainConfigKey); 224 if (match == null) { 225 match = initNewJob(harvest, cfg); 226 } else { 227 if (canAccept(match, cfg, null)) { 228 match.addConfiguration(cfg); 229 } else { 230 // The job is ready, post-process and store it in DB 231 editJobOrderXml(match); 232 dao.create(match); 233 234 // Increment counter 235 ++jobsComplete; 236 237 // Start construction of a new job 238 initNewJob(harvest, cfg); 239 } 240 } 241 } 242 return jobsComplete; 243 } 244 245 /** 246 * Initializes a new job. 247 * 248 * @param harvest the {@link HarvestDefinition} being processed. 249 * @param cfg the first {@link DomainConfiguration} for this job. 250 * @return the {@link Job} instance 251 */ 252 private Job initNewJob(HarvestDefinition harvest, DomainConfiguration cfg) { 253 HarvestJobGenerationState jobsUnderConstruction = getExistingStateForHarvest(harvest); 254 Job job = getNewJob(harvest, cfg); 255 jobsUnderConstruction.put(new DomainConfigurationKey(cfg), job); 256 return job; 257 } 258 259 private synchronized HarvestJobGenerationState getStateForHarvest(final HarvestDefinition harvest, 260 final boolean failIfNotExists) { 261 262 long harvestId = harvest.getOid(); 263 HarvestJobGenerationState harvestState = this.state.get(harvestId); 264 if (harvestState == null) { 265 if (failIfNotExists) { 266 throw new NoSuchElementException("No job generation state for harvest " + harvestId); 267 } 268 harvestState = new HarvestJobGenerationState(); 269 this.state.put(harvestId, harvestState); 270 } 271 272 return harvestState; 273 } 274 275 private HarvestJobGenerationState getStateForHarvest(final HarvestDefinition harvest) { 276 return getStateForHarvest(harvest, false); 277 } 278 279 private HarvestJobGenerationState getExistingStateForHarvest(final HarvestDefinition harvest) { 280 return getStateForHarvest(harvest, true); 281 } 282 283 private synchronized void dropStateForHarvest(final HarvestDefinition harvest) { 284 long harvestId = harvest.getOid(); 285 HarvestJobGenerationState harvestState = this.state.remove(harvestId); 286 if (harvestState == null) { 287 throw new NoSuchElementException("No job generation state for harvest " + harvestId); 288 } 289 } 290 291}