Jobgeneration workflow: Level 1: HarvestJobGenerator#JobGeneratorTask.generateJobs JobGenerator jobGen = JobGeneratorFactory.getInstance(); int jobsMade = jobGen.generateJobs(harvestDefinition); [SEE LEVEL 2] // After jobgeneration generateJobs also has set the next date of this harvest (harvestDefinition), and updated // numEvents log.info("Created " + jobsMade + " jobs for harvest definition (" + harvestDefinition.getName() + ")"); // Commit the changes made to the harvestDefinition to the database haDefinitionDAO.update(harvestDefinition); Level 2: AbstractJobGenerator.generateJobs(harvestDefinition) final Iterator domainConfigurations = harvest.getDomainConfigurations(); [SEE LEVEL 3] while (domainConfigurations.hasNext()) { List subset = new ArrayList(); while (domainConfigurations.hasNext() && subset.size() < DOMAIN_CONFIG_SUBSET_SIZE) { subset.add(domainConfigurations.next()); } if (log.isTraceEnabled()) { log.trace("Found " + subset.size() + " domainconfigs to process for harvest # " + harvest.getOid()); } Collections.sort( subset, getDomainConfigurationSubsetComparator(harvest)); if (log.isTraceEnabled()) { log.trace(subset.size() + " domainconfigs now sorted and ready to processing"); } jobsMade += processDomainConfigurationSubset(harvest, subset.iterator()); } // Next follows an update next date and numEvents for the harvset Level 3: The most timeconsuming part: Finding the domainConfigurations final Iterator domainConfigurations = harvest.getDomainConfigurations(); expands into either: PartialHarvest.getDomainConfigurations (which is an already existing iterator at call-time) FullHarvest.getDomainConfigurations() //which expands into this for a snapshot harvest that does not continue // previous havrest: if (previousHarvestDefinitionOid == null) { //The first snapshot harvest HarvestDefinitionDAO hdDao = HarvestDefinitionDAO.getInstance(); return hdDao.getSnapShotConfigurations(); [SEE LEVEL 4] } else: //An iterative snapshop harvest final DomainDAO dao = DomainDAO.getInstance(); //Get what has been harvested Iterator i = dao.getHarvestInfoBasedOnPreviousHarvestDefinition( getPreviousHarvestDefinition()); // return new FilterIterator(i) { protected DomainConfiguration filter(HarvestInfo harvestInfo) { if (harvestInfo.getStopReason() == StopReason.DOWNLOAD_COMPLETE || harvestInfo.getStopReason() == StopReason.DOWNLOAD_UNFINISHED) { // Don't include the ones that finished or died // in an unclean fashion return null; } DomainConfiguration config = getConfigurationFromPreviousHarvest(harvestInfo, dao); if (harvestInfo.getStopReason() == StopReason.CONFIG_SIZE_LIMIT) { // Check if MaxBytes limit for DomainConfiguration have // been raised since previous harvest. // If this is the case, return the configuration int compare = NumberUtils.compareInf(config.getMaxBytes(), harvestInfo.getSizeDataRetrieved()); if (compare < 1) { return null; } else { return config; } } if (harvestInfo.getStopReason() == StopReason.CONFIG_OBJECT_LIMIT) { // Check if MaxObjects limit for DomainConfiguration have // been raised since previous harvest. // If this is the case, return the configuration int compare = NumberUtils.compareInf(config.getMaxObjects(), harvestInfo.getCountObjectRetrieved()); if (compare < 1) { return null; } else { return config; } } Domain d = dao.read(config.getDomainName()); if (d.getAliasInfo() != null && !d.getAliasInfo().isExpired()) { //Don't include aliases return null; } else { return config; } } }; Level 4: /** * Gets default configurations for all domains that are not aliases. * * This method currently gives an iterator that reads in all domains, * although only on demand, that is: when calling "hasNext". * * @return Iterator containing the default DomainConfiguration for all * domains that are not aliases */ @Override public synchronized Iterator getSnapShotConfigurations() { return new FilterIterator(DomainDAO .getInstance().getAllDomainsInSnapshotHarvestOrder()) { [SEE LEVEL 5] public DomainConfiguration filter(Domain domain) { if (domain.getAliasInfo() == null || domain.getAliasInfo().isExpired()) { return domain.getDefaultConfiguration(); } else { return null; } } }; } Level 5: DomainDBDAO.getAllDomainsInSnapshotHarvestOrder(); @Override public Iterator getAllDomainsInSnapshotHarvestOrder() { Connection c = HarvestDBConnection.get(); try { // Note: maxbytes are ordered with largest first for symmetry // with HarvestDefinition.CompareConfigDesc List domainNames = DBUtils.selectStringList( c, "SELECT domains.name" + " FROM domains, configurations, ordertemplates" + " WHERE domains.defaultconfig=configurations.config_id" + " AND configurations.template_id" + "=ordertemplates.template_id" + " ORDER BY" + " ordertemplates.name," + " configurations.maxbytes DESC," + " domains.name"); return new FilterIterator(domainNames.iterator()) { public Domain filter(String s) { return readKnown(s); } }; } finally { HarvestDBConnection.release(c); } }