001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.scheduler.jobgen;
024
025import java.util.Comparator;
026import java.util.HashMap;
027import java.util.Iterator;
028import java.util.Map;
029import java.util.NoSuchElementException;
030
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import dk.netarkivet.common.utils.Settings;
035import dk.netarkivet.harvester.HarvesterSettings;
036import dk.netarkivet.harvester.datamodel.DomainConfiguration;
037import dk.netarkivet.harvester.datamodel.HarvestDefinition;
038import dk.netarkivet.harvester.datamodel.Job;
039import dk.netarkivet.harvester.datamodel.JobDAO;
040
041/**
042 * Job generator implementation. Generates jobs with a fixed number of domain configurations. Configuration allows to
043 * choose a different count for partial and full harvests. The last job generated may have less configurations in it, as
044 * job generation happens on a per-harvest basis.
045 *
046 * @see HarvesterSettings#JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT
047 * @see HarvesterSettings#JOBGEN_FIXED_CONFIG_COUNT_FOCUSED
048 */
049public class FixedDomainConfigurationCountJobGenerator extends AbstractJobGenerator {
050
051    /** Logger for this class. */
052    private static final Logger log = LoggerFactory.getLogger(FixedDomainConfigurationCountJobGenerator.class);
053
054    /**
055     * A compound key used to split domain configurations in jobs.
056     */
057    private class DomainConfigurationKey {
058
059        /** The name of the Heritrix crawl order template. */
060        private final String orderXmlName;
061        /** The crawl budget in URI. */
062        private final long maxObjects;
063        /** The crawl budget in bytes. */
064        private final long maxBytes;
065
066        /**
067         * Constructor from a domain configuration.
068         *
069         * @param cfg the related {@link DomainConfiguration}
070         */
071        DomainConfigurationKey(DomainConfiguration cfg) {
072            this.orderXmlName = cfg.getOrderXmlName();
073            long cMaxBytes = cfg.getMaxBytes();
074            long cMaxObjects = cfg.getMaxObjects();
075            if (cMaxBytes == 0 || cMaxObjects == 0) {
076                // All domain configurations with a zero budget (either size or URI count
077                // end up in the same group
078                this.maxBytes = 0;
079                this.maxObjects = 0;
080            } else {
081                this.maxBytes = cMaxBytes;
082                this.maxObjects = cMaxObjects;
083            }
084        }
085
086        @Override
087        public int hashCode() {
088            final int prime = 31;
089            int result = 1;
090            result = prime * result + (int) (maxBytes ^ (maxBytes >>> 32));
091            result = prime * result + (int) (maxObjects ^ (maxObjects >>> 32));
092            result = prime * result + ((orderXmlName == null) ? 0 : orderXmlName.hashCode());
093            return result;
094        }
095
096        @Override
097        public boolean equals(Object obj) {
098            if (obj == null || !DomainConfigurationKey.class.equals(obj.getClass())) {
099                return false;
100            }
101            DomainConfigurationKey dc = (DomainConfigurationKey) obj;
102            return orderXmlName.equals(dc.orderXmlName) && maxBytes == dc.maxBytes && maxObjects == dc.maxObjects;
103        }
104
105        @Override
106        public String toString() {
107            return orderXmlName + ":" + maxObjects + ":" + maxBytes;
108        }
109    }
110
111    /**
112     * Simple marker class to improve code readability.
113     * <p>
114     * Maps jobs currently being filled, for a given harvest definition, with domain configurations by harvest template
115     * name. These jobs keep getting new configurations until no more configurations are left to process or the
116     * configured size has been reached.
117     */
118    @SuppressWarnings("serial")
119    private class HarvestJobGenerationState extends HashMap<DomainConfigurationKey, Job> {
120    }
121
122    /**
123     * Compare two configurations in alphabetical order of their name.
124     */
125    private static class ConfigNamesComparator implements Comparator<DomainConfiguration> {
126
127        @Override
128        public int compare(DomainConfiguration dc1, DomainConfiguration dc2) {
129            return dc1.getName().compareTo(dc2.getName());
130        }
131
132    }
133
134    /** Constant : how many {@link DomainConfiguration}s we want in a focused harvest job. */
135    private static long CONFIG_COUNT_FOCUSED = Settings.getLong(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_FOCUSED);
136
137    /** Constant : how many {@link DomainConfiguration}s we want in a snapshot harvest job. */
138    private static long CONFIG_COUNT_SNAPSHOT = Settings.getLong(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_SNAPSHOT);
139
140    /** Constant : exclude {@link DomainConfiguration}s with a budget of zero (bytes or objects). */
141    private static boolean EXCLUDE_ZERO_BUDGET = Settings
142            .getBoolean(HarvesterSettings.JOBGEN_FIXED_CONFIG_COUNT_EXCLUDE_ZERO_BUDGET);
143
144    /** The singleton instance. */
145    public static FixedDomainConfigurationCountJobGenerator instance;
146
147    /**
148     * Maps jobs currently being filled with domain configurations by harvest template name. These jobs keep getting new
149     * configurations until no more configurations are left to process or the configured size has been reached.
150     */
151    private Map<Long, HarvestJobGenerationState> state;
152
153    /** The job DAO instance (singleton). */
154    private JobDAO dao = JobDAO.getInstance();
155
156    private FixedDomainConfigurationCountJobGenerator() {
157        this.state = new HashMap<Long, HarvestJobGenerationState>();
158    }
159
160    /**
161     * @return the singleton instance, builds it if necessary.
162     */
163    public synchronized static FixedDomainConfigurationCountJobGenerator getInstance() {
164        if (instance == null) {
165            instance = new FixedDomainConfigurationCountJobGenerator();
166        }
167        return instance;
168    }
169
170    @Override
171    protected Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator(HarvestDefinition harvest) {
172        return new ConfigNamesComparator();
173    }
174
175    @Override
176    protected boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg) {
177        return job.getDomainConfigurationMap().size() < (job.isSnapshot() ? CONFIG_COUNT_SNAPSHOT
178                : CONFIG_COUNT_FOCUSED);
179    }
180
181    @Override
182    public int generateJobs(HarvestDefinition harvest) {
183        HarvestJobGenerationState jobsUnderConstruction = getStateForHarvest(harvest);
184
185        try {
186            int jobsComplete = super.generateJobs(harvest);
187
188            // Look if we have jobs that have not reached their limit, but are complete
189            // as we have finished processing the harvest
190            if (!jobsUnderConstruction.isEmpty()) {
191                for (Job job : jobsUnderConstruction.values()) {
192                    // The job is ready, post-process and store it in DB
193                    editJobOrderXml(job);
194                    dao.create(job);
195
196                    // Increment counter
197                    ++jobsComplete;
198                }
199            }
200
201            return jobsComplete;
202        } finally {
203            dropStateForHarvest(harvest);
204        }
205    }
206
207    @Override
208    protected int processDomainConfigurationSubset(HarvestDefinition harvest,
209            Iterator<DomainConfiguration> domainConfSubset) {
210        HarvestJobGenerationState jobsUnderConstruction = getExistingStateForHarvest(harvest);
211        int jobsComplete = 0;
212        while (domainConfSubset.hasNext()) {
213            DomainConfiguration cfg = domainConfSubset.next();
214
215            // Should we exclude a configuration with a budget of zero?
216            if (EXCLUDE_ZERO_BUDGET && (0 == cfg.getMaxBytes() || 0 == cfg.getMaxObjects())) {
217                log.info("[JobGen] Config '{}' for '{}' excluded (0{})", cfg.getName(), cfg.getDomainName(),
218                        (cfg.getMaxBytes() == 0 ? " bytes" : " objects"));
219                continue;
220            }
221
222            DomainConfigurationKey domainConfigKey = new DomainConfigurationKey(cfg);
223            Job match = jobsUnderConstruction.get(domainConfigKey);
224            if (match == null) {
225                match = initNewJob(harvest, cfg);
226            } else {
227                if (canAccept(match, cfg, null)) {
228                    match.addConfiguration(cfg);
229                } else {
230                    // The job is ready, post-process and store it in DB
231                    editJobOrderXml(match);
232                    dao.create(match);
233
234                    // Increment counter
235                    ++jobsComplete;
236
237                    // Start construction of a new job
238                    initNewJob(harvest, cfg);
239                }
240            }
241        }
242        return jobsComplete;
243    }
244
245    /**
246     * Initializes a new job.
247     *
248     * @param harvest the {@link HarvestDefinition} being processed.
249     * @param cfg the first {@link DomainConfiguration} for this job.
250     * @return the {@link Job} instance
251     */
252    private Job initNewJob(HarvestDefinition harvest, DomainConfiguration cfg) {
253        HarvestJobGenerationState jobsUnderConstruction = getExistingStateForHarvest(harvest);
254        Job job = getNewJob(harvest, cfg);
255        jobsUnderConstruction.put(new DomainConfigurationKey(cfg), job);
256        return job;
257    }
258
259    private synchronized HarvestJobGenerationState getStateForHarvest(final HarvestDefinition harvest,
260            final boolean failIfNotExists) {
261
262        long harvestId = harvest.getOid();
263        HarvestJobGenerationState harvestState = this.state.get(harvestId);
264        if (harvestState == null) {
265            if (failIfNotExists) {
266                throw new NoSuchElementException("No job generation state for harvest " + harvestId);
267            }
268            harvestState = new HarvestJobGenerationState();
269            this.state.put(harvestId, harvestState);
270        }
271
272        return harvestState;
273    }
274
275    private HarvestJobGenerationState getStateForHarvest(final HarvestDefinition harvest) {
276        return getStateForHarvest(harvest, false);
277    }
278
279    private HarvestJobGenerationState getExistingStateForHarvest(final HarvestDefinition harvest) {
280        return getStateForHarvest(harvest, true);
281    }
282
283    private synchronized void dropStateForHarvest(final HarvestDefinition harvest) {
284        long harvestId = harvest.getOid();
285        HarvestJobGenerationState harvestState = this.state.remove(harvestId);
286        if (harvestState == null) {
287            throw new NoSuchElementException("No job generation state for harvest " + harvestId);
288        }
289    }
290
291}