001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.io.File;
026import java.io.FileReader;
027import java.io.IOException;
028import java.net.MalformedURLException;
029import java.net.URL;
030import java.util.ArrayList;
031import java.util.Collection;
032import java.util.Date;
033import java.util.HashMap;
034import java.util.HashSet;
035import java.util.Iterator;
036import java.util.List;
037import java.util.Map;
038import java.util.Set;
039
040import javax.servlet.jsp.PageContext;
041
042import org.apache.commons.io.LineIterator;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045
046import dk.netarkivet.common.exceptions.ArgumentNotValid;
047import dk.netarkivet.common.exceptions.IOFailure;
048import dk.netarkivet.common.exceptions.UnknownID;
049import dk.netarkivet.common.utils.DomainUtils;
050import dk.netarkivet.common.utils.I18n;
051import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory;
052import dk.netarkivet.harvester.webinterface.EventHarvestUtil;
053
054/**
055 * This class contains the specific properties and operations of harvest definitions which are not snapshot harvest
056 * definitions. I.e. this class models definitions of event and selective harvests.
057 */
058public class PartialHarvest extends HarvestDefinition {
059
060    private static final Logger log = LoggerFactory.getLogger(PartialHarvest.class);
061
062    /**
063     * Set of domain configurations being harvested by this harvest. Entries in this set are unique on configuration
064     * name + domain name.
065     */
066    private Map<SparseDomainConfiguration, DomainConfiguration> domainConfigurations = new HashMap<SparseDomainConfiguration, DomainConfiguration>();
067
068    /** The schedule used by this PartialHarvest. */
069    private Schedule schedule;
070
071    /**
072     * The next date this harvest definition should run, null if never again.
073     */
074    private Date nextDate;
075
076    /**
077     * Create new instance of a PartialHavest configured according to the properties of the supplied
078     * DomainConfiguration.
079     *
080     * @param domainConfigurations a list of domain configurations
081     * @param schedule the harvest definition schedule
082     * @param harvestDefName the name of the harvest definition
083     * @param comments comments
084     * @param audience The intended audience for this harvest (could be null)
085     */
086    public PartialHarvest(List<DomainConfiguration> domainConfigurations, Schedule schedule, String harvestDefName,
087            String comments, String audience) {
088        super(DAOProviderFactory.getExtendedFieldDAOProvider());
089        ArgumentNotValid.checkNotNull(schedule, "schedule");
090        ScheduleDAO.getInstance().read(schedule.getName());
091
092        ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName");
093        ArgumentNotValid.checkNotNull(comments, "comments");
094        ArgumentNotValid.checkNotNull(domainConfigurations, "domainConfigurations");
095
096        this.numEvents = 0;
097        addConfigurations(domainConfigurations);
098        this.schedule = schedule;
099        this.harvestDefName = harvestDefName;
100        this.comments = comments;
101        this.nextDate = schedule.getFirstEvent(new Date());
102        this.audience = audience;
103    }
104
105    /**
106     * Returns the schedule defined for this harvest definition.
107     *
108     * @return schedule
109     */
110    public Schedule getSchedule() {
111        return schedule;
112    }
113
114    /**
115     * Set the schedule to be used for this harvestdefinition.
116     *
117     * @param schedule A schedule for when to try harvesting.
118     */
119    public void setSchedule(Schedule schedule) {
120        ArgumentNotValid.checkNotNull(schedule, "schedule");
121        this.schedule = schedule;
122        if (nextDate != null) {
123            setNextDate(schedule.getFirstEvent(nextDate));
124        }
125    }
126
127    /**
128     * Get the next date this harvest definition should be run.
129     *
130     * @return The next date the harvest definition should be run or null, if the harvest definition should never run
131     * again.
132     */
133    public Date getNextDate() {
134        return nextDate;
135    }
136
137    /**
138     * Set the next date this harvest definition should be run.
139     *
140     * @param nextDate The next date the harvest definition should be run. May be null, meaning never again.
141     */
142    public void setNextDate(Date nextDate) {
143        this.nextDate = nextDate;
144    }
145
146    /**
147     * Remove domainconfiguration from this partialHarvest.
148     *
149     * @param dcKey domainConfiguration key
150     */
151    public void removeDomainConfiguration(SparseDomainConfiguration dcKey) {
152        ArgumentNotValid.checkNotNull(dcKey, "DomainConfigurationKey dcKey");
153        if (domainConfigurations.remove(dcKey) == null) {
154            log.warn("Unable to delete domainConfiguration '{}' from {}. Reason: didn't exist.", dcKey, this);
155        }
156    }
157
158    /**
159     * Add a new domainconfiguration to this PartialHarvest.
160     *
161     * @param newConfiguration A new DomainConfiguration
162     */
163    public void addDomainConfiguration(DomainConfiguration newConfiguration) {
164        ArgumentNotValid.checkNotNull(newConfiguration, "DomainConfiguration newConfiguration");
165        SparseDomainConfiguration key = new SparseDomainConfiguration(newConfiguration);
166        if (domainConfigurations.containsKey(key)) {
167            log.warn("Unable to add domainConfiguration '{}' from {}. Reason: does already exist.", newConfiguration,
168                    this);
169        } else {
170            domainConfigurations.put(key, newConfiguration);
171        }
172    }
173
174    /**
175     * Returns a List of domain configurations for this harvest definition.
176     *
177     * @return List containing information about the domain configurations
178     */
179    public Iterator<DomainConfiguration> getDomainConfigurations() {
180        return domainConfigurations.values().iterator();
181    }
182
183    /**
184     * @return the domainconfigurations as a list
185     */
186    public Collection<DomainConfiguration> getDomainConfigurationsAsList() {
187        return domainConfigurations.values();
188    }
189
190    /**
191     * Set the list of configurations that this PartialHarvest uses.
192     *
193     * @param configs List<DomainConfiguration> the configurations that this harvestdefinition will use.
194     */
195    public void setDomainConfigurations(List<DomainConfiguration> configs) {
196        ArgumentNotValid.checkNotNull(configs, "configs");
197
198        domainConfigurations.clear();
199        addConfigurations(configs);
200    }
201
202    /**
203     * Add the list of configurations to the configuration associated with this PartialHarvest.
204     *
205     * @param configs a List of configurations
206     */
207    private void addConfigurations(List<DomainConfiguration> configs) {
208        for (DomainConfiguration dc : configs) {
209            addConfiguration(dc);
210        }
211    }
212
213    /**
214     * Add a configuration to this PartialHarvest.
215     *
216     * @param dc the given configuration
217     */
218    private void addConfiguration(DomainConfiguration dc) {
219        domainConfigurations.put(new SparseDomainConfiguration(dc), dc);
220    }
221
222    /**
223     * Reset the harvest definition to no harvests and next date being the first possible for the schedule.
224     */
225    public void reset() {
226        numEvents = 0;
227        nextDate = schedule.getFirstEvent(new Date());
228    }
229
230    /**
231     * Check if this harvest definition should be run, given the time now.
232     *
233     * @param now The current time
234     * @return true if harvest definition should be run
235     */
236    public boolean runNow(Date now) {
237        ArgumentNotValid.checkNotNull(now, "now");
238        if (!getActive()) {
239            return false; // inactive definitions are never run
240        }
241        return nextDate != null && now.compareTo(nextDate) >= 0;
242    }
243
244    /**
245     * Returns whether this HarvestDefinition represents a snapshot harvest.
246     *
247     * @return false (always)
248     */
249    public boolean isSnapShot() {
250        return false;
251    }
252
253    /**
254     * Always returns no limit.
255     *
256     * @return 0, meaning no limit.
257     */
258    public long getMaxCountObjects() {
259        return Constants.HERITRIX_MAXOBJECTS_INFINITY;
260    }
261
262    /**
263     * Always returns no limit.
264     *
265     * @return -1, meaning no limit.
266     */
267    public long getMaxBytes() {
268        return Constants.HERITRIX_MAXBYTES_INFINITY;
269    }
270
271    /**
272     * Takes a seed list and creates any necessary domains, configurations, and seedlists to enable them to be harvested
273     * with the given template and other parameters. <A href="https://sbforge.org/jira/browse/NAS-1317">JIRA issue
274     * NAS-1317</A> addresses this issue. Current naming of the seedlists and domainconfigurations are: one of <br>
275     * harvestdefinitionname + "_" + templateName + "_" + "UnlimitedBytes" (if maxbytes is negative)<br>
276     * harvestdefinitionname + "_" + templateName + "_" + maxBytes + "Bytes" (if maxbytes is zero or postive).
277     *
278     * @param seeds a list of the seeds to be added
279     * @param templateName the name of the template to be used
280     * @param maxBytes Maximum number of bytes to harvest per domain
281     * @param maxObjects Maximum number of objects to harvest per domain
282     * @see EventHarvestUtil#addConfigurations(PageContext, I18n, String) for details
283     */
284    public void addSeeds(Set<String> seeds, String templateName, long maxBytes, int maxObjects) {
285        ArgumentNotValid.checkNotNull(seeds, "seeds");
286        ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName");
287        if (!TemplateDAO.getInstance().exists(templateName)) {
288            throw new UnknownID("No such template: " + templateName);
289        }
290
291        Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>();
292        StringBuilder invalidMessage = new StringBuilder("Unable to create an event harvest.\n"
293                + "The following seeds are invalid:\n");
294        boolean valid = true;
295        // validate:
296
297        for (String seed : seeds) {
298            boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds);
299            if (!seedValid) {
300                valid = false;
301            }
302        }
303
304        if (!valid) {
305            throw new ArgumentNotValid(invalidMessage.toString());
306        }
307
308        addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds);
309    }
310
311    /**
312     * This method is a duplicate of the addSeeds method but for seedsFile parameter
313     *
314     * @param seedsFile a newline-separated File containing the seeds to be added
315     * @param templateName the name of the template to be used
316     * @param maxBytes Maximum number of bytes to harvest per domain
317     * @param maxObjects Maximum number of objects to harvest per domain
318     */
319    public void addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects) {
320        ArgumentNotValid.checkNotNull(seedsFile, "seeds");
321        ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist");
322        ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName");
323        if (!TemplateDAO.getInstance().exists(templateName)) {
324            throw new UnknownID("No such template: " + templateName);
325        }
326
327        Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>();
328        StringBuilder invalidMessage = new StringBuilder("Unable to create an event harvest.\n"
329                + "The following seeds are invalid:\n");
330        boolean valid = true;
331
332        // validate all the seeds in the file
333        // those accepted are entered into the acceptedSeeds datastructure
334
335        // Iterate through the contents of the file
336        LineIterator seedIterator = null;
337        try {
338            seedIterator = new LineIterator(new FileReader(seedsFile));
339            while (seedIterator.hasNext()) {
340                String seed = seedIterator.next();
341                boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds);
342                if (!seedValid) {
343                    valid = false;
344                }
345            }
346        } catch (IOException e) {
347            throw new IOFailure("Unable to process seedsfile ", e);
348        } finally {
349            LineIterator.closeQuietly(seedIterator);
350        }
351
352        if (!valid) {
353            throw new ArgumentNotValid(invalidMessage.toString());
354        }
355
356        addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds);
357    }
358
359    /**
360     * Process each seed.
361     *
362     * @param seed The given seed.
363     * @param invalidMessage The message builder where the invalid seeds are added.
364     * @param acceptedSeeds The set of accepted seeds
365     * @return true, if the processed seed is valid or empty.
366     */
367    private boolean processSeed(String seed, StringBuilder invalidMessage, Map<String, Set<String>> acceptedSeeds) {
368        seed = seed.trim();
369        if (seed.length() != 0) {
370            if (!(seed.startsWith("http://") || seed.startsWith("https://"))) {
371                seed = "http://" + seed;
372            }
373            URL url = null;
374            try {
375                url = new URL(seed);
376            } catch (MalformedURLException e) {
377                invalidMessage.append(seed);
378                invalidMessage.append('\n');
379                return false;
380            }
381            String host = url.getHost();
382            String domainName = DomainUtils.domainNameFromHostname(host);
383            if (domainName == null) {
384                invalidMessage.append(seed);
385                invalidMessage.append('\n');
386                return false;
387            }
388
389            Set<String> seedsForDomain = acceptedSeeds.get(domainName);
390            if (seedsForDomain == null) {
391                seedsForDomain = new HashSet<String>();
392                acceptedSeeds.put(domainName, seedsForDomain);
393            }
394            seedsForDomain.add(seed);
395        }
396        return true;
397    }
398
399    /**
400     * Generate domain configurations for the accepted seeds.
401     *
402     * @param templateName The Heritrix template to be used.
403     * @param maxBytes The number of max bytes allowed
404     * @param maxObjects The number of max objected allowed
405     * @param acceptedSeeds The set of accepted seeds
406     */
407    private void addSeedsToDomain(String templateName, long maxBytes, int maxObjects,
408            Map<String, Set<String>> acceptedSeeds) {
409        // Generate components for the name for the configuration and seedlist
410        final String maxbytesSuffix = "Bytes";
411        String maxBytesS = "Unlimited" + maxbytesSuffix;
412        if (maxBytes >= 0) {
413            maxBytesS = Long.toString(maxBytes);
414            maxBytesS = maxBytesS + maxbytesSuffix;
415        }
416
417        final String maxobjectsSuffix = "Objects";
418        String maxObjectsS = "Unlimited" + maxobjectsSuffix;
419        if (maxObjects >= 0) {
420            maxObjectsS = Long.toString(maxObjects);
421            maxObjectsS = maxObjectsS + maxobjectsSuffix;
422        }
423
424        String name = harvestDefName + "_" + templateName + "_" + maxBytesS + "_" + maxObjectsS;
425
426        Set<DomainConfiguration> newDcs = new HashSet<DomainConfiguration>();
427        for (Map.Entry<String, Set<String>> entry : acceptedSeeds.entrySet()) {
428            String domainName = entry.getKey();
429            Domain domain;
430
431            // Need a seedlist to include in the configuration when we
432            // create it. This will be replaced later.
433            SeedList seedlist = new SeedList(name, "");
434            List<SeedList> seedListList = new ArrayList<SeedList>();
435            seedListList.add(seedlist);
436
437            // Find or create the domain
438            if (DomainDAO.getInstance().exists(domainName)) {
439                domain = DomainDAO.getInstance().read(domainName);
440                if (!domain.hasSeedList(name)) {
441                    domain.addSeedList(seedlist);
442                }
443            } else {
444                domain = Domain.getDefaultDomain(domainName);
445                domain.addSeedList(seedlist);
446                DomainDAO.getInstance().create(domain);
447            }
448            // Find or create the DomainConfiguration
449            DomainConfiguration dc = null;
450            if (domain.hasConfiguration(name)) {
451                dc = domain.getConfiguration(name);
452            } else {
453                dc = new DomainConfiguration(name, domain, seedListList, new ArrayList<Password>());
454                dc.setOrderXmlName(templateName);
455
456                dc.setMaxBytes(maxBytes);
457                dc.setMaxObjects(maxObjects);
458                domain.addConfiguration(dc);
459            }
460
461            // Find the SeedList and add this seed to it
462            seedlist = domain.getSeedList(name);
463            List<String> currentSeeds = seedlist.getSeeds();
464            entry.getValue().addAll(currentSeeds);
465
466            List<String> allSeeds = new ArrayList<String>();
467
468            allSeeds.addAll(entry.getValue());
469            domain.updateSeedList(new SeedList(name, allSeeds));
470
471            // Add the configuration to the list of new configs for
472            // this harvest.
473            newDcs.add(dc);
474            DomainDAO.getInstance().update(domain);
475        }
476
477        boolean thisInDAO = HarvestDefinitionDAO.getInstance().exists(this.harvestDefName);
478        if (thisInDAO) {
479            HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance();
480            for (DomainConfiguration dc : newDcs) {
481                addConfiguration(dc);
482                hddao.addDomainConfiguration(this, new SparseDomainConfiguration(dc));
483            }
484            hddao.update(this);
485        } else {
486            for (DomainConfiguration dc : newDcs) {
487                addConfiguration(dc);
488            }
489            HarvestDefinitionDAO.getInstance().create(this);
490        }
491
492    }
493
494}