001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.io.File;
026import java.io.FileReader;
027import java.io.IOException;
028import java.net.MalformedURLException;
029import java.net.URL;
030import java.sql.SQLException;
031import java.util.ArrayList;
032import java.util.Collection;
033import java.util.Date;
034import java.util.HashMap;
035import java.util.HashSet;
036import java.util.Iterator;
037import java.util.List;
038import java.util.Map;
039import java.util.Set;
040
041import javax.servlet.jsp.PageContext;
042
043import org.apache.commons.io.LineIterator;
044import org.slf4j.Logger;
045import org.slf4j.LoggerFactory;
046
047import com.antiaction.raptor.dao.AttributeBase;
048import com.antiaction.raptor.dao.AttributeTypeBase;
049
050import dk.netarkivet.common.exceptions.ArgumentNotValid;
051import dk.netarkivet.common.exceptions.IOFailure;
052import dk.netarkivet.common.exceptions.UnknownID;
053import dk.netarkivet.common.utils.DomainUtils;
054import dk.netarkivet.common.utils.I18n;
055import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory;
056import dk.netarkivet.harvester.datamodel.eav.EAV;
057import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
058import dk.netarkivet.harvester.webinterface.EventHarvestUtil;
059
060/**
061 * This class contains the specific properties and operations of harvest definitions which are not snapshot harvest
062 * definitions. I.e. this class models definitions of event and selective harvests.
063 */
064public class PartialHarvest extends HarvestDefinition {
065
066    private static final Logger log = LoggerFactory.getLogger(PartialHarvest.class);
067
068    /**
069     * Set of domain configurations being harvested by this harvest. Entries in this set are unique on configuration
070     * name + domain name.
071     */
072    private Map<SparseDomainConfiguration, DomainConfiguration> domainConfigurations = new HashMap<SparseDomainConfiguration, DomainConfiguration>();
073
074    /** The schedule used by this PartialHarvest. */
075    private Schedule schedule;
076
077    /**
078     * The next date this harvest definition should run, null if never again.
079     */
080    private Date nextDate;
081
082    /**
083     * Create new instance of a PartialHavest configured according to the properties of the supplied
084     * DomainConfiguration.
085     *
086     * @param domainConfigurations a list of domain configurations
087     * @param schedule the harvest definition schedule
088     * @param harvestDefName the name of the harvest definition
089     * @param comments comments
090     * @param audience The intended audience for this harvest (could be null)
091     */
092    public PartialHarvest(List<DomainConfiguration> domainConfigurations, Schedule schedule, String harvestDefName,
093            String comments, String audience) {
094        super(DAOProviderFactory.getExtendedFieldDAOProvider());
095        ArgumentNotValid.checkNotNull(schedule, "schedule");
096        ScheduleDAO.getInstance().read(schedule.getName());
097
098        ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName");
099        ArgumentNotValid.checkNotNull(comments, "comments");
100        ArgumentNotValid.checkNotNull(domainConfigurations, "domainConfigurations");
101
102        this.numEvents = 0;
103        addConfigurations(domainConfigurations);
104        this.schedule = schedule;
105        this.harvestDefName = harvestDefName;
106        this.comments = comments;
107        this.nextDate = schedule.getFirstEvent(new Date());
108        this.audience = audience;
109    }
110
111    /**
112     * Returns the schedule defined for this harvest definition.
113     *
114     * @return schedule
115     */
116    public Schedule getSchedule() {
117        return schedule;
118    }
119
120    /**
121     * Set the schedule to be used for this harvestdefinition.
122     *
123     * @param schedule A schedule for when to try harvesting.
124     */
125    public void setSchedule(Schedule schedule) {
126        ArgumentNotValid.checkNotNull(schedule, "schedule");
127        this.schedule = schedule;
128        if (nextDate != null) {
129            setNextDate(schedule.getFirstEvent(nextDate));
130        }
131    }
132
133    /**
134     * Get the next date this harvest definition should be run.
135     *
136     * @return The next date the harvest definition should be run or null, if the harvest definition should never run
137     * again.
138     */
139    public Date getNextDate() {
140        return nextDate;
141    }
142
143    /**
144     * Set the next date this harvest definition should be run.
145     *
146     * @param nextDate The next date the harvest definition should be run. May be null, meaning never again.
147     */
148    public void setNextDate(Date nextDate) {
149        this.nextDate = nextDate;
150    }
151
152    /**
153     * Remove domainconfiguration from this partialHarvest.
154     *
155     * @param dcKey domainConfiguration key
156     */
157    public void removeDomainConfiguration(SparseDomainConfiguration dcKey) {
158        ArgumentNotValid.checkNotNull(dcKey, "DomainConfigurationKey dcKey");
159        if (domainConfigurations.remove(dcKey) == null) {
160            log.warn("Unable to delete domainConfiguration '{}' from {}. Reason: didn't exist.", dcKey, this);
161        }
162    }
163
164    /**
165     * Add a new domainconfiguration to this PartialHarvest.
166     *
167     * @param newConfiguration A new DomainConfiguration
168     */
169    public void addDomainConfiguration(DomainConfiguration newConfiguration) {
170        ArgumentNotValid.checkNotNull(newConfiguration, "DomainConfiguration newConfiguration");
171        SparseDomainConfiguration key = new SparseDomainConfiguration(newConfiguration);
172        if (domainConfigurations.containsKey(key)) {
173            log.warn("Unable to add domainConfiguration '{}' from {}. Reason: does already exist.", newConfiguration,
174                    this);
175        } else {
176            domainConfigurations.put(key, newConfiguration);
177        }
178    }
179
180    /**
181     * Returns a List of domain configurations for this harvest definition.
182     *
183     * @return List containing information about the domain configurations
184     */
185    public Iterator<DomainConfiguration> getDomainConfigurations() {
186        return domainConfigurations.values().iterator();
187    }
188
189    /**
190     * @return the domainconfigurations as a list
191     */
192    public Collection<DomainConfiguration> getDomainConfigurationsAsList() {
193        return domainConfigurations.values();
194    }
195
196    /**
197     * Set the list of configurations that this PartialHarvest uses.
198     *
199     * @param configs List<DomainConfiguration> the configurations that this harvestdefinition will use.
200     */
201    public void setDomainConfigurations(List<DomainConfiguration> configs) {
202        ArgumentNotValid.checkNotNull(configs, "configs");
203
204        domainConfigurations.clear();
205        addConfigurations(configs);
206    }
207
208    /**
209     * Add the list of configurations to the configuration associated with this PartialHarvest.
210     *
211     * @param configs a List of configurations
212     */
213    private void addConfigurations(List<DomainConfiguration> configs) {
214        for (DomainConfiguration dc : configs) {
215            addConfiguration(dc);
216        }
217    }
218
219    /**
220     * Add a configuration to this PartialHarvest.
221     *
222     * @param dc the given configuration
223     */
224    private void addConfiguration(DomainConfiguration dc) {
225        domainConfigurations.put(new SparseDomainConfiguration(dc), dc);
226    }
227
228    /**
229     * Reset the harvest definition to no harvests and next date being the first possible for the schedule.
230     */
231    public void reset() {
232        numEvents = 0;
233        nextDate = schedule.getFirstEvent(new Date());
234    }
235
236    /**
237     * Check if this harvest definition should be run, given the time now.
238     *
239     * @param now The current time
240     * @return true if harvest definition should be run
241     */
242    public boolean runNow(Date now) {
243        ArgumentNotValid.checkNotNull(now, "now");
244        if (!getActive()) {
245            return false; // inactive definitions are never run
246        }
247        return nextDate != null && now.compareTo(nextDate) >= 0;
248    }
249
250    /**
251     * Returns whether this HarvestDefinition represents a snapshot harvest.
252     *
253     * @return false (always)
254     */
255    public boolean isSnapShot() {
256        return false;
257    }
258
259    /**
260     * Always returns no limit.
261     *
262     * @return 0, meaning no limit.
263     */
264    public long getMaxCountObjects() {
265        return Constants.HERITRIX_MAXOBJECTS_INFINITY;
266    }
267
268    /**
269     * Always returns no limit.
270     *
271     * @return -1, meaning no limit.
272     */
273    public long getMaxBytes() {
274        return Constants.HERITRIX_MAXBYTES_INFINITY;
275    }
276
277    /**
278     * Takes a seed list and creates any necessary domains, configurations, and seedlists to enable them to be harvested
279     * with the given template and other parameters. <A href="https://sbforge.org/jira/browse/NAS-1317">JIRA issue
280     * NAS-1317</A> addresses this issue. Current naming of the seedlists and domainconfigurations are: one of <br>
281     * harvestdefinitionname + "_" + templateName + "_" + "UnlimitedBytes" (if maxbytes is negative)<br>
282     * harvestdefinitionname + "_" + templateName + "_" + maxBytes + "Bytes" (if maxbytes is zero or postive).
283     *
284     * @param seeds a list of the seeds to be added
285     * @param templateName the name of the template to be used
286     * @param maxBytes Maximum number of bytes to harvest per domain
287     * @param maxObjects Maximum number of objects to harvest per domain
288     * @param attributeValues  Attributes read from webpage
289     * @see EventHarvestUtil#addConfigurations(PageContext, I18n, String) for details
290     */
291    public void addSeeds(Set<String> seeds, String templateName, long maxBytes, int maxObjects, Map<String, String> attributeValues) {
292        ArgumentNotValid.checkNotNull(seeds, "seeds");
293        ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName");
294        if (!TemplateDAO.getInstance().exists(templateName)) {
295            throw new UnknownID("No such template: " + templateName);
296        }
297
298        Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>();
299        StringBuilder invalidMessage = new StringBuilder("Unable to create an event harvest.\n"
300                + "The following seeds are invalid:\n");
301        boolean valid = true;
302        // validate:
303
304        for (String seed : seeds) {
305            boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds);
306            if (!seedValid) {
307                valid = false;
308            }
309        }
310
311        if (!valid) {
312            throw new ArgumentNotValid(invalidMessage.toString());
313        }
314
315        addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues);
316    }
317
318    /**
319     * This method is a duplicate of the addSeeds method but for seedsFile parameter
320     *
321     * @param seedsFile a newline-separated File containing the seeds to be added
322     * @param templateName the name of the template to be used
323     * @param maxBytes Maximum number of bytes to harvest per domain
324     * @param maxObjects Maximum number of objects to harvest per domain
325     */
326    public void addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects, Map<String,String> attributeValues) {
327        ArgumentNotValid.checkNotNull(seedsFile, "seeds");
328        ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist");
329        ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName");
330        if (!TemplateDAO.getInstance().exists(templateName)) {
331            throw new UnknownID("No such template: " + templateName);
332        }
333
334        Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>();
335        StringBuilder invalidMessage = new StringBuilder("Unable to create an event harvest.\n"
336                + "The following seeds are invalid:\n");
337        boolean valid = true;
338
339        // validate all the seeds in the file
340        // those accepted are entered into the acceptedSeeds datastructure
341
342        // Iterate through the contents of the file
343        LineIterator seedIterator = null;
344        try {
345            seedIterator = new LineIterator(new FileReader(seedsFile));
346            while (seedIterator.hasNext()) {
347                String seed = seedIterator.next();
348                boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds);
349                if (!seedValid) {
350                    valid = false;
351                }
352            }
353        } catch (IOException e) {
354            throw new IOFailure("Unable to process seedsfile ", e);
355        } finally {
356            LineIterator.closeQuietly(seedIterator);
357        }
358
359        if (!valid) {
360            throw new ArgumentNotValid(invalidMessage.toString());
361        }
362
363        addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues);
364    }
365
366    /**
367     * Process each seed.
368     *
369     * @param seed The given seed.
370     * @param invalidMessage The message builder where the invalid seeds are added.
371     * @param acceptedSeeds The set of accepted seeds
372     * @return true, if the processed seed is valid or empty.
373     */
374    private boolean processSeed(String seed, StringBuilder invalidMessage, Map<String, Set<String>> acceptedSeeds) {
375        seed = seed.trim();
376        if (seed.length() != 0 && !seed.startsWith("#") && !seed.startsWith("//")) { // ignore empty lines and comments
377            
378            if (!(seed.toLowerCase().startsWith("http://") || seed.toLowerCase().startsWith("https://"))) {
379                seed = "http://" + seed;
380            }
381            URL url = null;
382            try {
383                url = new URL(seed);
384            } catch (MalformedURLException e) {
385                invalidMessage.append(seed);
386                invalidMessage.append('\n');
387                return false;
388            }
389            String host = url.getHost();
390            String domainName = DomainUtils.domainNameFromHostname(host);
391            if (domainName == null) {
392                invalidMessage.append(seed);
393                invalidMessage.append('\n');
394                return false;
395            }
396
397            Set<String> seedsForDomain = acceptedSeeds.get(domainName);
398            if (seedsForDomain == null) {
399                seedsForDomain = new HashSet<String>();
400                acceptedSeeds.put(domainName, seedsForDomain);
401            }
402            seedsForDomain.add(seed);
403        }
404        return true;
405    }
406
407    /**
408     * Generate domain configurations for the accepted seeds.
409     *
410     * @param templateName The Heritrix template to be used.
411     * @param maxBytes The number of max bytes allowed
412     * @param maxObjects The number of max objected allowed
413     * @param acceptedSeeds The set of accepted seeds
414     */
415    private void addSeedsToDomain(String templateName, long maxBytes, int maxObjects,
416            Map<String, Set<String>> acceptedSeeds, Map<String, String> attributeValues) {
417        // Generate components for the name for the configuration and seedlist
418        final String maxbytesSuffix = "Bytes";
419        String maxBytesS = "Unlimited" + maxbytesSuffix;
420        if (maxBytes >= 0) {
421            maxBytesS = Long.toString(maxBytes);
422            maxBytesS = maxBytesS + maxbytesSuffix;
423        }
424
425        final String maxobjectsSuffix = "Objects";
426        String maxObjectsS = "Unlimited" + maxobjectsSuffix;
427        if (maxObjects >= 0) {
428            maxObjectsS = Long.toString(maxObjects);
429            maxObjectsS = maxObjectsS + maxobjectsSuffix;
430        }
431
432        String name = harvestDefName + "_" + templateName + "_" + maxBytesS + "_" + maxObjectsS;
433
434        Set<DomainConfiguration> newDcs = new HashSet<DomainConfiguration>();
435        for (Map.Entry<String, Set<String>> entry : acceptedSeeds.entrySet()) {
436            String domainName = entry.getKey();
437            Domain domain;
438            List<SeedList> seedListList = new ArrayList<SeedList>();
439            SeedList seedlist;
440            // Find or create the domain
441            if (DomainDAO.getInstance().exists(domainName)) {
442                domain = DomainDAO.getInstance().read(domainName);
443                
444                // If a config with this name exists already for the dommain, add a "_" + timestamp to the end of the name to be make it unique.
445                // This will probably happen rarely.
446                // This name is used for both the configuration and corresponding seed
447                if (domain.hasConfiguration(name)) {
448                    String oldName = name;
449                    name = name + "_" + System.currentTimeMillis();
450                    log.info("configuration '{}' for domain '{}' already exists. Change name for config and corresponding seed to ", 
451                            oldName, name, domain.getName());
452                }
453                seedlist =  new SeedList(name, ""); // Assure that the seedname is the same as the configname.
454                seedListList.add(seedlist);
455                domain.addSeedList(seedlist);
456                                
457            } else {
458                seedlist =  new SeedList(name, ""); // Assure that the seedname is the same as the configname.
459                seedListList.add(seedlist);
460                log.info("Creating domain {} in DomainDAO", domainName);
461                domain = Domain.getDefaultDomain(domainName);
462                domain.addSeedList(seedlist);
463                DomainDAO.getInstance().create(domain);
464            }
465            
466            DomainConfiguration dc = new DomainConfiguration(name, domain, seedListList, new ArrayList<Password>());
467            dc.setOrderXmlName(templateName);
468            dc.setMaxBytes(maxBytes);
469            dc.setMaxObjects(maxObjects);
470            domain.addConfiguration(dc);
471            log.info("Adding seeds til new configuration '{}' (id={}) for domain '{}' ", name, dc.getID(), domain.getName());
472
473
474            // Find the SeedList and add this seed to it
475            seedlist = domain.getSeedList(name);
476            List<String> currentSeeds = seedlist.getSeeds();
477            entry.getValue().addAll(currentSeeds);
478
479            List<String> allSeeds = new ArrayList<String>();
480
481            allSeeds.addAll(entry.getValue());
482            domain.updateSeedList(new SeedList(name, allSeeds));
483
484            // Add the configuration to the list of new configs for
485            // this harvest.
486            newDcs.add(dc);
487            DomainDAO.getInstance().update(domain);
488            log.info("Created configuration '{}' for domain {} with ID {}", dc.getName(), dc.getDomainName(), dc.getID());
489            saveAttributes(dc, attributeValues);
490        }
491
492        boolean thisInDAO = HarvestDefinitionDAO.getInstance().exists(this.harvestDefName);
493        if (thisInDAO) { // We have previously created this harvestdefinition in the HarvestDefinitionDAO.
494            HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance();
495            for (DomainConfiguration dc : newDcs) {
496                addConfiguration(dc);
497                hddao.addDomainConfiguration(this, new SparseDomainConfiguration(dc));
498            }
499            hddao.update(this);
500        } else { // not yet created in the HarvestDefinitionDAO
501            for (DomainConfiguration dc : newDcs) {
502                addConfiguration(dc);
503            }
504            HarvestDefinitionDAO.getInstance().create(this);
505        }
506    }
507
508    private void saveAttributes(DomainConfiguration dc, Map<String, String> attributeValues) {
509        if (dc.getID() == null) {
510             log.warn("Attributes not saved to database. Id of domainConfiguration not yet available");
511             return;
512        }
513        // EAV
514        try {
515            long entity_id = dc.getID();
516            log.info("Saving attributes for domain config id {} and name {} and domain {}", entity_id, dc.getName(), dc.getDomainName());
517            EAV eav = EAV.getInstance();
518            List<AttributeAndType> attributeTypes = eav.getAttributesAndTypes(EAV.DOMAIN_TREE_ID, (int)entity_id);
519            log.debug("3 attributes available for entity {}", entity_id);
520            AttributeAndType attributeAndType;
521            AttributeTypeBase attributeType;
522            AttributeBase attribute;
523            for (int i=0; i<attributeTypes.size(); ++i) {
524                attributeAndType = attributeTypes.get(i);
525                attributeType = attributeAndType.attributeType;
526                log.debug("Examining attribute {}",attributeType.name);
527                attribute = attributeAndType.attribute;
528                if (attribute == null) {
529                    attribute = attributeType.instanceOf();
530                    attribute.entity_id = (int)entity_id;
531                }
532                switch (attributeType.viewtype) {
533                case 1:
534                    String paramValue = attributeValues.get(attributeType.name);
535                    int intValue;
536                    if (paramValue != null) {
537                      intValue = Integer.decode(paramValue);
538                    } else {
539                      intValue = attributeType.def_int;
540                    }
541                    log.info("Setting attribute {} to value {}", attributeType.name, intValue);
542                    attribute.setInteger(intValue);
543                    break;
544                case 5:
545                case 6:
546                    paramValue = attributeValues.get(attributeType.name);
547                    int intVal = 0;
548                    if (paramValue != null && !"0".equals(paramValue)) {
549                        intVal = 1;
550                    } 
551                    log.debug("Set intVal = 1 for attribute {} when receiving paramValue={}", attributeType.name, paramValue);
552                    attribute.setInteger(intVal);
553                    break;
554                }
555                eav.saveAttribute(attribute);
556            }
557        } catch (SQLException e) {
558            throw new RuntimeException("Unable to store EAV data!", e);
559        }
560    }
561}