001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.io.File;
026import java.io.FileReader;
027import java.io.IOException;
028import java.net.MalformedURLException;
029import java.net.URL;
030import java.sql.SQLException;
031import java.util.ArrayList;
032import java.util.Collection;
033import java.util.Date;
034import java.util.HashMap;
035import java.util.HashSet;
036import java.util.Iterator;
037import java.util.List;
038import java.util.Map;
039import java.util.Set;
040
041import javax.servlet.jsp.PageContext;
042
043import org.apache.commons.io.LineIterator;
044import org.apache.commons.lang.StringUtils;
045import org.slf4j.Logger;
046import org.slf4j.LoggerFactory;
047
048import com.antiaction.raptor.dao.AttributeBase;
049import com.antiaction.raptor.dao.AttributeTypeBase;
050
051import dk.netarkivet.common.exceptions.ArgumentNotValid;
052import dk.netarkivet.common.exceptions.IOFailure;
053import dk.netarkivet.common.exceptions.UnknownID;
054import dk.netarkivet.common.utils.DomainUtils;
055import dk.netarkivet.common.utils.I18n;
056import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory;
057import dk.netarkivet.harvester.datamodel.eav.EAV;
058import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
059import dk.netarkivet.harvester.webinterface.EventHarvestUtil;
060
061/**
062 * This class contains the specific properties and operations of harvest definitions which are not snapshot harvest
063 * definitions. I.e. this class models definitions of event and selective harvests.
064 */
065public class PartialHarvest extends HarvestDefinition {
066
067    private static final Logger log = LoggerFactory.getLogger(PartialHarvest.class);
068
069    /**
070     * Set of domain configurations being harvested by this harvest. Entries in this set are unique on configuration
071     * name + domain name.
072     */
073    private Map<SparseDomainConfiguration, DomainConfiguration> domainConfigurations = new HashMap<SparseDomainConfiguration, DomainConfiguration>();
074
075    /** The schedule used by this PartialHarvest. */
076    private Schedule schedule;
077
078    /**
079     * The next date this harvest definition should run, null if never again.
080     */
081    private Date nextDate;
082
083    /**
084     * Create new instance of a PartialHavest configured according to the properties of the supplied
085     * DomainConfiguration.
086     *
087     * @param domainConfigurations a list of domain configurations
088     * @param schedule the harvest definition schedule
089     * @param harvestDefName the name of the harvest definition
090     * @param comments comments
091     * @param audience The intended audience for this harvest (could be null)
092     */
093    public PartialHarvest(List<DomainConfiguration> domainConfigurations, Schedule schedule, String harvestDefName,
094            String comments, String audience) {
095        super(DAOProviderFactory.getExtendedFieldDAOProvider());
096        ArgumentNotValid.checkNotNull(schedule, "schedule");
097        ScheduleDAO.getInstance().read(schedule.getName());
098
099        ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName");
100        ArgumentNotValid.checkNotNull(comments, "comments");
101        ArgumentNotValid.checkNotNull(domainConfigurations, "domainConfigurations");
102
103        this.numEvents = 0;
104        addConfigurations(domainConfigurations);
105        this.schedule = schedule;
106        this.harvestDefName = harvestDefName;
107        this.comments = comments;
108        this.nextDate = schedule.getFirstEvent(new Date());
109        this.audience = audience;
110    }
111
112    /**
113     * Returns the schedule defined for this harvest definition.
114     *
115     * @return schedule
116     */
117    public Schedule getSchedule() {
118        return schedule;
119    }
120
121    /**
122     * Set the schedule to be used for this harvestdefinition.
123     *
124     * @param schedule A schedule for when to try harvesting.
125     */
126    public void setSchedule(Schedule schedule) {
127        ArgumentNotValid.checkNotNull(schedule, "schedule");
128        this.schedule = schedule;
129        if (nextDate != null) {
130            setNextDate(schedule.getFirstEvent(nextDate));
131        }
132    }
133
134    /**
135     * Get the next date this harvest definition should be run.
136     *
137     * @return The next date the harvest definition should be run or null, if the harvest definition should never run
138     * again.
139     */
140    public Date getNextDate() {
141        return nextDate;
142    }
143
144    /**
145     * Set the next date this harvest definition should be run.
146     *
147     * @param nextDate The next date the harvest definition should be run. May be null, meaning never again.
148     */
149    public void setNextDate(Date nextDate) {
150        this.nextDate = nextDate;
151    }
152
153    /**
154     * Remove domainconfiguration from this partialHarvest.
155     *
156     * @param dcKey domainConfiguration key
157     */
158    public void removeDomainConfiguration(SparseDomainConfiguration dcKey) {
159        ArgumentNotValid.checkNotNull(dcKey, "DomainConfigurationKey dcKey");
160        if (domainConfigurations.remove(dcKey) == null) {
161            log.warn("Unable to delete domainConfiguration '{}' from {}. Reason: didn't exist.", dcKey, this);
162        }
163    }
164
165    /**
166     * Add a new domainconfiguration to this PartialHarvest.
167     *
168     * @param newConfiguration A new DomainConfiguration
169     */
170    public void addDomainConfiguration(DomainConfiguration newConfiguration) {
171        ArgumentNotValid.checkNotNull(newConfiguration, "DomainConfiguration newConfiguration");
172        SparseDomainConfiguration key = new SparseDomainConfiguration(newConfiguration);
173        if (domainConfigurations.containsKey(key)) {
174            log.warn("Unable to add domainConfiguration '{}' from {}. Reason: does already exist.", newConfiguration,
175                    this);
176        } else {
177            domainConfigurations.put(key, newConfiguration);
178        }
179    }
180
181    /**
182     * Returns a List of domain configurations for this harvest definition.
183     *
184     * @return List containing information about the domain configurations
185     */
186    public Iterator<DomainConfiguration> getDomainConfigurations() {
187        return domainConfigurations.values().iterator();
188    }
189
190    /**
191     * @return the domainconfigurations as a list
192     */
193    public Collection<DomainConfiguration> getDomainConfigurationsAsList() {
194        return domainConfigurations.values();
195    }
196
197    /**
198     * Set the list of configurations that this PartialHarvest uses.
199     *
200     * @param configs List<DomainConfiguration> the configurations that this harvestdefinition will use.
201     */
202    public void setDomainConfigurations(List<DomainConfiguration> configs) {
203        ArgumentNotValid.checkNotNull(configs, "configs");
204
205        domainConfigurations.clear();
206        addConfigurations(configs);
207    }
208
209    /**
210     * Add the list of configurations to the configuration associated with this PartialHarvest.
211     *
212     * @param configs a List of configurations
213     */
214    private void addConfigurations(List<DomainConfiguration> configs) {
215        for (DomainConfiguration dc : configs) {
216            addConfiguration(dc);
217        }
218    }
219
220    /**
221     * Add a configuration to this PartialHarvest.
222     *
223     * @param dc the given configuration
224     */
225    private void addConfiguration(DomainConfiguration dc) {
226        domainConfigurations.put(new SparseDomainConfiguration(dc), dc);
227    }
228
229    /**
230     * Reset the harvest definition to no harvests and next date being the first possible for the schedule.
231     */
232    public void reset() {
233        numEvents = 0;
234        nextDate = schedule.getFirstEvent(new Date());
235    }
236
237    /**
238     * Check if this harvest definition should be run, given the time now.
239     *
240     * @param now The current time
241     * @return true if harvest definition should be run
242     */
243    public boolean runNow(Date now) {
244        ArgumentNotValid.checkNotNull(now, "now");
245        if (!getActive()) {
246            return false; // inactive definitions are never run
247        }
248        return nextDate != null && now.compareTo(nextDate) >= 0;
249    }
250
251    /**
252     * Returns whether this HarvestDefinition represents a snapshot harvest.
253     *
254     * @return false (always)
255     */
256    public boolean isSnapShot() {
257        return false;
258    }
259
260    /**
261     * Always returns no limit.
262     *
263     * @return 0, meaning no limit.
264     */
265    public long getMaxCountObjects() {
266        return Constants.HERITRIX_MAXOBJECTS_INFINITY;
267    }
268
269    /**
270     * Always returns no limit.
271     *
272     * @return -1, meaning no limit.
273     */
274    public long getMaxBytes() {
275        return Constants.HERITRIX_MAXBYTES_INFINITY;
276    }
277
278    /**
279     * Takes a seed list and creates any necessary domains, configurations, and seedlists to enable them to be harvested
280     * with the given template and other parameters. <A href="https://sbforge.org/jira/browse/NAS-1317">JIRA issue
281     * NAS-1317</A> addresses this issue. Current naming of the seedlists and domainconfigurations are: one of <br>
282     * harvestdefinitionname + "_" + templateName + "_" + "UnlimitedBytes" (if maxbytes is negative)<br>
283     * harvestdefinitionname + "_" + templateName + "_" + maxBytes + "Bytes" (if maxbytes is zero or postive).
284     *
285     * @param seeds a list of the seeds to be added
286     * @param templateName the name of the template to be used
287     * @param maxBytes Maximum number of bytes to harvest per domain
288     * @param maxObjects Maximum number of objects to harvest per domain
289     * @param attributeValues  Attributes read from webpage
290     * @see EventHarvestUtil#addConfigurations(PageContext, I18n, String) for details
291     * @return the list of invalid seeds found during this process.
292     */
293    public Set<String> addSeeds(Set<String> seeds, String templateName, long maxBytes, int maxObjects, Map<String, String> attributeValues) {
294        ArgumentNotValid.checkNotNull(seeds, "seeds");
295        ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName");
296        if (!TemplateDAO.getInstance().exists(templateName)) {
297            throw new UnknownID("No such template: " + templateName);
298        }
299        Set<String> invalidSeeds = new HashSet<String>();
300        Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>();
301        
302        for (String seed : seeds) {
303            boolean seedValid = processSeed(seed, acceptedSeeds);
304            if (!seedValid) {
305                invalidSeeds.add(seed);
306            }
307        }
308
309        if (invalidSeeds.size() > 0) {
310            log.warn("Found the following invalid seeds:" + StringUtils.join(invalidSeeds, ","));
311        }
312
313        addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues);
314        return invalidSeeds;
315    }
316
317    /**
318     * This method is a duplicate of the addSeeds method but for seedsFile parameter
319     *
320     * @param seedsFile a newline-separated File containing the seeds to be added
321     * @param templateName the name of the template to be used
322     * @param maxBytes Maximum number of bytes to harvest per domain
323     * @param maxObjects Maximum number of objects to harvest per domain
324     */
325    public Set<String> addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects, Map<String,String> attributeValues) {
326        ArgumentNotValid.checkNotNull(seedsFile, "seeds");
327        ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist");
328        ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName");
329        if (!TemplateDAO.getInstance().exists(templateName)) {
330            throw new UnknownID("No such template: " + templateName);
331        }
332        Set<String> invalidSeeds = new HashSet<String>();
333        Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>();
334        
335        // validate all the seeds in the file
336        // those accepted are entered into the acceptedSeeds datastructure
337
338        // Iterate through the contents of the file
339        LineIterator seedIterator = null;
340        try {
341            seedIterator = new LineIterator(new FileReader(seedsFile));
342            while (seedIterator.hasNext()) {
343                String seed = seedIterator.next();
344                boolean seedValid = processSeed(seed, acceptedSeeds);
345                if (!seedValid) {
346                    invalidSeeds.add(seed);
347                }
348            }
349        } catch (IOException e) {
350            throw new IOFailure("Unable to process seedsfile ", e);
351        } finally {
352            LineIterator.closeQuietly(seedIterator);
353        }
354        
355        if (invalidSeeds.size() > 0) {
356            log.warn("Found the following invalid seeds:" + StringUtils.join(invalidSeeds, ","));
357        }
358        
359        addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues);
360        return invalidSeeds;
361    }
362
363    /**
364     * Process each seed.
365     *
366     * @param seed The given seed.
367     * @param acceptedSeeds The set of accepted seeds
368     * @return true, if the processed seed is valid or empty.
369     */
370    private boolean processSeed(String seed, Map<String, Set<String>> acceptedSeeds) {
371        seed = seed.trim();
372        if (seed.length() != 0 && !seed.startsWith("#") && !seed.startsWith("//")) { // ignore empty lines and comments
373            
374            if (!(seed.toLowerCase().startsWith("http://") || seed.toLowerCase().startsWith("https://"))) {
375                seed = "http://" + seed;
376            }
377            URL url = null;
378            try {
379                url = new URL(seed);
380            } catch (MalformedURLException e) {
381                return false;
382            }
383            String host = url.getHost();
384            String domainName = DomainUtils.domainNameFromHostname(host);
385            if (domainName == null) {
386                return false;
387            }
388
389            Set<String> seedsForDomain = acceptedSeeds.get(domainName);
390            if (seedsForDomain == null) {
391                seedsForDomain = new HashSet<String>();
392                acceptedSeeds.put(domainName, seedsForDomain);
393            }
394            seedsForDomain.add(seed);
395        }
396        return true;
397    }
398
399    /**
400     * Generate domain configurations for the accepted seeds.
401     *
402     * @param templateName The Heritrix template to be used.
403     * @param maxBytes The number of max bytes allowed
404     * @param maxObjects The number of max objected allowed
405     * @param acceptedSeeds The set of accepted seeds
406     */
407    private void addSeedsToDomain(String templateName, long maxBytes, int maxObjects,
408            Map<String, Set<String>> acceptedSeeds, Map<String, String> attributeValues) {
409        // Generate components for the name for the configuration and seedlist
410        final String maxbytesSuffix = "Bytes";
411        String maxBytesS = "Unlimited" + maxbytesSuffix;
412        if (maxBytes >= 0) {
413            maxBytesS = Long.toString(maxBytes);
414            maxBytesS = maxBytesS + maxbytesSuffix;
415        }
416
417        final String maxobjectsSuffix = "Objects";
418        String maxObjectsS = "Unlimited" + maxobjectsSuffix;
419        if (maxObjects >= 0) {
420            maxObjectsS = Long.toString(maxObjects);
421            maxObjectsS = maxObjectsS + maxobjectsSuffix;
422        }
423
424        String name = harvestDefName + "_" + templateName + "_" + maxBytesS + "_" + maxObjectsS;
425
426        Set<DomainConfiguration> newDcs = new HashSet<DomainConfiguration>();
427        for (Map.Entry<String, Set<String>> entry : acceptedSeeds.entrySet()) {
428            String domainName = entry.getKey();
429            Domain domain;
430            List<SeedList> seedListList = new ArrayList<SeedList>();
431            SeedList seedlist;
432            // Find or create the domain
433            if (DomainDAO.getInstance().exists(domainName)) {
434                domain = DomainDAO.getInstance().read(domainName);
435                
436                // If a config with this name exists already for the dommain, add a "_" + timestamp to the end of the name to be make it unique.
437                // This will probably happen rarely.
438                // This name is used for both the configuration and corresponding seed
439                if (domain.hasConfiguration(name)) {
440                    String oldName = name;
441                    name = name + "_" + System.currentTimeMillis();
442                    log.info("configuration '{}' for domain '{}' already exists. Change name for config and corresponding seed to ", 
443                            oldName, name, domain.getName());
444                }
445                seedlist =  new SeedList(name, ""); // Assure that the seedname is the same as the configname.
446                seedListList.add(seedlist);
447                domain.addSeedList(seedlist);
448                                
449            } else {
450                seedlist =  new SeedList(name, ""); // Assure that the seedname is the same as the configname.
451                seedListList.add(seedlist);
452                log.info("Creating domain {} in DomainDAO", domainName);
453                domain = Domain.getDefaultDomain(domainName);
454                domain.addSeedList(seedlist);
455                DomainDAO.getInstance().create(domain);
456            }
457            
458            DomainConfiguration dc = new DomainConfiguration(name, domain, seedListList, new ArrayList<Password>());
459            dc.setOrderXmlName(templateName);
460            dc.setMaxBytes(maxBytes);
461            dc.setMaxObjects(maxObjects);
462            domain.addConfiguration(dc);
463            log.info("Adding seeds til new configuration '{}' (id={}) for domain '{}' ", name, dc.getID(), domain.getName());
464
465
466            // Find the SeedList and add this seed to it
467            seedlist = domain.getSeedList(name);
468            List<String> currentSeeds = seedlist.getSeeds();
469            entry.getValue().addAll(currentSeeds);
470
471            List<String> allSeeds = new ArrayList<String>();
472
473            allSeeds.addAll(entry.getValue());
474            domain.updateSeedList(new SeedList(name, allSeeds));
475
476            // Add the configuration to the list of new configs for
477            // this harvest.
478            newDcs.add(dc);
479            DomainDAO.getInstance().update(domain);
480            log.info("Created configuration '{}' for domain {} with ID {}", dc.getName(), dc.getDomainName(), dc.getID());
481            saveAttributes(dc, attributeValues);
482        }
483
484        boolean thisInDAO = HarvestDefinitionDAO.getInstance().exists(this.harvestDefName);
485        if (thisInDAO) { // We have previously created this harvestdefinition in the HarvestDefinitionDAO.
486            HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance();
487            for (DomainConfiguration dc : newDcs) {
488                addConfiguration(dc);
489                hddao.addDomainConfiguration(this, new SparseDomainConfiguration(dc));
490            }
491            hddao.update(this);
492        } else { // not yet created in the HarvestDefinitionDAO
493            for (DomainConfiguration dc : newDcs) {
494                addConfiguration(dc);
495            }
496            HarvestDefinitionDAO.getInstance().create(this);
497        }
498    }
499
500    private void saveAttributes(DomainConfiguration dc, Map<String, String> attributeValues) {
501        if (dc.getID() == null) {
502             log.warn("Attributes not saved to database. Id of domainConfiguration not yet available");
503             return;
504        }
505        // EAV
506        try {
507            long entity_id = dc.getID();
508            log.info("Saving attributes for domain config id {} and name {} and domain {}", entity_id, dc.getName(), dc.getDomainName());
509            EAV eav = EAV.getInstance();
510            List<AttributeAndType> attributeTypes = eav.getAttributesAndTypes(EAV.DOMAIN_TREE_ID, (int)entity_id);
511            log.debug("3 attributes available for entity {}", entity_id);
512            AttributeAndType attributeAndType;
513            AttributeTypeBase attributeType;
514            AttributeBase attribute;
515            for (int i=0; i<attributeTypes.size(); ++i) {
516                attributeAndType = attributeTypes.get(i);
517                attributeType = attributeAndType.attributeType;
518                log.debug("Examining attribute {}",attributeType.name);
519                attribute = attributeAndType.attribute;
520                if (attribute == null) {
521                    attribute = attributeType.instanceOf();
522                    attribute.entity_id = (int)entity_id;
523                }
524                switch (attributeType.viewtype) {
525                case 1:
526                    String paramValue = attributeValues.get(attributeType.name);
527                    int intValue;
528                    if (paramValue != null) {
529                      intValue = Integer.decode(paramValue);
530                    } else {
531                      intValue = attributeType.def_int;
532                    }
533                    log.info("Setting attribute {} to value {}", attributeType.name, intValue);
534                    attribute.setInteger(intValue);
535                    break;
536                case 5:
537                case 6:
538                    paramValue = attributeValues.get(attributeType.name);
539                    int intVal = 0;
540                    if (paramValue != null && !"0".equals(paramValue)) {
541                        intVal = 1;
542                    } 
543                    log.debug("Set intVal = 1 for attribute {} when receiving paramValue={}", attributeType.name, paramValue);
544                    attribute.setInteger(intVal);
545                    break;
546                }
547                eav.saveAttribute(attribute);
548            }
549        } catch (SQLException e) {
550            throw new RuntimeException("Unable to store EAV data!", e);
551        }
552    }
553}