001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.io.File; 026import java.io.FileReader; 027import java.io.IOException; 028import java.net.MalformedURLException; 029import java.net.URL; 030import java.sql.SQLException; 031import java.util.ArrayList; 032import java.util.Collection; 033import java.util.Date; 034import java.util.HashMap; 035import java.util.HashSet; 036import java.util.Iterator; 037import java.util.List; 038import java.util.Map; 039import java.util.Set; 040 041import javax.servlet.jsp.PageContext; 042 043import org.apache.commons.io.LineIterator; 044import org.apache.commons.lang.StringUtils; 045import org.slf4j.Logger; 046import org.slf4j.LoggerFactory; 047 048import com.antiaction.raptor.dao.AttributeBase; 049import com.antiaction.raptor.dao.AttributeTypeBase; 050 051import dk.netarkivet.common.exceptions.ArgumentNotValid; 052import dk.netarkivet.common.exceptions.IOFailure; 053import dk.netarkivet.common.exceptions.UnknownID; 054import dk.netarkivet.common.utils.DomainUtils; 055import dk.netarkivet.common.utils.I18n; 056import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory; 057import dk.netarkivet.harvester.datamodel.eav.EAV; 058import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 059import dk.netarkivet.harvester.webinterface.EventHarvestUtil; 060 061/** 062 * This class contains the specific properties and operations of harvest definitions which are not snapshot harvest 063 * definitions. I.e. this class models definitions of event and selective harvests. 064 */ 065public class PartialHarvest extends HarvestDefinition { 066 067 private static final Logger log = LoggerFactory.getLogger(PartialHarvest.class); 068 069 /** 070 * Set of domain configurations being harvested by this harvest. Entries in this set are unique on configuration 071 * name + domain name. 072 */ 073 private Map<SparseDomainConfiguration, DomainConfiguration> domainConfigurations = new HashMap<SparseDomainConfiguration, DomainConfiguration>(); 074 075 /** The schedule used by this PartialHarvest. */ 076 private Schedule schedule; 077 078 /** 079 * The next date this harvest definition should run, null if never again. 080 */ 081 private Date nextDate; 082 083 /** 084 * Create new instance of a PartialHavest configured according to the properties of the supplied 085 * DomainConfiguration. 086 * 087 * @param domainConfigurations a list of domain configurations 088 * @param schedule the harvest definition schedule 089 * @param harvestDefName the name of the harvest definition 090 * @param comments comments 091 * @param audience The intended audience for this harvest (could be null) 092 */ 093 public PartialHarvest(List<DomainConfiguration> domainConfigurations, Schedule schedule, String harvestDefName, 094 String comments, String audience) { 095 super(DAOProviderFactory.getExtendedFieldDAOProvider()); 096 ArgumentNotValid.checkNotNull(schedule, "schedule"); 097 ScheduleDAO.getInstance().read(schedule.getName()); 098 099 ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName"); 100 ArgumentNotValid.checkNotNull(comments, "comments"); 101 ArgumentNotValid.checkNotNull(domainConfigurations, "domainConfigurations"); 102 103 this.numEvents = 0; 104 addConfigurations(domainConfigurations); 105 this.schedule = schedule; 106 this.harvestDefName = harvestDefName; 107 this.comments = comments; 108 this.nextDate = schedule.getFirstEvent(new Date()); 109 this.audience = audience; 110 } 111 112 /** 113 * Returns the schedule defined for this harvest definition. 114 * 115 * @return schedule 116 */ 117 public Schedule getSchedule() { 118 return schedule; 119 } 120 121 /** 122 * Set the schedule to be used for this harvestdefinition. 123 * 124 * @param schedule A schedule for when to try harvesting. 125 */ 126 public void setSchedule(Schedule schedule) { 127 ArgumentNotValid.checkNotNull(schedule, "schedule"); 128 this.schedule = schedule; 129 if (nextDate != null) { 130 setNextDate(schedule.getFirstEvent(nextDate)); 131 } 132 } 133 134 /** 135 * Get the next date this harvest definition should be run. 136 * 137 * @return The next date the harvest definition should be run or null, if the harvest definition should never run 138 * again. 139 */ 140 public Date getNextDate() { 141 return nextDate; 142 } 143 144 /** 145 * Set the next date this harvest definition should be run. 146 * 147 * @param nextDate The next date the harvest definition should be run. May be null, meaning never again. 148 */ 149 public void setNextDate(Date nextDate) { 150 this.nextDate = nextDate; 151 } 152 153 /** 154 * Remove domainconfiguration from this partialHarvest. 155 * 156 * @param dcKey domainConfiguration key 157 */ 158 public void removeDomainConfiguration(SparseDomainConfiguration dcKey) { 159 ArgumentNotValid.checkNotNull(dcKey, "DomainConfigurationKey dcKey"); 160 if (domainConfigurations.remove(dcKey) == null) { 161 log.warn("Unable to delete domainConfiguration '{}' from {}. Reason: didn't exist.", dcKey, this); 162 } 163 } 164 165 /** 166 * Add a new domainconfiguration to this PartialHarvest. 167 * 168 * @param newConfiguration A new DomainConfiguration 169 */ 170 public void addDomainConfiguration(DomainConfiguration newConfiguration) { 171 ArgumentNotValid.checkNotNull(newConfiguration, "DomainConfiguration newConfiguration"); 172 SparseDomainConfiguration key = new SparseDomainConfiguration(newConfiguration); 173 if (domainConfigurations.containsKey(key)) { 174 log.warn("Unable to add domainConfiguration '{}' from {}. Reason: does already exist.", newConfiguration, 175 this); 176 } else { 177 domainConfigurations.put(key, newConfiguration); 178 } 179 } 180 181 /** 182 * Returns a List of domain configurations for this harvest definition. 183 * 184 * @return List containing information about the domain configurations 185 */ 186 public Iterator<DomainConfiguration> getDomainConfigurations() { 187 return domainConfigurations.values().iterator(); 188 } 189 190 /** 191 * @return the domainconfigurations as a list 192 */ 193 public Collection<DomainConfiguration> getDomainConfigurationsAsList() { 194 return domainConfigurations.values(); 195 } 196 197 /** 198 * Set the list of configurations that this PartialHarvest uses. 199 * 200 * @param configs List<DomainConfiguration> the configurations that this harvestdefinition will use. 201 */ 202 public void setDomainConfigurations(List<DomainConfiguration> configs) { 203 ArgumentNotValid.checkNotNull(configs, "configs"); 204 205 domainConfigurations.clear(); 206 addConfigurations(configs); 207 } 208 209 /** 210 * Add the list of configurations to the configuration associated with this PartialHarvest. 211 * 212 * @param configs a List of configurations 213 */ 214 private void addConfigurations(List<DomainConfiguration> configs) { 215 for (DomainConfiguration dc : configs) { 216 addConfiguration(dc); 217 } 218 } 219 220 /** 221 * Add a configuration to this PartialHarvest. 222 * 223 * @param dc the given configuration 224 */ 225 private void addConfiguration(DomainConfiguration dc) { 226 domainConfigurations.put(new SparseDomainConfiguration(dc), dc); 227 } 228 229 /** 230 * Reset the harvest definition to no harvests and next date being the first possible for the schedule. 231 */ 232 public void reset() { 233 numEvents = 0; 234 nextDate = schedule.getFirstEvent(new Date()); 235 } 236 237 /** 238 * Check if this harvest definition should be run, given the time now. 239 * 240 * @param now The current time 241 * @return true if harvest definition should be run 242 */ 243 public boolean runNow(Date now) { 244 ArgumentNotValid.checkNotNull(now, "now"); 245 if (!getActive()) { 246 return false; // inactive definitions are never run 247 } 248 return nextDate != null && now.compareTo(nextDate) >= 0; 249 } 250 251 /** 252 * Returns whether this HarvestDefinition represents a snapshot harvest. 253 * 254 * @return false (always) 255 */ 256 public boolean isSnapShot() { 257 return false; 258 } 259 260 /** 261 * Always returns no limit. 262 * 263 * @return 0, meaning no limit. 264 */ 265 public long getMaxCountObjects() { 266 return Constants.HERITRIX_MAXOBJECTS_INFINITY; 267 } 268 269 /** 270 * Always returns no limit. 271 * 272 * @return -1, meaning no limit. 273 */ 274 public long getMaxBytes() { 275 return Constants.HERITRIX_MAXBYTES_INFINITY; 276 } 277 278 /** 279 * Takes a seed list and creates any necessary domains, configurations, and seedlists to enable them to be harvested 280 * with the given template and other parameters. <A href="https://sbforge.org/jira/browse/NAS-1317">JIRA issue 281 * NAS-1317</A> addresses this issue. Current naming of the seedlists and domainconfigurations are: one of <br> 282 * harvestdefinitionname + "_" + templateName + "_" + "UnlimitedBytes" (if maxbytes is negative)<br> 283 * harvestdefinitionname + "_" + templateName + "_" + maxBytes + "Bytes" (if maxbytes is zero or postive). 284 * 285 * @param seeds a list of the seeds to be added 286 * @param templateName the name of the template to be used 287 * @param maxBytes Maximum number of bytes to harvest per domain 288 * @param maxObjects Maximum number of objects to harvest per domain 289 * @param attributeValues Attributes read from webpage 290 * @see EventHarvestUtil#addConfigurations(PageContext, I18n, String) for details 291 * @return the list of invalid seeds found during this process. 292 */ 293 public Set<String> addSeeds(Set<String> seeds, String templateName, long maxBytes, int maxObjects, Map<String, String> attributeValues) { 294 ArgumentNotValid.checkNotNull(seeds, "seeds"); 295 ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); 296 if (!TemplateDAO.getInstance().exists(templateName)) { 297 throw new UnknownID("No such template: " + templateName); 298 } 299 Set<String> invalidSeeds = new HashSet<String>(); 300 Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); 301 302 for (String seed : seeds) { 303 boolean seedValid = processSeed(seed, acceptedSeeds); 304 if (!seedValid) { 305 invalidSeeds.add(seed); 306 } 307 } 308 309 if (invalidSeeds.size() > 0) { 310 log.warn("Found the following invalid seeds:" + StringUtils.join(invalidSeeds, ",")); 311 } 312 313 addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues); 314 return invalidSeeds; 315 } 316 317 /** 318 * This method is a duplicate of the addSeeds method but for seedsFile parameter 319 * 320 * @param seedsFile a newline-separated File containing the seeds to be added 321 * @param templateName the name of the template to be used 322 * @param maxBytes Maximum number of bytes to harvest per domain 323 * @param maxObjects Maximum number of objects to harvest per domain 324 */ 325 public Set<String> addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects, Map<String,String> attributeValues) { 326 ArgumentNotValid.checkNotNull(seedsFile, "seeds"); 327 ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist"); 328 ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); 329 if (!TemplateDAO.getInstance().exists(templateName)) { 330 throw new UnknownID("No such template: " + templateName); 331 } 332 Set<String> invalidSeeds = new HashSet<String>(); 333 Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); 334 335 // validate all the seeds in the file 336 // those accepted are entered into the acceptedSeeds datastructure 337 338 // Iterate through the contents of the file 339 LineIterator seedIterator = null; 340 try { 341 seedIterator = new LineIterator(new FileReader(seedsFile)); 342 while (seedIterator.hasNext()) { 343 String seed = seedIterator.next(); 344 boolean seedValid = processSeed(seed, acceptedSeeds); 345 if (!seedValid) { 346 invalidSeeds.add(seed); 347 } 348 } 349 } catch (IOException e) { 350 throw new IOFailure("Unable to process seedsfile ", e); 351 } finally { 352 LineIterator.closeQuietly(seedIterator); 353 } 354 355 if (invalidSeeds.size() > 0) { 356 log.warn("Found the following invalid seeds:" + StringUtils.join(invalidSeeds, ",")); 357 } 358 359 addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues); 360 return invalidSeeds; 361 } 362 363 /** 364 * Process each seed. 365 * 366 * @param seed The given seed. 367 * @param acceptedSeeds The set of accepted seeds 368 * @return true, if the processed seed is valid or empty. 369 */ 370 private boolean processSeed(String seed, Map<String, Set<String>> acceptedSeeds) { 371 seed = seed.trim(); 372 if (seed.length() != 0 && !seed.startsWith("#") && !seed.startsWith("//")) { // ignore empty lines and comments 373 374 if (!(seed.toLowerCase().startsWith("http://") || seed.toLowerCase().startsWith("https://"))) { 375 seed = "http://" + seed; 376 } 377 URL url = null; 378 try { 379 url = new URL(seed); 380 } catch (MalformedURLException e) { 381 return false; 382 } 383 String host = url.getHost(); 384 String domainName = DomainUtils.domainNameFromHostname(host); 385 if (domainName == null) { 386 return false; 387 } 388 389 Set<String> seedsForDomain = acceptedSeeds.get(domainName); 390 if (seedsForDomain == null) { 391 seedsForDomain = new HashSet<String>(); 392 acceptedSeeds.put(domainName, seedsForDomain); 393 } 394 seedsForDomain.add(seed); 395 } 396 return true; 397 } 398 399 /** 400 * Generate domain configurations for the accepted seeds. 401 * 402 * @param templateName The Heritrix template to be used. 403 * @param maxBytes The number of max bytes allowed 404 * @param maxObjects The number of max objected allowed 405 * @param acceptedSeeds The set of accepted seeds 406 */ 407 private void addSeedsToDomain(String templateName, long maxBytes, int maxObjects, 408 Map<String, Set<String>> acceptedSeeds, Map<String, String> attributeValues) { 409 // Generate components for the name for the configuration and seedlist 410 final String maxbytesSuffix = "Bytes"; 411 String maxBytesS = "Unlimited" + maxbytesSuffix; 412 if (maxBytes >= 0) { 413 maxBytesS = Long.toString(maxBytes); 414 maxBytesS = maxBytesS + maxbytesSuffix; 415 } 416 417 final String maxobjectsSuffix = "Objects"; 418 String maxObjectsS = "Unlimited" + maxobjectsSuffix; 419 if (maxObjects >= 0) { 420 maxObjectsS = Long.toString(maxObjects); 421 maxObjectsS = maxObjectsS + maxobjectsSuffix; 422 } 423 424 String name = harvestDefName + "_" + templateName + "_" + maxBytesS + "_" + maxObjectsS; 425 426 Set<DomainConfiguration> newDcs = new HashSet<DomainConfiguration>(); 427 for (Map.Entry<String, Set<String>> entry : acceptedSeeds.entrySet()) { 428 String domainName = entry.getKey(); 429 Domain domain; 430 List<SeedList> seedListList = new ArrayList<SeedList>(); 431 SeedList seedlist; 432 // Find or create the domain 433 if (DomainDAO.getInstance().exists(domainName)) { 434 domain = DomainDAO.getInstance().read(domainName); 435 436 // If a config with this name exists already for the dommain, add a "_" + timestamp to the end of the name to be make it unique. 437 // This will probably happen rarely. 438 // This name is used for both the configuration and corresponding seed 439 if (domain.hasConfiguration(name)) { 440 String oldName = name; 441 name = name + "_" + System.currentTimeMillis(); 442 log.info("configuration '{}' for domain '{}' already exists. Change name for config and corresponding seed to ", 443 oldName, name, domain.getName()); 444 } 445 seedlist = new SeedList(name, ""); // Assure that the seedname is the same as the configname. 446 seedListList.add(seedlist); 447 domain.addSeedList(seedlist); 448 449 } else { 450 seedlist = new SeedList(name, ""); // Assure that the seedname is the same as the configname. 451 seedListList.add(seedlist); 452 log.info("Creating domain {} in DomainDAO", domainName); 453 domain = Domain.getDefaultDomain(domainName); 454 domain.addSeedList(seedlist); 455 DomainDAO.getInstance().create(domain); 456 } 457 458 DomainConfiguration dc = new DomainConfiguration(name, domain, seedListList, new ArrayList<Password>()); 459 dc.setOrderXmlName(templateName); 460 dc.setMaxBytes(maxBytes); 461 dc.setMaxObjects(maxObjects); 462 domain.addConfiguration(dc); 463 log.info("Adding seeds til new configuration '{}' (id={}) for domain '{}' ", name, dc.getID(), domain.getName()); 464 465 466 // Find the SeedList and add this seed to it 467 seedlist = domain.getSeedList(name); 468 List<String> currentSeeds = seedlist.getSeeds(); 469 entry.getValue().addAll(currentSeeds); 470 471 List<String> allSeeds = new ArrayList<String>(); 472 473 allSeeds.addAll(entry.getValue()); 474 domain.updateSeedList(new SeedList(name, allSeeds)); 475 476 // Add the configuration to the list of new configs for 477 // this harvest. 478 newDcs.add(dc); 479 DomainDAO.getInstance().update(domain); 480 log.info("Created configuration '{}' for domain {} with ID {}", dc.getName(), dc.getDomainName(), dc.getID()); 481 saveAttributes(dc, attributeValues); 482 } 483 484 boolean thisInDAO = HarvestDefinitionDAO.getInstance().exists(this.harvestDefName); 485 if (thisInDAO) { // We have previously created this harvestdefinition in the HarvestDefinitionDAO. 486 HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance(); 487 for (DomainConfiguration dc : newDcs) { 488 addConfiguration(dc); 489 hddao.addDomainConfiguration(this, new SparseDomainConfiguration(dc)); 490 } 491 hddao.update(this); 492 } else { // not yet created in the HarvestDefinitionDAO 493 for (DomainConfiguration dc : newDcs) { 494 addConfiguration(dc); 495 } 496 HarvestDefinitionDAO.getInstance().create(this); 497 } 498 } 499 500 private void saveAttributes(DomainConfiguration dc, Map<String, String> attributeValues) { 501 if (dc.getID() == null) { 502 log.warn("Attributes not saved to database. Id of domainConfiguration not yet available"); 503 return; 504 } 505 // EAV 506 try { 507 long entity_id = dc.getID(); 508 log.info("Saving attributes for domain config id {} and name {} and domain {}", entity_id, dc.getName(), dc.getDomainName()); 509 EAV eav = EAV.getInstance(); 510 List<AttributeAndType> attributeTypes = eav.getAttributesAndTypes(EAV.DOMAIN_TREE_ID, (int)entity_id); 511 log.debug("3 attributes available for entity {}", entity_id); 512 AttributeAndType attributeAndType; 513 AttributeTypeBase attributeType; 514 AttributeBase attribute; 515 for (int i=0; i<attributeTypes.size(); ++i) { 516 attributeAndType = attributeTypes.get(i); 517 attributeType = attributeAndType.attributeType; 518 log.debug("Examining attribute {}",attributeType.name); 519 attribute = attributeAndType.attribute; 520 if (attribute == null) { 521 attribute = attributeType.instanceOf(); 522 attribute.entity_id = (int)entity_id; 523 } 524 switch (attributeType.viewtype) { 525 case 1: 526 String paramValue = attributeValues.get(attributeType.name); 527 int intValue; 528 if (paramValue != null) { 529 intValue = Integer.decode(paramValue); 530 } else { 531 intValue = attributeType.def_int; 532 } 533 log.info("Setting attribute {} to value {}", attributeType.name, intValue); 534 attribute.setInteger(intValue); 535 break; 536 case 5: 537 case 6: 538 paramValue = attributeValues.get(attributeType.name); 539 int intVal = 0; 540 if (paramValue != null && !"0".equals(paramValue)) { 541 intVal = 1; 542 } 543 log.debug("Set intVal = 1 for attribute {} when receiving paramValue={}", attributeType.name, paramValue); 544 attribute.setInteger(intVal); 545 break; 546 } 547 eav.saveAttribute(attribute); 548 } 549 } catch (SQLException e) { 550 throw new RuntimeException("Unable to store EAV data!", e); 551 } 552 } 553}