001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.io.File; 026import java.io.FileReader; 027import java.io.IOException; 028import java.net.MalformedURLException; 029import java.net.URL; 030import java.sql.SQLException; 031import java.util.ArrayList; 032import java.util.Collection; 033import java.util.Date; 034import java.util.HashMap; 035import java.util.HashSet; 036import java.util.Iterator; 037import java.util.List; 038import java.util.Map; 039import java.util.Set; 040 041import javax.servlet.jsp.PageContext; 042 043import org.apache.commons.io.LineIterator; 044import org.slf4j.Logger; 045import org.slf4j.LoggerFactory; 046 047import com.antiaction.raptor.dao.AttributeBase; 048import com.antiaction.raptor.dao.AttributeTypeBase; 049 050import dk.netarkivet.common.exceptions.ArgumentNotValid; 051import dk.netarkivet.common.exceptions.IOFailure; 052import dk.netarkivet.common.exceptions.UnknownID; 053import dk.netarkivet.common.utils.DomainUtils; 054import dk.netarkivet.common.utils.I18n; 055import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory; 056import dk.netarkivet.harvester.datamodel.eav.EAV; 057import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 058import dk.netarkivet.harvester.webinterface.EventHarvestUtil; 059 060/** 061 * This class contains the specific properties and operations of harvest definitions which are not snapshot harvest 062 * definitions. I.e. this class models definitions of event and selective harvests. 063 */ 064public class PartialHarvest extends HarvestDefinition { 065 066 private static final Logger log = LoggerFactory.getLogger(PartialHarvest.class); 067 068 /** 069 * Set of domain configurations being harvested by this harvest. Entries in this set are unique on configuration 070 * name + domain name. 071 */ 072 private Map<SparseDomainConfiguration, DomainConfiguration> domainConfigurations = new HashMap<SparseDomainConfiguration, DomainConfiguration>(); 073 074 /** The schedule used by this PartialHarvest. */ 075 private Schedule schedule; 076 077 /** 078 * The next date this harvest definition should run, null if never again. 079 */ 080 private Date nextDate; 081 082 /** 083 * Create new instance of a PartialHavest configured according to the properties of the supplied 084 * DomainConfiguration. 085 * 086 * @param domainConfigurations a list of domain configurations 087 * @param schedule the harvest definition schedule 088 * @param harvestDefName the name of the harvest definition 089 * @param comments comments 090 * @param audience The intended audience for this harvest (could be null) 091 */ 092 public PartialHarvest(List<DomainConfiguration> domainConfigurations, Schedule schedule, String harvestDefName, 093 String comments, String audience) { 094 super(DAOProviderFactory.getExtendedFieldDAOProvider()); 095 ArgumentNotValid.checkNotNull(schedule, "schedule"); 096 ScheduleDAO.getInstance().read(schedule.getName()); 097 098 ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName"); 099 ArgumentNotValid.checkNotNull(comments, "comments"); 100 ArgumentNotValid.checkNotNull(domainConfigurations, "domainConfigurations"); 101 102 this.numEvents = 0; 103 addConfigurations(domainConfigurations); 104 this.schedule = schedule; 105 this.harvestDefName = harvestDefName; 106 this.comments = comments; 107 this.nextDate = schedule.getFirstEvent(new Date()); 108 this.audience = audience; 109 } 110 111 /** 112 * Returns the schedule defined for this harvest definition. 113 * 114 * @return schedule 115 */ 116 public Schedule getSchedule() { 117 return schedule; 118 } 119 120 /** 121 * Set the schedule to be used for this harvestdefinition. 122 * 123 * @param schedule A schedule for when to try harvesting. 124 */ 125 public void setSchedule(Schedule schedule) { 126 ArgumentNotValid.checkNotNull(schedule, "schedule"); 127 this.schedule = schedule; 128 if (nextDate != null) { 129 setNextDate(schedule.getFirstEvent(nextDate)); 130 } 131 } 132 133 /** 134 * Get the next date this harvest definition should be run. 135 * 136 * @return The next date the harvest definition should be run or null, if the harvest definition should never run 137 * again. 138 */ 139 public Date getNextDate() { 140 return nextDate; 141 } 142 143 /** 144 * Set the next date this harvest definition should be run. 145 * 146 * @param nextDate The next date the harvest definition should be run. May be null, meaning never again. 147 */ 148 public void setNextDate(Date nextDate) { 149 this.nextDate = nextDate; 150 } 151 152 /** 153 * Remove domainconfiguration from this partialHarvest. 154 * 155 * @param dcKey domainConfiguration key 156 */ 157 public void removeDomainConfiguration(SparseDomainConfiguration dcKey) { 158 ArgumentNotValid.checkNotNull(dcKey, "DomainConfigurationKey dcKey"); 159 if (domainConfigurations.remove(dcKey) == null) { 160 log.warn("Unable to delete domainConfiguration '{}' from {}. Reason: didn't exist.", dcKey, this); 161 } 162 } 163 164 /** 165 * Add a new domainconfiguration to this PartialHarvest. 166 * 167 * @param newConfiguration A new DomainConfiguration 168 */ 169 public void addDomainConfiguration(DomainConfiguration newConfiguration) { 170 ArgumentNotValid.checkNotNull(newConfiguration, "DomainConfiguration newConfiguration"); 171 SparseDomainConfiguration key = new SparseDomainConfiguration(newConfiguration); 172 if (domainConfigurations.containsKey(key)) { 173 log.warn("Unable to add domainConfiguration '{}' from {}. Reason: does already exist.", newConfiguration, 174 this); 175 } else { 176 domainConfigurations.put(key, newConfiguration); 177 } 178 } 179 180 /** 181 * Returns a List of domain configurations for this harvest definition. 182 * 183 * @return List containing information about the domain configurations 184 */ 185 public Iterator<DomainConfiguration> getDomainConfigurations() { 186 return domainConfigurations.values().iterator(); 187 } 188 189 /** 190 * @return the domainconfigurations as a list 191 */ 192 public Collection<DomainConfiguration> getDomainConfigurationsAsList() { 193 return domainConfigurations.values(); 194 } 195 196 /** 197 * Set the list of configurations that this PartialHarvest uses. 198 * 199 * @param configs List<DomainConfiguration> the configurations that this harvestdefinition will use. 200 */ 201 public void setDomainConfigurations(List<DomainConfiguration> configs) { 202 ArgumentNotValid.checkNotNull(configs, "configs"); 203 204 domainConfigurations.clear(); 205 addConfigurations(configs); 206 } 207 208 /** 209 * Add the list of configurations to the configuration associated with this PartialHarvest. 210 * 211 * @param configs a List of configurations 212 */ 213 private void addConfigurations(List<DomainConfiguration> configs) { 214 for (DomainConfiguration dc : configs) { 215 addConfiguration(dc); 216 } 217 } 218 219 /** 220 * Add a configuration to this PartialHarvest. 221 * 222 * @param dc the given configuration 223 */ 224 private void addConfiguration(DomainConfiguration dc) { 225 domainConfigurations.put(new SparseDomainConfiguration(dc), dc); 226 } 227 228 /** 229 * Reset the harvest definition to no harvests and next date being the first possible for the schedule. 230 */ 231 public void reset() { 232 numEvents = 0; 233 nextDate = schedule.getFirstEvent(new Date()); 234 } 235 236 /** 237 * Check if this harvest definition should be run, given the time now. 238 * 239 * @param now The current time 240 * @return true if harvest definition should be run 241 */ 242 public boolean runNow(Date now) { 243 ArgumentNotValid.checkNotNull(now, "now"); 244 if (!getActive()) { 245 return false; // inactive definitions are never run 246 } 247 return nextDate != null && now.compareTo(nextDate) >= 0; 248 } 249 250 /** 251 * Returns whether this HarvestDefinition represents a snapshot harvest. 252 * 253 * @return false (always) 254 */ 255 public boolean isSnapShot() { 256 return false; 257 } 258 259 /** 260 * Always returns no limit. 261 * 262 * @return 0, meaning no limit. 263 */ 264 public long getMaxCountObjects() { 265 return Constants.HERITRIX_MAXOBJECTS_INFINITY; 266 } 267 268 /** 269 * Always returns no limit. 270 * 271 * @return -1, meaning no limit. 272 */ 273 public long getMaxBytes() { 274 return Constants.HERITRIX_MAXBYTES_INFINITY; 275 } 276 277 /** 278 * Takes a seed list and creates any necessary domains, configurations, and seedlists to enable them to be harvested 279 * with the given template and other parameters. <A href="https://sbforge.org/jira/browse/NAS-1317">JIRA issue 280 * NAS-1317</A> addresses this issue. Current naming of the seedlists and domainconfigurations are: one of <br> 281 * harvestdefinitionname + "_" + templateName + "_" + "UnlimitedBytes" (if maxbytes is negative)<br> 282 * harvestdefinitionname + "_" + templateName + "_" + maxBytes + "Bytes" (if maxbytes is zero or postive). 283 * 284 * @param seeds a list of the seeds to be added 285 * @param templateName the name of the template to be used 286 * @param maxBytes Maximum number of bytes to harvest per domain 287 * @param maxObjects Maximum number of objects to harvest per domain 288 * @param attributeValues Attributes read from webpage 289 * @see EventHarvestUtil#addConfigurations(PageContext, I18n, String) for details 290 */ 291 public void addSeeds(Set<String> seeds, String templateName, long maxBytes, int maxObjects, Map<String, String> attributeValues) { 292 ArgumentNotValid.checkNotNull(seeds, "seeds"); 293 ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); 294 if (!TemplateDAO.getInstance().exists(templateName)) { 295 throw new UnknownID("No such template: " + templateName); 296 } 297 298 Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); 299 StringBuilder invalidMessage = new StringBuilder("Unable to create an event harvest.\n" 300 + "The following seeds are invalid:\n"); 301 boolean valid = true; 302 // validate: 303 304 for (String seed : seeds) { 305 boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds); 306 if (!seedValid) { 307 valid = false; 308 } 309 } 310 311 if (!valid) { 312 throw new ArgumentNotValid(invalidMessage.toString()); 313 } 314 315 addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues); 316 } 317 318 /** 319 * This method is a duplicate of the addSeeds method but for seedsFile parameter 320 * 321 * @param seedsFile a newline-separated File containing the seeds to be added 322 * @param templateName the name of the template to be used 323 * @param maxBytes Maximum number of bytes to harvest per domain 324 * @param maxObjects Maximum number of objects to harvest per domain 325 */ 326 public void addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects, Map<String,String> attributeValues) { 327 ArgumentNotValid.checkNotNull(seedsFile, "seeds"); 328 ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist"); 329 ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); 330 if (!TemplateDAO.getInstance().exists(templateName)) { 331 throw new UnknownID("No such template: " + templateName); 332 } 333 334 Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); 335 StringBuilder invalidMessage = new StringBuilder("Unable to create an event harvest.\n" 336 + "The following seeds are invalid:\n"); 337 boolean valid = true; 338 339 // validate all the seeds in the file 340 // those accepted are entered into the acceptedSeeds datastructure 341 342 // Iterate through the contents of the file 343 LineIterator seedIterator = null; 344 try { 345 seedIterator = new LineIterator(new FileReader(seedsFile)); 346 while (seedIterator.hasNext()) { 347 String seed = seedIterator.next(); 348 boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds); 349 if (!seedValid) { 350 valid = false; 351 } 352 } 353 } catch (IOException e) { 354 throw new IOFailure("Unable to process seedsfile ", e); 355 } finally { 356 LineIterator.closeQuietly(seedIterator); 357 } 358 359 if (!valid) { 360 throw new ArgumentNotValid(invalidMessage.toString()); 361 } 362 363 addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues); 364 } 365 366 /** 367 * Process each seed. 368 * 369 * @param seed The given seed. 370 * @param invalidMessage The message builder where the invalid seeds are added. 371 * @param acceptedSeeds The set of accepted seeds 372 * @return true, if the processed seed is valid or empty. 373 */ 374 private boolean processSeed(String seed, StringBuilder invalidMessage, Map<String, Set<String>> acceptedSeeds) { 375 seed = seed.trim(); 376 if (seed.length() != 0 && !seed.startsWith("#") && !seed.startsWith("//")) { // ignore empty lines and comments 377 378 if (!(seed.toLowerCase().startsWith("http://") || seed.toLowerCase().startsWith("https://"))) { 379 seed = "http://" + seed; 380 } 381 URL url = null; 382 try { 383 url = new URL(seed); 384 } catch (MalformedURLException e) { 385 invalidMessage.append(seed); 386 invalidMessage.append('\n'); 387 return false; 388 } 389 String host = url.getHost(); 390 String domainName = DomainUtils.domainNameFromHostname(host); 391 if (domainName == null) { 392 invalidMessage.append(seed); 393 invalidMessage.append('\n'); 394 return false; 395 } 396 397 Set<String> seedsForDomain = acceptedSeeds.get(domainName); 398 if (seedsForDomain == null) { 399 seedsForDomain = new HashSet<String>(); 400 acceptedSeeds.put(domainName, seedsForDomain); 401 } 402 seedsForDomain.add(seed); 403 } 404 return true; 405 } 406 407 /** 408 * Generate domain configurations for the accepted seeds. 409 * 410 * @param templateName The Heritrix template to be used. 411 * @param maxBytes The number of max bytes allowed 412 * @param maxObjects The number of max objected allowed 413 * @param acceptedSeeds The set of accepted seeds 414 */ 415 private void addSeedsToDomain(String templateName, long maxBytes, int maxObjects, 416 Map<String, Set<String>> acceptedSeeds, Map<String, String> attributeValues) { 417 // Generate components for the name for the configuration and seedlist 418 final String maxbytesSuffix = "Bytes"; 419 String maxBytesS = "Unlimited" + maxbytesSuffix; 420 if (maxBytes >= 0) { 421 maxBytesS = Long.toString(maxBytes); 422 maxBytesS = maxBytesS + maxbytesSuffix; 423 } 424 425 final String maxobjectsSuffix = "Objects"; 426 String maxObjectsS = "Unlimited" + maxobjectsSuffix; 427 if (maxObjects >= 0) { 428 maxObjectsS = Long.toString(maxObjects); 429 maxObjectsS = maxObjectsS + maxobjectsSuffix; 430 } 431 432 String name = harvestDefName + "_" + templateName + "_" + maxBytesS + "_" + maxObjectsS; 433 434 Set<DomainConfiguration> newDcs = new HashSet<DomainConfiguration>(); 435 for (Map.Entry<String, Set<String>> entry : acceptedSeeds.entrySet()) { 436 String domainName = entry.getKey(); 437 Domain domain; 438 List<SeedList> seedListList = new ArrayList<SeedList>(); 439 SeedList seedlist; 440 // Find or create the domain 441 if (DomainDAO.getInstance().exists(domainName)) { 442 domain = DomainDAO.getInstance().read(domainName); 443 444 // If a config with this name exists already for the dommain, add a "_" + timestamp to the end of the name to be make it unique. 445 // This will probably happen rarely. 446 // This name is used for both the configuration and corresponding seed 447 if (domain.hasConfiguration(name)) { 448 String oldName = name; 449 name = name + "_" + System.currentTimeMillis(); 450 log.info("configuration '{}' for domain '{}' already exists. Change name for config and corresponding seed to ", 451 oldName, name, domain.getName()); 452 } 453 seedlist = new SeedList(name, ""); // Assure that the seedname is the same as the configname. 454 seedListList.add(seedlist); 455 domain.addSeedList(seedlist); 456 457 } else { 458 seedlist = new SeedList(name, ""); // Assure that the seedname is the same as the configname. 459 seedListList.add(seedlist); 460 log.info("Creating domain {} in DomainDAO", domainName); 461 domain = Domain.getDefaultDomain(domainName); 462 domain.addSeedList(seedlist); 463 DomainDAO.getInstance().create(domain); 464 } 465 466 DomainConfiguration dc = new DomainConfiguration(name, domain, seedListList, new ArrayList<Password>()); 467 dc.setOrderXmlName(templateName); 468 dc.setMaxBytes(maxBytes); 469 dc.setMaxObjects(maxObjects); 470 domain.addConfiguration(dc); 471 log.info("Adding seeds til new configuration '{}' (id={}) for domain '{}' ", name, dc.getID(), domain.getName()); 472 473 474 // Find the SeedList and add this seed to it 475 seedlist = domain.getSeedList(name); 476 List<String> currentSeeds = seedlist.getSeeds(); 477 entry.getValue().addAll(currentSeeds); 478 479 List<String> allSeeds = new ArrayList<String>(); 480 481 allSeeds.addAll(entry.getValue()); 482 domain.updateSeedList(new SeedList(name, allSeeds)); 483 484 // Add the configuration to the list of new configs for 485 // this harvest. 486 newDcs.add(dc); 487 DomainDAO.getInstance().update(domain); 488 log.info("Created configuration '{}' for domain {} with ID {}", dc.getName(), dc.getDomainName(), dc.getID()); 489 saveAttributes(dc, attributeValues); 490 } 491 492 boolean thisInDAO = HarvestDefinitionDAO.getInstance().exists(this.harvestDefName); 493 if (thisInDAO) { // We have previously created this harvestdefinition in the HarvestDefinitionDAO. 494 HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance(); 495 for (DomainConfiguration dc : newDcs) { 496 addConfiguration(dc); 497 hddao.addDomainConfiguration(this, new SparseDomainConfiguration(dc)); 498 } 499 hddao.update(this); 500 } else { // not yet created in the HarvestDefinitionDAO 501 for (DomainConfiguration dc : newDcs) { 502 addConfiguration(dc); 503 } 504 HarvestDefinitionDAO.getInstance().create(this); 505 } 506 } 507 508 private void saveAttributes(DomainConfiguration dc, Map<String, String> attributeValues) { 509 if (dc.getID() == null) { 510 log.warn("Attributes not saved to database. Id of domainConfiguration not yet available"); 511 return; 512 } 513 // EAV 514 try { 515 long entity_id = dc.getID(); 516 log.info("Saving attributes for domain config id {} and name {} and domain {}", entity_id, dc.getName(), dc.getDomainName()); 517 EAV eav = EAV.getInstance(); 518 List<AttributeAndType> attributeTypes = eav.getAttributesAndTypes(EAV.DOMAIN_TREE_ID, (int)entity_id); 519 log.debug("3 attributes available for entity {}", entity_id); 520 AttributeAndType attributeAndType; 521 AttributeTypeBase attributeType; 522 AttributeBase attribute; 523 for (int i=0; i<attributeTypes.size(); ++i) { 524 attributeAndType = attributeTypes.get(i); 525 attributeType = attributeAndType.attributeType; 526 log.debug("Examining attribute {}",attributeType.name); 527 attribute = attributeAndType.attribute; 528 if (attribute == null) { 529 attribute = attributeType.instanceOf(); 530 attribute.entity_id = (int)entity_id; 531 } 532 switch (attributeType.viewtype) { 533 case 1: 534 String paramValue = attributeValues.get(attributeType.name); 535 int intValue; 536 if (paramValue != null) { 537 intValue = Integer.decode(paramValue); 538 } else { 539 intValue = attributeType.def_int; 540 } 541 log.info("Setting attribute {} to value {}", attributeType.name, intValue); 542 attribute.setInteger(intValue); 543 break; 544 case 5: 545 case 6: 546 paramValue = attributeValues.get(attributeType.name); 547 int intVal = 0; 548 if (paramValue != null && !"0".equals(paramValue)) { 549 intVal = 1; 550 } 551 log.debug("Set intVal = 1 for attribute {} when receiving paramValue={}", attributeType.name, paramValue); 552 attribute.setInteger(intVal); 553 break; 554 } 555 eav.saveAttribute(attribute); 556 } 557 } catch (SQLException e) { 558 throw new RuntimeException("Unable to store EAV data!", e); 559 } 560 } 561}