001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.io.File; 026import java.io.FileReader; 027import java.io.IOException; 028import java.net.MalformedURLException; 029import java.net.URL; 030import java.util.ArrayList; 031import java.util.Collection; 032import java.util.Date; 033import java.util.HashMap; 034import java.util.HashSet; 035import java.util.Iterator; 036import java.util.List; 037import java.util.Map; 038import java.util.Set; 039 040import javax.servlet.jsp.PageContext; 041 042import org.apache.commons.io.LineIterator; 043import org.slf4j.Logger; 044import org.slf4j.LoggerFactory; 045 046import dk.netarkivet.common.exceptions.ArgumentNotValid; 047import dk.netarkivet.common.exceptions.IOFailure; 048import dk.netarkivet.common.exceptions.UnknownID; 049import dk.netarkivet.common.utils.DomainUtils; 050import dk.netarkivet.common.utils.I18n; 051import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory; 052import dk.netarkivet.harvester.webinterface.EventHarvestUtil; 053 054/** 055 * This class contains the specific properties and operations of harvest definitions which are not snapshot harvest 056 * definitions. I.e. this class models definitions of event and selective harvests. 057 */ 058public class PartialHarvest extends HarvestDefinition { 059 060 private static final Logger log = LoggerFactory.getLogger(PartialHarvest.class); 061 062 /** 063 * Set of domain configurations being harvested by this harvest. Entries in this set are unique on configuration 064 * name + domain name. 065 */ 066 private Map<SparseDomainConfiguration, DomainConfiguration> domainConfigurations = new HashMap<SparseDomainConfiguration, DomainConfiguration>(); 067 068 /** The schedule used by this PartialHarvest. */ 069 private Schedule schedule; 070 071 /** 072 * The next date this harvest definition should run, null if never again. 073 */ 074 private Date nextDate; 075 076 /** 077 * Create new instance of a PartialHavest configured according to the properties of the supplied 078 * DomainConfiguration. 079 * 080 * @param domainConfigurations a list of domain configurations 081 * @param schedule the harvest definition schedule 082 * @param harvestDefName the name of the harvest definition 083 * @param comments comments 084 * @param audience The intended audience for this harvest (could be null) 085 */ 086 public PartialHarvest(List<DomainConfiguration> domainConfigurations, Schedule schedule, String harvestDefName, 087 String comments, String audience) { 088 super(DAOProviderFactory.getExtendedFieldDAOProvider()); 089 ArgumentNotValid.checkNotNull(schedule, "schedule"); 090 ScheduleDAO.getInstance().read(schedule.getName()); 091 092 ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName"); 093 ArgumentNotValid.checkNotNull(comments, "comments"); 094 ArgumentNotValid.checkNotNull(domainConfigurations, "domainConfigurations"); 095 096 this.numEvents = 0; 097 addConfigurations(domainConfigurations); 098 this.schedule = schedule; 099 this.harvestDefName = harvestDefName; 100 this.comments = comments; 101 this.nextDate = schedule.getFirstEvent(new Date()); 102 this.audience = audience; 103 } 104 105 /** 106 * Returns the schedule defined for this harvest definition. 107 * 108 * @return schedule 109 */ 110 public Schedule getSchedule() { 111 return schedule; 112 } 113 114 /** 115 * Set the schedule to be used for this harvestdefinition. 116 * 117 * @param schedule A schedule for when to try harvesting. 118 */ 119 public void setSchedule(Schedule schedule) { 120 ArgumentNotValid.checkNotNull(schedule, "schedule"); 121 this.schedule = schedule; 122 if (nextDate != null) { 123 setNextDate(schedule.getFirstEvent(nextDate)); 124 } 125 } 126 127 /** 128 * Get the next date this harvest definition should be run. 129 * 130 * @return The next date the harvest definition should be run or null, if the harvest definition should never run 131 * again. 132 */ 133 public Date getNextDate() { 134 return nextDate; 135 } 136 137 /** 138 * Set the next date this harvest definition should be run. 139 * 140 * @param nextDate The next date the harvest definition should be run. May be null, meaning never again. 141 */ 142 public void setNextDate(Date nextDate) { 143 this.nextDate = nextDate; 144 } 145 146 /** 147 * Remove domainconfiguration from this partialHarvest. 148 * 149 * @param dcKey domainConfiguration key 150 */ 151 public void removeDomainConfiguration(SparseDomainConfiguration dcKey) { 152 ArgumentNotValid.checkNotNull(dcKey, "DomainConfigurationKey dcKey"); 153 if (domainConfigurations.remove(dcKey) == null) { 154 log.warn("Unable to delete domainConfiguration '{}' from {}. Reason: didn't exist.", dcKey, this); 155 } 156 } 157 158 /** 159 * Add a new domainconfiguration to this PartialHarvest. 160 * 161 * @param newConfiguration A new DomainConfiguration 162 */ 163 public void addDomainConfiguration(DomainConfiguration newConfiguration) { 164 ArgumentNotValid.checkNotNull(newConfiguration, "DomainConfiguration newConfiguration"); 165 SparseDomainConfiguration key = new SparseDomainConfiguration(newConfiguration); 166 if (domainConfigurations.containsKey(key)) { 167 log.warn("Unable to add domainConfiguration '{}' from {}. Reason: does already exist.", newConfiguration, 168 this); 169 } else { 170 domainConfigurations.put(key, newConfiguration); 171 } 172 } 173 174 /** 175 * Returns a List of domain configurations for this harvest definition. 176 * 177 * @return List containing information about the domain configurations 178 */ 179 public Iterator<DomainConfiguration> getDomainConfigurations() { 180 return domainConfigurations.values().iterator(); 181 } 182 183 /** 184 * @return the domainconfigurations as a list 185 */ 186 public Collection<DomainConfiguration> getDomainConfigurationsAsList() { 187 return domainConfigurations.values(); 188 } 189 190 /** 191 * Set the list of configurations that this PartialHarvest uses. 192 * 193 * @param configs List<DomainConfiguration> the configurations that this harvestdefinition will use. 194 */ 195 public void setDomainConfigurations(List<DomainConfiguration> configs) { 196 ArgumentNotValid.checkNotNull(configs, "configs"); 197 198 domainConfigurations.clear(); 199 addConfigurations(configs); 200 } 201 202 /** 203 * Add the list of configurations to the configuration associated with this PartialHarvest. 204 * 205 * @param configs a List of configurations 206 */ 207 private void addConfigurations(List<DomainConfiguration> configs) { 208 for (DomainConfiguration dc : configs) { 209 addConfiguration(dc); 210 } 211 } 212 213 /** 214 * Add a configuration to this PartialHarvest. 215 * 216 * @param dc the given configuration 217 */ 218 private void addConfiguration(DomainConfiguration dc) { 219 domainConfigurations.put(new SparseDomainConfiguration(dc), dc); 220 } 221 222 /** 223 * Reset the harvest definition to no harvests and next date being the first possible for the schedule. 224 */ 225 public void reset() { 226 numEvents = 0; 227 nextDate = schedule.getFirstEvent(new Date()); 228 } 229 230 /** 231 * Check if this harvest definition should be run, given the time now. 232 * 233 * @param now The current time 234 * @return true if harvest definition should be run 235 */ 236 public boolean runNow(Date now) { 237 ArgumentNotValid.checkNotNull(now, "now"); 238 if (!getActive()) { 239 return false; // inactive definitions are never run 240 } 241 return nextDate != null && now.compareTo(nextDate) >= 0; 242 } 243 244 /** 245 * Returns whether this HarvestDefinition represents a snapshot harvest. 246 * 247 * @return false (always) 248 */ 249 public boolean isSnapShot() { 250 return false; 251 } 252 253 /** 254 * Always returns no limit. 255 * 256 * @return 0, meaning no limit. 257 */ 258 public long getMaxCountObjects() { 259 return Constants.HERITRIX_MAXOBJECTS_INFINITY; 260 } 261 262 /** 263 * Always returns no limit. 264 * 265 * @return -1, meaning no limit. 266 */ 267 public long getMaxBytes() { 268 return Constants.HERITRIX_MAXBYTES_INFINITY; 269 } 270 271 /** 272 * Takes a seed list and creates any necessary domains, configurations, and seedlists to enable them to be harvested 273 * with the given template and other parameters. <A href="https://sbforge.org/jira/browse/NAS-1317">JIRA issue 274 * NAS-1317</A> addresses this issue. Current naming of the seedlists and domainconfigurations are: one of <br> 275 * harvestdefinitionname + "_" + templateName + "_" + "UnlimitedBytes" (if maxbytes is negative)<br> 276 * harvestdefinitionname + "_" + templateName + "_" + maxBytes + "Bytes" (if maxbytes is zero or postive). 277 * 278 * @param seeds a list of the seeds to be added 279 * @param templateName the name of the template to be used 280 * @param maxBytes Maximum number of bytes to harvest per domain 281 * @param maxObjects Maximum number of objects to harvest per domain 282 * @see EventHarvestUtil#addConfigurations(PageContext, I18n, String) for details 283 */ 284 public void addSeeds(Set<String> seeds, String templateName, long maxBytes, int maxObjects) { 285 ArgumentNotValid.checkNotNull(seeds, "seeds"); 286 ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); 287 if (!TemplateDAO.getInstance().exists(templateName)) { 288 throw new UnknownID("No such template: " + templateName); 289 } 290 291 Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); 292 StringBuilder invalidMessage = new StringBuilder("Unable to create an event harvest.\n" 293 + "The following seeds are invalid:\n"); 294 boolean valid = true; 295 // validate: 296 297 for (String seed : seeds) { 298 boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds); 299 if (!seedValid) { 300 valid = false; 301 } 302 } 303 304 if (!valid) { 305 throw new ArgumentNotValid(invalidMessage.toString()); 306 } 307 308 addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds); 309 } 310 311 /** 312 * This method is a duplicate of the addSeeds method but for seedsFile parameter 313 * 314 * @param seedsFile a newline-separated File containing the seeds to be added 315 * @param templateName the name of the template to be used 316 * @param maxBytes Maximum number of bytes to harvest per domain 317 * @param maxObjects Maximum number of objects to harvest per domain 318 */ 319 public void addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects) { 320 ArgumentNotValid.checkNotNull(seedsFile, "seeds"); 321 ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist"); 322 ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); 323 if (!TemplateDAO.getInstance().exists(templateName)) { 324 throw new UnknownID("No such template: " + templateName); 325 } 326 327 Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); 328 StringBuilder invalidMessage = new StringBuilder("Unable to create an event harvest.\n" 329 + "The following seeds are invalid:\n"); 330 boolean valid = true; 331 332 // validate all the seeds in the file 333 // those accepted are entered into the acceptedSeeds datastructure 334 335 // Iterate through the contents of the file 336 LineIterator seedIterator = null; 337 try { 338 seedIterator = new LineIterator(new FileReader(seedsFile)); 339 while (seedIterator.hasNext()) { 340 String seed = seedIterator.next(); 341 boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds); 342 if (!seedValid) { 343 valid = false; 344 } 345 } 346 } catch (IOException e) { 347 throw new IOFailure("Unable to process seedsfile ", e); 348 } finally { 349 LineIterator.closeQuietly(seedIterator); 350 } 351 352 if (!valid) { 353 throw new ArgumentNotValid(invalidMessage.toString()); 354 } 355 356 addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds); 357 } 358 359 /** 360 * Process each seed. 361 * 362 * @param seed The given seed. 363 * @param invalidMessage The message builder where the invalid seeds are added. 364 * @param acceptedSeeds The set of accepted seeds 365 * @return true, if the processed seed is valid or empty. 366 */ 367 private boolean processSeed(String seed, StringBuilder invalidMessage, Map<String, Set<String>> acceptedSeeds) { 368 seed = seed.trim(); 369 if (seed.length() != 0) { 370 if (!(seed.startsWith("http://") || seed.startsWith("https://"))) { 371 seed = "http://" + seed; 372 } 373 URL url = null; 374 try { 375 url = new URL(seed); 376 } catch (MalformedURLException e) { 377 invalidMessage.append(seed); 378 invalidMessage.append('\n'); 379 return false; 380 } 381 String host = url.getHost(); 382 String domainName = DomainUtils.domainNameFromHostname(host); 383 if (domainName == null) { 384 invalidMessage.append(seed); 385 invalidMessage.append('\n'); 386 return false; 387 } 388 389 Set<String> seedsForDomain = acceptedSeeds.get(domainName); 390 if (seedsForDomain == null) { 391 seedsForDomain = new HashSet<String>(); 392 acceptedSeeds.put(domainName, seedsForDomain); 393 } 394 seedsForDomain.add(seed); 395 } 396 return true; 397 } 398 399 /** 400 * Generate domain configurations for the accepted seeds. 401 * 402 * @param templateName The Heritrix template to be used. 403 * @param maxBytes The number of max bytes allowed 404 * @param maxObjects The number of max objected allowed 405 * @param acceptedSeeds The set of accepted seeds 406 */ 407 private void addSeedsToDomain(String templateName, long maxBytes, int maxObjects, 408 Map<String, Set<String>> acceptedSeeds) { 409 // Generate components for the name for the configuration and seedlist 410 final String maxbytesSuffix = "Bytes"; 411 String maxBytesS = "Unlimited" + maxbytesSuffix; 412 if (maxBytes >= 0) { 413 maxBytesS = Long.toString(maxBytes); 414 maxBytesS = maxBytesS + maxbytesSuffix; 415 } 416 417 final String maxobjectsSuffix = "Objects"; 418 String maxObjectsS = "Unlimited" + maxobjectsSuffix; 419 if (maxObjects >= 0) { 420 maxObjectsS = Long.toString(maxObjects); 421 maxObjectsS = maxObjectsS + maxobjectsSuffix; 422 } 423 424 String name = harvestDefName + "_" + templateName + "_" + maxBytesS + "_" + maxObjectsS; 425 426 Set<DomainConfiguration> newDcs = new HashSet<DomainConfiguration>(); 427 for (Map.Entry<String, Set<String>> entry : acceptedSeeds.entrySet()) { 428 String domainName = entry.getKey(); 429 Domain domain; 430 431 // Need a seedlist to include in the configuration when we 432 // create it. This will be replaced later. 433 SeedList seedlist = new SeedList(name, ""); 434 List<SeedList> seedListList = new ArrayList<SeedList>(); 435 seedListList.add(seedlist); 436 437 // Find or create the domain 438 if (DomainDAO.getInstance().exists(domainName)) { 439 domain = DomainDAO.getInstance().read(domainName); 440 if (!domain.hasSeedList(name)) { 441 domain.addSeedList(seedlist); 442 } 443 } else { 444 domain = Domain.getDefaultDomain(domainName); 445 domain.addSeedList(seedlist); 446 DomainDAO.getInstance().create(domain); 447 } 448 // Find or create the DomainConfiguration 449 DomainConfiguration dc = null; 450 if (domain.hasConfiguration(name)) { 451 dc = domain.getConfiguration(name); 452 } else { 453 dc = new DomainConfiguration(name, domain, seedListList, new ArrayList<Password>()); 454 dc.setOrderXmlName(templateName); 455 456 dc.setMaxBytes(maxBytes); 457 dc.setMaxObjects(maxObjects); 458 domain.addConfiguration(dc); 459 } 460 461 // Find the SeedList and add this seed to it 462 seedlist = domain.getSeedList(name); 463 List<String> currentSeeds = seedlist.getSeeds(); 464 entry.getValue().addAll(currentSeeds); 465 466 List<String> allSeeds = new ArrayList<String>(); 467 468 allSeeds.addAll(entry.getValue()); 469 domain.updateSeedList(new SeedList(name, allSeeds)); 470 471 // Add the configuration to the list of new configs for 472 // this harvest. 473 newDcs.add(dc); 474 DomainDAO.getInstance().update(domain); 475 } 476 477 boolean thisInDAO = HarvestDefinitionDAO.getInstance().exists(this.harvestDefName); 478 if (thisInDAO) { 479 HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance(); 480 for (DomainConfiguration dc : newDcs) { 481 addConfiguration(dc); 482 hddao.addDomainConfiguration(this, new SparseDomainConfiguration(dc)); 483 } 484 hddao.update(this); 485 } else { 486 for (DomainConfiguration dc : newDcs) { 487 addConfiguration(dc); 488 } 489 HarvestDefinitionDAO.getInstance().create(this); 490 } 491 492 } 493 494}