001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileInputStream; 028import java.io.FileNotFoundException; 029import java.io.IOException; 030import java.io.InputStreamReader; 031import java.util.ArrayList; 032import java.util.Date; 033import java.util.List; 034import java.util.Locale; 035 036import javax.servlet.jsp.JspWriter; 037 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import dk.netarkivet.common.exceptions.ArgumentNotValid; 042import dk.netarkivet.common.exceptions.IOFailure; 043import dk.netarkivet.common.utils.DomainUtils; 044import dk.netarkivet.common.utils.I18n; 045import dk.netarkivet.common.utils.StringUtils; 046import dk.netarkivet.harvester.Constants; 047 048/** 049 * Utility class for ingesting new domains into the database. 050 */ 051public class IngestDomainList { 052 053 /** The logger. */ 054 protected static final Logger log = LoggerFactory.getLogger(IngestDomainList.class); 055 056 /** I18n bundle used by this class. */ 057 private static final I18n I18N = new dk.netarkivet.common.utils.I18n(Constants.TRANSLATIONS_BUNDLE); 058 059 /** How often to log progress. */ 060 private static final int PRINT_INTERVAL = 10000; 061 062 /** Connection to the persistent store of Domains. */ 063 private DomainDAO dao; 064 065 /** 066 * Constructor for the IngestDomainList class. It makes a connection to the Domains store. 067 */ 068 public IngestDomainList() { 069 dao = DomainDAO.getInstance(); 070 } 071 072 /** 073 * Adds all new domains from a newline-separated file of domain names. The file is assumed to be in the UTF-8 074 * format. For large files, a line is printed to the log, and to the out variable (if not set to null), every 075 * PRINT_INTERVAL lines. 076 * 077 * @param domainList the file containing the domain names. 078 * @param out a stream to which output can be sent. May be null. 079 * @param theLocale the given Locale 080 */ 081 public void updateDomainInfo(File domainList, JspWriter out, Locale theLocale) { 082 ArgumentNotValid.checkNotNull(domainList, "File domainList"); 083 ArgumentNotValid.checkNotNull(theLocale, "Locale theLocale"); 084 Domain myDomain; 085 String domainName; 086 BufferedReader in = null; 087 int countDomains = 0; 088 List<String> invalidDomains = new ArrayList<String>(); 089 int countCreatedDomains = 0; 090 boolean print = (out != null); 091 try { 092 in = new BufferedReader(new InputStreamReader(new FileInputStream(domainList), "UTF-8")); 093 094 while ((domainName = in.readLine()) != null) { 095 try { 096 countDomains++; 097 if ((countDomains % PRINT_INTERVAL) == 0) { 098 Date d = new Date(); 099 String msg = "Domain #" + countDomains + ": " + domainName + " added at " + d; 100 log.info(msg); 101 if (print) { 102 out.print(I18N.getString(theLocale, "domain.number.0.1.added.at.2", countDomains, 103 domainName, d)); 104 out.print("<br/>"); 105 out.flush(); 106 } 107 } 108 109 if (DomainUtils.isValidDomainName(domainName)) { 110 if (!dao.exists(domainName)) { 111 myDomain = Domain.getDefaultDomain(domainName); 112 dao.create(myDomain); 113 countCreatedDomains++; 114 } 115 } else { 116 log.debug("domain '{}' is not a valid domain Name", domainName); 117 invalidDomains.add(domainName); 118 if (print) { 119 out.print(I18N.getString(theLocale, "errormsg;domain.0.is.not.a.valid" + ".domainname", 120 domainName)); 121 out.print("<br/>"); 122 out.flush(); 123 } 124 } 125 } catch (Exception e) { 126 log.debug("Could not create domain '{}'", domainName, e); 127 if (print) { 128 out.print(I18N.getString(theLocale, "errormsg;unable.to.create" + ".domain.0.due.to.error.1", 129 domainName, e.getMessage())); 130 out.print("<br/>\n"); 131 out.flush(); 132 } 133 } 134 } 135 log.info("Looked at {} domains, created {} new domains and found {} invalid domains", countDomains, countCreatedDomains, invalidDomains.size()); 136 if (!invalidDomains.isEmpty()) { 137 log.warn("Found the following {} invalid domains during ingest", invalidDomains.size(), StringUtils.conjoin(",", invalidDomains)); 138 } 139 } catch (FileNotFoundException e) { 140 String msg = "File '" + domainList.getAbsolutePath() + "' not found"; 141 log.debug(msg); 142 throw new IOFailure(msg, e); 143 } catch (IOException e) { 144 String msg = " Can't read the domain-file '" + domainList.getAbsolutePath() + "'."; 145 log.debug(msg); 146 throw new IOFailure(msg, e); 147 } finally { 148 try { 149 if (in != null) { 150 in.close(); 151 } 152 } catch (IOException e) { 153 throw new IOFailure("Problem closing input stream", e); 154 } 155 } 156 } 157 158}