001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.wayback.indexer; 024 025import static dk.netarkivet.common.distribute.bitrepository.BitmagUtils.BITREPOSITORY_USEPILLAR; 026 027import java.io.BufferedReader; 028import java.io.IOException; 029import java.io.InputStream; 030import java.io.InputStreamReader; 031import java.util.Date; 032import java.util.Set; 033 034import org.apache.commons.io.IOUtils; 035import org.bitrepository.access.getfileids.GetFileIDsClient; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import dk.netarkivet.common.CommonSettings; 040import dk.netarkivet.common.distribute.RemoteFile; 041import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 042import dk.netarkivet.common.distribute.arcrepository.BatchStatus; 043import dk.netarkivet.common.distribute.arcrepository.PreservationArcRepositoryClient; 044import dk.netarkivet.common.distribute.bitrepository.BitmagUtils; 045import dk.netarkivet.common.distribute.bitrepository.action.getfileids.GetFileIDsAction; 046import dk.netarkivet.common.exceptions.IOFailure; 047import dk.netarkivet.common.utils.Settings; 048import dk.netarkivet.common.utils.batch.DatedFileListJob; 049import dk.netarkivet.common.utils.batch.FileListJob; 050import dk.netarkivet.wayback.WaybackSettings; 051 052public class FileNameHarvester { 053 054 /** Logger for this class. */ 055 private static final Logger log = LoggerFactory.getLogger(FileNameHarvester.class); 056 057 058 /** 059 * This method harvests a list of all the files currently in the arcrepository and appends any new ones found to the 060 * ArchiveFile object store. 061 */ 062 public static synchronized void harvestAllFilenames() { 063 ArchiveFileDAO dao = new ArchiveFileDAO(); 064 if (Settings.getBoolean(CommonSettings.USE_BITMAG_HADOOP_BACKEND)) { 065 Set<String> fileNames = getFilesFromBitmagSince(new Date(0)); 066 log.info("Harvested {} file(s) from bitmag", fileNames.size()); 067 createFilesInDB(fileNames, dao); 068 } else { 069 PreservationArcRepositoryClient client = ArcRepositoryClientFactory.getPreservationInstance(); 070 BatchStatus status = client.batch(new FileListJob(), Settings.get(WaybackSettings.WAYBACK_REPLICA)); 071 getResultFileAndCreateInDB(status, dao); 072 } 073 } 074 075 /** 076 * This method harvests a list of all the recently added files in the archive. 077 */ 078 public static synchronized void harvestRecentFilenames() { 079 ArchiveFileDAO dao = new ArchiveFileDAO(); 080 long timeAgo = Settings.getLong(WaybackSettings.WAYBACK_INDEXER_RECENT_PRODUCER_SINCE); 081 Date sinceDate = new Date(System.currentTimeMillis() - timeAgo); 082 083 if (Settings.getBoolean(CommonSettings.USE_BITMAG_HADOOP_BACKEND)) { 084 Set<String> fileNames = getFilesFromBitmagSince(sinceDate); 085 log.info("Harvested {} recent file(s) from bitmag", fileNames.size()); 086 createFilesInDB(fileNames, dao); 087 } else { 088 PreservationArcRepositoryClient client = ArcRepositoryClientFactory.getPreservationInstance(); 089 BatchStatus status = client.batch( 090 new DatedFileListJob(sinceDate), Settings.get(WaybackSettings.WAYBACK_REPLICA)); 091 getResultFileAndCreateInDB(status, dao); 092 } 093 } 094 095 /** 096 * Creates the given filenames in the database if they don't already exist. 097 * If the given set is empty it just logs that there were no files to add to the database. 098 * @param fileNames Files to create 099 * @param dao The DAO through which the database is accessed 100 */ 101 private static void createFilesInDB(Set<String> fileNames, ArchiveFileDAO dao) { 102 if (!fileNames.isEmpty()) { 103 for (String fileName : fileNames) { 104 if (!dao.exists(fileName)) { 105 createArchiveFileInDB(fileName, dao); 106 } 107 } 108 } else { 109 String collectionID = BitmagUtils.getDefaultCollectionID(); 110 log.info("No new files to add in database after harvest of collection '{}'", collectionID); 111 } 112 } 113 114 /** 115 * Performs a get-file-ids action on the used bitmag instance and returns the results in a set. 116 * @param sinceDate A date specifying how far back to fetch files from 117 * @return The resulting set of filenames from the get-file-ids action 118 */ 119 private static Set<String> getFilesFromBitmagSince(Date sinceDate) { 120 String collectionID = BitmagUtils.getDefaultCollectionID(); 121 String usePillar = Settings.get(BITREPOSITORY_USEPILLAR); 122 GetFileIDsClient client = BitmagUtils.getFileIDsClient(); 123 GetFileIDsAction action = new GetFileIDsAction(client, collectionID, usePillar, sinceDate); 124 action.performAction(); 125 return action.getActionResult(); 126 } 127 128 /** 129 * Helper method for handling results from BatchStatus and putting it in the database. 130 * @param status The BatchStatus with results from batch() 131 * @param dao The DAO through which the database is accessed. 132 */ 133 private static void getResultFileAndCreateInDB(BatchStatus status, ArchiveFileDAO dao) { 134 RemoteFile results = status.getResultFile(); 135 InputStream is = results.getInputStream(); 136 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 137 String line; 138 try { 139 while ((line = reader.readLine()) != null) { 140 if (!dao.exists(line.trim())) { 141 createArchiveFileInDB(line, dao); 142 } // If the file is already known in the persistent store, no 143 // action needs to be taken. 144 } 145 } catch (IOException e) { 146 throw new IOFailure("Error reading remote file", e); 147 } finally { 148 IOUtils.closeQuietly(reader); 149 } 150 } 151 152 /** 153 * Helper method to create an ArchiveFile from a given filename and put it in the database. 154 * @param fileName The filename to create. 155 * @param dao The DAO through which the database is accessed. 156 */ 157 private static void createArchiveFileInDB(String fileName, ArchiveFileDAO dao) { 158 ArchiveFile file = new ArchiveFile(); 159 file.setFilename(fileName.trim()); 160 file.setIndexed(false); 161 log.info("Creating object store entry for '{}'", file.getFilename()); 162 dao.create(file); 163 } 164}