001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.wayback.indexer;
024
025import static dk.netarkivet.common.distribute.bitrepository.BitmagUtils.BITREPOSITORY_USEPILLAR;
026
027import java.io.BufferedReader;
028import java.io.IOException;
029import java.io.InputStream;
030import java.io.InputStreamReader;
031import java.util.Date;
032import java.util.Set;
033
034import org.apache.commons.io.IOUtils;
035import org.bitrepository.access.getfileids.GetFileIDsClient;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import dk.netarkivet.common.CommonSettings;
040import dk.netarkivet.common.distribute.RemoteFile;
041import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
042import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
043import dk.netarkivet.common.distribute.arcrepository.PreservationArcRepositoryClient;
044import dk.netarkivet.common.distribute.bitrepository.BitmagUtils;
045import dk.netarkivet.common.distribute.bitrepository.action.getfileids.GetFileIDsAction;
046import dk.netarkivet.common.exceptions.IOFailure;
047import dk.netarkivet.common.utils.Settings;
048import dk.netarkivet.common.utils.batch.DatedFileListJob;
049import dk.netarkivet.common.utils.batch.FileListJob;
050import dk.netarkivet.wayback.WaybackSettings;
051
052public class FileNameHarvester {
053
054    /** Logger for this class. */
055    private static final Logger log = LoggerFactory.getLogger(FileNameHarvester.class);
056
057
058    /**
059     * This method harvests a list of all the files currently in the arcrepository and appends any new ones found to the
060     * ArchiveFile object store.
061     */
062    public static synchronized void harvestAllFilenames() {
063        ArchiveFileDAO dao = new ArchiveFileDAO();
064        if (Settings.getBoolean(CommonSettings.USE_BITMAG_HADOOP_BACKEND)) {
065            Set<String> fileNames = getFilesFromBitmagSince(new Date(0));
066            log.info("Harvested {} file(s) from bitmag", fileNames.size());
067            createFilesInDB(fileNames, dao);
068        } else {
069            PreservationArcRepositoryClient client = ArcRepositoryClientFactory.getPreservationInstance();
070            BatchStatus status = client.batch(new FileListJob(), Settings.get(WaybackSettings.WAYBACK_REPLICA));
071            getResultFileAndCreateInDB(status, dao);
072        }
073    }
074
075    /**
076     * This method harvests a list of all the recently added files in the archive.
077     */
078    public static synchronized void harvestRecentFilenames() {
079        ArchiveFileDAO dao = new ArchiveFileDAO();
080        long timeAgo = Settings.getLong(WaybackSettings.WAYBACK_INDEXER_RECENT_PRODUCER_SINCE);
081        Date sinceDate = new Date(System.currentTimeMillis() - timeAgo);
082
083        if (Settings.getBoolean(CommonSettings.USE_BITMAG_HADOOP_BACKEND)) {
084            Set<String> fileNames = getFilesFromBitmagSince(sinceDate);
085            log.info("Harvested {} recent file(s) from bitmag", fileNames.size());
086            createFilesInDB(fileNames, dao);
087        } else {
088            PreservationArcRepositoryClient client = ArcRepositoryClientFactory.getPreservationInstance();
089            BatchStatus status = client.batch(
090                    new DatedFileListJob(sinceDate), Settings.get(WaybackSettings.WAYBACK_REPLICA));
091            getResultFileAndCreateInDB(status, dao);
092        }
093    }
094
095    /**
096     * Creates the given filenames in the database if they don't already exist.
097     * If the given set is empty it just logs that there were no files to add to the database.
098     * @param fileNames Files to create
099     * @param dao The DAO through which the database is accessed
100     */
101    private static void createFilesInDB(Set<String> fileNames, ArchiveFileDAO dao) {
102        if (!fileNames.isEmpty()) {
103            for (String fileName : fileNames) {
104                if (!dao.exists(fileName)) {
105                    createArchiveFileInDB(fileName, dao);
106                }
107            }
108        } else {
109            String collectionID = BitmagUtils.getDefaultCollectionID();
110            log.info("No new files to add in database after harvest of collection '{}'", collectionID);
111        }
112    }
113
114    /**
115     * Performs a get-file-ids action on the used bitmag instance and returns the results in a set.
116     * @param sinceDate A date specifying how far back to fetch files from
117     * @return The resulting set of filenames from the get-file-ids action
118     */
119    private static Set<String> getFilesFromBitmagSince(Date sinceDate) {
120        String collectionID = BitmagUtils.getDefaultCollectionID();
121        String usePillar = Settings.get(BITREPOSITORY_USEPILLAR);
122        GetFileIDsClient client = BitmagUtils.getFileIDsClient();
123        GetFileIDsAction action = new GetFileIDsAction(client, collectionID, usePillar, sinceDate);
124        action.performAction();
125        return action.getActionResult();
126    }
127
128    /**
129     * Helper method for handling results from BatchStatus and putting it in the database.
130     * @param status The BatchStatus with results from batch()
131     * @param dao The DAO through which the database is accessed.
132     */
133    private static void getResultFileAndCreateInDB(BatchStatus status, ArchiveFileDAO dao) {
134        RemoteFile results = status.getResultFile();
135        InputStream is = results.getInputStream();
136        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
137        String line;
138        try {
139            while ((line = reader.readLine()) != null) {
140                if (!dao.exists(line.trim())) {
141                    createArchiveFileInDB(line, dao);
142                } // If the file is already known in the persistent store, no
143                // action needs to be taken.
144            }
145        } catch (IOException e) {
146            throw new IOFailure("Error reading remote file", e);
147        } finally {
148            IOUtils.closeQuietly(reader);
149        }
150    }
151
152    /**
153     * Helper method to create an ArchiveFile from a given filename and put it in the database.
154     * @param fileName The filename to create.
155     * @param dao The DAO through which the database is accessed.
156     */
157    private static void createArchiveFileInDB(String fileName, ArchiveFileDAO dao) {
158        ArchiveFile file = new ArchiveFile();
159        file.setFilename(fileName.trim());
160        file.setIndexed(false);
161        log.info("Creating object store entry for '{}'", file.getFilename());
162        dao.create(file);
163    }
164}