001package dk.netarkivet.wayback.hadoop;
002
003import java.io.ByteArrayOutputStream;
004import java.io.File;
005import java.io.IOException;
006import java.util.Arrays;
007import java.util.List;
008
009import dk.netarkivet.common.utils.batch.FileBatchJob;
010import dk.netarkivet.wayback.batch.DeduplicationCDXExtractionBatchJob;
011
012public class DedupIndexer implements Indexer {
013
014    @Override public List<String> indexFile(File file) throws IOException {
015        FileBatchJob fileBatchJob = new DeduplicationCDXExtractionBatchJob();
016        ByteArrayOutputStream baos = new ByteArrayOutputStream();
017        fileBatchJob.initialize(baos);
018        fileBatchJob.processFile(file, baos);
019        fileBatchJob.finish(baos);
020        baos.flush();
021        return Arrays.asList(baos.toString().split("\\n"));
022    }
023
024}