001package dk.netarkivet.wayback.hadoop; 002 003import java.io.ByteArrayOutputStream; 004import java.io.File; 005import java.io.IOException; 006import java.util.Arrays; 007import java.util.List; 008 009import dk.netarkivet.common.utils.batch.FileBatchJob; 010import dk.netarkivet.wayback.batch.DeduplicationCDXExtractionBatchJob; 011 012public class DedupIndexer implements Indexer { 013 014 @Override public List<String> indexFile(File file) throws IOException { 015 FileBatchJob fileBatchJob = new DeduplicationCDXExtractionBatchJob(); 016 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 017 fileBatchJob.initialize(baos); 018 fileBatchJob.processFile(file, baos); 019 fileBatchJob.finish(baos); 020 baos.flush(); 021 return Arrays.asList(baos.toString().split("\\n")); 022 } 023 024}