001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.harvesting; 025 026import org.archive.crawler.datamodel.CrawlURI; 027import org.archive.crawler.framework.Processor; 028 029/** 030 * A post processor that adds an annotation content-size:<bytes> for each successfully harvested URI. 031 */ 032@SuppressWarnings({"serial"}) 033public class ContentSizeAnnotationPostProcessor extends Processor { 034 035 /** Prefix associated with annotations made by this processor. */ 036 public static final String CONTENT_SIZE_ANNOTATION_PREFIX = "content-size:"; 037 038 /** 039 * Constructor. 040 * 041 * @param name the name of the processor. 042 * @see Processor 043 */ 044 public ContentSizeAnnotationPostProcessor(String name) { 045 super(name, "A post processor that adds an annotation content-size:<bytes> for each successfully harvested" 046 + " URI."); 047 } 048 049 /** 050 * For each URI with a successful status code (status code > 0), add annotation with content size. 051 * 052 * @param crawlURI URI to add annotation for if successful. 053 * @throws ArgumentNotValid if crawlURI is null. 054 * @throws InterruptedException never. 055 * @see Processor#innerProcess(org.archive.crawler.datamodel.CrawlURI) 056 */ 057 protected void innerProcess(CrawlURI crawlURI) throws InterruptedException { 058 if (crawlURI == null) { 059 throw new IllegalArgumentException("The value of the variable 'CrawlURI crawlURI' must not be null."); 060 } 061 if (crawlURI.getFetchStatus() > 0) { 062 crawlURI.addAnnotation(CONTENT_SIZE_ANNOTATION_PREFIX + crawlURI.getContentSize()); 063 } 064 } 065 066}