# This Properties map is specified in the Java 'property list' text format # http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29 ### ### some of these overrides is actually just the default value, so they can be skipped ### metadata.jobName=default_orderxml metadata.description=Default Profile metadata.operator=Admin metadata.userAgentTemplate=Mozilla/5.0 (compatible; heritrix/3.3.0 +@OPERATOR_CONTACT_URL@) ## Edit the two following lines to match your setup. metadata.operatorContactUrl=http://netarkivet.dk/webcrawler/ metadata.operatorFrom=info@netarkivet.dk ## Replace YOUR_ORGANIZATION with the name of your organization metadata.organization=YOUR_ORGANIZATION ## This field is not available in the CrawlMetadata class bundled with heritrix ## So we extended the class to add this field. metadata.date=20080118111217 ## Select robots policy here (one of: default seems to be obey) metadata.robotsPolicyName=%{HONOR_ROBOTS_DOT_TXT} crawlLimiter.maxBytesDownload=0 crawlLimiter.maxDocumentsDownload=0 ## MaxTimeseconds inserted by NetarchiveSuite (Delete line, if behaviour unwanted) crawlLimiter.maxTimeSeconds=%{MAX_TIME_SECONDS_PLACEHOLDER} crawlController.maxToeThreads=50 crawlController.recorderOutBufferBytes=4096 crawlController.recorderInBufferBytes=65536 crawlController.pauseAtStart=false crawlController.runWhileEmpty=false crawlController.scratchDir=scratch ## org.archive.bdb.BdbModule overrides bdb.dir=state bdb.cachePercent=40 ## seeds properties ## no source-report.txt if this is false seeds.sourceTagSeeds=true ## Override properties for org.archive.modules.deciderules.TooManyHopsDecideRule scope.rules[2].maxHops=2 ## Override properties for org.archive.modules.deciderules.TransclusionDecideRule scope.rules[3].maxTransHops=5 scope.rules[3].maxSpeculativeHops=1 ## Override properties org.archive.modules.deciderules.PathologicalPathDecideRule scope.rules[6].maxRepetitions=3 ## Politeness overrides disposition.delayFactor=1.0 disposition.maxDelayMs=1000 disposition.minDelayMs=300 disposition.maxPerHostBandwidthUsageKbSec=500 preparer.preferenceEmbedHops=1 preparer.preferenceDepthHops=-1 ## Frontier settings frontier.maxRetries=3 frontier.retryDelaySeconds=300 frontier.recoveryLogEnabled=false frontier.balanceReplenishAmount=3000 frontier.errorPenaltyAmount=100 frontier.queueTotalBudget=%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER} frontier.snoozeLongMs=300000 frontier.extract404s=false frontier.extractIndependently=false preselector.enabled=true preselector.logToFile=false preselector.recheckScope=true preselector.blockAll=false preconditions.enabled=true preconditions.ipValidityDurationSeconds=21600 preconditions.robotsValidityDurationSeconds=86400 preconditions.calculateRobotsOnly=false fetchDns.enabled=true fetchDns.acceptNonDnsResolves=false fetchDns.digestContent=true fetchDns.digestAlgorithm=sha1 fetchHttp.enabled=true fetchHttp.timeoutSeconds=1200 #fetchHttp.soTimeoutMs=20000 fetchHttp.soTimeoutMs=120000 fetchHttp.maxFetchKBSec=0 fetchHttp.maxLengthBytes=0 fetchHttp.ignoreCookies=false fetchHttp.sslTrustLevel=OPEN #fetchHttp.defaultEncoding=ISO-8859-1 fetchHttp.defaultEncoding=UTF-8 fetchHttp.digestContent=true fetchHttp.digestAlgorithm=sha1 fetchHttp.sendIfModifiedSince=true fetchHttp.sendIfNoneMatch=true fetchHttp.sendConnectionClose=true fetchHttp.sendReferer=true fetchHttp.sendRange=false ## Accept headers for HTTP fetching fetchHttp.acceptHeaders[0]=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 extractorHttp.enabled=true extractorHtml.enabled=true extractorHtml.extractJavascript=%{EXTRACT_JAVASCRIPT} extractorHtml.treatFramesAsEmbedLinks=false extractorHtml.ignoreFormActionUrls=true extractorHtml.extractValueAttributes=false extractorHtml.ignoreUnexpectedHtml=true extractorCss.enabled=true extractorJs.enabled=true extractorSwf.enabled=true # allow redirected seeds to be accepted as seeds # In H1, this property belonged to the LinkScoper object, in H3, it is part of the CandidatesProcessor object candidates.seedsRedirectNewSeeds=true statisticsTracker.intervalSeconds=20 ## Quotaenforcing quotaenforcer.groupMaxFetchSuccesses=%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER} quotaenforcer.groupMaxAllKb=%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER} ## sample overrides of the warcwriter warcWriter.template=${prefix}-${timestamp17}-${serialno}-ciblee_2015_${heritrix.hostname} warcWriter.writeRequests=false warcWriter.writeMetadata=false warcWriter.poolMaxActive=3 loggerModule.path=logs .*core\.UserAdmin.*core\.UserLogin.* .*core\.UserAdmin.*register\.UserSelfRegistration.* .*\/w\/index\.php\?title=Speci[ae]l:Recentchanges.* .*act=calendar&cal_id=.* .*advCalendar_pi.* .*cal\.asp\?date=.* .*cal\.asp\?view=monthly&date=.* .*cal\.asp\?view=weekly&date=.* .*cal\.asp\?view=yearly&date=.* .*cal\.asp\?view=yearly&year=.* .*cal\/cal_day\.php\?op=day&date=.* .*cal\/cal_week\.php\?op=week&date=.* .*cal\/calendar\.php\?op=cal&month=.* .*cal\/yearcal\.php\?op=yearcal&ycyear=.* .*calendar\.asp\?calmonth=.* .*calendar\.asp\?qMonth=.* .*calendar\.php\?sid=.* .*calendar\.php\?start=.* .*calendar\.php\?Y=.* .*calendar\/\?CLmDemo_horizontal=.* .*calendar_menu\/calendar\.php\?.* .*calendar_scheduler\.php\?d=.* .*calendar_year\.asp\?qYear=.* .*calendarix\/calendar\.php\?op=.* .*calendarix\/yearcal\.php\?op=.* .*calender\/default\.asp\?month=.* .*Default\.asp\?month=.* .*events\.asp\?cat=0&mDate=.* .*events\.asp\?cat=1&mDate=.* .*events\.asp\?MONTH=.* .*events\.asp\?month=.* .*index\.php\?iDate=.* .*index\.php\?module=PostCalendar&func=view.* .*index\.php\?option=com_events&task=view.* .*index\.php\?option=com_events&task=view_day&year=.* .*index\.php\?option=com_events&task=view_detail&year=.* .*index\.php\?option=com_events&task=view_month&year=.* .*index\.php\?option=com_events&task=view_week&year=.* .*index\.php\?option=com_events&task=view_year&year=.* .*index\.php\?option=com_extcalendar&Itemid.* .*modules\.php\?name=Calendar&op=modload&file=index.* .*modules\.php\?name=vwar&file=calendar&action=list&month=.* .*modules\.php\?name=vwar&file=calendar.* .*modules\.php\?name=vWar&mod=calendar.* .*modules\/piCal\/index\.php\?caldate=.* .*modules\/piCal\/index\.php\?cid=.* .*option,com_events\/task,view_day\/year.* .*option,com_events\/task,view_month\/year.* .*option,com_extcalendar\/Itemid.* .*task,view_month\/year.* .*shopping_cart\.php.* .*action.add_product.* .*action.remove_product.* .*action.buy_now.* .*checkout_payment\.php.* .*login.*login.*login.*login.* .*homepage_calendar\.asp.* .*MediaWiki.*Movearticle.* .*index\.php.*action=edit.* .*comcast\.net.*othastar.* .*Login.*Login.*Login.* .*redir.*redir.*redir.* .*bookingsystemtime\.asp\?dato=.* .*bookingsystem\.asp\?date=.* .*cart\.asp\?mode=add.* .*\/photo.*\/photo.*\/photo.* .*\/skins.*\/skins.*\/skins.* .*\/scripts.*\/scripts.*\/scripts.* .*\/styles.*\/styles.*\/styles.* .*\/coppermine\/login\.php\?referer=.* .*\/images.*\/images.*\/images.* .*\/stories.*\/stories.*\/stories.* %{CRAWLERTRAPS_PLACEHOLDER} %{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER} %{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}