# This Properties map is specified in the Java 'property list' text format
# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
###
### some of these overrides is actually just the default value, so they can be skipped
###
metadata.jobName=default_orderxml
metadata.description=Default Profile
metadata.operator=Admin
metadata.userAgentTemplate=Mozilla/5.0 (compatible; heritrix/3.3.0 +@OPERATOR_CONTACT_URL@)
## Edit the two following lines to match your setup.
metadata.operatorContactUrl=http://netarkivet.dk/webcrawler/
metadata.operatorFrom=info@netarkivet.dk
## Replace YOUR_ORGANIZATION with the name of your organization
metadata.organization=YOUR_ORGANIZATION
## This field is not available in the CrawlMetadata class bundled with heritrix
## So we extended the class to add this field.
metadata.date=20080118111217
## Select robots policy here (one of: default seems to be obey)
metadata.robotsPolicyName=%{HONOR_ROBOTS_DOT_TXT}
crawlLimiter.maxBytesDownload=0
crawlLimiter.maxDocumentsDownload=0
## MaxTimeseconds inserted by NetarchiveSuite (Delete line, if behaviour unwanted)
crawlLimiter.maxTimeSeconds=%{MAX_TIME_SECONDS_PLACEHOLDER}
crawlController.maxToeThreads=50
crawlController.recorderOutBufferBytes=4096
crawlController.recorderInBufferBytes=65536
crawlController.pauseAtStart=false
crawlController.runWhileEmpty=false
crawlController.scratchDir=scratch
## org.archive.bdb.BdbModule overrides
bdb.dir=state
bdb.cachePercent=40
## seeds properties
## no source-report.txt if this is false
seeds.sourceTagSeeds=true
## Override properties for org.archive.modules.deciderules.TooManyHopsDecideRule
scope.rules[2].maxHops=%{MAX_HOPS}
## Override properties for org.archive.modules.deciderules.TransclusionDecideRule
scope.rules[3].maxTransHops=5
scope.rules[3].maxSpeculativeHops=1
## Override properties org.archive.modules.deciderules.PathologicalPathDecideRule
scope.rules[6].maxRepetitions=3
## Politeness overrides
disposition.delayFactor=1.0
disposition.maxDelayMs=1000
disposition.minDelayMs=300
disposition.maxPerHostBandwidthUsageKbSec=500
preparer.preferenceEmbedHops=1
preparer.preferenceDepthHops=-1
## Frontier settings
frontier.maxRetries=3
frontier.retryDelaySeconds=300
frontier.recoveryLogEnabled=false
frontier.balanceReplenishAmount=3000
frontier.errorPenaltyAmount=100
frontier.queueTotalBudget=%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}
frontier.snoozeLongMs=300000
frontier.extract404s=false
frontier.extractIndependently=false
preselector.enabled=true
preselector.logToFile=false
preselector.recheckScope=true
preselector.blockAll=false
preconditions.enabled=true
preconditions.ipValidityDurationSeconds=21600
preconditions.robotsValidityDurationSeconds=86400
preconditions.calculateRobotsOnly=false
fetchDns.enabled=true
fetchDns.acceptNonDnsResolves=false
fetchDns.digestContent=true
fetchDns.digestAlgorithm=sha1
fetchHttp.enabled=true
fetchHttp.timeoutSeconds=1200
#fetchHttp.soTimeoutMs=20000
fetchHttp.soTimeoutMs=120000
fetchHttp.maxFetchKBSec=0
fetchHttp.maxLengthBytes=0
fetchHttp.ignoreCookies=false
fetchHttp.sslTrustLevel=OPEN
#fetchHttp.defaultEncoding=ISO-8859-1
fetchHttp.defaultEncoding=UTF-8
fetchHttp.digestContent=true
fetchHttp.digestAlgorithm=sha1
fetchHttp.sendIfModifiedSince=true
fetchHttp.sendIfNoneMatch=true
fetchHttp.sendConnectionClose=true
fetchHttp.sendReferer=true
fetchHttp.sendRange=false
## Accept headers for HTTP fetching
fetchHttp.acceptHeaders[0]=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
extractorHttp.enabled=true
extractorHtml.enabled=true
extractorHtml.extractJavascript=%{EXTRACT_JAVASCRIPT}
extractorHtml.treatFramesAsEmbedLinks=false
extractorHtml.ignoreFormActionUrls=true
extractorHtml.extractValueAttributes=false
extractorHtml.ignoreUnexpectedHtml=true
extractorCss.enabled=true
extractorJs.enabled=true
extractorSwf.enabled=true
# allow redirected seeds to be accepted as seeds
# In H1, this property belonged to the LinkScoper object, in H3, it is part of the CandidatesProcessor object
candidates.seedsRedirectNewSeeds=true
statisticsTracker.intervalSeconds=20
## Quotaenforcing
quotaenforcer.groupMaxFetchSuccesses=%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}
quotaenforcer.groupMaxAllKb=%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}
## sample overrides of the warcwriter
warcWriter.template=${prefix}-${timestamp17}-${serialno}-ciblee_2015_${heritrix.hostname}
warcWriter.writeRequests=false
warcWriter.writeMetadata=false
warcWriter.poolMaxActive=3
loggerModule.path=logs
.*core\.UserAdmin.*core\.UserLogin.*
.*core\.UserAdmin.*register\.UserSelfRegistration.*
.*\/w\/index\.php\?title=Speci[ae]l:Recentchanges.*
.*act=calendar&cal_id=.*
.*advCalendar_pi.*
.*cal\.asp\?date=.*
.*cal\.asp\?view=monthly&date=.*
.*cal\.asp\?view=weekly&date=.*
.*cal\.asp\?view=yearly&date=.*
.*cal\.asp\?view=yearly&year=.*
.*cal\/cal_day\.php\?op=day&date=.*
.*cal\/cal_week\.php\?op=week&date=.*
.*cal\/calendar\.php\?op=cal&month=.*
.*cal\/yearcal\.php\?op=yearcal&ycyear=.*
.*calendar\.asp\?calmonth=.*
.*calendar\.asp\?qMonth=.*
.*calendar\.php\?sid=.*
.*calendar\.php\?start=.*
.*calendar\.php\?Y=.*
.*calendar\/\?CLmDemo_horizontal=.*
.*calendar_menu\/calendar\.php\?.*
.*calendar_scheduler\.php\?d=.*
.*calendar_year\.asp\?qYear=.*
.*calendarix\/calendar\.php\?op=.*
.*calendarix\/yearcal\.php\?op=.*
.*calender\/default\.asp\?month=.*
.*Default\.asp\?month=.*
.*events\.asp\?cat=0&mDate=.*
.*events\.asp\?cat=1&mDate=.*
.*events\.asp\?MONTH=.*
.*events\.asp\?month=.*
.*index\.php\?iDate=.*
.*index\.php\?module=PostCalendar&func=view.*
.*index\.php\?option=com_events&task=view.*
.*index\.php\?option=com_events&task=view_day&year=.*
.*index\.php\?option=com_events&task=view_detail&year=.*
.*index\.php\?option=com_events&task=view_month&year=.*
.*index\.php\?option=com_events&task=view_week&year=.*
.*index\.php\?option=com_events&task=view_year&year=.*
.*index\.php\?option=com_extcalendar&Itemid.*
.*modules\.php\?name=Calendar&op=modload&file=index.*
.*modules\.php\?name=vwar&file=calendar&action=list&month=.*
.*modules\.php\?name=vwar&file=calendar.*
.*modules\.php\?name=vWar&mod=calendar.*
.*modules\/piCal\/index\.php\?caldate=.*
.*modules\/piCal\/index\.php\?cid=.*
.*option,com_events\/task,view_day\/year.*
.*option,com_events\/task,view_month\/year.*
.*option,com_extcalendar\/Itemid.*
.*task,view_month\/year.*
.*shopping_cart\.php.*
.*action.add_product.*
.*action.remove_product.*
.*action.buy_now.*
.*checkout_payment\.php.*
.*login.*login.*login.*login.*
.*homepage_calendar\.asp.*
.*MediaWiki.*Movearticle.*
.*index\.php.*action=edit.*
.*comcast\.net.*othastar.*
.*Login.*Login.*Login.*
.*redir.*redir.*redir.*
.*bookingsystemtime\.asp\?dato=.*
.*bookingsystem\.asp\?date=.*
.*cart\.asp\?mode=add.*
.*\/photo.*\/photo.*\/photo.*
.*\/skins.*\/skins.*\/skins.*
.*\/scripts.*\/scripts.*\/scripts.*
.*\/styles.*\/styles.*\/styles.*
.*\/coppermine\/login\.php\?referer=.*
.*\/images.*\/images.*\/images.*
.*\/stories.*\/stories.*\/stories.*
%{CRAWLERTRAPS_PLACEHOLDER}
%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}
%{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}