Heritrix3.3.0源码阅读 crawler-beans.cxml中处理器链的配置
2015-10-14 14:36
741 查看
<!-- PROCESSING CHAINS Much of the crawler's work is specified by the sequential application of swappable Processor modules. These Processors are collected into three 'chains'. The CandidateChain is applied to URIs being considered for inclusion, before a URI is enqueued for collection. The FetchChain is applied to URIs when their turn for collection comes up. The DispositionChain is applied after a URI is fetched and analyzed/link-extracted. --> <!-- CANDIDATE CHAIN --> <!-- first, processors are declared as top-level named beans --> <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper"> </bean> <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer"> <!-- <property name="preferenceDepthHops" value="-1" /> --> <!-- <property name="preferenceEmbedHops" value="1" /> --> <!-- <property name="canonicalizationPolicy"> <ref bean="canonicalizationPolicy" /> </property> --> <!-- <property name="queueAssignmentPolicy"> <ref bean="queueAssignmentPolicy" /> </property> --> <!-- <property name="uriPrecedencePolicy"> <ref bean="uriPrecedencePolicy" /> </property> --> <!-- <property name="costAssignmentPolicy"> <ref bean="costAssignmentPolicy" /> </property> --> </bean> <!-- now, processors are assembled into ordered CandidateChain bean --> <bean id="candidateProcessors" class="org.archive.modules.CandidateChain"> <property name="processors"> <list> <!-- apply scoping rules to each individual candidate URI... --> <ref bean="candidateScoper"/> <!-- ...then prepare those ACCEPTed to be enqueued to frontier. --> <ref bean="preparer"/> </list> </property> </bean> <!-- FETCH CHAIN --> <!-- first, processors are declared as top-level named beans --> <bean id="preselector" class="org.archive.crawler.prefetch.Preselector"> <!-- <property name="recheckScope" value="false" /> --> <!-- <property name="blockAll" value="false" /> --> <!-- <property name="blockByRegex" value="" /> --> <!-- <property name="allowByRegex" value="" /> --> </bean> <bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer"> <!-- <property name="ipValidityDurationSeconds" value="21600" /> --> <!-- <property name="robotsValidityDurationSeconds" value="86400" /> --> <!-- <property name="calculateRobotsOnly" value="false" /> --> </bean> <bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS"> <!-- <property name="acceptNonDnsResolves" value="false" /> --> <!-- <property name="digestContent" value="true" /> --> <!-- <property name="digestAlgorithm" value="sha1" /> --> </bean> <!-- <bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois"> <property name="specialQueryTemplates"> <map> <entry key="whois.verisign-grs.com" value="domain %s" /> <entry key="whois.arin.net" value="z + %s" /> <entry key="whois.denic.de" value="-T dn %s" /> </map> </property> </bean> --> <bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP"> <!-- <property name="useHTTP11" value="false" /> --> <!-- <property name="maxLengthBytes" value="0" /> --> <!-- <property name="timeoutSeconds" value="1200" /> --> <!-- <property name="maxFetchKBSec" value="0" /> --> <!-- <property name="defaultEncoding" value="ISO-8859-1" /> --> <!-- <property name="shouldFetchBodyRule"> <bean class="org.archive.modules.deciderules.AcceptDecideRule"/> </property> --> <!-- <property name="soTimeoutMs" value="20000" /> --> <!-- <property name="sendIfModifiedSince" value="true" /> --> <!-- <property name="sendIfNoneMatch" value="true" /> --> <!-- <property name="sendConnectionClose" value="true" /> --> <!-- <property name="sendReferer" value="true" /> --> <!-- <property name="sendRange" value="false" /> --> <!-- <property name="ignoreCookies" value="false" /> --> <!-- <property name="sslTrustLevel" value="OPEN" /> --> <!-- <property name="acceptHeaders"> <list> <value>Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value> </list> </property> --> <!-- <property name="httpBindAddress" value="" /> --> <!-- <property name="httpProxyHost" value="" /> --> <!-- <property name="httpProxyPort" value="0" /> --> <!-- <property name="httpProxyUser" value="" /> --> <!-- <property name="httpProxyPassword" value="" /> --> <!-- <property name="digestContent" value="true" /> --> <!-- <property name="digestAlgorithm" value="sha1" /> --> </bean> <bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP"> </bean> <bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML"> <!-- <property name="extractJavascript" value="true" /> --> <!-- <property name="extractValueAttributes" value="true" /> --> <!-- <property name="ignoreFormActionUrls" value="false" /> --> <!-- <property name="extractOnlyFormGets" value="true" /> --> <!-- <property name="treatFramesAsEmbedLinks" value="true" /> --> <!-- <property name="ignoreUnexpectedHtml" value="true" /> --> <!-- <property name="maxElementLength" value="1024" /> --> <!-- <property name="maxAttributeNameLength" value="1024" /> --> <!-- <property name="maxAttributeValueLength" value="16384" /> --> </bean> <bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS"> </bean> <bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS"> </bean> <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF"> </bean> <!-- now, processors are assembled into ordered FetchChain bean --> <bean id="fetchProcessors" class="org.archive.modules.FetchChain"> <property name="processors"> <list> <!-- re-check scope, if so enabled... --> <ref bean="preselector"/> <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... --> <ref bean="preconditions"/> <!-- ...fetch if DNS URI... --> <ref bean="fetchDns"/> <!-- <ref bean="fetchWhois"/> --> <!-- ...fetch if HTTP URI... --> <ref bean="fetchHttp"/> <!-- ...extract outlinks from HTTP headers... --> <ref bean="extractorHttp"/> <!-- ...extract outlinks from HTML content... --> <ref bean="extractorHtml"/> <!-- ...extract outlinks from CSS content... --> <ref bean="extractorCss"/> <!-- ...extract outlinks from Javascript content... --> <ref bean="extractorJs"/> <!-- ...extract outlinks from Flash content... --> <ref bean="extractorSwf"/> </list> </property> </bean> <!-- DISPOSITION CHAIN --> <!-- first, processors are declared as top-level named beans --> <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor"> <!-- <property name="compress" value="true" /> --> <!-- <property name="prefix" value="IAH" /> --> <!-- <property name="suffix" value="${HOSTNAME}" /> --> <!-- <property name="maxFileSizeBytes" value="1000000000" /> --> <!-- <property name="poolMaxActive" value="1" /> --> <!-- <property name="MaxWaitForIdleMs" value="500" /> --> <!-- <property name="skipIdenticalDigests" value="false" /> --> <!-- <property name="maxTotalBytesToWrite" value="0" /> --> <!-- <property name="directory" value="${launchId}" /> --> <!-- <property name="storePaths"> <list> <value>warcs</value> </list> </property> --> <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> --> <!-- <property name="writeRequests" value="true" /> --> <!-- <property name="writeMetadata" value="true" /> --> <!-- <property name="writeRevisitForIdenticalDigests" value="true" /> --> <!-- <property name="writeRevisitForNotModified" value="true" /> --> <!-- <property name="startNewFilesOnCheckpoint" value="true" /> --> </bean> <bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor"> <!-- <property name="seedsRedirectNewSeeds" value="true" /> --> <!-- <property name="processErrorOutlinks" value="false" /> --> </bean> <bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor"> <!-- <property name="delayFactor" value="5.0" /> --> <!-- <property name="minDelayMs" value="3000" /> --> <!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> --> <!-- <property name="maxDelayMs" value="30000" /> --> <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> --> </bean> <!-- <bean id="rescheduler" class="org.archive.crawler.postprocessor.ReschedulingProcessor"> <property name="rescheduleDelaySeconds" value="-1" /> </bean> --> <!-- now, processors are assembled into ordered DispositionChain bean --> <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain"> <property name="processors"> <list> <!-- write to aggregate archival files... --> <ref bean="warcWriter"/> <!-- ...send each outlink candidate URI to CandidateChain, and enqueue those ACCEPTed to the frontier... --> <ref bean="candidates"/> <!-- ...then update stats, shared-structures, frontier decisions --> <ref bean="disposition"/> <!-- <ref bean="rescheduler" /> --> </list> </property> </bean>
相关文章推荐
- 从源码安装Mysql/Percona 5.5
- 我投了份简历,接到了十八个骚扰电话
- 浅析Ruby的源代码布局及其编程风格
- asp.net 抓取网页源码三种实现方法
- JS小游戏之仙剑翻牌源码详解
- JS小游戏之宇宙战机源码详解
- jQuery源码分析之jQuery中的循环技巧详解
- 本人自用的global.js库源码分享
- java中原码、反码与补码的问题分析
- PHP网页游戏学习之Xnova(ogame)源码解读(六)
- C#获取网页HTML源码实例
- PHP网页游戏学习之Xnova(ogame)源码解读(八)
- PHP网页游戏学习之Xnova(ogame)源码解读(四)
- JS小游戏之极速快跑源码详解
- JS小游戏之象棋暗棋源码详解
- android源码探索之定制android关机界面的方法
- 基于Android设计模式之--SDK源码之策略模式的详解
- Android游戏源码分享之2048
- C语言借助EasyX实现的生命游戏源码
- C实现的非阻塞方式命令行端口扫描器源码