yacy_search_server/htroot/CrawlProfileEditor_p.xml
orbiter f6eebb6f99 replaced auto-dom filter with easy-to-understand Site Link-List crawler option
- nobody understand the auto-dom filter without a lenghtly introduction about the function of a crawler
- nobody ever used the auto-dom filter other than with a crawl depth of 1
- the auto-dom filter was buggy since the filter did not survive a restart and then a search index contained waste
- the function of the auto-dom filter was in fact to just load a link list from the given start url and then start separate crawls for all these urls restricted by their domain
- the new Site Link-List option shows the target urls in real-time during input of the start url (like the robots check) and gives a transparent feed-back what it does before it can be used
- the new option also fits into the easy site-crawl start menu

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7213 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-09-30 12:50:34 +00:00

26 lines
995 B
XML

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<crawlProfiles>
#{crawlProfiles}#
<crawlProfile>
<name>#[name]#</name>
<status>#(status)#terminated::active#(/status)#</status>
<starturl>#[startURL]#</starturl>
<depth>#[depth]#</depth>
<mustmatch>#[mustmatch]#</mustmatch>
<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
<crawlingDomFilterContent>
#{crawlingDomFilterContent}#
<item>#[item]#</item>
#{/crawlingDomFilterContent}#
</crawlingDomFilterContent>
<crawlingDomMaxPages>#[crawlingDomMaxPages]#</crawlingDomMaxPages>
<withQuery>#(withQuery)#no::yes#(/withQuery)#</withQuery>
<storeCache>#(storeCache)#no::yes#(/storeCache)#</storeCache>
<indexText>#(indexText)#no::yes#(/indexText)#</indexText>
<indexMedia>#(indexMedia)#no::yes#(/indexMedia)#</indexMedia>
<remoteIndexing>#(remoteIndexing)#no::yes#(/remoteIndexing)#</remoteIndexing>
</crawlProfile>
#{/crawlProfiles}#
</crawlProfiles>