mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
f6eebb6f99
- nobody understand the auto-dom filter without a lenghtly introduction about the function of a crawler - nobody ever used the auto-dom filter other than with a crawl depth of 1 - the auto-dom filter was buggy since the filter did not survive a restart and then a search index contained waste - the function of the auto-dom filter was in fact to just load a link list from the given start url and then start separate crawls for all these urls restricted by their domain - the new Site Link-List option shows the target urls in real-time during input of the start url (like the robots check) and gives a transparent feed-back what it does before it can be used - the new option also fits into the easy site-crawl start menu git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7213 6c8d7289-2bf4-0310-a012-ef5d649a1542
26 lines
995 B
XML
26 lines
995 B
XML
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
<crawlProfiles>
|
|
#{crawlProfiles}#
|
|
<crawlProfile>
|
|
<name>#[name]#</name>
|
|
<status>#(status)#terminated::active#(/status)#</status>
|
|
<starturl>#[startURL]#</starturl>
|
|
<depth>#[depth]#</depth>
|
|
<mustmatch>#[mustmatch]#</mustmatch>
|
|
<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
|
|
<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
|
|
<crawlingDomFilterContent>
|
|
#{crawlingDomFilterContent}#
|
|
<item>#[item]#</item>
|
|
#{/crawlingDomFilterContent}#
|
|
</crawlingDomFilterContent>
|
|
<crawlingDomMaxPages>#[crawlingDomMaxPages]#</crawlingDomMaxPages>
|
|
<withQuery>#(withQuery)#no::yes#(/withQuery)#</withQuery>
|
|
<storeCache>#(storeCache)#no::yes#(/storeCache)#</storeCache>
|
|
<indexText>#(indexText)#no::yes#(/indexText)#</indexText>
|
|
<indexMedia>#(indexMedia)#no::yes#(/indexMedia)#</indexMedia>
|
|
<remoteIndexing>#(remoteIndexing)#no::yes#(/remoteIndexing)#</remoteIndexing>
|
|
</crawlProfile>
|
|
#{/crawlProfiles}#
|
|
</crawlProfiles>
|