Initial (experimental) implementation of index update/re-crawl job

added to IndexReIndexMonitor_p.html
Selects existing documents from index and feeds it to the crawler.
currently only the field fresh_date_dt is used determine documents for recrawl (fresh_date_dt:[* TO NOW-1DAY]
Documents are  added in small chunks (200) to the crawler, only if no other crawl is running.
This commit is contained in:
reger 2015-05-16 01:23:08 +02:00
parent 141cd80456
commit ace71a8877
3 changed files with 242 additions and 11 deletions

View File

@ -17,7 +17,7 @@
<tr>
<td>Documents in current queue</td>
<td>#[querysize]#</td>
<td>#(reindexjobrunning)#::<input type="submit" value="refresh page" class="btn btn-primary"/>#(/reindexjobrunning)#</td>
<td>#(reindexjobrunning)#::<input type="submit" value="refresh page" class="btn btn-success"/>#(/reindexjobrunning)#</td>
</tr>
<tr>
<td>Documents processed</td>
@ -37,7 +37,7 @@
</table>
#(reindexjobrunning)#
<input type="submit" name="reindexnow" value="start reindex job now" class="btn btn-primary"/>
::<input type="submit" name="stopreindex" value="stop reindexing" class="btn btn-primary"/>
::<input type="submit" name="stopreindex" value="stop reindexing" class="btn btn-danger"/>
#(/reindexjobrunning)#
<p class="info">#[infomessage]#</p>
</fieldset>
@ -57,6 +57,24 @@
#(/reindexjobrunning)#
</td></tr></table>
</form>
<h2>Re-Crawl Index Documents</h2>
<p>Searches the local index and selects documents to add to the crawler (recrawl the document).
This runs transparent as background job. Documents are added to the crawler only if no other crawls are active
and are added in small chunks.</p>
<form action="IndexReIndexMonitor_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
#(recrawljobrunning)#
<input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/>
to re-crawl documents with fresh_date_dt before today.
::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/>
<table>
<tr>
<td>Documents to process</td> <td>#[docCount]#</td> <td> with fresh_date_dt before today</td>
</tr>
</table>
#(/recrawljobrunning)#
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

View File

@ -21,6 +21,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.migration;
import net.yacy.crawler.RecrawlBusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.index.ReindexSolrBusyThread;
@ -36,26 +37,26 @@ public class IndexReIndexMonitor_p {
prop.put("docsprocessed", "0");
prop.put("currentselectquery","");
BusyThread bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME);
if (bt == null) {
BusyThread reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME);
if (reidxbt == null) {
if (post != null && post.containsKey("reindexnow") && sb.index.fulltext().connectedLocalSolr()) {
migration.reindexToschema(sb);
prop.put("querysize", "0");
prop.put("infomessage","reindex job started");
bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts
reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts
}
}
if (bt != null) {
if (reidxbt != null) {
prop.put("reindexjobrunning", 1);
prop.put("querysize", bt.getJobCount());
prop.put("querysize", reidxbt.getJobCount());
if (bt instanceof ReindexSolrBusyThread) {
prop.put("docsprocessed", ((ReindexSolrBusyThread) bt).getProcessed());
prop.put("currentselectquery","q="+((ReindexSolrBusyThread) bt).getCurrentQuery());
if (reidxbt instanceof ReindexSolrBusyThread) {
prop.put("docsprocessed", ((ReindexSolrBusyThread) reidxbt).getProcessed());
prop.put("currentselectquery","q="+((ReindexSolrBusyThread) reidxbt).getCurrentQuery());
// prepare list of fields in queue
final OrderedScoreMap<String> querylist = ((ReindexSolrBusyThread) bt).getQueryList();
final OrderedScoreMap<String> querylist = ((ReindexSolrBusyThread) reidxbt).getQueryList();
if (querylist != null) {
int i = 0;
for (String oneqs : querylist) { // just use fieldname from query (fieldname:[* TO *])
@ -86,6 +87,34 @@ public class IndexReIndexMonitor_p {
prop.putHTML("infomessage", "! reindex works only with embedded Solr index !");
}
}
// recrawl job handling
BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
if (recrawlbt == null) {
if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
sb.deployThread(RecrawlBusyThread.THREAD_NAME,
"ReCrawl",
"recrawl existing documents",
null,
new RecrawlBusyThread(Switchboard.getSwitchboard()),
1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
}
}
if (recrawlbt != null) {
if (post != null && post.containsKey("stoprecrawl")) {
sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false);
prop.put("recrawljobrunning",0);
} else {
prop.put("recrawljobrunning", 1);
prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound);
}
} else {
prop.put("recrawljobrunning", 0);
}
// return rewrite properties
return prop;
}

View File

@ -0,0 +1,184 @@
/**
* RecrawlBusyThread.java
* Copyright 2015 by Burkhard Buelte
* First released 15.05.2015 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* LICENSE
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;
/**
* Selects documents by a query from the local index
* and feeds the found urls to the crawler to recrawl the documents.
* This is intended to keep the index up-to-date
* Currently the doucments are selected by expired fresh_date_dt field
* an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is runnin.
*/
public class RecrawlBusyThread extends AbstractBusyThread {
public final static String THREAD_NAME = "recrawlindex";
public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
private int chunkstart = 0;
private int chunksize = 200;
final Switchboard sb;
private Set<DigestURL> urlstack; // buffer of urls to recrawl
public long urlsfound = 0;
public RecrawlBusyThread(Switchboard xsb) {
super(3000, 1000); // set lower limits of cycle delay
this.setIdleSleep(10*60000); // set actual cycle delays
this.setBusySleep(2*60000);
this.sb = xsb;
urlstack = new HashSet<DigestURL>();
}
/**
* feed urls to the local crawler
*
* @return true if urls were added/accepted to the crawler
*/
private boolean feedToCrawler() {
int added = 0;
if (!this.urlstack.isEmpty()) {
final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile;
for (DigestURL url : this.urlstack) {
final Request request = sb.loader.request(url, true, true);
String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
if (acceptedError == null) {
acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile);
}
if (acceptedError != null) {
ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
continue;
}
final String s;
s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots);
if (s != null) {
ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s);
} else {
added++;
}
}
this.urlstack.clear();
}
if (added > 0) {
return true;
}
return false;
}
/**
* Process query and hand over urls to the crawler
*
* @return true if something processed
*/
@Override
public boolean job() {
if (sb.crawlQueues.coreCrawlJobSize() > 0) {
return false;
}
if (this.urlstack.isEmpty()) {
processSingleQuery();
return true;
} else {
return feedToCrawler();
}
}
/**
* Selects documents to recrawl the urls
*/
private void processSingleQuery() {
if (!this.urlstack.isEmpty()) {
return;
}
SolrDocumentList docList = null;
SolrQuery solrQuery = new SolrQuery();
solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special
solrQuery.set("sort", CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc");
solrQuery.set(CommonParams.FL, CollectionSchema.sku.getSolrFieldName());
solrQuery.set(CommonParams.ROWS, this.chunksize);
solrQuery.set(CommonParams.START, this.chunkstart);
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (!solrConnector.isClosed()) {
try {
QueryResponse rsp = solrConnector.getResponseByParams(solrQuery);
docList = rsp.getResults();
this.urlsfound = docList.getNumFound();
} catch (Throwable e) {
}
}
if (docList != null) {
for (SolrDocument doc : docList) {
try {
this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
} catch (MalformedURLException ex) {
}
}
this.chunkstart = this.chunkstart + urlstack.size();
if (docList.getNumFound() <= this.chunkstart) {
this.chunkstart = 0;
}
}
}
@Override
public int getJobCount() {
return this.urlstack.size();
}
@Override
public void freemem() {
this.urlstack.clear();
}
}