Initial (experimental) implementation of index update/re-crawl job

added to IndexReIndexMonitor_p.html Selects existing documents from index and feeds it to the crawler. currently only the field fresh_date_dt is used determine documents for recrawl (fresh_date_dt:[* TO NOW-1DAY] Documents are added in small chunks (200) to the crawler, only if no other crawl is running.
2024-09-19 00:01:41 +02:00 · 2015-05-16 01:23:08 +02:00 · 2015-05-16 01:23:08 +02:00 · ace71a8877
commit ace71a8877
parent 141cd80456
3 changed files with 242 additions and 11 deletions
--- a/htroot/IndexReIndexMonitor_p.html
+++ b/htroot/IndexReIndexMonitor_p.html
@ -17,7 +17,7 @@
                    <tr>
                        <td>Documents in current queue</td>
                        <td>#[querysize]#</td>
-                        <td>#(reindexjobrunning)#::<input type="submit" value="refresh page" class="btn btn-primary"/>#(/reindexjobrunning)#</td>
+                        <td>#(reindexjobrunning)#::<input type="submit" value="refresh page" class="btn btn-success"/>#(/reindexjobrunning)#</td>
                    </tr>
                    <tr>
                        <td>Documents processed</td>
@ -37,7 +37,7 @@
                </table>
                #(reindexjobrunning)#                
                <input type="submit" name="reindexnow" value="start reindex job now" class="btn btn-primary"/>
-                ::<input type="submit" name="stopreindex" value="stop reindexing" class="btn btn-primary"/>
+                ::<input type="submit" name="stopreindex" value="stop reindexing" class="btn btn-danger"/>
                #(/reindexjobrunning)# 
                <p class="info">#[infomessage]#</p>                            
            </fieldset>
@ -57,6 +57,24 @@
            #(/reindexjobrunning)#
          </td></tr></table>
        </form>    
+        <h2>Re-Crawl Index Documents</h2>
+        <p>Searches the local index and selects documents to add to the crawler (recrawl the document).
+           This runs transparent as background job. Documents are added to the crawler only if no other crawls are active
+           and are added in small chunks.</p>
+        <form action="IndexReIndexMonitor_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
+          <fieldset>
+          #(recrawljobrunning)#                
+            <input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/>
+            to re-crawl documents with fresh_date_dt before today.
+            ::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/>
+            <table>
+              <tr>  
+                <td>Documents to process</td> <td>#[docCount]#</td> <td> with fresh_date_dt before today</td>
+              </tr>    
+            </table>
+          #(/recrawljobrunning)# 
+          </fieldset>
+        </form>
        #%env/templates/footer.template%#
    </body>
 </html>
--- a/htroot/IndexReIndexMonitor_p.java
+++ b/htroot/IndexReIndexMonitor_p.java
@ -21,6 +21,7 @@ import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.sorting.OrderedScoreMap;
 import net.yacy.kelondro.workflow.BusyThread;
 import net.yacy.migration;
+import net.yacy.crawler.RecrawlBusyThread;

 import net.yacy.search.Switchboard;
 import net.yacy.search.index.ReindexSolrBusyThread;
@ -36,26 +37,26 @@ public class IndexReIndexMonitor_p {

        prop.put("docsprocessed", "0");
        prop.put("currentselectquery","");
-        BusyThread bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME);
-        if (bt == null) {
+        BusyThread reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME);
+        if (reidxbt == null) {
            if (post != null && post.containsKey("reindexnow") && sb.index.fulltext().connectedLocalSolr()) {
                migration.reindexToschema(sb);
                prop.put("querysize", "0");
                prop.put("infomessage","reindex job started");
                
-                bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts
+                reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts
            }             
        }
        
-        if (bt != null) {
+        if (reidxbt != null) {
            prop.put("reindexjobrunning", 1);
-            prop.put("querysize", bt.getJobCount());
+            prop.put("querysize", reidxbt.getJobCount());

-            if (bt instanceof ReindexSolrBusyThread) {                
-                prop.put("docsprocessed", ((ReindexSolrBusyThread) bt).getProcessed());
-                prop.put("currentselectquery","q="+((ReindexSolrBusyThread) bt).getCurrentQuery());
+            if (reidxbt instanceof ReindexSolrBusyThread) {
+                prop.put("docsprocessed", ((ReindexSolrBusyThread) reidxbt).getProcessed());
+                prop.put("currentselectquery","q="+((ReindexSolrBusyThread) reidxbt).getCurrentQuery());
                // prepare list of fields in queue
-                final OrderedScoreMap<String> querylist = ((ReindexSolrBusyThread) bt).getQueryList();
+                final OrderedScoreMap<String> querylist = ((ReindexSolrBusyThread) reidxbt).getQueryList();
                if (querylist != null) {
                    int i = 0;
                    for (String oneqs : querylist) { // just use fieldname from query (fieldname:[* TO *])
@ -86,6 +87,34 @@ public class IndexReIndexMonitor_p {
                prop.putHTML("infomessage", "! reindex works only with embedded Solr index !");
            }
        }
+
+        // recrawl job handling
+        BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
+        if (recrawlbt == null) {
+            if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
+                sb.deployThread(RecrawlBusyThread.THREAD_NAME,
+                        "ReCrawl",
+                        "recrawl existing documents",
+                        null,
+                        new RecrawlBusyThread(Switchboard.getSwitchboard()),
+                        1000);
+                recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
+            }
+        }
+        
+        if (recrawlbt != null) {
+            if (post != null && post.containsKey("stoprecrawl")) {
+                sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false);
+                prop.put("recrawljobrunning",0);
+
+            } else {
+                prop.put("recrawljobrunning", 1);
+                prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound);
+            }
+        } else {
+            prop.put("recrawljobrunning", 0);
+        }
+
        // return rewrite properties
        return prop;
    }
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -0,0 +1,184 @@
+/**
+ * RecrawlBusyThread.java
+ * Copyright 2015 by Burkhard Buelte
+ * First released 15.05.2015 at http://yacy.net
+ *
+ * This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ * LICENSE
+ *
+ * This library is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option)
+ * any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+package net.yacy.crawler;
+
+import java.net.MalformedURLException;
+import java.util.HashSet;
+import java.util.Set;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.federate.solr.connector.SolrConnector;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.crawler.data.NoticedURL;
+import net.yacy.crawler.retrieval.Request;
+import net.yacy.kelondro.workflow.AbstractBusyThread;
+import net.yacy.search.Switchboard;
+import net.yacy.search.schema.CollectionSchema;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.params.CommonParams;
+
+/**
+ * Selects documents by a query from the local index
+ * and feeds the found urls to the crawler to recrawl the documents.
+ * This is intended to keep the index up-to-date
+ * Currently the doucments are selected by expired fresh_date_dt field
+ * an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is runnin.
+ */
+public class RecrawlBusyThread extends AbstractBusyThread {
+
+    public final static String THREAD_NAME = "recrawlindex";
+
+    public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
+    private int chunkstart = 0;
+    private int chunksize = 200;
+    final Switchboard sb;
+    private Set<DigestURL> urlstack; // buffer of urls to recrawl
+    public long urlsfound = 0;
+
+    public RecrawlBusyThread(Switchboard xsb) {
+        super(3000, 1000); // set lower limits of cycle delay
+        this.setIdleSleep(10*60000); // set actual cycle delays
+        this.setBusySleep(2*60000);
+
+        this.sb = xsb;
+        urlstack = new HashSet<DigestURL>();
+
+    }
+
+    /**
+     * feed urls to the local crawler
+     *
+     * @return true if urls were added/accepted to the crawler
+     */
+    private boolean feedToCrawler() {
+
+        int added = 0;
+
+        if (!this.urlstack.isEmpty()) {
+            final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile;
+
+            for (DigestURL url : this.urlstack) {
+                final Request request = sb.loader.request(url, true, true);
+                String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
+                if (acceptedError == null) {
+                    acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile);
+                }
+                if (acceptedError != null) {
+                    ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
+                    continue;
+                }
+                final String s;
+                s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots);
+
+                if (s != null) {
+                    ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s);
+                } else {
+                    added++;
+                }
+            }
+            this.urlstack.clear();
+        }
+
+        if (added > 0) {
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * Process query and hand over urls to the crawler
+     *
+     * @return true if something processed
+     */
+    @Override
+    public boolean job() {
+        if (sb.crawlQueues.coreCrawlJobSize() > 0) {
+            return false;
+        }
+
+        if (this.urlstack.isEmpty()) {
+            processSingleQuery();
+            return true;
+        } else {
+            return feedToCrawler();
+        }
+
+    }
+
+    /**
+     * Selects documents to recrawl the urls
+     */
+    private void processSingleQuery() {
+        if (!this.urlstack.isEmpty()) {
+            return;
+        }
+        SolrDocumentList docList = null;
+        SolrQuery solrQuery = new SolrQuery();
+        solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special
+        solrQuery.set("sort", CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc");
+        solrQuery.set(CommonParams.FL, CollectionSchema.sku.getSolrFieldName());
+        solrQuery.set(CommonParams.ROWS, this.chunksize);
+        solrQuery.set(CommonParams.START, this.chunkstart);
+
+        SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
+        if (!solrConnector.isClosed()) {
+            try {
+                QueryResponse rsp = solrConnector.getResponseByParams(solrQuery);
+                docList = rsp.getResults();
+                this.urlsfound = docList.getNumFound();
+            } catch (Throwable e) {
+            }
+        }
+
+        if (docList != null) {
+            for (SolrDocument doc : docList) {
+                try {
+                    this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
+                } catch (MalformedURLException ex) {
+                }
+            }
+
+            this.chunkstart = this.chunkstart + urlstack.size();
+
+            if (docList.getNumFound() <= this.chunkstart) {
+                this.chunkstart = 0;
+            }
+        }
+
+    }
+
+    @Override
+    public int getJobCount() {
+        return this.urlstack.size();
+    }
+
+    @Override
+    public void freemem() {
+        this.urlstack.clear();
+    }
+
+}