Merge pull request #650 from zutto/master

Fix autocrawler crashing
This commit is contained in:
Michael Christen 2024-07-10 16:35:39 +02:00 committed by GitHub
commit 2f5f3f8853
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 10 additions and 3 deletions

View File

@ -20,7 +20,7 @@
#(changed)#::<dt></dt><dd><span class="error">You need to restart for some settings to be applied</span></dd>#(/changed)#
<dt>Enable Autocrawler:</dt>
<dd><input id="autocrawlEnable" name="autocrawlEnable" type="checkbox" #(autocrawlEnable)#::checked="checked"#(/autocrawlEnable)# /></dd>
<dt>Deep crawl every:</dt>
<dt>Deep crawl every Nth document:</dt>
<dd>
<input id="autocrawlRatio" name="autocrawlRatio" type="number" min="1" max="500" step="1" size="2" maxlength="2" value="#[autocrawlRatio]#" />
Warning: if this is bigger than "Rows to fetch" only shallow crawls will run.
@ -47,4 +47,4 @@
</dl>
</form>
</fieldset>
</body>
</body>

View File

@ -211,7 +211,7 @@
<source>Enable Autocrawler:</source>
</trans-unit>
<trans-unit id="66a1bd2c" xml:space="preserve" approved="no" translate="yes">
<source>Deep crawl every:</source>
<source>Deep crawl every Nth document:</source>
</trans-unit>
<trans-unit id="2291c65d" xml:space="preserve" approved="no" translate="yes">
<source>Warning: if this is bigger than "Rows to fetch" only shallow crawls will run.</source>

View File

@ -608,12 +608,19 @@ public class CrawlQueues {
int i = 0;
int deepRatio = Integer.parseInt(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_RATIO, "50"));
for (SolrDocument doc: resp.getResults()) {
if (doc == null) {
continue;
}
boolean deep = false;
i++;
if( i % deepRatio == 0 ){
deep = true;
}
DigestURL url;
if (doc.getFieldValue("url_protocol_s") == null || doc.getFieldValue("host_s") == null) {
//Skip this document if either of these values is null.
continue;
}
final String u = doc.getFieldValue("url_protocol_s").toString() + "://" + doc.getFieldValue("host_s").toString();
try {
url = new DigestURL(u);