mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
merge rc1/master
This commit is contained in:
parent
082c9a98c1
commit
1437c45383
|
@ -461,19 +461,21 @@
|
|||
and old cache.
|
||||
-->
|
||||
<filterCache class="solr.FastLRUCache"
|
||||
size="512"
|
||||
initialSize="512"
|
||||
autowarmCount="0"/>
|
||||
size="64"
|
||||
initialSize="64"
|
||||
autowarmCount="4"
|
||||
cleanupThread="true"/>
|
||||
|
||||
<!-- Query Result Cache
|
||||
|
||||
Caches results of searches - ordered lists of document ids
|
||||
(DocList) based on a query, a sort, and the range of documents requested.
|
||||
-->
|
||||
<queryResultCache class="solr.LRUCache"
|
||||
size="512"
|
||||
initialSize="512"
|
||||
autowarmCount="0"/>
|
||||
<queryResultCache class="solr.FastLRUCache"
|
||||
size="64"
|
||||
initialSize="64"
|
||||
autowarmCount="4"
|
||||
cleanupThread="true"/>
|
||||
|
||||
<!-- Document Cache
|
||||
|
||||
|
@ -481,10 +483,11 @@
|
|||
document). Since Lucene internal document ids are transient,
|
||||
this cache will not be autowarmed.
|
||||
-->
|
||||
<documentCache class="solr.LRUCache"
|
||||
size="512"
|
||||
initialSize="512"
|
||||
autowarmCount="0"/>
|
||||
<documentCache class="solr.FastLRUCache"
|
||||
size="64"
|
||||
initialSize="64"
|
||||
autowarmCount="4"
|
||||
cleanupThread="true"/>
|
||||
|
||||
<!-- Field Value Cache
|
||||
|
||||
|
@ -494,9 +497,10 @@
|
|||
-->
|
||||
<!--
|
||||
<fieldValueCache class="solr.FastLRUCache"
|
||||
size="512"
|
||||
size="64"
|
||||
autowarmCount="128"
|
||||
showItems="32" />
|
||||
showItems="32"
|
||||
cleanupThread="true"/>
|
||||
-->
|
||||
|
||||
<!-- Custom Cache
|
||||
|
@ -510,11 +514,12 @@
|
|||
-->
|
||||
<!--
|
||||
<cache name="myUserCache"
|
||||
class="solr.LRUCache"
|
||||
size="4096"
|
||||
initialSize="1024"
|
||||
autowarmCount="1024"
|
||||
class="solr.FastLRUCache"
|
||||
size="64"
|
||||
initialSize="64"
|
||||
autowarmCount="64"
|
||||
regenerator="com.mycompany.MyRegenerator"
|
||||
cleanupThread="true"
|
||||
/>
|
||||
-->
|
||||
|
||||
|
|
|
@ -797,11 +797,6 @@ search.excludehosth=
|
|||
# the cases of nocache, iffresh and ifexist causes an index deletion
|
||||
search.verify.delete = true
|
||||
|
||||
# images may be treated either as documents that are shown in search results or as objects
|
||||
# that are only visible in special search environments, like image search
|
||||
search.excludeintext.image = true
|
||||
crawler.load.image = true
|
||||
|
||||
# remote search details
|
||||
remotesearch.maxcount = 10
|
||||
remotesearch.maxtime = 3000
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
<dt><label for="HTCachePath">The path where the cache is stored</label></dt>
|
||||
<dd><input name="HTCachePath" id="HTCachePath" type="text" size="20" maxlength="300" value="#[HTCachePath]#" /></dd>
|
||||
<dt><label for="actualCacheSize">The current size of the cache</label></dt>
|
||||
<dd><span id="actualCacheSize">#[actualCacheSize]# MB</span></dd>
|
||||
<dd><span id="actualCacheSize">#[actualCacheSize]# MB for #[actualCacheDocCount]# files, #[docSizeAverage]# KB / file in average </span></dd>
|
||||
<dt><label for="maxCacheSize">The maximum size of the cache</label></dt>
|
||||
<dd><input name="maxCacheSize" id="maxCacheSize" type="text" size="8" maxlength="24" value="#[maxCacheSize]#" /> MB</dd>
|
||||
<dt> </dt>
|
||||
|
|
|
@ -77,7 +77,9 @@ public class ConfigHTCache_p {
|
|||
}
|
||||
|
||||
prop.put("HTCachePath", env.getConfig(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT));
|
||||
prop.put("actualCacheSize", (Cache.getActualCacheSize() / 1024 / 1024));
|
||||
prop.put("actualCacheSize", Cache.getActualCacheSize() / 1024 / 1024);
|
||||
prop.put("actualCacheDocCount", Cache.getActualCacheDocCount());
|
||||
prop.put("docSizeAverage", Cache.getActualCacheSize() / Cache.getActualCacheDocCount() / 1024);
|
||||
prop.put("maxCacheSize", env.getConfigLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64));
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
|
|
|
@ -34,7 +34,7 @@ public class ContentAnalysis_p {
|
|||
|
||||
// clean up all search events
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings
|
||||
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
|
||||
|
||||
if (post != null && post.containsKey("EnterDoublecheck")) {
|
||||
Ranking.setMinTokenLen(post.getInt("minTokenLen", 3));
|
||||
|
|
|
@ -553,7 +553,6 @@ public class HostBrowser {
|
|||
}
|
||||
} catch (final IOException e) {
|
||||
}
|
||||
|
||||
}
|
||||
this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue();
|
||||
this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue();
|
||||
|
@ -562,7 +561,7 @@ public class HostBrowser {
|
|||
StringBuilder sbi = new StringBuilder();
|
||||
int c = 0;
|
||||
for (String s: references_internal_urls) {
|
||||
sbi.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
|
||||
sbi.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
|
||||
c++;
|
||||
if (c % 80 == 0) sbi.append("<br/>");
|
||||
}
|
||||
|
@ -570,7 +569,7 @@ public class HostBrowser {
|
|||
StringBuilder sbe = new StringBuilder();
|
||||
c = 0;
|
||||
for (String s: references_external_urls) {
|
||||
sbe.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
|
||||
sbe.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
|
||||
c++;
|
||||
if (c % 80 == 0) sbe.append("<br/>");
|
||||
}
|
||||
|
|
|
@ -193,6 +193,9 @@ function updatepage(str) {
|
|||
<dt class="TableCellDark">URL Filter</dt>
|
||||
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
|
||||
</dd>
|
||||
<dt class="TableCellDark">query</dt>
|
||||
<dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
|
||||
</dd>
|
||||
<dt class="TableCellDark">Export Format</dt>
|
||||
<dd>Only Domain:
|
||||
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)
|
||||
|
|
|
@ -261,7 +261,8 @@ public class IndexControlURLs_p {
|
|||
final File f = new File(s);
|
||||
f.getParentFile().mkdirs();
|
||||
final String filter = post.get("exportfilter", ".*");
|
||||
final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom);
|
||||
final String query = post.get("exportquery", "*:*");
|
||||
final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom);
|
||||
|
||||
prop.put("lurlexport_exportfile", s);
|
||||
prop.put("lurlexport_urlcount", running.count());
|
||||
|
|
|
@ -38,7 +38,7 @@ public class RankingSolr_p {
|
|||
|
||||
// clean up all search events
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings
|
||||
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
|
||||
|
||||
int profileNr = 0;
|
||||
if (post != null) profileNr = post.getInt("profileNr", profileNr);
|
||||
|
|
|
@ -360,7 +360,7 @@ public class yacysearch {
|
|||
|
||||
// check available memory and clean up if necessary
|
||||
if ( !MemoryControl.request(8000000L, false) ) {
|
||||
indexSegment.clearCache();
|
||||
indexSegment.clearCaches();
|
||||
SearchEventCache.cleanupEvents(false);
|
||||
}
|
||||
|
||||
|
|
|
@ -57,6 +57,7 @@ import net.yacy.cora.protocol.TimeoutRequest;
|
|||
import net.yacy.cora.protocol.ftp.FTPClient;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.cora.util.CommonPattern;
|
||||
import net.yacy.document.parser.html.CharacterCoding;
|
||||
|
||||
/**
|
||||
* MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
|
||||
|
@ -66,7 +67,6 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
|
||||
public static final MultiProtocolURL POISON = new MultiProtocolURL(); // poison pill for concurrent link generators
|
||||
|
||||
private static final Pattern ampPattern = Pattern.compile(Pattern.quote("&"));
|
||||
private static final long serialVersionUID = -1173233022912141884L;
|
||||
private static final long SMB_TIMEOUT = 5000;
|
||||
|
||||
|
@ -636,7 +636,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
} else {
|
||||
this.searchpart = this.path.substring(r + 1);
|
||||
// strip &
|
||||
Matcher matcher = ampPattern.matcher(this.searchpart);
|
||||
Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart);
|
||||
while (matcher.find()) {
|
||||
this.searchpart = matcher.replaceAll("&");
|
||||
matcher.reset(this.searchpart);
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
package net.yacy.cora.federate.solr.connector;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
@ -235,7 +234,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
|
|||
* @return a collection of a subset of the ids which exist in the index
|
||||
* @throws IOException
|
||||
*/
|
||||
public Set<String> existsByIds(Collection<String> ids) throws IOException {
|
||||
public Set<String> existsByIds(Set<String> ids) throws IOException {
|
||||
if (ids == null || ids.size() == 0) return new HashSet<String>();
|
||||
// construct raw query
|
||||
final SolrQuery params = new SolrQuery();
|
||||
|
|
|
@ -61,7 +61,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
|
|||
this.missCache = new ConcurrentARC<String, Object>(missCacheMax, partitions);
|
||||
}
|
||||
|
||||
public void clearCache() {
|
||||
public void clearCaches() {
|
||||
this.hitCache.clear();
|
||||
this.missCache.clear();
|
||||
this.documentCache.clear();
|
||||
|
@ -70,9 +70,9 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
|
|||
|
||||
@Override
|
||||
public synchronized void close() {
|
||||
this.clearCaches();
|
||||
if (this.solr != null) this.solr.close();
|
||||
this.solr = null;
|
||||
this.clearCache();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -81,7 +81,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
|
|||
*/
|
||||
@Override
|
||||
public void clear() throws IOException {
|
||||
this.clearCache();
|
||||
this.clearCaches();
|
||||
if (this.solr != null) this.solr.clear();
|
||||
}
|
||||
|
||||
|
@ -119,7 +119,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
|
|||
|
||||
@Override
|
||||
public void deleteByQuery(final String querystring) throws IOException {
|
||||
this.clearCache();
|
||||
this.clearCaches();
|
||||
this.solr.deleteByQuery(querystring);
|
||||
}
|
||||
|
||||
|
@ -261,7 +261,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
|
|||
}
|
||||
|
||||
private void addToCache(SolrDocumentList list, boolean doccache) {
|
||||
if (MemoryControl.shortStatus()) clearCache();
|
||||
if (MemoryControl.shortStatus()) clearCaches();
|
||||
for (final SolrDocument solrdoc: list) {
|
||||
addToCache(solrdoc, doccache);
|
||||
}
|
||||
|
|
|
@ -118,6 +118,12 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
|
|||
ensureAliveUpdateHandler();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clearCaches() {
|
||||
this.connector.clearCaches();
|
||||
this.idCache.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* used for debugging
|
||||
*/
|
||||
|
@ -326,10 +332,11 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Set<String> existsByIds(Collection<String> ids) throws IOException {
|
||||
public Set<String> existsByIds(Set<String> ids) throws IOException {
|
||||
HashSet<String> e = new HashSet<String>();
|
||||
if (ids == null || ids.size() == 0) return e;
|
||||
Collection<String> idsC = new HashSet<String>();
|
||||
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : e;
|
||||
Set<String> idsC = new HashSet<String>();
|
||||
for (String id: ids) {
|
||||
if (this.idCache.has(ASCII.getBytes(id))) {cacheSuccessSign(); e.add(id); continue;}
|
||||
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); continue;}
|
||||
|
|
|
@ -22,7 +22,6 @@
|
|||
package net.yacy.cora.federate.solr.connector;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
@ -35,6 +34,7 @@ import net.yacy.search.schema.CollectionSchema;
|
|||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
|
@ -48,10 +48,14 @@ import org.apache.solr.core.SolrCore;
|
|||
import org.apache.solr.handler.component.SearchHandler;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.request.SolrQueryRequestBase;
|
||||
import org.apache.solr.request.UnInvertedField;
|
||||
import org.apache.solr.response.ResultContext;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.DocSet;
|
||||
import org.apache.solr.search.QueryResultKey;
|
||||
import org.apache.solr.search.SolrCache;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.util.RefCounted;
|
||||
|
||||
|
@ -89,6 +93,22 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
|
|||
super.init(this.instance.getServer(coreName));
|
||||
}
|
||||
|
||||
public void clearCaches() {
|
||||
SolrConfig solrConfig = this.core.getSolrConfig();
|
||||
@SuppressWarnings("unchecked")
|
||||
SolrCache<String, UnInvertedField> fieldValueCache = solrConfig.fieldValueCacheConfig == null ? null : solrConfig.fieldValueCacheConfig.newInstance();
|
||||
if (fieldValueCache != null) fieldValueCache.clear();
|
||||
@SuppressWarnings("unchecked")
|
||||
SolrCache<Query, DocSet> filterCache= solrConfig.filterCacheConfig == null ? null : solrConfig.filterCacheConfig.newInstance();
|
||||
if (filterCache != null) filterCache.clear();
|
||||
@SuppressWarnings("unchecked")
|
||||
SolrCache<QueryResultKey, DocList> queryResultCache = solrConfig.queryResultCacheConfig == null ? null : solrConfig.queryResultCacheConfig.newInstance();
|
||||
if (queryResultCache != null) queryResultCache.clear();
|
||||
@SuppressWarnings("unchecked")
|
||||
SolrCache<Integer, Document> documentCache = solrConfig.documentCacheConfig == null ? null : solrConfig.documentCacheConfig.newInstance();
|
||||
if (documentCache != null) documentCache.clear();
|
||||
}
|
||||
|
||||
public SolrInstance getInstance() {
|
||||
return this.instance;
|
||||
}
|
||||
|
@ -224,9 +244,9 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
|
|||
}
|
||||
|
||||
@Override
|
||||
public Set<String> existsByIds(Collection<String> ids) {
|
||||
public Set<String> existsByIds(Set<String> ids) {
|
||||
if (ids == null || ids.size() == 0) return new HashSet<String>();
|
||||
if (ids.size() == 1 && ids instanceof Set) return existsById(ids.iterator().next()) ? (Set<String>) ids : new HashSet<String>();
|
||||
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet<String>();
|
||||
StringBuilder sb = new StringBuilder(); // construct something like "({!raw f=id}Ij7B63g-gSHA) OR ({!raw f=id}PBcGI3g-gSHA)"
|
||||
for (String id: ids) {
|
||||
sb.append("({!raw f=").append(CollectionSchema.id.getSolrFieldName()).append('}').append(id).append(") OR ");
|
||||
|
|
|
@ -53,6 +53,12 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
|
|||
this.solr0 = solr0;
|
||||
this.solr1 = solr1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clearCaches() {
|
||||
if (this.solr0 != null) this.solr0.clearCaches();
|
||||
if (this.solr1 != null) this.solr1.clearCaches();
|
||||
}
|
||||
|
||||
public boolean isConnected0() {
|
||||
return this.solr0 != null;
|
||||
|
@ -347,7 +353,9 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
|
|||
}
|
||||
|
||||
@Override
|
||||
public Set<String> existsByIds(Collection<String> ids) throws IOException {
|
||||
public Set<String> existsByIds(Set<String> ids) throws IOException {
|
||||
if (ids == null || ids.size() == 0) return new HashSet<String>();
|
||||
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet<String>();
|
||||
if (this.solr0 != null && this.solr1 == null) return this.solr0.existsByIds(ids);
|
||||
if (this.solr0 == null && this.solr1 != null) return this.solr1.existsByIds(ids);
|
||||
Set<String> s = new HashSet<String>();
|
||||
|
|
|
@ -71,6 +71,11 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn
|
|||
super.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clearCaches() {
|
||||
// we do not have a direct access to the caches here, thus we simply do nothing.
|
||||
}
|
||||
|
||||
@Override
|
||||
public QueryResponse getResponseByParams(ModifiableSolrParams params) throws IOException {
|
||||
// during the solr query we set the thread name to the query string to get more debugging info in thread dumps
|
||||
|
@ -134,4 +139,5 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn
|
|||
}
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -36,7 +36,12 @@ import org.apache.solr.common.SolrInputDocument;
|
|||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
|
||||
public interface SolrConnector extends Iterable<String> /* Iterable of document IDs */ {
|
||||
|
||||
|
||||
/**
|
||||
* clear all caches: inside solr and ouside solr within the implementations of this interface
|
||||
*/
|
||||
public void clearCaches();
|
||||
|
||||
/**
|
||||
* get the size of the index
|
||||
* @return number of results if solr is queries with a catch-all pattern
|
||||
|
@ -106,7 +111,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
|
|||
* @return a collection of a subset of the ids which exist in the index
|
||||
* @throws IOException
|
||||
*/
|
||||
public Set<String> existsByIds(Collection<String> ids) throws IOException;
|
||||
public Set<String> existsByIds(Set<String> ids) throws IOException;
|
||||
|
||||
/**
|
||||
* check if a given document exists in solr
|
||||
|
|
|
@ -64,7 +64,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
|
|||
public SolrServer getServer() {
|
||||
return this.server;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void commit(final boolean softCommit) {
|
||||
synchronized (this.server) {
|
||||
|
|
|
@ -24,7 +24,6 @@ import java.util.Collection;
|
|||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.cora.federate.solr.connector.CachedSolrConnector;
|
||||
import net.yacy.cora.federate.solr.connector.ConcurrentUpdateSolrConnector;
|
||||
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
|
||||
import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
|
||||
|
@ -161,9 +160,9 @@ public class InstanceMirror {
|
|||
return msc;
|
||||
}
|
||||
|
||||
public void clearCache() {
|
||||
public void clearCaches() {
|
||||
for (SolrConnector csc: this.connectorCache.values()) {
|
||||
if (csc instanceof CachedSolrConnector) ((CachedSolrConnector) csc).clearCache();
|
||||
csc.clearCaches();
|
||||
}
|
||||
for (EmbeddedSolrConnector ssc: this.embeddedCache.values()) ssc.commit(true);
|
||||
}
|
||||
|
|
|
@ -1,195 +1,193 @@
|
|||
/**
|
||||
* HTMLResponseWriter
|
||||
* Copyright 2013 by Michael Peter Christen
|
||||
* First released 09.06.2013 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.federate.solr.responsewriter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.Date;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.federate.solr.SolrType;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.XML;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.QueryResponseWriter;
|
||||
import org.apache.solr.response.ResultContext;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.schema.TextField;
|
||||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
|
||||
public class HTMLResponseWriter implements QueryResponseWriter {
|
||||
|
||||
private static final Set<String> DEFAULT_FIELD_LIST = null;
|
||||
private static final Pattern dqp = Pattern.compile("\"");
|
||||
|
||||
public HTMLResponseWriter() {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) {
|
||||
return "text/html";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(@SuppressWarnings("rawtypes") NamedList n) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException {
|
||||
NamedList<?> values = rsp.getValues();
|
||||
assert values.get("responseHeader") != null;
|
||||
assert values.get("response") != null;
|
||||
|
||||
writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n");
|
||||
//writer.write("<!--\n");
|
||||
//writer.write("this is a XHTML+RDFa file. It contains RDF annotations with dublin core properties\n");
|
||||
//writer.write("you can validate it with http://validator.w3.org/\n");
|
||||
//writer.write("-->\n");
|
||||
writer.write("<html xmlns=\"http://www.w3.org/1999/xhtml\"\n");
|
||||
writer.write(" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n");
|
||||
writer.write(" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n");
|
||||
writer.write(" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\">\n");
|
||||
writer.write("<head profile=\"http://www.w3.org/2003/g/data-view\">\n");
|
||||
//writer.write("<link rel=\"transformation\" href=\"http://www-sop.inria.fr/acacia/soft/RDFa2RDFXML.xsl\"/>\n");
|
||||
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"all\" href=\"/env/base.css\" />\n");
|
||||
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"/env/style.css\" />\n");
|
||||
NamedList<Object> paramsList = request.getOriginalParams().toNamedList();
|
||||
paramsList.remove("wt");
|
||||
String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22");
|
||||
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
|
||||
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</div>\n");
|
||||
|
||||
DocList response = ((ResultContext) values.get("response")).docs;
|
||||
final int sz = response.size();
|
||||
if (sz > 0) {
|
||||
SolrIndexSearcher searcher = request.getSearcher();
|
||||
DocIterator iterator = response.iterator();
|
||||
IndexSchema schema = request.getSchema();
|
||||
|
||||
int id = iterator.nextDoc();
|
||||
Document doc = searcher.doc(id, DEFAULT_FIELD_LIST);
|
||||
LinkedHashMap<String, String> tdoc = translateDoc(schema, doc);
|
||||
|
||||
String title = tdoc.get(CollectionSchema.title.getSolrFieldName());
|
||||
if (sz == 1) {
|
||||
writer.write("<title>" + title + "</title>\n</head><body>\n");
|
||||
} else {
|
||||
writer.write("<title>Document List</title>\n</head><body>\n");
|
||||
}
|
||||
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
|
||||
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</span></div>\n");
|
||||
|
||||
writeDoc(writer, tdoc, title);
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
id = iterator.nextDoc();
|
||||
doc = searcher.doc(id, DEFAULT_FIELD_LIST);
|
||||
tdoc = translateDoc(schema, doc);
|
||||
title = tdoc.get(CollectionSchema.title.getSolrFieldName());
|
||||
writeDoc(writer, tdoc, title);
|
||||
}
|
||||
} else {
|
||||
writer.write("<title>No Document Found</title>\n</head><body>\n");
|
||||
}
|
||||
|
||||
writer.write("</body></html>\n");
|
||||
}
|
||||
|
||||
private static final void writeDoc(Writer writer, LinkedHashMap<String, String> tdoc, String title) throws IOException {
|
||||
writer.write("<form name=\"yacydoc" + title + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n");
|
||||
writer.write("<fieldset>\n");
|
||||
writer.write("<h1 property=\"dc:Title\">" + title + "</h1>\n");
|
||||
writer.write("<dl>\n");
|
||||
for (Map.Entry<String, String> entry: tdoc.entrySet()) {
|
||||
writer.write("<dt>");
|
||||
writer.write(entry.getKey());
|
||||
writer.write("</dt><dd>");
|
||||
XML.escapeAttributeValue(entry.getValue(), writer);
|
||||
writer.write("</dd>\n");
|
||||
}
|
||||
writer.write("</dl>\n");
|
||||
writer.write("</fieldset>\n");
|
||||
writer.write("</form>\n");
|
||||
}
|
||||
|
||||
static final LinkedHashMap<String, String> translateDoc(final IndexSchema schema, final Document doc) {
|
||||
List<IndexableField> fields = doc.getFields();
|
||||
int sz = fields.size();
|
||||
int fidx1 = 0, fidx2 = 0;
|
||||
LinkedHashMap<String, String> kv = new LinkedHashMap<String, String>();
|
||||
while (fidx1 < sz) {
|
||||
IndexableField value = fields.get(fidx1);
|
||||
String fieldName = value.name();
|
||||
fidx2 = fidx1 + 1;
|
||||
while (fidx2 < sz && fieldName.equals(fields.get(fidx2).name())) {
|
||||
fidx2++;
|
||||
}
|
||||
SchemaField sf = schema.getFieldOrNull(fieldName);
|
||||
if (sf == null) sf = new SchemaField(fieldName, new TextField());
|
||||
FieldType type = sf.getType();
|
||||
|
||||
if (fidx1 + 1 == fidx2) {
|
||||
if (sf.multiValued()) {
|
||||
String sv = value.stringValue();
|
||||
kv.put(fieldName, field2string(type, sv));
|
||||
} else {
|
||||
kv.put(fieldName, field2string(type, value.stringValue()));
|
||||
}
|
||||
} else {
|
||||
for (int i = fidx1; i < fidx2; i++) {
|
||||
String sv = fields.get(i).stringValue();
|
||||
kv.put(fieldName + "_" + i, field2string(type, sv));
|
||||
}
|
||||
}
|
||||
|
||||
fidx1 = fidx2;
|
||||
}
|
||||
return kv;
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
private static String field2string(final FieldType type, final String value) {
|
||||
String typeName = type.getTypeName();
|
||||
if (typeName.equals(SolrType.bool.printName())) {
|
||||
return "F".equals(value) ? "false" : "true";
|
||||
} else if (typeName.equals(SolrType.date.printName())) {
|
||||
return org.apache.solr.schema.DateField.formatExternal(new Date(Long.parseLong(value))); // this is declared deprecated in solr 4.2.1 but is still used as done here
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
// XML.escapeCharData(val, writer);
|
||||
}
|
||||
/**
|
||||
* HTMLResponseWriter
|
||||
* Copyright 2013 by Michael Peter Christen
|
||||
* First released 09.06.2013 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.federate.solr.responsewriter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.Date;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.federate.solr.SolrType;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.XML;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.QueryResponseWriter;
|
||||
import org.apache.solr.response.ResultContext;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.schema.TextField;
|
||||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
|
||||
public class HTMLResponseWriter implements QueryResponseWriter {
|
||||
|
||||
private static final Set<String> DEFAULT_FIELD_LIST = null;
|
||||
private static final Pattern dqp = Pattern.compile("\"");
|
||||
|
||||
public HTMLResponseWriter() {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) {
|
||||
return "text/html";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(@SuppressWarnings("rawtypes") NamedList n) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException {
|
||||
NamedList<?> values = rsp.getValues();
|
||||
assert values.get("responseHeader") != null;
|
||||
assert values.get("response") != null;
|
||||
|
||||
writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n");
|
||||
//writer.write("<!--\n");
|
||||
//writer.write("this is a XHTML+RDFa file. It contains RDF annotations with dublin core properties\n");
|
||||
//writer.write("you can validate it with http://validator.w3.org/\n");
|
||||
//writer.write("-->\n");
|
||||
writer.write("<html xmlns=\"http://www.w3.org/1999/xhtml\"\n");
|
||||
writer.write(" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n");
|
||||
writer.write(" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n");
|
||||
writer.write(" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\">\n");
|
||||
writer.write("<head profile=\"http://www.w3.org/2003/g/data-view\">\n");
|
||||
//writer.write("<link rel=\"transformation\" href=\"http://www-sop.inria.fr/acacia/soft/RDFa2RDFXML.xsl\"/>\n");
|
||||
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"all\" href=\"/env/base.css\" />\n");
|
||||
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"/env/style.css\" />\n");
|
||||
NamedList<Object> paramsList = request.getOriginalParams().toNamedList();
|
||||
paramsList.remove("wt");
|
||||
String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22");
|
||||
|
||||
DocList response = ((ResultContext) values.get("response")).docs;
|
||||
final int sz = response.size();
|
||||
if (sz > 0) {
|
||||
SolrIndexSearcher searcher = request.getSearcher();
|
||||
DocIterator iterator = response.iterator();
|
||||
IndexSchema schema = request.getSchema();
|
||||
|
||||
int id = iterator.nextDoc();
|
||||
Document doc = searcher.doc(id, DEFAULT_FIELD_LIST);
|
||||
LinkedHashMap<String, String> tdoc = translateDoc(schema, doc);
|
||||
|
||||
String title = tdoc.get(CollectionSchema.title.getSolrFieldName());
|
||||
if (sz == 1) {
|
||||
writer.write("<title>" + title + "</title>\n</head><body>\n");
|
||||
} else {
|
||||
writer.write("<title>Document List</title>\n</head><body>\n");
|
||||
}
|
||||
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
|
||||
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</span></div>\n");
|
||||
|
||||
writeDoc(writer, tdoc, title);
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
id = iterator.nextDoc();
|
||||
doc = searcher.doc(id, DEFAULT_FIELD_LIST);
|
||||
tdoc = translateDoc(schema, doc);
|
||||
title = tdoc.get(CollectionSchema.title.getSolrFieldName());
|
||||
writeDoc(writer, tdoc, title);
|
||||
}
|
||||
} else {
|
||||
writer.write("<title>No Document Found</title>\n</head><body>\n");
|
||||
}
|
||||
|
||||
writer.write("</body></html>\n");
|
||||
}
|
||||
|
||||
private static final void writeDoc(Writer writer, LinkedHashMap<String, String> tdoc, String title) throws IOException {
|
||||
writer.write("<form name=\"yacydoc" + title + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n");
|
||||
writer.write("<fieldset>\n");
|
||||
writer.write("<h1 property=\"dc:Title\">" + title + "</h1>\n");
|
||||
writer.write("<dl>\n");
|
||||
for (Map.Entry<String, String> entry: tdoc.entrySet()) {
|
||||
writer.write("<dt>");
|
||||
writer.write(entry.getKey());
|
||||
writer.write("</dt><dd>");
|
||||
XML.escapeAttributeValue(entry.getValue(), writer);
|
||||
writer.write("</dd>\n");
|
||||
}
|
||||
writer.write("</dl>\n");
|
||||
writer.write("</fieldset>\n");
|
||||
writer.write("</form>\n");
|
||||
}
|
||||
|
||||
static final LinkedHashMap<String, String> translateDoc(final IndexSchema schema, final Document doc) {
|
||||
List<IndexableField> fields = doc.getFields();
|
||||
int sz = fields.size();
|
||||
int fidx1 = 0, fidx2 = 0;
|
||||
LinkedHashMap<String, String> kv = new LinkedHashMap<String, String>();
|
||||
while (fidx1 < sz) {
|
||||
IndexableField value = fields.get(fidx1);
|
||||
String fieldName = value.name();
|
||||
fidx2 = fidx1 + 1;
|
||||
while (fidx2 < sz && fieldName.equals(fields.get(fidx2).name())) {
|
||||
fidx2++;
|
||||
}
|
||||
SchemaField sf = schema.getFieldOrNull(fieldName);
|
||||
if (sf == null) sf = new SchemaField(fieldName, new TextField());
|
||||
FieldType type = sf.getType();
|
||||
|
||||
if (fidx1 + 1 == fidx2) {
|
||||
if (sf.multiValued()) {
|
||||
String sv = value.stringValue();
|
||||
kv.put(fieldName, field2string(type, sv));
|
||||
} else {
|
||||
kv.put(fieldName, field2string(type, value.stringValue()));
|
||||
}
|
||||
} else {
|
||||
for (int i = fidx1; i < fidx2; i++) {
|
||||
String sv = fields.get(i).stringValue();
|
||||
kv.put(fieldName + "_" + i, field2string(type, sv));
|
||||
}
|
||||
}
|
||||
|
||||
fidx1 = fidx2;
|
||||
}
|
||||
return kv;
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
private static String field2string(final FieldType type, final String value) {
|
||||
String typeName = type.getTypeName();
|
||||
if (typeName.equals(SolrType.bool.printName())) {
|
||||
return "F".equals(value) ? "false" : "true";
|
||||
} else if (typeName.equals(SolrType.date.printName())) {
|
||||
return org.apache.solr.schema.DateField.formatExternal(new Date(Long.parseLong(value))); // this is declared deprecated in solr 4.2.1 but is still used as done here
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
// XML.escapeCharData(val, writer);
|
||||
}
|
||||
|
|
|
@ -55,6 +55,7 @@ import net.yacy.crawler.retrieval.HTTPLoader;
|
|||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.crawler.retrieval.SMBLoader;
|
||||
import net.yacy.crawler.robots.RobotsTxt;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.kelondro.data.citation.CitationReference;
|
||||
import net.yacy.kelondro.rwi.IndexCell;
|
||||
import net.yacy.kelondro.workflow.WorkflowProcessor;
|
||||
|
@ -347,17 +348,10 @@ public final class CrawlStacker {
|
|||
|
||||
// check availability of parser and maxfilesize
|
||||
String warning = null;
|
||||
boolean loadImages = Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
|
||||
if (!loadImages && Switchboard.getSwitchboard().getConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, "").equals("true;")) {
|
||||
// dammit semicolon
|
||||
// TODO: remove this shit later
|
||||
Switchboard.getSwitchboard().setConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
|
||||
loadImages = true;
|
||||
}
|
||||
ContentDomain contentDomain = entry.url().getContentDomainFromExt();
|
||||
if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
|
||||
contentDomain == ContentDomain.APP ||
|
||||
(!loadImages && contentDomain == ContentDomain.IMAGE) ||
|
||||
(contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) ||
|
||||
contentDomain == ContentDomain.AUDIO ||
|
||||
contentDomain == ContentDomain.VIDEO ||
|
||||
contentDomain == ContentDomain.CTRL) {
|
||||
|
|
|
@ -182,6 +182,14 @@ public final class Cache {
|
|||
public static long getActualCacheSize() {
|
||||
return fileDBunbuffered.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* get the current actual cache size
|
||||
* @return
|
||||
*/
|
||||
public static long getActualCacheDocCount() {
|
||||
return fileDBunbuffered.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* close the databases
|
||||
|
|
|
@ -41,7 +41,10 @@ import net.yacy.cora.document.encoding.UTF8;
|
|||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.order.NaturalOrder;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.kelondro.blob.MapHeap;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.index.RowHandleSet;
|
||||
|
||||
public class BookmarksDB {
|
||||
|
||||
|
@ -147,11 +150,6 @@ public class BookmarksDB {
|
|||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
public String addBookmark(final Bookmark bookmark){
|
||||
saveBookmark(bookmark);
|
||||
return bookmark.getUrlHash();
|
||||
|
||||
}
|
||||
|
||||
public Bookmark getBookmark(final String urlHash) throws IOException {
|
||||
try {
|
||||
|
@ -214,18 +212,13 @@ public class BookmarksDB {
|
|||
final TreeSet<String> set=new TreeSet<String>(new bookmarkComparator(true));
|
||||
final String tagHash=BookmarkHelper.tagHash(tagName);
|
||||
final Tag tag=getTag(tagHash);
|
||||
Set<String> hashes=new HashSet<String>();
|
||||
if (tag != null) {
|
||||
hashes=getTag(tagHash).getUrlHashes();
|
||||
}
|
||||
RowHandleSet hashes = tag == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10) : tag.getUrlHashes();
|
||||
if (priv) {
|
||||
set.addAll(hashes);
|
||||
for (byte[] hash: hashes) set.add(ASCII.String(hash));
|
||||
} else {
|
||||
final Iterator<String> it=hashes.iterator();
|
||||
Bookmark bm;
|
||||
while(it.hasNext()){
|
||||
for (byte[] hash: hashes) {
|
||||
try {
|
||||
bm = getBookmark(it.next());
|
||||
Bookmark bm = getBookmark(ASCII.String(hash));
|
||||
if (bm != null && bm.getPublic()) {
|
||||
set.add(bm.getUrlHash());
|
||||
}
|
||||
|
@ -249,7 +242,7 @@ public class BookmarksDB {
|
|||
* retrieve an object of type Tag from the the tagCache, if object is not cached return loadTag(hash)
|
||||
* @param hash an object of type String, containing a tagHash
|
||||
*/
|
||||
public Tag getTag(final String hash){
|
||||
private Tag getTag(final String hash){
|
||||
return this.tags.get(hash); //null if it does not exists
|
||||
}
|
||||
|
||||
|
@ -257,7 +250,7 @@ public class BookmarksDB {
|
|||
* store a Tag in tagsTable or remove an empty tag
|
||||
* @param tag an object of type Tag to be stored/removed
|
||||
*/
|
||||
public void putTag(final Tag tag){
|
||||
private void putTag(final Tag tag){
|
||||
if (tag == null) return;
|
||||
if (tag.isEmpty()) {
|
||||
this.tags.remove(tag.getTagHash());
|
||||
|
@ -266,7 +259,7 @@ public class BookmarksDB {
|
|||
}
|
||||
}
|
||||
|
||||
public void removeTag(final String hash) {
|
||||
private void removeTag(final String hash) {
|
||||
this.tags.remove(hash);
|
||||
}
|
||||
|
||||
|
@ -301,7 +294,7 @@ public class BookmarksDB {
|
|||
return set.iterator();
|
||||
}
|
||||
|
||||
public Iterator<Tag> getTagIterator(final String tagName, final boolean priv, final int comp) {
|
||||
private Iterator<Tag> getTagIterator(final String tagName, final boolean priv, final int comp) {
|
||||
final TreeSet<Tag> set=new TreeSet<Tag>((comp == SORT_SIZE) ? tagSizeComparator : tagComparator);
|
||||
Iterator<String> it=null;
|
||||
final Iterator<String> bit=getBookmarksIterator(tagName, priv);
|
||||
|
@ -347,14 +340,14 @@ public class BookmarksDB {
|
|||
|
||||
final Tag oldTag=getTag(BookmarkHelper.tagHash(oldName));
|
||||
if (oldTag != null) {
|
||||
final Set<String> urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag
|
||||
final RowHandleSet urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag
|
||||
removeTag(BookmarkHelper.tagHash(oldName)); // remove oldHash from TagsDB
|
||||
|
||||
Bookmark bookmark;
|
||||
Set<String> tagSet = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
|
||||
for (final String urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName
|
||||
for (final byte[] urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName
|
||||
try {
|
||||
bookmark = getBookmark(urlHash);
|
||||
bookmark = getBookmark(ASCII.String(urlHash));
|
||||
tagSet = bookmark.getTags();
|
||||
tagSet.remove(oldName);
|
||||
bookmark.setTags(tagSet, true); // might not be needed, but doesn't hurt
|
||||
|
@ -371,9 +364,9 @@ public class BookmarksDB {
|
|||
public void addTag(final String selectTag, final String newTag) {
|
||||
|
||||
Bookmark bookmark;
|
||||
for (final String urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag
|
||||
for (final byte[] urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag
|
||||
try {
|
||||
bookmark = getBookmark(urlHash);
|
||||
bookmark = getBookmark(ASCII.String(urlHash));
|
||||
bookmark.addTag(newTag);
|
||||
saveBookmark(bookmark);
|
||||
} catch (final IOException e) {
|
||||
|
@ -389,51 +382,24 @@ public class BookmarksDB {
|
|||
* Subclass of bookmarksDB, which provides the Tag object-type
|
||||
*/
|
||||
public class Tag {
|
||||
public static final String URL_HASHES = "urlHashes";
|
||||
public static final String TAG_NAME = "tagName";
|
||||
private final String tagHash;
|
||||
private final Map<String, String> mem;
|
||||
private Set<String> urlHashes;
|
||||
private final String tagName;
|
||||
private RowHandleSet urlHashes;
|
||||
|
||||
public Tag(final String hash, final Map<String, String> map){
|
||||
this.tagHash = hash;
|
||||
this.mem = map;
|
||||
if (this.mem.containsKey(URL_HASHES)) {
|
||||
this.urlHashes = ListManager.string2set(this.mem.get(URL_HASHES));
|
||||
} else {
|
||||
this.urlHashes = new HashSet<String>();
|
||||
}
|
||||
}
|
||||
|
||||
public Tag(final String name, final HashSet<String> entries){
|
||||
private Tag(final String name) {
|
||||
this.tagHash = BookmarkHelper.tagHash(name);
|
||||
this.mem = new HashMap<String, String>();
|
||||
//mem.put(URL_HASHES, listManager.arraylist2string(entries));
|
||||
this.urlHashes = entries;
|
||||
this.mem.put(TAG_NAME, name);
|
||||
}
|
||||
|
||||
public Tag(final String name){
|
||||
this(name, new HashSet<String>());
|
||||
}
|
||||
|
||||
public Map<String, String> getMap(){
|
||||
this.mem.put(URL_HASHES, ListManager.collection2string(this.urlHashes));
|
||||
return this.mem;
|
||||
this.tagName = name;
|
||||
this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the lowercase Tagname
|
||||
*/
|
||||
public String getTagName(){
|
||||
/*if(this.mem.containsKey(TAG_NAME)){
|
||||
return (String) this.mem.get(TAG_NAME);
|
||||
}
|
||||
return "";*/
|
||||
return getFriendlyName().toLowerCase();
|
||||
}
|
||||
|
||||
public String getTagHash(){
|
||||
private String getTagHash(){
|
||||
return this.tagHash;
|
||||
}
|
||||
|
||||
|
@ -441,37 +407,33 @@ public class BookmarksDB {
|
|||
* @return the tag name, with all uppercase chars
|
||||
*/
|
||||
public String getFriendlyName(){
|
||||
/*if(this.mem.containsKey(TAG_FRIENDLY_NAME)){
|
||||
return (String) this.mem.get(TAG_FRIENDLY_NAME);
|
||||
}
|
||||
return getTagName();*/
|
||||
if(this.mem.containsKey(TAG_NAME)){
|
||||
return this.mem.get(TAG_NAME);
|
||||
}
|
||||
return "notagname";
|
||||
return this.tagName;
|
||||
}
|
||||
|
||||
public Set<String> getUrlHashes(){
|
||||
private RowHandleSet getUrlHashes(){
|
||||
return this.urlHashes;
|
||||
}
|
||||
|
||||
public boolean hasPublicItems(){
|
||||
private boolean hasPublicItems(){
|
||||
return getBookmarksIterator(getTagName(), false).hasNext();
|
||||
}
|
||||
|
||||
public void addUrl(final String urlHash){
|
||||
this.urlHashes.add(urlHash);
|
||||
private void addUrl(final String urlHash){
|
||||
try {
|
||||
this.urlHashes.put(ASCII.getBytes(urlHash));
|
||||
} catch (SpaceExceededException e) {
|
||||
}
|
||||
}
|
||||
|
||||
public void delete(final String urlHash){
|
||||
this.urlHashes.remove(urlHash);
|
||||
private void delete(final String urlHash){
|
||||
this.urlHashes.remove(ASCII.getBytes(urlHash));
|
||||
}
|
||||
|
||||
public int size(){
|
||||
return this.urlHashes.size();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
private boolean isEmpty() {
|
||||
return this.urlHashes.isEmpty();
|
||||
}
|
||||
}
|
||||
|
@ -481,27 +443,19 @@ public class BookmarksDB {
|
|||
*/
|
||||
public class Bookmark {
|
||||
|
||||
public static final String BOOKMARK_URL = "bookmarkUrl";
|
||||
private static final String BOOKMARK_URL = "bookmarkUrl";
|
||||
public static final String BOOKMARK_TITLE = "bookmarkTitle";
|
||||
public static final String BOOKMARK_DESCRIPTION = "bookmarkDesc";
|
||||
public static final String BOOKMARK_TAGS = "bookmarkTags";
|
||||
public static final String BOOKMARK_PUBLIC = "bookmarkPublic";
|
||||
public static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp";
|
||||
public static final String BOOKMARK_OWNER = "bookmarkOwner";
|
||||
public static final String BOOKMARK_IS_FEED = "bookmarkIsFeed";
|
||||
private static final String BOOKMARK_TAGS = "bookmarkTags";
|
||||
private static final String BOOKMARK_PUBLIC = "bookmarkPublic";
|
||||
private static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp";
|
||||
private static final String BOOKMARK_OWNER = "bookmarkOwner";
|
||||
private static final String BOOKMARK_IS_FEED = "bookmarkIsFeed";
|
||||
private final String urlHash;
|
||||
private Set<String> tagNames;
|
||||
private long timestamp;
|
||||
private final Map<String, String> entry;
|
||||
|
||||
public Bookmark(final String urlHash, final Map<String, String> map) {
|
||||
this.entry = map;
|
||||
this.urlHash = urlHash;
|
||||
this.tagNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
|
||||
if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS)));
|
||||
loadTimestamp();
|
||||
}
|
||||
|
||||
public Bookmark(final DigestURL url) {
|
||||
this.entry = new HashMap<String, String>();
|
||||
this.urlHash = ASCII.String(url.hash());
|
||||
|
@ -529,11 +483,15 @@ public class BookmarksDB {
|
|||
this(new DigestURL((url.indexOf("://") < 0) ? "http://" + url : url));
|
||||
}
|
||||
|
||||
public Bookmark(final Map<String, String> map) throws MalformedURLException {
|
||||
this(ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash()), map);
|
||||
private Bookmark(final Map<String, String> map) throws MalformedURLException {
|
||||
this.entry = map;
|
||||
this.urlHash = ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash());
|
||||
this.tagNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
|
||||
if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS)));
|
||||
loadTimestamp();
|
||||
}
|
||||
|
||||
Map<String, String> toMap() {
|
||||
private Map<String, String> toMap() {
|
||||
this.entry.put(BOOKMARK_TAGS, ListManager.collection2string(this.tagNames));
|
||||
this.entry.put(BOOKMARK_TIMESTAMP, String.valueOf(this.timestamp));
|
||||
return this.entry;
|
||||
|
@ -688,11 +646,11 @@ public class BookmarksDB {
|
|||
/**
|
||||
* Subclass of bookmarksDB, which provides the bookmarkIterator object-type
|
||||
*/
|
||||
public class bookmarkIterator implements Iterator<Bookmark> {
|
||||
private class bookmarkIterator implements Iterator<Bookmark> {
|
||||
|
||||
Iterator<byte[]> bookmarkIter;
|
||||
|
||||
public bookmarkIterator(final boolean up) throws IOException {
|
||||
private bookmarkIterator(final boolean up) throws IOException {
|
||||
//flushBookmarkCache(); //XXX: this will cost performance
|
||||
this.bookmarkIter = BookmarksDB.this.bookmarks.keys(up, false);
|
||||
//this.nextEntry = null;
|
||||
|
@ -722,14 +680,14 @@ public class BookmarksDB {
|
|||
/**
|
||||
* Comparator to sort objects of type Bookmark according to their timestamps
|
||||
*/
|
||||
public class bookmarkComparator implements Comparator<String> {
|
||||
private class bookmarkComparator implements Comparator<String> {
|
||||
|
||||
private final boolean newestFirst;
|
||||
|
||||
/**
|
||||
* @param newestFirst newest first, or oldest first?
|
||||
*/
|
||||
public bookmarkComparator(final boolean newestFirst){
|
||||
private bookmarkComparator(final boolean newestFirst){
|
||||
this.newestFirst = newestFirst;
|
||||
}
|
||||
|
||||
|
@ -752,13 +710,13 @@ public class BookmarksDB {
|
|||
}
|
||||
}
|
||||
|
||||
public static final TagComparator tagComparator = new TagComparator();
|
||||
public static final TagSizeComparator tagSizeComparator = new TagSizeComparator();
|
||||
private static final TagComparator tagComparator = new TagComparator();
|
||||
private static final TagSizeComparator tagSizeComparator = new TagSizeComparator();
|
||||
|
||||
/**
|
||||
* Comparator to sort objects of type Tag according to their names
|
||||
*/
|
||||
public static class TagComparator implements Comparator<Tag>, Serializable {
|
||||
private static class TagComparator implements Comparator<Tag>, Serializable {
|
||||
|
||||
/**
|
||||
* generated serial
|
||||
|
@ -772,7 +730,7 @@ public class BookmarksDB {
|
|||
|
||||
}
|
||||
|
||||
public static class TagSizeComparator implements Comparator<Tag>, Serializable {
|
||||
private static class TagSizeComparator implements Comparator<Tag>, Serializable {
|
||||
|
||||
/**
|
||||
* generated serial
|
||||
|
|
|
@ -26,12 +26,15 @@ package net.yacy.document.parser.html;
|
|||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Contains methods to convert between Unicode and XML/HTML encoding.
|
||||
*/
|
||||
public final class CharacterCoding {
|
||||
|
||||
/** Ampersand pattern */
|
||||
public final static Pattern ampPattern = Pattern.compile(Pattern.quote("&"));
|
||||
/** Ampersand character in unicode encoding. */
|
||||
private static final char AMP_UNICODE = "\u0026".charAt(0);
|
||||
/** Ampersand character in HTML encoding. */
|
||||
|
@ -276,14 +279,15 @@ public final class CharacterCoding {
|
|||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Replaces HTML-encoded characters with unicode representation.
|
||||
* @param text text with character to replace
|
||||
* @return text with replaced characters
|
||||
*/
|
||||
public static String html2unicode(final String text) {
|
||||
public static String html2unicode(String text) {
|
||||
if (text == null) return null;
|
||||
text = ampPattern.matcher(text).replaceAll("&"); // sometimes a double-replacement is necessary.
|
||||
int p = 0, p1, q;
|
||||
final StringBuilder sb = new StringBuilder(text.length());
|
||||
String s;
|
||||
|
|
|
@ -204,11 +204,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void scrapeText(final char[] newtext, final String insideTag) {
|
||||
public void scrapeText(final char[] newtext0, final String insideTag) {
|
||||
// System.out.println("SCRAPE: " + UTF8.String(newtext));
|
||||
if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return;
|
||||
int p, pl, q, s = 0;
|
||||
|
||||
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
|
||||
|
||||
// match evaluation pattern
|
||||
this.evaluationScores.match(Element.text, newtext);
|
||||
|
||||
|
@ -466,7 +467,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) {
|
||||
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
|
||||
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
|
||||
final String href = tagopts.getProperty("href", EMPTY_STRING);
|
||||
String href = tagopts.getProperty("href", EMPTY_STRING);
|
||||
href = CharacterCoding.html2unicode(href);
|
||||
AnchorURL url;
|
||||
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
|
||||
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
|
||||
|
|
|
@ -32,27 +32,15 @@ import java.io.FileInputStream;
|
|||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.Date;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.exceptions.CryptographyException;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
|
||||
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
|
||||
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDMMType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1CFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||
import org.apache.pdfbox.util.PDFTextStripper;
|
||||
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
|
@ -222,25 +210,54 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
false,
|
||||
docDate)};
|
||||
}
|
||||
|
||||
@SuppressWarnings("static-access")
|
||||
|
||||
public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
|
||||
// thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
|
||||
// AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
|
||||
// Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain.
|
||||
PDFont.clearResources();
|
||||
COSName.clearResources();
|
||||
PDType1Font.clearResources();
|
||||
PDTrueTypeFont.clearResources();
|
||||
PDType0Font.clearResources();
|
||||
PDType1AfmPfbFont.clearResources();
|
||||
PDType3Font.clearResources();
|
||||
PDType1CFont.clearResources();
|
||||
PDCIDFont.clearResources();
|
||||
PDCIDFontType0Font.clearResources();
|
||||
PDCIDFontType2Font.clearResources();
|
||||
PDMMType1Font.clearResources();
|
||||
PDSimpleFont.clearResources();
|
||||
ResourceCleaner cl = new ResourceCleaner();
|
||||
cl.clearClassResources("org.apache.pdfbox.cos.COSName");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDTrueTypeFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType0Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType3Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1CFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDMMType1Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDSimpleFont");
|
||||
}
|
||||
|
||||
@SuppressWarnings({ "unchecked", "rawtypes" })
|
||||
private static class ResourceCleaner {
|
||||
Method findLoadedClass;
|
||||
private ClassLoader sys;
|
||||
public ResourceCleaner() {
|
||||
try {
|
||||
this.findLoadedClass = ClassLoader.class.getDeclaredMethod("findLoadedClass", new Class[] { String.class });
|
||||
this.findLoadedClass.setAccessible(true);
|
||||
this.sys = ClassLoader.getSystemClassLoader();
|
||||
} catch (Throwable e) {
|
||||
e.printStackTrace();
|
||||
this.findLoadedClass = null;
|
||||
this.sys = null;
|
||||
}
|
||||
}
|
||||
public void clearClassResources(String name) {
|
||||
if (this.findLoadedClass == null) return;
|
||||
try {
|
||||
Object pdfparserpainclass = this.findLoadedClass.invoke(this.sys, name);
|
||||
if (pdfparserpainclass != null) {
|
||||
Method clearResources = ((Class) pdfparserpainclass).getDeclaredMethod("clearResources", new Class[] {});
|
||||
if (clearResources != null) clearResources.invoke(null);
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -37,12 +37,12 @@ public class CrashProtectionHandler extends HandlerWrapper implements Handler, H
|
|||
}
|
||||
|
||||
private void writeResponse(HttpServletRequest request, HttpServletResponse response, Exception exc) throws IOException {
|
||||
PrintWriter out = response.getWriter();
|
||||
out.println("Ops!");
|
||||
out.println();
|
||||
out.println("Message: " + exc.getMessage());
|
||||
exc.printStackTrace(out);
|
||||
response.setContentType("text/plain");
|
||||
response.setStatus(500);
|
||||
PrintWriter out = response.getWriter();
|
||||
out.println("Ops!");
|
||||
out.println();
|
||||
out.println("Message: " + exc.getMessage());
|
||||
exc.printStackTrace(out);
|
||||
response.setContentType("text/plain");
|
||||
response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -91,7 +91,7 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
|
|||
HttpServletResponse response) throws IOException, ServletException {
|
||||
|
||||
RequestHeader proxyHeaders = convertHeaderFromJetty(request);
|
||||
final String httpVer = (String) request.getHeader(HeaderFramework.CONNECTION_PROP_HTTP_VER);
|
||||
final String httpVer = request.getHeader(HeaderFramework.CONNECTION_PROP_HTTP_VER);
|
||||
setViaHeader (proxyHeaders, httpVer);
|
||||
proxyHeaders.remove(RequestHeader.KEEP_ALIVE);
|
||||
proxyHeaders.remove(RequestHeader.CONTENT_LENGTH);
|
||||
|
|
|
@ -27,7 +27,6 @@ package net.yacy.http;
|
|||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
|
||||
import javax.servlet.RequestDispatcher;
|
||||
import javax.servlet.ServletException;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
|
|
@ -97,11 +97,6 @@ public class TemplateHandler extends AbstractHandler implements Handler {
|
|||
htDocsPath = Switchboard.getSwitchboard().htDocsPath.getPath();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doStop() throws Exception {
|
||||
super.doStop();
|
||||
}
|
||||
|
||||
/** Returns a path to the localized or default file according to the parameter localeSelection
|
||||
* @param path relative from htroot
|
||||
* @param localeSelection language of localized file; locale.language from switchboard is used if localeSelection.equals("") */
|
||||
|
|
|
@ -17,13 +17,13 @@ import java.net.SocketException;
|
|||
*/
|
||||
public interface YaCyHttpServer {
|
||||
|
||||
abstract public void startupServer() throws Exception;
|
||||
abstract public void stop() throws Exception;
|
||||
abstract public void setMaxSessionCount(int cnt);
|
||||
abstract public InetSocketAddress generateSocketAddress(String port) throws SocketException;
|
||||
abstract public int getMaxSessionCount();
|
||||
abstract public int getJobCount();
|
||||
abstract public boolean withSSL();
|
||||
abstract public void reconnect(int milsec);
|
||||
abstract public String getVersion();
|
||||
abstract void startupServer() throws Exception;
|
||||
abstract void stop() throws Exception;
|
||||
abstract void setMaxSessionCount(int cnt);
|
||||
abstract InetSocketAddress generateSocketAddress(String port) throws SocketException;
|
||||
abstract int getMaxSessionCount();
|
||||
abstract int getJobCount();
|
||||
abstract boolean withSSL();
|
||||
abstract void reconnect(int milsec);
|
||||
abstract String getVersion();
|
||||
}
|
||||
|
|
|
@ -25,7 +25,6 @@
|
|||
package net.yacy.peers;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -164,7 +163,7 @@ public class Transmission {
|
|||
final ReferenceContainer<WordReference> c = (remaining >= container.size()) ? container : trimContainer(container, remaining);
|
||||
// iterate through the entries in the container and check if the reference is in the repository
|
||||
final List<byte[]> notFoundx = new ArrayList<byte[]>();
|
||||
Collection<String> testids = new HashSet<String>();
|
||||
Set<String> testids = new HashSet<String>();
|
||||
Iterator<WordReference> i = c.entries();
|
||||
while (i.hasNext()) {
|
||||
final WordReference e = i.next();
|
||||
|
|
|
@ -129,7 +129,7 @@ public class ResourceObserver {
|
|||
if(MemoryControl.properState()) return Space.HIGH;
|
||||
|
||||
// clear some caches - @all: are there more of these, we could clear here?
|
||||
this.sb.index.clearCache();
|
||||
this.sb.index.clearCaches();
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
this.sb.trail.clear();
|
||||
Switchboard.urlBlacklist.clearblacklistCache();
|
||||
|
|
|
@ -1585,7 +1585,7 @@ public final class Switchboard extends serverSwitch {
|
|||
* @param ids a collection of url hashes
|
||||
* @return a map from the hash id to: if it exists, the name of the database, otherwise null
|
||||
*/
|
||||
public Map<String, HarvestProcess> urlExists(final Collection<String> ids) {
|
||||
public Map<String, HarvestProcess> urlExists(final Set<String> ids) {
|
||||
Set<String> e = this.index.exists(ids);
|
||||
Map<String, HarvestProcess> m = new HashMap<String, HarvestProcess>();
|
||||
for (String id: ids) {
|
||||
|
@ -2031,7 +2031,7 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
// clear caches if necessary
|
||||
if ( !MemoryControl.request(128000000L, false) ) {
|
||||
this.index.clearCache();
|
||||
this.index.clearCaches();
|
||||
SearchEventCache.cleanupEvents(false);
|
||||
this.trail.clear();
|
||||
GuiHandler.clear();
|
||||
|
@ -2556,12 +2556,16 @@ public final class Switchboard extends serverSwitch {
|
|||
) {
|
||||
// get the hyperlinks
|
||||
final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
|
||||
boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
|
||||
if (loadImages) hl.putAll(Document.getImagelinks(documents));
|
||||
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
|
||||
if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
|
||||
|
||||
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
|
||||
if (response.profile().directDocByURL()) {
|
||||
if (!loadImages) hl.putAll(Document.getImagelinks(documents));
|
||||
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
|
||||
if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
hl.putAll(Document.getApplinks(documents));
|
||||
hl.putAll(Document.getVideolinks(documents));
|
||||
hl.putAll(Document.getAudiolinks(documents));
|
||||
|
@ -2905,7 +2909,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// stacking may fail because of double occurrences of that url. Therefore
|
||||
// we must wait here until the url has actually disappeared
|
||||
int t = 100;
|
||||
Collection<String> ids = new ArrayList<String>(1); ids.add(ASCII.String(urlhash));
|
||||
Set<String> ids = new HashSet<String>(1); ids.add(ASCII.String(urlhash));
|
||||
while (t-- > 0 && this.index.exists(ids).size() > 0) {
|
||||
try {Thread.sleep(100);} catch (final InterruptedException e) {}
|
||||
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);
|
||||
|
|
|
@ -323,7 +323,6 @@ public final class SwitchboardConstants {
|
|||
* <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
|
||||
* <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
|
||||
*/
|
||||
public static final String CRAWLER_LOAD_IMAGE = "crawler.load.image";
|
||||
public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
|
||||
public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
|
||||
public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store
|
||||
|
|
|
@ -225,10 +225,10 @@ public final class Fulltext {
|
|||
}
|
||||
}
|
||||
|
||||
public void clearCache() {
|
||||
public void clearCaches() {
|
||||
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
|
||||
if (this.statsDump != null) this.statsDump.clear();
|
||||
this.solrInstances.clearCache();
|
||||
this.solrInstances.clearCaches();
|
||||
this.statsDump = null;
|
||||
}
|
||||
|
||||
|
@ -250,7 +250,7 @@ public final class Fulltext {
|
|||
for (String name: instance.getCoreNames()) new EmbeddedSolrConnector(instance, name).clear();
|
||||
}
|
||||
this.commit(false);
|
||||
this.solrInstances.clearCache();
|
||||
this.solrInstances.clearCaches();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -260,7 +260,7 @@ public final class Fulltext {
|
|||
if (instance != null) {
|
||||
for (String name: instance.getCoreNames()) new RemoteSolrConnector(instance, name).clear();
|
||||
}
|
||||
this.solrInstances.clearCache();
|
||||
this.solrInstances.clearCaches();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -400,7 +400,7 @@ public final class Fulltext {
|
|||
throw new IOException(e.getMessage(), e);
|
||||
}
|
||||
this.statsDump = null;
|
||||
if (MemoryControl.shortStatus()) clearCache();
|
||||
if (MemoryControl.shortStatus()) clearCaches();
|
||||
}
|
||||
|
||||
public void putEdges(final Collection<SolrInputDocument> edges) throws IOException {
|
||||
|
@ -412,7 +412,7 @@ public final class Fulltext {
|
|||
throw new IOException(e.getMessage(), e);
|
||||
}
|
||||
this.statsDump = null;
|
||||
if (MemoryControl.shortStatus()) clearCache();
|
||||
if (MemoryControl.shortStatus()) clearCaches();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -432,7 +432,7 @@ public final class Fulltext {
|
|||
throw new IOException(e.getMessage(), e);
|
||||
}
|
||||
this.statsDump = null;
|
||||
if (MemoryControl.shortStatus()) clearCache();
|
||||
if (MemoryControl.shortStatus()) clearCaches();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -617,10 +617,11 @@ public final class Fulltext {
|
|||
* @param ids
|
||||
* @return a set of ids which exist in the database
|
||||
*/
|
||||
public Set<String> exists(Collection<String> ids) {
|
||||
public Set<String> exists(Set<String> ids) {
|
||||
HashSet<String> e = new HashSet<String>();
|
||||
if (ids == null || ids.size() == 0) return e;
|
||||
Collection<String> idsC = new HashSet<String>();
|
||||
if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e;
|
||||
Set<String> idsC = new HashSet<String>();
|
||||
idsC.addAll(ids);
|
||||
if (this.urlIndexFile != null) {
|
||||
Iterator<String> idsi = idsC.iterator();
|
||||
|
@ -751,12 +752,12 @@ public final class Fulltext {
|
|||
}
|
||||
|
||||
// export methods
|
||||
public Export export(final File f, final String filter, final int format, final boolean dom) {
|
||||
public Export export(final File f, final String filter, final String query, final int format, final boolean dom) {
|
||||
if ((this.exportthread != null) && (this.exportthread.isAlive())) {
|
||||
ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
|
||||
return this.exportthread;
|
||||
}
|
||||
this.exportthread = new Export(f, filter, format, dom);
|
||||
this.exportthread = new Export(f, filter, query, format, dom);
|
||||
this.exportthread.start();
|
||||
return this.exportthread;
|
||||
}
|
||||
|
@ -769,14 +770,15 @@ public final class Fulltext {
|
|||
private final File f;
|
||||
private final Pattern pattern;
|
||||
private int count;
|
||||
private String failure;
|
||||
private String failure, query;
|
||||
private final int format;
|
||||
private final boolean dom;
|
||||
|
||||
private Export(final File f, final String filter, final int format, boolean dom) {
|
||||
private Export(final File f, final String filter, final String query, final int format, boolean dom) {
|
||||
// format: 0=text, 1=html, 2=rss/xml
|
||||
this.f = f;
|
||||
this.pattern = filter == null ? null : Pattern.compile(filter);
|
||||
this.query = query == null? "*:*" : query;
|
||||
this.count = 0;
|
||||
this.failure = null;
|
||||
this.format = format;
|
||||
|
@ -805,7 +807,7 @@ public final class Fulltext {
|
|||
|
||||
|
||||
if (this.dom) {
|
||||
Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
|
||||
Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
|
||||
ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
|
||||
for (final String host: stats) {
|
||||
if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
|
||||
|
@ -814,21 +816,19 @@ public final class Fulltext {
|
|||
this.count++;
|
||||
}
|
||||
} else {
|
||||
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
|
||||
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
|
||||
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
|
||||
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
|
||||
SolrDocument doc;
|
||||
ArrayList<?> title;
|
||||
String url, author, hash;
|
||||
String[] descriptions;
|
||||
String url, hash, title, author, description;
|
||||
Integer size;
|
||||
Date date;
|
||||
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
|
||||
url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
|
||||
title = (ArrayList<?>) doc.getFieldValue(CollectionSchema.title.getSolrFieldName());
|
||||
author = (String) doc.getFieldValue(CollectionSchema.author.getSolrFieldName());
|
||||
descriptions = (String[]) doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName());
|
||||
hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
|
||||
url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
|
||||
title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName()));
|
||||
author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName()));
|
||||
description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()));
|
||||
size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
|
||||
date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
|
||||
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
|
||||
|
@ -836,16 +836,14 @@ public final class Fulltext {
|
|||
pw.println(url);
|
||||
}
|
||||
if (this.format == 1) {
|
||||
if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</a>");
|
||||
if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>");
|
||||
}
|
||||
if (this.format == 2) {
|
||||
pw.println("<item>");
|
||||
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</title>");
|
||||
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>");
|
||||
pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>");
|
||||
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
|
||||
if (descriptions != null && descriptions.length > 0) {
|
||||
for (String d: descriptions) pw.println("<description>" + CharacterCoding.unicode2xml(d, true) + "</description>");
|
||||
}
|
||||
if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
|
||||
if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>");
|
||||
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
|
||||
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
|
||||
|
@ -883,6 +881,13 @@ public final class Fulltext {
|
|||
public int count() {
|
||||
return this.count;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private String getStringFrom(final Object o) {
|
||||
if (o == null) return "";
|
||||
if (o instanceof ArrayList) return ((ArrayList<String>) o).get(0);
|
||||
return (String) o;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -29,7 +29,6 @@ package net.yacy.search.index;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -443,7 +442,7 @@ public class Segment {
|
|||
* @param ids
|
||||
* @return a set of ids which exist in the database
|
||||
*/
|
||||
public Set<String> exists(final Collection<String> ids) {
|
||||
public Set<String> exists(final Set<String> ids) {
|
||||
return this.fulltext.exists(ids);
|
||||
}
|
||||
|
||||
|
@ -504,10 +503,10 @@ public class Segment {
|
|||
}
|
||||
}
|
||||
|
||||
public void clearCache() {
|
||||
public void clearCaches() {
|
||||
if (this.urlCitationIndex != null) this.urlCitationIndex.clearCache();
|
||||
if (this.termIndex != null) this.termIndex.clearCache();
|
||||
this.fulltext.clearCache();
|
||||
this.fulltext.clearCaches();
|
||||
}
|
||||
|
||||
public File getLocation() {
|
||||
|
|
|
@ -242,7 +242,8 @@ public class QueryGoal {
|
|||
// add filter to prevent that results come from failed urls
|
||||
q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200").append(" AND (");
|
||||
q.append(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).append(":[* TO *] OR ");
|
||||
q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif))");
|
||||
q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif) OR");
|
||||
q.append(CollectionSchema.content_type.getSolrFieldName()).append(":(image/*))");
|
||||
|
||||
// parse special requests
|
||||
if (isCatchall()) return q;
|
||||
|
|
|
@ -898,17 +898,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
||||
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString();
|
||||
hostscore = collectionConnector.getFacets(query, 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
|
||||
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
|
||||
|
||||
ConcurrentLog.info("CollectionConfiguration", "collecting " + hostscore.size() + " hosts");
|
||||
int countcheck = 0;
|
||||
for (String host: hostscore.keyList(true)) {
|
||||
// Patch the citation index for links with canonical tags.
|
||||
// This shall fulfill the following requirement:
|
||||
// If a document A links to B and B contains a 'canonical C', then the citation rank coputation shall consider that A links to C and B does not link to C.
|
||||
// If a document A links to B and B contains a 'canonical C', then the citation rank computation shall consider that A links to C and B does not link to C.
|
||||
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
|
||||
String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]";
|
||||
long patchquerycount = collectionConnector.getCountByQuery(patchquery);
|
||||
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, 0, 10000000, 60000L, 50,
|
||||
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
|
||||
SolrDocument doc_B;
|
||||
int patchquerycountcheck = 0;
|
||||
try {
|
||||
while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
// find all documents which link to the canonical doc
|
||||
|
@ -926,10 +928,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
CitationReference doc_A_citation = doc_A_ids_iterator.next();
|
||||
segment.urlCitation().add(doc_C_url.hash(), doc_A_citation);
|
||||
}
|
||||
patchquerycountcheck++;
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
} catch (SpaceExceededException e) {
|
||||
}
|
||||
if (patchquerycount != patchquerycountcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous patchquery count for host " + host + ": expected=" + patchquerycount + ", counted=" + patchquerycountcheck);
|
||||
|
||||
// do the citation rank computation
|
||||
if (hostscore.get(host) <= 0) continue;
|
||||
|
@ -939,12 +943,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
while (convergence_attempts++ < 30) {
|
||||
if (crh.convergenceStep()) break;
|
||||
}
|
||||
ConcurrentLog.info("CollectionConfiguration.CRHost", "convergence for host " + host + " after " + convergence_attempts + " steps");
|
||||
ConcurrentLog.info("CollectionConfiguration", "convergence for host " + host + " after " + convergence_attempts + " steps");
|
||||
// we have now the cr for all documents of a specific host; we store them for later use
|
||||
Map<byte[], CRV> crn = crh.normalize();
|
||||
//crh.log(crn);
|
||||
ranking.putAll(crn); // accumulate this here for usage in document update later
|
||||
countcheck++;
|
||||
}
|
||||
if (hostscore.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + hostscore.size() + ", counted=" + countcheck);
|
||||
} catch (final IOException e2) {
|
||||
hostscore = new ClusteredScoreMap<String>();
|
||||
}
|
||||
|
@ -952,13 +958,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
// process all documents at the webgraph for the outgoing links of this document
|
||||
SolrDocument doc;
|
||||
if (webgraphConnector != null) {
|
||||
for (String host: hostscore.keyList(true)) {
|
||||
if (hostscore.get(host) <= 0) continue;
|
||||
// select all webgraph edges and modify their cr value
|
||||
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(
|
||||
WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"",
|
||||
0, 10000000, 60000, 50);
|
||||
try {
|
||||
try {
|
||||
for (String host: hostscore.keyList(true)) {
|
||||
if (hostscore.get(host) <= 0) continue;
|
||||
// select all webgraph edges and modify their cr value
|
||||
String query = WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"";
|
||||
long count = webgraphConnector.getCountByQuery(query);
|
||||
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph");
|
||||
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(query, 0, 10000000, 60000, 50);
|
||||
int countcheck = 0;
|
||||
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
boolean changed = false;
|
||||
SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
|
||||
|
@ -978,21 +986,29 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
webgraphConnector.add(sid);
|
||||
} catch (SolrException e) {
|
||||
} catch (IOException e) {
|
||||
}
|
||||
}
|
||||
countcheck++;
|
||||
}
|
||||
} catch (final InterruptedException e) {}
|
||||
if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous webgraph document count for host " + host + ": expected=" + count + ", counted=" + countcheck);
|
||||
}
|
||||
} catch (final IOException e2) {
|
||||
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
|
||||
} catch (final InterruptedException e3) {
|
||||
ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3);
|
||||
}
|
||||
}
|
||||
|
||||
// process all documents in collection
|
||||
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
|
||||
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
||||
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
|
||||
0, 10000, 60000, 50);
|
||||
String query = (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
||||
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]";
|
||||
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
|
||||
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
|
||||
Set<String> uniqueURLs = new HashSet<String>();
|
||||
try {
|
||||
long count = collectionConnector.getCountByQuery(query);
|
||||
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
|
||||
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000, 60000, 50);
|
||||
int countcheck = 0;
|
||||
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
// for each to-be-processed entry work on the process tag
|
||||
Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName());
|
||||
|
@ -1031,8 +1047,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (!hostExtentCache.containsKey(hosthash)) {
|
||||
StringBuilder q = new StringBuilder();
|
||||
q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
|
||||
long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
|
||||
hostExtentCache.put(hosthash, count);
|
||||
long hostExtentCount = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
|
||||
hostExtentCache.put(hosthash, hostExtentCount);
|
||||
}
|
||||
if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++;
|
||||
|
||||
|
@ -1047,13 +1063,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
proccount++;
|
||||
} catch (final Throwable e1) {
|
||||
}
|
||||
countcheck++;
|
||||
}
|
||||
if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck);
|
||||
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " +
|
||||
proccount_clickdepthchange + " clickdepth changes, " +
|
||||
proccount_referencechange + " reference-count changes, " +
|
||||
proccount_uniquechange + " unique field changes, " +
|
||||
proccount_citationchange + " citation ranking changes.");
|
||||
} catch (final InterruptedException e) {
|
||||
} catch (final InterruptedException e2) {
|
||||
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
|
||||
} catch (IOException e3) {
|
||||
ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3);
|
||||
}
|
||||
return proccount;
|
||||
}
|
||||
|
@ -1148,8 +1169,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (entry == null || entry.getValue() == null) continue;
|
||||
try {
|
||||
String url = (String) connector.getDocumentById(ASCII.String(entry.getKey()), CollectionSchema.sku.getSolrFieldName()).getFieldValue(CollectionSchema.sku.getSolrFieldName());
|
||||
ConcurrentLog.info("CollectionConfiguration.CRHost", "CR for " + url);
|
||||
ConcurrentLog.info("CollectionConfiguration.CRHost", ">> " + entry.getValue().toString());
|
||||
ConcurrentLog.info("CollectionConfiguration", "CR for " + url);
|
||||
ConcurrentLog.info("CollectionConfiguration", ">> " + entry.getValue().toString());
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user