mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
YaCy can now use the solr index to compute text snippets. This makes search result preparation MUCH faster because no document fetching and parsing is necessary any more.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7943 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
0819e1d397
commit
85a5487d6d
|
@ -66,6 +66,7 @@ import de.anomic.search.RankingProcess;
|
|||
import de.anomic.search.ReferenceOrder;
|
||||
import de.anomic.search.SearchEventCache;
|
||||
import de.anomic.search.Segment;
|
||||
import de.anomic.search.Segments;
|
||||
import de.anomic.search.Switchboard;
|
||||
import de.anomic.search.SwitchboardConstants;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
@ -86,7 +87,7 @@ public class IndexControlRWIs_p {
|
|||
prop.put("keyhash", "");
|
||||
prop.put("result", "");
|
||||
prop.put("cleanup", post == null || post.containsKey("maxReferencesLimit") ? 1 : 0);
|
||||
prop.put("cleanup_solr", sb.solrConnector == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1);
|
||||
prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1);
|
||||
|
||||
String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
|
||||
int i = 0;
|
||||
|
@ -157,7 +158,7 @@ public class IndexControlRWIs_p {
|
|||
segment.clear();
|
||||
}
|
||||
if (post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false)) try {
|
||||
sb.solrConnector.clear();
|
||||
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear();
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
|
|
|
@ -33,9 +33,12 @@ import net.yacy.cora.document.UTF8;
|
|||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.services.federated.solr.SolrChardingConnector;
|
||||
import net.yacy.cora.services.federated.solr.SolrChardingSelection;
|
||||
import net.yacy.cora.services.federated.solr.SolrConnector;
|
||||
import net.yacy.cora.services.federated.solr.SolrScheme;
|
||||
import net.yacy.cora.services.federated.solr.SolrSingleConnector;
|
||||
import net.yacy.cora.storage.ConfigurationSet;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import de.anomic.search.Segments;
|
||||
import de.anomic.search.Switchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
@ -75,8 +78,8 @@ public class IndexFederated_p {
|
|||
|
||||
if (solrWasOn) {
|
||||
// switch off
|
||||
sb.solrConnector.close();
|
||||
sb.solrConnector = null;
|
||||
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().close();
|
||||
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
|
||||
}
|
||||
|
||||
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
|
||||
|
@ -85,10 +88,10 @@ public class IndexFederated_p {
|
|||
// switch on
|
||||
final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
|
||||
try {
|
||||
sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
|
||||
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null);
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
sb.solrConnector = null;
|
||||
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -110,12 +113,13 @@ public class IndexFederated_p {
|
|||
}
|
||||
|
||||
// show solr host table
|
||||
if (sb.solrConnector == null) {
|
||||
if (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) {
|
||||
prop.put("table", 0);
|
||||
} else {
|
||||
prop.put("table", 1);
|
||||
final long[] size = sb.solrConnector.getSizeList();
|
||||
final String[] urls = sb.solrConnector.getAdminInterfaceList();
|
||||
final SolrConnector solr = sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr();
|
||||
final long[] size = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getSizeList() : new long[]{((SolrSingleConnector) solr).getSize()};
|
||||
final String[] urls = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getAdminInterfaceList() : new String[]{((SolrSingleConnector) solr).getAdminInterface()};
|
||||
boolean dark = false;
|
||||
for (int i = 0; i < size.length; i++) {
|
||||
prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark;
|
||||
|
@ -126,7 +130,7 @@ public class IndexFederated_p {
|
|||
}
|
||||
|
||||
// write scheme
|
||||
SolrScheme scheme = (sb.solrConnector == null) ? null : sb.solrConnector.getScheme();
|
||||
SolrScheme scheme = (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) ? null : sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().getScheme();
|
||||
final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
|
||||
if (scheme == null) {
|
||||
scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
|
||||
|
|
|
@ -62,6 +62,7 @@ public class CrawlQueues {
|
|||
|
||||
private static final String ERROR_DB_FILENAME = "urlError3.db";
|
||||
private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db";
|
||||
private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING;
|
||||
|
||||
protected Switchboard sb;
|
||||
protected Log log;
|
||||
|
@ -81,8 +82,8 @@ public class CrawlQueues {
|
|||
this.log.logConfig("Starting Crawling Management");
|
||||
this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
|
||||
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
|
||||
this.errorURL = new ZURL(sb.solrConnector, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(sb.solrConnector, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
|
||||
this.errorURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
|
||||
}
|
||||
|
||||
public void relocate(final File newQueuePath) {
|
||||
|
@ -93,8 +94,8 @@ public class CrawlQueues {
|
|||
|
||||
this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727);
|
||||
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
|
||||
this.errorURL = new ZURL(this.sb.solrConnector, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(this.sb.solrConnector, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
this.errorURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
|
@ -249,7 +250,7 @@ public class CrawlQueues {
|
|||
return true;
|
||||
}
|
||||
try {
|
||||
this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, new Response(urlEntry, profile), null, null));
|
||||
this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(PROCESS, new Response(urlEntry, profile), null, null));
|
||||
Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false));
|
||||
} catch (final InterruptedException e) {
|
||||
Log.logException(e);
|
||||
|
|
|
@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentLinkedQueue;
|
|||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.services.federated.solr.SolrChardingConnector;
|
||||
import net.yacy.cora.services.federated.solr.SolrConnector;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.Index;
|
||||
|
@ -76,10 +77,10 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
// the class object
|
||||
private Index urlIndex;
|
||||
private final ConcurrentLinkedQueue<byte[]> stack;
|
||||
private final SolrChardingConnector solrConnector;
|
||||
private final SolrConnector solrConnector;
|
||||
|
||||
public ZURL(
|
||||
final SolrChardingConnector solrConnector,
|
||||
final SolrConnector solrConnector,
|
||||
final File cachePath,
|
||||
final String tablename,
|
||||
final boolean startWithEmptyFile,
|
||||
|
|
|
@ -31,11 +31,13 @@ import java.util.Iterator;
|
|||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.cora.ranking.ScoreMap;
|
||||
import net.yacy.cora.ranking.WeakPriorityBlockingQueue;
|
||||
import net.yacy.cora.ranking.WeakPriorityBlockingQueue.ReverseElement;
|
||||
import net.yacy.cora.services.federated.solr.SolrConnector;
|
||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
|
@ -46,6 +48,10 @@ import net.yacy.kelondro.logging.Log;
|
|||
import net.yacy.kelondro.util.EventTracker;
|
||||
import net.yacy.kelondro.util.MemoryControl;
|
||||
import net.yacy.repository.LoaderDispatcher;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
|
||||
import de.anomic.data.WorkTables;
|
||||
import de.anomic.http.client.Cache;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
@ -322,6 +328,7 @@ public class ResultFetcher {
|
|||
private final int neededResults;
|
||||
private final Pattern snippetPattern;
|
||||
private boolean shallrun;
|
||||
private final SolrConnector solr;
|
||||
|
||||
public Worker(final int id, final long maxlifetime, final CacheStrategy cacheStrategy, final Pattern snippetPattern, final int neededResults) {
|
||||
this.id = id;
|
||||
|
@ -331,6 +338,7 @@ public class ResultFetcher {
|
|||
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
|
||||
this.neededResults = neededResults;
|
||||
this.shallrun = true;
|
||||
this.solr = ResultFetcher.this.rankingProcess.getQuery().getSegment().getSolr();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -373,8 +381,18 @@ public class ResultFetcher {
|
|||
}
|
||||
if (ResultFetcher.this.query.filterfailurls && ResultFetcher.this.workTables.failURLsContains(page.hash())) continue;
|
||||
|
||||
// in case that we have an attached solr, we load also the solr document
|
||||
String solrContent = null;
|
||||
if (this.solr != null) {
|
||||
SolrDocument sd = null;
|
||||
final SolrDocumentList sdl = this.solr.get("id:" + ASCII.String(page.hash()), 0, 1);
|
||||
if (sdl.size() > 0) sd = sdl.get(0);
|
||||
if (sd != null) solrContent = this.solr.getScheme().solrGetText(sd);
|
||||
}
|
||||
|
||||
|
||||
loops++;
|
||||
resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
|
||||
resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
|
||||
if (resultEntry == null) continue; // the entry had some problems, cannot be used
|
||||
rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw();
|
||||
//System.out.println("***SNIPPET*** raw='" + rawLine + "', pattern='" + this.snippetPattern.toString() + "'");
|
||||
|
@ -412,7 +430,7 @@ public class ResultFetcher {
|
|||
}
|
||||
}
|
||||
|
||||
protected ResultEntry fetchSnippet(final URIMetadataRow page, final CacheStrategy cacheStrategy) {
|
||||
protected ResultEntry fetchSnippet(final URIMetadataRow page, final String solrText, final CacheStrategy cacheStrategy) {
|
||||
// Snippet Fetching can has 3 modes:
|
||||
// 0 - do not fetch snippets
|
||||
// 1 - fetch snippets offline only
|
||||
|
@ -429,6 +447,7 @@ public class ResultFetcher {
|
|||
if (cacheStrategy == null) {
|
||||
final TextSnippet snippet = new TextSnippet(
|
||||
null,
|
||||
solrText,
|
||||
metadata,
|
||||
this.snippetFetchWordHashes,
|
||||
null,
|
||||
|
@ -445,6 +464,7 @@ public class ResultFetcher {
|
|||
startTime = System.currentTimeMillis();
|
||||
final TextSnippet snippet = new TextSnippet(
|
||||
this.loader,
|
||||
solrText,
|
||||
metadata,
|
||||
this.snippetFetchWordHashes,
|
||||
cacheStrategy,
|
||||
|
|
|
@ -37,6 +37,7 @@ import java.util.TreeSet;
|
|||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.services.federated.solr.SolrConnector;
|
||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
|
@ -81,6 +82,7 @@ public class Segment {
|
|||
protected final IndexCell<WordReference> termIndex;
|
||||
//private final IndexCell<NavigationReference> authorNavIndex;
|
||||
protected final MetadataRepository urlMetadata;
|
||||
private SolrConnector solr;
|
||||
private final File segmentPath;
|
||||
|
||||
public Segment(
|
||||
|
@ -98,6 +100,7 @@ public class Segment {
|
|||
|
||||
this.log = log;
|
||||
this.segmentPath = segmentPath;
|
||||
this.solr = null;
|
||||
|
||||
this.termIndex = new IndexCell<WordReference>(
|
||||
segmentPath,
|
||||
|
@ -126,6 +129,14 @@ public class Segment {
|
|||
this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727);
|
||||
}
|
||||
|
||||
public void connectSolr(final SolrConnector solr) {
|
||||
this.solr = solr;
|
||||
}
|
||||
|
||||
public SolrConnector getSolr() {
|
||||
return this.solr;
|
||||
}
|
||||
|
||||
public static void migrateTextIndex(final File oldSegmentPath, final File newSegmentPath) {
|
||||
final File oldCellPath = new File(oldSegmentPath, "RICELL");
|
||||
if (!oldCellPath.exists()) return;
|
||||
|
@ -254,6 +265,7 @@ public class Segment {
|
|||
public void close() {
|
||||
this.termIndex.close();
|
||||
this.urlMetadata.close();
|
||||
if (this.solr != null) this.solr.close();
|
||||
}
|
||||
|
||||
public URIMetadataRow storeDocument(
|
||||
|
|
|
@ -38,13 +38,13 @@ import net.yacy.kelondro.rwi.IndexCell;
|
|||
|
||||
|
||||
public class Segments implements Iterable<Segment> {
|
||||
|
||||
|
||||
/**
|
||||
* process enumeration type
|
||||
* defines constants that can be used to assign process-related segment names
|
||||
*/
|
||||
public enum Process {
|
||||
|
||||
|
||||
RECEIPTS,
|
||||
QUERIES,
|
||||
DHTIN,
|
||||
|
@ -59,7 +59,7 @@ public class Segments implements Iterable<Segment> {
|
|||
throw new UnsupportedOperationException("toString not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private final Log log;
|
||||
private final File segmentsPath;
|
||||
private final int entityCacheMaxSize;
|
||||
|
@ -68,7 +68,7 @@ public class Segments implements Iterable<Segment> {
|
|||
private final HashMap<Process, String> process_assignment;
|
||||
private final boolean useTailCache;
|
||||
private final boolean exceed134217727;
|
||||
|
||||
|
||||
public Segments(
|
||||
final Log log,
|
||||
final File segmentsPath,
|
||||
|
@ -96,41 +96,41 @@ public class Segments implements Iterable<Segment> {
|
|||
this.process_assignment.put(Process.PUBLIC, "default");
|
||||
this.process_assignment.put(Process.SURROGATES, "default");
|
||||
}
|
||||
|
||||
public void setSegment(Process process, String segmentName) {
|
||||
|
||||
public void setSegment(final Process process, final String segmentName) {
|
||||
this.process_assignment.put(process, segmentName);
|
||||
}
|
||||
|
||||
public static void migrateOld(File oldSingleSegment, File newSegmentsPath, String newSegmentName) {
|
||||
|
||||
public static void migrateOld(final File oldSingleSegment, final File newSegmentsPath, final String newSegmentName) {
|
||||
if (!oldSingleSegment.exists()) return;
|
||||
File newSegmentPath = new File(newSegmentsPath, newSegmentName);
|
||||
final File newSegmentPath = new File(newSegmentsPath, newSegmentName);
|
||||
if (!newSegmentPath.exists()) newSegmentPath.mkdirs();
|
||||
Segment.migrateTextIndex(oldSingleSegment, newSegmentPath);
|
||||
Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath);
|
||||
|
||||
String[] oldFiles = oldSingleSegment.list();
|
||||
for (String oldFile: oldFiles) {
|
||||
|
||||
final String[] oldFiles = oldSingleSegment.list();
|
||||
for (final String oldFile: oldFiles) {
|
||||
if (oldFile.startsWith("text.")) {
|
||||
new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String[] segmentNames() {
|
||||
return this.segments.keySet().toArray(new String[this.segments.size()]);
|
||||
}
|
||||
|
||||
|
||||
public boolean segmentExist(final String segmentName) {
|
||||
return segments.containsKey(segmentName);
|
||||
return this.segments.containsKey(segmentName);
|
||||
}
|
||||
|
||||
|
||||
public Segment segment(final Process process) {
|
||||
return segment(this.process_assignment.get(process));
|
||||
}
|
||||
|
||||
|
||||
public Segment segment(final String segmentName) {
|
||||
if (segments == null) return null;
|
||||
Segment segment = segments.get(segmentName);
|
||||
if (this.segments == null) return null;
|
||||
Segment segment = this.segments.get(segmentName);
|
||||
if (segment == null) {
|
||||
// generate the segment
|
||||
try {
|
||||
|
@ -141,7 +141,7 @@ public class Segments implements Iterable<Segment> {
|
|||
this.maxFileSize,
|
||||
this.useTailCache,
|
||||
this.exceed134217727);
|
||||
} catch (IOException e) {
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
return null;
|
||||
}
|
||||
|
@ -149,28 +149,28 @@ public class Segments implements Iterable<Segment> {
|
|||
}
|
||||
return segment;
|
||||
}
|
||||
|
||||
|
||||
public long URLCount() {
|
||||
if (this.segments == null) return 0;
|
||||
long c = 0;
|
||||
for (Segment s: this.segments.values()) c += (long) s.urlMetadata().size();
|
||||
for (final Segment s: this.segments.values()) c += s.urlMetadata().size();
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
public long RWICount() {
|
||||
if (this.segments == null) return 0;
|
||||
long c = 0;
|
||||
for (Segment s: this.segments.values()) c += (long) s.termIndex().sizesMax();
|
||||
for (final Segment s: this.segments.values()) c += s.termIndex().sizesMax();
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
public int RWIBufferCount() {
|
||||
if (this.segments == null) return 0;
|
||||
int c = 0;
|
||||
for (Segment s: this.segments.values()) c += s.termIndex().getBufferSize();
|
||||
for (final Segment s: this.segments.values()) c += s.termIndex().getBufferSize();
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
public MetadataRepository urlMetadata(final Process process) {
|
||||
return segment(this.process_assignment.get(process)).urlMetadata();
|
||||
}
|
||||
|
@ -178,11 +178,11 @@ public class Segments implements Iterable<Segment> {
|
|||
public IndexCell<WordReference> termIndex(final Process process) {
|
||||
return segment(this.process_assignment.get(process)).termIndex();
|
||||
}
|
||||
|
||||
|
||||
public void clear(final Process process) {
|
||||
segment(this.process_assignment.get(process)).clear();
|
||||
}
|
||||
|
||||
|
||||
public File getLocation(final Process process) {
|
||||
return segment(this.process_assignment.get(process)).getLocation();
|
||||
}
|
||||
|
@ -190,16 +190,16 @@ public class Segments implements Iterable<Segment> {
|
|||
public void close(final Process process) {
|
||||
segment(this.process_assignment.get(process)).close();
|
||||
}
|
||||
|
||||
|
||||
public void close() {
|
||||
if (segments != null) for (Segment s: this.segments.values()) s.close();
|
||||
if (this.segments != null) for (final Segment s: this.segments.values()) s.close();
|
||||
this.segments = null;
|
||||
}
|
||||
|
||||
public void finalize() {
|
||||
this.close();
|
||||
}
|
||||
|
||||
|
||||
public synchronized Segment.ReferenceCleaner getReferenceCleaner(final String segmentName, final byte[] startHash) {
|
||||
return segment(segmentName).getReferenceCleaner(startHash);
|
||||
}
|
||||
|
|
|
@ -247,7 +247,6 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
private final Semaphore shutdownSync = new Semaphore(0);
|
||||
private boolean terminate = false;
|
||||
public SolrChardingConnector solrConnector = null;
|
||||
|
||||
//private Object crawlingPausedSync = new Object();
|
||||
//private boolean crawlingIsPaused = false;
|
||||
|
@ -592,10 +591,10 @@ public final class Switchboard extends serverSwitch {
|
|||
final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
|
||||
final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
|
||||
try {
|
||||
this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
|
||||
this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null);
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
this.solrConnector = null;
|
||||
this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
|
||||
}
|
||||
|
||||
// start a loader
|
||||
|
@ -1314,7 +1313,6 @@ public final class Switchboard extends serverSwitch {
|
|||
Cache.close();
|
||||
this.tables.close();
|
||||
Domains.close();
|
||||
if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)) this.solrConnector.close();
|
||||
AccessTracker.dumpLog(new File("DATA/LOG/queries.log"));
|
||||
UPnP.deletePortMapping();
|
||||
Tray.removeTray();
|
||||
|
@ -1989,7 +1987,7 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
public indexingQueueEntry condenseDocument(final indexingQueueEntry in) {
|
||||
in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
|
||||
if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) {
|
||||
if (this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) {
|
||||
// send the documents to solr
|
||||
for (final Document doc: in.documents) {
|
||||
try {
|
||||
|
@ -2000,7 +1998,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// in case that this happens it appears that the doc id is the right one
|
||||
}
|
||||
try {
|
||||
this.solrConnector.add(id, in.queueEntry.getResponseHeader(), doc);
|
||||
this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().add(id, in.queueEntry.getResponseHeader(), doc);
|
||||
} catch (final IOException e) {
|
||||
Log.logWarning("SOLR", "failed to send " + in.queueEntry.url().toNormalform(true, false) + " to solr: " + e.getMessage());
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package de.anomic.search;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
|
@ -34,6 +35,7 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.cora.storage.ARC;
|
||||
import net.yacy.cora.storage.ConcurrentARC;
|
||||
|
@ -140,6 +142,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
|
||||
public TextSnippet(
|
||||
final LoaderDispatcher loader,
|
||||
final String solrText,
|
||||
final URIMetadataRow.Components comp,
|
||||
final HandleSet queryhashes,
|
||||
final CacheStrategy cacheStrategy,
|
||||
|
@ -156,7 +159,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
}
|
||||
|
||||
// try to get snippet from snippetCache
|
||||
ResultClass source = ResultClass.SOURCE_CACHE;
|
||||
final ResultClass source = ResultClass.SOURCE_CACHE;
|
||||
final String wordhashes = yacySearch.set2string(queryhashes);
|
||||
final String urls = ASCII.String(url.hash());
|
||||
String snippetLine = snippetsCache.get(wordhashes, urls);
|
||||
|
@ -165,32 +168,37 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
init(url.hash(), snippetLine, source, null);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// try to get the snippet from a document at the cache (or in the web)
|
||||
// this requires that the document is parsed after loading
|
||||
String textline = null;
|
||||
HandleSet remainingHashes = queryhashes;
|
||||
{ //encapsulate potential expensive sentences
|
||||
final Collection<StringBuilder> sentences;
|
||||
{ //encapsulate potential expensive document
|
||||
final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
|
||||
if (document == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
* COMPUTE SNIPPET
|
||||
* =========================================================================== */
|
||||
// we have found a parseable non-empty file: use the lines
|
||||
|
||||
// compute snippet from text
|
||||
sentences = document.getSentences(pre);
|
||||
document.close();
|
||||
} //encapsulate potential expensive document END
|
||||
|
||||
if (sentences == null) {
|
||||
init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
|
||||
return;
|
||||
}
|
||||
|
||||
Collection<StringBuilder> sentences = null;
|
||||
|
||||
// try the solr text first
|
||||
if (solrText != null) {
|
||||
// compute sentences from solr query
|
||||
sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText)));
|
||||
}
|
||||
|
||||
// if then no sentences are found, we fail-over to get the content from the re-loaded document
|
||||
if (sentences == null) {
|
||||
final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
|
||||
if (document == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// compute sentences from parsed document
|
||||
sentences = document.getSentences(pre);
|
||||
document.close();
|
||||
|
||||
if (sentences == null) {
|
||||
init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
final SnippetExtractor tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength);
|
||||
textline = tsr.getSnippet();
|
||||
|
@ -227,7 +235,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
// document.close();
|
||||
init(url.hash(), snippetLine, source, null);
|
||||
}
|
||||
|
||||
|
||||
private Document loadDocument(
|
||||
final LoaderDispatcher loader,
|
||||
final URIMetadataRow.Components comp,
|
||||
|
|
|
@ -34,14 +34,13 @@ import net.yacy.cora.protocol.Domains;
|
|||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
|
||||
public class SolrChardingConnector {
|
||||
public class SolrChardingConnector implements SolrConnector {
|
||||
|
||||
private final List<SolrSingleConnector> connectors;
|
||||
private final SolrScheme scheme;
|
||||
|
@ -164,13 +163,7 @@ public class SolrChardingConnector {
|
|||
final long[] size = new long[this.connectors.size()];
|
||||
int i = 0;
|
||||
for (final SolrSingleConnector connector: this.connectors) {
|
||||
try {
|
||||
final SolrDocumentList list = connector.get("*:*", 0, 1);
|
||||
size[i++] = list.getNumFound();
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
size[i++] = 0;
|
||||
}
|
||||
size[i++] = connector.getSize();
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,99 @@
|
|||
/**
|
||||
* SolrConnector
|
||||
* Copyright 2011 by Michael Peter Christen
|
||||
* First released 13.09.2011 at http://yacy.net
|
||||
*
|
||||
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
|
||||
* $LastChangedRevision: 7654 $
|
||||
* $LastChangedBy: orbiter $
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services.federated.solr;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
|
||||
public interface SolrConnector {
|
||||
|
||||
/**
|
||||
* with a scheme the fields of a SolrDocument can be translated to actual data values
|
||||
* @return the solr scheme that can translate the SolrDocument
|
||||
*/
|
||||
public SolrScheme getScheme();
|
||||
|
||||
public void close();
|
||||
|
||||
/**
|
||||
* delete everything in the solr index
|
||||
* @throws IOException
|
||||
*/
|
||||
public void clear() throws IOException;
|
||||
|
||||
/**
|
||||
* delete an entry from solr
|
||||
* @param id the url hash of the entry
|
||||
* @throws IOException
|
||||
*/
|
||||
public void delete(final String id) throws IOException;
|
||||
|
||||
/**
|
||||
* delete a set of entries from solr; entries are identified by their url hash
|
||||
* @param ids a list of url hashes
|
||||
* @throws IOException
|
||||
*/
|
||||
public void delete(final List<String> ids) throws IOException;
|
||||
|
||||
/**
|
||||
* add a YaCy document. This calls the scheme processor to add the document as solr document
|
||||
* @param id the url hash of the entry
|
||||
* @param header the http response header
|
||||
* @param doc the YaCy document
|
||||
* @throws IOException
|
||||
*/
|
||||
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException;
|
||||
|
||||
/**
|
||||
* register an entry as error document
|
||||
* @param digestURI
|
||||
* @param failReason
|
||||
* @param httpstatus
|
||||
* @throws IOException
|
||||
*/
|
||||
public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException;
|
||||
|
||||
|
||||
/**
|
||||
* get a query result from solr
|
||||
* to get all results set the query String to "*:*"
|
||||
* @param querystring
|
||||
* @throws IOException
|
||||
*/
|
||||
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException;
|
||||
|
||||
/**
|
||||
* get the size of the index
|
||||
* @return number of results if solr is queries with a catch-all pattern
|
||||
*/
|
||||
public long getSize();
|
||||
|
||||
}
|
|
@ -27,6 +27,8 @@ package net.yacy.cora.services.federated.solr;
|
|||
|
||||
import java.io.File;
|
||||
import java.net.InetAddress;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
|
@ -44,6 +46,7 @@ import net.yacy.document.parser.html.ContentScraper;
|
|||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
public class SolrScheme extends ConfigurationSet {
|
||||
|
@ -349,6 +352,46 @@ public class SolrScheme extends ConfigurationSet {
|
|||
return solrdoc;
|
||||
}
|
||||
|
||||
public String solrGetID(final SolrDocument solr) {
|
||||
return (String) solr.getFieldValue("id");
|
||||
}
|
||||
|
||||
public DigestURI solrGetURL(final SolrDocument solr) {
|
||||
try {
|
||||
return new DigestURI((String) solr.getFieldValue("sku"));
|
||||
} catch (final MalformedURLException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public String solrGetTitle(final SolrDocument solr) {
|
||||
return (String) solr.getFieldValue("title");
|
||||
}
|
||||
|
||||
public String solrGetText(final SolrDocument solr) {
|
||||
return (String) solr.getFieldValue("text_t");
|
||||
}
|
||||
|
||||
public String solrGetAuthor(final SolrDocument solr) {
|
||||
return (String) solr.getFieldValue("author");
|
||||
}
|
||||
|
||||
public String solrGetDescription(final SolrDocument solr) {
|
||||
return (String) solr.getFieldValue("description");
|
||||
}
|
||||
|
||||
public Date solrGetDate(final SolrDocument solr) {
|
||||
return (Date) solr.getFieldValue("last_modified");
|
||||
}
|
||||
|
||||
public Collection<String> solrGetKeywords(final SolrDocument solr) {
|
||||
final Collection<Object> c = solr.getFieldValues("keywords");
|
||||
final ArrayList<String> a = new ArrayList<String>();
|
||||
for (final Object s: c) {
|
||||
a.add((String) s);
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
/*
|
||||
* standard solr scheme
|
||||
|
|
|
@ -57,7 +57,7 @@ import org.apache.solr.common.SolrException;
|
|||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
|
||||
public class SolrSingleConnector {
|
||||
public class SolrSingleConnector implements SolrConnector {
|
||||
|
||||
private final String solrurl, host, solrpath, solraccount, solrpw;
|
||||
private final int port;
|
||||
|
@ -178,6 +178,22 @@ public class SolrSingleConnector {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrScheme getScheme() {
|
||||
return this.scheme;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSize() {
|
||||
try {
|
||||
final SolrDocumentList list = get("*:*", 0, 1);
|
||||
return list.getNumFound();
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* delete everything in the solr index
|
||||
* @throws IOException
|
||||
|
@ -325,6 +341,16 @@ public class SolrSingleConnector {
|
|||
//return result;
|
||||
}
|
||||
|
||||
|
||||
public String getAdminInterface() {
|
||||
final InetAddress localhostExternAddress = Domains.myPublicLocalIP();
|
||||
final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress();
|
||||
String u = this.solrurl;
|
||||
int p = u.indexOf("localhost"); if (p < 0) p = u.indexOf("127.0.0.1");
|
||||
if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9);
|
||||
return u + (u.endsWith("/") ? "admin/" : "/admin/");
|
||||
}
|
||||
|
||||
public static void main(final String args[]) {
|
||||
SolrSingleConnector solr;
|
||||
try {
|
||||
|
@ -347,5 +373,4 @@ public class SolrSingleConnector {
|
|||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -312,8 +312,12 @@ dc_rights
|
|||
}
|
||||
|
||||
public List<StringBuilder> getSentences(final boolean pre) {
|
||||
if (this.text == null) return null;
|
||||
final SentenceReader e = new SentenceReader(getText());
|
||||
return getSentences(pre, getText());
|
||||
}
|
||||
|
||||
public static List<StringBuilder> getSentences(final boolean pre, final InputStream text) {
|
||||
if (text == null) return null;
|
||||
final SentenceReader e = new SentenceReader(text);
|
||||
e.pre(pre);
|
||||
final List<StringBuilder> sentences = new ArrayList<StringBuilder>();
|
||||
while (e.hasNext()) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user