From 85a5487d6dc2f48757e66a68fcbae534824f4745 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 13 Sep 2011 14:39:41 +0000 Subject: [PATCH] YaCy can now use the solr index to compute text snippets. This makes search result preparation MUCH faster because no document fetching and parsing is necessary any more. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7943 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlRWIs_p.java | 5 +- htroot/IndexFederated_p.java | 20 ++-- source/de/anomic/crawler/CrawlQueues.java | 11 ++- source/de/anomic/crawler/ZURL.java | 5 +- source/de/anomic/search/ResultFetcher.java | 24 ++++- source/de/anomic/search/Segment.java | 12 +++ source/de/anomic/search/Segments.java | 64 ++++++------ source/de/anomic/search/Switchboard.java | 10 +- source/de/anomic/search/TextSnippet.java | 58 ++++++----- .../federated/solr/SolrChardingConnector.java | 11 +-- .../federated/solr/SolrConnector.java | 99 +++++++++++++++++++ .../services/federated/solr/SolrScheme.java | 43 ++++++++ .../federated/solr/SolrSingleConnector.java | 29 +++++- source/net/yacy/document/Document.java | 8 +- 14 files changed, 304 insertions(+), 95 deletions(-) create mode 100644 source/net/yacy/cora/services/federated/solr/SolrConnector.java diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index a8541602c..5339a6775 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -66,6 +66,7 @@ import de.anomic.search.RankingProcess; import de.anomic.search.ReferenceOrder; import de.anomic.search.SearchEventCache; import de.anomic.search.Segment; +import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverObjects; @@ -86,7 +87,7 @@ public class IndexControlRWIs_p { prop.put("keyhash", ""); prop.put("result", ""); prop.put("cleanup", post == null || post.containsKey("maxReferencesLimit") ? 1 : 0); - prop.put("cleanup_solr", sb.solrConnector == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1); + prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1); String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); int i = 0; @@ -157,7 +158,7 @@ public class IndexControlRWIs_p { segment.clear(); } if (post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false)) try { - sb.solrConnector.clear(); + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear(); } catch (final Exception e) { Log.logException(e); } diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index dd798dcd0..9777cf341 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -33,9 +33,12 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.solr.SolrChardingConnector; import net.yacy.cora.services.federated.solr.SolrChardingSelection; +import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.services.federated.solr.SolrScheme; +import net.yacy.cora.services.federated.solr.SolrSingleConnector; import net.yacy.cora.storage.ConfigurationSet; import net.yacy.kelondro.logging.Log; +import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -75,8 +78,8 @@ public class IndexFederated_p { if (solrWasOn) { // switch off - sb.solrConnector.close(); - sb.solrConnector = null; + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().close(); + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null); } final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename)); @@ -85,10 +88,10 @@ public class IndexFederated_p { // switch on final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; try { - sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null); } catch (final IOException e) { Log.logException(e); - sb.solrConnector = null; + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null); } } @@ -110,12 +113,13 @@ public class IndexFederated_p { } // show solr host table - if (sb.solrConnector == null) { + if (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) { prop.put("table", 0); } else { prop.put("table", 1); - final long[] size = sb.solrConnector.getSizeList(); - final String[] urls = sb.solrConnector.getAdminInterfaceList(); + final SolrConnector solr = sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr(); + final long[] size = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getSizeList() : new long[]{((SolrSingleConnector) solr).getSize()}; + final String[] urls = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getAdminInterfaceList() : new String[]{((SolrSingleConnector) solr).getAdminInterface()}; boolean dark = false; for (int i = 0; i < size.length; i++) { prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark; @@ -126,7 +130,7 @@ public class IndexFederated_p { } // write scheme - SolrScheme scheme = (sb.solrConnector == null) ? null : sb.solrConnector.getScheme(); + SolrScheme scheme = (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) ? null : sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().getScheme(); final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"); if (scheme == null) { scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename)); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 4d6e9099f..4cc879a69 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -62,6 +62,7 @@ public class CrawlQueues { private static final String ERROR_DB_FILENAME = "urlError3.db"; private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db"; + private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING; protected Switchboard sb; protected Log log; @@ -81,8 +82,8 @@ public class CrawlQueues { this.log.logConfig("Starting Crawling Management"); this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727); FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME)); - this.errorURL = new ZURL(sb.solrConnector, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); - this.delegatedURL = new ZURL(sb.solrConnector, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); + this.errorURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); + this.delegatedURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); } public void relocate(final File newQueuePath) { @@ -93,8 +94,8 @@ public class CrawlQueues { this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727); FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME)); - this.errorURL = new ZURL(this.sb.solrConnector, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727); - this.delegatedURL = new ZURL(this.sb.solrConnector, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727); + this.errorURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727); + this.delegatedURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727); } public void close() { @@ -249,7 +250,7 @@ public class CrawlQueues { return true; } try { - this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, new Response(urlEntry, profile), null, null)); + this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(PROCESS, new Response(urlEntry, profile), null, null)); Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false)); } catch (final InterruptedException e) { Log.logException(e); diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index 0b14e5e44..7ebc1b223 100755 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentLinkedQueue; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; import net.yacy.cora.services.federated.solr.SolrChardingConnector; +import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.Index; @@ -76,10 +77,10 @@ public class ZURL implements Iterable { // the class object private Index urlIndex; private final ConcurrentLinkedQueue stack; - private final SolrChardingConnector solrConnector; + private final SolrConnector solrConnector; public ZURL( - final SolrChardingConnector solrConnector, + final SolrConnector solrConnector, final File cachePath, final String tablename, final boolean startWithEmptyFile, diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 04767a074..6b2cbd6cc 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -31,11 +31,13 @@ import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.ranking.ScoreMap; import net.yacy.cora.ranking.WeakPriorityBlockingQueue; import net.yacy.cora.ranking.WeakPriorityBlockingQueue.ReverseElement; +import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -46,6 +48,10 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.EventTracker; import net.yacy.kelondro.util.MemoryControl; import net.yacy.repository.LoaderDispatcher; + +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + import de.anomic.data.WorkTables; import de.anomic.http.client.Cache; import de.anomic.yacy.yacySeedDB; @@ -322,6 +328,7 @@ public class ResultFetcher { private final int neededResults; private final Pattern snippetPattern; private boolean shallrun; + private final SolrConnector solr; public Worker(final int id, final long maxlifetime, final CacheStrategy cacheStrategy, final Pattern snippetPattern, final int neededResults) { this.id = id; @@ -331,6 +338,7 @@ public class ResultFetcher { this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime); this.neededResults = neededResults; this.shallrun = true; + this.solr = ResultFetcher.this.rankingProcess.getQuery().getSegment().getSolr(); } @Override @@ -373,8 +381,18 @@ public class ResultFetcher { } if (ResultFetcher.this.query.filterfailurls && ResultFetcher.this.workTables.failURLsContains(page.hash())) continue; + // in case that we have an attached solr, we load also the solr document + String solrContent = null; + if (this.solr != null) { + SolrDocument sd = null; + final SolrDocumentList sdl = this.solr.get("id:" + ASCII.String(page.hash()), 0, 1); + if (sdl.size() > 0) sd = sdl.get(0); + if (sd != null) solrContent = this.solr.getScheme().solrGetText(sd); + } + + loops++; - resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0 + resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0 if (resultEntry == null) continue; // the entry had some problems, cannot be used rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw(); //System.out.println("***SNIPPET*** raw='" + rawLine + "', pattern='" + this.snippetPattern.toString() + "'"); @@ -412,7 +430,7 @@ public class ResultFetcher { } } - protected ResultEntry fetchSnippet(final URIMetadataRow page, final CacheStrategy cacheStrategy) { + protected ResultEntry fetchSnippet(final URIMetadataRow page, final String solrText, final CacheStrategy cacheStrategy) { // Snippet Fetching can has 3 modes: // 0 - do not fetch snippets // 1 - fetch snippets offline only @@ -429,6 +447,7 @@ public class ResultFetcher { if (cacheStrategy == null) { final TextSnippet snippet = new TextSnippet( null, + solrText, metadata, this.snippetFetchWordHashes, null, @@ -445,6 +464,7 @@ public class ResultFetcher { startTime = System.currentTimeMillis(); final TextSnippet snippet = new TextSnippet( this.loader, + solrText, metadata, this.snippetFetchWordHashes, cacheStrategy, diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index 863264750..fbe5a962c 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -37,6 +37,7 @@ import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; +import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -81,6 +82,7 @@ public class Segment { protected final IndexCell termIndex; //private final IndexCell authorNavIndex; protected final MetadataRepository urlMetadata; + private SolrConnector solr; private final File segmentPath; public Segment( @@ -98,6 +100,7 @@ public class Segment { this.log = log; this.segmentPath = segmentPath; + this.solr = null; this.termIndex = new IndexCell( segmentPath, @@ -126,6 +129,14 @@ public class Segment { this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727); } + public void connectSolr(final SolrConnector solr) { + this.solr = solr; + } + + public SolrConnector getSolr() { + return this.solr; + } + public static void migrateTextIndex(final File oldSegmentPath, final File newSegmentPath) { final File oldCellPath = new File(oldSegmentPath, "RICELL"); if (!oldCellPath.exists()) return; @@ -254,6 +265,7 @@ public class Segment { public void close() { this.termIndex.close(); this.urlMetadata.close(); + if (this.solr != null) this.solr.close(); } public URIMetadataRow storeDocument( diff --git a/source/de/anomic/search/Segments.java b/source/de/anomic/search/Segments.java index f80b25ae1..dc4f5e896 100644 --- a/source/de/anomic/search/Segments.java +++ b/source/de/anomic/search/Segments.java @@ -38,13 +38,13 @@ import net.yacy.kelondro.rwi.IndexCell; public class Segments implements Iterable { - + /** * process enumeration type * defines constants that can be used to assign process-related segment names */ public enum Process { - + RECEIPTS, QUERIES, DHTIN, @@ -59,7 +59,7 @@ public class Segments implements Iterable { throw new UnsupportedOperationException("toString not allowed"); } } - + private final Log log; private final File segmentsPath; private final int entityCacheMaxSize; @@ -68,7 +68,7 @@ public class Segments implements Iterable { private final HashMap process_assignment; private final boolean useTailCache; private final boolean exceed134217727; - + public Segments( final Log log, final File segmentsPath, @@ -96,41 +96,41 @@ public class Segments implements Iterable { this.process_assignment.put(Process.PUBLIC, "default"); this.process_assignment.put(Process.SURROGATES, "default"); } - - public void setSegment(Process process, String segmentName) { + + public void setSegment(final Process process, final String segmentName) { this.process_assignment.put(process, segmentName); } - - public static void migrateOld(File oldSingleSegment, File newSegmentsPath, String newSegmentName) { + + public static void migrateOld(final File oldSingleSegment, final File newSegmentsPath, final String newSegmentName) { if (!oldSingleSegment.exists()) return; - File newSegmentPath = new File(newSegmentsPath, newSegmentName); + final File newSegmentPath = new File(newSegmentsPath, newSegmentName); if (!newSegmentPath.exists()) newSegmentPath.mkdirs(); Segment.migrateTextIndex(oldSingleSegment, newSegmentPath); Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath); - - String[] oldFiles = oldSingleSegment.list(); - for (String oldFile: oldFiles) { + + final String[] oldFiles = oldSingleSegment.list(); + for (final String oldFile: oldFiles) { if (oldFile.startsWith("text.")) { new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile)); } } } - + public String[] segmentNames() { return this.segments.keySet().toArray(new String[this.segments.size()]); } - + public boolean segmentExist(final String segmentName) { - return segments.containsKey(segmentName); + return this.segments.containsKey(segmentName); } - + public Segment segment(final Process process) { return segment(this.process_assignment.get(process)); } - + public Segment segment(final String segmentName) { - if (segments == null) return null; - Segment segment = segments.get(segmentName); + if (this.segments == null) return null; + Segment segment = this.segments.get(segmentName); if (segment == null) { // generate the segment try { @@ -141,7 +141,7 @@ public class Segments implements Iterable { this.maxFileSize, this.useTailCache, this.exceed134217727); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); return null; } @@ -149,28 +149,28 @@ public class Segments implements Iterable { } return segment; } - + public long URLCount() { if (this.segments == null) return 0; long c = 0; - for (Segment s: this.segments.values()) c += (long) s.urlMetadata().size(); + for (final Segment s: this.segments.values()) c += s.urlMetadata().size(); return c; } - + public long RWICount() { if (this.segments == null) return 0; long c = 0; - for (Segment s: this.segments.values()) c += (long) s.termIndex().sizesMax(); + for (final Segment s: this.segments.values()) c += s.termIndex().sizesMax(); return c; } - + public int RWIBufferCount() { if (this.segments == null) return 0; int c = 0; - for (Segment s: this.segments.values()) c += s.termIndex().getBufferSize(); + for (final Segment s: this.segments.values()) c += s.termIndex().getBufferSize(); return c; } - + public MetadataRepository urlMetadata(final Process process) { return segment(this.process_assignment.get(process)).urlMetadata(); } @@ -178,11 +178,11 @@ public class Segments implements Iterable { public IndexCell termIndex(final Process process) { return segment(this.process_assignment.get(process)).termIndex(); } - + public void clear(final Process process) { segment(this.process_assignment.get(process)).clear(); } - + public File getLocation(final Process process) { return segment(this.process_assignment.get(process)).getLocation(); } @@ -190,16 +190,16 @@ public class Segments implements Iterable { public void close(final Process process) { segment(this.process_assignment.get(process)).close(); } - + public void close() { - if (segments != null) for (Segment s: this.segments.values()) s.close(); + if (this.segments != null) for (final Segment s: this.segments.values()) s.close(); this.segments = null; } public void finalize() { this.close(); } - + public synchronized Segment.ReferenceCleaner getReferenceCleaner(final String segmentName, final byte[] startHash) { return segment(segmentName).getReferenceCleaner(startHash); } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 2a0d818e9..d1a6cfaf4 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -247,7 +247,6 @@ public final class Switchboard extends serverSwitch { private final Semaphore shutdownSync = new Semaphore(0); private boolean terminate = false; - public SolrChardingConnector solrConnector = null; //private Object crawlingPausedSync = new Object(); //private boolean crawlingIsPaused = false; @@ -592,10 +591,10 @@ public final class Switchboard extends serverSwitch { final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; try { - this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; + this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null); } catch (final IOException e) { Log.logException(e); - this.solrConnector = null; + this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null); } // start a loader @@ -1314,7 +1313,6 @@ public final class Switchboard extends serverSwitch { Cache.close(); this.tables.close(); Domains.close(); - if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)) this.solrConnector.close(); AccessTracker.dumpLog(new File("DATA/LOG/queries.log")); UPnP.deletePortMapping(); Tray.removeTray(); @@ -1989,7 +1987,7 @@ public final class Switchboard extends serverSwitch { public indexingQueueEntry condenseDocument(final indexingQueueEntry in) { in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING); - if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) { + if (this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) { // send the documents to solr for (final Document doc: in.documents) { try { @@ -2000,7 +1998,7 @@ public final class Switchboard extends serverSwitch { // in case that this happens it appears that the doc id is the right one } try { - this.solrConnector.add(id, in.queueEntry.getResponseHeader(), doc); + this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().add(id, in.queueEntry.getResponseHeader(), doc); } catch (final IOException e) { Log.logWarning("SOLR", "failed to send " + in.queueEntry.url().toNormalform(true, false) + " to solr: " + e.getMessage()); } diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 77f7a9e77..5ed50cbef 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -24,6 +24,7 @@ package de.anomic.search; +import java.io.ByteArrayInputStream; import java.util.Collection; import java.util.Comparator; import java.util.Iterator; @@ -34,6 +35,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.UTF8; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ConcurrentARC; @@ -140,6 +142,7 @@ public class TextSnippet implements Comparable, Comparator, Comparator, Comparator sentences; - { //encapsulate potential expensive document - final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source); - if (document == null) { - return; - } - - /* =========================================================================== - * COMPUTE SNIPPET - * =========================================================================== */ - // we have found a parseable non-empty file: use the lines - - // compute snippet from text - sentences = document.getSentences(pre); - document.close(); - } //encapsulate potential expensive document END - - if (sentences == null) { - init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences"); - return; - } - + Collection sentences = null; + + // try the solr text first + if (solrText != null) { + // compute sentences from solr query + sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText))); + } + + // if then no sentences are found, we fail-over to get the content from the re-loaded document + if (sentences == null) { + final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source); + if (document == null) { + return; + } + + // compute sentences from parsed document + sentences = document.getSentences(pre); + document.close(); + + if (sentences == null) { + init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences"); + return; + } + } + try { final SnippetExtractor tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength); textline = tsr.getSnippet(); @@ -227,7 +235,7 @@ public class TextSnippet implements Comparable, Comparator connectors; private final SolrScheme scheme; @@ -164,13 +163,7 @@ public class SolrChardingConnector { final long[] size = new long[this.connectors.size()]; int i = 0; for (final SolrSingleConnector connector: this.connectors) { - try { - final SolrDocumentList list = connector.get("*:*", 0, 1); - size[i++] = list.getNumFound(); - } catch (final Exception e) { - Log.logException(e); - size[i++] = 0; - } + size[i++] = connector.getSize(); } return size; } diff --git a/source/net/yacy/cora/services/federated/solr/SolrConnector.java b/source/net/yacy/cora/services/federated/solr/SolrConnector.java new file mode 100644 index 000000000..c20693ba2 --- /dev/null +++ b/source/net/yacy/cora/services/federated/solr/SolrConnector.java @@ -0,0 +1,99 @@ +/** + * SolrConnector + * Copyright 2011 by Michael Peter Christen + * First released 13.09.2011 at http://yacy.net + * + * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ + * $LastChangedRevision: 7654 $ + * $LastChangedBy: orbiter $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.services.federated.solr; + +import java.io.IOException; +import java.util.List; + +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.document.Document; +import net.yacy.kelondro.data.meta.DigestURI; + +import org.apache.solr.common.SolrDocumentList; + +public interface SolrConnector { + + /** + * with a scheme the fields of a SolrDocument can be translated to actual data values + * @return the solr scheme that can translate the SolrDocument + */ + public SolrScheme getScheme(); + + public void close(); + + /** + * delete everything in the solr index + * @throws IOException + */ + public void clear() throws IOException; + + /** + * delete an entry from solr + * @param id the url hash of the entry + * @throws IOException + */ + public void delete(final String id) throws IOException; + + /** + * delete a set of entries from solr; entries are identified by their url hash + * @param ids a list of url hashes + * @throws IOException + */ + public void delete(final List ids) throws IOException; + + /** + * add a YaCy document. This calls the scheme processor to add the document as solr document + * @param id the url hash of the entry + * @param header the http response header + * @param doc the YaCy document + * @throws IOException + */ + public void add(final String id, final ResponseHeader header, final Document doc) throws IOException; + + /** + * register an entry as error document + * @param digestURI + * @param failReason + * @param httpstatus + * @throws IOException + */ + public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException; + + + /** + * get a query result from solr + * to get all results set the query String to "*:*" + * @param querystring + * @throws IOException + */ + public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException; + + /** + * get the size of the index + * @return number of results if solr is queries with a catch-all pattern + */ + public long getSize(); + +} diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index ab8909d0b..b191f24ec 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -27,6 +27,8 @@ package net.yacy.cora.services.federated.solr; import java.io.File; import java.net.InetAddress; +import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.Map; @@ -44,6 +46,7 @@ import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; +import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; public class SolrScheme extends ConfigurationSet { @@ -349,6 +352,46 @@ public class SolrScheme extends ConfigurationSet { return solrdoc; } + public String solrGetID(final SolrDocument solr) { + return (String) solr.getFieldValue("id"); + } + + public DigestURI solrGetURL(final SolrDocument solr) { + try { + return new DigestURI((String) solr.getFieldValue("sku")); + } catch (final MalformedURLException e) { + return null; + } + } + + public String solrGetTitle(final SolrDocument solr) { + return (String) solr.getFieldValue("title"); + } + + public String solrGetText(final SolrDocument solr) { + return (String) solr.getFieldValue("text_t"); + } + + public String solrGetAuthor(final SolrDocument solr) { + return (String) solr.getFieldValue("author"); + } + + public String solrGetDescription(final SolrDocument solr) { + return (String) solr.getFieldValue("description"); + } + + public Date solrGetDate(final SolrDocument solr) { + return (Date) solr.getFieldValue("last_modified"); + } + + public Collection solrGetKeywords(final SolrDocument solr) { + final Collection c = solr.getFieldValues("keywords"); + final ArrayList a = new ArrayList(); + for (final Object s: c) { + a.add((String) s); + } + return a; + } /* * standard solr scheme diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java index e75ee0fd5..798c09322 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java @@ -57,7 +57,7 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; -public class SolrSingleConnector { +public class SolrSingleConnector implements SolrConnector { private final String solrurl, host, solrpath, solraccount, solrpw; private final int port; @@ -178,6 +178,22 @@ public class SolrSingleConnector { } } + @Override + public SolrScheme getScheme() { + return this.scheme; + } + + @Override + public long getSize() { + try { + final SolrDocumentList list = get("*:*", 0, 1); + return list.getNumFound(); + } catch (final Exception e) { + Log.logException(e); + return 0; + } + } + /** * delete everything in the solr index * @throws IOException @@ -325,6 +341,16 @@ public class SolrSingleConnector { //return result; } + + public String getAdminInterface() { + final InetAddress localhostExternAddress = Domains.myPublicLocalIP(); + final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress(); + String u = this.solrurl; + int p = u.indexOf("localhost"); if (p < 0) p = u.indexOf("127.0.0.1"); + if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9); + return u + (u.endsWith("/") ? "admin/" : "/admin/"); + } + public static void main(final String args[]) { SolrSingleConnector solr; try { @@ -347,5 +373,4 @@ public class SolrSingleConnector { e.printStackTrace(); } } - } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 666d64358..56c93bb56 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -312,8 +312,12 @@ dc_rights } public List getSentences(final boolean pre) { - if (this.text == null) return null; - final SentenceReader e = new SentenceReader(getText()); + return getSentences(pre, getText()); + } + + public static List getSentences(final boolean pre, final InputStream text) { + if (text == null) return null; + final SentenceReader e = new SentenceReader(text); e.pre(pre); final List sentences = new ArrayList(); while (e.hasNext()) {