mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Moved solr index-add method to the same method where the YaCy index is
written. Also done some code-cleanup.
This commit is contained in:
parent
315d83cfa0
commit
6f1ddb2519
|
@ -137,7 +137,7 @@ public class IndexFederated_p {
|
|||
}
|
||||
|
||||
// read index scheme table flags
|
||||
final Iterator<ConfigurationSet.Entry> i = sb.solrScheme.entryIterator();
|
||||
final Iterator<ConfigurationSet.Entry> i = sb.index.getSolrScheme().entryIterator();
|
||||
ConfigurationSet.Entry entry;
|
||||
boolean modified = false; // flag to remember changes
|
||||
while (i.hasNext()) {
|
||||
|
@ -160,7 +160,7 @@ public class IndexFederated_p {
|
|||
}
|
||||
if (modified) { // save settings to config file if modified
|
||||
try {
|
||||
sb.solrScheme.commit();
|
||||
sb.index.getSolrScheme().commit();
|
||||
modified = false;
|
||||
} catch (IOException ex) {}
|
||||
}
|
||||
|
@ -191,7 +191,7 @@ public class IndexFederated_p {
|
|||
// use enum SolrField to keep defined order
|
||||
for(SolrField field : SolrField.values()) {
|
||||
prop.put("scheme_" + c + "_dark", dark ? 1 : 0); dark = !dark;
|
||||
prop.put("scheme_" + c + "_checked", sb.solrScheme.contains(field.name()) ? 1 : 0);
|
||||
prop.put("scheme_" + c + "_checked", sb.index.getSolrScheme().contains(field.name()) ? 1 : 0);
|
||||
prop.putHTML("scheme_" + c + "_key", field.name());
|
||||
prop.putHTML("scheme_" + c + "_solrfieldname",field.name().equalsIgnoreCase(field.getSolrFieldName()) ? "" : field.getSolrFieldName());
|
||||
if (field.getComment() != null) prop.putHTML("scheme_" + c + "_comment",field.getComment());
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.SolrConfiguration;
|
||||
import net.yacy.search.index.SolrField;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
@ -37,8 +38,9 @@ public class schema_p {
|
|||
|
||||
// write scheme
|
||||
int c = 0;
|
||||
SolrConfiguration solrScheme = sb.index.getSolrScheme();
|
||||
for (SolrField field : SolrField.values()) {
|
||||
if (sb.solrScheme.contains(field.name())) {
|
||||
if (solrScheme.contains(field.name())) {
|
||||
prop.put("fields_" + c + "_solrname", field.getSolrFieldName());
|
||||
prop.put("fields_" + c + "_type", field.getType().printName());
|
||||
prop.put("fields_" + c + "_comment", field.getComment());
|
||||
|
|
|
@ -81,8 +81,8 @@ public class CrawlQueues {
|
|||
this.log.logConfig("Starting Crawling Management");
|
||||
this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
|
||||
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
|
||||
this.errorURL = new ZURL(sb.index.getSolr(), sb.solrScheme, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(sb.index.getSolr(), sb.solrScheme, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
|
||||
this.errorURL = new ZURL(sb.index.getSolr(), sb.index.getSolrScheme(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(sb.index.getSolr(), sb.index.getSolrScheme(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
|
||||
}
|
||||
|
||||
public void relocate(final File newQueuePath) {
|
||||
|
@ -93,8 +93,8 @@ public class CrawlQueues {
|
|||
|
||||
this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727);
|
||||
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
|
||||
this.errorURL = new ZURL(this.sb.index.getSolr(), this.sb.solrScheme, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(this.sb.index.getSolr(), this.sb.solrScheme, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
this.errorURL = new ZURL(this.sb.index.getSolr(), this.sb.index.getSolrScheme(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(this.sb.index.getSolr(), this.sb.index.getSolrScheme(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
}
|
||||
|
||||
public synchronized void close() {
|
||||
|
|
|
@ -159,4 +159,12 @@ public class ResponseHeader extends HeaderFramework {
|
|||
}
|
||||
return Charset.forName(charSetName);
|
||||
}
|
||||
|
||||
public String getXRobotsTag() {
|
||||
String x_robots_tag = this.get(HeaderFramework.X_ROBOTS_TAG, "");
|
||||
if (x_robots_tag.isEmpty()) {
|
||||
x_robots_tag = this.get(HeaderFramework.X_ROBOTS, "");
|
||||
}
|
||||
return x_robots_tag;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -786,7 +786,8 @@ public final class Protocol
|
|||
// store remote result to local result container
|
||||
// insert one container into the search result buffer
|
||||
// one is enough, only the references are used, not the word
|
||||
containerCache.add(container.get(0), false, target.getName() + "/" + target.hash, result.joincount, true, time);
|
||||
containerCache.add(container.get(0), false, target.getName() + "/" + target.hash, result.joincount, time);
|
||||
containerCache.addFinalize();
|
||||
containerCache.addExpectedRemoteReferences(-count);
|
||||
|
||||
// insert the containers to the index
|
||||
|
|
41
source/net/yacy/search/IndexingQueueEntry.java
Normal file
41
source/net/yacy/search/IndexingQueueEntry.java
Normal file
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* IndexingQueueEntry
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 24.07.2012 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package net.yacy.search;
|
||||
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.kelondro.workflow.WorkflowJob;
|
||||
import de.anomic.crawler.retrieval.Response;
|
||||
|
||||
public class IndexingQueueEntry extends WorkflowJob {
|
||||
|
||||
public Response queueEntry;
|
||||
public Document[] documents;
|
||||
public Condenser[] condenser;
|
||||
|
||||
public IndexingQueueEntry(final Response queueEntry, final Document[] documents, final Condenser[] condenser) {
|
||||
super();
|
||||
this.queueEntry = queueEntry;
|
||||
this.documents = documents;
|
||||
this.condenser = condenser;
|
||||
}
|
||||
}
|
47
source/net/yacy/search/Shutdown.java
Normal file
47
source/net/yacy/search/Shutdown.java
Normal file
|
@ -0,0 +1,47 @@
|
|||
/**
|
||||
* Shutdown
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 24.07.2012 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.search;
|
||||
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
public class Shutdown extends Thread {
|
||||
private final Switchboard sb;
|
||||
private final long delay;
|
||||
private final String reason;
|
||||
|
||||
public Shutdown(final Switchboard sb, final long delay, final String reason) {
|
||||
this.sb = sb;
|
||||
this.delay = delay;
|
||||
this.reason = reason;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
Thread.sleep(this.delay);
|
||||
} catch ( final InterruptedException e ) {
|
||||
this.sb.getLog().logInfo("interrupted delayed shutdown");
|
||||
} catch ( final Exception e ) {
|
||||
Log.logException(e);
|
||||
}
|
||||
this.sb.terminate(this.reason);
|
||||
}
|
||||
}
|
|
@ -97,7 +97,6 @@ import net.yacy.cora.protocol.http.ProxySettings;
|
|||
import net.yacy.cora.services.federated.solr.ShardSelection;
|
||||
import net.yacy.cora.services.federated.solr.ShardSolrConnector;
|
||||
import net.yacy.cora.services.federated.solr.SolrConnector;
|
||||
import net.yacy.cora.services.federated.solr.SolrDoc;
|
||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
|
@ -251,7 +250,6 @@ public final class Switchboard extends serverSwitch
|
|||
public SeedDB peers;
|
||||
public WorkTables tables;
|
||||
public Tray tray;
|
||||
public SolrConfiguration solrScheme;
|
||||
|
||||
public WorkflowProcessor<IndexingQueueEntry> indexingDocumentProcessor;
|
||||
public WorkflowProcessor<IndexingQueueEntry> indexingCondensementProcessor;
|
||||
|
@ -376,16 +374,6 @@ public final class Switchboard extends serverSwitch
|
|||
this.networkRoot.mkdirs();
|
||||
this.queuesRoot.mkdirs();
|
||||
|
||||
// initialize index
|
||||
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
|
||||
final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
|
||||
this.index = new Segment(this.log, new File(segmentsPath, "default"));
|
||||
final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, true)) this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, true)) this.index.connectLocalSolr(connectWithinMs);
|
||||
|
||||
// prepare a solr index profile switch list
|
||||
final File solrBackupProfile = new File("defaults/solr.keys.list");
|
||||
final String schemename = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SCHEMEFILE, "solr.keys.default.list");
|
||||
|
@ -395,11 +383,21 @@ public final class Switchboard extends serverSwitch
|
|||
}
|
||||
final boolean solrlazy = getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_LAZY, true);
|
||||
final SolrConfiguration backupScheme = new SolrConfiguration(solrBackupProfile, solrlazy);
|
||||
this.solrScheme = new SolrConfiguration(solrWorkProfile, solrlazy);
|
||||
|
||||
final SolrConfiguration solrScheme = new SolrConfiguration(solrWorkProfile, solrlazy);
|
||||
// update the working scheme with the backup scheme. This is necessary to include new features.
|
||||
// new features are always activated by default (if activated in input-backupScheme)
|
||||
this.solrScheme.fill(backupScheme, true);
|
||||
solrScheme.fill(backupScheme, true);
|
||||
|
||||
// initialize index
|
||||
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
|
||||
final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
|
||||
this.index = new Segment(this.log, new File(segmentsPath, "default"), solrScheme);
|
||||
final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, true)) this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, true)) this.index.connectLocalSolr(connectWithinMs);
|
||||
|
||||
|
||||
// set up the solr interface
|
||||
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
|
||||
|
@ -1133,6 +1131,9 @@ public final class Switchboard extends serverSwitch
|
|||
// switch the networks
|
||||
synchronized ( this ) {
|
||||
|
||||
// remember the solr scheme
|
||||
SolrConfiguration solrScheme = this.index.getSolrScheme();
|
||||
|
||||
// shut down
|
||||
this.crawler.close();
|
||||
if ( this.dhtDispatcher != null ) {
|
||||
|
@ -1179,7 +1180,7 @@ public final class Switchboard extends serverSwitch
|
|||
partitionExponent,
|
||||
this.useTailCache,
|
||||
this.exceed134217727);
|
||||
this.index = new Segment(this.log, new File(new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), "default"));
|
||||
this.index = new Segment(this.log, new File(new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), "default"), solrScheme);
|
||||
final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
|
||||
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
|
||||
|
@ -2395,55 +2396,8 @@ public final class Switchboard extends serverSwitch
|
|||
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
|
||||
}
|
||||
|
||||
boolean localSolr = this.index.connectedLocalSolr();
|
||||
boolean remoteSolr = this.index.connectedRemoteSolr();
|
||||
if (localSolr || remoteSolr) {
|
||||
// send the documents to solr
|
||||
for ( final Document doc : in.documents ) {
|
||||
try {
|
||||
final String id = UTF8.String(new DigestURI(doc.dc_identifier()).hash());
|
||||
final String iquh = UTF8.String(in.queueEntry.url().hash());
|
||||
if ( !id.equals(iquh) ) {
|
||||
this.log.logWarning("condenseDocument consistency check doc="
|
||||
+ id
|
||||
+ ":"
|
||||
+ doc.dc_identifier()
|
||||
+ ", query="
|
||||
+ iquh
|
||||
+ ":"
|
||||
+ in.queueEntry.url());
|
||||
// in case that this happens it appears that the doc id is the right one
|
||||
}
|
||||
try {
|
||||
SolrDoc solrDoc = this.solrScheme.yacy2solr(id, in.queueEntry.getResponseHeader(), doc);
|
||||
this.index.getSolr().add(solrDoc);
|
||||
} catch ( final IOException e ) {
|
||||
Log.logWarning(
|
||||
"SOLR",
|
||||
"failed to send "
|
||||
+ in.queueEntry.url().toNormalform(true, false)
|
||||
+ " to solr: "
|
||||
+ e.getMessage());
|
||||
}
|
||||
} catch ( final MalformedURLException e ) {
|
||||
Log.logException(e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check if we should accept the document for our index
|
||||
if (!this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) {
|
||||
if ( this.log.isInfo() ) {
|
||||
this.log.logInfo("Not Condensed Resource '"
|
||||
+ in.queueEntry.url().toNormalform(false, true)
|
||||
+ "': indexing not wanted by federated rule for YaCy");
|
||||
}
|
||||
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
|
||||
}
|
||||
final List<Document> doclist = new ArrayList<Document>();
|
||||
|
||||
// check which files may take part in the indexing process
|
||||
final List<Document> doclist = new ArrayList<Document>();
|
||||
for ( final Document document : in.documents ) {
|
||||
if ( document.indexingDenied() ) {
|
||||
if ( this.log.isInfo() ) {
|
||||
|
@ -2569,6 +2523,7 @@ public final class Switchboard extends serverSwitch
|
|||
queueEntry.lastModified(),
|
||||
new Date(),
|
||||
queueEntry.size(),
|
||||
queueEntry.getResponseHeader(),
|
||||
document,
|
||||
condenser,
|
||||
searchEvent,
|
||||
|
|
|
@ -73,9 +73,9 @@ public class DocumentIndex extends Segment
|
|||
|
||||
static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup");
|
||||
|
||||
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize)
|
||||
public DocumentIndex(final File segmentPath, final File schemePath, final CallbackListener callback, final int cachesize)
|
||||
throws IOException {
|
||||
super(new Log("DocumentIndex"), segmentPath);
|
||||
super(new Log("DocumentIndex"), segmentPath, schemePath == null ? null : new SolrConfiguration(schemePath, true));
|
||||
super.connectRWI(cachesize, targetFileSize * 4 - 1);
|
||||
super.connectCitation(cachesize, targetFileSize * 4 - 1);
|
||||
super.connectUrlDb(
|
||||
|
@ -174,6 +174,7 @@ public class DocumentIndex extends Segment
|
|||
new Date(url.lastModified()),
|
||||
new Date(),
|
||||
url.length(),
|
||||
null,
|
||||
document,
|
||||
condenser,
|
||||
null,
|
||||
|
@ -306,7 +307,7 @@ public class DocumentIndex extends Segment
|
|||
try {
|
||||
if ( args[1].equals("add") ) {
|
||||
final DigestURI f = new DigestURI(args[2]);
|
||||
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
|
||||
final DocumentIndex di = new DocumentIndex(segmentPath, null, callback, 100000);
|
||||
di.addConcurrent(f);
|
||||
di.close();
|
||||
} else {
|
||||
|
@ -315,7 +316,7 @@ public class DocumentIndex extends Segment
|
|||
query += args[i];
|
||||
}
|
||||
query.trim();
|
||||
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
|
||||
final DocumentIndex di = new DocumentIndex(segmentPath, null, callback, 100000);
|
||||
final ArrayList<DigestURI> results = di.find(query, 100);
|
||||
for ( final DigestURI f : results ) {
|
||||
if ( f != null ) {
|
||||
|
|
|
@ -39,7 +39,9 @@ import net.yacy.cora.document.ASCII;
|
|||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.order.ByteOrder;
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.cora.services.federated.solr.SolrConnector;
|
||||
import net.yacy.cora.services.federated.solr.SolrDoc;
|
||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
|
@ -100,15 +102,16 @@ public class Segment {
|
|||
|
||||
private final Log log;
|
||||
private final File segmentPath;
|
||||
private final SolrConfiguration solrScheme;
|
||||
protected final MetadataRepository urlMetadata;
|
||||
protected IndexCell<WordReference> termIndex;
|
||||
protected IndexCell<CitationReference> urlCitationIndex;
|
||||
|
||||
public Segment(final Log log, final File segmentPath) {
|
||||
|
||||
public Segment(final Log log, final File segmentPath, final SolrConfiguration solrScheme) {
|
||||
log.logInfo("Initializing Segment '" + segmentPath + ".");
|
||||
this.log = log;
|
||||
this.segmentPath = segmentPath;
|
||||
this.solrScheme = solrScheme;
|
||||
|
||||
// create LURL-db
|
||||
this.urlMetadata = new MetadataRepository(segmentPath);
|
||||
|
@ -197,10 +200,15 @@ public class Segment {
|
|||
public void disconnectLocalSolr() {
|
||||
this.urlMetadata.disconnectLocalSolr();
|
||||
}
|
||||
|
||||
public SolrConnector getSolr() {
|
||||
return this.urlMetadata.getSolr();
|
||||
}
|
||||
|
||||
public SolrConfiguration getSolrScheme() {
|
||||
return this.solrScheme;
|
||||
}
|
||||
|
||||
public SolrConnector getRemoteSolr() {
|
||||
return this.urlMetadata.getRemoteSolr();
|
||||
}
|
||||
|
@ -318,94 +326,6 @@ public class Segment {
|
|||
return this.segmentPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* this is called by the switchboard to put in a new page into the index
|
||||
* use all the words in one condenser object to simultanous create index entries
|
||||
*
|
||||
* @param url
|
||||
* @param urlModified
|
||||
* @param document
|
||||
* @param condenser
|
||||
* @param language
|
||||
* @param doctype
|
||||
* @param outlinksSame
|
||||
* @param outlinksOther
|
||||
* @return
|
||||
*/
|
||||
private int addPageIndex(
|
||||
final DigestURI url,
|
||||
final Date urlModified,
|
||||
final Document document,
|
||||
final Condenser condenser,
|
||||
final String language,
|
||||
final char doctype,
|
||||
final int outlinksSame,
|
||||
final int outlinksOther,
|
||||
final SearchEvent searchEvent,
|
||||
final String sourceName) {
|
||||
final RWIProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult();
|
||||
int wordCount = 0;
|
||||
final int urlLength = url.toNormalform(true, true).length();
|
||||
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
|
||||
|
||||
// iterate over all words of content text
|
||||
final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator();
|
||||
Map.Entry<String, Word> wentry;
|
||||
String word;
|
||||
final int len = (document == null) ? urlLength : document.dc_title().length();
|
||||
final WordReferenceRow ientry = new WordReferenceRow(url.hash(),
|
||||
urlLength, urlComps, len,
|
||||
condenser.RESULT_NUMB_WORDS,
|
||||
condenser.RESULT_NUMB_SENTENCES,
|
||||
urlModified.getTime(),
|
||||
System.currentTimeMillis(),
|
||||
UTF8.getBytes(language),
|
||||
doctype,
|
||||
outlinksSame, outlinksOther);
|
||||
Word wprop = null;
|
||||
byte[] wordhash;
|
||||
while (i.hasNext()) {
|
||||
wentry = i.next();
|
||||
word = wentry.getKey();
|
||||
wprop = wentry.getValue();
|
||||
assert (wprop.flags != null);
|
||||
ientry.setWord(wprop);
|
||||
wordhash = Word.word2hash(word);
|
||||
if (this.termIndex != null) try {
|
||||
this.termIndex.add(wordhash, ientry);
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
wordCount++;
|
||||
|
||||
// during a search event it is possible that a heuristic is used which aquires index
|
||||
// data during search-time. To transfer indexed data directly to the search process
|
||||
// the following lines push the index data additionally to the search process
|
||||
// this is done only for searched words
|
||||
if (searchEvent != null && !searchEvent.getQuery().query_exclude_hashes.has(wordhash) && searchEvent.getQuery().query_include_hashes.has(wordhash)) {
|
||||
// if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result
|
||||
ReferenceContainer<WordReference> container;
|
||||
try {
|
||||
container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1);
|
||||
container.add(ientry);
|
||||
rankingProcess.add(container, true, sourceName, -1, !i.hasNext(), 5000);
|
||||
} catch (final RowSpaceExceededException e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// assign the catchall word
|
||||
ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics
|
||||
if (this.termIndex != null) try {
|
||||
this.termIndex.add(catchallHash, ientry);
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
private int addCitationIndex(final DigestURI url, final Date urlModified, final Map<MultiProtocolURI, Properties> anchors) {
|
||||
if (anchors == null) return 0;
|
||||
int refCount = 0;
|
||||
|
@ -433,25 +353,12 @@ public class Segment {
|
|||
if (this.urlCitationIndex != null) this.urlCitationIndex.close();
|
||||
}
|
||||
|
||||
public URIMetadataRow storeDocument(
|
||||
final DigestURI url,
|
||||
final DigestURI referrerURL,
|
||||
Date modDate,
|
||||
final Date loadDate,
|
||||
final long sourcesize,
|
||||
final Document document,
|
||||
final Condenser condenser,
|
||||
final SearchEvent searchEvent,
|
||||
final String sourceName
|
||||
) throws IOException {
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
// CREATE INDEX
|
||||
|
||||
// load some document metadata
|
||||
final String dc_title = document.dc_title();
|
||||
|
||||
// do a identification of the language
|
||||
private String votedLanguage(
|
||||
final DigestURI url,
|
||||
final String urlNormalform,
|
||||
final Document document,
|
||||
final Condenser condenser) {
|
||||
// do a identification of the language
|
||||
String language = condenser.language(); // this is a statistical analysation of the content: will be compared with other attributes
|
||||
final String bymetadata = document.dc_language(); // the languageByMetadata may return null if there was no declaration
|
||||
if (language == null) {
|
||||
|
@ -466,7 +373,7 @@ public class Segment {
|
|||
else {
|
||||
final String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")";
|
||||
// see if we have a hint in the url that the statistic was right
|
||||
final String u = url.toNormalform(true, false).toLowerCase();
|
||||
final String u = urlNormalform.toLowerCase();
|
||||
if (!u.contains("/" + language + "/") && !u.contains("/" + ISO639.country(language).toLowerCase() + "/")) {
|
||||
// no confirmation using the url, use the TLD
|
||||
language = url.language();
|
||||
|
@ -491,9 +398,46 @@ public class Segment {
|
|||
}
|
||||
}
|
||||
}
|
||||
return language;
|
||||
}
|
||||
|
||||
// create a new loaded URL db entry
|
||||
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
|
||||
public URIMetadataRow storeDocument(
|
||||
final DigestURI url,
|
||||
final DigestURI referrerURL,
|
||||
Date modDate,
|
||||
final Date loadDate,
|
||||
final long sourcesize,
|
||||
final ResponseHeader responseHeader,
|
||||
final Document document,
|
||||
final Condenser condenser,
|
||||
final SearchEvent searchEvent,
|
||||
final String sourceName
|
||||
) throws IOException {
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
// CREATE INDEX
|
||||
|
||||
// load some document metadata
|
||||
final String id = ASCII.String(url.hash());
|
||||
final String dc_title = document.dc_title();
|
||||
final String urlNormalform = url.toNormalform(true, false);
|
||||
final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language
|
||||
|
||||
// STORE TO SOLR
|
||||
boolean localSolr = this.connectedLocalSolr();
|
||||
boolean remoteSolr = this.connectedRemoteSolr();
|
||||
if (localSolr || remoteSolr) {
|
||||
try {
|
||||
SolrDoc solrDoc = this.solrScheme.yacy2solr(id, responseHeader, document);
|
||||
this.getSolr().add(solrDoc);
|
||||
} catch ( final IOException e ) {
|
||||
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
// STORE URL TO LOADED-URL-DB
|
||||
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader
|
||||
char docType = Response.docType(document.dc_format());
|
||||
final URIMetadataRow newEntry = new URIMetadataRow(
|
||||
url, // URL
|
||||
dc_title, // document description
|
||||
|
@ -509,7 +453,7 @@ public class Segment {
|
|||
new byte[0], // md5
|
||||
(int) sourcesize, // size
|
||||
condenser.RESULT_NUMB_WORDS, // word count
|
||||
Response.docType(document.dc_format()), // doctype
|
||||
docType, // doctype
|
||||
condenser.RESULT_FLAGS, // flags
|
||||
UTF8.getBytes(language), // language
|
||||
document.inboundLinks().size(), // inbound links
|
||||
|
@ -519,25 +463,72 @@ public class Segment {
|
|||
document.getVideolinks().size(), // lvideo
|
||||
document.getApplinks().size() // lapp
|
||||
);
|
||||
|
||||
// STORE URL TO LOADED-URL-DB
|
||||
this.urlMetadata.store(newEntry); // TODO: should be serialized; integrated in IODispatcher
|
||||
|
||||
this.urlMetadata.store(newEntry);
|
||||
final long storageEndTime = System.currentTimeMillis();
|
||||
|
||||
// STORE PAGE INDEX INTO WORD INDEX DB
|
||||
final int words = addPageIndex(
|
||||
url, // document url
|
||||
modDate, // document mod date
|
||||
document, // document content
|
||||
condenser, // document condenser
|
||||
language, // document language
|
||||
Response.docType(document.dc_format()), // document type
|
||||
document.inboundLinks().size(), // inbound links
|
||||
document.outboundLinks().size(), // outbound links
|
||||
searchEvent, // a search event that can have results directly
|
||||
sourceName // the name of the source where the index was created
|
||||
);
|
||||
int outlinksSame = document.inboundLinks().size();
|
||||
int outlinksOther = document.outboundLinks().size();
|
||||
final RWIProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult();
|
||||
int wordCount = 0;
|
||||
final int urlLength = urlNormalform.length();
|
||||
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
|
||||
|
||||
// create a word prototype which is re-used for all entries
|
||||
final int len = (document == null) ? urlLength : document.dc_title().length();
|
||||
final WordReferenceRow ientry = new WordReferenceRow(
|
||||
url.hash(),
|
||||
urlLength, urlComps, len,
|
||||
condenser.RESULT_NUMB_WORDS,
|
||||
condenser.RESULT_NUMB_SENTENCES,
|
||||
modDate.getTime(),
|
||||
System.currentTimeMillis(),
|
||||
UTF8.getBytes(language),
|
||||
docType,
|
||||
outlinksSame, outlinksOther);
|
||||
|
||||
// iterate over all words of content text
|
||||
Word wprop = null;
|
||||
byte[] wordhash;
|
||||
String word;
|
||||
for (Map.Entry<String, Word> wentry: condenser.words().entrySet()) {
|
||||
word = wentry.getKey();
|
||||
wprop = wentry.getValue();
|
||||
assert (wprop.flags != null);
|
||||
ientry.setWord(wprop);
|
||||
wordhash = Word.word2hash(word);
|
||||
if (this.termIndex != null) try {
|
||||
this.termIndex.add(wordhash, ientry);
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
wordCount++;
|
||||
|
||||
// during a search event it is possible that a heuristic is used which aquires index
|
||||
// data during search-time. To transfer indexed data directly to the search process
|
||||
// the following lines push the index data additionally to the search process
|
||||
// this is done only for searched words
|
||||
if (searchEvent != null && !searchEvent.getQuery().query_exclude_hashes.has(wordhash) && searchEvent.getQuery().query_include_hashes.has(wordhash)) {
|
||||
// if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result
|
||||
ReferenceContainer<WordReference> container;
|
||||
try {
|
||||
container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1);
|
||||
container.add(ientry);
|
||||
rankingProcess.add(container, true, sourceName, -1, 5000);
|
||||
} catch (final RowSpaceExceededException e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (rankingProcess != null) rankingProcess.addFinalize();
|
||||
|
||||
// assign the catchall word
|
||||
ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics
|
||||
if (this.termIndex != null) try {
|
||||
this.termIndex.add(catchallHash, ientry);
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
|
||||
// STORE PAGE REFERENCES INTO CITATION INDEX
|
||||
final int refs = addCitationIndex(url, modDate, document.getAnchors());
|
||||
|
@ -546,10 +537,8 @@ public class Segment {
|
|||
final long indexingEndTime = System.currentTimeMillis();
|
||||
|
||||
if (this.log.isInfo()) {
|
||||
// TODO: UTF-8 docDescription seems not to be displayed correctly because
|
||||
// of string concatenation
|
||||
this.log.logInfo("*Indexed " + words + " words in URL " + url +
|
||||
" [" + ASCII.String(url.hash()) + "]" +
|
||||
this.log.logInfo("*Indexed " + wordCount + " words in URL " + url +
|
||||
" [" + id + "]" +
|
||||
"\n\tDescription: " + dc_title +
|
||||
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
|
||||
"Size: " + document.getTextLength() + " bytes | " +
|
||||
|
|
|
@ -106,7 +106,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String[] value) {
|
||||
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value);
|
||||
}
|
||||
|
||||
|
||||
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final List<String> value) {
|
||||
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value);
|
||||
}
|
||||
|
@ -163,7 +163,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
addSolr(solrdoc, SolrField.author, yacydoc.dc_creator());
|
||||
addSolr(solrdoc, SolrField.description, yacydoc.dc_description());
|
||||
addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format());
|
||||
addSolr(solrdoc, SolrField.last_modified, header.lastModified());
|
||||
addSolr(solrdoc, SolrField.last_modified, header == null ? new Date() : header.lastModified());
|
||||
addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' '));
|
||||
final String content = yacydoc.getTextString();
|
||||
addSolr(solrdoc, SolrField.text_t, content);
|
||||
|
@ -224,10 +224,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
if (robots_meta.indexOf("noindex",0) >= 0) b += 4; // set bit 2
|
||||
if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3
|
||||
}
|
||||
String x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, "");
|
||||
if (x_robots_tag.isEmpty()) {
|
||||
x_robots_tag = header.get(HeaderFramework.X_ROBOTS, "");
|
||||
} else {
|
||||
String x_robots_tag = "";
|
||||
if (header != null) {
|
||||
x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, "");
|
||||
if (x_robots_tag.isEmpty()) {
|
||||
x_robots_tag = header.get(HeaderFramework.X_ROBOTS, "");
|
||||
}
|
||||
}
|
||||
if (!x_robots_tag.isEmpty()) {
|
||||
// this tag may have values: noarchive, nosnippet, noindex, unavailable_after
|
||||
if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 256; // set bit 8
|
||||
if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 512; // set bit 9
|
||||
|
@ -398,7 +402,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
}
|
||||
|
||||
// response time
|
||||
addSolr(solrdoc, SolrField.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
|
||||
addSolr(solrdoc, SolrField.responsetime_i, header == null ? 0 : Integer.parseInt(header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
|
||||
}
|
||||
|
||||
// list all links
|
||||
|
@ -487,7 +491,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon());
|
||||
addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat());
|
||||
}
|
||||
addSolr(solrdoc, SolrField.httpstatus_i, header.getStatusCode());
|
||||
addSolr(solrdoc, SolrField.httpstatus_i, header == null ? 200 : header.getStatusCode());
|
||||
|
||||
return solrdoc;
|
||||
}
|
||||
|
|
|
@ -221,7 +221,8 @@ public final class RWIProcess extends Thread
|
|||
System.currentTimeMillis() - timer),
|
||||
false);
|
||||
if ( !index.isEmpty() ) {
|
||||
add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, true, this.maxtime);
|
||||
add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, this.maxtime);
|
||||
addFinalize();
|
||||
}
|
||||
} catch ( final Exception e ) {
|
||||
Log.logException(e);
|
||||
|
@ -230,12 +231,15 @@ public final class RWIProcess extends Thread
|
|||
}
|
||||
}
|
||||
|
||||
public void addFinalize() {
|
||||
this.addRunning = false;
|
||||
}
|
||||
|
||||
public void add(
|
||||
final ReferenceContainer<WordReference> index,
|
||||
final boolean local,
|
||||
final String resourceName,
|
||||
final int fullResource,
|
||||
final boolean finalizeAddAtEnd,
|
||||
final long maxtime) {
|
||||
// we collect the urlhashes and construct a list with urlEntry objects
|
||||
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
||||
|
@ -422,10 +426,6 @@ public final class RWIProcess extends Thread
|
|||
|
||||
} catch ( final InterruptedException e ) {
|
||||
} catch ( final RowSpaceExceededException e ) {
|
||||
} finally {
|
||||
if ( finalizeAddAtEnd ) {
|
||||
this.addRunning = false;
|
||||
}
|
||||
}
|
||||
|
||||
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
|
||||
|
|
|
@ -503,7 +503,7 @@ public class SnippetProcess {
|
|||
sd = sdl.get(0);
|
||||
}
|
||||
if (sd != null) {
|
||||
solrContent = Switchboard.getSwitchboard().solrScheme.solrGetText(sd);
|
||||
solrContent = Switchboard.getSwitchboard().index.getSolrScheme().solrGetText(sd);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -666,7 +666,7 @@ public final class yacy {
|
|||
final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
|
||||
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
|
||||
|
||||
final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"));
|
||||
final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null);
|
||||
wordIndex.connectRWI(10000, Integer.MAX_VALUE);
|
||||
wordIndex.connectUrlDb(false, false);
|
||||
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);
|
||||
|
@ -845,7 +845,7 @@ public final class yacy {
|
|||
try {
|
||||
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = null;
|
||||
if (resource.equals("all")) {
|
||||
WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"));
|
||||
WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null);
|
||||
WordIndex.connectRWI(10000, Integer.MAX_VALUE);
|
||||
WordIndex.connectUrlDb(false, false);
|
||||
indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);
|
||||
|
|
Loading…
Reference in New Issue
Block a user