Moved solr index-add method to the same method where the YaCy index is

written. Also done some code-cleanup.
This commit is contained in:
Michael Peter Christen 2012-07-25 01:53:47 +02:00
parent 315d83cfa0
commit 6f1ddb2519
14 changed files with 275 additions and 227 deletions

View File

@ -137,7 +137,7 @@ public class IndexFederated_p {
}
// read index scheme table flags
final Iterator<ConfigurationSet.Entry> i = sb.solrScheme.entryIterator();
final Iterator<ConfigurationSet.Entry> i = sb.index.getSolrScheme().entryIterator();
ConfigurationSet.Entry entry;
boolean modified = false; // flag to remember changes
while (i.hasNext()) {
@ -160,7 +160,7 @@ public class IndexFederated_p {
}
if (modified) { // save settings to config file if modified
try {
sb.solrScheme.commit();
sb.index.getSolrScheme().commit();
modified = false;
} catch (IOException ex) {}
}
@ -191,7 +191,7 @@ public class IndexFederated_p {
// use enum SolrField to keep defined order
for(SolrField field : SolrField.values()) {
prop.put("scheme_" + c + "_dark", dark ? 1 : 0); dark = !dark;
prop.put("scheme_" + c + "_checked", sb.solrScheme.contains(field.name()) ? 1 : 0);
prop.put("scheme_" + c + "_checked", sb.index.getSolrScheme().contains(field.name()) ? 1 : 0);
prop.putHTML("scheme_" + c + "_key", field.name());
prop.putHTML("scheme_" + c + "_solrfieldname",field.name().equalsIgnoreCase(field.getSolrFieldName()) ? "" : field.getSolrFieldName());
if (field.getComment() != null) prop.putHTML("scheme_" + c + "_comment",field.getComment());

View File

@ -24,6 +24,7 @@
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.index.SolrConfiguration;
import net.yacy.search.index.SolrField;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -37,8 +38,9 @@ public class schema_p {
// write scheme
int c = 0;
SolrConfiguration solrScheme = sb.index.getSolrScheme();
for (SolrField field : SolrField.values()) {
if (sb.solrScheme.contains(field.name())) {
if (solrScheme.contains(field.name())) {
prop.put("fields_" + c + "_solrname", field.getSolrFieldName());
prop.put("fields_" + c + "_type", field.getType().printName());
prop.put("fields_" + c + "_comment", field.getComment());

View File

@ -81,8 +81,8 @@ public class CrawlQueues {
this.log.logConfig("Starting Crawling Management");
this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(sb.index.getSolr(), sb.solrScheme, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
this.delegatedURL = new ZURL(sb.index.getSolr(), sb.solrScheme, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
this.errorURL = new ZURL(sb.index.getSolr(), sb.index.getSolrScheme(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
this.delegatedURL = new ZURL(sb.index.getSolr(), sb.index.getSolrScheme(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
}
public void relocate(final File newQueuePath) {
@ -93,8 +93,8 @@ public class CrawlQueues {
this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727);
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(this.sb.index.getSolr(), this.sb.solrScheme, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
this.delegatedURL = new ZURL(this.sb.index.getSolr(), this.sb.solrScheme, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
this.errorURL = new ZURL(this.sb.index.getSolr(), this.sb.index.getSolrScheme(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
this.delegatedURL = new ZURL(this.sb.index.getSolr(), this.sb.index.getSolrScheme(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
}
public synchronized void close() {

View File

@ -159,4 +159,12 @@ public class ResponseHeader extends HeaderFramework {
}
return Charset.forName(charSetName);
}
public String getXRobotsTag() {
String x_robots_tag = this.get(HeaderFramework.X_ROBOTS_TAG, "");
if (x_robots_tag.isEmpty()) {
x_robots_tag = this.get(HeaderFramework.X_ROBOTS, "");
}
return x_robots_tag;
}
}

View File

@ -786,7 +786,8 @@ public final class Protocol
// store remote result to local result container
// insert one container into the search result buffer
// one is enough, only the references are used, not the word
containerCache.add(container.get(0), false, target.getName() + "/" + target.hash, result.joincount, true, time);
containerCache.add(container.get(0), false, target.getName() + "/" + target.hash, result.joincount, time);
containerCache.addFinalize();
containerCache.addExpectedRemoteReferences(-count);
// insert the containers to the index

View File

@ -0,0 +1,41 @@
/**
* IndexingQueueEntry
* Copyright 2012 by Michael Peter Christen
* First released 24.07.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.kelondro.workflow.WorkflowJob;
import de.anomic.crawler.retrieval.Response;
public class IndexingQueueEntry extends WorkflowJob {
public Response queueEntry;
public Document[] documents;
public Condenser[] condenser;
public IndexingQueueEntry(final Response queueEntry, final Document[] documents, final Condenser[] condenser) {
super();
this.queueEntry = queueEntry;
this.documents = documents;
this.condenser = condenser;
}
}

View File

@ -0,0 +1,47 @@
/**
* Shutdown
* Copyright 2012 by Michael Peter Christen
* First released 24.07.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search;
import net.yacy.kelondro.logging.Log;
public class Shutdown extends Thread {
private final Switchboard sb;
private final long delay;
private final String reason;
public Shutdown(final Switchboard sb, final long delay, final String reason) {
this.sb = sb;
this.delay = delay;
this.reason = reason;
}
@Override
public void run() {
try {
Thread.sleep(this.delay);
} catch ( final InterruptedException e ) {
this.sb.getLog().logInfo("interrupted delayed shutdown");
} catch ( final Exception e ) {
Log.logException(e);
}
this.sb.terminate(this.reason);
}
}

View File

@ -97,7 +97,6 @@ import net.yacy.cora.protocol.http.ProxySettings;
import net.yacy.cora.services.federated.solr.ShardSelection;
import net.yacy.cora.services.federated.solr.ShardSolrConnector;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.solr.SolrDoc;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -251,7 +250,6 @@ public final class Switchboard extends serverSwitch
public SeedDB peers;
public WorkTables tables;
public Tray tray;
public SolrConfiguration solrScheme;
public WorkflowProcessor<IndexingQueueEntry> indexingDocumentProcessor;
public WorkflowProcessor<IndexingQueueEntry> indexingCondensementProcessor;
@ -376,16 +374,6 @@ public final class Switchboard extends serverSwitch
this.networkRoot.mkdirs();
this.queuesRoot.mkdirs();
// initialize index
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
this.index = new Segment(this.log, new File(segmentsPath, "default"));
final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, true)) this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, true)) this.index.connectLocalSolr(connectWithinMs);
// prepare a solr index profile switch list
final File solrBackupProfile = new File("defaults/solr.keys.list");
final String schemename = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SCHEMEFILE, "solr.keys.default.list");
@ -395,11 +383,21 @@ public final class Switchboard extends serverSwitch
}
final boolean solrlazy = getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_LAZY, true);
final SolrConfiguration backupScheme = new SolrConfiguration(solrBackupProfile, solrlazy);
this.solrScheme = new SolrConfiguration(solrWorkProfile, solrlazy);
final SolrConfiguration solrScheme = new SolrConfiguration(solrWorkProfile, solrlazy);
// update the working scheme with the backup scheme. This is necessary to include new features.
// new features are always activated by default (if activated in input-backupScheme)
this.solrScheme.fill(backupScheme, true);
solrScheme.fill(backupScheme, true);
// initialize index
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
this.index = new Segment(this.log, new File(segmentsPath, "default"), solrScheme);
final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, true)) this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, true)) this.index.connectLocalSolr(connectWithinMs);
// set up the solr interface
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
@ -1133,6 +1131,9 @@ public final class Switchboard extends serverSwitch
// switch the networks
synchronized ( this ) {
// remember the solr scheme
SolrConfiguration solrScheme = this.index.getSolrScheme();
// shut down
this.crawler.close();
if ( this.dhtDispatcher != null ) {
@ -1179,7 +1180,7 @@ public final class Switchboard extends serverSwitch
partitionExponent,
this.useTailCache,
this.exceed134217727);
this.index = new Segment(this.log, new File(new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), "default"));
this.index = new Segment(this.log, new File(new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), "default"), solrScheme);
final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
@ -2395,55 +2396,8 @@ public final class Switchboard extends serverSwitch
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
boolean localSolr = this.index.connectedLocalSolr();
boolean remoteSolr = this.index.connectedRemoteSolr();
if (localSolr || remoteSolr) {
// send the documents to solr
for ( final Document doc : in.documents ) {
try {
final String id = UTF8.String(new DigestURI(doc.dc_identifier()).hash());
final String iquh = UTF8.String(in.queueEntry.url().hash());
if ( !id.equals(iquh) ) {
this.log.logWarning("condenseDocument consistency check doc="
+ id
+ ":"
+ doc.dc_identifier()
+ ", query="
+ iquh
+ ":"
+ in.queueEntry.url());
// in case that this happens it appears that the doc id is the right one
}
try {
SolrDoc solrDoc = this.solrScheme.yacy2solr(id, in.queueEntry.getResponseHeader(), doc);
this.index.getSolr().add(solrDoc);
} catch ( final IOException e ) {
Log.logWarning(
"SOLR",
"failed to send "
+ in.queueEntry.url().toNormalform(true, false)
+ " to solr: "
+ e.getMessage());
}
} catch ( final MalformedURLException e ) {
Log.logException(e);
continue;
}
}
}
// check if we should accept the document for our index
if (!this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) {
if ( this.log.isInfo() ) {
this.log.logInfo("Not Condensed Resource '"
+ in.queueEntry.url().toNormalform(false, true)
+ "': indexing not wanted by federated rule for YaCy");
}
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
final List<Document> doclist = new ArrayList<Document>();
// check which files may take part in the indexing process
final List<Document> doclist = new ArrayList<Document>();
for ( final Document document : in.documents ) {
if ( document.indexingDenied() ) {
if ( this.log.isInfo() ) {
@ -2569,6 +2523,7 @@ public final class Switchboard extends serverSwitch
queueEntry.lastModified(),
new Date(),
queueEntry.size(),
queueEntry.getResponseHeader(),
document,
condenser,
searchEvent,

View File

@ -73,9 +73,9 @@ public class DocumentIndex extends Segment
static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup");
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize)
public DocumentIndex(final File segmentPath, final File schemePath, final CallbackListener callback, final int cachesize)
throws IOException {
super(new Log("DocumentIndex"), segmentPath);
super(new Log("DocumentIndex"), segmentPath, schemePath == null ? null : new SolrConfiguration(schemePath, true));
super.connectRWI(cachesize, targetFileSize * 4 - 1);
super.connectCitation(cachesize, targetFileSize * 4 - 1);
super.connectUrlDb(
@ -174,6 +174,7 @@ public class DocumentIndex extends Segment
new Date(url.lastModified()),
new Date(),
url.length(),
null,
document,
condenser,
null,
@ -306,7 +307,7 @@ public class DocumentIndex extends Segment
try {
if ( args[1].equals("add") ) {
final DigestURI f = new DigestURI(args[2]);
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
final DocumentIndex di = new DocumentIndex(segmentPath, null, callback, 100000);
di.addConcurrent(f);
di.close();
} else {
@ -315,7 +316,7 @@ public class DocumentIndex extends Segment
query += args[i];
}
query.trim();
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
final DocumentIndex di = new DocumentIndex(segmentPath, null, callback, 100000);
final ArrayList<DigestURI> results = di.find(query, 100);
for ( final DigestURI f : results ) {
if ( f != null ) {

View File

@ -39,7 +39,9 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.solr.SolrDoc;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -100,15 +102,16 @@ public class Segment {
private final Log log;
private final File segmentPath;
private final SolrConfiguration solrScheme;
protected final MetadataRepository urlMetadata;
protected IndexCell<WordReference> termIndex;
protected IndexCell<CitationReference> urlCitationIndex;
public Segment(final Log log, final File segmentPath) {
public Segment(final Log log, final File segmentPath, final SolrConfiguration solrScheme) {
log.logInfo("Initializing Segment '" + segmentPath + ".");
this.log = log;
this.segmentPath = segmentPath;
this.solrScheme = solrScheme;
// create LURL-db
this.urlMetadata = new MetadataRepository(segmentPath);
@ -197,10 +200,15 @@ public class Segment {
public void disconnectLocalSolr() {
this.urlMetadata.disconnectLocalSolr();
}
public SolrConnector getSolr() {
return this.urlMetadata.getSolr();
}
public SolrConfiguration getSolrScheme() {
return this.solrScheme;
}
public SolrConnector getRemoteSolr() {
return this.urlMetadata.getRemoteSolr();
}
@ -318,94 +326,6 @@ public class Segment {
return this.segmentPath;
}
/**
* this is called by the switchboard to put in a new page into the index
* use all the words in one condenser object to simultanous create index entries
*
* @param url
* @param urlModified
* @param document
* @param condenser
* @param language
* @param doctype
* @param outlinksSame
* @param outlinksOther
* @return
*/
private int addPageIndex(
final DigestURI url,
final Date urlModified,
final Document document,
final Condenser condenser,
final String language,
final char doctype,
final int outlinksSame,
final int outlinksOther,
final SearchEvent searchEvent,
final String sourceName) {
final RWIProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult();
int wordCount = 0;
final int urlLength = url.toNormalform(true, true).length();
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
// iterate over all words of content text
final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator();
Map.Entry<String, Word> wentry;
String word;
final int len = (document == null) ? urlLength : document.dc_title().length();
final WordReferenceRow ientry = new WordReferenceRow(url.hash(),
urlLength, urlComps, len,
condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_SENTENCES,
urlModified.getTime(),
System.currentTimeMillis(),
UTF8.getBytes(language),
doctype,
outlinksSame, outlinksOther);
Word wprop = null;
byte[] wordhash;
while (i.hasNext()) {
wentry = i.next();
word = wentry.getKey();
wprop = wentry.getValue();
assert (wprop.flags != null);
ientry.setWord(wprop);
wordhash = Word.word2hash(word);
if (this.termIndex != null) try {
this.termIndex.add(wordhash, ientry);
} catch (final Exception e) {
Log.logException(e);
}
wordCount++;
// during a search event it is possible that a heuristic is used which aquires index
// data during search-time. To transfer indexed data directly to the search process
// the following lines push the index data additionally to the search process
// this is done only for searched words
if (searchEvent != null && !searchEvent.getQuery().query_exclude_hashes.has(wordhash) && searchEvent.getQuery().query_include_hashes.has(wordhash)) {
// if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result
ReferenceContainer<WordReference> container;
try {
container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1);
container.add(ientry);
rankingProcess.add(container, true, sourceName, -1, !i.hasNext(), 5000);
} catch (final RowSpaceExceededException e) {
continue;
}
}
}
// assign the catchall word
ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics
if (this.termIndex != null) try {
this.termIndex.add(catchallHash, ientry);
} catch (final Exception e) {
Log.logException(e);
}
return wordCount;
}
private int addCitationIndex(final DigestURI url, final Date urlModified, final Map<MultiProtocolURI, Properties> anchors) {
if (anchors == null) return 0;
int refCount = 0;
@ -433,25 +353,12 @@ public class Segment {
if (this.urlCitationIndex != null) this.urlCitationIndex.close();
}
public URIMetadataRow storeDocument(
final DigestURI url,
final DigestURI referrerURL,
Date modDate,
final Date loadDate,
final long sourcesize,
final Document document,
final Condenser condenser,
final SearchEvent searchEvent,
final String sourceName
) throws IOException {
final long startTime = System.currentTimeMillis();
// CREATE INDEX
// load some document metadata
final String dc_title = document.dc_title();
// do a identification of the language
private String votedLanguage(
final DigestURI url,
final String urlNormalform,
final Document document,
final Condenser condenser) {
// do a identification of the language
String language = condenser.language(); // this is a statistical analysation of the content: will be compared with other attributes
final String bymetadata = document.dc_language(); // the languageByMetadata may return null if there was no declaration
if (language == null) {
@ -466,7 +373,7 @@ public class Segment {
else {
final String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")";
// see if we have a hint in the url that the statistic was right
final String u = url.toNormalform(true, false).toLowerCase();
final String u = urlNormalform.toLowerCase();
if (!u.contains("/" + language + "/") && !u.contains("/" + ISO639.country(language).toLowerCase() + "/")) {
// no confirmation using the url, use the TLD
language = url.language();
@ -491,9 +398,46 @@ public class Segment {
}
}
}
return language;
}
// create a new loaded URL db entry
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
public URIMetadataRow storeDocument(
final DigestURI url,
final DigestURI referrerURL,
Date modDate,
final Date loadDate,
final long sourcesize,
final ResponseHeader responseHeader,
final Document document,
final Condenser condenser,
final SearchEvent searchEvent,
final String sourceName
) throws IOException {
final long startTime = System.currentTimeMillis();
// CREATE INDEX
// load some document metadata
final String id = ASCII.String(url.hash());
final String dc_title = document.dc_title();
final String urlNormalform = url.toNormalform(true, false);
final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language
// STORE TO SOLR
boolean localSolr = this.connectedLocalSolr();
boolean remoteSolr = this.connectedRemoteSolr();
if (localSolr || remoteSolr) {
try {
SolrDoc solrDoc = this.solrScheme.yacy2solr(id, responseHeader, document);
this.getSolr().add(solrDoc);
} catch ( final IOException e ) {
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
}
}
// STORE URL TO LOADED-URL-DB
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader
char docType = Response.docType(document.dc_format());
final URIMetadataRow newEntry = new URIMetadataRow(
url, // URL
dc_title, // document description
@ -509,7 +453,7 @@ public class Segment {
new byte[0], // md5
(int) sourcesize, // size
condenser.RESULT_NUMB_WORDS, // word count
Response.docType(document.dc_format()), // doctype
docType, // doctype
condenser.RESULT_FLAGS, // flags
UTF8.getBytes(language), // language
document.inboundLinks().size(), // inbound links
@ -519,25 +463,72 @@ public class Segment {
document.getVideolinks().size(), // lvideo
document.getApplinks().size() // lapp
);
// STORE URL TO LOADED-URL-DB
this.urlMetadata.store(newEntry); // TODO: should be serialized; integrated in IODispatcher
this.urlMetadata.store(newEntry);
final long storageEndTime = System.currentTimeMillis();
// STORE PAGE INDEX INTO WORD INDEX DB
final int words = addPageIndex(
url, // document url
modDate, // document mod date
document, // document content
condenser, // document condenser
language, // document language
Response.docType(document.dc_format()), // document type
document.inboundLinks().size(), // inbound links
document.outboundLinks().size(), // outbound links
searchEvent, // a search event that can have results directly
sourceName // the name of the source where the index was created
);
int outlinksSame = document.inboundLinks().size();
int outlinksOther = document.outboundLinks().size();
final RWIProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult();
int wordCount = 0;
final int urlLength = urlNormalform.length();
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
// create a word prototype which is re-used for all entries
final int len = (document == null) ? urlLength : document.dc_title().length();
final WordReferenceRow ientry = new WordReferenceRow(
url.hash(),
urlLength, urlComps, len,
condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_SENTENCES,
modDate.getTime(),
System.currentTimeMillis(),
UTF8.getBytes(language),
docType,
outlinksSame, outlinksOther);
// iterate over all words of content text
Word wprop = null;
byte[] wordhash;
String word;
for (Map.Entry<String, Word> wentry: condenser.words().entrySet()) {
word = wentry.getKey();
wprop = wentry.getValue();
assert (wprop.flags != null);
ientry.setWord(wprop);
wordhash = Word.word2hash(word);
if (this.termIndex != null) try {
this.termIndex.add(wordhash, ientry);
} catch (final Exception e) {
Log.logException(e);
}
wordCount++;
// during a search event it is possible that a heuristic is used which aquires index
// data during search-time. To transfer indexed data directly to the search process
// the following lines push the index data additionally to the search process
// this is done only for searched words
if (searchEvent != null && !searchEvent.getQuery().query_exclude_hashes.has(wordhash) && searchEvent.getQuery().query_include_hashes.has(wordhash)) {
// if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result
ReferenceContainer<WordReference> container;
try {
container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1);
container.add(ientry);
rankingProcess.add(container, true, sourceName, -1, 5000);
} catch (final RowSpaceExceededException e) {
continue;
}
}
}
if (rankingProcess != null) rankingProcess.addFinalize();
// assign the catchall word
ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics
if (this.termIndex != null) try {
this.termIndex.add(catchallHash, ientry);
} catch (final Exception e) {
Log.logException(e);
}
// STORE PAGE REFERENCES INTO CITATION INDEX
final int refs = addCitationIndex(url, modDate, document.getAnchors());
@ -546,10 +537,8 @@ public class Segment {
final long indexingEndTime = System.currentTimeMillis();
if (this.log.isInfo()) {
// TODO: UTF-8 docDescription seems not to be displayed correctly because
// of string concatenation
this.log.logInfo("*Indexed " + words + " words in URL " + url +
" [" + ASCII.String(url.hash()) + "]" +
this.log.logInfo("*Indexed " + wordCount + " words in URL " + url +
" [" + id + "]" +
"\n\tDescription: " + dc_title +
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " +

View File

@ -106,7 +106,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String[] value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final List<String> value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value);
}
@ -163,7 +163,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
addSolr(solrdoc, SolrField.author, yacydoc.dc_creator());
addSolr(solrdoc, SolrField.description, yacydoc.dc_description());
addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format());
addSolr(solrdoc, SolrField.last_modified, header.lastModified());
addSolr(solrdoc, SolrField.last_modified, header == null ? new Date() : header.lastModified());
addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' '));
final String content = yacydoc.getTextString();
addSolr(solrdoc, SolrField.text_t, content);
@ -224,10 +224,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (robots_meta.indexOf("noindex",0) >= 0) b += 4; // set bit 2
if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3
}
String x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, "");
if (x_robots_tag.isEmpty()) {
x_robots_tag = header.get(HeaderFramework.X_ROBOTS, "");
} else {
String x_robots_tag = "";
if (header != null) {
x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, "");
if (x_robots_tag.isEmpty()) {
x_robots_tag = header.get(HeaderFramework.X_ROBOTS, "");
}
}
if (!x_robots_tag.isEmpty()) {
// this tag may have values: noarchive, nosnippet, noindex, unavailable_after
if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 256; // set bit 8
if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 512; // set bit 9
@ -398,7 +402,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// response time
addSolr(solrdoc, SolrField.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
addSolr(solrdoc, SolrField.responsetime_i, header == null ? 0 : Integer.parseInt(header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
}
// list all links
@ -487,7 +491,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat());
}
addSolr(solrdoc, SolrField.httpstatus_i, header.getStatusCode());
addSolr(solrdoc, SolrField.httpstatus_i, header == null ? 200 : header.getStatusCode());
return solrdoc;
}

View File

@ -221,7 +221,8 @@ public final class RWIProcess extends Thread
System.currentTimeMillis() - timer),
false);
if ( !index.isEmpty() ) {
add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, true, this.maxtime);
add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, this.maxtime);
addFinalize();
}
} catch ( final Exception e ) {
Log.logException(e);
@ -230,12 +231,15 @@ public final class RWIProcess extends Thread
}
}
public void addFinalize() {
this.addRunning = false;
}
public void add(
final ReferenceContainer<WordReference> index,
final boolean local,
final String resourceName,
final int fullResource,
final boolean finalizeAddAtEnd,
final long maxtime) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
@ -422,10 +426,6 @@ public final class RWIProcess extends Thread
} catch ( final InterruptedException e ) {
} catch ( final RowSpaceExceededException e ) {
} finally {
if ( finalizeAddAtEnd ) {
this.addRunning = false;
}
}
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);

View File

@ -503,7 +503,7 @@ public class SnippetProcess {
sd = sdl.get(0);
}
if (sd != null) {
solrContent = Switchboard.getSwitchboard().solrScheme.solrGetText(sd);
solrContent = Switchboard.getSwitchboard().index.getSolrScheme().solrGetText(sd);
}
}

View File

@ -666,7 +666,7 @@ public final class yacy {
final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"));
final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null);
wordIndex.connectRWI(10000, Integer.MAX_VALUE);
wordIndex.connectUrlDb(false, false);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);
@ -845,7 +845,7 @@ public final class yacy {
try {
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"));
WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null);
WordIndex.connectRWI(10000, Integer.MAX_VALUE);
WordIndex.connectUrlDb(false, false);
indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);