mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- refactoring
- a little bit more abstraction - new interfaces for index abstraction git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5783 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
82fb60a720
commit
44e01afa5b
|
@ -130,7 +130,7 @@ public class IndexControlRWIs_p {
|
|||
int i = 0;
|
||||
urlx = new String[index.size()];
|
||||
while (en.hasNext()) {
|
||||
urlx[i++] = en.next().urlHash();
|
||||
urlx[i++] = en.next().metadataHash();
|
||||
}
|
||||
index = null;
|
||||
}
|
||||
|
@ -214,12 +214,12 @@ public class IndexControlRWIs_p {
|
|||
URLMetadataRow lurl;
|
||||
while (urlIter.hasNext()) {
|
||||
iEntry = urlIter.next();
|
||||
lurl = sb.webIndex.metadata().load(iEntry.urlHash(), null, 0);
|
||||
lurl = sb.webIndex.metadata().load(iEntry.metadataHash(), null, 0);
|
||||
if (lurl == null) {
|
||||
unknownURLEntries.add(iEntry.urlHash());
|
||||
unknownURLEntries.add(iEntry.metadataHash());
|
||||
urlIter.remove();
|
||||
} else {
|
||||
knownURLs.put(iEntry.urlHash(), lurl);
|
||||
knownURLs.put(iEntry.metadataHash(), lurl);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -148,7 +148,7 @@ public final class transferRWI {
|
|||
wordHash = estring.substring(0, p);
|
||||
wordhashes[received] = wordHash;
|
||||
iEntry = new WordReferenceRow(estring.substring(p));
|
||||
urlHash = iEntry.urlHash();
|
||||
urlHash = iEntry.metadataHash();
|
||||
|
||||
// block blacklisted entries
|
||||
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(Blacklist.BLACKLIST_DHT, urlHash))) {
|
||||
|
|
|
@ -50,6 +50,7 @@ import de.anomic.kelondro.index.ObjectIndex;
|
|||
import de.anomic.kelondro.order.CloneableIterator;
|
||||
import de.anomic.kelondro.table.SplitTable;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReference;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
@ -99,7 +100,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
return 0;
|
||||
}
|
||||
|
||||
public synchronized URLMetadataRow load(final String urlHash, final Reference searchedWord, final long ranking) {
|
||||
public synchronized URLMetadataRow load(final String urlHash, final WordReference searchedWord, final long ranking) {
|
||||
// generates an plasmaLURLEntry using the url hash
|
||||
// if the url cannot be found, this returns null
|
||||
if (urlHash == null) return null;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// Reference.java
|
||||
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 07.11.2007 on http://www.anomic.de
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 03.04.2009 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
|
@ -26,49 +26,19 @@
|
|||
|
||||
package de.anomic.kelondro.text;
|
||||
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.index.Row.Entry;
|
||||
|
||||
public interface Reference {
|
||||
|
||||
public String toPropertyForm();
|
||||
|
||||
public Entry toKelondroEntry();
|
||||
|
||||
public String urlHash();
|
||||
|
||||
public int virtualAge();
|
||||
public String metadataHash();
|
||||
|
||||
public long lastModified();
|
||||
|
||||
public long freshUntil();
|
||||
|
||||
public int hitcount();
|
||||
|
||||
public int posintext();
|
||||
|
||||
public int posinphrase();
|
||||
|
||||
public int posofphrase();
|
||||
|
||||
public int wordsintext();
|
||||
|
||||
public int phrasesintext();
|
||||
|
||||
public String getLanguage();
|
||||
|
||||
public char getType();
|
||||
|
||||
public int wordsintitle();
|
||||
|
||||
public int llocal();
|
||||
|
||||
public int lother();
|
||||
|
||||
public int urllength();
|
||||
|
||||
public int urlcomps();
|
||||
|
||||
public Bitfield flags();
|
||||
|
||||
public double termFrequency();
|
||||
//public long freshUntil();
|
||||
|
||||
public String toString();
|
||||
|
||||
|
|
|
@ -37,6 +37,7 @@ import java.util.TreeMap;
|
|||
import de.anomic.kelondro.index.Row;
|
||||
import de.anomic.kelondro.index.RowSet;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReference;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
|
||||
import de.anomic.kelondro.util.ByteBuffer;
|
||||
|
@ -102,21 +103,6 @@ public class ReferenceContainer extends RowSet {
|
|||
this.lastTimeWrote = updateTime;
|
||||
}
|
||||
|
||||
public static final ReferenceContainer mergeUnique(final ReferenceContainer a, final boolean aIsClone, final ReferenceContainer b, final boolean bIsClone) {
|
||||
if ((aIsClone) && (bIsClone)) {
|
||||
if (a.size() > b.size()) return (ReferenceContainer) mergeUnique(a, b); else return (ReferenceContainer) mergeUnique(b, a);
|
||||
}
|
||||
if (aIsClone) return (ReferenceContainer) mergeUnique(a, b);
|
||||
if (bIsClone) return (ReferenceContainer) mergeUnique(b, a);
|
||||
if (a.size() > b.size()) return (ReferenceContainer) mergeUnique(a, b); else return (ReferenceContainer) mergeUnique(b, a);
|
||||
}
|
||||
|
||||
public static Object mergeUnique(final Object a, final Object b) {
|
||||
final ReferenceContainer c = (ReferenceContainer) a;
|
||||
c.addAllUnique((ReferenceContainer) b);
|
||||
return c;
|
||||
}
|
||||
|
||||
public ReferenceContainer merge(final ReferenceContainer c) {
|
||||
return new ReferenceContainer(this.termHash, super.merge(c));
|
||||
}
|
||||
|
@ -162,7 +148,7 @@ public class ReferenceContainer extends RowSet {
|
|||
return x;
|
||||
}
|
||||
|
||||
public Reference get(final String urlHash) {
|
||||
public WordReference get(final String urlHash) {
|
||||
final Row.Entry entry = this.get(urlHash.getBytes());
|
||||
if (entry == null) return null;
|
||||
return new WordReferenceRow(entry);
|
||||
|
@ -215,6 +201,12 @@ public class ReferenceContainer extends RowSet {
|
|||
|
||||
}
|
||||
|
||||
public static Object mergeUnique(final Object a, final Object b) {
|
||||
final ReferenceContainer c = (ReferenceContainer) a;
|
||||
c.addAllUnique((ReferenceContainer) b);
|
||||
return c;
|
||||
}
|
||||
|
||||
public static final Method containerMergeMethod;
|
||||
static {
|
||||
Method meth = null;
|
||||
|
@ -342,13 +334,13 @@ public class ReferenceContainer extends RowSet {
|
|||
final ReferenceContainer conj = new ReferenceContainer(null, small.rowdef, 0); // start with empty search result
|
||||
final Iterator<WordReferenceRow> se = small.entries();
|
||||
WordReferenceVars ie0;
|
||||
Reference ie1;
|
||||
WordReference ie1;
|
||||
while (se.hasNext()) {
|
||||
ie0 = new WordReferenceVars(se.next());
|
||||
ie1 = large.get(ie0.urlHash());
|
||||
ie1 = large.get(ie0.metadataHash());
|
||||
if ((ie0 != null) && (ie1 != null)) {
|
||||
assert (ie0.urlHash().length() == keylength) : "ie0.urlHash() = " + ie0.urlHash();
|
||||
assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = " + ie1.urlHash();
|
||||
assert (ie0.metadataHash().length() == keylength) : "ie0.urlHash() = " + ie0.metadataHash();
|
||||
assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash();
|
||||
// this is a hit. Calculate word distance:
|
||||
ie0.join(ie1);
|
||||
if (ie0.worddistance() <= maxDistance) conj.add(ie0.toRowEntry());
|
||||
|
@ -369,14 +361,14 @@ public class ReferenceContainer extends RowSet {
|
|||
int c;
|
||||
if ((e1.hasNext()) && (e2.hasNext())) {
|
||||
WordReferenceVars ie1;
|
||||
Reference ie2;
|
||||
WordReference ie2;
|
||||
ie1 = new WordReferenceVars(e1.next());
|
||||
ie2 = e2.next();
|
||||
|
||||
while (true) {
|
||||
assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = " + ie1.urlHash();
|
||||
assert (ie2.urlHash().length() == keylength) : "ie2.urlHash() = " + ie2.urlHash();
|
||||
c = i1.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes());
|
||||
assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash();
|
||||
assert (ie2.metadataHash().length() == keylength) : "ie2.urlHash() = " + ie2.metadataHash();
|
||||
c = i1.rowdef.getOrdering().compare(ie1.metadataHash().getBytes(), ie2.metadataHash().getBytes());
|
||||
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
|
||||
if (c < 0) {
|
||||
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
|
||||
|
@ -422,11 +414,11 @@ public class ReferenceContainer extends RowSet {
|
|||
Reference ie0, ie1;
|
||||
while (se.hasNext()) {
|
||||
ie0 = se.next();
|
||||
ie1 = excl.get(ie0.urlHash());
|
||||
ie1 = excl.get(ie0.metadataHash());
|
||||
if ((ie0 != null) && (ie1 != null)) {
|
||||
assert (ie0.urlHash().length() == keylength) : "ie0.urlHash() = " + ie0.urlHash();
|
||||
assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = " + ie1.urlHash();
|
||||
if (iterate_pivot) se.remove(); pivot.remove(ie0.urlHash().getBytes());
|
||||
assert (ie0.metadataHash().length() == keylength) : "ie0.urlHash() = " + ie0.metadataHash();
|
||||
assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash();
|
||||
if (iterate_pivot) se.remove(); pivot.remove(ie0.metadataHash().getBytes());
|
||||
}
|
||||
}
|
||||
return pivot;
|
||||
|
@ -442,14 +434,14 @@ public class ReferenceContainer extends RowSet {
|
|||
int c;
|
||||
if ((e1.hasNext()) && (e2.hasNext())) {
|
||||
WordReferenceVars ie1;
|
||||
Reference ie2;
|
||||
WordReference ie2;
|
||||
ie1 = new WordReferenceVars(e1.next());
|
||||
ie2 = e2.next();
|
||||
|
||||
while (true) {
|
||||
assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = " + ie1.urlHash();
|
||||
assert (ie2.urlHash().length() == keylength) : "ie2.urlHash() = " + ie2.urlHash();
|
||||
c = pivot.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes());
|
||||
assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash();
|
||||
assert (ie2.metadataHash().length() == keylength) : "ie2.urlHash() = " + ie2.metadataHash();
|
||||
c = pivot.rowdef.getOrdering().compare(ie1.metadataHash().getBytes(), ie2.metadataHash().getBytes());
|
||||
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
|
||||
if (c < 0) {
|
||||
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
|
||||
|
@ -486,12 +478,12 @@ public class ReferenceContainer extends RowSet {
|
|||
String dom, paths;
|
||||
while (i.hasNext()) {
|
||||
iEntry = i.next();
|
||||
if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
|
||||
dom = iEntry.urlHash().substring(6);
|
||||
if ((excludeContainer != null) && (excludeContainer.get(iEntry.metadataHash()) != null)) continue; // do not include urls that are in excludeContainer
|
||||
dom = iEntry.metadataHash().substring(6);
|
||||
if ((paths = doms.get(dom)) == null) {
|
||||
doms.put(dom, iEntry.urlHash().substring(0, 6));
|
||||
doms.put(dom, iEntry.metadataHash().substring(0, 6));
|
||||
} else {
|
||||
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
|
||||
doms.put(dom, paths + iEntry.metadataHash().substring(0, 6));
|
||||
}
|
||||
if (System.currentTimeMillis() > timeout)
|
||||
break;
|
||||
|
|
|
@ -378,7 +378,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
WordReferenceRow ee;
|
||||
while (e.hasNext()) {
|
||||
ee = e.next();
|
||||
if (urlselection.contains(ee.urlHash())) c1.add(ee);
|
||||
if (urlselection.contains(ee.metadataHash())) c1.add(ee);
|
||||
}
|
||||
return c1;
|
||||
}
|
||||
|
|
36
source/de/anomic/kelondro/text/ReferenceFactory.java
Normal file
36
source/de/anomic/kelondro/text/ReferenceFactory.java
Normal file
|
@ -0,0 +1,36 @@
|
|||
// ReferenceFactory.java
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 06.04.2009 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-04-03 15:23:45 +0200 (Fr, 03 Apr 2009) $
|
||||
// $LastChangedRevision: 5777 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
|
||||
import de.anomic.kelondro.index.Row;
|
||||
|
||||
public interface ReferenceFactory<ReferenceType extends Reference> {
|
||||
|
||||
public ReferenceType produce(Row.Entry e);
|
||||
|
||||
public Row.Entry recycle(ReferenceType r);
|
||||
}
|
|
@ -121,8 +121,8 @@ public class ReferenceOrder {
|
|||
final long tf = ((max.termFrequency() == min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-min.termFrequency())*256.0)/(max.termFrequency() - min.termFrequency())))) << ranking.coeff_termfrequency);
|
||||
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
|
||||
final long r =
|
||||
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
|
||||
+ ((ranking.coeff_ybr > 12) ? ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr) : 0)
|
||||
((256 - yacyURL.domLengthNormalized(t.metadataHash())) << ranking.coeff_domlength)
|
||||
+ ((ranking.coeff_ybr > 12) ? ((256 - (plasmaSearchRankingProcess.ybr(t.metadataHash()) << 4)) << ranking.coeff_ybr) : 0)
|
||||
+ ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
|
||||
+ ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
|
||||
+ ((max.posintext() == min.posintext() ) ? 0 : (256 - (((t.posintext() - min.posintext() ) << 8) / (max.posintext() - min.posintext()) )) << ranking.coeff_posintext)
|
||||
|
@ -137,7 +137,7 @@ public class ReferenceOrder {
|
|||
+ ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother)
|
||||
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
|
||||
+ tf
|
||||
+ ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0)
|
||||
+ ((ranking.coeff_authority > 12) ? (authority(t.metadataHash()) << ranking.coeff_authority) : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)
|
||||
|
@ -150,7 +150,7 @@ public class ReferenceOrder {
|
|||
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)
|
||||
+ ((patchUK(t.language).equals(this.language)) ? 255 << ranking.coeff_language : 0)
|
||||
+ ((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0);
|
||||
+ ((yacyURL.probablyRootURL(t.metadataHash())) ? 15 << ranking.coeff_urllength : 0);
|
||||
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
|
||||
|
||||
return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
|
||||
|
@ -195,7 +195,7 @@ public class ReferenceOrder {
|
|||
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);
|
||||
if (this.entryMax == null) this.entryMax = iEntry.clone(); else this.entryMax.max(iEntry);
|
||||
// update domcount
|
||||
dom = iEntry.urlHash().substring(6);
|
||||
dom = iEntry.metadataHash().substring(6);
|
||||
count = doms.get(dom);
|
||||
if (count == null) {
|
||||
doms.put(dom, int1);
|
||||
|
|
|
@ -40,7 +40,7 @@ import de.anomic.kelondro.order.Bitfield;
|
|||
import de.anomic.kelondro.order.Digest;
|
||||
import de.anomic.kelondro.order.NaturalOrder;
|
||||
import de.anomic.kelondro.text.Metadata;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReference;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
|
@ -119,7 +119,7 @@ public class URLMetadataRow implements Metadata {
|
|||
|
||||
private final Row.Entry entry;
|
||||
private final String snippet;
|
||||
private Reference word; // this is only used if the url is transported via remote search requests
|
||||
private WordReference word; // this is only used if the url is transported via remote search requests
|
||||
private final long ranking; // during generation of a search result this value is set
|
||||
|
||||
public URLMetadataRow(
|
||||
|
@ -201,7 +201,7 @@ public class URLMetadataRow implements Metadata {
|
|||
}
|
||||
}
|
||||
|
||||
public URLMetadataRow(final Row.Entry entry, final Reference searchedWord, final long ranking) {
|
||||
public URLMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) {
|
||||
this.entry = entry;
|
||||
this.snippet = null;
|
||||
this.word = searchedWord;
|
||||
|
@ -427,7 +427,7 @@ public class URLMetadataRow implements Metadata {
|
|||
return snippet;
|
||||
}
|
||||
|
||||
public Reference word() {
|
||||
public WordReference word() {
|
||||
return word;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,189 @@
|
|||
// CitationReferenceRow.java
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 03.04.2009 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
|
||||
// $LastChangedRevision: 5736 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text.referencePrototype;
|
||||
|
||||
import de.anomic.kelondro.index.Column;
|
||||
import de.anomic.kelondro.index.Row;
|
||||
import de.anomic.kelondro.index.Row.Entry;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.MicroDate;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
||||
public final class CitationReferenceRow /*implements Reference, Cloneable*/ {
|
||||
|
||||
// this object stores citation attributes to URL references
|
||||
|
||||
public static final Row citationRow = new Row(new Column[]{
|
||||
new Column("h", Column.celltype_string, Column.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
|
||||
new Column("a", Column.celltype_cardinal, Column.encoder_b256, 2, "lastModified"),
|
||||
new Column("a", Column.celltype_cardinal, Column.encoder_b256, 2, "lastAccessed"),
|
||||
new Column("t", Column.celltype_cardinal, Column.encoder_b256, 2, "posintext"),
|
||||
new Column("x", Column.celltype_cardinal, Column.encoder_b256, 1, "llocal"),
|
||||
new Column("y", Column.celltype_cardinal, Column.encoder_b256, 1, "lother"),
|
||||
new Column("m", Column.celltype_cardinal, Column.encoder_b256, 1, "urlLength"),
|
||||
new Column("n", Column.celltype_cardinal, Column.encoder_b256, 1, "urlComps"),
|
||||
new Column("g", Column.celltype_binary, Column.encoder_bytes, 1, "typeofurl"),
|
||||
new Column("k", Column.celltype_cardinal, Column.encoder_b256, 1, "reserve")
|
||||
},
|
||||
Base64Order.enhancedCoder
|
||||
);
|
||||
// available chars: b,e,j,q
|
||||
|
||||
// static properties
|
||||
private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
|
||||
private static final int col_lastModified = 1; // a 2 last-modified time of the document where url appears
|
||||
private static final int col_lastAccessed = 2; // a 2 curent time when the url was seen
|
||||
private static final int col_posintext = 3; // t 2 appearance of url in text; simply counts up the urls
|
||||
private static final int col_llocal = 4; // x 1 outlinks to same domain
|
||||
private static final int col_lother = 5; // y 1 outlinks to other domain
|
||||
private static final int col_urlLength = 6; // m 1 byte-length of complete URL
|
||||
private static final int col_urlComps = 7; // n 1 number of path components
|
||||
private static final int col_typeofurl = 8; // g typeofurl
|
||||
private static final int col_reserve = 9; // k 1 reserve2
|
||||
|
||||
private final Row.Entry entry;
|
||||
|
||||
public CitationReferenceRow(
|
||||
final String urlHash,
|
||||
final long lastmodified, // last-modified time of the document where word appears
|
||||
final long updatetime, // update time
|
||||
final int posintext, // occurrence of url; counts the url
|
||||
final int llocal,
|
||||
final int lother,
|
||||
final int urlLength, // byte-length of complete URL
|
||||
final int urlComps, // number of path components
|
||||
final byte typeofurl // outlinks to same domain
|
||||
) {
|
||||
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
|
||||
this.entry = citationRow.newEntry();
|
||||
final int mddlm = MicroDate.microDateDays(lastmodified);
|
||||
final int mddct = MicroDate.microDateDays(updatetime);
|
||||
this.entry.setCol(col_urlhash, urlHash, null);
|
||||
this.entry.setCol(col_lastModified, mddlm);
|
||||
this.entry.setCol(col_lastAccessed, mddct);
|
||||
this.entry.setCol(col_posintext, posintext);
|
||||
this.entry.setCol(col_llocal, llocal);
|
||||
this.entry.setCol(col_lother, lother);
|
||||
this.entry.setCol(col_urlLength, urlLength);
|
||||
this.entry.setCol(col_urlComps, urlComps);
|
||||
this.entry.setCol(col_typeofurl, new byte[]{(byte) typeofurl});
|
||||
this.entry.setCol(col_reserve, 0);
|
||||
}
|
||||
|
||||
public CitationReferenceRow(final String urlHash, final String code) {
|
||||
// the code is the external form of the row minus the leading urlHash entry
|
||||
this.entry = citationRow.newEntry((urlHash + code).getBytes());
|
||||
}
|
||||
|
||||
public CitationReferenceRow(final String external) {
|
||||
this.entry = citationRow.newEntry(external, true);
|
||||
}
|
||||
|
||||
public CitationReferenceRow(final byte[] row) {
|
||||
this.entry = citationRow.newEntry(row);
|
||||
}
|
||||
|
||||
public CitationReferenceRow(final byte[] row, final int offset, final boolean clone) {
|
||||
this.entry = citationRow.newEntry(row, offset, clone);
|
||||
}
|
||||
|
||||
public CitationReferenceRow(final Row.Entry rentry) {
|
||||
// FIXME: see if cloning is necessary
|
||||
this.entry = rentry;
|
||||
}
|
||||
|
||||
public CitationReferenceRow clone() {
|
||||
final byte[] b = new byte[citationRow.objectsize];
|
||||
System.arraycopy(entry.bytes(), 0, b, 0, citationRow.objectsize);
|
||||
return new CitationReferenceRow(b);
|
||||
}
|
||||
|
||||
public String toPropertyForm() {
|
||||
return entry.toPropertyForm(true, true, false);
|
||||
}
|
||||
|
||||
public Entry toKelondroEntry() {
|
||||
return this.entry;
|
||||
}
|
||||
|
||||
public String urlHash() {
|
||||
return this.entry.getColString(col_urlhash, null);
|
||||
}
|
||||
|
||||
public int virtualAge() {
|
||||
return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format
|
||||
}
|
||||
|
||||
public long lastModified() {
|
||||
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
|
||||
}
|
||||
|
||||
public int posintext() {
|
||||
return (int) this.entry.getColLong(col_posintext);
|
||||
}
|
||||
|
||||
public int llocal() {
|
||||
return (int) this.entry.getColLong(col_llocal);
|
||||
}
|
||||
|
||||
public int lother() {
|
||||
return (int) this.entry.getColLong(col_lother);
|
||||
}
|
||||
|
||||
public int urllength() {
|
||||
return (int) this.entry.getColLong(col_urlLength);
|
||||
}
|
||||
|
||||
public int urlcomps() {
|
||||
return (int) this.entry.getColLong(col_urlComps);
|
||||
}
|
||||
|
||||
public double citationFrequency() {
|
||||
return 1.0 / ((double) (llocal() + lother() + 1));
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return toPropertyForm();
|
||||
}
|
||||
|
||||
public boolean isNewer(final Reference other) {
|
||||
if (other == null) return true;
|
||||
if (this.lastModified() > other.lastModified()) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isOlder(final Reference other) {
|
||||
if (other == null) return false;
|
||||
if (this.lastModified() < other.lastModified()) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.urlHash().hashCode();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
// WordReference.java
|
||||
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 07.11.2007 on http://www.anomic.de
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-04-03 15:23:45 +0200 (Fr, 03 Apr 2009) $
|
||||
// $LastChangedRevision: 5777 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text.referencePrototype;
|
||||
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
|
||||
public interface WordReference extends Reference {
|
||||
|
||||
public int virtualAge();
|
||||
|
||||
public int hitcount();
|
||||
|
||||
public int posintext();
|
||||
|
||||
public int posinphrase();
|
||||
|
||||
public int posofphrase();
|
||||
|
||||
public int wordsintext();
|
||||
|
||||
public int phrasesintext();
|
||||
|
||||
public String getLanguage();
|
||||
|
||||
public char getType();
|
||||
|
||||
public int wordsintitle();
|
||||
|
||||
public int llocal();
|
||||
|
||||
public int lother();
|
||||
|
||||
public int urllength();
|
||||
|
||||
public int urlcomps();
|
||||
|
||||
public Bitfield flags();
|
||||
|
||||
public double termFrequency();
|
||||
|
||||
}
|
|
@ -35,7 +35,7 @@ import de.anomic.kelondro.order.MicroDate;
|
|||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
||||
public final class WordReferenceRow implements Reference, Cloneable {
|
||||
public final class WordReferenceRow implements WordReference, Cloneable {
|
||||
|
||||
// this object stores attributes to URL references inside RWI collections
|
||||
|
||||
|
@ -182,7 +182,7 @@ public final class WordReferenceRow implements Reference, Cloneable {
|
|||
return this.entry;
|
||||
}
|
||||
|
||||
public String urlHash() {
|
||||
public String metadataHash() {
|
||||
return this.entry.getColString(col_urlhash, null);
|
||||
}
|
||||
|
||||
|
@ -275,6 +275,6 @@ public final class WordReferenceRow implements Reference, Cloneable {
|
|||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.urlHash().hashCode();
|
||||
return this.metadataHash().hashCode();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,14 +26,15 @@
|
|||
|
||||
package de.anomic.kelondro.text.referencePrototype;
|
||||
|
||||
import de.anomic.kelondro.index.Row.Entry;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.order.MicroDate;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
|
||||
public class WordReferenceVars implements Reference, Cloneable {
|
||||
public class WordReferenceVars implements WordReference, Cloneable {
|
||||
|
||||
public Bitfield flags;
|
||||
public long freshUntil, lastModified;
|
||||
public long lastModified;
|
||||
public String language, urlHash;
|
||||
public char type;
|
||||
public int hitcount, llocal, lother, phrasesintext, posintext,
|
||||
|
@ -54,7 +55,7 @@ public class WordReferenceVars implements Reference, Cloneable {
|
|||
final int posofphrase, // number of the phrase where word appears
|
||||
final long lastmodified, // last-modified time of the document where word appears
|
||||
final long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
|
||||
String language, // (guessed) language of document
|
||||
String language, // (guessed) language of document
|
||||
final char doctype, // type of document
|
||||
final int outlinksSame, // outlinks to same domain
|
||||
final int outlinksOther, // outlinks to other domain
|
||||
|
@ -64,9 +65,9 @@ public class WordReferenceVars implements Reference, Cloneable {
|
|||
) {
|
||||
if ((language == null) || (language.length() != 2)) language = "uk";
|
||||
final int mddlm = MicroDate.microDateDays(lastmodified);
|
||||
final int mddct = MicroDate.microDateDays(updatetime);
|
||||
//final int mddct = MicroDate.microDateDays(updatetime);
|
||||
this.flags = flags;
|
||||
this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2);
|
||||
//this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2);
|
||||
this.lastModified = lastmodified;
|
||||
this.language = language;
|
||||
this.urlHash = urlHash;
|
||||
|
@ -89,10 +90,10 @@ public class WordReferenceVars implements Reference, Cloneable {
|
|||
|
||||
public WordReferenceVars(final WordReferenceRow e) {
|
||||
this.flags = e.flags();
|
||||
this.freshUntil = e.freshUntil();
|
||||
//this.freshUntil = e.freshUntil();
|
||||
this.lastModified = e.lastModified();
|
||||
this.language = e.getLanguage();
|
||||
this.urlHash = e.urlHash();
|
||||
this.urlHash = e.metadataHash();
|
||||
this.type = e.getType();
|
||||
this.hitcount = e.hitcount();
|
||||
this.llocal = e.llocal();
|
||||
|
@ -134,26 +135,26 @@ public class WordReferenceVars implements Reference, Cloneable {
|
|||
return c;
|
||||
}
|
||||
|
||||
public void join(final WordReferenceVars oe) {
|
||||
public void join(final WordReferenceVars v) {
|
||||
// combine the distance
|
||||
this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
|
||||
this.posintext = Math.min(this.posintext, oe.posintext);
|
||||
this.posinphrase = (this.posofphrase == oe.posofphrase) ? Math.min(this.posinphrase, oe.posinphrase) : 0;
|
||||
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase);
|
||||
this.worddistance = this.worddistance + v.worddistance() + Math.abs(this.posintext - v.posintext);
|
||||
this.posintext = Math.min(this.posintext, v.posintext);
|
||||
this.posinphrase = (this.posofphrase == v.posofphrase) ? Math.min(this.posinphrase, v.posinphrase) : 0;
|
||||
this.posofphrase = Math.min(this.posofphrase, v.posofphrase);
|
||||
|
||||
// combine term frequency
|
||||
this.wordsintext = this.wordsintext + oe.wordsintext;
|
||||
this.termFrequency = this.termFrequency + oe.termFrequency;
|
||||
this.wordsintext = this.wordsintext + v.wordsintext;
|
||||
this.termFrequency = this.termFrequency + v.termFrequency;
|
||||
}
|
||||
|
||||
public Bitfield flags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
/*
|
||||
public long freshUntil() {
|
||||
return freshUntil;
|
||||
}
|
||||
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
@ -225,12 +226,16 @@ public class WordReferenceVars implements Reference, Cloneable {
|
|||
flags // attributes to the url and to the word according the url
|
||||
);
|
||||
}
|
||||
|
||||
public Entry toKelondroEntry() {
|
||||
return toRowEntry().toKelondroEntry();
|
||||
}
|
||||
|
||||
public String toPropertyForm() {
|
||||
return toRowEntry().toPropertyForm();
|
||||
}
|
||||
|
||||
public String urlHash() {
|
||||
public String metadataHash() {
|
||||
return urlHash;
|
||||
}
|
||||
|
||||
|
@ -278,7 +283,7 @@ public class WordReferenceVars implements Reference, Cloneable {
|
|||
if (this.posofphrase > (v = other.posofphrase)) this.posofphrase = v;
|
||||
if (this.worddistance > (v = other.worddistance)) this.worddistance = v;
|
||||
if (this.lastModified > (w = other.lastModified)) this.lastModified = w;
|
||||
if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w;
|
||||
//if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w;
|
||||
if (this.urllength > (v = other.urllength)) this.urllength = v;
|
||||
if (this.urlcomps > (v = other.urlcomps)) this.urlcomps = v;
|
||||
if (this.wordsintitle > (v = other.wordsintitle)) this.wordsintitle = v;
|
||||
|
@ -300,14 +305,14 @@ public class WordReferenceVars implements Reference, Cloneable {
|
|||
if (this.posofphrase < (v = other.posofphrase)) this.posofphrase = v;
|
||||
if (this.worddistance < (v = other.worddistance)) this.worddistance = v;
|
||||
if (this.lastModified < (w = other.lastModified)) this.lastModified = w;
|
||||
if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w;
|
||||
//if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w;
|
||||
if (this.urllength < (v = other.urllength)) this.urllength = v;
|
||||
if (this.urlcomps < (v = other.urlcomps)) this.urlcomps = v;
|
||||
if (this.wordsintitle < (v = other.wordsintitle)) this.wordsintitle = v;
|
||||
if (this.termFrequency < (d = other.termFrequency)) this.termFrequency = d;
|
||||
}
|
||||
|
||||
public void join(final Reference oe) {
|
||||
public void join(final WordReference oe) {
|
||||
// joins two entries into one entry
|
||||
|
||||
// combine the distance
|
||||
|
@ -324,4 +329,6 @@ public class WordReferenceVars implements Reference, Cloneable {
|
|||
public int hashCode() {
|
||||
return this.urlHash.hashCode();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -121,7 +121,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
|
||||
// getting next word index entry
|
||||
importWordIdxEntry = importWordIdxEntries.next();
|
||||
final String urlHash = importWordIdxEntry.urlHash();
|
||||
final String urlHash = importWordIdxEntry.metadataHash();
|
||||
entityUrls.add(urlHash);
|
||||
}
|
||||
|
||||
|
|
|
@ -138,7 +138,7 @@ public class plasmaSearchAPI {
|
|||
if (rn == -1) rn = entry.ranking();
|
||||
prop.put("genUrlList_urlList_"+i+"_urlExists", "1");
|
||||
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
|
||||
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlHash());
|
||||
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().metadataHash());
|
||||
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring);
|
||||
prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash);
|
||||
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us);
|
||||
|
@ -173,7 +173,7 @@ public class plasmaSearchAPI {
|
|||
((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "") +
|
||||
((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "") +
|
||||
((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "") +
|
||||
((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "")
|
||||
((yacyURL.probablyRootURL(entry.word().metadataHash())) ? "probably root url" : "")
|
||||
);
|
||||
if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, url)) {
|
||||
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxChecked", "1");
|
||||
|
|
|
@ -43,6 +43,7 @@ import de.anomic.kelondro.text.Reference;
|
|||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceOrder;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReference;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.kelondro.util.SortStack;
|
||||
|
@ -158,7 +159,7 @@ public final class plasmaSearchRankingProcess {
|
|||
Long r;
|
||||
while (i.hasNext()) {
|
||||
iEntry = i.next();
|
||||
assert (iEntry.urlHash().length() == index.row().primaryKeyLength);
|
||||
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
|
||||
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
|
||||
|
||||
// increase flag counts
|
||||
|
@ -182,13 +183,13 @@ public final class plasmaSearchRankingProcess {
|
|||
}
|
||||
|
||||
// check tld domain
|
||||
if (!yacyURL.matchesAnyDomDomain(iEntry.urlHash(), this.query.zonecode)) {
|
||||
if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) {
|
||||
// filter out all tld that do not match with wanted tld domain
|
||||
continue;
|
||||
}
|
||||
|
||||
// check site constraints
|
||||
if (query.sitehash != null && !iEntry.urlHash().substring(6).equals(query.sitehash)) {
|
||||
if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) {
|
||||
// filter out all domains that do not match with the site constraint
|
||||
}
|
||||
|
||||
|
@ -198,12 +199,12 @@ public final class plasmaSearchRankingProcess {
|
|||
yacyURL uurl = (uentry == null) ? null : uentry.comp().url();
|
||||
System.out.println("DEBUG domDomain dom=" + ((uurl == null) ? "null" : uurl.getHost()) + ", zone=" + yacyURL.domDomain(iEntry.urlHash()));
|
||||
*/
|
||||
this.domZones[yacyURL.domDomain(iEntry.urlHash())]++;
|
||||
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;
|
||||
|
||||
// insert
|
||||
if ((maxentries < 0) || (stack.size() < maxentries)) {
|
||||
// in case that we don't have enough yet, accept any new entry
|
||||
if (urlhashes.containsKey(iEntry.urlHash())) continue;
|
||||
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
|
||||
stack.push(iEntry, r);
|
||||
} else {
|
||||
// if we already have enough entries, insert only such that are necessary to get a better result
|
||||
|
@ -211,7 +212,7 @@ public final class plasmaSearchRankingProcess {
|
|||
continue;
|
||||
}
|
||||
// double-check
|
||||
if (urlhashes.containsKey(iEntry.urlHash())) continue;
|
||||
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
|
||||
stack.push(iEntry, r);
|
||||
}
|
||||
|
||||
|
@ -223,7 +224,7 @@ public final class plasmaSearchRankingProcess {
|
|||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);
|
||||
}
|
||||
|
||||
private boolean testFlags(final Reference ientry) {
|
||||
private boolean testFlags(final WordReference ientry) {
|
||||
if (query.constraint == null) return true;
|
||||
// test if ientry matches with filter
|
||||
// if all = true: let only entries pass that has all matching bits
|
||||
|
@ -261,7 +262,7 @@ public final class plasmaSearchRankingProcess {
|
|||
if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it
|
||||
if (!skipDoubleDom) return rwi;
|
||||
// check doubledom
|
||||
final String domhash = rwi.element.urlHash().substring(6);
|
||||
final String domhash = rwi.element.metadataHash().substring(6);
|
||||
m = this.doubleDomCache.get(domhash);
|
||||
if (m == null) {
|
||||
// first appearance of dom
|
||||
|
@ -292,9 +293,9 @@ public final class plasmaSearchRankingProcess {
|
|||
}
|
||||
if (bestEntry == null) return null;
|
||||
// finally remove the best entry from the doubledom cache
|
||||
m = this.doubleDomCache.get(bestEntry.element.urlHash().substring(6));
|
||||
m = this.doubleDomCache.get(bestEntry.element.metadataHash().substring(6));
|
||||
o = m.pop();
|
||||
assert o == null || o.element.urlHash().equals(bestEntry.element.urlHash());
|
||||
assert o == null || o.element.metadataHash().equals(bestEntry.element.metadataHash());
|
||||
return bestEntry;
|
||||
}
|
||||
|
||||
|
@ -304,13 +305,13 @@ public final class plasmaSearchRankingProcess {
|
|||
if (((stack.size() == 0) && (size() == 0))) break;
|
||||
final SortStack<WordReferenceVars>.stackElement obrwi = bestRWI(skipDoubleDom);
|
||||
if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause?
|
||||
final URLMetadataRow u = wordIndex.metadata().load(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
|
||||
final URLMetadataRow u = wordIndex.metadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
|
||||
if (u != null) {
|
||||
final URLMetadataRow.Components metadata = u.metadata();
|
||||
if (metadata.url() != null) this.handover.put(u.hash(), metadata.url().toNormalform(true, false)); // remember that we handed over this url
|
||||
return u;
|
||||
}
|
||||
misses.add(obrwi.element.urlHash());
|
||||
misses.add(obrwi.element.metadataHash());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -210,7 +210,7 @@ public final class plasmaWordIndex {
|
|||
}
|
||||
}
|
||||
initActiveCrawlProfiles();
|
||||
log.logConfig("Loaded active crawl profiles from file " + profilesActiveFile.getName() +
|
||||
log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() +
|
||||
", " + this.profilesActiveCrawls.size() + " entries" +
|
||||
", " + profilesActiveFile.length()/1024);
|
||||
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
|
||||
|
@ -230,7 +230,7 @@ public final class plasmaWordIndex {
|
|||
this.profilesPassiveCrawls = null;
|
||||
}
|
||||
}
|
||||
log.logConfig("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
|
||||
log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
|
||||
", " + this.profilesPassiveCrawls.size() + " entries" +
|
||||
", " + profilesPassiveFile.length()/1024);
|
||||
|
||||
|
@ -665,13 +665,13 @@ public final class plasmaWordIndex {
|
|||
entry = containerIterator.next();
|
||||
// System.out.println("Wordhash: "+wordHash+" UrlHash:
|
||||
// "+entry.getUrlHash());
|
||||
final URLMetadataRow ue = metadata.load(entry.urlHash(), entry, 0);
|
||||
final URLMetadataRow ue = metadata.load(entry.metadataHash(), entry, 0);
|
||||
if (ue == null) {
|
||||
urlHashs.add(entry.urlHash());
|
||||
urlHashs.add(entry.metadataHash());
|
||||
} else {
|
||||
url = ue.metadata().url();
|
||||
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url) == true)) {
|
||||
urlHashs.add(entry.urlHash());
|
||||
urlHashs.add(entry.metadataHash());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -195,7 +195,7 @@ public class Dispatcher {
|
|||
urlHashes.clear();
|
||||
it = c.entries();
|
||||
while (it.hasNext()) {
|
||||
urlHashes.add(it.next().urlHash());
|
||||
urlHashes.add(it.next().metadataHash());
|
||||
}
|
||||
if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getTermHash() + "'");
|
||||
if (urlHashes.size() > 0) this.backend.remove(c.getTermHash(), urlHashes);
|
||||
|
@ -234,7 +234,7 @@ public class Dispatcher {
|
|||
while (i.hasNext()) {
|
||||
re = i.next();
|
||||
if (re == null) continue;
|
||||
partitionBuffer[this.seeds.scheme.verticalPosition(re.urlHash())].add(re);
|
||||
partitionBuffer[this.seeds.scheme.verticalPosition(re.metadataHash())].add(re);
|
||||
}
|
||||
|
||||
// add the containers to the result vector
|
||||
|
|
|
@ -127,13 +127,13 @@ public class Transmission {
|
|||
ArrayList<String> notFound = new ArrayList<String>();
|
||||
while (i.hasNext()) {
|
||||
WordReferenceRow e = i.next();
|
||||
if (references.containsKey(e.urlHash()) || badReferences.contains(e.urlHash())) continue;
|
||||
URLMetadataRow r = repository.load(e.urlHash(), null, 0);
|
||||
if (references.containsKey(e.metadataHash()) || badReferences.contains(e.metadataHash())) continue;
|
||||
URLMetadataRow r = repository.load(e.metadataHash(), null, 0);
|
||||
if (r == null) {
|
||||
notFound.add(e.urlHash());
|
||||
badReferences.add(e.urlHash());
|
||||
notFound.add(e.metadataHash());
|
||||
badReferences.add(e.metadataHash());
|
||||
} else {
|
||||
references.put(e.urlHash(), r);
|
||||
references.put(e.metadataHash(), r);
|
||||
}
|
||||
}
|
||||
// now delete all references that were not found
|
||||
|
|
|
@ -561,8 +561,8 @@ public final class yacyClient {
|
|||
|
||||
// the search-result-url transports all the attributes of word indexes
|
||||
entry = urlEntry.word();
|
||||
if (!(entry.urlHash().equals(urlEntry.hash()))) {
|
||||
yacyCore.log.logInfo("remote search (client): url-hash " + urlEntry.hash() + " does not belong to word-attached-hash " + entry.urlHash() + "; url = " + metadata.url() + " from peer " + target.getName());
|
||||
if (!(entry.metadataHash().equals(urlEntry.hash()))) {
|
||||
yacyCore.log.logInfo("remote search (client): url-hash " + urlEntry.hash() + " does not belong to word-attached-hash " + entry.metadataHash() + "; url = " + metadata.url() + " from peer " + target.getName());
|
||||
continue; // spammed
|
||||
}
|
||||
|
||||
|
@ -873,8 +873,8 @@ public final class yacyClient {
|
|||
eenum = ic.entries();
|
||||
while (eenum.hasNext()) {
|
||||
entry = eenum.next();
|
||||
if (urlCache.get(entry.urlHash()) == null) {
|
||||
if (yacyCore.log.isFine()) yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.urlHash() + "' is not contained in urlCache");
|
||||
if (urlCache.get(entry.metadataHash()) == null) {
|
||||
if (yacyCore.log.isFine()) yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.metadataHash() + "' is not contained in urlCache");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -693,7 +693,7 @@ public final class yacy {
|
|||
Reference iEntry;
|
||||
while (wordIdxEntries.hasNext()) {
|
||||
iEntry = wordIdxEntries.next();
|
||||
final String urlHash = iEntry.urlHash();
|
||||
final String urlHash = iEntry.metadataHash();
|
||||
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
|
||||
final URLMetadataRow urlEntry = currentUrlDB.load(urlHash, null, 0);
|
||||
urlCounter++;
|
||||
|
|
Loading…
Reference in New Issue
Block a user