- refactoring

- a little bit more abstraction
- new interfaces for index abstraction

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5783 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-04-07 09:34:41 +00:00
parent 82fb60a720
commit 44e01afa5b
21 changed files with 406 additions and 144 deletions

View File

@ -130,7 +130,7 @@ public class IndexControlRWIs_p {
int i = 0;
urlx = new String[index.size()];
while (en.hasNext()) {
urlx[i++] = en.next().urlHash();
urlx[i++] = en.next().metadataHash();
}
index = null;
}
@ -214,12 +214,12 @@ public class IndexControlRWIs_p {
URLMetadataRow lurl;
while (urlIter.hasNext()) {
iEntry = urlIter.next();
lurl = sb.webIndex.metadata().load(iEntry.urlHash(), null, 0);
lurl = sb.webIndex.metadata().load(iEntry.metadataHash(), null, 0);
if (lurl == null) {
unknownURLEntries.add(iEntry.urlHash());
unknownURLEntries.add(iEntry.metadataHash());
urlIter.remove();
} else {
knownURLs.put(iEntry.urlHash(), lurl);
knownURLs.put(iEntry.metadataHash(), lurl);
}
}

View File

@ -148,7 +148,7 @@ public final class transferRWI {
wordHash = estring.substring(0, p);
wordhashes[received] = wordHash;
iEntry = new WordReferenceRow(estring.substring(p));
urlHash = iEntry.urlHash();
urlHash = iEntry.metadataHash();
// block blacklisted entries
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(Blacklist.BLACKLIST_DHT, urlHash))) {

View File

@ -50,6 +50,7 @@ import de.anomic.kelondro.index.ObjectIndex;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.table.SplitTable;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.Log;
import de.anomic.yacy.yacyURL;
@ -99,7 +100,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
return 0;
}
public synchronized URLMetadataRow load(final String urlHash, final Reference searchedWord, final long ranking) {
public synchronized URLMetadataRow load(final String urlHash, final WordReference searchedWord, final long ranking) {
// generates an plasmaLURLEntry using the url hash
// if the url cannot be found, this returns null
if (urlHash == null) return null;

View File

@ -1,6 +1,6 @@
// Reference.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.11.2007 on http://www.anomic.de
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 03.04.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
@ -26,49 +26,19 @@
package de.anomic.kelondro.text;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.index.Row.Entry;
public interface Reference {
public String toPropertyForm();
public String urlHash();
public Entry toKelondroEntry();
public int virtualAge();
public String metadataHash();
public long lastModified();
public long freshUntil();
public int hitcount();
public int posintext();
public int posinphrase();
public int posofphrase();
public int wordsintext();
public int phrasesintext();
public String getLanguage();
public char getType();
public int wordsintitle();
public int llocal();
public int lother();
public int urllength();
public int urlcomps();
public Bitfield flags();
public double termFrequency();
//public long freshUntil();
public String toString();

View File

@ -37,6 +37,7 @@ import java.util.TreeMap;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.RowSet;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
import de.anomic.kelondro.util.ByteBuffer;
@ -102,21 +103,6 @@ public class ReferenceContainer extends RowSet {
this.lastTimeWrote = updateTime;
}
public static final ReferenceContainer mergeUnique(final ReferenceContainer a, final boolean aIsClone, final ReferenceContainer b, final boolean bIsClone) {
if ((aIsClone) && (bIsClone)) {
if (a.size() > b.size()) return (ReferenceContainer) mergeUnique(a, b); else return (ReferenceContainer) mergeUnique(b, a);
}
if (aIsClone) return (ReferenceContainer) mergeUnique(a, b);
if (bIsClone) return (ReferenceContainer) mergeUnique(b, a);
if (a.size() > b.size()) return (ReferenceContainer) mergeUnique(a, b); else return (ReferenceContainer) mergeUnique(b, a);
}
public static Object mergeUnique(final Object a, final Object b) {
final ReferenceContainer c = (ReferenceContainer) a;
c.addAllUnique((ReferenceContainer) b);
return c;
}
public ReferenceContainer merge(final ReferenceContainer c) {
return new ReferenceContainer(this.termHash, super.merge(c));
}
@ -162,7 +148,7 @@ public class ReferenceContainer extends RowSet {
return x;
}
public Reference get(final String urlHash) {
public WordReference get(final String urlHash) {
final Row.Entry entry = this.get(urlHash.getBytes());
if (entry == null) return null;
return new WordReferenceRow(entry);
@ -215,6 +201,12 @@ public class ReferenceContainer extends RowSet {
}
public static Object mergeUnique(final Object a, final Object b) {
final ReferenceContainer c = (ReferenceContainer) a;
c.addAllUnique((ReferenceContainer) b);
return c;
}
public static final Method containerMergeMethod;
static {
Method meth = null;
@ -342,13 +334,13 @@ public class ReferenceContainer extends RowSet {
final ReferenceContainer conj = new ReferenceContainer(null, small.rowdef, 0); // start with empty search result
final Iterator<WordReferenceRow> se = small.entries();
WordReferenceVars ie0;
Reference ie1;
WordReference ie1;
while (se.hasNext()) {
ie0 = new WordReferenceVars(se.next());
ie1 = large.get(ie0.urlHash());
ie1 = large.get(ie0.metadataHash());
if ((ie0 != null) && (ie1 != null)) {
assert (ie0.urlHash().length() == keylength) : "ie0.urlHash() = " + ie0.urlHash();
assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = " + ie1.urlHash();
assert (ie0.metadataHash().length() == keylength) : "ie0.urlHash() = " + ie0.metadataHash();
assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash();
// this is a hit. Calculate word distance:
ie0.join(ie1);
if (ie0.worddistance() <= maxDistance) conj.add(ie0.toRowEntry());
@ -369,14 +361,14 @@ public class ReferenceContainer extends RowSet {
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
WordReferenceVars ie1;
Reference ie2;
WordReference ie2;
ie1 = new WordReferenceVars(e1.next());
ie2 = e2.next();
while (true) {
assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = " + ie1.urlHash();
assert (ie2.urlHash().length() == keylength) : "ie2.urlHash() = " + ie2.urlHash();
c = i1.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes());
assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash();
assert (ie2.metadataHash().length() == keylength) : "ie2.urlHash() = " + ie2.metadataHash();
c = i1.rowdef.getOrdering().compare(ie1.metadataHash().getBytes(), ie2.metadataHash().getBytes());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) {
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
@ -422,11 +414,11 @@ public class ReferenceContainer extends RowSet {
Reference ie0, ie1;
while (se.hasNext()) {
ie0 = se.next();
ie1 = excl.get(ie0.urlHash());
ie1 = excl.get(ie0.metadataHash());
if ((ie0 != null) && (ie1 != null)) {
assert (ie0.urlHash().length() == keylength) : "ie0.urlHash() = " + ie0.urlHash();
assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = " + ie1.urlHash();
if (iterate_pivot) se.remove(); pivot.remove(ie0.urlHash().getBytes());
assert (ie0.metadataHash().length() == keylength) : "ie0.urlHash() = " + ie0.metadataHash();
assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash();
if (iterate_pivot) se.remove(); pivot.remove(ie0.metadataHash().getBytes());
}
}
return pivot;
@ -442,14 +434,14 @@ public class ReferenceContainer extends RowSet {
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
WordReferenceVars ie1;
Reference ie2;
WordReference ie2;
ie1 = new WordReferenceVars(e1.next());
ie2 = e2.next();
while (true) {
assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = " + ie1.urlHash();
assert (ie2.urlHash().length() == keylength) : "ie2.urlHash() = " + ie2.urlHash();
c = pivot.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes());
assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash();
assert (ie2.metadataHash().length() == keylength) : "ie2.urlHash() = " + ie2.metadataHash();
c = pivot.rowdef.getOrdering().compare(ie1.metadataHash().getBytes(), ie2.metadataHash().getBytes());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) {
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
@ -486,12 +478,12 @@ public class ReferenceContainer extends RowSet {
String dom, paths;
while (i.hasNext()) {
iEntry = i.next();
if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
dom = iEntry.urlHash().substring(6);
if ((excludeContainer != null) && (excludeContainer.get(iEntry.metadataHash()) != null)) continue; // do not include urls that are in excludeContainer
dom = iEntry.metadataHash().substring(6);
if ((paths = doms.get(dom)) == null) {
doms.put(dom, iEntry.urlHash().substring(0, 6));
doms.put(dom, iEntry.metadataHash().substring(0, 6));
} else {
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
doms.put(dom, paths + iEntry.metadataHash().substring(0, 6));
}
if (System.currentTimeMillis() > timeout)
break;

View File

@ -378,7 +378,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
WordReferenceRow ee;
while (e.hasNext()) {
ee = e.next();
if (urlselection.contains(ee.urlHash())) c1.add(ee);
if (urlselection.contains(ee.metadataHash())) c1.add(ee);
}
return c1;
}

View File

@ -0,0 +1,36 @@
// ReferenceFactory.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 06.04.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-04-03 15:23:45 +0200 (Fr, 03 Apr 2009) $
// $LastChangedRevision: 5777 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import de.anomic.kelondro.index.Row;
public interface ReferenceFactory<ReferenceType extends Reference> {
public ReferenceType produce(Row.Entry e);
public Row.Entry recycle(ReferenceType r);
}

View File

@ -121,8 +121,8 @@ public class ReferenceOrder {
final long tf = ((max.termFrequency() == min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-min.termFrequency())*256.0)/(max.termFrequency() - min.termFrequency())))) << ranking.coeff_termfrequency);
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
final long r =
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
+ ((ranking.coeff_ybr > 12) ? ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr) : 0)
((256 - yacyURL.domLengthNormalized(t.metadataHash())) << ranking.coeff_domlength)
+ ((ranking.coeff_ybr > 12) ? ((256 - (plasmaSearchRankingProcess.ybr(t.metadataHash()) << 4)) << ranking.coeff_ybr) : 0)
+ ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
+ ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
+ ((max.posintext() == min.posintext() ) ? 0 : (256 - (((t.posintext() - min.posintext() ) << 8) / (max.posintext() - min.posintext()) )) << ranking.coeff_posintext)
@ -137,7 +137,7 @@ public class ReferenceOrder {
+ ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ tf
+ ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0)
+ ((ranking.coeff_authority > 12) ? (authority(t.metadataHash()) << ranking.coeff_authority) : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)
@ -150,7 +150,7 @@ public class ReferenceOrder {
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)
+ ((patchUK(t.language).equals(this.language)) ? 255 << ranking.coeff_language : 0)
+ ((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0);
+ ((yacyURL.probablyRootURL(t.metadataHash())) ? 15 << ranking.coeff_urllength : 0);
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
@ -195,7 +195,7 @@ public class ReferenceOrder {
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);
if (this.entryMax == null) this.entryMax = iEntry.clone(); else this.entryMax.max(iEntry);
// update domcount
dom = iEntry.urlHash().substring(6);
dom = iEntry.metadataHash().substring(6);
count = doms.get(dom);
if (count == null) {
doms.put(dom, int1);

View File

@ -40,7 +40,7 @@ import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.text.Metadata;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.FileUtils;
@ -119,7 +119,7 @@ public class URLMetadataRow implements Metadata {
private final Row.Entry entry;
private final String snippet;
private Reference word; // this is only used if the url is transported via remote search requests
private WordReference word; // this is only used if the url is transported via remote search requests
private final long ranking; // during generation of a search result this value is set
public URLMetadataRow(
@ -201,7 +201,7 @@ public class URLMetadataRow implements Metadata {
}
}
public URLMetadataRow(final Row.Entry entry, final Reference searchedWord, final long ranking) {
public URLMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
@ -427,7 +427,7 @@ public class URLMetadataRow implements Metadata {
return snippet;
}
public Reference word() {
public WordReference word() {
return word;
}

View File

@ -0,0 +1,189 @@
// CitationReferenceRow.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 03.04.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text.referencePrototype;
import de.anomic.kelondro.index.Column;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.Row.Entry;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.MicroDate;
import de.anomic.kelondro.text.Reference;
import de.anomic.yacy.yacySeedDB;
public final class CitationReferenceRow /*implements Reference, Cloneable*/ {
// this object stores citation attributes to URL references
public static final Row citationRow = new Row(new Column[]{
new Column("h", Column.celltype_string, Column.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
new Column("a", Column.celltype_cardinal, Column.encoder_b256, 2, "lastModified"),
new Column("a", Column.celltype_cardinal, Column.encoder_b256, 2, "lastAccessed"),
new Column("t", Column.celltype_cardinal, Column.encoder_b256, 2, "posintext"),
new Column("x", Column.celltype_cardinal, Column.encoder_b256, 1, "llocal"),
new Column("y", Column.celltype_cardinal, Column.encoder_b256, 1, "lother"),
new Column("m", Column.celltype_cardinal, Column.encoder_b256, 1, "urlLength"),
new Column("n", Column.celltype_cardinal, Column.encoder_b256, 1, "urlComps"),
new Column("g", Column.celltype_binary, Column.encoder_bytes, 1, "typeofurl"),
new Column("k", Column.celltype_cardinal, Column.encoder_b256, 1, "reserve")
},
Base64Order.enhancedCoder
);
// available chars: b,e,j,q
// static properties
private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
private static final int col_lastModified = 1; // a 2 last-modified time of the document where url appears
private static final int col_lastAccessed = 2; // a 2 curent time when the url was seen
private static final int col_posintext = 3; // t 2 appearance of url in text; simply counts up the urls
private static final int col_llocal = 4; // x 1 outlinks to same domain
private static final int col_lother = 5; // y 1 outlinks to other domain
private static final int col_urlLength = 6; // m 1 byte-length of complete URL
private static final int col_urlComps = 7; // n 1 number of path components
private static final int col_typeofurl = 8; // g typeofurl
private static final int col_reserve = 9; // k 1 reserve2
private final Row.Entry entry;
public CitationReferenceRow(
final String urlHash,
final long lastmodified, // last-modified time of the document where word appears
final long updatetime, // update time
final int posintext, // occurrence of url; counts the url
final int llocal,
final int lother,
final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components
final byte typeofurl // outlinks to same domain
) {
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
this.entry = citationRow.newEntry();
final int mddlm = MicroDate.microDateDays(lastmodified);
final int mddct = MicroDate.microDateDays(updatetime);
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_lastAccessed, mddct);
this.entry.setCol(col_posintext, posintext);
this.entry.setCol(col_llocal, llocal);
this.entry.setCol(col_lother, lother);
this.entry.setCol(col_urlLength, urlLength);
this.entry.setCol(col_urlComps, urlComps);
this.entry.setCol(col_typeofurl, new byte[]{(byte) typeofurl});
this.entry.setCol(col_reserve, 0);
}
public CitationReferenceRow(final String urlHash, final String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = citationRow.newEntry((urlHash + code).getBytes());
}
public CitationReferenceRow(final String external) {
this.entry = citationRow.newEntry(external, true);
}
public CitationReferenceRow(final byte[] row) {
this.entry = citationRow.newEntry(row);
}
public CitationReferenceRow(final byte[] row, final int offset, final boolean clone) {
this.entry = citationRow.newEntry(row, offset, clone);
}
public CitationReferenceRow(final Row.Entry rentry) {
// FIXME: see if cloning is necessary
this.entry = rentry;
}
public CitationReferenceRow clone() {
final byte[] b = new byte[citationRow.objectsize];
System.arraycopy(entry.bytes(), 0, b, 0, citationRow.objectsize);
return new CitationReferenceRow(b);
}
public String toPropertyForm() {
return entry.toPropertyForm(true, true, false);
}
public Entry toKelondroEntry() {
return this.entry;
}
public String urlHash() {
return this.entry.getColString(col_urlhash, null);
}
public int virtualAge() {
return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format
}
public long lastModified() {
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
}
public int posintext() {
return (int) this.entry.getColLong(col_posintext);
}
public int llocal() {
return (int) this.entry.getColLong(col_llocal);
}
public int lother() {
return (int) this.entry.getColLong(col_lother);
}
public int urllength() {
return (int) this.entry.getColLong(col_urlLength);
}
public int urlcomps() {
return (int) this.entry.getColLong(col_urlComps);
}
public double citationFrequency() {
return 1.0 / ((double) (llocal() + lother() + 1));
}
public String toString() {
return toPropertyForm();
}
public boolean isNewer(final Reference other) {
if (other == null) return true;
if (this.lastModified() > other.lastModified()) return true;
return false;
}
public boolean isOlder(final Reference other) {
if (other == null) return false;
if (this.lastModified() < other.lastModified()) return true;
return false;
}
public int hashCode() {
return this.urlHash().hashCode();
}
}

View File

@ -0,0 +1,66 @@
// WordReference.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.11.2007 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-04-03 15:23:45 +0200 (Fr, 03 Apr 2009) $
// $LastChangedRevision: 5777 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text.referencePrototype;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.Reference;
public interface WordReference extends Reference {
public int virtualAge();
public int hitcount();
public int posintext();
public int posinphrase();
public int posofphrase();
public int wordsintext();
public int phrasesintext();
public String getLanguage();
public char getType();
public int wordsintitle();
public int llocal();
public int lother();
public int urllength();
public int urlcomps();
public Bitfield flags();
public double termFrequency();
}

View File

@ -35,7 +35,7 @@ import de.anomic.kelondro.order.MicroDate;
import de.anomic.kelondro.text.Reference;
import de.anomic.yacy.yacySeedDB;
public final class WordReferenceRow implements Reference, Cloneable {
public final class WordReferenceRow implements WordReference, Cloneable {
// this object stores attributes to URL references inside RWI collections
@ -182,7 +182,7 @@ public final class WordReferenceRow implements Reference, Cloneable {
return this.entry;
}
public String urlHash() {
public String metadataHash() {
return this.entry.getColString(col_urlhash, null);
}
@ -275,6 +275,6 @@ public final class WordReferenceRow implements Reference, Cloneable {
}
public int hashCode() {
return this.urlHash().hashCode();
return this.metadataHash().hashCode();
}
}

View File

@ -26,14 +26,15 @@
package de.anomic.kelondro.text.referencePrototype;
import de.anomic.kelondro.index.Row.Entry;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.MicroDate;
import de.anomic.kelondro.text.Reference;
public class WordReferenceVars implements Reference, Cloneable {
public class WordReferenceVars implements WordReference, Cloneable {
public Bitfield flags;
public long freshUntil, lastModified;
public long lastModified;
public String language, urlHash;
public char type;
public int hitcount, llocal, lother, phrasesintext, posintext,
@ -54,7 +55,7 @@ public class WordReferenceVars implements Reference, Cloneable {
final int posofphrase, // number of the phrase where word appears
final long lastmodified, // last-modified time of the document where word appears
final long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
String language, // (guessed) language of document
String language, // (guessed) language of document
final char doctype, // type of document
final int outlinksSame, // outlinks to same domain
final int outlinksOther, // outlinks to other domain
@ -64,9 +65,9 @@ public class WordReferenceVars implements Reference, Cloneable {
) {
if ((language == null) || (language.length() != 2)) language = "uk";
final int mddlm = MicroDate.microDateDays(lastmodified);
final int mddct = MicroDate.microDateDays(updatetime);
//final int mddct = MicroDate.microDateDays(updatetime);
this.flags = flags;
this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2);
//this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2);
this.lastModified = lastmodified;
this.language = language;
this.urlHash = urlHash;
@ -89,10 +90,10 @@ public class WordReferenceVars implements Reference, Cloneable {
public WordReferenceVars(final WordReferenceRow e) {
this.flags = e.flags();
this.freshUntil = e.freshUntil();
//this.freshUntil = e.freshUntil();
this.lastModified = e.lastModified();
this.language = e.getLanguage();
this.urlHash = e.urlHash();
this.urlHash = e.metadataHash();
this.type = e.getType();
this.hitcount = e.hitcount();
this.llocal = e.llocal();
@ -134,26 +135,26 @@ public class WordReferenceVars implements Reference, Cloneable {
return c;
}
public void join(final WordReferenceVars oe) {
public void join(final WordReferenceVars v) {
// combine the distance
this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
this.posintext = Math.min(this.posintext, oe.posintext);
this.posinphrase = (this.posofphrase == oe.posofphrase) ? Math.min(this.posinphrase, oe.posinphrase) : 0;
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase);
this.worddistance = this.worddistance + v.worddistance() + Math.abs(this.posintext - v.posintext);
this.posintext = Math.min(this.posintext, v.posintext);
this.posinphrase = (this.posofphrase == v.posofphrase) ? Math.min(this.posinphrase, v.posinphrase) : 0;
this.posofphrase = Math.min(this.posofphrase, v.posofphrase);
// combine term frequency
this.wordsintext = this.wordsintext + oe.wordsintext;
this.termFrequency = this.termFrequency + oe.termFrequency;
this.wordsintext = this.wordsintext + v.wordsintext;
this.termFrequency = this.termFrequency + v.termFrequency;
}
public Bitfield flags() {
return flags;
}
/*
public long freshUntil() {
return freshUntil;
}
*/
public String getLanguage() {
return language;
}
@ -226,11 +227,15 @@ public class WordReferenceVars implements Reference, Cloneable {
);
}
public Entry toKelondroEntry() {
return toRowEntry().toKelondroEntry();
}
public String toPropertyForm() {
return toRowEntry().toPropertyForm();
}
public String urlHash() {
public String metadataHash() {
return urlHash;
}
@ -278,7 +283,7 @@ public class WordReferenceVars implements Reference, Cloneable {
if (this.posofphrase > (v = other.posofphrase)) this.posofphrase = v;
if (this.worddistance > (v = other.worddistance)) this.worddistance = v;
if (this.lastModified > (w = other.lastModified)) this.lastModified = w;
if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w;
//if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w;
if (this.urllength > (v = other.urllength)) this.urllength = v;
if (this.urlcomps > (v = other.urlcomps)) this.urlcomps = v;
if (this.wordsintitle > (v = other.wordsintitle)) this.wordsintitle = v;
@ -300,14 +305,14 @@ public class WordReferenceVars implements Reference, Cloneable {
if (this.posofphrase < (v = other.posofphrase)) this.posofphrase = v;
if (this.worddistance < (v = other.worddistance)) this.worddistance = v;
if (this.lastModified < (w = other.lastModified)) this.lastModified = w;
if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w;
//if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w;
if (this.urllength < (v = other.urllength)) this.urllength = v;
if (this.urlcomps < (v = other.urlcomps)) this.urlcomps = v;
if (this.wordsintitle < (v = other.wordsintitle)) this.wordsintitle = v;
if (this.termFrequency < (d = other.termFrequency)) this.termFrequency = d;
}
public void join(final Reference oe) {
public void join(final WordReference oe) {
// joins two entries into one entry
// combine the distance
@ -324,4 +329,6 @@ public class WordReferenceVars implements Reference, Cloneable {
public int hashCode() {
return this.urlHash.hashCode();
}
}

View File

@ -121,7 +121,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
// getting next word index entry
importWordIdxEntry = importWordIdxEntries.next();
final String urlHash = importWordIdxEntry.urlHash();
final String urlHash = importWordIdxEntry.metadataHash();
entityUrls.add(urlHash);
}

View File

@ -138,7 +138,7 @@ public class plasmaSearchAPI {
if (rn == -1) rn = entry.ranking();
prop.put("genUrlList_urlList_"+i+"_urlExists", "1");
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlHash());
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().metadataHash());
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring);
prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us);
@ -173,7 +173,7 @@ public class plasmaSearchAPI {
((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "") +
((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "")
((yacyURL.probablyRootURL(entry.word().metadataHash())) ? "probably root url" : "")
);
if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, url)) {
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxChecked", "1");

View File

@ -43,6 +43,7 @@ import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceOrder;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.SortStack;
@ -158,7 +159,7 @@ public final class plasmaSearchRankingProcess {
Long r;
while (i.hasNext()) {
iEntry = i.next();
assert (iEntry.urlHash().length() == index.row().primaryKeyLength);
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
// increase flag counts
@ -182,13 +183,13 @@ public final class plasmaSearchRankingProcess {
}
// check tld domain
if (!yacyURL.matchesAnyDomDomain(iEntry.urlHash(), this.query.zonecode)) {
if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) {
// filter out all tld that do not match with wanted tld domain
continue;
}
// check site constraints
if (query.sitehash != null && !iEntry.urlHash().substring(6).equals(query.sitehash)) {
if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) {
// filter out all domains that do not match with the site constraint
}
@ -198,12 +199,12 @@ public final class plasmaSearchRankingProcess {
yacyURL uurl = (uentry == null) ? null : uentry.comp().url();
System.out.println("DEBUG domDomain dom=" + ((uurl == null) ? "null" : uurl.getHost()) + ", zone=" + yacyURL.domDomain(iEntry.urlHash()));
*/
this.domZones[yacyURL.domDomain(iEntry.urlHash())]++;
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;
// insert
if ((maxentries < 0) || (stack.size() < maxentries)) {
// in case that we don't have enough yet, accept any new entry
if (urlhashes.containsKey(iEntry.urlHash())) continue;
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
stack.push(iEntry, r);
} else {
// if we already have enough entries, insert only such that are necessary to get a better result
@ -211,7 +212,7 @@ public final class plasmaSearchRankingProcess {
continue;
}
// double-check
if (urlhashes.containsKey(iEntry.urlHash())) continue;
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
stack.push(iEntry, r);
}
@ -223,7 +224,7 @@ public final class plasmaSearchRankingProcess {
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);
}
private boolean testFlags(final Reference ientry) {
private boolean testFlags(final WordReference ientry) {
if (query.constraint == null) return true;
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
@ -261,7 +262,7 @@ public final class plasmaSearchRankingProcess {
if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it
if (!skipDoubleDom) return rwi;
// check doubledom
final String domhash = rwi.element.urlHash().substring(6);
final String domhash = rwi.element.metadataHash().substring(6);
m = this.doubleDomCache.get(domhash);
if (m == null) {
// first appearance of dom
@ -292,9 +293,9 @@ public final class plasmaSearchRankingProcess {
}
if (bestEntry == null) return null;
// finally remove the best entry from the doubledom cache
m = this.doubleDomCache.get(bestEntry.element.urlHash().substring(6));
m = this.doubleDomCache.get(bestEntry.element.metadataHash().substring(6));
o = m.pop();
assert o == null || o.element.urlHash().equals(bestEntry.element.urlHash());
assert o == null || o.element.metadataHash().equals(bestEntry.element.metadataHash());
return bestEntry;
}
@ -304,13 +305,13 @@ public final class plasmaSearchRankingProcess {
if (((stack.size() == 0) && (size() == 0))) break;
final SortStack<WordReferenceVars>.stackElement obrwi = bestRWI(skipDoubleDom);
if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause?
final URLMetadataRow u = wordIndex.metadata().load(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
final URLMetadataRow u = wordIndex.metadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
if (u != null) {
final URLMetadataRow.Components metadata = u.metadata();
if (metadata.url() != null) this.handover.put(u.hash(), metadata.url().toNormalform(true, false)); // remember that we handed over this url
return u;
}
misses.add(obrwi.element.urlHash());
misses.add(obrwi.element.metadataHash());
}
return null;
}

View File

@ -210,7 +210,7 @@ public final class plasmaWordIndex {
}
}
initActiveCrawlProfiles();
log.logConfig("Loaded active crawl profiles from file " + profilesActiveFile.getName() +
log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() +
", " + this.profilesActiveCrawls.size() + " entries" +
", " + profilesActiveFile.length()/1024);
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
@ -230,7 +230,7 @@ public final class plasmaWordIndex {
this.profilesPassiveCrawls = null;
}
}
log.logConfig("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
", " + this.profilesPassiveCrawls.size() + " entries" +
", " + profilesPassiveFile.length()/1024);
@ -665,13 +665,13 @@ public final class plasmaWordIndex {
entry = containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
final URLMetadataRow ue = metadata.load(entry.urlHash(), entry, 0);
final URLMetadataRow ue = metadata.load(entry.metadataHash(), entry, 0);
if (ue == null) {
urlHashs.add(entry.urlHash());
urlHashs.add(entry.metadataHash());
} else {
url = ue.metadata().url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
urlHashs.add(entry.metadataHash());
}
}
}

View File

@ -195,7 +195,7 @@ public class Dispatcher {
urlHashes.clear();
it = c.entries();
while (it.hasNext()) {
urlHashes.add(it.next().urlHash());
urlHashes.add(it.next().metadataHash());
}
if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getTermHash() + "'");
if (urlHashes.size() > 0) this.backend.remove(c.getTermHash(), urlHashes);
@ -234,7 +234,7 @@ public class Dispatcher {
while (i.hasNext()) {
re = i.next();
if (re == null) continue;
partitionBuffer[this.seeds.scheme.verticalPosition(re.urlHash())].add(re);
partitionBuffer[this.seeds.scheme.verticalPosition(re.metadataHash())].add(re);
}
// add the containers to the result vector

View File

@ -127,13 +127,13 @@ public class Transmission {
ArrayList<String> notFound = new ArrayList<String>();
while (i.hasNext()) {
WordReferenceRow e = i.next();
if (references.containsKey(e.urlHash()) || badReferences.contains(e.urlHash())) continue;
URLMetadataRow r = repository.load(e.urlHash(), null, 0);
if (references.containsKey(e.metadataHash()) || badReferences.contains(e.metadataHash())) continue;
URLMetadataRow r = repository.load(e.metadataHash(), null, 0);
if (r == null) {
notFound.add(e.urlHash());
badReferences.add(e.urlHash());
notFound.add(e.metadataHash());
badReferences.add(e.metadataHash());
} else {
references.put(e.urlHash(), r);
references.put(e.metadataHash(), r);
}
}
// now delete all references that were not found

View File

@ -561,8 +561,8 @@ public final class yacyClient {
// the search-result-url transports all the attributes of word indexes
entry = urlEntry.word();
if (!(entry.urlHash().equals(urlEntry.hash()))) {
yacyCore.log.logInfo("remote search (client): url-hash " + urlEntry.hash() + " does not belong to word-attached-hash " + entry.urlHash() + "; url = " + metadata.url() + " from peer " + target.getName());
if (!(entry.metadataHash().equals(urlEntry.hash()))) {
yacyCore.log.logInfo("remote search (client): url-hash " + urlEntry.hash() + " does not belong to word-attached-hash " + entry.metadataHash() + "; url = " + metadata.url() + " from peer " + target.getName());
continue; // spammed
}
@ -873,8 +873,8 @@ public final class yacyClient {
eenum = ic.entries();
while (eenum.hasNext()) {
entry = eenum.next();
if (urlCache.get(entry.urlHash()) == null) {
if (yacyCore.log.isFine()) yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.urlHash() + "' is not contained in urlCache");
if (urlCache.get(entry.metadataHash()) == null) {
if (yacyCore.log.isFine()) yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.metadataHash() + "' is not contained in urlCache");
}
}
}

View File

@ -693,7 +693,7 @@ public final class yacy {
Reference iEntry;
while (wordIdxEntries.hasNext()) {
iEntry = wordIdxEntries.next();
final String urlHash = iEntry.urlHash();
final String urlHash = iEntry.metadataHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
final URLMetadataRow urlEntry = currentUrlDB.load(urlHash, null, 0);
urlCounter++;