yacy_search_server/source/de/anomic/index/indexRWIEntryOld.java
orbiter 0a050bc043 enhanced ranking
- redesign of data storage in plasmaSearchRankingProfile
- profiles are extended by new ranking parameters
- new RWI ranking parameters are considered during ranking
- appearance attributes (i.e. emphasised text) is now considered
- faster ranking
- some attributes that had been checked during post-ranking can now be
  checked during pre-ranking phase
- removed old ranking parameter on index.html page (will be replaced by profiles in the future)
- ranking can now consider appearances of media content
- snippet-loading for media types now work correctly (fetches only from the wanted media)
- ranking-profiles can be handed over the remote peers and apply there also
- re-search of same query with different domain now also re-triggers remote search

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3105 6c8d7289-2bf4-0310-a012-ef5d649a1542
2006-12-20 15:44:29 +00:00

333 lines
16 KiB
Java

// indexRWIEntryOld.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 17.11.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.index;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB;
public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
// this object stores attributes to URL references inside RWI collections
public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "quality"),
new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"),
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"),
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 2, "language"),
new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"),
new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "localflag"),
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"),
new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posinphrase"),
new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posofphrase"),
new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "worddistance"),
new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "wordcount"),
new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "phrasecount")
},
kelondroBase64Order.enhancedCoder,
0);
private static final int col_urlhash = 0;
private static final int col_quality = 1;
private static final int col_lastModified = 2;
private static final int col_hitcount = 3;
private static final int col_language = 4;
private static final int col_doctype = 5;
//private static final int col_localflag = 6;
private static final int col_posintext = 7;
private static final int col_posinphrase = 8;
private static final int col_posofphrase = 9;
private static final int col_worddistance = 10;
private static final int col_wordcount = 11;
private static final int col_phrasecount = 12;
private kelondroRow.Entry entry;
/*
public indexRWIEntryOld(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
int posintext, //*position of word in all words
int posinphrase, //*position of word in its phrase
int posofphrase, //*number of the phrase where word appears
int worddistance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
String language, //*(guessed) language of document
char doctype, //*type of document
int outlinksSame, // outlinks to same domain
int outlinksOther,// outlinks to other domain
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer
) {
// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link etc
// - boolean: URL attributes
assert (urlHash != null);
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
this.entry = urlEntryRow.newEntry();
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_quality, 0);
this.entry.setCol(col_lastModified, lastmodified);
this.entry.setCol(col_hitcount, hitcount);
this.entry.setCol(col_language, language, null);
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
this.entry.setCol(col_localflag, (byte) ((local) ? plasmaURL.LT_LOCAL : plasmaURL.LT_GLOBAL));
this.entry.setCol(col_posintext, posintext);
this.entry.setCol(col_posinphrase, posinphrase);
this.entry.setCol(col_posofphrase, posofphrase);
this.entry.setCol(col_worddistance, worddistance);
this.entry.setCol(col_wordcount, wordcount);
this.entry.setCol(col_phrasecount, phrasecount);
//System.out.println("DEBUG-NEWENTRY " + toPropertyForm());
}
*/
public indexRWIEntryOld(String urlHash, String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
}
public indexRWIEntryOld(String external) {
this.entry = urlEntryRow.newEntry(external, false);
}
public indexRWIEntryOld(byte[] row) {
this.entry = urlEntryRow.newEntry(row);
}
public indexRWIEntryOld(kelondroRow.Entry rentry) {
// FIXME: see if cloning is necessary
this.entry = rentry;
}
public Object clone() {
byte[] b = new byte[urlEntryRow.objectsize()];
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize());
return new indexRWIEntryOld(b);
}
public String toPropertyForm() {
return entry.toPropertyForm(true, false, false);
}
public Entry toKelondroEntry() {
return this.entry;
}
public String urlHash() {
return this.entry.getColString(col_urlhash, null);
}
public int quality() {
return (int) this.entry.getColLong(col_quality);
}
public char doctype() {
return (char) this.entry.getColByte(col_doctype);
}
public int virtualAge() {
return plasmaWordIndex.microDateDays(lastModified());
}
public long lastModified() {
return (int) this.entry.getColLong(col_lastModified);
}
public int hitcount() {
return (int) this.entry.getColLong(col_hitcount);
}
public int posintext() {
return (int) this.entry.getColLong(col_posintext);
}
public int posinphrase() {
return (int) this.entry.getColLong(col_posinphrase);
}
public int posofphrase() {
return (int) this.entry.getColLong(col_posofphrase);
}
public int wordsintext() {
return (int) this.entry.getColLong(col_wordcount);
}
public int phrasesintext() {
return (int) this.entry.getColLong(col_phrasecount);
}
public String getLanguage() {
return this.entry.getColString(col_language, null);
}
public char getType() {
return (char) this.entry.getColByte(col_doctype);
}
public kelondroBitfield flags() {
return plasmaSearchQuery.empty_constraint;
}
public static indexRWIEntryOld combineDistance(indexRWIEntryOld ie1, indexRWIEntry ie2) {
// returns a modified entry of the first argument
ie1.entry.setCol(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext()));
ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext()));
ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/);
ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase()));
ie1.entry.setCol(col_wordcount, (ie1.wordsintext() + ie2.wordsintext()) / 2);
return ie1;
}
public void combineDistance(indexRWIEntry oe) {
combineDistance(this, oe);
}
public int worddistance() {
return (int) this.entry.getColLong(col_worddistance);
}
public static final void min(indexRWIEntryOld t, indexRWIEntry other) {
if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordsintext() > other.wordsintext()) t.entry.setCol(col_wordcount, other.wordsintext());
if (t.phrasesintext() > other.phrasesintext()) t.entry.setCol(col_phrasecount, other.phrasesintext());
if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.quality() > other.quality()) t.entry.setCol(col_quality, other.quality());
}
public static final void max(indexRWIEntryOld t, indexRWIEntry other) {
if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordsintext() < other.wordsintext()) t.entry.setCol(col_wordcount, other.wordsintext());
if (t.phrasesintext() < other.phrasesintext()) t.entry.setCol(col_phrasecount, other.phrasesintext());
if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.quality() < other.quality()) t.entry.setCol(col_quality, other.quality());
}
public void min(indexRWIEntry other) {
min(this, other);
}
public void max(indexRWIEntry other) {
max(this, other);
}
static void normalize(indexRWIEntryOld t, indexRWIEntry min, indexRWIEntry max) {
assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash();
assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash();
assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash();
if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm());
//System.out.println("Normalize:\nentry = " + t.toPropertyForm(true));
//System.out.println("min = " + min.toPropertyForm(true));
//System.out.println("max = " + max.toPropertyForm(true));
t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount()));
t.entry.setCol(col_wordcount , (t.wordsintext() == 0) ? 0 : 1 + 255 * (t.wordsintext() - min.wordsintext() ) / (1 + max.wordsintext() - min.wordsintext()));
t.entry.setCol(col_phrasecount , (t.phrasesintext() == 0) ? 0 : 1 + 255 * (t.phrasesintext() - min.phrasesintext() ) / (1 + max.phrasesintext() - min.phrasesintext()));
t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext()));
t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase()));
t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase()));
t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat.
t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified()));
t.entry.setCol(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality()));
//System.out.println("out = " + t.toPropertyForm(true));
}
public void normalize(indexRWIEntry min, indexRWIEntry max) {
normalize(this, min, max);
}
public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max) {
assert (this.urlHash().length() == 12) : "this.urlhash = " + this.urlHash();
indexRWIEntryOld e = (indexRWIEntryOld) this.clone();
e.normalize(min, max);
return e;
}
public boolean isNewer(indexRWIEntry other) {
if (other == null) return true;
if (this.lastModified() > other.lastModified()) return true;
if (this.lastModified() == other.lastModified()) {
if (this.quality() > other.quality()) return true;
}
return false;
}
public boolean isOlder(indexRWIEntry other) {
if (other == null) return false;
if (this.lastModified() < other.lastModified()) return true;
if (this.lastModified() == other.lastModified()) {
if (this.quality() < other.quality()) return true;
}
return false;
}
public int llocal() {
return 0;
}
public int lother() {
return 0;
}
public int urlcomps() {
return 0;
}
public int urllength() {
return 0;
}
public int wordsintitle() {
return 0;
}
}