// indexRWIEntryOld.java // (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany // first published 17.11.2006 on http://www.anomic.de // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedRevision: 1986 $ // $LastChangedBy: orbiter $ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.index; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroColumn; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow.Entry; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaWordIndex; import de.anomic.yacy.yacySeedDB; public class indexRWIEntryOld implements Cloneable, indexRWIEntry { // this object stores attributes to URL references inside RWI collections public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{ new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"), new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "quality"), new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"), new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"), new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 2, "language"), new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"), new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "localflag"), new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"), new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posinphrase"), new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posofphrase"), new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "worddistance"), new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "wordcount"), new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "phrasecount") }, kelondroBase64Order.enhancedCoder, 0); private static final int col_urlhash = 0; private static final int col_quality = 1; private static final int col_lastModified = 2; private static final int col_hitcount = 3; private static final int col_language = 4; private static final int col_doctype = 5; //private static final int col_localflag = 6; private static final int col_posintext = 7; private static final int col_posinphrase = 8; private static final int col_posofphrase = 9; private static final int col_worddistance = 10; private static final int col_wordcount = 11; private static final int col_phrasecount = 12; private kelondroRow.Entry entry; /* public indexRWIEntryOld(String urlHash, int urlLength, // byte-length of complete URL int urlComps, // number of path components int titleLength, // length of description/length (longer are better?) int hitcount, //*how often appears this word in the text int wordcount, //*total number of words int phrasecount, //*total number of phrases int posintext, //*position of word in all words int posinphrase, //*position of word in its phrase int posofphrase, //*number of the phrase where word appears int worddistance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search int sizeOfPage, // # of bytes of the page long lastmodified, //*last-modified time of the document where word appears long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short String language, //*(guessed) language of document char doctype, //*type of document int outlinksSame, // outlinks to same domain int outlinksOther,// outlinks to other domain boolean local //*flag shows that this index was generated locally; othervise its from a remote peer ) { // more needed attributes: // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link etc // - boolean: URL attributes assert (urlHash != null); assert (urlHash.length() == 12) : "urlhash = " + urlHash; if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk"; this.entry = urlEntryRow.newEntry(); this.entry.setCol(col_urlhash, urlHash, null); this.entry.setCol(col_quality, 0); this.entry.setCol(col_lastModified, lastmodified); this.entry.setCol(col_hitcount, hitcount); this.entry.setCol(col_language, language, null); this.entry.setCol(col_doctype, new byte[]{(byte) doctype}); this.entry.setCol(col_localflag, (byte) ((local) ? plasmaURL.LT_LOCAL : plasmaURL.LT_GLOBAL)); this.entry.setCol(col_posintext, posintext); this.entry.setCol(col_posinphrase, posinphrase); this.entry.setCol(col_posofphrase, posofphrase); this.entry.setCol(col_worddistance, worddistance); this.entry.setCol(col_wordcount, wordcount); this.entry.setCol(col_phrasecount, phrasecount); //System.out.println("DEBUG-NEWENTRY " + toPropertyForm()); } */ public indexRWIEntryOld(String urlHash, String code) { // the code is the external form of the row minus the leading urlHash entry this.entry = urlEntryRow.newEntry((urlHash + code).getBytes()); } public indexRWIEntryOld(String external) { this.entry = urlEntryRow.newEntry(external, false); } public indexRWIEntryOld(byte[] row) { this.entry = urlEntryRow.newEntry(row); } public indexRWIEntryOld(kelondroRow.Entry rentry) { // FIXME: see if cloning is necessary this.entry = rentry; } public Object clone() { byte[] b = new byte[urlEntryRow.objectsize()]; System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize()); return new indexRWIEntryOld(b); } public String toPropertyForm() { return entry.toPropertyForm(true, false, false); } public Entry toKelondroEntry() { return this.entry; } public String urlHash() { return this.entry.getColString(col_urlhash, null); } public int quality() { return (int) this.entry.getColLong(col_quality); } public char doctype() { return (char) this.entry.getColByte(col_doctype); } public int virtualAge() { return plasmaWordIndex.microDateDays(lastModified()); } public long lastModified() { return (int) this.entry.getColLong(col_lastModified); } public int hitcount() { return (int) this.entry.getColLong(col_hitcount); } public int posintext() { return (int) this.entry.getColLong(col_posintext); } public int posinphrase() { return (int) this.entry.getColLong(col_posinphrase); } public int posofphrase() { return (int) this.entry.getColLong(col_posofphrase); } public int wordsintext() { return (int) this.entry.getColLong(col_wordcount); } public int phrasesintext() { return (int) this.entry.getColLong(col_phrasecount); } public String getLanguage() { return this.entry.getColString(col_language, null); } public char getType() { return (char) this.entry.getColByte(col_doctype); } public kelondroBitfield flags() { return plasmaSearchQuery.empty_constraint; } public static indexRWIEntryOld combineDistance(indexRWIEntryOld ie1, indexRWIEntry ie2) { // returns a modified entry of the first argument ie1.entry.setCol(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext())); ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext())); ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/); ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase())); ie1.entry.setCol(col_wordcount, (ie1.wordsintext() + ie2.wordsintext()) / 2); return ie1; } public void combineDistance(indexRWIEntry oe) { combineDistance(this, oe); } public int worddistance() { return (int) this.entry.getColLong(col_worddistance); } public static final void min(indexRWIEntryOld t, indexRWIEntry other) { if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount()); if (t.wordsintext() > other.wordsintext()) t.entry.setCol(col_wordcount, other.wordsintext()); if (t.phrasesintext() > other.phrasesintext()) t.entry.setCol(col_phrasecount, other.phrasesintext()); if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext()); if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase()); if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase()); if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance()); if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified()); if (t.quality() > other.quality()) t.entry.setCol(col_quality, other.quality()); } public static final void max(indexRWIEntryOld t, indexRWIEntry other) { if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount()); if (t.wordsintext() < other.wordsintext()) t.entry.setCol(col_wordcount, other.wordsintext()); if (t.phrasesintext() < other.phrasesintext()) t.entry.setCol(col_phrasecount, other.phrasesintext()); if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext()); if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase()); if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase()); if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance()); if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified()); if (t.quality() < other.quality()) t.entry.setCol(col_quality, other.quality()); } public void min(indexRWIEntry other) { min(this, other); } public void max(indexRWIEntry other) { max(this, other); } static void normalize(indexRWIEntryOld t, indexRWIEntry min, indexRWIEntry max) { assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash(); assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash(); assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash(); if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm()); //System.out.println("Normalize:\nentry = " + t.toPropertyForm(true)); //System.out.println("min = " + min.toPropertyForm(true)); //System.out.println("max = " + max.toPropertyForm(true)); t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount())); t.entry.setCol(col_wordcount , (t.wordsintext() == 0) ? 0 : 1 + 255 * (t.wordsintext() - min.wordsintext() ) / (1 + max.wordsintext() - min.wordsintext())); t.entry.setCol(col_phrasecount , (t.phrasesintext() == 0) ? 0 : 1 + 255 * (t.phrasesintext() - min.phrasesintext() ) / (1 + max.phrasesintext() - min.phrasesintext())); t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext())); t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase())); t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase())); t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat. t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified())); t.entry.setCol(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality())); //System.out.println("out = " + t.toPropertyForm(true)); } public void normalize(indexRWIEntry min, indexRWIEntry max) { normalize(this, min, max); } public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max) { assert (this.urlHash().length() == 12) : "this.urlhash = " + this.urlHash(); indexRWIEntryOld e = (indexRWIEntryOld) this.clone(); e.normalize(min, max); return e; } public boolean isNewer(indexRWIEntry other) { if (other == null) return true; if (this.lastModified() > other.lastModified()) return true; if (this.lastModified() == other.lastModified()) { if (this.quality() > other.quality()) return true; } return false; } public boolean isOlder(indexRWIEntry other) { if (other == null) return false; if (this.lastModified() < other.lastModified()) return true; if (this.lastModified() == other.lastModified()) { if (this.quality() < other.quality()) return true; } return false; } public int llocal() { return 0; } public int lother() { return 0; } public int urlcomps() { return 0; } public int urllength() { return 0; } public int wordsintitle() { return 0; } }