// plasmaSearchRankingProcess.java // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 07.11.2007 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedRevision: 1986 $ // $LastChangedBy: orbiter $ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.plasma; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIEntryOrder; import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; import de.anomic.yacy.yacyURL; public final class plasmaSearchRankingProcess { public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; private TreeMap pageAcc; // key = ranking (Long); value = indexRWIEntry private plasmaSearchQuery query; private plasmaSearchRankingProfile ranking; private int filteredCount; private indexRWIEntryOrder order; private plasmaSearchProcessing process; private int maxentries; private int globalcount; private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic public plasmaSearchRankingProcess(plasmaSearchQuery query, plasmaSearchProcessing process, plasmaSearchRankingProfile ranking, int maxentries) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime this.pageAcc = new TreeMap(); this.process = process; this.order = null; this.query = query; this.ranking = ranking; this.maxentries = maxentries; this.globalcount = 0; this.urlhashes = new HashMap(); this.ref = new kelondroMScoreCluster(); } public void insert(indexContainer container, boolean local) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime assert (container != null); if (container.size() == 0) return; process.startTimer(); if (this.order == null) { this.order = new indexRWIEntryOrder(ranking); } this.order.extend(container); process.yield("normalizing", container.size()); /* container.setOrdering(o, 0); container.sort(); */ // normalize entries and get ranking process.startTimer(); Iterator i = container.entries(); this.pageAcc = new TreeMap(); indexRWIEntry iEntry, l; long biggestEntry = 0; //long s0 = System.currentTimeMillis(); Long r; while (i.hasNext()) { iEntry = (indexRWIEntry) i.next(); if (iEntry.urlHash().length() != container.row().primaryKeyLength) continue; r = new Long(order.cardinal(iEntry)); if ((pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue; // check constraints if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; } if (pageAcc.size() < maxentries) { if (urlhashes.containsKey(iEntry.urlHash())) continue; while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1); pageAcc.put(r, iEntry); } else { if (r.longValue() > biggestEntry) { continue; } else { if (urlhashes.containsKey(iEntry.urlHash())) continue; l = (indexRWIEntry) pageAcc.remove((Long) pageAcc.lastKey()); urlhashes.remove(l.urlHash()); while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1); pageAcc.put(r, iEntry); biggestEntry = order.cardinal((indexRWIEntry) pageAcc.get(pageAcc.lastKey())); } } urlhashes.put(iEntry.urlHash(), r); // increase counter for statistics if (!local) this.globalcount++; } this.filteredCount = pageAcc.size(); //long sc = Math.max(1, System.currentTimeMillis() - s0); //System.out.println("###DEBUG### time to sort " + container.size() + " entries to " + this.filteredCount + ": " + sc + " milliseconds, " + (container.size() / sc) + " entries/millisecond, ranking = " + tc); if (container.size() > query.neededResults()) remove(true, true); process.yield(plasmaSearchProcessing.PRESORT, container.size()); } public int size() { assert pageAcc.size() == urlhashes.size(); return pageAcc.size(); } public int filteredCount() { return this.filteredCount; } public int getGlobalCount() { return this.globalcount; } public indexRWIEntry remove(String urlHash) { Long r = (Long) urlhashes.get(urlHash); if (r == null) return null; assert pageAcc.containsKey(r); indexRWIEntry iEntry = (indexRWIEntry) pageAcc.remove(r); urlhashes.remove(urlHash); return iEntry; } public Iterator entries() { // returns an iterator of indexRWIEntry objects in the ranked order, best entry first return this.pageAcc.values().iterator(); } public Set getReferences(int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE); TreeSet s = new TreeSet(String.CASE_INSENSITIVE_ORDER); for (int i = 0; i < refs.length; i++) { s.add((String) refs[i]); } return s; } public void addReferences(String[] words) { String word; for (int i = 0; i < words.length; i++) { word = words[i].toLowerCase(); if ((word.length() > 2) && ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) && (!(query.queryHashes.contains(plasmaCondenser.word2hash(word))))) ref.incScore(word); } } protected void addReferences(plasmaSearchEvent.ResultEntry resultEntry) { // take out relevant information for reference computation if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description // add references addReferences(urlcomps); addReferences(descrcomps); } public indexRWIEntryOrder getOrder() { return this.order; } private void remove(boolean rootDomExt, boolean doubleDom) { // this removes all refererences to urls that are extended paths of existing 'RootDom'-urls if (pageAcc.size() <= query.neededResults()) return; HashSet rootDoms = new HashSet(); HashSet doubleDoms = new HashSet(); Iterator i = pageAcc.entrySet().iterator(); Map.Entry entry; indexRWIEntry iEntry; String hashpart; boolean isWordRootURL; TreeSet querywords = plasmaSearchQuery.cleanQuery(query.queryString())[0]; while (i.hasNext()) { if (pageAcc.size() <= query.neededResults()) break; entry = (Map.Entry) i.next(); iEntry = (indexRWIEntry) entry.getValue(); hashpart = iEntry.urlHash().substring(6); isWordRootURL = yacyURL.isWordRootURL(iEntry.urlHash(), querywords); if (isWordRootURL) { rootDoms.add(hashpart); } else { if (((rootDomExt) && (rootDoms.contains(hashpart))) || ((doubleDom) && (doubleDoms.contains(hashpart)))) { i.remove(); } } doubleDoms.add(hashpart); } } public static void loadYBR(File rankingPath, int count) { // load ranking tables if (rankingPath.exists()) { ybrTables = new kelondroBinSearch[count]; String ybrName; File f; try { for (int i = 0; i < count; i++) { ybrName = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx"; f = new File(rankingPath, ybrName); if (f.exists()) { ybrTables[i] = new kelondroBinSearch(serverFileUtils.read(f), 6); } else { ybrTables[i] = null; } } } catch (IOException e) { ybrTables = null; } } else { ybrTables = null; } } public static boolean canUseYBR() { return ybrTables != null; } public static boolean isUsingYBR() { return useYBR; } public static void switchYBR(boolean usage) { useYBR = usage; } public static int ybr(String urlHash) { // returns the YBR value in a range of 0..15, where 0 means best ranking and 15 means worst ranking if (ybrTables == null) return 15; if (!(useYBR)) return 15; final String domHash = urlHash.substring(6); for (int i = 0; i < ybrTables.length; i++) { if ((ybrTables[i] != null) && (ybrTables[i].contains(domHash.getBytes()))) { //System.out.println("YBR FOUND: " + urlHash + " (" + i + ")"); return i; } } //System.out.println("NOT FOUND: " + urlHash); return 15; } }