mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- faster search: using different data structures that avoid multiplr calculations
- no more table copy for error-eco table - optional table copy for lurl-entries - more abstractions (less single constant strings) - better logging (using host names instead of ips) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4459 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
8358652fa9
commit
bd63999801
|
@ -102,7 +102,7 @@ public class BlogComments {
|
|||
}
|
||||
|
||||
String pagename = post.get("page", "blog_default");
|
||||
String ip = post.get("CLIENTIP", "127.0.0.1");
|
||||
String ip = post.get(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1");
|
||||
|
||||
String StrAuthor = post.get("author", "anonymous");
|
||||
|
||||
|
|
|
@ -105,9 +105,9 @@ public class CrawlProfileEditor_p {
|
|||
while (it.hasNext()) {
|
||||
selentry = (entry)it.next();
|
||||
if (selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
|
||||
selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
|
||||
selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) /*||
|
||||
selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
|
||||
selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
|
||||
selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)*/)
|
||||
continue;
|
||||
prop.put("profiles_" + count + "_name", selentry.name());
|
||||
prop.put("profiles_" + count + "_handle", selentry.handle());
|
||||
|
|
|
@ -212,7 +212,7 @@ public final class Settings_p {
|
|||
}
|
||||
|
||||
// clientIP
|
||||
prop.putHTML("clientIP", (String) header.get("CLIENTIP", "<unknown>"), true); // read an artificial header addendum
|
||||
prop.putHTML("clientIP", (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "<unknown>"), true); // read an artificial header addendum
|
||||
|
||||
/*
|
||||
* seed upload settings
|
||||
|
|
|
@ -50,7 +50,7 @@ public class TestApplet {
|
|||
//File templatefile=filehandler.getOverlayedFile((String)post.get("url"));
|
||||
File classfile = httpdFileHandler.getOverlayedClass((String)post.get("url"));
|
||||
httpHeader header2=new httpHeader();
|
||||
header2.put("CLIENTIP", "127.0.0.1");
|
||||
header2.put(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1");
|
||||
header2.put("PATH", post.get("url"));
|
||||
serverObjects tp=null;
|
||||
try {
|
||||
|
|
|
@ -79,7 +79,7 @@ public class User{
|
|||
prop.put("logged-in_identified-by", "2");
|
||||
//try via ip
|
||||
if(entry == null){
|
||||
entry=sb.userDB.ipAuth(((String)header.get("CLIENTIP", "xxxxxx")));
|
||||
entry=sb.userDB.ipAuth(((String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "xxxxxx")));
|
||||
if(entry != null){
|
||||
prop.put("logged-in_identified-by", "0");
|
||||
}
|
||||
|
@ -108,7 +108,7 @@ public class User{
|
|||
//identified via form-login
|
||||
//TODO: this does not work for a static admin, yet.
|
||||
}else if(post != null && post.containsKey("username") && post.containsKey("password")){
|
||||
//entry=sb.userDB.passwordAuth((String)post.get("username"), (String)post.get("password"), (String)header.get("CLIENTIP", "xxxxxx"));
|
||||
//entry=sb.userDB.passwordAuth((String)post.get("username"), (String)post.get("password"), (String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "xxxxxx"));
|
||||
String username=(String)post.get("username");
|
||||
String password=(String)post.get("password");
|
||||
|
||||
|
@ -163,7 +163,7 @@ public class User{
|
|||
if(post!=null && post.containsKey("logout")){
|
||||
prop.put("logged-in", "0");
|
||||
if(entry != null){
|
||||
entry.logout(((String)header.get("CLIENTIP", "xxxxxx")), userDB.getLoginToken(header.getHeaderCookies())); //todo: logout cookie
|
||||
entry.logout(((String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "xxxxxx")), userDB.getLoginToken(header.getHeaderCookies())); //todo: logout cookie
|
||||
}else{
|
||||
sb.userDB.adminLogout(userDB.getLoginToken(header.getHeaderCookies()));
|
||||
}
|
||||
|
|
|
@ -72,7 +72,7 @@ public class ViewImage {
|
|||
|
||||
String urlString = post.get("url", "");
|
||||
String urlLicense = post.get("code", "");
|
||||
boolean auth = ((String) header.get("CLIENTIP", "")).equals("localhost") || sb.verifyAuthentication(header, true); // handle access rights
|
||||
boolean auth = ((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || sb.verifyAuthentication(header, true); // handle access rights
|
||||
|
||||
yacyURL url = null;
|
||||
if ((urlString.length() > 0) && (auth)) try {
|
||||
|
|
|
@ -88,7 +88,7 @@ public class Wiki {
|
|||
|
||||
String access = switchboard.getConfig("WikiAccess", "admin");
|
||||
String pagename = post.get("page", "start");
|
||||
String ip = post.get("CLIENTIP", "127.0.0.1");
|
||||
String ip = post.get(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1");
|
||||
String author = post.get("author", "anonymous");
|
||||
if (author.equals("anonymous")) {
|
||||
author = wikiBoard.guessAuthor(ip);
|
||||
|
|
|
@ -78,7 +78,7 @@ public class welcome {
|
|||
prop.put("hostip", "Unknown Host Exception");
|
||||
}
|
||||
prop.put("port", serverCore.getPortNr(env.getConfig("port","8080")));
|
||||
prop.put("clientip", (String) header.get("CLIENTIP", ""));
|
||||
prop.put("clientip", (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, ""));
|
||||
|
||||
final String peertype = (yacyCore.seedDB.mySeed() == null) ? yacySeed.PEERTYPE_JUNIOR : yacyCore.seedDB.mySeed().get(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN);
|
||||
final boolean senior = (peertype.equals(yacySeed.PEERTYPE_SENIOR)) || (peertype.equals(yacySeed.PEERTYPE_PRINCIPAL));
|
||||
|
|
|
@ -103,7 +103,7 @@ public final class hello {
|
|||
// if ((properTest != null) && (! properTest.substring(0,1).equals("IP"))) { return null; }
|
||||
|
||||
// we easily know the caller's IP:
|
||||
final String clientip = (String) header.get("CLIENTIP", "<unknown>"); // read an artificial header addendum
|
||||
final String clientip = (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "<unknown>"); // read an artificial header addendum
|
||||
InetAddress ias = serverDomains.dnsResolve(clientip);
|
||||
if (ias == null) {
|
||||
prop.put("message", "cannot resolve your IP from your reported location " + clientip);
|
||||
|
|
|
@ -282,7 +282,7 @@ public final class search {
|
|||
// prepare search statistics
|
||||
Long trackerHandle = new Long(System.currentTimeMillis());
|
||||
HashMap<String, Object> searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp, urlRetrievalAllTime, snippetComputationAllTime);
|
||||
String client = (String) header.get("CLIENTIP");
|
||||
String client = (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP);
|
||||
searchProfile.put("host", client);
|
||||
yacySeed remotepeer = yacyCore.seedDB.lookupByIP(natLib.getInetAddress(client), true, false, false);
|
||||
searchProfile.put("peername", (remotepeer == null) ? "unknown" : remotepeer.getName());
|
||||
|
|
|
@ -89,14 +89,14 @@ public final class transfer {
|
|||
final yacySeed opeer = yacyCore.seedDB.get(ohash);
|
||||
if (opeer == null) {
|
||||
// reject unknown peers: this does not appear fair, but anonymous senders are dangerous
|
||||
sb.getLog().logFine("RankingTransmission: rejected unknown peer '" + ohash + "', current IP " + header.get("CLIENTIP", "unknown"));
|
||||
sb.getLog().logFine("RankingTransmission: rejected unknown peer '" + ohash + "', current IP " + header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "unknown"));
|
||||
return prop;
|
||||
}
|
||||
opeer.setLastSeenUTC();
|
||||
|
||||
if (filename.indexOf("..") >= 0) {
|
||||
// reject paths that contain '..' because they are dangerous
|
||||
sb.getLog().logFine("RankingTransmission: rejected wrong path '" + filename + "' from peer " + opeer.getName() + "/" + opeer.getPublicAddress()+ ", current IP " + header.get("CLIENTIP", "unknown"));
|
||||
sb.getLog().logFine("RankingTransmission: rejected wrong path '" + filename + "' from peer " + opeer.getName() + "/" + opeer.getPublicAddress()+ ", current IP " + header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "unknown"));
|
||||
return prop;
|
||||
}
|
||||
|
||||
|
|
|
@ -257,7 +257,7 @@ public class yacysearch {
|
|||
constraint,
|
||||
true);
|
||||
|
||||
String client = (String) header.get("CLIENTIP"); // the search client who initiated the search
|
||||
String client = (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP); // the search client who initiated the search
|
||||
|
||||
// tell all threads to do nothing for a specific time
|
||||
sb.intermissionAllThreads(10000);
|
||||
|
|
|
@ -155,7 +155,7 @@ public final class userDB {
|
|||
return null;
|
||||
}
|
||||
public Entry getUser(httpHeader header){
|
||||
return getUser((String) header.get(httpHeader.AUTHORIZATION), (String)header.get("CLIENTIP"), header.getHeaderCookies());
|
||||
return getUser((String) header.get(httpHeader.AUTHORIZATION), (String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP), header.getHeaderCookies());
|
||||
}
|
||||
public Entry getUser(String auth, String ip, String cookies){
|
||||
Entry entry=null;
|
||||
|
|
|
@ -85,7 +85,7 @@ public class httpSSI {
|
|||
conProp.setProperty(httpHeader.CONNECTION_PROP_PATH, path);
|
||||
conProp.setProperty(httpHeader.CONNECTION_PROP_ARGS, args);
|
||||
conProp.setProperty(httpHeader.CONNECTION_PROP_HTTP_VER, httpHeader.HTTP_VERSION_0_9);
|
||||
conProp.setProperty("CLIENTIP", "127.0.0.1");
|
||||
conProp.setProperty(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1");
|
||||
header.put(httpHeader.AUTHORIZATION, authorization);
|
||||
httpdFileHandler.doGet(conProp, header, out);
|
||||
}
|
||||
|
|
|
@ -193,7 +193,7 @@ public final class httpd implements serverHandler {
|
|||
public void initSession(serverCore.Session newsession) throws IOException {
|
||||
this.session = newsession;
|
||||
this.userAddress = session.userAddress; // client InetAddress
|
||||
this.clientIP = this.userAddress.getHostAddress();
|
||||
this.clientIP = this.userAddress.getHostName();
|
||||
if (this.userAddress.isAnyLocalAddress()) this.clientIP = "localhost";
|
||||
if (this.clientIP.equals("0:0:0:0:0:0:0:1")) this.clientIP = "localhost";
|
||||
if (this.clientIP.equals("127.0.0.1")) this.clientIP = "localhost";
|
||||
|
@ -1147,7 +1147,7 @@ public final class httpd implements serverHandler {
|
|||
// tp.put("host", serverCore.publicIP().getHostAddress());
|
||||
// tp.put("port", switchboard.getConfig("port", "8080"));
|
||||
|
||||
String clientIP = conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP,"127.0.0.1");
|
||||
String clientIP = conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1");
|
||||
|
||||
// check if ip is local ip address
|
||||
InetAddress hostAddress = serverDomains.dnsResolve(clientIP);
|
||||
|
|
|
@ -303,13 +303,13 @@ public final class httpdFileHandler {
|
|||
if ((path.substring(0,(pos==-1)?path.length():pos)).endsWith("_p") && (adminAccountBase64MD5.length() != 0)) {
|
||||
//authentication required
|
||||
//userDB
|
||||
if(sb.userDB.hasAdminRight(authorization, conProp.getProperty("CLIENTIP"), requestHeader.getHeaderCookies())){
|
||||
if(sb.userDB.hasAdminRight(authorization, conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP), requestHeader.getHeaderCookies())){
|
||||
//Authentication successful. remove brute-force flag
|
||||
serverCore.bfHost.remove(conProp.getProperty("CLIENTIP"));
|
||||
serverCore.bfHost.remove(conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP));
|
||||
//static
|
||||
}else if(authorization != null && httpd.staticAdminAuthenticated(authorization.trim().substring(6), switchboard)==4){
|
||||
//Authentication successful. remove brute-force flag
|
||||
serverCore.bfHost.remove(conProp.getProperty("CLIENTIP"));
|
||||
serverCore.bfHost.remove(conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP));
|
||||
//no auth
|
||||
}else if (authorization == null) {
|
||||
// no authorization given in response. Ask for that
|
||||
|
@ -323,7 +323,7 @@ public final class httpdFileHandler {
|
|||
return;
|
||||
} else {
|
||||
// a wrong authentication was given or the userDB user does not have admin access. Ask again
|
||||
String clientIP = conProp.getProperty("CLIENTIP", "unknown-host");
|
||||
String clientIP = conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP, "unknown-host");
|
||||
serverLog.logInfo("HTTPD", "Wrong log-in for account 'admin' in http file handler for path '" + path + "' from host '" + clientIP + "'");
|
||||
Integer attempts = (Integer) serverCore.bfHost.get(clientIP);
|
||||
if (attempts == null)
|
||||
|
@ -473,7 +473,7 @@ public final class httpdFileHandler {
|
|||
// call an image-servlet to produce an on-the-fly - generated image
|
||||
Object img = null;
|
||||
try {
|
||||
requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty("CLIENTIP"));
|
||||
requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP));
|
||||
requestHeader.put(httpHeader.CONNECTION_PROP_PATH, path);
|
||||
// in case that there are no args given, args = null or empty hashmap
|
||||
img = invokeServlet(targetClass, requestHeader, args);
|
||||
|
@ -527,7 +527,7 @@ public final class httpdFileHandler {
|
|||
}
|
||||
} else if ((targetClass != null) && (path.endsWith(".stream"))) {
|
||||
// call rewrite-class
|
||||
requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty("CLIENTIP"));
|
||||
requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP));
|
||||
requestHeader.put(httpHeader.CONNECTION_PROP_PATH, path);
|
||||
//requestHeader.put(httpHeader.CONNECTION_PROP_INPUTSTREAM, body);
|
||||
//requestHeader.put(httpHeader.CONNECTION_PROP_OUTPUTSTREAM, out);
|
||||
|
@ -570,7 +570,7 @@ public final class httpdFileHandler {
|
|||
} else {
|
||||
// CGI-class: call the class to create a property for rewriting
|
||||
try {
|
||||
requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty("CLIENTIP"));
|
||||
requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP));
|
||||
requestHeader.put(httpHeader.CONNECTION_PROP_PATH, path);
|
||||
// in case that there are no args given, args = null or empty hashmap
|
||||
Object tmp = invokeServlet(targetClass, requestHeader, args);
|
||||
|
@ -586,7 +586,7 @@ public final class httpdFileHandler {
|
|||
if (tp.containsKey(servletProperties.ACTION_AUTHENTICATE)) {
|
||||
// handle brute-force protection
|
||||
if (authorization != null) {
|
||||
String clientIP = conProp.getProperty("CLIENTIP", "unknown-host");
|
||||
String clientIP = conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP, "unknown-host");
|
||||
serverLog.logInfo("HTTPD", "dynamic log-in for account 'admin' in http file handler for path '" + path + "' from host '" + clientIP + "'");
|
||||
Integer attempts = (Integer) serverCore.bfHost.get(clientIP);
|
||||
if (attempts == null)
|
||||
|
|
|
@ -47,8 +47,6 @@ public interface indexRWIEntry {
|
|||
|
||||
public String urlHash();
|
||||
|
||||
public int quality();
|
||||
|
||||
public int virtualAge();
|
||||
|
||||
public long lastModified();
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
|
||||
package de.anomic.index;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
@ -55,12 +56,13 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
this.maxdomcount = 0;
|
||||
}
|
||||
|
||||
public void normalizeWith(indexContainer container) {
|
||||
public ArrayList<indexRWIVarEntry> normalizeWith(indexContainer container) {
|
||||
// normalize ranking: find minimum and maxiumum of separate ranking criteria
|
||||
assert (container != null);
|
||||
ArrayList<indexRWIVarEntry> result = null;
|
||||
|
||||
//long s0 = System.currentTimeMillis();
|
||||
if ((processors > 1) && (container.size() > 10000)) {
|
||||
if ((processors > 1) && (container.size() > 600)) {
|
||||
// run minmax with two threads
|
||||
int middle = container.size() / 2;
|
||||
minmaxfinder mmf0 = new minmaxfinder(container, 0, middle);
|
||||
|
@ -83,6 +85,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
entry = di.next();
|
||||
this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue());
|
||||
}
|
||||
result = mmf0.decodedEntries;
|
||||
result.addAll(mmf1.decodedContainer());
|
||||
//long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0);
|
||||
//System.out.println("***DEBUG*** indexRWIEntry.Order (2-THREADED): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond");
|
||||
} else if (container.size() > 0) {
|
||||
|
@ -97,10 +101,12 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
entry = di.next();
|
||||
this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue());
|
||||
}
|
||||
result = mmf.decodedContainer();
|
||||
//long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0);
|
||||
//System.out.println("***DEBUG*** indexRWIEntry.Order (ONETHREAD): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond");
|
||||
}
|
||||
if (this.doms.size() > 0) this.maxdomcount = this.doms.getMaxScore();
|
||||
return result;
|
||||
}
|
||||
|
||||
public kelondroOrder<indexRWIVarEntry> clone() {
|
||||
|
@ -179,6 +185,7 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
private int start, end;
|
||||
private HashMap<String, Integer> doms;
|
||||
private Integer int1;
|
||||
ArrayList<indexRWIVarEntry> decodedEntries;
|
||||
|
||||
public minmaxfinder(indexContainer container, int start /*including*/, int end /*excluding*/) {
|
||||
this.container = container;
|
||||
|
@ -186,18 +193,20 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
this.end = end;
|
||||
this.doms = new HashMap<String, Integer>();
|
||||
this.int1 = new Integer(1);
|
||||
this.decodedEntries = new ArrayList<indexRWIVarEntry>();
|
||||
}
|
||||
|
||||
public void run() {
|
||||
// find min/max to obtain limits for normalization
|
||||
this.entryMin = null;
|
||||
this.entryMax = null;
|
||||
indexRWIRowEntry iEntry;
|
||||
indexRWIVarEntry iEntry;
|
||||
int p = this.start;
|
||||
String dom;
|
||||
Integer count;
|
||||
while (p < this.end) {
|
||||
iEntry = new indexRWIRowEntry(container.get(p++));
|
||||
iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++)));
|
||||
this.decodedEntries.add(iEntry);
|
||||
// find min/max
|
||||
if (this.entryMin == null) this.entryMin = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.min(this.entryMin, iEntry);
|
||||
if (this.entryMax == null) this.entryMax = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.max(this.entryMax, iEntry);
|
||||
|
@ -212,6 +221,10 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
}
|
||||
}
|
||||
|
||||
public ArrayList<indexRWIVarEntry> decodedContainer() {
|
||||
return this.decodedEntries;
|
||||
}
|
||||
|
||||
public HashMap<String, Integer> domcount() {
|
||||
return this.doms;
|
||||
}
|
||||
|
|
|
@ -88,6 +88,8 @@ public final class indexRWIRowEntry implements indexRWIEntry {
|
|||
private static final int col_worddistance = 18; // i 1 initial zero; may be used as reserve: is filled during search
|
||||
private static final int col_reserve = 19; // k 1 reserve
|
||||
|
||||
public double termFrequency;
|
||||
|
||||
private kelondroRow.Entry entry;
|
||||
|
||||
public indexRWIRowEntry(String urlHash,
|
||||
|
@ -101,14 +103,14 @@ public final class indexRWIRowEntry implements indexRWIEntry {
|
|||
int posinphrase, // position of word in its phrase
|
||||
int posofphrase, // number of the phrase where word appears
|
||||
int worddistance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
|
||||
int sizeOfPage, // # of bytes of the page TODO: not needed any more
|
||||
long lastmodified, // last-modified time of the document where word appears
|
||||
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
|
||||
String language, // (guessed) language of document
|
||||
char doctype, // type of document
|
||||
int outlinksSame, // outlinks to same domain
|
||||
int outlinksOther, // outlinks to other domain
|
||||
kelondroBitfield flags // attributes to the url and to the word according the url
|
||||
kelondroBitfield flags, // attributes to the url and to the word according the url
|
||||
double termFrequency
|
||||
) {
|
||||
|
||||
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
|
||||
|
@ -136,6 +138,7 @@ public final class indexRWIRowEntry implements indexRWIEntry {
|
|||
this.entry.setCol(col_posofphrase, posofphrase);
|
||||
this.entry.setCol(col_worddistance, worddistance);
|
||||
this.entry.setCol(col_reserve, 0);
|
||||
this.termFrequency = termFrequency;
|
||||
}
|
||||
|
||||
public indexRWIRowEntry(String urlHash, String code) {
|
||||
|
@ -183,10 +186,6 @@ public final class indexRWIRowEntry implements indexRWIEntry {
|
|||
return this.entry.getColString(col_urlhash, null);
|
||||
}
|
||||
|
||||
public int quality() {
|
||||
return 0; // not used any more
|
||||
}
|
||||
|
||||
public int virtualAge() {
|
||||
return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format
|
||||
}
|
||||
|
@ -256,7 +255,8 @@ public final class indexRWIRowEntry implements indexRWIEntry {
|
|||
}
|
||||
|
||||
public double termFrequency() {
|
||||
return (((double) this.hitcount()) / ((double) (this.wordsintext() + this.wordsintitle() + 1)));
|
||||
if (this.termFrequency == 0.0) this.termFrequency = (((double) this.hitcount()) / ((double) (this.wordsintext() + this.wordsintitle() + 1)));
|
||||
return this.termFrequency;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
@ -288,18 +288,12 @@ public final class indexRWIRowEntry implements indexRWIEntry {
|
|||
public boolean isNewer(indexRWIEntry other) {
|
||||
if (other == null) return true;
|
||||
if (this.lastModified() > other.lastModified()) return true;
|
||||
if (this.lastModified() == other.lastModified()) {
|
||||
if (this.quality() > other.quality()) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isOlder(indexRWIEntry other) {
|
||||
if (other == null) return false;
|
||||
if (this.lastModified() < other.lastModified()) return true;
|
||||
if (this.lastModified() == other.lastModified()) {
|
||||
if (this.quality() < other.quality()) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
public char type;
|
||||
public int hitcount, llocal, lother, phrasesintext, posintext,
|
||||
posinphrase, posofphrase,
|
||||
quality, urlcomps, urllength, virtualAge,
|
||||
urlcomps, urllength, virtualAge,
|
||||
worddistance, wordsintext, wordsintitle;
|
||||
public double termFrequency;
|
||||
|
||||
|
@ -55,7 +55,6 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
this.posintext = e.posintext();
|
||||
this.posinphrase = e.posinphrase();
|
||||
this.posofphrase = e.posofphrase();
|
||||
this.quality = e.quality();
|
||||
this.urlcomps = e.urlcomps();
|
||||
this.urllength = e.urllength();
|
||||
this.virtualAge = e.virtualAge();
|
||||
|
@ -134,8 +133,28 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
return posofphrase;
|
||||
}
|
||||
|
||||
public int quality() {
|
||||
return quality;
|
||||
private indexRWIRowEntry toRowEntry() {
|
||||
return new indexRWIRowEntry(
|
||||
urlHash,
|
||||
urllength, // byte-length of complete URL
|
||||
urlcomps, // number of path components
|
||||
wordsintitle, // length of description/length (longer are better?)
|
||||
hitcount, // how often appears this word in the text
|
||||
wordsintext, // total number of words
|
||||
phrasesintext, // total number of phrases
|
||||
posintext, // position of word in all words
|
||||
posinphrase, // position of word in its phrase
|
||||
posofphrase, // number of the phrase where word appears
|
||||
worddistance, // word distance
|
||||
lastModified, // last-modified time of the document where word appears
|
||||
System.currentTimeMillis(), // update time;
|
||||
language, // (guessed) language of document
|
||||
type, // type of document
|
||||
llocal, // outlinks to same domain
|
||||
lother, // outlinks to other domain
|
||||
flags, // attributes to the url and to the word according the url
|
||||
termFrequency
|
||||
);
|
||||
}
|
||||
|
||||
public Entry toKelondroEntry() {
|
||||
|
@ -144,8 +163,7 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
}
|
||||
|
||||
public String toPropertyForm() {
|
||||
assert false; // should not be used
|
||||
return null;
|
||||
return toRowEntry().toPropertyForm();
|
||||
}
|
||||
|
||||
public String urlHash() {
|
||||
|
@ -177,7 +195,8 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
}
|
||||
|
||||
public double termFrequency() {
|
||||
return termFrequency;
|
||||
if (this.termFrequency == 0.0) this.termFrequency = (((double) this.hitcount()) / ((double) (this.wordsintext() + this.wordsintitle() + 1)));
|
||||
return this.termFrequency;
|
||||
}
|
||||
|
||||
public static final void min(indexRWIVarEntry t, indexRWIEntry other) {
|
||||
|
@ -187,7 +206,6 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
if (t.hitcount() > (v = other.hitcount())) t.hitcount = v;
|
||||
if (t.llocal() > (v = other.llocal())) t.llocal = v;
|
||||
if (t.lother() > (v = other.lother())) t.lother = v;
|
||||
if (t.quality() > (v = other.quality())) t.quality = v;
|
||||
if (t.virtualAge() > (v = other.virtualAge())) t.virtualAge = v;
|
||||
if (t.wordsintext() > (v = other.wordsintext())) t.wordsintext = v;
|
||||
if (t.phrasesintext() > (v = other.phrasesintext())) t.phrasesintext = v;
|
||||
|
@ -210,7 +228,6 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
if (t.hitcount() < (v = other.hitcount())) t.hitcount = v;
|
||||
if (t.llocal() < (v = other.llocal())) t.llocal = v;
|
||||
if (t.lother() < (v = other.lother())) t.lother = v;
|
||||
if (t.quality() < (v = other.quality())) t.quality = v;
|
||||
if (t.virtualAge() < (v = other.virtualAge())) t.virtualAge = v;
|
||||
if (t.wordsintext() < (v = other.wordsintext())) t.wordsintext = v;
|
||||
if (t.phrasesintext() < (v = other.phrasesintext())) t.phrasesintext = v;
|
||||
|
|
|
@ -115,7 +115,7 @@ public class indexURLEntry {
|
|||
|
||||
private kelondroRow.Entry entry;
|
||||
private String snippet;
|
||||
private indexRWIRowEntry word; // this is only used if the url is transported via remote search requests
|
||||
private indexRWIEntry word; // this is only used if the url is transported via remote search requests
|
||||
private long ranking; // during generation of a search result this value is set
|
||||
|
||||
public indexURLEntry(
|
||||
|
@ -185,7 +185,7 @@ public class indexURLEntry {
|
|||
return s.toString().getBytes();
|
||||
}
|
||||
|
||||
public indexURLEntry(kelondroRow.Entry entry, indexRWIRowEntry searchedWord, long ranking) {
|
||||
public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord, long ranking) {
|
||||
this.entry = entry;
|
||||
this.snippet = null;
|
||||
this.word = searchedWord;
|
||||
|
@ -391,7 +391,7 @@ public class indexURLEntry {
|
|||
return snippet;
|
||||
}
|
||||
|
||||
public indexRWIRowEntry word() {
|
||||
public indexRWIEntry word() {
|
||||
return word;
|
||||
}
|
||||
|
||||
|
|
|
@ -119,7 +119,7 @@ public class kelondroSplitTable implements kelondroIndex {
|
|||
// this is a kelonodroFlex table
|
||||
table = new kelondroCache(new kelondroFlexTable(path, maxf, preloadTime, rowdef, 0, resetOnFail));
|
||||
} else {
|
||||
table = new kelondroEcoTable(f, rowdef, kelondroEcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0);
|
||||
table = new kelondroEcoTable(f, rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0);
|
||||
}
|
||||
tables.put(date, table);
|
||||
}
|
||||
|
|
|
@ -66,7 +66,7 @@ import java.util.LinkedList;
|
|||
import de.anomic.data.htmlTools;
|
||||
import de.anomic.http.httpc;
|
||||
import de.anomic.http.httpc.response;
|
||||
import de.anomic.index.indexRWIRowEntry;
|
||||
import de.anomic.index.indexRWIEntry;
|
||||
import de.anomic.index.indexURLEntry;
|
||||
import de.anomic.kelondro.kelondroBase64Order;
|
||||
import de.anomic.kelondro.kelondroCache;
|
||||
|
@ -153,7 +153,7 @@ public final class plasmaCrawlLURL {
|
|||
return 0;
|
||||
}
|
||||
|
||||
public synchronized indexURLEntry load(String urlHash, indexRWIRowEntry searchedWord, long ranking) {
|
||||
public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord, long ranking) {
|
||||
// generates an plasmaLURLEntry using the url hash
|
||||
// to speed up the access, the url-hashes are buffered
|
||||
// in the hash cache.
|
||||
|
|
|
@ -69,7 +69,7 @@ public class plasmaCrawlZURL {
|
|||
if (f.isDirectory()) kelondroFlexTable.delete(cachePath, tablename); else f.delete();
|
||||
}
|
||||
}
|
||||
urlIndex = new kelondroEcoTable(f, rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0);
|
||||
urlIndex = new kelondroEcoTable(f, rowdef, kelondroEcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0);
|
||||
//urlIndex = new kelondroFlexTable(cachePath, tablename, -1, rowdef, 0, true);
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@ package de.anomic.plasma;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
@ -40,6 +41,7 @@ import de.anomic.index.indexContainer;
|
|||
import de.anomic.index.indexRWIEntry;
|
||||
import de.anomic.index.indexRWIEntryOrder;
|
||||
import de.anomic.index.indexRWIRowEntry;
|
||||
import de.anomic.index.indexRWIVarEntry;
|
||||
import de.anomic.index.indexURLEntry;
|
||||
import de.anomic.kelondro.kelondroBinSearch;
|
||||
import de.anomic.kelondro.kelondroMScoreCluster;
|
||||
|
@ -52,8 +54,8 @@ public final class plasmaSearchRankingProcess {
|
|||
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
|
||||
private static boolean useYBR = true;
|
||||
|
||||
private TreeMap<Object, indexRWIRowEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
|
||||
private HashMap<String, TreeMap<Object, indexRWIRowEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
|
||||
private TreeMap<Object, indexRWIVarEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
|
||||
private HashMap<String, TreeMap<Object, indexRWIVarEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
|
||||
private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
|
||||
private plasmaSearchQuery query;
|
||||
private int sortorder;
|
||||
|
@ -72,8 +74,8 @@ public final class plasmaSearchRankingProcess {
|
|||
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
||||
// sortorder: 0 = hash, 1 = url, 2 = ranking
|
||||
this.localSearchContainerMaps = null;
|
||||
this.sortedRWIEntries = new TreeMap<Object, indexRWIRowEntry>();
|
||||
this.doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIRowEntry>>();
|
||||
this.sortedRWIEntries = new TreeMap<Object, indexRWIVarEntry>();
|
||||
this.doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIVarEntry>>();
|
||||
this.handover = new HashMap<String, String>();
|
||||
this.order = null;
|
||||
this.query = query;
|
||||
|
@ -132,11 +134,11 @@ public final class plasmaSearchRankingProcess {
|
|||
this.remote_indexCount += index.size();
|
||||
}
|
||||
|
||||
indexRWIRowEntry ientry;
|
||||
indexRWIVarEntry ientry;
|
||||
indexURLEntry uentry;
|
||||
String u;
|
||||
loop: while (en.hasNext()) {
|
||||
ientry = en.next();
|
||||
ientry = new indexRWIVarEntry(en.next());
|
||||
|
||||
// check constraints
|
||||
if (!testFlags(ientry)) continue loop;
|
||||
|
@ -183,13 +185,13 @@ public final class plasmaSearchRankingProcess {
|
|||
if (this.order == null) {
|
||||
this.order = new indexRWIEntryOrder(query.ranking);
|
||||
}
|
||||
this.order.normalizeWith(index);
|
||||
ArrayList<indexRWIVarEntry> decodedEntries = this.order.normalizeWith(index);
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer));
|
||||
|
||||
// normalize entries and get ranking
|
||||
timer = System.currentTimeMillis();
|
||||
Iterator<indexRWIRowEntry> i = index.entries();
|
||||
indexRWIRowEntry iEntry, l;
|
||||
Iterator<indexRWIVarEntry> i = decodedEntries.iterator();
|
||||
indexRWIVarEntry iEntry, l;
|
||||
long biggestEntry = 0;
|
||||
//long s0 = System.currentTimeMillis();
|
||||
Long r;
|
||||
|
@ -272,8 +274,8 @@ public final class plasmaSearchRankingProcess {
|
|||
private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) {
|
||||
// returns from the current RWI list the best entry and removed this entry from the list
|
||||
Object bestEntry;
|
||||
TreeMap<Object, indexRWIRowEntry> m;
|
||||
indexRWIRowEntry rwi;
|
||||
TreeMap<Object, indexRWIVarEntry> m;
|
||||
indexRWIVarEntry rwi;
|
||||
while (sortedRWIEntries.size() > 0) {
|
||||
bestEntry = sortedRWIEntries.firstKey();
|
||||
rwi = sortedRWIEntries.remove(bestEntry);
|
||||
|
@ -283,7 +285,7 @@ public final class plasmaSearchRankingProcess {
|
|||
m = this.doubleDomCache.get(domhash);
|
||||
if (m == null) {
|
||||
// first appearance of dom
|
||||
m = new TreeMap<Object, indexRWIRowEntry>();
|
||||
m = new TreeMap<Object, indexRWIVarEntry>();
|
||||
this.doubleDomCache.put(domhash, m);
|
||||
return new Object[]{bestEntry, rwi};
|
||||
}
|
||||
|
@ -292,10 +294,10 @@ public final class plasmaSearchRankingProcess {
|
|||
}
|
||||
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
|
||||
// find best entry from all caches
|
||||
Iterator<TreeMap<Object, indexRWIRowEntry>> i = this.doubleDomCache.values().iterator();
|
||||
Iterator<TreeMap<Object, indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
|
||||
bestEntry = null;
|
||||
Object o;
|
||||
indexRWIRowEntry bestrwi = null;
|
||||
indexRWIVarEntry bestrwi = null;
|
||||
while (i.hasNext()) {
|
||||
m = i.next();
|
||||
if (m.size() == 0) continue;
|
||||
|
@ -331,7 +333,7 @@ public final class plasmaSearchRankingProcess {
|
|||
while ((sortedRWIEntries.size() > 0) || (size() > 0)) {
|
||||
Object[] obrwi = bestRWI(skipDoubleDom);
|
||||
Object bestEntry = obrwi[0];
|
||||
indexRWIRowEntry ientry = (indexRWIRowEntry) obrwi[1];
|
||||
indexRWIVarEntry ientry = (indexRWIVarEntry) obrwi[1];
|
||||
long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0;
|
||||
indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking);
|
||||
if (u != null) {
|
||||
|
@ -347,7 +349,7 @@ public final class plasmaSearchRankingProcess {
|
|||
public synchronized int size() {
|
||||
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
|
||||
int c = sortedRWIEntries.size();
|
||||
Iterator<TreeMap<Object, indexRWIRowEntry>> i = this.doubleDomCache.values().iterator();
|
||||
Iterator<TreeMap<Object, indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
|
||||
while (i.hasNext()) c += i.next().size();
|
||||
return c;
|
||||
}
|
||||
|
|
|
@ -414,7 +414,7 @@ public class plasmaSnippetCache {
|
|||
resInfo = entry.getDocumentInfo();
|
||||
|
||||
// read resource body (if it is there)
|
||||
byte []resourceArray = entry.cacheArray();
|
||||
byte[] resourceArray = entry.cacheArray();
|
||||
if (resourceArray != null) {
|
||||
resContent = new ByteArrayInputStream(resourceArray);
|
||||
resContentLength = resourceArray.length;
|
||||
|
|
|
@ -906,7 +906,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
} catch (MalformedURLException e) {
|
||||
}
|
||||
} else {
|
||||
File networkUnitDefinitionFile = new File(rootPath, networkUnitDefinition);
|
||||
File networkUnitDefinitionFile = (networkUnitDefinition.startsWith("/")) ? new File(networkUnitDefinition) : new File(rootPath, networkUnitDefinition);
|
||||
if (networkUnitDefinitionFile.exists()) {
|
||||
initProps = serverFileUtils.loadHashMap(networkUnitDefinitionFile);
|
||||
this.setConfig(initProps);
|
||||
|
@ -2348,14 +2348,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
wordStat.posInPhrase,
|
||||
wordStat.numOfPhrase,
|
||||
0,
|
||||
newEntry.size(),
|
||||
docDate.getTime(),
|
||||
System.currentTimeMillis(),
|
||||
language,
|
||||
doctype,
|
||||
ioLinks[0].intValue(),
|
||||
ioLinks[1].intValue(),
|
||||
condenser.RESULT_FLAGS
|
||||
condenser.RESULT_FLAGS,
|
||||
0.0
|
||||
);
|
||||
indexContainer wordIdxContainer = plasmaWordIndex.emptyContainer(wordHash, 1);
|
||||
wordIdxContainer.add(wordIdxEntry);
|
||||
|
@ -2573,10 +2573,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
if (authorization.length() > 256) return 0;
|
||||
|
||||
// authorization by encoded password, only for localhost access
|
||||
if ((((String) header.get("CLIENTIP", "")).equals("localhost")) && (adminAccountBase64MD5.equals(authorization))) return 3; // soft-authenticated for localhost
|
||||
if ((((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost")) && (adminAccountBase64MD5.equals(authorization))) return 3; // soft-authenticated for localhost
|
||||
|
||||
// authorization by hit in userDB
|
||||
if (userDB.hasAdminRight((String) header.get(httpHeader.AUTHORIZATION, "xxxxxx"), ((String) header.get("CLIENTIP", "")), header.getHeaderCookies())) return 4; //return, because 4=max
|
||||
if (userDB.hasAdminRight((String) header.get(httpHeader.AUTHORIZATION, "xxxxxx"), ((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")), header.getHeaderCookies())) return 4; //return, because 4=max
|
||||
|
||||
// authorization with admin keyword in configuration
|
||||
return httpd.staticAdminAuthenticated(authorization, this);
|
||||
|
|
|
@ -314,13 +314,13 @@ public final class plasmaWordIndex implements indexRI {
|
|||
wprop.posInPhrase,
|
||||
wprop.numOfPhrase,
|
||||
0,
|
||||
size,
|
||||
urlModified.getTime(),
|
||||
System.currentTimeMillis(),
|
||||
language,
|
||||
doctype,
|
||||
outlinksSame, outlinksOther,
|
||||
wprop.flags);
|
||||
wprop.flags,
|
||||
0.0);
|
||||
addEntry(plasmaCondenser.word2hash(word), ientry, System.currentTimeMillis(), false);
|
||||
wordCount++;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user