enhanced root-url detection

This commit is contained in:
Michael Peter Christen 2013-01-03 19:21:21 +01:00
parent 5a0eb1b268
commit 0f5b6f38c1
5 changed files with 35 additions and 24 deletions

View File

@ -541,8 +541,7 @@ public class IndexControlRWIs_p {
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) ? "appears in author, " : "")
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "")
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "")
+ ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "")
+ ((DigestURI.probablyRootURL(entry.word().urlhash())) ? "probably root url" : ""));
+ ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : ""));
if ( Switchboard.urlBlacklist.isListed(BlacklistType.DHT, url) ) {
prop.put("genUrlList_urlList_" + i + "_urlExists_urlhxChecked", "1");
}

View File

@ -40,6 +40,7 @@ import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.util.CommonPattern;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteArray;
@ -278,20 +279,29 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(sb.toString())).charAt(0);
}
private static final char rootURLFlag0 = subdomPortPath("", 80, "");
private static final char rootURLFlag1 = subdomPortPath("www", 80, "");
private static final char rootURLFlag2 = subdomPortPath("", 21, "");
private static final char rootURLFlag3 = subdomPortPath("ftp", 21, "");
public final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php");
public final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
public final boolean probablyRootURL() {
return this.path.length() == 0 || rootPattern.matcher(this.path).matches() || probablyRootURL(this.hash);
return this.path.length() <= 1 || rootPattern.matcher(this.path).matches();
}
public static final boolean probablyRootURL(final byte[] urlHash) {
final char c = (char) urlHash[5];
return c == rootURLFlag0 || c == rootURLFlag1 || c == rootURLFlag2 || c == rootURLFlag3;
public RowHandleSet getPossibleRootHashes() {
RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
String rootStub = this.getProtocol() + "://" + this.getHost();
try {
rootCandidates.put(new DigestURI(rootStub).hash());
rootCandidates.put(new DigestURI(rootStub + "/").hash());
rootCandidates.put(new DigestURI(rootStub + "/index.htm").hash());
rootCandidates.put(new DigestURI(rootStub + "/index.html").hash());
rootCandidates.put(new DigestURI(rootStub + "/index.php").hash());
rootCandidates.put(new DigestURI(rootStub + "/home.htm").hash());
rootCandidates.put(new DigestURI(rootStub + "/home.html").hash());
rootCandidates.put(new DigestURI(rootStub + "/home.php").hash());
rootCandidates.put(new DigestURI(rootStub + "/default.htm").hash());
rootCandidates.put(new DigestURI(rootStub + "/default.html").hash());
rootCandidates.put(new DigestURI(rootStub + "/default.php").hash());
} catch (Throwable e) {}
return rootCandidates;
}
private static final String hosthash5(final String protocol, final String host, final int port) {

View File

@ -267,7 +267,7 @@ public final class RowHandleMap implements HandleMap, Iterable<Map.Entry<byte[],
@Override
public final long add(final byte[] key, final long a) throws SpaceExceededException {
assert key != null;
assert a > 0; // it does not make sense to add 0. If this occurres, it is a performance issue
assert a >= 0; // it does not make sense to add 0. If this occurres, it is a performance issue
synchronized (this.index) {
final Row.Entry indexentry = this.index.get(key, true);
if (indexentry == null) {

View File

@ -330,15 +330,16 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
String docurl = digestURI.toNormalform(true);
add(doc, YaCySchema.sku, docurl);
if (allAttr || contains(YaCySchema.clickdepth_i)) {
boolean fronturl = digestURI.probablyRootURL();
if (fronturl) {
if ((allAttr || contains(YaCySchema.clickdepth_i)) && citations != null) {
if (digestURI.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
add(doc, YaCySchema.clickdepth_i, 0);
this.lazy = lc;
} else {
// search the citations for references
int clickdepth = -1;
try {
clickdepth = getClickDepth(citations, digestURI.hash());
clickdepth = getClickDepth(citations, digestURI);
} catch (IOException e) {
add(doc, YaCySchema.clickdepth_i, -1);
}
@ -840,8 +841,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
* @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached
* @throws IOException
*/
private int getClickDepth(final IndexCell<CitationReference> citations, byte[] searchhash) throws IOException {
private static int getClickDepth(final IndexCell<CitationReference> citations, final DigestURI url) throws IOException {
final byte[] searchhash = url.hash();
RowHandleSet rootCandidates = url.getPossibleRootHashes();
RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
try {levelhashes.put(searchhash);} catch (SpaceExceededException e) {throw new IOException(e);}
@ -873,7 +877,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop;
// check if the url is a root url
if (DigestURI.probablyRootURL(u)) {
if (rootCandidates.has(u)) {
return leveldepth + 1;
}

View File

@ -255,8 +255,7 @@ public class ReferenceOrder {
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0)
+ ((DigestURI.probablyRootURL(t.urlhash())) ? 15 << this.ranking.coeff_urllength : 0);
+ ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0);
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
@ -290,8 +289,7 @@ public class ReferenceOrder {
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0)
+ ((DigestURI.probablyRootURL(t.hash())) ? 15 << this.ranking.coeff_urllength : 0);
+ ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0);
return r; // the higher the number the better the ranking.
}