mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
enhanced root-url detection
This commit is contained in:
parent
5a0eb1b268
commit
0f5b6f38c1
|
@ -541,8 +541,7 @@ public class IndexControlRWIs_p {
|
|||
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) ? "appears in author, " : "")
|
||||
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "")
|
||||
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "")
|
||||
+ ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "")
|
||||
+ ((DigestURI.probablyRootURL(entry.word().urlhash())) ? "probably root url" : ""));
|
||||
+ ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : ""));
|
||||
if ( Switchboard.urlBlacklist.isListed(BlacklistType.DHT, url) ) {
|
||||
prop.put("genUrlList_urlList_" + i + "_urlExists_urlhxChecked", "1");
|
||||
}
|
||||
|
|
|
@ -40,6 +40,7 @@ import net.yacy.cora.order.Base64Order;
|
|||
import net.yacy.cora.order.Digest;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.util.CommonPattern;
|
||||
import net.yacy.kelondro.index.RowHandleSet;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.ByteArray;
|
||||
|
||||
|
@ -278,20 +279,29 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
|
|||
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(sb.toString())).charAt(0);
|
||||
}
|
||||
|
||||
private static final char rootURLFlag0 = subdomPortPath("", 80, "");
|
||||
private static final char rootURLFlag1 = subdomPortPath("www", 80, "");
|
||||
private static final char rootURLFlag2 = subdomPortPath("", 21, "");
|
||||
private static final char rootURLFlag3 = subdomPortPath("ftp", 21, "");
|
||||
|
||||
public final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php");
|
||||
public final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
|
||||
|
||||
public final boolean probablyRootURL() {
|
||||
return this.path.length() == 0 || rootPattern.matcher(this.path).matches() || probablyRootURL(this.hash);
|
||||
return this.path.length() <= 1 || rootPattern.matcher(this.path).matches();
|
||||
}
|
||||
|
||||
public static final boolean probablyRootURL(final byte[] urlHash) {
|
||||
final char c = (char) urlHash[5];
|
||||
return c == rootURLFlag0 || c == rootURLFlag1 || c == rootURLFlag2 || c == rootURLFlag3;
|
||||
|
||||
public RowHandleSet getPossibleRootHashes() {
|
||||
RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
|
||||
String rootStub = this.getProtocol() + "://" + this.getHost();
|
||||
try {
|
||||
rootCandidates.put(new DigestURI(rootStub).hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/").hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/index.htm").hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/index.html").hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/index.php").hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/home.htm").hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/home.html").hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/home.php").hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/default.htm").hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/default.html").hash());
|
||||
rootCandidates.put(new DigestURI(rootStub + "/default.php").hash());
|
||||
} catch (Throwable e) {}
|
||||
return rootCandidates;
|
||||
}
|
||||
|
||||
private static final String hosthash5(final String protocol, final String host, final int port) {
|
||||
|
|
|
@ -267,7 +267,7 @@ public final class RowHandleMap implements HandleMap, Iterable<Map.Entry<byte[],
|
|||
@Override
|
||||
public final long add(final byte[] key, final long a) throws SpaceExceededException {
|
||||
assert key != null;
|
||||
assert a > 0; // it does not make sense to add 0. If this occurres, it is a performance issue
|
||||
assert a >= 0; // it does not make sense to add 0. If this occurres, it is a performance issue
|
||||
synchronized (this.index) {
|
||||
final Row.Entry indexentry = this.index.get(key, true);
|
||||
if (indexentry == null) {
|
||||
|
|
|
@ -330,15 +330,16 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
String docurl = digestURI.toNormalform(true);
|
||||
add(doc, YaCySchema.sku, docurl);
|
||||
|
||||
if (allAttr || contains(YaCySchema.clickdepth_i)) {
|
||||
boolean fronturl = digestURI.probablyRootURL();
|
||||
if (fronturl) {
|
||||
if ((allAttr || contains(YaCySchema.clickdepth_i)) && citations != null) {
|
||||
if (digestURI.probablyRootURL()) {
|
||||
boolean lc = this.lazy; this.lazy = false;
|
||||
add(doc, YaCySchema.clickdepth_i, 0);
|
||||
this.lazy = lc;
|
||||
} else {
|
||||
// search the citations for references
|
||||
int clickdepth = -1;
|
||||
try {
|
||||
clickdepth = getClickDepth(citations, digestURI.hash());
|
||||
clickdepth = getClickDepth(citations, digestURI);
|
||||
} catch (IOException e) {
|
||||
add(doc, YaCySchema.clickdepth_i, -1);
|
||||
}
|
||||
|
@ -840,8 +841,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
* @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached
|
||||
* @throws IOException
|
||||
*/
|
||||
private int getClickDepth(final IndexCell<CitationReference> citations, byte[] searchhash) throws IOException {
|
||||
private static int getClickDepth(final IndexCell<CitationReference> citations, final DigestURI url) throws IOException {
|
||||
|
||||
final byte[] searchhash = url.hash();
|
||||
RowHandleSet rootCandidates = url.getPossibleRootHashes();
|
||||
|
||||
RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
|
||||
RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
|
||||
try {levelhashes.put(searchhash);} catch (SpaceExceededException e) {throw new IOException(e);}
|
||||
|
@ -873,7 +877,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop;
|
||||
|
||||
// check if the url is a root url
|
||||
if (DigestURI.probablyRootURL(u)) {
|
||||
if (rootCandidates.has(u)) {
|
||||
return leveldepth + 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -255,8 +255,7 @@ public class ReferenceOrder {
|
|||
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
|
||||
+ ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0)
|
||||
+ ((DigestURI.probablyRootURL(t.urlhash())) ? 15 << this.ranking.coeff_urllength : 0);
|
||||
+ ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0);
|
||||
|
||||
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
|
||||
|
||||
|
@ -290,8 +289,7 @@ public class ReferenceOrder {
|
|||
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
|
||||
+ ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0)
|
||||
+ ((DigestURI.probablyRootURL(t.hash())) ? 15 << this.ranking.coeff_urllength : 0);
|
||||
+ ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0);
|
||||
return r; // the higher the number the better the ranking.
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user