Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

This commit is contained in:
orbiter 2013-09-23 20:59:03 +02:00
commit 214a087cdf
4 changed files with 40 additions and 29 deletions

View File

@ -48,7 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
public class DigestURL extends MultiProtocolURL implements Serializable {
private static final long serialVersionUID = -1173233022912141885L;
public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
// class variables
private byte[] hash;

View File

@ -619,7 +619,7 @@ public class Segment {
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) {

View File

@ -201,20 +201,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* add uri attributes to solr document
* @param doc
* @param allAttr
* @param digestURI
* @param digestURL
* @param doctype
* @return the normalized url
*/
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURI, final char doctype) {
add(doc, CollectionSchema.id, ASCII.String(digestURI.hash()));
String us = digestURI.toNormalform(true);
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL, final char doctype) {
add(doc, CollectionSchema.id, ASCII.String(digestURL.hash()));
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, digestURL.hosthash());
String us = digestURL.toNormalform(true);
add(doc, CollectionSchema.sku, us);
if (allAttr || contains(CollectionSchema.ip_s)) {
final InetAddress address = digestURI.getInetAddress();
final InetAddress address = digestURL.getInetAddress();
if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress());
}
String host = null;
if ((host = digestURI.getHost()) != null) {
if ((host = digestURL.getHost()) != null) {
String dnc = Domains.getDNC(host);
String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1);
int p = subdomOrga.lastIndexOf('.');
@ -228,17 +229,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// path elements of link
String filename = digestURI.getFileName();
String filename = digestURL.getFileName();
String extension = MultiProtocolURL.getFileExtension(filename);
if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length());
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURI.getProtocol());
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol());
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURL.getPaths());
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, doctype));
Map<String, String> searchpart = digestURI.getSearchpartMap();
Map<String, String> searchpart = digestURL.getSearchpartMap();
if (searchpart == null) {
if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0);
} else {
@ -309,7 +310,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate());
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate());
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, md.hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash()));
if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5());
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher());
@ -357,27 +357,25 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
public SolrVector yacy2solr(
final String id, final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
final IndexCell<CitationReference> citations,
final WebgraphConfiguration webgraph) {
// we use the SolrCell design as index schema
SolrVector doc = new SolrVector();
final DigestURL digestURI = document.dc_source();
final DigestURL digestURL = document.dc_source();
final String id = ASCII.String(digestURL.hash());
boolean allAttr = this.isEmpty();
String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
add(doc, CollectionSchema.id, id);
String us = digestURI.toNormalform(true);
String us = digestURL.toNormalform(true);
int clickdepth = 999;
if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) {
if (digestURI.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
if (digestURL.probablyRootURL()) {
clickdepth = 0;
this.lazy = lc;
} else {
clickdepth = 999;
}
@ -712,7 +710,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (refresh != null && refresh.length() > 0) {
MultiProtocolURL refreshURL;
try {
refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURI, html.getRefreshPath());
refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURL, html.getRefreshPath());
if (refreshURL != null) {
inboundLinks.remove(refreshURL);
outboundLinks.remove(refreshURL);
@ -785,7 +783,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
String content = document.getTextString();
String tokens = digestURI.toTokens();
String tokens = digestURL.toTokens();
if (content == null || content.length() == 0) {
content = tokens;
} else {
@ -798,9 +796,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURI.getFileName()))) {
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
content = digestURI.toTokens(); // remove all other entry but the url tokens
content = digestURL.toTokens(); // remove all other entry but the url tokens
}
// content (must be written after special parser data, since this can influence the content)
@ -824,7 +822,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// create a subgraph
if (!containsCanonical) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations);
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations);
}
// list all links
@ -850,7 +848,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength());
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate);
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, document.dc_source().hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash()));
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher());
@ -1269,6 +1266,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
configuration.add(doc, CollectionSchema.collection_sxt, cs);
}
// clickdepth, cr and postprocessing
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) {
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index
}
if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) {
processTypes.add(ProcessType.CITATION); // postprocessing needed
}
if (allAttr || configuration.contains(CollectionSchema.process_sxt)) {
List<String> p = new ArrayList<String>();
for (ProcessType t: processTypes) p.add(t.name());
configuration.add(doc, CollectionSchema.process_sxt, p);
}
return doc;
}

View File

@ -391,7 +391,6 @@ public final class TemplateEngine {
// #%
} else if ((bb & 0xFF) == pcChar) { //include
final ByteBuffer include = new ByteBuffer();
keyStream.reset(); //reset stream
if(transferUntil(pis, keyStream, iClose)){
byte[] filename = keyStream.toByteArray();
@ -403,6 +402,7 @@ public final class TemplateEngine {
filename= replacePattern(patternkey, pattern, dflt);
}
if (filename.length > 0 && !java.util.Arrays.equals(filename, dflt)) {
final ByteBuffer include = new ByteBuffer();
BufferedReader br = null;
try{
//br = new BufferedReader(new InputStreamReader(new FileInputStream( filename ))); //Simple Include
@ -422,9 +422,9 @@ public final class TemplateEngine {
structure.append(ASCII.getBytes("<fileinclude file=\"")).append(filename).append(close_tagn);
structure.append(writeTemplate(pis2, out, pattern, dflt, new byte[0])); //clear pattern prefix for include
structure.append(ASCII.getBytes("</fileinclude>\n"));
include.close();
}
}
// # - no special character. This is simply a '#' without meaning
} else { //no match, but a single hash (output # + bb)
out.write(hashChar);