mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git
This commit is contained in:
commit
214a087cdf
|
@ -48,7 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
|
|||
public class DigestURL extends MultiProtocolURL implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = -1173233022912141885L;
|
||||
public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
|
||||
|
||||
// class variables
|
||||
private byte[] hash;
|
||||
|
|
|
@ -619,7 +619,7 @@ public class Segment {
|
|||
char docType = Response.docType(document.dc_format());
|
||||
|
||||
// CREATE SOLR DOCUMENT
|
||||
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
|
||||
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
|
||||
|
||||
// ENRICH DOCUMENT WITH RANKING INFORMATION
|
||||
if (this.connectedCitation()) {
|
||||
|
|
|
@ -201,20 +201,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
* add uri attributes to solr document
|
||||
* @param doc
|
||||
* @param allAttr
|
||||
* @param digestURI
|
||||
* @param digestURL
|
||||
* @param doctype
|
||||
* @return the normalized url
|
||||
*/
|
||||
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURI, final char doctype) {
|
||||
add(doc, CollectionSchema.id, ASCII.String(digestURI.hash()));
|
||||
String us = digestURI.toNormalform(true);
|
||||
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL, final char doctype) {
|
||||
add(doc, CollectionSchema.id, ASCII.String(digestURL.hash()));
|
||||
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, digestURL.hosthash());
|
||||
String us = digestURL.toNormalform(true);
|
||||
add(doc, CollectionSchema.sku, us);
|
||||
if (allAttr || contains(CollectionSchema.ip_s)) {
|
||||
final InetAddress address = digestURI.getInetAddress();
|
||||
final InetAddress address = digestURL.getInetAddress();
|
||||
if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress());
|
||||
}
|
||||
String host = null;
|
||||
if ((host = digestURI.getHost()) != null) {
|
||||
if ((host = digestURL.getHost()) != null) {
|
||||
String dnc = Domains.getDNC(host);
|
||||
String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1);
|
||||
int p = subdomOrga.lastIndexOf('.');
|
||||
|
@ -228,17 +229,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
|
||||
// path elements of link
|
||||
String filename = digestURI.getFileName();
|
||||
String filename = digestURL.getFileName();
|
||||
String extension = MultiProtocolURL.getFileExtension(filename);
|
||||
if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length());
|
||||
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURI.getProtocol());
|
||||
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
|
||||
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol());
|
||||
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURL.getPaths());
|
||||
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
|
||||
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
|
||||
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, doctype));
|
||||
|
||||
|
||||
Map<String, String> searchpart = digestURI.getSearchpartMap();
|
||||
Map<String, String> searchpart = digestURL.getSearchpartMap();
|
||||
if (searchpart == null) {
|
||||
if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0);
|
||||
} else {
|
||||
|
@ -309,7 +310,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
|
||||
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate());
|
||||
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate());
|
||||
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, md.hosthash());
|
||||
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash()));
|
||||
if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5());
|
||||
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher());
|
||||
|
@ -357,27 +357,25 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
|
||||
public SolrVector yacy2solr(
|
||||
final String id, final Map<String, Pattern> collections, final ResponseHeader responseHeader,
|
||||
final Map<String, Pattern> collections, final ResponseHeader responseHeader,
|
||||
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
|
||||
final IndexCell<CitationReference> citations,
|
||||
final WebgraphConfiguration webgraph) {
|
||||
// we use the SolrCell design as index schema
|
||||
SolrVector doc = new SolrVector();
|
||||
final DigestURL digestURI = document.dc_source();
|
||||
final DigestURL digestURL = document.dc_source();
|
||||
final String id = ASCII.String(digestURL.hash());
|
||||
boolean allAttr = this.isEmpty();
|
||||
String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
|
||||
String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
|
||||
|
||||
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
|
||||
|
||||
add(doc, CollectionSchema.id, id);
|
||||
String us = digestURI.toNormalform(true);
|
||||
String us = digestURL.toNormalform(true);
|
||||
|
||||
int clickdepth = 999;
|
||||
if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) {
|
||||
if (digestURI.probablyRootURL()) {
|
||||
boolean lc = this.lazy; this.lazy = false;
|
||||
if (digestURL.probablyRootURL()) {
|
||||
clickdepth = 0;
|
||||
this.lazy = lc;
|
||||
} else {
|
||||
clickdepth = 999;
|
||||
}
|
||||
|
@ -712,7 +710,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (refresh != null && refresh.length() > 0) {
|
||||
MultiProtocolURL refreshURL;
|
||||
try {
|
||||
refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURI, html.getRefreshPath());
|
||||
refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURL, html.getRefreshPath());
|
||||
if (refreshURL != null) {
|
||||
inboundLinks.remove(refreshURL);
|
||||
outboundLinks.remove(refreshURL);
|
||||
|
@ -785,7 +783,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
|
||||
String content = document.getTextString();
|
||||
String tokens = digestURI.toTokens();
|
||||
String tokens = digestURL.toTokens();
|
||||
if (content == null || content.length() == 0) {
|
||||
content = tokens;
|
||||
} else {
|
||||
|
@ -798,9 +796,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
}
|
||||
|
||||
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURI.getFileName()))) {
|
||||
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
|
||||
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
|
||||
content = digestURI.toTokens(); // remove all other entry but the url tokens
|
||||
content = digestURL.toTokens(); // remove all other entry but the url tokens
|
||||
}
|
||||
|
||||
// content (must be written after special parser data, since this can influence the content)
|
||||
|
@ -824,7 +822,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
// create a subgraph
|
||||
if (!containsCanonical) {
|
||||
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
|
||||
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations);
|
||||
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations);
|
||||
}
|
||||
|
||||
// list all links
|
||||
|
@ -850,7 +848,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength());
|
||||
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate);
|
||||
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
|
||||
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, document.dc_source().hosthash());
|
||||
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash()));
|
||||
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
|
||||
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher());
|
||||
|
@ -1269,6 +1266,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
configuration.add(doc, CollectionSchema.collection_sxt, cs);
|
||||
}
|
||||
|
||||
// clickdepth, cr and postprocessing
|
||||
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
|
||||
if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) {
|
||||
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index
|
||||
}
|
||||
if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) {
|
||||
processTypes.add(ProcessType.CITATION); // postprocessing needed
|
||||
}
|
||||
if (allAttr || configuration.contains(CollectionSchema.process_sxt)) {
|
||||
List<String> p = new ArrayList<String>();
|
||||
for (ProcessType t: processTypes) p.add(t.name());
|
||||
configuration.add(doc, CollectionSchema.process_sxt, p);
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
|
|
|
@ -391,7 +391,6 @@ public final class TemplateEngine {
|
|||
|
||||
// #%
|
||||
} else if ((bb & 0xFF) == pcChar) { //include
|
||||
final ByteBuffer include = new ByteBuffer();
|
||||
keyStream.reset(); //reset stream
|
||||
if(transferUntil(pis, keyStream, iClose)){
|
||||
byte[] filename = keyStream.toByteArray();
|
||||
|
@ -403,6 +402,7 @@ public final class TemplateEngine {
|
|||
filename= replacePattern(patternkey, pattern, dflt);
|
||||
}
|
||||
if (filename.length > 0 && !java.util.Arrays.equals(filename, dflt)) {
|
||||
final ByteBuffer include = new ByteBuffer();
|
||||
BufferedReader br = null;
|
||||
try{
|
||||
//br = new BufferedReader(new InputStreamReader(new FileInputStream( filename ))); //Simple Include
|
||||
|
@ -422,9 +422,9 @@ public final class TemplateEngine {
|
|||
structure.append(ASCII.getBytes("<fileinclude file=\"")).append(filename).append(close_tagn);
|
||||
structure.append(writeTemplate(pis2, out, pattern, dflt, new byte[0])); //clear pattern prefix for include
|
||||
structure.append(ASCII.getBytes("</fileinclude>\n"));
|
||||
include.close();
|
||||
}
|
||||
}
|
||||
|
||||
// # - no special character. This is simply a '#' without meaning
|
||||
} else { //no match, but a single hash (output # + bb)
|
||||
out.write(hashChar);
|
||||
|
|
Loading…
Reference in New Issue
Block a user