more tolerance when creating solar document

This commit is contained in:
orbiter 2012-07-04 21:15:38 +02:00
parent 78fc3cf8f8
commit d4291ac1f3

View File

@ -41,7 +41,6 @@ import java.util.Set;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.services.federated.solr.SolrDoc;
@ -107,6 +106,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String[] value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final List<String> value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.size() > 0))) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final int value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || value > 0)) solrdoc.addSolr(key, value);
@ -162,7 +165,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format());
addSolr(solrdoc, SolrField.last_modified, header.lastModified());
addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' '));
final String content = UTF8.String(yacydoc.getTextBytes());
final String content = yacydoc.getTextString();
addSolr(solrdoc, SolrField.text_t, content);
if (isEmpty() || contains(SolrField.wordcount_i.name())) {
final int contentwc = content.split(" ").length;
@ -260,22 +263,21 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// images
final Collection<ImageEntry> imagesc = html.getImages().values();
final String[] imgtags = new String[imagesc.size()];
final String[] imgprots = new String[imagesc.size()];
final String[] imgstubs = new String[imagesc.size()];
final String[] imgalts = new String[imagesc.size()];
c = 0;
final List<String> imgtags = new ArrayList<String>(imagesc.size());
final List<String> imgprots = new ArrayList<String>(imagesc.size());
final List<String> imgstubs = new ArrayList<String>(imagesc.size());
final List<String> imgalts = new ArrayList<String>(imagesc.size());
for (final ImageEntry ie: imagesc) {
final MultiProtocolURI uri = ie.url();
inboundLinks.remove(uri);
ouboundLinks.remove(uri);
imgtags[c] = ie.toString();
imgprots[c] = uri.getProtocol();
imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
imgalts[c] = ie.alt();
c++;
imgtags.add(ie.toString());
String protocol = uri.getProtocol();
imgprots.add(protocol);
imgstubs.add(uri.toString().substring(protocol.length() + 3));
imgalts.add(ie.alt());
}
addSolr(solrdoc, SolrField.imagescount_i, imgtags.length);
addSolr(solrdoc, SolrField.imagescount_i, imgtags.size());
if (isEmpty() || contains(SolrField.images_tag_txt.name())) addSolr(solrdoc, SolrField.images_tag_txt, imgtags);
if (isEmpty() || contains(SolrField.images_protocol_txt.name())) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots));
if (isEmpty() || contains(SolrField.images_urlstub_txt.name())) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs);
@ -403,30 +405,31 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
c = 0;
if (isEmpty() || contains(SolrField.inboundlinkscount_i.name())) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size());
if (isEmpty() || contains(SolrField.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
final String[] inboundlinksTag = new String[inboundLinks.size()];
final String[] inboundlinksURLProtocol = new String[inboundLinks.size()];
final String[] inboundlinksURLStub = new String[inboundLinks.size()];
final String[] inboundlinksName = new String[inboundLinks.size()];
final String[] inboundlinksRel = new String[inboundLinks.size()];
final String[] inboundlinksText = new String[inboundLinks.size()];
final List<String> inboundlinksTag = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLProtocol = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLStub = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksName = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksRel = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksText = new ArrayList<String>(inboundLinks.size());
for (final MultiProtocolURI url: inboundLinks) {
final Properties p = alllinks.get(url);
if (p == null) continue;
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
inboundlinksURLProtocol[c] = urls.substring(0, pr);
inboundlinksURLStub[c] = urls.substring(pr + 3);
inboundlinksName[c] = name.length() > 0 ? name : "";
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
inboundlinksText[c] = text.length() > 0 ? text : "";
inboundlinksTag[c] =
inboundlinksURLProtocol.add(urls.substring(0, pr));
inboundlinksURLStub.add(urls.substring(pr + 3));
inboundlinksName.add(name.length() > 0 ? name : "");
inboundlinksRel.add(rel.length() > 0 ? rel : "");
inboundlinksText.add(text.length() > 0 ? text : "");
inboundlinksTag.add(
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
((text.length() > 0) ? text : "") + "</a>");
c++;
}
if (isEmpty() || contains(SolrField.inboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag);
@ -440,30 +443,31 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
c = 0;
if (isEmpty() || contains(SolrField.outboundlinkscount_i.name())) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size());
if (isEmpty() || contains(SolrField.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
final String[] outboundlinksTag = new String[ouboundLinks.size()];
final String[] outboundlinksURLProtocol = new String[ouboundLinks.size()];
final String[] outboundlinksURLStub = new String[ouboundLinks.size()];
final String[] outboundlinksName = new String[ouboundLinks.size()];
final String[] outboundlinksRel = new String[ouboundLinks.size()];
final String[] outboundlinksText = new String[ouboundLinks.size()];
final List<String> outboundlinksTag = new ArrayList<String>(ouboundLinks.size());
final List<String> outboundlinksURLProtocol = new ArrayList<String>(ouboundLinks.size());
final List<String> outboundlinksURLStub = new ArrayList<String>(ouboundLinks.size());
final List<String> outboundlinksName = new ArrayList<String>(ouboundLinks.size());
final List<String> outboundlinksRel = new ArrayList<String>(ouboundLinks.size());
final List<String> outboundlinksText = new ArrayList<String>(ouboundLinks.size());
for (final MultiProtocolURI url: ouboundLinks) {
final Properties p = alllinks.get(url);
if (p == null) continue;
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
outboundlinksURLProtocol[c] = urls.substring(0, pr);
outboundlinksURLStub[c] = urls.substring(pr + 3);
outboundlinksName[c] = name.length() > 0 ? name : "";
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
outboundlinksText[c] = text.length() > 0 ? text : "";
outboundlinksTag[c] =
outboundlinksURLProtocol.add(urls.substring(0, pr));
outboundlinksURLStub.add(urls.substring(pr + 3));
outboundlinksName.add(name.length() > 0 ? name : "");
outboundlinksRel.add(rel.length() > 0 ? rel : "");
outboundlinksText.add(text.length() > 0 ? text : "");
outboundlinksTag.add(
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
((text.length() > 0) ? text : "") + "</a>");
c++;
}
if (isEmpty() || contains(SolrField.outboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag);
@ -474,7 +478,6 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (isEmpty() || contains(SolrField.outboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(SolrField.outboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText);
// charset
addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset());
@ -488,16 +491,18 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
return solrdoc;
}
private static String[] protocolList2indexedList(String[] protocol) {
private static List<String> protocolList2indexedList(List<String> protocol) {
List<String> a = new ArrayList<String>();
for (int i = 0; i < protocol.length; i++) {
if (!protocol[i].equals("http")) {
String p;
for (int i = 0; i < protocol.size(); i++) {
p = protocol.get(i);
if (!p.equals("http")) {
String c = Integer.toString(i);
while (c.length() < 3) c = "0" + c;
a.add(c + "-" + protocol[i]);
a.add(c + "-" + p);
}
}
return a.toArray(new String[a.size()]);
return a;
}
/**
@ -507,7 +512,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
* @param rel
* @return binary encoded information about rel
*/
private static int relEval(final String[] rel) {
private static int relEval(final List<String> rel) {
int i = 0;
for (final String s: rel) {
final String s0 = s.toLowerCase().trim();