/** * SolrScheme * Copyright 2011 by Michael Peter Christen * First released 14.04.2011 at http://yacy.net * * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ * $LastChangedRevision: 7654 $ * $LastChangedBy: orbiter $ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ package net.yacy.search.index; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.net.InetAddress; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.services.federated.solr.SolrDoc; import net.yacy.cora.storage.ConfigurationSet; import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import org.apache.solr.common.SolrDocument; public class SolrConfiguration extends ConfigurationSet implements Serializable { private static final long serialVersionUID=-499100932212840385L; /** * initialize with an empty ConfigurationSet which will cause that all the index * attributes are used */ public SolrConfiguration() { super(); } /** * initialize the scheme with a given configuration file * the configuration file simply contains a list of lines with keywords * or keyword = value lines (while value is a custom Solr field name * @param configurationFile */ public SolrConfiguration(final File configurationFile) { super(configurationFile); // check consistency: compare with YaCyField enum if (this.isEmpty()) return; Iterator it = this.entryIterator(); for (ConfigurationSet.Entry etr = it.next(); it.hasNext(); etr = it.next()) { try { SolrField f = SolrField.valueOf(etr.key()); f.setSolrFieldName(etr.getValue()); } catch (IllegalArgumentException e) { Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + etr.toString() + "'"); it.remove(); } } } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final Date value) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final int value) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String[] value) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final float value) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final double value) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final boolean value) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value, final float boost) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value, boost); } /** * save configuration to file and update enum SolrFields * @throws IOException */ @Override public void commit() throws IOException { try { super.commit(); // make sure the enum SolrField.SolrFieldName is current Iterator it = this.entryIterator(); for (ConfigurationSet.Entry etr = it.next(); it.hasNext(); etr = it.next()) { try { SolrField f = SolrField.valueOf(etr.key()); f.setSolrFieldName(etr.getValue()); } catch (IllegalArgumentException e) { continue; } } } catch (final IOException e) {} } public SolrDoc yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) { // we use the SolrCell design as index scheme final SolrDoc solrdoc = new SolrDoc(); final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); addSolr(solrdoc, SolrField.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) addSolr(solrdoc, SolrField.id, id); addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false)); final InetAddress address = digestURI.getInetAddress(); if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress()); if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost()); addSolr(solrdoc, SolrField.title, yacydoc.dc_title()); addSolr(solrdoc, SolrField.author, yacydoc.dc_creator()); addSolr(solrdoc, SolrField.description, yacydoc.dc_description()); addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format()); addSolr(solrdoc, SolrField.last_modified, header.lastModified()); addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' ')); final String content = UTF8.String(yacydoc.getTextBytes()); addSolr(solrdoc, SolrField.text_t, content); if (isEmpty() || contains(SolrField.wordcount_i.name())) { final int contentwc = content.split(" ").length; addSolr(solrdoc, SolrField.wordcount_i, contentwc); } // path elements of link final String path = digestURI.getPath(); if (path != null && (isEmpty() || contains(SolrField.paths_txt.name()))) { final String[] paths = path.split("/"); if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths); } // get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme Set inboundLinks = yacydoc.inboundLinks(); Set ouboundLinks = yacydoc.outboundLinks(); int c = 0; final Object parser = yacydoc.getParserObject(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; // header tags int h = 0; int f = 1; String[] hs; hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h1_txt, hs); hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h2_txt, hs); hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h3_txt, hs); hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h4_txt, hs); hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h5_txt, hs); hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h6_txt, hs); addSolr(solrdoc, SolrField.htags_i, h); // canonical tag if (html.getCanonical() != null) addSolr(solrdoc, SolrField.canonical_s, html.getCanonical().toNormalform(false, false)); // noindex and nofollow attributes // from HTML (meta-tag in HTML header: robots) // and HTTP header (x-robots property) // coded as binary value: // bit 0: "all" contained in html header meta // bit 1: "index" contained in html header meta // bit 2: "noindex" contained in html header meta // bit 3: "nofollow" contained in html header meta // bit 8: "noarchive" contained in http header properties // bit 9: "nosnippet" contained in http header properties // bit 10: "noindex" contained in http header properties // bit 11: "nofollow" contained in http header properties // bit 12: "unavailable_after" contained in http header properties int b = 0; final String robots_meta = html.getMetas().get("robots"); // this tag may have values: all, index, noindex, nofollow if (robots_meta != null) { if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0 if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1 if (robots_meta.indexOf("noindex",0) >= 0) b += 4; // set bit 2 if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3 } String x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, ""); if (x_robots_tag.length() == 0) x_robots_tag = header.get(HeaderFramework.X_ROBOTS, ""); // this tag may have values: noarchive, nosnippet, noindex, unavailable_after if (x_robots_tag.length() > 0) { if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 256; // set bit 8 if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 512; // set bit 9 if (x_robots_tag.indexOf("noindex",0) >= 0) b += 1024; // set bit 10 if (x_robots_tag.indexOf("nofollow",0) >= 0) b += 2048; // set bit 11 if (x_robots_tag.indexOf("unavailable_after",0) >=0) b += 4096; // set bit 12 } addSolr(solrdoc, SolrField.robots_i, b); // meta tags: generator final String generator = html.getMetas().get("generator"); if (generator != null) addSolr(solrdoc, SolrField.metagenerator_t, generator); // bold, italic final String[] bold = html.getBold(); addSolr(solrdoc, SolrField.boldcount_i, bold.length); if (bold.length > 0) { addSolr(solrdoc, SolrField.bold_txt, bold); if (isEmpty() || contains(SolrField.bold_val.name())) { addSolr(solrdoc, SolrField.bold_val, html.getBoldCount(bold)); } } final String[] italic = html.getItalic(); addSolr(solrdoc, SolrField.italiccount_i, italic.length); if (italic.length > 0) { addSolr(solrdoc, SolrField.italic_txt, italic); if (isEmpty() || contains(SolrField.italic_val.name())) { addSolr(solrdoc, SolrField.italic_val, html.getItalicCount(italic)); } } final String[] li = html.getLi(); addSolr(solrdoc, SolrField.licount_i, li.length); if (li.length > 0) addSolr(solrdoc, SolrField.li_txt, li); // images final Collection imagesc = html.getImages().values(); final String[] imgtags = new String[imagesc.size()]; final String[] imgprots = new String[imagesc.size()]; final String[] imgstubs = new String[imagesc.size()]; final String[] imgalts = new String[imagesc.size()]; c = 0; for (final ImageEntry ie: imagesc) { final MultiProtocolURI uri = ie.url(); inboundLinks.remove(uri); ouboundLinks.remove(uri); imgtags[c] = ie.toString(); imgprots[c] = uri.getProtocol(); imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3); imgalts[c] = ie.alt(); c++; } addSolr(solrdoc, SolrField.imagescount_i, imgtags.length); if (isEmpty() || contains(SolrField.images_tag_txt.name())) addSolr(solrdoc, SolrField.images_tag_txt, imgtags); if (isEmpty() || contains(SolrField.images_protocol_txt.name())) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots)); if (isEmpty() || contains(SolrField.images_urlstub_txt.name())) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs); if (isEmpty() || contains(SolrField.images_alt_txt.name())) addSolr(solrdoc, SolrField.images_alt_txt, imgalts); // style sheets if (isEmpty() || contains(SolrField.css_tag_txt.name())) { final Map csss = html.getCSS(); final String[] css_tag = new String[csss.size()]; final String[] css_url = new String[csss.size()]; c = 0; for (final Map.Entry entry: csss.entrySet()) { final String url = entry.getKey().toNormalform(false, false); inboundLinks.remove(url); ouboundLinks.remove(url); css_tag[c] = ""; css_url[c] = url; c++; } addSolr(solrdoc, SolrField.csscount_i, css_tag.length); if (css_tag.length > 0) addSolr(solrdoc, SolrField.css_tag_txt, css_tag); if (css_url.length > 0) addSolr(solrdoc, SolrField.css_url_txt, css_url); } // Scripts if (isEmpty() || contains(SolrField.scripts_txt.name())) { final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; for (final MultiProtocolURI url: scriptss) { inboundLinks.remove(url); ouboundLinks.remove(url); scripts[c++] = url.toNormalform(false, false); } addSolr(solrdoc, SolrField.scriptscount_i, scripts.length); if (scripts.length > 0) addSolr(solrdoc, SolrField.scripts_txt, scripts); } // Frames if (isEmpty() || contains(SolrField.frames_txt.name())) { final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; for (final MultiProtocolURI url: framess) { inboundLinks.remove(url); ouboundLinks.remove(url); frames[c++] = url.toNormalform(false, false); } addSolr(solrdoc, SolrField.framesscount_i, frames.length); if (frames.length > 0) addSolr(solrdoc, SolrField.frames_txt, frames); } // IFrames if (isEmpty() || contains(SolrField.iframes_txt.name())) { final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; for (final MultiProtocolURI url: iframess) { inboundLinks.remove(url); ouboundLinks.remove(url); iframes[c++] = url.toNormalform(false, false); } addSolr(solrdoc, SolrField.iframesscount_i, iframes.length); if (iframes.length > 0) addSolr(solrdoc, SolrField.iframes_txt, iframes); } // flash embedded addSolr(solrdoc, SolrField.flash_b, html.containsFlash()); // generic evaluation pattern for (final String model: html.getEvaluationModelNames()) { if (isEmpty() || contains("ext_" + model + "_txt")) { final String[] scorenames = html.getEvaluationModelScoreNames(model); if (scorenames.length > 0) { addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_txt"), scorenames); addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames)); } } } // response time addSolr(solrdoc, SolrField.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")); } // list all links final Map alllinks = yacydoc.getAnchors(); c = 0; if (isEmpty() || contains(SolrField.inboundlinkscount_i.name())) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size()); if (isEmpty() || contains(SolrField.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount()); final String[] inboundlinksTag = new String[inboundLinks.size()]; final String[] inboundlinksURLProtocol = new String[inboundLinks.size()]; final String[] inboundlinksURLStub = new String[inboundLinks.size()]; final String[] inboundlinksName = new String[inboundLinks.size()]; final String[] inboundlinksRel = new String[inboundLinks.size()]; final String[] inboundlinksText = new String[inboundLinks.size()]; for (final MultiProtocolURI url: inboundLinks) { final Properties p = alllinks.get(url); final String name = p.getProperty("name", ""); // the name attribute final String rel = p.getProperty("rel", ""); // the rel-attribute final String text = p.getProperty("text", ""); // the text between the tag final String urls = url.toNormalform(false, false); final int pr = urls.indexOf("://",0); inboundlinksURLProtocol[c] = urls.substring(0, pr); inboundlinksURLStub[c] = urls.substring(pr + 3); inboundlinksName[c] = name.length() > 0 ? name : ""; inboundlinksRel[c] = rel.length() > 0 ? rel : ""; inboundlinksText[c] = text.length() > 0 ? text : ""; inboundlinksTag[c] = " 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") + ">" + ((text.length() > 0) ? text : "") + ""; c++; } if (isEmpty() || contains(SolrField.inboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag); if (isEmpty() || contains(SolrField.inboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol)); if (isEmpty() || contains(SolrField.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_urlstub_txt, inboundlinksURLStub); if (isEmpty() || contains(SolrField.inboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_name_txt, inboundlinksName); if (isEmpty() || contains(SolrField.inboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_rel_txt, inboundlinksRel); if (isEmpty() || contains(SolrField.inboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_relflags_txt, relEval(inboundlinksRel)); if (isEmpty() || contains(SolrField.inboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_text_txt, inboundlinksText); c = 0; if (isEmpty() || contains(SolrField.outboundlinkscount_i.name())) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size()); if (isEmpty() || contains(SolrField.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); final String[] outboundlinksTag = new String[ouboundLinks.size()]; final String[] outboundlinksURLProtocol = new String[ouboundLinks.size()]; final String[] outboundlinksURLStub = new String[ouboundLinks.size()]; final String[] outboundlinksName = new String[ouboundLinks.size()]; final String[] outboundlinksRel = new String[ouboundLinks.size()]; final String[] outboundlinksText = new String[ouboundLinks.size()]; for (final MultiProtocolURI url: ouboundLinks) { final Properties p = alllinks.get(url); final String name = p.getProperty("name", ""); // the name attribute final String rel = p.getProperty("rel", ""); // the rel-attribute final String text = p.getProperty("text", ""); // the text between the tag final String urls = url.toNormalform(false, false); final int pr = urls.indexOf("://",0); outboundlinksURLProtocol[c] = urls.substring(0, pr); outboundlinksURLStub[c] = urls.substring(pr + 3); outboundlinksName[c] = name.length() > 0 ? name : ""; outboundlinksRel[c] = rel.length() > 0 ? rel : ""; outboundlinksText[c] = text.length() > 0 ? text : ""; outboundlinksTag[c] = " 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") + ">" + ((text.length() > 0) ? text : "") + ""; c++; } if (isEmpty() || contains(SolrField.outboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag); if (isEmpty() || contains(SolrField.outboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol)); if (isEmpty() || contains(SolrField.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_urlstub_txt, outboundlinksURLStub); if (isEmpty() || contains(SolrField.outboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_name_txt, outboundlinksName); if (isEmpty() || contains(SolrField.outboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_rel_txt, outboundlinksRel); if (isEmpty() || contains(SolrField.outboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel)); if (isEmpty() || contains(SolrField.outboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText); // charset addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset()); // coordinates if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon()); addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat()); } addSolr(solrdoc, SolrField.httpstatus_i, 200); return solrdoc; } private static String[] protocolList2indexedList(String[] protocol) { List a = new ArrayList(); for (int i = 0; i < protocol.length; i++) { if (!protocol[i].equals("http")) { String c = Integer.toString(i); while (c.length() < 3) c = "0" + c; a.add(c + "-" + protocol[i]); } } return a.toArray(new String[a.size()]); } /** * encode a string containing attributes from anchor rel properties binary: * bit 0: "me" contained in rel * bit 1: "nofollow" contained in rel * @param rel * @return binary encoded information about rel */ private static int relEval(final String[] rel) { int i = 0; for (final String s: rel) { final String s0 = s.toLowerCase().trim(); if ("me".equals(s0)) i += 1; if ("nofollow".equals(s0)) i += 2; } return i; } public String solrGetID(final SolrDocument solr) { return (String) solr.getFieldValue(SolrField.id.getSolrFieldName()); } public DigestURI solrGetURL(final SolrDocument solr) { try { return new DigestURI((String) solr.getFieldValue(SolrField.sku.getSolrFieldName())); } catch (final MalformedURLException e) { return null; } } public String solrGetTitle(final SolrDocument solr) { return (String) solr.getFieldValue(SolrField.title.getSolrFieldName()); } public String solrGetText(final SolrDocument solr) { return (String) solr.getFieldValue(SolrField.text_t.getSolrFieldName()); } public String solrGetAuthor(final SolrDocument solr) { return (String) solr.getFieldValue(SolrField.author.getSolrFieldName()); } public String solrGetDescription(final SolrDocument solr) { return (String) solr.getFieldValue(SolrField.description.getSolrFieldName()); } public Date solrGetDate(final SolrDocument solr) { return (Date) solr.getFieldValue(SolrField.last_modified.getSolrFieldName()); } public Collection solrGetKeywords(final SolrDocument solr) { final Collection c = solr.getFieldValues(SolrField.keywords.getSolrFieldName()); final ArrayList a = new ArrayList(); for (final Object s: c) { a.add((String) s); } return a; } /** * register an entry as error document * @param digestURI * @param failReason * @param httpstatus * @throws IOException */ public SolrDoc err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException { final SolrDoc solrdoc = new SolrDoc(); addSolr(solrdoc, SolrField.id, ASCII.String(digestURI.hash())); addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false)); final InetAddress address = digestURI.getInetAddress(); if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress()); if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost()); // path elements of link final String path = digestURI.getPath(); if (path != null) { final String[] paths = path.split("/"); if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths); } addSolr(solrdoc, SolrField.failreason_t, failReason); addSolr(solrdoc, SolrField.httpstatus_i, httpstatus); return solrdoc; } /* standard solr schema */ }