added solr field 'refresh_s' which stores the refresh url contained in

the meta-refresh html header field.
This commit is contained in:
Michael Peter Christen 2012-06-28 13:27:45 +02:00
parent f3167def64
commit 508a81b86c
4 changed files with 30 additions and 1 deletions

View File

@ -184,6 +184,9 @@ host_s
## url inside the canonical link element, string
canonical_s
## link from the url property inside the refresh link element, string
refresh_s
## all texts in <li> tags, textgen
li_txt

View File

@ -807,7 +807,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final int pos = s.indexOf(';');
if (pos < 0) return EMPTY_STRING;
s = s.substring(pos + 1);
s = s.substring(pos + 1).trim();
if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim();
return EMPTY_STRING;
}

View File

@ -354,6 +354,24 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
}
// meta refresh tag
if (isEmpty() || contains(SolrField.refresh_s.name())) {
String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0) {
MultiProtocolURI refreshURL;
try {
refreshURL = refresh.startsWith("http") ? new MultiProtocolURI(html.getRefreshPath()) : new MultiProtocolURI(digestURI, html.getRefreshPath());
if (refreshURL != null) {
inboundLinks.remove(refreshURL);
ouboundLinks.remove(refreshURL);
addSolr(solrdoc, SolrField.refresh_s, refreshURL.toNormalform(false, false));
}
} catch (MalformedURLException e) {
addSolr(solrdoc, SolrField.refresh_s, refresh);
}
}
}
// flash embedded
if (isEmpty() || contains(SolrField.flash_b.name())) {
MultiProtocolURI[] flashURLs = html.getFlash();

View File

@ -82,6 +82,7 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"),
canonical_s(SolrType.string, true, true, "url inside the canonical link element"),
refresh_s(SolrType.string, true, true, "link from the url property inside the refresh link element"),
metagenerator_t(SolrType.text_general, true, true, "content of <meta name=\"generator\" content=#content#> tag"),
boldcount_i(SolrType.integer, true, true, "total number of occurrences of <b> or <strong>"),
bold_txt(SolrType.text_general, true, true, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
@ -150,6 +151,7 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField
* Returns the YaCy default or (if available) custom field name for Solr
* @return SolrFieldname String
*/
@Override
public final String getSolrFieldName() {
return (this.solrFieldName == null ? this.name() : this.solrFieldName);
}
@ -167,26 +169,32 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField
}
}
@Override
public final SolrType getType() {
return this.type;
}
@Override
public final boolean isIndexed() {
return this.indexed;
}
@Override
public final boolean isStored() {
return this.stored;
}
@Override
public final boolean isMultiValued() {
return this.multiValued;
}
@Override
public final boolean isOmitNorms() {
return this.omitNorms;
}
@Override
public final String getComment() {
return this.comment;
}