mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added solr field 'refresh_s' which stores the refresh url contained in
the meta-refresh html header field.
This commit is contained in:
parent
f3167def64
commit
508a81b86c
|
@ -184,6 +184,9 @@ host_s
|
|||
## url inside the canonical link element, string
|
||||
canonical_s
|
||||
|
||||
## link from the url property inside the refresh link element, string
|
||||
refresh_s
|
||||
|
||||
## all texts in <li> tags, textgen
|
||||
li_txt
|
||||
|
||||
|
|
|
@ -807,7 +807,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
|
||||
final int pos = s.indexOf(';');
|
||||
if (pos < 0) return EMPTY_STRING;
|
||||
s = s.substring(pos + 1);
|
||||
s = s.substring(pos + 1).trim();
|
||||
if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim();
|
||||
return EMPTY_STRING;
|
||||
}
|
||||
|
|
|
@ -354,6 +354,24 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
}
|
||||
}
|
||||
|
||||
// meta refresh tag
|
||||
if (isEmpty() || contains(SolrField.refresh_s.name())) {
|
||||
String refresh = html.getRefreshPath();
|
||||
if (refresh != null && refresh.length() > 0) {
|
||||
MultiProtocolURI refreshURL;
|
||||
try {
|
||||
refreshURL = refresh.startsWith("http") ? new MultiProtocolURI(html.getRefreshPath()) : new MultiProtocolURI(digestURI, html.getRefreshPath());
|
||||
if (refreshURL != null) {
|
||||
inboundLinks.remove(refreshURL);
|
||||
ouboundLinks.remove(refreshURL);
|
||||
addSolr(solrdoc, SolrField.refresh_s, refreshURL.toNormalform(false, false));
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
addSolr(solrdoc, SolrField.refresh_s, refresh);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// flash embedded
|
||||
if (isEmpty() || contains(SolrField.flash_b.name())) {
|
||||
MultiProtocolURI[] flashURLs = html.getFlash();
|
||||
|
|
|
@ -82,6 +82,7 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField
|
|||
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
|
||||
htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"),
|
||||
canonical_s(SolrType.string, true, true, "url inside the canonical link element"),
|
||||
refresh_s(SolrType.string, true, true, "link from the url property inside the refresh link element"),
|
||||
metagenerator_t(SolrType.text_general, true, true, "content of <meta name=\"generator\" content=#content#> tag"),
|
||||
boldcount_i(SolrType.integer, true, true, "total number of occurrences of <b> or <strong>"),
|
||||
bold_txt(SolrType.text_general, true, true, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
|
||||
|
@ -150,6 +151,7 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField
|
|||
* Returns the YaCy default or (if available) custom field name for Solr
|
||||
* @return SolrFieldname String
|
||||
*/
|
||||
@Override
|
||||
public final String getSolrFieldName() {
|
||||
return (this.solrFieldName == null ? this.name() : this.solrFieldName);
|
||||
}
|
||||
|
@ -167,26 +169,32 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SolrType getType() {
|
||||
return this.type;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean isIndexed() {
|
||||
return this.indexed;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean isStored() {
|
||||
return this.stored;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean isMultiValued() {
|
||||
return this.multiValued;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean isOmitNorms() {
|
||||
return this.omitNorms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final String getComment() {
|
||||
return this.comment;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user