Merge branch 'master' of github.com:yacy/yacy_search_server

This commit is contained in:
Michael Peter Christen 2024-05-19 17:35:24 +02:00
commit f1c70dce33
5 changed files with 65 additions and 39 deletions

View File

@ -6,12 +6,14 @@
# run with
# docker run -d --name yacy -p 8090:8090 -p 8443:8443 -v yacy_data:/opt/yacy_search_server/DATA --log-opt max-size=200m --log-opt max-file=2 yacy/yacy_search_server:latest
## build base
FROM eclipse-temurin:11-jdk-jammy AS base
RUN apt-get update && apt-get install -yq wkhtmltopdf imagemagick xvfb ghostscript && rm -rf /var/lib/apt/lists/*
## build app
FROM eclipse-temurin:11-jdk-jammy AS appbuilder
FROM base AS appbuilder
RUN apt-get update && apt-get install -yq ant git curl wkhtmltopdf imagemagick xvfb ghostscript && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -yq ant git curl && rm -rf /var/lib/apt/lists/*
RUN java -version
WORKDIR /opt
@ -20,8 +22,12 @@ COPY . /opt/yacy_search_server/
RUN ant compile -f /opt/yacy_search_server/build.xml && \
apt-get purge -yq --auto-remove ant && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \
rm -rf /opt/yacy_search_server/.git
rm -rf /var/lib/apt/lists/*
WORKDIR /opt/yacy_search_server/
RUN git rev-parse HEAD > .git/shallow && \
git tag -l | xargs git tag -d && \
git gc --prune=now
# Set initial admin password: "yacy" (encoded with custom yacy md5 function net.yacy.cora.order.Digest.encodeMD5Hex())
RUN sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a0987a4aba1ec46d63c" /opt/yacy_search_server/defaults/yacy.init && \
@ -31,16 +37,13 @@ RUN sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a09
## build dist
FROM eclipse-temurin:11-jre-jammy
FROM base
LABEL maintainer="Michael Peter Christen <mc@yacy.net>"
RUN apt-get update && apt-get install -yq wkhtmltopdf imagemagick xvfb ghostscript && rm -rf /var/lib/apt/lists/*
RUN adduser --system --group --no-create-home --disabled-password yacy
WORKDIR /opt
COPY . /opt/yacy_search_server/
COPY --from=appbuilder /opt/yacy_search_server /opt/yacy_search_server
COPY --chown=yacy:yacy --from=appbuilder /opt/yacy_search_server /opt/yacy_search_server
RUN adduser --system --group --no-create-home --disabled-password yacy && chown yacy:yacy -R /opt/yacy_search_server
EXPOSE 8090 8443
VOLUME ["/opt/yacy_search_server/DATA"]
USER yacy

View File

@ -5,7 +5,7 @@ FROM arm64v8/openjdk:17-buster
# Install needed packages not in base image
# (curl for sh scripts in /bin, and wkhtmltopdf,imagemagick,xvfb and ghostscript to enable PDF and image snapshot generation)
RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript && \
RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript ca-certificates-java && \
rm -rf /var/lib/apt/lists/*
# trace java version

28
ivy.xml
View File

@ -72,22 +72,22 @@
<dependency org="org.codehaus.woodstox" name="woodstox-core-asl" rev="4.4.1">
<exclude module="stax-api" />
</dependency>
<dependency org="org.eclipse.jetty" name="jetty-client" rev="9.4.52.v20230823" />
<dependency org="org.eclipse.jetty" name="jetty-deploy" rev="9.4.52.v20230823" conf="compile->master" />
<dependency org="org.eclipse.jetty" name="jetty-jmx" rev="9.4.52.v20230823" conf="compile->master"/>
<dependency org="org.eclipse.jetty" name="jetty-http" rev="9.4.52.v20230823"/>
<dependency org="org.eclipse.jetty" name="jetty-proxy" rev="9.4.52.v20230823"/>
<dependency org="org.eclipse.jetty" name="jetty-security" rev="9.4.52.v20230823"/>
<dependency org="org.eclipse.jetty" name="jetty-server" rev="9.4.52.v20230823"/>
<dependency org="org.eclipse.jetty" name="jetty-servlets" rev="9.4.52.v20230823"/>
<dependency org="org.eclipse.jetty" name="jetty-servlet" rev="9.4.52.v20230823">
<dependency org="org.eclipse.jetty" name="jetty-client" rev="9.4.54.v20240208" />
<dependency org="org.eclipse.jetty" name="jetty-deploy" rev="9.4.54.v20240208" conf="compile->master" />
<dependency org="org.eclipse.jetty" name="jetty-jmx" rev="9.4.54.v20240208" conf="compile->master"/>
<dependency org="org.eclipse.jetty" name="jetty-http" rev="9.4.54.v20240208"/>
<dependency org="org.eclipse.jetty" name="jetty-proxy" rev="9.4.54.v20240208"/>
<dependency org="org.eclipse.jetty" name="jetty-security" rev="9.4.54.v20240208"/>
<dependency org="org.eclipse.jetty" name="jetty-server" rev="9.4.54.v20240208"/>
<dependency org="org.eclipse.jetty" name="jetty-servlets" rev="9.4.54.v20240208"/>
<dependency org="org.eclipse.jetty" name="jetty-servlet" rev="9.4.54.v20240208">
<exclude module="jetty-util-ajax" />
</dependency>
<dependency org="org.eclipse.jetty" name="jetty-util" rev="9.4.52.v20230823" />
<dependency org="org.eclipse.jetty" name="jetty-webapp" rev="9.4.52.v20230823" />
<dependency org="org.eclipse.jetty.http2" name="http2-client" rev="9.4.52.v20230823" conf="compile->master"/>
<dependency org="org.eclipse.jetty.http2" name="http2-common" rev="9.4.52.v20230823" conf="compile->master"/>
<dependency org="org.eclipse.jetty.http2" name="http2-http-client-transport" rev="9.4.52.v20230823" conf="compile->master"/>
<dependency org="org.eclipse.jetty" name="jetty-util" rev="9.4.54.v20240208" />
<dependency org="org.eclipse.jetty" name="jetty-webapp" rev="9.4.54.v20240208" />
<dependency org="org.eclipse.jetty.http2" name="http2-client" rev="9.4.54.v20240208" conf="compile->master"/>
<dependency org="org.eclipse.jetty.http2" name="http2-common" rev="9.4.54.v20240208" conf="compile->master"/>
<dependency org="org.eclipse.jetty.http2" name="http2-http-client-transport" rev="9.4.54.v20240208" conf="compile->master"/>
<dependency org="org.jsoup" name="jsoup" rev="1.15.3" />
<dependency org="org.jwat" name="jwat-warc" rev="1.1.3" />
<dependency org="org.locationtech.spatial4j" name="spatial4j" rev="0.8"/>

View File

@ -148,6 +148,11 @@ public class JsonListImporter extends Thread implements Importer {
}
if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue;
final SolrInputDocument surrogate = new SolrInputDocument();
// set default values which act as constraints for a proper search
CollectionSchema.httpstatus_i.add(surrogate, 200);
// get fields for json object
jsonreader: for (final String key: json.keySet()) {
final Object o = json.opt(key);
if (o == null) continue;
@ -212,10 +217,19 @@ public class JsonListImporter extends Thread implements Importer {
final String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true));
surrogate.setField(CollectionSchema.id.getSolrFieldName(), id);
surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6));
continue jsonreader;
}
if (key.equals("description")) {
// in YaCy descriptions are full-text indexed and also multi-value fields
final List<Object> descriptions = new ArrayList<>();
descriptions.add(o.toString());
CollectionSchema.description_txt.add(surrogate, descriptions);
continue jsonreader;
}
if (key.equals("referrer_url_s")) {
// same patch as for urls which require re-calculation of id's; in this case we store the id only!
final DigestURL durl = new DigestURL(o.toString());
final String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id);
@ -236,6 +250,12 @@ public class JsonListImporter extends Thread implements Importer {
continue jsonreader;
}
// check if required fields are still missing and compute them
if (!surrogate.containsKey(CollectionSchema.host_s.getSolrFieldName())) {
final DigestURL durl = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
}
// regular situation, just read content of field
surrogate.setField(key, o.toString());
}

View File

@ -2325,7 +2325,10 @@ public final class Switchboard extends serverSwitch {
|| s.endsWith(".xml.zip")
|| s.endsWith(".warc")
|| s.endsWith(".warc.gz")
|| s.endsWith(".jsonl")
|| s.endsWith(".jsonl.gz")
|| s.endsWith(".jsonlist")
|| s.endsWith(".jsonlist.gz")
|| s.endsWith(".flatjson") ) {
count++;
}
@ -3167,9 +3170,9 @@ public final class Switchboard extends serverSwitch {
}
// check mustmatch pattern
Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
final Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3177,9 +3180,9 @@ public final class Switchboard extends serverSwitch {
}
// check mustnotmatch
Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
final Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3192,13 +3195,13 @@ public final class Switchboard extends serverSwitch {
// check canonical
if (profile.noindexWhenCanonicalUnequalURL()) {
AnchorURL canonical = document.getCanonical();
DigestURL source = document.dc_source();
final AnchorURL canonical = document.getCanonical();
final DigestURL source = document.dc_source();
if (canonical != null && source != null) {
String canonical_norm = canonical.toNormalform(true);
String source_norm = source.toNormalform(true);
final String canonical_norm = canonical.toNormalform(true);
final String source_norm = source.toNormalform(true);
if (!canonical_norm.equals(source_norm)) {
String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
final String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3216,9 +3219,9 @@ public final class Switchboard extends serverSwitch {
}
// check content pattern must-match
Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
final Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3226,9 +3229,9 @@ public final class Switchboard extends serverSwitch {
}
// check content pattern must-not-match
Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
final Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);