0N - added option to generate index export files for a specific number

of minutes in the past and reverted latest change. The export file dump
will now contain four data elements: f - first date of index entry write
date, l - last date of index write date, n - now-date of index dump
time, c - count of numbers inside the dump. '0N' denotes a series of
changes which will lead to the opportunity to exchange index data dumps
in a way that is needed to integrate ZeroNet index data. This will be
based on index dump sharing; that causes this commit.
This commit is contained in:
Michael Peter Christen 2016-02-23 18:56:20 +01:00
parent 5b9030180c
commit a6bf0b1649
3 changed files with 83 additions and 24 deletions

View File

@ -18,8 +18,8 @@
<form action="IndexExport_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Loaded URL Export</legend>
<dl>
<dt class="TableCellDark">Export File</dt>
<dd><input type="text" name="exportfile" value="#[exportfile]#" size="120" maxlength="250" />
<dt class="TableCellDark">Export Path</dt>
<dd><input type="text" name="exportfilepath" value="#[exportfilepath]#" size="120" maxlength="250" />
</dd>
<dt class="TableCellDark">URL Filter</dt>
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
@ -27,6 +27,9 @@
<dt class="TableCellDark">query</dt>
<dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">maximum age (seconds, -1 = unlimited)</dt>
<dd><input type="text" name="exportmaxseconds" value="-1" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">Export Format</dt>
<dd>
<dl>

View File

@ -22,9 +22,9 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.io.IOException;
import java.util.List;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
@ -53,9 +53,10 @@ public class IndexExport_p {
List<File> dumpFiles = segment.fulltext().dumpFiles();
prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
prop.put("dumprestore_optimizemax", 10);
prop.putNum("ucount", ucount);
// show export messages
final Fulltext.Export export = segment.fulltext().export();
Fulltext.Export export = segment.fulltext().export();
if ((export != null) && (export.isAlive())) {
// there is currently a running export
prop.put("lurlexport", 2);
@ -66,7 +67,7 @@ public class IndexExport_p {
prop.put("reload", 1);
} else {
prop.put("lurlexport", 1);
prop.put("lurlexport_exportfile", sb.getDataPath() + "/DATA/EXPORT/yacy_export_" + sb.peers.myID() + "_" + GenericFormatter.SHORT_SECOND_FORMATTER.format());
prop.put("lurlexport_exportfilepath", sb.getDataPath() + "/DATA/EXPORT/");
if (export == null) {
// there has never been an export
prop.put("lurlexportfinished", 0);
@ -87,7 +88,6 @@ public class IndexExport_p {
}
if (post == null || env == null) {
prop.putNum("ucount", ucount);
return prop; // nothing to do
}
@ -102,23 +102,25 @@ public class IndexExport_p {
if (fname.endsWith("rss")) format = Fulltext.ExportFormat.rss;
if (fname.endsWith("solr")) format = Fulltext.ExportFormat.solr;
// extend export file name
String s = post.get("exportfile", "");
if (s.indexOf('.',0) < 0) {
if (format == Fulltext.ExportFormat.text) s = s + ".txt";
if (format == Fulltext.ExportFormat.html) s = s + ".html";
if (format == Fulltext.ExportFormat.rss ) s = s + "_rss.xml";
if (format == Fulltext.ExportFormat.solr) s = s + "_full.xml";
}
final File f = new File(s);
f.getParentFile().mkdirs();
final String filter = post.get("exportfilter", ".*");
final String query = post.get("exportquery", "*:*");
final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom, text);
prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", running.count());
if ((running != null) && (running.failed() == null)) {
final int maxseconds = post.getInt("exportmaxseconds", -1);
final String path = post.get("exportfilepath", "");
// start the export
try {
export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text);
} catch (IOException e) {
prop.put("lurlexporterror", 1);
prop.put("lurlexporterror_exportfile", "-no export-");
prop.put("lurlexporterror_exportfailmsg", e.getMessage());
return prop;
}
// show result
prop.put("lurlexport_exportfile", export.file().toString());
prop.put("lurlexport_urlcount", export.count());
if ((export != null) && (export.failed() == null)) {
prop.put("lurlexport", 2);
}
prop.put("reload", 1);
@ -144,4 +146,4 @@ public class IndexExport_p {
return prop;
}
}
}

View File

@ -77,9 +77,11 @@ import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.search.schema.WebgraphSchema;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrInfoMBean;
import org.apache.solr.util.DateFormatUtil;
import org.apache.lucene.util.Version;
public final class Fulltext {
@ -617,9 +619,61 @@ public final class Fulltext {
}
}
}
public static enum ExportFormat {
text, html, rss, solr;
text("txt"), html("html"), rss("rss"), solr("xml");
private final String ext;
private ExportFormat(String ext) {this.ext = ext;}
public String getExt() {return this.ext;}
}
public Export export(Fulltext.ExportFormat format, String filter, String query, final int maxseconds, File path, boolean dom, boolean text) throws IOException {
// modify query according to maxseconds
long now = System.currentTimeMillis();
if (maxseconds > 0) {
long from = now - maxseconds * 1000L;
String nowstr = DateFormatUtil.formatExternal(new Date(now));
String fromstr = DateFormatUtil.formatExternal(new Date(from));
String dateq = CollectionSchema.load_date_dt.getSolrFieldName() + ":[" + fromstr + " TO " + nowstr + "]";
query = query == null || AbstractSolrConnector.CATCHALL_QUERY.equals(query) ? dateq : query + " AND " + dateq;
} else {
query = query == null? AbstractSolrConnector.CATCHALL_QUERY : query;
}
// check the oldest and latest entry in the index for this query
SolrDocumentList firstdoclist, lastdoclist;
firstdoclist = this.getDefaultConnector().getDocumentListByQuery(
query, CollectionSchema.load_date_dt.getSolrFieldName() + " asc", 0, 1,CollectionSchema.load_date_dt.getSolrFieldName());
lastdoclist = this.getDefaultConnector().getDocumentListByQuery(
query, CollectionSchema.load_date_dt.getSolrFieldName() + " desc", 0, 1,CollectionSchema.load_date_dt.getSolrFieldName());
if (firstdoclist.size() == 0 || lastdoclist.size() == 0) {
assert firstdoclist.size() == 0 && lastdoclist.size() == 0;
throw new IOException("number of exported documents == 0");
}
assert firstdoclist.size() == 1 && lastdoclist.size() == 1;
long doccount = firstdoclist.getNumFound();
// create the export name
SolrDocument firstdoc = firstdoclist.get(0);
SolrDocument lastdoc = lastdoclist.get(0);
Object firstdateobject = firstdoc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
Object lastdateobject = lastdoc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
Date firstdate = (Date) firstdateobject;
Date lastdate = (Date) lastdateobject;
String s = new File(path, "yacy_dump_" +
"f" + GenericFormatter.FORMAT_SHORT_MINUTE.format(firstdate) + "_" +
"l" + GenericFormatter.FORMAT_SHORT_MINUTE.format(lastdate) + "_" +
"n" + GenericFormatter.FORMAT_SHORT_MINUTE.format(new Date(now)) + "_" +
"c" + String.format("%1$012d", doccount)).getAbsolutePath();
// create export file name
if (s.indexOf('.',0) < 0) s += "." + format.getExt();
final File f = new File(s);
f.getParentFile().mkdirs();
return export(f, filter, query, format, dom, text);
}
// export methods