mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
added some more logging to domain extraction
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2316 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
79af283f6c
commit
c57b78722b
|
@ -940,6 +940,13 @@ public final class yacy {
|
|||
Iterator eiter = pool.loadedURL.entries(true, false);
|
||||
HashSet doms = new HashSet();
|
||||
plasmaCrawlLURL.Entry entry;
|
||||
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
|
||||
System.out.println("a dump will be written after double-check of all extracted domains.");
|
||||
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
|
||||
System.out.println("java -Xms<megabytes>m -Xmx<megabytes>m -classpath classes yacy -domlist [ -format { text | html } ] [ <path to DATA folder> ]");
|
||||
System.out.println("i.e.");
|
||||
System.out.println("java -Xms900m -Xmx900m -classpath classes yacy -domlist");
|
||||
int c = 0;
|
||||
while (eiter.hasNext()) {
|
||||
try {
|
||||
entry = (plasmaCrawlLURL.Entry) eiter.next();
|
||||
|
@ -948,12 +955,16 @@ public final class yacy {
|
|||
// here an MalformedURLException may occur
|
||||
// just ignore
|
||||
}
|
||||
c++;
|
||||
if (c % 10000 == 0) System.out.println(c + " urls checked, " + doms.size() + " domains collected.");
|
||||
}
|
||||
|
||||
// output file in HTML format
|
||||
|
||||
if (format.equals("html")) {
|
||||
// output file in HTML format
|
||||
File file = new File(root, targetName + ".html");
|
||||
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
|
||||
System.out.println("Started domain list dump to file " + file);
|
||||
Iterator i = doms.iterator();
|
||||
String key;
|
||||
bos.write(("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">").getBytes());
|
||||
|
@ -967,11 +978,13 @@ public final class yacy {
|
|||
}
|
||||
bos.write(("</body></html>").getBytes());
|
||||
bos.close();
|
||||
//output file in plain text but compressed with ZIP
|
||||
|
||||
} else if (format.equals("zip")) {
|
||||
// output file in plain text but compressed with ZIP
|
||||
ZipEntry zipEntry = new ZipEntry(targetName + ".txt");
|
||||
File file = new File(root, targetName + ".zip");
|
||||
ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file));
|
||||
System.out.println("Started domain list dump to file " + file);
|
||||
bos.putNextEntry(zipEntry);
|
||||
Iterator i = doms.iterator();
|
||||
String key;
|
||||
|
@ -981,10 +994,12 @@ public final class yacy {
|
|||
bos.write(serverCore.crlf);
|
||||
}
|
||||
bos.close();
|
||||
//output file in plain text but compressed with GZIP
|
||||
|
||||
} else if (format.equals("gzip")) {
|
||||
// output file in plain text but compressed with GZIP
|
||||
File file = new File(root, targetName + ".txt.gz");
|
||||
GZIPOutputStream bos = new GZIPOutputStream(new FileOutputStream(file));
|
||||
System.out.println("Started domain list dump to file " + file);
|
||||
Iterator i = doms.iterator();
|
||||
String key;
|
||||
while (i.hasNext()) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user