mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added another method to process url lists: extract hosts only
This can be used like java -Xmx2000m -cp classes de.anomic.data.URLAnalysis -host DATA/EXPORT/20090224213823.txt changed als the call method to generate statistics, please use now java -Xmx2000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/20090224213823.txt git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5650 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
89d8e824ed
commit
cf9b74e6e3
|
@ -35,9 +35,11 @@ import java.io.FileOutputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -121,9 +123,9 @@ public class URLAnalysis {
|
|||
Runtime.getRuntime().gc();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
String filename = args[0];
|
||||
String analysis = filename + ".stats";
|
||||
public static void genstat(String urlfile) {
|
||||
|
||||
String analysis = urlfile + ".stats";
|
||||
|
||||
// start threads
|
||||
ArrayBlockingQueue<yacyURL> in = new ArrayBlockingQueue<yacyURL>(1000);
|
||||
|
@ -133,7 +135,7 @@ public class URLAnalysis {
|
|||
spl.start();
|
||||
|
||||
// put urls in queue
|
||||
File infile = new File(filename);
|
||||
File infile = new File(urlfile);
|
||||
File outfile = new File(analysis);
|
||||
BufferedReader reader = null;
|
||||
long time = System.currentTimeMillis();
|
||||
|
@ -226,6 +228,89 @@ public class URLAnalysis {
|
|||
System.out.println("finished");
|
||||
}
|
||||
|
||||
public static void genhost(String urlfile) {
|
||||
|
||||
String host = urlfile + ".host";
|
||||
HashSet<String> hosts = new HashSet<String>();
|
||||
File infile = new File(urlfile);
|
||||
File outfile = new File(host);
|
||||
BufferedReader reader = null;
|
||||
long time = System.currentTimeMillis();
|
||||
long start = time;
|
||||
int count = 0;
|
||||
|
||||
System.out.println("start processing");
|
||||
try {
|
||||
reader = new BufferedReader(new InputStreamReader(new FileInputStream(infile)));
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
yacyURL url = new yacyURL(line, null);
|
||||
hosts.add(url.getHost());
|
||||
}
|
||||
count++;
|
||||
if (System.currentTimeMillis() - time > 1000) {
|
||||
time = System.currentTimeMillis();
|
||||
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
} catch (final IOException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (reader != null) try { reader.close(); } catch (final Exception e) {}
|
||||
}
|
||||
|
||||
// copy everything into a TreeSet to order it
|
||||
System.out.println("start processing results");
|
||||
TreeSet<String> results = new TreeSet<String>();
|
||||
count = 0;
|
||||
Iterator<String> i = hosts.iterator();
|
||||
while (i.hasNext()) {
|
||||
results.add(i.next());
|
||||
count++;
|
||||
i.remove(); // free memory
|
||||
if (System.currentTimeMillis() - time > 10000) {
|
||||
time = System.currentTimeMillis();
|
||||
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
|
||||
}
|
||||
}
|
||||
|
||||
// write hosts
|
||||
System.out.println("start writing results");
|
||||
try {
|
||||
BufferedOutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
|
||||
count = 0;
|
||||
for (String h: results) {
|
||||
os.write(h.getBytes());
|
||||
os.write(new byte[]{'\n'});
|
||||
count++;
|
||||
if (System.currentTimeMillis() - time > 10000) {
|
||||
time = System.currentTimeMillis();
|
||||
System.out.println("wrote " + count + " lines.");
|
||||
}
|
||||
}
|
||||
os.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
System.out.println("finished");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
if (args[0].equals("-stat") && args.length == 2) {
|
||||
genstat(args[1]);
|
||||
} else if (args[0].equals("-host") && args.length == 2) {
|
||||
genhost(args[1]);
|
||||
} else {
|
||||
System.out.println("usage:");
|
||||
System.out.println("-stat <file> generate a statistics about common words in file, store to <file>.stat");
|
||||
System.out.println("-host <file> generate a file <file>.host containing only the hosts of the urls");
|
||||
}
|
||||
}
|
||||
|
||||
private static final String num(int i) {
|
||||
String s = Integer.toString(i);
|
||||
while (s.length() < 9) s = "0" + s;
|
||||
|
|
Loading…
Reference in New Issue
Block a user