yacy_search_server/source/de/anomic/data/URLAnalysis.java
orbiter 60078cf322 added next tool for url analysis: check for references, that occur in the URL-DB but not in the RICOLLECTIONS
to use this, you must user the -incollection command before (see SVN 5687) and you need a 
used.dump file that has been produced with that process.

Now you can use that file, to do a URL-hash compare with the urls in the URL-DB. To do that, execute
java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump
or use different names for the dump files or more memory.

As a result, you get the file diffurlcol.dump which contains all the url hashes that occur in the URL database, but not in the collections.
The file has the format
{hash-12}*
that means: 12 byte long hashes are listed without any separation.

The next step could be to process this file and delete all these URLs with the computed hashes, or to export them before deletion.



git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5692 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-03-10 13:38:40 +00:00

479 lines
20 KiB
Java

// URLAnalysis.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 24.02.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-01-02 12:38:20 +0100 (Fr, 02 Jan 2009) $
// $LastChangedRevision: 5432 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import de.anomic.kelondro.index.HandleSet;
import de.anomic.kelondro.index.IntegerHandleIndex;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.yacy.yacyURL;
public class URLAnalysis {
/**
* processes to analyse URL lists
*/
public static yacyURL poison = null;
static {
try {
poison = new yacyURL("http://poison.org/poison", null);
} catch (MalformedURLException e) {
poison = null;
}
}
public static class splitter extends Thread {
ArrayBlockingQueue<yacyURL> in;
ConcurrentHashMap<String, Integer> out;
public splitter(ArrayBlockingQueue<yacyURL> in, ConcurrentHashMap<String, Integer> out) {
this.in = in;
this.out = out;
}
public void run() {
yacyURL url;
Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_");
while (true) {
try {
url = in.take();
if (url == poison) break;
update(url.getHost().replaceAll("-", "\\.").split("\\."));
update(p.matcher(url.getPath()).replaceAll("/").split("/"));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
private void update(String[] s) {
Integer c;
for (String t: s) {
if (t.length() == 0) continue;
c = out.get(t);
out.put(t, (c == null) ? 1 : c.intValue() + 1);
}
}
}
public static void cleanup(ConcurrentHashMap<String, Integer> stat) {
Map.Entry<String, Integer> entry;
int c, low = Integer.MAX_VALUE;
Iterator<Map.Entry<String, Integer>> i = stat.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
c = entry.getValue().intValue();
if (c == 1) {
i.remove();
} else {
if (c < low) low = c;
}
}
i = stat.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
c = entry.getValue().intValue();
if (c == low) {
i.remove();
}
}
Runtime.getRuntime().gc();
}
public static void genstat(String urlfile) {
boolean gz = urlfile.endsWith(".gz");
String analysis = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".stats.gz" : urlfile + ".stats";
long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8);
// start threads
ArrayBlockingQueue<yacyURL> in = new ArrayBlockingQueue<yacyURL>(1000);
ConcurrentHashMap<String, Integer> out = new ConcurrentHashMap<String, Integer>();
for (int i = 0; i < Runtime.getRuntime().availableProcessors(); i++) new splitter(in, out).start();
splitter spl = new splitter(in, out);
spl.start();
// put urls in queue
File infile = new File(urlfile);
File outfile = new File(analysis);
BufferedReader reader = null;
long time = System.currentTimeMillis();
long start = time;
int count = 0;
System.out.println("start processing");
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
if (gz) is = new GZIPInputStream(is);
reader = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
try {
yacyURL url = new yacyURL(line, null);
in.put(url);
} catch (InterruptedException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
continue;
}
}
count++;
if (System.currentTimeMillis() - time > 1000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
if (MemoryControl.available() < cleanuplimit) {
System.out.println("starting cleanup, " + out.size() + " entries in statistic");
cleanup(out);
System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
}
}
reader.close();
} catch (final IOException e) {
e.printStackTrace();
} finally {
if (reader != null) try { reader.close(); } catch (final Exception e) {}
}
// stop threads
System.out.println("stopping threads");
for (int i = 0; i < Runtime.getRuntime().availableProcessors() + 1; i++) try {
in.put(poison);
} catch (InterruptedException e) {
e.printStackTrace();
}
try {
spl.join();
} catch (InterruptedException e1) {
e1.printStackTrace();
}
// generate statistics
System.out.println("start processing results");
TreeMap<String, Integer> results = new TreeMap<String, Integer>();
count = 0;
Map.Entry<String, Integer> entry;
Iterator<Map.Entry<String, Integer>> i = out.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
results.put(num(entry.getValue().intValue() * (entry.getKey().length() - 1)) + " - " + entry.getKey(), entry.getValue());
count++;
i.remove(); // free memory
if (System.currentTimeMillis() - time > 10000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
}
// write statistics
System.out.println("start writing results");
try {
OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
if (gz) os = new GZIPOutputStream(os);
count = 0;
for (Map.Entry<String, Integer> e: results.entrySet()) {
os.write(e.getKey().getBytes());
os.write(new byte[]{'\t'});
os.write(("" + e.getValue()).getBytes());
os.write(new byte[]{'\n'});
count++;
if (System.currentTimeMillis() - time > 10000) {
time = System.currentTimeMillis();
System.out.println("wrote " + count + " lines.");
}
}
os.close();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("finished");
}
public static void genhost(String urlfile) {
boolean gz = urlfile.endsWith(".gz");
String trunk = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".host" : urlfile + ".host";
HashSet<String> hosts = new HashSet<String>();
File infile = new File(urlfile);
BufferedReader reader = null;
long time = System.currentTimeMillis();
long start = time;
int count = 0;
System.out.println("start processing");
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
if (gz) is = new GZIPInputStream(is);
reader = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
try {
yacyURL url = new yacyURL(line, null);
hosts.add(url.getHost());
} catch (MalformedURLException e) {
continue;
}
}
count++;
if (System.currentTimeMillis() - time > 1000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
}
}
reader.close();
} catch (final IOException e) {
e.printStackTrace();
} finally {
if (reader != null) try { reader.close(); } catch (final Exception e) {}
}
// copy everything into a TreeSet to order it
System.out.println("start processing results");
TreeSet<String> results = new TreeSet<String>();
count = 0;
Iterator<String> i = hosts.iterator();
while (i.hasNext()) {
results.add(i.next());
count++;
i.remove(); // free memory
if (System.currentTimeMillis() - time > 10000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
}
// write hosts
writeSet(trunk, gz, results);
System.out.println("finished");
}
private static void writeSet(String trunk, boolean gz, Set<String> set) {
// write hosts
System.out.println("start writing results");
File outfile = new File(trunk + ((gz) ? ".gz" : ""));
long time = System.currentTimeMillis();
try {
OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
if (gz) os = new GZIPOutputStream(os);
int count = 0;
for (String h: set) {
os.write(h.getBytes());
os.write(new byte[]{'\n'});
count++;
if (System.currentTimeMillis() - time > 10000) {
time = System.currentTimeMillis();
System.out.println("wrote " + count + " lines.");
}
}
os.close();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("finished writing results");
}
public static void sortsplit(String urlfile) {
boolean gz = urlfile.endsWith(".gz");
String trunk = ((gz) ? urlfile.substring(0, urlfile.length() - 3) : urlfile) + ".sort";
File infile = new File(urlfile);
TreeSet<String> urls = new TreeSet<String>();
BufferedReader reader = null;
long time = System.currentTimeMillis();
long start = time;
int count = 0;
int filecount = 0;
long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8);
System.out.println("start processing");
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
if (gz) is = new GZIPInputStream(is);
reader = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
try {
yacyURL url = new yacyURL(line, null);
urls.add(url.toNormalform(true, true));
} catch (MalformedURLException e) {
continue;
}
}
count++;
if (System.currentTimeMillis() - time > 1000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
}
if (MemoryControl.available() < cleanuplimit) {
writeSet(trunk + "." + filecount, gz, urls);
filecount++;
urls.clear();
Runtime.getRuntime().gc();
}
}
reader.close();
} catch (final IOException e) {
e.printStackTrace();
} finally {
if (reader != null) try { reader.close(); } catch (final Exception e) {}
}
// write hosts
writeSet(trunk + "." + filecount, gz, urls);
System.out.println("finished");
}
public static void incollection(String collectionPath, String statisticPath) {
try {
IntegerHandleIndex idx = IndexCollection.referenceHashes(
new File(collectionPath),
"collection",
12,
Base64Order.enhancedCoder,
ReferenceRow.urlEntryRow);
System.out.println("COLLECTION INDEX REFERENCE COLLECTION starting dump of statistics");
idx.dump(new File(statisticPath));
System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
} catch (IOException e) {
e.printStackTrace();
}
}
public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException {
System.out.println("COLLECTION INDEX DIFF URL-COL startup");
IntegerHandleIndex idx = new IntegerHandleIndex(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(statisticFile));
MetadataRepository mr = new MetadataRepository(new File(metadataPath));
HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, 100);
System.out.println("COLLECTION INDEX DIFF URL-COL loaded dump, starting diff");
byte[] refhash;
Iterator<byte[]> i = mr.iterator();
long start = System.currentTimeMillis();
long update = start - 7000;
int c = 0;
while (i.hasNext()) {
refhash = i.next();
if (idx.get(refhash) == -1) {
// the key exists as urlhash in the URL database, but not in the collection as referenced urlhash
hs.put(refhash);
}
c++;
if (System.currentTimeMillis() - update > 10000) {
System.out.println("COLLECTION INDEX DIFF URL-COL running, checked " + c + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - c) / c) / 60000) + " minutes remaining");
update = System.currentTimeMillis();
}
}
mr.close();
System.out.println("COLLECTION INDEX DIFF URL-COL finished diff, starting dump to " + diffFile);
c = hs.dump(new File(diffFile));
System.out.println("COLLECTION INDEX DIFF URL-COL finished dump, wrote " + c + " references that occur in the URL-DB, but not in the collection-dump");
return c;
}
public static void main(String[] args) {
if (args[0].equals("-stat") && args.length >= 2) {
// generate a statistics about common words in file, store to <file>.stat
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz
for (int i = 1; i < args.length; i++) genstat(args[i]);
} else if (args[0].equals("-host") && args.length >= 2) {
// generate a file <file>.host containing only the hosts of the urls
for (int i = 1; i < args.length; i++) genhost(args[i]);
} else if (args[0].equals("-sort") && args.length >= 2) {
// generate file <file>.x.sort with sorted lists and split the file in smaller pieces
for (int i = 1; i < args.length; i++) sortsplit(args[i]);
} else if (args[0].equals("-incollection") && args.length >= 2) {
// generate a dump of all referenced URL hashes from a given RICOLLECTION
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump
incollection(args[1], args[2]);
} else if (args[0].equals("-diffurlcol") && args.length >= 3) {
// make a diff-file that contains hashes from the url database that do not occur in the collection reference dump
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump
try {
diffurlcol(args[1], args[2], args[3]);
} catch (IOException e) {
e.printStackTrace();
}
} else {
System.out.println("usage:");
System.out.println("-stat <file> generate a statistics about common words in file, store to <file>.stat");
System.out.println("-host <file> generate a file <file>.host containing only the hosts of the urls");
System.out.println("-sort <file> generate file <file>.x.sort with sorted lists and split the file in smaller pieces");
System.out.println("-incollection <path-to-RICOLLECTION> <file> generate a dump of all referenced URL hashes");
System.out.println("-diffurlcol <path-to-URL-DB> <dump-from-incollection> <diff-dump> find URLs that occur ");
}
}
private static final String num(int i) {
String s = Integer.toString(i);
while (s.length() < 9) s = "0" + s;
return s;
}
}