mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
memory protection for URLAnalysis
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5649 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
0f6fa804ff
commit
89d8e824ed
|
@ -77,10 +77,10 @@
|
|||
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
|
||||
</dd>
|
||||
<dt class="TableCellDark">Export Format</dt>
|
||||
<dd>Only Domain <i>(superfast)</i>:
|
||||
<dd>Only Domain:
|
||||
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)
|
||||
<input type="radio" name="format" value="dom-html" checked="checked" />HTML (domains as URLs, no title)<br />
|
||||
Full URL List <i>(high IO) </i>:
|
||||
Full URL List:
|
||||
<input type="radio" name="format" value="url-text" />Plain Text List (URLs only)
|
||||
<input type="radio" name="format" value="url-html" />HTML (URLs with title)
|
||||
<input type="radio" name="format" value="url-rss" />XML (RSS)
|
||||
|
|
|
@ -50,6 +50,8 @@ public class URLAnalysis {
|
|||
/**
|
||||
* processes to analyse URL lists
|
||||
*/
|
||||
|
||||
private static final long cleanuplimit = 50 * 1024 * 1024;
|
||||
|
||||
public static yacyURL poison = null;
|
||||
static {
|
||||
|
@ -77,7 +79,6 @@ public class URLAnalysis {
|
|||
try {
|
||||
url = in.take();
|
||||
if (url == poison) break;
|
||||
//System.out.println(url);
|
||||
update(url.getHost().replaceAll("-", "\\.").split("\\."));
|
||||
update(p.matcher(url.getPath()).replaceAll("/").split("/"));
|
||||
} catch (InterruptedException e) {
|
||||
|
@ -96,6 +97,30 @@ public class URLAnalysis {
|
|||
}
|
||||
}
|
||||
|
||||
public static void cleanup(ConcurrentHashMap<String, Integer> stat) {
|
||||
Map.Entry<String, Integer> entry;
|
||||
int c, low = Integer.MAX_VALUE;
|
||||
Iterator<Map.Entry<String, Integer>> i = stat.entrySet().iterator();
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
c = entry.getValue().intValue();
|
||||
if (c == 1) {
|
||||
i.remove();
|
||||
} else {
|
||||
if (c < low) low = c;
|
||||
}
|
||||
}
|
||||
i = stat.entrySet().iterator();
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
c = entry.getValue().intValue();
|
||||
if (c == low) {
|
||||
i.remove();
|
||||
}
|
||||
}
|
||||
Runtime.getRuntime().gc();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
String filename = args[0];
|
||||
String analysis = filename + ".stats";
|
||||
|
@ -133,6 +158,11 @@ public class URLAnalysis {
|
|||
if (System.currentTimeMillis() - time > 1000) {
|
||||
time = System.currentTimeMillis();
|
||||
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
|
||||
if (MemoryControl.available() < cleanuplimit) {
|
||||
System.out.println("starting cleanup, " + out.size() + " entries in statistic");
|
||||
cleanup(out);
|
||||
System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
|
||||
}
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
|
|
Loading…
Reference in New Issue
Block a user