yacy_search_server/source/net/yacy/kelondro/util/SetTools.java
Michael Peter Christen cb85b22725 redesign of the image search process (with much better results,
unfortunately the index schema has changed and p2p image search will not
be muchmuch better until many people update)
2013-09-02 18:55:38 +02:00

618 lines
25 KiB
Java

// kelondroMSetTools.java
// -------------------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 28.12.2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.kelondro.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
public final class SetTools {
//public static Comparator fastStringComparator = fastStringComparator(true);
// ------------------------------------------------------------------------------------------------
// helper methods
public static int log2a(int x) {
// this computes 1 + log2
// it is the number of bits in x, not the logarithm by 2
int l = 0;
while (x > 0) {x = x >>> 1; l++;}
return l;
}
// ------------------------------------------------------------------------------------------------
// join
// We distinguish two principal solutions
// - constructive join (generate new data structure)
// - destructive join (remove non-valid elements from given data structure)
// The algorithm to perform the join can be also of two kind:
// - join by pairwise enumeration
// - join by iterative tests (where we distinguish left-right and right-left tests)
public static <A, B> SortedMap<A, B> joinConstructive(final Collection<SortedMap<A, B>> maps, final boolean concatStrings) {
// this joins all TreeMap(s) contained in maps
// first order entities by their size
final SortedMap<Long, SortedMap<A, B>> orderMap = new TreeMap<Long, SortedMap<A, B>>();
SortedMap<A, B> singleMap;
final Iterator<SortedMap<A, B>> i = maps.iterator();
int count = 0;
while (i.hasNext()) {
// get next entity:
singleMap = i.next();
// check result
if ((singleMap == null) || (singleMap.isEmpty())) return new TreeMap<A, B>();
// store result in order of result size
orderMap.put(Long.valueOf(singleMap.size() * 1000 + count), singleMap);
count++;
}
// check if there is any result
if (orderMap.isEmpty()) return new TreeMap<A, B>();
// we now must pairwise build up a conjunction of these maps
Long k = orderMap.firstKey(); // the smallest, which means, the one with the least entries
SortedMap<A, B> mapA, mapB, joinResult = orderMap.remove(k);
while (!orderMap.isEmpty() && !joinResult.isEmpty()) {
// take the first element of map which is a result and combine it with result
k = orderMap.firstKey(); // the next smallest...
mapA = joinResult;
mapB = orderMap.remove(k);
joinResult = joinConstructiveByTest(mapA, mapB, concatStrings); // TODO: better with enumeration?
// free resources
mapA = null;
mapB = null;
}
// in 'searchResult' is now the combined search result
if (joinResult.isEmpty()) return new TreeMap<A, B>();
return joinResult;
}
public static <A, B> SortedMap<A, B> joinConstructive(final SortedMap<A, B> map1, final SortedMap<A, B> map2, final boolean concatStrings) {
// comparators must be equal
if ((map1 == null) || (map2 == null)) return null;
if (map1.comparator() != map2.comparator()) return null;
if (map1.isEmpty() || map2.isEmpty()) return new TreeMap<A, B>(map1.comparator());
// decide which method to use
final int high = ((map1.size() > map2.size()) ? map1.size() : map2.size());
final int low = ((map1.size() > map2.size()) ? map2.size() : map1.size());
final int stepsEnum = 10 * (high + low - 1);
final int stepsTest = 12 * log2a(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {
if (map1.size() > map2.size()) return joinConstructiveByTest(map2, map1, concatStrings);
return joinConstructiveByTest(map1, map2, concatStrings);
}
return joinConstructiveByEnumeration(map1, map2, concatStrings);
}
@SuppressWarnings("unchecked")
private static <A, B> SortedMap<A, B> joinConstructiveByTest(final SortedMap<A, B> small, final SortedMap<A, B> large, final boolean concatStrings) {
final SortedMap<A, B> result = new TreeMap<A, B>(large.comparator());
synchronized (small) {
final Iterator<Map.Entry<A, B>> mi = small.entrySet().iterator();
Map.Entry<A, B> mentry1;
B mobj2;
loop: while (mi.hasNext()) {
try {
mentry1 = mi.next();
synchronized (large) {
mobj2 = large.get(mentry1.getKey());
}
if (mobj2 != null) {
if (mentry1.getValue() instanceof String) {
result.put(mentry1.getKey(), (B) ((concatStrings) ? (((String) mentry1.getValue()) + (String) mobj2) : (String) mentry1.getValue()));
} else if (mentry1.getValue() instanceof StringBuilder) {
result.put(mentry1.getKey(), (B) ((concatStrings) ? (((StringBuilder) mentry1.getValue()).append((StringBuilder) mobj2)) : mentry1.getValue()));
} else {
result.put(mentry1.getKey(), mentry1.getValue());
}
}
} catch (final ConcurrentModificationException e) {
ConcurrentLog.warn("SetTools", e.getMessage(), e);
break loop;
}
}
}
return result;
}
@SuppressWarnings("unchecked")
private static <A, B> SortedMap<A, B> joinConstructiveByEnumeration(final SortedMap<A, B> map1, final SortedMap<A, B> map2, final boolean concatStrings) {
// implement pairwise enumeration
final Comparator<? super A> comp = map1.comparator();
final Iterator<Map.Entry<A, B>> mi1 = map1.entrySet().iterator();
final Iterator<Map.Entry<A, B>> mi2 = map2.entrySet().iterator();
final SortedMap<A, B> result = new TreeMap<A, B>(map1.comparator());
int c;
if ((mi1.hasNext()) && (mi2.hasNext())) {
Map.Entry<A, B> mentry1 = mi1.next();
Map.Entry<A, B> mentry2 = mi2.next();
while (true) {
c = comp.compare(mentry1.getKey(), mentry2.getKey());
if (c < 0) {
if (mi1.hasNext()) mentry1 = mi1.next(); else break;
} else if (c > 0) {
if (mi2.hasNext()) mentry2 = mi2.next(); else break;
} else {
if (mentry1.getValue() instanceof String) {
result.put(mentry1.getKey(), (B) ((concatStrings) ? (((String) mentry1.getValue()) + (String) mentry2.getValue()) : (String) mentry1.getValue()));
} else if (mentry1.getValue() instanceof StringBuilder) {
result.put(mentry1.getKey(), (B) ((concatStrings) ? (((StringBuilder) mentry1.getValue()).append((StringBuilder) mentry2.getValue())) : (StringBuilder) mentry1.getValue()));
} else {
result.put(mentry1.getKey(), mentry1.getValue());
}
if (mi1.hasNext()) mentry1 = mi1.next(); else break;
if (mi2.hasNext()) mentry2 = mi2.next(); else break;
}
}
}
return result;
}
// now the same for set-set
public static <A> SortedSet<A> joinConstructive(final SortedSet<A> set1, final SortedSet<A> set2) {
// comparators must be equal
if ((set1 == null) || (set2 == null)) return null;
if (set1.comparator() != set2.comparator()) return null;
if (set1.isEmpty() || set2.isEmpty()) return new TreeSet<A>(set1.comparator());
// decide which method to use
final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size());
final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size());
final int stepsEnum = 10 * (high + low - 1);
final int stepsTest = 12 * log2a(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {
if (set1.size() < set2.size()) return joinConstructiveByTest(set1, set2);
return joinConstructiveByTest(set2, set1);
}
return joinConstructiveByEnumeration(set1, set2);
}
public static <A> SortedSet<A> joinConstructiveByTest(final Collection<A> small, final SortedSet<A> large) {
final Iterator<A> mi = small.iterator();
final SortedSet<A> result = new TreeSet<A>(large.comparator());
A o;
while (mi.hasNext()) {
o = mi.next();
if (large.contains(o)) result.add(o);
}
return result;
}
private static <A> SortedSet<A> joinConstructiveByEnumeration(final SortedSet<A> set1, final SortedSet<A> set2) {
// implement pairwise enumeration
final Comparator<? super A> comp = set1.comparator();
final Iterator<A> mi = set1.iterator();
final Iterator<A> si = set2.iterator();
final SortedSet<A> result = new TreeSet<A>(set1.comparator());
int c;
if ((mi.hasNext()) && (si.hasNext())) {
A mobj = mi.next();
A sobj = si.next();
while (true) {
c = comp.compare(mobj, sobj);
if (c < 0) {
if (mi.hasNext()) mobj = mi.next(); else break;
} else if (c > 0) {
if (si.hasNext()) sobj = si.next(); else break;
} else {
result.add(mobj);
if (mi.hasNext()) mobj = mi.next(); else break;
if (si.hasNext()) sobj = si.next(); else break;
}
}
}
return result;
}
/**
* test if one set is totally included in another set
* @param <A>
* @param small
* @param large
* @return true if the small set is completely included in the large set
*/
public static <A> boolean totalInclusion(final Set<A> small, final Set<A> large) {
for (A o: small) {
if (!large.contains(o)) return false;
}
return true;
}
/**
* test if one set is totally included in another set
* @param small
* @param large
* @return true if the small set is completely included in the large set
*/
public static boolean totalInclusion(final HandleSet small, final HandleSet large) {
for (byte[] handle: small) {
if (!large.has(handle)) return false;
}
return true;
}
/**
* test if the intersection of two sets is not empty
* @param <A>
* @param set1
* @param set2
* @return true if any element of the first set is part of the second set or vice-versa
*/
public static <A> boolean anymatch(final SortedSet<A> set1, final SortedSet<A> set2) {
// comparators must be equal
if ((set1 == null) || (set2 == null)) return false;
if (set1.comparator() != set2.comparator()) return false;
if (set1.isEmpty() || set2.isEmpty()) return false;
// decide which method to use
final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size());
final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size());
final int stepsEnum = 10 * (high + low - 1);
final int stepsTest = 12 * log2a(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {
if (set1.size() < set2.size()) return anymatchByTest(set1, set2);
return anymatchByTest(set2, set1);
}
return anymatchByEnumeration(set1, set2);
}
/**
* test if the intersection of two sets is not empty
* @param set1
* @param set2
* @return true if any element of the first set is part of the second set or vice-versa
*/
public static boolean anymatch(final HandleSet set1, final HandleSet set2) {
// comparators must be equal
if ((set1 == null) || (set2 == null)) return false;
if (set1.comparator() != set2.comparator()) return false;
if (set1.isEmpty() || set2.isEmpty()) return false;
// decide which method to use
final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size());
final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size());
final int stepsEnum = 10 * (high + low - 1);
final int stepsTest = 12 * log2a(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {
if (set1.size() < set2.size()) return anymatchByTest(set1, set2);
return anymatchByTest(set2, set1);
}
return anymatchByEnumeration(set1, set2);
}
private static <A> boolean anymatchByTest(final SortedSet<A> small, final SortedSet<A> large) {
final Iterator<A> mi = small.iterator();
A o;
while (mi.hasNext()) {
o = mi.next();
if (large.contains(o)) return true;
}
return false;
}
private static boolean anymatchByTest(final HandleSet small, final HandleSet large) {
final Iterator<byte[]> mi = small.iterator();
byte[] o;
while (mi.hasNext()) {
o = mi.next();
if (large.has(o)) return true;
}
return false;
}
private static <A> boolean anymatchByEnumeration(final SortedSet<A> set1, final SortedSet<A> set2) {
// implement pairwise enumeration
final Comparator<? super A> comp = set1.comparator();
final Iterator<A> mi = set1.iterator();
final Iterator<A> si = set2.iterator();
int c;
if ((mi.hasNext()) && (si.hasNext())) {
A mobj = mi.next();
A sobj = si.next();
while (true) {
c = comp.compare(mobj, sobj);
if (c < 0) {
if (mi.hasNext()) mobj = mi.next(); else break;
} else if (c > 0) {
if (si.hasNext()) sobj = si.next(); else break;
} else {
return true;
}
}
}
return false;
}
private static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) {
// implement pairwise enumeration
final Comparator<byte[]> comp = set1.comparator();
final Iterator<byte[]> mi = set1.iterator();
final Iterator<byte[]> si = set2.iterator();
int c;
if ((mi.hasNext()) && (si.hasNext())) {
byte[] mobj = mi.next();
byte[] sobj = si.next();
while (true) {
c = comp.compare(mobj, sobj);
if (c < 0) {
if (mi.hasNext()) mobj = mi.next(); else break;
} else if (c > 0) {
if (si.hasNext()) sobj = si.next(); else break;
} else {
return true;
}
}
}
return false;
}
// ------------------------------------------------------------------------------------------------
// exclude
/*
public static <A, B> TreeMap<A, B> excludeConstructive(final TreeMap<A, B> map, final Set<A> set) {
if (map == null) return null;
if (set == null) return map;
if (map.isEmpty() || set.isEmpty()) return map;
assert !(set instanceof TreeSet) || map.comparator() == ((TreeSet<A>) set).comparator();
// if (map.comparator() != set.comparator()) return excludeConstructiveByTestMapInSet(map, set);
return excludeConstructiveByTestMapInSet(map, set);
// return excludeConstructiveByEnumeration(map, set);
}
private static <A, B> TreeMap<A, B> excludeConstructiveByTestMapInSet(final TreeMap<A, B> map, final Set<A> set) {
final TreeMap<A, B> result = new TreeMap<A, B>(map.comparator());
A o;
for (Entry<A, B> entry: map.entrySet()) {
o = entry.getKey();
if (!(set.contains(o))) result.put(o, entry.getValue());
}
return result;
}
*/
public static <A, B> void excludeDestructive(final Map<A, B> map, final Set<A> set) {
// comparators must be equal
if (map == null) return;
if (set == null) return;
assert !(map instanceof SortedMap<?,?> && set instanceof SortedSet<?>) || ((SortedMap<A, B>) map).comparator() == ((SortedSet<A>) set).comparator();
if (map.isEmpty() || set.isEmpty()) return;
if (map.size() < set.size())
excludeDestructiveByTestMapInSet(map, set);
else
excludeDestructiveByTestSetInMap(map, set);
}
private static <A, B> void excludeDestructiveByTestMapInSet(final Map<A, B> map, final Set<A> set) {
final Iterator<A> mi = map.keySet().iterator();
while (mi.hasNext()) if (set.contains(mi.next())) mi.remove();
}
private static <A, B> void excludeDestructiveByTestSetInMap(final Map<A, B> map, final Set<A> set) {
final Iterator<A> si = set.iterator();
while (si.hasNext()) map.remove(si.next());
}
// and the same again with set-set
public static <A> void excludeDestructive(final Set<A> set1, final Set<A> set2) {
if (set1 == null) return;
if (set2 == null) return;
assert !(set1 instanceof SortedSet<?> && set2 instanceof SortedSet<?>) || ((SortedSet<A>) set1).comparator() == ((SortedSet<A>) set2).comparator();
if (set1.isEmpty() || set2.isEmpty()) return;
if (set1.size() < set2.size())
excludeDestructiveByTestSmallInLarge(set1, set2);
else
excludeDestructiveByTestLargeInSmall(set1, set2);
}
public static <A> void excludeDestructiveByTestSmallInLarge(final Collection<A> small, final Set<A> large) {
final Iterator<A> mi = small.iterator();
while (mi.hasNext()) if (large.contains(mi.next())) mi.remove();
}
public static <A> void excludeDestructiveByTestLargeInSmall(final Set<A> large, final Collection<A> small) {
final Iterator<A> si = small.iterator();
while (si.hasNext()) large.remove(si.next());
}
// ------------------------------------------------------------------------------------------------
public static SortedMap<String, String> loadMap(final String filename, final String sep) {
final SortedMap<String, String> map = new TreeMap<String, String>();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
String line;
int pos;
while ((line = br.readLine()) != null) {
line = line.trim();
if ((!line.isEmpty() && line.charAt(0) != '#') && ((pos = line.indexOf(sep)) > 0))
map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim());
}
} catch (final IOException e) {
} finally {
if (br != null) try { br.close(); } catch (final Exception e) {}
}
return map;
}
public static SortedMap<String, List<String>> loadMapMultiValsPerKey(final String filename, final String sep) {
final SortedMap<String, List<String>> map = new TreeMap<String, List<String>>();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
String line, key, value;
int pos;
while ((line = br.readLine()) != null) {
line = line.trim();
if ((!line.isEmpty() && line.charAt(0) != '#') && ((pos = line.indexOf(sep)) > 0)) {
key = line.substring(0, pos).trim().toLowerCase();
value = line.substring(pos + sep.length()).trim();
if (!map.containsKey(key)) map.put(key, new ArrayList<String>());
map.get(key).add(value);
}
}
} catch (final IOException e) {
} finally {
if (br != null) try { br.close(); } catch (final Exception e) {}
}
return map;
}
public static SortedSet<String> loadList(final File file, final Comparator<String> c) {
final SortedSet<String> list = new TreeSet<String>(c);
if (!(file.exists())) return list;
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
String line;
while ((line = br.readLine()) != null) {
int i = line.indexOf("|"); // ignore text after char (Solr stopwordfile syntax allows for # and | )
if (i>0) line = line.substring(0,i-1);
line = line.trim();
if (!line.isEmpty() && line.charAt(0) != '#') list.add(line.trim().toLowerCase());
}
} catch (final IOException e) {
} finally {
if (br != null) try{br.close();}catch(final Exception e){}
}
return list;
}
public static String setToString(final HandleSet set, final char separator) {
final Iterator<byte[]> i = set.iterator();
final StringBuilder sb = new StringBuilder(set.size() * 7);
if (i.hasNext()) sb.append(UTF8.String(i.next()));
while (i.hasNext()) {
sb.append(separator).append(UTF8.String(i.next()));
}
return sb.toString();
}
public static String setToString(final Set<String> set, final char separator) {
final Iterator<String> i = set.iterator();
final StringBuilder sb = new StringBuilder(set.size() * 7);
if (i.hasNext()) sb.append(i.next());
while (i.hasNext()) {
sb.append(separator).append(i.next());
}
return sb.toString();
}
public static Object nth(Collection<?> c, int n) {
if (c == null || c.size() <= n) return null;
int i = 0;
for (Object o: c) if (i++ == n) return o;
return null;
}
// ------------------------------------------------------------------------------------------------
public static void main(final String[] args) {
final SortedMap<String, String> m = new TreeMap<String, String>();
final SortedMap<String, String> s = new TreeMap<String, String>();
m.put("a", "a");
m.put("x", "x");
m.put("f", "f");
m.put("h", "h");
m.put("w", "w");
m.put("7", "7");
m.put("t", "t");
m.put("k", "k");
m.put("y", "y");
m.put("z", "z");
s.put("a", "a");
s.put("b", "b");
s.put("c", "c");
s.put("k", "k");
s.put("l", "l");
s.put("m", "m");
s.put("n", "n");
s.put("o", "o");
s.put("p", "p");
s.put("q", "q");
s.put("r", "r");
s.put("s", "s");
s.put("t", "t");
s.put("x", "x");
System.out.println("Compare " + m.toString() + " with " + s.toString());
System.out.println("Join=" + joinConstructiveByEnumeration(m, s, true));
System.out.println("Join=" + joinConstructiveByTest(m, s, true));
System.out.println("Join=" + joinConstructiveByTest(m, s, true));
System.out.println("Join=" + joinConstructive(m, s, true));
//System.out.println("Exclude=" + excludeConstructiveByTestMapInSet(m, s.keySet()));
/*
for (int low = 0; low < 10; low++)
for (int high = 0; high < 100; high=high + 10) {
int stepsEnum = 10 * high;
int stepsTest = 12 * log2(high) * low;
System.out.println("low=" + low + ", high=" + high + ", stepsEnum=" + stepsEnum + ", stepsTest=" + stepsTest + "; best method is " + ((stepsEnum < stepsTest) ? "joinByEnumeration" : "joinByTest"));
}
*/
}
}