yacy_search_server/source/de/anomic/plasma/plasmaCondenser.java
orbiter ad1e4aa88e added selection of audio, video, image and application resources
to search procedure. This function can currently not used through the
search interface, but only through remote search.

added accumulation of search attributes to enable the audio, video,
image and application selection.

fixed a problem with external URL representation generation


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3036 6c8d7289-2bf4-0310-a012-ef5d649a1542
2006-12-01 16:21:17 +00:00

815 lines
33 KiB
Java

// plasmaCondenser.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last change: 09.01.2004
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java
// execute with java -cp source de.anomic.plasma.plasmaCondenser
package de.anomic.plasma;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
public final class plasmaCondenser {
// this is the page analysis class
// category flags that show how the page can be distinguished in different interest groups
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
public static final int flag_cat_opencontent = 1; // open source, any free stuff
public static final int flag_cat_business = 2; // web shops, marketing, trade
public static final int flag_cat_stockfinance = 3; // stock exchange (quotes), finance, economy
public static final int flag_cat_health = 4; // health
public static final int flag_cat_sport = 5; // any sport, cars etc.
public static final int flag_cat_lifestyle = 6; // travel, lifestyle
public static final int flag_cat_politics = 7; // politics
public static final int flag_cat_news = 8; // blogs, news pages
public static final int flag_cat_children = 9; // toys, childrens education, help for parents
public static final int flag_cat_entertainment = 10; // boulevard, entertainment, cultural content
public static final int flag_cat_knowledge = 11; // science, school stuff, help for homework
public static final int flag_cat_computer = 12; // any computer related stuff, networks, operation systems
public static final int flag_cat_p2p = 13; // p2p support, filesharing archives etc.
public static final int flag_cat_sex = 14; // sexual content
public static final int flag_cat_spam = 15; // pages that anybody would consider as not interesting
public static final int flag_cat_linux = 16; // pages about linux software
public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os
public static final int flag_cat_windows = 18; // pages about windows os and softare
public static final int flag_cat_osreserve = 19; // reserve
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
private final static int numlength = 5;
//private Properties analysis;
private TreeMap words; // a string (the words) to (wordStatProp) - relation
private HashMap sentences;
private int wordminsize;
private int wordcut;
//public int RESULT_NUMB_TEXT_BYTES = -1;
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_SIMI_WORDS = -1;
public int RESULT_WORD_ENTROPHY = -1;
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1;
public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException {
this(document.getText(), document.getCharset());
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
}
public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
this(text, charset, 3, 2);
}
public plasmaCondenser(InputStream text, String charset, int wordminsize, int wordcut) throws UnsupportedEncodingException {
this.wordminsize = wordminsize;
this.wordcut = wordcut;
// analysis = new Properties();
words = new TreeMap();
sentences = new HashMap();
createCondensement(text, charset);
}
// create a word hash
public static final String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength);
}
public static final Set words2hashSet(String[] words) {
TreeSet hashes = new TreeSet();
for (int i = 0; i < words.length; i++) hashes.add(word2hash(words[i]));
return hashes;
}
public static final String words2hashString(String[] words) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < words.length; i++) sb.append(word2hash(words[i]));
return new String(sb);
}
public static final Set words2hashes(Set words) {
Iterator i = words.iterator();
TreeSet hashes = new TreeSet();
while (i.hasNext()) hashes.add(word2hash((String) i.next()));
return hashes;
}
public int excludeWords(TreeSet stopwords) {
// subtracts the given stopwords from the word list
// the word list shrinkes. This returns the number of shrinked words
int oldsize = words.size();
words = kelondroMSetTools.excludeConstructive(words, stopwords);
return oldsize - words.size();
}
public Iterator words() {
// returns an entry set iterator
// key is a String (the word), value is a wordStatProp Object
return words.entrySet().iterator();
}
public static class wordStatProp {
// object carries statistics for words and sentences
public int count; // number of occurrences
public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
public int posInPhrase; //
public int numOfPhrase;
public HashSet hash; //
public wordStatProp(int handle, int pip, int nop) {
this.count = 1;
this.posInText = handle;
this.posInPhrase = pip;
this.numOfPhrase = nop;
this.hash = new HashSet();
}
public void inc() {
count++;
}
public void check(int i) {
hash.add(Integer.toString(i));
}
}
public static class phraseStatProp {
// object carries statistics for words and sentences
public int count; // number of occurrences
public int handle; // unique handle, is initialized with sentence counter
public HashSet hash; //
public phraseStatProp(int handle) {
this.count = 1;
this.handle = handle;
this.hash = new HashSet();
}
public void inc() {
count++;
}
public void check(int i) {
hash.add(Integer.toString(i));
}
}
public String intString(int number, int length) {
String s = Integer.toString(number);
while (s.length() < length) s = "0" + s;
return s;
}
private void createCondensement(InputStream is, String charset) throws UnsupportedEncodingException {
words = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/);
sentences = new HashMap();
HashSet currsentwords = new HashSet();
StringBuffer sentence = new StringBuffer(100);
String word = "";
String k;
int wordlen;
wordStatProp wsp, wsp1;
phraseStatProp psp;
int wordHandle;
int wordHandleCount = 0;
int sentenceHandleCount = 0;
int allwordcounter = 0;
int allsentencecounter = 0;
int idx;
int wordInSentenceCounter = 1;
Iterator it, it1;
boolean comb_indexof = false, last_last = false, last_index = false;
// read source
sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize);
while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
//System.out.println("PARSED-WORD " + word);
// distinguish punctuation and words
wordlen = word.length();
if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) {
// store sentence
if (sentence.length() > 0) {
// we store the punctuation symbol as first element of the sentence vector
allsentencecounter++;
sentence.insert(0, word); // append at beginning
if (sentences.containsKey(sentence)) {
// sentence already exists
psp = (phraseStatProp) sentences.get(sentence);
psp.inc();
idx = psp.handle;
sentences.put(sentence, psp);
} else {
// create new sentence
idx = sentenceHandleCount++;
sentences.put(sentence, new phraseStatProp(idx));
}
// store to the words a link to this sentence
it = currsentwords.iterator();
while (it.hasNext()) {
k = (String) it.next();
wsp = (wordStatProp) words.get(k);
wsp.check(idx);
words.put(k, wsp);
}
}
sentence = new StringBuffer(100);
currsentwords.clear();
wordInSentenceCounter = 1;
} else {
// check index.of detection
if ((last_last) && (comb_indexof) && (word.equals("modified"))) {
this.RESULT_FLAGS.set(flag_cat_indexof, true);
wordenum.pre(true); // parse lines as they come with CRLF
}
if ((last_index) && (word.equals("of"))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
// store word
allwordcounter++;
currsentwords.add(word);
if (words.containsKey(word)) {
// word already exists
wsp = (wordStatProp) words.get(word);
wordHandle = wsp.posInText;
wsp.inc();
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1);
}
words.put(word, wsp);
// we now have the unique handle of the word, put it into the sentence:
sentence.append(intString(wordHandle, numlength));
wordInSentenceCounter++;
}
}
// finnish last sentence
if (sentence.length() > 0) {
allsentencecounter++;
sentence.insert(0, "."); // append at beginning
if (sentences.containsKey(sentence)) {
psp = (phraseStatProp) sentences.get(sentence);
psp.inc();
sentences.put(sentence, psp);
} else {
sentences.put(sentence, new phraseStatProp(sentenceHandleCount++));
}
}
// -------------------
// we reconstruct the sentence hashtable
// and order the entries by the number of the sentence
// this structure is needed to replace double occurring words in sentences
Object[] orderedSentences = new Object[sentenceHandleCount];
String[] s;
int wc;
Object o;
it = sentences.keySet().iterator();
while (it.hasNext()) {
o = it.next();
if (o != null) {
sentence = (StringBuffer) o;
wc = (sentence.length() - 1) / numlength;
s = new String[wc + 2];
psp = (phraseStatProp) sentences.get(sentence);
s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
for (int i = 0; i < wc; i++) {
k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
s[i + 2] = k;
}
orderedSentences[psp.handle] = s;
}
}
Map.Entry entry;
// we search for similar words and reorganize the corresponding sentences
// a word is similar, if a shortened version is equal
it = words.entrySet().iterator(); // enumerates the keys in descending order
wordsearch: while (it.hasNext()) {
entry = (Map.Entry) it.next();
word = (String) entry.getKey();
wordlen = word.length();
wsp = (wordStatProp) entry.getValue();
for (int i = wordcut; i > 0; i--) {
if (wordlen > i) {
k = word.substring(0, wordlen - i);
if (words.containsKey(k)) {
// we will delete the word 'word' and repoint the
// corresponding links
// in sentences that use this word
wsp1 = (wordStatProp) words.get(k);
it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word
while (it1.hasNext()) {
idx = Integer.parseInt((String) it1.next()); // number of a sentence
s = (String[]) orderedSentences[idx];
for (int j = 2; j < s.length; j++) {
if (s[j].equals(intString(wsp.posInText, numlength)))
s[j] = intString(wsp1.posInText, numlength);
}
orderedSentences[idx] = s;
}
// update word counter
wsp1.count = wsp1.count + wsp.count;
words.put(k, wsp1);
// remove current word
it.remove();
continue wordsearch;
}
}
}
}
// depending on the orderedSentences structure, we rebuild the sentence
// HashMap to eliminate double occuring sentences
sentences = new HashMap();
int le;
for (int i = 0; i < orderedSentences.length; i++) {
le = ((String[]) orderedSentences[i]).length;
sentence = new StringBuffer(le * 10);
for (int j = 1; j < le; j++)
sentence.append(((String[]) orderedSentences[i])[j]);
if (sentences.containsKey(sentence)) {
// add sentence counter to counter of found sentence
psp = (phraseStatProp) sentences.get(sentence);
psp.count = psp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
sentences.put(sentence, psp);
// System.out.println("Found double occurring sentence " + i + "
// = " + sp.handle);
} else {
// create new sentence entry
psp = new phraseStatProp(i);
psp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]);
sentences.put(sentence, psp);
}
}
// store result
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
this.RESULT_NUMB_WORDS = allwordcounter;
this.RESULT_DIFF_WORDS = wordHandleCount;
this.RESULT_SIMI_WORDS = words.size();
this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter);
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size();
}
public void print() {
String[] s = sentences();
// printout a reconstruction of the text
for (int i = 0; i < s.length; i++) {
if (s[i] != null) System.out.print("#T " + intString(i, numlength) + " " + s[i]);
}
}
public String[] sentences() {
// we reconstruct the word hashtable
// and order the entries by the number of the sentence
// this structure is only needed to reconstruct the text
String word;
wordStatProp wsp;
Map.Entry entry;
Iterator it;
String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack...
it = words.entrySet().iterator(); // enumerates the keys in ascending order
while (it.hasNext()) {
entry = (Map.Entry) it.next();
word = (String) entry.getKey();
wsp = (wordStatProp) entry.getValue();
orderedWords[wsp.posInText] = word;
}
Object[] orderedSentences = makeOrderedSentences();
// create a reconstruction of the text
String[] result = new String[orderedSentences.length];
String s;
for (int i = 0; i < orderedSentences.length; i++) {
if (orderedSentences[i] != null) {
// TODO: bugfix for UTF-8: avoid this form of string concatenation
s = "";
for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) {
s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])];
}
s += ((String[]) orderedSentences[i])[1];
result[i] = (s.length() > 1) ? s.substring(1) : s;
} else {
result[i] = "";
}
}
return result;
}
private Object[] makeOrderedSentences() {
// we reconstruct the sentence hashtable again and create by-handle ordered entries
// this structure is needed to present the strings in the right order in a printout
int wc;
Iterator it;
phraseStatProp psp;
String[] s;
StringBuffer sentence;
Object[] orderedSentences = new Object[sentences.size()];
for (int i = 0; i < sentences.size(); i++)
orderedSentences[i] = null; // this array must be initialized
it = sentences.keySet().iterator();
while (it.hasNext()) {
sentence = (StringBuffer) it.next();
wc = (sentence.length() - 1) / numlength;
s = new String[wc + 2];
psp = (phraseStatProp) sentences.get(sentence);
s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
for (int i = 0; i < wc; i++)
s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
orderedSentences[psp.handle] = s;
}
return orderedSentences;
}
public void writeMapToFile(File out) throws IOException {
Map.Entry entry;
String k;
String word;
Iterator it;
wordStatProp wsp;
Object[] orderedSentences = makeOrderedSentences();
// we reconstruct the word hashtable
// and sort the entries by the number of occurrences
// this structure is needed to print out a sorted list of words
TreeMap sortedWords = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/);
it = words.entrySet().iterator(); // enumerates the keys in ascending order
while (it.hasNext()) {
entry = (Map.Entry) it.next();
word = (String) entry.getKey();
wsp = (wordStatProp) entry.getValue();
sortedWords.put(intString(wsp.count, numlength) + intString(wsp.posInText, numlength), word);
}
// start writing of words and sentences
FileWriter writer = new FileWriter(out);
writer.write("\r\n");
it = sortedWords.entrySet().iterator(); // enumerates the keys in descending order
while (it.hasNext()) {
entry = (Map.Entry) it.next();
k = (String) entry.getKey();
writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + ((String) entry.getValue()) + "\r\n");
}
for (int i = 0; i < orderedSentences.length; i++) {
if (orderedSentences[i] != null) {
writer.write("#S " + intString(i, numlength) + " ");
for (int j = 0; j < ((String[]) orderedSentences[i]).length; j++) {
writer.write(((String[]) orderedSentences[i])[j] + " ");
}
writer.write("\r\n");
}
}
writer.close();
}
public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true;
return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
}
public static Enumeration wordTokenizer(String s, String charset, int minLength) {
try {
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset, minLength);
} catch (Exception e) {
return null;
}
}
public static class sievedWordsEnum implements Enumeration {
// this enumeration removes all words that contain either wrong characters or are too short
Object buffer = null;
unsievedWordsEnum e;
int ml;
public sievedWordsEnum(InputStream is, String charset, int minLength) throws UnsupportedEncodingException {
e = new unsievedWordsEnum(is, charset);
buffer = nextElement0();
ml = minLength;
}
public void pre(boolean x) {
e.pre(x);
}
private Object nextElement0() {
String s;
char c;
loop: while (e.hasMoreElements()) {
s = (String) e.nextElement();
if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
if ((s.length() < ml) && (!(s.equals("of")))) continue loop;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
// TODO: Bugfix needed for UTF-8
if (((c < 'a') || (c > 'z')) &&
((c < 'A') || (c > 'Z')) &&
((c < '0') || (c > '9')))
continue loop; // go to next while loop
}
return s;
}
return null;
}
public boolean hasMoreElements() {
return buffer != null;
}
public Object nextElement() {
Object r = buffer;
buffer = nextElement0();
return r;
}
}
private static class unsievedWordsEnum implements Enumeration {
Object buffer = null;
sentencesFromInputStreamEnum e;
String s;
public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is, charset);
s = "";
buffer = nextElement0();
}
public void pre(boolean x) {
e.pre(x);
}
private Object nextElement0() {
String r;
StringBuffer sb;
char c;
while (s.length() == 0) {
if (e.hasMoreElements()) {
r = (String) e.nextElement();
if (r == null) return null;
r = r.trim();
sb = new StringBuffer(r.length() * 2);
for (int i = 0; i < r.length(); i++) {
c = r.charAt(i);
if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else sb = sb.append(c);
}
s = sb.toString().trim();
//System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
} else {
return null;
}
}
int p = s.indexOf(" ");
if (p < 0) {
r = s;
s = "";
return r;
}
r = s.substring(0, p);
s = s.substring(p + 1).trim();
return r;
}
public boolean hasMoreElements() {
return buffer != null;
}
public Object nextElement() {
Object r = buffer;
buffer = nextElement0();
return r;
}
}
public static sentencesFromInputStreamEnum sentencesFromInputStream(InputStream is, String charset) {
try {
return new sentencesFromInputStreamEnum(is, charset);
} catch (UnsupportedEncodingException e) {
return null;
}
}
public static class sentencesFromInputStreamEnum implements Enumeration {
// read sentences from a given input stream
// this enumerates String objects
Object buffer = null;
BufferedReader raf;
int counter = 0;
boolean pre = false;
public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException {
raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
buffer = nextElement0();
counter = 0;
pre = false;
}
public void pre(boolean x) {
this.pre = x;
}
private Object nextElement0() {
try {
String s = readSentence(raf, pre);
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) {
raf.close();
return null;
}
return s;
} catch (IOException e) {
try {
raf.close();
} catch (Exception ee) {
}
return null;
}
}
public boolean hasMoreElements() {
return buffer != null;
}
public Object nextElement() {
if (buffer == null) {
return null;
} else {
counter = counter + ((String) buffer).length() + 1;
Object r = buffer;
buffer = nextElement0();
return r;
}
}
public int count() {
return counter;
}
}
static String readSentence(Reader reader, boolean pre) throws IOException {
StringBuffer s = new StringBuffer();
int nextChar;
char c;
// find sentence end
for (;;) {
nextChar = reader.read();
//System.out.print((char) nextChar); // DEBUG
if (nextChar < 0) {
if (s.length() == 0) return null; else break;
}
c = (char) nextChar;
s.append(c);
if (pre) {
if ((c == (char) 10) || (c == (char) 13)) break;
} else {
if (htmlFilterContentScraper.punctuation(c)) break;
}
}
// replace line endings and tabs by blanks
for (int i = 0; i < s.length(); i++) {
if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
}
// remove all double-spaces
int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
return new String(s);
}
public static Iterator getWords(InputStream input, String charset) throws UnsupportedEncodingException {
if (input == null) return null;
plasmaCondenser condenser = new plasmaCondenser(input, charset);
return condenser.words();
}
public static Iterator getWords(byte[] text, String charset) throws UnsupportedEncodingException {
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
return getWords(buffer, charset);
}
public static void main(String[] args) {
// read a property file and converty them into configuration lines
try {
File f = new File(args[0]);
Properties p = new Properties();
p.load(new FileInputStream(f));
StringBuffer sb = new StringBuffer();
sb.append("{\n");
for (int i = 0; i <= 15; i++) {
sb.append('"');
String s = p.getProperty("keywords" + i);
String[] l = s.split(",");
for (int j = 0; j < l.length; j++) {
sb.append(word2hash(l[j]));
}
if (i < 15) sb.append(",\n");
}
sb.append("}\n");
System.out.println(new String(sb));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}