yacy_search_server/source/net/yacy/document/SnippetExtractor.java

/**
 *  SnippetExtractor
 *  Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
 *  First released 22.10.2010 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.document;

import java.util.Collection;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.TreeSet;

import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;

public class SnippetExtractor {

    String snippetString;
    HandleSet remainingHashes;

    public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
        if (sentences == null) throw new UnsupportedOperationException("sentence == null");
        if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
        TreeMap<byte[], Integer> hs;
        final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
        long uniqCounter = 999L;
        Integer pos;
        TreeSet<Integer> positions;
        int linenumber = 0;
        int fullmatchcounter = 0;
        lookup: for (StringBuilder sentence: sentences) {
            hs = Condenser.hashSentence(sentence.toString());
            positions = new TreeSet<Integer>();
            for (byte[] word: queryhashes) {
                pos = hs.get(word);
                if (pos != null) {
                    positions.add(pos);
                }
            }
            int worddistance = positions.size() > 1 ? positions.last() - positions.first() : 0;
            // sort by
            // - 1st order: number of matching words
            // - 2nd order: word distance
            // - 3th order: line length (not too short and not too long)
            // - 4rd order: line number
            if (positions.size() > 0) {
                order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
                if (order.size() > 5) order.remove(order.firstEntry().getKey());
                if (positions.size() == queryhashes.size()) fullmatchcounter++;
                if (fullmatchcounter >= 3) break lookup;
            }
            linenumber++;
        }

        StringBuilder sentence;
        SnippetExtractor tsr;
        while (!order.isEmpty()) {
            sentence = order.remove(order.lastKey()); // sentence with the biggest score
            try {
                tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength);
            } catch (UnsupportedOperationException e) {
                continue;
            }
            snippetString = tsr.snippetString;
            if (snippetString != null && snippetString.length() > 0) {
                remainingHashes = tsr.remainingHashes;
                if (remainingHashes.isEmpty()) {
                    // we have found the snippet
                    return; // finished!
                } else if (remainingHashes.size() < queryhashes.size()) {
                    // the result has not all words in it.
                    // find another sentence that represents the missing other words
                    // and find recursively more sentences
                    maxLength = maxLength - snippetString.length();
                    if (maxLength < 20) maxLength = 20;
                    try {
                        tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength);
                    } catch (UnsupportedOperationException e) {
                        throw e;
                    }
                    final String nextSnippet = tsr.snippetString;
                    if (nextSnippet == null) return;
                    snippetString = snippetString + (" / " + nextSnippet);
                    remainingHashes = tsr.remainingHashes;
                    return;
                } else {
                    // error
                    //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
                    continue;
                }
            }
        }
        throw new UnsupportedOperationException("no snippet computed");
    }

    private static int linelengthKey(int givenlength, int maxlength) {
        if (givenlength > maxlength) return 1;
        if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7;
        if (givenlength >= maxlength / 4 && givenlength < maxlength / 2) return 5;
        if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3;
        return 0;
    }

    private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
        try {
            if (sentence == null) throw new UnsupportedOperationException("no sentence given");
            if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
            byte[] hash;

            // find all hashes that appear in the sentence
            final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
            final Iterator<byte[]> j = queryhashes.iterator();
            Integer pos;
            int p, minpos = sentence.length(), maxpos = -1;
            final HandleSet remainingHashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), 0);
            while (j.hasNext()) {
                hash = j.next();
                pos = hs.get(hash);
                if (pos == null) {
                    try {
                        remainingHashes.put(hash);
                    } catch (RowSpaceExceededException e) {
                        Log.logException(e);
                    }
                } else {
                    p = pos.intValue();
                    if (p > maxpos) maxpos = p;
                    if (p < minpos) minpos = p;
                }
            }
            // check result size
            maxpos = maxpos + 10;
            if (maxpos > sentence.length()) maxpos = sentence.length();
            if (minpos < 0) minpos = 0;
            // we have a result, but is it short enough?
            if (maxpos - minpos + 10 > maxLength) {
                // the string is too long, even if we cut at both ends
                // so cut here in the middle of the string
                final int lenb = sentence.length();
                sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
                " [..] " +
                sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
                maxpos = maxpos + lenb - sentence.length() + 6;
            }
            if (maxpos > maxLength) {
                // the string is too long, even if we cut it at the end
                // so cut it here at both ends at once
                assert maxpos >= minpos;
                final int newlen = Math.max(10, maxpos - minpos + 10);
                final int around = (maxLength - newlen) / 2;
                assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
                //assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
                sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
                minpos = around;
                maxpos = sentence.length() - around - 5;
            }
            if (sentence.length() > maxLength) {
                // trim sentence, 1st step (cut at right side)
                sentence = sentence.substring(0, maxpos).trim() + " [..]";
            }
            if (sentence.length() > maxLength) {
                // trim sentence, 2nd step (cut at left side)
                sentence = "[..] " + sentence.substring(minpos).trim();
            }
            if (sentence.length() > maxLength) {
                // trim sentence, 3rd step (cut in the middle)
                sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
            }
            this.snippetString = sentence;
            this.remainingHashes = remainingHashes;
        } catch (final IndexOutOfBoundsException e) {
            throw new UnsupportedOperationException(e.getMessage());
        }
    }

    public String getSnippet() {
        return this.snippetString;
    }

    public HandleSet getRemainingWords() {
        return this.remainingHashes;
    }
}