added a new facet type based on a probabilistic classifier using

bayesian filters. This can be used to classify documents during
indexing-time using a pre-definied bayesian filter.

New wordings:
- a context is a class where different categories are possible. The
context name is equal to a facet name.
- a category is a facet type within a facet navigation. Each context
must have several categories, at least one custom name (things you want
to discover) and one with the exact name "negative".

To use this, you must do:
- for each context, you must create a directory within
DATA/CLASSIFICATION with the name of the context (the facet name)
- within each context directory, you must create text files with one
document each per line for every categroy. One of these categories MUST
have the name 'negative.txt'.

Then, each new document is classified to match within one of the given
categories for each context.
This commit is contained in:
Michael Peter Christen 2015-08-10 14:27:44 +02:00
parent dbbad23e12
commit df3314ac1a
12 changed files with 251 additions and 11 deletions

View File

@ -258,6 +258,12 @@ surrogates.out = DATA/SURROGATES/out
# this directory also contains subdirectories for input sources, the did-you-mean function and other
dictionaries = DATA/DICTIONARIES
# a path to the classification directory
# each subdirectory is the name of a context (which becomes a navigator) with '.txt' files
# containing texts to teach a bayesian filter. One of the files must be named 'negative.txt'.
# The text files can be created with the Export functionality using the option "Only Text".
classification = DATA/CLASSIFICATION
# storage place for new releases
releases = DATA/RELEASE

View File

@ -52,8 +52,8 @@ function statistics(offset, itemscount, itemsperpage, totalcount, localResourceS
resnav += "\">&laquo;</a></li>";
}
numberofpages = Math.floor(Math.min(10, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage)));
if (!numberofpages) numberofpages = 10;
numberofpages = Math.floor(Math.min(9, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage)));
if (!numberofpages) numberofpages = 9;
for (i = 0; i < numberofpages; i++) {
if (i == thispage) {
resnav += "<li class=\"active\"><a href=\"#\">";

View File

@ -49,6 +49,7 @@ import net.yacy.cora.federate.FederateSearchManager;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.Tagging.Metatag;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
@ -443,7 +444,12 @@ public class yacysearch {
if (p > 0) {
String k = vocabulary.substring(0, p);
String v = vocabulary.substring(p + 1);
metatags.add(LibraryProvider.autotagging.metatag(k, v));
Metatag mt = LibraryProvider.autotagging.metatag(k, v);
if (mt != null) {
metatags.add(mt);
} else {
}
}
}

View File

@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.geo.Locations;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.ProbabilisticClassifier;
/**
* Autotagging provides a set of tag/print-name properties which can be used to
@ -167,6 +168,12 @@ public class AutotaggingLibrary {
public Tagging.Metatag metatag(String vocName, String term) {
Tagging tagging = this.vocabularies.get(vocName);
if (tagging == null) {
if (ProbabilisticClassifier.getContextNames().contains(vocName)) {
tagging = new Tagging(vocName);
}
}
if (tagging == null) return null;
return tagging.getMetatagFromTerm(Tagging.decodeMaskname(term));
}

View File

@ -90,7 +90,7 @@ public class Tagging {
}
private Tagging(String name) {
public Tagging(String name) {
this.navigatorName = name;
this.synonym2term = new ConcurrentHashMap<String, String>();
this.term2synonym = new ConcurrentHashMap<String, String>();
@ -544,6 +544,11 @@ public class Tagging {
return term;
}
/**
* The metatag class contains the object value for a Linked Open Data RDF triple.
* The metatag is created in a tagging environment, which already contains the
* subject and the predicate. The metatag is the object of the RDF triple.
*/
public class Metatag {
private final String object;
private Metatag(String object) {

View File

@ -250,12 +250,17 @@ dc_rights
/**
* add the given words to the set of keywords.
* These keywords will appear in dc_subject
* @param tags
* @param tags a map where the key is the navigator name and the value is the set of attributes as metatags
*/
protected void addMetatags(Map<String, Set<Tagging.Metatag>> tags) {
this.generic_facets.putAll(computeGenericFacets(tags));
}
/**
* compute generic facets
* @param tags a map where the key is the navigator name and the value is the set of attributes as metatags
* @return a map where the key is the navigator name and the value is the set of attributes names
*/
public static Map<String, Set<String>> computeGenericFacets(Map<String, Set<Tagging.Metatag>> tags) {
Map<String, Set<String>> gf = new HashMap<String, Set<String>>();
for (Map.Entry<String, Set<Tagging.Metatag>> e: tags.entrySet()) {

View File

@ -0,0 +1,168 @@
/**
* ProbabilisticClassifier
* Copyright 2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 06.08.2015 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.bayes.BayesClassifier;
import net.yacy.cora.bayes.Classification;
import net.yacy.cora.util.ConcurrentLog;
public class ProbabilisticClassifier {
public final static String NONE_CATEGORY_NAME = "NONE";
public final static Category NONE_CATEGORY = new Category(NONE_CATEGORY_NAME);
public static class Category {
String category_name;
public Category(String category_name) {
this.category_name = category_name;
}
public String getName() {
return this.category_name;
}
}
public static class Context {
private String context_name;
private BayesClassifier<String, Category> bayes;
public Context(String context_name, Map<String, File> categoryExampleLinesFiles, File negativeExampleLines) throws IOException {
this.context_name = context_name;
int requiredSize = 0;
Map<String, List<String>> categoryBuffer = new HashMap<>();
for (Map.Entry<String, File> category: categoryExampleLinesFiles.entrySet()) {
List<String> list = Files.readAllLines(category.getValue().toPath());
categoryBuffer.put(category.getKey(), list);
requiredSize += list.size();
}
List<String> list = Files.readAllLines(negativeExampleLines.toPath());
categoryBuffer.put(NONE_CATEGORY_NAME, Files.readAllLines(negativeExampleLines.toPath()));
requiredSize += list.size();
this.bayes = new BayesClassifier<>();
this.bayes.setMemoryCapacity(requiredSize);
for (Map.Entry<String, List<String>> category: categoryBuffer.entrySet()) {
Category c = new Category(category.getKey());
for (String line: category.getValue()) {
List<String> tokens = normalize(line);
bayes.learn(c, tokens);
}
}
bayes.learn(NONE_CATEGORY, categoryBuffer.get(NONE_CATEGORY_NAME));
}
private List<String> normalize(String phrase) {
String cleanphrase = phrase.toLowerCase().replaceAll("\\W", " ");
String[] rawtokens = cleanphrase.split("\\s");
List<String> tokens = new ArrayList<>();
for (String token: rawtokens) if (token.length() > 2) tokens.add(token);
return tokens;
}
public String getName() {
return this.context_name;
}
public Classification<String, Category> classify(String phrase) {
List<String> words = normalize(phrase);
return this.bayes.classify(words);
}
}
private static Map<String, Context> contexts;
public static Set<String> getContextNames() {
return contexts.keySet();
}
public static Context getContext(String contextName) {
return contexts.get(contextName);
}
/**
* create a new classifier set.
* @param path_to_context_directory directory containing contexts wich are directories containing .txt files. One of them must be named 'negative.txt'
*/
public static void initialize(File path_to_context_directory) {
contexts = new HashMap<>();
String[] context_candidates = path_to_context_directory.list();
for (String context_candidate: context_candidates) {
File ccf = new File(path_to_context_directory, context_candidate);
if (!ccf.isDirectory()) continue;
String[] category_candidates = ccf.list();
Map<String, File> categoryExampleLinesFiles = new HashMap<>();
File negativeExampleLines = null;
for (String category_candidate: category_candidates) {
if (!category_candidate.endsWith(".txt")) continue;
File catcf = new File(ccf, category_candidate);
if (category_candidate.startsWith("negative")) {
negativeExampleLines = catcf;
} else {
categoryExampleLinesFiles.put(category_candidate.substring(0, category_candidate.length() - 4), catcf);
}
}
if (negativeExampleLines != null && categoryExampleLinesFiles.size() > 0) {
try {
Context context = new Context(context_candidate, categoryExampleLinesFiles, negativeExampleLines);
contexts.put(context_candidate, context);
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
}
}
/**
* compute the classification of a given text. The result is a map with most probable categorizations for each context.
* @param text the text to be classified
* @return a map where the key is the navigator name (the bayes context) and the value is the most probable attribute name (the bayes category)
*/
public static Map<String, String> getClassification(String text) {
Map<String, String> c = new HashMap<>();
for (Context context: contexts.values()) {
Classification<String, Category> classification = context.classify(text);
String contextname = context.getName();
Category category = classification.getCategory();
String categoryname = category.getName();
c.put(contextname, categoryname);
}
return c;
}
}

View File

@ -153,6 +153,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser;
import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.Parser.Failure;
@ -242,7 +243,7 @@ public final class Switchboard extends serverSwitch {
// storage management
public File htCachePath;
public final File dictionariesPath;
public final File dictionariesPath, classificationPath;
public File listsPath;
public File htDocsPath;
public File workPath;
@ -374,11 +375,20 @@ public final class Switchboard extends serverSwitch {
}
this.log.config("Work Path: " + this.workPath.toString());
this.dictionariesPath =
getDataPath(
SwitchboardConstants.DICTIONARY_SOURCE_PATH,
SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT);
this.log.config("Dictionaries Path:" + this.dictionariesPath.toString());
if (!this.dictionariesPath.exists()) this.dictionariesPath.mkdirs();
this.classificationPath =
getDataPath(
SwitchboardConstants.CLASSIFICATION_SOURCE_PATH,
SwitchboardConstants.CLASSIFICATION_SOURCE_PATH_DEFAULT);
this.log.config("Classification Path:" + this.classificationPath.toString());
if (!this.classificationPath.exists()) this.classificationPath.mkdirs();
CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_HTTPS = this.getConfigBool("search.ranking.uniqueheuristic.preferhttps", false);
CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_WWWPREFIX = this.getConfigBool("search.ranking.uniqueheuristic.preferwwwprefix", true);
@ -397,6 +407,9 @@ public final class Switchboard extends serverSwitch {
Tagging t = LibraryProvider.autotagging.getVocabulary(o);
if (t != null) t.setFacet(false);
}
Thread.currentThread().setName("ProbabilisticClassification.initialize");
ProbabilisticClassifier.initialize(Switchboard.this.classificationPath);
}
}.start();

View File

@ -413,6 +413,9 @@ public final class SwitchboardConstants {
public static final String DICTIONARY_SOURCE_PATH = "dictionaries";
public static final String DICTIONARY_SOURCE_PATH_DEFAULT = "DATA/DICTIONARIES";
public static final String CLASSIFICATION_SOURCE_PATH = "classification";
public static final String CLASSIFICATION_SOURCE_PATH_DEFAULT = "DATA/CLASSIFICATION";
/**
* <p><code>public static final String <strong>HTDOCS_PATH</strong> = "htDocsPath"</code></p>

View File

@ -54,6 +54,7 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.LibraryProvider;
import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.Tokenizer;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
@ -262,6 +263,9 @@ public final class QueryParams {
this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
}
}
for (String context: ProbabilisticClassifier.getContextNames()) {
this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + context + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
}
this.cachedQuery = null;
}

View File

@ -36,6 +36,7 @@ import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
@ -72,6 +73,7 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
import net.yacy.document.LargeNumberCache;
import net.yacy.document.LibraryProvider;
import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.TextParser;
import net.yacy.document.Tokenizer;
import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -882,13 +884,16 @@ public final class SearchEvent {
}
// get the vocabulary navigation
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
Set<String> genericFacets = new LinkedHashSet<>();
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) genericFacets.add(v.getName());
genericFacets.addAll(ProbabilisticClassifier.getContextNames());
for (String v: genericFacets) {
fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
if (fcts != null) {
ScoreMap<String> vocNav = this.vocabularyNavigator.get(v.getName());
ScoreMap<String> vocNav = this.vocabularyNavigator.get(v);
if (vocNav == null) {
vocNav = new ConcurrentScoreMap<String>();
this.vocabularyNavigator.put(v.getName(), vocNav);
this.vocabularyNavigator.put(v, vocNav);
}
vocNav.inc(fcts);
}
@ -1242,7 +1247,7 @@ public final class SearchEvent {
// check vocabulary terms (metatags) {only available in Solr index as vocabulary_xxyyzzz_sxt field}
// TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL)
// TODO: vocabulary is only valid and available in local Solr index (consider to auto-switch to Searchdom.LOCAL)
if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
tagloop: for (Tagging.Metatag tag : this.query.metatags) {
SolrDocument sdoc = page;

View File

@ -81,6 +81,7 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.SentenceReader;
import net.yacy.document.Tokenizer;
import net.yacy.document.content.DCEntry;
@ -1006,6 +1007,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return doc;
}
/**
* attach additional information to the document to enable navigation features
* @param doc the document to be enriched
* @param synonyms a list of synonyms detected for the text content
* @param genericFacets a map where the key is the navigator name and the value is the set of attributes names
*/
public void enrich(SolrInputDocument doc, List<String> synonyms, Map<String, Set<String>> genericFacets) {
remove(doc, CollectionSchema.vocabularies_sxt); // delete old values
for (SolrInputField sif: doc) {
@ -1016,6 +1023,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// there are no pre-defined solr fields for navigation because the vocabulary is generic
// we use dynamically allocated solr fields for this.
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
// add to genericFacets the probabilistic categories
String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
Map<String, String> classification = ProbabilisticClassifier.getClassification(text);
for (Map.Entry<String, String> entry: classification.entrySet()) {
Set<String> facetAttrbutes = new HashSet<>();
facetAttrbutes.add(entry.getValue());
genericFacets.put(entry.getKey(), facetAttrbutes);
}
// compute the document field values
List<String> vocabularies = new ArrayList<>();
for (Map.Entry<String, Set<String>> facet: genericFacets.entrySet()) {
String facetName = facet.getKey();