mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
tab to spaces for classes supporting the condenser.
This is a preparation step to make changes in condenser and parser more visible; no functional changes so far.
This commit is contained in:
parent
ce4a2450da
commit
8285fe715a
|
@ -85,24 +85,24 @@ public class AutotaggingLibrary {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Autotagging instance from the provided vocabularies. Can be used
|
||||
* for example for testing purpose.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Create a new Autotagging instance from the provided vocabularies. Can be used
|
||||
* for example for testing purpose.
|
||||
*/
|
||||
protected AutotaggingLibrary(final Map<String, Tagging> vocabularies) {
|
||||
if(vocabularies != null) {
|
||||
this.vocabularies = vocabularies;
|
||||
} else {
|
||||
this.vocabularies = new ConcurrentHashMap<String, Tagging>();
|
||||
}
|
||||
this.allTags = new ConcurrentHashMap<String, Object>();
|
||||
this.autotaggingPath = null;
|
||||
for(final Tagging voc : this.vocabularies.values()) {
|
||||
if(vocabularies != null) {
|
||||
this.vocabularies = vocabularies;
|
||||
} else {
|
||||
this.vocabularies = new ConcurrentHashMap<String, Tagging>();
|
||||
}
|
||||
this.allTags = new ConcurrentHashMap<String, Object>();
|
||||
this.autotaggingPath = null;
|
||||
for(final Tagging voc : this.vocabularies.values()) {
|
||||
for (final String t: voc.tags()) {
|
||||
this.allTags.put(t, PRESENT);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public File getVocabularyFile(String name) {
|
||||
|
@ -159,11 +159,11 @@ public class AutotaggingLibrary {
|
|||
}
|
||||
|
||||
public int size() {
|
||||
return this.vocabularies.size();
|
||||
return this.vocabularies.size();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return this.vocabularies.isEmpty();
|
||||
return this.vocabularies.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -171,8 +171,8 @@ public class AutotaggingLibrary {
|
|||
* @return
|
||||
*/
|
||||
public int getMaxWordsInTerm() {
|
||||
//TODO: calculate from database
|
||||
return 4;
|
||||
//TODO: calculate from database
|
||||
return 4;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -195,70 +195,70 @@ public class AutotaggingLibrary {
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Search in the active vocabularies matching linked data for Metatag entries with objectspace + term
|
||||
* matching the given term URL. Returns at most one Metatag instance per
|
||||
* vocabulary.
|
||||
*
|
||||
* @param termURL
|
||||
* the vocabulary term identifier (an absolute URL) to search
|
||||
* @return a set of matching Metatag instances eventually empty
|
||||
*/
|
||||
public Set<Tagging.Metatag> getTagsFromTermURL(final DigestURL termURL) {
|
||||
final Set<Tagging.Metatag> tags = new HashSet<>();
|
||||
if (termURL == null || this.vocabularies.isEmpty()) {
|
||||
return tags;
|
||||
}
|
||||
final String termURLStr = termURL.toNormalform(false);
|
||||
String termNamespace = null;
|
||||
/**
|
||||
* Search in the active vocabularies matching linked data for Metatag entries with objectspace + term
|
||||
* matching the given term URL. Returns at most one Metatag instance per
|
||||
* vocabulary.
|
||||
*
|
||||
* @param termURL
|
||||
* the vocabulary term identifier (an absolute URL) to search
|
||||
* @return a set of matching Metatag instances eventually empty
|
||||
*/
|
||||
public Set<Tagging.Metatag> getTagsFromTermURL(final DigestURL termURL) {
|
||||
final Set<Tagging.Metatag> tags = new HashSet<>();
|
||||
if (termURL == null || this.vocabularies.isEmpty()) {
|
||||
return tags;
|
||||
}
|
||||
final String termURLStr = termURL.toNormalform(false);
|
||||
String termNamespace = null;
|
||||
|
||||
/* If the objectLink URL has a fragment, this should be the vocabulary term */
|
||||
String term = termURL.getRef();
|
||||
if (term == null) {
|
||||
/*
|
||||
* No fragment in the URL : the term should then be the last segment of the URL
|
||||
*/
|
||||
term = termURL.getFileName();
|
||||
if (StringUtils.isNotEmpty(term)) {
|
||||
final int lastPathSeparatorPos = termURLStr.lastIndexOf("/");
|
||||
if (lastPathSeparatorPos > 0) {
|
||||
termNamespace = termURLStr.substring(0, lastPathSeparatorPos + 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
final int fragmentPos = termURLStr.indexOf("#");
|
||||
if (fragmentPos > 0) {
|
||||
termNamespace = termURLStr.substring(0, fragmentPos + 1);
|
||||
}
|
||||
}
|
||||
if (StringUtils.isNotEmpty(term) && termNamespace != null) {
|
||||
final String alternativeTermNamespace;
|
||||
/*
|
||||
* http://example.org/ and https://example.org/ are considered equivalent forms
|
||||
* for the namespace URL
|
||||
*/
|
||||
if (termURL.isHTTP()) {
|
||||
alternativeTermNamespace = "https" + termNamespace.substring("http".length());
|
||||
} else if (termURL.isHTTPS()) {
|
||||
alternativeTermNamespace = "http" + termNamespace.substring("https".length());
|
||||
} else {
|
||||
alternativeTermNamespace = null;
|
||||
}
|
||||
/* If the objectLink URL has a fragment, this should be the vocabulary term */
|
||||
String term = termURL.getRef();
|
||||
if (term == null) {
|
||||
/*
|
||||
* No fragment in the URL : the term should then be the last segment of the URL
|
||||
*/
|
||||
term = termURL.getFileName();
|
||||
if (StringUtils.isNotEmpty(term)) {
|
||||
final int lastPathSeparatorPos = termURLStr.lastIndexOf("/");
|
||||
if (lastPathSeparatorPos > 0) {
|
||||
termNamespace = termURLStr.substring(0, lastPathSeparatorPos + 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
final int fragmentPos = termURLStr.indexOf("#");
|
||||
if (fragmentPos > 0) {
|
||||
termNamespace = termURLStr.substring(0, fragmentPos + 1);
|
||||
}
|
||||
}
|
||||
if (StringUtils.isNotEmpty(term) && termNamespace != null) {
|
||||
final String alternativeTermNamespace;
|
||||
/*
|
||||
* http://example.org/ and https://example.org/ are considered equivalent forms
|
||||
* for the namespace URL
|
||||
*/
|
||||
if (termURL.isHTTP()) {
|
||||
alternativeTermNamespace = "https" + termNamespace.substring("http".length());
|
||||
} else if (termURL.isHTTPS()) {
|
||||
alternativeTermNamespace = "http" + termNamespace.substring("https".length());
|
||||
} else {
|
||||
alternativeTermNamespace = null;
|
||||
}
|
||||
|
||||
for (final Tagging vocabulary : this.vocabularies.values()) {
|
||||
if (vocabulary != null && vocabulary.isMatchFromLinkedData()) {
|
||||
if ((termNamespace.equals(vocabulary.getObjectspace())) || (alternativeTermNamespace != null
|
||||
&& alternativeTermNamespace.equals(vocabulary.getObjectspace()))) {
|
||||
final Tagging.Metatag tag = vocabulary.getMetatagFromTerm(term);
|
||||
if (tag != null) {
|
||||
tags.add(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return tags;
|
||||
}
|
||||
for (final Tagging vocabulary : this.vocabularies.values()) {
|
||||
if (vocabulary != null && vocabulary.isMatchFromLinkedData()) {
|
||||
if ((termNamespace.equals(vocabulary.getObjectspace())) || (alternativeTermNamespace != null
|
||||
&& alternativeTermNamespace.equals(vocabulary.getObjectspace()))) {
|
||||
final Tagging.Metatag tag = vocabulary.getMetatagFromTerm(term);
|
||||
if (tag != null) {
|
||||
tags.add(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return tags;
|
||||
}
|
||||
|
||||
public Tagging.Metatag metatag(String vocName, String term) {
|
||||
Tagging tagging = this.vocabularies.get(vocName);
|
||||
|
|
|
@ -47,27 +47,27 @@ public class Tagging {
|
|||
|
||||
public final static String DEFAULT_NAMESPACE= "http://yacy.net/autotagging#";
|
||||
public final static String DEFAULT_PREFIX = "tags";
|
||||
|
||||
|
||||
/** Default value for the property matchFromLinkedData */
|
||||
public final static boolean DEFAULT_MATCH_FROM_LINKED_DATA = false;
|
||||
|
||||
private final String navigatorName;
|
||||
private final Map<String, String> synonym2term;
|
||||
|
||||
|
||||
/** Terms associated to TagginEntry instances each having a synonym and an eventual object link */
|
||||
private final Map<String, TaggingEntry> term2entries;
|
||||
|
||||
|
||||
private File propFile;
|
||||
|
||||
|
||||
/** true if the vocabulary shall generate a navigation facet */
|
||||
private boolean isFacet;
|
||||
|
||||
/**
|
||||
* True when this vocabulary terms should only be matched from linked data types
|
||||
* annotations (with microdata, RDFa, microformats...) instead of clear text
|
||||
* words
|
||||
*/
|
||||
private boolean matchFromLinkedData;
|
||||
|
||||
/**
|
||||
* True when this vocabulary terms should only be matched from linked data types
|
||||
* annotations (with microdata, RDFa, microformats...) instead of clear text
|
||||
* words
|
||||
*/
|
||||
private boolean matchFromLinkedData;
|
||||
|
||||
private String predicate, namespace, objectspace;
|
||||
|
||||
|
@ -142,55 +142,55 @@ public class Tagging {
|
|||
|
||||
String term, v;
|
||||
String[] tags;
|
||||
vocloop: for (Map.Entry<String, SOTuple> e: table.entrySet()) {
|
||||
if (e.getValue().getSynonymsCSV() == null || e.getValue().getSynonymsCSV().isEmpty()) {
|
||||
term = normalizeKey(e.getKey());
|
||||
v = normalizeTerm(e.getKey());
|
||||
this.synonym2term.put(v, term);
|
||||
if (e.getValue().getObjectlink() != null && e.getValue().getObjectlink().length() > 0) {
|
||||
this.term2entries.put(term, new TaggingEntryWithObjectLink(v, e.getValue().getObjectlink()));
|
||||
} else {
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(v));
|
||||
}
|
||||
|
||||
continue vocloop;
|
||||
}
|
||||
term = normalizeKey(e.getKey());
|
||||
tags = e.getValue().getSynonymsList();
|
||||
final Set<String> synonyms = new HashSet<String>();
|
||||
synonyms.add(term);
|
||||
tagloop: for (String synonym: tags) {
|
||||
if (synonym.isEmpty()) continue tagloop;
|
||||
synonyms.add(synonym);
|
||||
synonym = normalizeTerm(synonym);
|
||||
if (synonym.isEmpty()) continue tagloop;
|
||||
synonyms.add(synonym);
|
||||
this.synonym2term.put(synonym, term);
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(synonym));
|
||||
}
|
||||
final String synonym = normalizeTerm(term);
|
||||
this.synonym2term.put(synonym, term);
|
||||
if (e.getValue().getObjectlink() != null && e.getValue().getObjectlink().length() > 0) {
|
||||
this.term2entries.put(term, new TaggingEntryWithObjectLink(synonym, e.getValue().getObjectlink()));
|
||||
} else {
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(synonym));
|
||||
vocloop: for (Map.Entry<String, SOTuple> e: table.entrySet()) {
|
||||
if (e.getValue().getSynonymsCSV() == null || e.getValue().getSynonymsCSV().isEmpty()) {
|
||||
term = normalizeKey(e.getKey());
|
||||
v = normalizeTerm(e.getKey());
|
||||
this.synonym2term.put(v, term);
|
||||
if (e.getValue().getObjectlink() != null && e.getValue().getObjectlink().length() > 0) {
|
||||
this.term2entries.put(term, new TaggingEntryWithObjectLink(v, e.getValue().getObjectlink()));
|
||||
} else {
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(v));
|
||||
}
|
||||
|
||||
continue vocloop;
|
||||
}
|
||||
synonyms.add(synonym);
|
||||
}
|
||||
term = normalizeKey(e.getKey());
|
||||
tags = e.getValue().getSynonymsList();
|
||||
final Set<String> synonyms = new HashSet<String>();
|
||||
synonyms.add(term);
|
||||
tagloop: for (String synonym: tags) {
|
||||
if (synonym.isEmpty()) continue tagloop;
|
||||
synonyms.add(synonym);
|
||||
synonym = normalizeTerm(synonym);
|
||||
if (synonym.isEmpty()) continue tagloop;
|
||||
synonyms.add(synonym);
|
||||
this.synonym2term.put(synonym, term);
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(synonym));
|
||||
}
|
||||
final String synonym = normalizeTerm(term);
|
||||
this.synonym2term.put(synonym, term);
|
||||
if (e.getValue().getObjectlink() != null && e.getValue().getObjectlink().length() > 0) {
|
||||
this.term2entries.put(term, new TaggingEntryWithObjectLink(synonym, e.getValue().getObjectlink()));
|
||||
} else {
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(synonym));
|
||||
}
|
||||
synonyms.add(synonym);
|
||||
}
|
||||
} else {
|
||||
try (
|
||||
/* Resources automatically closed by this try-with-resources statement */
|
||||
final FileOutputStream outStream = new FileOutputStream(propFile);
|
||||
final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(outStream, StandardCharsets.UTF_8.name()));
|
||||
/* Resources automatically closed by this try-with-resources statement */
|
||||
final FileOutputStream outStream = new FileOutputStream(propFile);
|
||||
final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(outStream, StandardCharsets.UTF_8.name()));
|
||||
) {
|
||||
if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n");
|
||||
for (final Map.Entry<String, SOTuple> e: table.entrySet()) {
|
||||
String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV();
|
||||
String o = e.getValue() == null ? "" : e.getValue().getObjectlink();
|
||||
w.write(e.getKey() + (s == null || s.isEmpty() ? "" : ":" + e.getValue().getSynonymsCSV()) + (o == null || o.isEmpty() || o.equals(objectspace + e.getKey()) ? "" : "#" + o) + "\n");
|
||||
}
|
||||
if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n");
|
||||
for (final Map.Entry<String, SOTuple> e: table.entrySet()) {
|
||||
String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV();
|
||||
String o = e.getValue() == null ? "" : e.getValue().getObjectlink();
|
||||
w.write(e.getKey() + (s == null || s.isEmpty() ? "" : ":" + e.getValue().getSynonymsCSV()) + (o == null || o.isEmpty() || o.equals(objectspace + e.getKey()) ? "" : "#" + o) + "\n");
|
||||
}
|
||||
}
|
||||
init();
|
||||
init();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -207,7 +207,7 @@ public class Tagging {
|
|||
g = geo.iterator().next();
|
||||
this.term2entries.put(loc, new LocationTaggingEntry(syn, g));
|
||||
} else {
|
||||
this.term2entries.put(loc, new SynonymTaggingEntry(syn));
|
||||
this.term2entries.put(loc, new SynonymTaggingEntry(syn));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -255,9 +255,9 @@ public class Tagging {
|
|||
v = normalizeTerm(pl[0]);
|
||||
this.synonym2term.put(v, term);
|
||||
if (pl[2] != null && pl[2].length() > 0) {
|
||||
this.term2entries.put(term, new TaggingEntryWithObjectLink(v, pl[2]));
|
||||
this.term2entries.put(term, new TaggingEntryWithObjectLink(v, pl[2]));
|
||||
} else {
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(v));
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(v));
|
||||
}
|
||||
continue vocloop;
|
||||
}
|
||||
|
@ -278,9 +278,9 @@ public class Tagging {
|
|||
String synonym = normalizeTerm(term);
|
||||
this.synonym2term.put(synonym, term);
|
||||
if (pl[2] != null && pl[2].length() > 0) {
|
||||
this.term2entries.put(term, new TaggingEntryWithObjectLink(synonym, pl[2]));
|
||||
this.term2entries.put(term, new TaggingEntryWithObjectLink(synonym, pl[2]));
|
||||
} else {
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(synonym));
|
||||
this.term2entries.put(term, new SynonymTaggingEntry(synonym));
|
||||
}
|
||||
synonyms.add(synonym);
|
||||
}
|
||||
|
@ -293,30 +293,30 @@ public class Tagging {
|
|||
public boolean isFacet() {
|
||||
return this.isFacet;
|
||||
}
|
||||
|
||||
|
||||
public void setFacet(boolean isFacet) {
|
||||
this.isFacet = isFacet;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true when this vocabulary terms should be matched from linked data
|
||||
* types annotations (with microdata, RDFa, microformats...) instead of
|
||||
* clear text words
|
||||
*/
|
||||
|
||||
/**
|
||||
* @return true when this vocabulary terms should be matched from linked data
|
||||
* types annotations (with microdata, RDFa, microformats...) instead of
|
||||
* clear text words
|
||||
*/
|
||||
public boolean isMatchFromLinkedData() {
|
||||
return this.matchFromLinkedData;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param facetFromLinkedData
|
||||
* true when this vocabulary terms should be matched from linked
|
||||
* data types annotations (with microdata, RDFa, microformats...)
|
||||
* instead of clear text words
|
||||
*/
|
||||
public void setMatchFromLinkedData(final boolean facetFromLinkedData) {
|
||||
this.matchFromLinkedData = facetFromLinkedData;
|
||||
return this.matchFromLinkedData;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param facetFromLinkedData
|
||||
* true when this vocabulary terms should be matched from linked
|
||||
* data types annotations (with microdata, RDFa, microformats...)
|
||||
* instead of clear text words
|
||||
*/
|
||||
public void setMatchFromLinkedData(final boolean facetFromLinkedData) {
|
||||
this.matchFromLinkedData = facetFromLinkedData;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return this.term2entries.size();
|
||||
}
|
||||
|
@ -430,7 +430,7 @@ public class Tagging {
|
|||
r.put(e.getKey(), s);
|
||||
}
|
||||
if (e.getValue() != null && e.getValue().getSynonym() != null && e.getValue().getSynonym().length() != 0) {
|
||||
s.add(e.getValue().getSynonym());
|
||||
s.add(e.getValue().getSynonym());
|
||||
}
|
||||
}
|
||||
for (Map.Entry<String, String> e: this.synonym2term.entrySet()) {
|
||||
|
@ -448,11 +448,11 @@ public class Tagging {
|
|||
Map<String, Set<String>> r = reconstructionSets();
|
||||
Map<String, SOTuple> map = new TreeMap<String, SOTuple>();
|
||||
for (Map.Entry<String, Set<String>> e: r.entrySet()) {
|
||||
TaggingEntry entry = this.term2entries.get(e.getKey());
|
||||
String objectLink = null;
|
||||
if(entry != null) {
|
||||
objectLink = entry.getObjectLink();
|
||||
}
|
||||
TaggingEntry entry = this.term2entries.get(e.getKey());
|
||||
String objectLink = null;
|
||||
if(entry != null) {
|
||||
objectLink = entry.getObjectLink();
|
||||
}
|
||||
map.put(e.getKey(), new SOTuple(e.getValue().toArray(new String[e.getValue().size()]), objectLink == null ? "" : objectLink));
|
||||
}
|
||||
return map;
|
||||
|
@ -461,7 +461,7 @@ public class Tagging {
|
|||
public String getObjectlink(String term) {
|
||||
TaggingEntry entry = this.term2entries.get(term);
|
||||
if(entry != null) {
|
||||
return entry.getObjectLink();
|
||||
return entry.getObjectLink();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@ -531,11 +531,11 @@ public class Tagging {
|
|||
public String getObjectspace() {
|
||||
return this.objectspace;
|
||||
}
|
||||
|
||||
|
||||
private final static Pattern PATTERN_SPACESLASHPLUS = Pattern.compile(" (/|\\+)");
|
||||
private final static Pattern PATTERN_SLASHPLUS = Pattern.compile("/|\\+");
|
||||
private final static Pattern PATTERN_SPACESPACE = Pattern.compile(" ");
|
||||
|
||||
|
||||
private final String normalizeKey(String k) {
|
||||
k = k.trim();
|
||||
// remove symbols that are bad in a query attribute
|
||||
|
@ -557,37 +557,37 @@ public class Tagging {
|
|||
return this.propFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param word
|
||||
* a synonym to look for
|
||||
* @return a Metatag instance with the matching term, or null when the synonym
|
||||
* is not in this vocabulary.
|
||||
*/
|
||||
/**
|
||||
* @param word
|
||||
* a synonym to look for
|
||||
* @return a Metatag instance with the matching term, or null when the synonym
|
||||
* is not in this vocabulary.
|
||||
*/
|
||||
public Metatag getMetatagFromSynonym(final String word) {
|
||||
String printname = this.synonym2term.get(word);
|
||||
if (printname == null) return null;
|
||||
return new Metatag(printname);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param term
|
||||
* a term to look for
|
||||
* @return a Metatag instance with the matching term, or null when it is not in
|
||||
* this vocabulary.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @param term
|
||||
* a term to look for
|
||||
* @return a Metatag instance with the matching term, or null when it is not in
|
||||
* this vocabulary.
|
||||
*/
|
||||
public Metatag getMetatagFromTerm(final String term) {
|
||||
TaggingEntry entry = this.term2entries.get(term);
|
||||
if(entry == null) {
|
||||
return null;
|
||||
return null;
|
||||
}
|
||||
return new Metatag(term);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param word
|
||||
* the object of the Metatag
|
||||
* @return a new Metatag instance related to this vocabulary
|
||||
*/
|
||||
/**
|
||||
* @param word
|
||||
* the object of the Metatag
|
||||
* @return a new Metatag instance related to this vocabulary
|
||||
*/
|
||||
public Metatag buildMetatagFromTerm(final String word) {
|
||||
return new Metatag(word);
|
||||
}
|
||||
|
@ -632,15 +632,15 @@ public class Tagging {
|
|||
* The metatag is created in a tagging environment, which already contains the
|
||||
* subject and the predicate. The metatag is the object of the RDF triple.
|
||||
*/
|
||||
public class Metatag {
|
||||
private final String object;
|
||||
private Metatag(String object) {
|
||||
this.object = object;
|
||||
}
|
||||
public class Metatag {
|
||||
private final String object;
|
||||
private Metatag(String object) {
|
||||
this.object = object;
|
||||
}
|
||||
|
||||
public String getVocabularyName() {
|
||||
return Tagging.this.navigatorName;
|
||||
}
|
||||
public String getVocabularyName() {
|
||||
return Tagging.this.navigatorName;
|
||||
}
|
||||
|
||||
public String getPredicate() {
|
||||
return Tagging.this.predicate;
|
||||
|
@ -650,22 +650,22 @@ public class Tagging {
|
|||
return this.object;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return Tagging.this.navigatorName + ":" + encodePrintname(this.object);
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
return Tagging.this.navigatorName + ":" + encodePrintname(this.object);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object m) {
|
||||
Metatag m0 = (Metatag) m;
|
||||
return Tagging.this.navigatorName.equals(m0.getVocabularyName()) && this.object.equals(m0.object);
|
||||
}
|
||||
@Override
|
||||
public boolean equals(Object m) {
|
||||
Metatag m0 = (Metatag) m;
|
||||
return Tagging.this.navigatorName.equals(m0.getVocabularyName()) && this.object.equals(m0.object);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Tagging.this.navigatorName.hashCode() + this.object.hashCode();
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Tagging.this.navigatorName.hashCode() + this.object.hashCode();
|
||||
}
|
||||
}
|
||||
|
||||
public static final String encodePrintname(String printname) {
|
||||
return CommonPattern.SPACE.matcher(printname).replaceAll("_");
|
||||
|
|
|
@ -61,10 +61,10 @@ public final class Condenser extends Tokenizer {
|
|||
|
||||
private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
|
||||
private String fuzzy_signature_text = null; // signatures for double-check detection
|
||||
|
||||
|
||||
private final Identificator languageIdentificator;
|
||||
public LinkedHashSet<Date> dates_in_content;
|
||||
|
||||
|
||||
public Condenser(
|
||||
final Document document,
|
||||
final VocabularyScraper scraper,
|
||||
|
@ -76,14 +76,14 @@ public final class Condenser extends Tokenizer {
|
|||
final int timezoneOffset
|
||||
) {
|
||||
super(document.dc_source(), indexText ? document.getTextString() : "", meaningLib, doAutotagging, scraper);
|
||||
|
||||
|
||||
final String initialThreadName = Thread.currentThread().getName();
|
||||
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
|
||||
|
||||
|
||||
// if addMedia == true, then all the media links are also parsed and added to the words
|
||||
// added media words are flagged with the appropriate media flag
|
||||
this.dates_in_content = new LinkedHashSet<Date>();
|
||||
|
||||
|
||||
// construct flag set for document
|
||||
ContentDomain contentDomain = document.getContentDomain();
|
||||
if (contentDomain == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
|
||||
|
@ -196,9 +196,9 @@ public final class Condenser extends Tokenizer {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(doAutotagging) {
|
||||
extractAutoTagsFromLinkedDataTypes(document.getLinkedDataTypes(), LibraryProvider.autotagging);
|
||||
extractAutoTagsFromLinkedDataTypes(document.getLinkedDataTypes(), LibraryProvider.autotagging);
|
||||
}
|
||||
|
||||
// extend the tags in the document object with autotagging tags
|
||||
|
@ -224,36 +224,36 @@ public final class Condenser extends Tokenizer {
|
|||
/* Restore the current thread initial name */
|
||||
Thread.currentThread().setName(initialThreadName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for tags matching the given linked data types identifiers (absolute
|
||||
* URLs) in the given autotagging library. Then fill this instance "tags" map
|
||||
* with the eventually matching tags found.
|
||||
*
|
||||
* @param linkedDataTypes
|
||||
* a set of linked data typed items identifiers (absolute URLs) to
|
||||
* search
|
||||
* @param tagLibrary
|
||||
* the autotagging library holding vocabularies to search in
|
||||
*/
|
||||
protected void extractAutoTagsFromLinkedDataTypes(final Set<DigestURL> linkedDataTypes,
|
||||
final AutotaggingLibrary tagLibrary) {
|
||||
if (linkedDataTypes == null || tagLibrary == null) {
|
||||
return;
|
||||
}
|
||||
for (final DigestURL linkedDataType : linkedDataTypes) {
|
||||
final Set<Metatag> tags = tagLibrary.getTagsFromTermURL(linkedDataType);
|
||||
for (final Metatag tag : tags) {
|
||||
final String navigatorName = tag.getVocabularyName();
|
||||
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
|
||||
if (tagset == null) {
|
||||
tagset = new HashSet<Metatag>();
|
||||
this.tags.put(navigatorName, tagset);
|
||||
}
|
||||
tagset.add(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for tags matching the given linked data types identifiers (absolute
|
||||
* URLs) in the given autotagging library. Then fill this instance "tags" map
|
||||
* with the eventually matching tags found.
|
||||
*
|
||||
* @param linkedDataTypes
|
||||
* a set of linked data typed items identifiers (absolute URLs) to
|
||||
* search
|
||||
* @param tagLibrary
|
||||
* the autotagging library holding vocabularies to search in
|
||||
*/
|
||||
protected void extractAutoTagsFromLinkedDataTypes(final Set<DigestURL> linkedDataTypes,
|
||||
final AutotaggingLibrary tagLibrary) {
|
||||
if (linkedDataTypes == null || tagLibrary == null) {
|
||||
return;
|
||||
}
|
||||
for (final DigestURL linkedDataType : linkedDataTypes) {
|
||||
final Set<Metatag> tags = tagLibrary.getTagsFromTermURL(linkedDataType);
|
||||
for (final Metatag tag : tags) {
|
||||
final String navigatorName = tag.getVocabularyName();
|
||||
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
|
||||
if (tagset == null) {
|
||||
tagset = new HashSet<Metatag>();
|
||||
this.tags.put(navigatorName, tagset);
|
||||
}
|
||||
tagset.add(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void insertTextToWords(
|
||||
final SentenceReader text,
|
||||
|
@ -267,24 +267,24 @@ public final class Condenser extends Tokenizer {
|
|||
Word wprop;
|
||||
WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
|
||||
try {
|
||||
int pip = 0;
|
||||
while (wordenum.hasMoreElements()) {
|
||||
word = wordenum.nextElement().toString();
|
||||
if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive
|
||||
int pip = 0;
|
||||
while (wordenum.hasMoreElements()) {
|
||||
word = wordenum.nextElement().toString();
|
||||
if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive
|
||||
if (word.length() < 2) continue;
|
||||
word = word.toLowerCase(Locale.ENGLISH);
|
||||
wprop = this.words.get(word);
|
||||
if (wprop == null) wprop = new Word(0, pip, phrase);
|
||||
if (wprop.flags == null) wprop.flags = flagstemplate.clone();
|
||||
wprop.flags.set(flagpos, true);
|
||||
this.words.put(word, wprop);
|
||||
pip++;
|
||||
this.RESULT_NUMB_WORDS++;
|
||||
//this.RESULT_DIFF_WORDS++;
|
||||
wprop = this.words.get(word);
|
||||
if (wprop == null) wprop = new Word(0, pip, phrase);
|
||||
if (wprop.flags == null) wprop.flags = flagstemplate.clone();
|
||||
wprop.flags.set(flagpos, true);
|
||||
this.words.put(word, wprop);
|
||||
pip++;
|
||||
this.RESULT_NUMB_WORDS++;
|
||||
//this.RESULT_DIFF_WORDS++;
|
||||
}
|
||||
} finally {
|
||||
wordenum.close();
|
||||
wordenum = null;
|
||||
wordenum.close();
|
||||
wordenum = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -303,11 +303,11 @@ public final class Condenser extends Tokenizer {
|
|||
public String fuzzySignatureText() {
|
||||
return this.fuzzy_signature_text;
|
||||
}
|
||||
|
||||
|
||||
public long exactSignature() {
|
||||
return this.exact_signature;
|
||||
}
|
||||
|
||||
|
||||
public String language() {
|
||||
return this.languageIdentificator.getLanguage();
|
||||
}
|
||||
|
@ -322,7 +322,7 @@ public final class Condenser extends Tokenizer {
|
|||
|
||||
public static void main(final String[] args) {
|
||||
// read a property file and convert them into configuration lines
|
||||
FileInputStream inStream = null;
|
||||
FileInputStream inStream = null;
|
||||
try {
|
||||
final File f = new File(args[0]);
|
||||
final Properties p = new Properties();
|
||||
|
@ -346,13 +346,13 @@ public final class Condenser extends Tokenizer {
|
|||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
} finally {
|
||||
if(inStream != null) {
|
||||
try {
|
||||
inStream.close();
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
if(inStream != null) {
|
||||
try {
|
||||
inStream.close();
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -64,9 +64,9 @@ public class DateDetection {
|
|||
|
||||
private static final TimeZone UTC_TIMEZONE = TimeZone.getTimeZone("UTC");
|
||||
private static final String CONPATT = "uuuu/MM/dd";
|
||||
|
||||
private static final DateTimeFormatter CONFORM = DateTimeFormatter.ofPattern(CONPATT).withLocale(Locale.US)
|
||||
.withZone(ZoneOffset.UTC);
|
||||
|
||||
private static final DateTimeFormatter CONFORM = DateTimeFormatter.ofPattern(CONPATT).withLocale(Locale.US)
|
||||
.withZone(ZoneOffset.UTC);
|
||||
private static final LinkedHashMap<Language, String[]> Weekdays = new LinkedHashMap<>();
|
||||
private static final LinkedHashMap<Language, String[]> Months = new LinkedHashMap<>();
|
||||
private static final int[] MaxDaysInMonth = new int[]{31,29,31,30,31,30,31,31,30,31,30,31};
|
||||
|
@ -75,7 +75,7 @@ public class DateDetection {
|
|||
public static enum Language {
|
||||
GERMAN, ENGLISH, FRENCH, SPANISH, ITALIAN, PORTUGUESE;
|
||||
}
|
||||
|
||||
|
||||
static {
|
||||
// all names must be lowercase because compared strings are made to lowercase as well
|
||||
Weekdays.put(Language.GERMAN, new String[]{"montag", "dienstag", "mittwoch", "donnerstag", "freitag", "samstag" /*oder: "sonnabend"*/, "sonntag"});
|
||||
|
@ -91,7 +91,7 @@ public class DateDetection {
|
|||
Months.put(Language.PORTUGUESE,new String[]{"janeiro", "fevereiro", "março", "abril", "maio", "junho", "julho", "agosto", "setembro", "outubro", "novembro", "dezembro"});
|
||||
|
||||
}
|
||||
|
||||
|
||||
// RFC 822 day and month specification as a norm for date formats. This is needed to reconstruct the actual date later
|
||||
public static enum Weekday {
|
||||
Mon(Weekdays, 0),
|
||||
|
@ -101,7 +101,7 @@ public class DateDetection {
|
|||
Fri(Weekdays, 4),
|
||||
Sat(Weekdays, 5),
|
||||
Sun(Weekdays, 6);
|
||||
|
||||
|
||||
private final Map<String, Language> inLanguages; // a map from the word to the language
|
||||
public final int offset; // the day offset in the week, monday = 0
|
||||
private Weekday(final LinkedHashMap<Language, String[]> weekdayMap, final int offset) {
|
||||
|
@ -112,7 +112,7 @@ public class DateDetection {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static enum Month {
|
||||
Jan( 1), Feb( 2), Mar( 3), Apr( 4), May( 5), Jun( 6),
|
||||
Jul( 7), Aug( 8), Sep( 9), Oct(10), Nov(11), Dec(12);
|
||||
|
@ -122,7 +122,7 @@ public class DateDetection {
|
|||
this.count = count;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static enum EntityType {
|
||||
YEAR(new LinkedHashMap<Language, String[]>()),
|
||||
MONTH(Months),
|
||||
|
@ -142,7 +142,7 @@ public class DateDetection {
|
|||
private final static String DAYCAPTURE = "(\\d{1,2})";
|
||||
private final static String YEARCAPTURE = "(\\d{2}|\\d{4})";
|
||||
private final static String MONTHCAPTURE = "(\\p{L}{3,}|\\d{1,2})";
|
||||
|
||||
|
||||
public static class HolidayMap extends TreeMap<String, Date[]>{
|
||||
private static final long serialVersionUID = 1L;
|
||||
public HolidayMap() {
|
||||
|
@ -152,69 +152,64 @@ public class DateDetection {
|
|||
|
||||
public static HolidayMap Holidays = new HolidayMap();
|
||||
public static Map<Pattern, Date[]> HolidayPattern = new HashMap<>();
|
||||
|
||||
|
||||
static {
|
||||
Holidays.putAll(getHolidays(CURRENT_YEAR));
|
||||
|
||||
|
||||
Holidays.putAll(getHolidays(CURRENT_YEAR));
|
||||
|
||||
for (Map.Entry<String, Date[]> holiday: Holidays.entrySet()) {
|
||||
HolidayPattern.put(Pattern.compile(BODNCG + holiday.getKey() + EODNCG), holiday.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param currentYear
|
||||
* the current year reference to use
|
||||
* @return a new mapping from holiday names to arrays of
|
||||
* three or four holiday dates starting from currentYear - 1. Each date time is 00:00:00 on UTC+00:00 time zone.
|
||||
*/
|
||||
public static HolidayMap getHolidays(final int currentYear) {
|
||||
final HolidayMap result = new HolidayMap();
|
||||
|
||||
/* Date rules from icu4j library used here (SimpleDateRule and EasterRule) use internally the default time zone and this can not be modified (up to icu4j 60.1) */
|
||||
final TimeZone dateRulesTimeZone = TimeZone.getDefault();
|
||||
/**
|
||||
* @param currentYear
|
||||
* the current year reference to use
|
||||
* @return a new mapping from holiday names to arrays of
|
||||
* three or four holiday dates starting from currentYear - 1. Each date time is 00:00:00 on UTC+00:00 time zone.
|
||||
*/
|
||||
public static HolidayMap getHolidays(final int currentYear) {
|
||||
final HolidayMap result = new HolidayMap();
|
||||
|
||||
/* Date rules from icu4j library used here (SimpleDateRule and EasterRule) use internally the default time zone and this can not be modified (up to icu4j 60.1) */
|
||||
final TimeZone dateRulesTimeZone = TimeZone.getDefault();
|
||||
// German
|
||||
result.put("Neujahr", sameDayEveryYear(Calendar.JANUARY, 1, currentYear));
|
||||
result.put("Heilige Drei Könige", sameDayEveryYear(Calendar.JANUARY, 6, currentYear));
|
||||
result.put("Valentinstag", sameDayEveryYear(Calendar.FEBRUARY, 14, currentYear));
|
||||
|
||||
|
||||
/* Fat Thursday : Thursday (6 days) before Ash Wednesday (52 days before Easter Sunday) */
|
||||
result.put("Weiberfastnacht", holiDayEventRule(new EasterHoliday(-52, "Weiberfastnacht").getRule(), currentYear, dateRulesTimeZone)); // new Date[]{CONFORM.parse("2014/02/27"), CONFORM.parse("2015/02/12"), CONFORM.parse("2016/02/04")});
|
||||
|
||||
result.put("Weiberfasching", result.get("Weiberfastnacht"));
|
||||
|
||||
|
||||
/* Rose Monday : Monday before Ash Wednesday (48 days before Easter Sunday) */
|
||||
result.put("Rosenmontag", holiDayEventRule(new EasterHoliday(-48, "Rosenmontag").getRule(), currentYear, dateRulesTimeZone)); // new Date[]{CONFORM.parse("2014/03/03"), CONFORM.parse("2015/03/16"), CONFORM.parse("2016/02/08")});
|
||||
|
||||
result.put("Faschingsdienstag", holiDayEventRule(EasterHoliday.SHROVE_TUESDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/03/04"), CONFORM.parse("2015/03/17"), CONFORM.parse("2016/02/09")});
|
||||
result.put("Fastnacht", result.get("Faschingsdienstag")); // new Date[]{CONFORM.parse("2014/03/04"), CONFORM.parse("2015/03/17"), CONFORM.parse("2016/02/09")});
|
||||
result.put("Aschermittwoch", holiDayEventRule(EasterHoliday.ASH_WEDNESDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/03/05"), CONFORM.parse("2015/03/18"), CONFORM.parse("2016/02/10")});
|
||||
result.put("Palmsonntag", holiDayEventRule(EasterHoliday.PALM_SUNDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/13"), CONFORM.parse("2015/03/29"), CONFORM.parse("2016/04/20")});
|
||||
result.put("Gründonnerstag", holiDayEventRule(EasterHoliday.MAUNDY_THURSDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/17"), CONFORM.parse("2015/04/02"), CONFORM.parse("2016/04/24")});
|
||||
result.put("Karfreitag", holiDayEventRule(EasterHoliday.GOOD_FRIDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/18"), CONFORM.parse("2015/04/03"), CONFORM.parse("2016/04/25")});
|
||||
|
||||
|
||||
/* Holy Saturday (also called Easter Eve, Black Saturday) : one day before Easter Sunday */
|
||||
result.put("Karsamstag", holiDayEventRule(new EasterHoliday(-1, "Karsamstag").getRule(), currentYear, dateRulesTimeZone)); // new Date[]{CONFORM.parse("2014/04/19"), CONFORM.parse("2015/04/04"), CONFORM.parse("2016/04/26")});
|
||||
result.put("Ostersonntag", holiDayEventRule(EasterHoliday.EASTER_SUNDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/20"), CONFORM.parse("2015/04/05"), CONFORM.parse("2016/04/27")});
|
||||
result.put("Ostermontag", holiDayEventRule(EasterHoliday.EASTER_MONDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/21"), CONFORM.parse("2015/04/06"), CONFORM.parse("2016/04/28")});
|
||||
|
||||
|
||||
/* Include both Easter Sunday and Monday */
|
||||
result.put("Ostern", getOsternEventRule(currentYear, dateRulesTimeZone));
|
||||
|
||||
result.put("Walpurgisnacht", sameDayEveryYear(Calendar.APRIL, 30, currentYear));
|
||||
result.put("Tag der Arbeit", sameDayEveryYear(Calendar.MAY, 1, currentYear));
|
||||
|
||||
|
||||
/* Mother's Day : Second sunday of may in Germany */
|
||||
final Date[] mothersDays = new Date[3];
|
||||
int year = currentYear - 1;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
final LocalDate firstMay = LocalDate.of(year, java.time.Month.MAY, 1);
|
||||
final LocalDate mothersDay = firstMay.with(TemporalAdjusters.firstInMonth(DayOfWeek.SUNDAY)).with(TemporalAdjusters.next(DayOfWeek.SUNDAY));
|
||||
mothersDays[i] = toMidnightUTCDate(mothersDay);
|
||||
year++;
|
||||
final LocalDate firstMay = LocalDate.of(year, java.time.Month.MAY, 1);
|
||||
final LocalDate mothersDay = firstMay.with(TemporalAdjusters.firstInMonth(DayOfWeek.SUNDAY)).with(TemporalAdjusters.next(DayOfWeek.SUNDAY));
|
||||
mothersDays[i] = toMidnightUTCDate(mothersDay);
|
||||
year++;
|
||||
}
|
||||
result.put("Muttertag", mothersDays);
|
||||
|
||||
result.put("Christi Himmelfahrt", holiDayEventRule(EasterHoliday.ASCENSION.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/05/29"), CONFORM.parse("2015/05/14"), CONFORM.parse("2016/05/05")});
|
||||
result.put("Pfingstsonntag", holiDayEventRule(EasterHoliday.WHIT_SUNDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/06/08"), CONFORM.parse("2015/05/24"), CONFORM.parse("2016/05/15")});
|
||||
result.put("Pfingstmontag", holiDayEventRule(EasterHoliday.WHIT_MONDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/06/09"), CONFORM.parse("2015/05/25"), CONFORM.parse("2016/05/16")});
|
||||
|
@ -226,50 +221,48 @@ public class DateDetection {
|
|||
result.put("Allerseelen", sameDayEveryYear(Calendar.NOVEMBER, 2, currentYear));
|
||||
result.put("Martinstag", sameDayEveryYear(Calendar.NOVEMBER, 11, currentYear));
|
||||
result.put("St. Martin", result.get("Martinstag"));
|
||||
|
||||
result.put("Buß- und Bettag", holiDayEventRule(new SimpleDateRule(Calendar.NOVEMBER, 22, Calendar.WEDNESDAY, true), currentYear, dateRulesTimeZone)); // new Date[]{CONFORM.parse("2014/11/19"), CONFORM.parse("2015/11/18"), CONFORM.parse("2016/11/16")});
|
||||
|
||||
result.put("Nikolaus", sameDayEveryYear(Calendar.DECEMBER, 6, currentYear));
|
||||
result.put("Heiligabend", sameDayEveryYear(Calendar.DECEMBER, 24, currentYear));
|
||||
result.put("1. Weihnachtsfeiertag", sameDayEveryYear(Calendar.DECEMBER, 25, currentYear));
|
||||
result.put("2. Weihnachtsfeiertag", sameDayEveryYear(Calendar.DECEMBER, 26, currentYear));
|
||||
|
||||
/* Advent : four Sundays before Chritsmas */
|
||||
final Date[] advents1 = new Date[3], advents2 = new Date[3], advents3 = new Date[3], advents4 = new Date[3],
|
||||
volkstrauertagen = new Date[3], sundaysOfTheDead = new Date[3];
|
||||
|
||||
year = currentYear - 1;
|
||||
final TemporalAdjuster prevSunday = TemporalAdjusters.previous(DayOfWeek.SUNDAY);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
final LocalDate christmas = LocalDate.of(year, java.time.Month.DECEMBER, 25);
|
||||
final LocalDate advent4 = christmas.with(prevSunday);
|
||||
final LocalDate advent3 = advent4.with(prevSunday);
|
||||
final LocalDate advent2 = advent3.with(prevSunday);
|
||||
final LocalDate advent1 = advent2.with(prevSunday);
|
||||
final LocalDate sundayOfTheDead = advent1.with(prevSunday);
|
||||
final LocalDate volkstrauertag = sundayOfTheDead.with(prevSunday);
|
||||
advents4[i] = toMidnightUTCDate(advent4);
|
||||
advents3[i] = toMidnightUTCDate(advent3);
|
||||
advents2[i] = toMidnightUTCDate(advent2);
|
||||
advents1[i] = toMidnightUTCDate(advent1);
|
||||
sundaysOfTheDead[i] = toMidnightUTCDate(sundayOfTheDead);
|
||||
volkstrauertagen[i] = toMidnightUTCDate(volkstrauertag);
|
||||
year++;
|
||||
}
|
||||
|
||||
result.put("1. Advent", advents1);
|
||||
result.put("2. Advent", advents2);
|
||||
result.put("3. Advent", advents3);
|
||||
result.put("4. Advent", advents4);
|
||||
/* Advent : four Sundays before Chritsmas */
|
||||
final Date[] advents1 = new Date[3], advents2 = new Date[3], advents3 = new Date[3], advents4 = new Date[3],
|
||||
volkstrauertagen = new Date[3], sundaysOfTheDead = new Date[3];
|
||||
|
||||
/* Sunday of the Dead (also called Eternity Sunday) : last Sunday before Advent */
|
||||
year = currentYear - 1;
|
||||
final TemporalAdjuster prevSunday = TemporalAdjusters.previous(DayOfWeek.SUNDAY);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
final LocalDate christmas = LocalDate.of(year, java.time.Month.DECEMBER, 25);
|
||||
final LocalDate advent4 = christmas.with(prevSunday);
|
||||
final LocalDate advent3 = advent4.with(prevSunday);
|
||||
final LocalDate advent2 = advent3.with(prevSunday);
|
||||
final LocalDate advent1 = advent2.with(prevSunday);
|
||||
final LocalDate sundayOfTheDead = advent1.with(prevSunday);
|
||||
final LocalDate volkstrauertag = sundayOfTheDead.with(prevSunday);
|
||||
advents4[i] = toMidnightUTCDate(advent4);
|
||||
advents3[i] = toMidnightUTCDate(advent3);
|
||||
advents2[i] = toMidnightUTCDate(advent2);
|
||||
advents1[i] = toMidnightUTCDate(advent1);
|
||||
sundaysOfTheDead[i] = toMidnightUTCDate(sundayOfTheDead);
|
||||
volkstrauertagen[i] = toMidnightUTCDate(volkstrauertag);
|
||||
year++;
|
||||
}
|
||||
|
||||
result.put("1. Advent", advents1);
|
||||
result.put("2. Advent", advents2);
|
||||
result.put("3. Advent", advents3);
|
||||
result.put("4. Advent", advents4);
|
||||
|
||||
/* Sunday of the Dead (also called Eternity Sunday) : last Sunday before Advent */
|
||||
result.put("Totensonntag", sundaysOfTheDead);
|
||||
|
||||
/* "people's day of mourning" : two Sundays before Advent */
|
||||
result.put("Volkstrauertag", volkstrauertagen);
|
||||
|
||||
result.put("Volkstrauertag", volkstrauertagen);
|
||||
|
||||
result.put("Silvester", sameDayEveryYear(Calendar.DECEMBER, 31, currentYear));
|
||||
|
||||
|
||||
// English
|
||||
result.put("Eastern", result.get("Ostern"));
|
||||
result.put("New Year's Day", result.get("Neujahr"));
|
||||
|
@ -286,23 +279,23 @@ public class DateDetection {
|
|||
result.put("Christmas Day", result.get("1. Weihnachtsfeiertag"));
|
||||
result.put("Boxing Day", result.get("2. Weihnachtsfeiertag"));
|
||||
result.put("New Year's Eve", result.get("Silvester"));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a date to an old style java.util.Date instance with time set at
|
||||
* midnight on UTC time zone.
|
||||
*
|
||||
* @param localDate
|
||||
* a simple date with year month and day without time zone
|
||||
* @return a java.util.Date instance or null when localDate is null
|
||||
*/
|
||||
public static Date toMidnightUTCDate(final LocalDate localDate) {
|
||||
if (localDate == null) {
|
||||
return null;
|
||||
}
|
||||
return Date.from(ZonedDateTime.of(localDate, LocalTime.MIDNIGHT, UTC_TIMEZONE.toZoneId()).toInstant());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a date to an old style java.util.Date instance with time set at
|
||||
* midnight on UTC time zone.
|
||||
*
|
||||
* @param localDate
|
||||
* a simple date with year month and day without time zone
|
||||
* @return a java.util.Date instance or null when localDate is null
|
||||
*/
|
||||
public static Date toMidnightUTCDate(final LocalDate localDate) {
|
||||
if (localDate == null) {
|
||||
return null;
|
||||
}
|
||||
return Date.from(ZonedDateTime.of(localDate, LocalTime.MIDNIGHT, UTC_TIMEZONE.toZoneId()).toInstant());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param month value of month (Calendar.month is 0 based)
|
||||
|
@ -330,40 +323,40 @@ public class DateDetection {
|
|||
* @return 3 years of same holiday starting in last year (currentYear - 1)
|
||||
*/
|
||||
private static Date[] holiDayEventRule(final DateRule holidayrule, final int currentYear, final TimeZone ruleTimeZone) {
|
||||
final Date[] r = new Date[3];
|
||||
final Calendar january1Calendar = new GregorianCalendar(ruleTimeZone);
|
||||
/* Clear all fields to get a 00:00:00:000 time part */
|
||||
january1Calendar.clear();
|
||||
|
||||
/* Calendar using UTC time zone to produce date results */
|
||||
final Calendar utcCalendar = new GregorianCalendar(UTC_TIMEZONE);
|
||||
|
||||
/* Calendar using the same time zone as in the holidayrule to extract year,month, and day fields */
|
||||
final Calendar ruleCalendar = new GregorianCalendar(ruleTimeZone);
|
||||
final Date[] r = new Date[3];
|
||||
final Calendar january1Calendar = new GregorianCalendar(ruleTimeZone);
|
||||
/* Clear all fields to get a 00:00:00:000 time part */
|
||||
january1Calendar.clear();
|
||||
|
||||
int year = currentYear -1; // set previous year as start year
|
||||
for (int y = 0; y < 3; y++) {
|
||||
january1Calendar.set(year, Calendar.JANUARY, 1);
|
||||
Date holiday = holidayrule.firstAfter(january1Calendar.getTime());
|
||||
ruleCalendar.setTime(holiday);
|
||||
utcCalendar.set(ruleCalendar.get(Calendar.YEAR), ruleCalendar.get(Calendar.MONTH),
|
||||
ruleCalendar.get(Calendar.DAY_OF_MONTH));
|
||||
r[y] = utcCalendar.getTime();
|
||||
year++;
|
||||
}
|
||||
return r;
|
||||
/* Calendar using UTC time zone to produce date results */
|
||||
final Calendar utcCalendar = new GregorianCalendar(UTC_TIMEZONE);
|
||||
|
||||
/* Calendar using the same time zone as in the holidayrule to extract year,month, and day fields */
|
||||
final Calendar ruleCalendar = new GregorianCalendar(ruleTimeZone);
|
||||
|
||||
int year = currentYear -1; // set previous year as start year
|
||||
for (int y = 0; y < 3; y++) {
|
||||
january1Calendar.set(year, Calendar.JANUARY, 1);
|
||||
Date holiday = holidayrule.firstAfter(january1Calendar.getTime());
|
||||
ruleCalendar.setTime(holiday);
|
||||
utcCalendar.set(ruleCalendar.get(Calendar.YEAR), ruleCalendar.get(Calendar.MONTH),
|
||||
ruleCalendar.get(Calendar.DAY_OF_MONTH));
|
||||
r[y] = utcCalendar.getTime();
|
||||
year++;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param currentYear the current year reference to use
|
||||
* @param ruleTimeZone the time zone of calendar used in the holiday rule
|
||||
* @return Easter sunday and monday dates on three years starting from last year
|
||||
*/
|
||||
private static Date[] getOsternEventRule(final int currentYear, final TimeZone ruleTimeZone) {
|
||||
ArrayList<Date> osternDates = new ArrayList<>();
|
||||
Collections.addAll(osternDates, holiDayEventRule(EasterHoliday.EASTER_SUNDAY.getRule(), currentYear, ruleTimeZone));
|
||||
Collections.addAll(osternDates, holiDayEventRule(EasterHoliday.EASTER_MONDAY.getRule(), currentYear, ruleTimeZone));
|
||||
return osternDates.toArray(new Date[osternDates.size()]);
|
||||
ArrayList<Date> osternDates = new ArrayList<>();
|
||||
Collections.addAll(osternDates, holiDayEventRule(EasterHoliday.EASTER_SUNDAY.getRule(), currentYear, ruleTimeZone));
|
||||
Collections.addAll(osternDates, holiDayEventRule(EasterHoliday.EASTER_MONDAY.getRule(), currentYear, ruleTimeZone));
|
||||
return osternDates.toArray(new Date[osternDates.size()]);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -371,7 +364,7 @@ public class DateDetection {
|
|||
* It can also be used to identify the language of a text, if that text uses words from a date vocabulary.
|
||||
*/
|
||||
public static class LanguageRecognition {
|
||||
|
||||
|
||||
private final Pattern weekdayMatch, monthMatch;
|
||||
private final Set<Language> usedInLanguages;
|
||||
private final Map<String, Integer> weekdayIndex, monthIndex, monthIndexAbbrev;
|
||||
|
@ -395,7 +388,7 @@ public class DateDetection {
|
|||
weekdayMatchString.append("|(?:").append(BODNCG).append(weekdays[i]).append(SEPARATORNCG).append(EODNCG).append(')');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
String[] months = Months.get(language);
|
||||
if (months != null) {
|
||||
assert months.length == 12;
|
||||
|
@ -413,7 +406,7 @@ public class DateDetection {
|
|||
this.weekdayMatch = Pattern.compile(weekdayMatchString.length() > 0 ? weekdayMatchString.substring(1) : "");
|
||||
this.monthMatch = Pattern.compile(monthMatchString.length() > 0 ? monthMatchString.substring(1) : "");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* this is an expensive check that looks if any of the words from the date expressions (month and weekday expressions)
|
||||
* appear in the text. This should only be used to verify a parse result if the result was ambiguous
|
||||
|
@ -423,7 +416,7 @@ public class DateDetection {
|
|||
public boolean usesLanguageOfNotion(String text) {
|
||||
return this.weekdayMatch.matcher(text).matches() || this.monthMatch.matcher(text).matches();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* parse a part of a date
|
||||
* @param entity
|
||||
|
@ -479,7 +472,7 @@ public class DateDetection {
|
|||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
private final static LanguageRecognition ENGLISH_LANGUAGE = new LanguageRecognition(new Language[]{Language.ENGLISH});
|
||||
|
@ -487,7 +480,7 @@ public class DateDetection {
|
|||
private final static LanguageRecognition FRENCH_LANGUAGE = new LanguageRecognition(new Language[]{Language.FRENCH});
|
||||
private final static LanguageRecognition ENGLISH_GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH});
|
||||
private final static LanguageRecognition ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH, Language.FRENCH, Language.SPANISH, Language.ITALIAN, Language.PORTUGUESE});
|
||||
|
||||
|
||||
public static interface StyleParser {
|
||||
/**
|
||||
* get all dates in the text
|
||||
|
@ -496,7 +489,7 @@ public class DateDetection {
|
|||
*/
|
||||
public LinkedHashSet<Date> parse(String text);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Regular expressions for various types of date writings.
|
||||
* Uses terminology and data taken from:
|
||||
|
@ -526,7 +519,7 @@ public class DateDetection {
|
|||
this.pattern = Pattern.compile(patternString);
|
||||
this.languageParser = languageParser;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* get all dates in the text
|
||||
* @param text
|
||||
|
@ -552,42 +545,42 @@ public class DateDetection {
|
|||
int month = this.firstEntity == EntityType.MONTH ? i1 : this.secondEntity == EntityType.MONTH ? i2 : i3;
|
||||
if (day > MaxDaysInMonth[month - 1]) continue; // validity check of the day number
|
||||
int year = this.firstEntity == EntityType.YEAR ? i1 : this.secondEntity == EntityType.YEAR ? i2 : i3;
|
||||
final Date parsed = parseDateSafely(
|
||||
year + "/" + (month < 10 ? "0" : "") + month + "/" + (day < 10 ? "0" : "") + day, CONFORM);
|
||||
final Date parsed = parseDateSafely(
|
||||
year + "/" + (month < 10 ? "0" : "") + month + "/" + (day < 10 ? "0" : "") + day, CONFORM);
|
||||
if(parsed != null) {
|
||||
dates.add(parsed);
|
||||
dates.add(parsed);
|
||||
}
|
||||
if (dates.size() > 100) {dates.clear(); break;} // that does not make sense
|
||||
}
|
||||
return dates;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Safely parse the given string to an instant using the given formatter. Return
|
||||
* null when the format can not be applied to the given string or when any
|
||||
* parsing error occurred.
|
||||
*
|
||||
* @param str
|
||||
* the string to parse
|
||||
* @param formatter
|
||||
* the formatter to use
|
||||
* @return an Instant instance or null
|
||||
*/
|
||||
protected static Date parseDateSafely(final String str, final DateTimeFormatter formatter) {
|
||||
Date res = null;
|
||||
if (str != null && !str.isEmpty()) {
|
||||
try {
|
||||
if (formatter != null) {
|
||||
res = Date.from(LocalDate.parse(str, formatter).atStartOfDay().toInstant(ZoneOffset.UTC));
|
||||
}
|
||||
} catch (final RuntimeException ignored) {
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Safely parse the given string to an instant using the given formatter. Return
|
||||
* null when the format can not be applied to the given string or when any
|
||||
* parsing error occurred.
|
||||
*
|
||||
* @param str
|
||||
* the string to parse
|
||||
* @param formatter
|
||||
* the formatter to use
|
||||
* @return an Instant instance or null
|
||||
*/
|
||||
protected static Date parseDateSafely(final String str, final DateTimeFormatter formatter) {
|
||||
Date res = null;
|
||||
if (str != null && !str.isEmpty()) {
|
||||
try {
|
||||
if (formatter != null) {
|
||||
res = Date.from(LocalDate.parse(str, formatter).atStartOfDay().toInstant(ZoneOffset.UTC));
|
||||
}
|
||||
} catch (final RuntimeException ignored) {
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
public static enum ShortStyle implements StyleParser {
|
||||
MD_ENGLISH(EntityType.MONTH, EntityType.DAY, // Big-endian (month, day), e.g. "from october 1st to september 13th"
|
||||
ENGLISH_LANGUAGE,
|
||||
|
@ -647,21 +640,21 @@ public class DateDetection {
|
|||
|
||||
final Date atThisYear = parseDateSafely(thisyear + datestub, CONFORM);
|
||||
if(atThisYear != null) {
|
||||
dates.add(atThisYear);
|
||||
dates.add(atThisYear);
|
||||
}
|
||||
|
||||
|
||||
final Date atNextYear = parseDateSafely(nextyear + datestub, CONFORM);
|
||||
if(atNextYear != null) {
|
||||
dates.add(atNextYear);
|
||||
dates.add(atNextYear);
|
||||
}
|
||||
//dates.add(atThisYear.after(TODAY) ? atThisYear : atNextYear); // we consider these kind of dates as given for the future
|
||||
if (dates.size() > 100) {dates.clear(); break;} // that does not make sense
|
||||
}
|
||||
return dates;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static final HashMap<String, Long> specialDayOffset = new HashMap<>();
|
||||
static {
|
||||
specialDayOffset.put("today", 0L); specialDayOffset.put("heute", 0L);
|
||||
|
@ -669,7 +662,7 @@ public class DateDetection {
|
|||
specialDayOffset.put("dayaftertomorrow", 2 * AbstractFormatter.dayMillis); specialDayOffset.put("uebermorgen", 2 * AbstractFormatter.dayMillis);
|
||||
specialDayOffset.put("yesterday", -AbstractFormatter.dayMillis); specialDayOffset.put("gestern", -AbstractFormatter.dayMillis);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* get all dates in the text
|
||||
* @param text
|
||||
|
@ -679,7 +672,7 @@ public class DateDetection {
|
|||
public static LinkedHashSet<Date> parse(String text, int timezoneOffset) {
|
||||
|
||||
LinkedHashSet<Date> dates = parseRawDate(text);
|
||||
|
||||
|
||||
for (Map.Entry<Pattern, Date[]> entry: HolidayPattern.entrySet()) {
|
||||
if (entry.getKey().matcher(text).find()) {
|
||||
for (Date d: entry.getValue()) dates.add(d);
|
||||
|
@ -701,12 +694,12 @@ public class DateDetection {
|
|||
Date d = parseDateSafely(text, CONFORM);
|
||||
//if (d == null) try {d = GenericFormatter.FORMAT_SHORT_DAY.parse(text);} catch (ParseException e) {} // did not work well and fired for wrong formats; do not use
|
||||
if (d == null) {
|
||||
d = parseDateSafely(text, GenericFormatter.FORMAT_RFC1123_SHORT);
|
||||
d = parseDateSafely(text, GenericFormatter.FORMAT_RFC1123_SHORT);
|
||||
}
|
||||
if (d == null) {
|
||||
d = parseDateSafely(text, GenericFormatter.FORMAT_ANSIC);
|
||||
d = parseDateSafely(text, GenericFormatter.FORMAT_ANSIC);
|
||||
}
|
||||
|
||||
|
||||
if (d == null) {
|
||||
// check other date formats
|
||||
Set<Date> dd = parseRawDate(text);
|
||||
|
@ -734,7 +727,7 @@ public class DateDetection {
|
|||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
|
||||
private static LinkedHashSet<Date> parseRawDate(String text) {
|
||||
// get parse alternatives for different date styles; we consider that one document uses only one style
|
||||
LinkedHashSet<Date> DMYDates = EndianStyle.DMY.parse(text);
|
||||
|
@ -745,34 +738,34 @@ public class DateDetection {
|
|||
if (DMDates.size() > 0) break;
|
||||
}
|
||||
DMYDates.addAll(DMDates);
|
||||
|
||||
|
||||
LinkedHashSet<Date> MDYDates = DMYDates.size() == 0 ? EndianStyle.MDY.parse(text) : new LinkedHashSet<Date>(0);
|
||||
LinkedHashSet<Date> MDDates = DMYDates.size() == 0 ? ShortStyle.MD_ENGLISH.parse(text) : new LinkedHashSet<Date>(0);
|
||||
MDYDates.addAll(MDDates);
|
||||
|
||||
|
||||
LinkedHashSet<Date> YMDDates = DMYDates.size() == 0 && MDYDates.size() == 0 ? EndianStyle.YMD.parse(text) : new LinkedHashSet<Date>(0);
|
||||
|
||||
|
||||
// if either one of them contains any and the other contain no date, chose that one (we don't want to mix them)
|
||||
if (YMDDates.size() > 0 && DMYDates.size() == 0 && MDYDates.size() == 0) return YMDDates;
|
||||
if (YMDDates.size() == 0 && DMYDates.size() > 0 && MDYDates.size() == 0) return DMYDates;
|
||||
if (YMDDates.size() == 0 && DMYDates.size() == 0 && MDYDates.size() > 0) return MDYDates;
|
||||
|
||||
|
||||
// if we have several sets, check if we can detect the language from month or weekday expressions
|
||||
// we sort out such sets, which do not contain any of these languages
|
||||
boolean usesLanguageOfYMD = YMDDates.size() > 0 ? false : EndianStyle.YMD.languageParser.usesLanguageOfNotion(text);
|
||||
boolean usesLanguageOfDMY = DMYDates.size() > 0 ? false : EndianStyle.DMY.languageParser.usesLanguageOfNotion(text);
|
||||
boolean usesLanguageOfMDY = MDYDates.size() > 0 ? false : EndianStyle.MDY.languageParser.usesLanguageOfNotion(text);
|
||||
|
||||
|
||||
// now check again
|
||||
if (usesLanguageOfYMD && !usesLanguageOfDMY && !usesLanguageOfMDY) return YMDDates;
|
||||
if (!usesLanguageOfYMD && usesLanguageOfDMY && !usesLanguageOfMDY) return DMYDates;
|
||||
if (!usesLanguageOfYMD && !usesLanguageOfDMY && usesLanguageOfMDY) return MDYDates;
|
||||
|
||||
|
||||
// if this fails, we return only the DMY format since that has the most chances to be right (it is mostly used)
|
||||
// we choose DMYDates even if it is empty to avoid false positives.
|
||||
return DMYDates;
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
String fill = ""; for (int i = 0; i < 1000; i++) fill += 'x';
|
||||
String[] test = new String[]{
|
||||
|
@ -819,6 +812,6 @@ public class DateDetection {
|
|||
System.out.println();
|
||||
}
|
||||
System.out.println("Runtime: " + (System.currentTimeMillis() - t) + " milliseconds.");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -34,57 +34,57 @@ import java.util.List;
|
|||
*/
|
||||
public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> {
|
||||
|
||||
/** Holds the next element */
|
||||
/** Holds the next element */
|
||||
private StringBuilder buffer;
|
||||
|
||||
|
||||
/** List of already parsed sentences, eventually in addition to those extracted from the main text. */
|
||||
private List<StringBuilder> parsedSentences;
|
||||
|
||||
|
||||
/** Current position in the parsedSentences list. */
|
||||
private int sentencesPos;
|
||||
|
||||
|
||||
/** The main text to parse for sentences */
|
||||
private String text;
|
||||
|
||||
|
||||
/** The current character position in the main text */
|
||||
private int pos;
|
||||
|
||||
|
||||
/** When true sentences can not include line break characters */
|
||||
private boolean pre = false;
|
||||
|
||||
public SentenceReader(final String text) {
|
||||
this(new ArrayList<>(), text, false);
|
||||
this(new ArrayList<>(), text, false);
|
||||
}
|
||||
|
||||
public SentenceReader(final String text, final boolean pre) {
|
||||
this(new ArrayList<>(), text, pre);
|
||||
this(new ArrayList<>(), text, pre);
|
||||
}
|
||||
|
||||
|
||||
public SentenceReader(final List<StringBuilder> parsedSentences, final String text, final boolean pre) {
|
||||
assert text != null;
|
||||
assert text != null;
|
||||
this.text = text;
|
||||
this.pos = 0;
|
||||
this.pre = pre;
|
||||
if(parsedSentences == null) {
|
||||
this.parsedSentences = new ArrayList<>();
|
||||
this.parsedSentences = new ArrayList<>();
|
||||
} else {
|
||||
this.parsedSentences = parsedSentences;
|
||||
this.parsedSentences = parsedSentences;
|
||||
}
|
||||
this.sentencesPos = 0;
|
||||
this.buffer = nextElement0();
|
||||
}
|
||||
|
||||
|
||||
public void pre(final boolean x) {
|
||||
this.pre = x;
|
||||
}
|
||||
|
||||
private StringBuilder nextElement0() {
|
||||
if(this.sentencesPos < this.parsedSentences.size()) {
|
||||
final StringBuilder element = this.parsedSentences.get(this.sentencesPos);
|
||||
this.sentencesPos++;
|
||||
return element;
|
||||
}
|
||||
|
||||
if(this.sentencesPos < this.parsedSentences.size()) {
|
||||
final StringBuilder element = this.parsedSentences.get(this.sentencesPos);
|
||||
this.sentencesPos++;
|
||||
return element;
|
||||
}
|
||||
|
||||
final StringBuilder s = new StringBuilder(80);
|
||||
int nextChar;
|
||||
char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' '
|
||||
|
@ -112,10 +112,10 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
|
|||
}
|
||||
|
||||
public final static boolean invisible(final char c) {
|
||||
// first check average simple case
|
||||
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) return false;
|
||||
// then check more complex case which applies to all character sets
|
||||
final int type = Character.getType(c);
|
||||
// first check average simple case
|
||||
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) return false;
|
||||
// then check more complex case which applies to all character sets
|
||||
final int type = Character.getType(c);
|
||||
return !(type == Character.LOWERCASE_LETTER
|
||||
|| type == Character.DECIMAL_DIGIT_NUMBER
|
||||
|| type == Character.UPPERCASE_LETTER
|
||||
|
@ -153,19 +153,19 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
|
|||
public Iterator<StringBuilder> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reset the iterator position to zero
|
||||
*/
|
||||
public void reset() {
|
||||
/* Reset only the sentences position to reuse already parsed sentences */
|
||||
this.sentencesPos = 0;
|
||||
this.buffer = nextElement0();
|
||||
/* Reset only the sentences position to reuse already parsed sentences */
|
||||
this.sentencesPos = 0;
|
||||
this.buffer = nextElement0();
|
||||
}
|
||||
|
||||
public synchronized void close() {
|
||||
this.text = null;
|
||||
this.parsedSentences = null;
|
||||
this.text = null;
|
||||
this.parsedSentences = null;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
|
|
@ -59,7 +59,7 @@ public class Tokenizer {
|
|||
protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation (key: words are lowercase)
|
||||
private final Set<String> synonyms; // a set of synonyms to the words
|
||||
protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
|
||||
|
||||
|
||||
public int RESULT_NUMB_WORDS = -1;
|
||||
public int RESULT_NUMB_SENTENCES = -1;
|
||||
public Bitfield RESULT_FLAGS = new Bitfield(4);
|
||||
|
@ -70,7 +70,7 @@ public class Tokenizer {
|
|||
assert text != null;
|
||||
final String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
|
||||
for (int i = 0; i < wordcache.length; i++) {
|
||||
wordcache[i] = "";
|
||||
wordcache[i] = "";
|
||||
}
|
||||
String k;
|
||||
int wordlen;
|
||||
|
@ -167,95 +167,95 @@ public class Tokenizer {
|
|||
if (syms != null) this.synonyms.addAll(syms);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// store result
|
||||
this.RESULT_NUMB_WORDS = allwordcounter;
|
||||
// if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text.
|
||||
this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether a single word or multiple ones match tags
|
||||
* from the given autotagging vocabularies. Then fill this instance "tags" map
|
||||
* with the eventually matching tags found.
|
||||
*
|
||||
* @param wordcache
|
||||
* the words to be checked for matching a tag as a single word or as combination of words
|
||||
* @param word
|
||||
* an additional word to be considered for tag matching
|
||||
* @param vocabularyNames
|
||||
* names of the autotagging vocabularies to check
|
||||
*/
|
||||
protected void extractAutoTagsFromText(final String[] wordcache, final String word, final Set<String> vocabularyNames) {
|
||||
Tagging.Metatag tag;
|
||||
if (vocabularyNames.size() > 0) {
|
||||
for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
|
||||
// wordc is number of words that are tested
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (wordc == 1) {
|
||||
sb.append(word);
|
||||
} else {
|
||||
for (int w = 0; w < wordc - 1; w++) {
|
||||
sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
|
||||
}
|
||||
sb.append(word);
|
||||
}
|
||||
String testterm = sb.toString().trim();
|
||||
tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
|
||||
if (tag != null) {
|
||||
String navigatorName = tag.getVocabularyName();
|
||||
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
|
||||
if (tagset == null) {
|
||||
tagset = new HashSet<Tagging.Metatag>();
|
||||
this.tags.put(navigatorName, tagset);
|
||||
}
|
||||
tagset.add(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Check whether a single word or multiple ones match tags
|
||||
* from the given autotagging vocabularies. Then fill this instance "tags" map
|
||||
* with the eventually matching tags found.
|
||||
*
|
||||
* @param wordcache
|
||||
* the words to be checked for matching a tag as a single word or as combination of words
|
||||
* @param word
|
||||
* an additional word to be considered for tag matching
|
||||
* @param vocabularyNames
|
||||
* names of the autotagging vocabularies to check
|
||||
*/
|
||||
protected void extractAutoTagsFromText(final String[] wordcache, final String word, final Set<String> vocabularyNames) {
|
||||
Tagging.Metatag tag;
|
||||
if (vocabularyNames.size() > 0) {
|
||||
for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
|
||||
// wordc is number of words that are tested
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (wordc == 1) {
|
||||
sb.append(word);
|
||||
} else {
|
||||
for (int w = 0; w < wordc - 1; w++) {
|
||||
sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
|
||||
}
|
||||
sb.append(word);
|
||||
}
|
||||
String testterm = sb.toString().trim();
|
||||
tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
|
||||
if (tag != null) {
|
||||
String navigatorName = tag.getVocabularyName();
|
||||
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
|
||||
if (tagset == null) {
|
||||
tagset = new HashSet<Tagging.Metatag>();
|
||||
this.tags.put(navigatorName, tagset);
|
||||
}
|
||||
tagset.add(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend the specified vocabularies, with terms eventually found by the
|
||||
* vocabulary scraper for these vocabularies. The scraper is emptied after
|
||||
* processing, and extended vocabularies names are removed from the
|
||||
* vocabularyNames.
|
||||
*
|
||||
* @param root
|
||||
* the document URL
|
||||
* @param scraper
|
||||
* the vocabulary scraper, eventually containing new terms scraped
|
||||
* for the registered vocabularies
|
||||
* @param vocabularyNames
|
||||
* vocabularies names to be extended
|
||||
*/
|
||||
protected void extendVocabularies(final DigestURL root, final VocabularyScraper scraper,
|
||||
final Set<String> vocabularyNames) {
|
||||
Tagging.Metatag tag;
|
||||
Map<String, String> vocMap = scraper == null ? null : scraper.removeVocMap(root);
|
||||
if (vocMap != null && vocMap.size() > 0) {
|
||||
for (Map.Entry<String, String> entry: vocMap.entrySet()) {
|
||||
String navigatorName = entry.getKey();
|
||||
String term = entry.getValue();
|
||||
vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation
|
||||
Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
|
||||
if (vocabulary != null) {
|
||||
// extend the vocabulary
|
||||
String obj = vocabulary.getObjectlink(term);
|
||||
if (obj == null) {
|
||||
try {
|
||||
vocabulary.put(term, "", root.toNormalform(true));
|
||||
} catch (IOException e) {} // this makes IO, be careful!
|
||||
}
|
||||
// create annotation
|
||||
tag = vocabulary.getMetatagFromTerm(term);
|
||||
Set<Tagging.Metatag> tagset = new HashSet<>();
|
||||
tagset.add(tag);
|
||||
this.tags.put(navigatorName, tagset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Extend the specified vocabularies, with terms eventually found by the
|
||||
* vocabulary scraper for these vocabularies. The scraper is emptied after
|
||||
* processing, and extended vocabularies names are removed from the
|
||||
* vocabularyNames.
|
||||
*
|
||||
* @param root
|
||||
* the document URL
|
||||
* @param scraper
|
||||
* the vocabulary scraper, eventually containing new terms scraped
|
||||
* for the registered vocabularies
|
||||
* @param vocabularyNames
|
||||
* vocabularies names to be extended
|
||||
*/
|
||||
protected void extendVocabularies(final DigestURL root, final VocabularyScraper scraper,
|
||||
final Set<String> vocabularyNames) {
|
||||
Tagging.Metatag tag;
|
||||
Map<String, String> vocMap = scraper == null ? null : scraper.removeVocMap(root);
|
||||
if (vocMap != null && vocMap.size() > 0) {
|
||||
for (Map.Entry<String, String> entry: vocMap.entrySet()) {
|
||||
String navigatorName = entry.getKey();
|
||||
String term = entry.getValue();
|
||||
vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation
|
||||
Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
|
||||
if (vocabulary != null) {
|
||||
// extend the vocabulary
|
||||
String obj = vocabulary.getObjectlink(term);
|
||||
if (obj == null) {
|
||||
try {
|
||||
vocabulary.put(term, "", root.toNormalform(true));
|
||||
} catch (IOException e) {} // this makes IO, be careful!
|
||||
}
|
||||
// create annotation
|
||||
tag = vocabulary.getMetatagFromTerm(term);
|
||||
Set<Tagging.Metatag> tagset = new HashSet<>();
|
||||
tagset.add(tag);
|
||||
this.tags.put(navigatorName, tagset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return returns the words as word/indexWord relation map. All words are lowercase.
|
||||
|
@ -264,7 +264,7 @@ public class Tokenizer {
|
|||
// returns the words as word/indexWord relation map
|
||||
return this.words;
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
|
||||
// returns a word/indexWord relation map
|
||||
if (text == null) return null;
|
||||
|
@ -276,7 +276,7 @@ public class Tokenizer {
|
|||
for (String s: this.synonyms) l.add(s);
|
||||
return l;
|
||||
}
|
||||
|
||||
|
||||
public Map<String, Set<Tagging.Metatag>> tags() {
|
||||
return this.tags;
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ import net.yacy.kelondro.data.word.Word;
|
|||
|
||||
|
||||
public class WordTokenizer implements Enumeration<StringBuilder> {
|
||||
// this enumeration removes all words that contain either wrong characters or are too short
|
||||
// this enumeration removes all words that contain either wrong characters or are too short
|
||||
|
||||
private StringBuilder buffer = null;
|
||||
private unsievedWordsEnum e;
|
||||
|
@ -78,9 +78,9 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
|||
}
|
||||
|
||||
public synchronized void close() {
|
||||
this.e.close();
|
||||
this.e = null;
|
||||
this.buffer = null;
|
||||
this.e.close();
|
||||
this.e = null;
|
||||
this.buffer = null;
|
||||
}
|
||||
|
||||
private class unsievedWordsEnum implements Enumeration<StringBuilder> {
|
||||
|
@ -189,29 +189,29 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
|||
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
|
||||
WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null);
|
||||
try {
|
||||
int pos = 0;
|
||||
StringBuilder word;
|
||||
byte[] hash;
|
||||
Integer oldpos;
|
||||
while (words.hasMoreElements() && maxlength-- > 0) {
|
||||
word = words.nextElement();
|
||||
hash = Word.word2hash(word);
|
||||
int pos = 0;
|
||||
StringBuilder word;
|
||||
byte[] hash;
|
||||
Integer oldpos;
|
||||
while (words.hasMoreElements() && maxlength-- > 0) {
|
||||
word = words.nextElement();
|
||||
hash = Word.word2hash(word);
|
||||
|
||||
// don't overwrite old values, that leads to too far word distances
|
||||
oldpos = map.put(hash, LargeNumberCache.valueOf(pos));
|
||||
if (oldpos != null) {
|
||||
map.put(hash, oldpos);
|
||||
}
|
||||
// don't overwrite old values, that leads to too far word distances
|
||||
oldpos = map.put(hash, LargeNumberCache.valueOf(pos));
|
||||
if (oldpos != null) {
|
||||
map.put(hash, oldpos);
|
||||
}
|
||||
|
||||
pos += word.length() + 1;
|
||||
}
|
||||
return map;
|
||||
pos += word.length() + 1;
|
||||
}
|
||||
return map;
|
||||
} finally {
|
||||
words.close();
|
||||
words = null;
|
||||
words.close();
|
||||
words = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Tokenize the given sentence and generate a word-wordPos mapping
|
||||
* @param sentence the sentence to be tokenized
|
||||
|
@ -221,24 +221,24 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
|||
final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
|
||||
WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null);
|
||||
try {
|
||||
int pos = 0;
|
||||
String word;
|
||||
Integer oldpos;
|
||||
while (words.hasMoreElements() && maxlength-- > 0) {
|
||||
word = words.nextElement().toString().toLowerCase(Locale.ENGLISH);
|
||||
int pos = 0;
|
||||
String word;
|
||||
Integer oldpos;
|
||||
while (words.hasMoreElements() && maxlength-- > 0) {
|
||||
word = words.nextElement().toString().toLowerCase(Locale.ENGLISH);
|
||||
|
||||
// don't overwrite old values, that leads to too far word distances
|
||||
oldpos = map.put(word, LargeNumberCache.valueOf(pos));
|
||||
if (oldpos != null) {
|
||||
map.put(word, oldpos);
|
||||
}
|
||||
// don't overwrite old values, that leads to too far word distances
|
||||
oldpos = map.put(word, LargeNumberCache.valueOf(pos));
|
||||
if (oldpos != null) {
|
||||
map.put(word, oldpos);
|
||||
}
|
||||
|
||||
pos += word.length() + 1;
|
||||
}
|
||||
return map;
|
||||
pos += word.length() + 1;
|
||||
}
|
||||
return map;
|
||||
} finally {
|
||||
words.close();
|
||||
words = null;
|
||||
words.close();
|
||||
words = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -59,7 +59,7 @@ public final class Identificator {
|
|||
*/
|
||||
public void add(final String word) {
|
||||
if (word == null || this.detector == null) {
|
||||
return;
|
||||
return;
|
||||
}
|
||||
this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars
|
||||
}
|
||||
|
@ -71,24 +71,24 @@ public final class Identificator {
|
|||
* @return 2 char language code (ISO 639-1)
|
||||
*/
|
||||
public String getLanguage() {
|
||||
if(this.detector != null) {
|
||||
try {
|
||||
ArrayList<Language> probabilities = this.detector.getProbabilities();
|
||||
if(probabilities.isEmpty()) return null;
|
||||
this.language = this.detector.getProbabilities().get(0);
|
||||
} catch (LangDetectException e) {
|
||||
// this contains mostly the message "no features in text"
|
||||
//ConcurrentLog.logException(e);
|
||||
return null;
|
||||
}
|
||||
// Return language only if probability is higher than 30% to account for missing language profiles
|
||||
if (this.language.prob > 0.3) {
|
||||
if (this.language.lang.length() == 2) {
|
||||
return this.language.lang;
|
||||
}
|
||||
return this.language.lang.substring(0,2);
|
||||
}
|
||||
}
|
||||
if(this.detector != null) {
|
||||
try {
|
||||
ArrayList<Language> probabilities = this.detector.getProbabilities();
|
||||
if(probabilities.isEmpty()) return null;
|
||||
this.language = this.detector.getProbabilities().get(0);
|
||||
} catch (LangDetectException e) {
|
||||
// this contains mostly the message "no features in text"
|
||||
//ConcurrentLog.logException(e);
|
||||
return null;
|
||||
}
|
||||
// Return language only if probability is higher than 30% to account for missing language profiles
|
||||
if (this.language.prob > 0.3) {
|
||||
if (this.language.lang.length() == 2) {
|
||||
return this.language.lang;
|
||||
}
|
||||
return this.language.lang.substring(0,2);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
|
|
|
@ -111,17 +111,17 @@ public class Word {
|
|||
|
||||
// create a word hash
|
||||
public static final byte[] word2hash(final String word) {
|
||||
final String wordlc = word.toLowerCase(Locale.ENGLISH);
|
||||
byte[] h = hashCache.get(wordlc);
|
||||
final String wordlc = word.toLowerCase(Locale.ENGLISH);
|
||||
byte[] h = hashCache.get(wordlc);
|
||||
if (h != null) return h;
|
||||
// calculate the hash
|
||||
h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength);
|
||||
while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) {
|
||||
// ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer
|
||||
// statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never)
|
||||
System.arraycopy(h, 1, h, 0, commonHashLength - 1);
|
||||
h[commonHashLength - 1] = lowByte;
|
||||
}
|
||||
h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength);
|
||||
while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) {
|
||||
// ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer
|
||||
// statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never)
|
||||
System.arraycopy(h, 1, h, 0, commonHashLength - 1);
|
||||
h[commonHashLength - 1] = lowByte;
|
||||
}
|
||||
assert h[2] != '@';
|
||||
if (MemoryControl.shortStatus()) {
|
||||
hashCache.clear();
|
||||
|
|
|
@ -73,11 +73,11 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
|
|||
// available chars: b,e,j,q
|
||||
|
||||
/**
|
||||
* object for termination of concurrent blocking queue processing
|
||||
*/
|
||||
* object for termination of concurrent blocking queue processing
|
||||
*/
|
||||
protected static final Row.Entry poisonRowEntry = urlEntryRow.newEntry();
|
||||
|
||||
// static properties
|
||||
|
||||
// static properties
|
||||
private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
|
||||
private static final int col_lastModified = 1; // a 2 last-modified time of the document where word appears
|
||||
private static final int col_freshUntil = 2; // s 2 TTL for the word, so it can be removed easily if the TTL is short
|
||||
|
@ -207,7 +207,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
|
|||
this.entry.setCol(col_posinphrase, word.posInPhrase);
|
||||
this.entry.setCol(col_posofphrase, word.numOfPhrase);
|
||||
}
|
||||
|
||||
|
||||
public WordReferenceRow(final String external) {
|
||||
this.entry = urlEntryRow.newEntry(external, true);
|
||||
}
|
||||
|
|
|
@ -58,9 +58,9 @@ public final class SetTools {
|
|||
public static int log2a(int x) {
|
||||
// this computes 1 + log2
|
||||
// it is the number of bits in x, not the logarithm by 2
|
||||
int l = 0;
|
||||
while (x > 0) {x = x >>> 1; l++;}
|
||||
return l;
|
||||
int l = 0;
|
||||
while (x > 0) {x = x >>> 1; l++;}
|
||||
return l;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------------------
|
||||
|
@ -178,7 +178,7 @@ public final class SetTools {
|
|||
Map.Entry<A, B> mentry1 = mi1.next();
|
||||
Map.Entry<A, B> mentry2 = mi2.next();
|
||||
while (true) {
|
||||
c = comp.compare(mentry1.getKey(), mentry2.getKey());
|
||||
c = comp.compare(mentry1.getKey(), mentry2.getKey());
|
||||
if (c < 0) {
|
||||
if (mi1.hasNext()) mentry1 = mi1.next(); else break;
|
||||
} else if (c > 0) {
|
||||
|
@ -201,7 +201,7 @@ public final class SetTools {
|
|||
|
||||
// now the same for set-set
|
||||
public static <A> SortedSet<A> joinConstructive(final SortedSet<A> set1, final SortedSet<A> set2) {
|
||||
// comparators must be equal
|
||||
// comparators must be equal
|
||||
if ((set1 == null) || (set2 == null)) return null;
|
||||
if (set1.comparator() != set2.comparator()) return null;
|
||||
if (set1.isEmpty() || set2.isEmpty()) return new TreeSet<A>(set1.comparator());
|
||||
|
@ -214,46 +214,46 @@ public final class SetTools {
|
|||
|
||||
// start most efficient method
|
||||
if (stepsEnum > stepsTest) {
|
||||
if (set1.size() < set2.size()) return joinConstructiveByTest(set1.iterator(), set2);
|
||||
return joinConstructiveByTest(set2.iterator(), set1);
|
||||
if (set1.size() < set2.size()) return joinConstructiveByTest(set1.iterator(), set2);
|
||||
return joinConstructiveByTest(set2.iterator(), set1);
|
||||
}
|
||||
return joinConstructiveByEnumeration(set1, set2);
|
||||
}
|
||||
|
||||
public static <A> SortedSet<A> joinConstructiveByTest(final Iterator<A> small, final SortedSet<A> large) {
|
||||
final SortedSet<A> result = new TreeSet<A>(large.comparator());
|
||||
A o;
|
||||
while (small.hasNext()) {
|
||||
o = small.next();
|
||||
if (large.contains(o)) result.add(o);
|
||||
}
|
||||
return result;
|
||||
final SortedSet<A> result = new TreeSet<A>(large.comparator());
|
||||
A o;
|
||||
while (small.hasNext()) {
|
||||
o = small.next();
|
||||
if (large.contains(o)) result.add(o);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private static <A> SortedSet<A> joinConstructiveByEnumeration(final SortedSet<A> set1, final SortedSet<A> set2) {
|
||||
// implement pairwise enumeration
|
||||
final Comparator<? super A> comp = set1.comparator();
|
||||
final Iterator<A> mi = set1.iterator();
|
||||
final Iterator<A> si = set2.iterator();
|
||||
final SortedSet<A> result = new TreeSet<A>(set1.comparator());
|
||||
int c;
|
||||
if ((mi.hasNext()) && (si.hasNext())) {
|
||||
A mobj = mi.next();
|
||||
A sobj = si.next();
|
||||
while (true) {
|
||||
c = comp.compare(mobj, sobj);
|
||||
if (c < 0) {
|
||||
if (mi.hasNext()) mobj = mi.next(); else break;
|
||||
} else if (c > 0) {
|
||||
if (si.hasNext()) sobj = si.next(); else break;
|
||||
} else {
|
||||
result.add(mobj);
|
||||
if (mi.hasNext()) mobj = mi.next(); else break;
|
||||
if (si.hasNext()) sobj = si.next(); else break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
// implement pairwise enumeration
|
||||
final Comparator<? super A> comp = set1.comparator();
|
||||
final Iterator<A> mi = set1.iterator();
|
||||
final Iterator<A> si = set2.iterator();
|
||||
final SortedSet<A> result = new TreeSet<A>(set1.comparator());
|
||||
int c;
|
||||
if ((mi.hasNext()) && (si.hasNext())) {
|
||||
A mobj = mi.next();
|
||||
A sobj = si.next();
|
||||
while (true) {
|
||||
c = comp.compare(mobj, sobj);
|
||||
if (c < 0) {
|
||||
if (mi.hasNext()) mobj = mi.next(); else break;
|
||||
} else if (c > 0) {
|
||||
if (si.hasNext()) sobj = si.next(); else break;
|
||||
} else {
|
||||
result.add(mobj);
|
||||
if (mi.hasNext()) mobj = mi.next(); else break;
|
||||
if (si.hasNext()) sobj = si.next(); else break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -289,23 +289,23 @@ public final class SetTools {
|
|||
* @return true if any element of the first set is part of the second set or vice-versa
|
||||
*/
|
||||
public static <A> boolean anymatch(final SortedSet<A> set1, final SortedSet<A> set2) {
|
||||
// comparators must be equal
|
||||
if ((set1 == null) || (set2 == null)) return false;
|
||||
if (set1.comparator() != set2.comparator()) return false;
|
||||
if (set1.isEmpty() || set2.isEmpty()) return false;
|
||||
// comparators must be equal
|
||||
if ((set1 == null) || (set2 == null)) return false;
|
||||
if (set1.comparator() != set2.comparator()) return false;
|
||||
if (set1.isEmpty() || set2.isEmpty()) return false;
|
||||
|
||||
// decide which method to use
|
||||
final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size());
|
||||
final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size());
|
||||
final int stepsEnum = 10 * (high + low - 1);
|
||||
final int stepsTest = 12 * log2a(high) * low;
|
||||
// decide which method to use
|
||||
final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size());
|
||||
final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size());
|
||||
final int stepsEnum = 10 * (high + low - 1);
|
||||
final int stepsTest = 12 * log2a(high) * low;
|
||||
|
||||
// start most efficient method
|
||||
if (stepsEnum > stepsTest) {
|
||||
return (set1.size() < set2.size()) ? anymatchByTest(set1.iterator(), set2) : anymatchByTest(set2.iterator(), set1);
|
||||
}
|
||||
return anymatchByEnumeration(set1, set2);
|
||||
}
|
||||
// start most efficient method
|
||||
if (stepsEnum > stepsTest) {
|
||||
return (set1.size() < set2.size()) ? anymatchByTest(set1.iterator(), set2) : anymatchByTest(set2.iterator(), set1);
|
||||
}
|
||||
return anymatchByEnumeration(set1, set2);
|
||||
}
|
||||
|
||||
/**
|
||||
* test if the intersection of two sets is not empty
|
||||
|
@ -545,7 +545,7 @@ public final class SetTools {
|
|||
} catch (final IOException e) {
|
||||
} finally {
|
||||
if (br != null) try{br.close();}catch(final Exception e){
|
||||
ConcurrentLog.warn("SetTools", "Could not close input stream on file " + file);
|
||||
ConcurrentLog.warn("SetTools", "Could not close input stream on file " + file);
|
||||
}
|
||||
}
|
||||
return list;
|
||||
|
@ -577,52 +577,52 @@ public final class SetTools {
|
|||
for (Object o: c) if (i++ == n) return o;
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
public static void main(final String[] args) {
|
||||
final SortedMap<String, String> m = new TreeMap<String, String>();
|
||||
final SortedMap<String, String> s = new TreeMap<String, String>();
|
||||
m.put("a", "a");
|
||||
m.put("x", "x");
|
||||
m.put("f", "f");
|
||||
m.put("h", "h");
|
||||
m.put("w", "w");
|
||||
m.put("7", "7");
|
||||
m.put("t", "t");
|
||||
m.put("k", "k");
|
||||
m.put("y", "y");
|
||||
m.put("z", "z");
|
||||
s.put("a", "a");
|
||||
s.put("b", "b");
|
||||
s.put("c", "c");
|
||||
s.put("k", "k");
|
||||
s.put("l", "l");
|
||||
s.put("m", "m");
|
||||
s.put("n", "n");
|
||||
s.put("o", "o");
|
||||
s.put("p", "p");
|
||||
s.put("q", "q");
|
||||
s.put("r", "r");
|
||||
s.put("s", "s");
|
||||
s.put("t", "t");
|
||||
s.put("x", "x");
|
||||
System.out.println("Compare " + m.toString() + " with " + s.toString());
|
||||
System.out.println("Join=" + joinConstructiveByEnumeration(m, s, true));
|
||||
System.out.println("Join=" + joinConstructiveByTest(m, s, true));
|
||||
System.out.println("Join=" + joinConstructiveByTest(m, s, true));
|
||||
System.out.println("Join=" + joinConstructive(m, s, true));
|
||||
//System.out.println("Exclude=" + excludeConstructiveByTestMapInSet(m, s.keySet()));
|
||||
final SortedMap<String, String> m = new TreeMap<String, String>();
|
||||
final SortedMap<String, String> s = new TreeMap<String, String>();
|
||||
m.put("a", "a");
|
||||
m.put("x", "x");
|
||||
m.put("f", "f");
|
||||
m.put("h", "h");
|
||||
m.put("w", "w");
|
||||
m.put("7", "7");
|
||||
m.put("t", "t");
|
||||
m.put("k", "k");
|
||||
m.put("y", "y");
|
||||
m.put("z", "z");
|
||||
s.put("a", "a");
|
||||
s.put("b", "b");
|
||||
s.put("c", "c");
|
||||
s.put("k", "k");
|
||||
s.put("l", "l");
|
||||
s.put("m", "m");
|
||||
s.put("n", "n");
|
||||
s.put("o", "o");
|
||||
s.put("p", "p");
|
||||
s.put("q", "q");
|
||||
s.put("r", "r");
|
||||
s.put("s", "s");
|
||||
s.put("t", "t");
|
||||
s.put("x", "x");
|
||||
System.out.println("Compare " + m.toString() + " with " + s.toString());
|
||||
System.out.println("Join=" + joinConstructiveByEnumeration(m, s, true));
|
||||
System.out.println("Join=" + joinConstructiveByTest(m, s, true));
|
||||
System.out.println("Join=" + joinConstructiveByTest(m, s, true));
|
||||
System.out.println("Join=" + joinConstructive(m, s, true));
|
||||
//System.out.println("Exclude=" + excludeConstructiveByTestMapInSet(m, s.keySet()));
|
||||
|
||||
/*
|
||||
for (int low = 0; low < 10; low++)
|
||||
for (int high = 0; high < 100; high=high + 10) {
|
||||
int stepsEnum = 10 * high;
|
||||
int stepsTest = 12 * log2(high) * low;
|
||||
System.out.println("low=" + low + ", high=" + high + ", stepsEnum=" + stepsEnum + ", stepsTest=" + stepsTest + "; best method is " + ((stepsEnum < stepsTest) ? "joinByEnumeration" : "joinByTest"));
|
||||
}
|
||||
*/
|
||||
/*
|
||||
for (int low = 0; low < 10; low++)
|
||||
for (int high = 0; high < 100; high=high + 10) {
|
||||
int stepsEnum = 10 * high;
|
||||
int stepsTest = 12 * log2(high) * low;
|
||||
System.out.println("low=" + low + ", high=" + high + ", stepsEnum=" + stepsEnum + ", stepsTest=" + stepsTest + "; best method is " + ((stepsEnum < stepsTest) ? "joinByEnumeration" : "joinByTest"));
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user