From 8285fe715a9c48dd077bc31a7c53dccea320dcd4 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 1 Sep 2023 11:00:42 +0200 Subject: [PATCH] tab to spaces for classes supporting the condenser. This is a preparation step to make changes in condenser and parser more visible; no functional changes so far. --- .../language/synonyms/AutotaggingLibrary.java | 160 ++++----- .../net/yacy/cora/lod/vocabulary/Tagging.java | 258 +++++++------- source/net/yacy/document/Condenser.java | 122 +++---- source/net/yacy/document/DateDetection.java | 333 +++++++++--------- source/net/yacy/document/SentenceReader.java | 58 +-- source/net/yacy/document/Tokenizer.java | 172 ++++----- source/net/yacy/document/WordTokenizer.java | 74 ++-- .../yacy/document/language/Identificator.java | 38 +- source/net/yacy/kelondro/data/word/Word.java | 18 +- .../kelondro/data/word/WordReferenceRow.java | 10 +- source/net/yacy/kelondro/util/SetTools.java | 188 +++++----- 11 files changed, 712 insertions(+), 719 deletions(-) diff --git a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java index 95a8b3e3b..4ab3ca772 100644 --- a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java +++ b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java @@ -85,24 +85,24 @@ public class AutotaggingLibrary { } } } - - /** - * Create a new Autotagging instance from the provided vocabularies. Can be used - * for example for testing purpose. - */ + + /** + * Create a new Autotagging instance from the provided vocabularies. Can be used + * for example for testing purpose. + */ protected AutotaggingLibrary(final Map vocabularies) { - if(vocabularies != null) { - this.vocabularies = vocabularies; - } else { - this.vocabularies = new ConcurrentHashMap(); - } - this.allTags = new ConcurrentHashMap(); - this.autotaggingPath = null; - for(final Tagging voc : this.vocabularies.values()) { + if(vocabularies != null) { + this.vocabularies = vocabularies; + } else { + this.vocabularies = new ConcurrentHashMap(); + } + this.allTags = new ConcurrentHashMap(); + this.autotaggingPath = null; + for(final Tagging voc : this.vocabularies.values()) { for (final String t: voc.tags()) { this.allTags.put(t, PRESENT); } - } + } } public File getVocabularyFile(String name) { @@ -159,11 +159,11 @@ public class AutotaggingLibrary { } public int size() { - return this.vocabularies.size(); + return this.vocabularies.size(); } public boolean isEmpty() { - return this.vocabularies.isEmpty(); + return this.vocabularies.isEmpty(); } /** @@ -171,8 +171,8 @@ public class AutotaggingLibrary { * @return */ public int getMaxWordsInTerm() { - //TODO: calculate from database - return 4; + //TODO: calculate from database + return 4; } /** @@ -195,70 +195,70 @@ public class AutotaggingLibrary { return null; } - /** - * Search in the active vocabularies matching linked data for Metatag entries with objectspace + term - * matching the given term URL. Returns at most one Metatag instance per - * vocabulary. - * - * @param termURL - * the vocabulary term identifier (an absolute URL) to search - * @return a set of matching Metatag instances eventually empty - */ - public Set getTagsFromTermURL(final DigestURL termURL) { - final Set tags = new HashSet<>(); - if (termURL == null || this.vocabularies.isEmpty()) { - return tags; - } - final String termURLStr = termURL.toNormalform(false); - String termNamespace = null; + /** + * Search in the active vocabularies matching linked data for Metatag entries with objectspace + term + * matching the given term URL. Returns at most one Metatag instance per + * vocabulary. + * + * @param termURL + * the vocabulary term identifier (an absolute URL) to search + * @return a set of matching Metatag instances eventually empty + */ + public Set getTagsFromTermURL(final DigestURL termURL) { + final Set tags = new HashSet<>(); + if (termURL == null || this.vocabularies.isEmpty()) { + return tags; + } + final String termURLStr = termURL.toNormalform(false); + String termNamespace = null; - /* If the objectLink URL has a fragment, this should be the vocabulary term */ - String term = termURL.getRef(); - if (term == null) { - /* - * No fragment in the URL : the term should then be the last segment of the URL - */ - term = termURL.getFileName(); - if (StringUtils.isNotEmpty(term)) { - final int lastPathSeparatorPos = termURLStr.lastIndexOf("/"); - if (lastPathSeparatorPos > 0) { - termNamespace = termURLStr.substring(0, lastPathSeparatorPos + 1); - } - } - } else { - final int fragmentPos = termURLStr.indexOf("#"); - if (fragmentPos > 0) { - termNamespace = termURLStr.substring(0, fragmentPos + 1); - } - } - if (StringUtils.isNotEmpty(term) && termNamespace != null) { - final String alternativeTermNamespace; - /* - * http://example.org/ and https://example.org/ are considered equivalent forms - * for the namespace URL - */ - if (termURL.isHTTP()) { - alternativeTermNamespace = "https" + termNamespace.substring("http".length()); - } else if (termURL.isHTTPS()) { - alternativeTermNamespace = "http" + termNamespace.substring("https".length()); - } else { - alternativeTermNamespace = null; - } + /* If the objectLink URL has a fragment, this should be the vocabulary term */ + String term = termURL.getRef(); + if (term == null) { + /* + * No fragment in the URL : the term should then be the last segment of the URL + */ + term = termURL.getFileName(); + if (StringUtils.isNotEmpty(term)) { + final int lastPathSeparatorPos = termURLStr.lastIndexOf("/"); + if (lastPathSeparatorPos > 0) { + termNamespace = termURLStr.substring(0, lastPathSeparatorPos + 1); + } + } + } else { + final int fragmentPos = termURLStr.indexOf("#"); + if (fragmentPos > 0) { + termNamespace = termURLStr.substring(0, fragmentPos + 1); + } + } + if (StringUtils.isNotEmpty(term) && termNamespace != null) { + final String alternativeTermNamespace; + /* + * http://example.org/ and https://example.org/ are considered equivalent forms + * for the namespace URL + */ + if (termURL.isHTTP()) { + alternativeTermNamespace = "https" + termNamespace.substring("http".length()); + } else if (termURL.isHTTPS()) { + alternativeTermNamespace = "http" + termNamespace.substring("https".length()); + } else { + alternativeTermNamespace = null; + } - for (final Tagging vocabulary : this.vocabularies.values()) { - if (vocabulary != null && vocabulary.isMatchFromLinkedData()) { - if ((termNamespace.equals(vocabulary.getObjectspace())) || (alternativeTermNamespace != null - && alternativeTermNamespace.equals(vocabulary.getObjectspace()))) { - final Tagging.Metatag tag = vocabulary.getMetatagFromTerm(term); - if (tag != null) { - tags.add(tag); - } - } - } - } - } - return tags; - } + for (final Tagging vocabulary : this.vocabularies.values()) { + if (vocabulary != null && vocabulary.isMatchFromLinkedData()) { + if ((termNamespace.equals(vocabulary.getObjectspace())) || (alternativeTermNamespace != null + && alternativeTermNamespace.equals(vocabulary.getObjectspace()))) { + final Tagging.Metatag tag = vocabulary.getMetatagFromTerm(term); + if (tag != null) { + tags.add(tag); + } + } + } + } + } + return tags; + } public Tagging.Metatag metatag(String vocName, String term) { Tagging tagging = this.vocabularies.get(vocName); diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java index 49b8d3aa1..a6ec0c87e 100644 --- a/source/net/yacy/cora/lod/vocabulary/Tagging.java +++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java @@ -47,27 +47,27 @@ public class Tagging { public final static String DEFAULT_NAMESPACE= "http://yacy.net/autotagging#"; public final static String DEFAULT_PREFIX = "tags"; - + /** Default value for the property matchFromLinkedData */ public final static boolean DEFAULT_MATCH_FROM_LINKED_DATA = false; private final String navigatorName; private final Map synonym2term; - + /** Terms associated to TagginEntry instances each having a synonym and an eventual object link */ private final Map term2entries; - + private File propFile; - + /** true if the vocabulary shall generate a navigation facet */ private boolean isFacet; - - /** - * True when this vocabulary terms should only be matched from linked data types - * annotations (with microdata, RDFa, microformats...) instead of clear text - * words - */ - private boolean matchFromLinkedData; + + /** + * True when this vocabulary terms should only be matched from linked data types + * annotations (with microdata, RDFa, microformats...) instead of clear text + * words + */ + private boolean matchFromLinkedData; private String predicate, namespace, objectspace; @@ -142,55 +142,55 @@ public class Tagging { String term, v; String[] tags; - vocloop: for (Map.Entry e: table.entrySet()) { - if (e.getValue().getSynonymsCSV() == null || e.getValue().getSynonymsCSV().isEmpty()) { - term = normalizeKey(e.getKey()); - v = normalizeTerm(e.getKey()); - this.synonym2term.put(v, term); - if (e.getValue().getObjectlink() != null && e.getValue().getObjectlink().length() > 0) { - this.term2entries.put(term, new TaggingEntryWithObjectLink(v, e.getValue().getObjectlink())); - } else { - this.term2entries.put(term, new SynonymTaggingEntry(v)); - } - - continue vocloop; - } - term = normalizeKey(e.getKey()); - tags = e.getValue().getSynonymsList(); - final Set synonyms = new HashSet(); - synonyms.add(term); - tagloop: for (String synonym: tags) { - if (synonym.isEmpty()) continue tagloop; - synonyms.add(synonym); - synonym = normalizeTerm(synonym); - if (synonym.isEmpty()) continue tagloop; - synonyms.add(synonym); - this.synonym2term.put(synonym, term); - this.term2entries.put(term, new SynonymTaggingEntry(synonym)); - } - final String synonym = normalizeTerm(term); - this.synonym2term.put(synonym, term); - if (e.getValue().getObjectlink() != null && e.getValue().getObjectlink().length() > 0) { - this.term2entries.put(term, new TaggingEntryWithObjectLink(synonym, e.getValue().getObjectlink())); - } else { - this.term2entries.put(term, new SynonymTaggingEntry(synonym)); + vocloop: for (Map.Entry e: table.entrySet()) { + if (e.getValue().getSynonymsCSV() == null || e.getValue().getSynonymsCSV().isEmpty()) { + term = normalizeKey(e.getKey()); + v = normalizeTerm(e.getKey()); + this.synonym2term.put(v, term); + if (e.getValue().getObjectlink() != null && e.getValue().getObjectlink().length() > 0) { + this.term2entries.put(term, new TaggingEntryWithObjectLink(v, e.getValue().getObjectlink())); + } else { + this.term2entries.put(term, new SynonymTaggingEntry(v)); + } + + continue vocloop; } - synonyms.add(synonym); - } + term = normalizeKey(e.getKey()); + tags = e.getValue().getSynonymsList(); + final Set synonyms = new HashSet(); + synonyms.add(term); + tagloop: for (String synonym: tags) { + if (synonym.isEmpty()) continue tagloop; + synonyms.add(synonym); + synonym = normalizeTerm(synonym); + if (synonym.isEmpty()) continue tagloop; + synonyms.add(synonym); + this.synonym2term.put(synonym, term); + this.term2entries.put(term, new SynonymTaggingEntry(synonym)); + } + final String synonym = normalizeTerm(term); + this.synonym2term.put(synonym, term); + if (e.getValue().getObjectlink() != null && e.getValue().getObjectlink().length() > 0) { + this.term2entries.put(term, new TaggingEntryWithObjectLink(synonym, e.getValue().getObjectlink())); + } else { + this.term2entries.put(term, new SynonymTaggingEntry(synonym)); + } + synonyms.add(synonym); + } } else { try ( - /* Resources automatically closed by this try-with-resources statement */ - final FileOutputStream outStream = new FileOutputStream(propFile); - final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(outStream, StandardCharsets.UTF_8.name())); + /* Resources automatically closed by this try-with-resources statement */ + final FileOutputStream outStream = new FileOutputStream(propFile); + final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(outStream, StandardCharsets.UTF_8.name())); ) { - if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n"); - for (final Map.Entry e: table.entrySet()) { - String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV(); - String o = e.getValue() == null ? "" : e.getValue().getObjectlink(); - w.write(e.getKey() + (s == null || s.isEmpty() ? "" : ":" + e.getValue().getSynonymsCSV()) + (o == null || o.isEmpty() || o.equals(objectspace + e.getKey()) ? "" : "#" + o) + "\n"); - } + if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n"); + for (final Map.Entry e: table.entrySet()) { + String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV(); + String o = e.getValue() == null ? "" : e.getValue().getObjectlink(); + w.write(e.getKey() + (s == null || s.isEmpty() ? "" : ":" + e.getValue().getSynonymsCSV()) + (o == null || o.isEmpty() || o.equals(objectspace + e.getKey()) ? "" : "#" + o) + "\n"); + } } - init(); + init(); } } @@ -207,7 +207,7 @@ public class Tagging { g = geo.iterator().next(); this.term2entries.put(loc, new LocationTaggingEntry(syn, g)); } else { - this.term2entries.put(loc, new SynonymTaggingEntry(syn)); + this.term2entries.put(loc, new SynonymTaggingEntry(syn)); } } } @@ -255,9 +255,9 @@ public class Tagging { v = normalizeTerm(pl[0]); this.synonym2term.put(v, term); if (pl[2] != null && pl[2].length() > 0) { - this.term2entries.put(term, new TaggingEntryWithObjectLink(v, pl[2])); + this.term2entries.put(term, new TaggingEntryWithObjectLink(v, pl[2])); } else { - this.term2entries.put(term, new SynonymTaggingEntry(v)); + this.term2entries.put(term, new SynonymTaggingEntry(v)); } continue vocloop; } @@ -278,9 +278,9 @@ public class Tagging { String synonym = normalizeTerm(term); this.synonym2term.put(synonym, term); if (pl[2] != null && pl[2].length() > 0) { - this.term2entries.put(term, new TaggingEntryWithObjectLink(synonym, pl[2])); + this.term2entries.put(term, new TaggingEntryWithObjectLink(synonym, pl[2])); } else { - this.term2entries.put(term, new SynonymTaggingEntry(synonym)); + this.term2entries.put(term, new SynonymTaggingEntry(synonym)); } synonyms.add(synonym); } @@ -293,30 +293,30 @@ public class Tagging { public boolean isFacet() { return this.isFacet; } - + public void setFacet(boolean isFacet) { this.isFacet = isFacet; } - - /** - * @return true when this vocabulary terms should be matched from linked data - * types annotations (with microdata, RDFa, microformats...) instead of - * clear text words - */ + + /** + * @return true when this vocabulary terms should be matched from linked data + * types annotations (with microdata, RDFa, microformats...) instead of + * clear text words + */ public boolean isMatchFromLinkedData() { - return this.matchFromLinkedData; - } - - /** - * @param facetFromLinkedData - * true when this vocabulary terms should be matched from linked - * data types annotations (with microdata, RDFa, microformats...) - * instead of clear text words - */ - public void setMatchFromLinkedData(final boolean facetFromLinkedData) { - this.matchFromLinkedData = facetFromLinkedData; + return this.matchFromLinkedData; } - + + /** + * @param facetFromLinkedData + * true when this vocabulary terms should be matched from linked + * data types annotations (with microdata, RDFa, microformats...) + * instead of clear text words + */ + public void setMatchFromLinkedData(final boolean facetFromLinkedData) { + this.matchFromLinkedData = facetFromLinkedData; + } + public int size() { return this.term2entries.size(); } @@ -430,7 +430,7 @@ public class Tagging { r.put(e.getKey(), s); } if (e.getValue() != null && e.getValue().getSynonym() != null && e.getValue().getSynonym().length() != 0) { - s.add(e.getValue().getSynonym()); + s.add(e.getValue().getSynonym()); } } for (Map.Entry e: this.synonym2term.entrySet()) { @@ -448,11 +448,11 @@ public class Tagging { Map> r = reconstructionSets(); Map map = new TreeMap(); for (Map.Entry> e: r.entrySet()) { - TaggingEntry entry = this.term2entries.get(e.getKey()); - String objectLink = null; - if(entry != null) { - objectLink = entry.getObjectLink(); - } + TaggingEntry entry = this.term2entries.get(e.getKey()); + String objectLink = null; + if(entry != null) { + objectLink = entry.getObjectLink(); + } map.put(e.getKey(), new SOTuple(e.getValue().toArray(new String[e.getValue().size()]), objectLink == null ? "" : objectLink)); } return map; @@ -461,7 +461,7 @@ public class Tagging { public String getObjectlink(String term) { TaggingEntry entry = this.term2entries.get(term); if(entry != null) { - return entry.getObjectLink(); + return entry.getObjectLink(); } return null; } @@ -531,11 +531,11 @@ public class Tagging { public String getObjectspace() { return this.objectspace; } - + private final static Pattern PATTERN_SPACESLASHPLUS = Pattern.compile(" (/|\\+)"); private final static Pattern PATTERN_SLASHPLUS = Pattern.compile("/|\\+"); private final static Pattern PATTERN_SPACESPACE = Pattern.compile(" "); - + private final String normalizeKey(String k) { k = k.trim(); // remove symbols that are bad in a query attribute @@ -557,37 +557,37 @@ public class Tagging { return this.propFile; } - /** - * @param word - * a synonym to look for - * @return a Metatag instance with the matching term, or null when the synonym - * is not in this vocabulary. - */ + /** + * @param word + * a synonym to look for + * @return a Metatag instance with the matching term, or null when the synonym + * is not in this vocabulary. + */ public Metatag getMetatagFromSynonym(final String word) { String printname = this.synonym2term.get(word); if (printname == null) return null; return new Metatag(printname); } - - /** - * @param term - * a term to look for - * @return a Metatag instance with the matching term, or null when it is not in - * this vocabulary. - */ + + /** + * @param term + * a term to look for + * @return a Metatag instance with the matching term, or null when it is not in + * this vocabulary. + */ public Metatag getMetatagFromTerm(final String term) { TaggingEntry entry = this.term2entries.get(term); if(entry == null) { - return null; + return null; } return new Metatag(term); } - /** - * @param word - * the object of the Metatag - * @return a new Metatag instance related to this vocabulary - */ + /** + * @param word + * the object of the Metatag + * @return a new Metatag instance related to this vocabulary + */ public Metatag buildMetatagFromTerm(final String word) { return new Metatag(word); } @@ -632,15 +632,15 @@ public class Tagging { * The metatag is created in a tagging environment, which already contains the * subject and the predicate. The metatag is the object of the RDF triple. */ - public class Metatag { - private final String object; - private Metatag(String object) { - this.object = object; - } + public class Metatag { + private final String object; + private Metatag(String object) { + this.object = object; + } - public String getVocabularyName() { - return Tagging.this.navigatorName; - } + public String getVocabularyName() { + return Tagging.this.navigatorName; + } public String getPredicate() { return Tagging.this.predicate; @@ -650,22 +650,22 @@ public class Tagging { return this.object; } - @Override - public String toString() { - return Tagging.this.navigatorName + ":" + encodePrintname(this.object); - } + @Override + public String toString() { + return Tagging.this.navigatorName + ":" + encodePrintname(this.object); + } - @Override - public boolean equals(Object m) { - Metatag m0 = (Metatag) m; - return Tagging.this.navigatorName.equals(m0.getVocabularyName()) && this.object.equals(m0.object); - } + @Override + public boolean equals(Object m) { + Metatag m0 = (Metatag) m; + return Tagging.this.navigatorName.equals(m0.getVocabularyName()) && this.object.equals(m0.object); + } - @Override - public int hashCode() { - return Tagging.this.navigatorName.hashCode() + this.object.hashCode(); - } - } + @Override + public int hashCode() { + return Tagging.this.navigatorName.hashCode() + this.object.hashCode(); + } + } public static final String encodePrintname(String printname) { return CommonPattern.SPACE.matcher(printname).replaceAll("_"); diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 381ffc6da..ef3b2f748 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -61,10 +61,10 @@ public final class Condenser extends Tokenizer { private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection private String fuzzy_signature_text = null; // signatures for double-check detection - + private final Identificator languageIdentificator; public LinkedHashSet dates_in_content; - + public Condenser( final Document document, final VocabularyScraper scraper, @@ -76,14 +76,14 @@ public final class Condenser extends Tokenizer { final int timezoneOffset ) { super(document.dc_source(), indexText ? document.getTextString() : "", meaningLib, doAutotagging, scraper); - + final String initialThreadName = Thread.currentThread().getName(); Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging - + // if addMedia == true, then all the media links are also parsed and added to the words // added media words are flagged with the appropriate media flag this.dates_in_content = new LinkedHashSet(); - + // construct flag set for document ContentDomain contentDomain = document.getContentDomain(); if (contentDomain == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true); @@ -196,9 +196,9 @@ public final class Condenser extends Tokenizer { } } } - + if(doAutotagging) { - extractAutoTagsFromLinkedDataTypes(document.getLinkedDataTypes(), LibraryProvider.autotagging); + extractAutoTagsFromLinkedDataTypes(document.getLinkedDataTypes(), LibraryProvider.autotagging); } // extend the tags in the document object with autotagging tags @@ -224,36 +224,36 @@ public final class Condenser extends Tokenizer { /* Restore the current thread initial name */ Thread.currentThread().setName(initialThreadName); } - - /** - * Search for tags matching the given linked data types identifiers (absolute - * URLs) in the given autotagging library. Then fill this instance "tags" map - * with the eventually matching tags found. - * - * @param linkedDataTypes - * a set of linked data typed items identifiers (absolute URLs) to - * search - * @param tagLibrary - * the autotagging library holding vocabularies to search in - */ - protected void extractAutoTagsFromLinkedDataTypes(final Set linkedDataTypes, - final AutotaggingLibrary tagLibrary) { - if (linkedDataTypes == null || tagLibrary == null) { - return; - } - for (final DigestURL linkedDataType : linkedDataTypes) { - final Set tags = tagLibrary.getTagsFromTermURL(linkedDataType); - for (final Metatag tag : tags) { - final String navigatorName = tag.getVocabularyName(); - Set tagset = this.tags.get(navigatorName); - if (tagset == null) { - tagset = new HashSet(); - this.tags.put(navigatorName, tagset); - } - tagset.add(tag); - } - } - } + + /** + * Search for tags matching the given linked data types identifiers (absolute + * URLs) in the given autotagging library. Then fill this instance "tags" map + * with the eventually matching tags found. + * + * @param linkedDataTypes + * a set of linked data typed items identifiers (absolute URLs) to + * search + * @param tagLibrary + * the autotagging library holding vocabularies to search in + */ + protected void extractAutoTagsFromLinkedDataTypes(final Set linkedDataTypes, + final AutotaggingLibrary tagLibrary) { + if (linkedDataTypes == null || tagLibrary == null) { + return; + } + for (final DigestURL linkedDataType : linkedDataTypes) { + final Set tags = tagLibrary.getTagsFromTermURL(linkedDataType); + for (final Metatag tag : tags) { + final String navigatorName = tag.getVocabularyName(); + Set tagset = this.tags.get(navigatorName); + if (tagset == null) { + tagset = new HashSet(); + this.tags.put(navigatorName, tagset); + } + tagset.add(tag); + } + } + } private void insertTextToWords( final SentenceReader text, @@ -267,24 +267,24 @@ public final class Condenser extends Tokenizer { Word wprop; WordTokenizer wordenum = new WordTokenizer(text, meaningLib); try { - int pip = 0; - while (wordenum.hasMoreElements()) { - word = wordenum.nextElement().toString(); - if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive + int pip = 0; + while (wordenum.hasMoreElements()) { + word = wordenum.nextElement().toString(); + if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive if (word.length() < 2) continue; word = word.toLowerCase(Locale.ENGLISH); - wprop = this.words.get(word); - if (wprop == null) wprop = new Word(0, pip, phrase); - if (wprop.flags == null) wprop.flags = flagstemplate.clone(); - wprop.flags.set(flagpos, true); - this.words.put(word, wprop); - pip++; - this.RESULT_NUMB_WORDS++; - //this.RESULT_DIFF_WORDS++; + wprop = this.words.get(word); + if (wprop == null) wprop = new Word(0, pip, phrase); + if (wprop.flags == null) wprop.flags = flagstemplate.clone(); + wprop.flags.set(flagpos, true); + this.words.put(word, wprop); + pip++; + this.RESULT_NUMB_WORDS++; + //this.RESULT_DIFF_WORDS++; } } finally { - wordenum.close(); - wordenum = null; + wordenum.close(); + wordenum = null; } } @@ -303,11 +303,11 @@ public final class Condenser extends Tokenizer { public String fuzzySignatureText() { return this.fuzzy_signature_text; } - + public long exactSignature() { return this.exact_signature; } - + public String language() { return this.languageIdentificator.getLanguage(); } @@ -322,7 +322,7 @@ public final class Condenser extends Tokenizer { public static void main(final String[] args) { // read a property file and convert them into configuration lines - FileInputStream inStream = null; + FileInputStream inStream = null; try { final File f = new File(args[0]); final Properties p = new Properties(); @@ -346,13 +346,13 @@ public final class Condenser extends Tokenizer { } catch (final IOException e) { ConcurrentLog.logException(e); } finally { - if(inStream != null) { - try { - inStream.close(); - } catch (IOException e) { - ConcurrentLog.logException(e); - } - } + if(inStream != null) { + try { + inStream.close(); + } catch (IOException e) { + ConcurrentLog.logException(e); + } + } } } diff --git a/source/net/yacy/document/DateDetection.java b/source/net/yacy/document/DateDetection.java index 08e7c24e3..0b08edd1d 100644 --- a/source/net/yacy/document/DateDetection.java +++ b/source/net/yacy/document/DateDetection.java @@ -64,9 +64,9 @@ public class DateDetection { private static final TimeZone UTC_TIMEZONE = TimeZone.getTimeZone("UTC"); private static final String CONPATT = "uuuu/MM/dd"; - - private static final DateTimeFormatter CONFORM = DateTimeFormatter.ofPattern(CONPATT).withLocale(Locale.US) - .withZone(ZoneOffset.UTC); + + private static final DateTimeFormatter CONFORM = DateTimeFormatter.ofPattern(CONPATT).withLocale(Locale.US) + .withZone(ZoneOffset.UTC); private static final LinkedHashMap Weekdays = new LinkedHashMap<>(); private static final LinkedHashMap Months = new LinkedHashMap<>(); private static final int[] MaxDaysInMonth = new int[]{31,29,31,30,31,30,31,31,30,31,30,31}; @@ -75,7 +75,7 @@ public class DateDetection { public static enum Language { GERMAN, ENGLISH, FRENCH, SPANISH, ITALIAN, PORTUGUESE; } - + static { // all names must be lowercase because compared strings are made to lowercase as well Weekdays.put(Language.GERMAN, new String[]{"montag", "dienstag", "mittwoch", "donnerstag", "freitag", "samstag" /*oder: "sonnabend"*/, "sonntag"}); @@ -91,7 +91,7 @@ public class DateDetection { Months.put(Language.PORTUGUESE,new String[]{"janeiro", "fevereiro", "março", "abril", "maio", "junho", "julho", "agosto", "setembro", "outubro", "novembro", "dezembro"}); } - + // RFC 822 day and month specification as a norm for date formats. This is needed to reconstruct the actual date later public static enum Weekday { Mon(Weekdays, 0), @@ -101,7 +101,7 @@ public class DateDetection { Fri(Weekdays, 4), Sat(Weekdays, 5), Sun(Weekdays, 6); - + private final Map inLanguages; // a map from the word to the language public final int offset; // the day offset in the week, monday = 0 private Weekday(final LinkedHashMap weekdayMap, final int offset) { @@ -112,7 +112,7 @@ public class DateDetection { } } } - + public static enum Month { Jan( 1), Feb( 2), Mar( 3), Apr( 4), May( 5), Jun( 6), Jul( 7), Aug( 8), Sep( 9), Oct(10), Nov(11), Dec(12); @@ -122,7 +122,7 @@ public class DateDetection { this.count = count; } } - + public static enum EntityType { YEAR(new LinkedHashMap()), MONTH(Months), @@ -142,7 +142,7 @@ public class DateDetection { private final static String DAYCAPTURE = "(\\d{1,2})"; private final static String YEARCAPTURE = "(\\d{2}|\\d{4})"; private final static String MONTHCAPTURE = "(\\p{L}{3,}|\\d{1,2})"; - + public static class HolidayMap extends TreeMap{ private static final long serialVersionUID = 1L; public HolidayMap() { @@ -152,69 +152,64 @@ public class DateDetection { public static HolidayMap Holidays = new HolidayMap(); public static Map HolidayPattern = new HashMap<>(); - + static { - Holidays.putAll(getHolidays(CURRENT_YEAR)); - - + Holidays.putAll(getHolidays(CURRENT_YEAR)); + for (Map.Entry holiday: Holidays.entrySet()) { HolidayPattern.put(Pattern.compile(BODNCG + holiday.getKey() + EODNCG), holiday.getValue()); } } - /** - * @param currentYear - * the current year reference to use - * @return a new mapping from holiday names to arrays of - * three or four holiday dates starting from currentYear - 1. Each date time is 00:00:00 on UTC+00:00 time zone. - */ - public static HolidayMap getHolidays(final int currentYear) { - final HolidayMap result = new HolidayMap(); - - /* Date rules from icu4j library used here (SimpleDateRule and EasterRule) use internally the default time zone and this can not be modified (up to icu4j 60.1) */ - final TimeZone dateRulesTimeZone = TimeZone.getDefault(); + /** + * @param currentYear + * the current year reference to use + * @return a new mapping from holiday names to arrays of + * three or four holiday dates starting from currentYear - 1. Each date time is 00:00:00 on UTC+00:00 time zone. + */ + public static HolidayMap getHolidays(final int currentYear) { + final HolidayMap result = new HolidayMap(); + + /* Date rules from icu4j library used here (SimpleDateRule and EasterRule) use internally the default time zone and this can not be modified (up to icu4j 60.1) */ + final TimeZone dateRulesTimeZone = TimeZone.getDefault(); // German result.put("Neujahr", sameDayEveryYear(Calendar.JANUARY, 1, currentYear)); result.put("Heilige Drei Könige", sameDayEveryYear(Calendar.JANUARY, 6, currentYear)); result.put("Valentinstag", sameDayEveryYear(Calendar.FEBRUARY, 14, currentYear)); - + /* Fat Thursday : Thursday (6 days) before Ash Wednesday (52 days before Easter Sunday) */ result.put("Weiberfastnacht", holiDayEventRule(new EasterHoliday(-52, "Weiberfastnacht").getRule(), currentYear, dateRulesTimeZone)); // new Date[]{CONFORM.parse("2014/02/27"), CONFORM.parse("2015/02/12"), CONFORM.parse("2016/02/04")}); - result.put("Weiberfasching", result.get("Weiberfastnacht")); - + /* Rose Monday : Monday before Ash Wednesday (48 days before Easter Sunday) */ result.put("Rosenmontag", holiDayEventRule(new EasterHoliday(-48, "Rosenmontag").getRule(), currentYear, dateRulesTimeZone)); // new Date[]{CONFORM.parse("2014/03/03"), CONFORM.parse("2015/03/16"), CONFORM.parse("2016/02/08")}); - result.put("Faschingsdienstag", holiDayEventRule(EasterHoliday.SHROVE_TUESDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/03/04"), CONFORM.parse("2015/03/17"), CONFORM.parse("2016/02/09")}); result.put("Fastnacht", result.get("Faschingsdienstag")); // new Date[]{CONFORM.parse("2014/03/04"), CONFORM.parse("2015/03/17"), CONFORM.parse("2016/02/09")}); result.put("Aschermittwoch", holiDayEventRule(EasterHoliday.ASH_WEDNESDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/03/05"), CONFORM.parse("2015/03/18"), CONFORM.parse("2016/02/10")}); result.put("Palmsonntag", holiDayEventRule(EasterHoliday.PALM_SUNDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/13"), CONFORM.parse("2015/03/29"), CONFORM.parse("2016/04/20")}); result.put("Gründonnerstag", holiDayEventRule(EasterHoliday.MAUNDY_THURSDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/17"), CONFORM.parse("2015/04/02"), CONFORM.parse("2016/04/24")}); result.put("Karfreitag", holiDayEventRule(EasterHoliday.GOOD_FRIDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/18"), CONFORM.parse("2015/04/03"), CONFORM.parse("2016/04/25")}); - + /* Holy Saturday (also called Easter Eve, Black Saturday) : one day before Easter Sunday */ result.put("Karsamstag", holiDayEventRule(new EasterHoliday(-1, "Karsamstag").getRule(), currentYear, dateRulesTimeZone)); // new Date[]{CONFORM.parse("2014/04/19"), CONFORM.parse("2015/04/04"), CONFORM.parse("2016/04/26")}); result.put("Ostersonntag", holiDayEventRule(EasterHoliday.EASTER_SUNDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/20"), CONFORM.parse("2015/04/05"), CONFORM.parse("2016/04/27")}); result.put("Ostermontag", holiDayEventRule(EasterHoliday.EASTER_MONDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/04/21"), CONFORM.parse("2015/04/06"), CONFORM.parse("2016/04/28")}); - + /* Include both Easter Sunday and Monday */ result.put("Ostern", getOsternEventRule(currentYear, dateRulesTimeZone)); - result.put("Walpurgisnacht", sameDayEveryYear(Calendar.APRIL, 30, currentYear)); result.put("Tag der Arbeit", sameDayEveryYear(Calendar.MAY, 1, currentYear)); - + /* Mother's Day : Second sunday of may in Germany */ final Date[] mothersDays = new Date[3]; int year = currentYear - 1; for (int i = 0; i < 3; i++) { - final LocalDate firstMay = LocalDate.of(year, java.time.Month.MAY, 1); - final LocalDate mothersDay = firstMay.with(TemporalAdjusters.firstInMonth(DayOfWeek.SUNDAY)).with(TemporalAdjusters.next(DayOfWeek.SUNDAY)); - mothersDays[i] = toMidnightUTCDate(mothersDay); - year++; + final LocalDate firstMay = LocalDate.of(year, java.time.Month.MAY, 1); + final LocalDate mothersDay = firstMay.with(TemporalAdjusters.firstInMonth(DayOfWeek.SUNDAY)).with(TemporalAdjusters.next(DayOfWeek.SUNDAY)); + mothersDays[i] = toMidnightUTCDate(mothersDay); + year++; } result.put("Muttertag", mothersDays); - result.put("Christi Himmelfahrt", holiDayEventRule(EasterHoliday.ASCENSION.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/05/29"), CONFORM.parse("2015/05/14"), CONFORM.parse("2016/05/05")}); result.put("Pfingstsonntag", holiDayEventRule(EasterHoliday.WHIT_SUNDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/06/08"), CONFORM.parse("2015/05/24"), CONFORM.parse("2016/05/15")}); result.put("Pfingstmontag", holiDayEventRule(EasterHoliday.WHIT_MONDAY.getRule(), currentYear, dateRulesTimeZone));// new Date[]{CONFORM.parse("2014/06/09"), CONFORM.parse("2015/05/25"), CONFORM.parse("2016/05/16")}); @@ -226,50 +221,48 @@ public class DateDetection { result.put("Allerseelen", sameDayEveryYear(Calendar.NOVEMBER, 2, currentYear)); result.put("Martinstag", sameDayEveryYear(Calendar.NOVEMBER, 11, currentYear)); result.put("St. Martin", result.get("Martinstag")); - result.put("Buß- und Bettag", holiDayEventRule(new SimpleDateRule(Calendar.NOVEMBER, 22, Calendar.WEDNESDAY, true), currentYear, dateRulesTimeZone)); // new Date[]{CONFORM.parse("2014/11/19"), CONFORM.parse("2015/11/18"), CONFORM.parse("2016/11/16")}); - result.put("Nikolaus", sameDayEveryYear(Calendar.DECEMBER, 6, currentYear)); result.put("Heiligabend", sameDayEveryYear(Calendar.DECEMBER, 24, currentYear)); result.put("1. Weihnachtsfeiertag", sameDayEveryYear(Calendar.DECEMBER, 25, currentYear)); result.put("2. Weihnachtsfeiertag", sameDayEveryYear(Calendar.DECEMBER, 26, currentYear)); - - /* Advent : four Sundays before Chritsmas */ - final Date[] advents1 = new Date[3], advents2 = new Date[3], advents3 = new Date[3], advents4 = new Date[3], - volkstrauertagen = new Date[3], sundaysOfTheDead = new Date[3]; - - year = currentYear - 1; - final TemporalAdjuster prevSunday = TemporalAdjusters.previous(DayOfWeek.SUNDAY); - for (int i = 0; i < 3; i++) { - final LocalDate christmas = LocalDate.of(year, java.time.Month.DECEMBER, 25); - final LocalDate advent4 = christmas.with(prevSunday); - final LocalDate advent3 = advent4.with(prevSunday); - final LocalDate advent2 = advent3.with(prevSunday); - final LocalDate advent1 = advent2.with(prevSunday); - final LocalDate sundayOfTheDead = advent1.with(prevSunday); - final LocalDate volkstrauertag = sundayOfTheDead.with(prevSunday); - advents4[i] = toMidnightUTCDate(advent4); - advents3[i] = toMidnightUTCDate(advent3); - advents2[i] = toMidnightUTCDate(advent2); - advents1[i] = toMidnightUTCDate(advent1); - sundaysOfTheDead[i] = toMidnightUTCDate(sundayOfTheDead); - volkstrauertagen[i] = toMidnightUTCDate(volkstrauertag); - year++; - } - result.put("1. Advent", advents1); - result.put("2. Advent", advents2); - result.put("3. Advent", advents3); - result.put("4. Advent", advents4); + /* Advent : four Sundays before Chritsmas */ + final Date[] advents1 = new Date[3], advents2 = new Date[3], advents3 = new Date[3], advents4 = new Date[3], + volkstrauertagen = new Date[3], sundaysOfTheDead = new Date[3]; - /* Sunday of the Dead (also called Eternity Sunday) : last Sunday before Advent */ + year = currentYear - 1; + final TemporalAdjuster prevSunday = TemporalAdjusters.previous(DayOfWeek.SUNDAY); + for (int i = 0; i < 3; i++) { + final LocalDate christmas = LocalDate.of(year, java.time.Month.DECEMBER, 25); + final LocalDate advent4 = christmas.with(prevSunday); + final LocalDate advent3 = advent4.with(prevSunday); + final LocalDate advent2 = advent3.with(prevSunday); + final LocalDate advent1 = advent2.with(prevSunday); + final LocalDate sundayOfTheDead = advent1.with(prevSunday); + final LocalDate volkstrauertag = sundayOfTheDead.with(prevSunday); + advents4[i] = toMidnightUTCDate(advent4); + advents3[i] = toMidnightUTCDate(advent3); + advents2[i] = toMidnightUTCDate(advent2); + advents1[i] = toMidnightUTCDate(advent1); + sundaysOfTheDead[i] = toMidnightUTCDate(sundayOfTheDead); + volkstrauertagen[i] = toMidnightUTCDate(volkstrauertag); + year++; + } + + result.put("1. Advent", advents1); + result.put("2. Advent", advents2); + result.put("3. Advent", advents3); + result.put("4. Advent", advents4); + + /* Sunday of the Dead (also called Eternity Sunday) : last Sunday before Advent */ result.put("Totensonntag", sundaysOfTheDead); /* "people's day of mourning" : two Sundays before Advent */ - result.put("Volkstrauertag", volkstrauertagen); - + result.put("Volkstrauertag", volkstrauertagen); + result.put("Silvester", sameDayEveryYear(Calendar.DECEMBER, 31, currentYear)); - + // English result.put("Eastern", result.get("Ostern")); result.put("New Year's Day", result.get("Neujahr")); @@ -286,23 +279,23 @@ public class DateDetection { result.put("Christmas Day", result.get("1. Weihnachtsfeiertag")); result.put("Boxing Day", result.get("2. Weihnachtsfeiertag")); result.put("New Year's Eve", result.get("Silvester")); - return result; - } - - /** - * Convert a date to an old style java.util.Date instance with time set at - * midnight on UTC time zone. - * - * @param localDate - * a simple date with year month and day without time zone - * @return a java.util.Date instance or null when localDate is null - */ - public static Date toMidnightUTCDate(final LocalDate localDate) { - if (localDate == null) { - return null; - } - return Date.from(ZonedDateTime.of(localDate, LocalTime.MIDNIGHT, UTC_TIMEZONE.toZoneId()).toInstant()); - } + return result; + } + + /** + * Convert a date to an old style java.util.Date instance with time set at + * midnight on UTC time zone. + * + * @param localDate + * a simple date with year month and day without time zone + * @return a java.util.Date instance or null when localDate is null + */ + public static Date toMidnightUTCDate(final LocalDate localDate) { + if (localDate == null) { + return null; + } + return Date.from(ZonedDateTime.of(localDate, LocalTime.MIDNIGHT, UTC_TIMEZONE.toZoneId()).toInstant()); + } /** * @param month value of month (Calendar.month is 0 based) @@ -330,40 +323,40 @@ public class DateDetection { * @return 3 years of same holiday starting in last year (currentYear - 1) */ private static Date[] holiDayEventRule(final DateRule holidayrule, final int currentYear, final TimeZone ruleTimeZone) { - final Date[] r = new Date[3]; - final Calendar january1Calendar = new GregorianCalendar(ruleTimeZone); - /* Clear all fields to get a 00:00:00:000 time part */ - january1Calendar.clear(); - - /* Calendar using UTC time zone to produce date results */ - final Calendar utcCalendar = new GregorianCalendar(UTC_TIMEZONE); - - /* Calendar using the same time zone as in the holidayrule to extract year,month, and day fields */ - final Calendar ruleCalendar = new GregorianCalendar(ruleTimeZone); + final Date[] r = new Date[3]; + final Calendar january1Calendar = new GregorianCalendar(ruleTimeZone); + /* Clear all fields to get a 00:00:00:000 time part */ + january1Calendar.clear(); - int year = currentYear -1; // set previous year as start year - for (int y = 0; y < 3; y++) { - january1Calendar.set(year, Calendar.JANUARY, 1); - Date holiday = holidayrule.firstAfter(january1Calendar.getTime()); - ruleCalendar.setTime(holiday); - utcCalendar.set(ruleCalendar.get(Calendar.YEAR), ruleCalendar.get(Calendar.MONTH), - ruleCalendar.get(Calendar.DAY_OF_MONTH)); - r[y] = utcCalendar.getTime(); - year++; - } - return r; + /* Calendar using UTC time zone to produce date results */ + final Calendar utcCalendar = new GregorianCalendar(UTC_TIMEZONE); + + /* Calendar using the same time zone as in the holidayrule to extract year,month, and day fields */ + final Calendar ruleCalendar = new GregorianCalendar(ruleTimeZone); + + int year = currentYear -1; // set previous year as start year + for (int y = 0; y < 3; y++) { + january1Calendar.set(year, Calendar.JANUARY, 1); + Date holiday = holidayrule.firstAfter(january1Calendar.getTime()); + ruleCalendar.setTime(holiday); + utcCalendar.set(ruleCalendar.get(Calendar.YEAR), ruleCalendar.get(Calendar.MONTH), + ruleCalendar.get(Calendar.DAY_OF_MONTH)); + r[y] = utcCalendar.getTime(); + year++; + } + return r; } - + /** * @param currentYear the current year reference to use * @param ruleTimeZone the time zone of calendar used in the holiday rule * @return Easter sunday and monday dates on three years starting from last year */ private static Date[] getOsternEventRule(final int currentYear, final TimeZone ruleTimeZone) { - ArrayList osternDates = new ArrayList<>(); - Collections.addAll(osternDates, holiDayEventRule(EasterHoliday.EASTER_SUNDAY.getRule(), currentYear, ruleTimeZone)); - Collections.addAll(osternDates, holiDayEventRule(EasterHoliday.EASTER_MONDAY.getRule(), currentYear, ruleTimeZone)); - return osternDates.toArray(new Date[osternDates.size()]); + ArrayList osternDates = new ArrayList<>(); + Collections.addAll(osternDates, holiDayEventRule(EasterHoliday.EASTER_SUNDAY.getRule(), currentYear, ruleTimeZone)); + Collections.addAll(osternDates, holiDayEventRule(EasterHoliday.EASTER_MONDAY.getRule(), currentYear, ruleTimeZone)); + return osternDates.toArray(new Date[osternDates.size()]); } /** @@ -371,7 +364,7 @@ public class DateDetection { * It can also be used to identify the language of a text, if that text uses words from a date vocabulary. */ public static class LanguageRecognition { - + private final Pattern weekdayMatch, monthMatch; private final Set usedInLanguages; private final Map weekdayIndex, monthIndex, monthIndexAbbrev; @@ -395,7 +388,7 @@ public class DateDetection { weekdayMatchString.append("|(?:").append(BODNCG).append(weekdays[i]).append(SEPARATORNCG).append(EODNCG).append(')'); } } - + String[] months = Months.get(language); if (months != null) { assert months.length == 12; @@ -413,7 +406,7 @@ public class DateDetection { this.weekdayMatch = Pattern.compile(weekdayMatchString.length() > 0 ? weekdayMatchString.substring(1) : ""); this.monthMatch = Pattern.compile(monthMatchString.length() > 0 ? monthMatchString.substring(1) : ""); } - + /** * this is an expensive check that looks if any of the words from the date expressions (month and weekday expressions) * appear in the text. This should only be used to verify a parse result if the result was ambiguous @@ -423,7 +416,7 @@ public class DateDetection { public boolean usesLanguageOfNotion(String text) { return this.weekdayMatch.matcher(text).matches() || this.monthMatch.matcher(text).matches(); } - + /** * parse a part of a date * @param entity @@ -479,7 +472,7 @@ public class DateDetection { } return -1; } - + } private final static LanguageRecognition ENGLISH_LANGUAGE = new LanguageRecognition(new Language[]{Language.ENGLISH}); @@ -487,7 +480,7 @@ public class DateDetection { private final static LanguageRecognition FRENCH_LANGUAGE = new LanguageRecognition(new Language[]{Language.FRENCH}); private final static LanguageRecognition ENGLISH_GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH}); private final static LanguageRecognition ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH, Language.FRENCH, Language.SPANISH, Language.ITALIAN, Language.PORTUGUESE}); - + public static interface StyleParser { /** * get all dates in the text @@ -496,7 +489,7 @@ public class DateDetection { */ public LinkedHashSet parse(String text); } - + /** * Regular expressions for various types of date writings. * Uses terminology and data taken from: @@ -526,7 +519,7 @@ public class DateDetection { this.pattern = Pattern.compile(patternString); this.languageParser = languageParser; } - + /** * get all dates in the text * @param text @@ -552,42 +545,42 @@ public class DateDetection { int month = this.firstEntity == EntityType.MONTH ? i1 : this.secondEntity == EntityType.MONTH ? i2 : i3; if (day > MaxDaysInMonth[month - 1]) continue; // validity check of the day number int year = this.firstEntity == EntityType.YEAR ? i1 : this.secondEntity == EntityType.YEAR ? i2 : i3; - final Date parsed = parseDateSafely( - year + "/" + (month < 10 ? "0" : "") + month + "/" + (day < 10 ? "0" : "") + day, CONFORM); + final Date parsed = parseDateSafely( + year + "/" + (month < 10 ? "0" : "") + month + "/" + (day < 10 ? "0" : "") + day, CONFORM); if(parsed != null) { - dates.add(parsed); + dates.add(parsed); } if (dates.size() > 100) {dates.clear(); break;} // that does not make sense } return dates; } - + } - - /** - * Safely parse the given string to an instant using the given formatter. Return - * null when the format can not be applied to the given string or when any - * parsing error occurred. - * - * @param str - * the string to parse - * @param formatter - * the formatter to use - * @return an Instant instance or null - */ - protected static Date parseDateSafely(final String str, final DateTimeFormatter formatter) { - Date res = null; - if (str != null && !str.isEmpty()) { - try { - if (formatter != null) { - res = Date.from(LocalDate.parse(str, formatter).atStartOfDay().toInstant(ZoneOffset.UTC)); - } - } catch (final RuntimeException ignored) { - } - } - return res; - } - + + /** + * Safely parse the given string to an instant using the given formatter. Return + * null when the format can not be applied to the given string or when any + * parsing error occurred. + * + * @param str + * the string to parse + * @param formatter + * the formatter to use + * @return an Instant instance or null + */ + protected static Date parseDateSafely(final String str, final DateTimeFormatter formatter) { + Date res = null; + if (str != null && !str.isEmpty()) { + try { + if (formatter != null) { + res = Date.from(LocalDate.parse(str, formatter).atStartOfDay().toInstant(ZoneOffset.UTC)); + } + } catch (final RuntimeException ignored) { + } + } + return res; + } + public static enum ShortStyle implements StyleParser { MD_ENGLISH(EntityType.MONTH, EntityType.DAY, // Big-endian (month, day), e.g. "from october 1st to september 13th" ENGLISH_LANGUAGE, @@ -647,21 +640,21 @@ public class DateDetection { final Date atThisYear = parseDateSafely(thisyear + datestub, CONFORM); if(atThisYear != null) { - dates.add(atThisYear); + dates.add(atThisYear); } - + final Date atNextYear = parseDateSafely(nextyear + datestub, CONFORM); if(atNextYear != null) { - dates.add(atNextYear); + dates.add(atNextYear); } //dates.add(atThisYear.after(TODAY) ? atThisYear : atNextYear); // we consider these kind of dates as given for the future if (dates.size() > 100) {dates.clear(); break;} // that does not make sense } return dates; } - + } - + private static final HashMap specialDayOffset = new HashMap<>(); static { specialDayOffset.put("today", 0L); specialDayOffset.put("heute", 0L); @@ -669,7 +662,7 @@ public class DateDetection { specialDayOffset.put("dayaftertomorrow", 2 * AbstractFormatter.dayMillis); specialDayOffset.put("uebermorgen", 2 * AbstractFormatter.dayMillis); specialDayOffset.put("yesterday", -AbstractFormatter.dayMillis); specialDayOffset.put("gestern", -AbstractFormatter.dayMillis); } - + /** * get all dates in the text * @param text @@ -679,7 +672,7 @@ public class DateDetection { public static LinkedHashSet parse(String text, int timezoneOffset) { LinkedHashSet dates = parseRawDate(text); - + for (Map.Entry entry: HolidayPattern.entrySet()) { if (entry.getKey().matcher(text).find()) { for (Date d: entry.getValue()) dates.add(d); @@ -701,12 +694,12 @@ public class DateDetection { Date d = parseDateSafely(text, CONFORM); //if (d == null) try {d = GenericFormatter.FORMAT_SHORT_DAY.parse(text);} catch (ParseException e) {} // did not work well and fired for wrong formats; do not use if (d == null) { - d = parseDateSafely(text, GenericFormatter.FORMAT_RFC1123_SHORT); + d = parseDateSafely(text, GenericFormatter.FORMAT_RFC1123_SHORT); } if (d == null) { - d = parseDateSafely(text, GenericFormatter.FORMAT_ANSIC); + d = parseDateSafely(text, GenericFormatter.FORMAT_ANSIC); } - + if (d == null) { // check other date formats Set dd = parseRawDate(text); @@ -734,7 +727,7 @@ public class DateDetection { } return d; } - + private static LinkedHashSet parseRawDate(String text) { // get parse alternatives for different date styles; we consider that one document uses only one style LinkedHashSet DMYDates = EndianStyle.DMY.parse(text); @@ -745,34 +738,34 @@ public class DateDetection { if (DMDates.size() > 0) break; } DMYDates.addAll(DMDates); - + LinkedHashSet MDYDates = DMYDates.size() == 0 ? EndianStyle.MDY.parse(text) : new LinkedHashSet(0); LinkedHashSet MDDates = DMYDates.size() == 0 ? ShortStyle.MD_ENGLISH.parse(text) : new LinkedHashSet(0); MDYDates.addAll(MDDates); - + LinkedHashSet YMDDates = DMYDates.size() == 0 && MDYDates.size() == 0 ? EndianStyle.YMD.parse(text) : new LinkedHashSet(0); - + // if either one of them contains any and the other contain no date, chose that one (we don't want to mix them) if (YMDDates.size() > 0 && DMYDates.size() == 0 && MDYDates.size() == 0) return YMDDates; if (YMDDates.size() == 0 && DMYDates.size() > 0 && MDYDates.size() == 0) return DMYDates; if (YMDDates.size() == 0 && DMYDates.size() == 0 && MDYDates.size() > 0) return MDYDates; - + // if we have several sets, check if we can detect the language from month or weekday expressions // we sort out such sets, which do not contain any of these languages boolean usesLanguageOfYMD = YMDDates.size() > 0 ? false : EndianStyle.YMD.languageParser.usesLanguageOfNotion(text); boolean usesLanguageOfDMY = DMYDates.size() > 0 ? false : EndianStyle.DMY.languageParser.usesLanguageOfNotion(text); boolean usesLanguageOfMDY = MDYDates.size() > 0 ? false : EndianStyle.MDY.languageParser.usesLanguageOfNotion(text); - + // now check again if (usesLanguageOfYMD && !usesLanguageOfDMY && !usesLanguageOfMDY) return YMDDates; if (!usesLanguageOfYMD && usesLanguageOfDMY && !usesLanguageOfMDY) return DMYDates; if (!usesLanguageOfYMD && !usesLanguageOfDMY && usesLanguageOfMDY) return MDYDates; - + // if this fails, we return only the DMY format since that has the most chances to be right (it is mostly used) // we choose DMYDates even if it is empty to avoid false positives. return DMYDates; } - + public static void main(String[] args) { String fill = ""; for (int i = 0; i < 1000; i++) fill += 'x'; String[] test = new String[]{ @@ -819,6 +812,6 @@ public class DateDetection { System.out.println(); } System.out.println("Runtime: " + (System.currentTimeMillis() - t) + " milliseconds."); - } - + } + } diff --git a/source/net/yacy/document/SentenceReader.java b/source/net/yacy/document/SentenceReader.java index a8af87d25..be8d8b5b4 100644 --- a/source/net/yacy/document/SentenceReader.java +++ b/source/net/yacy/document/SentenceReader.java @@ -34,57 +34,57 @@ import java.util.List; */ public class SentenceReader implements Iterator, Iterable { - /** Holds the next element */ + /** Holds the next element */ private StringBuilder buffer; - + /** List of already parsed sentences, eventually in addition to those extracted from the main text. */ private List parsedSentences; - + /** Current position in the parsedSentences list. */ private int sentencesPos; - + /** The main text to parse for sentences */ private String text; - + /** The current character position in the main text */ private int pos; - + /** When true sentences can not include line break characters */ private boolean pre = false; public SentenceReader(final String text) { - this(new ArrayList<>(), text, false); + this(new ArrayList<>(), text, false); } public SentenceReader(final String text, final boolean pre) { - this(new ArrayList<>(), text, pre); + this(new ArrayList<>(), text, pre); } - + public SentenceReader(final List parsedSentences, final String text, final boolean pre) { - assert text != null; + assert text != null; this.text = text; this.pos = 0; this.pre = pre; if(parsedSentences == null) { - this.parsedSentences = new ArrayList<>(); + this.parsedSentences = new ArrayList<>(); } else { - this.parsedSentences = parsedSentences; + this.parsedSentences = parsedSentences; } this.sentencesPos = 0; this.buffer = nextElement0(); } - + public void pre(final boolean x) { this.pre = x; } private StringBuilder nextElement0() { - if(this.sentencesPos < this.parsedSentences.size()) { - final StringBuilder element = this.parsedSentences.get(this.sentencesPos); - this.sentencesPos++; - return element; - } - + if(this.sentencesPos < this.parsedSentences.size()) { + final StringBuilder element = this.parsedSentences.get(this.sentencesPos); + this.sentencesPos++; + return element; + } + final StringBuilder s = new StringBuilder(80); int nextChar; char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' ' @@ -112,10 +112,10 @@ public class SentenceReader implements Iterator, Iterable= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) return false; - // then check more complex case which applies to all character sets - final int type = Character.getType(c); + // first check average simple case + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) return false; + // then check more complex case which applies to all character sets + final int type = Character.getType(c); return !(type == Character.LOWERCASE_LETTER || type == Character.DECIMAL_DIGIT_NUMBER || type == Character.UPPERCASE_LETTER @@ -153,19 +153,19 @@ public class SentenceReader implements Iterator, Iterable iterator() { return this; } - + /** * Reset the iterator position to zero */ public void reset() { - /* Reset only the sentences position to reuse already parsed sentences */ - this.sentencesPos = 0; - this.buffer = nextElement0(); + /* Reset only the sentences position to reuse already parsed sentences */ + this.sentencesPos = 0; + this.buffer = nextElement0(); } public synchronized void close() { - this.text = null; - this.parsedSentences = null; + this.text = null; + this.parsedSentences = null; } public static void main(String[] args) { diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java index 275a2215a..8ed219bf4 100644 --- a/source/net/yacy/document/Tokenizer.java +++ b/source/net/yacy/document/Tokenizer.java @@ -59,7 +59,7 @@ public class Tokenizer { protected final Map words; // a string (the words) to (indexWord) - relation (key: words are lowercase) private final Set synonyms; // a set of synonyms to the words protected final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging - + public int RESULT_NUMB_WORDS = -1; public int RESULT_NUMB_SENTENCES = -1; public Bitfield RESULT_FLAGS = new Bitfield(4); @@ -70,7 +70,7 @@ public class Tokenizer { assert text != null; final String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; for (int i = 0; i < wordcache.length; i++) { - wordcache[i] = ""; + wordcache[i] = ""; } String k; int wordlen; @@ -167,95 +167,95 @@ public class Tokenizer { if (syms != null) this.synonyms.addAll(syms); } } - + // store result this.RESULT_NUMB_WORDS = allwordcounter; // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text. this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0); } - /** - * Check whether a single word or multiple ones match tags - * from the given autotagging vocabularies. Then fill this instance "tags" map - * with the eventually matching tags found. - * - * @param wordcache - * the words to be checked for matching a tag as a single word or as combination of words - * @param word - * an additional word to be considered for tag matching - * @param vocabularyNames - * names of the autotagging vocabularies to check - */ - protected void extractAutoTagsFromText(final String[] wordcache, final String word, final Set vocabularyNames) { - Tagging.Metatag tag; - if (vocabularyNames.size() > 0) { - for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) { - // wordc is number of words that are tested - StringBuilder sb = new StringBuilder(); - if (wordc == 1) { - sb.append(word); - } else { - for (int w = 0; w < wordc - 1; w++) { - sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' '); - } - sb.append(word); - } - String testterm = sb.toString().trim(); - tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm); - if (tag != null) { - String navigatorName = tag.getVocabularyName(); - Set tagset = this.tags.get(navigatorName); - if (tagset == null) { - tagset = new HashSet(); - this.tags.put(navigatorName, tagset); - } - tagset.add(tag); - } - } - } - } + /** + * Check whether a single word or multiple ones match tags + * from the given autotagging vocabularies. Then fill this instance "tags" map + * with the eventually matching tags found. + * + * @param wordcache + * the words to be checked for matching a tag as a single word or as combination of words + * @param word + * an additional word to be considered for tag matching + * @param vocabularyNames + * names of the autotagging vocabularies to check + */ + protected void extractAutoTagsFromText(final String[] wordcache, final String word, final Set vocabularyNames) { + Tagging.Metatag tag; + if (vocabularyNames.size() > 0) { + for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) { + // wordc is number of words that are tested + StringBuilder sb = new StringBuilder(); + if (wordc == 1) { + sb.append(word); + } else { + for (int w = 0; w < wordc - 1; w++) { + sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' '); + } + sb.append(word); + } + String testterm = sb.toString().trim(); + tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm); + if (tag != null) { + String navigatorName = tag.getVocabularyName(); + Set tagset = this.tags.get(navigatorName); + if (tagset == null) { + tagset = new HashSet(); + this.tags.put(navigatorName, tagset); + } + tagset.add(tag); + } + } + } + } - /** - * Extend the specified vocabularies, with terms eventually found by the - * vocabulary scraper for these vocabularies. The scraper is emptied after - * processing, and extended vocabularies names are removed from the - * vocabularyNames. - * - * @param root - * the document URL - * @param scraper - * the vocabulary scraper, eventually containing new terms scraped - * for the registered vocabularies - * @param vocabularyNames - * vocabularies names to be extended - */ - protected void extendVocabularies(final DigestURL root, final VocabularyScraper scraper, - final Set vocabularyNames) { - Tagging.Metatag tag; - Map vocMap = scraper == null ? null : scraper.removeVocMap(root); - if (vocMap != null && vocMap.size() > 0) { - for (Map.Entry entry: vocMap.entrySet()) { - String navigatorName = entry.getKey(); - String term = entry.getValue(); - vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation - Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName); - if (vocabulary != null) { - // extend the vocabulary - String obj = vocabulary.getObjectlink(term); - if (obj == null) { - try { - vocabulary.put(term, "", root.toNormalform(true)); - } catch (IOException e) {} // this makes IO, be careful! - } - // create annotation - tag = vocabulary.getMetatagFromTerm(term); - Set tagset = new HashSet<>(); - tagset.add(tag); - this.tags.put(navigatorName, tagset); - } - } - } - } + /** + * Extend the specified vocabularies, with terms eventually found by the + * vocabulary scraper for these vocabularies. The scraper is emptied after + * processing, and extended vocabularies names are removed from the + * vocabularyNames. + * + * @param root + * the document URL + * @param scraper + * the vocabulary scraper, eventually containing new terms scraped + * for the registered vocabularies + * @param vocabularyNames + * vocabularies names to be extended + */ + protected void extendVocabularies(final DigestURL root, final VocabularyScraper scraper, + final Set vocabularyNames) { + Tagging.Metatag tag; + Map vocMap = scraper == null ? null : scraper.removeVocMap(root); + if (vocMap != null && vocMap.size() > 0) { + for (Map.Entry entry: vocMap.entrySet()) { + String navigatorName = entry.getKey(); + String term = entry.getValue(); + vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation + Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName); + if (vocabulary != null) { + // extend the vocabulary + String obj = vocabulary.getObjectlink(term); + if (obj == null) { + try { + vocabulary.put(term, "", root.toNormalform(true)); + } catch (IOException e) {} // this makes IO, be careful! + } + // create annotation + tag = vocabulary.getMetatagFromTerm(term); + Set tagset = new HashSet<>(); + tagset.add(tag); + this.tags.put(navigatorName, tagset); + } + } + } + } /** * @return returns the words as word/indexWord relation map. All words are lowercase. @@ -264,7 +264,7 @@ public class Tokenizer { // returns the words as word/indexWord relation map return this.words; } - + public static Map getWords(final String text, final WordCache meaningLib) { // returns a word/indexWord relation map if (text == null) return null; @@ -276,7 +276,7 @@ public class Tokenizer { for (String s: this.synonyms) l.add(s); return l; } - + public Map> tags() { return this.tags; } diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 0ed51479f..f2acc4a39 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -37,7 +37,7 @@ import net.yacy.kelondro.data.word.Word; public class WordTokenizer implements Enumeration { - // this enumeration removes all words that contain either wrong characters or are too short + // this enumeration removes all words that contain either wrong characters or are too short private StringBuilder buffer = null; private unsievedWordsEnum e; @@ -78,9 +78,9 @@ public class WordTokenizer implements Enumeration { } public synchronized void close() { - this.e.close(); - this.e = null; - this.buffer = null; + this.e.close(); + this.e = null; + this.buffer = null; } private class unsievedWordsEnum implements Enumeration { @@ -189,29 +189,29 @@ public class WordTokenizer implements Enumeration { final SortedMap map = new TreeMap(Base64Order.enhancedCoder); WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null); try { - int pos = 0; - StringBuilder word; - byte[] hash; - Integer oldpos; - while (words.hasMoreElements() && maxlength-- > 0) { - word = words.nextElement(); - hash = Word.word2hash(word); + int pos = 0; + StringBuilder word; + byte[] hash; + Integer oldpos; + while (words.hasMoreElements() && maxlength-- > 0) { + word = words.nextElement(); + hash = Word.word2hash(word); - // don't overwrite old values, that leads to too far word distances - oldpos = map.put(hash, LargeNumberCache.valueOf(pos)); - if (oldpos != null) { - map.put(hash, oldpos); - } + // don't overwrite old values, that leads to too far word distances + oldpos = map.put(hash, LargeNumberCache.valueOf(pos)); + if (oldpos != null) { + map.put(hash, oldpos); + } - pos += word.length() + 1; - } - return map; + pos += word.length() + 1; + } + return map; } finally { - words.close(); - words = null; + words.close(); + words = null; } } - + /** * Tokenize the given sentence and generate a word-wordPos mapping * @param sentence the sentence to be tokenized @@ -221,24 +221,24 @@ public class WordTokenizer implements Enumeration { final SortedMap map = new TreeMap(); WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null); try { - int pos = 0; - String word; - Integer oldpos; - while (words.hasMoreElements() && maxlength-- > 0) { - word = words.nextElement().toString().toLowerCase(Locale.ENGLISH); + int pos = 0; + String word; + Integer oldpos; + while (words.hasMoreElements() && maxlength-- > 0) { + word = words.nextElement().toString().toLowerCase(Locale.ENGLISH); - // don't overwrite old values, that leads to too far word distances - oldpos = map.put(word, LargeNumberCache.valueOf(pos)); - if (oldpos != null) { - map.put(word, oldpos); - } + // don't overwrite old values, that leads to too far word distances + oldpos = map.put(word, LargeNumberCache.valueOf(pos)); + if (oldpos != null) { + map.put(word, oldpos); + } - pos += word.length() + 1; - } - return map; + pos += word.length() + 1; + } + return map; } finally { - words.close(); - words = null; + words.close(); + words = null; } } } diff --git a/source/net/yacy/document/language/Identificator.java b/source/net/yacy/document/language/Identificator.java index a9628788e..1ba1b619b 100644 --- a/source/net/yacy/document/language/Identificator.java +++ b/source/net/yacy/document/language/Identificator.java @@ -59,7 +59,7 @@ public final class Identificator { */ public void add(final String word) { if (word == null || this.detector == null) { - return; + return; } this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars } @@ -71,24 +71,24 @@ public final class Identificator { * @return 2 char language code (ISO 639-1) */ public String getLanguage() { - if(this.detector != null) { - try { - ArrayList probabilities = this.detector.getProbabilities(); - if(probabilities.isEmpty()) return null; - this.language = this.detector.getProbabilities().get(0); - } catch (LangDetectException e) { - // this contains mostly the message "no features in text" - //ConcurrentLog.logException(e); - return null; - } - // Return language only if probability is higher than 30% to account for missing language profiles - if (this.language.prob > 0.3) { - if (this.language.lang.length() == 2) { - return this.language.lang; - } - return this.language.lang.substring(0,2); - } - } + if(this.detector != null) { + try { + ArrayList probabilities = this.detector.getProbabilities(); + if(probabilities.isEmpty()) return null; + this.language = this.detector.getProbabilities().get(0); + } catch (LangDetectException e) { + // this contains mostly the message "no features in text" + //ConcurrentLog.logException(e); + return null; + } + // Return language only if probability is higher than 30% to account for missing language profiles + if (this.language.prob > 0.3) { + if (this.language.lang.length() == 2) { + return this.language.lang; + } + return this.language.lang.substring(0,2); + } + } return null; diff --git a/source/net/yacy/kelondro/data/word/Word.java b/source/net/yacy/kelondro/data/word/Word.java index 77e4010d4..18a406cb5 100644 --- a/source/net/yacy/kelondro/data/word/Word.java +++ b/source/net/yacy/kelondro/data/word/Word.java @@ -111,17 +111,17 @@ public class Word { // create a word hash public static final byte[] word2hash(final String word) { - final String wordlc = word.toLowerCase(Locale.ENGLISH); - byte[] h = hashCache.get(wordlc); + final String wordlc = word.toLowerCase(Locale.ENGLISH); + byte[] h = hashCache.get(wordlc); if (h != null) return h; // calculate the hash - h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength); - while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) { - // ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer - // statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never) - System.arraycopy(h, 1, h, 0, commonHashLength - 1); - h[commonHashLength - 1] = lowByte; - } + h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength); + while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) { + // ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer + // statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never) + System.arraycopy(h, 1, h, 0, commonHashLength - 1); + h[commonHashLength - 1] = lowByte; + } assert h[2] != '@'; if (MemoryControl.shortStatus()) { hashCache.clear(); diff --git a/source/net/yacy/kelondro/data/word/WordReferenceRow.java b/source/net/yacy/kelondro/data/word/WordReferenceRow.java index ac10466f2..683908d6a 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceRow.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceRow.java @@ -73,11 +73,11 @@ public final class WordReferenceRow extends AbstractReference implements WordRef // available chars: b,e,j,q /** - * object for termination of concurrent blocking queue processing - */ + * object for termination of concurrent blocking queue processing + */ protected static final Row.Entry poisonRowEntry = urlEntryRow.newEntry(); - - // static properties + + // static properties private static final int col_urlhash = 0; // h 12 the url hash b64-encoded private static final int col_lastModified = 1; // a 2 last-modified time of the document where word appears private static final int col_freshUntil = 2; // s 2 TTL for the word, so it can be removed easily if the TTL is short @@ -207,7 +207,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef this.entry.setCol(col_posinphrase, word.posInPhrase); this.entry.setCol(col_posofphrase, word.numOfPhrase); } - + public WordReferenceRow(final String external) { this.entry = urlEntryRow.newEntry(external, true); } diff --git a/source/net/yacy/kelondro/util/SetTools.java b/source/net/yacy/kelondro/util/SetTools.java index 7f6a1b983..96a38445a 100644 --- a/source/net/yacy/kelondro/util/SetTools.java +++ b/source/net/yacy/kelondro/util/SetTools.java @@ -58,9 +58,9 @@ public final class SetTools { public static int log2a(int x) { // this computes 1 + log2 // it is the number of bits in x, not the logarithm by 2 - int l = 0; - while (x > 0) {x = x >>> 1; l++;} - return l; + int l = 0; + while (x > 0) {x = x >>> 1; l++;} + return l; } // ------------------------------------------------------------------------------------------------ @@ -178,7 +178,7 @@ public final class SetTools { Map.Entry mentry1 = mi1.next(); Map.Entry mentry2 = mi2.next(); while (true) { - c = comp.compare(mentry1.getKey(), mentry2.getKey()); + c = comp.compare(mentry1.getKey(), mentry2.getKey()); if (c < 0) { if (mi1.hasNext()) mentry1 = mi1.next(); else break; } else if (c > 0) { @@ -201,7 +201,7 @@ public final class SetTools { // now the same for set-set public static SortedSet joinConstructive(final SortedSet set1, final SortedSet set2) { - // comparators must be equal + // comparators must be equal if ((set1 == null) || (set2 == null)) return null; if (set1.comparator() != set2.comparator()) return null; if (set1.isEmpty() || set2.isEmpty()) return new TreeSet(set1.comparator()); @@ -214,46 +214,46 @@ public final class SetTools { // start most efficient method if (stepsEnum > stepsTest) { - if (set1.size() < set2.size()) return joinConstructiveByTest(set1.iterator(), set2); - return joinConstructiveByTest(set2.iterator(), set1); + if (set1.size() < set2.size()) return joinConstructiveByTest(set1.iterator(), set2); + return joinConstructiveByTest(set2.iterator(), set1); } return joinConstructiveByEnumeration(set1, set2); } public static SortedSet joinConstructiveByTest(final Iterator small, final SortedSet large) { - final SortedSet result = new TreeSet(large.comparator()); - A o; - while (small.hasNext()) { - o = small.next(); - if (large.contains(o)) result.add(o); - } - return result; + final SortedSet result = new TreeSet(large.comparator()); + A o; + while (small.hasNext()) { + o = small.next(); + if (large.contains(o)) result.add(o); + } + return result; } private static SortedSet joinConstructiveByEnumeration(final SortedSet set1, final SortedSet set2) { - // implement pairwise enumeration - final Comparator comp = set1.comparator(); - final Iterator mi = set1.iterator(); - final Iterator si = set2.iterator(); - final SortedSet result = new TreeSet(set1.comparator()); - int c; - if ((mi.hasNext()) && (si.hasNext())) { - A mobj = mi.next(); - A sobj = si.next(); - while (true) { - c = comp.compare(mobj, sobj); - if (c < 0) { - if (mi.hasNext()) mobj = mi.next(); else break; - } else if (c > 0) { - if (si.hasNext()) sobj = si.next(); else break; - } else { - result.add(mobj); - if (mi.hasNext()) mobj = mi.next(); else break; - if (si.hasNext()) sobj = si.next(); else break; - } - } - } - return result; + // implement pairwise enumeration + final Comparator comp = set1.comparator(); + final Iterator mi = set1.iterator(); + final Iterator si = set2.iterator(); + final SortedSet result = new TreeSet(set1.comparator()); + int c; + if ((mi.hasNext()) && (si.hasNext())) { + A mobj = mi.next(); + A sobj = si.next(); + while (true) { + c = comp.compare(mobj, sobj); + if (c < 0) { + if (mi.hasNext()) mobj = mi.next(); else break; + } else if (c > 0) { + if (si.hasNext()) sobj = si.next(); else break; + } else { + result.add(mobj); + if (mi.hasNext()) mobj = mi.next(); else break; + if (si.hasNext()) sobj = si.next(); else break; + } + } + } + return result; } /** @@ -289,23 +289,23 @@ public final class SetTools { * @return true if any element of the first set is part of the second set or vice-versa */ public static boolean anymatch(final SortedSet set1, final SortedSet set2) { - // comparators must be equal - if ((set1 == null) || (set2 == null)) return false; - if (set1.comparator() != set2.comparator()) return false; - if (set1.isEmpty() || set2.isEmpty()) return false; + // comparators must be equal + if ((set1 == null) || (set2 == null)) return false; + if (set1.comparator() != set2.comparator()) return false; + if (set1.isEmpty() || set2.isEmpty()) return false; - // decide which method to use - final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size()); - final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size()); - final int stepsEnum = 10 * (high + low - 1); - final int stepsTest = 12 * log2a(high) * low; + // decide which method to use + final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size()); + final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size()); + final int stepsEnum = 10 * (high + low - 1); + final int stepsTest = 12 * log2a(high) * low; - // start most efficient method - if (stepsEnum > stepsTest) { - return (set1.size() < set2.size()) ? anymatchByTest(set1.iterator(), set2) : anymatchByTest(set2.iterator(), set1); - } - return anymatchByEnumeration(set1, set2); - } + // start most efficient method + if (stepsEnum > stepsTest) { + return (set1.size() < set2.size()) ? anymatchByTest(set1.iterator(), set2) : anymatchByTest(set2.iterator(), set1); + } + return anymatchByEnumeration(set1, set2); + } /** * test if the intersection of two sets is not empty @@ -545,7 +545,7 @@ public final class SetTools { } catch (final IOException e) { } finally { if (br != null) try{br.close();}catch(final Exception e){ - ConcurrentLog.warn("SetTools", "Could not close input stream on file " + file); + ConcurrentLog.warn("SetTools", "Could not close input stream on file " + file); } } return list; @@ -577,52 +577,52 @@ public final class SetTools { for (Object o: c) if (i++ == n) return o; return null; } - + // ------------------------------------------------------------------------------------------------ public static void main(final String[] args) { - final SortedMap m = new TreeMap(); - final SortedMap s = new TreeMap(); - m.put("a", "a"); - m.put("x", "x"); - m.put("f", "f"); - m.put("h", "h"); - m.put("w", "w"); - m.put("7", "7"); - m.put("t", "t"); - m.put("k", "k"); - m.put("y", "y"); - m.put("z", "z"); - s.put("a", "a"); - s.put("b", "b"); - s.put("c", "c"); - s.put("k", "k"); - s.put("l", "l"); - s.put("m", "m"); - s.put("n", "n"); - s.put("o", "o"); - s.put("p", "p"); - s.put("q", "q"); - s.put("r", "r"); - s.put("s", "s"); - s.put("t", "t"); - s.put("x", "x"); - System.out.println("Compare " + m.toString() + " with " + s.toString()); - System.out.println("Join=" + joinConstructiveByEnumeration(m, s, true)); - System.out.println("Join=" + joinConstructiveByTest(m, s, true)); - System.out.println("Join=" + joinConstructiveByTest(m, s, true)); - System.out.println("Join=" + joinConstructive(m, s, true)); - //System.out.println("Exclude=" + excludeConstructiveByTestMapInSet(m, s.keySet())); + final SortedMap m = new TreeMap(); + final SortedMap s = new TreeMap(); + m.put("a", "a"); + m.put("x", "x"); + m.put("f", "f"); + m.put("h", "h"); + m.put("w", "w"); + m.put("7", "7"); + m.put("t", "t"); + m.put("k", "k"); + m.put("y", "y"); + m.put("z", "z"); + s.put("a", "a"); + s.put("b", "b"); + s.put("c", "c"); + s.put("k", "k"); + s.put("l", "l"); + s.put("m", "m"); + s.put("n", "n"); + s.put("o", "o"); + s.put("p", "p"); + s.put("q", "q"); + s.put("r", "r"); + s.put("s", "s"); + s.put("t", "t"); + s.put("x", "x"); + System.out.println("Compare " + m.toString() + " with " + s.toString()); + System.out.println("Join=" + joinConstructiveByEnumeration(m, s, true)); + System.out.println("Join=" + joinConstructiveByTest(m, s, true)); + System.out.println("Join=" + joinConstructiveByTest(m, s, true)); + System.out.println("Join=" + joinConstructive(m, s, true)); + //System.out.println("Exclude=" + excludeConstructiveByTestMapInSet(m, s.keySet())); - /* - for (int low = 0; low < 10; low++) - for (int high = 0; high < 100; high=high + 10) { - int stepsEnum = 10 * high; - int stepsTest = 12 * log2(high) * low; - System.out.println("low=" + low + ", high=" + high + ", stepsEnum=" + stepsEnum + ", stepsTest=" + stepsTest + "; best method is " + ((stepsEnum < stepsTest) ? "joinByEnumeration" : "joinByTest")); - } - */ + /* + for (int low = 0; low < 10; low++) + for (int high = 0; high < 100; high=high + 10) { + int stepsEnum = 10 * high; + int stepsTest = 12 * log2(high) * low; + System.out.println("low=" + low + ", high=" + high + ", stepsEnum=" + stepsEnum + ", stepsTest=" + stepsTest + "; best method is " + ((stepsEnum < stepsTest) ? "joinByEnumeration" : "joinByTest")); + } + */ }