2014-12-14 13:43:30 +01:00
/ * *
* DateDetection
* Copyright 2014 by Michael Peter Christen
* First released 12 . 12 . 2014 at http : //yacy.net
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2 . 1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
package net.yacy.document ;
import java.text.ParseException ;
import java.text.SimpleDateFormat ;
import java.util.Date ;
import java.util.HashMap ;
import java.util.HashSet ;
import java.util.LinkedHashMap ;
import java.util.LinkedHashSet ;
import java.util.Locale ;
import java.util.Map ;
import java.util.Set ;
import java.util.TimeZone ;
import java.util.TreeMap ;
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
2015-03-02 13:10:05 +01:00
import net.yacy.cora.date.AbstractFormatter ;
2014-12-16 13:53:12 +01:00
import net.yacy.cora.date.GenericFormatter ;
2014-12-14 13:43:30 +01:00
/ * *
* The purpose of this class exceeds the demands on simple date parsing using a SimpleDateFormat
* because it tries to
* - discover where in a text a date is given
* - recognize human ways of date description and get it into a context , like ' next friday '
* - enrich partially given dates , i . e . when the year is omitted
* - understand different languages
* /
public class DateDetection {
private static final TimeZone TZ_GMT = TimeZone . getTimeZone ( " GMT " ) ;
private static final String CONPATT = " yyyy/MM/dd " ;
private static final SimpleDateFormat CONFORM = new SimpleDateFormat ( CONPATT , Locale . US ) ;
private static final LinkedHashMap < Language , String [ ] > Weekdays = new LinkedHashMap < > ( ) ;
private static final LinkedHashMap < Language , String [ ] > Months = new LinkedHashMap < > ( ) ;
private static final int [ ] MaxDaysInMonth = new int [ ] { 31 , 29 , 31 , 30 , 31 , 30 , 31 , 31 , 30 , 31 , 30 , 31 } ;
// to assign names for days and months, we must know what language is used to express that time
public static enum Language {
GERMAN , ENGLISH , FRENCH , SPANISH , ITALIAN ;
}
static {
CONFORM . setTimeZone ( TZ_GMT ) ;
// all names must be lowercase because compared strings are made to lowercase as well
Weekdays . put ( Language . GERMAN , new String [ ] { " montag " , " dienstag " , " mittwoch " , " donnerstag " , " freitag " , " samstag " /*oder: "sonnabend"*/ , " sonntag " } ) ;
Weekdays . put ( Language . ENGLISH , new String [ ] { " monday " , " tuesday " , " wednesday " , " thursday " , " friday " , " saturday " , " sunday " } ) ;
Weekdays . put ( Language . FRENCH , new String [ ] { " lundi " , " mardi " , " mercredi " , " jeudi " , " vendredi " , " samedi " , " dimanche " } ) ;
Weekdays . put ( Language . SPANISH , new String [ ] { " lunes " , " martes " , " miércoles " , " jueves " , " viernes " , " sábado " , " domingo " } ) ;
Weekdays . put ( Language . ITALIAN , new String [ ] { " lunedì " , " martedì " , " mercoledì " , " giovedì " , " venerdì " , " sabato " , " domenica " } ) ;
Months . put ( Language . GERMAN , new String [ ] { " januar " , " februar " , " märz " , " april " , " mai " , " juni " , " juli " , " august " , " september " , " oktober " , " november " , " dezember " } ) ;
Months . put ( Language . ENGLISH , new String [ ] { " january " , " february " , " march " , " april " , " may " , " june " , " july " , " august " , " september " , " october " , " november " , " december " } ) ;
Months . put ( Language . FRENCH , new String [ ] { " janvier " , " février " , " mars " , " avril " , " mai " , " juin " , " juillet " , " août " , " septembre " , " octobre " , " novembre " , " décembre " } ) ;
Months . put ( Language . SPANISH , new String [ ] { " enero " , " febrero " , " marzo " , " abril " , " mayo " , " junio " , " julio " , " agosto " , " septiembre " , " octubre " , " noviembre " , " diciembre " } ) ;
Months . put ( Language . ITALIAN , new String [ ] { " gennaio " , " febbraio " , " marzo " , " aprile " , " maggio " , " giugno " , " luglio " , " agosto " , " settembre " , " ottobre " , " novembre " , " dicembre " } ) ;
}
// RFC 822 day and month specification as a norm for date formats. This is needed to reconstruct the actual date later
public static enum Weekday {
Mon ( Weekdays , 0 ) ,
Tue ( Weekdays , 1 ) ,
Wed ( Weekdays , 2 ) ,
Thu ( Weekdays , 3 ) ,
Fri ( Weekdays , 4 ) ,
Sat ( Weekdays , 5 ) ,
Sun ( Weekdays , 6 ) ;
private final Map < String , Language > inLanguages ; // a map from the word to the language
public final int offset ; // the day offset in the week, monday = 0
private Weekday ( final LinkedHashMap < Language , String [ ] > weekdayMap , final int offset ) {
this . inLanguages = new HashMap < > ( ) ;
this . offset = offset ;
for ( Map . Entry < Language , String [ ] > entry : weekdayMap . entrySet ( ) ) {
this . inLanguages . put ( entry . getValue ( ) [ offset ] , entry . getKey ( ) ) ;
}
}
}
public static enum Month {
Jan ( 1 ) , Feb ( 2 ) , Mar ( 3 ) , Apr ( 4 ) , May ( 5 ) , Jun ( 6 ) ,
Jul ( 7 ) , Aug ( 8 ) , Sep ( 9 ) , Oct ( 10 ) , Nov ( 11 ) , Dec ( 12 ) ;
//private final Map<String, Language> inLanguages;
private final int count ;
private Month ( final int count ) {
this . count = count ;
}
}
public static enum EntityType {
YEAR ( new LinkedHashMap < Language , String [ ] > ( ) ) ,
MONTH ( Months ) ,
DAY ( new LinkedHashMap < Language , String [ ] > ( ) ) ,
WEEKDAYS ( Weekdays ) ;
LinkedHashMap < Language , String [ ] > languageTerms ;
EntityType ( LinkedHashMap < Language , String [ ] > languageTerms ) {
this . languageTerms = languageTerms ;
}
}
private final static Date TODAY = new Date ( ) ;
private final static int CURRENT_YEAR = Integer . parseInt ( CONFORM . format ( TODAY ) . substring ( 0 , 4 ) ) ; // we need that to parse dates without given years, see the ShortStyle class
private final static int CURRENT_MONTH = Integer . parseInt ( CONFORM . format ( TODAY ) . substring ( 5 , 7 ) ) ; // wee need that to generate recurring dates, see RecurringStyle class
2015-02-25 01:05:46 +01:00
private final static String BODNCG = " (?: \\ b|^) " ; // begin of date non-capturing group
private final static String EODNCG = " (?:[).:;! ]|$) " ; // end of date non-capturing group
2014-12-14 13:43:30 +01:00
private final static String SEPARATORNCG = " (?:/|-| - | \\ . \\ s|, \\ s| \\ .|,| \\ s) " ; // separator non-capturing group
private final static String DAYCAPTURE = " ( \\ d{1,2}) " ;
private final static String YEARCAPTURE = " ( \\ d{2}| \\ d{4}) " ;
private final static String MONTHCAPTURE = " ( \\ p{L}{3,}| \\ d{1,2}) " ;
public static class HolidayMap extends TreeMap < String , Date [ ] > {
private static final long serialVersionUID = 1L ;
public HolidayMap ( ) {
super ( String . CASE_INSENSITIVE_ORDER ) ;
}
}
public static HolidayMap Holidays = new HolidayMap ( ) ;
public static Map < Pattern , Date [ ] > HolidayPattern = new HashMap < > ( ) ;
static {
try {
// German
Holidays . put ( " Neujahr " , sameDayEveryYear ( 1 , 1 ) ) ;
Holidays . put ( " Heilige Drei Könige " , sameDayEveryYear ( 1 , 6 ) ) ;
Holidays . put ( " Valentinstag " , sameDayEveryYear ( 2 , 14 ) ) ;
Holidays . put ( " Weiberfastnacht " , new Date [ ] { CONFORM . parse ( " 2014/02/27 " ) , CONFORM . parse ( " 2015/02/12 " ) , CONFORM . parse ( " 2016/02/04 " ) } ) ;
Holidays . put ( " Weiberfasching " , Holidays . get ( " Weiberfastnacht " ) ) ;
Holidays . put ( " Rosenmontag " , new Date [ ] { CONFORM . parse ( " 2014/03/03 " ) , CONFORM . parse ( " 2015/03/16 " ) , CONFORM . parse ( " 2016/02/08 " ) } ) ;
Holidays . put ( " Faschingsdienstag " , new Date [ ] { CONFORM . parse ( " 2014/03/04 " ) , CONFORM . parse ( " 2015/03/17 " ) , CONFORM . parse ( " 2016/02/09 " ) } ) ;
Holidays . put ( " Fastnacht " , new Date [ ] { CONFORM . parse ( " 2014/03/04 " ) , CONFORM . parse ( " 2015/03/17 " ) , CONFORM . parse ( " 2016/02/09 " ) } ) ;
Holidays . put ( " Aschermittwoch " , new Date [ ] { CONFORM . parse ( " 2014/03/05 " ) , CONFORM . parse ( " 2015/03/18 " ) , CONFORM . parse ( " 2016/02/10 " ) } ) ;
Holidays . put ( " Palmsonntag " , new Date [ ] { CONFORM . parse ( " 2014/04/13 " ) , CONFORM . parse ( " 2015/03/29 " ) , CONFORM . parse ( " 2016/04/20 " ) } ) ;
Holidays . put ( " Gründonnerstag " , new Date [ ] { CONFORM . parse ( " 2014/04/17 " ) , CONFORM . parse ( " 2015/04/02 " ) , CONFORM . parse ( " 2016/04/24 " ) } ) ;
Holidays . put ( " Karfreitag " , new Date [ ] { CONFORM . parse ( " 2014/04/18 " ) , CONFORM . parse ( " 2015/04/03 " ) , CONFORM . parse ( " 2016/04/25 " ) } ) ;
Holidays . put ( " Karsamstag " , new Date [ ] { CONFORM . parse ( " 2014/04/19 " ) , CONFORM . parse ( " 2015/04/04 " ) , CONFORM . parse ( " 2016/04/26 " ) } ) ;
Holidays . put ( " Ostersonntag " , new Date [ ] { CONFORM . parse ( " 2014/04/20 " ) , CONFORM . parse ( " 2015/04/05 " ) , CONFORM . parse ( " 2016/04/27 " ) } ) ;
Holidays . put ( " Ostermontag " , new Date [ ] { CONFORM . parse ( " 2014/04/21 " ) , CONFORM . parse ( " 2015/04/06 " ) , CONFORM . parse ( " 2016/04/28 " ) } ) ;
2015-03-02 13:10:05 +01:00
Holidays . put ( " Ostern " , new Date [ ] { CONFORM . parse ( " 2014/04/20 " ) , CONFORM . parse ( " 2015/04/05 " ) , CONFORM . parse ( " 2016/04/27 " ) ,
CONFORM . parse ( " 2014/04/21 " ) , CONFORM . parse ( " 2015/04/06 " ) , CONFORM . parse ( " 2016/04/28 " ) } ) ;
2014-12-14 13:43:30 +01:00
Holidays . put ( " Walpurgisnacht " , sameDayEveryYear ( 4 , 30 ) ) ;
Holidays . put ( " Tag der Arbeit " , sameDayEveryYear ( 5 , 1 ) ) ;
Holidays . put ( " Muttertag " , new Date [ ] { CONFORM . parse ( " 2014/05/11 " ) , CONFORM . parse ( " 2015/05/10 " ) , CONFORM . parse ( " 2016/05/08 " ) } ) ;
Holidays . put ( " Christi Himmelfahrt " , new Date [ ] { CONFORM . parse ( " 2014/05/29 " ) , CONFORM . parse ( " 2015/05/14 " ) , CONFORM . parse ( " 2016/05/05 " ) } ) ;
Holidays . put ( " Pfingstsonntag " , new Date [ ] { CONFORM . parse ( " 2014/06/08 " ) , CONFORM . parse ( " 2015/05/24 " ) , CONFORM . parse ( " 2016/05/15 " ) } ) ;
Holidays . put ( " Pfingstmontag " , new Date [ ] { CONFORM . parse ( " 2014/06/09 " ) , CONFORM . parse ( " 2015/05/25 " ) , CONFORM . parse ( " 2016/05/16 " ) } ) ;
Holidays . put ( " Fronleichnam " , new Date [ ] { CONFORM . parse ( " 2014/06/19 " ) , CONFORM . parse ( " 2015/06/04 " ) , CONFORM . parse ( " 2016/05/25 " ) } ) ;
Holidays . put ( " Mariä Himmelfahrt " , sameDayEveryYear ( 8 , 15 ) ) ;
Holidays . put ( " Tag der Deutschen Einheit " , sameDayEveryYear ( 10 , 3 ) ) ;
Holidays . put ( " Reformationstag " , sameDayEveryYear ( 10 , 31 ) ) ;
Holidays . put ( " Allerheiligen " , sameDayEveryYear ( 11 , 1 ) ) ;
Holidays . put ( " Allerseelen " , sameDayEveryYear ( 11 , 2 ) ) ;
Holidays . put ( " Martinstag " , sameDayEveryYear ( 11 , 11 ) ) ;
Holidays . put ( " St. Martin " , Holidays . get ( " Martinstag " ) ) ;
Holidays . put ( " Volkstrauertag " , new Date [ ] { CONFORM . parse ( " 2014/11/16 " ) , CONFORM . parse ( " 2015/11/15 " ) , CONFORM . parse ( " 2016/11/13 " ) } ) ;
Holidays . put ( " Buß- und Bettag " , new Date [ ] { CONFORM . parse ( " 2014/11/19 " ) , CONFORM . parse ( " 2015/11/18 " ) , CONFORM . parse ( " 2016/11/16 " ) } ) ;
Holidays . put ( " Totensonntag " , new Date [ ] { CONFORM . parse ( " 2014/11/23 " ) , CONFORM . parse ( " 2015/11/22 " ) , CONFORM . parse ( " 2016/11/20 " ) } ) ;
Holidays . put ( " Nikolaus " , sameDayEveryYear ( 12 , 6 ) ) ;
Holidays . put ( " Heiligabend " , sameDayEveryYear ( 12 , 24 ) ) ;
Holidays . put ( " 1. Weihnachtsfeiertag " , sameDayEveryYear ( 12 , 25 ) ) ;
Holidays . put ( " 2. Weihnachtsfeiertag " , sameDayEveryYear ( 12 , 26 ) ) ;
Holidays . put ( " 1. Advent " , new Date [ ] { CONFORM . parse ( " 2014/11/30 " ) , CONFORM . parse ( " 2015/11/29 " ) , CONFORM . parse ( " 2016/11/27 " ) } ) ;
Holidays . put ( " 2. Advent " , new Date [ ] { CONFORM . parse ( " 2014/12/07 " ) , CONFORM . parse ( " 2015/12/06 " ) , CONFORM . parse ( " 2016/12/04 " ) } ) ;
Holidays . put ( " 3. Advent " , new Date [ ] { CONFORM . parse ( " 2014/12/14 " ) , CONFORM . parse ( " 2015/12/13 " ) , CONFORM . parse ( " 2016/12/11 " ) } ) ;
Holidays . put ( " 4. Advent " , new Date [ ] { CONFORM . parse ( " 2014/12/21 " ) , CONFORM . parse ( " 2015/12/20 " ) , CONFORM . parse ( " 2016/12/18 " ) } ) ;
Holidays . put ( " Silvester " , sameDayEveryYear ( 12 , 26 ) ) ;
// English
2015-03-02 13:10:05 +01:00
Holidays . put ( " Eastern " , Holidays . get ( " Ostern " ) ) ;
2014-12-14 13:43:30 +01:00
Holidays . put ( " New Year's Day " , Holidays . get ( " Neujahr " ) ) ;
Holidays . put ( " Epiphany " , Holidays . get ( " Heilige Drei Könige " ) ) ;
Holidays . put ( " Valentine's Day " , Holidays . get ( " Valentinstag " ) ) ;
Holidays . put ( " Orthodox Christmas " , sameDayEveryYear ( 1 , 7 ) ) ;
Holidays . put ( " St. Patrick's Day " , sameDayEveryYear ( 3 , 17 ) ) ;
Holidays . put ( " April Fools' Day " , sameDayEveryYear ( 4 , 1 ) ) ;
Holidays . put ( " Independence Day " , sameDayEveryYear ( 7 , 4 ) ) ;
Holidays . put ( " Halloween " , Holidays . get ( " Reformationstag " ) ) ;
Holidays . put ( " Immaculate Conception of the Virgin Mary " , sameDayEveryYear ( 12 , 8 ) ) ;
Holidays . put ( " Christmas Eve " , Holidays . get ( " Heiligabend " ) ) ;
Holidays . put ( " Christmas Day " , Holidays . get ( " 1. Weihnachtsfeiertag " ) ) ;
Holidays . put ( " Boxing Day " , Holidays . get ( " 2. Weihnachtsfeiertag " ) ) ;
Holidays . put ( " New Year's Eve " , Holidays . get ( " Silvester " ) ) ;
} catch ( ParseException e ) { }
for ( Map . Entry < String , Date [ ] > holiday : Holidays . entrySet ( ) ) {
HolidayPattern . put ( Pattern . compile ( BODNCG + holiday . getKey ( ) + EODNCG ) , holiday . getValue ( ) ) ;
}
}
private static Date [ ] sameDayEveryYear ( int month , int day ) {
Date [ ] r = new Date [ 4 ] ;
String d = " / " + ( month < 10 ? " 0 " + month : month ) + " / " + ( day < 10 ? " 0 " + day : day ) ;
for ( int y = 0 ; y < 4 ; y + + ) try { r [ y ] = CONFORM . parse ( ( CURRENT_YEAR + y - 1 ) + d ) ; } catch ( ParseException e ) { }
return r ;
}
/ * *
* The language recognition subclass understands date description parts in different languages .
* It can also be used to identify the language of a text , if that text uses words from a date vocabulary .
* /
public static class LanguageRecognition {
private final Pattern weekdayMatch , monthMatch ;
private final Set < Language > usedInLanguages ;
private final Map < String , Integer > weekdayIndex , monthIndex , monthIndexAbbrev ;
public LanguageRecognition ( Language [ ] languages ) {
this . usedInLanguages = new HashSet < Language > ( ) ;
// prepare a month index for the languages that this notion supports
this . weekdayIndex = new HashMap < > ( ) ;
this . monthIndex = new HashMap < > ( ) ;
this . monthIndexAbbrev = new HashMap < > ( ) ;
StringBuilder weekdayMatchString = new StringBuilder ( ) ;
StringBuilder monthMatchString = new StringBuilder ( ) ;
for ( Language language : languages ) {
this . usedInLanguages . add ( language ) ;
String [ ] weekdays = Weekdays . get ( language ) ;
if ( weekdays ! = null ) {
assert weekdays . length = = 7 ;
for ( int i = 0 ; i < 7 ; i + + ) {
this . weekdayIndex . put ( weekdays [ i ] , i ) ;
weekdayMatchString . append ( " |(?: " ) . append ( BODNCG ) . append ( weekdays [ i ] ) . append ( SEPARATORNCG ) . append ( EODNCG ) . append ( ')' ) ;
}
}
String [ ] months = Months . get ( language ) ;
if ( months ! = null ) {
assert months . length = = 12 ;
for ( int i = 0 ; i < 12 ; i + + ) {
monthIndex . put ( months [ i ] , i + 1 ) ;
monthMatchString . append ( " |(?: " ) . append ( BODNCG ) . append ( months [ i ] ) . append ( SEPARATORNCG ) . append ( EODNCG ) . append ( ')' ) ;
String abbrev = months [ i ] . substring ( 0 , 3 ) ;
if ( monthIndexAbbrev . containsKey ( abbrev ) & & monthIndexAbbrev . get ( abbrev ) . intValue ( ) ! = i + 1 )
monthIndexAbbrev . put ( abbrev , - 1 ) ; // ambiguous months get a -1
else
monthIndexAbbrev . put ( abbrev , i + 1 ) ;
}
}
}
this . weekdayMatch = Pattern . compile ( weekdayMatchString . length ( ) > 0 ? weekdayMatchString . substring ( 1 ) : " " ) ;
this . monthMatch = Pattern . compile ( monthMatchString . length ( ) > 0 ? monthMatchString . substring ( 1 ) : " " ) ;
}
/ * *
* this is an expensive check that looks if any of the words from the date expressions ( month and weekday expressions )
* appear in the text . This should only be used to verify a parse result if the result was ambiguous
* @param text
* @return true if one of the month and weekday expressions appear in the text
* /
public boolean usesLanguageOfNotion ( String text ) {
return this . weekdayMatch . matcher ( text ) . matches ( ) | | this . monthMatch . matcher ( text ) . matches ( ) ;
}
/ * *
* parse a part of a date
* @param entity
* @param object
* @return a scalar value associated with this date part
* /
public int parseEntity ( EntityType entity , String object ) {
if ( entity = = EntityType . YEAR ) {
try {
int i = Integer . parseInt ( object ) ;
if ( i < 100 ) i + = 2000 ; // yes that makes it possible to parse the years 0-99 and it will be incorrect in the year 2100 when that is abbreviated with 00
if ( i > CURRENT_YEAR + 10 ) return - 1 ; // there are very rarely dates in the future that far
return i ;
} catch ( NumberFormatException e ) {
return - 1 ;
}
}
if ( entity = = EntityType . MONTH ) {
try {
int i = Integer . parseInt ( object ) ;
if ( i > = 1 & & i < = 12 ) return i ;
return - 1 ; // no reason to try in a different way, its just a wrong number
} catch ( NumberFormatException e ) {
// this may be the name of a month
if ( object . length ( ) = = 3 ) {
// try RFC 822 names
object = object . substring ( 0 , 1 ) . toUpperCase ( ) + object . substring ( 1 ) . toLowerCase ( ) ;
try {
Month m = Month . valueOf ( object ) ;
return m . count ;
} catch ( IllegalArgumentException | NoClassDefFoundError ee ) { } // just ignore this, that was just a try to shorten things..
}
// try the collection of names for each language
object = object . toLowerCase ( ) ; // the stored month names are all lowercase
Integer i = this . monthIndex . get ( object ) ;
if ( i ! = null ) return i . intValue ( ) ;
// try an abbreviation
if ( object . length ( ) = = 3 ) {
i = this . monthIndexAbbrev . get ( object . substring ( 0 , 3 ) ) ;
if ( i ! = null ) return i . intValue ( ) ; // may also be -1!
}
return - 1 ;
}
}
if ( entity = = EntityType . DAY ) {
try {
int i = Integer . parseInt ( object ) ;
if ( i < 1 | | i > 31 ) return - 1 ;
return i ;
} catch ( NumberFormatException e ) {
return - 1 ;
}
}
return - 1 ;
}
}
private final static LanguageRecognition ENGLISH_LANGUAGE = new LanguageRecognition ( new Language [ ] { Language . ENGLISH } ) ;
private final static LanguageRecognition GERMAN_LANGUAGE = new LanguageRecognition ( new Language [ ] { Language . GERMAN } ) ;
private final static LanguageRecognition FRENCH_LANGUAGE = new LanguageRecognition ( new Language [ ] { Language . FRENCH } ) ;
private final static LanguageRecognition ENGLISH_GERMAN_LANGUAGE = new LanguageRecognition ( new Language [ ] { Language . GERMAN , Language . ENGLISH } ) ;
private final static LanguageRecognition ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE = new LanguageRecognition ( new Language [ ] { Language . GERMAN , Language . ENGLISH , Language . FRENCH , Language . SPANISH , Language . ITALIAN } ) ;
public static interface StyleParser {
/ * *
* get all dates in the text
* @param text
* @return a set of dates , ordered by occurrence .
* /
public LinkedHashSet < Date > parse ( String text ) ;
}
/ * *
* Regular expressions for various types of date writings .
* Uses terminology and data taken from :
* http : //en.wikipedia.org/wiki/Date_format_by_country
* /
public static enum EndianStyle implements StyleParser {
YMD ( EntityType . YEAR , EntityType . MONTH , EntityType . DAY , // Big-endian (year, month, day), e.g. 1996-04-22
ENGLISH_GERMAN_LANGUAGE , // GERMAN: 'official standard date format', ENGLISH: used in UK
BODNCG + YEARCAPTURE + SEPARATORNCG + MONTHCAPTURE + SEPARATORNCG + DAYCAPTURE + EODNCG
) ,
DMY ( EntityType . DAY , EntityType . MONTH , EntityType . YEAR , // Little-endian (day, month, year), e.g. 22.04.96 or 22/04/96 or 22 April 1996
ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE , // GERMAN: traditional, ENGLISH: used in UK
BODNCG + DAYCAPTURE + SEPARATORNCG + MONTHCAPTURE + SEPARATORNCG + YEARCAPTURE + EODNCG
) ,
MDY ( EntityType . MONTH , EntityType . DAY , EntityType . YEAR , // Middle-endian (month, day, year), e.g. 04/22/96 or April 22, 1996
ENGLISH_LANGUAGE , // ENGLISH: used in USA
BODNCG + MONTHCAPTURE + SEPARATORNCG + DAYCAPTURE + SEPARATORNCG + YEARCAPTURE + EODNCG
) ;
private final Pattern pattern ;
private final EntityType firstEntity , secondEntity , thirdEntity ;
public final LanguageRecognition languageParser ;
EndianStyle ( EntityType firstEntity , EntityType secondEntity , EntityType thirdEntity , LanguageRecognition languageParser , String patternString ) {
this . firstEntity = firstEntity ;
this . secondEntity = secondEntity ;
this . thirdEntity = thirdEntity ;
this . pattern = Pattern . compile ( patternString ) ;
this . languageParser = languageParser ;
}
/ * *
* get all dates in the text
* @param text
* @return a set of dates , ordered by occurrence .
* /
@Override
public LinkedHashSet < Date > parse ( final String text ) {
LinkedHashSet < Date > dates = new LinkedHashSet < > ( ) ;
Matcher matcher = this . pattern . matcher ( text ) ;
while ( matcher . find ( ) ) {
if ( ! ( matcher . groupCount ( ) = = 3 ) ) continue ;
String entity1 = matcher . group ( 1 ) ; if ( entity1 = = null ) continue ;
String entity2 = matcher . group ( 2 ) ; if ( entity2 = = null ) continue ;
String entity3 = matcher . group ( 3 ) ; if ( entity3 = = null ) continue ;
//System.out.println("FRAGMENTS: entity1=" + entity1 + ", entity2=" + entity2 + ", entity3=" + entity3); // DEBUG
int i1 = languageParser . parseEntity ( this . firstEntity , entity1 ) ;
if ( i1 < 0 ) continue ;
int i2 = languageParser . parseEntity ( this . secondEntity , entity2 ) ;
if ( i2 < 0 ) continue ;
int i3 = languageParser . parseEntity ( this . thirdEntity , entity3 ) ;
if ( i3 < 0 ) continue ;
int day = this . firstEntity = = EntityType . DAY ? i1 : this . secondEntity = = EntityType . DAY ? i2 : i3 ;
int month = this . firstEntity = = EntityType . MONTH ? i1 : this . secondEntity = = EntityType . MONTH ? i2 : i3 ;
if ( day > MaxDaysInMonth [ month - 1 ] ) continue ; // validity check of the day number
int year = this . firstEntity = = EntityType . YEAR ? i1 : this . secondEntity = = EntityType . YEAR ? i2 : i3 ;
synchronized ( CONFORM ) { try {
dates . add ( CONFORM . parse ( year + " / " + ( month < 10 ? " 0 " : " " ) + month + " / " + ( day < 10 ? " 0 " : " " ) + day ) ) ;
} catch ( ParseException e ) {
continue ;
} }
if ( dates . size ( ) > 100 ) { dates . clear ( ) ; break ; } // that does not make sense
}
return dates ;
}
}
public static enum ShortStyle implements StyleParser {
MD_ENGLISH ( EntityType . MONTH , EntityType . DAY , // Big-endian (month, day), e.g. "from october 1st to september 13th"
ENGLISH_LANGUAGE ,
BODNCG + " on " + MONTHCAPTURE + SEPARATORNCG + DAYCAPTURE + EODNCG
) ,
DM_GERMAN ( EntityType . DAY , EntityType . MONTH , // Little-endian (day, month), e.g. "am 1. April"
GERMAN_LANGUAGE ,
BODNCG + " am " + DAYCAPTURE + SEPARATORNCG + MONTHCAPTURE + EODNCG
) ,
DM_FRENCH ( EntityType . DAY , EntityType . MONTH , // Little-endian (day, month), e.g. "le 29 Septembre,"
FRENCH_LANGUAGE ,
BODNCG + " le " + DAYCAPTURE + " " + MONTHCAPTURE + EODNCG
) ,
DM_ITALIAN ( EntityType . DAY , EntityType . MONTH , // Little-endian (day, month), e.g. "il 29 settembre,"
FRENCH_LANGUAGE ,
BODNCG + " il " + DAYCAPTURE + " " + MONTHCAPTURE + EODNCG
) ,
DM_SPANISH ( EntityType . DAY , EntityType . MONTH , // Little-endian (day, month), e.g. "el 29 de septiembre,"
FRENCH_LANGUAGE ,
BODNCG + " el " + DAYCAPTURE + " de " + MONTHCAPTURE + EODNCG
) ;
public final Pattern pattern ;
private final EntityType firstEntity , secondEntity ;
public final LanguageRecognition languageParser ;
ShortStyle ( EntityType firstEntity , EntityType secondEntity , LanguageRecognition languageParser , String patternString ) {
this . firstEntity = firstEntity ;
this . secondEntity = secondEntity ;
this . pattern = Pattern . compile ( patternString ) ;
this . languageParser = languageParser ;
}
/ * *
* get all dates in the text
* @param text
* @return a set of dates , ordered by occurrence .
* /
@Override
public LinkedHashSet < Date > parse ( final String text ) {
LinkedHashSet < Date > dates = new LinkedHashSet < > ( ) ;
Matcher matcher = this . pattern . matcher ( text ) ;
2015-02-09 18:46:06 +01:00
//ConcurrentLog.info("DateDetection", "applying matcher: " + matcher.toString());
2014-12-14 13:43:30 +01:00
while ( matcher . find ( ) ) {
if ( ! ( matcher . groupCount ( ) = = 2 ) ) continue ;
String entity1 = matcher . group ( 1 ) ; if ( entity1 = = null ) continue ;
String entity2 = matcher . group ( 2 ) ; if ( entity2 = = null ) continue ;
//System.out.println("FRAGMENTS: entity1=" + entity1 + ", entity2=" + entity2 + ", entity3=" + entity3); // DEBUG
int i1 = languageParser . parseEntity ( this . firstEntity , entity1 ) ;
if ( i1 < 0 ) continue ;
int i2 = languageParser . parseEntity ( this . secondEntity , entity2 ) ;
if ( i2 < 0 ) continue ;
int day = this . firstEntity = = EntityType . DAY ? i1 : i2 ;
int month = this . firstEntity = = EntityType . MONTH ? i1 : i2 ;
if ( day > MaxDaysInMonth [ month - 1 ] ) continue ; // validity check of the day number
int thisyear = CURRENT_YEAR ;
int nextyear = CURRENT_YEAR + 1 ;
synchronized ( CONFORM ) { try {
String datestub = " / " + ( month < 10 ? " 0 " : " " ) + month + " / " + ( day < 10 ? " 0 " : " " ) + day ;
Date atThisYear = CONFORM . parse ( thisyear + datestub ) ;
Date atNextYear = CONFORM . parse ( nextyear + datestub ) ;
dates . add ( atThisYear ) ;
dates . add ( atNextYear ) ;
//dates.add(atThisYear.after(TODAY) ? atThisYear : atNextYear); // we consider these kind of dates as given for the future
} catch ( ParseException e ) {
continue ;
} }
if ( dates . size ( ) > 100 ) { dates . clear ( ) ; break ; } // that does not make sense
}
return dates ;
}
}
2015-03-02 13:10:05 +01:00
private static final HashMap < String , Long > specialDayOffset = new HashMap < > ( ) ;
static {
specialDayOffset . put ( " today " , 0L ) ; specialDayOffset . put ( " heute " , 0L ) ;
specialDayOffset . put ( " tomorrow " , AbstractFormatter . dayMillis ) ; specialDayOffset . put ( " morgen " , AbstractFormatter . dayMillis ) ;
specialDayOffset . put ( " dayaftertomorrow " , 2 * AbstractFormatter . dayMillis ) ; specialDayOffset . put ( " uebermorgen " , 2 * AbstractFormatter . dayMillis ) ;
specialDayOffset . put ( " yesterday " , - AbstractFormatter . dayMillis ) ; specialDayOffset . put ( " gestern " , - AbstractFormatter . dayMillis ) ;
}
2014-12-14 13:43:30 +01:00
/ * *
* get all dates in the text
* @param text
* @return a set of dates , ordered by time . first date in the ordered set is the oldest time .
* /
public static LinkedHashSet < Date > parse ( String text ) {
2015-03-02 13:10:05 +01:00
Long offset ;
if ( ( offset = specialDayOffset . get ( text ) ) ! = null ) {
LinkedHashSet < Date > dates = new LinkedHashSet < > ( ) ; dates . add ( new Date ( ( System . currentTimeMillis ( ) / AbstractFormatter . dayMillis ) * AbstractFormatter . dayMillis + offset . longValue ( ) ) ) ; return dates ;
}
2014-12-14 13:43:30 +01:00
LinkedHashSet < Date > dates = parseRawDate ( text ) ;
for ( Map . Entry < Pattern , Date [ ] > entry : HolidayPattern . entrySet ( ) ) {
if ( entry . getKey ( ) . matcher ( text ) . matches ( ) ) {
for ( Date d : entry . getValue ( ) ) dates . add ( d ) ;
}
}
return dates ;
}
2014-12-16 13:53:12 +01:00
public static Date parseLine ( String text ) {
Date d = null ;
try { d = CONFORM . parse ( text ) ; } catch ( ParseException e ) { }
2015-03-02 04:30:10 +01:00
//if (d == null) try {d = GenericFormatter.FORMAT_SHORT_DAY.parse(text);} catch (ParseException e) {} // did not work well and fired for wrong formats; do not use
2014-12-16 13:53:12 +01:00
if ( d = = null ) try { d = GenericFormatter . FORMAT_RFC1123_SHORT . parse ( text ) ; } catch ( ParseException e ) { }
if ( d = = null ) try { d = GenericFormatter . FORMAT_ANSIC . parse ( text ) ; } catch ( ParseException e ) { }
if ( d = = null ) {
Set < Date > dd = parse ( text ) ;
if ( dd . size ( ) > = 1 ) d = dd . iterator ( ) . next ( ) ;
}
return d ;
}
2014-12-14 13:43:30 +01:00
private static LinkedHashSet < Date > parseRawDate ( String text ) {
// get parse alternatives for different date styles; we consider that one document uses only one style
LinkedHashSet < Date > DMYDates = EndianStyle . DMY . parse ( text ) ;
2015-02-09 18:46:06 +01:00
ShortStyle [ ] shortStyleCheck = new ShortStyle [ ] { ShortStyle . DM_GERMAN , ShortStyle . DM_FRENCH , ShortStyle . DM_ITALIAN , ShortStyle . DM_SPANISH } ;
LinkedHashSet < Date > DMDates = new LinkedHashSet < > ( ) ;
for ( ShortStyle shortStyle : shortStyleCheck ) {
DMDates . addAll ( shortStyle . parse ( text ) ) ;
if ( DMDates . size ( ) > 0 ) break ;
}
2014-12-14 13:43:30 +01:00
DMYDates . addAll ( DMDates ) ;
LinkedHashSet < Date > MDYDates = DMYDates . size ( ) = = 0 ? EndianStyle . MDY . parse ( text ) : new LinkedHashSet < Date > ( 0 ) ;
LinkedHashSet < Date > MDDates = DMYDates . size ( ) = = 0 ? ShortStyle . MD_ENGLISH . parse ( text ) : new LinkedHashSet < Date > ( 0 ) ;
MDYDates . addAll ( MDDates ) ;
LinkedHashSet < Date > YMDDates = DMYDates . size ( ) = = 0 & & MDYDates . size ( ) = = 0 ? EndianStyle . YMD . parse ( text ) : new LinkedHashSet < Date > ( 0 ) ;
// if either one of them contains any and the other contain no date, chose that one (we don't want to mix them)
if ( YMDDates . size ( ) > 0 & & DMYDates . size ( ) = = 0 & & MDYDates . size ( ) = = 0 ) return YMDDates ;
if ( YMDDates . size ( ) = = 0 & & DMYDates . size ( ) > 0 & & MDYDates . size ( ) = = 0 ) return DMYDates ;
if ( YMDDates . size ( ) = = 0 & & DMYDates . size ( ) = = 0 & & MDYDates . size ( ) > 0 ) return MDYDates ;
// if we have several sets, check if we can detect the language from month or weekday expressions
// we sort out such sets, which do not contain any of these languages
boolean usesLanguageOfYMD = YMDDates . size ( ) > 0 ? false : EndianStyle . YMD . languageParser . usesLanguageOfNotion ( text ) ;
boolean usesLanguageOfDMY = DMYDates . size ( ) > 0 ? false : EndianStyle . DMY . languageParser . usesLanguageOfNotion ( text ) ;
boolean usesLanguageOfMDY = MDYDates . size ( ) > 0 ? false : EndianStyle . MDY . languageParser . usesLanguageOfNotion ( text ) ;
// now check again
if ( usesLanguageOfYMD & & ! usesLanguageOfDMY & & ! usesLanguageOfMDY ) return YMDDates ;
if ( ! usesLanguageOfYMD & & usesLanguageOfDMY & & ! usesLanguageOfMDY ) return DMYDates ;
if ( ! usesLanguageOfYMD & & ! usesLanguageOfDMY & & usesLanguageOfMDY ) return MDYDates ;
// if this fails, we return only the DMY format since that has the most chances to be right (it is mostly used)
// we choose DMYDates even if it is empty to avoid false positives.
return DMYDates ;
}
public static void main ( String [ ] args ) {
2015-02-25 01:05:46 +01:00
String fill = " " ; for ( int i = 0 ; i < 1000 ; i + + ) fill + = 'x' ;
2014-12-14 13:43:30 +01:00
String [ ] test = new String [ ] {
" \ n laden die Stadtwerke \ n X am Rosenmontag und am \ n Faschingsdienstag zur Disko auf die \ n " ,
" kein Datum im Text " ,
" Fastnacht am 4. März noch " ,
" Fastnacht am 4. April noch " ,
" heute 12. Dezember 2014. " ,
" heute 12. Dezember 2014 " ,
" 12. Dezember 2014. " ,
" heute 12. Dezember 2014 " ,
" heute 12. Dezember 2014. " ,
" Donnerstag, 18. Dezember 2014 xyz " ,
" Donnerstag, 18 Dezember 2014 xyz " ,
" Donnerstag, 18.Dezember 2014 xyz " ,
" Montag, 8. Dezember 2014 xyz " ,
" Montag, 8.Dezember 2014 xyz " ,
" Donnerstag, 18.12.2014 xyz " ,
" Montag, 8.12.2014 xyz " ,
" Donnerstag, 18.12.14 xyz " ,
" Montag, 8.12.14 xyz " ,
" Mitglied seit: 13. Januar 2007 xyz " ,
" Im Dezember 2014 xyz " ,
" 11.12.2014 " ,
" 11. September 2001 " ,
" 12.12.2014 08:43 " ,
" immer am 1. Dezember abends " ,
" immer am 31. Dezember abends " ,
" immer am 31. dezember abends " ,
" on october 20 every year " ,
" on october 20 every year " ,
" on September 29, " ,
" am Karfreitag um 15:00 Uhr "
} ;
long t = System . currentTimeMillis ( ) ;
for ( String s : test ) {
2015-02-25 01:05:46 +01:00
String parsed = parse ( fill + " " + s + " " + fill ) . toString ( ) ;
2014-12-14 13:43:30 +01:00
System . out . println ( " SOURCE: " + s ) ;
2015-02-25 01:05:46 +01:00
System . out . println ( " DATE : " + parsed ) ;
2014-12-14 13:43:30 +01:00
System . out . println ( ) ;
}
System . out . println ( " Runtime: " + ( System . currentTimeMillis ( ) - t ) + " milliseconds. " ) ;
}
}