FreeLing  4.0
dates_modules.h
Go to the documentation of this file.
00001 
00002 //
00003 //    FreeLing - Open Source Language Analyzers
00004 //
00005 //    Copyright (C) 2014   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This library is free software; you can redistribute it and/or
00009 //    modify it under the terms of the GNU Affero General Public
00010 //    License as published by the Free Software Foundation; either
00011 //    version 3 of the License, or (at your option) any later version.
00012 //
00013 //    This library is distributed in the hope that it will be useful,
00014 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 //    Affero General Public License for more details.
00017 //
00018 //    You should have received a copy of the GNU Affero General Public
00019 //    License along with this library; if not, write to the Free Software
00020 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00021 //
00022 //    contact: Lluis Padro (padro@lsi.upc.es)
00023 //             TALP Research Center
00024 //             despatx C6.212 - Campus Nord UPC
00025 //             08034 Barcelona.  SPAIN
00026 //
00028 
00029 #ifndef _DATES_MOD
00030 #define _DATES_MOD
00031 
00032 #include <map>
00033 
00034 #include "freeling/regexp.h"
00035 #include "freeling/morfo/language.h"
00036 #include "freeling/morfo/automat.h"
00037 
00038 namespace freeling {
00039 
00040   // Date/time regular expressions definitions
00041 
00042   const std::wstring RE_ROMAN=L"^([IVXLCDM]+)$";
00043 
00044   // Default:
00045   const std::wstring RE_DATE_DF=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d)))/)(\\d{1,4}))$";
00046   const std::wstring RE_TIME1_DF=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:min|m)?)?)$";
00047   const std::wstring RE_TIME2_DF=L"^(?:((?:[0-5])?(?:\\d))(?:min\\.?|m\\.?))$";
00048 
00049   // Spanish:
00050   const std::wstring RE_DATE_ES=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|ene|feb|mar|abr|may|jun|jul|ago|sep|oct|nov|dic)/)(\\d{1,4}))$";
00051   const std::wstring RE_TIME1_ES=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$";
00052   const std::wstring RE_TIME2_ES=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$";
00053 
00054   // Catalan:
00055   const std::wstring RE_DATE_CA=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|gener|febrer|març|abril|maig|juny|juliol|agost|setembre|octubre|novembre|desembre|gen|feb|mar|abr|mai|jun|jul|ago|set|oct|nov|des)/)(\\d{1,4}))$";
00056   const std::wstring RE_TIME1_CA=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minuts|min|m)?)?)$";
00057   const std::wstring RE_TIME2_CA=L"^(?:((?:[0-5])?(?:\\d))(?:minuts|min\\.?|m\\.?))$";
00058 
00059   // English:
00060   const std::wstring RE_DATE_EN=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/)(\\d{1,4}))$";
00061   const std::wstring RE_TIME1_EN=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutes|min|m)?)?)$";
00062   const std::wstring RE_TIME2_EN=L"^(?:((?:[0-5])?(?:\\d))(?:minutes|min\\.?|m\\.?))$";
00063 
00064   // Galician:
00065   const std::wstring RE_DATE_GL=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|xaneiro|febreiro|marzo|abril|maio|xuño|xullo|agosto|setembro|outubro|novembro|decembro|xan|feb|mar|abr|mai|xuñ|xul|ago|set|out|nov|dec)/)(\\d{1,4}))$";
00066   const std::wstring RE_TIME1_GL=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$";
00067   const std::wstring RE_TIME2_GL=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$";
00068 
00069   // Portuguese:
00070   const std::wstring RE_DATE_PT=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro|jan|fev|mar|abr|mai|jun|jul|ago|set|out|nov|dez)/)(\\d{1,4}))$";
00071   const std::wstring RE_TIME1_PT=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$";
00072   const std::wstring RE_TIME2_PT=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$";
00073 
00074   // Russian:
00075   const std::wstring RE_DATE_RU=L"^([0]?[1-9]|[1|2][0-9]|[3][0|1])[./]([0]?[1-9]|[1][0-2])[./]([0-9]{4}|[0-9]{2})$";
00076   const std::wstring RE_TIME_RU=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:ч\\.?|:)(?:((?:[0-5])?(?:\\d))(?:минуты?|мин\\.?)?)?)$";
00077   const std::wstring RE_MINUTES_RU=L"^(?:((?:[0-5])?(?:\\d))(?:минуты?|мин\\.?))$";
00078 
00079   // French:
00080   const std::wstring RE_DATE_FR=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|janvier|février|fevrier|mars|avril|mai|juin|juillet|aout|août|septembre|octobre|novembre|décembre|decembre|fév|fev|mar|avr|juil|aou|sep|oct|nov|déc|dec)/)(\\d{1,4}))$";
00081   const std::wstring RE_TIME1_FR=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$";
00082   const std::wstring RE_TIME2_FR=L"^(?:((?:[0-5])?(?:\\d))(?:minutes|min\\.?|m\\.?))$";
00083 
00084 // Welsh:
00085   const std::wstring RE_DATE_CY=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|ionawr|chwefror|[mf]awrth|ebrill|[fm]ai|[fm]ehefin|n?gorffennaf|orffennaf|awst|[mf]edi|hydref|th?achwedd|dachwedd|nhachwedd|dd?u|ion|chwe|maw|ebr|meh|gorff|hyd|tach)/)(\\d{1,4}))$";
00086   const std::wstring RE_TIME1_CY=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:munud|min|m)?)?)$";
00087   const std::wstring RE_TIME2_CY=L"^(?:((?:[0-5])?(?:\\d))(?:munud|mun\\.?|m\\.?))$";
00088 
00089  // German:
00090   const std::wstring RE_DATE_DE=L"^(?:(?:((?:[0-3])?(?:\\d))[/\\.] ?)(?:((?:(?:[0-1])?(?:\\d))|[Jj]anuar|[Ff]ebruar|[Mm]ärz|[Aa]pril|[Mm]ai|[jJ]uni|[jJ]uli|[aA]ugust|[Ss]eptember|[oO]ktober|[nN]ovember|[Dd]ezember|jan|feb|märz|apr|mai|jun|jul|aug|sep|okt|nov|dez)[/\\.] ?)(\\d{1,4}))$";
00091   const std::wstring RE_TIME1_DE=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minuten|min|m)?)?)$";
00092   const std::wstring RE_TIME2_DE=L"^(?:((?:[0-5])?(?:\\d))(?:minuten|min\\.?|m\\.?))$";
00093 
00094 
00095   // Value of unspecified fields in normalized date
00096   const std::wstring UNKNOWN_SYMB = L"??";
00097 
00101 
00102   class dates_status : public automat_status {
00103   public:
00105     std::wstring century,year,month,day,weekday,hour,minute,meridian;
00107     int temp;
00108     int sign;  // for Catalan "un quart menys(-1)/i(1) cinc de sis" or 
00109     // for English: a quarter to(-1)/past(1) five.
00110 
00111     int daytemp; // for special state Gbb in English
00112     bool inGbb; 
00113 
00114     std::vector<std::wstring> rem;  // remember results of last matched RegEx
00115   };
00116 
00121 
00122   class dates_module: public automat<dates_status> {
00123 
00124   protected:
00126     std::map<std::wstring,int> nMes;
00128     std::map<std::wstring,std::wstring> nDia;
00130     std::map<std::wstring,int> tok;
00131 
00132     // required regular expressions objects
00133     freeling::regexp RE_Date;
00134     freeling::regexp RE_Time1;
00135     freeling::regexp RE_Time2;
00136     freeling::regexp RE_Roman;
00137 
00138     // to unify notation (01 -> 1), maybe adding an offset
00139     std::wstring normalize(const std::wstring &in, int offs=0) const;
00140 
00141   private:
00142     virtual void ResetActions(dates_status *) const;
00143 
00144   public:
00146     dates_module(const std::wstring &, const std::wstring &, const std::wstring &, const std::wstring &); 
00147     virtual ~dates_module() {}
00148   };
00149 
00150 
00155 
00156   class dates_default : public dates_module {
00157 
00158   private:
00159     int ComputeToken(int, sentence::iterator &, sentence &) const;
00160     void StateActions(int, int, int, sentence::const_iterator, dates_status *) const;
00161     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status *) const;
00162 
00163   public:
00165     dates_default();
00166   };
00167 
00172 
00173   class dates_es : public dates_module {
00174 
00175   private:
00176     int ComputeToken(int, sentence::iterator &, sentence &) const;
00177     void StateActions(int, int, int, sentence::const_iterator, dates_status*) const;
00178     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const;
00179 
00180   public:
00182     dates_es();
00183   };
00184 
00185 
00190 
00191   class dates_ca : public dates_module {
00192 
00193   private:
00194     int ComputeToken(int, sentence::iterator &, sentence &) const;
00195     void StateActions(int, int, int, sentence::const_iterator, dates_status*) const;
00196     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const;
00197 
00198   public:
00200     dates_ca();
00201   };
00202 
00207 
00208   class dates_gl : public dates_module {
00209 
00210   private:
00211     int ComputeToken(int, sentence::iterator &, sentence &) const;
00212     void StateActions(int, int, int, sentence::const_iterator, dates_status*) const;
00213     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const;
00214 
00215   public:
00217     dates_gl();
00218   };
00219 
00224 
00225   class dates_pt : public dates_module {
00226 
00227   private:
00228     int ComputeToken(int, sentence::iterator &, sentence &) const;
00229     void StateActions(int, int, int, sentence::const_iterator, dates_status*) const;
00230     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const;
00231 
00232   public:
00234     dates_pt();
00235   };
00236 
00237 
00242 
00243   class dates_en : public dates_module {
00244 
00245   private:
00246     int ComputeToken(int, sentence::iterator &, sentence &) const;
00247     void StateActions(int, int, int, sentence::const_iterator, dates_status*) const;
00248     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const;
00249 
00251     std::map<std::wstring,int> numDay;
00252 
00253   public:
00255     dates_en();
00256   };
00257 
00262 
00263   class dates_ru : public dates_module {
00264 
00265   private:
00266     int ComputeToken(int, sentence::iterator &, sentence &) const;
00267     void StateActions(int, int, int, sentence::const_iterator, dates_status*) const;
00268     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const;
00269 
00270     int GetPrevStateValue(dates_status *) const;
00271     void SetPrevStateValue(int, dates_status *) const;
00272 
00273   public:
00275     dates_ru();
00276   };
00277 
00282 
00283 #define FRDEBUG
00284   class dates_fr : public dates_module {
00285 
00286   private:
00287 
00288     int ComputeToken(int, sentence::iterator &, sentence &) const;
00289     void StateActions(int, int, int, sentence::const_iterator, dates_status*) const;
00290     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const;
00291    // for tracing
00292 #ifdef FRDEBUG
00293    std::map<int, std::wstring> stateNames;
00294    std::map<int, std::wstring> tokenNames;
00295    std::wstring tokenName(const int token) const;
00296    std::wstring stateName(const int state) const;
00297 #endif
00298   public:
00300     dates_fr();
00301   };
00302 
00303 
00308 
00309 #define DEDEBUG
00310   class dates_de : public dates_module {
00311 
00312   private:
00313 
00314     int ComputeToken(int, sentence::iterator &, sentence &) const;
00315     void StateActions(int, int, int, sentence::const_iterator, dates_status*) const;
00316     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const;
00317 
00319     std::map<std::wstring,int> nNumbers;
00321 
00322     mutable int lastValue;
00323 
00324    // for tracing
00325 #ifdef DEDEBUG
00326    std::map<int, std::wstring> stateNames;
00327    std::map<int, std::wstring> tokenNames;
00328    std::wstring tokenName(const int token) const;
00329    std::wstring stateName(const int state) const;
00330 #endif
00331   public:
00333     dates_de();
00334   };
00335 
00336 
00341 
00342   #define CYDEBUG
00343   class dates_cy : public dates_module {
00344 
00345   private:
00346 
00347     int ComputeToken(int, sentence::iterator &, sentence &) const;
00348     void StateActions(int, int, int, sentence::const_iterator, dates_status*) const;
00349     void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const;
00350    // for tracing
00351 #ifdef CYDEBUG
00352    std::map<int, std::wstring> stateNames;
00353    std::map<int, std::wstring> tokenNames;
00354    std::wstring tokenName(const int token) const;
00355    std::wstring stateName(const int state) const;
00356 #endif
00357   public:
00359     dates_cy();
00360   };
00361 
00362 } // namespace
00363 
00364 #endif
00365