FreeLing
4.0
|
00001 00002 // 00003 // FreeLing - Open Source Language Analyzers 00004 // 00005 // Copyright (C) 2014 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This library is free software; you can redistribute it and/or 00009 // modify it under the terms of the GNU Affero General Public 00010 // License as published by the Free Software Foundation; either 00011 // version 3 of the License, or (at your option) any later version. 00012 // 00013 // This library is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // Affero General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU Affero General Public 00019 // License along with this library; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00021 // 00022 // contact: Lluis Padro (padro@lsi.upc.es) 00023 // TALP Research Center 00024 // despatx C6.212 - Campus Nord UPC 00025 // 08034 Barcelona. SPAIN 00026 // 00028 00029 #ifndef _DATES_MOD 00030 #define _DATES_MOD 00031 00032 #include <map> 00033 00034 #include "freeling/regexp.h" 00035 #include "freeling/morfo/language.h" 00036 #include "freeling/morfo/automat.h" 00037 00038 namespace freeling { 00039 00040 // Date/time regular expressions definitions 00041 00042 const std::wstring RE_ROMAN=L"^([IVXLCDM]+)$"; 00043 00044 // Default: 00045 const std::wstring RE_DATE_DF=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d)))/)(\\d{1,4}))$"; 00046 const std::wstring RE_TIME1_DF=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:min|m)?)?)$"; 00047 const std::wstring RE_TIME2_DF=L"^(?:((?:[0-5])?(?:\\d))(?:min\\.?|m\\.?))$"; 00048 00049 // Spanish: 00050 const std::wstring RE_DATE_ES=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|ene|feb|mar|abr|may|jun|jul|ago|sep|oct|nov|dic)/)(\\d{1,4}))$"; 00051 const std::wstring RE_TIME1_ES=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$"; 00052 const std::wstring RE_TIME2_ES=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$"; 00053 00054 // Catalan: 00055 const std::wstring RE_DATE_CA=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|gener|febrer|març|abril|maig|juny|juliol|agost|setembre|octubre|novembre|desembre|gen|feb|mar|abr|mai|jun|jul|ago|set|oct|nov|des)/)(\\d{1,4}))$"; 00056 const std::wstring RE_TIME1_CA=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minuts|min|m)?)?)$"; 00057 const std::wstring RE_TIME2_CA=L"^(?:((?:[0-5])?(?:\\d))(?:minuts|min\\.?|m\\.?))$"; 00058 00059 // English: 00060 const std::wstring RE_DATE_EN=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/)(\\d{1,4}))$"; 00061 const std::wstring RE_TIME1_EN=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutes|min|m)?)?)$"; 00062 const std::wstring RE_TIME2_EN=L"^(?:((?:[0-5])?(?:\\d))(?:minutes|min\\.?|m\\.?))$"; 00063 00064 // Galician: 00065 const std::wstring RE_DATE_GL=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|xaneiro|febreiro|marzo|abril|maio|xuño|xullo|agosto|setembro|outubro|novembro|decembro|xan|feb|mar|abr|mai|xuñ|xul|ago|set|out|nov|dec)/)(\\d{1,4}))$"; 00066 const std::wstring RE_TIME1_GL=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$"; 00067 const std::wstring RE_TIME2_GL=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$"; 00068 00069 // Portuguese: 00070 const std::wstring RE_DATE_PT=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro|jan|fev|mar|abr|mai|jun|jul|ago|set|out|nov|dez)/)(\\d{1,4}))$"; 00071 const std::wstring RE_TIME1_PT=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$"; 00072 const std::wstring RE_TIME2_PT=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$"; 00073 00074 // Russian: 00075 const std::wstring RE_DATE_RU=L"^([0]?[1-9]|[1|2][0-9]|[3][0|1])[./]([0]?[1-9]|[1][0-2])[./]([0-9]{4}|[0-9]{2})$"; 00076 const std::wstring RE_TIME_RU=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:ч\\.?|:)(?:((?:[0-5])?(?:\\d))(?:минуты?|мин\\.?)?)?)$"; 00077 const std::wstring RE_MINUTES_RU=L"^(?:((?:[0-5])?(?:\\d))(?:минуты?|мин\\.?))$"; 00078 00079 // French: 00080 const std::wstring RE_DATE_FR=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|janvier|février|fevrier|mars|avril|mai|juin|juillet|aout|août|septembre|octobre|novembre|décembre|decembre|fév|fev|mar|avr|juil|aou|sep|oct|nov|déc|dec)/)(\\d{1,4}))$"; 00081 const std::wstring RE_TIME1_FR=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$"; 00082 const std::wstring RE_TIME2_FR=L"^(?:((?:[0-5])?(?:\\d))(?:minutes|min\\.?|m\\.?))$"; 00083 00084 // Welsh: 00085 const std::wstring RE_DATE_CY=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|ionawr|chwefror|[mf]awrth|ebrill|[fm]ai|[fm]ehefin|n?gorffennaf|orffennaf|awst|[mf]edi|hydref|th?achwedd|dachwedd|nhachwedd|dd?u|ion|chwe|maw|ebr|meh|gorff|hyd|tach)/)(\\d{1,4}))$"; 00086 const std::wstring RE_TIME1_CY=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:munud|min|m)?)?)$"; 00087 const std::wstring RE_TIME2_CY=L"^(?:((?:[0-5])?(?:\\d))(?:munud|mun\\.?|m\\.?))$"; 00088 00089 // German: 00090 const std::wstring RE_DATE_DE=L"^(?:(?:((?:[0-3])?(?:\\d))[/\\.] ?)(?:((?:(?:[0-1])?(?:\\d))|[Jj]anuar|[Ff]ebruar|[Mm]ärz|[Aa]pril|[Mm]ai|[jJ]uni|[jJ]uli|[aA]ugust|[Ss]eptember|[oO]ktober|[nN]ovember|[Dd]ezember|jan|feb|märz|apr|mai|jun|jul|aug|sep|okt|nov|dez)[/\\.] ?)(\\d{1,4}))$"; 00091 const std::wstring RE_TIME1_DE=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minuten|min|m)?)?)$"; 00092 const std::wstring RE_TIME2_DE=L"^(?:((?:[0-5])?(?:\\d))(?:minuten|min\\.?|m\\.?))$"; 00093 00094 00095 // Value of unspecified fields in normalized date 00096 const std::wstring UNKNOWN_SYMB = L"??"; 00097 00101 00102 class dates_status : public automat_status { 00103 public: 00105 std::wstring century,year,month,day,weekday,hour,minute,meridian; 00107 int temp; 00108 int sign; // for Catalan "un quart menys(-1)/i(1) cinc de sis" or 00109 // for English: a quarter to(-1)/past(1) five. 00110 00111 int daytemp; // for special state Gbb in English 00112 bool inGbb; 00113 00114 std::vector<std::wstring> rem; // remember results of last matched RegEx 00115 }; 00116 00121 00122 class dates_module: public automat<dates_status> { 00123 00124 protected: 00126 std::map<std::wstring,int> nMes; 00128 std::map<std::wstring,std::wstring> nDia; 00130 std::map<std::wstring,int> tok; 00131 00132 // required regular expressions objects 00133 freeling::regexp RE_Date; 00134 freeling::regexp RE_Time1; 00135 freeling::regexp RE_Time2; 00136 freeling::regexp RE_Roman; 00137 00138 // to unify notation (01 -> 1), maybe adding an offset 00139 std::wstring normalize(const std::wstring &in, int offs=0) const; 00140 00141 private: 00142 virtual void ResetActions(dates_status *) const; 00143 00144 public: 00146 dates_module(const std::wstring &, const std::wstring &, const std::wstring &, const std::wstring &); 00147 virtual ~dates_module() {} 00148 }; 00149 00150 00155 00156 class dates_default : public dates_module { 00157 00158 private: 00159 int ComputeToken(int, sentence::iterator &, sentence &) const; 00160 void StateActions(int, int, int, sentence::const_iterator, dates_status *) const; 00161 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status *) const; 00162 00163 public: 00165 dates_default(); 00166 }; 00167 00172 00173 class dates_es : public dates_module { 00174 00175 private: 00176 int ComputeToken(int, sentence::iterator &, sentence &) const; 00177 void StateActions(int, int, int, sentence::const_iterator, dates_status*) const; 00178 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const; 00179 00180 public: 00182 dates_es(); 00183 }; 00184 00185 00190 00191 class dates_ca : public dates_module { 00192 00193 private: 00194 int ComputeToken(int, sentence::iterator &, sentence &) const; 00195 void StateActions(int, int, int, sentence::const_iterator, dates_status*) const; 00196 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const; 00197 00198 public: 00200 dates_ca(); 00201 }; 00202 00207 00208 class dates_gl : public dates_module { 00209 00210 private: 00211 int ComputeToken(int, sentence::iterator &, sentence &) const; 00212 void StateActions(int, int, int, sentence::const_iterator, dates_status*) const; 00213 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const; 00214 00215 public: 00217 dates_gl(); 00218 }; 00219 00224 00225 class dates_pt : public dates_module { 00226 00227 private: 00228 int ComputeToken(int, sentence::iterator &, sentence &) const; 00229 void StateActions(int, int, int, sentence::const_iterator, dates_status*) const; 00230 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const; 00231 00232 public: 00234 dates_pt(); 00235 }; 00236 00237 00242 00243 class dates_en : public dates_module { 00244 00245 private: 00246 int ComputeToken(int, sentence::iterator &, sentence &) const; 00247 void StateActions(int, int, int, sentence::const_iterator, dates_status*) const; 00248 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const; 00249 00251 std::map<std::wstring,int> numDay; 00252 00253 public: 00255 dates_en(); 00256 }; 00257 00262 00263 class dates_ru : public dates_module { 00264 00265 private: 00266 int ComputeToken(int, sentence::iterator &, sentence &) const; 00267 void StateActions(int, int, int, sentence::const_iterator, dates_status*) const; 00268 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const; 00269 00270 int GetPrevStateValue(dates_status *) const; 00271 void SetPrevStateValue(int, dates_status *) const; 00272 00273 public: 00275 dates_ru(); 00276 }; 00277 00282 00283 #define FRDEBUG 00284 class dates_fr : public dates_module { 00285 00286 private: 00287 00288 int ComputeToken(int, sentence::iterator &, sentence &) const; 00289 void StateActions(int, int, int, sentence::const_iterator, dates_status*) const; 00290 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const; 00291 // for tracing 00292 #ifdef FRDEBUG 00293 std::map<int, std::wstring> stateNames; 00294 std::map<int, std::wstring> tokenNames; 00295 std::wstring tokenName(const int token) const; 00296 std::wstring stateName(const int state) const; 00297 #endif 00298 public: 00300 dates_fr(); 00301 }; 00302 00303 00308 00309 #define DEDEBUG 00310 class dates_de : public dates_module { 00311 00312 private: 00313 00314 int ComputeToken(int, sentence::iterator &, sentence &) const; 00315 void StateActions(int, int, int, sentence::const_iterator, dates_status*) const; 00316 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const; 00317 00319 std::map<std::wstring,int> nNumbers; 00321 00322 mutable int lastValue; 00323 00324 // for tracing 00325 #ifdef DEDEBUG 00326 std::map<int, std::wstring> stateNames; 00327 std::map<int, std::wstring> tokenNames; 00328 std::wstring tokenName(const int token) const; 00329 std::wstring stateName(const int state) const; 00330 #endif 00331 public: 00333 dates_de(); 00334 }; 00335 00336 00341 00342 #define CYDEBUG 00343 class dates_cy : public dates_module { 00344 00345 private: 00346 00347 int ComputeToken(int, sentence::iterator &, sentence &) const; 00348 void StateActions(int, int, int, sentence::const_iterator, dates_status*) const; 00349 void SetMultiwordAnalysis(sentence::iterator, int, const dates_status*) const; 00350 // for tracing 00351 #ifdef CYDEBUG 00352 std::map<int, std::wstring> stateNames; 00353 std::map<int, std::wstring> tokenNames; 00354 std::wstring tokenName(const int token) const; 00355 std::wstring stateName(const int state) const; 00356 #endif 00357 public: 00359 dates_cy(); 00360 }; 00361 00362 } // namespace 00363 00364 #endif 00365