FreeLing
4.0
|
00001 00002 // 00003 // FreeLing - Open Source Language Analyzers 00004 // 00005 // Copyright (C) 2014 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This library is free software; you can redistribute it and/or 00009 // modify it under the terms of the GNU Affero General Public 00010 // License as published by the Free Software Foundation; either 00011 // version 3 of the License, or (at your option) any later version. 00012 // 00013 // This library is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // Affero General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU Affero General Public 00019 // License along with this library; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00021 // 00022 // contact: Lluis Padro (padro@lsi.upc.es) 00023 // TALP Research Center 00024 // despatx C6.212 - Campus Nord UPC 00025 // 08034 Barcelona. SPAIN 00026 // 00028 00029 #ifndef _AUTOMAT 00030 #define _AUTOMAT 00031 00032 #include <set> 00033 00034 #include "freeling/windll.h" 00035 #include "freeling/morfo/language.h" 00036 #include "freeling/morfo/processor.h" 00037 #include "freeling/morfo/traces.h" 00038 00039 namespace freeling { 00040 00041 #define MAX_STATES 100 00042 #define MAX_TOKENS 50 00043 00044 #define MOD_TRACENAME L"AUTOMAT" 00045 #define MOD_TRACECODE AUTOMAT_TRACE 00046 00050 00051 class automat_status : public processor_status { 00052 public: 00053 // shift beggining of multiword by N words (in_1982 -> 1982) 00054 int shiftbegin; 00055 }; 00056 00103 00104 template <class T> 00105 class WINDLL automat : public processor { 00106 private: 00109 virtual int ComputeToken(int, sentence::iterator &, sentence &) const =0; 00112 virtual void ResetActions(T *) const =0; 00116 virtual void StateActions(int, int, int, sentence::const_iterator, T *) const =0; 00119 virtual void SetMultiwordAnalysis(sentence::iterator, int, const T *) const =0; 00122 virtual bool ValidMultiWord(const word &w, T *st) const { return(true); } 00123 00125 virtual sentence::iterator BuildMultiword(sentence &se, sentence::iterator start, sentence::iterator end, int fs, bool &built, T *st) const { 00126 sentence::iterator i; 00127 std::list<word> mw; 00128 std::wstring form; 00129 00130 TRACE(3,L"Building multiword"); 00131 00132 // ignore initial tokens, if needed (e.g. in_1982 -> 1982) 00133 for (int i=0; i<((automat_status*)st)->shiftbegin && start!=end; i++) start++; 00134 00135 for (i=start; i!=end; i++){ 00136 mw.push_back(*i); 00137 form += i->get_form()+L"_"; 00138 TRACE(3,L"added next ["+form+L"]"); 00139 } 00140 // don't forget last word 00141 mw.push_back(*i); 00142 form += i->get_form(); 00143 TRACE(3,L"added last ["+form+L"]"); 00144 00145 // build new word with the mw list, and check whether it is acceptable 00146 word w(form,mw); 00147 00148 if (ValidMultiWord(w,st)) { 00149 TRACE(3,L"Valid Multiword. Modifying the sentence"); 00150 00151 // erasing from the sentence the words that composed the multiword 00152 end++; 00153 i=se.erase(start, end); 00154 // insert new multiword it into the sentence 00155 i=se.insert(i,w); 00156 00157 TRACE(3,L"New word inserted"); 00158 // Set morphological info for new MW 00159 SetMultiwordAnalysis(i,fs,st); 00160 built=true; 00161 } 00162 else { 00163 TRACE(3,L"Multiword found, but rejected. Sentence untouched"); 00164 ResetActions(st); 00165 i=start; 00166 built=false; 00167 } 00168 00169 return(i); 00170 } 00171 00172 00173 protected: 00175 int initialState; 00177 int stopState; 00179 int trans[MAX_STATES][MAX_TOKENS]; 00181 std::set<int> Final; 00182 00183 public: 00185 automat<T>() {}; 00187 virtual ~automat<T>() {}; 00188 00190 bool matching(sentence &se, sentence::iterator &i) const { 00191 sentence::iterator j,sMatch,eMatch; 00192 int newstate, state, token, fstate; 00193 bool found=false; 00194 00195 TRACE(3,L"Checking for mw starting at word '"+i->get_form()+L"'"); 00196 00197 T *pst = new T(); 00198 se.set_processing_status((processor_status *)pst); 00199 00200 // reset automaton 00201 state=initialState; 00202 fstate=0; 00203 ResetActions(pst); 00204 ((automat_status *)pst)->shiftbegin=0; 00205 00206 sMatch=i; eMatch=se.end(); 00207 for (j=i;state != stopState && j!=se.end(); j++) { 00208 // request the child class to compute the token 00209 // code for current word in current state 00210 token = ComputeToken(state,j,se); 00211 // do the transition to new state 00212 newstate = trans[state][token]; 00213 // let the child class perform any actions 00214 // for the new state (e.g. computing date value...) 00215 StateActions(state, newstate, token, j, pst); 00216 // change state 00217 state = newstate; 00218 // if the state codes a valid match, remember it 00219 // as the longest match found so long. 00220 if (Final.find(state)!=Final.end()) { 00221 eMatch=j; 00222 fstate=state; 00223 TRACE(3,L"New candidate found"); 00224 } 00225 } 00226 00227 TRACE(3,L"STOP state reached. Check longest match"); 00228 // stop state reached. find longest match (if any) and build a multiword 00229 if (eMatch!=se.end()) { 00230 TRACE(3,L"Match found"); 00231 i = BuildMultiword(se, sMatch, eMatch, fstate, found, pst); 00232 TRACE_SENTENCE(3,se); 00233 } 00234 00235 se.clear_processing_status(); 00236 return(found); 00237 } 00238 00239 00241 void analyze(sentence &se) const { 00242 sentence::iterator i; 00243 bool found=false; 00244 00245 // check whether there is a match starting at each position i 00246 for (i=se.begin(); i!=se.end(); i++) { 00247 if (not i->is_locked()) { 00248 if (matching(se, i)) found=true; 00249 } 00250 else TRACE(3,L"Word '"+i->get_form()+L"' is locked. Skipped."); 00251 } 00252 00253 if (found) se.rebuild_word_index(); 00254 00255 // Printing module results 00256 TRACE_SENTENCE(1,se); 00257 } 00258 00260 using processor::analyze; 00261 }; 00262 00263 #undef MOD_TRACENAME 00264 #undef MOD_TRACECODE 00265 00266 } // namespace 00267 00268 #endif 00269