FreeLing  4.0
automat.h
Go to the documentation of this file.
00001 
00002 //
00003 //    FreeLing - Open Source Language Analyzers
00004 //
00005 //    Copyright (C) 2014   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This library is free software; you can redistribute it and/or
00009 //    modify it under the terms of the GNU Affero General Public
00010 //    License as published by the Free Software Foundation; either
00011 //    version 3 of the License, or (at your option) any later version.
00012 //
00013 //    This library is distributed in the hope that it will be useful,
00014 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 //    Affero General Public License for more details.
00017 //
00018 //    You should have received a copy of the GNU Affero General Public
00019 //    License along with this library; if not, write to the Free Software
00020 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00021 //
00022 //    contact: Lluis Padro (padro@lsi.upc.es)
00023 //             TALP Research Center
00024 //             despatx C6.212 - Campus Nord UPC
00025 //             08034 Barcelona.  SPAIN
00026 //
00028 
00029 #ifndef _AUTOMAT
00030 #define _AUTOMAT
00031 
00032 #include <set>
00033 
00034 #include "freeling/windll.h"
00035 #include "freeling/morfo/language.h"
00036 #include "freeling/morfo/processor.h"
00037 #include "freeling/morfo/traces.h"
00038 
00039 namespace freeling {
00040 
00041 #define MAX_STATES 100
00042 #define MAX_TOKENS 50
00043 
00044 #define MOD_TRACENAME L"AUTOMAT"
00045 #define MOD_TRACECODE AUTOMAT_TRACE
00046 
00050 
00051   class automat_status : public processor_status {
00052   public:
00053     // shift beggining of multiword by N words (in_1982 -> 1982)
00054     int shiftbegin; 
00055   };
00056 
00103 
00104   template <class T>
00105     class WINDLL automat : public processor {
00106   private:
00109     virtual int ComputeToken(int, sentence::iterator &, sentence &) const =0;
00112     virtual void ResetActions(T *) const =0;
00116     virtual void StateActions(int, int, int, sentence::const_iterator, T *) const =0;
00119     virtual void SetMultiwordAnalysis(sentence::iterator, int, const T *) const =0;
00122     virtual bool ValidMultiWord(const word &w, T *st) const { return(true); }
00123 
00125     virtual sentence::iterator BuildMultiword(sentence &se, sentence::iterator start, sentence::iterator end, int fs, bool &built, T *st) const {
00126       sentence::iterator i;
00127       std::list<word> mw;
00128       std::wstring form;
00129     
00130       TRACE(3,L"Building multiword");
00131         
00132       // ignore initial tokens, if needed (e.g. in_1982 -> 1982)
00133       for (int i=0; i<((automat_status*)st)->shiftbegin && start!=end; i++) start++;
00134     
00135       for (i=start; i!=end; i++){
00136         mw.push_back(*i);           
00137         form += i->get_form()+L"_";
00138         TRACE(3,L"added next ["+form+L"]");
00139       } 
00140       // don't forget last word
00141       mw.push_back(*i);           
00142       form += i->get_form();
00143       TRACE(3,L"added last ["+form+L"]");
00144     
00145       // build new word with the mw list, and check whether it is acceptable
00146       word w(form,mw);
00147     
00148       if (ValidMultiWord(w,st)) {  
00149         TRACE(3,L"Valid Multiword. Modifying the sentence");
00150       
00151         // erasing from the sentence the words that composed the multiword
00152         end++;
00153         i=se.erase(start, end);
00154         // insert new multiword it into the sentence
00155         i=se.insert(i,w); 
00156       
00157         TRACE(3,L"New word inserted");
00158         // Set morphological info for new MW
00159         SetMultiwordAnalysis(i,fs,st);
00160         built=true;
00161       }
00162       else {
00163         TRACE(3,L"Multiword found, but rejected. Sentence untouched");
00164         ResetActions(st);
00165         i=start;
00166         built=false;
00167       }
00168     
00169       return(i);
00170     }
00171   
00172   
00173   protected:
00175     int initialState;
00177     int stopState;
00179     int trans[MAX_STATES][MAX_TOKENS];
00181     std::set<int> Final;
00182 
00183   public:
00185     automat<T>() {};
00187     virtual ~automat<T>() {};
00188 
00190     bool matching(sentence &se, sentence::iterator &i) const {
00191       sentence::iterator j,sMatch,eMatch; 
00192       int newstate, state, token, fstate;
00193       bool found=false;
00194 
00195       TRACE(3,L"Checking for mw starting at word '"+i->get_form()+L"'");
00196 
00197       T *pst = new T();
00198       se.set_processing_status((processor_status *)pst);  
00199     
00200       // reset automaton
00201       state=initialState;
00202       fstate=0;
00203       ResetActions(pst);
00204       ((automat_status *)pst)->shiftbegin=0;
00205     
00206       sMatch=i; eMatch=se.end();
00207       for (j=i;state != stopState && j!=se.end(); j++) {
00208         // request the child class to compute the token
00209         // code for current word in current state
00210         token = ComputeToken(state,j,se);
00211         // do the transition to new state
00212         newstate = trans[state][token];
00213         // let the child class perform any actions 
00214         // for the new state (e.g. computing date value...)
00215         StateActions(state, newstate, token, j, pst);
00216         // change state
00217         state = newstate;
00218         // if the state codes a valid match, remember it
00219         //  as the longest match found so long.
00220         if (Final.find(state)!=Final.end()) {
00221           eMatch=j;
00222           fstate=state;
00223           TRACE(3,L"New candidate found");
00224         }
00225       }
00226     
00227       TRACE(3,L"STOP state reached. Check longest match");
00228       // stop state reached. find longest match (if any) and build a multiword
00229       if (eMatch!=se.end()) {
00230         TRACE(3,L"Match found");
00231         i = BuildMultiword(se, sMatch, eMatch, fstate, found, pst);
00232         TRACE_SENTENCE(3,se);
00233       }
00234     
00235       se.clear_processing_status();
00236       return(found);
00237     }
00238   
00239 
00241     void analyze(sentence &se) const {
00242       sentence::iterator i;
00243       bool found=false;
00244 
00245       // check whether there is a match starting at each position i
00246       for (i=se.begin(); i!=se.end(); i++) {
00247         if (not i->is_locked()) {
00248           if (matching(se, i)) found=true;
00249         }
00250         else TRACE(3,L"Word '"+i->get_form()+L"' is locked. Skipped.");
00251       }
00252     
00253       if (found) se.rebuild_word_index();
00254     
00255       // Printing module results
00256       TRACE_SENTENCE(1,se);
00257     }
00258 
00260     using processor::analyze;
00261   };
00262 
00263 #undef MOD_TRACENAME
00264 #undef MOD_TRACECODE
00265 
00266 } // namespace
00267 
00268 #endif
00269