FreeLing
4.0
|
00001 00003 // 00004 // FreeLing - Open Source Language Analyzers 00005 // 00006 // Copyright (C) 2014 TALP Research Center 00007 // Universitat Politecnica de Catalunya 00008 // 00009 // This library is free software; you can redistribute it and/or 00010 // modify it under the terms of the GNU Affero General Public 00011 // License as published by the Free Software Foundation; either 00012 // version 3 of the License, or (at your option) any later version. 00013 // 00014 // This library is distributed in the hope that it will be useful, 00015 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00017 // Affero General Public License for more details. 00018 // 00019 // You should have received a copy of the GNU Affero General Public 00020 // License along with this library; if not, write to the Free Software 00021 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00022 // 00023 // contact: Lluis Padro (padro@lsi.upc.es) 00024 // TALP Research Center 00025 // despatx C6.212 - Campus Nord UPC 00026 // 08034 Barcelona. SPAIN 00027 // 00029 00030 #ifndef _ANALYZER 00031 #define _ANALYZER 00032 00033 #include <iostream> 00034 #include <list> 00035 00036 #include "freeling.h" 00037 00038 namespace freeling { 00039 00040 // codes for input-output formats 00041 typedef enum {TEXT,IDENT,TOKEN,SPLITTED,MORFO,TAGGED,SENSES,SHALLOW,PARSED,DEP,COREF,SEMGRAPH} AnalysisLevel; 00042 // codes for tagging algorithms 00043 typedef enum {NO_TAGGER,HMM,RELAX} TaggerAlgorithm; 00044 // codes for dependency parsers 00045 typedef enum {NO_DEP,TXALA,TREELER} DependencyParser; 00046 // codes for sense annotation 00047 typedef enum {NO_WSD,ALL,MFS,UKB} WSDAlgorithm; 00048 // codes for ForceSelect 00049 typedef enum {NO_FORCE,TAGGER,RETOK} ForceSelectStrategy; 00050 00051 00063 00064 class WINDLL analyzer { 00065 00066 private: 00067 00076 00077 class analyzer_config_options { 00078 public: 00080 std::wstring Lang; 00082 std::wstring TOK_TokenizerFile; 00084 std::wstring SPLIT_SplitterFile; 00086 std::wstring MACO_Decimal, MACO_Thousand; 00087 std::wstring MACO_UserMapFile, MACO_LocutionsFile, MACO_QuantitiesFile, 00088 MACO_AffixFile, MACO_ProbabilityFile, MACO_DictionaryFile, 00089 MACO_NPDataFile, MACO_PunctuationFile, MACO_CompoundFile; 00090 double MACO_ProbabilityThreshold; 00092 std::wstring PHON_PhoneticsFile; 00094 std::wstring NEC_NECFile; 00096 std::wstring SENSE_ConfigFile; 00097 std::wstring UKB_ConfigFile; 00099 std::wstring TAGGER_HMMFile; 00100 std::wstring TAGGER_RelaxFile; 00101 int TAGGER_RelaxMaxIter; 00102 double TAGGER_RelaxScaleFactor; 00103 double TAGGER_RelaxEpsilon; 00104 bool TAGGER_Retokenize; 00105 ForceSelectStrategy TAGGER_ForceSelect; 00107 std::wstring PARSER_GrammarFile; 00109 std::wstring DEP_TxalaFile; 00110 std::wstring DEP_TreelerFile; 00112 std::wstring COREF_CorefFile; 00114 std::wstring SEMGRAPH_SemGraphFile; 00115 }; 00116 00126 00127 class analyzer_invoke_options { 00128 public: 00130 AnalysisLevel InputLevel, OutputLevel; 00131 00133 bool MACO_UserMap, MACO_AffixAnalysis, MACO_MultiwordsDetection, 00134 MACO_NumbersDetection, MACO_PunctuationDetection, 00135 MACO_DatesDetection, MACO_QuantitiesDetection, 00136 MACO_DictionarySearch, MACO_ProbabilityAssignment, MACO_CompoundAnalysis, 00137 MACO_NERecognition, MACO_RetokContractions; 00138 00140 bool PHON_Phonetics; 00141 bool NEC_NEClassification; 00142 00144 WSDAlgorithm SENSE_WSD_which; 00145 TaggerAlgorithm TAGGER_which; 00146 DependencyParser DEP_which; 00147 }; 00148 00149 00150 // we use pointers to the analyzers, so we 00151 // can create only those strictly necessary. 00152 tokenizer *tk; 00153 splitter *sp; 00154 maco *morfo; 00155 nec *neclass; 00156 senses *sens; 00157 ukb *dsb; 00158 POS_tagger *hmm; 00159 POS_tagger *relax; 00160 phonetics *phon; 00161 chart_parser *parser; 00162 dep_txala *deptxala; 00163 dep_treeler *deptreeler; 00164 relaxcor *corfc; 00165 semgraph_extract *sge; 00166 00167 // store configuration options 00168 // config *cfg; 00169 analyzer_invoke_options current_invoke_options; 00170 00171 // remember splitter session 00172 splitter::session_id sp_id; 00173 00174 // remember token offsets in plain text input 00175 unsigned long offs; 00176 // number of sentences processed (used to generate sentence id's) 00177 unsigned long nsentence; 00178 // words pending of being splitted in InputMode==CORPUS 00179 std::list<word> tokens; 00180 00182 template<class T> void do_analysis(T &doc) const; 00183 // tokenize and split text. 00184 void tokenize_split(const std::wstring &text, 00185 std::list<sentence> &ls, 00186 unsigned long &offs, 00187 std::list<word> &av, 00188 unsigned long &nsent, 00189 bool flush, 00190 splitter::session_id sp_ses) const; 00191 00192 public: 00193 typedef analyzer_config_options config_options; 00194 typedef analyzer_invoke_options invoke_options; 00195 00196 analyzer(const config_options &cfg); 00197 void set_current_invoke_options(const invoke_options &opt, bool check=true); 00198 const invoke_options& get_current_invoke_options() const; 00199 00200 ~analyzer(); 00202 void analyze(document &doc) const; 00204 void analyze(std::list<sentence> &ls) const; 00206 void analyze(const wstring &text, document &doc, bool parag=false) const; 00209 void analyze(const wstring &text, std::list<sentence> &ls, bool flush=false); 00210 // flush splitter buffer and analyze any pending text. 00211 void flush_buffer(std::list<sentence> &ls); 00212 00213 void reset_offset(); 00214 }; 00215 00216 00217 00218 } // namespace 00219 00220 #endif 00221