FreeLing: analyzer.h Source File

Go to the documentation of this file.
00001 
00003 //
00004 //    FreeLing - Open Source Language Analyzers
00005 //
00006 //    Copyright (C) 2014   TALP Research Center
00007 //                         Universitat Politecnica de Catalunya
00008 //
00009 //    This library is free software; you can redistribute it and/or
00010 //    modify it under the terms of the GNU Affero General Public
00011 //    License as published by the Free Software Foundation; either
00012 //    version 3 of the License, or (at your option) any later version.
00013 //
00014 //    This library is distributed in the hope that it will be useful,
00015 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00017 //    Affero General Public License for more details.
00018 //
00019 //    You should have received a copy of the GNU Affero General Public
00020 //    License along with this library; if not, write to the Free Software
00021 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00022 //
00023 //    contact: Lluis Padro (padro@lsi.upc.es)
00024 //             TALP Research Center
00025 //             despatx C6.212 - Campus Nord UPC
00026 //             08034 Barcelona.  SPAIN
00027 //
00029 
00030 #ifndef _ANALYZER
00031 #define _ANALYZER
00032 
00033 #include <iostream> 
00034 #include <list>
00035 
00036 #include "freeling.h"
00037 
00038 namespace freeling {
00039 
00040 // codes for input-output formats
00041 typedef enum {TEXT,IDENT,TOKEN,SPLITTED,MORFO,TAGGED,SENSES,SHALLOW,PARSED,DEP,COREF,SEMGRAPH} AnalysisLevel;
00042 // codes for tagging algorithms
00043 typedef enum {NO_TAGGER,HMM,RELAX} TaggerAlgorithm;
00044 // codes for dependency parsers
00045 typedef enum {NO_DEP,TXALA,TREELER} DependencyParser;
00046 // codes for sense annotation
00047 typedef enum {NO_WSD,ALL,MFS,UKB} WSDAlgorithm;
00048 // codes for ForceSelect
00049 typedef enum {NO_FORCE,TAGGER,RETOK} ForceSelectStrategy;
00050 
00051 
00063 
00064 class WINDLL analyzer {
00065 
00066  private:
00067 
00076 
00077    class analyzer_config_options {
00078      public:
00080        std::wstring Lang;
00082        std::wstring TOK_TokenizerFile;
00084        std::wstring SPLIT_SplitterFile;
00086        std::wstring MACO_Decimal, MACO_Thousand;
00087        std::wstring MACO_UserMapFile, MACO_LocutionsFile,   MACO_QuantitiesFile,
00088          MACO_AffixFile,   MACO_ProbabilityFile, MACO_DictionaryFile, 
00089          MACO_NPDataFile,  MACO_PunctuationFile, MACO_CompoundFile;      
00090        double MACO_ProbabilityThreshold;
00092        std::wstring PHON_PhoneticsFile;
00094        std::wstring NEC_NECFile;
00096        std::wstring SENSE_ConfigFile;
00097        std::wstring UKB_ConfigFile;
00099        std::wstring TAGGER_HMMFile;
00100        std::wstring TAGGER_RelaxFile;
00101        int TAGGER_RelaxMaxIter;
00102        double TAGGER_RelaxScaleFactor;
00103        double TAGGER_RelaxEpsilon;
00104        bool TAGGER_Retokenize;
00105        ForceSelectStrategy TAGGER_ForceSelect;
00107        std::wstring PARSER_GrammarFile;
00109        std::wstring DEP_TxalaFile;   
00110        std::wstring DEP_TreelerFile;   
00112        std::wstring COREF_CorefFile;
00114        std::wstring SEMGRAPH_SemGraphFile;
00115    };
00116 
00126    
00127    class analyzer_invoke_options {
00128      public:
00130        AnalysisLevel InputLevel, OutputLevel;
00131 
00133        bool MACO_UserMap, MACO_AffixAnalysis, MACO_MultiwordsDetection, 
00134          MACO_NumbersDetection, MACO_PunctuationDetection, 
00135          MACO_DatesDetection, MACO_QuantitiesDetection, 
00136          MACO_DictionarySearch, MACO_ProbabilityAssignment, MACO_CompoundAnalysis,
00137          MACO_NERecognition, MACO_RetokContractions;
00138 
00140        bool PHON_Phonetics;
00141        bool NEC_NEClassification;
00142 
00144        WSDAlgorithm SENSE_WSD_which;
00145        TaggerAlgorithm TAGGER_which;
00146        DependencyParser DEP_which;    
00147    };
00148    
00149 
00150    // we use pointers to the analyzers, so we
00151    // can create only those strictly necessary.
00152    tokenizer *tk;
00153    splitter *sp;
00154    maco *morfo;
00155    nec *neclass;
00156    senses *sens;
00157    ukb *dsb;
00158    POS_tagger *hmm;
00159    POS_tagger *relax;
00160    phonetics *phon;
00161    chart_parser *parser;
00162    dep_txala *deptxala;
00163    dep_treeler *deptreeler;
00164    relaxcor *corfc;
00165    semgraph_extract *sge;
00166 
00167    // store configuration options
00168    //   config *cfg;
00169    analyzer_invoke_options current_invoke_options;
00170 
00171    // remember splitter session
00172    splitter::session_id sp_id;
00173 
00174    // remember token offsets in plain text input
00175    unsigned long offs;
00176    // number of sentences processed (used to generate sentence id's)
00177    unsigned long nsentence;
00178    // words pending of being splitted in InputMode==CORPUS
00179    std::list<word> tokens; 
00180   
00182    template<class T> void do_analysis(T &doc) const;
00183    // tokenize and split text.
00184    void tokenize_split(const std::wstring &text, 
00185                        std::list<sentence> &ls, 
00186                        unsigned long &offs, 
00187                        std::list<word> &av, 
00188                        unsigned long &nsent, 
00189                        bool flush, 
00190                        splitter::session_id sp_ses) const;
00191 
00192  public:
00193    typedef analyzer_config_options config_options;
00194    typedef analyzer_invoke_options invoke_options;
00195 
00196    analyzer(const config_options &cfg);
00197    void set_current_invoke_options(const invoke_options &opt, bool check=true);
00198    const invoke_options& get_current_invoke_options() const;
00199 
00200    ~analyzer();
00202    void analyze(document &doc) const;
00204    void analyze(std::list<sentence> &ls) const;
00206    void analyze(const wstring &text, document &doc, bool parag=false) const;
00209    void analyze(const wstring &text, std::list<sentence> &ls, bool flush=false);
00210    // flush splitter buffer and analyze any pending text. 
00211    void flush_buffer(std::list<sentence> &ls);
00212  
00213    void reset_offset();
00214 };
00215 
00216 
00217 
00218 } // namespace
00219 
00220 #endif
00221