FreeLing  4.0
relaxcor_fex.h
Go to the documentation of this file.
00001 
00002 //
00003 //    FreeLing - Open Source Language Analyzers
00004 //
00005 //    Copyright (C) 2014   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This library is free software; you can redistribute it and/or
00009 //    modify it under the terms of the GNU Affero General Public
00010 //    License as published by the Free Software Foundation; either
00011 //    version 3 of the License, or (at your option) any later version.
00012 //
00013 //    This library is distributed in the hope that it will be useful,
00014 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 //    Affero General Public License for more details.
00017 //
00018 //    You should have received a copy of the GNU Affero General Public
00019 //    License along with this library; if not, write to the Free Software
00020 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00021 //
00022 //    contact: Lluis Padro (padro@lsi.upc.es)
00023 //             TALP Research Center
00024 //             despatx C6.212 - Campus Nord UPC
00025 //             08034 Barcelona.  SPAIN
00026 //
00028 
00030 //
00031 //    Author: Jordi Turmo
00032 //    e-mail: turmo@lsi.upc.edu
00033 //
00034 //    This is an implementation based on the PhD Thesis of Emili Sapena
00035 //
00037 
00038 #ifndef RELAXCOR_FEX_H
00039 #define RELAXCOR_FEX_H
00040 
00041 #include <list>
00042 #include <fstream>
00043 #include <iostream>
00044 #include <string>
00045 #include <vector>
00046 #include <algorithm>
00047 #include <cctype>
00048 
00049 #include "freeling/morfo/language.h"
00050 #include "freeling/morfo/semdb.h"
00051 #include "freeling/morfo/tagset.h"
00052 #include "freeling/morfo/relaxcor_model.h"
00053 
00054 namespace freeling {
00055 
00056   // -- Feature group codes    
00057 #define RCF_SET_STRUCTURAL      0x00000001
00058 #define RCF_SET_LEXICAL         0x00000002
00059 #define RCF_SET_MORPHO          0x00000004
00060 #define RCF_SET_SYNTACTIC       0x00000008
00061 #define RCF_SET_SEMANTIC        0x00000010
00062 #define RCF_SET_DISCOURSE       0x00000020
00063   //                                                                          
00064 #define RCF_SET_ALL             0xFFFFFFFF
00065 
00066 
00070 
00071   class relaxcor_fex {
00072   public:
00073 
00074     typedef std::map<std::wstring, relaxcor_model::Tfeatures > Mfeatures;
00075 
00076     relaxcor_fex(const std::wstring&, relaxcor_model *, const std::wstring &lang=L"");
00077     ~relaxcor_fex();
00078 
00080     static void print(relaxcor_fex::Mfeatures&, unsigned int);
00081 
00082     void extract(std::vector<mention>&, Mfeatures &);
00083 
00084   private:
00085     static const freeling::regexp acronym_re1;
00086     static const freeling::regexp acronym_re2;
00087     static const freeling::regexp en_reflexive_re;
00088     static const freeling::regexp en_demostrative_re;
00089     static const freeling::regexp en_indefinite_re;
00090     static const freeling::regexp initial_letter_re1;
00091     static const freeling::regexp initial_letter_re2;
00092     static const freeling::regexp en_det_singular_re;
00093     static const freeling::regexp en_det_plural_re;
00094     static const freeling::regexp cat_verb_be_re1;
00095     static const freeling::regexp cat_verb_be_re2;
00096     static const freeling::regexp en_verb_be_re; 
00097     static const freeling::regexp es_verb_be_re; 
00098     static const freeling::regexp arg_re; 
00099     static const freeling::regexp role_re; 
00100 
00101     typedef enum {IN_QUOTES, HEAD_TERM, IS_ACRONYM, POSSESSIVE, NUMBER, GENDER, SEM_CLASS, THIRD_PERSON, REFLEXIVE, DEF_NP, INDEF_NP, DEM_NP, MAXIMAL_NP, EMBEDDED_NOUN} mentionFeature;
00102     typedef enum {ARGUMENTS, ROLES} mentionWsFeature;
00103 
00104     #define ID(x) model->feature_name_id(x)
00105     #define VERY_BIG 100000
00106 
00109     std::wstring _Language;
00111     relaxcor_model *model;
00113     semanticDB *_Semdb;
00115     unsigned int _Active_features;
00117     tagset *_POS_tagset;
00119     std::map<std::wstring, freeling::regexp> _Labels;
00121     std::set<std::wstring> _Det_words;
00123     std::map<std::wstring, std::map<std::wstring, std::wstring> > _Prons_feat;
00125     std::map<std::wstring,std::pair<std::wstring, freeling::regexp> > _Sem_classes;
00127     std::map<std::wstring,std::wstring> _Capitals;
00128     std::map<std::wstring,std::wstring> _Nationalities;
00129     std::multimap<std::wstring,std::wstring> _Countries;
00130     std::vector<freeling::regexp> _GPE_regexps;
00132     std::map<std::wstring, std::vector<unsigned int> > _Forenames;
00134     std::map<std::wstring, std::vector<unsigned int> > _Nicks;
00136     std::map<std::wstring, std::wstring> _Person_Names;
00138     std::map<std::wstring, std::wstring> _Titles;
00140     std::map<std::wstring, freeling::regexp> _AcroTerms;
00141 
00143     std::map<int, std::map<mentionFeature, unsigned int> > features;
00144     std::map<int, std::map<mentionWsFeature, std::vector<std::wstring> >> wsfeatures;
00145 
00147     void set_feature(int, mentionFeature, unsigned int);
00148     void set_feature(int, mentionWsFeature, const std::vector<std::wstring>&);
00149     void clean_features();
00150     unsigned int get_feature(int, mentionFeature) const;
00151     const std::vector<std::wstring>& get_feature(int, mentionWsFeature) const;
00152     bool computed_feature(int, mentionFeature) const;
00153     bool computed_feature(int, mentionWsFeature) const;
00154     std::wstring subvector2wstring(const std::vector<std::wstring>&, unsigned int, unsigned int, const std::wstring&);
00155 
00157     void get_structural(const mention&, const mention&, relaxcor_model::Tfeatures&);
00158     void get_lexical(const mention&, const mention&, relaxcor_model::Tfeatures&);
00159     void get_morphological(const mention &, const mention&, relaxcor_model::Tfeatures&, std::vector<mention>&);
00160     void get_syntactic(const mention &, const mention&, relaxcor_model::Tfeatures&, std::vector<mention>&);
00161     void get_semantic(const mention &, const mention&, relaxcor_model::Tfeatures&, std::vector<mention>&);
00162     void get_discourse(const mention &, const mention&, relaxcor_model::Tfeatures&);
00163 
00164     void get_group_features(std::vector<mention>&, relaxcor_model::Tfeatures&);
00165 
00167     unsigned int dist_in_phrases(const mention&, const mention&); 
00168     unsigned int in_quotes(const mention&);
00169     bool appositive(const mention&, const mention&);
00170     bool nested(const mention&, const mention&);
00171     bool intersected(const mention&, const mention&);
00172     bool string_match(const mention&, const mention&);
00173     bool pronoun_string_match(const mention&, const mention&, bool);
00174     bool proper_noun_string_match(const mention&, const mention&, bool);
00175     bool no_pronoun_string_match(const mention&, const mention&, bool);
00176     unsigned int head_is_term(const mention&);
00177     unsigned int alias(const mention&, const mention&);
00178     unsigned int is_possessive(const mention&);
00179     unsigned int same_number(const mention&, const mention&);
00180     unsigned int same_gender(const mention&, const mention&);
00181     unsigned int is_3rd_person(const mention&);
00182     unsigned int agreement(const mention&, const mention&);
00183     unsigned int closest_agreement(const mention&, const mention&, std::vector<mention>&);
00184     unsigned int is_reflexive(const mention&);
00185     // syntactic
00186     unsigned int is_def_NP(const mention&);
00187     unsigned int is_dem_NP(const mention&);
00188     bool share_maximal_NP(const mention&, const mention&, std::vector<mention>&);
00189     unsigned int is_maximal_NP(const mention&, std::vector<mention>&);
00190     unsigned int is_indef_NP(const mention&);
00191     unsigned int is_embedded_noun(const mention&, std::vector<mention>&);
00192     bool binding_pos(const mention&, const mention&, bool);
00193     bool binding_neg(const mention&, const mention&, bool);
00194     void get_arguments(const mention&, std::wstring&, std::wstring&);
00195     bool same_preds(bool, const std::wstring&, const std::wstring&);
00196     bool same_args(bool, const std::wstring&, const std::wstring&, relaxcor_model::Tfeatures&);
00197     // semantic
00198     bool separated_by_verb_is(const mention&, const mention&, std::vector<mention>&);
00199     bool sem_class_match(const mention&, const mention&);
00200     bool is_semantic_type(const mention&, const std::wstring&);
00201     bool animacy(const mention&, const mention&);
00202     bool incompatible(const mention&, const mention&);
00203     void get_roles(const mention&, std::vector<std::wstring>& );
00204     bool same_roles(const std::vector<std::wstring>&, const std::vector<std::wstring>&);
00206     void read_countries_capitals(const std::wstring&);
00207     void read_gpe_regexps(const std::wstring&);
00208     void read_pairs(const std::wstring&, std::map<std::wstring, std::wstring>&);
00209     void read_same_names(const std::wstring&, std::map<std::wstring, std::vector<unsigned int> >&);
00210     std::wstring drop_det(const mention&);
00211     std::wstring compute_term(const mention&);
00212     unsigned int geo_match(const mention&, const mention&);
00213     std::wstring string_merge(const mention&, bool);
00214     std::vector<std::wstring> split_words(const std::wstring&);
00215     bool is_acronym(const std::wstring&);
00216     unsigned int acronym_of(const std::vector<std::wstring>&, const std::vector<std::wstring>&);
00217     unsigned int initials_match(const std::vector<std::wstring>&, const std::vector<std::wstring>&);
00218     double lex_dist(const std::wstring&, const std::wstring&);
00219     unsigned int nick_name_match(const std::wstring&, const std::wstring&);
00220     unsigned int forenames_match(const std::vector<std::wstring>&, const std::vector<std::wstring>&);
00221     unsigned int first_name_match(const std::wstring&, const std::vector<std::wstring>&);
00222     double levenshtein(const std::wstring&, const std::wstring&);
00223     unsigned int get_number(const mention&);
00224     unsigned int get_gender(const mention&);
00225     std::wstring extract_msd_feature(const std::wstring &tag, const std::wstring &feature) const;
00226       //std::wstring extract_number(const std::wstring&);
00227       //std::wstring extract_gender(const std::wstring&);
00228       //std::wstring extract_person(const std::wstring&);
00229     mention::SEMmentionType extract_semclass(const mention&);
00230     void isa(const std::wstring&, std::vector<bool>&);
00231     int get_maximal_NP(const mention&, std::vector<mention>&);
00232     const std::wstring& get_argument(sentence::predicates::const_iterator, dep_tree::const_iterator, paragraph::const_iterator);
00233     bool verb_is_between(const mention&, const mention&);
00234 
00235     void extract_pair(mention &, mention &, relaxcor_model::Tfeatures &, std::vector<mention>&);
00236     
00237   };
00238 
00239 } // namespace
00240 
00241 #endif