FreeLing
4.0
|
00001 00002 // 00003 // FreeLing - Open Source Language Analyzers 00004 // 00005 // Copyright (C) 2014 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This library is free software; you can redistribute it and/or 00009 // modify it under the terms of the GNU Affero General Public 00010 // License as published by the Free Software Foundation; either 00011 // version 3 of the License, or (at your option) any later version. 00012 // 00013 // This library is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // Affero General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU Affero General Public 00019 // License along with this library; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00021 // 00022 // contact: Lluis Padro (padro@lsi.upc.es) 00023 // TALP Research Center 00024 // despatx C6.212 - Campus Nord UPC 00025 // 08034 Barcelona. SPAIN 00026 // 00028 00030 // 00031 // Author: Jordi Turmo 00032 // e-mail: turmo@lsi.upc.edu 00033 // 00034 // This is an implementation based on the PhD Thesis of Emili Sapena 00035 // 00037 00038 #ifndef RELAXCOR_FEX_H 00039 #define RELAXCOR_FEX_H 00040 00041 #include <list> 00042 #include <fstream> 00043 #include <iostream> 00044 #include <string> 00045 #include <vector> 00046 #include <algorithm> 00047 #include <cctype> 00048 00049 #include "freeling/morfo/language.h" 00050 #include "freeling/morfo/semdb.h" 00051 #include "freeling/morfo/tagset.h" 00052 #include "freeling/morfo/relaxcor_model.h" 00053 00054 namespace freeling { 00055 00056 // -- Feature group codes 00057 #define RCF_SET_STRUCTURAL 0x00000001 00058 #define RCF_SET_LEXICAL 0x00000002 00059 #define RCF_SET_MORPHO 0x00000004 00060 #define RCF_SET_SYNTACTIC 0x00000008 00061 #define RCF_SET_SEMANTIC 0x00000010 00062 #define RCF_SET_DISCOURSE 0x00000020 00063 // 00064 #define RCF_SET_ALL 0xFFFFFFFF 00065 00066 00070 00071 class relaxcor_fex { 00072 public: 00073 00074 typedef std::map<std::wstring, relaxcor_model::Tfeatures > Mfeatures; 00075 00076 relaxcor_fex(const std::wstring&, relaxcor_model *, const std::wstring &lang=L""); 00077 ~relaxcor_fex(); 00078 00080 static void print(relaxcor_fex::Mfeatures&, unsigned int); 00081 00082 void extract(std::vector<mention>&, Mfeatures &); 00083 00084 private: 00085 static const freeling::regexp acronym_re1; 00086 static const freeling::regexp acronym_re2; 00087 static const freeling::regexp en_reflexive_re; 00088 static const freeling::regexp en_demostrative_re; 00089 static const freeling::regexp en_indefinite_re; 00090 static const freeling::regexp initial_letter_re1; 00091 static const freeling::regexp initial_letter_re2; 00092 static const freeling::regexp en_det_singular_re; 00093 static const freeling::regexp en_det_plural_re; 00094 static const freeling::regexp cat_verb_be_re1; 00095 static const freeling::regexp cat_verb_be_re2; 00096 static const freeling::regexp en_verb_be_re; 00097 static const freeling::regexp es_verb_be_re; 00098 static const freeling::regexp arg_re; 00099 static const freeling::regexp role_re; 00100 00101 typedef enum {IN_QUOTES, HEAD_TERM, IS_ACRONYM, POSSESSIVE, NUMBER, GENDER, SEM_CLASS, THIRD_PERSON, REFLEXIVE, DEF_NP, INDEF_NP, DEM_NP, MAXIMAL_NP, EMBEDDED_NOUN} mentionFeature; 00102 typedef enum {ARGUMENTS, ROLES} mentionWsFeature; 00103 00104 #define ID(x) model->feature_name_id(x) 00105 #define VERY_BIG 100000 00106 00109 std::wstring _Language; 00111 relaxcor_model *model; 00113 semanticDB *_Semdb; 00115 unsigned int _Active_features; 00117 tagset *_POS_tagset; 00119 std::map<std::wstring, freeling::regexp> _Labels; 00121 std::set<std::wstring> _Det_words; 00123 std::map<std::wstring, std::map<std::wstring, std::wstring> > _Prons_feat; 00125 std::map<std::wstring,std::pair<std::wstring, freeling::regexp> > _Sem_classes; 00127 std::map<std::wstring,std::wstring> _Capitals; 00128 std::map<std::wstring,std::wstring> _Nationalities; 00129 std::multimap<std::wstring,std::wstring> _Countries; 00130 std::vector<freeling::regexp> _GPE_regexps; 00132 std::map<std::wstring, std::vector<unsigned int> > _Forenames; 00134 std::map<std::wstring, std::vector<unsigned int> > _Nicks; 00136 std::map<std::wstring, std::wstring> _Person_Names; 00138 std::map<std::wstring, std::wstring> _Titles; 00140 std::map<std::wstring, freeling::regexp> _AcroTerms; 00141 00143 std::map<int, std::map<mentionFeature, unsigned int> > features; 00144 std::map<int, std::map<mentionWsFeature, std::vector<std::wstring> >> wsfeatures; 00145 00147 void set_feature(int, mentionFeature, unsigned int); 00148 void set_feature(int, mentionWsFeature, const std::vector<std::wstring>&); 00149 void clean_features(); 00150 unsigned int get_feature(int, mentionFeature) const; 00151 const std::vector<std::wstring>& get_feature(int, mentionWsFeature) const; 00152 bool computed_feature(int, mentionFeature) const; 00153 bool computed_feature(int, mentionWsFeature) const; 00154 std::wstring subvector2wstring(const std::vector<std::wstring>&, unsigned int, unsigned int, const std::wstring&); 00155 00157 void get_structural(const mention&, const mention&, relaxcor_model::Tfeatures&); 00158 void get_lexical(const mention&, const mention&, relaxcor_model::Tfeatures&); 00159 void get_morphological(const mention &, const mention&, relaxcor_model::Tfeatures&, std::vector<mention>&); 00160 void get_syntactic(const mention &, const mention&, relaxcor_model::Tfeatures&, std::vector<mention>&); 00161 void get_semantic(const mention &, const mention&, relaxcor_model::Tfeatures&, std::vector<mention>&); 00162 void get_discourse(const mention &, const mention&, relaxcor_model::Tfeatures&); 00163 00164 void get_group_features(std::vector<mention>&, relaxcor_model::Tfeatures&); 00165 00167 unsigned int dist_in_phrases(const mention&, const mention&); 00168 unsigned int in_quotes(const mention&); 00169 bool appositive(const mention&, const mention&); 00170 bool nested(const mention&, const mention&); 00171 bool intersected(const mention&, const mention&); 00172 bool string_match(const mention&, const mention&); 00173 bool pronoun_string_match(const mention&, const mention&, bool); 00174 bool proper_noun_string_match(const mention&, const mention&, bool); 00175 bool no_pronoun_string_match(const mention&, const mention&, bool); 00176 unsigned int head_is_term(const mention&); 00177 unsigned int alias(const mention&, const mention&); 00178 unsigned int is_possessive(const mention&); 00179 unsigned int same_number(const mention&, const mention&); 00180 unsigned int same_gender(const mention&, const mention&); 00181 unsigned int is_3rd_person(const mention&); 00182 unsigned int agreement(const mention&, const mention&); 00183 unsigned int closest_agreement(const mention&, const mention&, std::vector<mention>&); 00184 unsigned int is_reflexive(const mention&); 00185 // syntactic 00186 unsigned int is_def_NP(const mention&); 00187 unsigned int is_dem_NP(const mention&); 00188 bool share_maximal_NP(const mention&, const mention&, std::vector<mention>&); 00189 unsigned int is_maximal_NP(const mention&, std::vector<mention>&); 00190 unsigned int is_indef_NP(const mention&); 00191 unsigned int is_embedded_noun(const mention&, std::vector<mention>&); 00192 bool binding_pos(const mention&, const mention&, bool); 00193 bool binding_neg(const mention&, const mention&, bool); 00194 void get_arguments(const mention&, std::wstring&, std::wstring&); 00195 bool same_preds(bool, const std::wstring&, const std::wstring&); 00196 bool same_args(bool, const std::wstring&, const std::wstring&, relaxcor_model::Tfeatures&); 00197 // semantic 00198 bool separated_by_verb_is(const mention&, const mention&, std::vector<mention>&); 00199 bool sem_class_match(const mention&, const mention&); 00200 bool is_semantic_type(const mention&, const std::wstring&); 00201 bool animacy(const mention&, const mention&); 00202 bool incompatible(const mention&, const mention&); 00203 void get_roles(const mention&, std::vector<std::wstring>& ); 00204 bool same_roles(const std::vector<std::wstring>&, const std::vector<std::wstring>&); 00206 void read_countries_capitals(const std::wstring&); 00207 void read_gpe_regexps(const std::wstring&); 00208 void read_pairs(const std::wstring&, std::map<std::wstring, std::wstring>&); 00209 void read_same_names(const std::wstring&, std::map<std::wstring, std::vector<unsigned int> >&); 00210 std::wstring drop_det(const mention&); 00211 std::wstring compute_term(const mention&); 00212 unsigned int geo_match(const mention&, const mention&); 00213 std::wstring string_merge(const mention&, bool); 00214 std::vector<std::wstring> split_words(const std::wstring&); 00215 bool is_acronym(const std::wstring&); 00216 unsigned int acronym_of(const std::vector<std::wstring>&, const std::vector<std::wstring>&); 00217 unsigned int initials_match(const std::vector<std::wstring>&, const std::vector<std::wstring>&); 00218 double lex_dist(const std::wstring&, const std::wstring&); 00219 unsigned int nick_name_match(const std::wstring&, const std::wstring&); 00220 unsigned int forenames_match(const std::vector<std::wstring>&, const std::vector<std::wstring>&); 00221 unsigned int first_name_match(const std::wstring&, const std::vector<std::wstring>&); 00222 double levenshtein(const std::wstring&, const std::wstring&); 00223 unsigned int get_number(const mention&); 00224 unsigned int get_gender(const mention&); 00225 std::wstring extract_msd_feature(const std::wstring &tag, const std::wstring &feature) const; 00226 //std::wstring extract_number(const std::wstring&); 00227 //std::wstring extract_gender(const std::wstring&); 00228 //std::wstring extract_person(const std::wstring&); 00229 mention::SEMmentionType extract_semclass(const mention&); 00230 void isa(const std::wstring&, std::vector<bool>&); 00231 int get_maximal_NP(const mention&, std::vector<mention>&); 00232 const std::wstring& get_argument(sentence::predicates::const_iterator, dep_tree::const_iterator, paragraph::const_iterator); 00233 bool verb_is_between(const mention&, const mention&); 00234 00235 void extract_pair(mention &, mention &, relaxcor_model::Tfeatures &, std::vector<mention>&); 00236 00237 }; 00238 00239 } // namespace 00240 00241 #endif