FreeLing
4.0
|
00001 00002 // 00003 // FreeLing - Open Source Language Analyzers 00004 // 00005 // Copyright (C) 2014 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This library is free software; you can redistribute it and/or 00009 // modify it under the terms of the GNU Affero General Public 00010 // License as published by the Free Software Foundation; either 00011 // version 3 of the License, or (at your option) any later version. 00012 // 00013 // This library is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // Affero General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU Affero General Public 00019 // License along with this library; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00021 // 00022 // contact: Lluis Padro (padro@lsi.upc.es) 00023 // TALP Research Center 00024 // despatx C6.212 - Campus Nord UPC 00025 // 08034 Barcelona. SPAIN 00026 // 00028 00029 #ifndef _LANGUAGE 00030 #define _LANGUAGE 00031 00032 #include <string> 00033 #include <list> 00034 #include <vector> 00035 #include <set> 00036 #include <map> 00037 00038 #include "freeling/regexp.h" 00039 #include "freeling/windll.h" 00040 #include "freeling/tree.h" 00041 #include "freeling/morfo/semgraph.h" 00042 00043 namespace freeling { 00044 00045 class word; // predeclaration 00046 00051 00052 class WINDLL analysis { 00053 00054 private: 00056 std::wstring lemma; 00058 std::wstring tag; 00060 double prob; 00062 double distance; 00064 std::list<std::pair<std::wstring,double> > senses; 00066 std::list<word> retok; 00067 00068 // store which sequences --among the kbest proposed by 00069 // the tagger-- contain this analysis 00070 std::set<int> selected_kbest; 00071 00072 public: 00074 std::vector<std::wstring> user; 00075 00077 analysis(); 00079 analysis(const std::wstring &, const std::wstring &); 00081 analysis& operator=(const analysis&); 00082 00083 void init(const std::wstring &l, const std::wstring &t); 00084 void set_lemma(const std::wstring &); 00085 void set_tag(const std::wstring &); 00086 void set_prob(double); 00087 void set_distance(double); 00088 void set_retokenizable(const std::list<word> &); 00089 00090 bool has_prob() const; 00091 bool has_distance() const; 00092 const std::wstring& get_lemma() const; 00093 const std::wstring& get_tag() const; 00094 double get_prob() const; 00095 double get_distance() const; 00096 bool is_retokenizable() const; 00097 std::list<word>& get_retokenizable(); 00098 const std::list<word>& get_retokenizable() const; 00099 00100 const std::list<std::pair<std::wstring,double> > & get_senses() const; 00101 std::list<std::pair<std::wstring,double> > & get_senses(); 00102 void set_senses(const std::list<std::pair<std::wstring,double> > &); 00103 // useful for java API 00104 std::wstring get_senses_string() const; 00105 00106 // get the largest kbest sequence index the analysis is selected in. 00107 int max_kbest() const; 00108 // find out whether the analysis is selected in the tagger k-th best sequence 00109 bool is_selected(int k=0) const; 00110 // mark this analysis as selected in k-th best sequence 00111 void mark_selected(int k=0); 00112 // unmark this analysis as selected in k-th best sequence 00113 void unmark_selected(int k=0); 00114 00116 bool operator>(const analysis &) const; 00118 bool operator<(const analysis &) const; 00120 bool operator==(const analysis &) const; 00121 }; 00122 00123 00128 00129 class WINDLL word : public std::list<analysis> { 00130 private: 00132 std::wstring form; 00134 std::wstring lc_form; 00136 std::wstring ph_form; 00138 std::list<word> multiword; 00140 bool ambiguous_mw; 00142 std::list<std::pair<std::wstring,int> > alternatives; 00144 unsigned long start, finish; 00146 bool in_dict; 00148 bool locked; 00150 void clone(const word &); 00152 size_t position; 00153 00155 static const int SELECTED=0; 00156 static const int UNSELECTED=1; 00157 static const int ALL=2; 00158 static const std::wstring NOT_FOUND; 00159 00160 public: 00161 // predeclarations 00162 class iterator; 00163 class const_iterator; 00164 00166 std::vector<std::wstring> user; 00167 00169 word(); 00171 word(const std::wstring &); 00173 word(const std::wstring &, const std::list<word> &); 00175 word(const std::wstring &, const std::list<analysis> &, const std::list<word> &); 00177 word(const word &); 00179 word& operator=(const word&); 00180 00182 void copy_analysis(const word &); 00184 int get_n_selected(int k=0) const; 00186 int get_n_unselected(int k=0) const; 00188 bool is_multiword() const; 00190 bool is_ambiguous_mw() const; 00192 void set_ambiguous_mw(bool); 00194 int get_n_words_mw() const; 00196 const std::list<word>& get_words_mw() const; 00198 const std::wstring& get_form() const; 00200 const std::wstring& get_lc_form() const; 00202 const std::wstring& get_ph_form() const; 00204 word::iterator selected_begin(int k=0); 00206 word::const_iterator selected_begin(int k=0) const; 00208 word::iterator selected_end(int k=0); 00210 word::const_iterator selected_end(int k=0) const; 00212 word::iterator unselected_begin(int k=0); 00214 word::const_iterator unselected_begin(int k=0) const; 00216 word::iterator unselected_end(int k=0); 00218 word::const_iterator unselected_end(int k=0) const; 00220 unsigned int num_kbest() const; 00222 const std::wstring& get_lemma(int k=0) const; 00224 const std::wstring& get_tag(int k=0) const; 00225 00227 const std::list<std::pair<std::wstring,double> >& get_senses(int k=0) const; 00228 std::list<std::pair<std::wstring,double> >& get_senses(int k=0); 00229 // useful for java API 00230 std::wstring get_senses_string(int k=0) const; 00232 void set_senses(const std::list<std::pair<std::wstring,double> > &, int k=0); 00233 00235 unsigned long get_span_start() const; 00236 unsigned long get_span_finish() const; 00237 00239 bool found_in_dict() const; 00241 void set_found_in_dict(bool); 00243 bool has_retokenizable() const; 00245 void lock_analysis(); 00247 bool is_locked() const; 00248 00250 void add_alternative(const std::wstring &, int); 00252 void set_alternatives(const std::list<std::pair<std::wstring,int> > &); 00254 void clear_alternatives(); 00256 bool has_alternatives() const; 00258 const std::list<std::pair<std::wstring,int> >& get_alternatives() const; 00260 std::list<std::pair<std::wstring,int> >& get_alternatives(); 00262 std::list<std::pair<std::wstring,int> >::iterator alternatives_begin(); 00264 std::list<std::pair<std::wstring,int> >::iterator alternatives_end(); 00266 std::list<std::pair<std::wstring,int> >::const_iterator alternatives_begin() const; 00268 std::list<std::pair<std::wstring,int> >::const_iterator alternatives_end() const; 00269 00271 void add_analysis(const analysis &); 00273 void set_analysis(const analysis &); 00275 void set_analysis(const std::list<analysis> &); 00277 void set_form(const std::wstring &); 00279 void set_ph_form(const std::wstring &); 00281 void set_span(unsigned long, unsigned long); 00282 00283 // get/set word position 00284 void set_position(size_t); 00285 size_t get_position() const; 00286 00288 bool find_tag_match(const freeling::regexp &) const; 00289 00291 int get_n_analysis() const; 00293 void unselect_all_analysis(int k=0); 00295 void select_all_analysis(int k=0); 00297 void select_analysis(word::iterator, int k=0); 00299 void unselect_analysis(word::iterator, int k=0); 00301 std::list<analysis> get_analysis() const; 00303 word::iterator analysis_begin(); 00304 word::const_iterator analysis_begin() const; 00306 word::iterator analysis_end(); 00307 word::const_iterator analysis_end() const; 00308 00310 class WINDLL iterator : public std::list<analysis>::iterator { 00311 friend class word::const_iterator; 00312 private: 00314 std::list<analysis>::iterator ibeg; 00316 std::list<analysis>::iterator iend; 00318 int type; 00320 int kbest; 00321 00322 public: 00324 iterator(); 00326 iterator(const word::iterator &); 00328 iterator(const std::list<analysis>::iterator &); 00330 iterator(const std::list<analysis>::iterator &, 00331 const std::list<analysis>::iterator &, 00332 const std::list<analysis>::iterator &, int,int k=0); 00334 iterator& operator++(); 00335 iterator operator++(int); 00336 }; 00337 00339 class WINDLL const_iterator : public std::list<analysis>::const_iterator { 00340 private: 00342 std::list<analysis>::const_iterator ibeg; 00344 std::list<analysis>::const_iterator iend; 00346 int type; 00348 int kbest; 00349 00350 public: 00352 const_iterator(); 00354 const_iterator(const word::const_iterator &); 00356 const_iterator(const word::iterator &); 00358 const_iterator(const std::list<analysis>::const_iterator &); 00360 const_iterator(const std::list<analysis>::iterator &); 00362 const_iterator(const std::list<analysis>::const_iterator &, 00363 const std::list<analysis>::const_iterator &, 00364 const std::list<analysis>::const_iterator &, int, int k=0); 00366 const_iterator& operator++(); 00367 const_iterator operator++(int); 00368 }; 00369 00370 }; 00371 00372 00373 00379 00380 class WINDLL node { 00381 protected: 00383 std::wstring nodeid; 00385 bool head; 00387 int chunk; 00389 std::wstring label; 00391 word * w; 00392 00393 public: 00395 std::vector<std::wstring> user; 00397 node(); 00398 node(const std::wstring &); 00400 const std::wstring& get_node_id() const; 00402 void set_node_id(const std::wstring &); 00404 const std::wstring& get_label() const; 00406 bool has_word() const; 00408 const word & get_word() const; 00410 word & get_word(); 00412 void set_label(const std::wstring &); 00414 void set_word(word &); 00416 bool is_head() const; 00418 void set_head(const bool); 00420 bool is_chunk() const; 00422 void set_chunk(const int); 00424 int get_chunk_ord() const; 00425 00426 }; 00427 00431 00432 class WINDLL parse_tree : public tree<node> { 00433 private: 00434 // access nodes by id 00435 std::map<std::wstring,parse_tree::iterator> node_index; 00436 // acces leaf nodes by word position 00437 std::vector<parse_tree::iterator> word_index; 00438 00439 public: 00440 parse_tree(); 00441 parse_tree(parse_tree::const_iterator p); 00442 parse_tree(const node &); 00443 00445 void build_node_index(const std::wstring &); 00447 void rebuild_node_index(); 00449 parse_tree::const_iterator get_node_by_id(const std::wstring &) const; 00451 parse_tree::const_iterator get_node_by_pos(size_t) const; 00453 parse_tree::iterator get_node_by_id(const std::wstring &); 00455 parse_tree::iterator get_node_by_pos(size_t); 00456 00458 static const word& get_head_word(parse_tree::const_iterator); 00460 static const std::wstring& get_head_label(parse_tree::const_iterator); 00462 static int get_head_position(parse_tree::const_iterator); 00463 00468 static bool C_commands(parse_tree::const_iterator, parse_tree::const_iterator); 00469 00471 static void PrintTree(parse_tree::const_iterator n, int k, int depth); 00472 00474 parse_tree::const_iterator get_subsuming_node(size_t i, size_t j) const; 00475 parse_tree::iterator get_subsuming_node(size_t i, size_t j); 00477 parse_tree::const_iterator get_left_subsuming_node(size_t i, size_t j) const; 00478 parse_tree::iterator get_left_subsuming_node(size_t i, size_t j); 00480 parse_tree::const_iterator get_right_subsuming_node(size_t i, size_t j) const; 00481 parse_tree::iterator get_right_subsuming_node(size_t i, size_t j); 00482 }; 00483 00484 00489 00490 class WINDLL depnode : public node { 00491 00492 private: 00494 parse_tree::iterator link; 00495 00496 public: 00497 depnode(); 00498 depnode(const std::wstring &); 00499 depnode(const node &); 00500 00502 void set_link(const parse_tree::iterator); 00504 parse_tree::iterator get_link(); 00505 parse_tree::const_iterator get_link() const; 00506 }; 00507 00511 00512 class WINDLL dep_tree : public tree<depnode> { 00513 00514 private: 00515 // acces nodes by word position 00516 std::vector<dep_tree::iterator> word_index; 00517 00518 public: 00519 dep_tree(); 00520 dep_tree(const depnode &); 00521 00523 dep_tree::const_iterator get_node_by_pos(size_t) const; 00525 dep_tree::iterator get_node_by_pos(size_t); 00527 void rebuild_node_index(); 00528 00530 static size_t get_first_word(dep_tree::const_iterator); 00531 static size_t get_last_word(dep_tree::const_iterator); 00532 00534 static void PrintDepTree(dep_tree::const_iterator n, int depth); 00535 }; 00536 00537 00538 00544 00545 class processor_status { 00546 public: 00547 processor_status(); 00548 virtual ~processor_status() {}; 00549 }; 00550 00551 00552 00556 00557 class WINDLL argument { 00558 private: 00559 int position; 00560 std::wstring role; 00561 00562 public: 00563 static const std::wstring EMPTY_ROLE; 00564 00565 argument(); 00566 ~argument(); 00567 argument(int p, const std::wstring &r); 00568 00570 int get_position() const; 00571 const std::wstring& get_role() const; 00572 }; 00573 00577 00578 class WINDLL predicate : public std::vector<argument> { 00579 00580 private: 00581 // index to find arguments by word position 00582 std::map<int,int> arg_index; 00583 // position of the predicate head in the sentence 00584 int position; 00585 // propbank sense of the predicate 00586 std::wstring sense; 00587 00588 public: 00589 00590 predicate(); 00591 ~predicate(); 00592 predicate(int p, const std::wstring &s); 00594 predicate(const predicate &); 00596 predicate& operator=(const predicate&); 00597 00599 const std::wstring& get_sense() const; 00601 int get_position() const; 00603 bool has_argument(int p) const; 00605 void add_argument(int p, const std::wstring &r); 00607 const argument & get_argument_by_pos(int p) const; 00608 00609 }; 00610 00611 00617 00618 class WINDLL sentence : public std::list<word> { 00619 00620 public: 00621 typedef std::vector<predicate> predicates; 00622 00623 private: 00624 // sentence identifier, in case user application wants to set it. 00625 std::wstring sent_id; 00626 // vector with pointers to sentence words, for fast access by position 00627 std::vector<word*> wpos; 00628 // remember if it is PoS tagged 00629 bool tagged; 00630 // parse tree (if sentence parsed) 00631 std::map<int,parse_tree> pts; 00632 // dependencey tree (if sentence dep. parsed) 00633 std::map<int,dep_tree> dts; 00634 // clone sentence (used by assignment/copy constructors) 00635 void clone(const sentence &); 00636 // stack processing status for processor currently analyzing the sentence 00637 // (there might be a hierarchy of embeeded processors, thus the stack) 00638 std::list<processor_status*> status; 00639 // store map of predicates by position 00640 predicates preds; 00641 // index to access predicates by word position 00642 std::map<int,int> pred_index; 00643 00644 public: 00645 00646 sentence(); 00647 ~sentence(); 00648 sentence(const std::list<word>&); 00650 sentence(const sentence &); 00652 sentence& operator=(const sentence&); 00654 const word& operator[](size_t) const; 00655 word& operator[](size_t); 00657 unsigned int num_kbest() const; 00659 void push_back(const word &); 00661 void rebuild_word_index(); 00662 00663 void clear(); 00664 00665 void set_sentence_id(const std::wstring &); 00666 const std::wstring& get_sentence_id() const; 00667 00668 void set_is_tagged(bool); 00669 bool is_tagged() const; 00670 00671 void set_parse_tree(const parse_tree &, int k=0); 00672 parse_tree & get_parse_tree(int k=0); 00673 const parse_tree & get_parse_tree(int k=0) const; 00674 bool is_parsed() const; 00675 00676 void set_dep_tree(const dep_tree &, int k=0); 00677 dep_tree & get_dep_tree(int k=0); 00678 const dep_tree & get_dep_tree(int k=0) const; 00679 bool is_dep_parsed() const; 00680 00682 processor_status* get_processing_status(); 00683 const processor_status* get_processing_status() const; 00685 void set_processing_status(processor_status *); 00687 void clear_processing_status(); 00688 00690 std::vector<word> get_words() const; 00692 sentence::iterator words_begin(); 00693 sentence::const_iterator words_begin() const; 00694 sentence::iterator words_end(); 00695 sentence::const_iterator words_end() const; 00696 00697 // obtain iterator to a word given its position 00698 sentence::iterator get_word_iterator(const word &w); 00699 sentence::const_iterator get_word_iterator(const word &w) const; 00700 00701 void add_predicate(const predicate &pr); 00703 bool is_predicate(int p) const; 00705 int get_predicate_number(int p) const; 00707 int get_predicate_position(int n) const; 00709 const predicate& get_predicate_by_pos(int n) const; 00711 const predicate& get_predicate_by_number(int n) const; 00713 const predicates & get_predicates() const; 00714 }; 00715 00720 00721 class WINDLL paragraph : public std::list<sentence> { 00722 private: 00723 std::wstring par_id; 00724 public: 00725 paragraph(); 00726 paragraph(const std::list<sentence> &x); 00727 void set_paragraph_id(const std::wstring &); 00728 const std::wstring & get_paragraph_id() const; 00729 }; 00730 00736 00737 class WINDLL mention { 00738 00739 public: 00740 typedef enum {PROPER_NOUN, PRONOUN, NOUN_PHRASE, COMPOSITE, VERB_PHRASE} mentionType; 00741 // NE type NOTPER is the supertype of ORG, GEO and OTHER 00742 typedef enum {PER, MALE, FEMALE, NOTPER, ORG, GEO, TIME, DATE, MONEY, OTHER} SEMmentionType; 00743 00744 private: 00745 00747 int id; 00749 mentionType mType; 00751 int sent; 00753 paragraph::const_iterator s; 00755 bool initial; 00757 bool single_subsumtion; 00759 parse_tree::const_iterator ptree; 00761 int posBegin; 00763 int posEnd; 00765 sentence::const_iterator itBegin; 00767 sentence::const_iterator itEnd; 00769 sentence::const_iterator itHead; 00771 int chain; 00772 00774 00776 void clone(const mention &); 00779 static void set_tokens(parse_tree::const_iterator, int&, sentence::const_iterator&); 00783 static void set_iterators(sentence::const_iterator, sentence::const_iterator, const parse_tree&, sentence::const_iterator&, parse_tree::const_iterator&); 00784 00785 public: 00786 00788 mention(int, int, paragraph::const_iterator, parse_tree::const_iterator, int, sentence::const_iterator); 00790 mention(int, int, paragraph::const_iterator, sentence::const_iterator, sentence::const_iterator); 00792 mention(const mention &); 00794 mention& operator=(const mention&); 00795 00797 bool operator<(const mention &m) const; 00798 00800 void set_id(int); 00801 void set_type(mentionType); 00802 void set_initial(bool); 00803 void set_group(int); 00804 void subsumed_with_no_verb(bool b=false); 00805 00807 int get_id() const; 00808 int get_n_sentence() const; 00809 paragraph::const_iterator get_sentence() const; 00810 int get_pos_begin() const; 00811 int get_pos_end() const; 00812 sentence::const_iterator get_it_begin() const; 00813 sentence::const_iterator get_it_end() const; 00814 sentence::const_iterator get_it_head() const; 00815 mentionType get_type() const; 00816 int get_group() const; 00817 bool is_type(mentionType) const; 00818 bool is_initial() const; 00819 bool is_subsumed_with_no_verb() const; 00820 parse_tree::const_iterator get_ptree() const; 00821 const word& get_head() const; 00822 std::wstring value(int lc=0) const; 00823 }; 00824 00825 00826 00831 00832 class WINDLL document : public std::list<paragraph> { 00833 00834 private: 00835 static const size_t DIM = 500; 00836 00837 paragraph title; 00838 00839 // potentially coreferring mentions 00840 std::vector<mention> mentions; 00841 // set of mention_ids belonging to the same group 00842 std::multimap<int,int> group2mentions; 00843 // ids (non correlative) for existing groups 00844 std::list<int> groups; 00845 00846 // semantic graph 00847 semgraph::semantic_graph sem_graph; 00848 00849 public: 00850 document(); 00851 ~document(); 00852 00853 bool is_parsed() const; 00854 bool is_dep_parsed() const; 00855 00857 void add_mention(const mention &m); 00858 00865 00866 // count number of words in the document 00867 int get_num_words() const; 00868 00870 int get_num_groups() const; 00872 const std::list<int> & get_groups() const; 00873 00875 std::vector<mention>::const_iterator begin_mentions() const; 00876 std::vector<mention>::iterator begin_mentions(); 00878 std::vector<mention>::const_iterator end_mentions() const; 00879 std::vector<mention>::iterator end_mentions(); 00880 00882 const mention& get_mention(int) const; 00883 00886 00888 std::list<int> get_coref_id_mentions(int) const; 00889 00892 00893 const semgraph::semantic_graph & get_semantic_graph() const; 00894 semgraph::semantic_graph & get_semantic_graph(); 00895 }; 00896 00897 } // namespace 00898 00899 #endif 00900