FreeLing  4.0
summarizer.h
Go to the documentation of this file.
00001 
00002 //
00003 //    FreeLing - Open Source Language Analyzers
00004 //
00005 //    Copyright (C) 2014   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This library is free software; you can redistribute it and/or
00009 //    modify it under the terms of the GNU Affero General Public
00010 //    License as published by the Free Software Foundation; either
00011 //    version 3 of the License, or (at your option) any later version.
00012 //
00013 //    This library is distributed in the hope that it will be useful,
00014 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 //    Affero General Public License for more details.
00017 //
00018 //    You should have received a copy of the GNU Affero General Public
00019 //    License along with this library; if not, write to the Free Software
00020 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00021 //
00022 //    contact: Lluis Padro (padro@lsi.upc.es)
00023 //             TALP Research Center
00024 //             despatx C6.212 - Campus Nord UPC
00025 //             08034 Barcelona.  SPAIN
00026 //
00028 
00029 #ifndef _SUMMARIZER
00030 #define _SUMMARIZER
00031 
00032 #include "freeling/windll.h"
00033 #include "freeling/morfo/lexical_chain.h"
00034 #include "freeling/morfo/language.h"
00035 
00036 namespace freeling {
00037 
00042 
00043   class WINDLL summarizer {
00044   public:
00046     summarizer(const std::wstring &datFile);
00047 
00049     ~summarizer();
00050 
00052     std::list<const freeling::sentence*> summarize(const freeling::document &doc,
00053                                                    int num_words) const;
00054 
00055     typedef enum {FIRST_WORD,FIRST_MOST_WEIGHT,WEIGHT_SUM} Heuristics;
00056 
00057   private:
00058 
00060     bool remove_used_lexical_chains;
00062     bool only_strong;
00064     int hypernymy_depth;
00066     double alpha;
00068     std::wstring semdb_path;
00070     std::set<relation*> used_relations;
00072     Heuristics heuristic;
00073 
00075     std::map<std::wstring, std::list<lexical_chain>> build_lexical_chains(const freeling::document &doc) const;
00076 
00078     void remove_one_word_lexical_chains(std::map<std::wstring, std::list<lexical_chain>> &chains) const;
00079 
00081     void remove_weak_lexical_chains(std::map<std::wstring, std::list<lexical_chain>> &chains) const;
00082     
00084     void print_lexical_chains(std::map<std::wstring, std::list<lexical_chain>> &chains) const;
00085     
00087     int count_occurences(const freeling::word &w, const freeling::document &doc) const;
00088     
00090     double average_scores(std::map<std::wstring, std::list<lexical_chain> > &chains_type) const;
00091     
00093     double standard_deviation_scores(std::map<std::wstring, std::list<lexical_chain> > &chains_type,
00094                                      const double avg) const;
00095     
00097     std::list<lexical_chain> map_to_lists(std::map<std::wstring,
00098                                           std::list<lexical_chain> > &chains_type) const;
00099     
00102     void compute_sentence(const std::list<word_pos> &wps, std::list<word_pos> &wp_list,
00103                           std::set<const freeling::sentence*> &sent_set, int &acc_n_words,
00104                           int num_words) const;
00105     
00108     std::list<word_pos> first_word(std::map<std::wstring,
00109                                    std::list<lexical_chain> > &chains_type, int num_words) const;
00110     
00113     std:: list<word_pos> first_most_weighted_word(std::map<std::wstring, 
00114                                                   std::list<lexical_chain> > &chains, 
00115                                                   int num_words) const;
00116     
00119     std::list<word_pos> sum_of_chain_weights(std::map<std::wstring, 
00120                                              std::list<lexical_chain> > &chains, 
00121                                              int num_words) const;
00122 
00123   };
00124 
00125 } // namespace
00126 
00127 #endif