FreeLing
4.0
|
00001 00002 // 00003 // FreeLing - Open Source Language Analyzers 00004 // 00005 // Copyright (C) 2014 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This library is free software; you can redistribute it and/or 00009 // modify it under the terms of the GNU Affero General Public 00010 // License as published by the Free Software Foundation; either 00011 // version 3 of the License, or (at your option) any later version. 00012 // 00013 // This library is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // Affero General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU Affero General Public 00019 // License along with this library; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00021 // 00022 // contact: Lluis Padro (padro@lsi.upc.es) 00023 // TALP Research Center 00024 // despatx C6.212 - Campus Nord UPC 00025 // 08034 Barcelona. SPAIN 00026 // 00028 00029 #ifndef _SMOOTHING_LD 00030 #define _SMOOTHING_LD 00031 00032 #include "freeling/morfo/traces.h" 00033 #include "freeling/morfo/util.h" 00034 #include "freeling/morfo/configfile.h" 00035 00036 #define MOD_TRACENAME L"SMOOTHING" 00037 #define MOD_TRACEMODULE LANGIDENT_TRACE 00038 00039 namespace freeling { 00040 00051 00052 template <class G,class E> 00053 class smoothingLD { 00054 00055 private: 00057 double alpha; 00058 double notalpha; 00060 std::map<G,double> counts; 00061 00062 // probability of unseen unigrams 00063 double pUnseen; 00064 // total number of observations 00065 double nobs; 00066 00068 size_t order; 00069 00071 std::map<std::wstring,E> escapes; 00072 00074 // Get observed counts for given ngram 00075 00076 double count(const G &ngram) const { 00077 typename std::map<G, double>::const_iterator p = counts.find(ngram); 00078 if (p!=counts.end()) return p->second; 00079 else return -1; 00080 } 00081 00082 00083 public: 00084 00087 00088 smoothingLD(const std::wstring &cfgFile, 00089 const std::map<std::wstring,E> &esc=std::map<std::wstring,E>()) : escapes(esc) { 00090 00091 double ntypes=0; 00092 double vsize=0; 00093 nobs=0; 00094 order=0; 00095 00096 enum sections {ORDER,NGRAMS,SMOOTHING}; 00097 config_file cfg(true); 00098 cfg.add_section(L"Order",ORDER,true); 00099 cfg.add_section(L"NGrams",NGRAMS,true); 00100 cfg.add_section(L"Smoothing",SMOOTHING,true); 00101 00102 if (not cfg.open(cfgFile)) 00103 ERROR_CRASH(L"Error opening file "+cfgFile); 00104 00105 std::wstring line; 00106 while (cfg.get_content_line(line)) { 00107 00108 std::wistringstream sin; 00109 sin.str(line); 00110 00111 // process each content line according to the section where it is found 00112 switch (cfg.get_section()) { 00113 00114 case ORDER: {// read order of ngram model 00115 std::wistringstream sin; 00116 sin.str(line); 00117 size_t x; sin>>x; 00118 if (order!=0 and order!=x) 00119 ERROR_CRASH(L"ERROR - Specified model order does not match ngram size"); 00120 order = x; 00121 break; 00122 } 00123 00124 case SMOOTHING: { // reading general parameters 00125 std::wstring name; 00126 sin>>name; 00127 if (name==L"LinearDiscountAlpha") { double a; sin>>a; alpha = log(a); notalpha=log(1-a); } 00128 else if (name==L"VocabularySize") sin>>vsize; 00129 else ERROR_CRASH(L"Unexpected smoothing option '"+name+L"'"); 00130 break; 00131 } 00132 00133 case NGRAMS: { // reading ngram counts 00134 // read counts for this ngram to table 00135 double c; sin >> c; 00136 // read ngram components into a G object. 00137 G ngram; 00138 std::wstring w; 00139 while (sin >> w) { 00140 typename std::map<std::wstring,E>::const_iterator p=escapes.find(w); 00141 if (p!=escapes.end()) 00142 ngram.push_back(p->second); 00143 else 00144 ngram.push_back(util::wstring_to<E>(w)); 00145 } 00146 00147 if (order==0) order = ngram.size(); 00148 else if (order != ngram.size()) 00149 ERROR_CRASH(L"ERROR - Mixed order ngrams in input file, or specified model order does not match ngram size"); 00150 00151 // add ngram (and n-i gram) counts to the model 00152 while (ngram.size()>1) { 00153 // insert ngram count, or increase if it already existed 00154 std::pair<typename std::map<G,double>::iterator,bool> x = counts.insert(make_pair(ngram,c)); 00155 if (not x.second) x.first->second += c; 00156 // shorten n gram and loop to insert n-1 gram 00157 ngram.erase(std::prev(ngram.end())); 00158 } 00159 // unigram is left. Add it 00160 std::pair<typename std::map<G,double>::iterator,bool> x = counts.insert(make_pair(ngram,c)); 00161 if (x.second) ntypes++; // new unigram inserted, increase type count 00162 else x.first->second += c; // existing unigram, increase count 00163 00164 // update total observation counts 00165 nobs += c; 00166 break; 00167 } 00168 00169 default: break; 00170 } 00171 } 00172 cfg.close(); 00173 00174 // precompute logs needed for logprob 00175 if (vsize<=ntypes) 00176 ERROR_CRASH(L"VocabularySize can not be smaller than number of different observed unigrams."); 00177 pUnseen = -log(vsize-ntypes); // log version of 1/(vsize-ntypes) 00178 nobs = log(nobs); 00179 for (typename std::map<G,double>::iterator c=counts.begin(); c!=counts.end(); c++) 00180 c->second = log(c->second); 00181 } 00182 00185 00186 ~smoothingLD() {} 00187 00191 00192 double Prob(const G &ngram, const E &z) const { 00193 00194 // seq = n+1 gram ( ngram + z ) 00195 G seq = ngram; 00196 seq.push_back(z); 00197 // log count of complete ngram 00198 double c = count(seq); 00199 00200 if (ngram.size()==0) { 00201 // no conditioning history, use unigrams (seq = [z]) 00202 if (c>=0) return notalpha + c - nobs; // log version of (1-alpha)*count(c)/nobs 00203 else return alpha + pUnseen; // log version of alpha * punseen 00204 } 00205 00206 else { 00207 // conditioning history, use LD smoothing 00208 if (c>=0) return notalpha + c - count(ngram); // log version of (1-alpha)*count(c)/count(ngram) 00209 else { 00210 // shorten history and recurse 00211 G sht = ngram; sht.erase(sht.begin()); 00212 return alpha + Prob(sht,z); // log version of alpha * Prob(sht,z) 00213 } 00214 } 00215 } 00216 }; 00217 00218 } // namespace 00219 00220 #undef MOD_TRACENAME 00221 #undef MOD_TRACECODE 00222 00223 #endif 00224