FreeLing  4.0
smoothingLD.h
Go to the documentation of this file.
00001 
00002 //
00003 //    FreeLing - Open Source Language Analyzers
00004 //
00005 //    Copyright (C) 2014   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This library is free software; you can redistribute it and/or
00009 //    modify it under the terms of the GNU Affero General Public
00010 //    License as published by the Free Software Foundation; either
00011 //    version 3 of the License, or (at your option) any later version.
00012 //
00013 //    This library is distributed in the hope that it will be useful,
00014 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 //    Affero General Public License for more details.
00017 //
00018 //    You should have received a copy of the GNU Affero General Public
00019 //    License along with this library; if not, write to the Free Software
00020 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00021 //
00022 //    contact: Lluis Padro (padro@lsi.upc.es)
00023 //             TALP Research Center
00024 //             despatx C6.212 - Campus Nord UPC
00025 //             08034 Barcelona.  SPAIN
00026 //
00028 
00029 #ifndef _SMOOTHING_LD
00030 #define _SMOOTHING_LD
00031 
00032 #include "freeling/morfo/traces.h"
00033 #include "freeling/morfo/util.h"
00034 #include "freeling/morfo/configfile.h"
00035 
00036 #define MOD_TRACENAME L"SMOOTHING"
00037 #define MOD_TRACEMODULE LANGIDENT_TRACE
00038 
00039 namespace freeling {
00040   
00051 
00052   template <class G,class E> 
00053   class smoothingLD {
00054     
00055   private:
00057     double alpha; 
00058     double notalpha;
00060     std::map<G,double> counts;
00061     
00062     // probability of unseen unigrams
00063     double pUnseen;
00064     // total number of observations
00065     double nobs; 
00066 
00068     size_t order;
00069 
00071     std::map<std::wstring,E> escapes;
00072     
00074     // Get observed counts for given ngram
00075     
00076     double count(const G &ngram) const {
00077       typename std::map<G, double>::const_iterator p = counts.find(ngram);
00078       if (p!=counts.end()) return p->second;
00079       else return -1;
00080     }
00081     
00082         
00083   public:
00084     
00087     
00088     smoothingLD(const std::wstring &cfgFile, 
00089                 const std::map<std::wstring,E> &esc=std::map<std::wstring,E>()) : escapes(esc) { 
00090         
00091       double ntypes=0; 
00092       double vsize=0;
00093       nobs=0; 
00094       order=0;
00095       
00096       enum sections {ORDER,NGRAMS,SMOOTHING};
00097       config_file cfg(true);  
00098       cfg.add_section(L"Order",ORDER,true);
00099       cfg.add_section(L"NGrams",NGRAMS,true);
00100       cfg.add_section(L"Smoothing",SMOOTHING,true);
00101       
00102       if (not cfg.open(cfgFile))
00103         ERROR_CRASH(L"Error opening file "+cfgFile);
00104       
00105       std::wstring line; 
00106       while (cfg.get_content_line(line)) {
00107         
00108         std::wistringstream sin;
00109         sin.str(line);
00110         
00111         // process each content line according to the section where it is found
00112         switch (cfg.get_section()) {
00113           
00114         case ORDER: {// read order of ngram model
00115           std::wistringstream sin;
00116           sin.str(line);
00117           size_t x; sin>>x;
00118           if (order!=0 and order!=x)
00119             ERROR_CRASH(L"ERROR - Specified model order does not match ngram size");
00120           order = x;
00121           break;
00122         }
00123 
00124         case SMOOTHING: { // reading general parameters
00125           std::wstring name;
00126           sin>>name;
00127           if (name==L"LinearDiscountAlpha") { double a; sin>>a; alpha = log(a); notalpha=log(1-a); }
00128           else if (name==L"VocabularySize") sin>>vsize;
00129           else ERROR_CRASH(L"Unexpected smoothing option '"+name+L"'");
00130           break;
00131         }
00132           
00133         case NGRAMS: {  // reading ngram counts
00134           // read counts for this ngram to table
00135           double c; sin >> c;
00136           // read ngram components into a G object.
00137           G ngram;
00138           std::wstring w; 
00139           while (sin >> w) {
00140             typename std::map<std::wstring,E>::const_iterator p=escapes.find(w);
00141             if (p!=escapes.end())                
00142               ngram.push_back(p->second);
00143             else
00144               ngram.push_back(util::wstring_to<E>(w));
00145           }
00146 
00147           if (order==0) order = ngram.size();
00148           else if (order != ngram.size()) 
00149             ERROR_CRASH(L"ERROR - Mixed order ngrams in input file, or specified model order does not match ngram size");
00150 
00151           // add ngram (and n-i gram) counts to the model
00152           while (ngram.size()>1) {
00153             // insert ngram count, or increase if it already existed
00154             std::pair<typename std::map<G,double>::iterator,bool> x = counts.insert(make_pair(ngram,c));
00155             if (not x.second) x.first->second += c;            
00156             // shorten n gram and loop to insert n-1 gram
00157             ngram.erase(std::prev(ngram.end()));
00158           }
00159           // unigram is left. Add it
00160           std::pair<typename std::map<G,double>::iterator,bool> x = counts.insert(make_pair(ngram,c));
00161           if (x.second) ntypes++; // new unigram inserted, increase type count
00162           else x.first->second += c;  // existing unigram, increase count
00163 
00164           // update total observation counts
00165           nobs += c;
00166           break;
00167         }
00168           
00169         default: break;
00170         }
00171       }
00172       cfg.close();       
00173 
00174       // precompute logs needed for logprob
00175       if (vsize<=ntypes)
00176         ERROR_CRASH(L"VocabularySize can not be smaller than number of different observed unigrams.");
00177       pUnseen = -log(vsize-ntypes); // log version of 1/(vsize-ntypes)
00178       nobs = log(nobs);
00179       for (typename std::map<G,double>::iterator c=counts.begin(); c!=counts.end(); c++) 
00180         c->second = log(c->second); 
00181     }
00182     
00185     
00186     ~smoothingLD() {}
00187          
00191     
00192     double Prob(const G &ngram, const E &z) const {
00193       
00194       // seq =  n+1 gram ( ngram + z )
00195       G seq = ngram; 
00196       seq.push_back(z);
00197       // log count of complete ngram
00198       double c = count(seq);
00199 
00200       if (ngram.size()==0) {
00201         // no conditioning history, use unigrams (seq = [z])
00202         if (c>=0) return notalpha + c - nobs;  // log version of (1-alpha)*count(c)/nobs
00203         else return alpha + pUnseen;   // log version of alpha * punseen
00204       }
00205       
00206       else {
00207         // conditioning history, use LD smoothing
00208         if (c>=0) return notalpha + c - count(ngram); // log version of (1-alpha)*count(c)/count(ngram)
00209         else {
00210           // shorten history and recurse
00211           G sht = ngram; sht.erase(sht.begin());
00212           return alpha + Prob(sht,z);  // log version of alpha * Prob(sht,z)
00213         }
00214       }
00215     }
00216   };
00217   
00218 } // namespace
00219 
00220 #undef MOD_TRACENAME
00221 #undef MOD_TRACECODE
00222 
00223 #endif
00224