FreeLing  4.0
phd.h
Go to the documentation of this file.
00001 /*
00002  * Phonetic Distance Scorer for the PHAST package
00003  * + See the features.ALL file for input description
00004  * + April 2006 pcomas@lsi.upc.edu
00005  *
00006  */
00007 #ifndef _phd_h
00008 #define _phd_h
00009 
00010 #include <cstdlib>
00011 #include <iostream>
00012 #include <sstream>
00013 #include <fstream>
00014 #include <map>
00015 #include <set>
00016 #include <string>
00017 #include <math.h>
00018 
00019 #include "freeling/morfo/util.h"
00020 
00021 namespace freeling {
00022 
00023 #define ALPHSIZE 128
00024 
00025   template <typename T=int> 
00026     class phd {
00027 
00028   private:
00029   T csub, cexp, cvowel, cskip, cspace;
00030   T distance[ALPHSIZE][ALPHSIZE];
00031   std::set<wchar_t> svowels;      // set of vowel phonemes
00032   std::set<wchar_t> sconsonants;  // set of consonant phonemes
00033   int debug;
00034 
00035   inline T V(wchar_t a){ return svowels.find(a) != svowels.end() ? cvowel : 0; }
00036 
00037   public:
00038 
00039   phd(const std::wstring &fname){
00040 
00041     debug = 0;
00042     std::wstring s;
00043     wchar_t c;
00044     T t;
00045     int i;
00046     std::map<const std::wstring, int> flist;   // set of the features' names with its index
00047     std::map<const std::wstring, T> fweight; // set of the features' saliences
00048     std::map<const std::wstring, T> values;  // set of the numerical values of the multivaluated features
00049     std::set<std::wstring> svfeatures;  // set of attributes for vowel comparison
00050     std::set<std::wstring> scfeatures;  // set of attributes for other comparisons
00051     csub = 0;
00052     cskip = 0;
00053     cexp = 0;
00054     cvowel = 0;
00055 
00056     /**************************************************************
00057      *
00058      * READ INPUT FILES, BUILD MATRIX OF FEATURES
00059      *
00060      **************************************************************/
00061 
00062     T features [ALPHSIZE][ALPHSIZE];
00063 
00064     std::wifstream is;
00065     util::open_utf8_file(is,fname);
00066     if (is.fail()) {
00067       std::wcerr<<L"PHONETIC_DISTANCE: Error opening file "+fname;
00068       exit(1);
00069     }
00070 
00071     while(!is.eof()){
00072 
00073       is >> s;
00074 
00075       if( s[0] == L'#'){ 
00076         getline(is,s);
00077 
00078       } else if( s==L"FON:") {
00079         is >> c;     // this is the phoneme
00080         //cerr << "FONEMA "<< c << endl;
00081         getline(is,s); 
00082         std::wstringstream ss(s,std::stringstream::in);
00083         i = 0;
00084         while(ss>>s){
00085           if(s==L"+"){
00086             features[(int)c][i] = 100;
00087           }else if(s==L"-"){
00088             features[(int)c][i] = 0;
00089           }else{  // is a multivaluated feature
00090             features[(int)c][i] = values[s];
00091           }
00092           //cerr << "Posant " << features[c][i] << " a " << i << " (" << s << ")"<< endl;
00093           i++;
00094         }
00095 
00096       } else if( s==L"VALUE:") {
00097         is >> s >> t; // feature value is i
00098         values[s] = t;
00099         //cerr << "VALUE ADD: " << s << " <-- " << i << endl;
00100 
00101       } else if( s==L"WEIGHT:") {
00102         is >> s >> t; // feature s weights i
00103         fweight[s] = t;
00104 
00105       } else if( s==L"CONSTANT:") {
00106         is >> s >> t; // s takes value i
00107         if (s==L"Cskip")   { cskip = t;}
00108         else if(s==L"Csub"){ csub  = t;}
00109         else if(s==L"Cexp"){  cexp = t;}
00110         else if(s==L"Cvowel"){ cvowel = t;}
00111         else if(s==L"Cspace"){ cspace = t;}
00112         else{ std::wcerr << L"UNEXPECTED CONSTANT DEFINITION" << s << std::endl; }
00113 
00114       } else if( s==L"VOWELS:") {
00115         //create a list with the vocalic phonemes
00116         getline(is,s); 
00117         std::wstringstream ss(s, std::wstringstream::in);
00118         while( ss>>c ){  svowels.insert(c); }
00119 
00120       } else if( s==L"CONSONANTS:") {
00121         //create a set with the consonantic phonemes
00122         getline(is,s); 
00123         std::wstringstream ss(s, std::wstringstream::in);
00124         while( ss>>c ){  sconsonants.insert(c); }
00125 
00126       } else if( s==L"FEATURES:") {
00127         //create a list with the index inside the matrix for each feature
00128         getline(is,s); 
00129         std::wstringstream ss(s, std::wstringstream::in);
00130         i = 0;
00131         while( ss>>s ){ flist[s]=i; i++; }
00132 
00133       } else if( s==L"FVOWELS:") {
00134         //create a set with 
00135         getline(is,s); 
00136         std::wstringstream ss(s, std::wstringstream::in);
00137         while( ss>>s ){ svfeatures.insert(s); }
00138 
00139       } else if( s==L"FOTHER:") {
00140         //create a set with 
00141         getline(is,s); 
00142         std::wstringstream ss(s, std::wstringstream::in);
00143         while( ss>>s ){ scfeatures.insert(s); }
00144 
00145       } else {
00146         //skip
00147       }
00148       
00149     }
00150     
00151     is.close();
00152 
00153 
00154     /**************************************************************
00155      *
00156      * BUILD MATRIX OF DISTANCES
00157      *
00158      **************************************************************/
00159     /*
00160      */
00161     
00162     std::set<wchar_t>::iterator it1;
00163     std::set<wchar_t>::iterator it2;
00164     std::set<std::wstring>::iterator it3;
00165     T d;
00166     int f;
00167 
00168     for(int i=0;i<ALPHSIZE;i++){
00169       for(int j=0;j<ALPHSIZE;j++){
00170         distance[i][j]= i==j ? 0 : (T)8000;
00171       }
00172     }
00173 
00174     //Build vowels vs vowels
00175 
00176     for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00177       for( it2 = svowels.begin(); it2!=it1; ++it2){
00178         //calculate distance between it1 and it2 using features in it3
00179         d=0;
00180         for(it3 = svfeatures.begin(); it3!=svfeatures.end(); ++it3){
00181           f = flist[(*it3)];
00182           d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)];
00183         }
00184         distance[(int)(*it1)][(int)(*it2)] = d;
00185         distance[(int)(*it2)][(int)(*it1)] = d;
00186       }
00187     }
00188 
00189 
00190     //Build vowels vs consonants
00191     for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){
00192       for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00193         //calculate distance between it1 and it2 using features in it3
00194         d=0;
00195         for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){
00196           f = flist[(*it3)];
00197           d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)];
00198         }
00199         distance[(int)(*it1)][(int)(*it2)] = d;
00200         distance[(int)(*it2)][(int)(*it1)] = d;
00201       }
00202     }
00203 
00204 
00205     //Build consonants vs consonants
00206     for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){
00207       for( it2 = sconsonants.begin(); it2!=it1; ++it2){
00208         //calculate distance between it1 and it2 using features in it3
00209         d=0;
00210         for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){
00211           f = flist[(*it3)];
00212           d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)];
00213         }
00214         distance[(int)(*it1)][(int)(*it2)] = d;
00215         distance[(int)(*it2)][(int)(*it1)] = d;
00216       }
00217     }
00218 
00219     if(debug>2){
00220       std::wcerr << L"\t";
00221       for( int i=85; i<ALPHSIZE; i++ ){
00222         std::wcerr << (wchar_t)i << L"\t";
00223       }
00224       std::wcerr << std::endl;
00225 
00226       for( int i=85; i<ALPHSIZE; i++ ){
00227         std::wcerr << (wchar_t)i << L"\t";
00228         for( int j=85; j<ALPHSIZE; j++ ){
00229           std::wcerr << distance[i][j] << L"\t";
00230         }
00231         std::wcerr << std::endl;
00232       }
00233 
00234     }
00235 
00236 
00237   } //constructor
00238 
00239 
00240   void show(std::wostream &o){
00241 
00242     std::set<wchar_t>::iterator it1;
00243     std::set<wchar_t>::iterator it2;
00244     std::set<std::wstring>::iterator it3;
00245 
00246     o << L"Distances between phonemes" << std::endl << L"==========================" << std::endl << std::endl;
00247 
00248     o << L"Read values: cskip:" << cskip << L", csub:" << csub << L", cexp:" << cexp << L", cvowel:" << cvowel << std::endl;
00249 
00250 
00251     o << L"\t";
00252     for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << L"\t";
00253     o << std::endl;
00254 
00255     for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00256       o << (*it1) << L"\t";
00257       for( it2 = svowels.begin(); it2!=it1; ++it2){
00258         o << distance[(int)(*it1)][(int)(*it2)] << L"\t";
00259       }
00260       o << std::endl;
00261     }
00262 
00263     o << std::endl << L"\t";
00264     for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << L"\t";
00265     o << std::endl;
00266 
00267     // vowels vs consonants
00268     for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){
00269       o << (*it2) << L"\t";
00270       for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00271         o << distance[(int)(*it1)][(int)(*it2)] << L"\t";
00272       }
00273       o << std::endl;
00274     }
00275 
00276     o << std::endl << L"\t";
00277     for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1) o << (*it1) << L"\t";
00278     o << std::endl;
00279 
00280     // consonants vs consonants
00281     for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){
00282       o << (*it1) << L"\t";
00283       for( it2 = sconsonants.begin(); it2!=it1; ++it2){
00284         o << distance[(int)(*it1)][(int)(*it2)] << L"\t";
00285       }
00286       o << std::endl;
00287     }
00288   }
00289 
00290 
00291   T getCskip(){
00292     return cskip;
00293   }
00294 
00295   T dSkip(int c){
00296     return c==L' ' || c==L'_' ? cskip+cspace : cskip;
00297     //return cskip;
00298   }
00299 
00300   T dSub(int const a, int const b){
00301     if( ( (wchar_t)a==L' ' || (wchar_t)a==L'_' ) && ( (wchar_t)b==L' ' || (wchar_t)b==L'_' ) ){ return cspace; }
00302     return (wchar_t)a==L'_' || (wchar_t)a==L' ' || (wchar_t)b==L' ' || (wchar_t)b==L'_' ? -cspace/2 : csub - distance[a][b] - V(a) - V(b);
00303   }
00304 
00305   T dExp(int const a, int const b, int const c){
00306     return cexp - distance[a][b] - distance[a][c] - V(a) - std::max(V(b),V(c));
00307   }
00308   
00309   };
00310 
00311 } // namespace
00312 
00313 #endif