FreeLing
4.0
|
00001 /* 00002 * Phonetic Distance Scorer for the PHAST package 00003 * + See the features.ALL file for input description 00004 * + April 2006 pcomas@lsi.upc.edu 00005 * 00006 */ 00007 #ifndef _phd_h 00008 #define _phd_h 00009 00010 #include <cstdlib> 00011 #include <iostream> 00012 #include <sstream> 00013 #include <fstream> 00014 #include <map> 00015 #include <set> 00016 #include <string> 00017 #include <math.h> 00018 00019 #include "freeling/morfo/util.h" 00020 00021 namespace freeling { 00022 00023 #define ALPHSIZE 128 00024 00025 template <typename T=int> 00026 class phd { 00027 00028 private: 00029 T csub, cexp, cvowel, cskip, cspace; 00030 T distance[ALPHSIZE][ALPHSIZE]; 00031 std::set<wchar_t> svowels; // set of vowel phonemes 00032 std::set<wchar_t> sconsonants; // set of consonant phonemes 00033 int debug; 00034 00035 inline T V(wchar_t a){ return svowels.find(a) != svowels.end() ? cvowel : 0; } 00036 00037 public: 00038 00039 phd(const std::wstring &fname){ 00040 00041 debug = 0; 00042 std::wstring s; 00043 wchar_t c; 00044 T t; 00045 int i; 00046 std::map<const std::wstring, int> flist; // set of the features' names with its index 00047 std::map<const std::wstring, T> fweight; // set of the features' saliences 00048 std::map<const std::wstring, T> values; // set of the numerical values of the multivaluated features 00049 std::set<std::wstring> svfeatures; // set of attributes for vowel comparison 00050 std::set<std::wstring> scfeatures; // set of attributes for other comparisons 00051 csub = 0; 00052 cskip = 0; 00053 cexp = 0; 00054 cvowel = 0; 00055 00056 /************************************************************** 00057 * 00058 * READ INPUT FILES, BUILD MATRIX OF FEATURES 00059 * 00060 **************************************************************/ 00061 00062 T features [ALPHSIZE][ALPHSIZE]; 00063 00064 std::wifstream is; 00065 util::open_utf8_file(is,fname); 00066 if (is.fail()) { 00067 std::wcerr<<L"PHONETIC_DISTANCE: Error opening file "+fname; 00068 exit(1); 00069 } 00070 00071 while(!is.eof()){ 00072 00073 is >> s; 00074 00075 if( s[0] == L'#'){ 00076 getline(is,s); 00077 00078 } else if( s==L"FON:") { 00079 is >> c; // this is the phoneme 00080 //cerr << "FONEMA "<< c << endl; 00081 getline(is,s); 00082 std::wstringstream ss(s,std::stringstream::in); 00083 i = 0; 00084 while(ss>>s){ 00085 if(s==L"+"){ 00086 features[(int)c][i] = 100; 00087 }else if(s==L"-"){ 00088 features[(int)c][i] = 0; 00089 }else{ // is a multivaluated feature 00090 features[(int)c][i] = values[s]; 00091 } 00092 //cerr << "Posant " << features[c][i] << " a " << i << " (" << s << ")"<< endl; 00093 i++; 00094 } 00095 00096 } else if( s==L"VALUE:") { 00097 is >> s >> t; // feature value is i 00098 values[s] = t; 00099 //cerr << "VALUE ADD: " << s << " <-- " << i << endl; 00100 00101 } else if( s==L"WEIGHT:") { 00102 is >> s >> t; // feature s weights i 00103 fweight[s] = t; 00104 00105 } else if( s==L"CONSTANT:") { 00106 is >> s >> t; // s takes value i 00107 if (s==L"Cskip") { cskip = t;} 00108 else if(s==L"Csub"){ csub = t;} 00109 else if(s==L"Cexp"){ cexp = t;} 00110 else if(s==L"Cvowel"){ cvowel = t;} 00111 else if(s==L"Cspace"){ cspace = t;} 00112 else{ std::wcerr << L"UNEXPECTED CONSTANT DEFINITION" << s << std::endl; } 00113 00114 } else if( s==L"VOWELS:") { 00115 //create a list with the vocalic phonemes 00116 getline(is,s); 00117 std::wstringstream ss(s, std::wstringstream::in); 00118 while( ss>>c ){ svowels.insert(c); } 00119 00120 } else if( s==L"CONSONANTS:") { 00121 //create a set with the consonantic phonemes 00122 getline(is,s); 00123 std::wstringstream ss(s, std::wstringstream::in); 00124 while( ss>>c ){ sconsonants.insert(c); } 00125 00126 } else if( s==L"FEATURES:") { 00127 //create a list with the index inside the matrix for each feature 00128 getline(is,s); 00129 std::wstringstream ss(s, std::wstringstream::in); 00130 i = 0; 00131 while( ss>>s ){ flist[s]=i; i++; } 00132 00133 } else if( s==L"FVOWELS:") { 00134 //create a set with 00135 getline(is,s); 00136 std::wstringstream ss(s, std::wstringstream::in); 00137 while( ss>>s ){ svfeatures.insert(s); } 00138 00139 } else if( s==L"FOTHER:") { 00140 //create a set with 00141 getline(is,s); 00142 std::wstringstream ss(s, std::wstringstream::in); 00143 while( ss>>s ){ scfeatures.insert(s); } 00144 00145 } else { 00146 //skip 00147 } 00148 00149 } 00150 00151 is.close(); 00152 00153 00154 /************************************************************** 00155 * 00156 * BUILD MATRIX OF DISTANCES 00157 * 00158 **************************************************************/ 00159 /* 00160 */ 00161 00162 std::set<wchar_t>::iterator it1; 00163 std::set<wchar_t>::iterator it2; 00164 std::set<std::wstring>::iterator it3; 00165 T d; 00166 int f; 00167 00168 for(int i=0;i<ALPHSIZE;i++){ 00169 for(int j=0;j<ALPHSIZE;j++){ 00170 distance[i][j]= i==j ? 0 : (T)8000; 00171 } 00172 } 00173 00174 //Build vowels vs vowels 00175 00176 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){ 00177 for( it2 = svowels.begin(); it2!=it1; ++it2){ 00178 //calculate distance between it1 and it2 using features in it3 00179 d=0; 00180 for(it3 = svfeatures.begin(); it3!=svfeatures.end(); ++it3){ 00181 f = flist[(*it3)]; 00182 d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)]; 00183 } 00184 distance[(int)(*it1)][(int)(*it2)] = d; 00185 distance[(int)(*it2)][(int)(*it1)] = d; 00186 } 00187 } 00188 00189 00190 //Build vowels vs consonants 00191 for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){ 00192 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){ 00193 //calculate distance between it1 and it2 using features in it3 00194 d=0; 00195 for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){ 00196 f = flist[(*it3)]; 00197 d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)]; 00198 } 00199 distance[(int)(*it1)][(int)(*it2)] = d; 00200 distance[(int)(*it2)][(int)(*it1)] = d; 00201 } 00202 } 00203 00204 00205 //Build consonants vs consonants 00206 for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){ 00207 for( it2 = sconsonants.begin(); it2!=it1; ++it2){ 00208 //calculate distance between it1 and it2 using features in it3 00209 d=0; 00210 for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){ 00211 f = flist[(*it3)]; 00212 d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)]; 00213 } 00214 distance[(int)(*it1)][(int)(*it2)] = d; 00215 distance[(int)(*it2)][(int)(*it1)] = d; 00216 } 00217 } 00218 00219 if(debug>2){ 00220 std::wcerr << L"\t"; 00221 for( int i=85; i<ALPHSIZE; i++ ){ 00222 std::wcerr << (wchar_t)i << L"\t"; 00223 } 00224 std::wcerr << std::endl; 00225 00226 for( int i=85; i<ALPHSIZE; i++ ){ 00227 std::wcerr << (wchar_t)i << L"\t"; 00228 for( int j=85; j<ALPHSIZE; j++ ){ 00229 std::wcerr << distance[i][j] << L"\t"; 00230 } 00231 std::wcerr << std::endl; 00232 } 00233 00234 } 00235 00236 00237 } //constructor 00238 00239 00240 void show(std::wostream &o){ 00241 00242 std::set<wchar_t>::iterator it1; 00243 std::set<wchar_t>::iterator it2; 00244 std::set<std::wstring>::iterator it3; 00245 00246 o << L"Distances between phonemes" << std::endl << L"==========================" << std::endl << std::endl; 00247 00248 o << L"Read values: cskip:" << cskip << L", csub:" << csub << L", cexp:" << cexp << L", cvowel:" << cvowel << std::endl; 00249 00250 00251 o << L"\t"; 00252 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << L"\t"; 00253 o << std::endl; 00254 00255 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){ 00256 o << (*it1) << L"\t"; 00257 for( it2 = svowels.begin(); it2!=it1; ++it2){ 00258 o << distance[(int)(*it1)][(int)(*it2)] << L"\t"; 00259 } 00260 o << std::endl; 00261 } 00262 00263 o << std::endl << L"\t"; 00264 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << L"\t"; 00265 o << std::endl; 00266 00267 // vowels vs consonants 00268 for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){ 00269 o << (*it2) << L"\t"; 00270 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){ 00271 o << distance[(int)(*it1)][(int)(*it2)] << L"\t"; 00272 } 00273 o << std::endl; 00274 } 00275 00276 o << std::endl << L"\t"; 00277 for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1) o << (*it1) << L"\t"; 00278 o << std::endl; 00279 00280 // consonants vs consonants 00281 for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){ 00282 o << (*it1) << L"\t"; 00283 for( it2 = sconsonants.begin(); it2!=it1; ++it2){ 00284 o << distance[(int)(*it1)][(int)(*it2)] << L"\t"; 00285 } 00286 o << std::endl; 00287 } 00288 } 00289 00290 00291 T getCskip(){ 00292 return cskip; 00293 } 00294 00295 T dSkip(int c){ 00296 return c==L' ' || c==L'_' ? cskip+cspace : cskip; 00297 //return cskip; 00298 } 00299 00300 T dSub(int const a, int const b){ 00301 if( ( (wchar_t)a==L' ' || (wchar_t)a==L'_' ) && ( (wchar_t)b==L' ' || (wchar_t)b==L'_' ) ){ return cspace; } 00302 return (wchar_t)a==L'_' || (wchar_t)a==L' ' || (wchar_t)b==L' ' || (wchar_t)b==L'_' ? -cspace/2 : csub - distance[a][b] - V(a) - V(b); 00303 } 00304 00305 T dExp(int const a, int const b, int const c){ 00306 return cexp - distance[a][b] - distance[a][c] - V(a) - std::max(V(b),V(c)); 00307 } 00308 00309 }; 00310 00311 } // namespace 00312 00313 #endif