FreeLing
4.0
|
00001 /* 00002 * String aligner for the Phonetic Distance Package 00003 * + 00004 * + April 2006 pcomas@lsi.upc.edu 00005 * 00006 */ 00007 #ifndef _aligner_h 00008 #define _aligner_h 00009 00010 #include <iostream> 00011 #include <fstream> 00012 #include <sstream> 00013 #include <map> 00014 #include <set> 00015 #include <string> 00016 #include <math.h> 00017 00018 #include "freeling/morfo/phd.h" 00019 00020 #define GLOBAL 1 // The aligner produces local alignments 00021 #define SEMILOCAL 2 // The aligner produces semilocal alignments 00022 #define LOCAL 3 // The aligner produces global alignments 00023 00024 namespace freeling { 00025 00026 template<typename T=int> 00027 class aligner{ 00028 00029 private: 00030 phd<T>* sc; 00031 T score; 00032 int debug; 00033 00034 public: 00035 00036 struct alin { 00037 T score; // score of this alignment 00038 double scoren; // score normalized 00039 double context; 00040 int Psubstitutions; 00041 int Pinserts; 00042 int Pdeletions; 00043 int Wsubstitutions; 00044 int Winserts; 00045 int Wdeletions; 00046 unsigned int kword; //The number of keyword involved in this alignment 00047 int begin; // index in b[] where the alignment starts 00048 int end; // index in b[] where the alignment ends 00049 int beginW; //number of word in b[] where the alignment starts (each ' ' changes the word) 00050 int endW; //number of word in b[] where the alignment ends 00051 wchar_t* seg; // Segment of b[] with the letters 00052 wchar_t* a; // Alignment of a 00053 wchar_t* b; // Alignment of b 00054 bool good; // Marks if this alignment is selected as relevant 00055 00056 ~alin(){ 00057 delete[](a); 00058 delete[](b); 00059 delete[](seg); 00060 } 00061 00062 alin(T _score, int _begin, int _end, int _beginW, int _endW, int _substitutions, int _inserts, int _deletions, wchar_t* _a, wchar_t* _b) : 00063 score(_score), Psubstitutions(_substitutions), Pinserts(_inserts), Pdeletions(_deletions), begin(_begin), end(_end), beginW(_beginW), endW(_endW), seg(NULL), a(_a), b(_b) {} 00064 }; 00065 00066 00067 aligner(const std::wstring &fname, int const _debug = 0){ 00068 sc = new phd<T>(fname); 00069 debug=_debug; 00070 // if(debug>3){ sc->show(cerr);} 00071 } //constructor 00072 00073 ~aligner(){ 00074 delete(sc); 00075 } 00076 00077 00078 /* 00079 * Alinia la cadena A contra la cadena B. Conceptualment es considera que A és la query 00080 */ 00081 alin* align(const std::wstring &wa, const std::wstring &wb, const int mode = SEMILOCAL){ 00082 // a is the short string with length tj 00083 // b is the long string with length ti 00084 // The algorithm searchs for the best match of a against b in a semi-local or global point of view. 00085 // Usage of global matching is discouraged for the sake of coherence :-) because it doesn't try 00086 // to keep the letters together so 'a' will be meaningless scattered through all 'b' 00087 00088 const wchar_t *a=wa.c_str(); const int tj=wa.size(); 00089 const wchar_t *b=wb.c_str(); const int ti=wb.size(); 00090 00091 /* 00092 // Build the align matrix 00093 */ 00094 int const W = ti+1; 00095 00096 if( ti==0 || tj == 0 ){ return new alin(0,0,0,0,0,0,0,0,0,0); } 00097 int i,j; 00098 int* m = new int[W*(tj+1)+ti+1]; //int m[tj+1][ti+1]; 00099 00100 // Variables for alignment reconstruction 00101 wchar_t* answerA = new wchar_t[tj+ti]; 00102 wchar_t* answerB = new wchar_t[tj+ti]; 00103 int pA = 0; 00104 int pB = 0; 00105 int insertions=0, deletions=0, substitutions=0; 00106 int spacesA=0; 00107 00108 00109 // INITIALIZATION STEP 00110 00111 // Decide the number of word that each character belongs to 00112 int nwords=0; 00113 for(int i=0; i<ti; i++){ 00114 if( b[i] == L' ' || b[i] == L'_' ){ 00115 // word change at _i_ 00116 nwords++; 00117 } 00118 } 00119 00120 int* words = new int[nwords]; 00121 j=0; 00122 for(int i=0; i<ti; i++){ 00123 if( b[i] == L' ' || b[i] == L'_' ){ 00124 words[j++] = i; 00125 } 00126 } 00127 00128 switch(mode){ 00129 case GLOBAL: //Start values are skips 00130 m[0]=0; 00131 for(int j=1; j<=tj; j++){ m[j*W] = m[(j-1)*W] + sc->dSkip(a[j-1]); } 00132 for(int i=1; i<=ti; i++){ m[i] = m[i-1] + sc->dSkip(b[i-1]); } 00133 break; 00134 default: //Start values are 0 00135 for(int j=0;j<=tj;j++){ m[j*W] = 0; } 00136 for(int i=1;i<=ti;i++){ m[i] = 0; } 00137 break; 00138 } 00139 00140 00141 /* 00142 // Throw the scoring process 00143 */ 00144 int i1, i2, i3, i4, i5, indexInit, indexEnd, bestJ, bestI; 00145 indexEnd = 0; 00146 indexInit = 0; 00147 int initWord = 0; 00148 int endWord = 0; 00149 score = -100000000; 00150 bestJ = tj; 00151 bestI = ti; 00152 00153 // FILL THE FIRST COLUMN & ROW 00154 for(j=1;j<=tj;j++){ 00155 m[j*W+1] = std::max( m[W*(j-1)]+sc->dSub(a[j-1],b[0]) , m[W*(j-1)+1]+sc->dSkip(a[j-1]) ); 00156 if(a[j-1]==L' '|| a[j-1]==L'_' ){spacesA++;} 00157 if(score < m[W*j+1]){ 00158 score = m[W*j+1]; 00159 bestJ = j; 00160 bestI = 0; 00161 } 00162 } 00163 for(i=1;i<=ti;i++){ 00164 m[ W+i ] = std::max( m[ i-1 ]+sc->dSub(a[0],b[i-1]) , m[ W+i-1 ]+sc->dSkip(b[i-1]) ); 00165 if(score < m[W+i]){ 00166 score = m[W+i]; 00167 bestJ = 0; 00168 bestI = i; 00169 } 00170 } 00171 00172 //FILL THE REST OF THE MATRIX 00173 //int step = ti/4; 00174 int lowLimit = 0; 00175 int upperLimit = 0; 00176 int threshold = -10000000; 00177 if( mode == LOCAL ){ threshold = 0; } 00178 00179 for(j=2;j<=tj;j++){ 00180 //if( mode==GLOBAL && tj/ti> 0.75){ 00181 // // If the scoring is GLOBAL we do not fill the entire matrix 00182 // lowLimit = max( 2 , j-step ); 00183 // upperLimit = min( ti , j+step ); 00184 // m[W*j+lowLimit+1] = 0; 00185 // } else { 00186 lowLimit = 2; 00187 upperLimit = ti; 00188 // } 00189 00190 for(i=lowLimit;i<=upperLimit;i++){ 00191 i1 = m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]); 00192 i2 = m[W*j+i-1] + sc->dSkip(b[i-1]); 00193 i3 = m[W*(j-1)+i-2] + sc->dExp(a[j-1],b[i-2],b[i-1]); 00194 i4 = m[W*(j-1)+i] + sc->dSkip(a[j-1]); 00195 i5 = m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]); 00196 m[W*j+i] = std::max(threshold,std::max(i1,std::max(i1,std::max(i2,std::max(i3,std::max(i4,i5)))))); 00197 00198 if(score < m[W*j+i]){ 00199 score = m[W*j+i]; 00200 bestJ = j; 00201 bestI = i; 00202 } 00203 00204 } 00205 } 00206 00207 //Prints the full score matrix if needed 00208 /*if(debug>5){ 00209 cerr << endl << "\t"; 00210 for(i=0;i<ti;i++){ cerr << "\t" << b[i]; } 00211 cerr << endl << "\t"; 00212 for(j=0;j<=tj;j++){ 00213 if(j>0) cerr << a[j-1] << "\t"; 00214 for(i=0;i<=ti;i++){ 00215 cerr << m[W*j+i] << "\t"; 00216 } 00217 cerr << endl; 00218 } 00219 }*/ 00220 00221 00222 /* 00223 || Reconstruction Step 00224 */ 00225 bool final = true; //Exit condition 00226 int lastScore = score; 00227 int steps = 0; //Number of steps uset to align 'a' with 'b' 00228 int lastChar = 0; 00229 00230 00231 int ni=0; 00232 int mymax=0; 00233 00234 switch(mode){ 00235 case SEMILOCAL: 00236 /* In semi-local alignment, the best-path reconstructions doesn't start from the 00237 * botom-right corner of the matrix but from the best-scoring cell among last 00238 * column and last row scores. 00239 * In this implementation only the last row is taken in account since it is 00240 * the "long" string 00241 */ 00242 for(i=0;i<=ti;i++){ 00243 if( mymax < m[W*tj+i] ){ 00244 mymax = m[W*tj+i]; 00245 ni=i; 00246 } 00247 } 00248 score = mymax; 00249 for(i=ti-1;i>=ni;i--){ // Carry on half of the string for debugging purpouses 00250 answerA[pA++]= L'-'; 00251 answerB[pB++]= b[i]; 00252 insertions++; 00253 } 00254 i=ni; j=tj; 00255 break; 00256 case GLOBAL: // now this is global alignment 00257 i = ti; j = tj; 00258 break; 00259 case LOCAL: 00260 /* In LOCAL alignment, the best-path reconstruction starts with at the best scoring 00261 * position in the whole matrix. 00262 */ 00263 j = bestJ; 00264 i = bestI; 00265 break; 00266 } 00267 00268 00269 while( final ){ 00270 00271 /*if(debug>5){ 00272 cerr << "Looking for " << m[ W*j +i] << " in ("<<j<<","<<i<<")"<<endl; 00273 cerr << "m[j-1][i-1]=" << m[ W*(j-1) +i-1] << ": dSub=" << sc->dSub(a[j-1],b[i-1]) << endl; 00274 cerr << "m[j][i-1]=" << m[ W*j +i-1] << ": dSkipB=" << sc->dSkip(b[i-1]) << endl; 00275 cerr << "m[j-1][i]=" << m[ W*(j-1) +i] << ": dSkipA=" << sc->dSkip(a[j-1]) << endl; 00276 if(j>1){ cerr << "m[j-1][i-2]=" << m[ W*(j-1) +i-2] << ": dExp=" << sc->dExp(a[j-1],b[i-2],b[i-1]) << endl; } 00277 if(i>1){ cerr << "m[j-2][i-1]=" << m[ W*(j-2) +i-1] << ": dExp=" << sc->dExp(b[i-1],a[j-2],a[j-1]) << endl; } 00278 }*/ 00279 00280 00281 if( j>0 && i>0 && m[W*j+i] == m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]) ){ 00282 //Aliniar b[i] amb a[j] 00283 lastScore = sc->dSub(a[j-1],b[i-1]); 00284 i--; j--; 00285 //cerr << "Sub a[" << j << "]=" << a[j] << " b[" << i << "]=" << b[i] << " @ " << pA << endl; 00286 answerA[pA++] = a[j]; 00287 answerB[pB++] = b[i]; 00288 indexInit= i; 00289 indexEnd = std::max(i,indexEnd); 00290 if( a[j]!=b[i] ){ substitutions++; } 00291 steps++; 00292 lastChar = steps; 00293 00294 } else if ( i>0 && j>0 && m[W*j+i] == m[W*j+i-1] + sc->dSkip(b[i-1]) ) { 00295 lastScore = sc->dSkip(b[i-1]); 00296 i--; 00297 answerA[pA++] = L'-'; 00298 answerB[pB++] = b[i]; 00299 insertions++; 00300 if(steps==0 && b[i]==L' ' && b[i]==L'_' ){ 00301 score = m[W*j+i]; 00302 } else { 00303 steps++; 00304 } 00305 00306 } else if( j>1 && i>0 && m[W*j+i] == m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]) ){ 00307 lastScore = sc->dExp(b[i-1],a[j-2],a[j-1]); 00308 i--; j-=2; 00309 answerA[pA++] = a[j]; 00310 answerA[pA++] = a[j+1]; 00311 answerB[pB++] = b[i]; 00312 answerB[pB++] = L'+'; 00313 indexInit = i; 00314 indexEnd = std::max(i,indexEnd); 00315 deletions++; 00316 if( a[j]!=b[i] ){ substitutions++; } 00317 steps+=2; 00318 lastChar = steps; 00319 00320 } else if( j>0 && i>0 && m[W*j+i] == m[W*(j-1)+i] + sc->dSkip(a[j-1]) ){ 00321 lastScore = sc->dSkip(a[j-1]); 00322 j--; 00323 answerA[pA++] = a[j]; 00324 answerB[pB++] = L'-'; 00325 indexInit = i; 00326 indexEnd = std::max(i,indexEnd); 00327 deletions++; 00328 steps++; 00329 lastChar = steps; 00330 00331 } else if ( i>1 && j>0 && m[W*j+i] == m[ W*(j-1) +i-2 ] + sc->dExp(a[j-1],b[i-2],b[i-1]) ) { 00332 lastScore = sc->dExp(a[j-1],b[i-2],b[i-1]); 00333 j--; i--; 00334 answerA[pA++] = a[j]; 00335 answerA[pA++] = L'+'; 00336 answerB[pB++] = b[i]; 00337 answerB[pB++] = b[i-1]; 00338 indexInit = i; 00339 indexEnd = std::max(i,indexEnd); 00340 insertions++; 00341 if( a[j]!=b[i] ){ substitutions++; } 00342 i--; 00343 steps+=2; 00344 lastChar = steps; 00345 00346 } else if ( j==0 ){ 00347 i--; 00348 answerA[pA++] = L'-'; 00349 answerB[pB++] = b[i]; 00350 insertions++; 00351 if(steps!=0 || b[i]!=L' ' || b[i]!=L'_' ) steps++; 00352 00353 } else if ( i==0 ){ 00354 j--; 00355 answerA[pA++] = a[j]; 00356 answerB[pB++] = L'-'; 00357 deletions++; 00358 00359 if(steps!=0 || b[i]!=L' ' || b[i]!=L'_' ) steps++; 00360 lastChar = steps; 00361 00362 } else { 00363 std::wcerr << L"BOINK! Error at "<< j << L"," << i << std::endl; 00364 break; 00365 } 00366 00367 // This is the termination condition 00368 switch(mode){ 00369 case SEMILOCAL: 00370 final = j!=0; 00371 if(!final){ 00372 i--; 00373 while(i>=0){ 00374 answerA[pA++] = L'-'; 00375 answerB[pB++] = b[i--]; 00376 insertions++; 00377 } 00378 } 00379 break; 00380 00381 case GLOBAL: 00382 final = i!=0 || j!=0; 00383 break; 00384 00385 case LOCAL: 00386 final = m[W*j+i]!=0; 00387 break; 00388 } 00389 } 00390 00391 00392 // Computing of the score 00393 // Score= avg.points/phoneme 00394 steps = lastChar; 00395 00396 switch(mode){ 00397 case GLOBAL: 00398 // average of similarity per phonem (without spaces?) 00399 score /= (pA-spacesA); 00400 break; 00401 case SEMILOCAL: 00402 //if(debug>2) cerr << " Score = " << score << " Last Score = " << lastScore << " steps = " << steps << endl; 00403 //score = (score-lastScore) / steps; 00404 score = score / steps; 00405 break; 00406 case LOCAL: 00407 score = score / steps; 00408 break; 00409 } 00410 00411 // Print the alignment 00412 wchar_t* newA = new wchar_t[pA+1]; 00413 wchar_t* newB = new wchar_t[pB+1]; 00414 newA[pA]=0; 00415 newB[pB]=0; 00416 --pA; 00417 --pB; 00418 00419 for(int i=0; pA>=0; ++i, --pA){ 00420 newA[i] = answerA[pA]; 00421 } 00422 for(int i=0; pB>=0; ++i, --pB){ 00423 newB[i] = answerB[pB]; 00424 } 00425 00426 delete[](answerA); 00427 delete[](answerB); 00428 delete[] m; 00429 00430 // Finds in wich word does indexInit and indexEnd fall 00431 for(int i=0; i<nwords; i++){ 00432 if( words[i] == indexInit ){ 00433 initWord = i+1; 00434 break; 00435 } else if( words[i] < indexInit ) { 00436 initWord = i+1; 00437 } else if( words[i] > indexInit ) { 00438 initWord = i; 00439 break; 00440 } 00441 00442 } 00443 00444 for(int i=0; i<nwords; i++){ 00445 if( words[i] == indexEnd ){ 00446 endWord = i; 00447 break; 00448 } else if( words[i] < indexEnd ) { 00449 endWord = i+1; 00450 } else if( words[i] > indexEnd ) { 00451 endWord = i; 00452 break; 00453 } 00454 00455 } 00456 00457 delete[] words; 00458 00459 return new alin(score,indexInit,indexEnd,initWord,endWord,substitutions,insertions,deletions,newA,newB); 00460 00461 } 00462 00463 }; 00464 00465 } // namespace 00466 00467 #endif