FreeLing  4.0
aligner.h
Go to the documentation of this file.
00001 /*
00002  * String aligner for the Phonetic Distance Package
00003  * + 
00004  * + April 2006 pcomas@lsi.upc.edu
00005  *
00006  */
00007 #ifndef _aligner_h
00008 #define _aligner_h
00009 
00010 #include <iostream>
00011 #include <fstream>
00012 #include <sstream>
00013 #include <map>
00014 #include <set>
00015 #include <string>
00016 #include <math.h>
00017 
00018 #include "freeling/morfo/phd.h"
00019 
00020 #define GLOBAL    1  // The aligner produces local alignments
00021 #define SEMILOCAL 2  // The aligner produces semilocal alignments
00022 #define LOCAL     3  // The aligner produces global alignments
00023 
00024 namespace freeling {
00025 
00026   template<typename T=int> 
00027     class aligner{
00028 
00029   private:
00030   phd<T>* sc;
00031   T score;
00032   int debug;
00033 
00034   public:
00035 
00036   struct alin {
00037     T score;   // score of this alignment
00038     double scoren; // score normalized
00039     double context;
00040     int Psubstitutions;
00041     int Pinserts;
00042     int Pdeletions;
00043     int Wsubstitutions;
00044     int Winserts;
00045     int Wdeletions;
00046     unsigned int kword; //The number of keyword involved in this alignment
00047     int begin; // index in b[] where the alignment starts
00048     int end;   // index in b[] where the alignment ends
00049     int beginW; //number of word in b[] where the alignment starts (each ' ' changes the word)
00050     int endW;   //number of word in b[] where the alignment ends
00051     wchar_t* seg; // Segment of b[] with the letters
00052     wchar_t* a;   // Alignment of a
00053     wchar_t* b;   // Alignment of b
00054     bool good; // Marks if this alignment is selected as relevant
00055 
00056     ~alin(){
00057       delete[](a);
00058       delete[](b);
00059       delete[](seg);
00060     }
00061 
00062   alin(T _score, int _begin, int _end, int _beginW, int _endW, int _substitutions, int _inserts, int _deletions, wchar_t* _a, wchar_t* _b) :
00063     score(_score), Psubstitutions(_substitutions), Pinserts(_inserts), Pdeletions(_deletions), begin(_begin), end(_end), beginW(_beginW), endW(_endW), seg(NULL), a(_a), b(_b) {}
00064   };
00065 
00066 
00067   aligner(const std::wstring &fname, int const _debug = 0){
00068     sc = new phd<T>(fname);
00069     debug=_debug;
00070     // if(debug>3){ sc->show(cerr);}
00071   } //constructor
00072 
00073   ~aligner(){
00074     delete(sc);
00075   }
00076 
00077 
00078   /*
00079    * Alinia la cadena A contra la cadena B. Conceptualment es considera que A és la query
00080    */
00081   alin* align(const std::wstring &wa, const std::wstring &wb, const int mode = SEMILOCAL){
00082     // a is the short string with length tj
00083     // b is the long string with length ti
00084     // The algorithm searchs for the best match of a against b in a semi-local or global point of view.
00085     // Usage of global matching is discouraged for the sake of coherence :-) because it doesn't try
00086     // to keep the letters together so 'a' will be meaningless scattered through all 'b'
00087 
00088     const wchar_t *a=wa.c_str(); const int tj=wa.size();
00089     const wchar_t *b=wb.c_str(); const int ti=wb.size();
00090 
00091     /*
00092     //  Build the align matrix
00093     */
00094     int const W = ti+1;
00095 
00096     if( ti==0 || tj == 0 ){ return new alin(0,0,0,0,0,0,0,0,0,0); }
00097     int i,j;
00098     int* m = new int[W*(tj+1)+ti+1]; //int m[tj+1][ti+1];
00099 
00100     // Variables for alignment reconstruction
00101     wchar_t* answerA = new wchar_t[tj+ti];
00102     wchar_t* answerB = new wchar_t[tj+ti];
00103     int pA = 0;
00104     int pB = 0;
00105     int insertions=0, deletions=0, substitutions=0;
00106     int spacesA=0;
00107 
00108 
00109     // INITIALIZATION STEP
00110 
00111     // Decide the number of word that each character belongs to
00112     int nwords=0;
00113     for(int i=0; i<ti; i++){ 
00114       if( b[i] == L' ' || b[i] == L'_' ){
00115         // word change at _i_
00116         nwords++;
00117       }
00118     }
00119 
00120     int* words = new int[nwords];
00121     j=0;
00122     for(int i=0; i<ti; i++){ 
00123       if( b[i] == L' ' || b[i] == L'_' ){
00124         words[j++] = i;
00125       }
00126     }
00127 
00128     switch(mode){
00129     case GLOBAL:  //Start values are skips
00130     m[0]=0;
00131     for(int j=1; j<=tj; j++){ m[j*W] = m[(j-1)*W] + sc->dSkip(a[j-1]); }
00132     for(int i=1; i<=ti; i++){ m[i]   = m[i-1]     + sc->dSkip(b[i-1]); }
00133     break;
00134     default: //Start values are 0
00135     for(int j=0;j<=tj;j++){ m[j*W] = 0; }
00136     for(int i=1;i<=ti;i++){ m[i] = 0; }
00137     break;
00138     }
00139 
00140 
00141     /*
00142     // Throw the scoring process
00143     */
00144     int i1, i2, i3, i4, i5, indexInit, indexEnd, bestJ, bestI;
00145     indexEnd  = 0;
00146     indexInit = 0;
00147     int initWord  = 0;
00148     int endWord   = 0;
00149     score = -100000000;
00150     bestJ = tj;
00151     bestI = ti;
00152 
00153     // FILL THE FIRST COLUMN & ROW
00154     for(j=1;j<=tj;j++){
00155       m[j*W+1] = std::max(  m[W*(j-1)]+sc->dSub(a[j-1],b[0]) , m[W*(j-1)+1]+sc->dSkip(a[j-1]) );
00156       if(a[j-1]==L' '|| a[j-1]==L'_' ){spacesA++;}
00157       if(score < m[W*j+1]){
00158         score = m[W*j+1];
00159         bestJ = j;
00160         bestI = 0;
00161       }
00162     }
00163     for(i=1;i<=ti;i++){
00164       m[ W+i ] = std::max(  m[ i-1 ]+sc->dSub(a[0],b[i-1])  ,  m[ W+i-1 ]+sc->dSkip(b[i-1])  );
00165       if(score < m[W+i]){
00166         score = m[W+i];
00167         bestJ = 0;
00168         bestI = i;
00169       }
00170     }
00171 
00172     //FILL THE REST OF THE MATRIX
00173     //int step = ti/4;
00174     int lowLimit = 0;
00175     int upperLimit = 0;
00176     int threshold = -10000000;
00177     if( mode == LOCAL ){ threshold = 0; }
00178 
00179     for(j=2;j<=tj;j++){
00180       //if( mode==GLOBAL && tj/ti> 0.75){
00181       // // If the scoring is GLOBAL we do not fill the entire matrix
00182       // lowLimit   = max(  2 , j-step );
00183       // upperLimit = min( ti , j+step );
00184       // m[W*j+lowLimit+1] = 0;
00185       //      } else {
00186       lowLimit   = 2;
00187       upperLimit = ti;
00188       //      }
00189 
00190       for(i=lowLimit;i<=upperLimit;i++){
00191         i1 = m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]);
00192         i2 = m[W*j+i-1] + sc->dSkip(b[i-1]);
00193         i3 = m[W*(j-1)+i-2] + sc->dExp(a[j-1],b[i-2],b[i-1]);
00194         i4 = m[W*(j-1)+i] + sc->dSkip(a[j-1]);
00195         i5 = m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]);
00196         m[W*j+i] = std::max(threshold,std::max(i1,std::max(i1,std::max(i2,std::max(i3,std::max(i4,i5))))));
00197 
00198         if(score < m[W*j+i]){
00199           score = m[W*j+i];
00200           bestJ = j;
00201           bestI = i;
00202         }
00203 
00204       }
00205     }
00206 
00207     //Prints the full score matrix if needed
00208     /*if(debug>5){
00209       cerr << endl << "\t";
00210       for(i=0;i<ti;i++){ cerr << "\t" << b[i]; }
00211       cerr << endl << "\t";
00212       for(j=0;j<=tj;j++){
00213       if(j>0) cerr << a[j-1] << "\t";
00214       for(i=0;i<=ti;i++){
00215       cerr << m[W*j+i] << "\t";
00216       }
00217       cerr << endl;
00218       }
00219       }*/
00220 
00221 
00222     /*
00223       || Reconstruction Step
00224     */
00225     bool final = true;    //Exit condition
00226     int lastScore = score;
00227     int steps = 0; //Number of steps uset to align 'a' with 'b'
00228     int lastChar = 0;
00229 
00230 
00231     int ni=0;
00232     int mymax=0;
00233 
00234     switch(mode){
00235     case SEMILOCAL:
00236       /* In semi-local alignment, the best-path reconstructions doesn't start from the 
00237        * botom-right corner of the matrix but from the best-scoring cell among last
00238        * column and last row scores.
00239        * In this implementation only the last row is taken in account since it is 
00240        * the "long" string
00241        */
00242       for(i=0;i<=ti;i++){
00243         if( mymax < m[W*tj+i] ){
00244           mymax = m[W*tj+i];
00245           ni=i;
00246         }
00247       }
00248       score = mymax;
00249       for(i=ti-1;i>=ni;i--){ // Carry on half of the string for debugging purpouses
00250         answerA[pA++]= L'-';
00251         answerB[pB++]= b[i];
00252         insertions++;
00253       }
00254       i=ni; j=tj;
00255       break;
00256     case GLOBAL: // now this is global alignment
00257       i = ti; j = tj;
00258       break;
00259     case LOCAL: 
00260       /* In LOCAL alignment, the best-path reconstruction starts with at the best scoring 
00261        * position in the whole matrix.
00262        */
00263       j = bestJ;
00264       i = bestI;
00265       break;
00266     }
00267 
00268 
00269     while( final ){
00270 
00271       /*if(debug>5){
00272         cerr << "Looking for " << m[ W*j +i] << " in ("<<j<<","<<i<<")"<<endl;
00273         cerr << "m[j-1][i-1]=" << m[ W*(j-1) +i-1] << ": dSub=" << sc->dSub(a[j-1],b[i-1]) << endl;
00274         cerr << "m[j][i-1]=" <<   m[ W*j +i-1] << ": dSkipB=" << sc->dSkip(b[i-1]) << endl;
00275         cerr << "m[j-1][i]=" <<   m[ W*(j-1) +i] << ": dSkipA=" << sc->dSkip(a[j-1]) << endl;
00276         if(j>1){ cerr << "m[j-1][i-2]=" << m[ W*(j-1) +i-2] << ": dExp=" << sc->dExp(a[j-1],b[i-2],b[i-1]) << endl; }
00277         if(i>1){ cerr << "m[j-2][i-1]=" << m[ W*(j-2) +i-1] << ": dExp=" << sc->dExp(b[i-1],a[j-2],a[j-1]) << endl; }
00278         }*/
00279 
00280 
00281       if( j>0 && i>0 && m[W*j+i] ==  m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]) ){
00282         //Aliniar b[i] amb a[j]
00283         lastScore = sc->dSub(a[j-1],b[i-1]);
00284         i--; j--;
00285         //cerr << "Sub a[" << j << "]=" << a[j] << " b[" << i << "]=" << b[i] << " @ " << pA << endl;
00286         answerA[pA++] = a[j];
00287         answerB[pB++] = b[i];
00288         indexInit= i;
00289         indexEnd = std::max(i,indexEnd);
00290         if( a[j]!=b[i] ){ substitutions++;  }
00291         steps++;
00292         lastChar = steps;
00293 
00294       } else if ( i>0 && j>0 && m[W*j+i] == m[W*j+i-1] + sc->dSkip(b[i-1]) ) {
00295         lastScore = sc->dSkip(b[i-1]);
00296         i--;
00297         answerA[pA++] = L'-';
00298         answerB[pB++] = b[i];
00299         insertions++;
00300         if(steps==0 && b[i]==L' ' && b[i]==L'_' ){
00301           score = m[W*j+i];
00302         } else {
00303           steps++;
00304         }
00305 
00306       } else if( j>1 && i>0 && m[W*j+i] ==  m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]) ){
00307         lastScore =  sc->dExp(b[i-1],a[j-2],a[j-1]);
00308         i--; j-=2;
00309         answerA[pA++] = a[j];
00310         answerA[pA++] = a[j+1];
00311         answerB[pB++] = b[i];
00312         answerB[pB++] = L'+';
00313         indexInit = i;
00314         indexEnd = std::max(i,indexEnd);
00315         deletions++;
00316         if( a[j]!=b[i] ){ substitutions++; }
00317         steps+=2;
00318         lastChar = steps;
00319 
00320       } else if( j>0 && i>0 && m[W*j+i] ==  m[W*(j-1)+i] + sc->dSkip(a[j-1]) ){
00321         lastScore = sc->dSkip(a[j-1]);
00322         j--;
00323         answerA[pA++] = a[j];
00324         answerB[pB++] = L'-';
00325         indexInit = i;
00326         indexEnd = std::max(i,indexEnd);
00327         deletions++;
00328         steps++;
00329         lastChar = steps;
00330 
00331       } else if ( i>1 && j>0 && m[W*j+i] == m[ W*(j-1) +i-2 ] + sc->dExp(a[j-1],b[i-2],b[i-1]) ) {
00332         lastScore = sc->dExp(a[j-1],b[i-2],b[i-1]);
00333         j--; i--;
00334         answerA[pA++] = a[j];
00335         answerA[pA++] = L'+';
00336         answerB[pB++] = b[i];
00337         answerB[pB++] = b[i-1];
00338         indexInit = i;
00339         indexEnd = std::max(i,indexEnd);
00340         insertions++;
00341         if( a[j]!=b[i] ){ substitutions++; }
00342         i--;
00343         steps+=2;
00344         lastChar = steps;
00345 
00346       } else if ( j==0 ){
00347         i--;
00348         answerA[pA++] = L'-';
00349         answerB[pB++] = b[i];
00350         insertions++;
00351         if(steps!=0 || b[i]!=L' ' || b[i]!=L'_' ) steps++;
00352 
00353       } else if ( i==0 ){
00354         j--;
00355         answerA[pA++] = a[j];
00356         answerB[pB++] = L'-';
00357         deletions++;
00358 
00359         if(steps!=0 || b[i]!=L' ' || b[i]!=L'_' ) steps++;
00360         lastChar = steps;
00361         
00362       } else {
00363         std::wcerr << L"BOINK! Error at "<< j << L"," << i << std::endl;
00364         break;
00365       }
00366       
00367       // This is the termination condition
00368       switch(mode){
00369       case SEMILOCAL:
00370       final = j!=0;
00371       if(!final){
00372         i--;
00373         while(i>=0){
00374           answerA[pA++] = L'-';
00375           answerB[pB++] = b[i--];
00376           insertions++;
00377         }
00378       }
00379       break;
00380 
00381       case GLOBAL:
00382       final = i!=0 || j!=0;
00383       break;
00384 
00385       case LOCAL:
00386       final = m[W*j+i]!=0;
00387       break;
00388       }
00389     }
00390 
00391 
00392     // Computing of the score
00393     // Score= avg.points/phoneme
00394     steps = lastChar;
00395 
00396     switch(mode){
00397     case GLOBAL:
00398       // average of similarity per phonem (without spaces?)
00399       score /= (pA-spacesA);
00400       break;
00401     case SEMILOCAL:
00402       //if(debug>2) cerr << "     Score = " << score << " Last Score = " << lastScore << " steps = " << steps <<  endl;
00403       //score = (score-lastScore) / steps;
00404       score = score / steps;
00405       break;
00406     case LOCAL:
00407       score = score / steps;
00408       break;
00409     }
00410 
00411     // Print the alignment
00412     wchar_t* newA = new wchar_t[pA+1];
00413     wchar_t* newB = new wchar_t[pB+1];
00414     newA[pA]=0;
00415     newB[pB]=0;
00416     --pA;
00417     --pB;
00418 
00419     for(int i=0; pA>=0; ++i, --pA){
00420       newA[i] = answerA[pA];
00421     }
00422     for(int i=0; pB>=0; ++i, --pB){
00423       newB[i] = answerB[pB];
00424     }
00425 
00426     delete[](answerA);
00427     delete[](answerB);
00428     delete[] m;
00429 
00430     // Finds in wich word does indexInit and indexEnd fall
00431     for(int i=0; i<nwords; i++){ 
00432       if( words[i] == indexInit ){
00433         initWord = i+1;
00434         break;
00435       } else if( words[i] < indexInit ) {
00436         initWord = i+1;
00437       } else if( words[i] > indexInit ) {
00438         initWord = i;
00439         break;
00440       }
00441 
00442     }
00443 
00444     for(int i=0; i<nwords; i++){ 
00445       if( words[i] == indexEnd ){
00446         endWord = i;
00447         break;
00448       } else if( words[i] < indexEnd ) {
00449         endWord = i+1;
00450       } else if( words[i] > indexEnd ) {
00451         endWord = i;
00452         break;
00453       }
00454 
00455     }
00456 
00457     delete[] words;
00458 
00459     return new alin(score,indexInit,indexEnd,initWord,endWord,substitutions,insertions,deletions,newA,newB);
00460 
00461   }
00462 
00463   };
00464 
00465 } // namespace
00466 
00467 #endif