FreeLing  4.0
util.h
Go to the documentation of this file.
00001 
00002 //
00003 //    Fries - Feature Retriever for Intensional Encoding of Sentences
00004 //
00005 //    Copyright (C) 2014   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This file is part of the Fries library
00009 //
00010 //    The Fries library is free software; you can redistribute it 
00011 //    and/or modify it under the terms of the GNU Affero General Public
00012 //    License as published by the Free Software Foundation; either
00013 //    version 3 of the License, or (at your option) any later version.
00014 //
00015 //    This library is distributed in the hope that it will be useful,
00016 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 //    Affero General Public License for more details.
00019 //
00020 //    You should have received a copy of the GNU Affero General Public
00021 //    License along with this library; if not, write to the Free Software
00022 //    Foundation, Inc., 51 Franklin St, 5th Floor, Boston, MA 02110-1301 USA
00023 //
00024 //    contact: Lluis Padro (padro@lsi.upc.es)
00025 //             TALP Research Center
00026 //             despatx Omega.S112 - Campus Nord UPC
00027 //             08034 Barcelona.  SPAIN
00028 //
00030 
00031 #ifndef _UTIL
00032 #define _UTIL
00033 
00034 #include <cstdio>
00035 #include <list>
00036 #include <string>
00037 #include <vector>
00038 #include <set>
00039 
00040 #include <locale>
00041 #include <iostream>
00042 #include "freeling/utf8/utf8.h"
00043 
00044 #include "freeling/regexp.h"
00045 #include "freeling/windll.h"
00046 #include "freeling/morfo/traces.h"
00047 
00048 #ifdef WIN32
00049 #include <windows.h>
00050 #define getpid() GetCurrentProcessId()
00051 #define pid_t DWORD
00052 #define err_type errno_t
00053 #define NEW_TMPNAME(buf,sz) tmpnam_s(buf,sz)
00054 #define TMPNAME_FAILED(x) x
00055 #define TMPNAME_PREFIX L"."
00056 #else
00057 #define err_type char*
00058 #define NEW_TMPNAME(buf,sz) tmpnam(buf)
00059 #define TMPNAME_FAILED(x) (x==NULL)
00060 #define TMPNAME_PREFIX L""
00061 #endif
00062 
00063 
00064 // Capitalization patterns
00065 #define UPPER_NONE 0
00066 #define UPPER_1ST 1
00067 #define UPPER_ALL 2
00068  
00069 namespace freeling {
00070 
00071 #define MOD_TRACENAME L"UTIL"
00072 #define MOD_TRACECODE UTIL_TRACE
00073 
00079 
00080   class WINDLL util {
00081 
00082   public:
00084     static regexp RE_has_lowercase;   // wstring contains lowercase chars
00085     static regexp RE_has_alphanum;    // wstring contains alphanum chars
00086     static regexp RE_is_capitalized;  // wstring is capitalized
00087     static regexp RE_all_digits;      // wstring is all digits
00088     static regexp RE_all_caps;        // wstring is uppercase
00089     static regexp RE_initial_dot;     // wstring is an initial plus optional dot
00090     static regexp RE_all_caps_dot;    // wstring is uppercase plus optional dot
00091     static regexp RE_capitalized_dot; // wstring is capitalized plus optional dot
00092     static regexp RE_has_digits;      // wstring contains digits
00093     static regexp RE_lowercase_dot;   // wstring is lowercase plus optional dot
00094 
00095     static regexp RE_win_absolute_path; // to detect absolute paths in windows
00096 
00098     static void init_locale(const std::wstring &s=L"default");
00100     static void open_utf8_file(std::wifstream &, const std::wstring &);
00102     static void open_utf8_file(std::wofstream &, const std::wstring &);
00104     static std::wstring lowercase(const std::wstring &);
00106     static std::wstring uppercase(const std::wstring &);
00108     static bool is_absolute(const std::string &p);
00110     static bool is_absolute(const std::wstring &p);
00112     static std::string get_current_path(); 
00114     static std::string absolute(const std::string &, const std::string &);
00116     static std::wstring absolute(const std::wstring &, const std::wstring &);
00118     static std::string expand_filename(const std::string &);
00120     static std::wstring expand_filename(const std::wstring &);
00122     static std::wstring new_tempfile_name();
00124     static std::wstring remove_chars(const std::wstring &, const std::wstring &);
00126     static void find_and_replace(std::wstring &, const std::wstring &, const std::wstring &);
00127 
00129     static int wstring2int(const std::wstring &);
00130     static double wstring2double(const std::wstring &);
00131     static long double wstring2longdouble(const std::wstring &);
00132 
00133     template<class C> static std::wstring wstring_from(const C&, const std::wstring &);
00134     template<class C> static std::wstring wstring_from(const C&);
00135     template<class C> static std::wstring wstring_from(const C*);
00136     template<class C> static C wstring_to(const std::wstring &, const std::wstring &, bool mcsep=true);
00137     template<class C> static C wstring_to(const std::wstring &);
00138 
00139     template<class P1,class P2> static std::wstring pairlist2wstring(const std::list<std::pair<P1,P2> > &, const std::wstring &, const std::wstring &);
00140     template<class P1,class P2> static std::list<std::pair<P1,P2> > wstring2pairlist(const std::wstring &, const std::wstring &, const std::wstring &);
00141 
00142     static int capitalization(const std::wstring &);
00143     static std::wstring capitalize(const std::wstring &, int, bool);
00144 
00146     template<class T1,class T2> static bool ascending_first(const std::pair<T1,T2> &, const std::pair<T1,T2> &);
00147     template<class T1,class T2> static bool ascending_second(const std::pair<T1,T2> &, const std::pair<T1,T2> &);
00148     template<class T1,class T2> static bool descending_first(const std::pair<T1,T2> &, const std::pair<T1,T2> &);
00149     template<class T1,class T2> static bool descending_second(const std::pair<T1,T2> &, const std::pair<T1,T2> &);
00150   };
00151 
00152 
00156 
00157   inline std::wstring util::new_tempfile_name() {
00158     char* tempfile = new char[L_tmpnam+1]; 
00159     err_type err = NEW_TMPNAME(tempfile,L_tmpnam+1);
00160     if (TMPNAME_FAILED(err))
00161       ERROR_CRASH(L"Error occurred creating unique filename.");
00162     std::wstring fname = TMPNAME_PREFIX + wstring_from(tempfile)+L"-FL-"+wstring_from(getpid());
00163     delete[] tempfile;
00164     return fname;
00165   }
00166 
00170 
00171   template<class C>
00172     inline std::wstring util::wstring_from(const C& ls, const std::wstring &sep) {
00173     // if nothing to convert, we are done
00174     if (ls.empty()) return L"";  
00175     // print first element to output
00176     typename C::const_iterator i=ls.begin();
00177     std::wstring sn;  sn=(*i);  
00178     // print all remaining elements, adding separators
00179     while (++i!=ls.end()) sn += sep+(*i);
00180     // return resulting string
00181     return(sn); 
00182   }
00183  
00187 
00188   template<class C>
00189     inline std::wstring util::wstring_from(const C & x) {
00190     std::wostringstream ss;
00191     ss<<std::fixed<<x;
00192     return ss.str();
00193   }
00194 
00198 
00199   template<>
00200     inline std::wstring util::wstring_from(const long double &x) {
00201     std::wostringstream ss;
00202     ss<<std::fixed<<x;
00203     // remove decimal digits if all zeros.
00204     std::wstring s(ss.str());
00205     std::wstring::size_type pos = s.find(L'.');
00206     std::wstring::size_type posLast = s.find_last_not_of(L"0");
00207     if ((pos != s.npos) && (posLast != s.npos) && (posLast >= pos)) {
00208       if (posLast == pos) s.erase(pos);
00209       else s.erase(posLast+1);
00210     }
00211     return s;
00212   }
00213 
00217 
00218   template<>
00219     inline std::wstring util::wstring_from(const std::string &s) {
00220     std::wstring ws;
00221     if (sizeof(std::wstring::value_type)==2) 
00222       utf8::utf8to16(s.begin(), s.end(), back_inserter(ws));
00223     else if (sizeof(std::wstring::value_type)==4) 
00224       utf8::utf8to32(s.begin(), s.end(), back_inserter(ws));
00225     else 
00226       WARNING(L"Unexpected wchar size "+wstring_from<int>(sizeof(std::wstring::value_type)));
00227     return ws; 
00228   }
00229 
00230 
00234 
00235   template<>
00236     inline std::wstring util::wstring_from(const char *cp) {
00237     return wstring_from<std::string>(std::string(cp));
00238   }
00239 
00240 
00246 
00247   template<class C>
00248     inline C util::wstring_to(const std::wstring &ws, const std::wstring &sep, bool mcsep) {
00249     C ls;
00250     std::wstring::size_type p,q;
00251     // at each occurence of separator "sep" in string "s", cut and insert at the end of the container
00252     p=0; q = (mcsep? ws.find(sep) : ws.find_first_of(sep));
00253     while(q!=std::wstring::npos){
00254       ls.insert(ls.end(),ws.substr(p,q-p));
00255       p = q+sep.size();
00256       q = (mcsep? ws.find(sep,p) : ws.find_first_of(sep,p));
00257     }
00258     // piece remaining after last separator, if any.
00259     if (not ws.empty()) ls.insert(ls.end(),ws.substr(p,ws.size()-p));
00260     return(ls);    
00261   }
00262 
00266 
00267   template<class C>
00268     inline C util::wstring_to(const std::wstring &ws) {
00269     long double x;
00270     std::wistringstream ss; ss.str(ws); 
00271     ss>>x;
00272     // if original wstring hasn't been fully emptied return default value
00273     std::wstring r;
00274     if (ss>>r) x= -99999;
00275     return static_cast<C>(x);
00276   }
00277 
00281 
00282   template<>
00283     inline wchar_t util::wstring_to(const std::wstring &ws) {
00284     if (ws.empty()) return 0;
00285     else return ws[0];
00286   }
00287 
00288 
00292 
00293   template<>
00294     inline std::string util::wstring_to(const std::wstring &ws) {
00295     std::string s;
00296     if (sizeof(std::wstring::value_type)==2) 
00297       utf8::utf16to8(ws.begin(), ws.end(), back_inserter(s));
00298     else if (sizeof(std::wstring::value_type)==4) 
00299       utf8::utf32to8(ws.begin(), ws.end(), back_inserter(s));
00300     else 
00301       WARNING(L"Unexpected wchar size "+wstring_from<int>(sizeof(std::wstring::value_type)));
00302 
00303     return s;
00304   }
00305 
00306 
00311 
00312   template<class P1,class P2> 
00313     inline std::wstring util::pairlist2wstring(const std::list<std::pair<P1,P2> > &ls, const std::wstring &sep_pair, const std::wstring &sep_list) {
00314     // if nothing to convert, we are done
00315     if (ls.empty()) return L"";  
00316     // print first element to output
00317     typename std::list<std::pair<P1,P2> >::const_iterator i=ls.begin();
00318     std::wstringstream ss;  ss << i->first << sep_pair << i->second;
00319     // concatenate elements in list<pair>
00320     while (++i!=ls.end()) ss << sep_list << i->first << sep_pair << i->second;
00321     // return resulting string
00322     return(ss.str());
00323   }
00324 
00328 
00329   template<class P1,class P2> 
00330     inline std::list<std::pair<P1,P2> > util::wstring2pairlist(const std::wstring &s, const std::wstring &sep_pair, const std::wstring &sep_list) {
00331     // split string at sep_list
00332     std::list<std::wstring> ls = util::wstring_to<std::list<std::wstring> >(s,sep_list);
00333     // split each pair in ls at sep_pair, and store to lps
00334     std::list<std::pair<P1,P2> > lps;
00335     P1 elem1;
00336     P2 elem2;
00337     for (std::list<std::wstring>::const_iterator i=ls.begin(); i!=ls.end(); i++) {
00338       std::wstring::size_type p = i->find(sep_pair);
00339       std::wstringstream ss1(i->substr(0,p)); ss1 >> elem1;
00340       std::wstringstream ss2(i->substr(p+1)); ss2 >> elem2;
00341       lps.push_back(make_pair(elem1,elem2));
00342     }
00343 
00344     return(lps);
00345   }
00346 
00347 
00351 
00352   template<class T1,class T2> inline bool util::ascending_first(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) {
00353     return (p1.first<p2.first or (p1.first==p2.first and p1.second<p2.second));
00354   }
00355 
00359 
00360   template<class T1,class T2> inline bool util::ascending_second(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) {
00361     return (p1.second<p2.second or (p1.second==p2.second and p1.first<p2.first));
00362   }
00363 
00367 
00368   template<class T1,class T2> inline bool util::descending_first(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) {
00369     return (p1.first>p2.first or (p1.first==p2.first and p1.second>p2.second));
00370   }
00371 
00372 
00376 
00377   template<class T1,class T2> inline bool util::descending_second(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) {
00378     return (p1.second>p2.second or (p1.second==p2.second and p1.first>p2.first));
00379   }
00380 
00381 
00384 
00385 #define wstring2vector(x,y) wstring_to<std::vector<std::wstring> >(x,y)
00386 #define wstring2list(x,y) wstring_to<std::list<std::wstring> >(x,y)
00387 #define wstring2set(x,y) wstring_to<std::set<std::wstring> >(x,y)
00388 
00389 #define wstring2string(x) wstring_to<std::string>(x)
00390 #define wstring2int(x) wstring_to<int>(x) 
00391 #define wstring2double(x) wstring_to<double>(x) 
00392 #define wstring2longdouble(x) wstring_to<long double>(x) 
00393 
00394 #define vector2wstring(x,y) wstring_from(x,y)
00395 #define list2wstring(x,y) wstring_from(x,y)
00396 #define set2wstring(x,y) wstring_from(x,y)
00397 #define string2wstring(x) wstring_from(x)
00398 #define int2wstring(x) wstring_from(x)
00399 #define double2wstring(x) wstring_from(x)
00400 #define longdouble2wstring(x) wstring_from(x)
00401 
00402 #define wstring2pairlist(x,y,z) wstring2pairlist<std::wstring,std::wstring>(x,y,z)
00403 
00404 
00405 #undef MOD_TRACENAME
00406 #undef MOD_TRACECODE
00407 
00408 } //namespace
00409 
00410 #endif