FreeLing
4.0
|
00001 00002 // 00003 // Fries - Feature Retriever for Intensional Encoding of Sentences 00004 // 00005 // Copyright (C) 2014 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This file is part of the Fries library 00009 // 00010 // The Fries library is free software; you can redistribute it 00011 // and/or modify it under the terms of the GNU Affero General Public 00012 // License as published by the Free Software Foundation; either 00013 // version 3 of the License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, 00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Affero General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Affero General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 51 Franklin St, 5th Floor, Boston, MA 02110-1301 USA 00023 // 00024 // contact: Lluis Padro (padro@lsi.upc.es) 00025 // TALP Research Center 00026 // despatx Omega.S112 - Campus Nord UPC 00027 // 08034 Barcelona. SPAIN 00028 // 00030 00031 #ifndef _UTIL 00032 #define _UTIL 00033 00034 #include <cstdio> 00035 #include <list> 00036 #include <string> 00037 #include <vector> 00038 #include <set> 00039 00040 #include <locale> 00041 #include <iostream> 00042 #include "freeling/utf8/utf8.h" 00043 00044 #include "freeling/regexp.h" 00045 #include "freeling/windll.h" 00046 #include "freeling/morfo/traces.h" 00047 00048 #ifdef WIN32 00049 #include <windows.h> 00050 #define getpid() GetCurrentProcessId() 00051 #define pid_t DWORD 00052 #define err_type errno_t 00053 #define NEW_TMPNAME(buf,sz) tmpnam_s(buf,sz) 00054 #define TMPNAME_FAILED(x) x 00055 #define TMPNAME_PREFIX L"." 00056 #else 00057 #define err_type char* 00058 #define NEW_TMPNAME(buf,sz) tmpnam(buf) 00059 #define TMPNAME_FAILED(x) (x==NULL) 00060 #define TMPNAME_PREFIX L"" 00061 #endif 00062 00063 00064 // Capitalization patterns 00065 #define UPPER_NONE 0 00066 #define UPPER_1ST 1 00067 #define UPPER_ALL 2 00068 00069 namespace freeling { 00070 00071 #define MOD_TRACENAME L"UTIL" 00072 #define MOD_TRACECODE UTIL_TRACE 00073 00079 00080 class WINDLL util { 00081 00082 public: 00084 static regexp RE_has_lowercase; // wstring contains lowercase chars 00085 static regexp RE_has_alphanum; // wstring contains alphanum chars 00086 static regexp RE_is_capitalized; // wstring is capitalized 00087 static regexp RE_all_digits; // wstring is all digits 00088 static regexp RE_all_caps; // wstring is uppercase 00089 static regexp RE_initial_dot; // wstring is an initial plus optional dot 00090 static regexp RE_all_caps_dot; // wstring is uppercase plus optional dot 00091 static regexp RE_capitalized_dot; // wstring is capitalized plus optional dot 00092 static regexp RE_has_digits; // wstring contains digits 00093 static regexp RE_lowercase_dot; // wstring is lowercase plus optional dot 00094 00095 static regexp RE_win_absolute_path; // to detect absolute paths in windows 00096 00098 static void init_locale(const std::wstring &s=L"default"); 00100 static void open_utf8_file(std::wifstream &, const std::wstring &); 00102 static void open_utf8_file(std::wofstream &, const std::wstring &); 00104 static std::wstring lowercase(const std::wstring &); 00106 static std::wstring uppercase(const std::wstring &); 00108 static bool is_absolute(const std::string &p); 00110 static bool is_absolute(const std::wstring &p); 00112 static std::string get_current_path(); 00114 static std::string absolute(const std::string &, const std::string &); 00116 static std::wstring absolute(const std::wstring &, const std::wstring &); 00118 static std::string expand_filename(const std::string &); 00120 static std::wstring expand_filename(const std::wstring &); 00122 static std::wstring new_tempfile_name(); 00124 static std::wstring remove_chars(const std::wstring &, const std::wstring &); 00126 static void find_and_replace(std::wstring &, const std::wstring &, const std::wstring &); 00127 00129 static int wstring2int(const std::wstring &); 00130 static double wstring2double(const std::wstring &); 00131 static long double wstring2longdouble(const std::wstring &); 00132 00133 template<class C> static std::wstring wstring_from(const C&, const std::wstring &); 00134 template<class C> static std::wstring wstring_from(const C&); 00135 template<class C> static std::wstring wstring_from(const C*); 00136 template<class C> static C wstring_to(const std::wstring &, const std::wstring &, bool mcsep=true); 00137 template<class C> static C wstring_to(const std::wstring &); 00138 00139 template<class P1,class P2> static std::wstring pairlist2wstring(const std::list<std::pair<P1,P2> > &, const std::wstring &, const std::wstring &); 00140 template<class P1,class P2> static std::list<std::pair<P1,P2> > wstring2pairlist(const std::wstring &, const std::wstring &, const std::wstring &); 00141 00142 static int capitalization(const std::wstring &); 00143 static std::wstring capitalize(const std::wstring &, int, bool); 00144 00146 template<class T1,class T2> static bool ascending_first(const std::pair<T1,T2> &, const std::pair<T1,T2> &); 00147 template<class T1,class T2> static bool ascending_second(const std::pair<T1,T2> &, const std::pair<T1,T2> &); 00148 template<class T1,class T2> static bool descending_first(const std::pair<T1,T2> &, const std::pair<T1,T2> &); 00149 template<class T1,class T2> static bool descending_second(const std::pair<T1,T2> &, const std::pair<T1,T2> &); 00150 }; 00151 00152 00156 00157 inline std::wstring util::new_tempfile_name() { 00158 char* tempfile = new char[L_tmpnam+1]; 00159 err_type err = NEW_TMPNAME(tempfile,L_tmpnam+1); 00160 if (TMPNAME_FAILED(err)) 00161 ERROR_CRASH(L"Error occurred creating unique filename."); 00162 std::wstring fname = TMPNAME_PREFIX + wstring_from(tempfile)+L"-FL-"+wstring_from(getpid()); 00163 delete[] tempfile; 00164 return fname; 00165 } 00166 00170 00171 template<class C> 00172 inline std::wstring util::wstring_from(const C& ls, const std::wstring &sep) { 00173 // if nothing to convert, we are done 00174 if (ls.empty()) return L""; 00175 // print first element to output 00176 typename C::const_iterator i=ls.begin(); 00177 std::wstring sn; sn=(*i); 00178 // print all remaining elements, adding separators 00179 while (++i!=ls.end()) sn += sep+(*i); 00180 // return resulting string 00181 return(sn); 00182 } 00183 00187 00188 template<class C> 00189 inline std::wstring util::wstring_from(const C & x) { 00190 std::wostringstream ss; 00191 ss<<std::fixed<<x; 00192 return ss.str(); 00193 } 00194 00198 00199 template<> 00200 inline std::wstring util::wstring_from(const long double &x) { 00201 std::wostringstream ss; 00202 ss<<std::fixed<<x; 00203 // remove decimal digits if all zeros. 00204 std::wstring s(ss.str()); 00205 std::wstring::size_type pos = s.find(L'.'); 00206 std::wstring::size_type posLast = s.find_last_not_of(L"0"); 00207 if ((pos != s.npos) && (posLast != s.npos) && (posLast >= pos)) { 00208 if (posLast == pos) s.erase(pos); 00209 else s.erase(posLast+1); 00210 } 00211 return s; 00212 } 00213 00217 00218 template<> 00219 inline std::wstring util::wstring_from(const std::string &s) { 00220 std::wstring ws; 00221 if (sizeof(std::wstring::value_type)==2) 00222 utf8::utf8to16(s.begin(), s.end(), back_inserter(ws)); 00223 else if (sizeof(std::wstring::value_type)==4) 00224 utf8::utf8to32(s.begin(), s.end(), back_inserter(ws)); 00225 else 00226 WARNING(L"Unexpected wchar size "+wstring_from<int>(sizeof(std::wstring::value_type))); 00227 return ws; 00228 } 00229 00230 00234 00235 template<> 00236 inline std::wstring util::wstring_from(const char *cp) { 00237 return wstring_from<std::string>(std::string(cp)); 00238 } 00239 00240 00246 00247 template<class C> 00248 inline C util::wstring_to(const std::wstring &ws, const std::wstring &sep, bool mcsep) { 00249 C ls; 00250 std::wstring::size_type p,q; 00251 // at each occurence of separator "sep" in string "s", cut and insert at the end of the container 00252 p=0; q = (mcsep? ws.find(sep) : ws.find_first_of(sep)); 00253 while(q!=std::wstring::npos){ 00254 ls.insert(ls.end(),ws.substr(p,q-p)); 00255 p = q+sep.size(); 00256 q = (mcsep? ws.find(sep,p) : ws.find_first_of(sep,p)); 00257 } 00258 // piece remaining after last separator, if any. 00259 if (not ws.empty()) ls.insert(ls.end(),ws.substr(p,ws.size()-p)); 00260 return(ls); 00261 } 00262 00266 00267 template<class C> 00268 inline C util::wstring_to(const std::wstring &ws) { 00269 long double x; 00270 std::wistringstream ss; ss.str(ws); 00271 ss>>x; 00272 // if original wstring hasn't been fully emptied return default value 00273 std::wstring r; 00274 if (ss>>r) x= -99999; 00275 return static_cast<C>(x); 00276 } 00277 00281 00282 template<> 00283 inline wchar_t util::wstring_to(const std::wstring &ws) { 00284 if (ws.empty()) return 0; 00285 else return ws[0]; 00286 } 00287 00288 00292 00293 template<> 00294 inline std::string util::wstring_to(const std::wstring &ws) { 00295 std::string s; 00296 if (sizeof(std::wstring::value_type)==2) 00297 utf8::utf16to8(ws.begin(), ws.end(), back_inserter(s)); 00298 else if (sizeof(std::wstring::value_type)==4) 00299 utf8::utf32to8(ws.begin(), ws.end(), back_inserter(s)); 00300 else 00301 WARNING(L"Unexpected wchar size "+wstring_from<int>(sizeof(std::wstring::value_type))); 00302 00303 return s; 00304 } 00305 00306 00311 00312 template<class P1,class P2> 00313 inline std::wstring util::pairlist2wstring(const std::list<std::pair<P1,P2> > &ls, const std::wstring &sep_pair, const std::wstring &sep_list) { 00314 // if nothing to convert, we are done 00315 if (ls.empty()) return L""; 00316 // print first element to output 00317 typename std::list<std::pair<P1,P2> >::const_iterator i=ls.begin(); 00318 std::wstringstream ss; ss << i->first << sep_pair << i->second; 00319 // concatenate elements in list<pair> 00320 while (++i!=ls.end()) ss << sep_list << i->first << sep_pair << i->second; 00321 // return resulting string 00322 return(ss.str()); 00323 } 00324 00328 00329 template<class P1,class P2> 00330 inline std::list<std::pair<P1,P2> > util::wstring2pairlist(const std::wstring &s, const std::wstring &sep_pair, const std::wstring &sep_list) { 00331 // split string at sep_list 00332 std::list<std::wstring> ls = util::wstring_to<std::list<std::wstring> >(s,sep_list); 00333 // split each pair in ls at sep_pair, and store to lps 00334 std::list<std::pair<P1,P2> > lps; 00335 P1 elem1; 00336 P2 elem2; 00337 for (std::list<std::wstring>::const_iterator i=ls.begin(); i!=ls.end(); i++) { 00338 std::wstring::size_type p = i->find(sep_pair); 00339 std::wstringstream ss1(i->substr(0,p)); ss1 >> elem1; 00340 std::wstringstream ss2(i->substr(p+1)); ss2 >> elem2; 00341 lps.push_back(make_pair(elem1,elem2)); 00342 } 00343 00344 return(lps); 00345 } 00346 00347 00351 00352 template<class T1,class T2> inline bool util::ascending_first(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) { 00353 return (p1.first<p2.first or (p1.first==p2.first and p1.second<p2.second)); 00354 } 00355 00359 00360 template<class T1,class T2> inline bool util::ascending_second(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) { 00361 return (p1.second<p2.second or (p1.second==p2.second and p1.first<p2.first)); 00362 } 00363 00367 00368 template<class T1,class T2> inline bool util::descending_first(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) { 00369 return (p1.first>p2.first or (p1.first==p2.first and p1.second>p2.second)); 00370 } 00371 00372 00376 00377 template<class T1,class T2> inline bool util::descending_second(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) { 00378 return (p1.second>p2.second or (p1.second==p2.second and p1.first>p2.first)); 00379 } 00380 00381 00384 00385 #define wstring2vector(x,y) wstring_to<std::vector<std::wstring> >(x,y) 00386 #define wstring2list(x,y) wstring_to<std::list<std::wstring> >(x,y) 00387 #define wstring2set(x,y) wstring_to<std::set<std::wstring> >(x,y) 00388 00389 #define wstring2string(x) wstring_to<std::string>(x) 00390 #define wstring2int(x) wstring_to<int>(x) 00391 #define wstring2double(x) wstring_to<double>(x) 00392 #define wstring2longdouble(x) wstring_to<long double>(x) 00393 00394 #define vector2wstring(x,y) wstring_from(x,y) 00395 #define list2wstring(x,y) wstring_from(x,y) 00396 #define set2wstring(x,y) wstring_from(x,y) 00397 #define string2wstring(x) wstring_from(x) 00398 #define int2wstring(x) wstring_from(x) 00399 #define double2wstring(x) wstring_from(x) 00400 #define longdouble2wstring(x) wstring_from(x) 00401 00402 #define wstring2pairlist(x,y,z) wstring2pairlist<std::wstring,std::wstring>(x,y,z) 00403 00404 00405 #undef MOD_TRACENAME 00406 #undef MOD_TRACECODE 00407 00408 } //namespace 00409 00410 #endif