From 2d304a9714abe337a927257a739efbd116e8a2c9 Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Wed, 25 May 2011 21:43:40 +0000 Subject: [PATCH] changed to UTF-8: added support for UTF-16 when wchar_t is 2 bytes long git-svn-id: svn://ttmath.org/publicrep/ezc/trunk@345 e52654a7-88a9-db11-a3e9-0013d4bc506e --- src/utf8.cpp | 215 +++++++++++++++++++++++++++++++++++++++++++++++---- src/utf8.h | 13 ++-- 2 files changed, 205 insertions(+), 23 deletions(-) diff --git a/src/utf8.cpp b/src/utf8.cpp index 0bc8bbb..fb8365a 100755 --- a/src/utf8.cpp +++ b/src/utf8.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010, Tomasz Sowa + * Copyright (c) 2010-2011, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -212,6 +212,23 @@ return len; +/* + +*/ +static void IntToWide(int c, std::wstring & res) +{ + if( sizeof(wchar_t)==2 && c>0xffff ) + { + c -= 0x10000; + res += static_cast(((c >> 10) & 0x3FF) + 0xD800); + res += static_cast((c & 0x3FF) + 0xDC00); + } + else + { + res += static_cast(c); + } +} + @@ -254,7 +271,7 @@ bool correct, was_error = false; len = UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero } - if( !correct || (sizeof(wchar_t)==2 && z>0xffff) ) + if( !correct ) { if( mode == 1 ) res += 0xFFFD; // U+FFFD "replacement character" @@ -263,7 +280,7 @@ bool correct, was_error = false; } else { - res += static_cast(z); + IntToWide(z, res); } utf8 += len; @@ -346,7 +363,7 @@ bool correct, was_error = false; while( UTF8ToInt(utf8, z, correct) > 0 ) { - if( !correct || (sizeof(wchar_t)==2 && z>0xffff) ) + if( !correct ) { if( mode == 1 ) res += 0xFFFD; // U+FFFD "replacement character" @@ -355,7 +372,7 @@ bool correct, was_error = false; } else { - res += z; + IntToWide(z, res); } } @@ -478,38 +495,192 @@ return len; +/* + an auxiliary function for converting from wide characters to UTF-8 + converting a wide character into one int + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +static size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct) +{ + if( string_len == 0 ) + { + z = 0; + correct = false; + return 0; + } + + z = static_cast(*wide_string); + correct = true; + + if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) ) + { + if( z>=0xD800 && z<=0xDBFF && string_len>1 ) + { + int z2 = *(wide_string+1); + + if( z2>=0xDC00 && z2<=0xDFFF ) + { + z = 0x10000 + ((z & 0x3FF) << 10) | (z2 & 0x3FF); + return 2; + } + else + { + correct = false; + return 2; + } + } + else + { + correct = false; + return 1; + } + } + else + { + return 1; + } +} + + + +/* + an auxiliary function for converting from wide characters to UTF-8 + converting a wide character into one int + + returns how many wide characters were used + if wide_string has at least one character then the return value is always greater than zero too +*/ +static size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct) +{ +size_t min_str_len = 1; + + if( *wide_string == 0 ) + { + z = 0; + correct = false; + return 0; + } + + if( *(wide_string+1) != 0 ) + min_str_len = 2; + +return WideToInt(wide_string, min_str_len, z, correct); +} + + /*! an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too */ -static void WideToUTF8(wchar_t z, std::string & utf8, bool & was_error, int mode) +static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode) { - if( IntToUTF8(z, utf8, false) == 0 ) +int z; +bool correct; +size_t chars; + + chars = WideToInt(wide_string, string_len, z, correct); + + if( correct ) + correct = IntToUTF8(z, utf8, false) != 0; + + if( !correct ) { if( mode == 1 ) IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character" was_error = true; } + +return chars; } + /*! an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if wide_string has at least one character then the return value is always greater than zero too */ -static void WideToUTF8(wchar_t z, std::ostream & utf8, bool & was_error, int mode) +static size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode) { - if( IntToUTF8(z, utf8) == 0 ) +int z; +bool correct; +size_t chars; + + chars = WideToInt(wide_string, z, correct); + + if( correct ) + correct = IntToUTF8(z, utf8, false) != 0; + + if( !correct ) + { + if( mode == 1 ) + IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = WideToInt(wide_string, string_len, z, correct); + + if( correct ) + correct = IntToUTF8(z, utf8) != 0; + + if( !correct ) { if( mode == 1 ) IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character" was_error = true; } + +return chars; } + +/*! + an auxiliary function for converting from wide characters to UTF-8 +*/ +static size_t WideOneToUTF8(const wchar_t * wide_string, std::ostream & utf8, bool & was_error, int mode) +{ +size_t min_str_len = 1; + + if( *wide_string == 0 ) + return 0; + + if( *(wide_string+1) != 0 ) + min_str_len = 2; + +return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode); +} + + + + /*! this function converts a wide string into UTF-8 string @@ -528,12 +699,17 @@ static void WideToUTF8(wchar_t z, std::ostream & utf8, bool & was_error, int mod bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear, int mode) { bool was_error = false; +size_t chars; if( clear ) utf8.clear(); - for(size_t i=0 ; i 0 ) + { + chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode); + wide_string += chars; + string_len -= chars; + } return !was_error; } @@ -561,8 +737,8 @@ bool was_error = false; if( clear ) utf8.clear(); - for( ; *wide_string != 0 ; ++wide_string ) - WideToUTF8(*wide_string, utf8, was_error, mode); + while( *wide_string ) + wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode); return !was_error; } @@ -608,9 +784,14 @@ bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode) { bool was_error = false; +size_t chars; - for(size_t i=0 ; i 0 ) + { + chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode); + wide_string += chars; + string_len -= chars; + } return !was_error; } @@ -634,8 +815,8 @@ bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode) { bool was_error = false; - for( ; *wide_string != 0 ; ++wide_string ) - WideToUTF8(*wide_string, utf8, was_error, mode); + while( *wide_string ) + wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode); return !was_error; } diff --git a/src/utf8.h b/src/utf8.h index 1b7e4e7..8615f70 100755 --- a/src/utf8.h +++ b/src/utf8.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010, Tomasz Sowa + * Copyright (c) 2010-2011, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -48,6 +48,12 @@ namespace Ezc /*! UTF-8, a transformation format of ISO 10646 http://tools.ietf.org/html/rfc3629 + + when wchar_t is 4 bytes length we use UTF-32 + when wchar_t is 2 bytes length we use UTF-16 (with surrogate pairs) + + UTF-16 + http://www.ietf.org/rfc/rfc2781.txt */ @@ -69,11 +75,6 @@ size_t UTF8ToInt(std::istream & utf8, int & res, bool & correct); /*! converting UTF-8 string to a wide string - - warning: current limitation - on MS Windows wide characters consist of two bytes only - and we tread them as UCS-2 (not UTF-16 with surrogate pairs as it should be trated) - so unicode characters above 0xffff value are ignored (depending on 'mode' parameter) */ bool UTF8ToWide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear = true, int mode = 1); bool UTF8ToWide(const char * utf8, std::wstring & res, bool clear = true, int mode = 1);