changed to UTF-8: added support for UTF-16 when wchar_t is 2 bytes long
git-svn-id: svn://ttmath.org/publicrep/ezc/trunk@345 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
parent
861269383e
commit
2d304a9714
215
src/utf8.cpp
215
src/utf8.cpp
|
@ -5,7 +5,7 @@
|
|||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010, Tomasz Sowa
|
||||
* Copyright (c) 2010-2011, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -212,6 +212,23 @@ return len;
|
|||
|
||||
|
||||
|
||||
/*
|
||||
|
||||
*/
|
||||
static void IntToWide(int c, std::wstring & res)
|
||||
{
|
||||
if( sizeof(wchar_t)==2 && c>0xffff )
|
||||
{
|
||||
c -= 0x10000;
|
||||
res += static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
|
||||
res += static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
|
||||
}
|
||||
else
|
||||
{
|
||||
res += static_cast<wchar_t>(c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -254,7 +271,7 @@ bool correct, was_error = false;
|
|||
len = UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero
|
||||
}
|
||||
|
||||
if( !correct || (sizeof(wchar_t)==2 && z>0xffff) )
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
res += 0xFFFD; // U+FFFD "replacement character"
|
||||
|
@ -263,7 +280,7 @@ bool correct, was_error = false;
|
|||
}
|
||||
else
|
||||
{
|
||||
res += static_cast<wchar_t>(z);
|
||||
IntToWide(z, res);
|
||||
}
|
||||
|
||||
utf8 += len;
|
||||
|
@ -346,7 +363,7 @@ bool correct, was_error = false;
|
|||
|
||||
while( UTF8ToInt(utf8, z, correct) > 0 )
|
||||
{
|
||||
if( !correct || (sizeof(wchar_t)==2 && z>0xffff) )
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
res += 0xFFFD; // U+FFFD "replacement character"
|
||||
|
@ -355,7 +372,7 @@ bool correct, was_error = false;
|
|||
}
|
||||
else
|
||||
{
|
||||
res += z;
|
||||
IntToWide(z, res);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -478,38 +495,192 @@ return len;
|
|||
|
||||
|
||||
|
||||
/*
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
converting a wide character into one int
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
*/
|
||||
static size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
|
||||
{
|
||||
if( string_len == 0 )
|
||||
{
|
||||
z = 0;
|
||||
correct = false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
z = static_cast<int>(*wide_string);
|
||||
correct = true;
|
||||
|
||||
if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) )
|
||||
{
|
||||
if( z>=0xD800 && z<=0xDBFF && string_len>1 )
|
||||
{
|
||||
int z2 = *(wide_string+1);
|
||||
|
||||
if( z2>=0xDC00 && z2<=0xDFFF )
|
||||
{
|
||||
z = 0x10000 + ((z & 0x3FF) << 10) | (z2 & 0x3FF);
|
||||
return 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
correct = false;
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
correct = false;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
converting a wide character into one int
|
||||
|
||||
returns how many wide characters were used
|
||||
if wide_string has at least one character then the return value is always greater than zero too
|
||||
*/
|
||||
static size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct)
|
||||
{
|
||||
size_t min_str_len = 1;
|
||||
|
||||
if( *wide_string == 0 )
|
||||
{
|
||||
z = 0;
|
||||
correct = false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if( *(wide_string+1) != 0 )
|
||||
min_str_len = 2;
|
||||
|
||||
return WideToInt(wide_string, min_str_len, z, correct);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
*/
|
||||
static void WideToUTF8(wchar_t z, std::string & utf8, bool & was_error, int mode)
|
||||
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
|
||||
{
|
||||
if( IntToUTF8(z, utf8, false) == 0 )
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
chars = WideToInt(wide_string, string_len, z, correct);
|
||||
|
||||
if( correct )
|
||||
correct = IntToUTF8(z, utf8, false) != 0;
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if wide_string has at least one character then the return value is always greater than zero too
|
||||
*/
|
||||
static void WideToUTF8(wchar_t z, std::ostream & utf8, bool & was_error, int mode)
|
||||
static size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
|
||||
{
|
||||
if( IntToUTF8(z, utf8) == 0 )
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
chars = WideToInt(wide_string, z, correct);
|
||||
|
||||
if( correct )
|
||||
correct = IntToUTF8(z, utf8, false) != 0;
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
*/
|
||||
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, bool & was_error, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
chars = WideToInt(wide_string, string_len, z, correct);
|
||||
|
||||
if( correct )
|
||||
correct = IntToUTF8(z, utf8) != 0;
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
*/
|
||||
static size_t WideOneToUTF8(const wchar_t * wide_string, std::ostream & utf8, bool & was_error, int mode)
|
||||
{
|
||||
size_t min_str_len = 1;
|
||||
|
||||
if( *wide_string == 0 )
|
||||
return 0;
|
||||
|
||||
if( *(wide_string+1) != 0 )
|
||||
min_str_len = 2;
|
||||
|
||||
return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide string into UTF-8 string
|
||||
|
||||
|
@ -528,12 +699,17 @@ static void WideToUTF8(wchar_t z, std::ostream & utf8, bool & was_error, int mod
|
|||
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear, int mode)
|
||||
{
|
||||
bool was_error = false;
|
||||
size_t chars;
|
||||
|
||||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
for(size_t i=0 ; i<string_len ; ++i)
|
||||
WideToUTF8(wide_string[i], utf8, was_error, mode);
|
||||
while( string_len > 0 )
|
||||
{
|
||||
chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
|
||||
wide_string += chars;
|
||||
string_len -= chars;
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
@ -561,8 +737,8 @@ bool was_error = false;
|
|||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
for( ; *wide_string != 0 ; ++wide_string )
|
||||
WideToUTF8(*wide_string, utf8, was_error, mode);
|
||||
while( *wide_string )
|
||||
wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
@ -608,9 +784,14 @@ bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear
|
|||
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode)
|
||||
{
|
||||
bool was_error = false;
|
||||
size_t chars;
|
||||
|
||||
for(size_t i=0 ; i<string_len ; ++i)
|
||||
WideToUTF8(wide_string[i], utf8, was_error, mode);
|
||||
while( string_len > 0 )
|
||||
{
|
||||
chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
|
||||
wide_string += chars;
|
||||
string_len -= chars;
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
@ -634,8 +815,8 @@ bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode)
|
|||
{
|
||||
bool was_error = false;
|
||||
|
||||
for( ; *wide_string != 0 ; ++wide_string )
|
||||
WideToUTF8(*wide_string, utf8, was_error, mode);
|
||||
while( *wide_string )
|
||||
wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
|
13
src/utf8.h
13
src/utf8.h
|
@ -5,7 +5,7 @@
|
|||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010, Tomasz Sowa
|
||||
* Copyright (c) 2010-2011, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -48,6 +48,12 @@ namespace Ezc
|
|||
/*!
|
||||
UTF-8, a transformation format of ISO 10646
|
||||
http://tools.ietf.org/html/rfc3629
|
||||
|
||||
when wchar_t is 4 bytes length we use UTF-32
|
||||
when wchar_t is 2 bytes length we use UTF-16 (with surrogate pairs)
|
||||
|
||||
UTF-16
|
||||
http://www.ietf.org/rfc/rfc2781.txt
|
||||
*/
|
||||
|
||||
|
||||
|
@ -69,11 +75,6 @@ size_t UTF8ToInt(std::istream & utf8, int & res, bool & correct);
|
|||
|
||||
/*!
|
||||
converting UTF-8 string to a wide string
|
||||
|
||||
warning: current limitation
|
||||
on MS Windows wide characters consist of two bytes only
|
||||
and we tread them as UCS-2 (not UTF-16 with surrogate pairs as it should be trated)
|
||||
so unicode characters above 0xffff value are ignored (depending on 'mode' parameter)
|
||||
*/
|
||||
bool UTF8ToWide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear = true, int mode = 1);
|
||||
bool UTF8ToWide(const char * utf8, std::wstring & res, bool clear = true, int mode = 1);
|
||||
|
|
Loading…
Reference in New Issue