reorganization in utf8
- utf8 auxiliary functions moved to utf8_private.h file - in utf8.h are shown only functions available for consumers - template functions has been moved to utf8_template.h (in utf8.h are only declarations) utf8_template.h is included at the end of utf8.h - functions which take std::ostream changed to template (the stream is a template argument now)
This commit is contained in:
parent
effe9be0a3
commit
fac3a7eb71
406
utf8/utf8.cpp
406
utf8/utf8.cpp
|
@ -5,7 +5,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010-2018, Tomasz Sowa
|
* Copyright (c) 2010-2021, Tomasz Sowa
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -36,6 +36,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
|
#include "utf8_private.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,48 +45,6 @@ namespace PT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from UTF-8 string
|
|
||||||
*/
|
|
||||||
static bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res)
|
|
||||||
{
|
|
||||||
for(len=0 ; (uz & 0x80) != 0 ; ++len)
|
|
||||||
uz <<= 1;
|
|
||||||
|
|
||||||
if( len == 1 )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
res = uz;
|
|
||||||
|
|
||||||
if( len > 0 )
|
|
||||||
res >>= len;
|
|
||||||
|
|
||||||
if( res == 0 )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if( len == 0 )
|
|
||||||
len = 1;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from UTF-8 string
|
|
||||||
*/
|
|
||||||
static bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res)
|
|
||||||
{
|
|
||||||
if( (uz & 0xc0) != 0x80 )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
res <<= 6;
|
|
||||||
res |= (uz & 0x3F);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -126,15 +85,17 @@ size_t i, len;
|
||||||
if( utf8_len == 0 )
|
if( utf8_len == 0 )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if( !UTF8ToInt_FirstOctet(utf8[0], len, res) )
|
if( !private_namespace::UTF8ToInt_FirstOctet(utf8[0], len, res) )
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if( utf8_len < len )
|
if( utf8_len < len )
|
||||||
return utf8_len;
|
return utf8_len;
|
||||||
|
|
||||||
for(i=1 ; i<len ; ++i)
|
for(i=1 ; i<len ; ++i)
|
||||||
if( !UTF8ToInt_AddNextOctet(utf8[i], res) )
|
{
|
||||||
|
if( !private_namespace::UTF8ToInt_AddNextOctet(utf8[i], res) )
|
||||||
return i;
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
if( UTF8_CheckRange(res) )
|
if( UTF8_CheckRange(res) )
|
||||||
correct = true;
|
correct = true;
|
||||||
|
@ -168,7 +129,7 @@ size_t i, len;
|
||||||
if( *utf8 == 0 )
|
if( *utf8 == 0 )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if( !UTF8ToInt_FirstOctet(utf8[0], len, res) )
|
if( !private_namespace::UTF8ToInt_FirstOctet(utf8[0], len, res) )
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
for(i=1 ; i<len ; ++i)
|
for(i=1 ; i<len ; ++i)
|
||||||
|
@ -176,7 +137,7 @@ size_t i, len;
|
||||||
if( utf8[i] == 0 )
|
if( utf8[i] == 0 )
|
||||||
return i;
|
return i;
|
||||||
|
|
||||||
if( !UTF8ToInt_AddNextOctet(utf8[i], res) )
|
if( !private_namespace::UTF8ToInt_AddNextOctet(utf8[i], res) )
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -235,7 +196,7 @@ unsigned char uz;
|
||||||
if( !utf8 )
|
if( !utf8 )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if( !UTF8ToInt_FirstOctet(uz, len, res) )
|
if( !private_namespace::UTF8ToInt_FirstOctet(uz, len, res) )
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
for(i=1 ; i<len ; ++i)
|
for(i=1 ; i<len ; ++i)
|
||||||
|
@ -245,7 +206,7 @@ unsigned char uz;
|
||||||
if( !utf8 )
|
if( !utf8 )
|
||||||
return i;
|
return i;
|
||||||
|
|
||||||
if( !UTF8ToInt_AddNextOctet(uz, res) )
|
if( !private_namespace::UTF8ToInt_AddNextOctet(uz, res) )
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -485,268 +446,6 @@ return len;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts one wide character into UTF-8 stream
|
|
||||||
|
|
||||||
input:
|
|
||||||
z - wide character
|
|
||||||
|
|
||||||
output:
|
|
||||||
utf8 - a UTF-8 stream for the output sequence
|
|
||||||
|
|
||||||
the function returns how many characters have been written to the utf8 stream,
|
|
||||||
zero means that 'z' is an incorrect unicode character
|
|
||||||
*/
|
|
||||||
size_t IntToUTF8(int z, std::ostream & utf8)
|
|
||||||
{
|
|
||||||
char buf[10];
|
|
||||||
|
|
||||||
size_t len = IntToUTF8(z, buf, sizeof(buf)/sizeof(char));
|
|
||||||
size_t i;
|
|
||||||
|
|
||||||
for(i=0 ; i<len ; ++i)
|
|
||||||
utf8 << buf[i];
|
|
||||||
|
|
||||||
return len;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
converting a wide character into one int
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if string_len is greater than 0 then the return value is always greater than zero too
|
|
||||||
*/
|
|
||||||
static size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
|
|
||||||
{
|
|
||||||
if( string_len == 0 )
|
|
||||||
{
|
|
||||||
z = 0;
|
|
||||||
correct = false;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
z = static_cast<int>(*wide_string);
|
|
||||||
correct = true;
|
|
||||||
|
|
||||||
if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) )
|
|
||||||
{
|
|
||||||
if( z>=0xD800 && z<=0xDBFF && string_len>1 )
|
|
||||||
{
|
|
||||||
int z2 = *(wide_string+1);
|
|
||||||
|
|
||||||
if( z2>=0xDC00 && z2<=0xDFFF )
|
|
||||||
{
|
|
||||||
z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
correct = false;
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
correct = false;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
correct = UTF8_CheckRange(z);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
converting a wide character into one int
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if wide_string has at least one character then the return value is always greater than zero too
|
|
||||||
*/
|
|
||||||
static size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct)
|
|
||||||
{
|
|
||||||
size_t min_str_len = 1;
|
|
||||||
|
|
||||||
if( *wide_string == 0 )
|
|
||||||
{
|
|
||||||
z = 0;
|
|
||||||
correct = false;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( *(wide_string+1) != 0 )
|
|
||||||
min_str_len = 2;
|
|
||||||
|
|
||||||
return WideToInt(wide_string, min_str_len, z, correct);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if string_len is greater than 0 then the return value is always greater than zero too
|
|
||||||
|
|
||||||
utf8_written - how many characters were saved in the utf8 string (the string doesn't have
|
|
||||||
a null terminating character)
|
|
||||||
it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
|
|
||||||
was_utf8_buf_too_small - will be true if the utf8 buffer is too small
|
|
||||||
if this flag is true then utf8_written is equal to zero
|
|
||||||
was_error - will be true if there is an error when converting (there was an incorrect wide character)
|
|
||||||
(was_error will not be true if the utf8 buffer is too small)
|
|
||||||
*/
|
|
||||||
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
|
||||||
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
utf8_written = 0;
|
|
||||||
was_utf8_buf_too_small = false;
|
|
||||||
chars = WideToInt(wide_string, string_len, z, correct);
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
{
|
|
||||||
utf8_written = IntToUTF8(z, utf8, utf8_len);
|
|
||||||
|
|
||||||
if( utf8_written == 0 )
|
|
||||||
was_utf8_buf_too_small = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
{
|
|
||||||
utf8_written = IntToUTF8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
if( utf8_written == 0 )
|
|
||||||
was_utf8_buf_too_small = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if string_len is greater than 0 then the return value is always greater than zero too
|
|
||||||
*/
|
|
||||||
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
chars = WideToInt(wide_string, string_len, z, correct);
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
correct = IntToUTF8(z, utf8, false) != 0;
|
|
||||||
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if wide_string has at least one character then the return value is always greater than zero too
|
|
||||||
*/
|
|
||||||
static size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
chars = WideToInt(wide_string, z, correct);
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
correct = IntToUTF8(z, utf8, false) != 0;
|
|
||||||
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if string_len is greater than 0 then the return value is always greater than zero too
|
|
||||||
*/
|
|
||||||
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
chars = WideToInt(wide_string, string_len, z, correct);
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
correct = IntToUTF8(z, utf8) != 0;
|
|
||||||
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
*/
|
|
||||||
static size_t WideOneToUTF8(const wchar_t * wide_string, std::ostream & utf8, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
size_t min_str_len = 1;
|
|
||||||
|
|
||||||
if( *wide_string == 0 )
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
if( *(wide_string+1) != 0 )
|
|
||||||
min_str_len = 2;
|
|
||||||
|
|
||||||
return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
this function converts a wide string into UTF-8 string
|
this function converts a wide string into UTF-8 string
|
||||||
|
|
||||||
|
@ -772,7 +471,7 @@ size_t chars;
|
||||||
|
|
||||||
while( string_len > 0 )
|
while( string_len > 0 )
|
||||||
{
|
{
|
||||||
chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
|
chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
|
||||||
wide_string += chars;
|
wide_string += chars;
|
||||||
string_len -= chars;
|
string_len -= chars;
|
||||||
}
|
}
|
||||||
|
@ -804,7 +503,7 @@ bool was_error = false;
|
||||||
utf8.clear();
|
utf8.clear();
|
||||||
|
|
||||||
while( *wide_string )
|
while( *wide_string )
|
||||||
wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
|
wide_string += private_namespace::WideOneToUTF8(wide_string, utf8, was_error, mode);
|
||||||
|
|
||||||
return !was_error;
|
return !was_error;
|
||||||
}
|
}
|
||||||
|
@ -832,83 +531,6 @@ bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts a wide string into UTF-8 stream
|
|
||||||
|
|
||||||
input:
|
|
||||||
wide_string - a wide string for converting
|
|
||||||
string_len - size of the string
|
|
||||||
mode - what to do with errors when converting
|
|
||||||
0: skip an invalid character
|
|
||||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
|
||||||
|
|
||||||
output:
|
|
||||||
utf8 - a UTF-8 stream for the output sequence
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting
|
|
||||||
*/
|
|
||||||
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode)
|
|
||||||
{
|
|
||||||
bool was_error = false;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
while( string_len > 0 )
|
|
||||||
{
|
|
||||||
chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
|
|
||||||
wide_string += chars;
|
|
||||||
string_len -= chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts a wide string into UTF-8 stream
|
|
||||||
|
|
||||||
input:
|
|
||||||
wide_string - a null terminated wide string for converting
|
|
||||||
mode - what to do with errors when converting
|
|
||||||
0: skip an invalid character
|
|
||||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
|
||||||
|
|
||||||
output:
|
|
||||||
utf8 - a UTF-8 stream for the output sequence
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting
|
|
||||||
*/
|
|
||||||
bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode)
|
|
||||||
{
|
|
||||||
bool was_error = false;
|
|
||||||
|
|
||||||
while( *wide_string )
|
|
||||||
wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts a wide string (std::wstring) into UTF-8 stream
|
|
||||||
|
|
||||||
input:
|
|
||||||
wide_string - a wide string for converting
|
|
||||||
mode - what to do with errors when converting
|
|
||||||
0: skip an invalid character
|
|
||||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
|
||||||
|
|
||||||
output:
|
|
||||||
utf8 - a UTF-8 stream for the output sequence
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting
|
|
||||||
*/
|
|
||||||
bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode)
|
|
||||||
{
|
|
||||||
return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -942,7 +564,7 @@ size_t chars, utf8_saved;
|
||||||
|
|
||||||
while( string_len > 0 )
|
while( string_len > 0 )
|
||||||
{
|
{
|
||||||
chars = WideOneToUTF8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
|
chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
|
||||||
|
|
||||||
if( was_buffer_to_small )
|
if( was_buffer_to_small )
|
||||||
{
|
{
|
||||||
|
@ -1089,7 +711,7 @@ size_t len;
|
||||||
while( *wide_string )
|
while( *wide_string )
|
||||||
{
|
{
|
||||||
len = (*(wide_string+1) == 0) ? 1 : 2;
|
len = (*(wide_string+1) == 0) ? 1 : 2;
|
||||||
chars = WideOneToUTF8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
|
chars = private_namespace::WideOneToUTF8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
|
||||||
|
|
||||||
if( was_buffer_to_small )
|
if( was_buffer_to_small )
|
||||||
{
|
{
|
||||||
|
|
268
utf8/utf8.h
268
utf8/utf8.h
|
@ -5,7 +5,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010-2018, Tomasz Sowa
|
* Copyright (c) 2010-2021, Tomasz Sowa
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -38,7 +38,6 @@
|
||||||
#ifndef headerfile_picotools_utf8_utf8
|
#ifndef headerfile_picotools_utf8_utf8
|
||||||
#define headerfile_picotools_utf8_utf8
|
#define headerfile_picotools_utf8_utf8
|
||||||
|
|
||||||
#include <fstream>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "textstream/textstream.h"
|
#include "textstream/textstream.h"
|
||||||
|
|
||||||
|
@ -46,8 +45,6 @@
|
||||||
namespace PT
|
namespace PT
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
UTF-8, a transformation format of ISO 10646
|
UTF-8, a transformation format of ISO 10646
|
||||||
http://tools.ietf.org/html/rfc3629
|
http://tools.ietf.org/html/rfc3629
|
||||||
|
@ -68,6 +65,16 @@ bool UTF8_CheckRange(int c);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* convertions from UTF-8
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
converting one character from UTF-8 to an int
|
converting one character from UTF-8 to an int
|
||||||
*/
|
*/
|
||||||
|
@ -85,25 +92,40 @@ bool UTF8ToWide(const char * utf8, std::wstring & res, bool cle
|
||||||
bool UTF8ToWide(const std::string & utf8, std::wstring & res, bool clear = true, int mode = 1);
|
bool UTF8ToWide(const std::string & utf8, std::wstring & res, bool clear = true, int mode = 1);
|
||||||
bool UTF8ToWide(std::istream & utf8, std::wstring & res, bool clear = true, int mode = 1);
|
bool UTF8ToWide(std::istream & utf8, std::wstring & res, bool clear = true, int mode = 1);
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
bool UTF8ToWide(const char * utf8, size_t utf8_len, TextStreamBase<wchar_t, stack_size, heap_block_size> & res, bool clear = true, int mode = 1); // need to be tested
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
bool UTF8ToWide(const char * utf8, TextStreamBase<wchar_t, stack_size, heap_block_size> & res, bool clear = true, int mode = 1); // need to be tested
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
bool UTF8ToWide(const std::string & utf8, TextStreamBase<wchar_t, stack_size, heap_block_size> & res, bool clear = true, int mode = 1); // need to be tested
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
bool UTF8ToWide(std::istream & utf8, TextStreamBase<wchar_t, stack_size, heap_block_size> & res, bool clear = true, int mode = 1); // need to be tested
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
converting UTF-8 string to a WTextStream stream
|
|
||||||
(need to be tested)
|
|
||||||
*/
|
|
||||||
/*
|
/*
|
||||||
implemented as templates below
|
*
|
||||||
bool UTF8ToWide(const char * utf8, size_t utf8_len, WTextStream & res, bool clear = true, int mode = 1);
|
*
|
||||||
bool UTF8ToWide(const char * utf8, WTextStream & res, bool clear = true, int mode = 1);
|
*
|
||||||
bool UTF8ToWide(const std::string & utf8, WTextStream & res, bool clear = true, int mode = 1);
|
* convertions to UTF-8
|
||||||
bool UTF8ToWide(std::istream & utf8, WTextStream & res, bool clear = true, int mode = 1);
|
*
|
||||||
*/
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
converting one int character to UTF-8
|
converting one int character to UTF-8
|
||||||
*/
|
*/
|
||||||
size_t IntToUTF8(int z, char * utf8, size_t utf8_max_len);
|
size_t IntToUTF8(int z, char * utf8, size_t utf8_max_len);
|
||||||
size_t IntToUTF8(int z, std::string & utf8, bool clear = true );
|
size_t IntToUTF8(int z, std::string & utf8, bool clear = true);
|
||||||
size_t IntToUTF8(int z, std::ostream & utf8);
|
|
||||||
|
template<typename StreamType>
|
||||||
|
size_t IntToUTF8(int z, StreamType & utf8);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -113,216 +135,32 @@ bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::string & ut
|
||||||
bool WideToUTF8(const wchar_t * wide_string, std::string & utf8, bool clear = true, int mode = 1);
|
bool WideToUTF8(const wchar_t * wide_string, std::string & utf8, bool clear = true, int mode = 1);
|
||||||
bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear = true, int mode = 1);
|
bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear = true, int mode = 1);
|
||||||
|
|
||||||
// implemented as a template below
|
template<typename StreamType>
|
||||||
//void WideToUTF8(PT::WTextStream & buffer, std::string & utf8, bool clear = true, int mode = 1);// not tested
|
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode = 1);
|
||||||
|
|
||||||
|
template<typename StreamType>
|
||||||
|
bool WideToUTF8(const wchar_t * wide_string, StreamType & utf8, int mode = 1);
|
||||||
|
|
||||||
|
template<typename StreamType>
|
||||||
|
bool WideToUTF8(const std::wstring & wide_string, StreamType & utf8, int mode = 1);
|
||||||
|
|
||||||
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode = 1);
|
|
||||||
bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode = 1);
|
|
||||||
bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode = 1);
|
|
||||||
|
|
||||||
// implemented as a template below
|
|
||||||
//void WideToUTF8(PT::WTextStream & buffer, std::ostream & utf8, int mode = 1);// not tested
|
|
||||||
|
|
||||||
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
||||||
bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
||||||
bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
||||||
// implement void WideToUTF8(PT::WTextStream & buffer, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
// implement template<typename StreamType>
|
||||||
|
|
||||||
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode = 1);
|
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode = 1);
|
||||||
bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode = 1);
|
bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode = 1);
|
||||||
bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode = 1);
|
bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode = 1);
|
||||||
// implement void WideToUTF8(PT::WTextStream & buffer, char * utf8, size_t utf8_len, int mode = 1);
|
// implement template<typename StreamType>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
namespace private_namespace
|
|
||||||
{
|
|
||||||
template<typename function_type>
|
|
||||||
bool UTF8ToWideGeneric(const char * utf8, size_t utf8_len, int mode, function_type convert_function)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
size_t len;
|
|
||||||
bool correct, was_error = false;
|
|
||||||
|
|
||||||
while( utf8_len > 0 )
|
|
||||||
{
|
|
||||||
if( (unsigned char)*utf8 <= 0x7f )
|
|
||||||
{
|
|
||||||
// small optimization
|
|
||||||
len = 1;
|
|
||||||
correct = true;
|
|
||||||
z = static_cast<unsigned char>(*utf8);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
len = UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero
|
|
||||||
}
|
|
||||||
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
convert_function(0xFFFD); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
convert_function(z);
|
|
||||||
}
|
|
||||||
|
|
||||||
utf8 += len;
|
|
||||||
utf8_len -= len;
|
|
||||||
}
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<typename char_type, size_t stack_size, size_t heap_block_size>
|
|
||||||
void IntToWide(int c, TextStreamBase<char_type, stack_size, heap_block_size> & res)
|
|
||||||
{
|
|
||||||
if( sizeof(wchar_t)==2 && c>0xffff )
|
|
||||||
{
|
|
||||||
// UTF16 surrogate pairs
|
|
||||||
c -= 0x10000;
|
|
||||||
res << static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
|
|
||||||
res << static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
res << static_cast<wchar_t>(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// not tested
|
|
||||||
// FIX ME it is not using surrogate pairs from input stream
|
|
||||||
// and mode parameter
|
|
||||||
template<typename char_type, size_t stack_size, size_t heap_block_size, typename function_type>
|
|
||||||
void WideToUTF8Generic(TextStreamBase<char_type, stack_size, heap_block_size> & buffer, int mode, function_type write_function)
|
|
||||||
{
|
|
||||||
char utf8_buffer[256];
|
|
||||||
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
|
|
||||||
std::size_t utf8_sequence_max_length = 10;
|
|
||||||
std::size_t index = 0;
|
|
||||||
|
|
||||||
typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator i = buffer.begin();
|
|
||||||
|
|
||||||
while( i != buffer.end() )
|
|
||||||
{
|
|
||||||
if( index + utf8_sequence_max_length > buffer_len )
|
|
||||||
{
|
|
||||||
write_function(utf8_buffer, index);
|
|
||||||
index = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
index += PT::IntToUTF8(*i, utf8_buffer + index, buffer_len - index);
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( index > 0 )
|
|
||||||
{
|
|
||||||
write_function(utf8_buffer, index);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// need to be tested
|
|
||||||
template<typename char_type, size_t stack_size, size_t heap_block_size>
|
|
||||||
bool UTF8ToWide(const char * utf8, size_t utf8_len, TextStreamBase<char_type, stack_size, heap_block_size> & res, bool clear = true, int mode = 1)
|
|
||||||
{
|
|
||||||
if( clear )
|
|
||||||
res.clear();
|
|
||||||
|
|
||||||
bool status = private_namespace::UTF8ToWideGeneric(utf8, utf8_len, mode, [&res](int c) {
|
|
||||||
private_namespace::IntToWide(c, res);
|
|
||||||
});
|
|
||||||
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// need to be tested
|
|
||||||
template<typename char_type, size_t stack_size, size_t heap_block_size>
|
|
||||||
bool UTF8ToWide(const char * utf8, TextStreamBase<char_type, stack_size, heap_block_size> & res, bool clear = true, int mode = 1)
|
|
||||||
{
|
|
||||||
size_t utf8_len = 0;
|
|
||||||
|
|
||||||
while( utf8[utf8_len] != 0 )
|
|
||||||
utf8_len += 1;
|
|
||||||
|
|
||||||
return UTF8ToWide(utf8, utf8_len, res, clear, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// need to be tested
|
|
||||||
template<typename char_type, size_t stack_size, size_t heap_block_size>
|
|
||||||
bool UTF8ToWide(const std::string & utf8, TextStreamBase<char_type, stack_size, heap_block_size> & res, bool clear = true, int mode = 1)
|
|
||||||
{
|
|
||||||
return UTF8ToWide(utf8.c_str(), utf8.size(), res, clear, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// need to be tested
|
|
||||||
template<typename char_type, size_t stack_size, size_t heap_block_size>
|
|
||||||
bool UTF8ToWide(std::istream & utf8, TextStreamBase<char_type, stack_size, heap_block_size> & res, bool clear = true, int mode = 1)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct, was_error = false;
|
|
||||||
|
|
||||||
if( clear )
|
|
||||||
res.clear();
|
|
||||||
|
|
||||||
while( UTF8ToInt(utf8, z, correct) > 0 )
|
|
||||||
{
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
res << 0xFFFD; // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
private_namespace::IntToWide(z, res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// not tested
|
|
||||||
template<typename char_type, size_t stack_size, size_t heap_block_size>
|
|
||||||
void WideToUTF8(TextStreamBase<char_type, stack_size, heap_block_size> & buffer, std::string & utf8, bool clear = true, int mode = 1)
|
|
||||||
{
|
|
||||||
if( clear )
|
|
||||||
utf8.clear();
|
|
||||||
|
|
||||||
private_namespace::WideToUTF8Generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
|
|
||||||
utf8.append(utf8_buffer, buffer_len);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// not tested
|
|
||||||
template<typename char_type, size_t stack_size, size_t heap_block_size>
|
|
||||||
void WideToUTF8(TextStreamBase<char_type, stack_size, heap_block_size> & buffer, std::ostream & utf8, int mode = 1)
|
|
||||||
{
|
|
||||||
private_namespace::WideToUTF8Generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
|
|
||||||
utf8.write(utf8_buffer, buffer_len);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
void WideToUTF8(TextStreamBase<wchar_t, stack_size, heap_block_size> & buffer, std::string & utf8, bool clear = true, int mode = 1); // not tested
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
void WideToUTF8(TextStreamBase<wchar_t, stack_size, heap_block_size> & buffer, std::ostream & utf8, int mode = 1); // not tested
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -330,5 +168,7 @@ void WideToUTF8(TextStreamBase<char_type, stack_size, heap_block_size> & buffer,
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
|
||||||
|
#include "utf8/utf8_templates.h"
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,286 @@
|
||||||
|
/*
|
||||||
|
* This file is a part of PikoTools
|
||||||
|
* and is distributed under the (new) BSD licence.
|
||||||
|
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2021, Tomasz Sowa
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* * Neither the name Tomasz Sowa nor the names of contributors to this
|
||||||
|
* project may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||||
|
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utf8_private.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace PT
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace private_namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from UTF-8 string
|
||||||
|
*/
|
||||||
|
bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res)
|
||||||
|
{
|
||||||
|
for(len=0 ; (uz & 0x80) != 0 ; ++len)
|
||||||
|
uz <<= 1;
|
||||||
|
|
||||||
|
if( len == 1 )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
res = uz;
|
||||||
|
|
||||||
|
if( len > 0 )
|
||||||
|
res >>= len;
|
||||||
|
|
||||||
|
if( res == 0 )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if( len == 0 )
|
||||||
|
len = 1;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from UTF-8 string
|
||||||
|
*/
|
||||||
|
bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res)
|
||||||
|
{
|
||||||
|
if( (uz & 0xc0) != 0x80 )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
res <<= 6;
|
||||||
|
res |= (uz & 0x3F);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
converting a wide character into one int
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if string_len is greater than 0 then the return value is always greater than zero too
|
||||||
|
*/
|
||||||
|
size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
|
||||||
|
{
|
||||||
|
if( string_len == 0 )
|
||||||
|
{
|
||||||
|
z = 0;
|
||||||
|
correct = false;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
z = static_cast<int>(*wide_string);
|
||||||
|
correct = true;
|
||||||
|
|
||||||
|
if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) )
|
||||||
|
{
|
||||||
|
if( z>=0xD800 && z<=0xDBFF && string_len>1 )
|
||||||
|
{
|
||||||
|
int z2 = *(wide_string+1);
|
||||||
|
|
||||||
|
if( z2>=0xDC00 && z2<=0xDFFF )
|
||||||
|
{
|
||||||
|
z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
correct = false;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
correct = false;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
correct = UTF8_CheckRange(z);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
converting a wide character into one int
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if wide_string has at least one character then the return value is always greater than zero too
|
||||||
|
*/
|
||||||
|
size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct)
|
||||||
|
{
|
||||||
|
size_t min_str_len = 1;
|
||||||
|
|
||||||
|
if( *wide_string == 0 )
|
||||||
|
{
|
||||||
|
z = 0;
|
||||||
|
correct = false;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( *(wide_string+1) != 0 )
|
||||||
|
min_str_len = 2;
|
||||||
|
|
||||||
|
return WideToInt(wide_string, min_str_len, z, correct);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if string_len is greater than 0 then the return value is always greater than zero too
|
||||||
|
|
||||||
|
utf8_written - how many characters were saved in the utf8 string (the string doesn't have
|
||||||
|
a null terminating character)
|
||||||
|
it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
|
||||||
|
was_utf8_buf_too_small - will be true if the utf8 buffer is too small
|
||||||
|
if this flag is true then utf8_written is equal to zero
|
||||||
|
was_error - will be true if there is an error when converting (there was an incorrect wide character)
|
||||||
|
(was_error will not be true if the utf8 buffer is too small)
|
||||||
|
*/
|
||||||
|
size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
||||||
|
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
utf8_written = 0;
|
||||||
|
was_utf8_buf_too_small = false;
|
||||||
|
chars = WideToInt(wide_string, string_len, z, correct);
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
{
|
||||||
|
utf8_written = IntToUTF8(z, utf8, utf8_len);
|
||||||
|
|
||||||
|
if( utf8_written == 0 )
|
||||||
|
was_utf8_buf_too_small = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
{
|
||||||
|
utf8_written = IntToUTF8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
if( utf8_written == 0 )
|
||||||
|
was_utf8_buf_too_small = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if string_len is greater than 0 then the return value is always greater than zero too
|
||||||
|
*/
|
||||||
|
size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
chars = WideToInt(wide_string, string_len, z, correct);
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
correct = IntToUTF8(z, utf8, false) != 0;
|
||||||
|
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if wide_string has at least one character then the return value is always greater than zero too
|
||||||
|
*/
|
||||||
|
size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
chars = WideToInt(wide_string, z, correct);
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
correct = IntToUTF8(z, utf8, false) != 0;
|
||||||
|
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace private_namespace
|
||||||
|
|
||||||
|
} // namespace PT
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,220 @@
|
||||||
|
/*
|
||||||
|
* This file is a part of PikoTools
|
||||||
|
* and is distributed under the (new) BSD licence.
|
||||||
|
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2021, Tomasz Sowa
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* * Neither the name Tomasz Sowa nor the names of contributors to this
|
||||||
|
* project may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||||
|
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef headerfile_picotools_utf8_utf8_private
|
||||||
|
#define headerfile_picotools_utf8_utf8_private
|
||||||
|
|
||||||
|
#include "textstream/textstream.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace PT
|
||||||
|
{
|
||||||
|
|
||||||
|
bool UTF8_CheckRange(int c);
|
||||||
|
size_t IntToUTF8(int z, char * utf8, size_t utf8_max_len);
|
||||||
|
size_t IntToUTF8(int z, std::string & utf8, bool clear);
|
||||||
|
size_t UTF8ToInt(const char * utf8, size_t utf8_len, int & res, bool & correct);
|
||||||
|
|
||||||
|
|
||||||
|
namespace private_namespace
|
||||||
|
{
|
||||||
|
bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res);
|
||||||
|
bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res);
|
||||||
|
|
||||||
|
size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct);
|
||||||
|
size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct);
|
||||||
|
|
||||||
|
size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
||||||
|
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode);
|
||||||
|
|
||||||
|
size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode);
|
||||||
|
|
||||||
|
size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode);
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if string_len is greater than 0 then the return value is always greater than zero too
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
chars = WideToInt(wide_string, string_len, z, correct);
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
correct = IntToUTF8(z, utf8) != 0;
|
||||||
|
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
static size_t WideOneToUTF8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
size_t min_str_len = 1;
|
||||||
|
|
||||||
|
if( *wide_string == 0 )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if( *(wide_string+1) != 0 )
|
||||||
|
min_str_len = 2;
|
||||||
|
|
||||||
|
return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// declared in utf8.h, defined in utf8.cpp
|
||||||
|
size_t UTF8ToInt(const char * utf8, size_t utf8_len, int & res, bool & correct);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<typename function_type>
|
||||||
|
bool UTF8ToWideGeneric(const char * utf8, size_t utf8_len, int mode, function_type convert_function)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
size_t len;
|
||||||
|
bool correct, was_error = false;
|
||||||
|
|
||||||
|
while( utf8_len > 0 )
|
||||||
|
{
|
||||||
|
if( (unsigned char)*utf8 <= 0x7f )
|
||||||
|
{
|
||||||
|
// small optimization
|
||||||
|
len = 1;
|
||||||
|
correct = true;
|
||||||
|
z = static_cast<unsigned char>(*utf8);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
len = PT::UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero
|
||||||
|
}
|
||||||
|
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
convert_function(0xFFFD); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
convert_function(z);
|
||||||
|
}
|
||||||
|
|
||||||
|
utf8 += len;
|
||||||
|
utf8_len -= len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
void IntToWide(int c, TextStreamBase<wchar_t, stack_size, heap_block_size> & res)
|
||||||
|
{
|
||||||
|
if( sizeof(wchar_t)==2 && c>0xffff )
|
||||||
|
{
|
||||||
|
// UTF16 surrogate pairs
|
||||||
|
c -= 0x10000;
|
||||||
|
res << static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
|
||||||
|
res << static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
res << static_cast<wchar_t>(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// not tested
|
||||||
|
// FIX ME it is not using surrogate pairs from input stream
|
||||||
|
// and mode parameter
|
||||||
|
template<typename char_type, size_t stack_size, size_t heap_block_size, typename function_type>
|
||||||
|
void WideToUTF8Generic(TextStreamBase<char_type, stack_size, heap_block_size> & buffer, int mode, function_type write_function)
|
||||||
|
{
|
||||||
|
char utf8_buffer[256];
|
||||||
|
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
|
||||||
|
std::size_t utf8_sequence_max_length = 10;
|
||||||
|
std::size_t index = 0;
|
||||||
|
|
||||||
|
typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator i = buffer.begin();
|
||||||
|
|
||||||
|
while( i != buffer.end() )
|
||||||
|
{
|
||||||
|
if( index + utf8_sequence_max_length > buffer_len )
|
||||||
|
{
|
||||||
|
write_function(utf8_buffer, index);
|
||||||
|
index = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
index += IntToUTF8(*i, utf8_buffer + index, buffer_len - index);
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( index > 0 )
|
||||||
|
{
|
||||||
|
write_function(utf8_buffer, index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace private_namespace
|
||||||
|
|
||||||
|
} // namespace PT
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,271 @@
|
||||||
|
/*
|
||||||
|
* This file is a part of PikoTools
|
||||||
|
* and is distributed under the (new) BSD licence.
|
||||||
|
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2021, Tomasz Sowa
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* * Neither the name Tomasz Sowa nor the names of contributors to this
|
||||||
|
* project may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||||
|
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef headerfile_picotools_utf8_utf8_templates
|
||||||
|
#define headerfile_picotools_utf8_utf8_templates
|
||||||
|
|
||||||
|
// this file is included at the end of utf8.h
|
||||||
|
|
||||||
|
#include "utf8_private.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace PT
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
|
||||||
|
(need to be tested)
|
||||||
|
*/
|
||||||
|
// need to be tested
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
bool UTF8ToWide(const char * utf8, size_t utf8_len, TextStreamBase<wchar_t, stack_size, heap_block_size> & res, bool clear, int mode)
|
||||||
|
{
|
||||||
|
if( clear )
|
||||||
|
res.clear();
|
||||||
|
|
||||||
|
bool status = private_namespace::UTF8ToWideGeneric(utf8, utf8_len, mode, [&res](int c) {
|
||||||
|
private_namespace::IntToWide(c, res);
|
||||||
|
});
|
||||||
|
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
bool UTF8ToWide(const char * utf8, TextStreamBase<wchar_t, stack_size, heap_block_size> & res, bool clear, int mode)
|
||||||
|
{
|
||||||
|
size_t utf8_len = 0;
|
||||||
|
|
||||||
|
while( utf8[utf8_len] != 0 )
|
||||||
|
utf8_len += 1;
|
||||||
|
|
||||||
|
return UTF8ToWide(utf8, utf8_len, res, clear, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
bool UTF8ToWide(const std::string & utf8, TextStreamBase<wchar_t, stack_size, heap_block_size> & res, bool clear, int mode)
|
||||||
|
{
|
||||||
|
return UTF8ToWide(utf8.c_str(), utf8.size(), res, clear, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// need to be tested
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
bool UTF8ToWide(std::istream & utf8, TextStreamBase<wchar_t, stack_size, heap_block_size> & res, bool clear, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct, was_error = false;
|
||||||
|
|
||||||
|
if( clear )
|
||||||
|
res.clear();
|
||||||
|
|
||||||
|
while( UTF8ToInt(utf8, z, correct) > 0 )
|
||||||
|
{
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
res << 0xFFFD; // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
private_namespace::IntToWide(z, res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts one wide character into UTF-8 stream
|
||||||
|
|
||||||
|
input:
|
||||||
|
z - wide character
|
||||||
|
|
||||||
|
output:
|
||||||
|
utf8 - a UTF-8 stream for the output sequence
|
||||||
|
|
||||||
|
the function returns how many characters have been written to the utf8 stream,
|
||||||
|
zero means that 'z' is an incorrect unicode character
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
size_t IntToUTF8(int z, StreamType & utf8)
|
||||||
|
{
|
||||||
|
char buf[10];
|
||||||
|
|
||||||
|
size_t len = IntToUTF8(z, buf, sizeof(buf)/sizeof(char));
|
||||||
|
|
||||||
|
if( len > 0 )
|
||||||
|
utf8.write(buf, len);
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts a wide string into UTF-8 stream
|
||||||
|
|
||||||
|
input:
|
||||||
|
wide_string - a wide string for converting
|
||||||
|
string_len - size of the string
|
||||||
|
mode - what to do with errors when converting
|
||||||
|
0: skip an invalid character
|
||||||
|
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||||
|
|
||||||
|
output:
|
||||||
|
utf8 - a UTF-8 stream for the output sequence
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode)
|
||||||
|
{
|
||||||
|
bool was_error = false;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
while( string_len > 0 )
|
||||||
|
{
|
||||||
|
chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
|
||||||
|
wide_string += chars;
|
||||||
|
string_len -= chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts a wide string into UTF-8 stream
|
||||||
|
|
||||||
|
input:
|
||||||
|
wide_string - a null terminated wide string for converting
|
||||||
|
mode - what to do with errors when converting
|
||||||
|
0: skip an invalid character
|
||||||
|
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||||
|
|
||||||
|
output:
|
||||||
|
utf8 - a UTF-8 stream for the output sequence
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
bool WideToUTF8(const wchar_t * wide_string, StreamType & utf8, int mode)
|
||||||
|
{
|
||||||
|
bool was_error = false;
|
||||||
|
|
||||||
|
while( *wide_string )
|
||||||
|
wide_string += private_namespace::WideOneToUTF8(wide_string, utf8, was_error, mode);
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts a wide string (std::wstring) into UTF-8 stream
|
||||||
|
|
||||||
|
input:
|
||||||
|
wide_string - a wide string for converting
|
||||||
|
mode - what to do with errors when converting
|
||||||
|
0: skip an invalid character
|
||||||
|
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||||
|
|
||||||
|
output:
|
||||||
|
utf8 - a UTF-8 stream for the output sequence
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
bool WideToUTF8(const std::wstring & wide_string, StreamType & utf8, int mode)
|
||||||
|
{
|
||||||
|
return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
void WideToUTF8(TextStreamBase<wchar_t, stack_size, heap_block_size> & buffer, std::string & utf8, bool clear, int mode)
|
||||||
|
{
|
||||||
|
if( clear )
|
||||||
|
utf8.clear();
|
||||||
|
|
||||||
|
private_namespace::WideToUTF8Generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
|
||||||
|
utf8.append(utf8_buffer, buffer_len);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// not tested
|
||||||
|
template<size_t stack_size, size_t heap_block_size>
|
||||||
|
void WideToUTF8(TextStreamBase<wchar_t, stack_size, heap_block_size> & buffer, std::ostream & utf8, int mode)
|
||||||
|
{
|
||||||
|
private_namespace::WideToUTF8Generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
|
||||||
|
utf8.write(utf8_buffer, buffer_len);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace PT
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue