leavy only utf8.h and utf8.cpp
Remove utf8_private.h, utf8_private.cpp and utf8_templates.h and move their methods to utf8.h/utf8.cpp.
This commit is contained in:
@@ -3,32 +3,28 @@
|
|||||||
./convert/inttostr.o: ./convert/inttostr.h
|
./convert/inttostr.o: ./convert/inttostr.h
|
||||||
./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h
|
./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h
|
||||||
./convert/misc.o: textstream/types.h ./convert/inttostr.h utf8/utf8.h
|
./convert/misc.o: textstream/types.h ./convert/inttostr.h utf8/utf8.h
|
||||||
./convert/misc.o: utf8/utf8_templates.h utf8/utf8_private.h
|
|
||||||
./convert/text.o: ./convert/text.h ./convert/text_private.h
|
./convert/text.o: ./convert/text.h ./convert/text_private.h
|
||||||
./convert/double.o: ./convert/double.h textstream/textstream.h
|
./convert/double.o: ./convert/double.h textstream/textstream.h
|
||||||
./convert/double.o: textstream/stream.h space/space.h convert/inttostr.h
|
./convert/double.o: textstream/stream.h space/space.h convert/inttostr.h
|
||||||
./convert/double.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
./convert/double.o: utf8/utf8.h textstream/stream.h date/date.h
|
||||||
./convert/double.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
./convert/double.o: membuffer/membuffer.h textstream/types.h
|
||||||
./convert/double.o: textstream/types.h textstream/stream_private.h
|
./convert/double.o: textstream/stream_private.h
|
||||||
./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
|
./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
|
||||||
./convert/baseparser.o: textstream/stream.h space/space.h convert/inttostr.h
|
./convert/baseparser.o: textstream/stream.h space/space.h convert/inttostr.h
|
||||||
./convert/baseparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
./convert/baseparser.o: utf8/utf8.h textstream/stream.h date/date.h
|
||||||
./convert/baseparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
./convert/baseparser.o: membuffer/membuffer.h textstream/types.h
|
||||||
./convert/baseparser.o: textstream/types.h textstream/stream_private.h
|
./convert/baseparser.o: textstream/stream_private.h
|
||||||
./date/date.o: ./date/date.h convert/inttostr.h
|
./date/date.o: ./date/date.h convert/inttostr.h
|
||||||
./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
|
./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
|
||||||
./log/filelog.o: space/space.h convert/inttostr.h utf8/utf8.h
|
./log/filelog.o: space/space.h convert/inttostr.h utf8/utf8.h
|
||||||
./log/filelog.o: textstream/stream.h utf8/utf8_templates.h
|
./log/filelog.o: textstream/stream.h date/date.h membuffer/membuffer.h
|
||||||
./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
|
||||||
./log/filelog.o: textstream/types.h textstream/stream_private.h
|
./log/filelog.o: textstream/types.h textstream/stream_private.h
|
||||||
./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h
|
./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h
|
||||||
./log/log.o: space/space.h convert/inttostr.h utf8/utf8.h textstream/stream.h
|
./log/log.o: space/space.h convert/inttostr.h utf8/utf8.h textstream/stream.h
|
||||||
./log/log.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
|
./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h
|
||||||
./log/log.o: membuffer/membuffer.h textstream/types.h
|
|
||||||
./log/log.o: textstream/stream_private.h ./log/filelog.h
|
./log/log.o: textstream/stream_private.h ./log/filelog.h
|
||||||
./space/space.o: ./space/space.h convert/inttostr.h utf8/utf8.h
|
./space/space.o: ./space/space.h convert/inttostr.h utf8/utf8.h
|
||||||
./space/space.o: textstream/stream.h utf8/utf8_templates.h
|
./space/space.o: textstream/stream.h convert/convert.h ./convert/inttostr.h
|
||||||
./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h
|
|
||||||
./space/space.o: convert/patternreplacer.h textstream/textstream.h
|
./space/space.o: convert/patternreplacer.h textstream/textstream.h
|
||||||
./space/space.o: textstream/stream.h space/space.h date/date.h
|
./space/space.o: textstream/stream.h space/space.h date/date.h
|
||||||
./space/space.o: membuffer/membuffer.h textstream/types.h
|
./space/space.o: membuffer/membuffer.h textstream/types.h
|
||||||
@@ -37,7 +33,6 @@
|
|||||||
./space/space.o: ./convert/double.h
|
./space/space.o: ./convert/double.h
|
||||||
./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
|
./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
|
||||||
./space/spaceparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
|
./space/spaceparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
|
||||||
./space/spaceparser.o: utf8/utf8_templates.h utf8/utf8_private.h
|
|
||||||
./space/spaceparser.o: convert/baseparser.h textstream/textstream.h
|
./space/spaceparser.o: convert/baseparser.h textstream/textstream.h
|
||||||
./space/spaceparser.o: textstream/stream.h space/space.h date/date.h
|
./space/spaceparser.o: textstream/stream.h space/space.h date/date.h
|
||||||
./space/spaceparser.o: membuffer/membuffer.h textstream/types.h
|
./space/spaceparser.o: membuffer/membuffer.h textstream/types.h
|
||||||
@@ -45,7 +40,6 @@
|
|||||||
./space/spaceparser.o: ./convert/text.h ./convert/misc.h textstream/types.h
|
./space/spaceparser.o: ./convert/text.h ./convert/misc.h textstream/types.h
|
||||||
./space/keyvalueparser.o: ./space/keyvalueparser.h ./space/space.h
|
./space/keyvalueparser.o: ./space/keyvalueparser.h ./space/space.h
|
||||||
./space/keyvalueparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
|
./space/keyvalueparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
|
||||||
./space/keyvalueparser.o: utf8/utf8_templates.h utf8/utf8_private.h
|
|
||||||
./space/keyvalueparser.o: convert/baseparser.h textstream/textstream.h
|
./space/keyvalueparser.o: convert/baseparser.h textstream/textstream.h
|
||||||
./space/keyvalueparser.o: textstream/stream.h space/space.h date/date.h
|
./space/keyvalueparser.o: textstream/stream.h space/space.h date/date.h
|
||||||
./space/keyvalueparser.o: membuffer/membuffer.h textstream/types.h
|
./space/keyvalueparser.o: membuffer/membuffer.h textstream/types.h
|
||||||
@@ -53,29 +47,24 @@
|
|||||||
./space/keyvalueparser.o: ./convert/text.h ./convert/misc.h
|
./space/keyvalueparser.o: ./convert/text.h ./convert/misc.h
|
||||||
./space/keyvalueparser.o: textstream/types.h
|
./space/keyvalueparser.o: textstream/types.h
|
||||||
./textstream/stream_private.o: textstream/stream_private.h
|
./textstream/stream_private.o: textstream/stream_private.h
|
||||||
./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h
|
||||||
./utf8/utf8.o: utf8/utf8_private.h
|
|
||||||
./utf8/utf8_private.o: utf8/utf8_private.h
|
|
||||||
./csv/csvparser.o: ./csv/csvparser.h space/space.h convert/inttostr.h
|
./csv/csvparser.o: ./csv/csvparser.h space/space.h convert/inttostr.h
|
||||||
./csv/csvparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
./csv/csvparser.o: utf8/utf8.h textstream/stream.h convert/baseparser.h
|
||||||
./csv/csvparser.o: utf8/utf8_private.h convert/baseparser.h
|
|
||||||
./csv/csvparser.o: textstream/textstream.h textstream/stream.h date/date.h
|
./csv/csvparser.o: textstream/textstream.h textstream/stream.h date/date.h
|
||||||
./csv/csvparser.o: membuffer/membuffer.h textstream/types.h
|
./csv/csvparser.o: membuffer/membuffer.h textstream/types.h
|
||||||
./csv/csvparser.o: textstream/stream_private.h
|
./csv/csvparser.o: textstream/stream_private.h
|
||||||
./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
|
./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
|
||||||
./mainoptions/mainoptionsparser.o: space/space.h convert/inttostr.h
|
./mainoptions/mainoptionsparser.o: space/space.h convert/inttostr.h
|
||||||
./mainoptions/mainoptionsparser.o: utf8/utf8.h textstream/stream.h
|
./mainoptions/mainoptionsparser.o: utf8/utf8.h textstream/stream.h
|
||||||
./mainoptions/mainoptionsparser.o: utf8/utf8_templates.h utf8/utf8_private.h
|
|
||||||
./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
|
./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
|
||||||
./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
|
./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
|
||||||
./html/bbcodeparser.o: textstream/stream.h space/space.h convert/inttostr.h
|
./html/bbcodeparser.o: textstream/stream.h space/space.h convert/inttostr.h
|
||||||
./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h date/date.h
|
||||||
./html/bbcodeparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
|
||||||
./html/bbcodeparser.o: textstream/types.h textstream/stream_private.h
|
./html/bbcodeparser.o: textstream/stream_private.h
|
||||||
./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
|
./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
|
||||||
./html/htmlparser.o: textstream/textstream.h textstream/stream.h
|
./html/htmlparser.o: textstream/textstream.h textstream/stream.h
|
||||||
./html/htmlparser.o: space/space.h convert/inttostr.h utf8/utf8.h
|
./html/htmlparser.o: space/space.h convert/inttostr.h utf8/utf8.h
|
||||||
./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
|
./html/htmlparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
|
||||||
./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
|
||||||
./html/htmlparser.o: textstream/types.h textstream/stream_private.h
|
./html/htmlparser.o: textstream/types.h textstream/stream_private.h
|
||||||
./html/htmlparser.o: convert/text.h
|
./html/htmlparser.o: convert/text.h
|
||||||
|
@@ -34,29 +34,27 @@
|
|||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
#include "utf8_private.h"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
namespace pt
|
namespace pt
|
||||||
{
|
{
|
||||||
|
|
||||||
|
/*
|
||||||
/*!
|
* returns true if 'c' is a correct unicode character
|
||||||
returns true if 'c' is a correct unicode character
|
*/
|
||||||
*/
|
|
||||||
bool utf8_check_range(int c)
|
bool utf8_check_range(int c)
|
||||||
{
|
{
|
||||||
return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF);
|
return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*
|
||||||
returns true if 'c' is a correct unicode character
|
* returns true if 'c' is a correct unicode character
|
||||||
|
*
|
||||||
this method is used when reading from an utf8 string
|
* this method is used when reading from an utf8 string
|
||||||
how_many_bytes - means how many bytes from the utf8 string were read
|
* how_many_bytes - means how many bytes from the utf8 string were read
|
||||||
*/
|
*/
|
||||||
bool utf8_check_range(int c, int how_many_bytes)
|
bool utf8_check_range(int c, int how_many_bytes)
|
||||||
{
|
{
|
||||||
if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 )
|
if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 )
|
||||||
@@ -126,12 +124,12 @@ bool surrogate_pair_to_int(int c1, int c2, int & z)
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
* an auxiliary function for converting from wide characters to UTF-8
|
||||||
converting a wide character into one int
|
* converting a wide character into one int
|
||||||
|
*
|
||||||
returns how many wide characters were used
|
* returns how many wide characters were used
|
||||||
if string_len is greater than 0 then the return value is always greater than zero too
|
* if string_len is greater than 0 then the return value is always greater than zero too
|
||||||
*/
|
*/
|
||||||
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
|
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
|
||||||
{
|
{
|
||||||
if( string_len == 0 )
|
if( string_len == 0 )
|
||||||
@@ -177,12 +175,12 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
* an auxiliary function for converting from wide characters to UTF-8
|
||||||
converting a wide character into one int
|
* converting a wide character into one int
|
||||||
|
|
||||||
returns how many wide characters were used
|
* returns how many wide characters were used
|
||||||
if wide_string has at least one character then the return value is always greater than zero too
|
* if wide_string has at least one character then the return value is always greater than zero too
|
||||||
*/
|
*/
|
||||||
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)
|
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)
|
||||||
{
|
{
|
||||||
size_t min_str_len = 1;
|
size_t min_str_len = 1;
|
||||||
@@ -235,10 +233,10 @@ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len)
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
converts an int to a wide string
|
* converts an int to a wide string
|
||||||
|
*
|
||||||
returns true if a character was inserted to the string
|
* returns true if a character was inserted to the string
|
||||||
*/
|
*/
|
||||||
bool int_to_wide(int c, std::wstring & res)
|
bool int_to_wide(int c, std::wstring & res)
|
||||||
{
|
{
|
||||||
wchar_t buf[2];
|
wchar_t buf[2];
|
||||||
@@ -281,23 +279,23 @@ bool int_to_stream(int c, pt::Stream & stream)
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*
|
||||||
this function converts one UTF-8 character into one wide-character
|
* this function converts one UTF-8 character into one wide-character
|
||||||
|
*
|
||||||
input:
|
* input:
|
||||||
utf8 - an input UTF-8 string
|
* utf8 - an input UTF-8 string
|
||||||
utf8_len - size of the input string,
|
* utf8_len - size of the input string,
|
||||||
the string should be at least 4 bytes length for correctly
|
* the string should be at least 4 bytes length for correctly
|
||||||
recognized the utf-8 sequence
|
* recognized the utf-8 sequence
|
||||||
|
*
|
||||||
output:
|
* output:
|
||||||
res - an output character
|
* res - an output character
|
||||||
correct - true if it is a correct character
|
* correct - true if it is a correct character
|
||||||
|
*
|
||||||
the function returns how many characters have been used from the input string
|
* the function returns how many characters have been used from the input string
|
||||||
(returns zero only if utf8_len is zero)
|
* (returns zero only if utf8_len is zero)
|
||||||
even if there are errors the functions returns a different from zero value
|
* even if there are errors the functions returns a different from zero value
|
||||||
*/
|
*/
|
||||||
size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct)
|
size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct)
|
||||||
{
|
{
|
||||||
size_t i, len;
|
size_t i, len;
|
||||||
@@ -1016,6 +1014,160 @@ return res;
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
namespace private_namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from UTF-8 string
|
||||||
|
*/
|
||||||
|
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res)
|
||||||
|
{
|
||||||
|
for(len=0 ; (uz & 0x80) != 0 ; ++len)
|
||||||
|
uz <<= 1;
|
||||||
|
|
||||||
|
if( len == 1 || len > 4 )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
res = uz;
|
||||||
|
|
||||||
|
if( len > 0 )
|
||||||
|
res >>= len;
|
||||||
|
|
||||||
|
if( len == 0 )
|
||||||
|
len = 1;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from UTF-8 string
|
||||||
|
*/
|
||||||
|
bool utf8_to_int_add_next_octet(unsigned char uz, int & res)
|
||||||
|
{
|
||||||
|
if( (uz & 0xc0) != 0x80 )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
res <<= 6;
|
||||||
|
res |= (uz & 0x3F);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if string_len is greater than 0 then the return value is always greater than zero too
|
||||||
|
|
||||||
|
utf8_written - how many characters were saved in the utf8 string (the string doesn't have
|
||||||
|
a null terminating character)
|
||||||
|
it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
|
||||||
|
was_utf8_buf_too_small - will be true if the utf8 buffer is too small
|
||||||
|
if this flag is true then utf8_written is equal to zero
|
||||||
|
was_error - will be true if there is an error when converting (there was an incorrect wide character)
|
||||||
|
(was_error will not be true if the utf8 buffer is too small)
|
||||||
|
*/
|
||||||
|
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
||||||
|
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
utf8_written = 0;
|
||||||
|
was_utf8_buf_too_small = false;
|
||||||
|
chars = wide_to_int(wide_string, string_len, z, correct);
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
{
|
||||||
|
utf8_written = int_to_utf8(z, utf8, utf8_len);
|
||||||
|
|
||||||
|
if( utf8_written == 0 )
|
||||||
|
was_utf8_buf_too_small = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
{
|
||||||
|
utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
if( utf8_written == 0 )
|
||||||
|
was_utf8_buf_too_small = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if string_len is greater than 0 then the return value is always greater than zero too
|
||||||
|
*/
|
||||||
|
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
chars = wide_to_int(wide_string, string_len, z, correct);
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
correct = int_to_utf8(z, utf8, false) != 0;
|
||||||
|
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if wide_string has at least one character then the return value is always greater than zero too
|
||||||
|
*/
|
||||||
|
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
chars = wide_to_int(wide_string, z, correct);
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
correct = int_to_utf8(z, utf8, false) != 0;
|
||||||
|
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace private_namespace
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
826
src/utf8/utf8.h
826
src/utf8/utf8.h
@@ -177,7 +177,7 @@ bool int_to_wide(int c, std::wstring & res);
|
|||||||
call a convert_function for each character from an utf8 string
|
call a convert_function for each character from an utf8 string
|
||||||
*/
|
*/
|
||||||
template<typename OutputFunction>
|
template<typename OutputFunction>
|
||||||
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction convert_function, int mode = 1);
|
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode = 1);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -298,10 +298,832 @@ template<typename StreamType>
|
|||||||
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large = nullptr, int mode = 1);
|
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large = nullptr, int mode = 1);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
namespace private_namespace
|
||||||
|
{
|
||||||
|
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res);
|
||||||
|
bool utf8_to_int_add_next_octet(unsigned char uz, int & res);
|
||||||
|
|
||||||
|
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
||||||
|
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode);
|
||||||
|
|
||||||
|
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode);
|
||||||
|
|
||||||
|
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode);
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
|
||||||
|
returns how many wide characters were used
|
||||||
|
if string_len is greater than 0 then the return value is always greater than zero too
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
static size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
chars = wide_to_int(wide_string, string_len, z, correct);
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
correct = int_to_utf8(z, utf8) != 0;
|
||||||
|
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
an auxiliary function for converting from wide characters to UTF-8
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode)
|
||||||
|
{
|
||||||
|
size_t min_str_len = 1;
|
||||||
|
|
||||||
|
if( *wide_string == 0 )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if( *(wide_string+1) != 0 )
|
||||||
|
min_str_len = 2;
|
||||||
|
|
||||||
|
return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace private_namespace
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<typename StreamType>
|
||||||
|
bool int_to_wide(int c, StreamType & res)
|
||||||
|
{
|
||||||
|
wchar_t buf[2];
|
||||||
|
size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t));
|
||||||
|
|
||||||
|
if( used == 1 )
|
||||||
|
{
|
||||||
|
res << buf[0];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if( used == 2 )
|
||||||
|
{
|
||||||
|
res << buf[0];
|
||||||
|
res << buf[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return used > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts one UTF-8 character into int
|
||||||
|
|
||||||
|
input:
|
||||||
|
iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||||
|
iterator_end - an end iterator
|
||||||
|
|
||||||
|
output:
|
||||||
|
res - an output character
|
||||||
|
correct - true if it is a correct character
|
||||||
|
|
||||||
|
the function returns how many characters have been used from the input stream
|
||||||
|
*/
|
||||||
|
template<typename StreamIteratorType>
|
||||||
|
size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct)
|
||||||
|
{
|
||||||
|
size_t i, len;
|
||||||
|
unsigned char uz;
|
||||||
|
|
||||||
|
res = 0;
|
||||||
|
correct = false;
|
||||||
|
|
||||||
|
if( iterator_in == iterator_end )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
uz = *iterator_in;
|
||||||
|
++iterator_in;
|
||||||
|
|
||||||
|
if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
for(i=1 ; i<len ; ++i)
|
||||||
|
{
|
||||||
|
if( iterator_in == iterator_end )
|
||||||
|
return i;
|
||||||
|
|
||||||
|
uz = *iterator_in;
|
||||||
|
++iterator_in;
|
||||||
|
|
||||||
|
if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( utf8_check_range(res, len) )
|
||||||
|
correct = true;
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
|
||||||
|
(need to be tested)
|
||||||
|
*/
|
||||||
|
// need to be tested
|
||||||
|
template<typename StreamType>
|
||||||
|
bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode)
|
||||||
|
{
|
||||||
|
if( clear )
|
||||||
|
res.clear();
|
||||||
|
|
||||||
|
bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) {
|
||||||
|
int_to_wide(c, res);
|
||||||
|
}, mode);
|
||||||
|
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<typename StreamType>
|
||||||
|
bool utf8_to_wide(const char * utf8, StreamType & res, bool clear, int mode)
|
||||||
|
{
|
||||||
|
size_t utf8_len = 0;
|
||||||
|
|
||||||
|
while( utf8[utf8_len] != 0 )
|
||||||
|
utf8_len += 1;
|
||||||
|
|
||||||
|
return utf8_to_wide(utf8, utf8_len, res, clear, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<typename StreamType>
|
||||||
|
bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear, int mode)
|
||||||
|
{
|
||||||
|
return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<typename StreamType>
|
||||||
|
bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
bool correct, was_error = false;
|
||||||
|
|
||||||
|
if( clear )
|
||||||
|
res.clear();
|
||||||
|
|
||||||
|
while( utf8_to_int(utf8, z, correct) > 0 )
|
||||||
|
{
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
res << 0xFFFD; // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int_to_wide(z, res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename OutputFunction>
|
||||||
|
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode)
|
||||||
|
{
|
||||||
|
int z;
|
||||||
|
size_t len;
|
||||||
|
bool correct, was_error = false;
|
||||||
|
|
||||||
|
while( utf8_len > 0 )
|
||||||
|
{
|
||||||
|
if( (unsigned char)*utf8 <= 0x7f )
|
||||||
|
{
|
||||||
|
// small optimization
|
||||||
|
len = 1;
|
||||||
|
correct = true;
|
||||||
|
z = static_cast<unsigned char>(*utf8);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero
|
||||||
|
}
|
||||||
|
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
output_function(0xFFFD); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
output_function(z);
|
||||||
|
}
|
||||||
|
|
||||||
|
utf8 += len;
|
||||||
|
utf8_len -= len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename StreamType, typename OutputFunction>
|
||||||
|
bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode)
|
||||||
|
{
|
||||||
|
char utf8_buffer[256];
|
||||||
|
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
|
||||||
|
std::size_t utf8_sequence_max_length = 10;
|
||||||
|
std::size_t index = 0;
|
||||||
|
bool was_error = false;
|
||||||
|
|
||||||
|
typename StreamType::const_iterator i = buffer.begin();
|
||||||
|
|
||||||
|
while( i != buffer.end() )
|
||||||
|
{
|
||||||
|
if( index + utf8_sequence_max_length > buffer_len )
|
||||||
|
{
|
||||||
|
bool write_status = output_function(utf8_buffer, index);
|
||||||
|
index = 0;
|
||||||
|
|
||||||
|
if( !write_status )
|
||||||
|
{
|
||||||
|
was_error = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int c = 0xFFFD; // U+FFFD "replacement character";
|
||||||
|
bool seems_to_be_correct = false;
|
||||||
|
wchar_t w1 = *i;
|
||||||
|
|
||||||
|
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
|
||||||
|
{
|
||||||
|
++i;
|
||||||
|
|
||||||
|
if( i != buffer.end() )
|
||||||
|
{
|
||||||
|
wchar_t w2 = *i;
|
||||||
|
|
||||||
|
if( surrogate_pair_to_int(w1, w2, c) )
|
||||||
|
{
|
||||||
|
seems_to_be_correct = true;
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c = w1;
|
||||||
|
seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( seems_to_be_correct || mode == 1 )
|
||||||
|
{
|
||||||
|
size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index);
|
||||||
|
// here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough)
|
||||||
|
|
||||||
|
if( seq_len == 0 )
|
||||||
|
{
|
||||||
|
was_error = true;
|
||||||
|
|
||||||
|
if( mode == 1 )
|
||||||
|
{
|
||||||
|
seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
index += seq_len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( index > 0 )
|
||||||
|
{
|
||||||
|
if( !output_function(utf8_buffer, index) )
|
||||||
|
{
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
this function converts a UTF-8 stream into a wide stream or a wide string
|
||||||
|
|
||||||
|
input:
|
||||||
|
stream - a UTF-8 stream for converting
|
||||||
|
mode - what to do with errors when converting
|
||||||
|
0: skip an invalid character
|
||||||
|
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||||
|
|
||||||
|
output:
|
||||||
|
res - a wide stream or a wide string for the output sequence
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting
|
||||||
|
*/
|
||||||
|
template<typename StreamOrStringType>
|
||||||
|
bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode)
|
||||||
|
{
|
||||||
|
if( clear )
|
||||||
|
res.clear();
|
||||||
|
|
||||||
|
return utf8_to_output_function(stream, [&](int z) {
|
||||||
|
int_to_wide(z, res);
|
||||||
|
}, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
this function reads characters from a UTF-8 stream and calls an output_function
|
||||||
|
|
||||||
|
input:
|
||||||
|
stream - a UTF-8 stream for converting
|
||||||
|
mode - what to do with errors when converting
|
||||||
|
0: skip an invalid character
|
||||||
|
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||||
|
|
||||||
|
output:
|
||||||
|
output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType
|
||||||
|
and should put the character to the output string/stream, this function should have the signature like this:
|
||||||
|
output_function(int z, StreamOrStringType & res)
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting
|
||||||
|
*/
|
||||||
|
template<typename OutputFunction>
|
||||||
|
bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode)
|
||||||
|
{
|
||||||
|
size_t len;
|
||||||
|
bool correct;
|
||||||
|
int z;
|
||||||
|
size_t index = 0;
|
||||||
|
bool was_error = false;
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
len = utf8_to_int(stream, index, z, correct);
|
||||||
|
|
||||||
|
if( len > 0 )
|
||||||
|
{
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
output_function(0xFFFD); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
output_function(z);
|
||||||
|
}
|
||||||
|
|
||||||
|
index += len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while( len > 0 );
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts UTF-8 stream into a wide stream or a wide string
|
||||||
|
|
||||||
|
input:
|
||||||
|
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||||
|
iterator_end - an end iterator
|
||||||
|
|
||||||
|
output:
|
||||||
|
out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string)
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting
|
||||||
|
*/
|
||||||
|
template<typename StreamIteratorType, typename StreamOrStringType>
|
||||||
|
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode)
|
||||||
|
{
|
||||||
|
if( clear_stream )
|
||||||
|
out_stream.clear();
|
||||||
|
|
||||||
|
return utf8_to_output_function(iterator_in, iterator_end, [&](int z){
|
||||||
|
int_to_wide(z, out_stream);
|
||||||
|
}, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename StreamIteratorType, typename OutputFunction>
|
||||||
|
bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode)
|
||||||
|
{
|
||||||
|
int res;
|
||||||
|
bool correct;
|
||||||
|
bool was_error = false;
|
||||||
|
|
||||||
|
while( iterator_in != iterator_end )
|
||||||
|
{
|
||||||
|
utf8_to_int(iterator_in, iterator_end, res, correct);
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
{
|
||||||
|
output_function(res);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( mode == 1 )
|
||||||
|
output_function(0xFFFD); // U+FFFD "replacement character"
|
||||||
|
|
||||||
|
was_error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts UTF-8 stream into a wide string
|
||||||
|
|
||||||
|
input:
|
||||||
|
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||||
|
iterator_end - an end iterator
|
||||||
|
|
||||||
|
output:
|
||||||
|
out_buffer - an output wide string
|
||||||
|
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||||
|
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||||
|
*/
|
||||||
|
template<typename StreamIteratorType>
|
||||||
|
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large)
|
||||||
|
{
|
||||||
|
int res;
|
||||||
|
bool correct;
|
||||||
|
bool was_error = true;
|
||||||
|
bool was_buffer_ok = false;
|
||||||
|
|
||||||
|
if( max_buffer_len > 0 )
|
||||||
|
{
|
||||||
|
max_buffer_len -= 1; // for terminating null character
|
||||||
|
was_error = false;
|
||||||
|
was_buffer_ok = true;
|
||||||
|
|
||||||
|
while( iterator_in != iterator_end )
|
||||||
|
{
|
||||||
|
utf8_to_int(iterator_in, iterator_end, res, correct);
|
||||||
|
|
||||||
|
if( !correct )
|
||||||
|
{
|
||||||
|
was_error = true;
|
||||||
|
|
||||||
|
if( mode == 1 )
|
||||||
|
{
|
||||||
|
res = 0xFFFD; // U+FFFD "replacement character"
|
||||||
|
correct = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( correct )
|
||||||
|
{
|
||||||
|
size_t len = int_to_wide(res, out_buffer, max_buffer_len);
|
||||||
|
// if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand)
|
||||||
|
|
||||||
|
if( len == 0 )
|
||||||
|
{
|
||||||
|
was_error = true;
|
||||||
|
was_buffer_ok = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
out_buffer += len;
|
||||||
|
max_buffer_len -= len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*out_buffer = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( was_buffer_sufficient_large )
|
||||||
|
*was_buffer_sufficient_large = was_buffer_ok;
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts UTF-8 stream into a wide string
|
||||||
|
|
||||||
|
input:
|
||||||
|
stream - a stream for reading from
|
||||||
|
|
||||||
|
output:
|
||||||
|
out_buffer - an output wide string
|
||||||
|
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||||
|
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode)
|
||||||
|
{
|
||||||
|
typename StreamType::const_iterator stream_begin = stream.begin();
|
||||||
|
typename StreamType::const_iterator stream_end = stream.end();
|
||||||
|
|
||||||
|
return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts one wide character into UTF-8 stream
|
||||||
|
|
||||||
|
input:
|
||||||
|
z - wide character
|
||||||
|
|
||||||
|
output:
|
||||||
|
utf8 - a UTF-8 stream for the output sequence
|
||||||
|
|
||||||
|
the function returns how many characters have been written to the utf8 stream,
|
||||||
|
zero means that 'z' is an incorrect unicode character
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
size_t int_to_utf8(int z, StreamType & utf8)
|
||||||
|
{
|
||||||
|
char buf[10];
|
||||||
|
|
||||||
|
size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char));
|
||||||
|
|
||||||
|
if( len > 0 )
|
||||||
|
utf8.write(buf, len);
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts a wide string into UTF-8 stream
|
||||||
|
|
||||||
|
input:
|
||||||
|
wide_string - a wide string for converting
|
||||||
|
string_len - size of the string
|
||||||
|
mode - what to do with errors when converting
|
||||||
|
0: skip an invalid character
|
||||||
|
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||||
|
|
||||||
|
output:
|
||||||
|
utf8 - a UTF-8 stream for the output sequence
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode)
|
||||||
|
{
|
||||||
|
bool was_error = false;
|
||||||
|
size_t chars;
|
||||||
|
|
||||||
|
while( string_len > 0 )
|
||||||
|
{
|
||||||
|
chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode);
|
||||||
|
wide_string += chars;
|
||||||
|
string_len -= chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts a wide string into UTF-8 stream
|
||||||
|
|
||||||
|
input:
|
||||||
|
wide_string - a null terminated wide string for converting
|
||||||
|
mode - what to do with errors when converting
|
||||||
|
0: skip an invalid character
|
||||||
|
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||||
|
|
||||||
|
output:
|
||||||
|
utf8 - a UTF-8 stream for the output sequence
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode)
|
||||||
|
{
|
||||||
|
bool was_error = false;
|
||||||
|
|
||||||
|
while( *wide_string )
|
||||||
|
wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode);
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts a wide string (std::wstring) into UTF-8 stream
|
||||||
|
|
||||||
|
input:
|
||||||
|
wide_string - a wide string for converting
|
||||||
|
mode - what to do with errors when converting
|
||||||
|
0: skip an invalid character
|
||||||
|
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||||
|
|
||||||
|
output:
|
||||||
|
utf8 - a UTF-8 stream for the output sequence
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode)
|
||||||
|
{
|
||||||
|
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<typename StreamType>
|
||||||
|
bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
|
||||||
|
{
|
||||||
|
if( clear )
|
||||||
|
utf8.clear();
|
||||||
|
|
||||||
|
return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||||
|
utf8.append(utf8_buffer, buffer_len);
|
||||||
|
return true;
|
||||||
|
}, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<typename StreamType>
|
||||||
|
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode)
|
||||||
|
{
|
||||||
|
bool was_error = false;
|
||||||
|
|
||||||
|
if( clear )
|
||||||
|
utf8.clear();
|
||||||
|
|
||||||
|
for(size_t i=0 ; i < stream.size() ; ++i)
|
||||||
|
{
|
||||||
|
int c = static_cast<int>(stream.get_wchar(i));
|
||||||
|
bool is_correct = false;
|
||||||
|
|
||||||
|
if( utf8_check_range(c) )
|
||||||
|
{
|
||||||
|
// CHECKME test me when sizeof(wchar_t) == 2
|
||||||
|
if( is_first_surrogate_char(c) )
|
||||||
|
{
|
||||||
|
if( i + 1 < stream.size() )
|
||||||
|
{
|
||||||
|
wchar_t c1 = static_cast<wchar_t>(c);
|
||||||
|
wchar_t c2 = stream.get_wchar(++i);
|
||||||
|
|
||||||
|
if( surrogate_pair_to_int(c1, c2, c) )
|
||||||
|
{
|
||||||
|
is_correct = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
is_correct = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( is_correct )
|
||||||
|
{
|
||||||
|
int_to_utf8(c, utf8);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
was_error = true;
|
||||||
|
|
||||||
|
if( mode == 1 )
|
||||||
|
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return !was_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename StreamTypeIn, typename StreamTypeOut>
|
||||||
|
bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
|
||||||
|
{
|
||||||
|
if( clear )
|
||||||
|
utf8.clear();
|
||||||
|
|
||||||
|
return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||||
|
utf8.write(utf8_buffer, buffer_len);
|
||||||
|
return true;
|
||||||
|
}, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
this function converts a wide stream into a utf8 string
|
||||||
|
|
||||||
|
input:
|
||||||
|
buffer - a wide stream for reading from
|
||||||
|
|
||||||
|
output:
|
||||||
|
utf8 - an output utf8 string
|
||||||
|
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||||
|
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||||
|
|
||||||
|
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||||
|
*/
|
||||||
|
template<typename StreamType>
|
||||||
|
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode)
|
||||||
|
{
|
||||||
|
bool buffer_ok = false;
|
||||||
|
bool is_ok = false;
|
||||||
|
|
||||||
|
if( max_buffer_size > 0 )
|
||||||
|
{
|
||||||
|
buffer_ok = true;
|
||||||
|
max_buffer_size -= 1; // for terminating null character
|
||||||
|
|
||||||
|
is_ok = wide_to_output_function(buffer, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||||
|
std::size_t i=0;
|
||||||
|
|
||||||
|
for( ; i < buffer_len ; ++i)
|
||||||
|
{
|
||||||
|
if( i < max_buffer_size )
|
||||||
|
{
|
||||||
|
*utf8 = utf8_buffer[i];
|
||||||
|
utf8 += 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
buffer_ok = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
max_buffer_size -= i;
|
||||||
|
*utf8 = 0;
|
||||||
|
return buffer_ok;
|
||||||
|
}, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
if( was_buffer_sufficient_large )
|
||||||
|
*was_buffer_sufficient_large = buffer_ok;
|
||||||
|
|
||||||
|
return is_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
|
||||||
#include "utf8/utf8_templates.h"
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@@ -1,201 +0,0 @@
|
|||||||
/*
|
|
||||||
* This file is a part of PikoTools
|
|
||||||
* and is distributed under the 2-Clause BSD licence.
|
|
||||||
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2021-2024, Tomasz Sowa
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright notice,
|
|
||||||
* this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in the
|
|
||||||
* documentation and/or other materials provided with the distribution.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
||||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
||||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
||||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
||||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
||||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
||||||
* POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "utf8_private.h"
|
|
||||||
|
|
||||||
|
|
||||||
namespace pt
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace private_namespace
|
|
||||||
{
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from UTF-8 string
|
|
||||||
*/
|
|
||||||
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res)
|
|
||||||
{
|
|
||||||
for(len=0 ; (uz & 0x80) != 0 ; ++len)
|
|
||||||
uz <<= 1;
|
|
||||||
|
|
||||||
if( len == 1 || len > 4 )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
res = uz;
|
|
||||||
|
|
||||||
if( len > 0 )
|
|
||||||
res >>= len;
|
|
||||||
|
|
||||||
if( len == 0 )
|
|
||||||
len = 1;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from UTF-8 string
|
|
||||||
*/
|
|
||||||
bool utf8_to_int_add_next_octet(unsigned char uz, int & res)
|
|
||||||
{
|
|
||||||
if( (uz & 0xc0) != 0x80 )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
res <<= 6;
|
|
||||||
res |= (uz & 0x3F);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if string_len is greater than 0 then the return value is always greater than zero too
|
|
||||||
|
|
||||||
utf8_written - how many characters were saved in the utf8 string (the string doesn't have
|
|
||||||
a null terminating character)
|
|
||||||
it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
|
|
||||||
was_utf8_buf_too_small - will be true if the utf8 buffer is too small
|
|
||||||
if this flag is true then utf8_written is equal to zero
|
|
||||||
was_error - will be true if there is an error when converting (there was an incorrect wide character)
|
|
||||||
(was_error will not be true if the utf8 buffer is too small)
|
|
||||||
*/
|
|
||||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
|
||||||
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
utf8_written = 0;
|
|
||||||
was_utf8_buf_too_small = false;
|
|
||||||
chars = wide_to_int(wide_string, string_len, z, correct);
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
{
|
|
||||||
utf8_written = int_to_utf8(z, utf8, utf8_len);
|
|
||||||
|
|
||||||
if( utf8_written == 0 )
|
|
||||||
was_utf8_buf_too_small = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
{
|
|
||||||
utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
if( utf8_written == 0 )
|
|
||||||
was_utf8_buf_too_small = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if string_len is greater than 0 then the return value is always greater than zero too
|
|
||||||
*/
|
|
||||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
chars = wide_to_int(wide_string, string_len, z, correct);
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
correct = int_to_utf8(z, utf8, false) != 0;
|
|
||||||
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if wide_string has at least one character then the return value is always greater than zero too
|
|
||||||
*/
|
|
||||||
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
chars = wide_to_int(wide_string, z, correct);
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
correct = int_to_utf8(z, utf8, false) != 0;
|
|
||||||
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace private_namespace
|
|
||||||
|
|
||||||
} // namespace pt
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@@ -1,117 +0,0 @@
|
|||||||
/*
|
|
||||||
* This file is a part of PikoTools
|
|
||||||
* and is distributed under the 2-Clause BSD licence.
|
|
||||||
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2021-2024, Tomasz Sowa
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright notice,
|
|
||||||
* this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in the
|
|
||||||
* documentation and/or other materials provided with the distribution.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
||||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
||||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
||||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
||||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
||||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
||||||
* POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef headerfile_pikotools_src_utf8_utf8_private
|
|
||||||
#define headerfile_pikotools_src_utf8_utf8_private
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
|
|
||||||
namespace pt
|
|
||||||
{
|
|
||||||
|
|
||||||
size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len);
|
|
||||||
size_t int_to_utf8(int z, std::string & utf8, bool clear);
|
|
||||||
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct);
|
|
||||||
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct);
|
|
||||||
|
|
||||||
|
|
||||||
namespace private_namespace
|
|
||||||
{
|
|
||||||
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res);
|
|
||||||
bool utf8_to_int_add_next_octet(unsigned char uz, int & res);
|
|
||||||
|
|
||||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
|
||||||
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode);
|
|
||||||
|
|
||||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode);
|
|
||||||
|
|
||||||
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode);
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
|
|
||||||
returns how many wide characters were used
|
|
||||||
if string_len is greater than 0 then the return value is always greater than zero too
|
|
||||||
*/
|
|
||||||
template<typename StreamType>
|
|
||||||
static size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
chars = wide_to_int(wide_string, string_len, z, correct);
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
correct = int_to_utf8(z, utf8) != 0;
|
|
||||||
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
an auxiliary function for converting from wide characters to UTF-8
|
|
||||||
*/
|
|
||||||
template<typename StreamType>
|
|
||||||
static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode)
|
|
||||||
{
|
|
||||||
size_t min_str_len = 1;
|
|
||||||
|
|
||||||
if( *wide_string == 0 )
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
if( *(wide_string+1) != 0 )
|
|
||||||
min_str_len = 2;
|
|
||||||
|
|
||||||
return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace private_namespace
|
|
||||||
|
|
||||||
} // namespace pt
|
|
||||||
|
|
||||||
#endif
|
|
@@ -1,808 +0,0 @@
|
|||||||
/*
|
|
||||||
* This file is a part of PikoTools
|
|
||||||
* and is distributed under the 2-Clause BSD licence.
|
|
||||||
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2021-2024, Tomasz Sowa
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright notice,
|
|
||||||
* this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in the
|
|
||||||
* documentation and/or other materials provided with the distribution.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
||||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
||||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
||||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
||||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
||||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
||||||
* POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef headerfile_pikotools_src_utf8_utf8_templates
|
|
||||||
#define headerfile_pikotools_src_utf8_utf8_templates
|
|
||||||
|
|
||||||
// this file is included at the end of utf8.h
|
|
||||||
|
|
||||||
#include "utf8_private.h"
|
|
||||||
|
|
||||||
|
|
||||||
namespace pt
|
|
||||||
{
|
|
||||||
|
|
||||||
|
|
||||||
template<typename StreamType>
|
|
||||||
bool int_to_wide(int c, StreamType & res)
|
|
||||||
{
|
|
||||||
wchar_t buf[2];
|
|
||||||
size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t));
|
|
||||||
|
|
||||||
if( used == 1 )
|
|
||||||
{
|
|
||||||
res << buf[0];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
if( used == 2 )
|
|
||||||
{
|
|
||||||
res << buf[0];
|
|
||||||
res << buf[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
return used > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts one UTF-8 character into int
|
|
||||||
|
|
||||||
input:
|
|
||||||
iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
|
||||||
iterator_end - an end iterator
|
|
||||||
|
|
||||||
output:
|
|
||||||
res - an output character
|
|
||||||
correct - true if it is a correct character
|
|
||||||
|
|
||||||
the function returns how many characters have been used from the input stream
|
|
||||||
*/
|
|
||||||
template<typename StreamIteratorType>
|
|
||||||
size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct)
|
|
||||||
{
|
|
||||||
size_t i, len;
|
|
||||||
unsigned char uz;
|
|
||||||
|
|
||||||
res = 0;
|
|
||||||
correct = false;
|
|
||||||
|
|
||||||
if( iterator_in == iterator_end )
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
uz = *iterator_in;
|
|
||||||
++iterator_in;
|
|
||||||
|
|
||||||
if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
for(i=1 ; i<len ; ++i)
|
|
||||||
{
|
|
||||||
if( iterator_in == iterator_end )
|
|
||||||
return i;
|
|
||||||
|
|
||||||
uz = *iterator_in;
|
|
||||||
++iterator_in;
|
|
||||||
|
|
||||||
if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
|
|
||||||
return i + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( utf8_check_range(res, len) )
|
|
||||||
correct = true;
|
|
||||||
|
|
||||||
return len;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
|
|
||||||
(need to be tested)
|
|
||||||
*/
|
|
||||||
// need to be tested
|
|
||||||
template<typename StreamType>
|
|
||||||
bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode)
|
|
||||||
{
|
|
||||||
if( clear )
|
|
||||||
res.clear();
|
|
||||||
|
|
||||||
bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) {
|
|
||||||
int_to_wide(c, res);
|
|
||||||
}, mode);
|
|
||||||
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<typename StreamType>
|
|
||||||
bool utf8_to_wide(const char * utf8, StreamType & res, bool clear, int mode)
|
|
||||||
{
|
|
||||||
size_t utf8_len = 0;
|
|
||||||
|
|
||||||
while( utf8[utf8_len] != 0 )
|
|
||||||
utf8_len += 1;
|
|
||||||
|
|
||||||
return utf8_to_wide(utf8, utf8_len, res, clear, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<typename StreamType>
|
|
||||||
bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear, int mode)
|
|
||||||
{
|
|
||||||
return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<typename StreamType>
|
|
||||||
bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
bool correct, was_error = false;
|
|
||||||
|
|
||||||
if( clear )
|
|
||||||
res.clear();
|
|
||||||
|
|
||||||
while( utf8_to_int(utf8, z, correct) > 0 )
|
|
||||||
{
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
res << 0xFFFD; // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
int_to_wide(z, res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<typename OutputFunction>
|
|
||||||
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode)
|
|
||||||
{
|
|
||||||
int z;
|
|
||||||
size_t len;
|
|
||||||
bool correct, was_error = false;
|
|
||||||
|
|
||||||
while( utf8_len > 0 )
|
|
||||||
{
|
|
||||||
if( (unsigned char)*utf8 <= 0x7f )
|
|
||||||
{
|
|
||||||
// small optimization
|
|
||||||
len = 1;
|
|
||||||
correct = true;
|
|
||||||
z = static_cast<unsigned char>(*utf8);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero
|
|
||||||
}
|
|
||||||
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
output_function(0xFFFD); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
output_function(z);
|
|
||||||
}
|
|
||||||
|
|
||||||
utf8 += len;
|
|
||||||
utf8_len -= len;
|
|
||||||
}
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<typename StreamType, typename OutputFunction>
|
|
||||||
bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode)
|
|
||||||
{
|
|
||||||
char utf8_buffer[256];
|
|
||||||
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
|
|
||||||
std::size_t utf8_sequence_max_length = 10;
|
|
||||||
std::size_t index = 0;
|
|
||||||
bool was_error = false;
|
|
||||||
|
|
||||||
typename StreamType::const_iterator i = buffer.begin();
|
|
||||||
|
|
||||||
while( i != buffer.end() )
|
|
||||||
{
|
|
||||||
if( index + utf8_sequence_max_length > buffer_len )
|
|
||||||
{
|
|
||||||
bool write_status = output_function(utf8_buffer, index);
|
|
||||||
index = 0;
|
|
||||||
|
|
||||||
if( !write_status )
|
|
||||||
{
|
|
||||||
was_error = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int c = 0xFFFD; // U+FFFD "replacement character";
|
|
||||||
bool seems_to_be_correct = false;
|
|
||||||
wchar_t w1 = *i;
|
|
||||||
|
|
||||||
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
|
|
||||||
{
|
|
||||||
++i;
|
|
||||||
|
|
||||||
if( i != buffer.end() )
|
|
||||||
{
|
|
||||||
wchar_t w2 = *i;
|
|
||||||
|
|
||||||
if( surrogate_pair_to_int(w1, w2, c) )
|
|
||||||
{
|
|
||||||
seems_to_be_correct = true;
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
c = w1;
|
|
||||||
seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( seems_to_be_correct || mode == 1 )
|
|
||||||
{
|
|
||||||
size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index);
|
|
||||||
// here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough)
|
|
||||||
|
|
||||||
if( seq_len == 0 )
|
|
||||||
{
|
|
||||||
was_error = true;
|
|
||||||
|
|
||||||
if( mode == 1 )
|
|
||||||
{
|
|
||||||
seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
index += seq_len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if( index > 0 )
|
|
||||||
{
|
|
||||||
if( !output_function(utf8_buffer, index) )
|
|
||||||
{
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
this function converts a UTF-8 stream into a wide stream or a wide string
|
|
||||||
|
|
||||||
input:
|
|
||||||
stream - a UTF-8 stream for converting
|
|
||||||
mode - what to do with errors when converting
|
|
||||||
0: skip an invalid character
|
|
||||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
|
||||||
|
|
||||||
output:
|
|
||||||
res - a wide stream or a wide string for the output sequence
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting
|
|
||||||
*/
|
|
||||||
template<typename StreamOrStringType>
|
|
||||||
bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode)
|
|
||||||
{
|
|
||||||
if( clear )
|
|
||||||
res.clear();
|
|
||||||
|
|
||||||
return utf8_to_output_function(stream, [&](int z) {
|
|
||||||
int_to_wide(z, res);
|
|
||||||
}, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
this function reads characters from a UTF-8 stream and calls an output_function
|
|
||||||
|
|
||||||
input:
|
|
||||||
stream - a UTF-8 stream for converting
|
|
||||||
mode - what to do with errors when converting
|
|
||||||
0: skip an invalid character
|
|
||||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
|
||||||
|
|
||||||
output:
|
|
||||||
output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType
|
|
||||||
and should put the character to the output string/stream, this function should have the signature like this:
|
|
||||||
output_function(int z, StreamOrStringType & res)
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting
|
|
||||||
*/
|
|
||||||
template<typename OutputFunction>
|
|
||||||
bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode)
|
|
||||||
{
|
|
||||||
size_t len;
|
|
||||||
bool correct;
|
|
||||||
int z;
|
|
||||||
size_t index = 0;
|
|
||||||
bool was_error = false;
|
|
||||||
|
|
||||||
do
|
|
||||||
{
|
|
||||||
len = utf8_to_int(stream, index, z, correct);
|
|
||||||
|
|
||||||
if( len > 0 )
|
|
||||||
{
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
output_function(0xFFFD); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
output_function(z);
|
|
||||||
}
|
|
||||||
|
|
||||||
index += len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
while( len > 0 );
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts UTF-8 stream into a wide stream or a wide string
|
|
||||||
|
|
||||||
input:
|
|
||||||
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
|
||||||
iterator_end - an end iterator
|
|
||||||
|
|
||||||
output:
|
|
||||||
out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string)
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting
|
|
||||||
*/
|
|
||||||
template<typename StreamIteratorType, typename StreamOrStringType>
|
|
||||||
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode)
|
|
||||||
{
|
|
||||||
if( clear_stream )
|
|
||||||
out_stream.clear();
|
|
||||||
|
|
||||||
return utf8_to_output_function(iterator_in, iterator_end, [&](int z){
|
|
||||||
int_to_wide(z, out_stream);
|
|
||||||
}, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<typename StreamIteratorType, typename OutputFunction>
|
|
||||||
bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
bool correct;
|
|
||||||
bool was_error = false;
|
|
||||||
|
|
||||||
while( iterator_in != iterator_end )
|
|
||||||
{
|
|
||||||
utf8_to_int(iterator_in, iterator_end, res, correct);
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
{
|
|
||||||
output_function(res);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if( mode == 1 )
|
|
||||||
output_function(0xFFFD); // U+FFFD "replacement character"
|
|
||||||
|
|
||||||
was_error = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts UTF-8 stream into a wide string
|
|
||||||
|
|
||||||
input:
|
|
||||||
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
|
||||||
iterator_end - an end iterator
|
|
||||||
|
|
||||||
output:
|
|
||||||
out_buffer - an output wide string
|
|
||||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
|
||||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
|
||||||
*/
|
|
||||||
template<typename StreamIteratorType>
|
|
||||||
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
bool correct;
|
|
||||||
bool was_error = true;
|
|
||||||
bool was_buffer_ok = false;
|
|
||||||
|
|
||||||
if( max_buffer_len > 0 )
|
|
||||||
{
|
|
||||||
max_buffer_len -= 1; // for terminating null character
|
|
||||||
was_error = false;
|
|
||||||
was_buffer_ok = true;
|
|
||||||
|
|
||||||
while( iterator_in != iterator_end )
|
|
||||||
{
|
|
||||||
utf8_to_int(iterator_in, iterator_end, res, correct);
|
|
||||||
|
|
||||||
if( !correct )
|
|
||||||
{
|
|
||||||
was_error = true;
|
|
||||||
|
|
||||||
if( mode == 1 )
|
|
||||||
{
|
|
||||||
res = 0xFFFD; // U+FFFD "replacement character"
|
|
||||||
correct = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if( correct )
|
|
||||||
{
|
|
||||||
size_t len = int_to_wide(res, out_buffer, max_buffer_len);
|
|
||||||
// if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand)
|
|
||||||
|
|
||||||
if( len == 0 )
|
|
||||||
{
|
|
||||||
was_error = true;
|
|
||||||
was_buffer_ok = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
out_buffer += len;
|
|
||||||
max_buffer_len -= len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
*out_buffer = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( was_buffer_sufficient_large )
|
|
||||||
*was_buffer_sufficient_large = was_buffer_ok;
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts UTF-8 stream into a wide string
|
|
||||||
|
|
||||||
input:
|
|
||||||
stream - a stream for reading from
|
|
||||||
|
|
||||||
output:
|
|
||||||
out_buffer - an output wide string
|
|
||||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
|
||||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
|
||||||
*/
|
|
||||||
template<typename StreamType>
|
|
||||||
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode)
|
|
||||||
{
|
|
||||||
typename StreamType::const_iterator stream_begin = stream.begin();
|
|
||||||
typename StreamType::const_iterator stream_end = stream.end();
|
|
||||||
|
|
||||||
return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts one wide character into UTF-8 stream
|
|
||||||
|
|
||||||
input:
|
|
||||||
z - wide character
|
|
||||||
|
|
||||||
output:
|
|
||||||
utf8 - a UTF-8 stream for the output sequence
|
|
||||||
|
|
||||||
the function returns how many characters have been written to the utf8 stream,
|
|
||||||
zero means that 'z' is an incorrect unicode character
|
|
||||||
*/
|
|
||||||
template<typename StreamType>
|
|
||||||
size_t int_to_utf8(int z, StreamType & utf8)
|
|
||||||
{
|
|
||||||
char buf[10];
|
|
||||||
|
|
||||||
size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char));
|
|
||||||
|
|
||||||
if( len > 0 )
|
|
||||||
utf8.write(buf, len);
|
|
||||||
|
|
||||||
return len;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts a wide string into UTF-8 stream
|
|
||||||
|
|
||||||
input:
|
|
||||||
wide_string - a wide string for converting
|
|
||||||
string_len - size of the string
|
|
||||||
mode - what to do with errors when converting
|
|
||||||
0: skip an invalid character
|
|
||||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
|
||||||
|
|
||||||
output:
|
|
||||||
utf8 - a UTF-8 stream for the output sequence
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting
|
|
||||||
*/
|
|
||||||
template<typename StreamType>
|
|
||||||
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode)
|
|
||||||
{
|
|
||||||
bool was_error = false;
|
|
||||||
size_t chars;
|
|
||||||
|
|
||||||
while( string_len > 0 )
|
|
||||||
{
|
|
||||||
chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode);
|
|
||||||
wide_string += chars;
|
|
||||||
string_len -= chars;
|
|
||||||
}
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts a wide string into UTF-8 stream
|
|
||||||
|
|
||||||
input:
|
|
||||||
wide_string - a null terminated wide string for converting
|
|
||||||
mode - what to do with errors when converting
|
|
||||||
0: skip an invalid character
|
|
||||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
|
||||||
|
|
||||||
output:
|
|
||||||
utf8 - a UTF-8 stream for the output sequence
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting
|
|
||||||
*/
|
|
||||||
template<typename StreamType>
|
|
||||||
bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode)
|
|
||||||
{
|
|
||||||
bool was_error = false;
|
|
||||||
|
|
||||||
while( *wide_string )
|
|
||||||
wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode);
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts a wide string (std::wstring) into UTF-8 stream
|
|
||||||
|
|
||||||
input:
|
|
||||||
wide_string - a wide string for converting
|
|
||||||
mode - what to do with errors when converting
|
|
||||||
0: skip an invalid character
|
|
||||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
|
||||||
|
|
||||||
output:
|
|
||||||
utf8 - a UTF-8 stream for the output sequence
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting
|
|
||||||
*/
|
|
||||||
template<typename StreamType>
|
|
||||||
bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode)
|
|
||||||
{
|
|
||||||
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<typename StreamType>
|
|
||||||
bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
|
|
||||||
{
|
|
||||||
if( clear )
|
|
||||||
utf8.clear();
|
|
||||||
|
|
||||||
return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
|
||||||
utf8.append(utf8_buffer, buffer_len);
|
|
||||||
return true;
|
|
||||||
}, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<typename StreamType>
|
|
||||||
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode)
|
|
||||||
{
|
|
||||||
bool was_error = false;
|
|
||||||
|
|
||||||
if( clear )
|
|
||||||
utf8.clear();
|
|
||||||
|
|
||||||
for(size_t i=0 ; i < stream.size() ; ++i)
|
|
||||||
{
|
|
||||||
int c = static_cast<int>(stream.get_wchar(i));
|
|
||||||
bool is_correct = false;
|
|
||||||
|
|
||||||
if( utf8_check_range(c) )
|
|
||||||
{
|
|
||||||
// CHECKME test me when sizeof(wchar_t) == 2
|
|
||||||
if( is_first_surrogate_char(c) )
|
|
||||||
{
|
|
||||||
if( i + 1 < stream.size() )
|
|
||||||
{
|
|
||||||
wchar_t c1 = static_cast<wchar_t>(c);
|
|
||||||
wchar_t c2 = stream.get_wchar(++i);
|
|
||||||
|
|
||||||
if( surrogate_pair_to_int(c1, c2, c) )
|
|
||||||
{
|
|
||||||
is_correct = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
is_correct = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if( is_correct )
|
|
||||||
{
|
|
||||||
int_to_utf8(c, utf8);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
was_error = true;
|
|
||||||
|
|
||||||
if( mode == 1 )
|
|
||||||
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return !was_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<typename StreamTypeIn, typename StreamTypeOut>
|
|
||||||
bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
|
|
||||||
{
|
|
||||||
if( clear )
|
|
||||||
utf8.clear();
|
|
||||||
|
|
||||||
return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
|
||||||
utf8.write(utf8_buffer, buffer_len);
|
|
||||||
return true;
|
|
||||||
}, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
this function converts a wide stream into a utf8 string
|
|
||||||
|
|
||||||
input:
|
|
||||||
buffer - a wide stream for reading from
|
|
||||||
|
|
||||||
output:
|
|
||||||
utf8 - an output utf8 string
|
|
||||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
|
||||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
|
||||||
|
|
||||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
|
||||||
*/
|
|
||||||
template<typename StreamType>
|
|
||||||
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode)
|
|
||||||
{
|
|
||||||
bool buffer_ok = false;
|
|
||||||
bool is_ok = false;
|
|
||||||
|
|
||||||
if( max_buffer_size > 0 )
|
|
||||||
{
|
|
||||||
buffer_ok = true;
|
|
||||||
max_buffer_size -= 1; // for terminating null character
|
|
||||||
|
|
||||||
is_ok = wide_to_output_function(buffer, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
|
||||||
std::size_t i=0;
|
|
||||||
|
|
||||||
for( ; i < buffer_len ; ++i)
|
|
||||||
{
|
|
||||||
if( i < max_buffer_size )
|
|
||||||
{
|
|
||||||
*utf8 = utf8_buffer[i];
|
|
||||||
utf8 += 1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
buffer_ok = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
max_buffer_size -= i;
|
|
||||||
*utf8 = 0;
|
|
||||||
return buffer_ok;
|
|
||||||
}, mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
if( was_buffer_sufficient_large )
|
|
||||||
*was_buffer_sufficient_large = buffer_ok;
|
|
||||||
|
|
||||||
return is_ok;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace pt
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@@ -4,16 +4,14 @@
|
|||||||
./convert.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
|
./convert.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
|
||||||
./convert.o: ../src/textstream/stream.h ../src/space/space.h
|
./convert.o: ../src/textstream/stream.h ../src/space/space.h
|
||||||
./convert.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
|
./convert.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
|
||||||
./convert.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
|
./convert.o: ../src/textstream/stream.h ../src/date/date.h
|
||||||
./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h
|
|
||||||
./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
|
./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
|
||||||
./convert.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
|
./convert.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
|
||||||
./convert.o: ../src/convert/text.h ../src/convert/misc.h
|
./convert.o: ../src/convert/text.h ../src/convert/misc.h
|
||||||
./convert.o: ../src/textstream/types.h ../src/convert/double.h test.h
|
./convert.o: ../src/textstream/types.h ../src/convert/double.h test.h
|
||||||
./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
|
./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
|
||||||
./csvparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
|
./csvparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
|
||||||
./csvparser.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
|
./csvparser.o: ../src/textstream/stream.h ../src/convert/baseparser.h
|
||||||
./csvparser.o: ../src/utf8/utf8_private.h ../src/convert/baseparser.h
|
|
||||||
./csvparser.o: ../src/textstream/textstream.h ../src/textstream/stream.h
|
./csvparser.o: ../src/textstream/textstream.h ../src/textstream/stream.h
|
||||||
./csvparser.o: ../src/date/date.h ../src/membuffer/membuffer.h
|
./csvparser.o: ../src/date/date.h ../src/membuffer/membuffer.h
|
||||||
./csvparser.o: ../src/textstream/types.h ../src/textstream/stream_private.h
|
./csvparser.o: ../src/textstream/types.h ../src/textstream/stream_private.h
|
||||||
@@ -22,8 +20,7 @@
|
|||||||
./main.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
|
./main.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
|
||||||
./main.o: ../src/textstream/stream.h ../src/space/space.h
|
./main.o: ../src/textstream/stream.h ../src/space/space.h
|
||||||
./main.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
|
./main.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
|
||||||
./main.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
|
./main.o: ../src/textstream/stream.h ../src/date/date.h
|
||||||
./main.o: ../src/utf8/utf8_private.h ../src/date/date.h
|
|
||||||
./main.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
|
./main.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
|
||||||
./main.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
|
./main.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
|
||||||
./main.o: ../src/convert/text.h ../src/convert/misc.h
|
./main.o: ../src/convert/text.h ../src/convert/misc.h
|
||||||
@@ -34,9 +31,7 @@
|
|||||||
./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
|
./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
|
||||||
./mainoptionsparser.o: ../src/space/space.h ../src/convert/inttostr.h
|
./mainoptionsparser.o: ../src/space/space.h ../src/convert/inttostr.h
|
||||||
./mainoptionsparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
|
./mainoptionsparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
|
||||||
./mainoptionsparser.o: ../src/utf8/utf8_templates.h
|
./mainoptionsparser.o: ../src/convert/convert.h ../src/convert/inttostr.h
|
||||||
./mainoptionsparser.o: ../src/utf8/utf8_private.h ../src/convert/convert.h
|
|
||||||
./mainoptionsparser.o: ../src/convert/inttostr.h
|
|
||||||
./mainoptionsparser.o: ../src/convert/patternreplacer.h
|
./mainoptionsparser.o: ../src/convert/patternreplacer.h
|
||||||
./mainoptionsparser.o: ../src/textstream/textstream.h
|
./mainoptionsparser.o: ../src/textstream/textstream.h
|
||||||
./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h
|
./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h
|
||||||
|
Reference in New Issue
Block a user