leavy only utf8.h and utf8.cpp
Remove utf8_private.h, utf8_private.cpp and utf8_templates.h and move their methods to utf8.h/utf8.cpp.
This commit is contained in:
@@ -3,32 +3,28 @@
|
||||
./convert/inttostr.o: ./convert/inttostr.h
|
||||
./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h
|
||||
./convert/misc.o: textstream/types.h ./convert/inttostr.h utf8/utf8.h
|
||||
./convert/misc.o: utf8/utf8_templates.h utf8/utf8_private.h
|
||||
./convert/text.o: ./convert/text.h ./convert/text_private.h
|
||||
./convert/double.o: ./convert/double.h textstream/textstream.h
|
||||
./convert/double.o: textstream/stream.h space/space.h convert/inttostr.h
|
||||
./convert/double.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
||||
./convert/double.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
||||
./convert/double.o: textstream/types.h textstream/stream_private.h
|
||||
./convert/double.o: utf8/utf8.h textstream/stream.h date/date.h
|
||||
./convert/double.o: membuffer/membuffer.h textstream/types.h
|
||||
./convert/double.o: textstream/stream_private.h
|
||||
./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
|
||||
./convert/baseparser.o: textstream/stream.h space/space.h convert/inttostr.h
|
||||
./convert/baseparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
||||
./convert/baseparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
||||
./convert/baseparser.o: textstream/types.h textstream/stream_private.h
|
||||
./convert/baseparser.o: utf8/utf8.h textstream/stream.h date/date.h
|
||||
./convert/baseparser.o: membuffer/membuffer.h textstream/types.h
|
||||
./convert/baseparser.o: textstream/stream_private.h
|
||||
./date/date.o: ./date/date.h convert/inttostr.h
|
||||
./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
|
||||
./log/filelog.o: space/space.h convert/inttostr.h utf8/utf8.h
|
||||
./log/filelog.o: textstream/stream.h utf8/utf8_templates.h
|
||||
./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
||||
./log/filelog.o: textstream/stream.h date/date.h membuffer/membuffer.h
|
||||
./log/filelog.o: textstream/types.h textstream/stream_private.h
|
||||
./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h
|
||||
./log/log.o: space/space.h convert/inttostr.h utf8/utf8.h textstream/stream.h
|
||||
./log/log.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
|
||||
./log/log.o: membuffer/membuffer.h textstream/types.h
|
||||
./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h
|
||||
./log/log.o: textstream/stream_private.h ./log/filelog.h
|
||||
./space/space.o: ./space/space.h convert/inttostr.h utf8/utf8.h
|
||||
./space/space.o: textstream/stream.h utf8/utf8_templates.h
|
||||
./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h
|
||||
./space/space.o: textstream/stream.h convert/convert.h ./convert/inttostr.h
|
||||
./space/space.o: convert/patternreplacer.h textstream/textstream.h
|
||||
./space/space.o: textstream/stream.h space/space.h date/date.h
|
||||
./space/space.o: membuffer/membuffer.h textstream/types.h
|
||||
@@ -37,7 +33,6 @@
|
||||
./space/space.o: ./convert/double.h
|
||||
./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
|
||||
./space/spaceparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
|
||||
./space/spaceparser.o: utf8/utf8_templates.h utf8/utf8_private.h
|
||||
./space/spaceparser.o: convert/baseparser.h textstream/textstream.h
|
||||
./space/spaceparser.o: textstream/stream.h space/space.h date/date.h
|
||||
./space/spaceparser.o: membuffer/membuffer.h textstream/types.h
|
||||
@@ -45,7 +40,6 @@
|
||||
./space/spaceparser.o: ./convert/text.h ./convert/misc.h textstream/types.h
|
||||
./space/keyvalueparser.o: ./space/keyvalueparser.h ./space/space.h
|
||||
./space/keyvalueparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
|
||||
./space/keyvalueparser.o: utf8/utf8_templates.h utf8/utf8_private.h
|
||||
./space/keyvalueparser.o: convert/baseparser.h textstream/textstream.h
|
||||
./space/keyvalueparser.o: textstream/stream.h space/space.h date/date.h
|
||||
./space/keyvalueparser.o: membuffer/membuffer.h textstream/types.h
|
||||
@@ -53,29 +47,24 @@
|
||||
./space/keyvalueparser.o: ./convert/text.h ./convert/misc.h
|
||||
./space/keyvalueparser.o: textstream/types.h
|
||||
./textstream/stream_private.o: textstream/stream_private.h
|
||||
./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
||||
./utf8/utf8.o: utf8/utf8_private.h
|
||||
./utf8/utf8_private.o: utf8/utf8_private.h
|
||||
./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h
|
||||
./csv/csvparser.o: ./csv/csvparser.h space/space.h convert/inttostr.h
|
||||
./csv/csvparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
||||
./csv/csvparser.o: utf8/utf8_private.h convert/baseparser.h
|
||||
./csv/csvparser.o: utf8/utf8.h textstream/stream.h convert/baseparser.h
|
||||
./csv/csvparser.o: textstream/textstream.h textstream/stream.h date/date.h
|
||||
./csv/csvparser.o: membuffer/membuffer.h textstream/types.h
|
||||
./csv/csvparser.o: textstream/stream_private.h
|
||||
./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
|
||||
./mainoptions/mainoptionsparser.o: space/space.h convert/inttostr.h
|
||||
./mainoptions/mainoptionsparser.o: utf8/utf8.h textstream/stream.h
|
||||
./mainoptions/mainoptionsparser.o: utf8/utf8_templates.h utf8/utf8_private.h
|
||||
./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
|
||||
./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
|
||||
./html/bbcodeparser.o: textstream/stream.h space/space.h convert/inttostr.h
|
||||
./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
|
||||
./html/bbcodeparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
||||
./html/bbcodeparser.o: textstream/types.h textstream/stream_private.h
|
||||
./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h date/date.h
|
||||
./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
|
||||
./html/bbcodeparser.o: textstream/stream_private.h
|
||||
./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
|
||||
./html/htmlparser.o: textstream/textstream.h textstream/stream.h
|
||||
./html/htmlparser.o: space/space.h convert/inttostr.h utf8/utf8.h
|
||||
./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
|
||||
./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
|
||||
./html/htmlparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
|
||||
./html/htmlparser.o: textstream/types.h textstream/stream_private.h
|
||||
./html/htmlparser.o: convert/text.h
|
||||
|
@@ -34,29 +34,27 @@
|
||||
|
||||
#include <fstream>
|
||||
#include "utf8.h"
|
||||
#include "utf8_private.h"
|
||||
|
||||
|
||||
|
||||
namespace pt
|
||||
{
|
||||
|
||||
|
||||
/*!
|
||||
returns true if 'c' is a correct unicode character
|
||||
*/
|
||||
/*
|
||||
* returns true if 'c' is a correct unicode character
|
||||
*/
|
||||
bool utf8_check_range(int c)
|
||||
{
|
||||
return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF);
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
returns true if 'c' is a correct unicode character
|
||||
|
||||
this method is used when reading from an utf8 string
|
||||
how_many_bytes - means how many bytes from the utf8 string were read
|
||||
*/
|
||||
/*
|
||||
* returns true if 'c' is a correct unicode character
|
||||
*
|
||||
* this method is used when reading from an utf8 string
|
||||
* how_many_bytes - means how many bytes from the utf8 string were read
|
||||
*/
|
||||
bool utf8_check_range(int c, int how_many_bytes)
|
||||
{
|
||||
if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 )
|
||||
@@ -126,12 +124,12 @@ bool surrogate_pair_to_int(int c1, int c2, int & z)
|
||||
|
||||
|
||||
/*
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
converting a wide character into one int
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
*/
|
||||
* an auxiliary function for converting from wide characters to UTF-8
|
||||
* converting a wide character into one int
|
||||
*
|
||||
* returns how many wide characters were used
|
||||
* if string_len is greater than 0 then the return value is always greater than zero too
|
||||
*/
|
||||
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
|
||||
{
|
||||
if( string_len == 0 )
|
||||
@@ -177,12 +175,12 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool
|
||||
|
||||
|
||||
/*
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
converting a wide character into one int
|
||||
* an auxiliary function for converting from wide characters to UTF-8
|
||||
* converting a wide character into one int
|
||||
|
||||
returns how many wide characters were used
|
||||
if wide_string has at least one character then the return value is always greater than zero too
|
||||
*/
|
||||
* returns how many wide characters were used
|
||||
* if wide_string has at least one character then the return value is always greater than zero too
|
||||
*/
|
||||
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)
|
||||
{
|
||||
size_t min_str_len = 1;
|
||||
@@ -235,10 +233,10 @@ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len)
|
||||
|
||||
|
||||
/*
|
||||
converts an int to a wide string
|
||||
|
||||
returns true if a character was inserted to the string
|
||||
*/
|
||||
* converts an int to a wide string
|
||||
*
|
||||
* returns true if a character was inserted to the string
|
||||
*/
|
||||
bool int_to_wide(int c, std::wstring & res)
|
||||
{
|
||||
wchar_t buf[2];
|
||||
@@ -281,23 +279,23 @@ bool int_to_stream(int c, pt::Stream & stream)
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts one UTF-8 character into one wide-character
|
||||
|
||||
input:
|
||||
utf8 - an input UTF-8 string
|
||||
utf8_len - size of the input string,
|
||||
the string should be at least 4 bytes length for correctly
|
||||
recognized the utf-8 sequence
|
||||
|
||||
output:
|
||||
res - an output character
|
||||
correct - true if it is a correct character
|
||||
|
||||
the function returns how many characters have been used from the input string
|
||||
(returns zero only if utf8_len is zero)
|
||||
even if there are errors the functions returns a different from zero value
|
||||
*/
|
||||
/*
|
||||
* this function converts one UTF-8 character into one wide-character
|
||||
*
|
||||
* input:
|
||||
* utf8 - an input UTF-8 string
|
||||
* utf8_len - size of the input string,
|
||||
* the string should be at least 4 bytes length for correctly
|
||||
* recognized the utf-8 sequence
|
||||
*
|
||||
* output:
|
||||
* res - an output character
|
||||
* correct - true if it is a correct character
|
||||
*
|
||||
* the function returns how many characters have been used from the input string
|
||||
* (returns zero only if utf8_len is zero)
|
||||
* even if there are errors the functions returns a different from zero value
|
||||
*/
|
||||
size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct)
|
||||
{
|
||||
size_t i, len;
|
||||
@@ -1016,6 +1014,160 @@ return res;
|
||||
|
||||
|
||||
|
||||
namespace private_namespace
|
||||
{
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from UTF-8 string
|
||||
*/
|
||||
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res)
|
||||
{
|
||||
for(len=0 ; (uz & 0x80) != 0 ; ++len)
|
||||
uz <<= 1;
|
||||
|
||||
if( len == 1 || len > 4 )
|
||||
return false;
|
||||
|
||||
res = uz;
|
||||
|
||||
if( len > 0 )
|
||||
res >>= len;
|
||||
|
||||
if( len == 0 )
|
||||
len = 1;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from UTF-8 string
|
||||
*/
|
||||
bool utf8_to_int_add_next_octet(unsigned char uz, int & res)
|
||||
{
|
||||
if( (uz & 0xc0) != 0x80 )
|
||||
return false;
|
||||
|
||||
res <<= 6;
|
||||
res |= (uz & 0x3F);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
|
||||
utf8_written - how many characters were saved in the utf8 string (the string doesn't have
|
||||
a null terminating character)
|
||||
it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
|
||||
was_utf8_buf_too_small - will be true if the utf8 buffer is too small
|
||||
if this flag is true then utf8_written is equal to zero
|
||||
was_error - will be true if there is an error when converting (there was an incorrect wide character)
|
||||
(was_error will not be true if the utf8 buffer is too small)
|
||||
*/
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
||||
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
utf8_written = 0;
|
||||
was_utf8_buf_too_small = false;
|
||||
chars = wide_to_int(wide_string, string_len, z, correct);
|
||||
|
||||
if( correct )
|
||||
{
|
||||
utf8_written = int_to_utf8(z, utf8, utf8_len);
|
||||
|
||||
if( utf8_written == 0 )
|
||||
was_utf8_buf_too_small = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( mode == 1 )
|
||||
{
|
||||
utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
|
||||
|
||||
if( utf8_written == 0 )
|
||||
was_utf8_buf_too_small = true;
|
||||
}
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
*/
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
chars = wide_to_int(wide_string, string_len, z, correct);
|
||||
|
||||
if( correct )
|
||||
correct = int_to_utf8(z, utf8, false) != 0;
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if wide_string has at least one character then the return value is always greater than zero too
|
||||
*/
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
chars = wide_to_int(wide_string, z, correct);
|
||||
|
||||
if( correct )
|
||||
correct = int_to_utf8(z, utf8, false) != 0;
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
} // namespace private_namespace
|
||||
|
||||
|
||||
|
||||
} // namespace
|
||||
|
||||
|
826
src/utf8/utf8.h
826
src/utf8/utf8.h
@@ -177,7 +177,7 @@ bool int_to_wide(int c, std::wstring & res);
|
||||
call a convert_function for each character from an utf8 string
|
||||
*/
|
||||
template<typename OutputFunction>
|
||||
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction convert_function, int mode = 1);
|
||||
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode = 1);
|
||||
|
||||
|
||||
|
||||
@@ -298,10 +298,832 @@ template<typename StreamType>
|
||||
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large = nullptr, int mode = 1);
|
||||
|
||||
|
||||
|
||||
|
||||
namespace private_namespace
|
||||
{
|
||||
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res);
|
||||
bool utf8_to_int_add_next_octet(unsigned char uz, int & res);
|
||||
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
||||
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode);
|
||||
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode);
|
||||
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode);
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
*/
|
||||
template<typename StreamType>
|
||||
static size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
chars = wide_to_int(wide_string, string_len, z, correct);
|
||||
|
||||
if( correct )
|
||||
correct = int_to_utf8(z, utf8) != 0;
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
*/
|
||||
template<typename StreamType>
|
||||
static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode)
|
||||
{
|
||||
size_t min_str_len = 1;
|
||||
|
||||
if( *wide_string == 0 )
|
||||
return 0;
|
||||
|
||||
if( *(wide_string+1) != 0 )
|
||||
min_str_len = 2;
|
||||
|
||||
return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode);
|
||||
}
|
||||
|
||||
} // namespace private_namespace
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool int_to_wide(int c, StreamType & res)
|
||||
{
|
||||
wchar_t buf[2];
|
||||
size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t));
|
||||
|
||||
if( used == 1 )
|
||||
{
|
||||
res << buf[0];
|
||||
}
|
||||
else
|
||||
if( used == 2 )
|
||||
{
|
||||
res << buf[0];
|
||||
res << buf[1];
|
||||
}
|
||||
|
||||
return used > 0;
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
this function converts one UTF-8 character into int
|
||||
|
||||
input:
|
||||
iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||
iterator_end - an end iterator
|
||||
|
||||
output:
|
||||
res - an output character
|
||||
correct - true if it is a correct character
|
||||
|
||||
the function returns how many characters have been used from the input stream
|
||||
*/
|
||||
template<typename StreamIteratorType>
|
||||
size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct)
|
||||
{
|
||||
size_t i, len;
|
||||
unsigned char uz;
|
||||
|
||||
res = 0;
|
||||
correct = false;
|
||||
|
||||
if( iterator_in == iterator_end )
|
||||
return 0;
|
||||
|
||||
uz = *iterator_in;
|
||||
++iterator_in;
|
||||
|
||||
if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
|
||||
return 1;
|
||||
|
||||
for(i=1 ; i<len ; ++i)
|
||||
{
|
||||
if( iterator_in == iterator_end )
|
||||
return i;
|
||||
|
||||
uz = *iterator_in;
|
||||
++iterator_in;
|
||||
|
||||
if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
if( utf8_check_range(res, len) )
|
||||
correct = true;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
|
||||
(need to be tested)
|
||||
*/
|
||||
// need to be tested
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
res.clear();
|
||||
|
||||
bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) {
|
||||
int_to_wide(c, res);
|
||||
}, mode);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const char * utf8, StreamType & res, bool clear, int mode)
|
||||
{
|
||||
size_t utf8_len = 0;
|
||||
|
||||
while( utf8[utf8_len] != 0 )
|
||||
utf8_len += 1;
|
||||
|
||||
return utf8_to_wide(utf8, utf8_len, res, clear, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear, int mode)
|
||||
{
|
||||
return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct, was_error = false;
|
||||
|
||||
if( clear )
|
||||
res.clear();
|
||||
|
||||
while( utf8_to_int(utf8, z, correct) > 0 )
|
||||
{
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
res << 0xFFFD; // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
int_to_wide(z, res);
|
||||
}
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
template<typename OutputFunction>
|
||||
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode)
|
||||
{
|
||||
int z;
|
||||
size_t len;
|
||||
bool correct, was_error = false;
|
||||
|
||||
while( utf8_len > 0 )
|
||||
{
|
||||
if( (unsigned char)*utf8 <= 0x7f )
|
||||
{
|
||||
// small optimization
|
||||
len = 1;
|
||||
correct = true;
|
||||
z = static_cast<unsigned char>(*utf8);
|
||||
}
|
||||
else
|
||||
{
|
||||
len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero
|
||||
}
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
output_function(0xFFFD); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_function(z);
|
||||
}
|
||||
|
||||
utf8 += len;
|
||||
utf8_len -= len;
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
template<typename StreamType, typename OutputFunction>
|
||||
bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode)
|
||||
{
|
||||
char utf8_buffer[256];
|
||||
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
|
||||
std::size_t utf8_sequence_max_length = 10;
|
||||
std::size_t index = 0;
|
||||
bool was_error = false;
|
||||
|
||||
typename StreamType::const_iterator i = buffer.begin();
|
||||
|
||||
while( i != buffer.end() )
|
||||
{
|
||||
if( index + utf8_sequence_max_length > buffer_len )
|
||||
{
|
||||
bool write_status = output_function(utf8_buffer, index);
|
||||
index = 0;
|
||||
|
||||
if( !write_status )
|
||||
{
|
||||
was_error = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int c = 0xFFFD; // U+FFFD "replacement character";
|
||||
bool seems_to_be_correct = false;
|
||||
wchar_t w1 = *i;
|
||||
|
||||
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
|
||||
{
|
||||
++i;
|
||||
|
||||
if( i != buffer.end() )
|
||||
{
|
||||
wchar_t w2 = *i;
|
||||
|
||||
if( surrogate_pair_to_int(w1, w2, c) )
|
||||
{
|
||||
seems_to_be_correct = true;
|
||||
++i;
|
||||
}
|
||||
else
|
||||
{
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
c = w1;
|
||||
seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below
|
||||
++i;
|
||||
}
|
||||
|
||||
if( seems_to_be_correct || mode == 1 )
|
||||
{
|
||||
size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index);
|
||||
// here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough)
|
||||
|
||||
if( seq_len == 0 )
|
||||
{
|
||||
was_error = true;
|
||||
|
||||
if( mode == 1 )
|
||||
{
|
||||
seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character";
|
||||
}
|
||||
}
|
||||
|
||||
index += seq_len;
|
||||
}
|
||||
}
|
||||
|
||||
if( index > 0 )
|
||||
{
|
||||
if( !output_function(utf8_buffer, index) )
|
||||
{
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
this function converts a UTF-8 stream into a wide stream or a wide string
|
||||
|
||||
input:
|
||||
stream - a UTF-8 stream for converting
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
res - a wide stream or a wide string for the output sequence
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamOrStringType>
|
||||
bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
res.clear();
|
||||
|
||||
return utf8_to_output_function(stream, [&](int z) {
|
||||
int_to_wide(z, res);
|
||||
}, mode);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
this function reads characters from a UTF-8 stream and calls an output_function
|
||||
|
||||
input:
|
||||
stream - a UTF-8 stream for converting
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType
|
||||
and should put the character to the output string/stream, this function should have the signature like this:
|
||||
output_function(int z, StreamOrStringType & res)
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename OutputFunction>
|
||||
bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode)
|
||||
{
|
||||
size_t len;
|
||||
bool correct;
|
||||
int z;
|
||||
size_t index = 0;
|
||||
bool was_error = false;
|
||||
|
||||
do
|
||||
{
|
||||
len = utf8_to_int(stream, index, z, correct);
|
||||
|
||||
if( len > 0 )
|
||||
{
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
output_function(0xFFFD); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_function(z);
|
||||
}
|
||||
|
||||
index += len;
|
||||
}
|
||||
}
|
||||
while( len > 0 );
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts UTF-8 stream into a wide stream or a wide string
|
||||
|
||||
input:
|
||||
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||
iterator_end - an end iterator
|
||||
|
||||
output:
|
||||
out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string)
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamIteratorType, typename StreamOrStringType>
|
||||
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode)
|
||||
{
|
||||
if( clear_stream )
|
||||
out_stream.clear();
|
||||
|
||||
return utf8_to_output_function(iterator_in, iterator_end, [&](int z){
|
||||
int_to_wide(z, out_stream);
|
||||
}, mode);
|
||||
}
|
||||
|
||||
|
||||
template<typename StreamIteratorType, typename OutputFunction>
|
||||
bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode)
|
||||
{
|
||||
int res;
|
||||
bool correct;
|
||||
bool was_error = false;
|
||||
|
||||
while( iterator_in != iterator_end )
|
||||
{
|
||||
utf8_to_int(iterator_in, iterator_end, res, correct);
|
||||
|
||||
if( correct )
|
||||
{
|
||||
output_function(res);
|
||||
}
|
||||
else
|
||||
{
|
||||
if( mode == 1 )
|
||||
output_function(0xFFFD); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts UTF-8 stream into a wide string
|
||||
|
||||
input:
|
||||
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||
iterator_end - an end iterator
|
||||
|
||||
output:
|
||||
out_buffer - an output wide string
|
||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||
|
||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||
*/
|
||||
template<typename StreamIteratorType>
|
||||
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large)
|
||||
{
|
||||
int res;
|
||||
bool correct;
|
||||
bool was_error = true;
|
||||
bool was_buffer_ok = false;
|
||||
|
||||
if( max_buffer_len > 0 )
|
||||
{
|
||||
max_buffer_len -= 1; // for terminating null character
|
||||
was_error = false;
|
||||
was_buffer_ok = true;
|
||||
|
||||
while( iterator_in != iterator_end )
|
||||
{
|
||||
utf8_to_int(iterator_in, iterator_end, res, correct);
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
was_error = true;
|
||||
|
||||
if( mode == 1 )
|
||||
{
|
||||
res = 0xFFFD; // U+FFFD "replacement character"
|
||||
correct = true;
|
||||
}
|
||||
}
|
||||
|
||||
if( correct )
|
||||
{
|
||||
size_t len = int_to_wide(res, out_buffer, max_buffer_len);
|
||||
// if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand)
|
||||
|
||||
if( len == 0 )
|
||||
{
|
||||
was_error = true;
|
||||
was_buffer_ok = false;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
out_buffer += len;
|
||||
max_buffer_len -= len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*out_buffer = 0;
|
||||
}
|
||||
|
||||
if( was_buffer_sufficient_large )
|
||||
*was_buffer_sufficient_large = was_buffer_ok;
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts UTF-8 stream into a wide string
|
||||
|
||||
input:
|
||||
stream - a stream for reading from
|
||||
|
||||
output:
|
||||
out_buffer - an output wide string
|
||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||
|
||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode)
|
||||
{
|
||||
typename StreamType::const_iterator stream_begin = stream.begin();
|
||||
typename StreamType::const_iterator stream_end = stream.end();
|
||||
|
||||
return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts one wide character into UTF-8 stream
|
||||
|
||||
input:
|
||||
z - wide character
|
||||
|
||||
output:
|
||||
utf8 - a UTF-8 stream for the output sequence
|
||||
|
||||
the function returns how many characters have been written to the utf8 stream,
|
||||
zero means that 'z' is an incorrect unicode character
|
||||
*/
|
||||
template<typename StreamType>
|
||||
size_t int_to_utf8(int z, StreamType & utf8)
|
||||
{
|
||||
char buf[10];
|
||||
|
||||
size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char));
|
||||
|
||||
if( len > 0 )
|
||||
utf8.write(buf, len);
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide string into UTF-8 stream
|
||||
|
||||
input:
|
||||
wide_string - a wide string for converting
|
||||
string_len - size of the string
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
utf8 - a UTF-8 stream for the output sequence
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode)
|
||||
{
|
||||
bool was_error = false;
|
||||
size_t chars;
|
||||
|
||||
while( string_len > 0 )
|
||||
{
|
||||
chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode);
|
||||
wide_string += chars;
|
||||
string_len -= chars;
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide string into UTF-8 stream
|
||||
|
||||
input:
|
||||
wide_string - a null terminated wide string for converting
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
utf8 - a UTF-8 stream for the output sequence
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode)
|
||||
{
|
||||
bool was_error = false;
|
||||
|
||||
while( *wide_string )
|
||||
wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode);
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide string (std::wstring) into UTF-8 stream
|
||||
|
||||
input:
|
||||
wide_string - a wide string for converting
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
utf8 - a UTF-8 stream for the output sequence
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode)
|
||||
{
|
||||
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||
utf8.append(utf8_buffer, buffer_len);
|
||||
return true;
|
||||
}, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode)
|
||||
{
|
||||
bool was_error = false;
|
||||
|
||||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
for(size_t i=0 ; i < stream.size() ; ++i)
|
||||
{
|
||||
int c = static_cast<int>(stream.get_wchar(i));
|
||||
bool is_correct = false;
|
||||
|
||||
if( utf8_check_range(c) )
|
||||
{
|
||||
// CHECKME test me when sizeof(wchar_t) == 2
|
||||
if( is_first_surrogate_char(c) )
|
||||
{
|
||||
if( i + 1 < stream.size() )
|
||||
{
|
||||
wchar_t c1 = static_cast<wchar_t>(c);
|
||||
wchar_t c2 = stream.get_wchar(++i);
|
||||
|
||||
if( surrogate_pair_to_int(c1, c2, c) )
|
||||
{
|
||||
is_correct = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
is_correct = true;
|
||||
}
|
||||
}
|
||||
|
||||
if( is_correct )
|
||||
{
|
||||
int_to_utf8(c, utf8);
|
||||
}
|
||||
else
|
||||
{
|
||||
was_error = true;
|
||||
|
||||
if( mode == 1 )
|
||||
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
|
||||
}
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
template<typename StreamTypeIn, typename StreamTypeOut>
|
||||
bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||
utf8.write(utf8_buffer, buffer_len);
|
||||
return true;
|
||||
}, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide stream into a utf8 string
|
||||
|
||||
input:
|
||||
buffer - a wide stream for reading from
|
||||
|
||||
output:
|
||||
utf8 - an output utf8 string
|
||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||
|
||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode)
|
||||
{
|
||||
bool buffer_ok = false;
|
||||
bool is_ok = false;
|
||||
|
||||
if( max_buffer_size > 0 )
|
||||
{
|
||||
buffer_ok = true;
|
||||
max_buffer_size -= 1; // for terminating null character
|
||||
|
||||
is_ok = wide_to_output_function(buffer, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||
std::size_t i=0;
|
||||
|
||||
for( ; i < buffer_len ; ++i)
|
||||
{
|
||||
if( i < max_buffer_size )
|
||||
{
|
||||
*utf8 = utf8_buffer[i];
|
||||
utf8 += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
buffer_ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
max_buffer_size -= i;
|
||||
*utf8 = 0;
|
||||
return buffer_ok;
|
||||
}, mode);
|
||||
}
|
||||
|
||||
if( was_buffer_sufficient_large )
|
||||
*was_buffer_sufficient_large = buffer_ok;
|
||||
|
||||
return is_ok;
|
||||
}
|
||||
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
#include "utf8/utf8_templates.h"
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -1,201 +0,0 @@
|
||||
/*
|
||||
* This file is a part of PikoTools
|
||||
* and is distributed under the 2-Clause BSD licence.
|
||||
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2021-2024, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "utf8_private.h"
|
||||
|
||||
|
||||
namespace pt
|
||||
{
|
||||
|
||||
namespace private_namespace
|
||||
{
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from UTF-8 string
|
||||
*/
|
||||
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res)
|
||||
{
|
||||
for(len=0 ; (uz & 0x80) != 0 ; ++len)
|
||||
uz <<= 1;
|
||||
|
||||
if( len == 1 || len > 4 )
|
||||
return false;
|
||||
|
||||
res = uz;
|
||||
|
||||
if( len > 0 )
|
||||
res >>= len;
|
||||
|
||||
if( len == 0 )
|
||||
len = 1;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from UTF-8 string
|
||||
*/
|
||||
bool utf8_to_int_add_next_octet(unsigned char uz, int & res)
|
||||
{
|
||||
if( (uz & 0xc0) != 0x80 )
|
||||
return false;
|
||||
|
||||
res <<= 6;
|
||||
res |= (uz & 0x3F);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
|
||||
utf8_written - how many characters were saved in the utf8 string (the string doesn't have
|
||||
a null terminating character)
|
||||
it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
|
||||
was_utf8_buf_too_small - will be true if the utf8 buffer is too small
|
||||
if this flag is true then utf8_written is equal to zero
|
||||
was_error - will be true if there is an error when converting (there was an incorrect wide character)
|
||||
(was_error will not be true if the utf8 buffer is too small)
|
||||
*/
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
||||
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
utf8_written = 0;
|
||||
was_utf8_buf_too_small = false;
|
||||
chars = wide_to_int(wide_string, string_len, z, correct);
|
||||
|
||||
if( correct )
|
||||
{
|
||||
utf8_written = int_to_utf8(z, utf8, utf8_len);
|
||||
|
||||
if( utf8_written == 0 )
|
||||
was_utf8_buf_too_small = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( mode == 1 )
|
||||
{
|
||||
utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
|
||||
|
||||
if( utf8_written == 0 )
|
||||
was_utf8_buf_too_small = true;
|
||||
}
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
*/
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
chars = wide_to_int(wide_string, string_len, z, correct);
|
||||
|
||||
if( correct )
|
||||
correct = int_to_utf8(z, utf8, false) != 0;
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if wide_string has at least one character then the return value is always greater than zero too
|
||||
*/
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
chars = wide_to_int(wide_string, z, correct);
|
||||
|
||||
if( correct )
|
||||
correct = int_to_utf8(z, utf8, false) != 0;
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
} // namespace private_namespace
|
||||
|
||||
} // namespace pt
|
||||
|
||||
|
||||
|
@@ -1,117 +0,0 @@
|
||||
/*
|
||||
* This file is a part of PikoTools
|
||||
* and is distributed under the 2-Clause BSD licence.
|
||||
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2021-2024, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef headerfile_pikotools_src_utf8_utf8_private
|
||||
#define headerfile_pikotools_src_utf8_utf8_private
|
||||
|
||||
#include <string>
|
||||
|
||||
|
||||
namespace pt
|
||||
{
|
||||
|
||||
size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len);
|
||||
size_t int_to_utf8(int z, std::string & utf8, bool clear);
|
||||
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct);
|
||||
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct);
|
||||
|
||||
|
||||
namespace private_namespace
|
||||
{
|
||||
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res);
|
||||
bool utf8_to_int_add_next_octet(unsigned char uz, int & res);
|
||||
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
|
||||
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode);
|
||||
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode);
|
||||
|
||||
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode);
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
|
||||
returns how many wide characters were used
|
||||
if string_len is greater than 0 then the return value is always greater than zero too
|
||||
*/
|
||||
template<typename StreamType>
|
||||
static size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct;
|
||||
size_t chars;
|
||||
|
||||
chars = wide_to_int(wide_string, string_len, z, correct);
|
||||
|
||||
if( correct )
|
||||
correct = int_to_utf8(z, utf8) != 0;
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
an auxiliary function for converting from wide characters to UTF-8
|
||||
*/
|
||||
template<typename StreamType>
|
||||
static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode)
|
||||
{
|
||||
size_t min_str_len = 1;
|
||||
|
||||
if( *wide_string == 0 )
|
||||
return 0;
|
||||
|
||||
if( *(wide_string+1) != 0 )
|
||||
min_str_len = 2;
|
||||
|
||||
return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
} // namespace private_namespace
|
||||
|
||||
} // namespace pt
|
||||
|
||||
#endif
|
@@ -1,808 +0,0 @@
|
||||
/*
|
||||
* This file is a part of PikoTools
|
||||
* and is distributed under the 2-Clause BSD licence.
|
||||
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2021-2024, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef headerfile_pikotools_src_utf8_utf8_templates
|
||||
#define headerfile_pikotools_src_utf8_utf8_templates
|
||||
|
||||
// this file is included at the end of utf8.h
|
||||
|
||||
#include "utf8_private.h"
|
||||
|
||||
|
||||
namespace pt
|
||||
{
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool int_to_wide(int c, StreamType & res)
|
||||
{
|
||||
wchar_t buf[2];
|
||||
size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t));
|
||||
|
||||
if( used == 1 )
|
||||
{
|
||||
res << buf[0];
|
||||
}
|
||||
else
|
||||
if( used == 2 )
|
||||
{
|
||||
res << buf[0];
|
||||
res << buf[1];
|
||||
}
|
||||
|
||||
return used > 0;
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
this function converts one UTF-8 character into int
|
||||
|
||||
input:
|
||||
iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||
iterator_end - an end iterator
|
||||
|
||||
output:
|
||||
res - an output character
|
||||
correct - true if it is a correct character
|
||||
|
||||
the function returns how many characters have been used from the input stream
|
||||
*/
|
||||
template<typename StreamIteratorType>
|
||||
size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct)
|
||||
{
|
||||
size_t i, len;
|
||||
unsigned char uz;
|
||||
|
||||
res = 0;
|
||||
correct = false;
|
||||
|
||||
if( iterator_in == iterator_end )
|
||||
return 0;
|
||||
|
||||
uz = *iterator_in;
|
||||
++iterator_in;
|
||||
|
||||
if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
|
||||
return 1;
|
||||
|
||||
for(i=1 ; i<len ; ++i)
|
||||
{
|
||||
if( iterator_in == iterator_end )
|
||||
return i;
|
||||
|
||||
uz = *iterator_in;
|
||||
++iterator_in;
|
||||
|
||||
if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
if( utf8_check_range(res, len) )
|
||||
correct = true;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
|
||||
(need to be tested)
|
||||
*/
|
||||
// need to be tested
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
res.clear();
|
||||
|
||||
bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) {
|
||||
int_to_wide(c, res);
|
||||
}, mode);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const char * utf8, StreamType & res, bool clear, int mode)
|
||||
{
|
||||
size_t utf8_len = 0;
|
||||
|
||||
while( utf8[utf8_len] != 0 )
|
||||
utf8_len += 1;
|
||||
|
||||
return utf8_to_wide(utf8, utf8_len, res, clear, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear, int mode)
|
||||
{
|
||||
return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear, int mode)
|
||||
{
|
||||
int z;
|
||||
bool correct, was_error = false;
|
||||
|
||||
if( clear )
|
||||
res.clear();
|
||||
|
||||
while( utf8_to_int(utf8, z, correct) > 0 )
|
||||
{
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
res << 0xFFFD; // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
int_to_wide(z, res);
|
||||
}
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
template<typename OutputFunction>
|
||||
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode)
|
||||
{
|
||||
int z;
|
||||
size_t len;
|
||||
bool correct, was_error = false;
|
||||
|
||||
while( utf8_len > 0 )
|
||||
{
|
||||
if( (unsigned char)*utf8 <= 0x7f )
|
||||
{
|
||||
// small optimization
|
||||
len = 1;
|
||||
correct = true;
|
||||
z = static_cast<unsigned char>(*utf8);
|
||||
}
|
||||
else
|
||||
{
|
||||
len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero
|
||||
}
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
output_function(0xFFFD); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_function(z);
|
||||
}
|
||||
|
||||
utf8 += len;
|
||||
utf8_len -= len;
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
template<typename StreamType, typename OutputFunction>
|
||||
bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode)
|
||||
{
|
||||
char utf8_buffer[256];
|
||||
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
|
||||
std::size_t utf8_sequence_max_length = 10;
|
||||
std::size_t index = 0;
|
||||
bool was_error = false;
|
||||
|
||||
typename StreamType::const_iterator i = buffer.begin();
|
||||
|
||||
while( i != buffer.end() )
|
||||
{
|
||||
if( index + utf8_sequence_max_length > buffer_len )
|
||||
{
|
||||
bool write_status = output_function(utf8_buffer, index);
|
||||
index = 0;
|
||||
|
||||
if( !write_status )
|
||||
{
|
||||
was_error = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int c = 0xFFFD; // U+FFFD "replacement character";
|
||||
bool seems_to_be_correct = false;
|
||||
wchar_t w1 = *i;
|
||||
|
||||
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
|
||||
{
|
||||
++i;
|
||||
|
||||
if( i != buffer.end() )
|
||||
{
|
||||
wchar_t w2 = *i;
|
||||
|
||||
if( surrogate_pair_to_int(w1, w2, c) )
|
||||
{
|
||||
seems_to_be_correct = true;
|
||||
++i;
|
||||
}
|
||||
else
|
||||
{
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
c = w1;
|
||||
seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below
|
||||
++i;
|
||||
}
|
||||
|
||||
if( seems_to_be_correct || mode == 1 )
|
||||
{
|
||||
size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index);
|
||||
// here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough)
|
||||
|
||||
if( seq_len == 0 )
|
||||
{
|
||||
was_error = true;
|
||||
|
||||
if( mode == 1 )
|
||||
{
|
||||
seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character";
|
||||
}
|
||||
}
|
||||
|
||||
index += seq_len;
|
||||
}
|
||||
}
|
||||
|
||||
if( index > 0 )
|
||||
{
|
||||
if( !output_function(utf8_buffer, index) )
|
||||
{
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
this function converts a UTF-8 stream into a wide stream or a wide string
|
||||
|
||||
input:
|
||||
stream - a UTF-8 stream for converting
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
res - a wide stream or a wide string for the output sequence
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamOrStringType>
|
||||
bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
res.clear();
|
||||
|
||||
return utf8_to_output_function(stream, [&](int z) {
|
||||
int_to_wide(z, res);
|
||||
}, mode);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
this function reads characters from a UTF-8 stream and calls an output_function
|
||||
|
||||
input:
|
||||
stream - a UTF-8 stream for converting
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType
|
||||
and should put the character to the output string/stream, this function should have the signature like this:
|
||||
output_function(int z, StreamOrStringType & res)
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename OutputFunction>
|
||||
bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode)
|
||||
{
|
||||
size_t len;
|
||||
bool correct;
|
||||
int z;
|
||||
size_t index = 0;
|
||||
bool was_error = false;
|
||||
|
||||
do
|
||||
{
|
||||
len = utf8_to_int(stream, index, z, correct);
|
||||
|
||||
if( len > 0 )
|
||||
{
|
||||
if( !correct )
|
||||
{
|
||||
if( mode == 1 )
|
||||
output_function(0xFFFD); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_function(z);
|
||||
}
|
||||
|
||||
index += len;
|
||||
}
|
||||
}
|
||||
while( len > 0 );
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts UTF-8 stream into a wide stream or a wide string
|
||||
|
||||
input:
|
||||
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||
iterator_end - an end iterator
|
||||
|
||||
output:
|
||||
out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string)
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamIteratorType, typename StreamOrStringType>
|
||||
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode)
|
||||
{
|
||||
if( clear_stream )
|
||||
out_stream.clear();
|
||||
|
||||
return utf8_to_output_function(iterator_in, iterator_end, [&](int z){
|
||||
int_to_wide(z, out_stream);
|
||||
}, mode);
|
||||
}
|
||||
|
||||
|
||||
template<typename StreamIteratorType, typename OutputFunction>
|
||||
bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode)
|
||||
{
|
||||
int res;
|
||||
bool correct;
|
||||
bool was_error = false;
|
||||
|
||||
while( iterator_in != iterator_end )
|
||||
{
|
||||
utf8_to_int(iterator_in, iterator_end, res, correct);
|
||||
|
||||
if( correct )
|
||||
{
|
||||
output_function(res);
|
||||
}
|
||||
else
|
||||
{
|
||||
if( mode == 1 )
|
||||
output_function(0xFFFD); // U+FFFD "replacement character"
|
||||
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts UTF-8 stream into a wide string
|
||||
|
||||
input:
|
||||
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||
iterator_end - an end iterator
|
||||
|
||||
output:
|
||||
out_buffer - an output wide string
|
||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||
|
||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||
*/
|
||||
template<typename StreamIteratorType>
|
||||
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large)
|
||||
{
|
||||
int res;
|
||||
bool correct;
|
||||
bool was_error = true;
|
||||
bool was_buffer_ok = false;
|
||||
|
||||
if( max_buffer_len > 0 )
|
||||
{
|
||||
max_buffer_len -= 1; // for terminating null character
|
||||
was_error = false;
|
||||
was_buffer_ok = true;
|
||||
|
||||
while( iterator_in != iterator_end )
|
||||
{
|
||||
utf8_to_int(iterator_in, iterator_end, res, correct);
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
was_error = true;
|
||||
|
||||
if( mode == 1 )
|
||||
{
|
||||
res = 0xFFFD; // U+FFFD "replacement character"
|
||||
correct = true;
|
||||
}
|
||||
}
|
||||
|
||||
if( correct )
|
||||
{
|
||||
size_t len = int_to_wide(res, out_buffer, max_buffer_len);
|
||||
// if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand)
|
||||
|
||||
if( len == 0 )
|
||||
{
|
||||
was_error = true;
|
||||
was_buffer_ok = false;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
out_buffer += len;
|
||||
max_buffer_len -= len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*out_buffer = 0;
|
||||
}
|
||||
|
||||
if( was_buffer_sufficient_large )
|
||||
*was_buffer_sufficient_large = was_buffer_ok;
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts UTF-8 stream into a wide string
|
||||
|
||||
input:
|
||||
stream - a stream for reading from
|
||||
|
||||
output:
|
||||
out_buffer - an output wide string
|
||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||
|
||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode)
|
||||
{
|
||||
typename StreamType::const_iterator stream_begin = stream.begin();
|
||||
typename StreamType::const_iterator stream_end = stream.end();
|
||||
|
||||
return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts one wide character into UTF-8 stream
|
||||
|
||||
input:
|
||||
z - wide character
|
||||
|
||||
output:
|
||||
utf8 - a UTF-8 stream for the output sequence
|
||||
|
||||
the function returns how many characters have been written to the utf8 stream,
|
||||
zero means that 'z' is an incorrect unicode character
|
||||
*/
|
||||
template<typename StreamType>
|
||||
size_t int_to_utf8(int z, StreamType & utf8)
|
||||
{
|
||||
char buf[10];
|
||||
|
||||
size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char));
|
||||
|
||||
if( len > 0 )
|
||||
utf8.write(buf, len);
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide string into UTF-8 stream
|
||||
|
||||
input:
|
||||
wide_string - a wide string for converting
|
||||
string_len - size of the string
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
utf8 - a UTF-8 stream for the output sequence
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode)
|
||||
{
|
||||
bool was_error = false;
|
||||
size_t chars;
|
||||
|
||||
while( string_len > 0 )
|
||||
{
|
||||
chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode);
|
||||
wide_string += chars;
|
||||
string_len -= chars;
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide string into UTF-8 stream
|
||||
|
||||
input:
|
||||
wide_string - a null terminated wide string for converting
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
utf8 - a UTF-8 stream for the output sequence
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode)
|
||||
{
|
||||
bool was_error = false;
|
||||
|
||||
while( *wide_string )
|
||||
wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode);
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide string (std::wstring) into UTF-8 stream
|
||||
|
||||
input:
|
||||
wide_string - a wide string for converting
|
||||
mode - what to do with errors when converting
|
||||
0: skip an invalid character
|
||||
1: put U+FFFD "replacement character" istead of the invalid character (default)
|
||||
|
||||
output:
|
||||
utf8 - a UTF-8 stream for the output sequence
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode)
|
||||
{
|
||||
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||
utf8.append(utf8_buffer, buffer_len);
|
||||
return true;
|
||||
}, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode)
|
||||
{
|
||||
bool was_error = false;
|
||||
|
||||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
for(size_t i=0 ; i < stream.size() ; ++i)
|
||||
{
|
||||
int c = static_cast<int>(stream.get_wchar(i));
|
||||
bool is_correct = false;
|
||||
|
||||
if( utf8_check_range(c) )
|
||||
{
|
||||
// CHECKME test me when sizeof(wchar_t) == 2
|
||||
if( is_first_surrogate_char(c) )
|
||||
{
|
||||
if( i + 1 < stream.size() )
|
||||
{
|
||||
wchar_t c1 = static_cast<wchar_t>(c);
|
||||
wchar_t c2 = stream.get_wchar(++i);
|
||||
|
||||
if( surrogate_pair_to_int(c1, c2, c) )
|
||||
{
|
||||
is_correct = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
is_correct = true;
|
||||
}
|
||||
}
|
||||
|
||||
if( is_correct )
|
||||
{
|
||||
int_to_utf8(c, utf8);
|
||||
}
|
||||
else
|
||||
{
|
||||
was_error = true;
|
||||
|
||||
if( mode == 1 )
|
||||
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
|
||||
}
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
template<typename StreamTypeIn, typename StreamTypeOut>
|
||||
bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||
utf8.write(utf8_buffer, buffer_len);
|
||||
return true;
|
||||
}, mode);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide stream into a utf8 string
|
||||
|
||||
input:
|
||||
buffer - a wide stream for reading from
|
||||
|
||||
output:
|
||||
utf8 - an output utf8 string
|
||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||
|
||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode)
|
||||
{
|
||||
bool buffer_ok = false;
|
||||
bool is_ok = false;
|
||||
|
||||
if( max_buffer_size > 0 )
|
||||
{
|
||||
buffer_ok = true;
|
||||
max_buffer_size -= 1; // for terminating null character
|
||||
|
||||
is_ok = wide_to_output_function(buffer, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||
std::size_t i=0;
|
||||
|
||||
for( ; i < buffer_len ; ++i)
|
||||
{
|
||||
if( i < max_buffer_size )
|
||||
{
|
||||
*utf8 = utf8_buffer[i];
|
||||
utf8 += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
buffer_ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
max_buffer_size -= i;
|
||||
*utf8 = 0;
|
||||
return buffer_ok;
|
||||
}, mode);
|
||||
}
|
||||
|
||||
if( was_buffer_sufficient_large )
|
||||
*was_buffer_sufficient_large = buffer_ok;
|
||||
|
||||
return is_ok;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
} // namespace pt
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@@ -4,16 +4,14 @@
|
||||
./convert.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
|
||||
./convert.o: ../src/textstream/stream.h ../src/space/space.h
|
||||
./convert.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
|
||||
./convert.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
|
||||
./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h
|
||||
./convert.o: ../src/textstream/stream.h ../src/date/date.h
|
||||
./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
|
||||
./convert.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
|
||||
./convert.o: ../src/convert/text.h ../src/convert/misc.h
|
||||
./convert.o: ../src/textstream/types.h ../src/convert/double.h test.h
|
||||
./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
|
||||
./csvparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
|
||||
./csvparser.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
|
||||
./csvparser.o: ../src/utf8/utf8_private.h ../src/convert/baseparser.h
|
||||
./csvparser.o: ../src/textstream/stream.h ../src/convert/baseparser.h
|
||||
./csvparser.o: ../src/textstream/textstream.h ../src/textstream/stream.h
|
||||
./csvparser.o: ../src/date/date.h ../src/membuffer/membuffer.h
|
||||
./csvparser.o: ../src/textstream/types.h ../src/textstream/stream_private.h
|
||||
@@ -22,8 +20,7 @@
|
||||
./main.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
|
||||
./main.o: ../src/textstream/stream.h ../src/space/space.h
|
||||
./main.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
|
||||
./main.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
|
||||
./main.o: ../src/utf8/utf8_private.h ../src/date/date.h
|
||||
./main.o: ../src/textstream/stream.h ../src/date/date.h
|
||||
./main.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
|
||||
./main.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
|
||||
./main.o: ../src/convert/text.h ../src/convert/misc.h
|
||||
@@ -34,9 +31,7 @@
|
||||
./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
|
||||
./mainoptionsparser.o: ../src/space/space.h ../src/convert/inttostr.h
|
||||
./mainoptionsparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
|
||||
./mainoptionsparser.o: ../src/utf8/utf8_templates.h
|
||||
./mainoptionsparser.o: ../src/utf8/utf8_private.h ../src/convert/convert.h
|
||||
./mainoptionsparser.o: ../src/convert/inttostr.h
|
||||
./mainoptionsparser.o: ../src/convert/convert.h ../src/convert/inttostr.h
|
||||
./mainoptionsparser.o: ../src/convert/patternreplacer.h
|
||||
./mainoptionsparser.o: ../src/textstream/textstream.h
|
||||
./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h
|
||||
|
Reference in New Issue
Block a user