update utf8 functions comments

while here:
- rename pt::utf8_check_range(...) -> pt::is_correct_unicode_char(...)
This commit is contained in:
2024-05-31 00:23:43 +02:00
parent 450c5d55e9
commit 2689c9fece
3 changed files with 595 additions and 591 deletions

View File

@@ -1178,7 +1178,7 @@ void SpaceParser::read_unicode_json_format(bool has_first_byte, int first_byte)
}
}
if( !ok || !pt::utf8_check_range(lastc) )
if( !ok || !pt::is_correct_unicode_char(lastc) )
{
lastc = 0xFFFD; // U+FFFD "replacement character";
}
@@ -1207,7 +1207,7 @@ int i;
value = (value << 4) | hex_to_int(c);
}
if( i > 0 && c == '}' && pt::utf8_check_range(value) )
if( i > 0 && c == '}' && pt::is_correct_unicode_char(value) )
{
lastc = static_cast<wchar_t>(value);
}

View File

@@ -41,21 +41,21 @@ namespace pt
{
/*
* returns true if 'c' is a correct unicode character
* return true if 'c' is a correct unicode character
*/
bool utf8_check_range(int c)
bool is_correct_unicode_char(int c)
{
return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF);
}
/*
* returns true if 'c' is a correct unicode character
* return true if 'c' is a correct unicode character
*
* this method is used when reading from an utf8 string
* how_many_bytes - means how many bytes from the utf8 string were read
*/
bool utf8_check_range(int c, int how_many_bytes)
bool is_correct_unicode_char(int c, int how_many_bytes)
{
if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 )
{
@@ -124,10 +124,9 @@ bool surrogate_pair_to_int(int c1, int c2, int & z)
/*
* an auxiliary function for converting from wide characters to UTF-8
* converting a wide character into one int
* convert one wide (or two wide) characters to an int
*
* returns how many wide characters were used
* return how many wide characters were used
* if string_len is greater than 0 then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
@@ -167,7 +166,7 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool
}
else
{
correct = utf8_check_range(z);
correct = is_correct_unicode_char(z);
return 1;
}
}
@@ -178,7 +177,7 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool
* an auxiliary function for converting from wide characters to UTF-8
* converting a wide character into one int
* returns how many wide characters were used
* return how many wide characters were used
* if wide_string has at least one character then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)
@@ -201,7 +200,7 @@ return wide_to_int(wide_string, min_str_len, z, correct);
/*
* converts an int to a wide string
* convert an int to a wide string
*
* this method will not terminate the output string with a null character
* return how many characters have been written (0, 1 or 2)
@@ -233,9 +232,9 @@ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len)
/*
* converts an int to a wide string
* convert an int to a wide string
*
* returns true if a character was inserted to the string
* return true if a character was inserted to the string
*/
bool int_to_wide(int c, std::wstring & res)
{
@@ -258,6 +257,9 @@ bool int_to_wide(int c, std::wstring & res)
/*
* convert one character into a stream
* stream can be an utf8 or a wide stream
*
* return true if c was a correct unicode character
* and has been put the the stream
*/
@@ -280,17 +282,17 @@ bool int_to_stream(int c, pt::Stream & stream)
/*
* this function converts one UTF-8 character into one wide-character
* convert one UTF-8 character into one wide-character
*
* input:
* utf8 - an input UTF-8 string
* utf8_len - size of the input string,
* the string should be at least 4 bytes length for correctly
* recognized the utf-8 sequence
* utf8 - an input UTF-8 string
* utf8_len - size of the input string,
* the string should be at least 4 bytes length for correctly
* recognized the utf-8 sequence
*
* output:
* res - an output character
* correct - true if it is a correct character
* res - an output character
* correct - true if it is a correct character
*
* the function returns how many characters have been used from the input string
* (returns zero only if utf8_len is zero)
@@ -318,7 +320,7 @@ size_t i, len;
return i;
}
if( utf8_check_range(res, len) )
if( is_correct_unicode_char(res, len) )
correct = true;
return len;
@@ -326,20 +328,20 @@ return len;
/*!
this function converts one UTF-8 character into one wide-character
input:
utf8 - an input UTF-8 string (null terminated)
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input string
(returns zero only if the string has '\0' at the first character)
even if there are errors the functions returns a different from zero value
*/
/*
* convert one UTF-8 character into one wide-character
*
* input:
* utf8 - an input UTF-8 string (null terminated)
*
* output:
* res - an output character
* correct - true if it is a correct character
*
* the function returns how many characters have been used from the input string
* (returns zero only if the string has '\0' at the first character)
* even if there are errors the functions returns a different from zero value
*/
size_t utf8_to_int(const char * utf8, int & res, bool & correct)
{
size_t i, len;
@@ -362,7 +364,7 @@ size_t i, len;
return i;
}
if( utf8_check_range(res, len) )
if( is_correct_unicode_char(res, len) )
correct = true;
return len;
@@ -371,20 +373,20 @@ return len;
/*!
this function converts one UTF-8 character into one wide-character
input:
utf8 - an input UTF-8 string
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input string
(returns zero only if utf8 is empty)
even if there are errors the functions returns a different from zero value
*/
/*
* convert one UTF-8 character into one wide-character
*
* input:
* utf8 - an input UTF-8 string
*
* output:
* res - an output character
* correct - true if it is a correct character
*
* the function returns how many characters have been used from the input string
* (returns zero only if utf8 is empty)
* even if there are errors the functions returns a different from zero value
*/
size_t utf8_to_int(const std::string & utf8, int & res, bool & correct)
{
return utf8_to_int(utf8.c_str(), utf8.size(), res, correct);
@@ -392,18 +394,18 @@ size_t utf8_to_int(const std::string & utf8, int & res, bool & correct)
/*!
this function converts one UTF-8 character into one wide-character
input:
utf8 - an input UTF-8 stream
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input stream
*/
/*
* convert one UTF-8 character into one wide-character
*
* input:
* utf8 - an input UTF-8 stream
*
* output:
* res - an output character
* correct - true if it is a correct character
*
* the function returns how many characters have been used from the input stream
*/
size_t utf8_to_int(std::istream & utf8, int & res, bool & correct)
{
size_t i, len;
@@ -431,7 +433,7 @@ unsigned char uz;
return i;
}
if( utf8_check_range(res, len) )
if( is_correct_unicode_char(res, len) )
correct = true;
return len;
@@ -465,7 +467,7 @@ unsigned char uz;
return i + 1;
}
if( utf8_check_range(res, len) )
if( is_correct_unicode_char(res, len) )
correct = true;
}
else
@@ -482,21 +484,21 @@ unsigned char uz;
/*!
this function converts an utf8 string into wide string (std::wstring)
input:
utf8 - an input utf8 string
utf8_len - size of the input string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - an output wide string
the function returns false if there were some errors when converting
*/
/*
* convert an utf8 string into a wide string (std::wstring)
*
* input:
* utf8 - an input utf8 string
* utf8_len - size of the input string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* res - an output wide string
*
* the function returns false if there were some errors when converting
*/
bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear, int mode)
{
if( clear )
@@ -513,20 +515,20 @@ bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool c
/*!
this function converts an utf8 string into wide string (std::wstring)
input:
utf8 - an input utf8 null terminated string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - an output wide string
the function returns false if there were some errors when converting
*/
/*
* convert an utf8 string into a wide string (std::wstring)
*
* input:
* utf8 - an input utf8 null terminated string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* res - an output wide string
*
* the function returns false if there were some errors when converting
*/
bool utf8_to_wide(const char * utf8, std::wstring & res, bool clear, int mode)
{
size_t utf8_len = 0;
@@ -539,20 +541,20 @@ return utf8_to_wide(utf8, utf8_len, res, clear, mode);
/*!
this function converts an utf8 string into wide string (std::wstring)
input:
utf8 - an input utf8 string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - an output wide string
the function returns false if there were some errors when converting
*/
/*
* convert an utf8 string into a wide string (std::wstring)
*
* input:
* utf8 - an input utf8 string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* res - an output wide string
*
* the function returns false if there were some errors when converting
*/
bool utf8_to_wide(const std::string & utf8, std::wstring & res, bool clear, int mode)
{
return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode);
@@ -560,20 +562,20 @@ bool utf8_to_wide(const std::string & utf8, std::wstring & res, bool clear, int
/*!
this function converts an utf8 stream into wide string (std::wstring)
input:
utf8 - an input utf8 stream
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - an output wide string
the function returns false if there were some errors when converting
*/
/*
* convert an utf8 stream into a wide string (std::wstring)
*
* input:
* utf8 - an input utf8 stream
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* res - an output wide string
*
* the function returns false if there were some errors when converting
*/
bool utf8_to_wide(std::istream & utf8, std::wstring & res, bool clear, int mode)
{
int z;
@@ -603,26 +605,26 @@ return !was_error;
/*!
this function converts one wide character into UTF-8 sequence
input:
z - wide character
output:
utf8 - a buffer for the output sequence
utf8_len - the size of the buffer
the function returns how many characters have been written to the utf8,
zero means the utf8 buffer is too small or 'z' is an incorrect unicode character
*/
/*
* convert one wide character into an UTF-8 sequence
*
* input:
* z - wide character
*
* output:
* utf8 - a buffer for the output sequence
* utf8_len - the size of the buffer
*
* the function returns how many characters have been written to the utf8,
* zero means the utf8 buffer is too small or 'z' is an incorrect unicode character
*/
size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len)
{
char buf[10];
int i = 0;
int mask = 0x3f; // 6 first bits set
if( utf8_max_len==0 || !utf8_check_range(z) )
if( utf8_max_len==0 || !is_correct_unicode_char(z) )
return 0;
if( z <= 0x7f )
@@ -658,18 +660,18 @@ return a;
/*!
this function converts one wide character into UTF-8 string
input:
z - wide character
output:
utf8 - a UTF-8 string for the output sequence (the string is not cleared)
the function returns how many characters have been written to the utf8 string,
zero means that 'z' is an incorrect unicode character
*/
/*
* convert one wide character into an UTF-8 string
*
* input:
* z - wide character
*
* output:
* utf8 - a UTF-8 string for the output sequence (the string is not cleared)
*
* the function returns how many characters have been written to the utf8 string,
* zero means that 'z' is an incorrect unicode character
*/
size_t int_to_utf8(int z, std::string & utf8, bool clear)
{
char buf[10];
@@ -688,21 +690,21 @@ return len;
/*!
this function converts a wide string into UTF-8 string
input:
wide_string - a wide string for converting
string_len - the size of the string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 string for the output sequence (the string is not cleared)
this function returns false if there were some errors when converting
*/
/*
* convert a wide string into an UTF-8 string
*
* input:
* wide_string - a wide string for converting
* string_len - the size of the string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a UTF-8 string for the output sequence (the string is not cleared)
*
* this function returns false if there were some errors when converting
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear, int mode)
{
bool was_error = false;
@@ -723,20 +725,20 @@ return !was_error;
/*!
this function converts a wide string into UTF-8 string
input:
wide_string - a null terminated wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 string for the output sequence (the string is not cleared)
this function returns false if there were some errors when converting
*/
/*
* convert a wide string into an UTF-8 string
*
* input:
* wide_string - a null terminated wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a UTF-8 string for the output sequence (the string is not cleared)
*
* this function returns false if there were some errors when converting
*/
bool wide_to_utf8(const wchar_t * wide_string, std::string & utf8, bool clear, int mode)
{
bool was_error = false;
@@ -752,20 +754,20 @@ return !was_error;
/*!
this function converts a wide string (std::wstring) into UTF-8 string
input:
wide_string - a wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 string for the output sequence (the string is not cleared)
this function returns false if there were some errors when converting
*/
/*
* convert a wide string (std::wstring) into an UTF-8 string
*
* input:
* wide_string - a wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a UTF-8 string for the output sequence (the string is not cleared)
*
* this function returns false if there were some errors when converting
*/
bool wide_to_utf8(const std::wstring & wide_string, std::string & utf8, bool clear, int mode)
{
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, clear, mode);
@@ -775,27 +777,27 @@ bool wide_to_utf8(const std::wstring & wide_string, std::string & utf8, bool cle
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a wide string for converting
string_len - lenght of the wide string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a buffer for the UTF-8 stream
utf8_len - the size of the buffer
utf8_written - how many bytes have been written to the buffer
this function returns false if there were some errors when converting or the output buffer was too small,
the output string is not null terminated
if there is an error when converting (there is an incorrect character in the wide string) the function
will continue converting but if the buffer is too small the function breaks immediately
*/
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* string_len - lenght of the wide string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
* utf8_written - how many bytes have been written to the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is not null terminated
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode)
{
bool was_error = false;
@@ -830,26 +832,26 @@ return !was_error;
/*!
this function converts a wide string (std::wstring) into UTF-8 stream
input:
wide_string - a wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a buffer for the UTF-8 stream
utf8_len - the size of the buffer
utf8_written - how many bytes have been written to the buffer
this function returns false if there were some errors when converting or the output buffer was too small,
the output string is not null terminated
if there is an error when converting (there is an incorrect character in the wide string) the function
will continue converting but if the buffer is too small the function breaks immediately
*/
/*
* convert a wide string (std::wstring) into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
* utf8_written - how many bytes have been written to the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is not null terminated
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
*/
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode)
{
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, utf8_written, mode);
@@ -857,27 +859,27 @@ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a wide string for converting
string_len - lenght of the wide string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a buffer for the UTF-8 stream
utf8_len - the size of the buffer
this function returns false if there were some errors when converting or the output buffer was too small,
the output string is null terminated (even if there were errors during converting)
if there is an error when converting (there is an incorrect character in the wide string) the function
will continue converting but if the buffer is too small the function breaks immediately
(in both cases the utf8 buffer is null terminated)
*/
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* string_len - lenght of the wide string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is null terminated (even if there were errors during converting)
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
* (in both cases the utf8 buffer is null terminated)
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode)
{
size_t utf8_saved;
@@ -894,26 +896,26 @@ return res;
/*!
this function converts a wide string (std::wstring) into UTF-8 stream
input:
wide_string - a wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a buffer for the UTF-8 stream
utf8_len - the size of the buffer
this function returns false if there were some errors when converting or the output buffer was too small,
the output string is null terminated (even if there were errors during converting)
if there is an error when converting (there is an incorrect character in the wide string) the function
will continue converting but if the buffer is too small the function breaks immediately
(in both cases the utf8 buffer is null terminated)
*/
/*
* convert a wide string (std::wstring) into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is null terminated (even if there were errors during converting)
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
* (in both cases the utf8 buffer is null terminated)
*/
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode)
{
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, mode);
@@ -921,26 +923,26 @@ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a null terminated wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a buffer for the UTF-8 stream
utf8_len - the size of the buffer
utf8_written - how many bytes have been written to the buffer
this function returns false if there were some errors when converting or the output buffer was too small,
the output string is not null terminated
if there is an error when converting (there is an incorrect character in the wide string) the function
will continue converting but if the buffer is too small the function breaks immediately
*/
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a null terminated wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
* utf8_written - how many bytes have been written to the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is not null terminated
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
*/
bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode)
{
bool was_error = false;
@@ -976,26 +978,26 @@ return !was_error;
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a buffer for the UTF-8 stream
utf8_len - the size of the buffer
this function returns false if there were some errors when converting or the output buffer was too small,
the output string is null terminated (even if there were errors during converting)
if there is an error when converting (there is an incorrect character in the wide string) the function
will continue converting but if the buffer is too small the function breaks immediately
(in both cases the utf8 buffer is null terminated)
*/
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is null terminated (even if there were errors during converting)
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
* (in both cases the utf8 buffer is null terminated)
*/
bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode)
{
size_t utf8_saved;
@@ -1017,9 +1019,9 @@ return res;
namespace private_namespace
{
/*!
an auxiliary function for converting from UTF-8 string
*/
/*
* an auxiliary function for converting from UTF-8 string
*/
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res)
{
for(len=0 ; (uz & 0x80) != 0 ; ++len)
@@ -1041,9 +1043,9 @@ return true;
/*!
an auxiliary function for converting from UTF-8 string
*/
/*
* an auxiliary function for converting from UTF-8 string
*/
bool utf8_to_int_add_next_octet(unsigned char uz, int & res)
{
if( (uz & 0xc0) != 0x80 )
@@ -1057,20 +1059,20 @@ return true;
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
utf8_written - how many characters were saved in the utf8 string (the string doesn't have
a null terminating character)
it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
was_utf8_buf_too_small - will be true if the utf8 buffer is too small
if this flag is true then utf8_written is equal to zero
was_error - will be true if there is an error when converting (there was an incorrect wide character)
(was_error will not be true if the utf8 buffer is too small)
*/
/*
* an auxiliary function for converting from wide characters to UTF-8
*
* return how many wide characters were used
* if string_len is greater than 0 then the return value is always greater than zero too
*
* utf8_written - how many characters were saved in the utf8 string (the string doesn't have
* a null terminating character)
* it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
* was_utf8_buf_too_small - will be true if the utf8 buffer is too small
* if this flag is true then utf8_written is equal to zero
* was_error - will be true if there is an error when converting (there was an incorrect wide character)
* (was_error will not be true if the utf8 buffer is too small)
*/
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
{
@@ -1107,12 +1109,12 @@ return chars;
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
/*
* an auxiliary function for converting from wide characters to UTF-8
*
* return how many wide characters were used
* if string_len is greater than 0 then the return value is always greater than zero too
*/
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
{
int z;
@@ -1137,12 +1139,12 @@ return chars;
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if wide_string has at least one character then the return value is always greater than zero too
*/
/*
* an auxiliary function for converting from wide characters to UTF-8
*
* return how many wide characters were used
* if wide_string has at least one character then the return value is always greater than zero too
*/
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
{
int z;

View File

@@ -42,85 +42,80 @@
namespace pt
{
/*
* public methods are also defined in utf8_stream.h
* UTF-8, a transformation format of ISO 10646
* http://tools.ietf.org/html/rfc3629
*
* when wchar_t is 4 bytes length we use UTF-32
* when wchar_t is 2 bytes length we use UTF-16 (with surrogate pairs)
*
* UTF-16
* http://www.ietf.org/rfc/rfc2781.txt
*/
/*!
UTF-8, a transformation format of ISO 10646
http://tools.ietf.org/html/rfc3629
when wchar_t is 4 bytes length we use UTF-32
when wchar_t is 2 bytes length we use UTF-16 (with surrogate pairs)
UTF-16
http://www.ietf.org/rfc/rfc2781.txt
*/
/*!
returns true if 'c' is a correct unicode character
RENAMEME to is_correct_unicode_char
*/
bool utf8_check_range(int c);
/*!
returns true if 'c' is a correct unicode character
this method is used when reading from an utf8 string
how_many_chars - means how many characters from utf8 string were read
*/
bool utf8_check_range(int c, int how_many_bytes);
/*
* return true if 'c' is a correct unicode character
*/
bool is_correct_unicode_char(int c);
/*
* returns true if 'c' is a characters from the surrogate range
* return true if 'c' is a correct unicode character
*
* this method is used when reading from an utf8 string
* how_many_chars - means how many characters from utf8 string were read
*/
bool is_correct_unicode_char(int c, int how_many_bytes);
/*
* return true if 'c' is a character from the surrogate range
* (c>=0xD800 && c<=0xDFFF)
*
*/
bool is_surrogate_char(int c);
/*
* returns true if 'c' is a first character from the surrogate pair
* return true if 'c' is a first character from the surrogate pair
* (c>=0xD800 && c<=0xDBFF)
*/
bool is_first_surrogate_char(int c);
/*
* returns true if 'c' is a second character from the surrogate pair
* return true if 'c' is a second character from the surrogate pair
* (c>=0xDC00 && c<=0xDFFF)
*/
bool is_second_surrogate_char(int c);
/*
* returns a code point from two surrogate pair characters
* return a code point from two surrogate pair characters
*/
bool surrogate_pair_to_int(int c1, int c2, int & z);
/*
* converting one character into a stream
* stream can be an utf8 or wide stream
* convert one character into a stream
* stream can be an utf8 or a wide stream
*
* return true if c was a correct unicode character
* and has been put the the stream
*/
bool int_to_stream(int c, pt::Stream & stream);
/*
* converting a one unicode character to an int
* such an unicode character can consists of one or two wide characters
* convert one wide (or two wide) characters to an int
*
* return how many wide characters were used
* if string_len is greater than 0 then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); // may these methods make public?
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct);
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct);
@@ -134,9 +129,9 @@ size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct);
*
*/
/*!
converting one character from UTF-8 to an int
*/
/*
* convert one character from UTF-8 to an int
*/
size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct);
size_t utf8_to_int(const char * utf8, int & res, bool & correct);
size_t utf8_to_int(const std::string & utf8, int & res, bool & correct);
@@ -147,43 +142,46 @@ template<typename StreamIteratorType>
size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct);
/*!
converting one character from int to wide stream
returns true if a character was inserted to the stream
*/
/*
* convert one character from an int to a wide stream
*
* return true if a character was inserted to the stream
*/
template<typename StreamType>
bool int_to_wide(int c, StreamType & res);
/*!
converting one character from int to wide string
this method will not terminate the output string with a null character
return how many characters have been written (0, 1 or 2)
*/
/*
* convert one character from an int to a wide string
*
* this method will not terminate the output string with a null character
* return how many characters have been written (0, 1 or 2)
*/
size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len);
/*!
converting one character from int to wide string
returns true if a character was inserted to the string
*/
/*
* convert one character from an int to a wide string
*
* return true if a character was inserted to the string
*/
bool int_to_wide(int c, std::wstring & res);
/*!
call a convert_function for each character from an utf8 string
/*
* call a convert_function for each character from an utf8 string
*
* this function takes one int parameter:
* output_function(int c)
*/
template<typename OutputFunction>
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode = 1);
/*!
converting UTF-8 string to a wide string
*/
/*
* convert an UTF-8 string to a wide string
*/
bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear = true, int mode = 1);
bool utf8_to_wide(const char * utf8, std::wstring & res, bool clear = true, int mode = 1);
bool utf8_to_wide(const std::string & utf8, std::wstring & res, bool clear = true, int mode = 1);
@@ -216,7 +214,9 @@ bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamItera
template<typename CharT, size_t stack_size, size_t heap_block_size>
class TextStreamBase;
// defined at the end in textstream.h
/*
* this method is defined at the end of textstream.h
*/
template<size_t stack_size, size_t heap_block_size, typename StreamOrStringType>
bool utf8_to_wide(const TextStreamBase<char, stack_size, heap_block_size> & utf8, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1);
@@ -239,9 +239,9 @@ bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_bu
*/
/*!
converting one int character to UTF-8
*/
/*
* convert one int character to UTF-8
*/
size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len);
size_t int_to_utf8(int z, std::string & utf8, bool clear = true);
@@ -249,21 +249,23 @@ template<typename StreamType>
size_t int_to_utf8(int z, StreamType & utf8);
/*!
call an output_function for some sequence of wide characters from the stream buffer
output_function has two arguments: const char * buf, size_t len:
output_function(const char * buf, size_t len)
StreamType should have a const_iterator and begin() and end() methods
*/
/*
* call an output_function for some sequence of wide characters from the stream buffer
*
* output_function takes two arguments: const char * buf, size_t len:
* output_function(const char * buf, size_t len)
* this is a buffer which was filled with utf8 characters
* (this buffer can have up to 256 characters)
*
* StreamType should have a const_iterator and begin() and end() methods
*/
template<typename StreamType, typename OutputFunction>
bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode = 1);
/*!
converting a wide string to UTF-8 string
*/
/*
* convert a wide string to an UTF-8 string
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear = true, int mode = 1);
bool wide_to_utf8(const wchar_t * wide_string, std::string & utf8, bool clear = true, int mode = 1);
bool wide_to_utf8(const std::wstring & wide_string, std::string & utf8, bool clear = true, int mode = 1);
@@ -299,7 +301,9 @@ bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffe
/*
* some private/auxiliary methods
*/
namespace private_namespace
{
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res);
@@ -313,12 +317,12 @@ size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::str
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode);
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
/*
* an auxiliary function for converting from wide characters to UTF-8
*
* return how many wide characters were used
* if string_len is greater than 0 then the return value is always greater than zero too
*/
template<typename StreamType>
static size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode)
{
@@ -339,13 +343,13 @@ size_t chars;
was_error = true;
}
return chars;
return chars;
}
/*!
an auxiliary function for converting from wide characters to UTF-8
*/
/*
* an auxiliary function for converting from wide characters to UTF-8
*/
template<typename StreamType>
static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode)
{
@@ -357,7 +361,7 @@ static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, b
if( *(wide_string+1) != 0 )
min_str_len = 2;
return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode);
return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode);
}
} // namespace private_namespace
@@ -389,19 +393,19 @@ bool int_to_wide(int c, StreamType & res)
}
/*!
this function converts one UTF-8 character into int
input:
iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
iterator_end - an end iterator
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input stream
*/
/*
* convert one UTF-8 character into int
*
* input:
* iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
* iterator_end - an end iterator
*
* output:
* res - an output character
* correct - true if it is a correct character
*
* the function returns how many characters have been used from the input stream
*/
template<typename StreamIteratorType>
size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct)
{
@@ -432,7 +436,7 @@ unsigned char uz;
return i + 1;
}
if( utf8_check_range(res, len) )
if( is_correct_unicode_char(res, len) )
correct = true;
return len;
@@ -440,11 +444,10 @@ return len;
/*!
converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
(need to be tested)
*/
// need to be tested
/*
* convert UTF-8 string to a TextStreamBase<wchar_t,...> stream
* (need to be tested)
*/
template<typename StreamType>
bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode)
{
@@ -642,19 +645,19 @@ bool wide_to_output_function(StreamType & buffer, OutputFunction output_function
/*
this function converts a UTF-8 stream into a wide stream or a wide string
input:
stream - a UTF-8 stream for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - a wide stream or a wide string for the output sequence
this function returns false if there were some errors when converting
*/
* convert a UTF-8 stream into a wide stream or a wide string
*
* input:
* stream - a UTF-8 stream for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* res - a wide stream or a wide string for the output sequence
*
* this function returns false if there were some errors when converting
*/
template<typename StreamOrStringType>
bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode)
{
@@ -668,21 +671,20 @@ bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, i
/*
this function reads characters from a UTF-8 stream and calls an output_function
input:
stream - a UTF-8 stream for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType
and should put the character to the output string/stream, this function should have the signature like this:
output_function(int z, StreamOrStringType & res)
this function returns false if there were some errors when converting
*/
* read characters from an UTF-8 stream and call an output_function
*
* input:
* stream - a UTF-8 stream for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* output_function - a function which takes one artument: an int (a character):
* output_function(int c)
*
* this function returns false if there were some errors when converting
*/
template<typename OutputFunction>
bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode)
{
@@ -723,18 +725,18 @@ bool utf8_to_output_function(const Stream & stream, OutputFunction output_functi
/*!
this function converts UTF-8 stream into a wide stream or a wide string
input:
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
iterator_end - an end iterator
output:
out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string)
this function returns false if there were some errors when converting
*/
/*
* convert an UTF-8 stream into a wide stream or a wide string
*
* input:
* iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
* iterator_end - an end iterator
*
* output:
* out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string)
*
* this function returns false if there were some errors when converting
*/
template<typename StreamIteratorType, typename StreamOrStringType>
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode)
{
@@ -776,20 +778,20 @@ bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamItera
/*!
this function converts UTF-8 stream into a wide string
input:
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
iterator_end - an end iterator
output:
out_buffer - an output wide string
max_buffer_len - how many characters can be write (we write the terminating null character too)
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
this function returns false if there were some errors when converting or if the output buffer was too short
*/
/*
* convert an UTF-8 stream into a wide string
*
* input:
* iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
* iterator_end - an end iterator
*
* output:
* out_buffer - an output wide string
* max_buffer_len - how many characters can be write (we write the terminating null character too)
* was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
*
* this function returns false if there were some errors when converting or if the output buffer was too short
*/
template<typename StreamIteratorType>
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large)
{
@@ -849,19 +851,19 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i
/*!
this function converts UTF-8 stream into a wide string
input:
stream - a stream for reading from
output:
out_buffer - an output wide string
max_buffer_len - how many characters can be write (we write the terminating null character too)
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
this function returns false if there were some errors when converting or if the output buffer was too short
*/
/*
* convert an UTF-8 stream into a wide string
*
* input:
* stream - a stream for reading from
*
* output:
* out_buffer - an output wide string
* max_buffer_len - how many characters can be write (we write the terminating null character too)
* was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
*
* this function returns false if there were some errors when converting or if the output buffer was too short
*/
template<typename StreamType>
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode)
{
@@ -873,18 +875,18 @@ bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_bu
/*!
this function converts one wide character into UTF-8 stream
input:
z - wide character
output:
utf8 - a UTF-8 stream for the output sequence
the function returns how many characters have been written to the utf8 stream,
zero means that 'z' is an incorrect unicode character
*/
/*
* convert one wide character into an UTF-8 stream
*
* input:
* z - wide character
*
* output:
* utf8 - a UTF-8 stream for the output sequence
*
* the function returns how many characters have been written to the utf8 stream,
* zero means that 'z' is an incorrect unicode character
*/
template<typename StreamType>
size_t int_to_utf8(int z, StreamType & utf8)
{
@@ -902,21 +904,21 @@ size_t int_to_utf8(int z, StreamType & utf8)
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a wide string for converting
string_len - size of the string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* string_len - size of the string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a UTF-8 stream for the output sequence
*
* this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode)
{
@@ -937,20 +939,20 @@ return !was_error;
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a null terminated wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a null terminated wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a UTF-8 stream for the output sequence
*
* this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode)
{
@@ -964,20 +966,20 @@ return !was_error;
/*!
this function converts a wide string (std::wstring) into UTF-8 stream
input:
wide_string - a wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
/*
* convert a wide string (std::wstring) into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a UTF-8 stream for the output sequence
*
* this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode)
{
@@ -1014,7 +1016,7 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i
int c = static_cast<int>(stream.get_wchar(i));
bool is_correct = false;
if( utf8_check_range(c) )
if( is_correct_unicode_char(c) )
{
// CHECKME test me when sizeof(wchar_t) == 2
if( is_first_surrogate_char(c) )
@@ -1067,19 +1069,19 @@ bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear
/*!
this function converts a wide stream into a utf8 string
input:
buffer - a wide stream for reading from
output:
utf8 - an output utf8 string
max_buffer_len - how many characters can be write (we write the terminating null character too)
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
this function returns false if there were some errors when converting or if the output buffer was too short
*/
/*
* convert a wide stream into an UTF-8 string
*
* input:
* buffer - a wide stream for reading from
*
* output:
* utf8 - an output utf8 string
* max_buffer_len - how many characters can be write (we write the terminating null character too)
* was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
*
* this function returns false if there were some errors when converting or if the output buffer was too short
*/
template<typename StreamType>
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode)
{