diff --git a/src/space/spaceparser.cpp b/src/space/spaceparser.cpp index 9f79f9a..fc841a6 100644 --- a/src/space/spaceparser.cpp +++ b/src/space/spaceparser.cpp @@ -1178,7 +1178,7 @@ void SpaceParser::read_unicode_json_format(bool has_first_byte, int first_byte) } } - if( !ok || !pt::utf8_check_range(lastc) ) + if( !ok || !pt::is_correct_unicode_char(lastc) ) { lastc = 0xFFFD; // U+FFFD "replacement character"; } @@ -1207,7 +1207,7 @@ int i; value = (value << 4) | hex_to_int(c); } - if( i > 0 && c == '}' && pt::utf8_check_range(value) ) + if( i > 0 && c == '}' && pt::is_correct_unicode_char(value) ) { lastc = static_cast(value); } diff --git a/src/utf8/utf8.cpp b/src/utf8/utf8.cpp index 26c4f31..a613898 100644 --- a/src/utf8/utf8.cpp +++ b/src/utf8/utf8.cpp @@ -41,21 +41,21 @@ namespace pt { /* - * returns true if 'c' is a correct unicode character + * return true if 'c' is a correct unicode character */ -bool utf8_check_range(int c) +bool is_correct_unicode_char(int c) { return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF); } /* - * returns true if 'c' is a correct unicode character + * return true if 'c' is a correct unicode character * * this method is used when reading from an utf8 string * how_many_bytes - means how many bytes from the utf8 string were read */ -bool utf8_check_range(int c, int how_many_bytes) +bool is_correct_unicode_char(int c, int how_many_bytes) { if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 ) { @@ -124,10 +124,9 @@ bool surrogate_pair_to_int(int c1, int c2, int & z) /* - * an auxiliary function for converting from wide characters to UTF-8 - * converting a wide character into one int + * convert one wide (or two wide) characters to an int * - * returns how many wide characters were used + * return how many wide characters were used * if string_len is greater than 0 then the return value is always greater than zero too */ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct) @@ -167,7 +166,7 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool } else { - correct = utf8_check_range(z); + correct = is_correct_unicode_char(z); return 1; } } @@ -178,7 +177,7 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool * an auxiliary function for converting from wide characters to UTF-8 * converting a wide character into one int - * returns how many wide characters were used + * return how many wide characters were used * if wide_string has at least one character then the return value is always greater than zero too */ size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct) @@ -201,7 +200,7 @@ return wide_to_int(wide_string, min_str_len, z, correct); /* - * converts an int to a wide string + * convert an int to a wide string * * this method will not terminate the output string with a null character * return how many characters have been written (0, 1 or 2) @@ -233,9 +232,9 @@ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len) /* - * converts an int to a wide string + * convert an int to a wide string * - * returns true if a character was inserted to the string + * return true if a character was inserted to the string */ bool int_to_wide(int c, std::wstring & res) { @@ -258,6 +257,9 @@ bool int_to_wide(int c, std::wstring & res) /* + * convert one character into a stream + * stream can be an utf8 or a wide stream + * * return true if c was a correct unicode character * and has been put the the stream */ @@ -280,17 +282,17 @@ bool int_to_stream(int c, pt::Stream & stream) /* - * this function converts one UTF-8 character into one wide-character + * convert one UTF-8 character into one wide-character * * input: - * utf8 - an input UTF-8 string - * utf8_len - size of the input string, - * the string should be at least 4 bytes length for correctly - * recognized the utf-8 sequence + * utf8 - an input UTF-8 string + * utf8_len - size of the input string, + * the string should be at least 4 bytes length for correctly + * recognized the utf-8 sequence * * output: - * res - an output character - * correct - true if it is a correct character + * res - an output character + * correct - true if it is a correct character * * the function returns how many characters have been used from the input string * (returns zero only if utf8_len is zero) @@ -318,7 +320,7 @@ size_t i, len; return i; } - if( utf8_check_range(res, len) ) + if( is_correct_unicode_char(res, len) ) correct = true; return len; @@ -326,20 +328,20 @@ return len; -/*! - this function converts one UTF-8 character into one wide-character - - input: - utf8 - an input UTF-8 string (null terminated) - - output: - res - an output character - correct - true if it is a correct character - - the function returns how many characters have been used from the input string - (returns zero only if the string has '\0' at the first character) - even if there are errors the functions returns a different from zero value -*/ +/* + * convert one UTF-8 character into one wide-character + * + * input: + * utf8 - an input UTF-8 string (null terminated) + * + * output: + * res - an output character + * correct - true if it is a correct character + * + * the function returns how many characters have been used from the input string + * (returns zero only if the string has '\0' at the first character) + * even if there are errors the functions returns a different from zero value + */ size_t utf8_to_int(const char * utf8, int & res, bool & correct) { size_t i, len; @@ -362,7 +364,7 @@ size_t i, len; return i; } - if( utf8_check_range(res, len) ) + if( is_correct_unicode_char(res, len) ) correct = true; return len; @@ -371,20 +373,20 @@ return len; -/*! - this function converts one UTF-8 character into one wide-character - - input: - utf8 - an input UTF-8 string - - output: - res - an output character - correct - true if it is a correct character - - the function returns how many characters have been used from the input string - (returns zero only if utf8 is empty) - even if there are errors the functions returns a different from zero value -*/ +/* + * convert one UTF-8 character into one wide-character + * + * input: + * utf8 - an input UTF-8 string + * + * output: + * res - an output character + * correct - true if it is a correct character + * + * the function returns how many characters have been used from the input string + * (returns zero only if utf8 is empty) + * even if there are errors the functions returns a different from zero value + */ size_t utf8_to_int(const std::string & utf8, int & res, bool & correct) { return utf8_to_int(utf8.c_str(), utf8.size(), res, correct); @@ -392,18 +394,18 @@ size_t utf8_to_int(const std::string & utf8, int & res, bool & correct) -/*! - this function converts one UTF-8 character into one wide-character - - input: - utf8 - an input UTF-8 stream - - output: - res - an output character - correct - true if it is a correct character - - the function returns how many characters have been used from the input stream -*/ +/* + * convert one UTF-8 character into one wide-character + * + * input: + * utf8 - an input UTF-8 stream + * + * output: + * res - an output character + * correct - true if it is a correct character + * + * the function returns how many characters have been used from the input stream + */ size_t utf8_to_int(std::istream & utf8, int & res, bool & correct) { size_t i, len; @@ -431,7 +433,7 @@ unsigned char uz; return i; } - if( utf8_check_range(res, len) ) + if( is_correct_unicode_char(res, len) ) correct = true; return len; @@ -465,7 +467,7 @@ unsigned char uz; return i + 1; } - if( utf8_check_range(res, len) ) + if( is_correct_unicode_char(res, len) ) correct = true; } else @@ -482,21 +484,21 @@ unsigned char uz; -/*! - this function converts an utf8 string into wide string (std::wstring) - - input: - utf8 - an input utf8 string - utf8_len - size of the input string - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - res - an output wide string - - the function returns false if there were some errors when converting -*/ +/* + * convert an utf8 string into a wide string (std::wstring) + * + * input: + * utf8 - an input utf8 string + * utf8_len - size of the input string + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * res - an output wide string + * + * the function returns false if there were some errors when converting + */ bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear, int mode) { if( clear ) @@ -513,20 +515,20 @@ bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool c -/*! - this function converts an utf8 string into wide string (std::wstring) - - input: - utf8 - an input utf8 null terminated string - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - res - an output wide string - - the function returns false if there were some errors when converting -*/ +/* + * convert an utf8 string into a wide string (std::wstring) + * + * input: + * utf8 - an input utf8 null terminated string + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * res - an output wide string + * + * the function returns false if there were some errors when converting + */ bool utf8_to_wide(const char * utf8, std::wstring & res, bool clear, int mode) { size_t utf8_len = 0; @@ -539,20 +541,20 @@ return utf8_to_wide(utf8, utf8_len, res, clear, mode); -/*! - this function converts an utf8 string into wide string (std::wstring) - - input: - utf8 - an input utf8 string - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - res - an output wide string - - the function returns false if there were some errors when converting -*/ +/* + * convert an utf8 string into a wide string (std::wstring) + * + * input: + * utf8 - an input utf8 string + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * res - an output wide string + * + * the function returns false if there were some errors when converting + */ bool utf8_to_wide(const std::string & utf8, std::wstring & res, bool clear, int mode) { return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode); @@ -560,20 +562,20 @@ bool utf8_to_wide(const std::string & utf8, std::wstring & res, bool clear, int -/*! - this function converts an utf8 stream into wide string (std::wstring) - - input: - utf8 - an input utf8 stream - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - res - an output wide string - - the function returns false if there were some errors when converting -*/ +/* + * convert an utf8 stream into a wide string (std::wstring) + * + * input: + * utf8 - an input utf8 stream + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * res - an output wide string + * + * the function returns false if there were some errors when converting + */ bool utf8_to_wide(std::istream & utf8, std::wstring & res, bool clear, int mode) { int z; @@ -603,26 +605,26 @@ return !was_error; -/*! - this function converts one wide character into UTF-8 sequence - - input: - z - wide character - - output: - utf8 - a buffer for the output sequence - utf8_len - the size of the buffer - - the function returns how many characters have been written to the utf8, - zero means the utf8 buffer is too small or 'z' is an incorrect unicode character -*/ +/* + * convert one wide character into an UTF-8 sequence + * + * input: + * z - wide character + * + * output: + * utf8 - a buffer for the output sequence + * utf8_len - the size of the buffer + * + * the function returns how many characters have been written to the utf8, + * zero means the utf8 buffer is too small or 'z' is an incorrect unicode character + */ size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len) { char buf[10]; int i = 0; int mask = 0x3f; // 6 first bits set - if( utf8_max_len==0 || !utf8_check_range(z) ) + if( utf8_max_len==0 || !is_correct_unicode_char(z) ) return 0; if( z <= 0x7f ) @@ -658,18 +660,18 @@ return a; -/*! - this function converts one wide character into UTF-8 string - - input: - z - wide character - - output: - utf8 - a UTF-8 string for the output sequence (the string is not cleared) - - the function returns how many characters have been written to the utf8 string, - zero means that 'z' is an incorrect unicode character -*/ +/* + * convert one wide character into an UTF-8 string + * + * input: + * z - wide character + * + * output: + * utf8 - a UTF-8 string for the output sequence (the string is not cleared) + * + * the function returns how many characters have been written to the utf8 string, + * zero means that 'z' is an incorrect unicode character + */ size_t int_to_utf8(int z, std::string & utf8, bool clear) { char buf[10]; @@ -688,21 +690,21 @@ return len; -/*! - this function converts a wide string into UTF-8 string - - input: - wide_string - a wide string for converting - string_len - the size of the string - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 string for the output sequence (the string is not cleared) - - this function returns false if there were some errors when converting -*/ +/* + * convert a wide string into an UTF-8 string + * + * input: + * wide_string - a wide string for converting + * string_len - the size of the string + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a UTF-8 string for the output sequence (the string is not cleared) + * + * this function returns false if there were some errors when converting + */ bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear, int mode) { bool was_error = false; @@ -723,20 +725,20 @@ return !was_error; -/*! - this function converts a wide string into UTF-8 string - - input: - wide_string - a null terminated wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 string for the output sequence (the string is not cleared) - - this function returns false if there were some errors when converting -*/ +/* + * convert a wide string into an UTF-8 string + * + * input: + * wide_string - a null terminated wide string for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a UTF-8 string for the output sequence (the string is not cleared) + * + * this function returns false if there were some errors when converting + */ bool wide_to_utf8(const wchar_t * wide_string, std::string & utf8, bool clear, int mode) { bool was_error = false; @@ -752,20 +754,20 @@ return !was_error; -/*! - this function converts a wide string (std::wstring) into UTF-8 string - - input: - wide_string - a wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 string for the output sequence (the string is not cleared) - - this function returns false if there were some errors when converting -*/ +/* + * convert a wide string (std::wstring) into an UTF-8 string + * + * input: + * wide_string - a wide string for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a UTF-8 string for the output sequence (the string is not cleared) + * + * this function returns false if there were some errors when converting + */ bool wide_to_utf8(const std::wstring & wide_string, std::string & utf8, bool clear, int mode) { return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, clear, mode); @@ -775,27 +777,27 @@ bool wide_to_utf8(const std::wstring & wide_string, std::string & utf8, bool cle -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a wide string for converting - string_len - lenght of the wide string - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a buffer for the UTF-8 stream - utf8_len - the size of the buffer - utf8_written - how many bytes have been written to the buffer - - this function returns false if there were some errors when converting or the output buffer was too small, - the output string is not null terminated - - if there is an error when converting (there is an incorrect character in the wide string) the function - will continue converting but if the buffer is too small the function breaks immediately -*/ +/* + * convert a wide string into an UTF-8 stream + * + * input: + * wide_string - a wide string for converting + * string_len - lenght of the wide string + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a buffer for the UTF-8 stream + * utf8_len - the size of the buffer + * utf8_written - how many bytes have been written to the buffer + * + * this function returns false if there were some errors when converting or the output buffer was too small, + * the output string is not null terminated + * + * if there is an error when converting (there is an incorrect character in the wide string) the function + * will continue converting but if the buffer is too small the function breaks immediately + */ bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode) { bool was_error = false; @@ -830,26 +832,26 @@ return !was_error; -/*! - this function converts a wide string (std::wstring) into UTF-8 stream - - input: - wide_string - a wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a buffer for the UTF-8 stream - utf8_len - the size of the buffer - utf8_written - how many bytes have been written to the buffer - - this function returns false if there were some errors when converting or the output buffer was too small, - the output string is not null terminated - - if there is an error when converting (there is an incorrect character in the wide string) the function - will continue converting but if the buffer is too small the function breaks immediately -*/ +/* + * convert a wide string (std::wstring) into an UTF-8 stream + * + * input: + * wide_string - a wide string for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a buffer for the UTF-8 stream + * utf8_len - the size of the buffer + * utf8_written - how many bytes have been written to the buffer + * + * this function returns false if there were some errors when converting or the output buffer was too small, + * the output string is not null terminated + * + * if there is an error when converting (there is an incorrect character in the wide string) the function + * will continue converting but if the buffer is too small the function breaks immediately + */ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode) { return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, utf8_written, mode); @@ -857,27 +859,27 @@ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a wide string for converting - string_len - lenght of the wide string - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a buffer for the UTF-8 stream - utf8_len - the size of the buffer - - this function returns false if there were some errors when converting or the output buffer was too small, - the output string is null terminated (even if there were errors during converting) - - if there is an error when converting (there is an incorrect character in the wide string) the function - will continue converting but if the buffer is too small the function breaks immediately - (in both cases the utf8 buffer is null terminated) -*/ +/* + * convert a wide string into an UTF-8 stream + * + * input: + * wide_string - a wide string for converting + * string_len - lenght of the wide string + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a buffer for the UTF-8 stream + * utf8_len - the size of the buffer + * + * this function returns false if there were some errors when converting or the output buffer was too small, + * the output string is null terminated (even if there were errors during converting) + * + * if there is an error when converting (there is an incorrect character in the wide string) the function + * will continue converting but if the buffer is too small the function breaks immediately + * (in both cases the utf8 buffer is null terminated) + */ bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode) { size_t utf8_saved; @@ -894,26 +896,26 @@ return res; -/*! - this function converts a wide string (std::wstring) into UTF-8 stream - - input: - wide_string - a wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a buffer for the UTF-8 stream - utf8_len - the size of the buffer - - this function returns false if there were some errors when converting or the output buffer was too small, - the output string is null terminated (even if there were errors during converting) - - if there is an error when converting (there is an incorrect character in the wide string) the function - will continue converting but if the buffer is too small the function breaks immediately - (in both cases the utf8 buffer is null terminated) -*/ +/* + * convert a wide string (std::wstring) into an UTF-8 stream + * + * input: + * wide_string - a wide string for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a buffer for the UTF-8 stream + * utf8_len - the size of the buffer + * + * this function returns false if there were some errors when converting or the output buffer was too small, + * the output string is null terminated (even if there were errors during converting) + * + * if there is an error when converting (there is an incorrect character in the wide string) the function + * will continue converting but if the buffer is too small the function breaks immediately + * (in both cases the utf8 buffer is null terminated) + */ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode) { return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, mode); @@ -921,26 +923,26 @@ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a null terminated wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a buffer for the UTF-8 stream - utf8_len - the size of the buffer - utf8_written - how many bytes have been written to the buffer - - this function returns false if there were some errors when converting or the output buffer was too small, - the output string is not null terminated - - if there is an error when converting (there is an incorrect character in the wide string) the function - will continue converting but if the buffer is too small the function breaks immediately -*/ +/* + * convert a wide string into an UTF-8 stream + * + * input: + * wide_string - a null terminated wide string for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a buffer for the UTF-8 stream + * utf8_len - the size of the buffer + * utf8_written - how many bytes have been written to the buffer + * + * this function returns false if there were some errors when converting or the output buffer was too small, + * the output string is not null terminated + * + * if there is an error when converting (there is an incorrect character in the wide string) the function + * will continue converting but if the buffer is too small the function breaks immediately + */ bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode) { bool was_error = false; @@ -976,26 +978,26 @@ return !was_error; -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a buffer for the UTF-8 stream - utf8_len - the size of the buffer - - this function returns false if there were some errors when converting or the output buffer was too small, - the output string is null terminated (even if there were errors during converting) - - if there is an error when converting (there is an incorrect character in the wide string) the function - will continue converting but if the buffer is too small the function breaks immediately - (in both cases the utf8 buffer is null terminated) -*/ +/* + * convert a wide string into an UTF-8 stream + * + * input: + * wide_string - a wide string for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a buffer for the UTF-8 stream + * utf8_len - the size of the buffer + * + * this function returns false if there were some errors when converting or the output buffer was too small, + * the output string is null terminated (even if there were errors during converting) + * + * if there is an error when converting (there is an incorrect character in the wide string) the function + * will continue converting but if the buffer is too small the function breaks immediately + * (in both cases the utf8 buffer is null terminated) + */ bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode) { size_t utf8_saved; @@ -1017,9 +1019,9 @@ return res; namespace private_namespace { -/*! - an auxiliary function for converting from UTF-8 string -*/ +/* + * an auxiliary function for converting from UTF-8 string + */ bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res) { for(len=0 ; (uz & 0x80) != 0 ; ++len) @@ -1041,9 +1043,9 @@ return true; -/*! - an auxiliary function for converting from UTF-8 string -*/ +/* + * an auxiliary function for converting from UTF-8 string + */ bool utf8_to_int_add_next_octet(unsigned char uz, int & res) { if( (uz & 0xc0) != 0x80 ) @@ -1057,20 +1059,20 @@ return true; -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too - - utf8_written - how many characters were saved in the utf8 string (the string doesn't have - a null terminating character) - it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read - was_utf8_buf_too_small - will be true if the utf8 buffer is too small - if this flag is true then utf8_written is equal to zero - was_error - will be true if there is an error when converting (there was an incorrect wide character) - (was_error will not be true if the utf8 buffer is too small) -*/ +/* + * an auxiliary function for converting from wide characters to UTF-8 + * + * return how many wide characters were used + * if string_len is greater than 0 then the return value is always greater than zero too + * + * utf8_written - how many characters were saved in the utf8 string (the string doesn't have + * a null terminating character) + * it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read + * was_utf8_buf_too_small - will be true if the utf8 buffer is too small + * if this flag is true then utf8_written is equal to zero + * was_error - will be true if there is an error when converting (there was an incorrect wide character) + * (was_error will not be true if the utf8 buffer is too small) + */ size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode) { @@ -1107,12 +1109,12 @@ return chars; -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too -*/ +/* + * an auxiliary function for converting from wide characters to UTF-8 + * + * return how many wide characters were used + * if string_len is greater than 0 then the return value is always greater than zero too + */ size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode) { int z; @@ -1137,12 +1139,12 @@ return chars; -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if wide_string has at least one character then the return value is always greater than zero too -*/ +/* + * an auxiliary function for converting from wide characters to UTF-8 + * + * return how many wide characters were used + * if wide_string has at least one character then the return value is always greater than zero too + */ size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode) { int z; diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h index 51f619b..1f2106e 100644 --- a/src/utf8/utf8.h +++ b/src/utf8/utf8.h @@ -42,85 +42,80 @@ namespace pt { + /* - * public methods are also defined in utf8_stream.h + * UTF-8, a transformation format of ISO 10646 + * http://tools.ietf.org/html/rfc3629 * + * when wchar_t is 4 bytes length we use UTF-32 + * when wchar_t is 2 bytes length we use UTF-16 (with surrogate pairs) + * + * UTF-16 + * http://www.ietf.org/rfc/rfc2781.txt */ -/*! - UTF-8, a transformation format of ISO 10646 - http://tools.ietf.org/html/rfc3629 - - when wchar_t is 4 bytes length we use UTF-32 - when wchar_t is 2 bytes length we use UTF-16 (with surrogate pairs) - - UTF-16 - http://www.ietf.org/rfc/rfc2781.txt -*/ - - - -/*! - returns true if 'c' is a correct unicode character - - RENAMEME to is_correct_unicode_char -*/ -bool utf8_check_range(int c); - - -/*! - returns true if 'c' is a correct unicode character - - this method is used when reading from an utf8 string - how_many_chars - means how many characters from utf8 string were read -*/ -bool utf8_check_range(int c, int how_many_bytes); +/* + * return true if 'c' is a correct unicode character + */ +bool is_correct_unicode_char(int c); /* - * returns true if 'c' is a characters from the surrogate range + * return true if 'c' is a correct unicode character + * + * this method is used when reading from an utf8 string + * how_many_chars - means how many characters from utf8 string were read + */ +bool is_correct_unicode_char(int c, int how_many_bytes); + + +/* + * return true if 'c' is a character from the surrogate range * (c>=0xD800 && c<=0xDFFF) * */ bool is_surrogate_char(int c); - /* - * returns true if 'c' is a first character from the surrogate pair + * return true if 'c' is a first character from the surrogate pair * (c>=0xD800 && c<=0xDBFF) */ bool is_first_surrogate_char(int c); /* - * returns true if 'c' is a second character from the surrogate pair + * return true if 'c' is a second character from the surrogate pair * (c>=0xDC00 && c<=0xDFFF) */ bool is_second_surrogate_char(int c); /* - * returns a code point from two surrogate pair characters + * return a code point from two surrogate pair characters */ bool surrogate_pair_to_int(int c1, int c2, int & z); - /* - * converting one character into a stream - * stream can be an utf8 or wide stream + * convert one character into a stream + * stream can be an utf8 or a wide stream + * + * return true if c was a correct unicode character + * and has been put the the stream */ bool int_to_stream(int c, pt::Stream & stream); /* - * converting a one unicode character to an int - * such an unicode character can consists of one or two wide characters + * convert one wide (or two wide) characters to an int + * + * return how many wide characters were used + * if string_len is greater than 0 then the return value is always greater than zero too */ -size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); // may these methods make public? +size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct); @@ -134,9 +129,9 @@ size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct); * */ -/*! - converting one character from UTF-8 to an int -*/ +/* + * convert one character from UTF-8 to an int + */ size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct); size_t utf8_to_int(const char * utf8, int & res, bool & correct); size_t utf8_to_int(const std::string & utf8, int & res, bool & correct); @@ -147,43 +142,46 @@ template size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct); -/*! - converting one character from int to wide stream - - returns true if a character was inserted to the stream -*/ +/* + * convert one character from an int to a wide stream + * + * return true if a character was inserted to the stream + */ template bool int_to_wide(int c, StreamType & res); -/*! - converting one character from int to wide string - - this method will not terminate the output string with a null character - return how many characters have been written (0, 1 or 2) -*/ +/* + * convert one character from an int to a wide string + * + * this method will not terminate the output string with a null character + * return how many characters have been written (0, 1 or 2) + */ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len); -/*! - converting one character from int to wide string - - returns true if a character was inserted to the string -*/ +/* + * convert one character from an int to a wide string + * + * return true if a character was inserted to the string + */ bool int_to_wide(int c, std::wstring & res); -/*! - call a convert_function for each character from an utf8 string +/* + * call a convert_function for each character from an utf8 string + * + * this function takes one int parameter: + * output_function(int c) */ template bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode = 1); -/*! - converting UTF-8 string to a wide string -*/ +/* + * convert an UTF-8 string to a wide string + */ bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear = true, int mode = 1); bool utf8_to_wide(const char * utf8, std::wstring & res, bool clear = true, int mode = 1); bool utf8_to_wide(const std::string & utf8, std::wstring & res, bool clear = true, int mode = 1); @@ -216,7 +214,9 @@ bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamItera template class TextStreamBase; -// defined at the end in textstream.h +/* + * this method is defined at the end of textstream.h + */ template bool utf8_to_wide(const TextStreamBase & utf8, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1); @@ -239,9 +239,9 @@ bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_bu */ -/*! - converting one int character to UTF-8 -*/ +/* + * convert one int character to UTF-8 + */ size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len); size_t int_to_utf8(int z, std::string & utf8, bool clear = true); @@ -249,21 +249,23 @@ template size_t int_to_utf8(int z, StreamType & utf8); -/*! - call an output_function for some sequence of wide characters from the stream buffer - - output_function has two arguments: const char * buf, size_t len: - output_function(const char * buf, size_t len) - - StreamType should have a const_iterator and begin() and end() methods -*/ +/* + * call an output_function for some sequence of wide characters from the stream buffer + * + * output_function takes two arguments: const char * buf, size_t len: + * output_function(const char * buf, size_t len) + * this is a buffer which was filled with utf8 characters + * (this buffer can have up to 256 characters) + * + * StreamType should have a const_iterator and begin() and end() methods + */ template bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode = 1); -/*! - converting a wide string to UTF-8 string -*/ +/* + * convert a wide string to an UTF-8 string + */ bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear = true, int mode = 1); bool wide_to_utf8(const wchar_t * wide_string, std::string & utf8, bool clear = true, int mode = 1); bool wide_to_utf8(const std::wstring & wide_string, std::string & utf8, bool clear = true, int mode = 1); @@ -299,7 +301,9 @@ bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffe - +/* + * some private/auxiliary methods + */ namespace private_namespace { bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res); @@ -313,12 +317,12 @@ size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::str size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode); -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too -*/ +/* + * an auxiliary function for converting from wide characters to UTF-8 + * + * return how many wide characters were used + * if string_len is greater than 0 then the return value is always greater than zero too + */ template static size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode) { @@ -339,13 +343,13 @@ size_t chars; was_error = true; } -return chars; + return chars; } -/*! - an auxiliary function for converting from wide characters to UTF-8 -*/ +/* + * an auxiliary function for converting from wide characters to UTF-8 + */ template static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode) { @@ -357,7 +361,7 @@ static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, b if( *(wide_string+1) != 0 ) min_str_len = 2; -return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode); + return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode); } } // namespace private_namespace @@ -389,19 +393,19 @@ bool int_to_wide(int c, StreamType & res) } -/*! - this function converts one UTF-8 character into int - - input: - iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) - iterator_end - an end iterator - - output: - res - an output character - correct - true if it is a correct character - - the function returns how many characters have been used from the input stream -*/ +/* + * convert one UTF-8 character into int + * + * input: + * iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) + * iterator_end - an end iterator + * + * output: + * res - an output character + * correct - true if it is a correct character + * + * the function returns how many characters have been used from the input stream + */ template size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct) { @@ -432,7 +436,7 @@ unsigned char uz; return i + 1; } - if( utf8_check_range(res, len) ) + if( is_correct_unicode_char(res, len) ) correct = true; return len; @@ -440,11 +444,10 @@ return len; -/*! - converting UTF-8 string to a TextStreamBase stream - (need to be tested) -*/ -// need to be tested +/* + * convert UTF-8 string to a TextStreamBase stream + * (need to be tested) + */ template bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode) { @@ -642,19 +645,19 @@ bool wide_to_output_function(StreamType & buffer, OutputFunction output_function /* -this function converts a UTF-8 stream into a wide stream or a wide string - -input: - stream - a UTF-8 stream for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - -output: - res - a wide stream or a wide string for the output sequence - - this function returns false if there were some errors when converting -*/ + * convert a UTF-8 stream into a wide stream or a wide string + * + * input: + * stream - a UTF-8 stream for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * res - a wide stream or a wide string for the output sequence + * + * this function returns false if there were some errors when converting + */ template bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode) { @@ -668,21 +671,20 @@ bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, i /* -this function reads characters from a UTF-8 stream and calls an output_function - -input: - stream - a UTF-8 stream for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - -output: - output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType - and should put the character to the output string/stream, this function should have the signature like this: - output_function(int z, StreamOrStringType & res) - - this function returns false if there were some errors when converting -*/ + * read characters from an UTF-8 stream and call an output_function + * + * input: + * stream - a UTF-8 stream for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * output_function - a function which takes one artument: an int (a character): + * output_function(int c) + * + * this function returns false if there were some errors when converting + */ template bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode) { @@ -723,18 +725,18 @@ bool utf8_to_output_function(const Stream & stream, OutputFunction output_functi -/*! - this function converts UTF-8 stream into a wide stream or a wide string - - input: - iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) - iterator_end - an end iterator - - output: - out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string) - - this function returns false if there were some errors when converting -*/ +/* + * convert an UTF-8 stream into a wide stream or a wide string + * + * input: + * iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) + * iterator_end - an end iterator + * + * output: + * out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string) + * + * this function returns false if there were some errors when converting + */ template bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode) { @@ -776,20 +778,20 @@ bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamItera -/*! - this function converts UTF-8 stream into a wide string - - input: - iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) - iterator_end - an end iterator - - output: - out_buffer - an output wide string - max_buffer_len - how many characters can be write (we write the terminating null character too) - was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large - - this function returns false if there were some errors when converting or if the output buffer was too short -*/ +/* + * convert an UTF-8 stream into a wide string + * + * input: + * iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) + * iterator_end - an end iterator + * + * output: + * out_buffer - an output wide string + * max_buffer_len - how many characters can be write (we write the terminating null character too) + * was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large + * + * this function returns false if there were some errors when converting or if the output buffer was too short + */ template bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large) { @@ -849,19 +851,19 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i -/*! - this function converts UTF-8 stream into a wide string - - input: - stream - a stream for reading from - - output: - out_buffer - an output wide string - max_buffer_len - how many characters can be write (we write the terminating null character too) - was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large - - this function returns false if there were some errors when converting or if the output buffer was too short -*/ +/* + * convert an UTF-8 stream into a wide string + * + * input: + * stream - a stream for reading from + * + * output: + * out_buffer - an output wide string + * max_buffer_len - how many characters can be write (we write the terminating null character too) + * was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large + * + * this function returns false if there were some errors when converting or if the output buffer was too short + */ template bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode) { @@ -873,18 +875,18 @@ bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_bu -/*! - this function converts one wide character into UTF-8 stream - - input: - z - wide character - - output: - utf8 - a UTF-8 stream for the output sequence - - the function returns how many characters have been written to the utf8 stream, - zero means that 'z' is an incorrect unicode character -*/ +/* + * convert one wide character into an UTF-8 stream + * + * input: + * z - wide character + * + * output: + * utf8 - a UTF-8 stream for the output sequence + * + * the function returns how many characters have been written to the utf8 stream, + * zero means that 'z' is an incorrect unicode character + */ template size_t int_to_utf8(int z, StreamType & utf8) { @@ -902,21 +904,21 @@ size_t int_to_utf8(int z, StreamType & utf8) -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a wide string for converting - string_len - size of the string - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 stream for the output sequence - - this function returns false if there were some errors when converting -*/ +/* + * convert a wide string into an UTF-8 stream + * + * input: + * wide_string - a wide string for converting + * string_len - size of the string + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a UTF-8 stream for the output sequence + * + * this function returns false if there were some errors when converting + */ template bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode) { @@ -937,20 +939,20 @@ return !was_error; -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a null terminated wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 stream for the output sequence - - this function returns false if there were some errors when converting -*/ +/* + * convert a wide string into an UTF-8 stream + * + * input: + * wide_string - a null terminated wide string for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a UTF-8 stream for the output sequence + * + * this function returns false if there were some errors when converting + */ template bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode) { @@ -964,20 +966,20 @@ return !was_error; -/*! - this function converts a wide string (std::wstring) into UTF-8 stream - - input: - wide_string - a wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 stream for the output sequence - - this function returns false if there were some errors when converting -*/ +/* + * convert a wide string (std::wstring) into an UTF-8 stream + * + * input: + * wide_string - a wide string for converting + * mode - what to do with errors when converting + * 0: skip an invalid character + * 1: put U+FFFD "replacement character" istead of the invalid character (default) + * + * output: + * utf8 - a UTF-8 stream for the output sequence + * + * this function returns false if there were some errors when converting + */ template bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode) { @@ -1014,7 +1016,7 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i int c = static_cast(stream.get_wchar(i)); bool is_correct = false; - if( utf8_check_range(c) ) + if( is_correct_unicode_char(c) ) { // CHECKME test me when sizeof(wchar_t) == 2 if( is_first_surrogate_char(c) ) @@ -1067,19 +1069,19 @@ bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear -/*! - this function converts a wide stream into a utf8 string - - input: - buffer - a wide stream for reading from - - output: - utf8 - an output utf8 string - max_buffer_len - how many characters can be write (we write the terminating null character too) - was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large - - this function returns false if there were some errors when converting or if the output buffer was too short -*/ +/* + * convert a wide stream into an UTF-8 string + * + * input: + * buffer - a wide stream for reading from + * + * output: + * utf8 - an output utf8 string + * max_buffer_len - how many characters can be write (we write the terminating null character too) + * was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large + * + * this function returns false if there were some errors when converting or if the output buffer was too short + */ template bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode) {