From 4d70ae9e875eee66d31da0fefb196c0c5b1aadec Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Mon, 14 Jun 2021 13:48:32 +0200 Subject: [PATCH] fixed: using size() when serializing strings - this allows to serialize a string which contain a null character fixed: printing null character in space format: \u0000 (before was \0 which is not correct in json) fixed: in serialize_string_buffer(const char * input_str, ...) a temporary fixed was used when copying input string added support for surrogate pairs when reading \uHHHH format added support to parse \u{H...} format (only if parsing Space format) --- src/space/space.cpp | 2 +- src/space/space.h | 89 ++++++++++++++++++++-------- src/space/spaceparser.cpp | 121 ++++++++++++++++++++++++++++++++++++-- src/space/spaceparser.h | 4 ++ src/utf8/utf8.cpp | 38 ++++++++++++ src/utf8/utf8.h | 31 ++++++++++ src/utf8/utf8_private.cpp | 6 +- src/utf8/utf8_private.h | 4 ++ tests/main.cpp | 6 +- 9 files changed, 262 insertions(+), 39 deletions(-) diff --git a/src/space/space.cpp b/src/space/space.cpp index bc334c2..6d36ef6 100644 --- a/src/space/space.cpp +++ b/src/space/space.cpp @@ -953,7 +953,7 @@ std::string Space::to_str() const if( type == type_wstring ) { TextStream stream; - serialize_string_buffer(value.value_wstring.c_str(), stream, Escape::no_escape); + serialize_string_buffer(value.value_wstring.c_str(), value.value_wstring.size(), stream, Escape::no_escape); stream.to_string(str); return str; } diff --git a/src/space/space.h b/src/space/space.h index 8cba9f2..1a79748 100644 --- a/src/space/space.h +++ b/src/space/space.h @@ -768,8 +768,18 @@ protected: template void escape_to_space_format(int c, StreamType & out) const { - // IMPLEMENT ME - escape_to_json_format(c, out); + switch(c) + { + case 0: out << '\\'; out << 'u' << '{' << '0' << '}'; break; + case '\r': out << '\\'; out << 'r'; break; // 13 + case '\n': out << '\\'; out << 'n'; break; // 10 + case '\\': out << '\\'; out << '\\'; break; + case '"': out << '\\'; out << '\"'; break; + case '\b': out << '\\'; out << 'b'; break; // 8 + case '\f': out << '\\'; out << 'f'; break; // 12 + default: + out << static_cast(c); + } } @@ -778,14 +788,14 @@ protected: { switch(c) { - case 0: out << '\\'; out << '0'; break; - case '\r': out << '\\'; out << 'r'; break; - case '\n': out << '\\'; out << 'n'; break; - case '\\': out << '\\'; out << '\\'; break; - case '"': out << '\\'; out << '\"'; break; - //case '(': out << '\\'; out << '('; break; - //case ')': out << '\\'; out << ')'; break; - //case '=': out << '\\'; out << '='; break; + case 0: out << '\\'; out << 'u' << '0' << '0' << '0' << '0'; break; + case '\r': out << '\\'; out << 'r'; break; // 13 + case '\n': out << '\\'; out << 'n'; break; // 10 + case '\\': out << '\\'; out << '\\'; break; + case '"': out << '\\'; out << '\"'; break; + case '\t': out << '\\'; out << 't'; break; // 9 + case '\b': out << '\\'; out << 'b'; break; // 8 + case '\f': out << '\\'; out << 'f'; break; // 12 default: out << static_cast(c); } @@ -811,6 +821,22 @@ protected: } } + template + void copy_input_string_to_output(const CharT * input_str, size_t len, StreamType & out_str, Escape escape) const + { + for(size_t i=0 ; i < len ; ++i) + { + if( escape == Escape::no_escape ) + out_str << static_cast(input_str[i]); + else + if( escape == Escape::escape_space ) + escape_to_space_format(input_str[i], out_str); + else + if( escape == Escape::escape_json ) + escape_to_json_format(input_str[i], out_str); + } + } + template void copy_input_stream_to_output(const StreamType & input_str, StreamType & out_str, Escape escape) const { @@ -833,23 +859,37 @@ protected: template - void serialize_string_buffer(const char * input_str, StreamType & out_str, Escape escape) const + void serialize_string_buffer(const char * input_str, size_t len, StreamType & out_str, Escape escape) const { if constexpr ( sizeof(char) == sizeof(typename StreamType::char_type) ) { // input and output are char (we assume it is utf8) - copy_input_string_to_output(input_str, out_str, escape); + copy_input_string_to_output(input_str, len, out_str, escape); } else { // input is utf8 but output is wide - copy_input_string_to_output(input_str, out_str, escape); // temporarily + StreamType temp_stream; + utf8_to_wide(input_str, len, temp_stream, false); + copy_input_stream_to_output(temp_stream, out_str, escape); + } + } - // !!!!!!!!!!!!!!!!!!! FIXME -// StreamType temp_stream; -// UTF8ToWide(input_str, temp_stream, false); -// -// copy_input_stream_to_output(temp_stream, out_str, escape); + + template + void serialize_string_buffer(const wchar_t * input_str, size_t len, StreamType & out_str, Escape escape) const + { + if constexpr ( sizeof(wchar_t) == sizeof(typename StreamType::char_type) ) + { + // input and output are wide characters + copy_input_string_to_output(input_str, len, out_str, escape); + } + else + { + // input is wide but output is utf8 + StreamType temp_stream; + wide_to_utf8(input_str, len, temp_stream, false); + copy_input_stream_to_output(temp_stream, out_str, escape); } } @@ -864,17 +904,14 @@ protected: } else { - StreamType temp_stream; - // input is wide but output is utf8 + StreamType temp_stream; wide_to_utf8(input_str, temp_stream, false); copy_input_stream_to_output(temp_stream, out_str, escape); } } - - template void serialize_space_null(StreamType & str) const { @@ -937,7 +974,7 @@ protected: void serialize_space_string(StreamType & str) const { str << '"'; - serialize_string_buffer(value.value_string.c_str(), str, Escape::escape_space); + serialize_string_buffer(value.value_string.c_str(), value.value_string.size(), str, Escape::escape_space); str << '"'; } @@ -945,7 +982,7 @@ protected: void serialize_space_wstring(StreamType & str) const { str << '"'; - serialize_string_buffer(value.value_wstring.c_str(), str, Escape::escape_space); + serialize_string_buffer(value.value_wstring.c_str(), value.value_wstring.size(), str, Escape::escape_space); str << '"'; } @@ -1121,7 +1158,7 @@ protected: void serialize_json_string(StreamType & str) const { str << '"'; - serialize_string_buffer(value.value_string.c_str(), str, Escape::escape_json); + serialize_string_buffer(value.value_string.c_str(), value.value_string.size(), str, Escape::escape_json); str << '"'; } @@ -1129,7 +1166,7 @@ protected: void serialize_json_wstring(StreamType & str) const { str << '"'; - serialize_string_buffer(value.value_wstring.c_str(), str, Escape::escape_json); + serialize_string_buffer(value.value_wstring.c_str(), value.value_wstring.size(), str, Escape::escape_json); str << '"'; } diff --git a/src/space/spaceparser.cpp b/src/space/spaceparser.cpp index a5704f8..0fb5570 100644 --- a/src/space/spaceparser.cpp +++ b/src/space/spaceparser.cpp @@ -1037,25 +1037,130 @@ return 0; } -void SpaceParser::read_unicode_code_point() +/* + * format: \uHHHH where H is a hex digit 0-F + */ +bool SpaceParser::read_unicode_four_digit_format(bool has_first_byte, int first_byte) { -wchar_t c; +int c; int value = 0; for(int i=0 ; i<4 ; ++i) { - c = read_char_no_escape(); + if( i == 0 && has_first_byte ) + { + c = first_byte; + } + else + { + c = read_char_no_escape(); + } if( !is_hex_digit(c) ) { - status = syntax_error; - return; + return false; } value = (value << 4) | hex_to_int(c); } - lastc = (wchar_t)value; + lastc = static_cast(value); + return true; +} + + + +/* + * format: \uHHHH and optionally following by \uHHHH + * + */ +void SpaceParser::read_unicode_json_format(bool has_first_byte, int first_byte) +{ + bool ok = read_unicode_four_digit_format(has_first_byte, first_byte); + + if( ok && pt::is_first_surrogate_char(lastc) ) + { + int c1 = lastc; + int c = read_char_no_escape(); + + ok = ok && (c == '\\'); + + if( ok ) + { + c = read_char_no_escape(); + ok = ok && (c == 'u'); + ok = ok && read_unicode_four_digit_format(false, 0); + + if( ok && pt::is_second_surrogate_char(lastc) ) + { + int c2 = lastc; + ok = ok && pt::surrogate_pair_to_int(c1, c2, lastc); + } + } + } + + if( !ok || !pt::utf8_check_range(lastc) ) + { + lastc = 0xFFFD; // U+FFFD "replacement character"; + } +} + + +/* + * format: \u{H...} where H is a hex digit 0-F, minimum digits: 1, maximum digits: 6 + */ +void SpaceParser::read_unicode_floating_format() +{ +int c; +int value = 0; +int i; + + // max 6 hex digits + '}' + for(i=0 ; i<7 ; ++i) + { + c = read_char_no_escape(); + + if( !is_hex_digit(c) ) + { + break; + } + + value = (value << 4) | hex_to_int(c); + } + + if( i > 0 && c == '}' && pt::utf8_check_range(value) ) + { + lastc = static_cast(value); + } + else + { + lastc = 0xFFFD; // U+FFFD "replacement character"; + } +} + + + + + +void SpaceParser::read_unicode_code_point() +{ + if( parsing_space ) + { + int c = read_char_no_escape(); + + if( c == '{' ) + { + read_unicode_floating_format(); + } + else + { + read_unicode_json_format(true, c); + } + } + else + { + read_unicode_json_format(false, 0); + } } @@ -1087,6 +1192,10 @@ return lastc; + + + + } // namespace diff --git a/src/space/spaceparser.h b/src/space/spaceparser.h index 8b89772..818b260 100644 --- a/src/space/spaceparser.h +++ b/src/space/spaceparser.h @@ -300,6 +300,10 @@ private: void trim_last_white(std::wstring & s); bool is_hex_digit(wchar_t c); int hex_to_int(wchar_t c); + + bool read_unicode_four_digit_format(bool has_first_byte, int first_byte); + void read_unicode_json_format(bool has_first_byte, int first_byte); + void read_unicode_floating_format(); void read_unicode_code_point(); }; diff --git a/src/utf8/utf8.cpp b/src/utf8/utf8.cpp index 325de87..d891dd6 100644 --- a/src/utf8/utf8.cpp +++ b/src/utf8/utf8.cpp @@ -94,6 +94,43 @@ return false; +bool is_surrogate_char(int c) +{ + return (c>=0xD800 && c<=0xDFFF); +} + + +bool is_first_surrogate_char(int c) +{ + return (c>=0xD800 && c<=0xDBFF); +} + + +bool is_second_surrogate_char(int c) +{ + return (c>=0xDC00 && c<=0xDFFF); +} + + +bool surrogate_pair_to_int(int c1, int c2, int & z) +{ + z = 0xFFFD; // U+FFFD "replacement character"; + + if( is_first_surrogate_char(c1) ) + { + if( is_second_surrogate_char(c2) ) + { + z = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF)); + return true; + } + } + + return false; +} + + + + /*! this function converts one UTF-8 character into one wide-character @@ -276,6 +313,7 @@ static void int_to_wide(int c, std::wstring & res) + /*! this function converts an utf8 string into wide string (std::wstring) diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h index 65dbda9..7edabda 100644 --- a/src/utf8/utf8.h +++ b/src/utf8/utf8.h @@ -73,6 +73,37 @@ bool utf8_check_range(int c); bool utf8_check_range(int c, int how_many_bytes); + +/* + * returns true if 'c' is a characters from the surrogate range + * (c>=0xD800 && c<=0xDFFF) + * + */ +bool is_surrogate_char(int c); + + + +/* + * returns true if 'c' is a first character from the surrogate pair + * (c>=0xD800 && c<=0xDBFF) + */ +bool is_first_surrogate_char(int c); + + +/* + * returns true if 'c' is a second character from the surrogate pair + * (c>=0xDC00 && c<=0xDFFF) + */ +bool is_second_surrogate_char(int c); + + +/* + * returns a code point from two surrogate pair characters + */ +bool surrogate_pair_to_int(int c1, int c2, int & z); + + + /* * * diff --git a/src/utf8/utf8_private.cpp b/src/utf8/utf8_private.cpp index 54aa3c1..31a68ec 100644 --- a/src/utf8/utf8_private.cpp +++ b/src/utf8/utf8_private.cpp @@ -105,13 +105,13 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool z = static_cast(*wide_string); correct = true; - if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) ) + if( sizeof(wchar_t) == 2 && is_surrogate_char(z) ) { - if( z>=0xD800 && z<=0xDBFF && string_len>1 ) + if( is_first_surrogate_char(z) && string_len>1 ) { int z2 = *(wide_string+1); - if( z2>=0xDC00 && z2<=0xDFFF ) + if( is_second_surrogate_char(z2) ) { z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF)); return 2; diff --git a/src/utf8/utf8_private.h b/src/utf8/utf8_private.h index 5ea815f..1fb3726 100644 --- a/src/utf8/utf8_private.h +++ b/src/utf8/utf8_private.h @@ -48,6 +48,10 @@ bool utf8_check_range(int c); size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len); size_t int_to_utf8(int z, std::string & utf8, bool clear); size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct); +bool is_surrogate_char(int c); +bool is_first_surrogate_char(int c); +bool is_second_surrogate_char(int c); +bool surrogate_pair_to_int(int c1, int c2, int & z); namespace private_namespace diff --git a/tests/main.cpp b/tests/main.cpp index 7b53280..323ab20 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -44,9 +44,9 @@ namespace pt { -bool was_error = false; -int test_counter = 0; -const char * test_msg = nullptr; + bool was_error = false; + int test_counter = 0; + const char * test_msg = nullptr; }