Merge branch 'api2021'

added support for surrogate pairs when reading \uHHHH format added support to parse \u{H...} format (only if parsing Space format) and some fixes
2021-06-14 13:51:53 +02:00
parent 848cdf9c03 4d70ae9e87
commit 55f6bda3ab
9 changed files with 262 additions and 39 deletions
--- a/src/space/space.cpp
+++ b/src/space/space.cpp
@@ -953,7 +953,7 @@ std::string Space::to_str() const
 	if( type == type_wstring )
 	{
 		TextStream stream;
-		serialize_string_buffer(value.value_wstring.c_str(), stream, Escape::no_escape);
+		serialize_string_buffer(value.value_wstring.c_str(), value.value_wstring.size(), stream, Escape::no_escape);
 		stream.to_string(str);
 		return str;
 	}
--- a/src/space/space.h
+++ b/src/space/space.h
@@ -768,8 +768,18 @@ protected:
 	template<typename StreamType>
 	void escape_to_space_format(int c, StreamType & out) const
 	{
-		// IMPLEMENT ME
-		escape_to_json_format(c, out);
+		switch(c)
+		{
+		case 0:		out << '\\';	out << 'u' << '{' << '0' << '}';	break;
+		case '\r':	out << '\\';	out << 'r';		break;	// 13
+		case '\n':	out << '\\';	out << 'n';		break;	// 10
+		case '\\':	out << '\\';	out << '\\';	break;
+		case '"':	out << '\\';	out << '\"';	break;
+		case '\b':	out << '\\';	out << 'b';		break;	// 8
+		case '\f':	out << '\\';	out << 'f';		break;	// 12
+		default:
+			out << static_cast<typename StreamType::char_type>(c);
+		}
 	}


@@ -778,14 +788,14 @@ protected:
 	{
 		switch(c)
 		{
-		case 0:		out << '\\';	out << '0';		break;
-		case '\r':	out << '\\';	out << 'r';		break;
-		case '\n':	out << '\\';	out << 'n';		break;
-		case '\\':	out << '\\';	out << '\\';		break;
-		case '"':	out << '\\';	out << '\"';		break;
-		//case '(':	out << '\\';	out << '(';		break;
-		//case ')':	out << '\\';	out << ')';		break;
-		//case '=':	out << '\\';	out << '=';		break;
+		case 0:		out << '\\';	out << 'u' << '0' << '0' << '0' << '0';		break;
+		case '\r':	out << '\\';	out << 'r';		break;	// 13
+		case '\n':	out << '\\';	out << 'n';		break;	// 10
+		case '\\':	out << '\\';	out << '\\';	break;
+		case '"':	out << '\\';	out << '\"';	break;
+		case '\t':	out << '\\';	out << 't';		break;	// 9
+		case '\b':	out << '\\';	out << 'b';		break;	// 8
+		case '\f':	out << '\\';	out << 'f';		break;	// 12
 		default:
 			out << static_cast<typename StreamType::char_type>(c);
 		}
@@ -811,6 +821,22 @@ protected:
 		}
 	}

+	template<typename CharT, typename StreamType>
+	void copy_input_string_to_output(const CharT * input_str, size_t len, StreamType & out_str, Escape escape) const
+	{
+		for(size_t i=0 ; i < len ; ++i)
+		{
+			if( escape == Escape::no_escape )
+				out_str << static_cast<typename StreamType::char_type>(input_str[i]);
+			else
+			if( escape == Escape::escape_space )
+				escape_to_space_format(input_str[i], out_str);
+			else
+			if( escape == Escape::escape_json )
+				escape_to_json_format(input_str[i], out_str);
+		}
+	}
+
 	template<typename StreamType>
 	void copy_input_stream_to_output(const StreamType & input_str, StreamType & out_str, Escape escape) const
 	{
@@ -833,23 +859,37 @@ protected:


 	template<typename StreamType>
-	void serialize_string_buffer(const char * input_str, StreamType & out_str, Escape escape) const
+	void serialize_string_buffer(const char * input_str, size_t len, StreamType & out_str, Escape escape) const
 	{
 		if constexpr ( sizeof(char) == sizeof(typename StreamType::char_type) )
 		{
 			// input and output are char (we assume it is utf8)
-			copy_input_string_to_output(input_str, out_str, escape);
+			copy_input_string_to_output(input_str, len, out_str, escape);
 		}
 		else
 		{
 			// input is utf8 but output is wide
-			copy_input_string_to_output(input_str, out_str, escape); // temporarily
+			StreamType temp_stream;
+			utf8_to_wide(input_str, len, temp_stream, false);
+			copy_input_stream_to_output(temp_stream, out_str, escape);
+		}
+	}

-			// !!!!!!!!!!!!!!!!!!! FIXME
-//			StreamType temp_stream;
-//			UTF8ToWide(input_str, temp_stream, false);
-//
-//			copy_input_stream_to_output(temp_stream, out_str, escape);
+
+	template<typename StreamType>
+	void serialize_string_buffer(const wchar_t * input_str, size_t len, StreamType & out_str, Escape escape) const
+	{
+		if constexpr ( sizeof(wchar_t) == sizeof(typename StreamType::char_type) )
+		{
+			// input and output are wide characters
+			copy_input_string_to_output(input_str, len, out_str, escape);
+		}
+		else
+		{
+			// input is wide but output is utf8
+			StreamType temp_stream;
+			wide_to_utf8(input_str, len, temp_stream, false);
+			copy_input_stream_to_output(temp_stream, out_str, escape);
 		}
 	}

@@ -864,17 +904,14 @@ protected:
 		}
 		else
 		{
-			StreamType temp_stream;
-
 			// input is wide but output is utf8
+			StreamType temp_stream;
 			wide_to_utf8(input_str, temp_stream, false);
 			copy_input_stream_to_output(temp_stream, out_str, escape);
 		}
 	}


-
-
 	template<typename StreamType>
 	void serialize_space_null(StreamType & str) const
 	{
@@ -937,7 +974,7 @@ protected:
 	void serialize_space_string(StreamType & str) const
 	{
 		str << '"';
-		serialize_string_buffer(value.value_string.c_str(), str, Escape::escape_space);
+		serialize_string_buffer(value.value_string.c_str(), value.value_string.size(), str, Escape::escape_space);
 		str << '"';
 	}

@@ -945,7 +982,7 @@ protected:
 	void serialize_space_wstring(StreamType & str) const
 	{
 		str << '"';
-		serialize_string_buffer(value.value_wstring.c_str(), str, Escape::escape_space);
+		serialize_string_buffer(value.value_wstring.c_str(), value.value_wstring.size(), str, Escape::escape_space);
 		str << '"';
 	}

@@ -1121,7 +1158,7 @@ protected:
 	void serialize_json_string(StreamType & str) const
 	{
 		str << '"';
-		serialize_string_buffer(value.value_string.c_str(), str, Escape::escape_json);
+		serialize_string_buffer(value.value_string.c_str(), value.value_string.size(), str, Escape::escape_json);
 		str << '"';
 	}

@@ -1129,7 +1166,7 @@ protected:
 	void serialize_json_wstring(StreamType & str) const
 	{
 		str << '"';
-		serialize_string_buffer(value.value_wstring.c_str(), str, Escape::escape_json);
+		serialize_string_buffer(value.value_wstring.c_str(), value.value_wstring.size(), str, Escape::escape_json);
 		str << '"';
 	}

--- a/src/space/spaceparser.cpp
+++ b/src/space/spaceparser.cpp
@@ -1037,25 +1037,130 @@ return 0;
 }


-void SpaceParser::read_unicode_code_point()
+/*
+ * format: \uHHHH where H is a hex digit 0-F
+ */
+bool SpaceParser::read_unicode_four_digit_format(bool has_first_byte, int first_byte)
 {
-wchar_t c;
+int c;
 int value = 0;

 	for(int i=0 ; i<4 ; ++i)
 	{
-		c = read_char_no_escape();
+		if( i == 0 && has_first_byte )
+		{
+			c = first_byte;
+		}
+		else
+		{
+			c = read_char_no_escape();
+		}

 		if( !is_hex_digit(c) )
 		{
-			status = syntax_error;
-			return;
+			return false;
 		}

 		value = (value << 4) | hex_to_int(c);
 	}

-	lastc = (wchar_t)value;
+	lastc = static_cast<wchar_t>(value);
+	return true;
+}
+
+
+
+/*
+ * format: \uHHHH and optionally following by \uHHHH
+ *
+ */
+void SpaceParser::read_unicode_json_format(bool has_first_byte, int first_byte)
+{
+	bool ok = read_unicode_four_digit_format(has_first_byte, first_byte);
+
+	if( ok && pt::is_first_surrogate_char(lastc) )
+	{
+		int c1 = lastc;
+		int c = read_char_no_escape();
+
+		ok = ok && (c == '\\');
+
+		if( ok )
+		{
+			c = read_char_no_escape();
+			ok = ok && (c == 'u');
+			ok = ok && read_unicode_four_digit_format(false, 0);
+
+			if( ok && pt::is_second_surrogate_char(lastc) )
+			{
+				int c2 = lastc;
+				ok = ok && pt::surrogate_pair_to_int(c1, c2, lastc);
+			}
+		}
+	}
+
+	if( !ok || !pt::utf8_check_range(lastc) )
+	{
+		lastc = 0xFFFD; // U+FFFD "replacement character";
+	}
+}
+
+
+/*
+ * format: \u{H...} where H is a hex digit 0-F, minimum digits: 1, maximum digits: 6
+ */
+void SpaceParser::read_unicode_floating_format()
+{
+int c;
+int value = 0;
+int i;
+
+	// max 6 hex digits + '}'
+	for(i=0 ; i<7 ; ++i)
+	{
+		c = read_char_no_escape();
+
+		if( !is_hex_digit(c) )
+		{
+			break;
+		}
+
+		value = (value << 4) | hex_to_int(c);
+	}
+
+	if( i > 0 && c == '}' && pt::utf8_check_range(value) )
+	{
+		lastc = static_cast<wchar_t>(value);
+	}
+	else
+	{
+		lastc = 0xFFFD; // U+FFFD "replacement character";
+	}
+}
+
+
+
+
+
+void SpaceParser::read_unicode_code_point()
+{
+	if( parsing_space )
+	{
+		int c = read_char_no_escape();
+
+		if( c == '{' )
+		{
+			read_unicode_floating_format();
+		}
+		else
+		{
+			read_unicode_json_format(true, c);
+		}
+	}
+	else
+	{
+		read_unicode_json_format(false, 0);
+	}
 }


@@ -1087,6 +1192,10 @@ return lastc;



+
+
+
+
 } // namespace


--- a/src/space/spaceparser.h
+++ b/src/space/spaceparser.h
@@ -300,6 +300,10 @@ private:
 	void trim_last_white(std::wstring & s);
 	bool is_hex_digit(wchar_t c);
 	int  hex_to_int(wchar_t c);
+
+	bool read_unicode_four_digit_format(bool has_first_byte, int first_byte);
+	void read_unicode_json_format(bool has_first_byte, int first_byte);
+	void read_unicode_floating_format();
 	void read_unicode_code_point();

 };
--- a/src/utf8/utf8.cpp
+++ b/src/utf8/utf8.cpp
@@ -94,6 +94,43 @@ return false;



+bool is_surrogate_char(int c)
+{
+	return (c>=0xD800 && c<=0xDFFF);
+}
+
+
+bool is_first_surrogate_char(int c)
+{
+	return (c>=0xD800 && c<=0xDBFF);
+}
+
+
+bool is_second_surrogate_char(int c)
+{
+	return (c>=0xDC00 && c<=0xDFFF);
+}
+
+
+bool surrogate_pair_to_int(int c1, int c2, int & z)
+{
+	z = 0xFFFD; // U+FFFD "replacement character";
+
+	if( is_first_surrogate_char(c1) )
+	{
+		if( is_second_surrogate_char(c2) )
+		{
+			z = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF));
+			return true;
+		}
+	}
+
+	return false;
+}
+
+
+
+
 /*!
 	this function converts one UTF-8 character into one wide-character

@@ -276,6 +313,7 @@ static void int_to_wide(int c, std::wstring & res)



+
 /*!
 	this function converts an utf8 string into wide string (std::wstring)

--- a/src/utf8/utf8.h
+++ b/src/utf8/utf8.h
@@ -73,6 +73,37 @@ bool utf8_check_range(int c);
 bool utf8_check_range(int c, int how_many_bytes);


+
+/*
+ * returns true if 'c' is a characters from the surrogate range
+ * (c>=0xD800 && c<=0xDFFF)
+ *
+ */
+bool is_surrogate_char(int c);
+
+
+
+/*
+ * returns true if 'c' is a first character from the surrogate pair
+ * (c>=0xD800 && c<=0xDBFF)
+ */
+bool is_first_surrogate_char(int c);
+
+
+/*
+ * returns true if 'c' is a second character from the surrogate pair
+ * (c>=0xDC00 && c<=0xDFFF)
+ */
+bool is_second_surrogate_char(int c);
+
+
+/*
+ * returns a code point from two surrogate pair characters
+ */
+bool surrogate_pair_to_int(int c1, int c2, int & z);
+
+
+
 /*
 *
 *
--- a/src/utf8/utf8_private.cpp
+++ b/src/utf8/utf8_private.cpp
@@ -105,13 +105,13 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool
 	z = static_cast<int>(*wide_string);
 	correct = true;

-	if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) )
+	if( sizeof(wchar_t) == 2 && is_surrogate_char(z) )
 	{
-		if( z>=0xD800 && z<=0xDBFF && string_len>1 )
+		if( is_first_surrogate_char(z) && string_len>1 )
 		{
 			int z2 = *(wide_string+1);

-			if( z2>=0xDC00 && z2<=0xDFFF )
+			if( is_second_surrogate_char(z2) )
 			{
 				z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
 				return 2;
--- a/src/utf8/utf8_private.h
+++ b/src/utf8/utf8_private.h
@@ -48,6 +48,10 @@ bool utf8_check_range(int c);
 size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len);
 size_t int_to_utf8(int z, std::string & utf8, bool clear);
 size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct);
+bool is_surrogate_char(int c);
+bool is_first_surrogate_char(int c);
+bool is_second_surrogate_char(int c);
+bool surrogate_pair_to_int(int c1, int c2, int & z);


 namespace private_namespace
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -44,9 +44,9 @@

 namespace pt
 {
-bool was_error = false;
-int test_counter = 0;
-const char * test_msg = nullptr;
+	bool was_error = false;
+	int test_counter = 0;
+	const char * test_msg = nullptr;
 }