fix: correctly escape json/xml/csv wide strings
A wide string was first changed to utf-8 and then escaped to json/xml/csv which is incorrect. First should be escaped and then changed to utf-8. Add TextStreamBase<>::iterator and TextStreamBase<>::const_interator as classes with a method wchar_t get_unicode_and_advance(const iterator & end) to return one character either from utf-8 stream or from wide stream. Let TextStreamBase<>::operator<<(wchar_t v) correctly use utf-8.
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2017-2021, Tomasz Sowa
|
||||
* Copyright (c) 2017-2022, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -51,62 +51,83 @@ void SetOverflow(bool * was_overflow, bool val)
|
||||
}
|
||||
|
||||
|
||||
void esc_to_json(char val, Stream & out)
|
||||
|
||||
void esc_to_json_uformat(wchar_t val, Stream & out)
|
||||
{
|
||||
if( (unsigned char)val < 32 )
|
||||
char buf[10];
|
||||
size_t len;
|
||||
|
||||
Toa((unsigned long)val, buf, sizeof(buf)/sizeof(char), 16, &len);
|
||||
|
||||
out << "\\u";
|
||||
|
||||
if( len < 4 )
|
||||
{
|
||||
char buf[10];
|
||||
size_t len;
|
||||
Toa((unsigned char)val, buf, sizeof(buf)/sizeof(char), 16, &len);
|
||||
|
||||
out << "\\u";
|
||||
|
||||
if( len < 4 )
|
||||
for(size_t i=0 ; i < (4-len) ; ++i)
|
||||
{
|
||||
for(size_t i=0 ; i < (4-len) ; ++i)
|
||||
{
|
||||
out << '0';
|
||||
}
|
||||
}
|
||||
|
||||
out << buf;
|
||||
}
|
||||
else
|
||||
{
|
||||
// CHECKME
|
||||
// \r \n \t are <32 and will be serialized os \u.... above
|
||||
|
||||
switch( val )
|
||||
{
|
||||
case 0: out << '\\'; out << '0'; break; // may to skip this character is better?
|
||||
case '\r': out << '\\'; out << 'r'; break;
|
||||
case '\n': out << '\\'; out << 'n'; break;
|
||||
case '\t': out << '\\'; out << 't'; break;
|
||||
case 0x08: out << '\\'; out << 'b'; break;
|
||||
case 0x0c: out << '\\'; out << 'f'; break;
|
||||
case '\\': out << '\\'; out << '\\'; break;
|
||||
case '"': out << '\\'; out << '\"'; break;
|
||||
default:
|
||||
out << val;
|
||||
out << '0';
|
||||
}
|
||||
}
|
||||
|
||||
out << buf;
|
||||
}
|
||||
|
||||
|
||||
void esc_to_json(wchar_t val, Stream & out)
|
||||
{
|
||||
char utf8_buf[10];
|
||||
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
|
||||
|
||||
size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
|
||||
|
||||
for(size_t a = 0 ; a < len ; ++a)
|
||||
if( val == '\r' )
|
||||
{
|
||||
esc_to_json(utf8_buf[a], out);
|
||||
out << '\\' << 'r';
|
||||
}
|
||||
else
|
||||
if( val == '\n' )
|
||||
{
|
||||
out << '\\' << 'n';
|
||||
}
|
||||
else
|
||||
if( val == '\t' )
|
||||
{
|
||||
out << '\\' << 't';
|
||||
}
|
||||
else
|
||||
if( val == 0x08 )
|
||||
{
|
||||
out << '\\' << 'b';
|
||||
}
|
||||
else
|
||||
if( val == 0x0c )
|
||||
{
|
||||
out << '\\' << 'f';
|
||||
}
|
||||
else
|
||||
if( val == '\\' )
|
||||
{
|
||||
out << '\\' << '\\';
|
||||
}
|
||||
else
|
||||
if( val == '"' )
|
||||
{
|
||||
out << '\\' << '\"';
|
||||
}
|
||||
else
|
||||
if( val < 32 )
|
||||
{
|
||||
esc_to_json_uformat(val, out);
|
||||
}
|
||||
else
|
||||
{
|
||||
out << val;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void esc_to_json(char val, Stream & out)
|
||||
{
|
||||
esc_to_json((wchar_t)(unsigned char)val, out);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void esc_to_json(const char * c, pt::Stream & out)
|
||||
{
|
||||
for(size_t i = 0 ; c[i] != 0 ; ++i)
|
||||
@@ -159,10 +180,15 @@ void esc_to_json(const std::wstring & in, Stream & out)
|
||||
|
||||
|
||||
|
||||
void esc_to_xml(char val, Stream & out)
|
||||
void esc_to_xml(wchar_t val, Stream & out)
|
||||
{
|
||||
switch(val)
|
||||
{
|
||||
case 0:
|
||||
// null character is invalid in XML 1.0 and 1.1
|
||||
// https://en.wikipedia.org/wiki/Valid_characters_in_XML
|
||||
break;
|
||||
|
||||
case '<':
|
||||
out << "<";
|
||||
break;
|
||||
@@ -182,22 +208,13 @@ void esc_to_xml(char val, Stream & out)
|
||||
default:
|
||||
out << val;
|
||||
break;
|
||||
|
||||
// what about zero (null) character?
|
||||
}
|
||||
}
|
||||
|
||||
void esc_to_xml(wchar_t val, Stream & out)
|
||||
|
||||
void esc_to_xml(char val, Stream & out)
|
||||
{
|
||||
char utf8_buf[10];
|
||||
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
|
||||
|
||||
size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
|
||||
|
||||
for(size_t a = 0 ; a < len ; ++a)
|
||||
{
|
||||
esc_to_xml(utf8_buf[a], out);
|
||||
}
|
||||
esc_to_xml((wchar_t)(unsigned char)val, out);
|
||||
}
|
||||
|
||||
|
||||
@@ -252,10 +269,14 @@ void esc_to_xml(const std::wstring & in, Stream & out)
|
||||
|
||||
|
||||
|
||||
void esc_to_csv(char c, pt::Stream & out)
|
||||
void esc_to_csv(wchar_t c, pt::Stream & out)
|
||||
{
|
||||
switch(c)
|
||||
{
|
||||
case 0:
|
||||
// null characters are invalid in text files
|
||||
break;
|
||||
|
||||
case '"':
|
||||
out << "\"\"";
|
||||
break;
|
||||
@@ -263,27 +284,16 @@ void esc_to_csv(char c, pt::Stream & out)
|
||||
default:
|
||||
out << c;
|
||||
break;
|
||||
|
||||
// what about zero (null) character?
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void esc_to_csv(wchar_t val, Stream & out)
|
||||
void esc_to_csv(char val, Stream & out)
|
||||
{
|
||||
char utf8_buf[10];
|
||||
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
|
||||
|
||||
size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
|
||||
|
||||
for(size_t a = 0 ; a < len ; ++a)
|
||||
{
|
||||
esc_to_csv(utf8_buf[a], out);
|
||||
}
|
||||
esc_to_csv((wchar_t)(unsigned char)val, out);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void esc_to_csv(const char * c, pt::Stream & out)
|
||||
{
|
||||
for(size_t i = 0 ; c[i] != 0 ; ++i)
|
||||
|
@@ -5,7 +5,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2017-2021, Tomasz Sowa
|
||||
* Copyright (c) 2017-2022, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -50,8 +50,8 @@ namespace pt
|
||||
|
||||
void SetOverflow(bool * was_overflow, bool val);
|
||||
|
||||
void esc_to_json(char val, Stream & out);
|
||||
void esc_to_json(wchar_t val, Stream & out);
|
||||
void esc_to_json(char val, Stream & out);
|
||||
void esc_to_json(const char * c, pt::Stream & out);
|
||||
void esc_to_json(const char * c, std::size_t len, Stream & out);
|
||||
void esc_to_json(const wchar_t * c, Stream & out);
|
||||
@@ -59,8 +59,8 @@ void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out);
|
||||
void esc_to_json(const std::string & in, Stream & out);
|
||||
void esc_to_json(const std::wstring & in, Stream & out);
|
||||
|
||||
void esc_to_xml(char c, pt::Stream & out);
|
||||
void esc_to_xml(wchar_t c, pt::Stream & out);
|
||||
void esc_to_xml(char c, pt::Stream & out);
|
||||
void esc_to_xml(const char * c, pt::Stream & out);
|
||||
void esc_to_xml(const char * c, std::size_t len, pt::Stream & out);
|
||||
void esc_to_xml(const wchar_t * c, pt::Stream & out);
|
||||
@@ -68,8 +68,8 @@ void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out);
|
||||
void esc_to_xml(const std::string & in, Stream & out);
|
||||
void esc_to_xml(const std::wstring & in, Stream & out);
|
||||
|
||||
void esc_to_csv(char c, pt::Stream & out);
|
||||
void esc_to_csv(wchar_t val, Stream & out);
|
||||
void esc_to_csv(char c, pt::Stream & out);
|
||||
void esc_to_csv(const char * c, std::size_t len, Stream & out);
|
||||
void esc_to_csv(const char * c, pt::Stream & out);
|
||||
void esc_to_csv(const char * c, std::size_t len, pt::Stream & out);
|
||||
@@ -82,34 +82,13 @@ void esc_to_csv(const std::string & in, Stream & out);
|
||||
template<typename StreamType>
|
||||
void esc_to_json(const StreamType & in, Stream & out)
|
||||
{
|
||||
char utf8_buf[10];
|
||||
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
|
||||
typename StreamType::const_iterator i = in.begin();
|
||||
typename StreamType::const_iterator end = in.end();
|
||||
int res;
|
||||
bool correct;
|
||||
|
||||
for( ; i != end ; ++i)
|
||||
while( i != end )
|
||||
{
|
||||
if( in.is_wchar_stream() && out.is_char_stream() )
|
||||
{
|
||||
std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
|
||||
esc_to_json(utf8_buf, len, out);
|
||||
}
|
||||
else
|
||||
if( in.is_char_stream() && out.is_wchar_stream() )
|
||||
{
|
||||
utf8_to_int(i, end, res, correct);
|
||||
|
||||
if( correct )
|
||||
esc_to_json(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
|
||||
|
||||
// put replacement char if not correct?
|
||||
}
|
||||
else
|
||||
{
|
||||
esc_to_json(static_cast<wchar_t>(*i), out);
|
||||
}
|
||||
wchar_t c = i.get_unicode_and_advance(end);
|
||||
esc_to_json(c, out);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,34 +96,13 @@ void esc_to_json(const StreamType & in, Stream & out)
|
||||
template<typename StreamType>
|
||||
void esc_to_xml(const StreamType & in, Stream & out)
|
||||
{
|
||||
char utf8_buf[10];
|
||||
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
|
||||
typename StreamType::const_iterator i = in.begin();
|
||||
typename StreamType::const_iterator end = in.end();
|
||||
int res;
|
||||
bool correct;
|
||||
|
||||
for( ; i != end ; ++i)
|
||||
while( i != end )
|
||||
{
|
||||
if( in.is_wchar_stream() && out.is_char_stream() )
|
||||
{
|
||||
std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
|
||||
esc_to_xml(utf8_buf, len, out);
|
||||
}
|
||||
else
|
||||
if( in.is_char_stream() && out.is_wchar_stream() )
|
||||
{
|
||||
utf8_to_int(i, end, res, correct);
|
||||
|
||||
if( correct )
|
||||
esc_to_xml(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
|
||||
|
||||
// put replacement char if not correct?
|
||||
}
|
||||
else
|
||||
{
|
||||
esc_to_xml(static_cast<wchar_t>(*i), out);
|
||||
}
|
||||
wchar_t c = i.get_unicode_and_advance(end);
|
||||
esc_to_xml(c, out);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,34 +110,13 @@ void esc_to_xml(const StreamType & in, Stream & out)
|
||||
template<typename StreamType>
|
||||
void esc_to_csv(const StreamType & in, Stream & out)
|
||||
{
|
||||
char utf8_buf[10];
|
||||
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
|
||||
typename StreamType::const_iterator i = in.begin();
|
||||
typename StreamType::const_iterator end = in.end();
|
||||
int res;
|
||||
bool correct;
|
||||
|
||||
for( ; i != end ; ++i)
|
||||
while( i != end )
|
||||
{
|
||||
if( in.is_wchar_stream() && out.is_char_stream() )
|
||||
{
|
||||
std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
|
||||
esc_to_csv(utf8_buf, len, out);
|
||||
}
|
||||
else
|
||||
if( in.is_char_stream() && out.is_wchar_stream() )
|
||||
{
|
||||
utf8_to_int(i, end, res, correct);
|
||||
|
||||
if( correct )
|
||||
esc_to_csv(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
|
||||
|
||||
// put replacement char if not correct?
|
||||
}
|
||||
else
|
||||
{
|
||||
esc_to_csv(static_cast<wchar_t>(*i), out);
|
||||
}
|
||||
wchar_t c = i.get_unicode_and_advance(end);
|
||||
esc_to_csv(c, out);
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user