improve the Space text convertion methods

Read the whole character from a multibyte string (as int/char32_t) and
then check if it needs to be escaped. Also don't use a tmp stream object
when serializing between wide/char strings.

while here:
- add try_esc_to_space(...) global function
- add wide_to_output_function(const wchar_t * str, size_t len, OutputFunction output_function, int mode)
- add wide_to_output_function(const wchar_t * str, OutputFunction output_function, int mode)
This commit is contained in:
2024-06-19 04:46:00 +02:00
parent c0838de3a4
commit f85f1dade5
6 changed files with 289 additions and 155 deletions

View File

@@ -5,39 +5,42 @@
./convert/text.o: ./convert/text.h ./convert/text_private.h
./convert/double.o: ./convert/double.h textstream/textstream.h
./convert/double.o: textstream/stream.h space/space.h convert/inttostr.h
./convert/double.o: utf8/utf8.h textstream/stream.h date/date.h
./convert/double.o: membuffer/membuffer.h textstream/types.h
./convert/double.o: convert/misc.h textstream/stream.h utf8/utf8.h
./convert/double.o: date/date.h membuffer/membuffer.h textstream/types.h
./convert/double.o: textstream/stream_private.h
./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
./convert/baseparser.o: textstream/stream.h space/space.h convert/inttostr.h
./convert/baseparser.o: utf8/utf8.h textstream/stream.h date/date.h
./convert/baseparser.o: membuffer/membuffer.h textstream/types.h
./convert/baseparser.o: convert/misc.h textstream/stream.h utf8/utf8.h
./convert/baseparser.o: date/date.h membuffer/membuffer.h textstream/types.h
./convert/baseparser.o: textstream/stream_private.h
./date/date.o: ./date/date.h convert/inttostr.h
./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
./log/filelog.o: space/space.h convert/inttostr.h utf8/utf8.h
./log/filelog.o: textstream/stream.h date/date.h membuffer/membuffer.h
./log/filelog.o: textstream/types.h textstream/stream_private.h
./log/filelog.o: space/space.h convert/inttostr.h convert/misc.h
./log/filelog.o: textstream/stream.h utf8/utf8.h date/date.h
./log/filelog.o: membuffer/membuffer.h textstream/types.h
./log/filelog.o: textstream/stream_private.h
./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h
./log/log.o: space/space.h convert/inttostr.h utf8/utf8.h textstream/stream.h
./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h
./log/log.o: space/space.h convert/inttostr.h convert/misc.h
./log/log.o: textstream/stream.h utf8/utf8.h date/date.h
./log/log.o: membuffer/membuffer.h textstream/types.h
./log/log.o: textstream/stream_private.h ./log/filelog.h
./space/space.o: ./space/space.h convert/inttostr.h utf8/utf8.h
./space/space.o: textstream/stream.h convert/convert.h ./convert/inttostr.h
./space/space.o: convert/patternreplacer.h textstream/textstream.h
./space/space.o: textstream/stream.h space/space.h date/date.h
./space/space.o: membuffer/membuffer.h textstream/types.h
./space/space.o: ./space/space.h convert/inttostr.h convert/misc.h
./space/space.o: textstream/stream.h utf8/utf8.h convert/convert.h
./space/space.o: ./convert/inttostr.h convert/patternreplacer.h
./space/space.o: textstream/textstream.h textstream/stream.h space/space.h
./space/space.o: date/date.h membuffer/membuffer.h textstream/types.h
./space/space.o: textstream/stream_private.h convert/strtoint.h
./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h
./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
./space/spaceparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./space/spaceparser.o: convert/baseparser.h textstream/textstream.h
./space/spaceparser.o: textstream/stream.h space/space.h date/date.h
./space/spaceparser.o: membuffer/membuffer.h textstream/types.h
./space/spaceparser.o: textstream/stream_private.h convert/strtoint.h
./space/spaceparser.o: ./convert/text.h ./convert/misc.h
./space/spaceparser.o: convert/inttostr.h convert/misc.h textstream/stream.h
./space/spaceparser.o: utf8/utf8.h convert/baseparser.h
./space/spaceparser.o: textstream/textstream.h textstream/stream.h
./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h
./space/spaceparser.o: textstream/types.h textstream/stream_private.h
./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
./space/keyvalueparser.o: ./space/keyvalueparser.h ./space/space.h
./space/keyvalueparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./space/keyvalueparser.o: convert/inttostr.h convert/misc.h
./space/keyvalueparser.o: textstream/stream.h utf8/utf8.h
./space/keyvalueparser.o: convert/baseparser.h textstream/textstream.h
./space/keyvalueparser.o: textstream/stream.h space/space.h date/date.h
./space/keyvalueparser.o: membuffer/membuffer.h textstream/types.h
@@ -46,22 +49,23 @@
./textstream/stream_private.o: textstream/stream_private.h
./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h
./csv/csvparser.o: ./csv/csvparser.h space/space.h convert/inttostr.h
./csv/csvparser.o: utf8/utf8.h textstream/stream.h convert/baseparser.h
./csv/csvparser.o: textstream/textstream.h textstream/stream.h date/date.h
./csv/csvparser.o: membuffer/membuffer.h textstream/types.h
./csv/csvparser.o: textstream/stream_private.h
./csv/csvparser.o: convert/misc.h textstream/stream.h utf8/utf8.h
./csv/csvparser.o: convert/baseparser.h textstream/textstream.h
./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
./csv/csvparser.o: textstream/types.h textstream/stream_private.h
./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
./mainoptions/mainoptionsparser.o: space/space.h convert/inttostr.h
./mainoptions/mainoptionsparser.o: utf8/utf8.h textstream/stream.h
./mainoptions/mainoptionsparser.o: convert/misc.h textstream/stream.h
./mainoptions/mainoptionsparser.o: utf8/utf8.h
./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
./html/bbcodeparser.o: textstream/stream.h space/space.h convert/inttostr.h
./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h date/date.h
./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
./html/bbcodeparser.o: convert/misc.h textstream/stream.h utf8/utf8.h
./html/bbcodeparser.o: date/date.h membuffer/membuffer.h textstream/types.h
./html/bbcodeparser.o: textstream/stream_private.h
./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
./html/htmlparser.o: textstream/textstream.h textstream/stream.h
./html/htmlparser.o: space/space.h convert/inttostr.h utf8/utf8.h
./html/htmlparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
./html/htmlparser.o: textstream/types.h textstream/stream_private.h
./html/htmlparser.o: convert/text.h
./html/htmlparser.o: space/space.h convert/inttostr.h convert/misc.h
./html/htmlparser.o: textstream/stream.h utf8/utf8.h date/date.h
./html/htmlparser.o: membuffer/membuffer.h textstream/types.h
./html/htmlparser.o: textstream/stream_private.h convert/text.h

View File

@@ -525,6 +525,58 @@ bool try_esc_to_html(char32_t c, pt::Stream & out)
}
bool try_esc_to_space(char32_t c, pt::Stream & out)
{
bool status = false;
switch(c)
{
case 0:
out << '\\';
out << 'u' << '{' << '0' << '}';
status = true;
break;
case '\r': // 13
out << '\\';
out << 'r';
status = true;
break;
case '\n': // 10
out << '\\';
out << 'n';
status = true;
break;
case '\\':
out << '\\';
out << '\\';
status = true;
break;
case '"':
out << '\\';
out << '\"';
status = true;
break;
case '\b': // 8
out << '\\';
out << 'b';
status = true;
break;
case '\f': // 12
out << '\\';
out << 'f';
status = true;
break;
}
return status;
}
}

View File

@@ -76,6 +76,7 @@ void esc_to_csv(const std::wstring & in, Stream & out);
bool try_esc_to_tex(char32_t c, pt::Stream & out);
bool try_esc_to_html(char32_t c, pt::Stream & out);
bool try_esc_to_space(char32_t c, pt::Stream & out);
template<typename StreamType>

View File

@@ -43,6 +43,7 @@
#include <cwchar>
#include <errno.h>
#include "convert/inttostr.h"
#include "convert/misc.h"
#include "utf8/utf8.h"
@@ -844,97 +845,12 @@ protected:
}
template<typename StreamType>
void escape_to_space_format(int c, StreamType & out) const
{
switch(c)
{
case 0: out << '\\'; out << 'u' << '{' << '0' << '}'; break;
case '\r': out << '\\'; out << 'r'; break; // 13
case '\n': out << '\\'; out << 'n'; break; // 10
case '\\': out << '\\'; out << '\\'; break;
case '"': out << '\\'; out << '\"'; break;
case '\b': out << '\\'; out << 'b'; break; // 8
case '\f': out << '\\'; out << 'f'; break; // 12
default:
out << static_cast<typename StreamType::char_type>(c);
}
}
template<typename StreamType>
void escape_to_json_format(int c, StreamType & out) const
{
switch(c)
{
case 0: out << '\\'; out << 'u' << '0' << '0' << '0' << '0'; break;
case '\r': out << '\\'; out << 'r'; break; // 13
case '\n': out << '\\'; out << 'n'; break; // 10
case '\\': out << '\\'; out << '\\'; break;
case '"': out << '\\'; out << '\"'; break;
case '\t': out << '\\'; out << 't'; break; // 9
case '\b': out << '\\'; out << 'b'; break; // 8
case '\f': out << '\\'; out << 'f'; break; // 12
default:
out << static_cast<typename StreamType::char_type>(c);
}
}
template<typename CharT, typename StreamType>
void copy_input_string_to_output(const CharT * input_str, StreamType & out_str, Escape escape) const
{
while( *input_str )
{
if( escape == Escape::no_escape )
out_str << static_cast<typename StreamType::char_type>(*input_str);
else
if( escape == Escape::escape_space )
escape_to_space_format(*input_str, out_str);
else
if( escape == Escape::escape_json )
escape_to_json_format(*input_str, out_str);
input_str += 1;
}
}
template<typename CharT, typename StreamType>
void copy_input_string_to_output(const CharT * input_str, size_t len, StreamType & out_str, Escape escape) const
void copy_string_directly(const CharT * input_str, size_t len, StreamType & out_str) const
{
for(size_t i=0 ; i < len ; ++i)
{
if( escape == Escape::no_escape )
out_str << static_cast<typename StreamType::char_type>(input_str[i]);
else
if( escape == Escape::escape_space )
escape_to_space_format(input_str[i], out_str);
else
if( escape == Escape::escape_json )
escape_to_json_format(input_str[i], out_str);
}
}
template<typename StreamType>
void copy_input_stream_to_output(const StreamType & input_str, StreamType & out_str, Escape escape) const
{
typename StreamType::const_iterator i = input_str.begin();
while( i != input_str.end() )
{
if( escape == Escape::no_escape )
out_str << static_cast<typename StreamType::char_type>(*i);
else
if( escape == Escape::escape_space )
escape_to_space_format(*i, out_str);
else
if( escape == Escape::escape_json )
escape_to_json_format(*i, out_str);
++i;
out_str << input_str[i];
}
}
@@ -942,17 +858,36 @@ protected:
template<typename StreamType>
void serialize_string_buffer(const char * input_str, size_t len, StreamType & out_str, Escape escape) const
{
if constexpr ( sizeof(char) == sizeof(typename StreamType::char_type) )
if( escape == Escape::no_escape )
{
// input and output are char (we assume it is utf8)
copy_input_string_to_output(input_str, len, out_str, escape);
if( out_str.is_char_stream() )
{
copy_string_directly(input_str, len, out_str);
}
else
{
utf8_to_wide(input_str, len, out_str, false);
}
}
else
if( escape == Escape::escape_space )
{
// input is utf8 but output is wide
StreamType temp_stream;
utf8_to_wide(input_str, len, temp_stream, false);
copy_input_stream_to_output(temp_stream, out_str, escape);
utf8_to_output_function(input_str, len, [&](int c){
if( !try_esc_to_space(static_cast<char32_t>(c), out_str) )
{
out_str << static_cast<char32_t>(c);
}
});
}
else
if( escape == Escape::escape_json )
{
utf8_to_output_function(input_str, len, [&](int c){
if( !try_esc_to_json(static_cast<char32_t>(c), out_str) )
{
out_str << static_cast<char32_t>(c);
}
});
}
}
@@ -960,17 +895,36 @@ protected:
template<typename StreamType>
void serialize_string_buffer(const wchar_t * input_str, size_t len, StreamType & out_str, Escape escape) const
{
if constexpr ( sizeof(wchar_t) == sizeof(typename StreamType::char_type) )
if( escape == Escape::no_escape )
{
// input and output are wide characters
copy_input_string_to_output(input_str, len, out_str, escape);
if( out_str.is_char_stream() )
{
wide_to_utf8(input_str, len, out_str);
}
else
{
copy_string_directly(input_str, len, out_str);
}
}
else
if( escape == Escape::escape_space )
{
// input is wide but output is utf8
StreamType temp_stream;
wide_to_utf8(input_str, len, temp_stream, false);
copy_input_stream_to_output(temp_stream, out_str, escape);
wide_to_output_function(input_str, len, [&](int c){
if( !try_esc_to_space(static_cast<char32_t>(c), out_str) )
{
out_str << static_cast<char32_t>(c);
}
});
}
else
if( escape == Escape::escape_json )
{
wide_to_output_function(input_str, len, [&](int c){
if( !try_esc_to_json(static_cast<char32_t>(c), out_str) )
{
out_str << static_cast<char32_t>(c);
}
});
}
}
@@ -978,17 +932,29 @@ protected:
template<typename StreamType>
void serialize_string_buffer(const wchar_t * input_str, StreamType & out_str, Escape escape) const
{
if constexpr ( sizeof(wchar_t) == sizeof(typename StreamType::char_type) )
if( escape == Escape::no_escape )
{
// input and output are wide characters
copy_input_string_to_output(input_str, out_str, escape);
out_str << input_str;
}
else
if( escape == Escape::escape_space )
{
// input is wide but output is utf8
StreamType temp_stream;
wide_to_utf8(input_str, temp_stream, false);
copy_input_stream_to_output(temp_stream, out_str, escape);
wide_to_output_function(input_str, [&](int c){
if( !try_esc_to_space(static_cast<char32_t>(c), out_str) )
{
out_str << static_cast<char32_t>(c);
}
});
}
else
if( escape == Escape::escape_json )
{
wide_to_output_function(input_str, [&](int c){
if( !try_esc_to_json(static_cast<char32_t>(c), out_str) )
{
out_str << static_cast<char32_t>(c);
}
});
}
}
@@ -1108,7 +1074,7 @@ protected:
bool quote_field = should_field_be_quoted(map_item.first);
print_if(quote_field, str, '"');
serialize_string_buffer(map_item.first.c_str(), str, Escape::escape_space);
serialize_string_buffer(map_item.first.c_str(), map_item.first.size(), str, Escape::escape_space);
print_if(quote_field, str, '"');
print_if(pretty_print, str, ' ');
@@ -1272,7 +1238,7 @@ protected:
}
str << '"';
serialize_string_buffer(map_item.first.c_str(), str, Escape::escape_json);
serialize_string_buffer(map_item.first.c_str(), map_item.first.size(), str, Escape::escape_json);
str << '"';
str << ':';
print_if(pretty_print, str, ' ');

View File

@@ -253,6 +253,12 @@ template<typename StreamType>
size_t int_to_utf8(int z, StreamType & utf8);
template<typename OutputFunction>
bool wide_to_output_function(const wchar_t * str, size_t len, OutputFunction output_function, int mode = 1);
template<typename OutputFunction>
bool wide_to_output_function(const wchar_t * str, OutputFunction output_function, int mode = 1);
template<typename StreamType, typename OutputFunction>
bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode = 1);
@@ -541,7 +547,7 @@ bool correct, was_error = false;
}
else
{
len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero
len = utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero
}
if( !correct )
@@ -565,6 +571,109 @@ return !was_error;
template<typename OutputFunction>
bool wide_to_output_function(const wchar_t * str, size_t len, OutputFunction output_function, int mode)
{
bool was_error = false;
size_t index = 0;
while( index < len )
{
int c = 0xFFFD; // U+FFFD "replacement character";
bool is_correct_char = false;
wchar_t w1 = str[index];
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
{
++index;
if( index < len )
{
wchar_t w2 = str[index];
if( surrogate_pair_to_int(w1, w2, c) )
{
is_correct_char = true;
++index;
}
else
{
was_error = true;
}
}
else
{
was_error = true;
}
}
else
{
c = w1;
is_correct_char = true;
++index;
}
if( is_correct_char || mode == 1 )
{
output_function(c);
}
}
return !was_error;
}
template<typename OutputFunction>
bool wide_to_output_function(const wchar_t * str, OutputFunction output_function, int mode)
{
bool was_error = false;
while( *str )
{
int c = 0xFFFD; // U+FFFD "replacement character";
bool is_correct_char = false;
wchar_t w1 = *str;
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
{
++str;
if( *str )
{
wchar_t w2 = *str;
if( surrogate_pair_to_int(w1, w2, c) )
{
is_correct_char = true;
++str;
}
else
{
was_error = true;
}
}
else
{
was_error = true;
}
}
else
{
c = w1;
is_correct_char = true;
++str;
}
if( is_correct_char || mode == 1 )
{
output_function(c);
}
}
return !was_error;
}
template<typename StreamType, typename OutputFunction>
bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode)
{
@@ -866,6 +975,7 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i
}
template<typename StreamIteratorType, typename OutputFunction>
bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode)
{

View File

@@ -3,24 +3,24 @@
./convert.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h
./convert.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
./convert.o: ../src/textstream/stream.h ../src/space/space.h
./convert.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
./convert.o: ../src/textstream/stream.h ../src/date/date.h
./convert.o: ../src/convert/inttostr.h ../src/convert/misc.h
./convert.o: ../src/textstream/stream.h ../src/utf8/utf8.h ../src/date/date.h
./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./convert.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
./convert.o: ../src/convert/text.h ../src/convert/misc.h
./convert.o: ../src/convert/double.h test.h
./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
./csvparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
./csvparser.o: ../src/textstream/stream.h ../src/convert/baseparser.h
./csvparser.o: ../src/textstream/textstream.h ../src/textstream/stream.h
./csvparser.o: ../src/date/date.h ../src/membuffer/membuffer.h
./csvparser.o: ../src/textstream/types.h ../src/textstream/stream_private.h
./csvparser.o: test.h
./csvparser.o: ../src/convert/inttostr.h ../src/convert/misc.h
./csvparser.o: ../src/textstream/stream.h ../src/utf8/utf8.h
./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h
./csvparser.o: ../src/textstream/stream.h ../src/date/date.h
./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./csvparser.o: ../src/textstream/stream_private.h test.h
./main.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h
./main.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
./main.o: ../src/textstream/stream.h ../src/space/space.h
./main.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
./main.o: ../src/textstream/stream.h ../src/date/date.h
./main.o: ../src/convert/inttostr.h ../src/convert/misc.h
./main.o: ../src/textstream/stream.h ../src/utf8/utf8.h ../src/date/date.h
./main.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./main.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
./main.o: ../src/convert/text.h ../src/convert/misc.h ../src/convert/double.h
@@ -29,8 +29,9 @@
./mainoptionsparser.o: mainoptionsparser.h test.h
./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
./mainoptionsparser.o: ../src/space/space.h ../src/convert/inttostr.h
./mainoptionsparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
./mainoptionsparser.o: ../src/convert/convert.h ../src/convert/inttostr.h
./mainoptionsparser.o: ../src/convert/misc.h ../src/textstream/stream.h
./mainoptionsparser.o: ../src/utf8/utf8.h ../src/convert/convert.h
./mainoptionsparser.o: ../src/convert/inttostr.h
./mainoptionsparser.o: ../src/convert/patternreplacer.h
./mainoptionsparser.o: ../src/textstream/textstream.h
./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h