From f85f1dade5b629fb29313b91cf78d9722b163d71 Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Wed, 19 Jun 2024 04:46:00 +0200 Subject: [PATCH] improve the Space text convertion methods Read the whole character from a multibyte string (as int/char32_t) and then check if it needs to be escaped. Also don't use a tmp stream object when serializing between wide/char strings. while here: - add try_esc_to_space(...) global function - add wide_to_output_function(const wchar_t * str, size_t len, OutputFunction output_function, int mode) - add wide_to_output_function(const wchar_t * str, OutputFunction output_function, int mode) --- src/Makefile.dep | 68 ++++++++-------- src/convert/misc.cpp | 52 ++++++++++++ src/convert/misc.h | 1 + src/space/space.h | 186 ++++++++++++++++++------------------------- src/utf8/utf8.h | 112 +++++++++++++++++++++++++- tests/Makefile.dep | 25 +++--- 6 files changed, 289 insertions(+), 155 deletions(-) diff --git a/src/Makefile.dep b/src/Makefile.dep index 879cc07..89500a4 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -5,39 +5,42 @@ ./convert/text.o: ./convert/text.h ./convert/text_private.h ./convert/double.o: ./convert/double.h textstream/textstream.h ./convert/double.o: textstream/stream.h space/space.h convert/inttostr.h -./convert/double.o: utf8/utf8.h textstream/stream.h date/date.h -./convert/double.o: membuffer/membuffer.h textstream/types.h +./convert/double.o: convert/misc.h textstream/stream.h utf8/utf8.h +./convert/double.o: date/date.h membuffer/membuffer.h textstream/types.h ./convert/double.o: textstream/stream_private.h ./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h ./convert/baseparser.o: textstream/stream.h space/space.h convert/inttostr.h -./convert/baseparser.o: utf8/utf8.h textstream/stream.h date/date.h -./convert/baseparser.o: membuffer/membuffer.h textstream/types.h +./convert/baseparser.o: convert/misc.h textstream/stream.h utf8/utf8.h +./convert/baseparser.o: date/date.h membuffer/membuffer.h textstream/types.h ./convert/baseparser.o: textstream/stream_private.h ./date/date.o: ./date/date.h convert/inttostr.h ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h -./log/filelog.o: space/space.h convert/inttostr.h utf8/utf8.h -./log/filelog.o: textstream/stream.h date/date.h membuffer/membuffer.h -./log/filelog.o: textstream/types.h textstream/stream_private.h +./log/filelog.o: space/space.h convert/inttostr.h convert/misc.h +./log/filelog.o: textstream/stream.h utf8/utf8.h date/date.h +./log/filelog.o: membuffer/membuffer.h textstream/types.h +./log/filelog.o: textstream/stream_private.h ./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h -./log/log.o: space/space.h convert/inttostr.h utf8/utf8.h textstream/stream.h -./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h +./log/log.o: space/space.h convert/inttostr.h convert/misc.h +./log/log.o: textstream/stream.h utf8/utf8.h date/date.h +./log/log.o: membuffer/membuffer.h textstream/types.h ./log/log.o: textstream/stream_private.h ./log/filelog.h -./space/space.o: ./space/space.h convert/inttostr.h utf8/utf8.h -./space/space.o: textstream/stream.h convert/convert.h ./convert/inttostr.h -./space/space.o: convert/patternreplacer.h textstream/textstream.h -./space/space.o: textstream/stream.h space/space.h date/date.h -./space/space.o: membuffer/membuffer.h textstream/types.h +./space/space.o: ./space/space.h convert/inttostr.h convert/misc.h +./space/space.o: textstream/stream.h utf8/utf8.h convert/convert.h +./space/space.o: ./convert/inttostr.h convert/patternreplacer.h +./space/space.o: textstream/textstream.h textstream/stream.h space/space.h +./space/space.o: date/date.h membuffer/membuffer.h textstream/types.h ./space/space.o: textstream/stream_private.h convert/strtoint.h ./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h -./space/spaceparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h -./space/spaceparser.o: convert/baseparser.h textstream/textstream.h -./space/spaceparser.o: textstream/stream.h space/space.h date/date.h -./space/spaceparser.o: membuffer/membuffer.h textstream/types.h -./space/spaceparser.o: textstream/stream_private.h convert/strtoint.h -./space/spaceparser.o: ./convert/text.h ./convert/misc.h +./space/spaceparser.o: convert/inttostr.h convert/misc.h textstream/stream.h +./space/spaceparser.o: utf8/utf8.h convert/baseparser.h +./space/spaceparser.o: textstream/textstream.h textstream/stream.h +./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h +./space/spaceparser.o: textstream/types.h textstream/stream_private.h +./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h ./space/keyvalueparser.o: ./space/keyvalueparser.h ./space/space.h -./space/keyvalueparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h +./space/keyvalueparser.o: convert/inttostr.h convert/misc.h +./space/keyvalueparser.o: textstream/stream.h utf8/utf8.h ./space/keyvalueparser.o: convert/baseparser.h textstream/textstream.h ./space/keyvalueparser.o: textstream/stream.h space/space.h date/date.h ./space/keyvalueparser.o: membuffer/membuffer.h textstream/types.h @@ -46,22 +49,23 @@ ./textstream/stream_private.o: textstream/stream_private.h ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h ./csv/csvparser.o: ./csv/csvparser.h space/space.h convert/inttostr.h -./csv/csvparser.o: utf8/utf8.h textstream/stream.h convert/baseparser.h -./csv/csvparser.o: textstream/textstream.h textstream/stream.h date/date.h -./csv/csvparser.o: membuffer/membuffer.h textstream/types.h -./csv/csvparser.o: textstream/stream_private.h +./csv/csvparser.o: convert/misc.h textstream/stream.h utf8/utf8.h +./csv/csvparser.o: convert/baseparser.h textstream/textstream.h +./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h +./csv/csvparser.o: textstream/types.h textstream/stream_private.h ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h ./mainoptions/mainoptionsparser.o: space/space.h convert/inttostr.h -./mainoptions/mainoptionsparser.o: utf8/utf8.h textstream/stream.h +./mainoptions/mainoptionsparser.o: convert/misc.h textstream/stream.h +./mainoptions/mainoptionsparser.o: utf8/utf8.h ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h ./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h ./html/bbcodeparser.o: textstream/stream.h space/space.h convert/inttostr.h -./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h date/date.h -./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h +./html/bbcodeparser.o: convert/misc.h textstream/stream.h utf8/utf8.h +./html/bbcodeparser.o: date/date.h membuffer/membuffer.h textstream/types.h ./html/bbcodeparser.o: textstream/stream_private.h ./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h ./html/htmlparser.o: textstream/textstream.h textstream/stream.h -./html/htmlparser.o: space/space.h convert/inttostr.h utf8/utf8.h -./html/htmlparser.o: textstream/stream.h date/date.h membuffer/membuffer.h -./html/htmlparser.o: textstream/types.h textstream/stream_private.h -./html/htmlparser.o: convert/text.h +./html/htmlparser.o: space/space.h convert/inttostr.h convert/misc.h +./html/htmlparser.o: textstream/stream.h utf8/utf8.h date/date.h +./html/htmlparser.o: membuffer/membuffer.h textstream/types.h +./html/htmlparser.o: textstream/stream_private.h convert/text.h diff --git a/src/convert/misc.cpp b/src/convert/misc.cpp index 3d4099a..0e94316 100644 --- a/src/convert/misc.cpp +++ b/src/convert/misc.cpp @@ -525,6 +525,58 @@ bool try_esc_to_html(char32_t c, pt::Stream & out) } +bool try_esc_to_space(char32_t c, pt::Stream & out) +{ + bool status = false; + + switch(c) + { + case 0: + out << '\\'; + out << 'u' << '{' << '0' << '}'; + status = true; + break; + + case '\r': // 13 + out << '\\'; + out << 'r'; + status = true; + break; + + case '\n': // 10 + out << '\\'; + out << 'n'; + status = true; + break; + + case '\\': + out << '\\'; + out << '\\'; + status = true; + break; + + case '"': + out << '\\'; + out << '\"'; + status = true; + break; + + case '\b': // 8 + out << '\\'; + out << 'b'; + status = true; + break; + + case '\f': // 12 + out << '\\'; + out << 'f'; + status = true; + break; + } + + return status; +} + } diff --git a/src/convert/misc.h b/src/convert/misc.h index e8729d1..2d29da3 100644 --- a/src/convert/misc.h +++ b/src/convert/misc.h @@ -76,6 +76,7 @@ void esc_to_csv(const std::wstring & in, Stream & out); bool try_esc_to_tex(char32_t c, pt::Stream & out); bool try_esc_to_html(char32_t c, pt::Stream & out); +bool try_esc_to_space(char32_t c, pt::Stream & out); template diff --git a/src/space/space.h b/src/space/space.h index 4c38b6e..0de7fd5 100644 --- a/src/space/space.h +++ b/src/space/space.h @@ -43,6 +43,7 @@ #include #include #include "convert/inttostr.h" +#include "convert/misc.h" #include "utf8/utf8.h" @@ -844,97 +845,12 @@ protected: } - - - template - void escape_to_space_format(int c, StreamType & out) const - { - switch(c) - { - case 0: out << '\\'; out << 'u' << '{' << '0' << '}'; break; - case '\r': out << '\\'; out << 'r'; break; // 13 - case '\n': out << '\\'; out << 'n'; break; // 10 - case '\\': out << '\\'; out << '\\'; break; - case '"': out << '\\'; out << '\"'; break; - case '\b': out << '\\'; out << 'b'; break; // 8 - case '\f': out << '\\'; out << 'f'; break; // 12 - default: - out << static_cast(c); - } - } - - - template - void escape_to_json_format(int c, StreamType & out) const - { - switch(c) - { - case 0: out << '\\'; out << 'u' << '0' << '0' << '0' << '0'; break; - case '\r': out << '\\'; out << 'r'; break; // 13 - case '\n': out << '\\'; out << 'n'; break; // 10 - case '\\': out << '\\'; out << '\\'; break; - case '"': out << '\\'; out << '\"'; break; - case '\t': out << '\\'; out << 't'; break; // 9 - case '\b': out << '\\'; out << 'b'; break; // 8 - case '\f': out << '\\'; out << 'f'; break; // 12 - default: - out << static_cast(c); - } - } - - - template - void copy_input_string_to_output(const CharT * input_str, StreamType & out_str, Escape escape) const - { - while( *input_str ) - { - if( escape == Escape::no_escape ) - out_str << static_cast(*input_str); - else - if( escape == Escape::escape_space ) - escape_to_space_format(*input_str, out_str); - else - if( escape == Escape::escape_json ) - escape_to_json_format(*input_str, out_str); - - input_str += 1; - } - } - - template - void copy_input_string_to_output(const CharT * input_str, size_t len, StreamType & out_str, Escape escape) const + void copy_string_directly(const CharT * input_str, size_t len, StreamType & out_str) const { for(size_t i=0 ; i < len ; ++i) { - if( escape == Escape::no_escape ) - out_str << static_cast(input_str[i]); - else - if( escape == Escape::escape_space ) - escape_to_space_format(input_str[i], out_str); - else - if( escape == Escape::escape_json ) - escape_to_json_format(input_str[i], out_str); - } - } - - template - void copy_input_stream_to_output(const StreamType & input_str, StreamType & out_str, Escape escape) const - { - typename StreamType::const_iterator i = input_str.begin(); - - while( i != input_str.end() ) - { - if( escape == Escape::no_escape ) - out_str << static_cast(*i); - else - if( escape == Escape::escape_space ) - escape_to_space_format(*i, out_str); - else - if( escape == Escape::escape_json ) - escape_to_json_format(*i, out_str); - - ++i; + out_str << input_str[i]; } } @@ -942,17 +858,36 @@ protected: template void serialize_string_buffer(const char * input_str, size_t len, StreamType & out_str, Escape escape) const { - if constexpr ( sizeof(char) == sizeof(typename StreamType::char_type) ) + if( escape == Escape::no_escape ) { - // input and output are char (we assume it is utf8) - copy_input_string_to_output(input_str, len, out_str, escape); + if( out_str.is_char_stream() ) + { + copy_string_directly(input_str, len, out_str); + } + else + { + utf8_to_wide(input_str, len, out_str, false); + } } else + if( escape == Escape::escape_space ) { - // input is utf8 but output is wide - StreamType temp_stream; - utf8_to_wide(input_str, len, temp_stream, false); - copy_input_stream_to_output(temp_stream, out_str, escape); + utf8_to_output_function(input_str, len, [&](int c){ + if( !try_esc_to_space(static_cast(c), out_str) ) + { + out_str << static_cast(c); + } + }); + } + else + if( escape == Escape::escape_json ) + { + utf8_to_output_function(input_str, len, [&](int c){ + if( !try_esc_to_json(static_cast(c), out_str) ) + { + out_str << static_cast(c); + } + }); } } @@ -960,17 +895,36 @@ protected: template void serialize_string_buffer(const wchar_t * input_str, size_t len, StreamType & out_str, Escape escape) const { - if constexpr ( sizeof(wchar_t) == sizeof(typename StreamType::char_type) ) + if( escape == Escape::no_escape ) { - // input and output are wide characters - copy_input_string_to_output(input_str, len, out_str, escape); + if( out_str.is_char_stream() ) + { + wide_to_utf8(input_str, len, out_str); + } + else + { + copy_string_directly(input_str, len, out_str); + } } else + if( escape == Escape::escape_space ) { - // input is wide but output is utf8 - StreamType temp_stream; - wide_to_utf8(input_str, len, temp_stream, false); - copy_input_stream_to_output(temp_stream, out_str, escape); + wide_to_output_function(input_str, len, [&](int c){ + if( !try_esc_to_space(static_cast(c), out_str) ) + { + out_str << static_cast(c); + } + }); + } + else + if( escape == Escape::escape_json ) + { + wide_to_output_function(input_str, len, [&](int c){ + if( !try_esc_to_json(static_cast(c), out_str) ) + { + out_str << static_cast(c); + } + }); } } @@ -978,17 +932,29 @@ protected: template void serialize_string_buffer(const wchar_t * input_str, StreamType & out_str, Escape escape) const { - if constexpr ( sizeof(wchar_t) == sizeof(typename StreamType::char_type) ) + if( escape == Escape::no_escape ) { - // input and output are wide characters - copy_input_string_to_output(input_str, out_str, escape); + out_str << input_str; } else + if( escape == Escape::escape_space ) { - // input is wide but output is utf8 - StreamType temp_stream; - wide_to_utf8(input_str, temp_stream, false); - copy_input_stream_to_output(temp_stream, out_str, escape); + wide_to_output_function(input_str, [&](int c){ + if( !try_esc_to_space(static_cast(c), out_str) ) + { + out_str << static_cast(c); + } + }); + } + else + if( escape == Escape::escape_json ) + { + wide_to_output_function(input_str, [&](int c){ + if( !try_esc_to_json(static_cast(c), out_str) ) + { + out_str << static_cast(c); + } + }); } } @@ -1108,7 +1074,7 @@ protected: bool quote_field = should_field_be_quoted(map_item.first); print_if(quote_field, str, '"'); - serialize_string_buffer(map_item.first.c_str(), str, Escape::escape_space); + serialize_string_buffer(map_item.first.c_str(), map_item.first.size(), str, Escape::escape_space); print_if(quote_field, str, '"'); print_if(pretty_print, str, ' '); @@ -1272,7 +1238,7 @@ protected: } str << '"'; - serialize_string_buffer(map_item.first.c_str(), str, Escape::escape_json); + serialize_string_buffer(map_item.first.c_str(), map_item.first.size(), str, Escape::escape_json); str << '"'; str << ':'; print_if(pretty_print, str, ' '); diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h index f3a004f..c2d3f90 100644 --- a/src/utf8/utf8.h +++ b/src/utf8/utf8.h @@ -253,6 +253,12 @@ template size_t int_to_utf8(int z, StreamType & utf8); +template +bool wide_to_output_function(const wchar_t * str, size_t len, OutputFunction output_function, int mode = 1); + +template +bool wide_to_output_function(const wchar_t * str, OutputFunction output_function, int mode = 1); + template bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode = 1); @@ -541,7 +547,7 @@ bool correct, was_error = false; } else { - len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero + len = utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero } if( !correct ) @@ -565,6 +571,109 @@ return !was_error; +template +bool wide_to_output_function(const wchar_t * str, size_t len, OutputFunction output_function, int mode) +{ + bool was_error = false; + size_t index = 0; + + while( index < len ) + { + int c = 0xFFFD; // U+FFFD "replacement character"; + bool is_correct_char = false; + wchar_t w1 = str[index]; + + if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) ) + { + ++index; + + if( index < len ) + { + wchar_t w2 = str[index]; + + if( surrogate_pair_to_int(w1, w2, c) ) + { + is_correct_char = true; + ++index; + } + else + { + was_error = true; + } + } + else + { + was_error = true; + } + } + else + { + c = w1; + is_correct_char = true; + ++index; + } + + if( is_correct_char || mode == 1 ) + { + output_function(c); + } + } + + return !was_error; +} + + +template +bool wide_to_output_function(const wchar_t * str, OutputFunction output_function, int mode) +{ + bool was_error = false; + + while( *str ) + { + int c = 0xFFFD; // U+FFFD "replacement character"; + bool is_correct_char = false; + wchar_t w1 = *str; + + if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) ) + { + ++str; + + if( *str ) + { + wchar_t w2 = *str; + + if( surrogate_pair_to_int(w1, w2, c) ) + { + is_correct_char = true; + ++str; + } + else + { + was_error = true; + } + } + else + { + was_error = true; + } + } + else + { + c = w1; + is_correct_char = true; + ++str; + } + + if( is_correct_char || mode == 1 ) + { + output_function(c); + } + } + + return !was_error; +} + + template bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode) { @@ -866,6 +975,7 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i } + template bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode) { diff --git a/tests/Makefile.dep b/tests/Makefile.dep index 9d4cb79..5ec1105 100644 --- a/tests/Makefile.dep +++ b/tests/Makefile.dep @@ -3,24 +3,24 @@ ./convert.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h ./convert.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h ./convert.o: ../src/textstream/stream.h ../src/space/space.h -./convert.o: ../src/convert/inttostr.h ../src/utf8/utf8.h -./convert.o: ../src/textstream/stream.h ../src/date/date.h +./convert.o: ../src/convert/inttostr.h ../src/convert/misc.h +./convert.o: ../src/textstream/stream.h ../src/utf8/utf8.h ../src/date/date.h ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./convert.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h ./convert.o: ../src/convert/text.h ../src/convert/misc.h ./convert.o: ../src/convert/double.h test.h ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h -./csvparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h -./csvparser.o: ../src/textstream/stream.h ../src/convert/baseparser.h -./csvparser.o: ../src/textstream/textstream.h ../src/textstream/stream.h -./csvparser.o: ../src/date/date.h ../src/membuffer/membuffer.h -./csvparser.o: ../src/textstream/types.h ../src/textstream/stream_private.h -./csvparser.o: test.h +./csvparser.o: ../src/convert/inttostr.h ../src/convert/misc.h +./csvparser.o: ../src/textstream/stream.h ../src/utf8/utf8.h +./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h +./csvparser.o: ../src/textstream/stream.h ../src/date/date.h +./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h +./csvparser.o: ../src/textstream/stream_private.h test.h ./main.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h ./main.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h ./main.o: ../src/textstream/stream.h ../src/space/space.h -./main.o: ../src/convert/inttostr.h ../src/utf8/utf8.h -./main.o: ../src/textstream/stream.h ../src/date/date.h +./main.o: ../src/convert/inttostr.h ../src/convert/misc.h +./main.o: ../src/textstream/stream.h ../src/utf8/utf8.h ../src/date/date.h ./main.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./main.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h ./main.o: ../src/convert/text.h ../src/convert/misc.h ../src/convert/double.h @@ -29,8 +29,9 @@ ./mainoptionsparser.o: mainoptionsparser.h test.h ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h ./mainoptionsparser.o: ../src/space/space.h ../src/convert/inttostr.h -./mainoptionsparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h -./mainoptionsparser.o: ../src/convert/convert.h ../src/convert/inttostr.h +./mainoptionsparser.o: ../src/convert/misc.h ../src/textstream/stream.h +./mainoptionsparser.o: ../src/utf8/utf8.h ../src/convert/convert.h +./mainoptionsparser.o: ../src/convert/inttostr.h ./mainoptionsparser.o: ../src/convert/patternreplacer.h ./mainoptionsparser.o: ../src/textstream/textstream.h ./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h