From 6b97b1b74acc1d7c89f8426218cec867380f1a3f Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Thu, 3 Feb 2022 19:08:21 +0100 Subject: [PATCH] fix: correctly escape json/xml/csv wide strings A wide string was first changed to utf-8 and then escaped to json/xml/csv which is incorrect. First should be escaped and then changed to utf-8. Add TextStreamBase<>::iterator and TextStreamBase<>::const_interator as classes with a method wchar_t get_unicode_and_advance(const iterator & end) to return one character either from utf-8 stream or from wide stream. Let TextStreamBase<>::operator<<(wchar_t v) correctly use utf-8. --- src/Makefile.dep | 48 ++--- src/convert/misc.cpp | 148 ++++++++------- src/convert/misc.h | 89 ++------- src/textstream/textstream.h | 359 ++++++++++++++++++++++++++++++++++-- src/utf8/utf8_stream.h | 4 +- tests/Makefile.dep | 2 +- 6 files changed, 466 insertions(+), 184 deletions(-) diff --git a/src/Makefile.dep b/src/Makefile.dep index 7dbbb8e..b037dab 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -13,6 +13,7 @@ ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./convert/double.o: membuffer/membuffer.h textstream/types.h +./convert/double.o: utf8/utf8_stream.h ./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h ./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h ./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h @@ -24,19 +25,19 @@ ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h ./log/filelog.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h -./log/filelog.o: textstream/types.h +./log/filelog.o: textstream/types.h utf8/utf8_stream.h ./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h ./log/log.o: space/space.h textstream/types.h convert/inttostr.h utf8/utf8.h ./log/log.o: textstream/stream.h utf8/utf8_templates.h utf8/utf8_private.h ./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h -./log/log.o: ./log/filelog.h +./log/log.o: utf8/utf8_stream.h ./log/filelog.h ./space/space.o: ./space/space.h textstream/types.h convert/inttostr.h ./space/space.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h ./space/space.o: convert/patternreplacer.h textstream/textstream.h ./space/space.o: textstream/stream.h space/space.h date/date.h -./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h -./space/space.o: ./convert/text.h ./convert/misc.h utf8/utf8_stream.h +./space/space.o: membuffer/membuffer.h textstream/types.h utf8/utf8_stream.h +./space/space.o: convert/strtoint.h ./convert/text.h ./convert/misc.h ./space/space.o: ./convert/double.h ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h @@ -44,31 +45,32 @@ ./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h ./space/spaceparser.o: textstream/textstream.h textstream/stream.h ./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h -./space/spaceparser.o: textstream/types.h convert/strtoint.h ./convert/text.h -./space/spaceparser.o: ./convert/misc.h utf8/utf8_stream.h +./space/spaceparser.o: textstream/types.h utf8/utf8_stream.h +./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./utf8/utf8.o: utf8/utf8_private.h ./utf8/utf8_private.o: utf8/utf8_private.h -./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h -./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h -./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h -./csv/csvparser.o: convert/baseparser.h textstream/textstream.h -./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h -./csv/csvparser.o: textstream/types.h -./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h -./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h -./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h -./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h -./mainoptions/mainoptionsparser.o: utf8/utf8_private.h -./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h -./html/htmlparser.o: textstream/textstream.h textstream/stream.h -./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h -./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h -./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h -./html/htmlparser.o: textstream/types.h convert/text.h ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h ./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h ./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h ./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h +./html/bbcodeparser.o: utf8/utf8_stream.h +./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h +./html/htmlparser.o: textstream/textstream.h textstream/stream.h +./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h +./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./html/htmlparser.o: textstream/types.h utf8/utf8_stream.h convert/text.h +./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h +./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h +./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h +./csv/csvparser.o: convert/baseparser.h textstream/textstream.h +./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h +./csv/csvparser.o: textstream/types.h utf8/utf8_stream.h +./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h +./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h +./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h +./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h +./mainoptions/mainoptionsparser.o: utf8/utf8_private.h diff --git a/src/convert/misc.cpp b/src/convert/misc.cpp index ffdf457..ffa757e 100644 --- a/src/convert/misc.cpp +++ b/src/convert/misc.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2017-2021, Tomasz Sowa + * Copyright (c) 2017-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -51,62 +51,83 @@ void SetOverflow(bool * was_overflow, bool val) } -void esc_to_json(char val, Stream & out) + +void esc_to_json_uformat(wchar_t val, Stream & out) { - if( (unsigned char)val < 32 ) + char buf[10]; + size_t len; + + Toa((unsigned long)val, buf, sizeof(buf)/sizeof(char), 16, &len); + + out << "\\u"; + + if( len < 4 ) { - char buf[10]; - size_t len; - Toa((unsigned char)val, buf, sizeof(buf)/sizeof(char), 16, &len); - - out << "\\u"; - - if( len < 4 ) + for(size_t i=0 ; i < (4-len) ; ++i) { - for(size_t i=0 ; i < (4-len) ; ++i) - { - out << '0'; - } - } - - out << buf; - } - else - { - // CHECKME - // \r \n \t are <32 and will be serialized os \u.... above - - switch( val ) - { - case 0: out << '\\'; out << '0'; break; // may to skip this character is better? - case '\r': out << '\\'; out << 'r'; break; - case '\n': out << '\\'; out << 'n'; break; - case '\t': out << '\\'; out << 't'; break; - case 0x08: out << '\\'; out << 'b'; break; - case 0x0c: out << '\\'; out << 'f'; break; - case '\\': out << '\\'; out << '\\'; break; - case '"': out << '\\'; out << '\"'; break; - default: - out << val; + out << '0'; } } + + out << buf; } void esc_to_json(wchar_t val, Stream & out) { - char utf8_buf[10]; - std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); - - size_t len = int_to_utf8(static_cast(val), utf8_buf, utf8_buf_len); - - for(size_t a = 0 ; a < len ; ++a) + if( val == '\r' ) { - esc_to_json(utf8_buf[a], out); + out << '\\' << 'r'; + } + else + if( val == '\n' ) + { + out << '\\' << 'n'; + } + else + if( val == '\t' ) + { + out << '\\' << 't'; + } + else + if( val == 0x08 ) + { + out << '\\' << 'b'; + } + else + if( val == 0x0c ) + { + out << '\\' << 'f'; + } + else + if( val == '\\' ) + { + out << '\\' << '\\'; + } + else + if( val == '"' ) + { + out << '\\' << '\"'; + } + else + if( val < 32 ) + { + esc_to_json_uformat(val, out); + } + else + { + out << val; } } +void esc_to_json(char val, Stream & out) +{ + esc_to_json((wchar_t)(unsigned char)val, out); +} + + + void esc_to_json(const char * c, pt::Stream & out) { for(size_t i = 0 ; c[i] != 0 ; ++i) @@ -159,10 +180,15 @@ void esc_to_json(const std::wstring & in, Stream & out) -void esc_to_xml(char val, Stream & out) +void esc_to_xml(wchar_t val, Stream & out) { switch(val) { + case 0: + // null character is invalid in XML 1.0 and 1.1 + // https://en.wikipedia.org/wiki/Valid_characters_in_XML + break; + case '<': out << "<"; break; @@ -182,22 +208,13 @@ void esc_to_xml(char val, Stream & out) default: out << val; break; - - // what about zero (null) character? } } -void esc_to_xml(wchar_t val, Stream & out) + +void esc_to_xml(char val, Stream & out) { - char utf8_buf[10]; - std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); - - size_t len = int_to_utf8(static_cast(val), utf8_buf, utf8_buf_len); - - for(size_t a = 0 ; a < len ; ++a) - { - esc_to_xml(utf8_buf[a], out); - } + esc_to_xml((wchar_t)(unsigned char)val, out); } @@ -252,10 +269,14 @@ void esc_to_xml(const std::wstring & in, Stream & out) -void esc_to_csv(char c, pt::Stream & out) +void esc_to_csv(wchar_t c, pt::Stream & out) { switch(c) { + case 0: + // null characters are invalid in text files + break; + case '"': out << "\"\""; break; @@ -263,27 +284,16 @@ void esc_to_csv(char c, pt::Stream & out) default: out << c; break; - - // what about zero (null) character? } } -void esc_to_csv(wchar_t val, Stream & out) +void esc_to_csv(char val, Stream & out) { - char utf8_buf[10]; - std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); - - size_t len = int_to_utf8(static_cast(val), utf8_buf, utf8_buf_len); - - for(size_t a = 0 ; a < len ; ++a) - { - esc_to_csv(utf8_buf[a], out); - } + esc_to_csv((wchar_t)(unsigned char)val, out); } - void esc_to_csv(const char * c, pt::Stream & out) { for(size_t i = 0 ; c[i] != 0 ; ++i) diff --git a/src/convert/misc.h b/src/convert/misc.h index 51f4159..5070655 100644 --- a/src/convert/misc.h +++ b/src/convert/misc.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2017-2021, Tomasz Sowa + * Copyright (c) 2017-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -50,8 +50,8 @@ namespace pt void SetOverflow(bool * was_overflow, bool val); -void esc_to_json(char val, Stream & out); void esc_to_json(wchar_t val, Stream & out); +void esc_to_json(char val, Stream & out); void esc_to_json(const char * c, pt::Stream & out); void esc_to_json(const char * c, std::size_t len, Stream & out); void esc_to_json(const wchar_t * c, Stream & out); @@ -59,8 +59,8 @@ void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out); void esc_to_json(const std::string & in, Stream & out); void esc_to_json(const std::wstring & in, Stream & out); -void esc_to_xml(char c, pt::Stream & out); void esc_to_xml(wchar_t c, pt::Stream & out); +void esc_to_xml(char c, pt::Stream & out); void esc_to_xml(const char * c, pt::Stream & out); void esc_to_xml(const char * c, std::size_t len, pt::Stream & out); void esc_to_xml(const wchar_t * c, pt::Stream & out); @@ -68,8 +68,8 @@ void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out); void esc_to_xml(const std::string & in, Stream & out); void esc_to_xml(const std::wstring & in, Stream & out); -void esc_to_csv(char c, pt::Stream & out); void esc_to_csv(wchar_t val, Stream & out); +void esc_to_csv(char c, pt::Stream & out); void esc_to_csv(const char * c, std::size_t len, Stream & out); void esc_to_csv(const char * c, pt::Stream & out); void esc_to_csv(const char * c, std::size_t len, pt::Stream & out); @@ -82,34 +82,13 @@ void esc_to_csv(const std::string & in, Stream & out); template void esc_to_json(const StreamType & in, Stream & out) { - char utf8_buf[10]; - std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); typename StreamType::const_iterator i = in.begin(); typename StreamType::const_iterator end = in.end(); - int res; - bool correct; - for( ; i != end ; ++i) + while( i != end ) { - if( in.is_wchar_stream() && out.is_char_stream() ) - { - std::size_t len = int_to_utf8(static_cast(*i), utf8_buf, utf8_buf_len); - esc_to_json(utf8_buf, len, out); - } - else - if( in.is_char_stream() && out.is_wchar_stream() ) - { - utf8_to_int(i, end, res, correct); - - if( correct ) - esc_to_json(static_cast(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2) - - // put replacement char if not correct? - } - else - { - esc_to_json(static_cast(*i), out); - } + wchar_t c = i.get_unicode_and_advance(end); + esc_to_json(c, out); } } @@ -117,34 +96,13 @@ void esc_to_json(const StreamType & in, Stream & out) template void esc_to_xml(const StreamType & in, Stream & out) { - char utf8_buf[10]; - std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); typename StreamType::const_iterator i = in.begin(); typename StreamType::const_iterator end = in.end(); - int res; - bool correct; - for( ; i != end ; ++i) + while( i != end ) { - if( in.is_wchar_stream() && out.is_char_stream() ) - { - std::size_t len = int_to_utf8(static_cast(*i), utf8_buf, utf8_buf_len); - esc_to_xml(utf8_buf, len, out); - } - else - if( in.is_char_stream() && out.is_wchar_stream() ) - { - utf8_to_int(i, end, res, correct); - - if( correct ) - esc_to_xml(static_cast(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2) - - // put replacement char if not correct? - } - else - { - esc_to_xml(static_cast(*i), out); - } + wchar_t c = i.get_unicode_and_advance(end); + esc_to_xml(c, out); } } @@ -152,34 +110,13 @@ void esc_to_xml(const StreamType & in, Stream & out) template void esc_to_csv(const StreamType & in, Stream & out) { - char utf8_buf[10]; - std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); typename StreamType::const_iterator i = in.begin(); typename StreamType::const_iterator end = in.end(); - int res; - bool correct; - for( ; i != end ; ++i) + while( i != end ) { - if( in.is_wchar_stream() && out.is_char_stream() ) - { - std::size_t len = int_to_utf8(static_cast(*i), utf8_buf, utf8_buf_len); - esc_to_csv(utf8_buf, len, out); - } - else - if( in.is_char_stream() && out.is_wchar_stream() ) - { - utf8_to_int(i, end, res, correct); - - if( correct ) - esc_to_csv(static_cast(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2) - - // put replacement char if not correct? - } - else - { - esc_to_csv(static_cast(*i), out); - } + wchar_t c = i.get_unicode_and_advance(end); + esc_to_csv(c, out); } } diff --git a/src/textstream/textstream.h b/src/textstream/textstream.h index 772a073..6f0053d 100644 --- a/src/textstream/textstream.h +++ b/src/textstream/textstream.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2012-2021, Tomasz Sowa + * Copyright (c) 2012-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -46,6 +46,7 @@ #include "membuffer/membuffer.h" #include "types.h" #include "utf8/utf8.h" +#include "utf8/utf8_stream.h" // for snprintf #include @@ -71,8 +72,67 @@ public: typedef CharT char_type; typedef MemBuffer buffer_type; - typedef typename buffer_type::iterator iterator; - typedef typename buffer_type::const_iterator const_iterator; + + + class iterator + { + public: + + typename buffer_type::iterator membuffer_iterator; + + iterator(); + iterator(const iterator & i); + iterator & operator=(const iterator & i); + + iterator(const typename buffer_type::iterator & i); + iterator & operator=(const typename buffer_type::iterator & i); + + bool operator==(const iterator & i) const; + bool operator!=(const iterator & i) const; + + iterator & operator++(); // prefix ++ + iterator operator++(int); // postfix ++ + + iterator & operator--(); // prefix -- + iterator operator--(int); // postfix -- + + CharT & operator*(); + + wchar_t get_unicode_and_advance(const iterator & end); + }; + + + class const_iterator + { + public: + + typename buffer_type::const_iterator membuffer_const_iterator; + + const_iterator(); + const_iterator(const const_iterator & i); + const_iterator(const iterator & i); + const_iterator & operator=(const const_iterator & i); + const_iterator & operator=(const iterator & i); + + const_iterator(const typename buffer_type::const_iterator & i); + const_iterator(const typename buffer_type::iterator & i); + const_iterator & operator=(const typename buffer_type::const_iterator & i); + const_iterator & operator=(const typename buffer_type::iterator & i); + + bool operator==(const const_iterator & i) const; + bool operator!=(const const_iterator & i) const; + + const_iterator & operator++(); // prefix ++ + const_iterator operator++(int); // postfix ++ + + const_iterator & operator--(); // prefix -- + const_iterator operator--(int); // postfix -- + + CharT operator*(); + + wchar_t get_unicode_and_advance(const const_iterator & end); + + }; bool is_char_stream() const; @@ -112,7 +172,7 @@ public: TextStreamBase & operator<<(char); TextStreamBase & operator<<(unsigned char); - TextStreamBase & operator<<(wchar_t); + TextStreamBase & operator<<(wchar_t); // no surrogate pairs are used TextStreamBase & operator<<(bool); TextStreamBase & operator<<(short); TextStreamBase & operator<<(int); @@ -173,6 +233,272 @@ TextStreamBase::TextStreamBase() } + +template +TextStreamBase::iterator::iterator() +{ +} + + +template +TextStreamBase::iterator::iterator(const iterator & i) : membuffer_iterator(i) +{ +} + + +template +TextStreamBase::iterator & +TextStreamBase::iterator::operator=(const iterator & i) +{ + membuffer_iterator = i; +} + + +template +TextStreamBase::iterator::iterator(const typename buffer_type::iterator & i) : membuffer_iterator(i) +{ +} + + +template +TextStreamBase::iterator & +TextStreamBase::iterator::operator=(const typename buffer_type::iterator & i) +{ + membuffer_iterator = i; +} + + + + +template +bool TextStreamBase::iterator::operator==(const iterator & i) const +{ + return membuffer_iterator == i.membuffer_iterator; +} + +template +bool TextStreamBase::iterator::operator!=(const iterator & i) const +{ + return membuffer_iterator != i.membuffer_iterator; +} + +template +TextStreamBase::iterator & +TextStreamBase::iterator::operator++() +{ + ++membuffer_iterator; + return *this; +} + +template +TextStreamBase::iterator +TextStreamBase::iterator::operator++(int) +{ + const_iterator old(*this); + membuffer_iterator++; + return old; +} + +template +TextStreamBase::iterator & +TextStreamBase::iterator::operator--() +{ + --membuffer_iterator; + return *this; +} + +template +TextStreamBase::iterator +TextStreamBase::iterator::operator--(int) +{ + const_iterator old(*this); + membuffer_iterator--; + return old; +} + +template +char_type & TextStreamBase::iterator::operator*() +{ + return *membuffer_iterator; +} + + +template +wchar_t TextStreamBase::iterator::get_unicode_and_advance(const iterator & end) +{ + if( *this != end ) + { + if constexpr (sizeof(char_type) == sizeof(char) ) + { + int res; + bool correct; + utf8_to_int(*this, end, res, correct); + + if( correct ) + return static_cast(res); + else + return static_cast(0xFFFD); // U+FFFD "replacement character" + } + else + { + wchar_t c = operator*(); + ++membuffer_iterator; + return c; + } + } + + return 0; +} + + + + + +template +TextStreamBase::const_iterator::const_iterator() +{ +} + +template +TextStreamBase::const_iterator::const_iterator(const const_iterator & i) : membuffer_const_iterator(i.membuffer_const_iterator) +{ +} + +template +TextStreamBase::const_iterator::const_iterator(const iterator & i) : membuffer_const_iterator(i.membuffer_iterator) +{ +} + +template +TextStreamBase::const_iterator & +TextStreamBase::const_iterator::operator=(const const_iterator & i) +{ + membuffer_const_iterator = i.membuffer_const_iterator; + return *this; +} + +template +TextStreamBase::const_iterator & +TextStreamBase::const_iterator::operator=(const iterator & i) +{ + membuffer_const_iterator = i.membuffer_iterator; + return *this; +} + + + +template +TextStreamBase::const_iterator::const_iterator(const typename buffer_type::const_iterator & i) : membuffer_const_iterator(i) +{ +} + +template +TextStreamBase::const_iterator::const_iterator(const typename buffer_type::iterator & i) : membuffer_const_iterator(i) +{ +} + +template +TextStreamBase::const_iterator & +TextStreamBase::const_iterator::operator=(const typename buffer_type::const_iterator & i) +{ + membuffer_const_iterator = i; + return *this; +} + +template +TextStreamBase::const_iterator & +TextStreamBase::const_iterator::operator=(const typename buffer_type::iterator & i) +{ + membuffer_const_iterator = i; + return *this; +} + + + +template +bool TextStreamBase::const_iterator::operator==(const const_iterator & i) const +{ + return membuffer_const_iterator == i.membuffer_const_iterator; +} + +template +bool TextStreamBase::const_iterator::operator!=(const const_iterator & i) const +{ + return membuffer_const_iterator != i.membuffer_const_iterator; +} + +template +TextStreamBase::const_iterator & +TextStreamBase::const_iterator::operator++() +{ + ++membuffer_const_iterator; + return *this; +} + +template +TextStreamBase::const_iterator +TextStreamBase::const_iterator::operator++(int) +{ + const_iterator old(*this); + membuffer_const_iterator++; + return old; +} + +template +TextStreamBase::const_iterator & +TextStreamBase::const_iterator::operator--() +{ + --membuffer_const_iterator; + return *this; +} + +template +TextStreamBase::const_iterator +TextStreamBase::const_iterator::operator--(int) +{ + const_iterator old(*this); + membuffer_const_iterator--; + return old; +} + +template +char_type TextStreamBase::const_iterator::operator*() +{ + return *membuffer_const_iterator; +} + + +template +wchar_t TextStreamBase::const_iterator::get_unicode_and_advance(const const_iterator & end) +{ + if( *this != end ) + { + if constexpr (sizeof(char_type) == sizeof(char) ) + { + int res; + bool correct; + pt::utf8_to_int(*this, end, res, correct); + + if( correct ) + return static_cast(res); + else + return static_cast(0xFFFD); // U+FFFD "replacement character" + } + else + { + wchar_t c = operator*(); + ++membuffer_const_iterator; + return c; + } + } + + return 0; +} + + + + + template bool TextStreamBase::is_char_stream() const { @@ -433,10 +759,14 @@ template TextStreamBase & TextStreamBase::operator<<(char v) { - // IMPROVEME - // if char_type == 1 then if v <= 127 then put that char but if (unsigned)v > 127 put replacement character - // if char_type > 1 then simply put that character - buffer.append(static_cast(v)); + if constexpr (sizeof(char_type) == sizeof(wchar_t) ) + { + buffer.append(static_cast(static_cast(v))); + } + else + { + buffer.append(v); + } return *this; } @@ -446,9 +776,6 @@ template TextStreamBase & TextStreamBase::operator<<(unsigned char v) { - // IMPROVEME - // if char_type == 1 then if v <= 127 then put that char but if v > 127 put replacement character - // if char_type > 1 then simply put that character buffer.append(static_cast(v)); return *this; @@ -459,8 +786,14 @@ template TextStreamBase & TextStreamBase::operator<<(wchar_t v) { - // IMPROVEME add utf8/wide conversion, if v is from surrogate pair we can skip it - buffer.append(static_cast(v)); + if constexpr (sizeof(char_type) == sizeof(wchar_t) ) + { + buffer.append(v); + } + else + { + pt::int_to_utf8(static_cast(v), *this); + } return *this; } diff --git a/src/utf8/utf8_stream.h b/src/utf8/utf8_stream.h index 3adf848..565381d 100644 --- a/src/utf8/utf8_stream.h +++ b/src/utf8/utf8_stream.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2021, Tomasz Sowa + * Copyright (c) 2021-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -60,7 +60,7 @@ namespace pt template size_t utf8_to_int( StreamIteratorType & iterator_in, - StreamIteratorType & iterator_end, + const StreamIteratorType & iterator_end, int & res, bool & correct) { diff --git a/tests/Makefile.dep b/tests/Makefile.dep index a9228ca..8685629 100644 --- a/tests/Makefile.dep +++ b/tests/Makefile.dep @@ -19,7 +19,6 @@ ./csvparser.o: ../src/textstream/stream.h ../src/date/date.h ./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h test.h ./main.o: convert.h mainoptionsparser.h csvparser.h -./test.o: test.h ./mainoptionsparser.o: mainoptionsparser.h test.h ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h ./mainoptionsparser.o: ../src/space/space.h ../src/textstream/types.h @@ -35,3 +34,4 @@ ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h ./mainoptionsparser.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h ./mainoptionsparser.o: ../src/convert/double.h +./test.o: test.h