diff --git a/src/textstream/textstream.h b/src/textstream/textstream.h index 352e3e5..772a073 100644 --- a/src/textstream/textstream.h +++ b/src/textstream/textstream.h @@ -157,8 +157,6 @@ public: protected: void put_stream(const Stream & stream); - void put_utf8_to_wide(const Stream & stream); - void put_wide_to_utf8(const Stream & stream); }; @@ -435,6 +433,9 @@ template TextStreamBase & TextStreamBase::operator<<(char v) { + // IMPROVEME + // if char_type == 1 then if v <= 127 then put that char but if (unsigned)v > 127 put replacement character + // if char_type > 1 then simply put that character buffer.append(static_cast(v)); return *this; @@ -445,6 +446,9 @@ template TextStreamBase & TextStreamBase::operator<<(unsigned char v) { + // IMPROVEME + // if char_type == 1 then if v <= 127 then put that char but if v > 127 put replacement character + // if char_type > 1 then simply put that character buffer.append(static_cast(v)); return *this; @@ -455,6 +459,7 @@ template TextStreamBase & TextStreamBase::operator<<(wchar_t v) { + // IMPROVEME add utf8/wide conversion, if v is from surrogate pair we can skip it buffer.append(static_cast(v)); return *this; @@ -652,6 +657,7 @@ TextStreamBase::operator<<(const Stream } + template void TextStreamBase::put_stream(const Stream & stream) { @@ -669,13 +675,13 @@ void TextStreamBase::put_stream(const St if( sizeof(char_type) == sizeof(wchar_t) && stream.is_char_stream() ) { // from utf8 to wide - put_utf8_to_wide(stream); + utf8_to_wide(stream, *this, false); } else if( sizeof(char_type) == sizeof(char) && stream.is_wchar_stream() ) { // from wide to utf8 - put_wide_to_utf8(stream); + wide_stream_to_utf8(stream, *this, false); } else if( sizeof(char_type) == sizeof(wchar_t) && stream.is_wchar_stream() ) @@ -695,68 +701,6 @@ void TextStreamBase::put_stream(const St } -// test me when sizeof(wchar_t) is 2 -// or may move me to utf8 functions? -template -void TextStreamBase::put_utf8_to_wide(const Stream & stream) -{ - size_t len; - bool correct; - size_t index = 0; - int z; - - do - { - len = utf8_to_int(stream, index, z, correct); - - if( len > 0 ) - { - if( !correct ) - { - z = 0xFFFD; // U+FFFD "replacement character" - } - - int_to_wide(z, *this); - index += len; - } - } - while( len > 0 ); -} - - -// test me when sizeof(wchar_t) is 2 -// or may move me to utf8 functions? -template -void TextStreamBase::put_wide_to_utf8(const Stream & stream) -{ - char utf8_buf[10]; - size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); - - for(size_t i=0 ; i < stream.size() ; ++i) - { - int c = static_cast(stream.get_wchar(i)); - - if( is_first_surrogate_char(c) && i + 1 < stream.size() ) - { - wchar_t c1 = static_cast(c); - wchar_t c2 = stream.get_wchar(++i); - surrogate_pair_to_int(c1, c2, c); - } - - size_t len = int_to_utf8(c, utf8_buf, utf8_buf_len); - - for(size_t u=0 ; u < len ; ++u) - { - operator<<(utf8_buf[u]); - } - } -} - - - - - - template TextStreamBase & diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h index 13df593..bdf28f3 100644 --- a/src/utf8/utf8.h +++ b/src/utf8/utf8.h @@ -60,6 +60,8 @@ namespace pt /*! returns true if 'c' is a correct unicode character + + RENAMEME to is_correct_unicode_char */ bool utf8_check_range(int c); @@ -152,6 +154,8 @@ bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear = true, template bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear = true, int mode = 1); // need to be tested +template +bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear = true, int mode = 1); /* @@ -205,8 +209,11 @@ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, s template void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear = true, int mode = 1); // not tested +template +bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1); + template -void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested +void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested, IMPROVE ME add clear parameter, mode parameter is not used diff --git a/src/utf8/utf8_templates.h b/src/utf8/utf8_templates.h index fc44491..a0f7613 100644 --- a/src/utf8/utf8_templates.h +++ b/src/utf8/utf8_templates.h @@ -136,6 +136,59 @@ return !was_error; } +/* +this function converts a UTF-8 stream into wide stream + +input: + stream - a UTF-8 stream for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + +output: + res - a wide stream for the output sequence + + this function returns false if there were some errors when converting +*/ +template +bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear, int mode) +{ + size_t len; + bool correct; + int z; + size_t index = 0; + bool was_error = false; + + if( clear ) + res.clear(); + + // CHECKME test me when sizeof(wchar_t) is 2 + + do + { + len = utf8_to_int(stream, index, z, correct); + + if( len > 0 ) + { + if( !correct ) + { + if( mode == 1 ) + res << 0xFFFD; // U+FFFD "replacement character" + + was_error = true; + } + else + { + int_to_wide(z, res); + } + + index += len; + } + } + while( len > 0 ); + + return !was_error; +} @@ -268,6 +321,59 @@ void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, in } + +template +bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode) +{ + bool was_error = false; + + if( clear ) + utf8.clear(); + + for(size_t i=0 ; i < stream.size() ; ++i) + { + int c = static_cast(stream.get_wchar(i)); + bool is_correct = false; + + if( utf8_check_range(c) ) + { + // CHECKME test me when sizeof(wchar_t) == 2 + if( is_first_surrogate_char(c) ) + { + if( i + 1 < stream.size() ) + { + wchar_t c1 = static_cast(c); + wchar_t c2 = stream.get_wchar(++i); + + if( surrogate_pair_to_int(c1, c2, c) ) + { + is_correct = true; + } + } + } + else + { + is_correct = true; + } + } + + if( is_correct ) + { + int_to_utf8(c, utf8); + } + else + { + was_error = true; + + if( mode == 1 ) + int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character" + } + } + + return !was_error; +} + + // not tested template void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)