diff --git a/src/Makefile.dep b/src/Makefile.dep index 88924d2..53fbb00 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -2,51 +2,45 @@ ./convert/inttostr.o: ./convert/inttostr.h ./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h -./convert/misc.o: textstream/types.h utf8/utf8_stream.h -./convert/misc.o: textstream/textstream.h textstream/stream.h space/space.h -./convert/misc.o: convert/inttostr.h utf8/utf8.h utf8/utf8_templates.h -./convert/misc.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h -./convert/misc.o: textstream/types.h ./convert/inttostr.h +./convert/misc.o: textstream/types.h ./convert/inttostr.h utf8/utf8.h +./convert/misc.o: utf8/utf8_templates.h utf8/utf8_private.h ./convert/text.o: ./convert/text.h ./convert/text_private.h ./convert/double.o: ./convert/double.h textstream/textstream.h ./convert/double.o: textstream/stream.h space/space.h textstream/types.h ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./convert/double.o: membuffer/membuffer.h textstream/types.h -./convert/double.o: utf8/utf8_stream.h ./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h ./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h ./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./convert/baseparser.o: membuffer/membuffer.h textstream/types.h -./convert/baseparser.o: utf8/utf8_stream.h ./date/date.o: ./date/date.h convert/inttostr.h ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h ./log/filelog.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h -./log/filelog.o: textstream/types.h utf8/utf8_stream.h +./log/filelog.o: textstream/types.h ./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h ./log/log.o: space/space.h textstream/types.h convert/inttostr.h utf8/utf8.h ./log/log.o: textstream/stream.h utf8/utf8_templates.h utf8/utf8_private.h ./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h -./log/log.o: utf8/utf8_stream.h ./log/filelog.h +./log/log.o: ./log/filelog.h ./space/space.o: ./space/space.h textstream/types.h convert/inttostr.h ./space/space.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h ./space/space.o: convert/patternreplacer.h textstream/textstream.h ./space/space.o: textstream/stream.h space/space.h date/date.h -./space/space.o: membuffer/membuffer.h textstream/types.h utf8/utf8_stream.h -./space/space.o: convert/strtoint.h ./convert/text.h ./convert/misc.h -./space/space.o: ./convert/double.h +./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h +./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h ./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h ./space/spaceparser.o: textstream/textstream.h textstream/stream.h ./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h -./space/spaceparser.o: textstream/types.h utf8/utf8_stream.h -./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h +./space/spaceparser.o: textstream/types.h convert/strtoint.h ./convert/text.h +./space/spaceparser.o: ./convert/misc.h ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./utf8/utf8.o: utf8/utf8_private.h ./utf8/utf8_private.o: utf8/utf8_private.h @@ -55,7 +49,7 @@ ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h ./csv/csvparser.o: convert/baseparser.h textstream/textstream.h ./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h -./csv/csvparser.o: textstream/types.h utf8/utf8_stream.h +./csv/csvparser.o: textstream/types.h ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h @@ -67,10 +61,9 @@ ./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h -./html/bbcodeparser.o: utf8/utf8_stream.h ./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h ./html/htmlparser.o: textstream/textstream.h textstream/stream.h ./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h ./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h -./html/htmlparser.o: textstream/types.h utf8/utf8_stream.h convert/text.h +./html/htmlparser.o: textstream/types.h convert/text.h diff --git a/src/convert/baseparser.cpp b/src/convert/baseparser.cpp index 4a8b88e..e58d2ae 100644 --- a/src/convert/baseparser.cpp +++ b/src/convert/baseparser.cpp @@ -34,7 +34,6 @@ #include "baseparser.h" #include "utf8/utf8.h" -#include "utf8/utf8_stream.h" namespace pt diff --git a/src/convert/misc.h b/src/convert/misc.h index 6228b67..ebc6cad 100644 --- a/src/convert/misc.h +++ b/src/convert/misc.h @@ -39,7 +39,6 @@ #include "text.h" #include "textstream/stream.h" #include "textstream/types.h" -#include "utf8/utf8_stream.h" namespace pt diff --git a/src/textstream/textstream.h b/src/textstream/textstream.h index a5c3df7..d43a8c7 100644 --- a/src/textstream/textstream.h +++ b/src/textstream/textstream.h @@ -43,7 +43,6 @@ #include "membuffer/membuffer.h" #include "types.h" #include "utf8/utf8.h" -#include "utf8/utf8_stream.h" // for snprintf #include @@ -213,6 +212,9 @@ public: protected: + template + void put_stream(const TextStreamBase & stream); + void put_stream(const Stream & stream); @@ -599,8 +601,9 @@ void TextStreamBase::to_str(std::string if constexpr (sizeof(char_type) == sizeof(char) ) { const_iterator i = begin(); + const_iterator i_end = end(); - for( ; i != end() ; ++i) + for( ; i != i_end ; ++i) str += *i; } else @@ -629,10 +632,7 @@ void TextStreamBase::to_str(std::wstring } else { - // IMPROVE ME don't use a temporary object - std::string utf8; - to_str(utf8); - utf8_to_wide(utf8, str, false); + utf8_to_wide(*this, str, false); } } @@ -715,7 +715,14 @@ template TextStreamBase & TextStreamBase::operator<<(const std::string & str) { - operator<<(str.c_str()); + if constexpr ( sizeof(char_type) == sizeof(char) ) + { + buffer.append(str.c_str(), str.size()); + } + else + { + utf8_to_wide(str, *this, false); + } return *this; } @@ -733,7 +740,7 @@ TextStreamBase::operator<<(const wchar_t } else { - wide_to_utf8(str, *this); + wide_to_utf8(str, *this, false); } return *this; @@ -745,7 +752,14 @@ template TextStreamBase & TextStreamBase::operator<<(const std::wstring & str) { - operator<<(str.c_str()); + if constexpr (sizeof(char_type) == sizeof(wchar_t) ) + { + buffer.append(str.c_str(), str.size()); + } + else + { + wide_to_utf8(str, *this, false); + } return *this; } @@ -988,6 +1002,44 @@ TextStreamBase::operator<<(const Stream +template +template +void TextStreamBase::put_stream( + const TextStreamBase & stream + ) +{ + if( (sizeof(char_type) == sizeof(char) && stream.is_char_stream()) || + (sizeof(char_type) == sizeof(wchar_t) && stream.is_wchar_stream()) ) + { + // from utf8 to utf8 or from wide to wide + + typename TextStreamBase::const_iterator i = stream.begin(); + + for( ; i != stream.end() ; ++i) + { + operator<<(*i); + } + } + else + if( sizeof(char_type) == sizeof(wchar_t) && stream.is_char_stream() ) + { + // from utf8 to wide + utf8_to_wide(stream, *this, false); + } + else + if( sizeof(char_type) == sizeof(char) && stream.is_wchar_stream() ) + { + // from wide to utf8 + wide_stream_to_utf8(stream, *this, false); + } + else + { + operator<<("such conversion is not implemented"); + } +} + + + template void TextStreamBase::put_stream(const Stream & stream) { @@ -1036,7 +1088,7 @@ template TextStreamBase & TextStreamBase::operator<<(const Space & space) { - space.serialize_to_space_stream(*this, true); + space.serialize_to_json_stream(*this, true); return *this; } @@ -1059,11 +1111,6 @@ TextStreamBase & TextStreamBase::operator<<( const TextStreamBase & arg) { - /* - * in the future we can have a faster implementation - * which uses iterators instead of get_char() and get_wchar() methods - * - */ put_stream(arg); return *this; @@ -1101,6 +1148,35 @@ TextStreamBase::fill_up_if_needed(wchar_ } + + + +/*! + this function converts an UTF-8 stream into wide stream or wide string + (is declared in utf8/utf8.h) + + input: + iterator_in - an TextStream iterator for reading from + iterator_end - an end iterator (can be returned by end() method from TextStream) + + output: + out_stream - an output wide stream or wide string + + this function returns false if there were some errors when converting +*/ +template +bool utf8_to_wide( + const TextStreamBase & utf8, + StreamOrStringType & out_stream, + bool clear_stream, + int mode + ) +{ + typename TextStreamBase::const_iterator i_begin = utf8.begin(); + return utf8_to_wide(i_begin, utf8.end(), out_stream, clear_stream, mode); +} + + } // namespace diff --git a/src/utf8/utf8.cpp b/src/utf8/utf8.cpp index 98c91c6..4cdb506 100644 --- a/src/utf8/utf8.cpp +++ b/src/utf8/utf8.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2021, Tomasz Sowa + * Copyright (c) 2010-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -128,6 +128,25 @@ bool surrogate_pair_to_int(int c1, int c2, int & z) +/* + converts an int to a wide string +*/ +void int_to_wide(int c, std::wstring & res) +{ + if( sizeof(wchar_t)==2 && c>0xffff ) + { + // UTF16 surrogate pairs + c -= 0x10000; + res += static_cast(((c >> 10) & 0x3FF) + 0xD800); + res += static_cast((c & 0x3FF) + 0xDC00); + } + else + { + res += static_cast(c); + } +} + + /*! this function converts one UTF-8 character into one wide-character @@ -312,7 +331,7 @@ unsigned char uz; uz = utf8.get_char(stream_index + i); if( !private_namespace::utf8_to_int_add_next_octet(uz, res) ) - return i; + return i + 1; } if( utf8_check_range(res, len) ) @@ -330,26 +349,6 @@ unsigned char uz; -/* - -*/ -static void int_to_wide(int c, std::wstring & res) -{ - if( sizeof(wchar_t)==2 && c>0xffff ) - { - // UTF16 surrogate pairs - c -= 0x10000; - res += static_cast(((c >> 10) & 0x3FF) + 0xD800); - res += static_cast((c & 0x3FF) + 0xDC00); - } - else - { - res += static_cast(c); - } -} - - - /*! diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h index d2f21bf..19c0cb6 100644 --- a/src/utf8/utf8.h +++ b/src/utf8/utf8.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2021, Tomasz Sowa + * Copyright (c) 2010-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -128,6 +128,9 @@ size_t utf8_to_int(const std::string & utf8, int & res, bool & correct size_t utf8_to_int(std::istream & utf8, int & res, bool & correct); size_t utf8_to_int(const Stream & utf8, size_t stream_index, int & res, bool & correct); +template +size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct); + /*! converting one character from int to wide stream @@ -136,6 +139,11 @@ template void int_to_wide(int c, StreamType & res); +/*! + converting one character from int to wide string +*/ +void int_to_wide(int c, std::wstring & res); + /*! converting UTF-8 string to a wide string @@ -157,8 +165,18 @@ bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear = true, template bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear = true, int mode = 1); // need to be tested -template -bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear = true, int mode = 1); +template +bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear = true, int mode = 1); + +template +bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1); + +template +class TextStreamBase; + +// defined at the end in textstream.h +template +bool utf8_to_wide(const TextStreamBase & utf8, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1); /* diff --git a/src/utf8/utf8_private.h b/src/utf8/utf8_private.h index 0122a49..c12a8c5 100644 --- a/src/utf8/utf8_private.h +++ b/src/utf8/utf8_private.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2021, Tomasz Sowa + * Copyright (c) 2021-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -56,7 +56,7 @@ namespace private_namespace bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res); bool utf8_to_int_add_next_octet(unsigned char uz, int & res); -size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); +size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); // may these methods make public? size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct); size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, diff --git a/src/utf8/utf8_stream.h b/src/utf8/utf8_stream.h deleted file mode 100644 index a50af92..0000000 --- a/src/utf8/utf8_stream.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * This file is a part of PikoTools - * and is distributed under the 2-Clause BSD licence. - * Author: Tomasz Sowa - */ - -/* - * Copyright (c) 2021-2022, Tomasz Sowa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifndef headerfile_pikotools_src_utf8_utf8_stream -#define headerfile_pikotools_src_utf8_utf8_stream - -#include "textstream/textstream.h" - -namespace pt -{ - - -/*! - this function converts one UTF-8 character into one wide-character - - input: - iterator_in - an TextStream iterator for reading from - iterator_end - an end iterator (can be returned by end() method from TextStream) - - output: - res - an output character - correct - true if it is a correct character - - the function returns how many characters have been used from the input stream -*/ -template -size_t utf8_to_int( - StreamIteratorType & iterator_in, - const StreamIteratorType & iterator_end, - int & res, - bool & correct) -{ -size_t i, len; -unsigned char uz; - - res = 0; - correct = false; - - if( iterator_in == iterator_end ) - return 0; - - uz = *iterator_in; - ++iterator_in; - - if( !private_namespace::utf8_to_int_first_octet(uz, len, res) ) - return 1; - - for(i=1 ; i +size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct) +{ +size_t i, len; +unsigned char uz; + + res = 0; + correct = false; + + if( iterator_in == iterator_end ) + return 0; + + uz = *iterator_in; + ++iterator_in; + + if( !private_namespace::utf8_to_int_first_octet(uz, len, res) ) + return 1; + + for(i=1 ; i -bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear, int mode) +template +bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode) { size_t len; bool correct; @@ -172,7 +221,7 @@ bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear, int mode) if( !correct ) { if( mode == 1 ) - res << 0xFFFD; // U+FFFD "replacement character" + int_to_wide(0xFFFD, res); // U+FFFD "replacement character" was_error = true; } @@ -194,6 +243,51 @@ bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear, int mode) +/*! + this function converts one UTF-8 character into a wide stream or a wide string + + input: + iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) + iterator_end - an end iterator + + output: + out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator) + + this function returns false if there were some errors when converting +*/ +template +bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode) +{ + if( clear_stream ) + out_stream.clear(); + + int res; + bool correct; + bool was_error = false; + + while( iterator_in != iterator_end ) + { + utf8_to_int(iterator_in, iterator_end, res, correct); + + if( correct ) + { + int_to_wide(res, out_stream); + } + else + { + if( mode == 1 ) + int_to_wide(0xFFFD, out_stream); // U+FFFD "replacement character" + + was_error = true; + } + } + + return !was_error; +} + + + + /*! this function converts one wide character into UTF-8 stream @@ -387,6 +481,11 @@ void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear + + + + + } // namespace pt #endif diff --git a/tests/Makefile.dep b/tests/Makefile.dep index 8cf551c..faa24dd 100644 --- a/tests/Makefile.dep +++ b/tests/Makefile.dep @@ -8,17 +8,15 @@ ./convert.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h ./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h -./convert.o: ../src/utf8/utf8_stream.h ../src/convert/strtoint.h -./convert.o: ../src/convert/text.h ../src/convert/misc.h -./convert.o: ../src/convert/double.h +./convert.o: ../src/convert/strtoint.h ../src/convert/text.h +./convert.o: ../src/convert/misc.h ../src/convert/double.h ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h ./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h ./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h ./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h ./csvparser.o: ../src/textstream/stream.h ../src/date/date.h -./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h -./csvparser.o: ../src/utf8/utf8_stream.h test.h +./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h test.h ./main.o: convert.h mainoptionsparser.h csvparser.h ./test.o: test.h ./mainoptionsparser.o: mainoptionsparser.h test.h @@ -33,6 +31,5 @@ ./mainoptionsparser.o: ../src/textstream/textstream.h ./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h -./mainoptionsparser.o: ../src/utf8/utf8_stream.h ../src/convert/strtoint.h -./mainoptionsparser.o: ../src/convert/text.h ../src/convert/misc.h -./mainoptionsparser.o: ../src/convert/double.h +./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h +./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h