From aacb1f43ae4912d7ec9e3718f54b18097bfc7ae0 Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Thu, 30 May 2024 20:19:04 +0200 Subject: [PATCH] add some utf8 converting methods add new methods: - bool int_to_stream(int c, pt::Stream & stream); - template bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode = 1); - template bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode = 1); - template bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode = 1); make some methods public: - size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct) - size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct) rename and make some methods public: - template utf8_to_wide_generic(const char * utf8, size_t utf8_len, OutputFunction convert_function, int mode) -> utf8_to_output_function(...) while here: - fix: correctly convert characters in Log::put_multiline_generic() --- src/Makefile.dep | 97 +++++++++-------- src/log/log.h | 31 +++++- src/space/space.cpp | 4 +- src/space/space.h | 1 - src/textstream/textstream.h | 2 +- src/utf8/utf8.cpp | 106 +++++++++++++++++-- src/utf8/utf8.h | 42 +++++++- src/utf8/utf8_private.cpp | 81 +-------------- src/utf8/utf8_private.h | 151 +-------------------------- src/utf8/utf8_templates.h | 201 ++++++++++++++++++++++++++++++++---- tests/Makefile.dep | 50 ++++----- 11 files changed, 428 insertions(+), 338 deletions(-) diff --git a/src/Makefile.dep b/src/Makefile.dep index a3e1eb4..530234e 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -6,79 +6,76 @@ ./convert/misc.o: utf8/utf8_templates.h utf8/utf8_private.h ./convert/text.o: ./convert/text.h ./convert/text_private.h ./convert/double.o: ./convert/double.h textstream/textstream.h -./convert/double.o: textstream/stream.h space/space.h textstream/types.h -./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h -./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h -./convert/double.o: membuffer/membuffer.h textstream/types.h -./convert/double.o: textstream/stream_private.h +./convert/double.o: textstream/stream.h space/space.h convert/inttostr.h +./convert/double.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./convert/double.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./convert/double.o: textstream/types.h textstream/stream_private.h ./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h -./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h -./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h -./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h -./convert/baseparser.o: membuffer/membuffer.h textstream/types.h -./convert/baseparser.o: textstream/stream_private.h +./convert/baseparser.o: textstream/stream.h space/space.h convert/inttostr.h +./convert/baseparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./convert/baseparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./convert/baseparser.o: textstream/types.h textstream/stream_private.h ./date/date.o: ./date/date.h convert/inttostr.h ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h -./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h -./log/filelog.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./log/filelog.o: space/space.h convert/inttostr.h utf8/utf8.h +./log/filelog.o: textstream/stream.h utf8/utf8_templates.h ./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h ./log/filelog.o: textstream/types.h textstream/stream_private.h ./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h -./log/log.o: space/space.h textstream/types.h convert/inttostr.h utf8/utf8.h -./log/log.o: textstream/stream.h utf8/utf8_templates.h utf8/utf8_private.h -./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h +./log/log.o: space/space.h convert/inttostr.h utf8/utf8.h textstream/stream.h +./log/log.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h +./log/log.o: membuffer/membuffer.h textstream/types.h ./log/log.o: textstream/stream_private.h ./log/filelog.h -./space/space.o: ./space/space.h textstream/types.h convert/inttostr.h -./space/space.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./space/space.o: ./space/space.h convert/inttostr.h utf8/utf8.h +./space/space.o: textstream/stream.h utf8/utf8_templates.h ./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h ./space/space.o: convert/patternreplacer.h textstream/textstream.h ./space/space.o: textstream/stream.h space/space.h date/date.h ./space/space.o: membuffer/membuffer.h textstream/types.h ./space/space.o: textstream/stream_private.h convert/strtoint.h -./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h +./space/space.o: ./convert/text.h ./convert/misc.h textstream/types.h +./space/space.o: ./convert/double.h ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h -./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h -./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h -./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h -./space/spaceparser.o: textstream/textstream.h textstream/stream.h -./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h -./space/spaceparser.o: textstream/types.h textstream/stream_private.h -./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h +./space/spaceparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h +./space/spaceparser.o: utf8/utf8_templates.h utf8/utf8_private.h +./space/spaceparser.o: convert/baseparser.h textstream/textstream.h +./space/spaceparser.o: textstream/stream.h space/space.h date/date.h +./space/spaceparser.o: membuffer/membuffer.h textstream/types.h +./space/spaceparser.o: textstream/stream_private.h convert/strtoint.h +./space/spaceparser.o: ./convert/text.h ./convert/misc.h textstream/types.h ./space/keyvalueparser.o: ./space/keyvalueparser.h ./space/space.h -./space/keyvalueparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h -./space/keyvalueparser.o: textstream/stream.h utf8/utf8_templates.h -./space/keyvalueparser.o: utf8/utf8_private.h convert/baseparser.h -./space/keyvalueparser.o: textstream/textstream.h textstream/stream.h -./space/keyvalueparser.o: space/space.h date/date.h membuffer/membuffer.h -./space/keyvalueparser.o: textstream/types.h textstream/stream_private.h -./space/keyvalueparser.o: convert/strtoint.h ./convert/text.h -./space/keyvalueparser.o: ./convert/misc.h +./space/keyvalueparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h +./space/keyvalueparser.o: utf8/utf8_templates.h utf8/utf8_private.h +./space/keyvalueparser.o: convert/baseparser.h textstream/textstream.h +./space/keyvalueparser.o: textstream/stream.h space/space.h date/date.h +./space/keyvalueparser.o: membuffer/membuffer.h textstream/types.h +./space/keyvalueparser.o: textstream/stream_private.h convert/strtoint.h +./space/keyvalueparser.o: ./convert/text.h ./convert/misc.h +./space/keyvalueparser.o: textstream/types.h ./textstream/stream_private.o: textstream/stream_private.h ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./utf8/utf8.o: utf8/utf8_private.h ./utf8/utf8_private.o: utf8/utf8_private.h -./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h -./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h -./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h -./csv/csvparser.o: convert/baseparser.h textstream/textstream.h -./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h -./csv/csvparser.o: textstream/types.h textstream/stream_private.h +./csv/csvparser.o: ./csv/csvparser.h space/space.h convert/inttostr.h +./csv/csvparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./csv/csvparser.o: utf8/utf8_private.h convert/baseparser.h +./csv/csvparser.o: textstream/textstream.h textstream/stream.h date/date.h +./csv/csvparser.o: membuffer/membuffer.h textstream/types.h +./csv/csvparser.o: textstream/stream_private.h ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h -./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h -./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h -./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h -./mainoptions/mainoptionsparser.o: utf8/utf8_private.h +./mainoptions/mainoptionsparser.o: space/space.h convert/inttostr.h +./mainoptions/mainoptionsparser.o: utf8/utf8.h textstream/stream.h +./mainoptions/mainoptionsparser.o: utf8/utf8_templates.h utf8/utf8_private.h ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h ./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h -./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h -./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h -./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h -./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h -./html/bbcodeparser.o: textstream/stream_private.h +./html/bbcodeparser.o: textstream/stream.h space/space.h convert/inttostr.h +./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./html/bbcodeparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./html/bbcodeparser.o: textstream/types.h textstream/stream_private.h ./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h ./html/htmlparser.o: textstream/textstream.h textstream/stream.h -./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h -./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./html/htmlparser.o: space/space.h convert/inttostr.h utf8/utf8.h +./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h ./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h ./html/htmlparser.o: textstream/types.h textstream/stream_private.h ./html/htmlparser.o: convert/text.h diff --git a/src/log/log.h b/src/log/log.h index af1fb82..330129b 100644 --- a/src/log/log.h +++ b/src/log/log.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2018-2022, Tomasz Sowa + * Copyright (c) 2018-2024, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,7 +36,6 @@ #define headerfile_pikotools_src_log_log #include -#include #include "textstream/textstream.h" #include "filelog.h" @@ -246,6 +245,7 @@ void Log::put_multiline_generic(const CharType * prefix, const CharType * msg) { was_new_line = true; put_prefix = true; + msg += 1; } else { @@ -265,11 +265,32 @@ void Log::put_multiline_generic(const CharType * prefix, const CharType * msg) put_prefix = false; } - operator<<(*msg); + if constexpr ( sizeof(CharType) == sizeof(char) ) + { + int c; + bool correct; + msg += utf8_to_int(msg, c, correct); + + if( correct ) + int_to_stream(c, *this); + else + int_to_stream(0xFFFD, *this); // replacement character + } + else + if constexpr ( sizeof(CharType) == sizeof(wchar_t) ) + { + operator<<(*msg); + msg += 1; + } + else + { + // what is the CharType? + // at the moment do not print anything + msg += 1; + } + was_something_printed = true; } - - msg += 1; } if( was_something_printed ) diff --git a/src/space/space.cpp b/src/space/space.cpp index 8d7146d..880d894 100644 --- a/src/space/space.cpp +++ b/src/space/space.cpp @@ -34,8 +34,10 @@ #include #include "space.h" -#include "utf8/utf8.h" #include "convert/convert.h" +#include "textstream/textstream.h" +#include "utf8/utf8.h" + namespace pt diff --git a/src/space/space.h b/src/space/space.h index 3964a09..4c38b6e 100644 --- a/src/space/space.h +++ b/src/space/space.h @@ -42,7 +42,6 @@ #include #include #include -#include "textstream/types.h" #include "convert/inttostr.h" #include "utf8/utf8.h" diff --git a/src/textstream/textstream.h b/src/textstream/textstream.h index f703e4a..f52d237 100644 --- a/src/textstream/textstream.h +++ b/src/textstream/textstream.h @@ -203,7 +203,6 @@ public: template TextStreamBase & operator<<(const TextStreamBase & arg); - template bool operator==(const TextStreamBase & stream) const; @@ -1235,6 +1234,7 @@ return *this; } + template template bool TextStreamBase::operator==(const TextStreamBase & stream) const diff --git a/src/utf8/utf8.cpp b/src/utf8/utf8.cpp index abf201a..fd8c3aa 100644 --- a/src/utf8/utf8.cpp +++ b/src/utf8/utf8.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2023, Tomasz Sowa + * Copyright (c) 2010-2024, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -42,9 +42,6 @@ namespace pt { - - - /*! returns true if 'c' is a correct unicode character */ @@ -128,6 +125,83 @@ bool surrogate_pair_to_int(int c1, int c2, int & z) +/* + an auxiliary function for converting from wide characters to UTF-8 + converting a wide character into one int + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct) +{ + if( string_len == 0 ) + { + z = 0; + correct = false; + return 0; + } + + z = static_cast(*wide_string); + correct = true; + + if( sizeof(wchar_t) == 2 && is_first_surrogate_char(z) ) + { + if( string_len > 1 ) + { + int z2 = *(wide_string+1); + + if( is_second_surrogate_char(z2) ) + { + z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF)); + return 2; + } + else + { + correct = false; + return 1; + } + } + else + { + correct = false; + return 1; + } + } + else + { + correct = utf8_check_range(z); + return 1; + } +} + + + +/* + an auxiliary function for converting from wide characters to UTF-8 + converting a wide character into one int + + returns how many wide characters were used + if wide_string has at least one character then the return value is always greater than zero too +*/ +size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct) +{ +size_t min_str_len = 1; + + if( *wide_string == 0 ) + { + z = 0; + correct = false; + return 0; + } + + if( *(wide_string+1) != 0 ) + min_str_len = 2; + +return wide_to_int(wide_string, min_str_len, z, correct); +} + + + /* * converts an int to a wide string * @@ -185,6 +259,26 @@ bool int_to_wide(int c, std::wstring & res) } +/* + * return true if c was a correct unicode character + * and has been put the the stream + */ +bool int_to_stream(int c, pt::Stream & stream) +{ + if( stream.is_char_stream() ) + { + return int_to_utf8(c, stream) > 0; + } + else + if( stream.is_wchar_stream() ) + { + return int_to_wide(c, stream); + } + + return false; +} + + /*! @@ -410,9 +504,9 @@ bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool c if( clear ) res.clear(); - bool status = private_namespace::utf8_to_wide_generic(utf8, utf8_len, mode, [&res](int c) { + bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) { int_to_wide(c, res); - }); + }, mode); return status; } diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h index fabc585..fb5abac 100644 --- a/src/utf8/utf8.h +++ b/src/utf8/utf8.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2023, Tomasz Sowa + * Copyright (c) 2010-2024, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -109,6 +109,21 @@ bool surrogate_pair_to_int(int c1, int c2, int & z); +/* + * converting one character into a stream + * stream can be an utf8 or wide stream + */ +bool int_to_stream(int c, pt::Stream & stream); + + +/* + * converting a one unicode character to an int + * such an unicode character can consists of one or two wide characters + */ +size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); // may these methods make public? +size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct); + + /* * * @@ -158,6 +173,14 @@ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len); bool int_to_wide(int c, std::wstring & res); +/*! + call a convert_function for each character from an utf8 string + */ +template +bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction convert_function, int mode = 1); + + + /*! converting UTF-8 string to a wide string */ @@ -181,9 +204,15 @@ bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear = true, int template bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear = true, int mode = 1); +template +bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode = 1); + template bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1); +template +bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode = 1); + template class TextStreamBase; @@ -220,6 +249,17 @@ template size_t int_to_utf8(int z, StreamType & utf8); +/*! + call an output_function for some sequence of wide characters from the stream buffer + + output_function has two arguments: const char * buf, size_t len: + output_function(const char * buf, size_t len) + + StreamType should have a const_iterator and begin() and end() methods +*/ +template +bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode = 1); + /*! converting a wide string to UTF-8 string diff --git a/src/utf8/utf8_private.cpp b/src/utf8/utf8_private.cpp index 8e235bf..a4fa2b0 100644 --- a/src/utf8/utf8_private.cpp +++ b/src/utf8/utf8_private.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2021-2023, Tomasz Sowa + * Copyright (c) 2021-2024, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -81,85 +81,6 @@ return true; - - -/* - an auxiliary function for converting from wide characters to UTF-8 - converting a wide character into one int - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too -*/ -size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct) -{ - if( string_len == 0 ) - { - z = 0; - correct = false; - return 0; - } - - z = static_cast(*wide_string); - correct = true; - - if( sizeof(wchar_t) == 2 && is_first_surrogate_char(z) ) - { - if( string_len > 1 ) - { - int z2 = *(wide_string+1); - - if( is_second_surrogate_char(z2) ) - { - z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF)); - return 2; - } - else - { - correct = false; - return 1; - } - } - else - { - correct = false; - return 1; - } - } - else - { - correct = utf8_check_range(z); - return 1; - } -} - - - -/* - an auxiliary function for converting from wide characters to UTF-8 - converting a wide character into one int - - returns how many wide characters were used - if wide_string has at least one character then the return value is always greater than zero too -*/ -size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct) -{ -size_t min_str_len = 1; - - if( *wide_string == 0 ) - { - z = 0; - correct = false; - return 0; - } - - if( *(wide_string+1) != 0 ) - min_str_len = 2; - -return wide_to_int(wide_string, min_str_len, z, correct); -} - - - /*! an auxiliary function for converting from wide characters to UTF-8 diff --git a/src/utf8/utf8_private.h b/src/utf8/utf8_private.h index 118b2fb..66208cc 100644 --- a/src/utf8/utf8_private.h +++ b/src/utf8/utf8_private.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2021-2023, Tomasz Sowa + * Copyright (c) 2021-2024, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -41,14 +41,10 @@ namespace pt { -bool utf8_check_range(int c); size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len); size_t int_to_utf8(int z, std::string & utf8, bool clear); -size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct); -bool is_surrogate_char(int c); -bool is_first_surrogate_char(int c); -bool is_second_surrogate_char(int c); -bool surrogate_pair_to_int(int c1, int c2, int & z); +size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); +size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct); namespace private_namespace @@ -56,9 +52,6 @@ namespace private_namespace bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res); bool utf8_to_int_add_next_octet(unsigned char uz, int & res); -size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); // may these methods make public? -size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct); - size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode); @@ -116,144 +109,6 @@ return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode); -// declared in utf8.h, defined in utf8.cpp -size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct); - - - -template -bool utf8_to_wide_generic(const char * utf8, size_t utf8_len, int mode, function_type convert_function) -{ -int z; -size_t len; -bool correct, was_error = false; - - while( utf8_len > 0 ) - { - if( (unsigned char)*utf8 <= 0x7f ) - { - // small optimization - len = 1; - correct = true; - z = static_cast(*utf8); - } - else - { - len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero - } - - if( !correct ) - { - if( mode == 1 ) - convert_function(0xFFFD); // U+FFFD "replacement character" - - was_error = true; - } - else - { - convert_function(z); - } - - utf8 += len; - utf8_len -= len; - } - -return !was_error; -} - - - -template -bool wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_function) -{ - char utf8_buffer[256]; - std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char); - std::size_t utf8_sequence_max_length = 10; - std::size_t index = 0; - bool was_error = false; - - typename StreamType::const_iterator i = buffer.begin(); - - while( i != buffer.end() ) - { - if( index + utf8_sequence_max_length > buffer_len ) - { - bool write_status = write_function(utf8_buffer, index); - index = 0; - - if( !write_status ) - { - was_error = true; - break; - } - } - - int c = 0xFFFD; // U+FFFD "replacement character"; - bool seems_to_be_correct = false; - wchar_t w1 = *i; - - if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) ) - { - ++i; - - if( i != buffer.end() ) - { - wchar_t w2 = *i; - - if( surrogate_pair_to_int(w1, w2, c) ) - { - seems_to_be_correct = true; - ++i; - } - else - { - was_error = true; - } - } - else - { - was_error = true; - } - } - else - { - c = w1; - seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below - ++i; - } - - if( seems_to_be_correct || mode == 1 ) - { - size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index); - // here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough) - - if( seq_len == 0 ) - { - was_error = true; - - if( mode == 1 ) - { - seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character"; - } - } - - index += seq_len; - } - } - - if( index > 0 ) - { - if( !write_function(utf8_buffer, index) ) - { - was_error = true; - } - } - - return !was_error; -} - - - } // namespace private_namespace diff --git a/src/utf8/utf8_templates.h b/src/utf8/utf8_templates.h index f5988be..eafbc29 100644 --- a/src/utf8/utf8_templates.h +++ b/src/utf8/utf8_templates.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2021-2023, Tomasz Sowa + * Copyright (c) 2021-2024, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -44,7 +44,6 @@ namespace pt { - template bool int_to_wide(int c, StreamType & res) { @@ -128,9 +127,9 @@ bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool cle if( clear ) res.clear(); - bool status = private_namespace::utf8_to_wide_generic(utf8, utf8_len, mode, [&res](int c) { + bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) { int_to_wide(c, res); - }); + }, mode); return status; } @@ -187,6 +186,137 @@ return !was_error; } +template +bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode) +{ +int z; +size_t len; +bool correct, was_error = false; + + while( utf8_len > 0 ) + { + if( (unsigned char)*utf8 <= 0x7f ) + { + // small optimization + len = 1; + correct = true; + z = static_cast(*utf8); + } + else + { + len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero + } + + if( !correct ) + { + if( mode == 1 ) + output_function(0xFFFD); // U+FFFD "replacement character" + + was_error = true; + } + else + { + output_function(z); + } + + utf8 += len; + utf8_len -= len; + } + +return !was_error; +} + + +template +bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode) +{ + char utf8_buffer[256]; + std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char); + std::size_t utf8_sequence_max_length = 10; + std::size_t index = 0; + bool was_error = false; + + typename StreamType::const_iterator i = buffer.begin(); + + while( i != buffer.end() ) + { + if( index + utf8_sequence_max_length > buffer_len ) + { + bool write_status = output_function(utf8_buffer, index); + index = 0; + + if( !write_status ) + { + was_error = true; + break; + } + } + + int c = 0xFFFD; // U+FFFD "replacement character"; + bool seems_to_be_correct = false; + wchar_t w1 = *i; + + if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) ) + { + ++i; + + if( i != buffer.end() ) + { + wchar_t w2 = *i; + + if( surrogate_pair_to_int(w1, w2, c) ) + { + seems_to_be_correct = true; + ++i; + } + else + { + was_error = true; + } + } + else + { + was_error = true; + } + } + else + { + c = w1; + seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below + ++i; + } + + if( seems_to_be_correct || mode == 1 ) + { + size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index); + // here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough) + + if( seq_len == 0 ) + { + was_error = true; + + if( mode == 1 ) + { + seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character"; + } + } + + index += seq_len; + } + } + + if( index > 0 ) + { + if( !output_function(utf8_buffer, index) ) + { + was_error = true; + } + } + + return !was_error; +} + + /* this function converts a UTF-8 stream into a wide stream or a wide string @@ -203,6 +333,34 @@ output: */ template bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode) +{ + if( clear ) + res.clear(); + + return utf8_to_output_function(stream, [&](int z) { + int_to_wide(z, res); + }, mode); +} + + +/* +this function reads characters from a UTF-8 stream and calls an output_function + +input: + stream - a UTF-8 stream for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + +output: + output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType + and should put the character to the output string/stream, this function should have the signature like this: + output_function(int z, StreamOrStringType & res) + + this function returns false if there were some errors when converting +*/ +template +bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode) { size_t len; bool correct; @@ -210,11 +368,6 @@ bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, i size_t index = 0; bool was_error = false; - if( clear ) - res.clear(); - - // CHECKME test me when sizeof(wchar_t) is 2 - do { len = utf8_to_int(stream, index, z, correct); @@ -224,13 +377,13 @@ bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, i if( !correct ) { if( mode == 1 ) - int_to_wide(0xFFFD, res); // U+FFFD "replacement character" + output_function(0xFFFD); // U+FFFD "replacement character" was_error = true; } else { - int_to_wide(z, res); + output_function(z); } index += len; @@ -264,6 +417,15 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i if( clear_stream ) out_stream.clear(); + return utf8_to_output_function(iterator_in, iterator_end, [&](int z){ + int_to_wide(z, out_stream); + }, mode); +} + + +template +bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode) +{ int res; bool correct; bool was_error = false; @@ -274,12 +436,12 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i if( correct ) { - int_to_wide(res, out_stream); + output_function(res); } else { if( mode == 1 ) - int_to_wide(0xFFFD, out_stream); // U+FFFD "replacement character" + output_function(0xFFFD); // U+FFFD "replacement character" was_error = true; } @@ -290,7 +452,6 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i - /*! this function converts UTF-8 stream into a wide string @@ -508,10 +669,10 @@ bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, in if( clear ) utf8.clear(); - return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { + return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { utf8.append(utf8_buffer, buffer_len); return true; - }); + }, mode); } @@ -574,10 +735,10 @@ bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear if( clear ) utf8.clear(); - return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { + return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { utf8.write(utf8_buffer, buffer_len); return true; - }); + }, mode); } @@ -606,7 +767,7 @@ bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffe buffer_ok = true; max_buffer_size -= 1; // for terminating null character - is_ok = private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool { + is_ok = wide_to_output_function(buffer, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool { std::size_t i=0; for( ; i < buffer_len ; ++i) @@ -626,7 +787,7 @@ bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffe max_buffer_size -= i; *utf8 = 0; return buffer_ok; - }); + }, mode); } if( was_buffer_sufficient_large ) diff --git a/tests/Makefile.dep b/tests/Makefile.dep index 0356e4f..f60d315 100644 --- a/tests/Makefile.dep +++ b/tests/Makefile.dep @@ -3,38 +3,37 @@ ./convert.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h ./convert.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h ./convert.o: ../src/textstream/stream.h ../src/space/space.h -./convert.o: ../src/textstream/types.h ../src/convert/inttostr.h -./convert.o: ../src/utf8/utf8.h ../src/textstream/stream.h -./convert.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h -./convert.o: ../src/date/date.h ../src/membuffer/membuffer.h -./convert.o: ../src/textstream/types.h ../src/textstream/stream_private.h -./convert.o: ../src/convert/strtoint.h ../src/convert/text.h -./convert.o: ../src/convert/misc.h ../src/convert/double.h test.h +./convert.o: ../src/convert/inttostr.h ../src/utf8/utf8.h +./convert.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h +./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h +./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h +./convert.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h +./convert.o: ../src/convert/text.h ../src/convert/misc.h +./convert.o: ../src/textstream/types.h ../src/convert/double.h test.h ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h -./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h -./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h -./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h -./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h -./csvparser.o: ../src/textstream/stream.h ../src/date/date.h -./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h -./csvparser.o: ../src/textstream/stream_private.h test.h +./csvparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h +./csvparser.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h +./csvparser.o: ../src/utf8/utf8_private.h ../src/convert/baseparser.h +./csvparser.o: ../src/textstream/textstream.h ../src/textstream/stream.h +./csvparser.o: ../src/date/date.h ../src/membuffer/membuffer.h +./csvparser.o: ../src/textstream/types.h ../src/textstream/stream_private.h +./csvparser.o: test.h ./main.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h ./main.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h ./main.o: ../src/textstream/stream.h ../src/space/space.h -./main.o: ../src/textstream/types.h ../src/convert/inttostr.h -./main.o: ../src/utf8/utf8.h ../src/textstream/stream.h -./main.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h -./main.o: ../src/date/date.h ../src/membuffer/membuffer.h -./main.o: ../src/textstream/types.h ../src/textstream/stream_private.h -./main.o: ../src/convert/strtoint.h ../src/convert/text.h -./main.o: ../src/convert/misc.h ../src/convert/double.h test.h +./main.o: ../src/convert/inttostr.h ../src/utf8/utf8.h +./main.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h +./main.o: ../src/utf8/utf8_private.h ../src/date/date.h +./main.o: ../src/membuffer/membuffer.h ../src/textstream/types.h +./main.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h +./main.o: ../src/convert/text.h ../src/convert/misc.h +./main.o: ../src/textstream/types.h ../src/convert/double.h test.h ./main.o: mainoptionsparser.h csvparser.h ./test.o: test.h ./mainoptionsparser.o: mainoptionsparser.h test.h ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h -./mainoptionsparser.o: ../src/space/space.h ../src/textstream/types.h -./mainoptionsparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h -./mainoptionsparser.o: ../src/textstream/stream.h +./mainoptionsparser.o: ../src/space/space.h ../src/convert/inttostr.h +./mainoptionsparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h ./mainoptionsparser.o: ../src/utf8/utf8_templates.h ./mainoptionsparser.o: ../src/utf8/utf8_private.h ../src/convert/convert.h ./mainoptionsparser.o: ../src/convert/inttostr.h @@ -44,4 +43,5 @@ ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./mainoptionsparser.o: ../src/textstream/stream_private.h ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h -./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h +./mainoptionsparser.o: ../src/convert/misc.h ../src/textstream/types.h +./mainoptionsparser.o: ../src/convert/double.h