diff --git a/src/utf8/utf8.cpp b/src/utf8/utf8.cpp index 4cdb506..abf201a 100644 --- a/src/utf8/utf8.cpp +++ b/src/utf8/utf8.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2022, Tomasz Sowa + * Copyright (c) 2010-2023, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -129,24 +129,63 @@ bool surrogate_pair_to_int(int c1, int c2, int & z) /* - converts an int to a wide string -*/ -void int_to_wide(int c, std::wstring & res) + * converts an int to a wide string + * + * this method will not terminate the output string with a null character + * return how many characters have been written (0, 1 or 2) + */ +size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len) { if( sizeof(wchar_t)==2 && c>0xffff ) { - // UTF16 surrogate pairs - c -= 0x10000; - res += static_cast(((c >> 10) & 0x3FF) + 0xD800); - res += static_cast((c & 0x3FF) + 0xDC00); + if( max_buf_len > 1 ) + { + // UTF16 surrogate pairs + c -= 0x10000; + res[0] = static_cast(((c >> 10) & 0x3FF) + 0xD800); + res[1] = static_cast((c & 0x3FF) + 0xDC00); + return 2; + } } else { - res += static_cast(c); + if( max_buf_len > 0 ) + { + res[0] = static_cast(c); + return 1; + } } + + return 0; } +/* + converts an int to a wide string + + returns true if a character was inserted to the string +*/ +bool int_to_wide(int c, std::wstring & res) +{ + wchar_t buf[2]; + size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t)); + + if( used == 1 ) + { + res += buf[0]; + } + else + if( used == 2 ) + { + res += buf[0]; + res += buf[1]; + } + + return used > 0; +} + + + /*! this function converts one UTF-8 character into one wide-character diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h index 1a479cc..fabc585 100644 --- a/src/utf8/utf8.h +++ b/src/utf8/utf8.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2022, Tomasz Sowa + * Copyright (c) 2010-2023, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -134,15 +134,28 @@ size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & /*! converting one character from int to wide stream + + returns true if a character was inserted to the stream */ template -void int_to_wide(int c, StreamType & res); +bool int_to_wide(int c, StreamType & res); /*! converting one character from int to wide string + + this method will not terminate the output string with a null character + return how many characters have been written (0, 1 or 2) */ -void int_to_wide(int c, std::wstring & res); +size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len); + + +/*! + converting one character from int to wide string + + returns true if a character was inserted to the string +*/ +bool int_to_wide(int c, std::wstring & res); /*! @@ -176,7 +189,14 @@ class TextStreamBase; // defined at the end in textstream.h template -bool utf8_to_wide(const TextStreamBase & utf8, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1); +bool utf8_to_wide(const TextStreamBase & utf8, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1); + +template +bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode = 1, bool * was_buffer_sufficient_large = nullptr); + +template +bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large = nullptr, int mode = 1); + /* @@ -217,8 +237,6 @@ bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode = 1); template bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode = 1); - - bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); @@ -228,13 +246,16 @@ bool wide_to_utf8(const wchar_t * wide_string, char * utf8, s bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode = 1); template -void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear = true, int mode = 1); // not tested +bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear = true, int mode = 1); template bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1); template -void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); // not tested, IMPROVE ME mode parameter is not used +bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); + +template +bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large = nullptr, int mode = 1); } // namespace diff --git a/src/utf8/utf8_private.cpp b/src/utf8/utf8_private.cpp index 4ee6a73..8e235bf 100644 --- a/src/utf8/utf8_private.cpp +++ b/src/utf8/utf8_private.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2021, Tomasz Sowa + * Copyright (c) 2021-2023, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -102,9 +102,9 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool z = static_cast(*wide_string); correct = true; - if( sizeof(wchar_t) == 2 && is_surrogate_char(z) ) + if( sizeof(wchar_t) == 2 && is_first_surrogate_char(z) ) { - if( is_first_surrogate_char(z) && string_len>1 ) + if( string_len > 1 ) { int z2 = *(wide_string+1); @@ -116,7 +116,7 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool else { correct = false; - return 2; + return 1; } } else diff --git a/src/utf8/utf8_private.h b/src/utf8/utf8_private.h index c12a8c5..118b2fb 100644 --- a/src/utf8/utf8_private.h +++ b/src/utf8/utf8_private.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2021-2022, Tomasz Sowa + * Copyright (c) 2021-2023, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -163,15 +163,14 @@ return !was_error; -// FIX ME it is not using surrogate pairs from input stream -// and is not using mode parameter template -void wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_function) +bool wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_function) { char utf8_buffer[256]; std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char); std::size_t utf8_sequence_max_length = 10; std::size_t index = 0; + bool was_error = false; typename StreamType::const_iterator i = buffer.begin(); @@ -179,18 +178,78 @@ void wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_fun { if( index + utf8_sequence_max_length > buffer_len ) { - write_function(utf8_buffer, index); + bool write_status = write_function(utf8_buffer, index); index = 0; + + if( !write_status ) + { + was_error = true; + break; + } } - index += int_to_utf8(*i, utf8_buffer + index, buffer_len - index); - ++i; + int c = 0xFFFD; // U+FFFD "replacement character"; + bool seems_to_be_correct = false; + wchar_t w1 = *i; + + if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) ) + { + ++i; + + if( i != buffer.end() ) + { + wchar_t w2 = *i; + + if( surrogate_pair_to_int(w1, w2, c) ) + { + seems_to_be_correct = true; + ++i; + } + else + { + was_error = true; + } + } + else + { + was_error = true; + } + } + else + { + c = w1; + seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below + ++i; + } + + if( seems_to_be_correct || mode == 1 ) + { + size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index); + // here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough) + + if( seq_len == 0 ) + { + was_error = true; + + if( mode == 1 ) + { + seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character"; + } + } + + index += seq_len; + } } if( index > 0 ) { - write_function(utf8_buffer, index); + if( !write_function(utf8_buffer, index) ) + { + was_error = true; + } } + + return !was_error; } diff --git a/src/utf8/utf8_templates.h b/src/utf8/utf8_templates.h index 15d8383..f5988be 100644 --- a/src/utf8/utf8_templates.h +++ b/src/utf8/utf8_templates.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2021-2022, Tomasz Sowa + * Copyright (c) 2021-2023, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -46,19 +46,23 @@ namespace pt template -void int_to_wide(int c, StreamType & res) +bool int_to_wide(int c, StreamType & res) { - if( sizeof(wchar_t)==2 && c>0xffff ) + wchar_t buf[2]; + size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t)); + + if( used == 1 ) { - // UTF16 surrogate pairs - c -= 0x10000; - res << static_cast(((c >> 10) & 0x3FF) + 0xD800); - res << static_cast((c & 0x3FF) + 0xDC00); + res << buf[0]; } else + if( used == 2 ) { - res << static_cast(c); + res << buf[0]; + res << buf[1]; } + + return used > 0; } @@ -243,14 +247,14 @@ bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, i /*! - this function converts one UTF-8 character into a wide stream or a wide string + this function converts UTF-8 stream into a wide stream or a wide string input: - iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) + iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) iterator_end - an end iterator output: - out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator) + out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string) this function returns false if there were some errors when converting */ @@ -287,6 +291,103 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i +/*! + this function converts UTF-8 stream into a wide string + + input: + iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) + iterator_end - an end iterator + + output: + out_buffer - an output wide string + max_buffer_len - how many characters can be write (we write the terminating null character too) + was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large + + this function returns false if there were some errors when converting or if the output buffer was too short +*/ +template +bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large) +{ + int res; + bool correct; + bool was_error = true; + bool was_buffer_ok = false; + + if( max_buffer_len > 0 ) + { + max_buffer_len -= 1; // for terminating null character + was_error = false; + was_buffer_ok = true; + + while( iterator_in != iterator_end ) + { + utf8_to_int(iterator_in, iterator_end, res, correct); + + if( !correct ) + { + was_error = true; + + if( mode == 1 ) + { + res = 0xFFFD; // U+FFFD "replacement character" + correct = true; + } + } + + if( correct ) + { + size_t len = int_to_wide(res, out_buffer, max_buffer_len); + // if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand) + + if( len == 0 ) + { + was_error = true; + was_buffer_ok = false; + break; + } + else + { + out_buffer += len; + max_buffer_len -= len; + } + } + } + + *out_buffer = 0; + } + + if( was_buffer_sufficient_large ) + *was_buffer_sufficient_large = was_buffer_ok; + + return !was_error; +} + + + +/*! + this function converts UTF-8 stream into a wide string + + input: + stream - a stream for reading from + + output: + out_buffer - an output wide string + max_buffer_len - how many characters can be write (we write the terminating null character too) + was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large + + this function returns false if there were some errors when converting or if the output buffer was too short +*/ +template +bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode) +{ + typename StreamType::const_iterator stream_begin = stream.begin(); + typename StreamType::const_iterator stream_end = stream.end(); + + return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large); +} + + + /*! this function converts one wide character into UTF-8 stream @@ -402,13 +503,14 @@ bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode) template -void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode) +bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode) { if( clear ) utf8.clear(); - private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){ + return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { utf8.append(utf8_buffer, buffer_len); + return true; }); } @@ -466,20 +568,72 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i } -// not tested template -void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode) +bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode) { if( clear ) utf8.clear(); - private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){ + return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { utf8.write(utf8_buffer, buffer_len); + return true; }); } +/*! + this function converts a wide stream into a utf8 string + + input: + buffer - a wide stream for reading from + + output: + utf8 - an output utf8 string + max_buffer_len - how many characters can be write (we write the terminating null character too) + was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large + + this function returns false if there were some errors when converting or if the output buffer was too short +*/ +template +bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode) +{ + bool buffer_ok = false; + bool is_ok = false; + + if( max_buffer_size > 0 ) + { + buffer_ok = true; + max_buffer_size -= 1; // for terminating null character + + is_ok = private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool { + std::size_t i=0; + + for( ; i < buffer_len ; ++i) + { + if( i < max_buffer_size ) + { + *utf8 = utf8_buffer[i]; + utf8 += 1; + } + else + { + buffer_ok = false; + break; + } + } + + max_buffer_size -= i; + *utf8 = 0; + return buffer_ok; + }); + } + + if( was_buffer_sufficient_large ) + *was_buffer_sufficient_large = buffer_ok; + + return is_ok; +}