From fac3a7eb7189e40e396649b73a075baa43ec1f07 Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Mon, 15 Mar 2021 19:34:51 +0100 Subject: [PATCH] reorganization in utf8 - utf8 auxiliary functions moved to utf8_private.h file - in utf8.h are shown only functions available for consumers - template functions has been moved to utf8_template.h (in utf8.h are only declarations) utf8_template.h is included at the end of utf8.h - functions which take std::ostream changed to template (the stream is a template argument now) --- utf8/utf8.cpp | 406 ++---------------------------------------- utf8/utf8.h | 268 ++++++---------------------- utf8/utf8_private.cpp | 286 +++++++++++++++++++++++++++++ utf8/utf8_private.h | 220 +++++++++++++++++++++++ utf8/utf8_templates.h | 271 ++++++++++++++++++++++++++++ 5 files changed, 845 insertions(+), 606 deletions(-) create mode 100644 utf8/utf8_private.cpp create mode 100644 utf8/utf8_private.h create mode 100644 utf8/utf8_templates.h diff --git a/utf8/utf8.cpp b/utf8/utf8.cpp index cfb20e5..b804e1f 100644 --- a/utf8/utf8.cpp +++ b/utf8/utf8.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2018, Tomasz Sowa + * Copyright (c) 2010-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,6 +36,7 @@ */ #include "utf8.h" +#include "utf8_private.h" @@ -44,48 +45,6 @@ namespace PT -/*! - an auxiliary function for converting from UTF-8 string -*/ -static bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res) -{ - for(len=0 ; (uz & 0x80) != 0 ; ++len) - uz <<= 1; - - if( len == 1 ) - return false; - - res = uz; - - if( len > 0 ) - res >>= len; - - if( res == 0 ) - return false; - - if( len == 0 ) - len = 1; - -return true; -} - - - -/*! - an auxiliary function for converting from UTF-8 string -*/ -static bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res) -{ - if( (uz & 0xc0) != 0x80 ) - return false; - - res <<= 6; - res |= (uz & 0x3F); - -return true; -} - - /*! @@ -126,15 +85,17 @@ size_t i, len; if( utf8_len == 0 ) return 0; - if( !UTF8ToInt_FirstOctet(utf8[0], len, res) ) + if( !private_namespace::UTF8ToInt_FirstOctet(utf8[0], len, res) ) return 1; if( utf8_len < len ) return utf8_len; for(i=1 ; i(*wide_string); - correct = true; - - if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) ) - { - if( z>=0xD800 && z<=0xDBFF && string_len>1 ) - { - int z2 = *(wide_string+1); - - if( z2>=0xDC00 && z2<=0xDFFF ) - { - z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF)); - return 2; - } - else - { - correct = false; - return 2; - } - } - else - { - correct = false; - return 1; - } - } - else - { - correct = UTF8_CheckRange(z); - return 1; - } -} - - - -/* - an auxiliary function for converting from wide characters to UTF-8 - converting a wide character into one int - - returns how many wide characters were used - if wide_string has at least one character then the return value is always greater than zero too -*/ -static size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct) -{ -size_t min_str_len = 1; - - if( *wide_string == 0 ) - { - z = 0; - correct = false; - return 0; - } - - if( *(wide_string+1) != 0 ) - min_str_len = 2; - -return WideToInt(wide_string, min_str_len, z, correct); -} - - - -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too - - utf8_written - how many characters were saved in the utf8 string (the string doesn't have - a null terminating character) - it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read - was_utf8_buf_too_small - will be true if the utf8 buffer is too small - if this flag is true then utf8_written is equal to zero - was_error - will be true if there is an error when converting (there was an incorrect wide character) - (was_error will not be true if the utf8 buffer is too small) -*/ -static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, - size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode) -{ -int z; -bool correct; -size_t chars; - - utf8_written = 0; - was_utf8_buf_too_small = false; - chars = WideToInt(wide_string, string_len, z, correct); - - if( correct ) - { - utf8_written = IntToUTF8(z, utf8, utf8_len); - - if( utf8_written == 0 ) - was_utf8_buf_too_small = true; - } - else - { - if( mode == 1 ) - { - utf8_written = IntToUTF8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character" - - if( utf8_written == 0 ) - was_utf8_buf_too_small = true; - } - - was_error = true; - } - -return chars; -} - - - -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too -*/ -static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode) -{ -int z; -bool correct; -size_t chars; - - chars = WideToInt(wide_string, string_len, z, correct); - - if( correct ) - correct = IntToUTF8(z, utf8, false) != 0; - - if( !correct ) - { - if( mode == 1 ) - IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character" - - was_error = true; - } - -return chars; -} - - - -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if wide_string has at least one character then the return value is always greater than zero too -*/ -static size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode) -{ -int z; -bool correct; -size_t chars; - - chars = WideToInt(wide_string, z, correct); - - if( correct ) - correct = IntToUTF8(z, utf8, false) != 0; - - if( !correct ) - { - if( mode == 1 ) - IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character" - - was_error = true; - } - -return chars; -} - - - -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too -*/ -static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, bool & was_error, int mode) -{ -int z; -bool correct; -size_t chars; - - chars = WideToInt(wide_string, string_len, z, correct); - - if( correct ) - correct = IntToUTF8(z, utf8) != 0; - - if( !correct ) - { - if( mode == 1 ) - IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character" - - was_error = true; - } - -return chars; -} - - - -/*! - an auxiliary function for converting from wide characters to UTF-8 -*/ -static size_t WideOneToUTF8(const wchar_t * wide_string, std::ostream & utf8, bool & was_error, int mode) -{ -size_t min_str_len = 1; - - if( *wide_string == 0 ) - return 0; - - if( *(wide_string+1) != 0 ) - min_str_len = 2; - -return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode); -} - - - /*! this function converts a wide string into UTF-8 string @@ -772,7 +471,7 @@ size_t chars; while( string_len > 0 ) { - chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode); + chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, was_error, mode); wide_string += chars; string_len -= chars; } @@ -804,7 +503,7 @@ bool was_error = false; utf8.clear(); while( *wide_string ) - wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode); + wide_string += private_namespace::WideOneToUTF8(wide_string, utf8, was_error, mode); return !was_error; } @@ -832,83 +531,6 @@ bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a wide string for converting - string_len - size of the string - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 stream for the output sequence - - this function returns false if there were some errors when converting -*/ -bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode) -{ -bool was_error = false; -size_t chars; - - while( string_len > 0 ) - { - chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode); - wide_string += chars; - string_len -= chars; - } - -return !was_error; -} - - - -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a null terminated wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 stream for the output sequence - - this function returns false if there were some errors when converting -*/ -bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode) -{ -bool was_error = false; - - while( *wide_string ) - wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode); - -return !was_error; -} - - - -/*! - this function converts a wide string (std::wstring) into UTF-8 stream - - input: - wide_string - a wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 stream for the output sequence - - this function returns false if there were some errors when converting -*/ -bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode) -{ - return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, mode); -} - /*! @@ -942,7 +564,7 @@ size_t chars, utf8_saved; while( string_len > 0 ) { - chars = WideOneToUTF8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode); + chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode); if( was_buffer_to_small ) { @@ -1089,7 +711,7 @@ size_t len; while( *wide_string ) { len = (*(wide_string+1) == 0) ? 1 : 2; - chars = WideOneToUTF8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode); + chars = private_namespace::WideOneToUTF8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode); if( was_buffer_to_small ) { diff --git a/utf8/utf8.h b/utf8/utf8.h index c33a68d..2cd5b72 100644 --- a/utf8/utf8.h +++ b/utf8/utf8.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2018, Tomasz Sowa + * Copyright (c) 2010-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,7 +38,6 @@ #ifndef headerfile_picotools_utf8_utf8 #define headerfile_picotools_utf8_utf8 -#include #include #include "textstream/textstream.h" @@ -46,8 +45,6 @@ namespace PT { - - /*! UTF-8, a transformation format of ISO 10646 http://tools.ietf.org/html/rfc3629 @@ -68,6 +65,16 @@ bool UTF8_CheckRange(int c); +/* + * + * + * + * convertions from UTF-8 + * + * + * + */ + /*! converting one character from UTF-8 to an int */ @@ -85,25 +92,40 @@ bool UTF8ToWide(const char * utf8, std::wstring & res, bool cle bool UTF8ToWide(const std::string & utf8, std::wstring & res, bool clear = true, int mode = 1); bool UTF8ToWide(std::istream & utf8, std::wstring & res, bool clear = true, int mode = 1); +template +bool UTF8ToWide(const char * utf8, size_t utf8_len, TextStreamBase & res, bool clear = true, int mode = 1); // need to be tested + +template +bool UTF8ToWide(const char * utf8, TextStreamBase & res, bool clear = true, int mode = 1); // need to be tested + +template +bool UTF8ToWide(const std::string & utf8, TextStreamBase & res, bool clear = true, int mode = 1); // need to be tested + +template +bool UTF8ToWide(std::istream & utf8, TextStreamBase & res, bool clear = true, int mode = 1); // need to be tested + + -/*! - converting UTF-8 string to a WTextStream stream - (need to be tested) -*/ /* - implemented as templates below -bool UTF8ToWide(const char * utf8, size_t utf8_len, WTextStream & res, bool clear = true, int mode = 1); -bool UTF8ToWide(const char * utf8, WTextStream & res, bool clear = true, int mode = 1); -bool UTF8ToWide(const std::string & utf8, WTextStream & res, bool clear = true, int mode = 1); -bool UTF8ToWide(std::istream & utf8, WTextStream & res, bool clear = true, int mode = 1); -*/ + * + * + * + * convertions to UTF-8 + * + * + * + */ + /*! converting one int character to UTF-8 */ size_t IntToUTF8(int z, char * utf8, size_t utf8_max_len); -size_t IntToUTF8(int z, std::string & utf8, bool clear = true ); -size_t IntToUTF8(int z, std::ostream & utf8); +size_t IntToUTF8(int z, std::string & utf8, bool clear = true); + +template +size_t IntToUTF8(int z, StreamType & utf8); + /*! @@ -113,216 +135,32 @@ bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::string & ut bool WideToUTF8(const wchar_t * wide_string, std::string & utf8, bool clear = true, int mode = 1); bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear = true, int mode = 1); -// implemented as a template below -//void WideToUTF8(PT::WTextStream & buffer, std::string & utf8, bool clear = true, int mode = 1);// not tested +template +bool WideToUTF8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode = 1); + +template +bool WideToUTF8(const wchar_t * wide_string, StreamType & utf8, int mode = 1); + +template +bool WideToUTF8(const std::wstring & wide_string, StreamType & utf8, int mode = 1); -bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode = 1); -bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode = 1); -bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode = 1); -// implemented as a template below -//void WideToUTF8(PT::WTextStream & buffer, std::ostream & utf8, int mode = 1);// not tested bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); -// implement void WideToUTF8(PT::WTextStream & buffer, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); +// implement template bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode = 1); bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode = 1); bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode = 1); -// implement void WideToUTF8(PT::WTextStream & buffer, char * utf8, size_t utf8_len, int mode = 1); - - - -namespace private_namespace -{ -template -bool UTF8ToWideGeneric(const char * utf8, size_t utf8_len, int mode, function_type convert_function) -{ -int z; -size_t len; -bool correct, was_error = false; - - while( utf8_len > 0 ) - { - if( (unsigned char)*utf8 <= 0x7f ) - { - // small optimization - len = 1; - correct = true; - z = static_cast(*utf8); - } - else - { - len = UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero - } - - if( !correct ) - { - if( mode == 1 ) - convert_function(0xFFFD); // U+FFFD "replacement character" - - was_error = true; - } - else - { - convert_function(z); - } - - utf8 += len; - utf8_len -= len; - } - -return !was_error; -} - - - -template -void IntToWide(int c, TextStreamBase & res) -{ - if( sizeof(wchar_t)==2 && c>0xffff ) - { - // UTF16 surrogate pairs - c -= 0x10000; - res << static_cast(((c >> 10) & 0x3FF) + 0xD800); - res << static_cast((c & 0x3FF) + 0xDC00); - } - else - { - res << static_cast(c); - } -} - - -// not tested -// FIX ME it is not using surrogate pairs from input stream -// and mode parameter -template -void WideToUTF8Generic(TextStreamBase & buffer, int mode, function_type write_function) -{ - char utf8_buffer[256]; - std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char); - std::size_t utf8_sequence_max_length = 10; - std::size_t index = 0; - - typename TextStreamBase::const_iterator i = buffer.begin(); - - while( i != buffer.end() ) - { - if( index + utf8_sequence_max_length > buffer_len ) - { - write_function(utf8_buffer, index); - index = 0; - } - - index += PT::IntToUTF8(*i, utf8_buffer + index, buffer_len - index); - ++i; - } - - if( index > 0 ) - { - write_function(utf8_buffer, index); - } -} - - -} // namespace - - - - -// need to be tested -template -bool UTF8ToWide(const char * utf8, size_t utf8_len, TextStreamBase & res, bool clear = true, int mode = 1) -{ - if( clear ) - res.clear(); - - bool status = private_namespace::UTF8ToWideGeneric(utf8, utf8_len, mode, [&res](int c) { - private_namespace::IntToWide(c, res); - }); - - return status; -} - - -// need to be tested -template -bool UTF8ToWide(const char * utf8, TextStreamBase & res, bool clear = true, int mode = 1) -{ -size_t utf8_len = 0; - - while( utf8[utf8_len] != 0 ) - utf8_len += 1; - -return UTF8ToWide(utf8, utf8_len, res, clear, mode); -} - - -// need to be tested -template -bool UTF8ToWide(const std::string & utf8, TextStreamBase & res, bool clear = true, int mode = 1) -{ - return UTF8ToWide(utf8.c_str(), utf8.size(), res, clear, mode); -} - - -// need to be tested -template -bool UTF8ToWide(std::istream & utf8, TextStreamBase & res, bool clear = true, int mode = 1) -{ -int z; -bool correct, was_error = false; - - if( clear ) - res.clear(); - - while( UTF8ToInt(utf8, z, correct) > 0 ) - { - if( !correct ) - { - if( mode == 1 ) - res << 0xFFFD; // U+FFFD "replacement character" - - was_error = true; - } - else - { - private_namespace::IntToWide(z, res); - } - } - -return !was_error; -} - - - - -// not tested -template -void WideToUTF8(TextStreamBase & buffer, std::string & utf8, bool clear = true, int mode = 1) -{ - if( clear ) - utf8.clear(); - - private_namespace::WideToUTF8Generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){ - utf8.append(utf8_buffer, buffer_len); - }); -} - - -// not tested -template -void WideToUTF8(TextStreamBase & buffer, std::ostream & utf8, int mode = 1) -{ - private_namespace::WideToUTF8Generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){ - utf8.write(utf8_buffer, buffer_len); - }); -} +// implement template +template +void WideToUTF8(TextStreamBase & buffer, std::string & utf8, bool clear = true, int mode = 1); // not tested +template +void WideToUTF8(TextStreamBase & buffer, std::ostream & utf8, int mode = 1); // not tested @@ -330,5 +168,7 @@ void WideToUTF8(TextStreamBase & buffer, } // namespace +#include "utf8/utf8_templates.h" + #endif diff --git a/utf8/utf8_private.cpp b/utf8/utf8_private.cpp new file mode 100644 index 0000000..cba2dfe --- /dev/null +++ b/utf8/utf8_private.cpp @@ -0,0 +1,286 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2021, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "utf8_private.h" + + +namespace PT +{ + +namespace private_namespace +{ + +/*! + an auxiliary function for converting from UTF-8 string +*/ +bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res) +{ + for(len=0 ; (uz & 0x80) != 0 ; ++len) + uz <<= 1; + + if( len == 1 ) + return false; + + res = uz; + + if( len > 0 ) + res >>= len; + + if( res == 0 ) + return false; + + if( len == 0 ) + len = 1; + +return true; +} + + + +/*! + an auxiliary function for converting from UTF-8 string +*/ +bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res) +{ + if( (uz & 0xc0) != 0x80 ) + return false; + + res <<= 6; + res |= (uz & 0x3F); + +return true; +} + + + + + +/* + an auxiliary function for converting from wide characters to UTF-8 + converting a wide character into one int + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct) +{ + if( string_len == 0 ) + { + z = 0; + correct = false; + return 0; + } + + z = static_cast(*wide_string); + correct = true; + + if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) ) + { + if( z>=0xD800 && z<=0xDBFF && string_len>1 ) + { + int z2 = *(wide_string+1); + + if( z2>=0xDC00 && z2<=0xDFFF ) + { + z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF)); + return 2; + } + else + { + correct = false; + return 2; + } + } + else + { + correct = false; + return 1; + } + } + else + { + correct = UTF8_CheckRange(z); + return 1; + } +} + + + +/* + an auxiliary function for converting from wide characters to UTF-8 + converting a wide character into one int + + returns how many wide characters were used + if wide_string has at least one character then the return value is always greater than zero too +*/ +size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct) +{ +size_t min_str_len = 1; + + if( *wide_string == 0 ) + { + z = 0; + correct = false; + return 0; + } + + if( *(wide_string+1) != 0 ) + min_str_len = 2; + +return WideToInt(wide_string, min_str_len, z, correct); +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too + + utf8_written - how many characters were saved in the utf8 string (the string doesn't have + a null terminating character) + it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read + was_utf8_buf_too_small - will be true if the utf8 buffer is too small + if this flag is true then utf8_written is equal to zero + was_error - will be true if there is an error when converting (there was an incorrect wide character) + (was_error will not be true if the utf8 buffer is too small) +*/ +size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, + size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + utf8_written = 0; + was_utf8_buf_too_small = false; + chars = WideToInt(wide_string, string_len, z, correct); + + if( correct ) + { + utf8_written = IntToUTF8(z, utf8, utf8_len); + + if( utf8_written == 0 ) + was_utf8_buf_too_small = true; + } + else + { + if( mode == 1 ) + { + utf8_written = IntToUTF8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character" + + if( utf8_written == 0 ) + was_utf8_buf_too_small = true; + } + + was_error = true; + } + +return chars; +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = WideToInt(wide_string, string_len, z, correct); + + if( correct ) + correct = IntToUTF8(z, utf8, false) != 0; + + if( !correct ) + { + if( mode == 1 ) + IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if wide_string has at least one character then the return value is always greater than zero too +*/ +size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = WideToInt(wide_string, z, correct); + + if( correct ) + correct = IntToUTF8(z, utf8, false) != 0; + + if( !correct ) + { + if( mode == 1 ) + IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + + + + + +} // namespace private_namespace + +} // namespace PT + + + diff --git a/utf8/utf8_private.h b/utf8/utf8_private.h new file mode 100644 index 0000000..d28e525 --- /dev/null +++ b/utf8/utf8_private.h @@ -0,0 +1,220 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2021, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef headerfile_picotools_utf8_utf8_private +#define headerfile_picotools_utf8_utf8_private + +#include "textstream/textstream.h" + + +namespace PT +{ + +bool UTF8_CheckRange(int c); +size_t IntToUTF8(int z, char * utf8, size_t utf8_max_len); +size_t IntToUTF8(int z, std::string & utf8, bool clear); +size_t UTF8ToInt(const char * utf8, size_t utf8_len, int & res, bool & correct); + + +namespace private_namespace +{ +bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res); +bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res); + +size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); +size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct); + +size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, + size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode); + +size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode); + +size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode); + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +template +static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = WideToInt(wide_string, string_len, z, correct); + + if( correct ) + correct = IntToUTF8(z, utf8) != 0; + + if( !correct ) + { + if( mode == 1 ) + IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + + +/*! + an auxiliary function for converting from wide characters to UTF-8 +*/ +template +static size_t WideOneToUTF8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode) +{ + size_t min_str_len = 1; + + if( *wide_string == 0 ) + return 0; + + if( *(wide_string+1) != 0 ) + min_str_len = 2; + +return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode); +} + + + +// declared in utf8.h, defined in utf8.cpp +size_t UTF8ToInt(const char * utf8, size_t utf8_len, int & res, bool & correct); + + + +template +bool UTF8ToWideGeneric(const char * utf8, size_t utf8_len, int mode, function_type convert_function) +{ +int z; +size_t len; +bool correct, was_error = false; + + while( utf8_len > 0 ) + { + if( (unsigned char)*utf8 <= 0x7f ) + { + // small optimization + len = 1; + correct = true; + z = static_cast(*utf8); + } + else + { + len = PT::UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero + } + + if( !correct ) + { + if( mode == 1 ) + convert_function(0xFFFD); // U+FFFD "replacement character" + + was_error = true; + } + else + { + convert_function(z); + } + + utf8 += len; + utf8_len -= len; + } + +return !was_error; +} + + + +template +void IntToWide(int c, TextStreamBase & res) +{ + if( sizeof(wchar_t)==2 && c>0xffff ) + { + // UTF16 surrogate pairs + c -= 0x10000; + res << static_cast(((c >> 10) & 0x3FF) + 0xD800); + res << static_cast((c & 0x3FF) + 0xDC00); + } + else + { + res << static_cast(c); + } +} + + +// not tested +// FIX ME it is not using surrogate pairs from input stream +// and mode parameter +template +void WideToUTF8Generic(TextStreamBase & buffer, int mode, function_type write_function) +{ + char utf8_buffer[256]; + std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char); + std::size_t utf8_sequence_max_length = 10; + std::size_t index = 0; + + typename TextStreamBase::const_iterator i = buffer.begin(); + + while( i != buffer.end() ) + { + if( index + utf8_sequence_max_length > buffer_len ) + { + write_function(utf8_buffer, index); + index = 0; + } + + index += IntToUTF8(*i, utf8_buffer + index, buffer_len - index); + ++i; + } + + if( index > 0 ) + { + write_function(utf8_buffer, index); + } +} + + + + +} // namespace private_namespace + +} // namespace PT + +#endif diff --git a/utf8/utf8_templates.h b/utf8/utf8_templates.h new file mode 100644 index 0000000..bbf9ceb --- /dev/null +++ b/utf8/utf8_templates.h @@ -0,0 +1,271 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2021, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef headerfile_picotools_utf8_utf8_templates +#define headerfile_picotools_utf8_utf8_templates + +// this file is included at the end of utf8.h + +#include "utf8_private.h" + + +namespace PT +{ + + +/*! + converting UTF-8 string to a TextStreamBase stream + (need to be tested) +*/ +// need to be tested +template +bool UTF8ToWide(const char * utf8, size_t utf8_len, TextStreamBase & res, bool clear, int mode) +{ + if( clear ) + res.clear(); + + bool status = private_namespace::UTF8ToWideGeneric(utf8, utf8_len, mode, [&res](int c) { + private_namespace::IntToWide(c, res); + }); + + return status; +} + + + + +template +bool UTF8ToWide(const char * utf8, TextStreamBase & res, bool clear, int mode) +{ +size_t utf8_len = 0; + + while( utf8[utf8_len] != 0 ) + utf8_len += 1; + +return UTF8ToWide(utf8, utf8_len, res, clear, mode); +} + + + +template +bool UTF8ToWide(const std::string & utf8, TextStreamBase & res, bool clear, int mode) +{ + return UTF8ToWide(utf8.c_str(), utf8.size(), res, clear, mode); +} + + + +// need to be tested +template +bool UTF8ToWide(std::istream & utf8, TextStreamBase & res, bool clear, int mode) +{ +int z; +bool correct, was_error = false; + + if( clear ) + res.clear(); + + while( UTF8ToInt(utf8, z, correct) > 0 ) + { + if( !correct ) + { + if( mode == 1 ) + res << 0xFFFD; // U+FFFD "replacement character" + + was_error = true; + } + else + { + private_namespace::IntToWide(z, res); + } + } + +return !was_error; +} + + + + + + + + +/*! + this function converts one wide character into UTF-8 stream + + input: + z - wide character + + output: + utf8 - a UTF-8 stream for the output sequence + + the function returns how many characters have been written to the utf8 stream, + zero means that 'z' is an incorrect unicode character +*/ +template +size_t IntToUTF8(int z, StreamType & utf8) +{ + char buf[10]; + + size_t len = IntToUTF8(z, buf, sizeof(buf)/sizeof(char)); + + if( len > 0 ) + utf8.write(buf, len); + + return len; +} + + + + + +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a wide string for converting + string_len - size of the string + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 stream for the output sequence + + this function returns false if there were some errors when converting +*/ +template +bool WideToUTF8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode) +{ +bool was_error = false; +size_t chars; + + while( string_len > 0 ) + { + chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, was_error, mode); + wide_string += chars; + string_len -= chars; + } + +return !was_error; +} + + + + + +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a null terminated wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 stream for the output sequence + + this function returns false if there were some errors when converting +*/ +template +bool WideToUTF8(const wchar_t * wide_string, StreamType & utf8, int mode) +{ +bool was_error = false; + + while( *wide_string ) + wide_string += private_namespace::WideOneToUTF8(wide_string, utf8, was_error, mode); + +return !was_error; +} + + + +/*! + this function converts a wide string (std::wstring) into UTF-8 stream + + input: + wide_string - a wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 stream for the output sequence + + this function returns false if there were some errors when converting +*/ +template +bool WideToUTF8(const std::wstring & wide_string, StreamType & utf8, int mode) +{ + return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, mode); +} + + + + +template +void WideToUTF8(TextStreamBase & buffer, std::string & utf8, bool clear, int mode) +{ + if( clear ) + utf8.clear(); + + private_namespace::WideToUTF8Generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){ + utf8.append(utf8_buffer, buffer_len); + }); +} + + +// not tested +template +void WideToUTF8(TextStreamBase & buffer, std::ostream & utf8, int mode) +{ + private_namespace::WideToUTF8Generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){ + utf8.write(utf8_buffer, buffer_len); + }); +} + + + + + +} // namespace PT + +#endif + + +