/* * This file is a part of PikoTools * and is distributed under the (new) BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef headerfile_picotools_utf8_utf8_private #define headerfile_picotools_utf8_utf8_private #include "textstream/textstream.h" namespace pt { bool UTF8_CheckRange(int c); size_t IntToUTF8(int z, char * utf8, size_t utf8_max_len); size_t IntToUTF8(int z, std::string & utf8, bool clear); size_t UTF8ToInt(const char * utf8, size_t utf8_len, int & res, bool & correct); namespace private_namespace { bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res); bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res); size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct); size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode); size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode); size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode); /*! an auxiliary function for converting from wide characters to UTF-8 returns how many wide characters were used if string_len is greater than 0 then the return value is always greater than zero too */ template static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode) { int z; bool correct; size_t chars; chars = WideToInt(wide_string, string_len, z, correct); if( correct ) correct = IntToUTF8(z, utf8) != 0; if( !correct ) { if( mode == 1 ) IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character" was_error = true; } return chars; } /*! an auxiliary function for converting from wide characters to UTF-8 */ template static size_t WideOneToUTF8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode) { size_t min_str_len = 1; if( *wide_string == 0 ) return 0; if( *(wide_string+1) != 0 ) min_str_len = 2; return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode); } // declared in utf8.h, defined in utf8.cpp size_t UTF8ToInt(const char * utf8, size_t utf8_len, int & res, bool & correct); template bool UTF8ToWideGeneric(const char * utf8, size_t utf8_len, int mode, function_type convert_function) { int z; size_t len; bool correct, was_error = false; while( utf8_len > 0 ) { if( (unsigned char)*utf8 <= 0x7f ) { // small optimization len = 1; correct = true; z = static_cast(*utf8); } else { len = pt::UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero } if( !correct ) { if( mode == 1 ) convert_function(0xFFFD); // U+FFFD "replacement character" was_error = true; } else { convert_function(z); } utf8 += len; utf8_len -= len; } return !was_error; } template void IntToWide(int c, StreamType & res) { if( sizeof(wchar_t)==2 && c>0xffff ) { // UTF16 surrogate pairs c -= 0x10000; res << static_cast(((c >> 10) & 0x3FF) + 0xD800); res << static_cast((c & 0x3FF) + 0xDC00); } else { res << static_cast(c); } } // not tested // FIX ME it is not using surrogate pairs from input stream // and mode parameter template void WideToUTF8Generic(TextStreamBase & buffer, int mode, function_type write_function) { char utf8_buffer[256]; std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char); std::size_t utf8_sequence_max_length = 10; std::size_t index = 0; typename TextStreamBase::const_iterator i = buffer.begin(); while( i != buffer.end() ) { if( index + utf8_sequence_max_length > buffer_len ) { write_function(utf8_buffer, index); index = 0; } index += IntToUTF8(*i, utf8_buffer + index, buffer_len - index); ++i; } if( index > 0 ) { write_function(utf8_buffer, index); } } } // namespace private_namespace } // namespace pt #endif