/* * This file is a part of PikoTools * and is distributed under the 2-Clause BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #ifndef headerfile_picotools_utf8_utf8_private #define headerfile_picotools_utf8_utf8_private #include namespace pt { bool utf8_check_range(int c); size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len); size_t int_to_utf8(int z, std::string & utf8, bool clear); size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct); bool is_surrogate_char(int c); bool is_first_surrogate_char(int c); bool is_second_surrogate_char(int c); bool surrogate_pair_to_int(int c1, int c2, int & z); namespace private_namespace { bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res); bool utf8_to_int_add_next_octet(unsigned char uz, int & res); size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct); size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode); size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode); size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode); /*! an auxiliary function for converting from wide characters to UTF-8 returns how many wide characters were used if string_len is greater than 0 then the return value is always greater than zero too */ template static size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode) { int z; bool correct; size_t chars; chars = wide_to_int(wide_string, string_len, z, correct); if( correct ) correct = int_to_utf8(z, utf8) != 0; if( !correct ) { if( mode == 1 ) int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character" was_error = true; } return chars; } /*! an auxiliary function for converting from wide characters to UTF-8 */ template static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode) { size_t min_str_len = 1; if( *wide_string == 0 ) return 0; if( *(wide_string+1) != 0 ) min_str_len = 2; return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode); } // declared in utf8.h, defined in utf8.cpp size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct); template bool utf8_to_wide_generic(const char * utf8, size_t utf8_len, int mode, function_type convert_function) { int z; size_t len; bool correct, was_error = false; while( utf8_len > 0 ) { if( (unsigned char)*utf8 <= 0x7f ) { // small optimization len = 1; correct = true; z = static_cast(*utf8); } else { len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero } if( !correct ) { if( mode == 1 ) convert_function(0xFFFD); // U+FFFD "replacement character" was_error = true; } else { convert_function(z); } utf8 += len; utf8_len -= len; } return !was_error; } // FIX ME it is not using surrogate pairs from input stream // and is not using mode parameter template void wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_function) { char utf8_buffer[256]; std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char); std::size_t utf8_sequence_max_length = 10; std::size_t index = 0; typename StreamType::const_iterator i = buffer.begin(); while( i != buffer.end() ) { if( index + utf8_sequence_max_length > buffer_len ) { write_function(utf8_buffer, index); index = 0; } index += int_to_utf8(*i, utf8_buffer + index, buffer_len - index); ++i; } if( index > 0 ) { write_function(utf8_buffer, index); } } } // namespace private_namespace } // namespace pt #endif