/* * This file is a part of PikoTools * and is distributed under the 2-Clause BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2010-2024, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #include #include "utf8.h" namespace pt { /* * return true if 'c' is a correct unicode character */ bool is_correct_unicode_char(int c) { return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF); } /* * return true if 'c' is a correct unicode character * * this method is used when reading from an utf8 string * how_many_bytes - means how many bytes from the utf8 string were read */ bool is_correct_unicode_char(int c, int how_many_bytes) { if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 ) { return true; } if( c >= 0x0080 && c <= 0x07ff && how_many_bytes == 2 ) { return true; } if( c >= 0x0800 && c < 0xD800 && how_many_bytes == 3) { return true; } if( c > 0xDFFF && c <= 0xffff && how_many_bytes == 3) { return true; } if( c >= 0x10000 && c <= 0x10FFFF && how_many_bytes == 4 ) { return true; } return false; } bool is_surrogate_char(int c) { return (c>=0xD800 && c<=0xDFFF); } bool is_first_surrogate_char(int c) { return (c>=0xD800 && c<=0xDBFF); } bool is_second_surrogate_char(int c) { return (c>=0xDC00 && c<=0xDFFF); } bool surrogate_pair_to_int(int c1, int c2, int & z) { z = 0xFFFD; // U+FFFD "replacement character"; if( is_first_surrogate_char(c1) ) { if( is_second_surrogate_char(c2) ) { z = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF)); return true; } } return false; } /* * convert one wide (or two wide) characters to an int * * return how many wide characters were used * if string_len is greater than 0 then the return value is always greater than zero too */ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct) { if( string_len == 0 ) { z = 0; correct = false; return 0; } z = static_cast(*wide_string); correct = true; if( sizeof(wchar_t) == 2 && is_first_surrogate_char(z) ) { if( string_len > 1 ) { int z2 = *(wide_string+1); if( is_second_surrogate_char(z2) ) { z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF)); return 2; } else { correct = false; return 1; } } else { correct = false; return 1; } } else { correct = is_correct_unicode_char(z); return 1; } } /* * an auxiliary function for converting from wide characters to UTF-8 * converting a wide character into one int * return how many wide characters were used * if wide_string has at least one character then the return value is always greater than zero too */ size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct) { size_t min_str_len = 1; if( *wide_string == 0 ) { z = 0; correct = false; return 0; } if( *(wide_string+1) != 0 ) min_str_len = 2; return wide_to_int(wide_string, min_str_len, z, correct); } /* * convert an int to a wide string * * this method will not terminate the output string with a null character * return how many characters have been written (0, 1 or 2) */ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len) { if( sizeof(wchar_t)==2 && c>0xffff ) { if( max_buf_len > 1 ) { // UTF16 surrogate pairs c -= 0x10000; res[0] = static_cast(((c >> 10) & 0x3FF) + 0xD800); res[1] = static_cast((c & 0x3FF) + 0xDC00); return 2; } } else { if( max_buf_len > 0 ) { res[0] = static_cast(c); return 1; } } return 0; } /* * convert an int to a wide string * * return true if a character was inserted to the string */ bool int_to_wide(int c, std::wstring & res) { wchar_t buf[2]; size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t)); if( used == 1 ) { res += buf[0]; } else if( used == 2 ) { res += buf[0]; res += buf[1]; } return used > 0; } /* * convert one character into a stream * stream can be an utf8 or a wide stream * * return true if c was a correct unicode character * and has been put the the stream */ bool int_to_stream(int c, pt::Stream & stream) { if( stream.is_char_stream() ) { return int_to_utf8(c, stream) > 0; } else if( stream.is_wchar_stream() ) { return int_to_wide(c, stream); } return false; } /* * convert one UTF-8 character into one wide-character * * input: * utf8 - an input UTF-8 string * utf8_len - size of the input string, * the string should be at least 4 bytes length for correctly * recognized the utf-8 sequence * * output: * res - an output character * correct - true if it is a correct character * * the function returns how many characters have been used from the input string * (returns zero only if utf8_len is zero) * even if there are errors the functions returns a different from zero value */ size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct) { size_t i, len; res = 0; correct = false; if( utf8_len == 0 ) return 0; if( !private_namespace::utf8_to_int_first_octet(utf8[0], len, res) ) return 1; if( utf8_len < len ) return utf8_len; for(i=1 ; i 0 ) { if( !correct ) { if( mode == 1 ) res += 0xFFFD; // U+FFFD "replacement character" was_error = true; } else { int_to_wide(z, res); } } return !was_error; } /* * convert one wide character into an UTF-8 sequence * * input: * z - wide character * * output: * utf8 - a buffer for the output sequence * utf8_len - the size of the buffer * * the function returns how many characters have been written to the utf8, * zero means the utf8 buffer is too small or 'z' is an incorrect unicode character */ size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len) { char buf[10]; int i = 0; int mask = 0x3f; // 6 first bits set if( utf8_max_len==0 || !is_correct_unicode_char(z) ) return 0; if( z <= 0x7f ) { utf8[0] = static_cast(z); return 1; } do { buf[i] = 0x80 | (z & 0x3f); i += 1; z >>= 6; mask >>= 1; } while( (z & (~mask)) != 0 ); unsigned int first = -1; first <<= (7 - i); first |= (z & mask); if( size_t(i+1) > utf8_max_len ) return 0; utf8[0] = static_cast(first); int a = 1; for(--i; i>=0 ; --i, ++a) utf8[a] = buf[i]; return a; } /* * convert one wide character into an UTF-8 string * * input: * z - wide character * * output: * utf8 - a UTF-8 string for the output sequence (the string is not cleared) * * the function returns how many characters have been written to the utf8 string, * zero means that 'z' is an incorrect unicode character */ size_t int_to_utf8(int z, std::string & utf8, bool clear) { char buf[10]; if( clear ) utf8.clear(); size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char)); size_t i; for(i=0 ; i 0 ) { chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode); wide_string += chars; string_len -= chars; } return !was_error; } /* * convert a wide string into an UTF-8 string * * input: * wide_string - a null terminated wide string for converting * mode - what to do with errors when converting * 0: skip an invalid character * 1: put U+FFFD "replacement character" istead of the invalid character (default) * * output: * utf8 - a UTF-8 string for the output sequence (the string is not cleared) * * this function returns false if there were some errors when converting */ bool wide_to_utf8(const wchar_t * wide_string, std::string & utf8, bool clear, int mode) { bool was_error = false; if( clear ) utf8.clear(); while( *wide_string ) wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode); return !was_error; } /* * convert a wide string (std::wstring) into an UTF-8 string * * input: * wide_string - a wide string for converting * mode - what to do with errors when converting * 0: skip an invalid character * 1: put U+FFFD "replacement character" istead of the invalid character (default) * * output: * utf8 - a UTF-8 string for the output sequence (the string is not cleared) * * this function returns false if there were some errors when converting */ bool wide_to_utf8(const std::wstring & wide_string, std::string & utf8, bool clear, int mode) { return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, clear, mode); } /* * convert a wide string into an UTF-8 stream * * input: * wide_string - a wide string for converting * string_len - lenght of the wide string * mode - what to do with errors when converting * 0: skip an invalid character * 1: put U+FFFD "replacement character" istead of the invalid character (default) * * output: * utf8 - a buffer for the UTF-8 stream * utf8_len - the size of the buffer * utf8_written - how many bytes have been written to the buffer * * this function returns false if there were some errors when converting or the output buffer was too small, * the output string is not null terminated * * if there is an error when converting (there is an incorrect character in the wide string) the function * will continue converting but if the buffer is too small the function breaks immediately */ bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode) { bool was_error = false; bool was_buffer_to_small; size_t chars, utf8_saved; utf8_written = 0; while( string_len > 0 ) { chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode); if( was_buffer_to_small ) { /* * if the buffer was too small break immediately * and set the was_error flag */ was_error = true; break; } wide_string += chars; string_len -= chars; utf8 += utf8_saved; utf8_len -= utf8_saved; utf8_written += utf8_saved; } return !was_error; } /* * convert a wide string (std::wstring) into an UTF-8 stream * * input: * wide_string - a wide string for converting * mode - what to do with errors when converting * 0: skip an invalid character * 1: put U+FFFD "replacement character" istead of the invalid character (default) * * output: * utf8 - a buffer for the UTF-8 stream * utf8_len - the size of the buffer * utf8_written - how many bytes have been written to the buffer * * this function returns false if there were some errors when converting or the output buffer was too small, * the output string is not null terminated * * if there is an error when converting (there is an incorrect character in the wide string) the function * will continue converting but if the buffer is too small the function breaks immediately */ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode) { return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, utf8_written, mode); } /* * convert a wide string into an UTF-8 stream * * input: * wide_string - a wide string for converting * string_len - lenght of the wide string * mode - what to do with errors when converting * 0: skip an invalid character * 1: put U+FFFD "replacement character" istead of the invalid character (default) * * output: * utf8 - a buffer for the UTF-8 stream * utf8_len - the size of the buffer * * this function returns false if there were some errors when converting or the output buffer was too small, * the output string is null terminated (even if there were errors during converting) * * if there is an error when converting (there is an incorrect character in the wide string) the function * will continue converting but if the buffer is too small the function breaks immediately * (in both cases the utf8 buffer is null terminated) */ bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode) { size_t utf8_saved; bool res; if( utf8_len == 0 ) return false; res = wide_to_utf8(wide_string, string_len, utf8, utf8_len - 1, utf8_saved, mode); utf8[utf8_saved] = 0; return res; } /* * convert a wide string (std::wstring) into an UTF-8 stream * * input: * wide_string - a wide string for converting * mode - what to do with errors when converting * 0: skip an invalid character * 1: put U+FFFD "replacement character" istead of the invalid character (default) * * output: * utf8 - a buffer for the UTF-8 stream * utf8_len - the size of the buffer * * this function returns false if there were some errors when converting or the output buffer was too small, * the output string is null terminated (even if there were errors during converting) * * if there is an error when converting (there is an incorrect character in the wide string) the function * will continue converting but if the buffer is too small the function breaks immediately * (in both cases the utf8 buffer is null terminated) */ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode) { return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, mode); } /* * convert a wide string into an UTF-8 stream * * input: * wide_string - a null terminated wide string for converting * mode - what to do with errors when converting * 0: skip an invalid character * 1: put U+FFFD "replacement character" istead of the invalid character (default) * * output: * utf8 - a buffer for the UTF-8 stream * utf8_len - the size of the buffer * utf8_written - how many bytes have been written to the buffer * * this function returns false if there were some errors when converting or the output buffer was too small, * the output string is not null terminated * * if there is an error when converting (there is an incorrect character in the wide string) the function * will continue converting but if the buffer is too small the function breaks immediately */ bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode) { bool was_error = false; bool was_buffer_to_small; size_t chars, utf8_saved; size_t len; utf8_written = 0; while( *wide_string ) { len = (*(wide_string+1) == 0) ? 1 : 2; chars = private_namespace::wide_one_to_utf8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode); if( was_buffer_to_small ) { /* * if the buffer was too small break immediately * and set the was_error flag */ was_error = true; break; } wide_string += chars; utf8 += utf8_saved; utf8_len -= utf8_saved; utf8_written += utf8_saved; } return !was_error; } /* * convert a wide string into an UTF-8 stream * * input: * wide_string - a wide string for converting * mode - what to do with errors when converting * 0: skip an invalid character * 1: put U+FFFD "replacement character" istead of the invalid character (default) * * output: * utf8 - a buffer for the UTF-8 stream * utf8_len - the size of the buffer * * this function returns false if there were some errors when converting or the output buffer was too small, * the output string is null terminated (even if there were errors during converting) * * if there is an error when converting (there is an incorrect character in the wide string) the function * will continue converting but if the buffer is too small the function breaks immediately * (in both cases the utf8 buffer is null terminated) */ bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode) { size_t utf8_saved; bool res; if( utf8_len == 0 ) return false; res = wide_to_utf8(wide_string, utf8, utf8_len - 1, utf8_saved, mode); utf8[utf8_saved] = 0; return res; } namespace private_namespace { /* * an auxiliary function for converting from UTF-8 string */ bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res) { for(len=0 ; (uz & 0x80) != 0 ; ++len) uz <<= 1; if( len == 1 || len > 4 ) return false; res = uz; if( len > 0 ) res >>= len; if( len == 0 ) len = 1; return true; } /* * an auxiliary function for converting from UTF-8 string */ bool utf8_to_int_add_next_octet(unsigned char uz, int & res) { if( (uz & 0xc0) != 0x80 ) return false; res <<= 6; res |= (uz & 0x3F); return true; } /* * an auxiliary function for converting from wide characters to UTF-8 * * return how many wide characters were used * if string_len is greater than 0 then the return value is always greater than zero too * * utf8_written - how many characters were saved in the utf8 string (the string doesn't have * a null terminating character) * it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read * was_utf8_buf_too_small - will be true if the utf8 buffer is too small * if this flag is true then utf8_written is equal to zero * was_error - will be true if there is an error when converting (there was an incorrect wide character) * (was_error will not be true if the utf8 buffer is too small) */ size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode) { int z; bool correct; size_t chars; utf8_written = 0; was_utf8_buf_too_small = false; chars = wide_to_int(wide_string, string_len, z, correct); if( correct ) { utf8_written = int_to_utf8(z, utf8, utf8_len); if( utf8_written == 0 ) was_utf8_buf_too_small = true; } else { if( mode == 1 ) { utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character" if( utf8_written == 0 ) was_utf8_buf_too_small = true; } was_error = true; } return chars; } /* * an auxiliary function for converting from wide characters to UTF-8 * * return how many wide characters were used * if string_len is greater than 0 then the return value is always greater than zero too */ size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode) { int z; bool correct; size_t chars; chars = wide_to_int(wide_string, string_len, z, correct); if( correct ) correct = int_to_utf8(z, utf8, false) != 0; if( !correct ) { if( mode == 1 ) int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character" was_error = true; } return chars; } /* * an auxiliary function for converting from wide characters to UTF-8 * * return how many wide characters were used * if wide_string has at least one character then the return value is always greater than zero too */ size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode) { int z; bool correct; size_t chars; chars = wide_to_int(wide_string, z, correct); if( correct ) correct = int_to_utf8(z, utf8, false) != 0; if( !correct ) { if( mode == 1 ) int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character" was_error = true; } return chars; } } // namespace private_namespace } // namespace