/* * This file is a part of PikoTools * and is distributed under the 2-Clause BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2021-2023, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #ifndef headerfile_pikotools_src_utf8_utf8_templates #define headerfile_pikotools_src_utf8_utf8_templates // this file is included at the end of utf8.h #include "utf8_private.h" namespace pt { template bool int_to_wide(int c, StreamType & res) { wchar_t buf[2]; size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t)); if( used == 1 ) { res << buf[0]; } else if( used == 2 ) { res << buf[0]; res << buf[1]; } return used > 0; } /*! this function converts one UTF-8 character into int input: iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) iterator_end - an end iterator output: res - an output character correct - true if it is a correct character the function returns how many characters have been used from the input stream */ template size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct) { size_t i, len; unsigned char uz; res = 0; correct = false; if( iterator_in == iterator_end ) return 0; uz = *iterator_in; ++iterator_in; if( !private_namespace::utf8_to_int_first_octet(uz, len, res) ) return 1; for(i=1 ; i stream (need to be tested) */ // need to be tested template bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode) { if( clear ) res.clear(); bool status = private_namespace::utf8_to_wide_generic(utf8, utf8_len, mode, [&res](int c) { int_to_wide(c, res); }); return status; } template bool utf8_to_wide(const char * utf8, StreamType & res, bool clear, int mode) { size_t utf8_len = 0; while( utf8[utf8_len] != 0 ) utf8_len += 1; return utf8_to_wide(utf8, utf8_len, res, clear, mode); } template bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear, int mode) { return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode); } template bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear, int mode) { int z; bool correct, was_error = false; if( clear ) res.clear(); while( utf8_to_int(utf8, z, correct) > 0 ) { if( !correct ) { if( mode == 1 ) res << 0xFFFD; // U+FFFD "replacement character" was_error = true; } else { int_to_wide(z, res); } } return !was_error; } /* this function converts a UTF-8 stream into a wide stream or a wide string input: stream - a UTF-8 stream for converting mode - what to do with errors when converting 0: skip an invalid character 1: put U+FFFD "replacement character" istead of the invalid character (default) output: res - a wide stream or a wide string for the output sequence this function returns false if there were some errors when converting */ template bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode) { size_t len; bool correct; int z; size_t index = 0; bool was_error = false; if( clear ) res.clear(); // CHECKME test me when sizeof(wchar_t) is 2 do { len = utf8_to_int(stream, index, z, correct); if( len > 0 ) { if( !correct ) { if( mode == 1 ) int_to_wide(0xFFFD, res); // U+FFFD "replacement character" was_error = true; } else { int_to_wide(z, res); } index += len; } } while( len > 0 ); return !was_error; } /*! this function converts UTF-8 stream into a wide stream or a wide string input: iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) iterator_end - an end iterator output: out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string) this function returns false if there were some errors when converting */ template bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode) { if( clear_stream ) out_stream.clear(); int res; bool correct; bool was_error = false; while( iterator_in != iterator_end ) { utf8_to_int(iterator_in, iterator_end, res, correct); if( correct ) { int_to_wide(res, out_stream); } else { if( mode == 1 ) int_to_wide(0xFFFD, out_stream); // U+FFFD "replacement character" was_error = true; } } return !was_error; } /*! this function converts UTF-8 stream into a wide string input: iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) iterator_end - an end iterator output: out_buffer - an output wide string max_buffer_len - how many characters can be write (we write the terminating null character too) was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large this function returns false if there were some errors when converting or if the output buffer was too short */ template bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large) { int res; bool correct; bool was_error = true; bool was_buffer_ok = false; if( max_buffer_len > 0 ) { max_buffer_len -= 1; // for terminating null character was_error = false; was_buffer_ok = true; while( iterator_in != iterator_end ) { utf8_to_int(iterator_in, iterator_end, res, correct); if( !correct ) { was_error = true; if( mode == 1 ) { res = 0xFFFD; // U+FFFD "replacement character" correct = true; } } if( correct ) { size_t len = int_to_wide(res, out_buffer, max_buffer_len); // if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand) if( len == 0 ) { was_error = true; was_buffer_ok = false; break; } else { out_buffer += len; max_buffer_len -= len; } } } *out_buffer = 0; } if( was_buffer_sufficient_large ) *was_buffer_sufficient_large = was_buffer_ok; return !was_error; } /*! this function converts UTF-8 stream into a wide string input: stream - a stream for reading from output: out_buffer - an output wide string max_buffer_len - how many characters can be write (we write the terminating null character too) was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large this function returns false if there were some errors when converting or if the output buffer was too short */ template bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode) { typename StreamType::const_iterator stream_begin = stream.begin(); typename StreamType::const_iterator stream_end = stream.end(); return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large); } /*! this function converts one wide character into UTF-8 stream input: z - wide character output: utf8 - a UTF-8 stream for the output sequence the function returns how many characters have been written to the utf8 stream, zero means that 'z' is an incorrect unicode character */ template size_t int_to_utf8(int z, StreamType & utf8) { char buf[10]; size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char)); if( len > 0 ) utf8.write(buf, len); return len; } /*! this function converts a wide string into UTF-8 stream input: wide_string - a wide string for converting string_len - size of the string mode - what to do with errors when converting 0: skip an invalid character 1: put U+FFFD "replacement character" istead of the invalid character (default) output: utf8 - a UTF-8 stream for the output sequence this function returns false if there were some errors when converting */ template bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode) { bool was_error = false; size_t chars; while( string_len > 0 ) { chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode); wide_string += chars; string_len -= chars; } return !was_error; } /*! this function converts a wide string into UTF-8 stream input: wide_string - a null terminated wide string for converting mode - what to do with errors when converting 0: skip an invalid character 1: put U+FFFD "replacement character" istead of the invalid character (default) output: utf8 - a UTF-8 stream for the output sequence this function returns false if there were some errors when converting */ template bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode) { bool was_error = false; while( *wide_string ) wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode); return !was_error; } /*! this function converts a wide string (std::wstring) into UTF-8 stream input: wide_string - a wide string for converting mode - what to do with errors when converting 0: skip an invalid character 1: put U+FFFD "replacement character" istead of the invalid character (default) output: utf8 - a UTF-8 stream for the output sequence this function returns false if there were some errors when converting */ template bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode) { return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, mode); } template bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode) { if( clear ) utf8.clear(); return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { utf8.append(utf8_buffer, buffer_len); return true; }); } template bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode) { bool was_error = false; if( clear ) utf8.clear(); for(size_t i=0 ; i < stream.size() ; ++i) { int c = static_cast(stream.get_wchar(i)); bool is_correct = false; if( utf8_check_range(c) ) { // CHECKME test me when sizeof(wchar_t) == 2 if( is_first_surrogate_char(c) ) { if( i + 1 < stream.size() ) { wchar_t c1 = static_cast(c); wchar_t c2 = stream.get_wchar(++i); if( surrogate_pair_to_int(c1, c2, c) ) { is_correct = true; } } } else { is_correct = true; } } if( is_correct ) { int_to_utf8(c, utf8); } else { was_error = true; if( mode == 1 ) int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character" } } return !was_error; } template bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode) { if( clear ) utf8.clear(); return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { utf8.write(utf8_buffer, buffer_len); return true; }); } /*! this function converts a wide stream into a utf8 string input: buffer - a wide stream for reading from output: utf8 - an output utf8 string max_buffer_len - how many characters can be write (we write the terminating null character too) was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large this function returns false if there were some errors when converting or if the output buffer was too short */ template bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode) { bool buffer_ok = false; bool is_ok = false; if( max_buffer_size > 0 ) { buffer_ok = true; max_buffer_size -= 1; // for terminating null character is_ok = private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool { std::size_t i=0; for( ; i < buffer_len ; ++i) { if( i < max_buffer_size ) { *utf8 = utf8_buffer[i]; utf8 += 1; } else { buffer_ok = false; break; } } max_buffer_size -= i; *utf8 = 0; return buffer_ok; }); } if( was_buffer_sufficient_large ) *was_buffer_sufficient_large = buffer_ok; return is_ok; } } // namespace pt #endif