Files
pikotools/src/utf8/utf8.cpp
Tomasz Sowa 2689c9fece update utf8 functions comments
while here:
- rename pt::utf8_check_range(...) -> pt::is_correct_unicode_char(...)
2024-05-31 00:23:43 +02:00

1176 lines
26 KiB
C++

/*
* This file is a part of PikoTools
* and is distributed under the 2-Clause BSD licence.
* Author: Tomasz Sowa <t.sowa@ttmath.org>
*/
/*
* Copyright (c) 2010-2024, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <fstream>
#include "utf8.h"
namespace pt
{
/*
* return true if 'c' is a correct unicode character
*/
bool is_correct_unicode_char(int c)
{
return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF);
}
/*
* return true if 'c' is a correct unicode character
*
* this method is used when reading from an utf8 string
* how_many_bytes - means how many bytes from the utf8 string were read
*/
bool is_correct_unicode_char(int c, int how_many_bytes)
{
if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 )
{
return true;
}
if( c >= 0x0080 && c <= 0x07ff && how_many_bytes == 2 )
{
return true;
}
if( c >= 0x0800 && c < 0xD800 && how_many_bytes == 3)
{
return true;
}
if( c > 0xDFFF && c <= 0xffff && how_many_bytes == 3)
{
return true;
}
if( c >= 0x10000 && c <= 0x10FFFF && how_many_bytes == 4 )
{
return true;
}
return false;
}
bool is_surrogate_char(int c)
{
return (c>=0xD800 && c<=0xDFFF);
}
bool is_first_surrogate_char(int c)
{
return (c>=0xD800 && c<=0xDBFF);
}
bool is_second_surrogate_char(int c)
{
return (c>=0xDC00 && c<=0xDFFF);
}
bool surrogate_pair_to_int(int c1, int c2, int & z)
{
z = 0xFFFD; // U+FFFD "replacement character";
if( is_first_surrogate_char(c1) )
{
if( is_second_surrogate_char(c2) )
{
z = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF));
return true;
}
}
return false;
}
/*
* convert one wide (or two wide) characters to an int
*
* return how many wide characters were used
* if string_len is greater than 0 then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
{
if( string_len == 0 )
{
z = 0;
correct = false;
return 0;
}
z = static_cast<int>(*wide_string);
correct = true;
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(z) )
{
if( string_len > 1 )
{
int z2 = *(wide_string+1);
if( is_second_surrogate_char(z2) )
{
z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
return 2;
}
else
{
correct = false;
return 1;
}
}
else
{
correct = false;
return 1;
}
}
else
{
correct = is_correct_unicode_char(z);
return 1;
}
}
/*
* an auxiliary function for converting from wide characters to UTF-8
* converting a wide character into one int
* return how many wide characters were used
* if wide_string has at least one character then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)
{
size_t min_str_len = 1;
if( *wide_string == 0 )
{
z = 0;
correct = false;
return 0;
}
if( *(wide_string+1) != 0 )
min_str_len = 2;
return wide_to_int(wide_string, min_str_len, z, correct);
}
/*
* convert an int to a wide string
*
* this method will not terminate the output string with a null character
* return how many characters have been written (0, 1 or 2)
*/
size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len)
{
if( sizeof(wchar_t)==2 && c>0xffff )
{
if( max_buf_len > 1 )
{
// UTF16 surrogate pairs
c -= 0x10000;
res[0] = static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
res[1] = static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
return 2;
}
}
else
{
if( max_buf_len > 0 )
{
res[0] = static_cast<wchar_t>(c);
return 1;
}
}
return 0;
}
/*
* convert an int to a wide string
*
* return true if a character was inserted to the string
*/
bool int_to_wide(int c, std::wstring & res)
{
wchar_t buf[2];
size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t));
if( used == 1 )
{
res += buf[0];
}
else
if( used == 2 )
{
res += buf[0];
res += buf[1];
}
return used > 0;
}
/*
* convert one character into a stream
* stream can be an utf8 or a wide stream
*
* return true if c was a correct unicode character
* and has been put the the stream
*/
bool int_to_stream(int c, pt::Stream & stream)
{
if( stream.is_char_stream() )
{
return int_to_utf8(c, stream) > 0;
}
else
if( stream.is_wchar_stream() )
{
return int_to_wide(c, stream);
}
return false;
}
/*
* convert one UTF-8 character into one wide-character
*
* input:
* utf8 - an input UTF-8 string
* utf8_len - size of the input string,
* the string should be at least 4 bytes length for correctly
* recognized the utf-8 sequence
*
* output:
* res - an output character
* correct - true if it is a correct character
*
* the function returns how many characters have been used from the input string
* (returns zero only if utf8_len is zero)
* even if there are errors the functions returns a different from zero value
*/
size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct)
{
size_t i, len;
res = 0;
correct = false;
if( utf8_len == 0 )
return 0;
if( !private_namespace::utf8_to_int_first_octet(utf8[0], len, res) )
return 1;
if( utf8_len < len )
return utf8_len;
for(i=1 ; i<len ; ++i)
{
if( !private_namespace::utf8_to_int_add_next_octet(utf8[i], res) )
return i;
}
if( is_correct_unicode_char(res, len) )
correct = true;
return len;
}
/*
* convert one UTF-8 character into one wide-character
*
* input:
* utf8 - an input UTF-8 string (null terminated)
*
* output:
* res - an output character
* correct - true if it is a correct character
*
* the function returns how many characters have been used from the input string
* (returns zero only if the string has '\0' at the first character)
* even if there are errors the functions returns a different from zero value
*/
size_t utf8_to_int(const char * utf8, int & res, bool & correct)
{
size_t i, len;
res = 0;
correct = false;
if( *utf8 == 0 )
return 0;
if( !private_namespace::utf8_to_int_first_octet(utf8[0], len, res) )
return 1;
for(i=1 ; i<len ; ++i)
{
if( utf8[i] == 0 )
return i;
if( !private_namespace::utf8_to_int_add_next_octet(utf8[i], res) )
return i;
}
if( is_correct_unicode_char(res, len) )
correct = true;
return len;
}
/*
* convert one UTF-8 character into one wide-character
*
* input:
* utf8 - an input UTF-8 string
*
* output:
* res - an output character
* correct - true if it is a correct character
*
* the function returns how many characters have been used from the input string
* (returns zero only if utf8 is empty)
* even if there are errors the functions returns a different from zero value
*/
size_t utf8_to_int(const std::string & utf8, int & res, bool & correct)
{
return utf8_to_int(utf8.c_str(), utf8.size(), res, correct);
}
/*
* convert one UTF-8 character into one wide-character
*
* input:
* utf8 - an input UTF-8 stream
*
* output:
* res - an output character
* correct - true if it is a correct character
*
* the function returns how many characters have been used from the input stream
*/
size_t utf8_to_int(std::istream & utf8, int & res, bool & correct)
{
size_t i, len;
unsigned char uz;
res = 0;
correct = false;
uz = utf8.get();
if( !utf8 )
return 0;
if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
return 1;
for(i=1 ; i<len ; ++i)
{
uz = utf8.get();
if( !utf8 )
return i;
if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
return i;
}
if( is_correct_unicode_char(res, len) )
correct = true;
return len;
}
// new function, need to be tested a little more especially when sizeof(wchar_t) is 2
size_t utf8_to_int(const Stream & utf8, size_t stream_index, int & res, bool & correct)
{
size_t i, len;
unsigned char uz;
res = 0;
correct = false;
len = 0;
if( stream_index < utf8.size() )
{
uz = utf8.get_char(stream_index);
if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
return 1;
if( stream_index + len < utf8.size() + 1 )
{
for(i=1 ; i<len ; ++i)
{
uz = utf8.get_char(stream_index + i);
if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
return i + 1;
}
if( is_correct_unicode_char(res, len) )
correct = true;
}
else
{
len = utf8.size() - stream_index;
}
}
return len;
}
/*
* convert an utf8 string into a wide string (std::wstring)
*
* input:
* utf8 - an input utf8 string
* utf8_len - size of the input string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* res - an output wide string
*
* the function returns false if there were some errors when converting
*/
bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear, int mode)
{
if( clear )
res.clear();
bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) {
int_to_wide(c, res);
}, mode);
return status;
}
/*
* convert an utf8 string into a wide string (std::wstring)
*
* input:
* utf8 - an input utf8 null terminated string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* res - an output wide string
*
* the function returns false if there were some errors when converting
*/
bool utf8_to_wide(const char * utf8, std::wstring & res, bool clear, int mode)
{
size_t utf8_len = 0;
while( utf8[utf8_len] != 0 )
utf8_len += 1;
return utf8_to_wide(utf8, utf8_len, res, clear, mode);
}
/*
* convert an utf8 string into a wide string (std::wstring)
*
* input:
* utf8 - an input utf8 string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* res - an output wide string
*
* the function returns false if there were some errors when converting
*/
bool utf8_to_wide(const std::string & utf8, std::wstring & res, bool clear, int mode)
{
return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode);
}
/*
* convert an utf8 stream into a wide string (std::wstring)
*
* input:
* utf8 - an input utf8 stream
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* res - an output wide string
*
* the function returns false if there were some errors when converting
*/
bool utf8_to_wide(std::istream & utf8, std::wstring & res, bool clear, int mode)
{
int z;
bool correct, was_error = false;
if( clear )
res.clear();
while( utf8_to_int(utf8, z, correct) > 0 )
{
if( !correct )
{
if( mode == 1 )
res += 0xFFFD; // U+FFFD "replacement character"
was_error = true;
}
else
{
int_to_wide(z, res);
}
}
return !was_error;
}
/*
* convert one wide character into an UTF-8 sequence
*
* input:
* z - wide character
*
* output:
* utf8 - a buffer for the output sequence
* utf8_len - the size of the buffer
*
* the function returns how many characters have been written to the utf8,
* zero means the utf8 buffer is too small or 'z' is an incorrect unicode character
*/
size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len)
{
char buf[10];
int i = 0;
int mask = 0x3f; // 6 first bits set
if( utf8_max_len==0 || !is_correct_unicode_char(z) )
return 0;
if( z <= 0x7f )
{
utf8[0] = static_cast<char>(z);
return 1;
}
do
{
buf[i] = 0x80 | (z & 0x3f);
i += 1;
z >>= 6;
mask >>= 1;
}
while( (z & (~mask)) != 0 );
unsigned int first = -1;
first <<= (7 - i);
first |= (z & mask);
if( size_t(i+1) > utf8_max_len )
return 0;
utf8[0] = static_cast<char>(first);
int a = 1;
for(--i; i>=0 ; --i, ++a)
utf8[a] = buf[i];
return a;
}
/*
* convert one wide character into an UTF-8 string
*
* input:
* z - wide character
*
* output:
* utf8 - a UTF-8 string for the output sequence (the string is not cleared)
*
* the function returns how many characters have been written to the utf8 string,
* zero means that 'z' is an incorrect unicode character
*/
size_t int_to_utf8(int z, std::string & utf8, bool clear)
{
char buf[10];
if( clear )
utf8.clear();
size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char));
size_t i;
for(i=0 ; i<len ; ++i)
utf8 += buf[i];
return len;
}
/*
* convert a wide string into an UTF-8 string
*
* input:
* wide_string - a wide string for converting
* string_len - the size of the string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a UTF-8 string for the output sequence (the string is not cleared)
*
* this function returns false if there were some errors when converting
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear, int mode)
{
bool was_error = false;
size_t chars;
if( clear )
utf8.clear();
while( string_len > 0 )
{
chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode);
wide_string += chars;
string_len -= chars;
}
return !was_error;
}
/*
* convert a wide string into an UTF-8 string
*
* input:
* wide_string - a null terminated wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a UTF-8 string for the output sequence (the string is not cleared)
*
* this function returns false if there were some errors when converting
*/
bool wide_to_utf8(const wchar_t * wide_string, std::string & utf8, bool clear, int mode)
{
bool was_error = false;
if( clear )
utf8.clear();
while( *wide_string )
wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode);
return !was_error;
}
/*
* convert a wide string (std::wstring) into an UTF-8 string
*
* input:
* wide_string - a wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a UTF-8 string for the output sequence (the string is not cleared)
*
* this function returns false if there were some errors when converting
*/
bool wide_to_utf8(const std::wstring & wide_string, std::string & utf8, bool clear, int mode)
{
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, clear, mode);
}
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* string_len - lenght of the wide string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
* utf8_written - how many bytes have been written to the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is not null terminated
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode)
{
bool was_error = false;
bool was_buffer_to_small;
size_t chars, utf8_saved;
utf8_written = 0;
while( string_len > 0 )
{
chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
if( was_buffer_to_small )
{
/*
* if the buffer was too small break immediately
* and set the was_error flag
*/
was_error = true;
break;
}
wide_string += chars;
string_len -= chars;
utf8 += utf8_saved;
utf8_len -= utf8_saved;
utf8_written += utf8_saved;
}
return !was_error;
}
/*
* convert a wide string (std::wstring) into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
* utf8_written - how many bytes have been written to the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is not null terminated
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
*/
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode)
{
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, utf8_written, mode);
}
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* string_len - lenght of the wide string
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is null terminated (even if there were errors during converting)
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
* (in both cases the utf8 buffer is null terminated)
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode)
{
size_t utf8_saved;
bool res;
if( utf8_len == 0 )
return false;
res = wide_to_utf8(wide_string, string_len, utf8, utf8_len - 1, utf8_saved, mode);
utf8[utf8_saved] = 0;
return res;
}
/*
* convert a wide string (std::wstring) into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is null terminated (even if there were errors during converting)
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
* (in both cases the utf8 buffer is null terminated)
*/
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode)
{
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, mode);
}
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a null terminated wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
* utf8_written - how many bytes have been written to the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is not null terminated
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
*/
bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode)
{
bool was_error = false;
bool was_buffer_to_small;
size_t chars, utf8_saved;
size_t len;
utf8_written = 0;
while( *wide_string )
{
len = (*(wide_string+1) == 0) ? 1 : 2;
chars = private_namespace::wide_one_to_utf8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
if( was_buffer_to_small )
{
/*
* if the buffer was too small break immediately
* and set the was_error flag
*/
was_error = true;
break;
}
wide_string += chars;
utf8 += utf8_saved;
utf8_len -= utf8_saved;
utf8_written += utf8_saved;
}
return !was_error;
}
/*
* convert a wide string into an UTF-8 stream
*
* input:
* wide_string - a wide string for converting
* mode - what to do with errors when converting
* 0: skip an invalid character
* 1: put U+FFFD "replacement character" istead of the invalid character (default)
*
* output:
* utf8 - a buffer for the UTF-8 stream
* utf8_len - the size of the buffer
*
* this function returns false if there were some errors when converting or the output buffer was too small,
* the output string is null terminated (even if there were errors during converting)
*
* if there is an error when converting (there is an incorrect character in the wide string) the function
* will continue converting but if the buffer is too small the function breaks immediately
* (in both cases the utf8 buffer is null terminated)
*/
bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode)
{
size_t utf8_saved;
bool res;
if( utf8_len == 0 )
return false;
res = wide_to_utf8(wide_string, utf8, utf8_len - 1, utf8_saved, mode);
utf8[utf8_saved] = 0;
return res;
}
namespace private_namespace
{
/*
* an auxiliary function for converting from UTF-8 string
*/
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res)
{
for(len=0 ; (uz & 0x80) != 0 ; ++len)
uz <<= 1;
if( len == 1 || len > 4 )
return false;
res = uz;
if( len > 0 )
res >>= len;
if( len == 0 )
len = 1;
return true;
}
/*
* an auxiliary function for converting from UTF-8 string
*/
bool utf8_to_int_add_next_octet(unsigned char uz, int & res)
{
if( (uz & 0xc0) != 0x80 )
return false;
res <<= 6;
res |= (uz & 0x3F);
return true;
}
/*
* an auxiliary function for converting from wide characters to UTF-8
*
* return how many wide characters were used
* if string_len is greater than 0 then the return value is always greater than zero too
*
* utf8_written - how many characters were saved in the utf8 string (the string doesn't have
* a null terminating character)
* it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
* was_utf8_buf_too_small - will be true if the utf8 buffer is too small
* if this flag is true then utf8_written is equal to zero
* was_error - will be true if there is an error when converting (there was an incorrect wide character)
* (was_error will not be true if the utf8 buffer is too small)
*/
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
utf8_written = 0;
was_utf8_buf_too_small = false;
chars = wide_to_int(wide_string, string_len, z, correct);
if( correct )
{
utf8_written = int_to_utf8(z, utf8, utf8_len);
if( utf8_written == 0 )
was_utf8_buf_too_small = true;
}
else
{
if( mode == 1 )
{
utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
if( utf8_written == 0 )
was_utf8_buf_too_small = true;
}
was_error = true;
}
return chars;
}
/*
* an auxiliary function for converting from wide characters to UTF-8
*
* return how many wide characters were used
* if string_len is greater than 0 then the return value is always greater than zero too
*/
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
chars = wide_to_int(wide_string, string_len, z, correct);
if( correct )
correct = int_to_utf8(z, utf8, false) != 0;
if( !correct )
{
if( mode == 1 )
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
was_error = true;
}
return chars;
}
/*
* an auxiliary function for converting from wide characters to UTF-8
*
* return how many wide characters were used
* if wide_string has at least one character then the return value is always greater than zero too
*/
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
chars = wide_to_int(wide_string, z, correct);
if( correct )
correct = int_to_utf8(z, utf8, false) != 0;
if( !correct )
{
if( mode == 1 )
int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
was_error = true;
}
return chars;
}
} // namespace private_namespace
} // namespace