add some wide/utf8 convertion methods

add following methods:
size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len);

template<typename StreamIteratorType>
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode = 1, bool * was_buffer_sufficient_large = nullptr);

template<typename StreamType>
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large = nullptr, int mode = 1);

template<typename StreamType>
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large = nullptr, int mode = 1);
This commit is contained in:
Tomasz Sowa 2023-07-14 07:41:14 +02:00
parent 7e92b5d9d7
commit 78d31861de
Signed by: tomasz.sowa
GPG Key ID: 662CC1438638588B
5 changed files with 318 additions and 45 deletions

View File

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2010-2022, Tomasz Sowa
* Copyright (c) 2010-2023, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -129,24 +129,63 @@ bool surrogate_pair_to_int(int c1, int c2, int & z)
/*
converts an int to a wide string
*/
void int_to_wide(int c, std::wstring & res)
* converts an int to a wide string
*
* this method will not terminate the output string with a null character
* return how many characters have been written (0, 1 or 2)
*/
size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len)
{
if( sizeof(wchar_t)==2 && c>0xffff )
{
// UTF16 surrogate pairs
c -= 0x10000;
res += static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
res += static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
if( max_buf_len > 1 )
{
// UTF16 surrogate pairs
c -= 0x10000;
res[0] = static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
res[1] = static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
return 2;
}
}
else
{
res += static_cast<wchar_t>(c);
if( max_buf_len > 0 )
{
res[0] = static_cast<wchar_t>(c);
return 1;
}
}
return 0;
}
/*
converts an int to a wide string
returns true if a character was inserted to the string
*/
bool int_to_wide(int c, std::wstring & res)
{
wchar_t buf[2];
size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t));
if( used == 1 )
{
res += buf[0];
}
else
if( used == 2 )
{
res += buf[0];
res += buf[1];
}
return used > 0;
}
/*!
this function converts one UTF-8 character into one wide-character

View File

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2010-2022, Tomasz Sowa
* Copyright (c) 2010-2023, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -134,15 +134,28 @@ size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType &
/*!
converting one character from int to wide stream
returns true if a character was inserted to the stream
*/
template<typename StreamType>
void int_to_wide(int c, StreamType & res);
bool int_to_wide(int c, StreamType & res);
/*!
converting one character from int to wide string
this method will not terminate the output string with a null character
return how many characters have been written (0, 1 or 2)
*/
void int_to_wide(int c, std::wstring & res);
size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len);
/*!
converting one character from int to wide string
returns true if a character was inserted to the string
*/
bool int_to_wide(int c, std::wstring & res);
/*!
@ -176,7 +189,14 @@ class TextStreamBase;
// defined at the end in textstream.h
template<size_t stack_size, size_t heap_block_size, typename StreamOrStringType>
bool utf8_to_wide(const TextStreamBase<char, stack_size, heap_block_size> & utf8, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1);
bool utf8_to_wide(const TextStreamBase<char, stack_size, heap_block_size> & utf8, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1);
template<typename StreamIteratorType>
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode = 1, bool * was_buffer_sufficient_large = nullptr);
template<typename StreamType>
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large = nullptr, int mode = 1);
/*
@ -217,8 +237,6 @@ bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode = 1);
template<typename StreamType>
bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode = 1);
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
@ -228,13 +246,16 @@ bool wide_to_utf8(const wchar_t * wide_string, char * utf8, s
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode = 1);
template<typename StreamType>
void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear = true, int mode = 1); // not tested
bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear = true, int mode = 1);
template<typename StreamType>
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1);
template<typename StreamTypeIn, typename StreamTypeOut>
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); // not tested, IMPROVE ME mode parameter is not used
bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1);
template<typename StreamType>
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large = nullptr, int mode = 1);
} // namespace

View File

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2021, Tomasz Sowa
* Copyright (c) 2021-2023, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -102,9 +102,9 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool
z = static_cast<int>(*wide_string);
correct = true;
if( sizeof(wchar_t) == 2 && is_surrogate_char(z) )
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(z) )
{
if( is_first_surrogate_char(z) && string_len>1 )
if( string_len > 1 )
{
int z2 = *(wide_string+1);
@ -116,7 +116,7 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool
else
{
correct = false;
return 2;
return 1;
}
}
else

View File

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2021-2022, Tomasz Sowa
* Copyright (c) 2021-2023, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -163,15 +163,14 @@ return !was_error;
// FIX ME it is not using surrogate pairs from input stream
// and is not using mode parameter
template<typename StreamType, typename function_type>
void wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_function)
bool wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_function)
{
char utf8_buffer[256];
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
std::size_t utf8_sequence_max_length = 10;
std::size_t index = 0;
bool was_error = false;
typename StreamType::const_iterator i = buffer.begin();
@ -179,18 +178,78 @@ void wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_fun
{
if( index + utf8_sequence_max_length > buffer_len )
{
write_function(utf8_buffer, index);
bool write_status = write_function(utf8_buffer, index);
index = 0;
if( !write_status )
{
was_error = true;
break;
}
}
index += int_to_utf8(*i, utf8_buffer + index, buffer_len - index);
++i;
int c = 0xFFFD; // U+FFFD "replacement character";
bool seems_to_be_correct = false;
wchar_t w1 = *i;
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
{
++i;
if( i != buffer.end() )
{
wchar_t w2 = *i;
if( surrogate_pair_to_int(w1, w2, c) )
{
seems_to_be_correct = true;
++i;
}
else
{
was_error = true;
}
}
else
{
was_error = true;
}
}
else
{
c = w1;
seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below
++i;
}
if( seems_to_be_correct || mode == 1 )
{
size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index);
// here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough)
if( seq_len == 0 )
{
was_error = true;
if( mode == 1 )
{
seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character";
}
}
index += seq_len;
}
}
if( index > 0 )
{
write_function(utf8_buffer, index);
if( !write_function(utf8_buffer, index) )
{
was_error = true;
}
}
return !was_error;
}

View File

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2021-2022, Tomasz Sowa
* Copyright (c) 2021-2023, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -46,19 +46,23 @@ namespace pt
template<typename StreamType>
void int_to_wide(int c, StreamType & res)
bool int_to_wide(int c, StreamType & res)
{
if( sizeof(wchar_t)==2 && c>0xffff )
wchar_t buf[2];
size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t));
if( used == 1 )
{
// UTF16 surrogate pairs
c -= 0x10000;
res << static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
res << static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
res << buf[0];
}
else
if( used == 2 )
{
res << static_cast<wchar_t>(c);
res << buf[0];
res << buf[1];
}
return used > 0;
}
@ -243,14 +247,14 @@ bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, i
/*!
this function converts one UTF-8 character into a wide stream or a wide string
this function converts UTF-8 stream into a wide stream or a wide string
input:
iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
iterator_end - an end iterator
output:
out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator)
out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string)
this function returns false if there were some errors when converting
*/
@ -287,6 +291,103 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i
/*!
this function converts UTF-8 stream into a wide string
input:
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
iterator_end - an end iterator
output:
out_buffer - an output wide string
max_buffer_len - how many characters can be write (we write the terminating null character too)
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
this function returns false if there were some errors when converting or if the output buffer was too short
*/
template<typename StreamIteratorType>
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large)
{
int res;
bool correct;
bool was_error = true;
bool was_buffer_ok = false;
if( max_buffer_len > 0 )
{
max_buffer_len -= 1; // for terminating null character
was_error = false;
was_buffer_ok = true;
while( iterator_in != iterator_end )
{
utf8_to_int(iterator_in, iterator_end, res, correct);
if( !correct )
{
was_error = true;
if( mode == 1 )
{
res = 0xFFFD; // U+FFFD "replacement character"
correct = true;
}
}
if( correct )
{
size_t len = int_to_wide(res, out_buffer, max_buffer_len);
// if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand)
if( len == 0 )
{
was_error = true;
was_buffer_ok = false;
break;
}
else
{
out_buffer += len;
max_buffer_len -= len;
}
}
}
*out_buffer = 0;
}
if( was_buffer_sufficient_large )
*was_buffer_sufficient_large = was_buffer_ok;
return !was_error;
}
/*!
this function converts UTF-8 stream into a wide string
input:
stream - a stream for reading from
output:
out_buffer - an output wide string
max_buffer_len - how many characters can be write (we write the terminating null character too)
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
this function returns false if there were some errors when converting or if the output buffer was too short
*/
template<typename StreamType>
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode)
{
typename StreamType::const_iterator stream_begin = stream.begin();
typename StreamType::const_iterator stream_end = stream.end();
return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large);
}
/*!
this function converts one wide character into UTF-8 stream
@ -402,13 +503,14 @@ bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode)
template<typename StreamType>
void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
{
if( clear )
utf8.clear();
private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
utf8.append(utf8_buffer, buffer_len);
return true;
});
}
@ -466,20 +568,72 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i
}
// not tested
template<typename StreamTypeIn, typename StreamTypeOut>
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
{
if( clear )
utf8.clear();
private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
utf8.write(utf8_buffer, buffer_len);
return true;
});
}
/*!
this function converts a wide stream into a utf8 string
input:
buffer - a wide stream for reading from
output:
utf8 - an output utf8 string
max_buffer_len - how many characters can be write (we write the terminating null character too)
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
this function returns false if there were some errors when converting or if the output buffer was too short
*/
template<typename StreamType>
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode)
{
bool buffer_ok = false;
bool is_ok = false;
if( max_buffer_size > 0 )
{
buffer_ok = true;
max_buffer_size -= 1; // for terminating null character
is_ok = private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool {
std::size_t i=0;
for( ; i < buffer_len ; ++i)
{
if( i < max_buffer_size )
{
*utf8 = utf8_buffer[i];
utf8 += 1;
}
else
{
buffer_ok = false;
break;
}
}
max_buffer_size -= i;
*utf8 = 0;
return buffer_ok;
});
}
if( was_buffer_sufficient_large )
*was_buffer_sufficient_large = buffer_ok;
return is_ok;
}