add some wide/utf8 convertion methods
add following methods: size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len); template<typename StreamIteratorType> bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode = 1, bool * was_buffer_sufficient_large = nullptr); template<typename StreamType> bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large = nullptr, int mode = 1); template<typename StreamType> bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large = nullptr, int mode = 1);
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010-2022, Tomasz Sowa
|
||||
* Copyright (c) 2010-2023, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -129,24 +129,63 @@ bool surrogate_pair_to_int(int c1, int c2, int & z)
|
||||
|
||||
|
||||
/*
|
||||
converts an int to a wide string
|
||||
*/
|
||||
void int_to_wide(int c, std::wstring & res)
|
||||
* converts an int to a wide string
|
||||
*
|
||||
* this method will not terminate the output string with a null character
|
||||
* return how many characters have been written (0, 1 or 2)
|
||||
*/
|
||||
size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len)
|
||||
{
|
||||
if( sizeof(wchar_t)==2 && c>0xffff )
|
||||
{
|
||||
// UTF16 surrogate pairs
|
||||
c -= 0x10000;
|
||||
res += static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
|
||||
res += static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
|
||||
if( max_buf_len > 1 )
|
||||
{
|
||||
// UTF16 surrogate pairs
|
||||
c -= 0x10000;
|
||||
res[0] = static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
|
||||
res[1] = static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
res += static_cast<wchar_t>(c);
|
||||
if( max_buf_len > 0 )
|
||||
{
|
||||
res[0] = static_cast<wchar_t>(c);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
converts an int to a wide string
|
||||
|
||||
returns true if a character was inserted to the string
|
||||
*/
|
||||
bool int_to_wide(int c, std::wstring & res)
|
||||
{
|
||||
wchar_t buf[2];
|
||||
size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t));
|
||||
|
||||
if( used == 1 )
|
||||
{
|
||||
res += buf[0];
|
||||
}
|
||||
else
|
||||
if( used == 2 )
|
||||
{
|
||||
res += buf[0];
|
||||
res += buf[1];
|
||||
}
|
||||
|
||||
return used > 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts one UTF-8 character into one wide-character
|
||||
|
@@ -5,7 +5,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010-2022, Tomasz Sowa
|
||||
* Copyright (c) 2010-2023, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -134,15 +134,28 @@ size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType &
|
||||
|
||||
/*!
|
||||
converting one character from int to wide stream
|
||||
|
||||
returns true if a character was inserted to the stream
|
||||
*/
|
||||
template<typename StreamType>
|
||||
void int_to_wide(int c, StreamType & res);
|
||||
bool int_to_wide(int c, StreamType & res);
|
||||
|
||||
|
||||
/*!
|
||||
converting one character from int to wide string
|
||||
|
||||
this method will not terminate the output string with a null character
|
||||
return how many characters have been written (0, 1 or 2)
|
||||
*/
|
||||
void int_to_wide(int c, std::wstring & res);
|
||||
size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len);
|
||||
|
||||
|
||||
/*!
|
||||
converting one character from int to wide string
|
||||
|
||||
returns true if a character was inserted to the string
|
||||
*/
|
||||
bool int_to_wide(int c, std::wstring & res);
|
||||
|
||||
|
||||
/*!
|
||||
@@ -176,7 +189,14 @@ class TextStreamBase;
|
||||
|
||||
// defined at the end in textstream.h
|
||||
template<size_t stack_size, size_t heap_block_size, typename StreamOrStringType>
|
||||
bool utf8_to_wide(const TextStreamBase<char, stack_size, heap_block_size> & utf8, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1);
|
||||
bool utf8_to_wide(const TextStreamBase<char, stack_size, heap_block_size> & utf8, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1);
|
||||
|
||||
template<typename StreamIteratorType>
|
||||
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode = 1, bool * was_buffer_sufficient_large = nullptr);
|
||||
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large = nullptr, int mode = 1);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
@@ -217,8 +237,6 @@ bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode = 1);
|
||||
template<typename StreamType>
|
||||
bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode = 1);
|
||||
|
||||
|
||||
|
||||
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
||||
bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
||||
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1);
|
||||
@@ -228,13 +246,16 @@ bool wide_to_utf8(const wchar_t * wide_string, char * utf8, s
|
||||
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode = 1);
|
||||
|
||||
template<typename StreamType>
|
||||
void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear = true, int mode = 1); // not tested
|
||||
bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear = true, int mode = 1);
|
||||
|
||||
template<typename StreamType>
|
||||
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1);
|
||||
|
||||
template<typename StreamTypeIn, typename StreamTypeOut>
|
||||
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); // not tested, IMPROVE ME mode parameter is not used
|
||||
bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1);
|
||||
|
||||
template<typename StreamType>
|
||||
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large = nullptr, int mode = 1);
|
||||
|
||||
|
||||
} // namespace
|
||||
|
@@ -5,7 +5,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2021, Tomasz Sowa
|
||||
* Copyright (c) 2021-2023, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -102,9 +102,9 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool
|
||||
z = static_cast<int>(*wide_string);
|
||||
correct = true;
|
||||
|
||||
if( sizeof(wchar_t) == 2 && is_surrogate_char(z) )
|
||||
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(z) )
|
||||
{
|
||||
if( is_first_surrogate_char(z) && string_len>1 )
|
||||
if( string_len > 1 )
|
||||
{
|
||||
int z2 = *(wide_string+1);
|
||||
|
||||
@@ -116,7 +116,7 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool
|
||||
else
|
||||
{
|
||||
correct = false;
|
||||
return 2;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@@ -5,7 +5,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2021-2022, Tomasz Sowa
|
||||
* Copyright (c) 2021-2023, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -163,15 +163,14 @@ return !was_error;
|
||||
|
||||
|
||||
|
||||
// FIX ME it is not using surrogate pairs from input stream
|
||||
// and is not using mode parameter
|
||||
template<typename StreamType, typename function_type>
|
||||
void wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_function)
|
||||
bool wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_function)
|
||||
{
|
||||
char utf8_buffer[256];
|
||||
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
|
||||
std::size_t utf8_sequence_max_length = 10;
|
||||
std::size_t index = 0;
|
||||
bool was_error = false;
|
||||
|
||||
typename StreamType::const_iterator i = buffer.begin();
|
||||
|
||||
@@ -179,18 +178,78 @@ void wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_fun
|
||||
{
|
||||
if( index + utf8_sequence_max_length > buffer_len )
|
||||
{
|
||||
write_function(utf8_buffer, index);
|
||||
bool write_status = write_function(utf8_buffer, index);
|
||||
index = 0;
|
||||
|
||||
if( !write_status )
|
||||
{
|
||||
was_error = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
index += int_to_utf8(*i, utf8_buffer + index, buffer_len - index);
|
||||
++i;
|
||||
int c = 0xFFFD; // U+FFFD "replacement character";
|
||||
bool seems_to_be_correct = false;
|
||||
wchar_t w1 = *i;
|
||||
|
||||
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
|
||||
{
|
||||
++i;
|
||||
|
||||
if( i != buffer.end() )
|
||||
{
|
||||
wchar_t w2 = *i;
|
||||
|
||||
if( surrogate_pair_to_int(w1, w2, c) )
|
||||
{
|
||||
seems_to_be_correct = true;
|
||||
++i;
|
||||
}
|
||||
else
|
||||
{
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
c = w1;
|
||||
seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below
|
||||
++i;
|
||||
}
|
||||
|
||||
if( seems_to_be_correct || mode == 1 )
|
||||
{
|
||||
size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index);
|
||||
// here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough)
|
||||
|
||||
if( seq_len == 0 )
|
||||
{
|
||||
was_error = true;
|
||||
|
||||
if( mode == 1 )
|
||||
{
|
||||
seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character";
|
||||
}
|
||||
}
|
||||
|
||||
index += seq_len;
|
||||
}
|
||||
}
|
||||
|
||||
if( index > 0 )
|
||||
{
|
||||
write_function(utf8_buffer, index);
|
||||
if( !write_function(utf8_buffer, index) )
|
||||
{
|
||||
was_error = true;
|
||||
}
|
||||
}
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
@@ -5,7 +5,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2021-2022, Tomasz Sowa
|
||||
* Copyright (c) 2021-2023, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -46,19 +46,23 @@ namespace pt
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
void int_to_wide(int c, StreamType & res)
|
||||
bool int_to_wide(int c, StreamType & res)
|
||||
{
|
||||
if( sizeof(wchar_t)==2 && c>0xffff )
|
||||
wchar_t buf[2];
|
||||
size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t));
|
||||
|
||||
if( used == 1 )
|
||||
{
|
||||
// UTF16 surrogate pairs
|
||||
c -= 0x10000;
|
||||
res << static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
|
||||
res << static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
|
||||
res << buf[0];
|
||||
}
|
||||
else
|
||||
if( used == 2 )
|
||||
{
|
||||
res << static_cast<wchar_t>(c);
|
||||
res << buf[0];
|
||||
res << buf[1];
|
||||
}
|
||||
|
||||
return used > 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -243,14 +247,14 @@ bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, i
|
||||
|
||||
|
||||
/*!
|
||||
this function converts one UTF-8 character into a wide stream or a wide string
|
||||
this function converts UTF-8 stream into a wide stream or a wide string
|
||||
|
||||
input:
|
||||
iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||
iterator_end - an end iterator
|
||||
|
||||
output:
|
||||
out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator)
|
||||
out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string)
|
||||
|
||||
this function returns false if there were some errors when converting
|
||||
*/
|
||||
@@ -287,6 +291,103 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts UTF-8 stream into a wide string
|
||||
|
||||
input:
|
||||
iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only)
|
||||
iterator_end - an end iterator
|
||||
|
||||
output:
|
||||
out_buffer - an output wide string
|
||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||
|
||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||
*/
|
||||
template<typename StreamIteratorType>
|
||||
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large)
|
||||
{
|
||||
int res;
|
||||
bool correct;
|
||||
bool was_error = true;
|
||||
bool was_buffer_ok = false;
|
||||
|
||||
if( max_buffer_len > 0 )
|
||||
{
|
||||
max_buffer_len -= 1; // for terminating null character
|
||||
was_error = false;
|
||||
was_buffer_ok = true;
|
||||
|
||||
while( iterator_in != iterator_end )
|
||||
{
|
||||
utf8_to_int(iterator_in, iterator_end, res, correct);
|
||||
|
||||
if( !correct )
|
||||
{
|
||||
was_error = true;
|
||||
|
||||
if( mode == 1 )
|
||||
{
|
||||
res = 0xFFFD; // U+FFFD "replacement character"
|
||||
correct = true;
|
||||
}
|
||||
}
|
||||
|
||||
if( correct )
|
||||
{
|
||||
size_t len = int_to_wide(res, out_buffer, max_buffer_len);
|
||||
// if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand)
|
||||
|
||||
if( len == 0 )
|
||||
{
|
||||
was_error = true;
|
||||
was_buffer_ok = false;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
out_buffer += len;
|
||||
max_buffer_len -= len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*out_buffer = 0;
|
||||
}
|
||||
|
||||
if( was_buffer_sufficient_large )
|
||||
*was_buffer_sufficient_large = was_buffer_ok;
|
||||
|
||||
return !was_error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts UTF-8 stream into a wide string
|
||||
|
||||
input:
|
||||
stream - a stream for reading from
|
||||
|
||||
output:
|
||||
out_buffer - an output wide string
|
||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||
|
||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode)
|
||||
{
|
||||
typename StreamType::const_iterator stream_begin = stream.begin();
|
||||
typename StreamType::const_iterator stream_end = stream.end();
|
||||
|
||||
return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts one wide character into UTF-8 stream
|
||||
|
||||
@@ -402,13 +503,14 @@ bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode)
|
||||
|
||||
|
||||
template<typename StreamType>
|
||||
void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
|
||||
bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
|
||||
return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||
utf8.append(utf8_buffer, buffer_len);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -466,20 +568,72 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i
|
||||
}
|
||||
|
||||
|
||||
// not tested
|
||||
template<typename StreamTypeIn, typename StreamTypeOut>
|
||||
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
|
||||
bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
|
||||
{
|
||||
if( clear )
|
||||
utf8.clear();
|
||||
|
||||
private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
|
||||
return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||
utf8.write(utf8_buffer, buffer_len);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
this function converts a wide stream into a utf8 string
|
||||
|
||||
input:
|
||||
buffer - a wide stream for reading from
|
||||
|
||||
output:
|
||||
utf8 - an output utf8 string
|
||||
max_buffer_len - how many characters can be write (we write the terminating null character too)
|
||||
was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large
|
||||
|
||||
this function returns false if there were some errors when converting or if the output buffer was too short
|
||||
*/
|
||||
template<typename StreamType>
|
||||
bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode)
|
||||
{
|
||||
bool buffer_ok = false;
|
||||
bool is_ok = false;
|
||||
|
||||
if( max_buffer_size > 0 )
|
||||
{
|
||||
buffer_ok = true;
|
||||
max_buffer_size -= 1; // for terminating null character
|
||||
|
||||
is_ok = private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool {
|
||||
std::size_t i=0;
|
||||
|
||||
for( ; i < buffer_len ; ++i)
|
||||
{
|
||||
if( i < max_buffer_size )
|
||||
{
|
||||
*utf8 = utf8_buffer[i];
|
||||
utf8 += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
buffer_ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
max_buffer_size -= i;
|
||||
*utf8 = 0;
|
||||
return buffer_ok;
|
||||
});
|
||||
}
|
||||
|
||||
if( was_buffer_sufficient_large )
|
||||
*was_buffer_sufficient_large = buffer_ok;
|
||||
|
||||
return is_ok;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user