Browse Source

added two functions to utf8:

template<typename StreamType> bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear = true, int mode = 1);
template<typename StreamType> bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1);

these functions are moved from TextStreamBase
htmlparserlistener
Tomasz Sowa 1 year ago
parent
commit
8ec9350d52
  1. 76
      src/textstream/textstream.h
  2. 9
      src/utf8/utf8.h
  3. 106
      src/utf8/utf8_templates.h

76
src/textstream/textstream.h

@ -157,8 +157,6 @@ public:
protected:
void put_stream(const Stream & stream);
void put_utf8_to_wide(const Stream & stream);
void put_wide_to_utf8(const Stream & stream);
};
@ -435,6 +433,9 @@ template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size> &
TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(char v)
{
// IMPROVEME
// if char_type == 1 then if v <= 127 then put that char but if (unsigned)v > 127 put replacement character
// if char_type > 1 then simply put that character
buffer.append(static_cast<char_type>(v));
return *this;
@ -445,6 +446,9 @@ template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size> &
TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(unsigned char v)
{
// IMPROVEME
// if char_type == 1 then if v <= 127 then put that char but if v > 127 put replacement character
// if char_type > 1 then simply put that character
buffer.append(static_cast<char_type>(v));
return *this;
@ -455,6 +459,7 @@ template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size> &
TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(wchar_t v)
{
// IMPROVEME add utf8/wide conversion, if v is from surrogate pair we can skip it
buffer.append(static_cast<char_type>(v));
return *this;
@ -652,6 +657,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(const Stream
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
void TextStreamBase<char_type, stack_size, heap_block_size>::put_stream(const Stream & stream)
{
@ -669,13 +675,13 @@ void TextStreamBase<char_type, stack_size, heap_block_size>::put_stream(const St
if( sizeof(char_type) == sizeof(wchar_t) && stream.is_char_stream() )
{
// from utf8 to wide
put_utf8_to_wide(stream);
utf8_to_wide(stream, *this, false);
}
else
if( sizeof(char_type) == sizeof(char) && stream.is_wchar_stream() )
{
// from wide to utf8
put_wide_to_utf8(stream);
wide_stream_to_utf8(stream, *this, false);
}
else
if( sizeof(char_type) == sizeof(wchar_t) && stream.is_wchar_stream() )
@ -695,68 +701,6 @@ void TextStreamBase<char_type, stack_size, heap_block_size>::put_stream(const St
}
// test me when sizeof(wchar_t) is 2
// or may move me to utf8 functions?
template<typename char_type, size_t stack_size, size_t heap_block_size>
void TextStreamBase<char_type, stack_size, heap_block_size>::put_utf8_to_wide(const Stream & stream)
{
size_t len;
bool correct;
size_t index = 0;
int z;
do
{
len = utf8_to_int(stream, index, z, correct);
if( len > 0 )
{
if( !correct )
{
z = 0xFFFD; // U+FFFD "replacement character"
}
int_to_wide(z, *this);
index += len;
}
}
while( len > 0 );
}
// test me when sizeof(wchar_t) is 2
// or may move me to utf8 functions?
template<typename char_type, size_t stack_size, size_t heap_block_size>
void TextStreamBase<char_type, stack_size, heap_block_size>::put_wide_to_utf8(const Stream & stream)
{
char utf8_buf[10];
size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
for(size_t i=0 ; i < stream.size() ; ++i)
{
int c = static_cast<int>(stream.get_wchar(i));
if( is_first_surrogate_char(c) && i + 1 < stream.size() )
{
wchar_t c1 = static_cast<wchar_t>(c);
wchar_t c2 = stream.get_wchar(++i);
surrogate_pair_to_int(c1, c2, c);
}
size_t len = int_to_utf8(c, utf8_buf, utf8_buf_len);
for(size_t u=0 ; u < len ; ++u)
{
operator<<(utf8_buf[u]);
}
}
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size> &

9
src/utf8/utf8.h

@ -60,6 +60,8 @@ namespace pt
/*!
returns true if 'c' is a correct unicode character
RENAMEME to is_correct_unicode_char
*/
bool utf8_check_range(int c);
@ -152,6 +154,8 @@ bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear = true,
template<typename StreamType>
bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear = true, int mode = 1); // need to be tested
template<typename StreamType>
bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear = true, int mode = 1);
/*
@ -205,8 +209,11 @@ bool wide_to_utf8(const std::wstring & wide_string, char * utf8, s
template<typename StreamType>
void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear = true, int mode = 1); // not tested
template<typename StreamType>
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1);
template<typename StreamTypeIn, typename StreamTypeOut>
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested, IMPROVE ME add clear parameter, mode parameter is not used

106
src/utf8/utf8_templates.h

@ -136,6 +136,59 @@ return !was_error;
}
/*
this function converts a UTF-8 stream into wide stream
input:
stream - a UTF-8 stream for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - a wide stream for the output sequence
this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear, int mode)
{
size_t len;
bool correct;
int z;
size_t index = 0;
bool was_error = false;
if( clear )
res.clear();
// CHECKME test me when sizeof(wchar_t) is 2
do
{
len = utf8_to_int(stream, index, z, correct);
if( len > 0 )
{
if( !correct )
{
if( mode == 1 )
res << 0xFFFD; // U+FFFD "replacement character"
was_error = true;
}
else
{
int_to_wide(z, res);
}
index += len;
}
}
while( len > 0 );
return !was_error;
}
@ -268,6 +321,59 @@ void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, in
}
template<typename StreamType>
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode)
{
bool was_error = false;
if( clear )
utf8.clear();
for(size_t i=0 ; i < stream.size() ; ++i)
{
int c = static_cast<int>(stream.get_wchar(i));
bool is_correct = false;
if( utf8_check_range(c) )
{
// CHECKME test me when sizeof(wchar_t) == 2
if( is_first_surrogate_char(c) )
{
if( i + 1 < stream.size() )
{
wchar_t c1 = static_cast<wchar_t>(c);
wchar_t c2 = stream.get_wchar(++i);
if( surrogate_pair_to_int(c1, c2, c) )
{
is_correct = true;
}
}
}
else
{
is_correct = true;
}
}
if( is_correct )
{
int_to_utf8(c, utf8);
}
else
{
was_error = true;
if( mode == 1 )
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
}
}
return !was_error;
}
// not tested
template<typename StreamTypeIn, typename StreamTypeOut>
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)

Loading…
Cancel
Save