add some utf8 converting methods

add new methods:
- bool int_to_stream(int c, pt::Stream & stream);
- template<typename OutputFunction>
  bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode = 1);
- template<typename StreamIteratorType, typename OutputFunction>
  bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode = 1);
- template<typename StreamType, typename OutputFunction>
  bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode = 1);

make some methods public:
- size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
- size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)

rename and make some methods public:
- template<typename OutputFunction>
  utf8_to_wide_generic(const char * utf8, size_t utf8_len, OutputFunction convert_function, int mode) -> utf8_to_output_function(...)

while here:
- fix: correctly convert characters in Log::put_multiline_generic()
This commit is contained in:
2024-05-30 20:19:04 +02:00
parent 5fd17175c1
commit aacb1f43ae
11 changed files with 428 additions and 338 deletions

View File

@@ -6,79 +6,76 @@
./convert/misc.o: utf8/utf8_templates.h utf8/utf8_private.h ./convert/misc.o: utf8/utf8_templates.h utf8/utf8_private.h
./convert/text.o: ./convert/text.h ./convert/text_private.h ./convert/text.o: ./convert/text.h ./convert/text_private.h
./convert/double.o: ./convert/double.h textstream/textstream.h ./convert/double.o: ./convert/double.h textstream/textstream.h
./convert/double.o: textstream/stream.h space/space.h textstream/types.h ./convert/double.o: textstream/stream.h space/space.h convert/inttostr.h
./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./convert/double.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./convert/double.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
./convert/double.o: membuffer/membuffer.h textstream/types.h ./convert/double.o: textstream/types.h textstream/stream_private.h
./convert/double.o: textstream/stream_private.h
./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h ./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h ./convert/baseparser.o: textstream/stream.h space/space.h convert/inttostr.h
./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./convert/baseparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./convert/baseparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
./convert/baseparser.o: membuffer/membuffer.h textstream/types.h ./convert/baseparser.o: textstream/types.h textstream/stream_private.h
./convert/baseparser.o: textstream/stream_private.h
./date/date.o: ./date/date.h convert/inttostr.h ./date/date.o: ./date/date.h convert/inttostr.h
./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h ./log/filelog.o: space/space.h convert/inttostr.h utf8/utf8.h
./log/filelog.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./log/filelog.o: textstream/stream.h utf8/utf8_templates.h
./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h ./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
./log/filelog.o: textstream/types.h textstream/stream_private.h ./log/filelog.o: textstream/types.h textstream/stream_private.h
./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h ./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h
./log/log.o: space/space.h textstream/types.h convert/inttostr.h utf8/utf8.h ./log/log.o: space/space.h convert/inttostr.h utf8/utf8.h textstream/stream.h
./log/log.o: textstream/stream.h utf8/utf8_templates.h utf8/utf8_private.h ./log/log.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h ./log/log.o: membuffer/membuffer.h textstream/types.h
./log/log.o: textstream/stream_private.h ./log/filelog.h ./log/log.o: textstream/stream_private.h ./log/filelog.h
./space/space.o: ./space/space.h textstream/types.h convert/inttostr.h ./space/space.o: ./space/space.h convert/inttostr.h utf8/utf8.h
./space/space.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./space/space.o: textstream/stream.h utf8/utf8_templates.h
./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h ./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h
./space/space.o: convert/patternreplacer.h textstream/textstream.h ./space/space.o: convert/patternreplacer.h textstream/textstream.h
./space/space.o: textstream/stream.h space/space.h date/date.h ./space/space.o: textstream/stream.h space/space.h date/date.h
./space/space.o: membuffer/membuffer.h textstream/types.h ./space/space.o: membuffer/membuffer.h textstream/types.h
./space/space.o: textstream/stream_private.h convert/strtoint.h ./space/space.o: textstream/stream_private.h convert/strtoint.h
./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h ./space/space.o: ./convert/text.h ./convert/misc.h textstream/types.h
./space/space.o: ./convert/double.h
./space/spaceparser.o: ./space/spaceparser.h ./space/space.h ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h ./space/spaceparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h ./space/spaceparser.o: utf8/utf8_templates.h utf8/utf8_private.h
./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h ./space/spaceparser.o: convert/baseparser.h textstream/textstream.h
./space/spaceparser.o: textstream/textstream.h textstream/stream.h ./space/spaceparser.o: textstream/stream.h space/space.h date/date.h
./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h ./space/spaceparser.o: membuffer/membuffer.h textstream/types.h
./space/spaceparser.o: textstream/types.h textstream/stream_private.h ./space/spaceparser.o: textstream/stream_private.h convert/strtoint.h
./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h ./space/spaceparser.o: ./convert/text.h ./convert/misc.h textstream/types.h
./space/keyvalueparser.o: ./space/keyvalueparser.h ./space/space.h ./space/keyvalueparser.o: ./space/keyvalueparser.h ./space/space.h
./space/keyvalueparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h ./space/keyvalueparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./space/keyvalueparser.o: textstream/stream.h utf8/utf8_templates.h ./space/keyvalueparser.o: utf8/utf8_templates.h utf8/utf8_private.h
./space/keyvalueparser.o: utf8/utf8_private.h convert/baseparser.h ./space/keyvalueparser.o: convert/baseparser.h textstream/textstream.h
./space/keyvalueparser.o: textstream/textstream.h textstream/stream.h ./space/keyvalueparser.o: textstream/stream.h space/space.h date/date.h
./space/keyvalueparser.o: space/space.h date/date.h membuffer/membuffer.h ./space/keyvalueparser.o: membuffer/membuffer.h textstream/types.h
./space/keyvalueparser.o: textstream/types.h textstream/stream_private.h ./space/keyvalueparser.o: textstream/stream_private.h convert/strtoint.h
./space/keyvalueparser.o: convert/strtoint.h ./convert/text.h ./space/keyvalueparser.o: ./convert/text.h ./convert/misc.h
./space/keyvalueparser.o: ./convert/misc.h ./space/keyvalueparser.o: textstream/types.h
./textstream/stream_private.o: textstream/stream_private.h ./textstream/stream_private.o: textstream/stream_private.h
./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./utf8/utf8.o: utf8/utf8_private.h ./utf8/utf8.o: utf8/utf8_private.h
./utf8/utf8_private.o: utf8/utf8_private.h ./utf8/utf8_private.o: utf8/utf8_private.h
./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h ./csv/csvparser.o: ./csv/csvparser.h space/space.h convert/inttostr.h
./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./csv/csvparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h ./csv/csvparser.o: utf8/utf8_private.h convert/baseparser.h
./csv/csvparser.o: convert/baseparser.h textstream/textstream.h ./csv/csvparser.o: textstream/textstream.h textstream/stream.h date/date.h
./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h ./csv/csvparser.o: membuffer/membuffer.h textstream/types.h
./csv/csvparser.o: textstream/types.h textstream/stream_private.h ./csv/csvparser.o: textstream/stream_private.h
./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h ./mainoptions/mainoptionsparser.o: space/space.h convert/inttostr.h
./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h ./mainoptions/mainoptionsparser.o: utf8/utf8.h textstream/stream.h
./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h ./mainoptions/mainoptionsparser.o: utf8/utf8_templates.h utf8/utf8_private.h
./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h ./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h ./html/bbcodeparser.o: textstream/stream.h space/space.h convert/inttostr.h
./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./html/bbcodeparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h ./html/bbcodeparser.o: textstream/types.h textstream/stream_private.h
./html/bbcodeparser.o: textstream/stream_private.h
./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h ./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
./html/htmlparser.o: textstream/textstream.h textstream/stream.h ./html/htmlparser.o: textstream/textstream.h textstream/stream.h
./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h ./html/htmlparser.o: space/space.h convert/inttostr.h utf8/utf8.h
./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h ./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
./html/htmlparser.o: textstream/types.h textstream/stream_private.h ./html/htmlparser.o: textstream/types.h textstream/stream_private.h
./html/htmlparser.o: convert/text.h ./html/htmlparser.o: convert/text.h

View File

@@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2018-2022, Tomasz Sowa * Copyright (c) 2018-2024, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,6 @@
#define headerfile_pikotools_src_log_log #define headerfile_pikotools_src_log_log
#include <string> #include <string>
#include <fstream>
#include "textstream/textstream.h" #include "textstream/textstream.h"
#include "filelog.h" #include "filelog.h"
@@ -246,6 +245,7 @@ void Log::put_multiline_generic(const CharType * prefix, const CharType * msg)
{ {
was_new_line = true; was_new_line = true;
put_prefix = true; put_prefix = true;
msg += 1;
} }
else else
{ {
@@ -265,11 +265,32 @@ void Log::put_multiline_generic(const CharType * prefix, const CharType * msg)
put_prefix = false; put_prefix = false;
} }
operator<<(*msg); if constexpr ( sizeof(CharType) == sizeof(char) )
{
int c;
bool correct;
msg += utf8_to_int(msg, c, correct);
if( correct )
int_to_stream(c, *this);
else
int_to_stream(0xFFFD, *this); // replacement character
}
else
if constexpr ( sizeof(CharType) == sizeof(wchar_t) )
{
operator<<(*msg);
msg += 1;
}
else
{
// what is the CharType?
// at the moment do not print anything
msg += 1;
}
was_something_printed = true; was_something_printed = true;
} }
msg += 1;
} }
if( was_something_printed ) if( was_something_printed )

View File

@@ -34,8 +34,10 @@
#include <wchar.h> #include <wchar.h>
#include "space.h" #include "space.h"
#include "utf8/utf8.h"
#include "convert/convert.h" #include "convert/convert.h"
#include "textstream/textstream.h"
#include "utf8/utf8.h"
namespace pt namespace pt

View File

@@ -42,7 +42,6 @@
#include <cstdio> #include <cstdio>
#include <cwchar> #include <cwchar>
#include <errno.h> #include <errno.h>
#include "textstream/types.h"
#include "convert/inttostr.h" #include "convert/inttostr.h"
#include "utf8/utf8.h" #include "utf8/utf8.h"

View File

@@ -203,7 +203,6 @@ public:
template<typename arg_char_type, size_t arg_stack_size, size_t arg_heap_block_size> template<typename arg_char_type, size_t arg_stack_size, size_t arg_heap_block_size>
TextStreamBase & operator<<(const TextStreamBase<arg_char_type, arg_stack_size, arg_heap_block_size> & arg); TextStreamBase & operator<<(const TextStreamBase<arg_char_type, arg_stack_size, arg_heap_block_size> & arg);
template<typename arg_char_type, size_t arg_stack_size, size_t arg_heap_block_size> template<typename arg_char_type, size_t arg_stack_size, size_t arg_heap_block_size>
bool operator==(const TextStreamBase<arg_char_type, arg_stack_size, arg_heap_block_size> & stream) const; bool operator==(const TextStreamBase<arg_char_type, arg_stack_size, arg_heap_block_size> & stream) const;
@@ -1235,6 +1234,7 @@ return *this;
} }
template<typename char_type, size_t stack_size, size_t heap_block_size> template<typename char_type, size_t stack_size, size_t heap_block_size>
template<typename arg_char_type, size_t arg_stack_size, size_t arg_heap_block_size> template<typename arg_char_type, size_t arg_stack_size, size_t arg_heap_block_size>
bool TextStreamBase<char_type, stack_size, heap_block_size>::operator==(const TextStreamBase<arg_char_type, arg_stack_size, arg_heap_block_size> & stream) const bool TextStreamBase<char_type, stack_size, heap_block_size>::operator==(const TextStreamBase<arg_char_type, arg_stack_size, arg_heap_block_size> & stream) const

View File

@@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2010-2023, Tomasz Sowa * Copyright (c) 2010-2024, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@@ -42,9 +42,6 @@ namespace pt
{ {
/*! /*!
returns true if 'c' is a correct unicode character returns true if 'c' is a correct unicode character
*/ */
@@ -128,6 +125,83 @@ bool surrogate_pair_to_int(int c1, int c2, int & z)
/*
an auxiliary function for converting from wide characters to UTF-8
converting a wide character into one int
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
{
if( string_len == 0 )
{
z = 0;
correct = false;
return 0;
}
z = static_cast<int>(*wide_string);
correct = true;
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(z) )
{
if( string_len > 1 )
{
int z2 = *(wide_string+1);
if( is_second_surrogate_char(z2) )
{
z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
return 2;
}
else
{
correct = false;
return 1;
}
}
else
{
correct = false;
return 1;
}
}
else
{
correct = utf8_check_range(z);
return 1;
}
}
/*
an auxiliary function for converting from wide characters to UTF-8
converting a wide character into one int
returns how many wide characters were used
if wide_string has at least one character then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)
{
size_t min_str_len = 1;
if( *wide_string == 0 )
{
z = 0;
correct = false;
return 0;
}
if( *(wide_string+1) != 0 )
min_str_len = 2;
return wide_to_int(wide_string, min_str_len, z, correct);
}
/* /*
* converts an int to a wide string * converts an int to a wide string
* *
@@ -185,6 +259,26 @@ bool int_to_wide(int c, std::wstring & res)
} }
/*
* return true if c was a correct unicode character
* and has been put the the stream
*/
bool int_to_stream(int c, pt::Stream & stream)
{
if( stream.is_char_stream() )
{
return int_to_utf8(c, stream) > 0;
}
else
if( stream.is_wchar_stream() )
{
return int_to_wide(c, stream);
}
return false;
}
/*! /*!
@@ -410,9 +504,9 @@ bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool c
if( clear ) if( clear )
res.clear(); res.clear();
bool status = private_namespace::utf8_to_wide_generic(utf8, utf8_len, mode, [&res](int c) { bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) {
int_to_wide(c, res); int_to_wide(c, res);
}); }, mode);
return status; return status;
} }

View File

@@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2010-2023, Tomasz Sowa * Copyright (c) 2010-2024, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@@ -109,6 +109,21 @@ bool surrogate_pair_to_int(int c1, int c2, int & z);
/*
* converting one character into a stream
* stream can be an utf8 or wide stream
*/
bool int_to_stream(int c, pt::Stream & stream);
/*
* converting a one unicode character to an int
* such an unicode character can consists of one or two wide characters
*/
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); // may these methods make public?
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct);
/* /*
* *
* *
@@ -158,6 +173,14 @@ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len);
bool int_to_wide(int c, std::wstring & res); bool int_to_wide(int c, std::wstring & res);
/*!
call a convert_function for each character from an utf8 string
*/
template<typename OutputFunction>
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction convert_function, int mode = 1);
/*! /*!
converting UTF-8 string to a wide string converting UTF-8 string to a wide string
*/ */
@@ -181,9 +204,15 @@ bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear = true, int
template<typename StreamOrStringType> template<typename StreamOrStringType>
bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear = true, int mode = 1); bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear = true, int mode = 1);
template<typename OutputFunction>
bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode = 1);
template<typename StreamIteratorType, typename StreamOrStringType> template<typename StreamIteratorType, typename StreamOrStringType>
bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1); bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream = true, int mode = 1);
template<typename StreamIteratorType, typename OutputFunction>
bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode = 1);
template<typename CharT, size_t stack_size, size_t heap_block_size> template<typename CharT, size_t stack_size, size_t heap_block_size>
class TextStreamBase; class TextStreamBase;
@@ -220,6 +249,17 @@ template<typename StreamType>
size_t int_to_utf8(int z, StreamType & utf8); size_t int_to_utf8(int z, StreamType & utf8);
/*!
call an output_function for some sequence of wide characters from the stream buffer
output_function has two arguments: const char * buf, size_t len:
output_function(const char * buf, size_t len)
StreamType should have a const_iterator and begin() and end() methods
*/
template<typename StreamType, typename OutputFunction>
bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode = 1);
/*! /*!
converting a wide string to UTF-8 string converting a wide string to UTF-8 string

View File

@@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2021-2023, Tomasz Sowa * Copyright (c) 2021-2024, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@@ -81,85 +81,6 @@ return true;
/*
an auxiliary function for converting from wide characters to UTF-8
converting a wide character into one int
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
{
if( string_len == 0 )
{
z = 0;
correct = false;
return 0;
}
z = static_cast<int>(*wide_string);
correct = true;
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(z) )
{
if( string_len > 1 )
{
int z2 = *(wide_string+1);
if( is_second_surrogate_char(z2) )
{
z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
return 2;
}
else
{
correct = false;
return 1;
}
}
else
{
correct = false;
return 1;
}
}
else
{
correct = utf8_check_range(z);
return 1;
}
}
/*
an auxiliary function for converting from wide characters to UTF-8
converting a wide character into one int
returns how many wide characters were used
if wide_string has at least one character then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)
{
size_t min_str_len = 1;
if( *wide_string == 0 )
{
z = 0;
correct = false;
return 0;
}
if( *(wide_string+1) != 0 )
min_str_len = 2;
return wide_to_int(wide_string, min_str_len, z, correct);
}
/*! /*!
an auxiliary function for converting from wide characters to UTF-8 an auxiliary function for converting from wide characters to UTF-8

View File

@@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2021-2023, Tomasz Sowa * Copyright (c) 2021-2024, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@@ -41,14 +41,10 @@
namespace pt namespace pt
{ {
bool utf8_check_range(int c);
size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len); size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len);
size_t int_to_utf8(int z, std::string & utf8, bool clear); size_t int_to_utf8(int z, std::string & utf8, bool clear);
size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct); size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct);
bool is_surrogate_char(int c); size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct);
bool is_first_surrogate_char(int c);
bool is_second_surrogate_char(int c);
bool surrogate_pair_to_int(int c1, int c2, int & z);
namespace private_namespace namespace private_namespace
@@ -56,9 +52,6 @@ namespace private_namespace
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res); bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res);
bool utf8_to_int_add_next_octet(unsigned char uz, int & res); bool utf8_to_int_add_next_octet(unsigned char uz, int & res);
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); // may these methods make public?
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct);
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode); size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode);
@@ -116,144 +109,6 @@ return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode);
// declared in utf8.h, defined in utf8.cpp
size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct);
template<typename function_type>
bool utf8_to_wide_generic(const char * utf8, size_t utf8_len, int mode, function_type convert_function)
{
int z;
size_t len;
bool correct, was_error = false;
while( utf8_len > 0 )
{
if( (unsigned char)*utf8 <= 0x7f )
{
// small optimization
len = 1;
correct = true;
z = static_cast<unsigned char>(*utf8);
}
else
{
len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero
}
if( !correct )
{
if( mode == 1 )
convert_function(0xFFFD); // U+FFFD "replacement character"
was_error = true;
}
else
{
convert_function(z);
}
utf8 += len;
utf8_len -= len;
}
return !was_error;
}
template<typename StreamType, typename function_type>
bool wide_to_utf8_generic(StreamType & buffer, int mode, function_type write_function)
{
char utf8_buffer[256];
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
std::size_t utf8_sequence_max_length = 10;
std::size_t index = 0;
bool was_error = false;
typename StreamType::const_iterator i = buffer.begin();
while( i != buffer.end() )
{
if( index + utf8_sequence_max_length > buffer_len )
{
bool write_status = write_function(utf8_buffer, index);
index = 0;
if( !write_status )
{
was_error = true;
break;
}
}
int c = 0xFFFD; // U+FFFD "replacement character";
bool seems_to_be_correct = false;
wchar_t w1 = *i;
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
{
++i;
if( i != buffer.end() )
{
wchar_t w2 = *i;
if( surrogate_pair_to_int(w1, w2, c) )
{
seems_to_be_correct = true;
++i;
}
else
{
was_error = true;
}
}
else
{
was_error = true;
}
}
else
{
c = w1;
seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below
++i;
}
if( seems_to_be_correct || mode == 1 )
{
size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index);
// here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough)
if( seq_len == 0 )
{
was_error = true;
if( mode == 1 )
{
seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character";
}
}
index += seq_len;
}
}
if( index > 0 )
{
if( !write_function(utf8_buffer, index) )
{
was_error = true;
}
}
return !was_error;
}
} // namespace private_namespace } // namespace private_namespace

View File

@@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2021-2023, Tomasz Sowa * Copyright (c) 2021-2024, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@@ -44,7 +44,6 @@ namespace pt
{ {
template<typename StreamType> template<typename StreamType>
bool int_to_wide(int c, StreamType & res) bool int_to_wide(int c, StreamType & res)
{ {
@@ -128,9 +127,9 @@ bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool cle
if( clear ) if( clear )
res.clear(); res.clear();
bool status = private_namespace::utf8_to_wide_generic(utf8, utf8_len, mode, [&res](int c) { bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) {
int_to_wide(c, res); int_to_wide(c, res);
}); }, mode);
return status; return status;
} }
@@ -187,6 +186,137 @@ return !was_error;
} }
template<typename OutputFunction>
bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode)
{
int z;
size_t len;
bool correct, was_error = false;
while( utf8_len > 0 )
{
if( (unsigned char)*utf8 <= 0x7f )
{
// small optimization
len = 1;
correct = true;
z = static_cast<unsigned char>(*utf8);
}
else
{
len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero
}
if( !correct )
{
if( mode == 1 )
output_function(0xFFFD); // U+FFFD "replacement character"
was_error = true;
}
else
{
output_function(z);
}
utf8 += len;
utf8_len -= len;
}
return !was_error;
}
template<typename StreamType, typename OutputFunction>
bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode)
{
char utf8_buffer[256];
std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char);
std::size_t utf8_sequence_max_length = 10;
std::size_t index = 0;
bool was_error = false;
typename StreamType::const_iterator i = buffer.begin();
while( i != buffer.end() )
{
if( index + utf8_sequence_max_length > buffer_len )
{
bool write_status = output_function(utf8_buffer, index);
index = 0;
if( !write_status )
{
was_error = true;
break;
}
}
int c = 0xFFFD; // U+FFFD "replacement character";
bool seems_to_be_correct = false;
wchar_t w1 = *i;
if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) )
{
++i;
if( i != buffer.end() )
{
wchar_t w2 = *i;
if( surrogate_pair_to_int(w1, w2, c) )
{
seems_to_be_correct = true;
++i;
}
else
{
was_error = true;
}
}
else
{
was_error = true;
}
}
else
{
c = w1;
seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below
++i;
}
if( seems_to_be_correct || mode == 1 )
{
size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index);
// here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough)
if( seq_len == 0 )
{
was_error = true;
if( mode == 1 )
{
seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character";
}
}
index += seq_len;
}
}
if( index > 0 )
{
if( !output_function(utf8_buffer, index) )
{
was_error = true;
}
}
return !was_error;
}
/* /*
this function converts a UTF-8 stream into a wide stream or a wide string this function converts a UTF-8 stream into a wide stream or a wide string
@@ -203,6 +333,34 @@ output:
*/ */
template<typename StreamOrStringType> template<typename StreamOrStringType>
bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode) bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode)
{
if( clear )
res.clear();
return utf8_to_output_function(stream, [&](int z) {
int_to_wide(z, res);
}, mode);
}
/*
this function reads characters from a UTF-8 stream and calls an output_function
input:
stream - a UTF-8 stream for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType
and should put the character to the output string/stream, this function should have the signature like this:
output_function(int z, StreamOrStringType & res)
this function returns false if there were some errors when converting
*/
template<typename OutputFunction>
bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode)
{ {
size_t len; size_t len;
bool correct; bool correct;
@@ -210,11 +368,6 @@ bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, i
size_t index = 0; size_t index = 0;
bool was_error = false; bool was_error = false;
if( clear )
res.clear();
// CHECKME test me when sizeof(wchar_t) is 2
do do
{ {
len = utf8_to_int(stream, index, z, correct); len = utf8_to_int(stream, index, z, correct);
@@ -224,13 +377,13 @@ bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, i
if( !correct ) if( !correct )
{ {
if( mode == 1 ) if( mode == 1 )
int_to_wide(0xFFFD, res); // U+FFFD "replacement character" output_function(0xFFFD); // U+FFFD "replacement character"
was_error = true; was_error = true;
} }
else else
{ {
int_to_wide(z, res); output_function(z);
} }
index += len; index += len;
@@ -264,6 +417,15 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i
if( clear_stream ) if( clear_stream )
out_stream.clear(); out_stream.clear();
return utf8_to_output_function(iterator_in, iterator_end, [&](int z){
int_to_wide(z, out_stream);
}, mode);
}
template<typename StreamIteratorType, typename OutputFunction>
bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode)
{
int res; int res;
bool correct; bool correct;
bool was_error = false; bool was_error = false;
@@ -274,12 +436,12 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i
if( correct ) if( correct )
{ {
int_to_wide(res, out_stream); output_function(res);
} }
else else
{ {
if( mode == 1 ) if( mode == 1 )
int_to_wide(0xFFFD, out_stream); // U+FFFD "replacement character" output_function(0xFFFD); // U+FFFD "replacement character"
was_error = true; was_error = true;
} }
@@ -290,7 +452,6 @@ bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & i
/*! /*!
this function converts UTF-8 stream into a wide string this function converts UTF-8 stream into a wide string
@@ -508,10 +669,10 @@ bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, in
if( clear ) if( clear )
utf8.clear(); utf8.clear();
return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
utf8.append(utf8_buffer, buffer_len); utf8.append(utf8_buffer, buffer_len);
return true; return true;
}); }, mode);
} }
@@ -574,10 +735,10 @@ bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear
if( clear ) if( clear )
utf8.clear(); utf8.clear();
return private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool {
utf8.write(utf8_buffer, buffer_len); utf8.write(utf8_buffer, buffer_len);
return true; return true;
}); }, mode);
} }
@@ -606,7 +767,7 @@ bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffe
buffer_ok = true; buffer_ok = true;
max_buffer_size -= 1; // for terminating null character max_buffer_size -= 1; // for terminating null character
is_ok = private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool { is_ok = wide_to_output_function(buffer, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool {
std::size_t i=0; std::size_t i=0;
for( ; i < buffer_len ; ++i) for( ; i < buffer_len ; ++i)
@@ -626,7 +787,7 @@ bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffe
max_buffer_size -= i; max_buffer_size -= i;
*utf8 = 0; *utf8 = 0;
return buffer_ok; return buffer_ok;
}); }, mode);
} }
if( was_buffer_sufficient_large ) if( was_buffer_sufficient_large )

View File

@@ -3,38 +3,37 @@
./convert.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h ./convert.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h
./convert.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h ./convert.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
./convert.o: ../src/textstream/stream.h ../src/space/space.h ./convert.o: ../src/textstream/stream.h ../src/space/space.h
./convert.o: ../src/textstream/types.h ../src/convert/inttostr.h ./convert.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
./convert.o: ../src/utf8/utf8.h ../src/textstream/stream.h ./convert.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
./convert.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h ./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h
./convert.o: ../src/date/date.h ../src/membuffer/membuffer.h ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./convert.o: ../src/textstream/types.h ../src/textstream/stream_private.h ./convert.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
./convert.o: ../src/convert/strtoint.h ../src/convert/text.h ./convert.o: ../src/convert/text.h ../src/convert/misc.h
./convert.o: ../src/convert/misc.h ../src/convert/double.h test.h ./convert.o: ../src/textstream/types.h ../src/convert/double.h test.h
./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h ./csvparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h ./csvparser.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h ./csvparser.o: ../src/utf8/utf8_private.h ../src/convert/baseparser.h
./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h ./csvparser.o: ../src/textstream/textstream.h ../src/textstream/stream.h
./csvparser.o: ../src/textstream/stream.h ../src/date/date.h ./csvparser.o: ../src/date/date.h ../src/membuffer/membuffer.h
./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./csvparser.o: ../src/textstream/types.h ../src/textstream/stream_private.h
./csvparser.o: ../src/textstream/stream_private.h test.h ./csvparser.o: test.h
./main.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h ./main.o: convert.h ../src/convert/convert.h ../src/convert/inttostr.h
./main.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h ./main.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h
./main.o: ../src/textstream/stream.h ../src/space/space.h ./main.o: ../src/textstream/stream.h ../src/space/space.h
./main.o: ../src/textstream/types.h ../src/convert/inttostr.h ./main.o: ../src/convert/inttostr.h ../src/utf8/utf8.h
./main.o: ../src/utf8/utf8.h ../src/textstream/stream.h ./main.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
./main.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h ./main.o: ../src/utf8/utf8_private.h ../src/date/date.h
./main.o: ../src/date/date.h ../src/membuffer/membuffer.h ./main.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./main.o: ../src/textstream/types.h ../src/textstream/stream_private.h ./main.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h
./main.o: ../src/convert/strtoint.h ../src/convert/text.h ./main.o: ../src/convert/text.h ../src/convert/misc.h
./main.o: ../src/convert/misc.h ../src/convert/double.h test.h ./main.o: ../src/textstream/types.h ../src/convert/double.h test.h
./main.o: mainoptionsparser.h csvparser.h ./main.o: mainoptionsparser.h csvparser.h
./test.o: test.h ./test.o: test.h
./mainoptionsparser.o: mainoptionsparser.h test.h ./mainoptionsparser.o: mainoptionsparser.h test.h
./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
./mainoptionsparser.o: ../src/space/space.h ../src/textstream/types.h ./mainoptionsparser.o: ../src/space/space.h ../src/convert/inttostr.h
./mainoptionsparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h ./mainoptionsparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
./mainoptionsparser.o: ../src/textstream/stream.h
./mainoptionsparser.o: ../src/utf8/utf8_templates.h ./mainoptionsparser.o: ../src/utf8/utf8_templates.h
./mainoptionsparser.o: ../src/utf8/utf8_private.h ../src/convert/convert.h ./mainoptionsparser.o: ../src/utf8/utf8_private.h ../src/convert/convert.h
./mainoptionsparser.o: ../src/convert/inttostr.h ./mainoptionsparser.o: ../src/convert/inttostr.h
@@ -44,4 +43,5 @@
./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./mainoptionsparser.o: ../src/textstream/stream_private.h ./mainoptionsparser.o: ../src/textstream/stream_private.h
./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h ./mainoptionsparser.o: ../src/convert/misc.h ../src/textstream/types.h
./mainoptionsparser.o: ../src/convert/double.h