diff --git a/src/Makefile.dep b/src/Makefile.dep index 530234e..2623c91 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -3,32 +3,28 @@ ./convert/inttostr.o: ./convert/inttostr.h ./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h ./convert/misc.o: textstream/types.h ./convert/inttostr.h utf8/utf8.h -./convert/misc.o: utf8/utf8_templates.h utf8/utf8_private.h ./convert/text.o: ./convert/text.h ./convert/text_private.h ./convert/double.o: ./convert/double.h textstream/textstream.h ./convert/double.o: textstream/stream.h space/space.h convert/inttostr.h -./convert/double.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h -./convert/double.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h -./convert/double.o: textstream/types.h textstream/stream_private.h +./convert/double.o: utf8/utf8.h textstream/stream.h date/date.h +./convert/double.o: membuffer/membuffer.h textstream/types.h +./convert/double.o: textstream/stream_private.h ./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h ./convert/baseparser.o: textstream/stream.h space/space.h convert/inttostr.h -./convert/baseparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h -./convert/baseparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h -./convert/baseparser.o: textstream/types.h textstream/stream_private.h +./convert/baseparser.o: utf8/utf8.h textstream/stream.h date/date.h +./convert/baseparser.o: membuffer/membuffer.h textstream/types.h +./convert/baseparser.o: textstream/stream_private.h ./date/date.o: ./date/date.h convert/inttostr.h ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h ./log/filelog.o: space/space.h convert/inttostr.h utf8/utf8.h -./log/filelog.o: textstream/stream.h utf8/utf8_templates.h -./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./log/filelog.o: textstream/stream.h date/date.h membuffer/membuffer.h ./log/filelog.o: textstream/types.h textstream/stream_private.h ./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h ./log/log.o: space/space.h convert/inttostr.h utf8/utf8.h textstream/stream.h -./log/log.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h -./log/log.o: membuffer/membuffer.h textstream/types.h +./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h ./log/log.o: textstream/stream_private.h ./log/filelog.h ./space/space.o: ./space/space.h convert/inttostr.h utf8/utf8.h -./space/space.o: textstream/stream.h utf8/utf8_templates.h -./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h +./space/space.o: textstream/stream.h convert/convert.h ./convert/inttostr.h ./space/space.o: convert/patternreplacer.h textstream/textstream.h ./space/space.o: textstream/stream.h space/space.h date/date.h ./space/space.o: membuffer/membuffer.h textstream/types.h @@ -37,7 +33,6 @@ ./space/space.o: ./convert/double.h ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h ./space/spaceparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h -./space/spaceparser.o: utf8/utf8_templates.h utf8/utf8_private.h ./space/spaceparser.o: convert/baseparser.h textstream/textstream.h ./space/spaceparser.o: textstream/stream.h space/space.h date/date.h ./space/spaceparser.o: membuffer/membuffer.h textstream/types.h @@ -45,7 +40,6 @@ ./space/spaceparser.o: ./convert/text.h ./convert/misc.h textstream/types.h ./space/keyvalueparser.o: ./space/keyvalueparser.h ./space/space.h ./space/keyvalueparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h -./space/keyvalueparser.o: utf8/utf8_templates.h utf8/utf8_private.h ./space/keyvalueparser.o: convert/baseparser.h textstream/textstream.h ./space/keyvalueparser.o: textstream/stream.h space/space.h date/date.h ./space/keyvalueparser.o: membuffer/membuffer.h textstream/types.h @@ -53,29 +47,24 @@ ./space/keyvalueparser.o: ./convert/text.h ./convert/misc.h ./space/keyvalueparser.o: textstream/types.h ./textstream/stream_private.o: textstream/stream_private.h -./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h -./utf8/utf8.o: utf8/utf8_private.h -./utf8/utf8_private.o: utf8/utf8_private.h +./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h ./csv/csvparser.o: ./csv/csvparser.h space/space.h convert/inttostr.h -./csv/csvparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h -./csv/csvparser.o: utf8/utf8_private.h convert/baseparser.h +./csv/csvparser.o: utf8/utf8.h textstream/stream.h convert/baseparser.h ./csv/csvparser.o: textstream/textstream.h textstream/stream.h date/date.h ./csv/csvparser.o: membuffer/membuffer.h textstream/types.h ./csv/csvparser.o: textstream/stream_private.h ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h ./mainoptions/mainoptionsparser.o: space/space.h convert/inttostr.h ./mainoptions/mainoptionsparser.o: utf8/utf8.h textstream/stream.h -./mainoptions/mainoptionsparser.o: utf8/utf8_templates.h utf8/utf8_private.h ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h ./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h ./html/bbcodeparser.o: textstream/stream.h space/space.h convert/inttostr.h -./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h -./html/bbcodeparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h -./html/bbcodeparser.o: textstream/types.h textstream/stream_private.h +./html/bbcodeparser.o: utf8/utf8.h textstream/stream.h date/date.h +./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h +./html/bbcodeparser.o: textstream/stream_private.h ./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h ./html/htmlparser.o: textstream/textstream.h textstream/stream.h ./html/htmlparser.o: space/space.h convert/inttostr.h utf8/utf8.h -./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h -./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./html/htmlparser.o: textstream/stream.h date/date.h membuffer/membuffer.h ./html/htmlparser.o: textstream/types.h textstream/stream_private.h ./html/htmlparser.o: convert/text.h diff --git a/src/utf8/utf8.cpp b/src/utf8/utf8.cpp index fd8c3aa..26c4f31 100644 --- a/src/utf8/utf8.cpp +++ b/src/utf8/utf8.cpp @@ -34,29 +34,27 @@ #include #include "utf8.h" -#include "utf8_private.h" namespace pt { - -/*! - returns true if 'c' is a correct unicode character -*/ +/* + * returns true if 'c' is a correct unicode character + */ bool utf8_check_range(int c) { return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF); } -/*! - returns true if 'c' is a correct unicode character - - this method is used when reading from an utf8 string - how_many_bytes - means how many bytes from the utf8 string were read -*/ +/* + * returns true if 'c' is a correct unicode character + * + * this method is used when reading from an utf8 string + * how_many_bytes - means how many bytes from the utf8 string were read + */ bool utf8_check_range(int c, int how_many_bytes) { if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 ) @@ -126,12 +124,12 @@ bool surrogate_pair_to_int(int c1, int c2, int & z) /* - an auxiliary function for converting from wide characters to UTF-8 - converting a wide character into one int - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too -*/ + * an auxiliary function for converting from wide characters to UTF-8 + * converting a wide character into one int + * + * returns how many wide characters were used + * if string_len is greater than 0 then the return value is always greater than zero too + */ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct) { if( string_len == 0 ) @@ -177,12 +175,12 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool /* - an auxiliary function for converting from wide characters to UTF-8 - converting a wide character into one int + * an auxiliary function for converting from wide characters to UTF-8 + * converting a wide character into one int - returns how many wide characters were used - if wide_string has at least one character then the return value is always greater than zero too -*/ + * returns how many wide characters were used + * if wide_string has at least one character then the return value is always greater than zero too + */ size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct) { size_t min_str_len = 1; @@ -235,10 +233,10 @@ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len) /* - converts an int to a wide string - - returns true if a character was inserted to the string -*/ + * converts an int to a wide string + * + * returns true if a character was inserted to the string + */ bool int_to_wide(int c, std::wstring & res) { wchar_t buf[2]; @@ -281,23 +279,23 @@ bool int_to_stream(int c, pt::Stream & stream) -/*! - this function converts one UTF-8 character into one wide-character - - input: - utf8 - an input UTF-8 string - utf8_len - size of the input string, - the string should be at least 4 bytes length for correctly - recognized the utf-8 sequence - - output: - res - an output character - correct - true if it is a correct character - - the function returns how many characters have been used from the input string - (returns zero only if utf8_len is zero) - even if there are errors the functions returns a different from zero value -*/ +/* + * this function converts one UTF-8 character into one wide-character + * + * input: + * utf8 - an input UTF-8 string + * utf8_len - size of the input string, + * the string should be at least 4 bytes length for correctly + * recognized the utf-8 sequence + * + * output: + * res - an output character + * correct - true if it is a correct character + * + * the function returns how many characters have been used from the input string + * (returns zero only if utf8_len is zero) + * even if there are errors the functions returns a different from zero value + */ size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct) { size_t i, len; @@ -1016,6 +1014,160 @@ return res; +namespace private_namespace +{ + +/*! + an auxiliary function for converting from UTF-8 string +*/ +bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res) +{ + for(len=0 ; (uz & 0x80) != 0 ; ++len) + uz <<= 1; + + if( len == 1 || len > 4 ) + return false; + + res = uz; + + if( len > 0 ) + res >>= len; + + if( len == 0 ) + len = 1; + +return true; +} + + + +/*! + an auxiliary function for converting from UTF-8 string +*/ +bool utf8_to_int_add_next_octet(unsigned char uz, int & res) +{ + if( (uz & 0xc0) != 0x80 ) + return false; + + res <<= 6; + res |= (uz & 0x3F); + +return true; +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too + + utf8_written - how many characters were saved in the utf8 string (the string doesn't have + a null terminating character) + it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read + was_utf8_buf_too_small - will be true if the utf8 buffer is too small + if this flag is true then utf8_written is equal to zero + was_error - will be true if there is an error when converting (there was an incorrect wide character) + (was_error will not be true if the utf8 buffer is too small) +*/ +size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, + size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + utf8_written = 0; + was_utf8_buf_too_small = false; + chars = wide_to_int(wide_string, string_len, z, correct); + + if( correct ) + { + utf8_written = int_to_utf8(z, utf8, utf8_len); + + if( utf8_written == 0 ) + was_utf8_buf_too_small = true; + } + else + { + if( mode == 1 ) + { + utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character" + + if( utf8_written == 0 ) + was_utf8_buf_too_small = true; + } + + was_error = true; + } + +return chars; +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = wide_to_int(wide_string, string_len, z, correct); + + if( correct ) + correct = int_to_utf8(z, utf8, false) != 0; + + if( !correct ) + { + if( mode == 1 ) + int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if wide_string has at least one character then the return value is always greater than zero too +*/ +size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = wide_to_int(wide_string, z, correct); + + if( correct ) + correct = int_to_utf8(z, utf8, false) != 0; + + if( !correct ) + { + if( mode == 1 ) + int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + +} // namespace private_namespace + + } // namespace diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h index fb5abac..51f619b 100644 --- a/src/utf8/utf8.h +++ b/src/utf8/utf8.h @@ -177,7 +177,7 @@ bool int_to_wide(int c, std::wstring & res); call a convert_function for each character from an utf8 string */ template -bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction convert_function, int mode = 1); +bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode = 1); @@ -298,10 +298,832 @@ template bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large = nullptr, int mode = 1); + + +namespace private_namespace +{ +bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res); +bool utf8_to_int_add_next_octet(unsigned char uz, int & res); + +size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, + size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode); + +size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode); + +size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode); + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +template +static size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = wide_to_int(wide_string, string_len, z, correct); + + if( correct ) + correct = int_to_utf8(z, utf8) != 0; + + if( !correct ) + { + if( mode == 1 ) + int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + + +/*! + an auxiliary function for converting from wide characters to UTF-8 +*/ +template +static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode) +{ + size_t min_str_len = 1; + + if( *wide_string == 0 ) + return 0; + + if( *(wide_string+1) != 0 ) + min_str_len = 2; + +return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode); +} + +} // namespace private_namespace + + + + + + + +template +bool int_to_wide(int c, StreamType & res) +{ + wchar_t buf[2]; + size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t)); + + if( used == 1 ) + { + res << buf[0]; + } + else + if( used == 2 ) + { + res << buf[0]; + res << buf[1]; + } + + return used > 0; +} + + +/*! + this function converts one UTF-8 character into int + + input: + iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) + iterator_end - an end iterator + + output: + res - an output character + correct - true if it is a correct character + + the function returns how many characters have been used from the input stream +*/ +template +size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct) +{ +size_t i, len; +unsigned char uz; + + res = 0; + correct = false; + + if( iterator_in == iterator_end ) + return 0; + + uz = *iterator_in; + ++iterator_in; + + if( !private_namespace::utf8_to_int_first_octet(uz, len, res) ) + return 1; + + for(i=1 ; i stream + (need to be tested) +*/ +// need to be tested +template +bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode) +{ + if( clear ) + res.clear(); + + bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) { + int_to_wide(c, res); + }, mode); + + return status; +} + + + + +template +bool utf8_to_wide(const char * utf8, StreamType & res, bool clear, int mode) +{ +size_t utf8_len = 0; + + while( utf8[utf8_len] != 0 ) + utf8_len += 1; + +return utf8_to_wide(utf8, utf8_len, res, clear, mode); +} + + + +template +bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear, int mode) +{ + return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode); +} + + + +template +bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear, int mode) +{ +int z; +bool correct, was_error = false; + + if( clear ) + res.clear(); + + while( utf8_to_int(utf8, z, correct) > 0 ) + { + if( !correct ) + { + if( mode == 1 ) + res << 0xFFFD; // U+FFFD "replacement character" + + was_error = true; + } + else + { + int_to_wide(z, res); + } + } + +return !was_error; +} + + +template +bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode) +{ +int z; +size_t len; +bool correct, was_error = false; + + while( utf8_len > 0 ) + { + if( (unsigned char)*utf8 <= 0x7f ) + { + // small optimization + len = 1; + correct = true; + z = static_cast(*utf8); + } + else + { + len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero + } + + if( !correct ) + { + if( mode == 1 ) + output_function(0xFFFD); // U+FFFD "replacement character" + + was_error = true; + } + else + { + output_function(z); + } + + utf8 += len; + utf8_len -= len; + } + +return !was_error; +} + + +template +bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode) +{ + char utf8_buffer[256]; + std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char); + std::size_t utf8_sequence_max_length = 10; + std::size_t index = 0; + bool was_error = false; + + typename StreamType::const_iterator i = buffer.begin(); + + while( i != buffer.end() ) + { + if( index + utf8_sequence_max_length > buffer_len ) + { + bool write_status = output_function(utf8_buffer, index); + index = 0; + + if( !write_status ) + { + was_error = true; + break; + } + } + + int c = 0xFFFD; // U+FFFD "replacement character"; + bool seems_to_be_correct = false; + wchar_t w1 = *i; + + if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) ) + { + ++i; + + if( i != buffer.end() ) + { + wchar_t w2 = *i; + + if( surrogate_pair_to_int(w1, w2, c) ) + { + seems_to_be_correct = true; + ++i; + } + else + { + was_error = true; + } + } + else + { + was_error = true; + } + } + else + { + c = w1; + seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below + ++i; + } + + if( seems_to_be_correct || mode == 1 ) + { + size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index); + // here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough) + + if( seq_len == 0 ) + { + was_error = true; + + if( mode == 1 ) + { + seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character"; + } + } + + index += seq_len; + } + } + + if( index > 0 ) + { + if( !output_function(utf8_buffer, index) ) + { + was_error = true; + } + } + + return !was_error; +} + + +/* +this function converts a UTF-8 stream into a wide stream or a wide string + +input: + stream - a UTF-8 stream for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + +output: + res - a wide stream or a wide string for the output sequence + + this function returns false if there were some errors when converting +*/ +template +bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode) +{ + if( clear ) + res.clear(); + + return utf8_to_output_function(stream, [&](int z) { + int_to_wide(z, res); + }, mode); +} + + +/* +this function reads characters from a UTF-8 stream and calls an output_function + +input: + stream - a UTF-8 stream for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + +output: + output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType + and should put the character to the output string/stream, this function should have the signature like this: + output_function(int z, StreamOrStringType & res) + + this function returns false if there were some errors when converting +*/ +template +bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode) +{ + size_t len; + bool correct; + int z; + size_t index = 0; + bool was_error = false; + + do + { + len = utf8_to_int(stream, index, z, correct); + + if( len > 0 ) + { + if( !correct ) + { + if( mode == 1 ) + output_function(0xFFFD); // U+FFFD "replacement character" + + was_error = true; + } + else + { + output_function(z); + } + + index += len; + } + } + while( len > 0 ); + + return !was_error; +} + + + + + + +/*! + this function converts UTF-8 stream into a wide stream or a wide string + + input: + iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) + iterator_end - an end iterator + + output: + out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string) + + this function returns false if there were some errors when converting +*/ +template +bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode) +{ + if( clear_stream ) + out_stream.clear(); + + return utf8_to_output_function(iterator_in, iterator_end, [&](int z){ + int_to_wide(z, out_stream); + }, mode); +} + + +template +bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode) +{ + int res; + bool correct; + bool was_error = false; + + while( iterator_in != iterator_end ) + { + utf8_to_int(iterator_in, iterator_end, res, correct); + + if( correct ) + { + output_function(res); + } + else + { + if( mode == 1 ) + output_function(0xFFFD); // U+FFFD "replacement character" + + was_error = true; + } + } + + return !was_error; +} + + + +/*! + this function converts UTF-8 stream into a wide string + + input: + iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) + iterator_end - an end iterator + + output: + out_buffer - an output wide string + max_buffer_len - how many characters can be write (we write the terminating null character too) + was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large + + this function returns false if there were some errors when converting or if the output buffer was too short +*/ +template +bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large) +{ + int res; + bool correct; + bool was_error = true; + bool was_buffer_ok = false; + + if( max_buffer_len > 0 ) + { + max_buffer_len -= 1; // for terminating null character + was_error = false; + was_buffer_ok = true; + + while( iterator_in != iterator_end ) + { + utf8_to_int(iterator_in, iterator_end, res, correct); + + if( !correct ) + { + was_error = true; + + if( mode == 1 ) + { + res = 0xFFFD; // U+FFFD "replacement character" + correct = true; + } + } + + if( correct ) + { + size_t len = int_to_wide(res, out_buffer, max_buffer_len); + // if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand) + + if( len == 0 ) + { + was_error = true; + was_buffer_ok = false; + break; + } + else + { + out_buffer += len; + max_buffer_len -= len; + } + } + } + + *out_buffer = 0; + } + + if( was_buffer_sufficient_large ) + *was_buffer_sufficient_large = was_buffer_ok; + + return !was_error; +} + + + +/*! + this function converts UTF-8 stream into a wide string + + input: + stream - a stream for reading from + + output: + out_buffer - an output wide string + max_buffer_len - how many characters can be write (we write the terminating null character too) + was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large + + this function returns false if there were some errors when converting or if the output buffer was too short +*/ +template +bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode) +{ + typename StreamType::const_iterator stream_begin = stream.begin(); + typename StreamType::const_iterator stream_end = stream.end(); + + return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large); +} + + + +/*! + this function converts one wide character into UTF-8 stream + + input: + z - wide character + + output: + utf8 - a UTF-8 stream for the output sequence + + the function returns how many characters have been written to the utf8 stream, + zero means that 'z' is an incorrect unicode character +*/ +template +size_t int_to_utf8(int z, StreamType & utf8) +{ + char buf[10]; + + size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char)); + + if( len > 0 ) + utf8.write(buf, len); + + return len; +} + + + + + +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a wide string for converting + string_len - size of the string + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 stream for the output sequence + + this function returns false if there were some errors when converting +*/ +template +bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode) +{ +bool was_error = false; +size_t chars; + + while( string_len > 0 ) + { + chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode); + wide_string += chars; + string_len -= chars; + } + +return !was_error; +} + + + + + +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a null terminated wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 stream for the output sequence + + this function returns false if there were some errors when converting +*/ +template +bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode) +{ +bool was_error = false; + + while( *wide_string ) + wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode); + +return !was_error; +} + + + +/*! + this function converts a wide string (std::wstring) into UTF-8 stream + + input: + wide_string - a wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 stream for the output sequence + + this function returns false if there were some errors when converting +*/ +template +bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode) +{ + return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, mode); +} + + + + +template +bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode) +{ + if( clear ) + utf8.clear(); + + return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { + utf8.append(utf8_buffer, buffer_len); + return true; + }, mode); +} + + + +template +bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode) +{ + bool was_error = false; + + if( clear ) + utf8.clear(); + + for(size_t i=0 ; i < stream.size() ; ++i) + { + int c = static_cast(stream.get_wchar(i)); + bool is_correct = false; + + if( utf8_check_range(c) ) + { + // CHECKME test me when sizeof(wchar_t) == 2 + if( is_first_surrogate_char(c) ) + { + if( i + 1 < stream.size() ) + { + wchar_t c1 = static_cast(c); + wchar_t c2 = stream.get_wchar(++i); + + if( surrogate_pair_to_int(c1, c2, c) ) + { + is_correct = true; + } + } + } + else + { + is_correct = true; + } + } + + if( is_correct ) + { + int_to_utf8(c, utf8); + } + else + { + was_error = true; + + if( mode == 1 ) + int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character" + } + } + + return !was_error; +} + + +template +bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode) +{ + if( clear ) + utf8.clear(); + + return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { + utf8.write(utf8_buffer, buffer_len); + return true; + }, mode); +} + + + +/*! + this function converts a wide stream into a utf8 string + + input: + buffer - a wide stream for reading from + + output: + utf8 - an output utf8 string + max_buffer_len - how many characters can be write (we write the terminating null character too) + was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large + + this function returns false if there were some errors when converting or if the output buffer was too short +*/ +template +bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode) +{ + bool buffer_ok = false; + bool is_ok = false; + + if( max_buffer_size > 0 ) + { + buffer_ok = true; + max_buffer_size -= 1; // for terminating null character + + is_ok = wide_to_output_function(buffer, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool { + std::size_t i=0; + + for( ; i < buffer_len ; ++i) + { + if( i < max_buffer_size ) + { + *utf8 = utf8_buffer[i]; + utf8 += 1; + } + else + { + buffer_ok = false; + break; + } + } + + max_buffer_size -= i; + *utf8 = 0; + return buffer_ok; + }, mode); + } + + if( was_buffer_sufficient_large ) + *was_buffer_sufficient_large = buffer_ok; + + return is_ok; +} + + } // namespace -#include "utf8/utf8_templates.h" #endif diff --git a/src/utf8/utf8_private.cpp b/src/utf8/utf8_private.cpp deleted file mode 100644 index a4fa2b0..0000000 --- a/src/utf8/utf8_private.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * This file is a part of PikoTools - * and is distributed under the 2-Clause BSD licence. - * Author: Tomasz Sowa - */ - -/* - * Copyright (c) 2021-2024, Tomasz Sowa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include "utf8_private.h" - - -namespace pt -{ - -namespace private_namespace -{ - -/*! - an auxiliary function for converting from UTF-8 string -*/ -bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res) -{ - for(len=0 ; (uz & 0x80) != 0 ; ++len) - uz <<= 1; - - if( len == 1 || len > 4 ) - return false; - - res = uz; - - if( len > 0 ) - res >>= len; - - if( len == 0 ) - len = 1; - -return true; -} - - - -/*! - an auxiliary function for converting from UTF-8 string -*/ -bool utf8_to_int_add_next_octet(unsigned char uz, int & res) -{ - if( (uz & 0xc0) != 0x80 ) - return false; - - res <<= 6; - res |= (uz & 0x3F); - -return true; -} - - - -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too - - utf8_written - how many characters were saved in the utf8 string (the string doesn't have - a null terminating character) - it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read - was_utf8_buf_too_small - will be true if the utf8 buffer is too small - if this flag is true then utf8_written is equal to zero - was_error - will be true if there is an error when converting (there was an incorrect wide character) - (was_error will not be true if the utf8 buffer is too small) -*/ -size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, - size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode) -{ -int z; -bool correct; -size_t chars; - - utf8_written = 0; - was_utf8_buf_too_small = false; - chars = wide_to_int(wide_string, string_len, z, correct); - - if( correct ) - { - utf8_written = int_to_utf8(z, utf8, utf8_len); - - if( utf8_written == 0 ) - was_utf8_buf_too_small = true; - } - else - { - if( mode == 1 ) - { - utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character" - - if( utf8_written == 0 ) - was_utf8_buf_too_small = true; - } - - was_error = true; - } - -return chars; -} - - - -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too -*/ -size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode) -{ -int z; -bool correct; -size_t chars; - - chars = wide_to_int(wide_string, string_len, z, correct); - - if( correct ) - correct = int_to_utf8(z, utf8, false) != 0; - - if( !correct ) - { - if( mode == 1 ) - int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character" - - was_error = true; - } - -return chars; -} - - - -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if wide_string has at least one character then the return value is always greater than zero too -*/ -size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode) -{ -int z; -bool correct; -size_t chars; - - chars = wide_to_int(wide_string, z, correct); - - if( correct ) - correct = int_to_utf8(z, utf8, false) != 0; - - if( !correct ) - { - if( mode == 1 ) - int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character" - - was_error = true; - } - -return chars; -} - - - - - -} // namespace private_namespace - -} // namespace pt - - - diff --git a/src/utf8/utf8_private.h b/src/utf8/utf8_private.h deleted file mode 100644 index 66208cc..0000000 --- a/src/utf8/utf8_private.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * This file is a part of PikoTools - * and is distributed under the 2-Clause BSD licence. - * Author: Tomasz Sowa - */ - -/* - * Copyright (c) 2021-2024, Tomasz Sowa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifndef headerfile_pikotools_src_utf8_utf8_private -#define headerfile_pikotools_src_utf8_utf8_private - -#include - - -namespace pt -{ - -size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len); -size_t int_to_utf8(int z, std::string & utf8, bool clear); -size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct); -size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct); - - -namespace private_namespace -{ -bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res); -bool utf8_to_int_add_next_octet(unsigned char uz, int & res); - -size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, - size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode); - -size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode); - -size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode); - - -/*! - an auxiliary function for converting from wide characters to UTF-8 - - returns how many wide characters were used - if string_len is greater than 0 then the return value is always greater than zero too -*/ -template -static size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, bool & was_error, int mode) -{ -int z; -bool correct; -size_t chars; - - chars = wide_to_int(wide_string, string_len, z, correct); - - if( correct ) - correct = int_to_utf8(z, utf8) != 0; - - if( !correct ) - { - if( mode == 1 ) - int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character" - - was_error = true; - } - -return chars; -} - - -/*! - an auxiliary function for converting from wide characters to UTF-8 -*/ -template -static size_t wide_one_to_utf8(const wchar_t * wide_string, StreamType & utf8, bool & was_error, int mode) -{ - size_t min_str_len = 1; - - if( *wide_string == 0 ) - return 0; - - if( *(wide_string+1) != 0 ) - min_str_len = 2; - -return wide_one_to_utf8(wide_string, min_str_len, utf8, was_error, mode); -} - - - - -} // namespace private_namespace - -} // namespace pt - -#endif diff --git a/src/utf8/utf8_templates.h b/src/utf8/utf8_templates.h deleted file mode 100644 index eafbc29..0000000 --- a/src/utf8/utf8_templates.h +++ /dev/null @@ -1,808 +0,0 @@ -/* - * This file is a part of PikoTools - * and is distributed under the 2-Clause BSD licence. - * Author: Tomasz Sowa - */ - -/* - * Copyright (c) 2021-2024, Tomasz Sowa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifndef headerfile_pikotools_src_utf8_utf8_templates -#define headerfile_pikotools_src_utf8_utf8_templates - -// this file is included at the end of utf8.h - -#include "utf8_private.h" - - -namespace pt -{ - - -template -bool int_to_wide(int c, StreamType & res) -{ - wchar_t buf[2]; - size_t used = int_to_wide(c, buf, sizeof(buf) / sizeof(wchar_t)); - - if( used == 1 ) - { - res << buf[0]; - } - else - if( used == 2 ) - { - res << buf[0]; - res << buf[1]; - } - - return used > 0; -} - - -/*! - this function converts one UTF-8 character into int - - input: - iterator_in - an stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) - iterator_end - an end iterator - - output: - res - an output character - correct - true if it is a correct character - - the function returns how many characters have been used from the input stream -*/ -template -size_t utf8_to_int(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, int & res, bool & correct) -{ -size_t i, len; -unsigned char uz; - - res = 0; - correct = false; - - if( iterator_in == iterator_end ) - return 0; - - uz = *iterator_in; - ++iterator_in; - - if( !private_namespace::utf8_to_int_first_octet(uz, len, res) ) - return 1; - - for(i=1 ; i stream - (need to be tested) -*/ -// need to be tested -template -bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode) -{ - if( clear ) - res.clear(); - - bool status = utf8_to_output_function(utf8, utf8_len, [&res](int c) { - int_to_wide(c, res); - }, mode); - - return status; -} - - - - -template -bool utf8_to_wide(const char * utf8, StreamType & res, bool clear, int mode) -{ -size_t utf8_len = 0; - - while( utf8[utf8_len] != 0 ) - utf8_len += 1; - -return utf8_to_wide(utf8, utf8_len, res, clear, mode); -} - - - -template -bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear, int mode) -{ - return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode); -} - - - -template -bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear, int mode) -{ -int z; -bool correct, was_error = false; - - if( clear ) - res.clear(); - - while( utf8_to_int(utf8, z, correct) > 0 ) - { - if( !correct ) - { - if( mode == 1 ) - res << 0xFFFD; // U+FFFD "replacement character" - - was_error = true; - } - else - { - int_to_wide(z, res); - } - } - -return !was_error; -} - - -template -bool utf8_to_output_function(const char * utf8, size_t utf8_len, OutputFunction output_function, int mode) -{ -int z; -size_t len; -bool correct, was_error = false; - - while( utf8_len > 0 ) - { - if( (unsigned char)*utf8 <= 0x7f ) - { - // small optimization - len = 1; - correct = true; - z = static_cast(*utf8); - } - else - { - len = pt::utf8_to_int(utf8, utf8_len, z, correct); // the len will be different from zero - } - - if( !correct ) - { - if( mode == 1 ) - output_function(0xFFFD); // U+FFFD "replacement character" - - was_error = true; - } - else - { - output_function(z); - } - - utf8 += len; - utf8_len -= len; - } - -return !was_error; -} - - -template -bool wide_to_output_function(StreamType & buffer, OutputFunction output_function, int mode) -{ - char utf8_buffer[256]; - std::size_t buffer_len = sizeof(utf8_buffer) / sizeof(char); - std::size_t utf8_sequence_max_length = 10; - std::size_t index = 0; - bool was_error = false; - - typename StreamType::const_iterator i = buffer.begin(); - - while( i != buffer.end() ) - { - if( index + utf8_sequence_max_length > buffer_len ) - { - bool write_status = output_function(utf8_buffer, index); - index = 0; - - if( !write_status ) - { - was_error = true; - break; - } - } - - int c = 0xFFFD; // U+FFFD "replacement character"; - bool seems_to_be_correct = false; - wchar_t w1 = *i; - - if( sizeof(wchar_t) == 2 && is_first_surrogate_char(w1) ) - { - ++i; - - if( i != buffer.end() ) - { - wchar_t w2 = *i; - - if( surrogate_pair_to_int(w1, w2, c) ) - { - seems_to_be_correct = true; - ++i; - } - else - { - was_error = true; - } - } - else - { - was_error = true; - } - } - else - { - c = w1; - seems_to_be_correct = true; // we do not test utf8_check_range(...) here because it is tested in int_to_utf8(...) below - ++i; - } - - if( seems_to_be_correct || mode == 1 ) - { - size_t seq_len = int_to_utf8(c, utf8_buffer + index, buffer_len - index); - // here seq_len can be zero only when c is an incorrect unicode char (the buffer is large enough) - - if( seq_len == 0 ) - { - was_error = true; - - if( mode == 1 ) - { - seq_len = int_to_utf8(0xFFFD, utf8_buffer + index, buffer_len - index); // U+FFFD "replacement character"; - } - } - - index += seq_len; - } - } - - if( index > 0 ) - { - if( !output_function(utf8_buffer, index) ) - { - was_error = true; - } - } - - return !was_error; -} - - -/* -this function converts a UTF-8 stream into a wide stream or a wide string - -input: - stream - a UTF-8 stream for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - -output: - res - a wide stream or a wide string for the output sequence - - this function returns false if there were some errors when converting -*/ -template -bool utf8_to_wide(const Stream & stream, StreamOrStringType & res, bool clear, int mode) -{ - if( clear ) - res.clear(); - - return utf8_to_output_function(stream, [&](int z) { - int_to_wide(z, res); - }, mode); -} - - -/* -this function reads characters from a UTF-8 stream and calls an output_function - -input: - stream - a UTF-8 stream for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - -output: - output_function - is a function which gets two artuments: int (character) and a reference to StreamOrStringType - and should put the character to the output string/stream, this function should have the signature like this: - output_function(int z, StreamOrStringType & res) - - this function returns false if there were some errors when converting -*/ -template -bool utf8_to_output_function(const Stream & stream, OutputFunction output_function, int mode) -{ - size_t len; - bool correct; - int z; - size_t index = 0; - bool was_error = false; - - do - { - len = utf8_to_int(stream, index, z, correct); - - if( len > 0 ) - { - if( !correct ) - { - if( mode == 1 ) - output_function(0xFFFD); // U+FFFD "replacement character" - - was_error = true; - } - else - { - output_function(z); - } - - index += len; - } - } - while( len > 0 ); - - return !was_error; -} - - - - - - -/*! - this function converts UTF-8 stream into a wide stream or a wide string - - input: - iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) - iterator_end - an end iterator - - output: - out_stream - an output wide stream or wide string (the stream can by of any kind, we use only << operator for a stream and += for a string) - - this function returns false if there were some errors when converting -*/ -template -bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, StreamOrStringType & out_stream, bool clear_stream, int mode) -{ - if( clear_stream ) - out_stream.clear(); - - return utf8_to_output_function(iterator_in, iterator_end, [&](int z){ - int_to_wide(z, out_stream); - }, mode); -} - - -template -bool utf8_to_output_function(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, OutputFunction output_function, int mode) -{ - int res; - bool correct; - bool was_error = false; - - while( iterator_in != iterator_end ) - { - utf8_to_int(iterator_in, iterator_end, res, correct); - - if( correct ) - { - output_function(res); - } - else - { - if( mode == 1 ) - output_function(0xFFFD); // U+FFFD "replacement character" - - was_error = true; - } - } - - return !was_error; -} - - - -/*! - this function converts UTF-8 stream into a wide string - - input: - iterator_in - a stream iterator for reading from (the stream can by any stream, we use *, ++ and == operators only) - iterator_end - an end iterator - - output: - out_buffer - an output wide string - max_buffer_len - how many characters can be write (we write the terminating null character too) - was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large - - this function returns false if there were some errors when converting or if the output buffer was too short -*/ -template -bool utf8_to_wide(StreamIteratorType & iterator_in, const StreamIteratorType & iterator_end, wchar_t * out_buffer, size_t max_buffer_len, int mode, bool * was_buffer_sufficient_large) -{ - int res; - bool correct; - bool was_error = true; - bool was_buffer_ok = false; - - if( max_buffer_len > 0 ) - { - max_buffer_len -= 1; // for terminating null character - was_error = false; - was_buffer_ok = true; - - while( iterator_in != iterator_end ) - { - utf8_to_int(iterator_in, iterator_end, res, correct); - - if( !correct ) - { - was_error = true; - - if( mode == 1 ) - { - res = 0xFFFD; // U+FFFD "replacement character" - correct = true; - } - } - - if( correct ) - { - size_t len = int_to_wide(res, out_buffer, max_buffer_len); - // if len is zero then the output buffer is too short - the res input value was correct (it was returned from utf_to_int(...) beforehand) - - if( len == 0 ) - { - was_error = true; - was_buffer_ok = false; - break; - } - else - { - out_buffer += len; - max_buffer_len -= len; - } - } - } - - *out_buffer = 0; - } - - if( was_buffer_sufficient_large ) - *was_buffer_sufficient_large = was_buffer_ok; - - return !was_error; -} - - - -/*! - this function converts UTF-8 stream into a wide string - - input: - stream - a stream for reading from - - output: - out_buffer - an output wide string - max_buffer_len - how many characters can be write (we write the terminating null character too) - was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large - - this function returns false if there were some errors when converting or if the output buffer was too short -*/ -template -bool utf8_to_wide(const StreamType & stream, wchar_t * out_buffer, size_t max_buffer_len, bool * was_buffer_sufficient_large, int mode) -{ - typename StreamType::const_iterator stream_begin = stream.begin(); - typename StreamType::const_iterator stream_end = stream.end(); - - return utf8_to_wide(stream_begin, stream_end, out_buffer, max_buffer_len, mode, was_buffer_sufficient_large); -} - - - -/*! - this function converts one wide character into UTF-8 stream - - input: - z - wide character - - output: - utf8 - a UTF-8 stream for the output sequence - - the function returns how many characters have been written to the utf8 stream, - zero means that 'z' is an incorrect unicode character -*/ -template -size_t int_to_utf8(int z, StreamType & utf8) -{ - char buf[10]; - - size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char)); - - if( len > 0 ) - utf8.write(buf, len); - - return len; -} - - - - - -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a wide string for converting - string_len - size of the string - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 stream for the output sequence - - this function returns false if there were some errors when converting -*/ -template -bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode) -{ -bool was_error = false; -size_t chars; - - while( string_len > 0 ) - { - chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode); - wide_string += chars; - string_len -= chars; - } - -return !was_error; -} - - - - - -/*! - this function converts a wide string into UTF-8 stream - - input: - wide_string - a null terminated wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 stream for the output sequence - - this function returns false if there were some errors when converting -*/ -template -bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode) -{ -bool was_error = false; - - while( *wide_string ) - wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode); - -return !was_error; -} - - - -/*! - this function converts a wide string (std::wstring) into UTF-8 stream - - input: - wide_string - a wide string for converting - mode - what to do with errors when converting - 0: skip an invalid character - 1: put U+FFFD "replacement character" istead of the invalid character (default) - - output: - utf8 - a UTF-8 stream for the output sequence - - this function returns false if there were some errors when converting -*/ -template -bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode) -{ - return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, mode); -} - - - - -template -bool wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode) -{ - if( clear ) - utf8.clear(); - - return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { - utf8.append(utf8_buffer, buffer_len); - return true; - }, mode); -} - - - -template -bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode) -{ - bool was_error = false; - - if( clear ) - utf8.clear(); - - for(size_t i=0 ; i < stream.size() ; ++i) - { - int c = static_cast(stream.get_wchar(i)); - bool is_correct = false; - - if( utf8_check_range(c) ) - { - // CHECKME test me when sizeof(wchar_t) == 2 - if( is_first_surrogate_char(c) ) - { - if( i + 1 < stream.size() ) - { - wchar_t c1 = static_cast(c); - wchar_t c2 = stream.get_wchar(++i); - - if( surrogate_pair_to_int(c1, c2, c) ) - { - is_correct = true; - } - } - } - else - { - is_correct = true; - } - } - - if( is_correct ) - { - int_to_utf8(c, utf8); - } - else - { - was_error = true; - - if( mode == 1 ) - int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character" - } - } - - return !was_error; -} - - -template -bool wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode) -{ - if( clear ) - utf8.clear(); - - return wide_to_output_function(buffer, [&utf8](const char * utf8_buffer, std::size_t buffer_len) -> bool { - utf8.write(utf8_buffer, buffer_len); - return true; - }, mode); -} - - - -/*! - this function converts a wide stream into a utf8 string - - input: - buffer - a wide stream for reading from - - output: - utf8 - an output utf8 string - max_buffer_len - how many characters can be write (we write the terminating null character too) - was_buffer_sufficient_large - a pointer to a bool value - if provided it is set to true if the buffer was sufficient large - - this function returns false if there were some errors when converting or if the output buffer was too short -*/ -template -bool wide_stream_to_utf8(StreamType & buffer, char * utf8, std::size_t max_buffer_size, bool * was_buffer_sufficient_large, int mode) -{ - bool buffer_ok = false; - bool is_ok = false; - - if( max_buffer_size > 0 ) - { - buffer_ok = true; - max_buffer_size -= 1; // for terminating null character - - is_ok = wide_to_output_function(buffer, [&utf8, &max_buffer_size, &buffer_ok](const char * utf8_buffer, std::size_t buffer_len) -> bool { - std::size_t i=0; - - for( ; i < buffer_len ; ++i) - { - if( i < max_buffer_size ) - { - *utf8 = utf8_buffer[i]; - utf8 += 1; - } - else - { - buffer_ok = false; - break; - } - } - - max_buffer_size -= i; - *utf8 = 0; - return buffer_ok; - }, mode); - } - - if( was_buffer_sufficient_large ) - *was_buffer_sufficient_large = buffer_ok; - - return is_ok; -} - - - - - -} // namespace pt - -#endif - - - diff --git a/tests/Makefile.dep b/tests/Makefile.dep index f60d315..3feebfa 100644 --- a/tests/Makefile.dep +++ b/tests/Makefile.dep @@ -4,16 +4,14 @@ ./convert.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h ./convert.o: ../src/textstream/stream.h ../src/space/space.h ./convert.o: ../src/convert/inttostr.h ../src/utf8/utf8.h -./convert.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h -./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h +./convert.o: ../src/textstream/stream.h ../src/date/date.h ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./convert.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h ./convert.o: ../src/convert/text.h ../src/convert/misc.h ./convert.o: ../src/textstream/types.h ../src/convert/double.h test.h ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h ./csvparser.o: ../src/convert/inttostr.h ../src/utf8/utf8.h -./csvparser.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h -./csvparser.o: ../src/utf8/utf8_private.h ../src/convert/baseparser.h +./csvparser.o: ../src/textstream/stream.h ../src/convert/baseparser.h ./csvparser.o: ../src/textstream/textstream.h ../src/textstream/stream.h ./csvparser.o: ../src/date/date.h ../src/membuffer/membuffer.h ./csvparser.o: ../src/textstream/types.h ../src/textstream/stream_private.h @@ -22,8 +20,7 @@ ./main.o: ../src/convert/patternreplacer.h ../src/textstream/textstream.h ./main.o: ../src/textstream/stream.h ../src/space/space.h ./main.o: ../src/convert/inttostr.h ../src/utf8/utf8.h -./main.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h -./main.o: ../src/utf8/utf8_private.h ../src/date/date.h +./main.o: ../src/textstream/stream.h ../src/date/date.h ./main.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./main.o: ../src/textstream/stream_private.h ../src/convert/strtoint.h ./main.o: ../src/convert/text.h ../src/convert/misc.h @@ -34,9 +31,7 @@ ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h ./mainoptionsparser.o: ../src/space/space.h ../src/convert/inttostr.h ./mainoptionsparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h -./mainoptionsparser.o: ../src/utf8/utf8_templates.h -./mainoptionsparser.o: ../src/utf8/utf8_private.h ../src/convert/convert.h -./mainoptionsparser.o: ../src/convert/inttostr.h +./mainoptionsparser.o: ../src/convert/convert.h ../src/convert/inttostr.h ./mainoptionsparser.o: ../src/convert/patternreplacer.h ./mainoptionsparser.o: ../src/textstream/textstream.h ./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h