diff --git a/utf8/utf8.cpp b/utf8/utf8.cpp index 65ee931..5bee161 100755 --- a/utf8/utf8.cpp +++ b/utf8/utf8.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2012, Tomasz Sowa + * Copyright (c) 2010-2014, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -426,7 +426,6 @@ return !was_error; - /*! this function converts one wide character into UTF-8 sequence @@ -539,7 +538,6 @@ return len; - /* an auxiliary function for converting from wide characters to UTF-8 converting a wide character into one int @@ -584,6 +582,7 @@ static size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, } else { + correct = UTF8_CheckRange(z); return 1; } } @@ -616,6 +615,55 @@ return WideToInt(wide_string, min_str_len, z, correct); +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too + + utf8_written - how many characters were saved in the utf8 string (the string doesn't have + a null terminating character) + it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read + was_utf8_buf_too_small - will be true if the utf8 buffer is too small + if this flag is true then utf8_written is equal to zero + was_error - will be true if there is an error when converting (there was an incorrect wide character) + (was_error will not be true if the utf8 buffer is too small) +*/ +static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, + size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + utf8_written = 0; + was_utf8_buf_too_small = false; + chars = WideToInt(wide_string, string_len, z, correct); + + if( correct ) + { + utf8_written = IntToUTF8(z, utf8, utf8_len); + + if( utf8_written == 0 ) + was_utf8_buf_too_small = true; + } + else + { + if( mode == 1 ) + { + utf8_written = IntToUTF8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character" + + if( utf8_written == 0 ) + was_utf8_buf_too_small = true; + } + + was_error = true; + } + +return chars; +} + + /*! an auxiliary function for converting from wide characters to UTF-8 @@ -725,7 +773,6 @@ return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode); - /*! this function converts a wide string into UTF-8 string @@ -842,6 +889,7 @@ return !was_error; } + /*! this function converts a wide string into UTF-8 stream @@ -889,8 +937,244 @@ bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode) +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a wide string for converting + string_len - lenght of the wide string + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a buffer for the UTF-8 stream + utf8_len - the size of the buffer + utf8_written - how many bytes have been written to the buffer + + this function returns false if there were some errors when converting or the output buffer was too small, + the output string is not null terminated + + if there is an error when converting (there is an incorrect character in the wide string) the function + will continue converting but if the buffer is too small the function breaks immediately +*/ +bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode) +{ +bool was_error = false; +bool was_buffer_to_small; +size_t chars, utf8_saved; + + utf8_written = 0; + + while( string_len > 0 ) + { + chars = WideOneToUTF8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode); + + if( was_buffer_to_small ) + { + /* + * if the buffer was too small break immediately + * and set the was_error flag + */ + was_error = true; + break; + } + + wide_string += chars; + string_len -= chars; + utf8 += utf8_saved; + utf8_len -= utf8_saved; + utf8_written += utf8_saved; + } + +return !was_error; +} + + + +/*! + this function converts a wide string (std::wstring) into UTF-8 stream + + input: + wide_string - a wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a buffer for the UTF-8 stream + utf8_len - the size of the buffer + utf8_written - how many bytes have been written to the buffer + + this function returns false if there were some errors when converting or the output buffer was too small, + the output string is not null terminated + + if there is an error when converting (there is an incorrect character in the wide string) the function + will continue converting but if the buffer is too small the function breaks immediately +*/ +bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode) +{ + return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, utf8_written, mode); +} + + + +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a wide string for converting + string_len - lenght of the wide string + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a buffer for the UTF-8 stream + utf8_len - the size of the buffer + + this function returns false if there were some errors when converting or the output buffer was too small, + the output string is null terminated (even if there were errors during converting) + + if there is an error when converting (there is an incorrect character in the wide string) the function + will continue converting but if the buffer is too small the function breaks immediately + (in both cases the utf8 buffer is null terminated) +*/ +bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode) +{ +size_t utf8_saved; +bool res; + + if( utf8_len == 0 ) + return false; + + res = WideToUTF8(wide_string, string_len, utf8, utf8_len - 1, utf8_saved, mode); + utf8[utf8_saved] = 0; + +return res; +} + + + +/*! + this function converts a wide string (std::wstring) into UTF-8 stream + + input: + wide_string - a wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a buffer for the UTF-8 stream + utf8_len - the size of the buffer + + this function returns false if there were some errors when converting or the output buffer was too small, + the output string is null terminated (even if there were errors during converting) + + if there is an error when converting (there is an incorrect character in the wide string) the function + will continue converting but if the buffer is too small the function breaks immediately + (in both cases the utf8 buffer is null terminated) +*/ +bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode) +{ + return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, mode); +} + + + +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a null terminated wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a buffer for the UTF-8 stream + utf8_len - the size of the buffer + utf8_written - how many bytes have been written to the buffer + + this function returns false if there were some errors when converting or the output buffer was too small, + the output string is not null terminated + + if there is an error when converting (there is an incorrect character in the wide string) the function + will continue converting but if the buffer is too small the function breaks immediately +*/ +bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode) +{ +bool was_error = false; +bool was_buffer_to_small; +size_t chars, utf8_saved; +size_t len; + + utf8_written = 0; + + while( *wide_string ) + { + len = (*(wide_string+1) == 0) ? 1 : 2; + chars = WideOneToUTF8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode); + + if( was_buffer_to_small ) + { + /* + * if the buffer was too small break immediately + * and set the was_error flag + */ + was_error = true; + break; + } + + wide_string += chars; + utf8 += utf8_saved; + utf8_len -= utf8_saved; + utf8_written += utf8_saved; + } + +return !was_error; +} + + + +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a buffer for the UTF-8 stream + utf8_len - the size of the buffer + + this function returns false if there were some errors when converting or the output buffer was too small, + the output string is null terminated (even if there were errors during converting) + + if there is an error when converting (there is an incorrect character in the wide string) the function + will continue converting but if the buffer is too small the function breaks immediately + (in both cases the utf8 buffer is null terminated) +*/ +bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode) +{ +size_t utf8_saved; +bool res; + + if( utf8_len == 0 ) + return false; + + res = WideToUTF8(wide_string, utf8, utf8_len - 1, utf8_saved, mode); + utf8[utf8_saved] = 0; + +return res; +} + + + + } // namespace - - diff --git a/utf8/utf8.h b/utf8/utf8.h index 8030fcd..416d594 100755 --- a/utf8/utf8.h +++ b/utf8/utf8.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2010-2012, Tomasz Sowa + * Copyright (c) 2010-2014, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -97,10 +97,18 @@ size_t IntToUTF8(int z, std::ostream & utf8); bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear = true, int mode = 1); bool WideToUTF8(const wchar_t * wide_string, std::string & utf8, bool clear = true, int mode = 1); bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear = true, int mode = 1); + bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode = 1); bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode = 1); bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode = 1); +bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); +bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); +bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode = 1); + +bool WideToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode = 1); +bool WideToUTF8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode = 1); +bool WideToUTF8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode = 1);