diff --git a/utf8/utf8.cpp b/utf8/utf8.cpp new file mode 100755 index 0000000..b78d314 --- /dev/null +++ b/utf8/utf8.cpp @@ -0,0 +1,896 @@ +/* + * This file is a part of EZC -- Easy templating in C++ + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2010-2011, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "utf8.h" + + +namespace Ezc +{ + + + +/*! + an auxiliary function for converting from UTF-8 string +*/ +static bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res) +{ + for(len=0 ; (uz & 0x80) != 0 ; ++len) + uz <<= 1; + + if( len == 1 ) + return false; + + res = uz; + + if( len > 0 ) + res >>= len; + + if( res == 0 ) + return false; + + if( len == 0 ) + len = 1; + +return true; +} + + + +/*! + an auxiliary function for converting from UTF-8 string +*/ +static bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res) +{ + if( (uz & 0xc0) != 0x80 ) + return false; + + res <<= 6; + res |= (uz & 0x3F); + +return true; +} + + + + +/*! + returns true if 'c' is a correct unicode character +*/ +bool UTF8_CheckRange(int c) +{ + return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF); +} + + + + +/*! + this function converts one UTF-8 character into one wide-character + + input: + utf8 - an input UTF-8 string + utf8_len - size of the input string, + the string should be at least 4 bytes length for correctly + recognized the utf-8 sequence + + output: + res - an output character + correct - true if it is a correct character + + the function returns how many characters have been used from the input string + (returns zero only if utf8_len is zero) + even if there are errors the functions returns a different from zero value +*/ +size_t UTF8ToInt(const char * utf8, size_t utf8_len, int & res, bool & correct) +{ +size_t i, len; + + res = 0; + correct = false; + + if( utf8_len == 0 ) + return 0; + + if( !UTF8ToInt_FirstOctet(utf8[0], len, res) ) + return 1; + + if( utf8_len < len ) + return utf8_len; + + for(i=1 ; i0xffff ) + { + c -= 0x10000; + res += static_cast(((c >> 10) & 0x3FF) + 0xD800); + res += static_cast((c & 0x3FF) + 0xDC00); + } + else + { + res += static_cast(c); + } +} + + + + + +/*! + this function converts an utf8 string into wide string (std::wstring) + + input: + utf8 - an input utf8 string + utf8_len - size of the input string + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + res - an output wide string + + the function returns false if there were some errors when converting +*/ +bool UTF8ToWide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear, int mode) +{ +int z; +size_t len; +bool correct, was_error = false; + + if( clear ) + res.clear(); + + while( utf8_len > 0 ) + { + if( (unsigned char)*utf8 <= 0x7f ) + { + // small optimization + len = 1; + correct = true; + z = static_cast(*utf8); + } + else + { + len = UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero + } + + if( !correct ) + { + if( mode == 1 ) + res += 0xFFFD; // U+FFFD "replacement character" + + was_error = true; + } + else + { + IntToWide(z, res); + } + + utf8 += len; + utf8_len -= len; + } + +return !was_error; +} + + + +/*! + this function converts an utf8 string into wide string (std::wstring) + + input: + utf8 - an input utf8 null terminated string + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + res - an output wide string + + the function returns false if there were some errors when converting +*/ +bool UTF8ToWide(const char * utf8, std::wstring & res, bool clear, int mode) +{ +size_t utf8_len = 0; + + while( utf8[utf8_len] != 0 ) + utf8_len += 1; + +return UTF8ToWide(utf8, utf8_len, res, clear, mode); +} + + + +/*! + this function converts an utf8 string into wide string (std::wstring) + + input: + utf8 - an input utf8 string + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + res - an output wide string + + the function returns false if there were some errors when converting +*/ +bool UTF8ToWide(const std::string & utf8, std::wstring & res, bool clear, int mode) +{ + return UTF8ToWide(utf8.c_str(), utf8.size(), res, clear, mode); +} + + + +/*! + this function converts an utf8 stream into wide string (std::wstring) + + input: + utf8 - an input utf8 stream + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + res - an output wide string + + the function returns false if there were some errors when converting +*/ +bool UTF8ToWide(std::istream & utf8, std::wstring & res, bool clear, int mode) +{ +int z; +bool correct, was_error = false; + + if( clear ) + res.clear(); + + while( UTF8ToInt(utf8, z, correct) > 0 ) + { + if( !correct ) + { + if( mode == 1 ) + res += 0xFFFD; // U+FFFD "replacement character" + + was_error = true; + } + else + { + IntToWide(z, res); + } + } + +return !was_error; +} + + + + +/*! + this function converts one wide character into UTF-8 sequence + + input: + z - wide character + + output: + utf8 - a buffer for the output sequence + utf8_len - the size of the buffer + + the function returns how many characters have been written to the utf8, + zero means the utf8 buffer is too small or 'z' is an incorrect unicode character +*/ +size_t IntToUTF8(int z, char * utf8, size_t utf8_max_len) +{ +char buf[10]; +int i = 0; +int mask = 0x3f; // 6 first bits set + + if( utf8_max_len==0 || !UTF8_CheckRange(z) ) + return 0; + + if( z <= 0x7f ) + { + utf8[0] = static_cast(z); + return 1; + } + + do + { + buf[i] = 0x80 | (z & 0x3f); + i += 1; + z >>= 6; + mask >>= 1; + } + while( (z & (~mask)) != 0 ); + + unsigned int first = -1; + first <<= (7 - i); + first |= (z & mask); + + if( size_t(i+1) > utf8_max_len ) + return 0; + + utf8[0] = static_cast(first); + + int a = 1; + for(--i; i>=0 ; --i, ++a) + utf8[a] = buf[i]; + +return a; +} + + + +/*! + this function converts one wide character into UTF-8 string + + input: + z - wide character + + output: + utf8 - a UTF-8 string for the output sequence (the string is not cleared) + + the function returns how many characters have been written to the utf8 string, + zero means that 'z' is an incorrect unicode character +*/ +size_t IntToUTF8(int z, std::string & utf8, bool clear) +{ +char buf[10]; + + if( clear ) + utf8.clear(); + + size_t len = IntToUTF8(z, buf, sizeof(buf)/sizeof(char)); + size_t i; + + for(i=0 ; i(*wide_string); + correct = true; + + if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) ) + { + if( z>=0xD800 && z<=0xDBFF && string_len>1 ) + { + int z2 = *(wide_string+1); + + if( z2>=0xDC00 && z2<=0xDFFF ) + { + z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF)); + return 2; + } + else + { + correct = false; + return 2; + } + } + else + { + correct = false; + return 1; + } + } + else + { + return 1; + } +} + + + +/* + an auxiliary function for converting from wide characters to UTF-8 + converting a wide character into one int + + returns how many wide characters were used + if wide_string has at least one character then the return value is always greater than zero too +*/ +static size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct) +{ +size_t min_str_len = 1; + + if( *wide_string == 0 ) + { + z = 0; + correct = false; + return 0; + } + + if( *(wide_string+1) != 0 ) + min_str_len = 2; + +return WideToInt(wide_string, min_str_len, z, correct); +} + + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = WideToInt(wide_string, string_len, z, correct); + + if( correct ) + correct = IntToUTF8(z, utf8, false) != 0; + + if( !correct ) + { + if( mode == 1 ) + IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if wide_string has at least one character then the return value is always greater than zero too +*/ +static size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = WideToInt(wide_string, z, correct); + + if( correct ) + correct = IntToUTF8(z, utf8, false) != 0; + + if( !correct ) + { + if( mode == 1 ) + IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 + + returns how many wide characters were used + if string_len is greater than 0 then the return value is always greater than zero too +*/ +static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, bool & was_error, int mode) +{ +int z; +bool correct; +size_t chars; + + chars = WideToInt(wide_string, string_len, z, correct); + + if( correct ) + correct = IntToUTF8(z, utf8) != 0; + + if( !correct ) + { + if( mode == 1 ) + IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character" + + was_error = true; + } + +return chars; +} + + + +/*! + an auxiliary function for converting from wide characters to UTF-8 +*/ +static size_t WideOneToUTF8(const wchar_t * wide_string, std::ostream & utf8, bool & was_error, int mode) +{ +size_t min_str_len = 1; + + if( *wide_string == 0 ) + return 0; + + if( *(wide_string+1) != 0 ) + min_str_len = 2; + +return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode); +} + + + + +/*! + this function converts a wide string into UTF-8 string + + input: + wide_string - a wide string for converting + string_len - the size of the string + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 string for the output sequence (the string is not cleared) + + this function returns false if there were some errors when converting +*/ +bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear, int mode) +{ +bool was_error = false; +size_t chars; + + if( clear ) + utf8.clear(); + + while( string_len > 0 ) + { + chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode); + wide_string += chars; + string_len -= chars; + } + +return !was_error; +} + + + +/*! + this function converts a wide string into UTF-8 string + + input: + wide_string - a null terminated wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 string for the output sequence (the string is not cleared) + + this function returns false if there were some errors when converting +*/ +bool WideToUTF8(const wchar_t * wide_string, std::string & utf8, bool clear, int mode) +{ +bool was_error = false; + + if( clear ) + utf8.clear(); + + while( *wide_string ) + wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode); + +return !was_error; +} + + + +/*! + this function converts a wide string (std::wstring) into UTF-8 string + + input: + wide_string - a wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 string for the output sequence (the string is not cleared) + + this function returns false if there were some errors when converting +*/ +bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear, int mode) +{ + return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, clear, mode); +} + + + +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a wide string for converting + string_len - size of the string + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 stream for the output sequence + + this function returns false if there were some errors when converting +*/ +bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode) +{ +bool was_error = false; +size_t chars; + + while( string_len > 0 ) + { + chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode); + wide_string += chars; + string_len -= chars; + } + +return !was_error; +} + + +/*! + this function converts a wide string into UTF-8 stream + + input: + wide_string - a null terminated wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 stream for the output sequence + + this function returns false if there were some errors when converting +*/ +bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode) +{ +bool was_error = false; + + while( *wide_string ) + wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode); + +return !was_error; +} + + + +/*! + this function converts a wide string (std::wstring) into UTF-8 stream + + input: + wide_string - a wide string for converting + mode - what to do with errors when converting + 0: skip an invalid character + 1: put U+FFFD "replacement character" istead of the invalid character (default) + + output: + utf8 - a UTF-8 stream for the output sequence + + this function returns false if there were some errors when converting +*/ +bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode) +{ + return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, mode); +} + + + + +} // namespace Ezc + + +