moving utf8.cpp from ezc to pikotools

git-svn-id: svn://ttmath.org/publicrep/pikotools/trunk@367 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
Tomasz Sowa 2012-01-11 11:15:36 +00:00
parent 8f9887d01a
commit 8c3d76ce13
1 changed files with 896 additions and 0 deletions

896
utf8/utf8.cpp Executable file
View File

@ -0,0 +1,896 @@
/*
* This file is a part of EZC -- Easy templating in C++
* and is distributed under the (new) BSD licence.
* Author: Tomasz Sowa <t.sowa@ttmath.org>
*/
/*
* Copyright (c) 2010-2011, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* * Neither the name Tomasz Sowa nor the names of contributors to this
* project may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "utf8.h"
namespace Ezc
{
/*!
an auxiliary function for converting from UTF-8 string
*/
static bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res)
{
for(len=0 ; (uz & 0x80) != 0 ; ++len)
uz <<= 1;
if( len == 1 )
return false;
res = uz;
if( len > 0 )
res >>= len;
if( res == 0 )
return false;
if( len == 0 )
len = 1;
return true;
}
/*!
an auxiliary function for converting from UTF-8 string
*/
static bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res)
{
if( (uz & 0xc0) != 0x80 )
return false;
res <<= 6;
res |= (uz & 0x3F);
return true;
}
/*!
returns true if 'c' is a correct unicode character
*/
bool UTF8_CheckRange(int c)
{
return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF);
}
/*!
this function converts one UTF-8 character into one wide-character
input:
utf8 - an input UTF-8 string
utf8_len - size of the input string,
the string should be at least 4 bytes length for correctly
recognized the utf-8 sequence
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input string
(returns zero only if utf8_len is zero)
even if there are errors the functions returns a different from zero value
*/
size_t UTF8ToInt(const char * utf8, size_t utf8_len, int & res, bool & correct)
{
size_t i, len;
res = 0;
correct = false;
if( utf8_len == 0 )
return 0;
if( !UTF8ToInt_FirstOctet(utf8[0], len, res) )
return 1;
if( utf8_len < len )
return utf8_len;
for(i=1 ; i<len ; ++i)
if( !UTF8ToInt_AddNextOctet(utf8[i], res) )
return i;
if( UTF8_CheckRange(res) )
correct = true;
return len;
}
/*!
this function converts one UTF-8 character into one wide-character
input:
utf8 - an input UTF-8 string (null terminated)
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input string
(returns zero only if the string has '\0' at the first character)
even if there are errors the functions returns a different from zero value
*/
size_t UTF8ToInt(const char * utf8, int & res, bool & correct)
{
size_t i, len;
res = 0;
correct = false;
if( *utf8 == 0 )
return 0;
if( !UTF8ToInt_FirstOctet(utf8[0], len, res) )
return 1;
for(i=1 ; i<len ; ++i)
{
if( utf8[i] == 0 )
return i;
if( !UTF8ToInt_AddNextOctet(utf8[i], res) )
return i;
}
if( UTF8_CheckRange(res) )
correct = true;
return len;
}
/*!
this function converts one UTF-8 character into one wide-character
input:
utf8 - an input UTF-8 string
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input string
(returns zero only if utf8 is empty)
even if there are errors the functions returns a different from zero value
*/
size_t UTF8ToInt(const std::string & utf8, int & res, bool & correct)
{
return UTF8ToInt(utf8.c_str(), utf8.size(), res, correct);
}
/*!
this function converts one UTF-8 character into one wide-character
input:
utf8 - an input UTF-8 stream
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input stream
*/
size_t UTF8ToInt(std::istream & utf8, int & res, bool & correct)
{
size_t i, len;
unsigned char uz;
res = 0;
correct = false;
uz = utf8.get();
if( !utf8 )
return 0;
if( !UTF8ToInt_FirstOctet(uz, len, res) )
return 1;
for(i=1 ; i<len ; ++i)
{
uz = utf8.get();
if( !utf8 )
return i;
if( !UTF8ToInt_AddNextOctet(uz, res) )
return i;
}
if( UTF8_CheckRange(res) )
correct = true;
return len;
}
/*
*/
static void IntToWide(int c, std::wstring & res)
{
if( sizeof(wchar_t)==2 && c>0xffff )
{
c -= 0x10000;
res += static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
res += static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
}
else
{
res += static_cast<wchar_t>(c);
}
}
/*!
this function converts an utf8 string into wide string (std::wstring)
input:
utf8 - an input utf8 string
utf8_len - size of the input string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - an output wide string
the function returns false if there were some errors when converting
*/
bool UTF8ToWide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear, int mode)
{
int z;
size_t len;
bool correct, was_error = false;
if( clear )
res.clear();
while( utf8_len > 0 )
{
if( (unsigned char)*utf8 <= 0x7f )
{
// small optimization
len = 1;
correct = true;
z = static_cast<unsigned char>(*utf8);
}
else
{
len = UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero
}
if( !correct )
{
if( mode == 1 )
res += 0xFFFD; // U+FFFD "replacement character"
was_error = true;
}
else
{
IntToWide(z, res);
}
utf8 += len;
utf8_len -= len;
}
return !was_error;
}
/*!
this function converts an utf8 string into wide string (std::wstring)
input:
utf8 - an input utf8 null terminated string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - an output wide string
the function returns false if there were some errors when converting
*/
bool UTF8ToWide(const char * utf8, std::wstring & res, bool clear, int mode)
{
size_t utf8_len = 0;
while( utf8[utf8_len] != 0 )
utf8_len += 1;
return UTF8ToWide(utf8, utf8_len, res, clear, mode);
}
/*!
this function converts an utf8 string into wide string (std::wstring)
input:
utf8 - an input utf8 string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - an output wide string
the function returns false if there were some errors when converting
*/
bool UTF8ToWide(const std::string & utf8, std::wstring & res, bool clear, int mode)
{
return UTF8ToWide(utf8.c_str(), utf8.size(), res, clear, mode);
}
/*!
this function converts an utf8 stream into wide string (std::wstring)
input:
utf8 - an input utf8 stream
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - an output wide string
the function returns false if there were some errors when converting
*/
bool UTF8ToWide(std::istream & utf8, std::wstring & res, bool clear, int mode)
{
int z;
bool correct, was_error = false;
if( clear )
res.clear();
while( UTF8ToInt(utf8, z, correct) > 0 )
{
if( !correct )
{
if( mode == 1 )
res += 0xFFFD; // U+FFFD "replacement character"
was_error = true;
}
else
{
IntToWide(z, res);
}
}
return !was_error;
}
/*!
this function converts one wide character into UTF-8 sequence
input:
z - wide character
output:
utf8 - a buffer for the output sequence
utf8_len - the size of the buffer
the function returns how many characters have been written to the utf8,
zero means the utf8 buffer is too small or 'z' is an incorrect unicode character
*/
size_t IntToUTF8(int z, char * utf8, size_t utf8_max_len)
{
char buf[10];
int i = 0;
int mask = 0x3f; // 6 first bits set
if( utf8_max_len==0 || !UTF8_CheckRange(z) )
return 0;
if( z <= 0x7f )
{
utf8[0] = static_cast<char>(z);
return 1;
}
do
{
buf[i] = 0x80 | (z & 0x3f);
i += 1;
z >>= 6;
mask >>= 1;
}
while( (z & (~mask)) != 0 );
unsigned int first = -1;
first <<= (7 - i);
first |= (z & mask);
if( size_t(i+1) > utf8_max_len )
return 0;
utf8[0] = static_cast<char>(first);
int a = 1;
for(--i; i>=0 ; --i, ++a)
utf8[a] = buf[i];
return a;
}
/*!
this function converts one wide character into UTF-8 string
input:
z - wide character
output:
utf8 - a UTF-8 string for the output sequence (the string is not cleared)
the function returns how many characters have been written to the utf8 string,
zero means that 'z' is an incorrect unicode character
*/
size_t IntToUTF8(int z, std::string & utf8, bool clear)
{
char buf[10];
if( clear )
utf8.clear();
size_t len = IntToUTF8(z, buf, sizeof(buf)/sizeof(char));
size_t i;
for(i=0 ; i<len ; ++i)
utf8 += buf[i];
return len;
}
/*!
this function converts one wide character into UTF-8 stream
input:
z - wide character
output:
utf8 - a UTF-8 stream for the output sequence
the function returns how many characters have been written to the utf8 stream,
zero means that 'z' is an incorrect unicode character
*/
size_t IntToUTF8(int z, std::ostream & utf8)
{
char buf[10];
size_t len = IntToUTF8(z, buf, sizeof(buf)/sizeof(char));
size_t i;
for(i=0 ; i<len ; ++i)
utf8 << buf[i];
return len;
}
/*
an auxiliary function for converting from wide characters to UTF-8
converting a wide character into one int
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
static size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
{
if( string_len == 0 )
{
z = 0;
correct = false;
return 0;
}
z = static_cast<int>(*wide_string);
correct = true;
if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) )
{
if( z>=0xD800 && z<=0xDBFF && string_len>1 )
{
int z2 = *(wide_string+1);
if( z2>=0xDC00 && z2<=0xDFFF )
{
z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
return 2;
}
else
{
correct = false;
return 2;
}
}
else
{
correct = false;
return 1;
}
}
else
{
return 1;
}
}
/*
an auxiliary function for converting from wide characters to UTF-8
converting a wide character into one int
returns how many wide characters were used
if wide_string has at least one character then the return value is always greater than zero too
*/
static size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct)
{
size_t min_str_len = 1;
if( *wide_string == 0 )
{
z = 0;
correct = false;
return 0;
}
if( *(wide_string+1) != 0 )
min_str_len = 2;
return WideToInt(wide_string, min_str_len, z, correct);
}
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
chars = WideToInt(wide_string, string_len, z, correct);
if( correct )
correct = IntToUTF8(z, utf8, false) != 0;
if( !correct )
{
if( mode == 1 )
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
was_error = true;
}
return chars;
}
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if wide_string has at least one character then the return value is always greater than zero too
*/
static size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
chars = WideToInt(wide_string, z, correct);
if( correct )
correct = IntToUTF8(z, utf8, false) != 0;
if( !correct )
{
if( mode == 1 )
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
was_error = true;
}
return chars;
}
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
chars = WideToInt(wide_string, string_len, z, correct);
if( correct )
correct = IntToUTF8(z, utf8) != 0;
if( !correct )
{
if( mode == 1 )
IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character"
was_error = true;
}
return chars;
}
/*!
an auxiliary function for converting from wide characters to UTF-8
*/
static size_t WideOneToUTF8(const wchar_t * wide_string, std::ostream & utf8, bool & was_error, int mode)
{
size_t min_str_len = 1;
if( *wide_string == 0 )
return 0;
if( *(wide_string+1) != 0 )
min_str_len = 2;
return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode);
}
/*!
this function converts a wide string into UTF-8 string
input:
wide_string - a wide string for converting
string_len - the size of the string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 string for the output sequence (the string is not cleared)
this function returns false if there were some errors when converting
*/
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear, int mode)
{
bool was_error = false;
size_t chars;
if( clear )
utf8.clear();
while( string_len > 0 )
{
chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
wide_string += chars;
string_len -= chars;
}
return !was_error;
}
/*!
this function converts a wide string into UTF-8 string
input:
wide_string - a null terminated wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 string for the output sequence (the string is not cleared)
this function returns false if there were some errors when converting
*/
bool WideToUTF8(const wchar_t * wide_string, std::string & utf8, bool clear, int mode)
{
bool was_error = false;
if( clear )
utf8.clear();
while( *wide_string )
wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
return !was_error;
}
/*!
this function converts a wide string (std::wstring) into UTF-8 string
input:
wide_string - a wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 string for the output sequence (the string is not cleared)
this function returns false if there were some errors when converting
*/
bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear, int mode)
{
return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, clear, mode);
}
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a wide string for converting
string_len - size of the string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode)
{
bool was_error = false;
size_t chars;
while( string_len > 0 )
{
chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
wide_string += chars;
string_len -= chars;
}
return !was_error;
}
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a null terminated wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode)
{
bool was_error = false;
while( *wide_string )
wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
return !was_error;
}
/*!
this function converts a wide string (std::wstring) into UTF-8 stream
input:
wide_string - a wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode)
{
return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, mode);
}
} // namespace Ezc