From 2d304a9714abe337a927257a739efbd116e8a2c9 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Wed, 25 May 2011 21:43:40 +0000
Subject: [PATCH] changed to UTF-8: added support for UTF-16 when wchar_t is 2
 bytes long

git-svn-id: svn://ttmath.org/publicrep/ezc/trunk@345 e52654a7-88a9-db11-a3e9-0013d4bc506e
---
 src/utf8.cpp | 215 +++++++++++++++++++++++++++++++++++++++++++++++----
 src/utf8.h   |  13 ++--
 2 files changed, 205 insertions(+), 23 deletions(-)
diff --git a/src/utf8.cpp b/src/utf8.cpp
index 0bc8bbb..fb8365a 100755
--- a/src/utf8.cpp
+++ b/src/utf8.cpp
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2010, Tomasz Sowa
+ * Copyright (c) 2010-2011, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -212,6 +212,23 @@ return len;
 
 
 
+/*
+
+*/
+static void IntToWide(int c, std::wstring & res)
+{
+	if( sizeof(wchar_t)==2 && c>0xffff )
+	{
+		c -= 0x10000;
+		res += static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
+		res += static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
+	}
+	else
+	{
+		res += static_cast<wchar_t>(c);
+	}
+}
+
 
 
 
@@ -254,7 +271,7 @@ bool correct, was_error = false;
 			len = UTF8ToInt(utf8, utf8_len, z, correct); // the len will be different from zero
 		}
 
-		if( !correct || (sizeof(wchar_t)==2 && z>0xffff) )
+		if( !correct )
 		{
 			if( mode == 1 )
 				res += 0xFFFD; // U+FFFD "replacement character"
@@ -263,7 +280,7 @@ bool correct, was_error = false;
 		}
 		else
 		{
-			res += static_cast<wchar_t>(z);
+			IntToWide(z, res);
 		}
 
 		utf8     += len;
@@ -346,7 +363,7 @@ bool correct, was_error = false;
 
 	while( UTF8ToInt(utf8, z, correct) > 0 )
 	{
-		if( !correct || (sizeof(wchar_t)==2 && z>0xffff) )
+		if( !correct )
 		{
 			if( mode == 1 )
 				res += 0xFFFD; // U+FFFD "replacement character"
@@ -355,7 +372,7 @@ bool correct, was_error = false;
 		}
 		else
 		{
-			res += z;
+			IntToWide(z, res);
 		}
 	}
 
@@ -478,38 +495,192 @@ return len;
 
 
 
+/*
+	an auxiliary function for converting from wide characters to UTF-8
+	converting a wide character into one int
+
+	returns how many wide characters were used
+	if string_len is greater than 0 then the return value is always greater than zero too
+*/
+static size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
+{
+	if( string_len == 0 )
+	{
+		z = 0;
+		correct = false;
+		return 0;
+	}
+
+	z = static_cast<int>(*wide_string);
+	correct = true;
+
+	if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) )
+	{
+		if( z>=0xD800 && z<=0xDBFF && string_len>1 )
+		{
+			int z2 = *(wide_string+1);
+			
+			if( z2>=0xDC00 && z2<=0xDFFF )
+			{
+				z = 0x10000 + ((z & 0x3FF) << 10) | (z2 & 0x3FF);
+				return 2;
+			}
+			else
+			{
+				correct = false;
+				return 2;
+			}
+		}
+		else
+		{
+			correct = false;
+			return 1;
+		}
+	}
+	else
+	{
+		return 1;
+	}
+}
+
+
+
+/*
+	an auxiliary function for converting from wide characters to UTF-8
+	converting a wide character into one int
+
+	returns how many wide characters were used
+	if wide_string has at least one character then the return value is always greater than zero too
+*/
+static size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct)
+{
+size_t min_str_len = 1;
+
+	if( *wide_string == 0 )
+	{
+		z = 0;
+		correct = false;
+		return 0;
+	}
+
+	if( *(wide_string+1) != 0 )
+		min_str_len = 2;
+
+return WideToInt(wide_string, min_str_len, z, correct);
+}
+
+
 
 
 /*!
 	an auxiliary function for converting from wide characters to UTF-8
+
+	returns how many wide characters were used
+	if string_len is greater than 0 then the return value is always greater than zero too
 */
-static void WideToUTF8(wchar_t z, std::string & utf8, bool & was_error, int mode)
+static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
 {
-	if( IntToUTF8(z, utf8, false) == 0 )
+int z;
+bool correct;
+size_t chars;
+
+	chars = WideToInt(wide_string, string_len, z, correct);
+
+	if( correct )
+		correct = IntToUTF8(z, utf8, false) != 0;
+
+	if( !correct )
 	{
 		if( mode == 1 )
 			IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
 
 		was_error = true;
 	}
+
+return chars;
 }
 
 
+
 /*!
 	an auxiliary function for converting from wide characters to UTF-8
+
+	returns how many wide characters were used
+	if wide_string has at least one character then the return value is always greater than zero too
 */
-static void WideToUTF8(wchar_t z, std::ostream & utf8, bool & was_error, int mode)
+static size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
 {
-	if( IntToUTF8(z, utf8) == 0 )
+int z;
+bool correct;
+size_t chars;
+
+	chars = WideToInt(wide_string, z, correct);
+
+	if( correct )
+		correct = IntToUTF8(z, utf8, false) != 0;
+
+	if( !correct )
+	{
+		if( mode == 1 )
+			IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
+
+		was_error = true;
+	}
+
+return chars;
+}
+
+
+
+/*!
+	an auxiliary function for converting from wide characters to UTF-8
+
+	returns how many wide characters were used
+	if string_len is greater than 0 then the return value is always greater than zero too
+*/
+static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, bool & was_error, int mode)
+{
+int z;
+bool correct;
+size_t chars;
+
+	chars = WideToInt(wide_string, string_len, z, correct);
+
+	if( correct )
+		correct = IntToUTF8(z, utf8) != 0;
+
+	if( !correct )
 	{
 		if( mode == 1 )
 			IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character"
 
 		was_error = true;
 	}
+
+return chars;
 }
 
 
+
+/*!
+	an auxiliary function for converting from wide characters to UTF-8
+*/
+static size_t WideOneToUTF8(const wchar_t * wide_string, std::ostream & utf8, bool & was_error, int mode)
+{
+size_t min_str_len = 1;
+
+	if( *wide_string == 0 )
+		return 0;
+
+	if( *(wide_string+1) != 0 )
+		min_str_len = 2;
+
+return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode);
+}
+
+
+
+
 /*!
 	this function converts a wide string into UTF-8 string
 
@@ -528,12 +699,17 @@ static void WideToUTF8(wchar_t z, std::ostream & utf8, bool & was_error, int mod
 bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear, int mode)
 {
 bool was_error = false;
+size_t chars;
 
 	if( clear )
 		utf8.clear();
 
-	for(size_t i=0 ; i<string_len ; ++i)
-		WideToUTF8(wide_string[i], utf8, was_error, mode);
+	while( string_len > 0 )
+	{
+		chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
+		wide_string += chars;
+		string_len  -= chars;
+	}
 
 return !was_error;
 }
@@ -561,8 +737,8 @@ bool was_error = false;
 	if( clear )
 		utf8.clear();
 
-	for( ; *wide_string != 0 ; ++wide_string )
-		WideToUTF8(*wide_string, utf8, was_error, mode);
+	while( *wide_string )
+		wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
 
 return !was_error;
 }
@@ -608,9 +784,14 @@ bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear
 bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode)
 {
 bool was_error = false;
+size_t chars;
 
-	for(size_t i=0 ; i<string_len ; ++i)
-		WideToUTF8(wide_string[i], utf8, was_error, mode);
+	while( string_len > 0 )
+	{
+		chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
+		wide_string += chars;
+		string_len  -= chars;
+	}
 
 return !was_error;
 }
@@ -634,8 +815,8 @@ bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode)
 {
 bool was_error = false;
 
-	for( ; *wide_string != 0 ; ++wide_string )
-		WideToUTF8(*wide_string, utf8, was_error, mode);
+	while( *wide_string )
+		wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
 
 return !was_error;
 }
diff --git a/src/utf8.h b/src/utf8.h
index 1b7e4e7..8615f70 100755
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2010, Tomasz Sowa
+ * Copyright (c) 2010-2011, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -48,6 +48,12 @@ namespace Ezc
 /*!
 	UTF-8, a transformation format of ISO 10646
 	http://tools.ietf.org/html/rfc3629
+
+	when wchar_t is 4 bytes length we use UTF-32
+	when wchar_t is 2 bytes length we use UTF-16 (with surrogate pairs)
+
+	UTF-16
+	http://www.ietf.org/rfc/rfc2781.txt
 */
 
 
@@ -69,11 +75,6 @@ size_t UTF8ToInt(std::istream & utf8,                int & res, bool & correct);
 
 /*!
 	converting UTF-8 string to a wide string
-	
-	warning: current limitation
-	 on MS Windows wide characters consist of two bytes only
-	 and we tread them as UCS-2 (not UTF-16 with surrogate pairs as it should be trated)
-	 so unicode characters above 0xffff value are ignored (depending on 'mode' parameter)
 */
 bool UTF8ToWide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear = true, int mode = 1);
 bool UTF8ToWide(const char * utf8,                  std::wstring & res, bool clear = true, int mode = 1);