reorganization in utf8

- utf8 auxiliary functions moved to utf8_private.h file - in utf8.h are shown only functions available for consumers - template functions has been moved to utf8_template.h (in utf8.h are only declarations) utf8_template.h is included at the end of utf8.h - functions which take std::ostream changed to template (the stream is a template argument now)
2021-03-15 19:34:51 +01:00
parent effe9be0a3
commit fac3a7eb71
5 changed files with 845 additions and 606 deletions
--- a/utf8/utf8.cpp
+++ b/utf8/utf8.cpp
@@ -5,7 +5,7 @@
 */

 /* 
- * Copyright (c) 2010-2018, Tomasz Sowa
+ * Copyright (c) 2010-2021, Tomasz Sowa
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@
 */

 #include "utf8.h"
+#include "utf8_private.h"



@@ -44,48 +45,6 @@ namespace PT



-/*!
-	an auxiliary function for converting from UTF-8 string
-*/
-static bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res)
-{
-	for(len=0 ; (uz & 0x80) != 0 ; ++len)
-		uz <<= 1;
-
-	if( len == 1 )
-		return false;
-
-	res = uz;
-	
-	if( len > 0 )
-		res >>= len;
-
-	if( res == 0 )
-		return false;
-
-	if( len == 0 )
-		len = 1;
-	
-return true;
-}
-
-
-
-/*!
-	an auxiliary function for converting from UTF-8 string
-*/
-static bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res)
-{
-	if( (uz & 0xc0) != 0x80 )
-		return false;
-
-	res <<= 6;
-	res |= (uz & 0x3F);
-
-return true;
-}
-
-


 /*!
@@ -126,15 +85,17 @@ size_t i, len;
 	if( utf8_len == 0 )
 		return 0;

-	if( !UTF8ToInt_FirstOctet(utf8[0], len, res) )
+	if( !private_namespace::UTF8ToInt_FirstOctet(utf8[0], len, res) )
 		return 1;

 	if( utf8_len < len )
 		return utf8_len;

 	for(i=1 ; i<len ; ++i)
-		if( !UTF8ToInt_AddNextOctet(utf8[i], res) )
+	{
+		if( !private_namespace::UTF8ToInt_AddNextOctet(utf8[i], res) )
 			return i;
+	}

 	if( UTF8_CheckRange(res) )
 		correct = true;
@@ -168,7 +129,7 @@ size_t i, len;
 	if( *utf8 == 0 )
 		return 0;

-	if( !UTF8ToInt_FirstOctet(utf8[0], len, res) )
+	if( !private_namespace::UTF8ToInt_FirstOctet(utf8[0], len, res) )
 		return 1;

 	for(i=1 ; i<len ; ++i)
@@ -176,7 +137,7 @@ size_t i, len;
 		if( utf8[i] == 0 )
 			return i;

-		if( !UTF8ToInt_AddNextOctet(utf8[i], res) )
+		if( !private_namespace::UTF8ToInt_AddNextOctet(utf8[i], res) )
 			return i;
 	}

@@ -235,7 +196,7 @@ unsigned char uz;
 	if( !utf8 )
 		return 0;

-	if( !UTF8ToInt_FirstOctet(uz, len, res) )
+	if( !private_namespace::UTF8ToInt_FirstOctet(uz, len, res) )
 		return 1;

 	for(i=1 ; i<len ; ++i)
@@ -245,7 +206,7 @@ unsigned char uz;
 		if( !utf8 )
 			return i;

-		if( !UTF8ToInt_AddNextOctet(uz, res) )
+		if( !private_namespace::UTF8ToInt_AddNextOctet(uz, res) )
 			return i;
 	}

@@ -485,268 +446,6 @@ return len;



-/*!
-	this function converts one wide character into UTF-8 stream
-
-	input:
-		z - wide character
-
-	output:
-		utf8 - a UTF-8 stream for the output sequence
-
-	the function returns how many characters have been written to the utf8 stream,
-	zero means that 'z' is an incorrect unicode character
-*/
-size_t IntToUTF8(int z, std::ostream & utf8)
-{
-char buf[10];
-
-	size_t len = IntToUTF8(z, buf, sizeof(buf)/sizeof(char));
-	size_t i;
-	
-	for(i=0 ; i<len ; ++i)
-		utf8 << buf[i];
-
-return len;
-}
-
-
-
-/*
-	an auxiliary function for converting from wide characters to UTF-8
-	converting a wide character into one int
-
-	returns how many wide characters were used
-	if string_len is greater than 0 then the return value is always greater than zero too
-*/
-static size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
-{
-	if( string_len == 0 )
-	{
-		z = 0;
-		correct = false;
-		return 0;
-	}
-
-	z = static_cast<int>(*wide_string);
-	correct = true;
-
-	if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) )
-	{
-		if( z>=0xD800 && z<=0xDBFF && string_len>1 )
-		{
-			int z2 = *(wide_string+1);
-			
-			if( z2>=0xDC00 && z2<=0xDFFF )
-			{
-				z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
-				return 2;
-			}
-			else
-			{
-				correct = false;
-				return 2;
-			}
-		}
-		else
-		{
-			correct = false;
-			return 1;
-		}
-	}
-	else
-	{
-		correct = UTF8_CheckRange(z);
-		return 1;
-	}
-}
-
-
-
-/*
-	an auxiliary function for converting from wide characters to UTF-8
-	converting a wide character into one int
-
-	returns how many wide characters were used
-	if wide_string has at least one character then the return value is always greater than zero too
-*/
-static size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct)
-{
-size_t min_str_len = 1;
-
-	if( *wide_string == 0 )
-	{
-		z = 0;
-		correct = false;
-		return 0;
-	}
-
-	if( *(wide_string+1) != 0 )
-		min_str_len = 2;
-
-return WideToInt(wide_string, min_str_len, z, correct);
-}
-
-
-
-/*!
-	an auxiliary function for converting from wide characters to UTF-8
-
-	returns how many wide characters were used
-	if string_len is greater than 0 then the return value is always greater than zero too
-
-	utf8_written - how many characters were saved in the utf8 string (the string doesn't have
-	               a null terminating character)
-	               it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
-	was_utf8_buf_too_small -  will be true if the utf8 buffer is too small
-	               if this flag is true then utf8_written is equal to zero
-	was_error    - will be true if there is an error when converting (there was an incorrect wide character)
-	               (was_error will not be true if the utf8 buffer is too small)
-*/
-static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
-							size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
-{
-int z;
-bool correct;
-size_t chars;
-
-	utf8_written = 0;
-	was_utf8_buf_too_small = false;
-	chars = WideToInt(wide_string, string_len, z, correct);
-
-	if( correct )
-	{
-		utf8_written = IntToUTF8(z, utf8, utf8_len);
-
-		if( utf8_written == 0 )
-			was_utf8_buf_too_small = true;
-	}
-	else
-	{
-		if( mode == 1 )
-		{
-			utf8_written = IntToUTF8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
-
-			if( utf8_written == 0 )
-				was_utf8_buf_too_small = true;
-		}
-
-		was_error = true;
-	}
-
-return chars;
-}
-
-
-
-/*!
-	an auxiliary function for converting from wide characters to UTF-8
-
-	returns how many wide characters were used
-	if string_len is greater than 0 then the return value is always greater than zero too
-*/
-static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
-{
-int z;
-bool correct;
-size_t chars;
-
-	chars = WideToInt(wide_string, string_len, z, correct);
-
-	if( correct )
-		correct = IntToUTF8(z, utf8, false) != 0;
-
-	if( !correct )
-	{
-		if( mode == 1 )
-			IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
-
-		was_error = true;
-	}
-
-return chars;
-}
-
-
-
-/*!
-	an auxiliary function for converting from wide characters to UTF-8
-
-	returns how many wide characters were used
-	if wide_string has at least one character then the return value is always greater than zero too
-*/
-static size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
-{
-int z;
-bool correct;
-size_t chars;
-
-	chars = WideToInt(wide_string, z, correct);
-
-	if( correct )
-		correct = IntToUTF8(z, utf8, false) != 0;
-
-	if( !correct )
-	{
-		if( mode == 1 )
-			IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
-
-		was_error = true;
-	}
-
-return chars;
-}
-
-
-
-/*!
-	an auxiliary function for converting from wide characters to UTF-8
-
-	returns how many wide characters were used
-	if string_len is greater than 0 then the return value is always greater than zero too
-*/
-static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, bool & was_error, int mode)
-{
-int z;
-bool correct;
-size_t chars;
-
-	chars = WideToInt(wide_string, string_len, z, correct);
-
-	if( correct )
-		correct = IntToUTF8(z, utf8) != 0;
-
-	if( !correct )
-	{
-		if( mode == 1 )
-			IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character"
-
-		was_error = true;
-	}
-
-return chars;
-}
-
-
-
-/*!
-	an auxiliary function for converting from wide characters to UTF-8
-*/
-static size_t WideOneToUTF8(const wchar_t * wide_string, std::ostream & utf8, bool & was_error, int mode)
-{
-size_t min_str_len = 1;
-
-	if( *wide_string == 0 )
-		return 0;
-
-	if( *(wide_string+1) != 0 )
-		min_str_len = 2;
-
-return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode);
-}
-
-
-
 /*!
 	this function converts a wide string into UTF-8 string

@@ -772,7 +471,7 @@ size_t chars;

 	while( string_len > 0 )
 	{
-		chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
+		chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
 		wide_string += chars;
 		string_len  -= chars;
 	}
@@ -804,7 +503,7 @@ bool was_error = false;
 		utf8.clear();

 	while( *wide_string )
-		wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
+		wide_string += private_namespace::WideOneToUTF8(wide_string, utf8, was_error, mode);

 return !was_error;
 }
@@ -832,83 +531,6 @@ bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear



-/*!
-	this function converts a wide string into UTF-8 stream
-
-	input:
-		wide_string - a wide string for converting
-		string_len - size of the string
-		mode - what to do with errors when converting
-			0: skip an invalid character
-			1: put U+FFFD "replacement character" istead of the invalid character (default)
-
-	output:
-		utf8 - a UTF-8 stream for the output sequence
-
-	this function returns false if there were some errors when converting
-*/
-bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode)
-{
-bool was_error = false;
-size_t chars;
-
-	while( string_len > 0 )
-	{
-		chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
-		wide_string += chars;
-		string_len  -= chars;
-	}
-
-return !was_error;
-}
-
-
-
-/*!
-	this function converts a wide string into UTF-8 stream
-
-	input:
-		wide_string - a null terminated wide string for converting
-		mode - what to do with errors when converting
-			0: skip an invalid character
-			1: put U+FFFD "replacement character" istead of the invalid character (default)
-
-	output:
-		utf8 - a UTF-8 stream for the output sequence
-
-	this function returns false if there were some errors when converting
-*/
-bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode)
-{
-bool was_error = false;
-
-	while( *wide_string )
-		wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
-
-return !was_error;
-}
-
-
-
-/*!
-	this function converts a wide string (std::wstring) into UTF-8 stream
-
-	input:
-		wide_string - a wide string for converting
-		mode - what to do with errors when converting
-			0: skip an invalid character
-			1: put U+FFFD "replacement character" istead of the invalid character (default)
-
-	output:
-		utf8 - a UTF-8 stream for the output sequence
-
-	this function returns false if there were some errors when converting
-*/
-bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode)
-{
-	return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, mode);
-}
-


 /*!
@@ -942,7 +564,7 @@ size_t chars, utf8_saved;

 	while( string_len > 0 )
 	{
-		chars = WideOneToUTF8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
+		chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);

 		if( was_buffer_to_small )
 		{
@@ -1089,7 +711,7 @@ size_t len;
 	while( *wide_string )
 	{
 		len = (*(wide_string+1) == 0) ? 1 : 2;
-		chars = WideOneToUTF8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
+		chars = private_namespace::WideOneToUTF8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);

 		if( was_buffer_to_small )
 		{