leavy only utf8.h and utf8.cpp

Remove utf8_private.h, utf8_private.cpp and utf8_templates.h and move their methods to utf8.h/utf8.cpp.
2024-05-30 21:20:25 +02:00
parent aacb1f43ae
commit 450c5d55e9
7 changed files with 1038 additions and 1206 deletions
--- a/src/utf8/utf8.cpp
+++ b/src/utf8/utf8.cpp
@@ -34,29 +34,27 @@

 #include <fstream>
 #include "utf8.h"
-#include "utf8_private.h"



 namespace pt
 {

-
-/*!
-	returns true if 'c' is a correct unicode character
-*/
+/*
+ * returns true if 'c' is a correct unicode character
+ */
 bool utf8_check_range(int c)
 {
 	return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF);
 }


-/*!
-	returns true if 'c' is a correct unicode character
-
-	this method is used when reading from an utf8 string
-	how_many_bytes - means how many bytes from the utf8 string were read
-*/
+/*
+ * returns true if 'c' is a correct unicode character
+ *
+ * this method is used when reading from an utf8 string
+ * how_many_bytes - means how many bytes from the utf8 string were read
+ */
 bool utf8_check_range(int c, int how_many_bytes)
 {
 	if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 )
@@ -126,12 +124,12 @@ bool surrogate_pair_to_int(int c1, int c2, int & z)


 /*
-	an auxiliary function for converting from wide characters to UTF-8
-	converting a wide character into one int
-
-	returns how many wide characters were used
-	if string_len is greater than 0 then the return value is always greater than zero too
-*/
+ * an auxiliary function for converting from wide characters to UTF-8
+ * converting a wide character into one int
+ *
+ * returns how many wide characters were used
+ * if string_len is greater than 0 then the return value is always greater than zero too
+ */
 size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
 {
 	if( string_len == 0 )
@@ -177,12 +175,12 @@ size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool


 /*
-	an auxiliary function for converting from wide characters to UTF-8
-	converting a wide character into one int
+ * an auxiliary function for converting from wide characters to UTF-8
+ * converting a wide character into one int

-	returns how many wide characters were used
-	if wide_string has at least one character then the return value is always greater than zero too
-*/
+ * returns how many wide characters were used
+ * if wide_string has at least one character then the return value is always greater than zero too
+ */
 size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)
 {
 size_t min_str_len = 1;
@@ -235,10 +233,10 @@ size_t int_to_wide(int c, wchar_t * res, size_t max_buf_len)


 /*
-	converts an int to a wide string
-
-	returns true if a character was inserted to the string
-*/
+ * converts an int to a wide string
+ *
+ * returns true if a character was inserted to the string
+ */
 bool int_to_wide(int c, std::wstring & res)
 {
 	wchar_t buf[2];
@@ -281,23 +279,23 @@ bool int_to_stream(int c, pt::Stream & stream)



-/*!
-	this function converts one UTF-8 character into one wide-character
-
-	input:
-		utf8 - an input UTF-8 string
-		utf8_len - size of the input string,
-		           the string should be at least 4 bytes length for correctly
-				   recognized the utf-8 sequence
-
-	output:
-		res - an output character
-		correct - true if it is a correct character
-
-		the function returns how many characters have been used from the input string
-		(returns zero only if utf8_len is zero)
-		even if there are errors the functions returns a different from zero value
-*/
+/*
+ * this function converts one UTF-8 character into one wide-character
+ *
+ * input:
+ *  utf8 - an input UTF-8 string
+ *  utf8_len - size of the input string,
+ *             the string should be at least 4 bytes length for correctly
+ *             recognized the utf-8 sequence
+ *
+ * output:
+ *  res - an output character
+ *  correct - true if it is a correct character
+ *
+ * the function returns how many characters have been used from the input string
+ * (returns zero only if utf8_len is zero)
+ * even if there are errors the functions returns a different from zero value
+ */
 size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct)
 {
 size_t i, len;
@@ -1016,6 +1014,160 @@ return res;



+namespace private_namespace
+{
+
+/*!
+	an auxiliary function for converting from UTF-8 string
+*/
+bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res)
+{
+	for(len=0 ; (uz & 0x80) != 0 ; ++len)
+		uz <<= 1;
+
+	if( len == 1 || len > 4 )
+		return false;
+
+	res = uz;
+
+	if( len > 0 )
+		res >>= len;
+
+	if( len == 0 )
+		len = 1;
+
+return true;
+}
+
+
+
+/*!
+	an auxiliary function for converting from UTF-8 string
+*/
+bool utf8_to_int_add_next_octet(unsigned char uz, int & res)
+{
+	if( (uz & 0xc0) != 0x80 )
+		return false;
+
+	res <<= 6;
+	res |= (uz & 0x3F);
+
+return true;
+}
+
+
+
+/*!
+	an auxiliary function for converting from wide characters to UTF-8
+
+	returns how many wide characters were used
+	if string_len is greater than 0 then the return value is always greater than zero too
+
+	utf8_written - how many characters were saved in the utf8 string (the string doesn't have
+				   a null terminating character)
+				   it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
+	was_utf8_buf_too_small -  will be true if the utf8 buffer is too small
+				   if this flag is true then utf8_written is equal to zero
+	was_error    - will be true if there is an error when converting (there was an incorrect wide character)
+				   (was_error will not be true if the utf8 buffer is too small)
+*/
+size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
+							size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
+{
+int z;
+bool correct;
+size_t chars;
+
+	utf8_written = 0;
+	was_utf8_buf_too_small = false;
+	chars = wide_to_int(wide_string, string_len, z, correct);
+
+	if( correct )
+	{
+		utf8_written = int_to_utf8(z, utf8, utf8_len);
+
+		if( utf8_written == 0 )
+			was_utf8_buf_too_small = true;
+	}
+	else
+	{
+		if( mode == 1 )
+		{
+			utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
+
+			if( utf8_written == 0 )
+				was_utf8_buf_too_small = true;
+		}
+
+		was_error = true;
+	}
+
+return chars;
+}
+
+
+
+/*!
+	an auxiliary function for converting from wide characters to UTF-8
+
+	returns how many wide characters were used
+	if string_len is greater than 0 then the return value is always greater than zero too
+*/
+size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
+{
+int z;
+bool correct;
+size_t chars;
+
+	chars = wide_to_int(wide_string, string_len, z, correct);
+
+	if( correct )
+		correct = int_to_utf8(z, utf8, false) != 0;
+
+	if( !correct )
+	{
+		if( mode == 1 )
+			int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
+
+		was_error = true;
+	}
+
+return chars;
+}
+
+
+
+/*!
+	an auxiliary function for converting from wide characters to UTF-8
+
+	returns how many wide characters were used
+	if wide_string has at least one character then the return value is always greater than zero too
+*/
+size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
+{
+int z;
+bool correct;
+size_t chars;
+
+	chars = wide_to_int(wide_string, z, correct);
+
+	if( correct )
+		correct = int_to_utf8(z, utf8, false) != 0;
+
+	if( !correct )
+	{
+		if( mode == 1 )
+			int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"
+
+		was_error = true;
+	}
+
+return chars;
+}
+
+} // namespace private_namespace
+
+

 } // namespace