From 463cec328331e89792334e949dee661723fe52c1 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Fri, 7 May 2021 15:53:19 +0200
Subject: [PATCH] fixed #2: Procedures for reading an utf8 string incorrectly
 read some utf-8 characters. Those characters were treated as invalid
 characters.

UTF8ToInt_FirstOctet incorrectly checked if the first octed is zero (after removing first bits).
This is a case only if the utf-8 character consists of two bytes. For 3 or 4 bytes
the first part can have all bits equal zero.
---
 utf8/utf8.cpp         | 42 +++++++++++++++++++++++++++++++++++++++---
 utf8/utf8.h           |  8 ++++++++
 utf8/utf8_private.cpp |  5 +----
 3 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/utf8/utf8.cpp b/utf8/utf8.cpp
index b804e1f..6090e59 100644
--- a/utf8/utf8.cpp
+++ b/utf8/utf8.cpp
@@ -56,6 +56,42 @@ bool UTF8_CheckRange(int c)
 }
 
 
+/*!
+	returns true if 'c' is a correct unicode character
+
+	this method is used when reading from an utf8 string
+	how_many_bytes - means how many bytes from the utf8 string were read
+*/
+bool UTF8_CheckRange(int c, int how_many_bytes)
+{
+	if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 )
+	{
+		return true;
+	}
+
+	if( c >= 0x0080 && c <= 0x07ff && how_many_bytes == 2 )
+	{
+		return true;
+	}
+
+	if( c >= 0x0800 && c < 0xD800 && how_many_bytes == 3)
+	{
+		return true;
+	}
+
+	if( c > 0xDFFF && c <= 0xffff && how_many_bytes == 3)
+	{
+		return true;
+	}
+
+	if( c >= 0x10000 && c <= 0x10FFFF && how_many_bytes == 4 )
+	{
+		return true;
+	}
+
+return false;
+}
+
 
 
 /*!
@@ -97,7 +133,7 @@ size_t i, len;
 			return i;
 	}
 
-	if( UTF8_CheckRange(res) )
+	if( UTF8_CheckRange(res, len) )
 		correct = true;
 
 return len;
@@ -141,7 +177,7 @@ size_t i, len;
 			return i;
 	}
 
-	if( UTF8_CheckRange(res) )
+	if( UTF8_CheckRange(res, len) )
 		correct = true;
 
 return len;
@@ -210,7 +246,7 @@ unsigned char uz;
 			return i;
 	}
 
-	if( UTF8_CheckRange(res) )
+	if( UTF8_CheckRange(res, len) )
 		correct = true;
 
 return len;
diff --git a/utf8/utf8.h b/utf8/utf8.h
index a8c0362..979dd9e 100644
--- a/utf8/utf8.h
+++ b/utf8/utf8.h
@@ -64,6 +64,14 @@ namespace PT
 bool UTF8_CheckRange(int c);
 
 
+/*!
+	returns true if 'c' is a correct unicode character
+
+	this method is used when reading from an utf8 string
+	how_many_chars - means how many characters from utf8 string were read
+*/
+bool UTF8_CheckRange(int c, int how_many_bytes);
+
 
 /*
  *
diff --git a/utf8/utf8_private.cpp b/utf8/utf8_private.cpp
index cba2dfe..71248d0 100644
--- a/utf8/utf8_private.cpp
+++ b/utf8/utf8_private.cpp
@@ -52,7 +52,7 @@ bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res)
 	for(len=0 ; (uz & 0x80) != 0 ; ++len)
 		uz <<= 1;
 
-	if( len == 1 )
+	if( len == 1 || len > 4 )
 		return false;
 
 	res = uz;
@@ -60,9 +60,6 @@ bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res)
 	if( len > 0 )
 		res >>= len;
 
-	if( res == 0 )
-		return false;
-
 	if( len == 0 )
 		len = 1;