From 463cec328331e89792334e949dee661723fe52c1 Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Fri, 7 May 2021 15:53:19 +0200 Subject: [PATCH] fixed #2: Procedures for reading an utf8 string incorrectly read some utf-8 characters. Those characters were treated as invalid characters. UTF8ToInt_FirstOctet incorrectly checked if the first octed is zero (after removing first bits). This is a case only if the utf-8 character consists of two bytes. For 3 or 4 bytes the first part can have all bits equal zero. --- utf8/utf8.cpp | 42 +++++++++++++++++++++++++++++++++++++++--- utf8/utf8.h | 8 ++++++++ utf8/utf8_private.cpp | 5 +---- 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/utf8/utf8.cpp b/utf8/utf8.cpp index b804e1f..6090e59 100644 --- a/utf8/utf8.cpp +++ b/utf8/utf8.cpp @@ -56,6 +56,42 @@ bool UTF8_CheckRange(int c) } +/*! + returns true if 'c' is a correct unicode character + + this method is used when reading from an utf8 string + how_many_bytes - means how many bytes from the utf8 string were read +*/ +bool UTF8_CheckRange(int c, int how_many_bytes) +{ + if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 ) + { + return true; + } + + if( c >= 0x0080 && c <= 0x07ff && how_many_bytes == 2 ) + { + return true; + } + + if( c >= 0x0800 && c < 0xD800 && how_many_bytes == 3) + { + return true; + } + + if( c > 0xDFFF && c <= 0xffff && how_many_bytes == 3) + { + return true; + } + + if( c >= 0x10000 && c <= 0x10FFFF && how_many_bytes == 4 ) + { + return true; + } + +return false; +} + /*! @@ -97,7 +133,7 @@ size_t i, len; return i; } - if( UTF8_CheckRange(res) ) + if( UTF8_CheckRange(res, len) ) correct = true; return len; @@ -141,7 +177,7 @@ size_t i, len; return i; } - if( UTF8_CheckRange(res) ) + if( UTF8_CheckRange(res, len) ) correct = true; return len; @@ -210,7 +246,7 @@ unsigned char uz; return i; } - if( UTF8_CheckRange(res) ) + if( UTF8_CheckRange(res, len) ) correct = true; return len; diff --git a/utf8/utf8.h b/utf8/utf8.h index a8c0362..979dd9e 100644 --- a/utf8/utf8.h +++ b/utf8/utf8.h @@ -64,6 +64,14 @@ namespace PT bool UTF8_CheckRange(int c); +/*! + returns true if 'c' is a correct unicode character + + this method is used when reading from an utf8 string + how_many_chars - means how many characters from utf8 string were read +*/ +bool UTF8_CheckRange(int c, int how_many_bytes); + /* * diff --git a/utf8/utf8_private.cpp b/utf8/utf8_private.cpp index cba2dfe..71248d0 100644 --- a/utf8/utf8_private.cpp +++ b/utf8/utf8_private.cpp @@ -52,7 +52,7 @@ bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res) for(len=0 ; (uz & 0x80) != 0 ; ++len) uz <<= 1; - if( len == 1 ) + if( len == 1 || len > 4 ) return false; res = uz; @@ -60,9 +60,6 @@ bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res) if( len > 0 ) res >>= len; - if( res == 0 ) - return false; - if( len == 0 ) len = 1;