fixed #2: Procedures for reading an utf8 string incorrectly read some utf-8 characters.
Those characters were treated as invalid characters. UTF8ToInt_FirstOctet incorrectly checked if the first octed is zero (after removing first bits). This is a case only if the utf-8 character consists of two bytes. For 3 or 4 bytes the first part can have all bits equal zero.
This commit is contained in:
parent
96eedd9be9
commit
463cec3283
|
@ -56,6 +56,42 @@ bool UTF8_CheckRange(int c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
returns true if 'c' is a correct unicode character
|
||||||
|
|
||||||
|
this method is used when reading from an utf8 string
|
||||||
|
how_many_bytes - means how many bytes from the utf8 string were read
|
||||||
|
*/
|
||||||
|
bool UTF8_CheckRange(int c, int how_many_bytes)
|
||||||
|
{
|
||||||
|
if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 )
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( c >= 0x0080 && c <= 0x07ff && how_many_bytes == 2 )
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( c >= 0x0800 && c < 0xD800 && how_many_bytes == 3)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( c > 0xDFFF && c <= 0xffff && how_many_bytes == 3)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( c >= 0x10000 && c <= 0x10FFFF && how_many_bytes == 4 )
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -97,7 +133,7 @@ size_t i, len;
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( UTF8_CheckRange(res) )
|
if( UTF8_CheckRange(res, len) )
|
||||||
correct = true;
|
correct = true;
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
|
@ -141,7 +177,7 @@ size_t i, len;
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( UTF8_CheckRange(res) )
|
if( UTF8_CheckRange(res, len) )
|
||||||
correct = true;
|
correct = true;
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
|
@ -210,7 +246,7 @@ unsigned char uz;
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( UTF8_CheckRange(res) )
|
if( UTF8_CheckRange(res, len) )
|
||||||
correct = true;
|
correct = true;
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
|
|
|
@ -64,6 +64,14 @@ namespace PT
|
||||||
bool UTF8_CheckRange(int c);
|
bool UTF8_CheckRange(int c);
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
returns true if 'c' is a correct unicode character
|
||||||
|
|
||||||
|
this method is used when reading from an utf8 string
|
||||||
|
how_many_chars - means how many characters from utf8 string were read
|
||||||
|
*/
|
||||||
|
bool UTF8_CheckRange(int c, int how_many_bytes);
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
*
|
*
|
||||||
|
|
|
@ -52,7 +52,7 @@ bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res)
|
||||||
for(len=0 ; (uz & 0x80) != 0 ; ++len)
|
for(len=0 ; (uz & 0x80) != 0 ; ++len)
|
||||||
uz <<= 1;
|
uz <<= 1;
|
||||||
|
|
||||||
if( len == 1 )
|
if( len == 1 || len > 4 )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
res = uz;
|
res = uz;
|
||||||
|
@ -60,9 +60,6 @@ bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res)
|
||||||
if( len > 0 )
|
if( len > 0 )
|
||||||
res >>= len;
|
res >>= len;
|
||||||
|
|
||||||
if( res == 0 )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if( len == 0 )
|
if( len == 0 )
|
||||||
len = 1;
|
len = 1;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue