reorganization in utf8

- utf8 auxiliary functions moved to utf8_private.h file
- in utf8.h are shown only functions available for consumers
- template functions has been moved to utf8_template.h (in utf8.h are only declarations)
  utf8_template.h is included at the end of utf8.h
- functions which take std::ostream changed to template (the stream is a template argument now)
This commit is contained in:
2021-03-15 19:34:51 +01:00
parent effe9be0a3
commit fac3a7eb71
5 changed files with 845 additions and 606 deletions

View File

@@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2010-2018, Tomasz Sowa
* Copyright (c) 2010-2021, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@
*/
#include "utf8.h"
#include "utf8_private.h"
@@ -44,48 +45,6 @@ namespace PT
/*!
an auxiliary function for converting from UTF-8 string
*/
static bool UTF8ToInt_FirstOctet(unsigned char uz, size_t & len, int & res)
{
for(len=0 ; (uz & 0x80) != 0 ; ++len)
uz <<= 1;
if( len == 1 )
return false;
res = uz;
if( len > 0 )
res >>= len;
if( res == 0 )
return false;
if( len == 0 )
len = 1;
return true;
}
/*!
an auxiliary function for converting from UTF-8 string
*/
static bool UTF8ToInt_AddNextOctet(unsigned char uz, int & res)
{
if( (uz & 0xc0) != 0x80 )
return false;
res <<= 6;
res |= (uz & 0x3F);
return true;
}
/*!
@@ -126,15 +85,17 @@ size_t i, len;
if( utf8_len == 0 )
return 0;
if( !UTF8ToInt_FirstOctet(utf8[0], len, res) )
if( !private_namespace::UTF8ToInt_FirstOctet(utf8[0], len, res) )
return 1;
if( utf8_len < len )
return utf8_len;
for(i=1 ; i<len ; ++i)
if( !UTF8ToInt_AddNextOctet(utf8[i], res) )
{
if( !private_namespace::UTF8ToInt_AddNextOctet(utf8[i], res) )
return i;
}
if( UTF8_CheckRange(res) )
correct = true;
@@ -168,7 +129,7 @@ size_t i, len;
if( *utf8 == 0 )
return 0;
if( !UTF8ToInt_FirstOctet(utf8[0], len, res) )
if( !private_namespace::UTF8ToInt_FirstOctet(utf8[0], len, res) )
return 1;
for(i=1 ; i<len ; ++i)
@@ -176,7 +137,7 @@ size_t i, len;
if( utf8[i] == 0 )
return i;
if( !UTF8ToInt_AddNextOctet(utf8[i], res) )
if( !private_namespace::UTF8ToInt_AddNextOctet(utf8[i], res) )
return i;
}
@@ -235,7 +196,7 @@ unsigned char uz;
if( !utf8 )
return 0;
if( !UTF8ToInt_FirstOctet(uz, len, res) )
if( !private_namespace::UTF8ToInt_FirstOctet(uz, len, res) )
return 1;
for(i=1 ; i<len ; ++i)
@@ -245,7 +206,7 @@ unsigned char uz;
if( !utf8 )
return i;
if( !UTF8ToInt_AddNextOctet(uz, res) )
if( !private_namespace::UTF8ToInt_AddNextOctet(uz, res) )
return i;
}
@@ -485,268 +446,6 @@ return len;
/*!
this function converts one wide character into UTF-8 stream
input:
z - wide character
output:
utf8 - a UTF-8 stream for the output sequence
the function returns how many characters have been written to the utf8 stream,
zero means that 'z' is an incorrect unicode character
*/
size_t IntToUTF8(int z, std::ostream & utf8)
{
char buf[10];
size_t len = IntToUTF8(z, buf, sizeof(buf)/sizeof(char));
size_t i;
for(i=0 ; i<len ; ++i)
utf8 << buf[i];
return len;
}
/*
an auxiliary function for converting from wide characters to UTF-8
converting a wide character into one int
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
static size_t WideToInt(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
{
if( string_len == 0 )
{
z = 0;
correct = false;
return 0;
}
z = static_cast<int>(*wide_string);
correct = true;
if( sizeof(wchar_t) == 2 && (z>=0xD800 && z<=0xDFFF) )
{
if( z>=0xD800 && z<=0xDBFF && string_len>1 )
{
int z2 = *(wide_string+1);
if( z2>=0xDC00 && z2<=0xDFFF )
{
z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
return 2;
}
else
{
correct = false;
return 2;
}
}
else
{
correct = false;
return 1;
}
}
else
{
correct = UTF8_CheckRange(z);
return 1;
}
}
/*
an auxiliary function for converting from wide characters to UTF-8
converting a wide character into one int
returns how many wide characters were used
if wide_string has at least one character then the return value is always greater than zero too
*/
static size_t WideToInt(const wchar_t * wide_string, int & z, bool & correct)
{
size_t min_str_len = 1;
if( *wide_string == 0 )
{
z = 0;
correct = false;
return 0;
}
if( *(wide_string+1) != 0 )
min_str_len = 2;
return WideToInt(wide_string, min_str_len, z, correct);
}
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
utf8_written - how many characters were saved in the utf8 string (the string doesn't have
a null terminating character)
it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
was_utf8_buf_too_small - will be true if the utf8 buffer is too small
if this flag is true then utf8_written is equal to zero
was_error - will be true if there is an error when converting (there was an incorrect wide character)
(was_error will not be true if the utf8 buffer is too small)
*/
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
utf8_written = 0;
was_utf8_buf_too_small = false;
chars = WideToInt(wide_string, string_len, z, correct);
if( correct )
{
utf8_written = IntToUTF8(z, utf8, utf8_len);
if( utf8_written == 0 )
was_utf8_buf_too_small = true;
}
else
{
if( mode == 1 )
{
utf8_written = IntToUTF8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"
if( utf8_written == 0 )
was_utf8_buf_too_small = true;
}
was_error = true;
}
return chars;
}
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
chars = WideToInt(wide_string, string_len, z, correct);
if( correct )
correct = IntToUTF8(z, utf8, false) != 0;
if( !correct )
{
if( mode == 1 )
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
was_error = true;
}
return chars;
}
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if wide_string has at least one character then the return value is always greater than zero too
*/
static size_t WideOneToUTF8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
chars = WideToInt(wide_string, z, correct);
if( correct )
correct = IntToUTF8(z, utf8, false) != 0;
if( !correct )
{
if( mode == 1 )
IntToUTF8(0xFFFD, utf8, false); // U+FFFD "replacement character"
was_error = true;
}
return chars;
}
/*!
an auxiliary function for converting from wide characters to UTF-8
returns how many wide characters were used
if string_len is greater than 0 then the return value is always greater than zero too
*/
static size_t WideOneToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;
chars = WideToInt(wide_string, string_len, z, correct);
if( correct )
correct = IntToUTF8(z, utf8) != 0;
if( !correct )
{
if( mode == 1 )
IntToUTF8(0xFFFD, utf8); // U+FFFD "replacement character"
was_error = true;
}
return chars;
}
/*!
an auxiliary function for converting from wide characters to UTF-8
*/
static size_t WideOneToUTF8(const wchar_t * wide_string, std::ostream & utf8, bool & was_error, int mode)
{
size_t min_str_len = 1;
if( *wide_string == 0 )
return 0;
if( *(wide_string+1) != 0 )
min_str_len = 2;
return WideOneToUTF8(wide_string, min_str_len, utf8, was_error, mode);
}
/*!
this function converts a wide string into UTF-8 string
@@ -772,7 +471,7 @@ size_t chars;
while( string_len > 0 )
{
chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
wide_string += chars;
string_len -= chars;
}
@@ -804,7 +503,7 @@ bool was_error = false;
utf8.clear();
while( *wide_string )
wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
wide_string += private_namespace::WideOneToUTF8(wide_string, utf8, was_error, mode);
return !was_error;
}
@@ -832,83 +531,6 @@ bool WideToUTF8(const std::wstring & wide_string, std::string & utf8, bool clear
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a wide string for converting
string_len - size of the string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
bool WideToUTF8(const wchar_t * wide_string, size_t string_len, std::ostream & utf8, int mode)
{
bool was_error = false;
size_t chars;
while( string_len > 0 )
{
chars = WideOneToUTF8(wide_string, string_len, utf8, was_error, mode);
wide_string += chars;
string_len -= chars;
}
return !was_error;
}
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a null terminated wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
bool WideToUTF8(const wchar_t * wide_string, std::ostream & utf8, int mode)
{
bool was_error = false;
while( *wide_string )
wide_string += WideOneToUTF8(wide_string, utf8, was_error, mode);
return !was_error;
}
/*!
this function converts a wide string (std::wstring) into UTF-8 stream
input:
wide_string - a wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
bool WideToUTF8(const std::wstring & wide_string, std::ostream & utf8, int mode)
{
return WideToUTF8(wide_string.c_str(), wide_string.size(), utf8, mode);
}
/*!
@@ -942,7 +564,7 @@ size_t chars, utf8_saved;
while( string_len > 0 )
{
chars = WideOneToUTF8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
chars = private_namespace::WideOneToUTF8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
if( was_buffer_to_small )
{
@@ -1089,7 +711,7 @@ size_t len;
while( *wide_string )
{
len = (*(wide_string+1) == 0) ? 1 : 2;
chars = WideOneToUTF8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
chars = private_namespace::WideOneToUTF8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);
if( was_buffer_to_small )
{