pikotools/src/utf8/utf8.cpp

/*
 * This file is a part of PikoTools
 * and is distributed under the (new) BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */

/* 
 * Copyright (c) 2010-2021, Tomasz Sowa
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *    
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *    
 *  * Neither the name Tomasz Sowa nor the names of contributors to this
 *    project may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <fstream>
#include "utf8.h"
#include "utf8_private.h"


namespace pt
{


/*!
	returns true if 'c' is a correct unicode character
*/
bool utf8_check_range(int c)
{
	return c>=0 && c<=0x10FFFF && !(c>=0xD800 && c<=0xDFFF);
}


/*!
	returns true if 'c' is a correct unicode character

	this method is used when reading from an utf8 string
	how_many_bytes - means how many bytes from the utf8 string were read
*/
bool utf8_check_range(int c, int how_many_bytes)
{
	if( c >= 0x0000 && c <= 0x007f && how_many_bytes == 1 )
	{
		return true;
	}

	if( c >= 0x0080 && c <= 0x07ff && how_many_bytes == 2 )
	{
		return true;
	}

	if( c >= 0x0800 && c < 0xD800 && how_many_bytes == 3)
	{
		return true;
	}

	if( c > 0xDFFF && c <= 0xffff && how_many_bytes == 3)
	{
		return true;
	}

	if( c >= 0x10000 && c <= 0x10FFFF && how_many_bytes == 4 )
	{
		return true;
	}

return false;
}


bool is_surrogate_char(int c)
{
	return (c>=0xD800 && c<=0xDFFF);
}


bool is_first_surrogate_char(int c)
{
	return (c>=0xD800 && c<=0xDBFF);
}


bool is_second_surrogate_char(int c)
{
	return (c>=0xDC00 && c<=0xDFFF);
}


bool surrogate_pair_to_int(int c1, int c2, int & z)
{
	z = 0xFFFD; // U+FFFD "replacement character";

	if( is_first_surrogate_char(c1) )
	{
		if( is_second_surrogate_char(c2) )
		{
			z = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF));
			return true;
		}
	}

	return false;
}


/*!
	this function converts one UTF-8 character into one wide-character

	input:
		utf8 - an input UTF-8 string
		utf8_len - size of the input string,
		           the string should be at least 4 bytes length for correctly
				   recognized the utf-8 sequence

	output:
		res - an output character
		correct - true if it is a correct character

		the function returns how many characters have been used from the input string
		(returns zero only if utf8_len is zero)
		even if there are errors the functions returns a different from zero value
*/
size_t utf8_to_int(const char * utf8, size_t utf8_len, int & res, bool & correct)
{
size_t i, len;

	res = 0;
	correct = false;

	if( utf8_len == 0 )
		return 0;

	if( !private_namespace::utf8_to_int_first_octet(utf8[0], len, res) )
		return 1;

	if( utf8_len < len )
		return utf8_len;

	for(i=1 ; i<len ; ++i)
	{
		if( !private_namespace::utf8_to_int_add_next_octet(utf8[i], res) )
			return i;
	}

	if( utf8_check_range(res, len) )
		correct = true;

return len;
}


/*!
	this function converts one UTF-8 character into one wide-character

	input:
		utf8 - an input UTF-8 string (null terminated)

	output:
		res - an output character
		correct - true if it is a correct character

		the function returns how many characters have been used from the input string
		(returns zero only if the string has '\0' at the first character)
		even if there are errors the functions returns a different from zero value
*/
size_t utf8_to_int(const char * utf8, int & res, bool & correct)
{
size_t i, len;

	res = 0;
	correct = false;

	if( *utf8 == 0 )
		return 0;

	if( !private_namespace::utf8_to_int_first_octet(utf8[0], len, res) )
		return 1;

	for(i=1 ; i<len ; ++i)
	{
		if( utf8[i] == 0 )
			return i;

		if( !private_namespace::utf8_to_int_add_next_octet(utf8[i], res) )
			return i;
	}

	if( utf8_check_range(res, len) )
		correct = true;

return len;
}


/*!
	this function converts one UTF-8 character into one wide-character

	input:
		utf8 - an input UTF-8 string

	output:
		res - an output character
		correct - true if it is a correct character

		the function returns how many characters have been used from the input string
		(returns zero only if utf8 is empty)
		even if there are errors the functions returns a different from zero value
*/
size_t utf8_to_int(const std::string & utf8, int & res, bool & correct)
{
	return utf8_to_int(utf8.c_str(), utf8.size(), res, correct);
}


/*!
	this function converts one UTF-8 character into one wide-character

	input:
		utf8 - an input UTF-8 stream

	output:
		res - an output character
		correct - true if it is a correct character

		the function returns how many characters have been used from the input stream
*/
size_t utf8_to_int(std::istream & utf8, int & res, bool & correct)
{
size_t i, len;
unsigned char uz;

	res = 0;
	correct = false;

	uz = utf8.get();

	if( !utf8 )
		return 0;

	if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
		return 1;

	for(i=1 ; i<len ; ++i)
	{
		uz = utf8.get();

		if( !utf8 )
			return i;

		if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
			return i;
	}

	if( utf8_check_range(res, len) )
		correct = true;

return len;
}


// new function, need to be tested a little more especially when sizeof(wchar_t) is 2
size_t utf8_to_int(const Stream & utf8, size_t stream_index, int & res, bool & correct)
{
size_t i, len;
unsigned char uz;

	res = 0;
	correct = false;
	len = 0;

	if( stream_index < utf8.size() )
	{
		uz = utf8.get_char(stream_index);

		if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
			return 1;

		if( stream_index + len < utf8.size() + 1 )
		{
			for(i=1 ; i<len ; ++i)
			{
				uz = utf8.get_char(stream_index + i);

				if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
					return i;
			}

			if( utf8_check_range(res, len) )
				correct = true;
		}
		else
		{
			len = utf8.size() - stream_index;
		}
	}

	return len;
}


/*

*/
static void int_to_wide(int c, std::wstring & res)
{
	if( sizeof(wchar_t)==2 && c>0xffff )
	{
		// UTF16 surrogate pairs
		c -= 0x10000;
		res += static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
		res += static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
	}
	else
	{
		res += static_cast<wchar_t>(c);
	}
}


/*!
	this function converts an utf8 string into wide string (std::wstring)

	input:
		utf8 - an input utf8 string
		utf8_len - size of the input string
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		res - an output wide string

		the function returns false if there were some errors when converting
*/
bool utf8_to_wide(const char * utf8, size_t utf8_len, std::wstring & res, bool clear, int mode)
{
	if( clear )
		res.clear();

	bool status = private_namespace::utf8_to_wide_generic(utf8, utf8_len, mode, [&res](int c) {
		int_to_wide(c, res);
	});

	return status;
}


/*!
	this function converts an utf8 string into wide string (std::wstring)

	input:
		utf8 - an input utf8 null terminated string
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		res - an output wide string

		the function returns false if there were some errors when converting
*/
bool utf8_to_wide(const char * utf8, std::wstring & res, bool clear, int mode)
{
size_t utf8_len = 0;

	while( utf8[utf8_len] != 0 )
		utf8_len += 1;

return utf8_to_wide(utf8, utf8_len, res, clear, mode);
}


/*!
	this function converts an utf8 string into wide string (std::wstring)

	input:
		utf8 - an input utf8 string
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		res - an output wide string

		the function returns false if there were some errors when converting
*/
bool utf8_to_wide(const std::string & utf8, std::wstring & res, bool clear, int mode)
{
	return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode);
}


/*!
	this function converts an utf8 stream into wide string (std::wstring)

	input:
		utf8 - an input utf8 stream
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		res - an output wide string

		the function returns false if there were some errors when converting
*/
bool utf8_to_wide(std::istream & utf8, std::wstring & res, bool clear, int mode)
{
int z;
bool correct, was_error = false;

	if( clear )
		res.clear();

	while( utf8_to_int(utf8, z, correct) > 0 )
	{
		if( !correct )
		{
			if( mode == 1 )
				res += 0xFFFD; // U+FFFD "replacement character"

			was_error = true;
		}
		else
		{
			int_to_wide(z, res);
		}
	}

return !was_error;
}


/*!
	this function converts one wide character into UTF-8 sequence

	input:
		z - wide character

	output:
		utf8 - a buffer for the output sequence
		utf8_len - the size of the buffer

		the function returns how many characters have been written to the utf8,
		zero means the utf8 buffer is too small or 'z' is an incorrect unicode character
*/
size_t int_to_utf8(int z, char * utf8, size_t utf8_max_len)
{
char buf[10];
int i = 0;
int mask = 0x3f; // 6 first bits set

	if( utf8_max_len==0 || !utf8_check_range(z) )
		return 0;

	if( z <= 0x7f )
	{
		utf8[0] = static_cast<char>(z);
		return 1;
	}

	do
	{
		buf[i] = 0x80 | (z & 0x3f);
		i += 1;
		z >>= 6;
		mask >>= 1;
	}
	while( (z & (~mask)) != 0  );

	unsigned int first = -1;
	first <<= (7 - i);
	first |=  (z & mask);

	if( size_t(i+1) > utf8_max_len )
		return 0;

	utf8[0] = static_cast<char>(first);

	int a = 1;
	for(--i; i>=0 ; --i, ++a)
		utf8[a] = buf[i];

return a;
}


/*!
	this function converts one wide character into UTF-8 string

	input:
		z - wide character

	output:
		utf8 - a UTF-8 string for the output sequence (the string is not cleared)

	the function returns how many characters have been written to the utf8 string,
	zero means that 'z' is an incorrect unicode character
*/
size_t int_to_utf8(int z, std::string & utf8, bool clear)
{
char buf[10];

	if( clear )
		utf8.clear();

	size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char));
	size_t i;
	
	for(i=0 ; i<len ; ++i)
		utf8 += buf[i];

return len;
}


/*!
	this function converts a wide string into UTF-8 string

	input:
		wide_string - a wide string for converting
		string_len - the size of the string
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a UTF-8 string for the output sequence (the string is not cleared)

	this function returns false if there were some errors when converting
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool clear, int mode)
{
bool was_error = false;
size_t chars;

	if( clear )
		utf8.clear();

	while( string_len > 0 )
	{
		chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode);
		wide_string += chars;
		string_len  -= chars;
	}

return !was_error;
}


/*!
	this function converts a wide string into UTF-8 string

	input:
		wide_string - a null terminated wide string for converting
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a UTF-8 string for the output sequence (the string is not cleared)

	this function returns false if there were some errors when converting
*/
bool wide_to_utf8(const wchar_t * wide_string, std::string & utf8, bool clear, int mode)
{
bool was_error = false;

	if( clear )
		utf8.clear();

	while( *wide_string )
		wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode);

return !was_error;
}


/*!
	this function converts a wide string (std::wstring) into UTF-8 string

	input:
		wide_string - a wide string for converting
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a UTF-8 string for the output sequence (the string is not cleared)

	this function returns false if there were some errors when converting
*/
bool wide_to_utf8(const std::wstring & wide_string, std::string & utf8, bool clear, int mode)
{
	return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, clear, mode);
}


/*!
	this function converts a wide string into UTF-8 stream

	input:
		wide_string - a wide string for converting
		string_len  - lenght of the wide string
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a buffer for the UTF-8 stream
		utf8_len - the size of the buffer
		utf8_written - how many bytes have been written to the buffer

	this function returns false if there were some errors when converting or the output buffer was too small,
	the output string is not null terminated

	if there is an error when converting (there is an incorrect character in the wide string) the function
	will continue converting but if the buffer is too small the function breaks immediately
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, size_t & utf8_written, int mode)
{
bool was_error = false;
bool was_buffer_to_small;
size_t chars, utf8_saved;

	utf8_written = 0;

	while( string_len > 0 )
	{
		chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);

		if( was_buffer_to_small )
		{
			/*
			 * if the buffer was too small break immediately
			 * and set the was_error flag
			 */
			was_error = true;
			break;
		}

		wide_string  += chars;
		string_len   -= chars;
		utf8         += utf8_saved;
		utf8_len     -= utf8_saved;
		utf8_written += utf8_saved;
	}

return !was_error;
}


/*!
	this function converts a wide string (std::wstring) into UTF-8 stream

	input:
		wide_string - a wide string for converting
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a buffer for the UTF-8 stream
		utf8_len - the size of the buffer
		utf8_written - how many bytes have been written to the buffer

	this function returns false if there were some errors when converting or the output buffer was too small,
	the output string is not null terminated

	if there is an error when converting (there is an incorrect character in the wide string) the function
	will continue converting but if the buffer is too small the function breaks immediately
*/
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode)
{
	return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, utf8_written, mode);
}


/*!
	this function converts a wide string into UTF-8 stream

	input:
		wide_string - a wide string for converting
		string_len  - lenght of the wide string
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a buffer for the UTF-8 stream
		utf8_len - the size of the buffer

	this function returns false if there were some errors when converting or the output buffer was too small,
	the output string is null terminated (even if there were errors during converting)

	if there is an error when converting (there is an incorrect character in the wide string) the function
	will continue converting but if the buffer is too small the function breaks immediately
	(in both cases the utf8 buffer is null terminated)
*/
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len, int mode)
{
size_t utf8_saved;
bool res;

	if( utf8_len == 0 )
		return false;

	res = wide_to_utf8(wide_string, string_len, utf8, utf8_len - 1, utf8_saved, mode);
	utf8[utf8_saved] = 0;

return res;
}


/*!
	this function converts a wide string (std::wstring) into UTF-8 stream

	input:
		wide_string - a wide string for converting
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a buffer for the UTF-8 stream
		utf8_len - the size of the buffer

	this function returns false if there were some errors when converting or the output buffer was too small,
	the output string is null terminated (even if there were errors during converting)

	if there is an error when converting (there is an incorrect character in the wide string) the function
	will continue converting but if the buffer is too small the function breaks immediately
	(in both cases the utf8 buffer is null terminated)
*/
bool wide_to_utf8(const std::wstring & wide_string, char * utf8, size_t utf8_len, int mode)
{
	return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, utf8_len, mode);
}


/*!
	this function converts a wide string into UTF-8 stream

	input:
		wide_string - a null terminated wide string for converting
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a buffer for the UTF-8 stream
		utf8_len - the size of the buffer
		utf8_written - how many bytes have been written to the buffer

	this function returns false if there were some errors when converting or the output buffer was too small,
	the output string is not null terminated

	if there is an error when converting (there is an incorrect character in the wide string) the function
	will continue converting but if the buffer is too small the function breaks immediately
*/
bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, size_t & utf8_written, int mode)
{
bool was_error = false;
bool was_buffer_to_small;
size_t chars, utf8_saved;
size_t len;

	utf8_written = 0;

	while( *wide_string )
	{
		len = (*(wide_string+1) == 0) ? 1 : 2;
		chars = private_namespace::wide_one_to_utf8(wide_string, len, utf8, utf8_len, utf8_saved, was_buffer_to_small, was_error, mode);

		if( was_buffer_to_small )
		{
			/*
			 * if the buffer was too small break immediately
			 * and set the was_error flag
			 */
			was_error = true;
			break;
		}

		wide_string  += chars;
		utf8         += utf8_saved;
		utf8_len     -= utf8_saved;
		utf8_written += utf8_saved;
	}

return !was_error;
}


/*!
	this function converts a wide string into UTF-8 stream

	input:
		wide_string - a wide string for converting
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a buffer for the UTF-8 stream
		utf8_len - the size of the buffer

	this function returns false if there were some errors when converting or the output buffer was too small,
	the output string is null terminated (even if there were errors during converting)

	if there is an error when converting (there is an incorrect character in the wide string) the function
	will continue converting but if the buffer is too small the function breaks immediately
	(in both cases the utf8 buffer is null terminated)
*/
bool wide_to_utf8(const wchar_t * wide_string, char * utf8, size_t utf8_len, int mode)
{
size_t utf8_saved;
bool res;

	if( utf8_len == 0 )
		return false;

	res = wide_to_utf8(wide_string, utf8, utf8_len - 1, utf8_saved, mode);
	utf8[utf8_saved] = 0;

return res;
}


} // namespace