pikotools/src/utf8/utf8_private.cpp

/*
 * This file is a part of PikoTools
 * and is distributed under the 2-Clause BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */

/*
 * Copyright (c) 2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include "utf8_private.h"


namespace pt
{

namespace private_namespace
{

/*!
	an auxiliary function for converting from UTF-8 string
*/
bool utf8_to_int_first_octet(unsigned char uz, size_t & len, int & res)
{
	for(len=0 ; (uz & 0x80) != 0 ; ++len)
		uz <<= 1;

	if( len == 1 || len > 4 )
		return false;

	res = uz;

	if( len > 0 )
		res >>= len;

	if( len == 0 )
		len = 1;

return true;
}


/*!
	an auxiliary function for converting from UTF-8 string
*/
bool utf8_to_int_add_next_octet(unsigned char uz, int & res)
{
	if( (uz & 0xc0) != 0x80 )
		return false;

	res <<= 6;
	res |= (uz & 0x3F);

return true;
}


/*
	an auxiliary function for converting from wide characters to UTF-8
	converting a wide character into one int

	returns how many wide characters were used
	if string_len is greater than 0 then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, size_t string_len, int & z, bool & correct)
{
	if( string_len == 0 )
	{
		z = 0;
		correct = false;
		return 0;
	}

	z = static_cast<int>(*wide_string);
	correct = true;

	if( sizeof(wchar_t) == 2 && is_surrogate_char(z) )
	{
		if( is_first_surrogate_char(z) && string_len>1 )
		{
			int z2 = *(wide_string+1);

			if( is_second_surrogate_char(z2) )
			{
				z = 0x10000 + (((z & 0x3FF) << 10) | (z2 & 0x3FF));
				return 2;
			}
			else
			{
				correct = false;
				return 2;
			}
		}
		else
		{
			correct = false;
			return 1;
		}
	}
	else
	{
		correct = utf8_check_range(z);
		return 1;
	}
}


/*
	an auxiliary function for converting from wide characters to UTF-8
	converting a wide character into one int

	returns how many wide characters were used
	if wide_string has at least one character then the return value is always greater than zero too
*/
size_t wide_to_int(const wchar_t * wide_string, int & z, bool & correct)
{
size_t min_str_len = 1;

	if( *wide_string == 0 )
	{
		z = 0;
		correct = false;
		return 0;
	}

	if( *(wide_string+1) != 0 )
		min_str_len = 2;

return wide_to_int(wide_string, min_str_len, z, correct);
}


/*!
	an auxiliary function for converting from wide characters to UTF-8

	returns how many wide characters were used
	if string_len is greater than 0 then the return value is always greater than zero too

	utf8_written - how many characters were saved in the utf8 string (the string doesn't have
	               a null terminating character)
	               it can be equal to zero if the utf8 buffer is too small or there was an incorrect wide character read
	was_utf8_buf_too_small -  will be true if the utf8 buffer is too small
	               if this flag is true then utf8_written is equal to zero
	was_error    - will be true if there is an error when converting (there was an incorrect wide character)
	               (was_error will not be true if the utf8 buffer is too small)
*/
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, char * utf8, size_t utf8_len,
							size_t & utf8_written, bool & was_utf8_buf_too_small, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;

	utf8_written = 0;
	was_utf8_buf_too_small = false;
	chars = wide_to_int(wide_string, string_len, z, correct);

	if( correct )
	{
		utf8_written = int_to_utf8(z, utf8, utf8_len);

		if( utf8_written == 0 )
			was_utf8_buf_too_small = true;
	}
	else
	{
		if( mode == 1 )
		{
			utf8_written = int_to_utf8(0xFFFD, utf8, utf8_len); // U+FFFD "replacement character"

			if( utf8_written == 0 )
				was_utf8_buf_too_small = true;
		}

		was_error = true;
	}

return chars;
}


/*!
	an auxiliary function for converting from wide characters to UTF-8

	returns how many wide characters were used
	if string_len is greater than 0 then the return value is always greater than zero too
*/
size_t wide_one_to_utf8(const wchar_t * wide_string, size_t string_len, std::string & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;

	chars = wide_to_int(wide_string, string_len, z, correct);

	if( correct )
		correct = int_to_utf8(z, utf8, false) != 0;

	if( !correct )
	{
		if( mode == 1 )
			int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"

		was_error = true;
	}

return chars;
}


/*!
	an auxiliary function for converting from wide characters to UTF-8

	returns how many wide characters were used
	if wide_string has at least one character then the return value is always greater than zero too
*/
size_t wide_one_to_utf8(const wchar_t * wide_string, std::string & utf8, bool & was_error, int mode)
{
int z;
bool correct;
size_t chars;

	chars = wide_to_int(wide_string, z, correct);

	if( correct )
		correct = int_to_utf8(z, utf8, false) != 0;

	if( !correct )
	{
		if( mode == 1 )
			int_to_utf8(0xFFFD, utf8, false); // U+FFFD "replacement character"

		was_error = true;
	}

return chars;
}


} // namespace private_namespace

} // namespace pt