pikotools/src/utf8/utf8_templates.h

/*
 * This file is a part of PikoTools
 * and is distributed under the (new) BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */

/*
 * Copyright (c) 2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 *  * Neither the name Tomasz Sowa nor the names of contributors to this
 *    project may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef headerfile_picotools_utf8_utf8_templates
#define headerfile_picotools_utf8_utf8_templates

// this file is included at the end of utf8.h

#include "utf8_private.h"


namespace pt
{


template<typename StreamType>
void int_to_wide(int c, StreamType & res)
{
	if( sizeof(wchar_t)==2 && c>0xffff )
	{
		// UTF16 surrogate pairs
		c -= 0x10000;
		res << static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
		res << static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
	}
	else
	{
		res << static_cast<wchar_t>(c);
	}
}


/*!
	converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
	(need to be tested)
*/
// need to be tested
template<typename StreamType>
bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode)
{
	if( clear )
		res.clear();

	bool status = private_namespace::utf8_to_wide_generic(utf8, utf8_len, mode, [&res](int c) {
		int_to_wide(c, res);
	});

	return status;
}


template<typename StreamType>
bool utf8_to_wide(const char * utf8, StreamType & res, bool clear, int mode)
{
size_t utf8_len = 0;

	while( utf8[utf8_len] != 0 )
		utf8_len += 1;

return utf8_to_wide(utf8, utf8_len, res, clear, mode);
}


template<typename StreamType>
bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear, int mode)
{
	return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode);
}


// need to be tested
template<typename StreamType>
bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear, int mode)
{
int z;
bool correct, was_error = false;

	if( clear )
		res.clear();

	while( utf8_to_int(utf8, z, correct) > 0 )
	{
		if( !correct )
		{
			if( mode == 1 )
				res << 0xFFFD; // U+FFFD "replacement character"

			was_error = true;
		}
		else
		{
			int_to_wide(z, res);
		}
	}

return !was_error;
}


/*
this function converts a UTF-8 stream into wide stream

input:
	stream - a UTF-8 stream for converting
	mode - what to do with errors when converting
		0: skip an invalid character
		1: put U+FFFD "replacement character" istead of the invalid character (default)

output:
	res - a wide stream for the output sequence

	this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear, int mode)
{
	size_t len;
	bool correct;
	int z;
	size_t index = 0;
	bool was_error = false;

	if( clear )
		res.clear();

	// CHECKME test me when sizeof(wchar_t) is 2

	do
	{
		len = utf8_to_int(stream, index, z, correct);

		if( len > 0 )
		{
			if( !correct )
			{
				if( mode == 1 )
					res << 0xFFFD; // U+FFFD "replacement character"

				was_error = true;
			}
			else
			{
				int_to_wide(z, res);
			}

			index += len;
		}
	}
	while( len > 0 );

	return !was_error;
}


/*!
	this function converts one wide character into UTF-8 stream

	input:
		z - wide character

	output:
		utf8 - a UTF-8 stream for the output sequence

	the function returns how many characters have been written to the utf8 stream,
	zero means that 'z' is an incorrect unicode character
*/
template<typename StreamType>
size_t int_to_utf8(int z, StreamType & utf8)
{
	char buf[10];

	size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char));

	if( len > 0 )
		utf8.write(buf, len);

	return len;
}


/*!
	this function converts a wide string into UTF-8 stream

	input:
		wide_string - a wide string for converting
		string_len - size of the string
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a UTF-8 stream for the output sequence

	this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode)
{
bool was_error = false;
size_t chars;

	while( string_len > 0 )
	{
		chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode);
		wide_string += chars;
		string_len  -= chars;
	}

return !was_error;
}


/*!
	this function converts a wide string into UTF-8 stream

	input:
		wide_string - a null terminated wide string for converting
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a UTF-8 stream for the output sequence

	this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode)
{
bool was_error = false;

	while( *wide_string )
		wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode);

return !was_error;
}


/*!
	this function converts a wide string (std::wstring) into UTF-8 stream

	input:
		wide_string - a wide string for converting
		mode - what to do with errors when converting
			0: skip an invalid character
			1: put U+FFFD "replacement character" istead of the invalid character (default)

	output:
		utf8 - a UTF-8 stream for the output sequence

	this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode)
{
	return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, mode);
}


template<typename StreamType>
void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
{
	if( clear )
		utf8.clear();

	private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
		utf8.append(utf8_buffer, buffer_len);
	});
}


template<typename StreamType>
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode)
{
	bool was_error = false;

	if( clear )
		utf8.clear();

	for(size_t i=0 ; i < stream.size() ; ++i)
	{
		int c = static_cast<int>(stream.get_wchar(i));
		bool is_correct = false;

		if( utf8_check_range(c) )
		{
			// CHECKME test me when sizeof(wchar_t) == 2
			if( is_first_surrogate_char(c) )
			{
				if( i + 1 < stream.size() )
				{
					wchar_t c1 = static_cast<wchar_t>(c);
					wchar_t c2 = stream.get_wchar(++i);

					if( surrogate_pair_to_int(c1, c2, c) )
					{
						is_correct = true;
					}
				}
			}
			else
			{
				is_correct = true;
			}
		}

		if( is_correct )
		{
			int_to_utf8(c, utf8);
		}
		else
		{
			was_error = true;

			if( mode == 1 )
				int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
		}
	}

	return !was_error;
}


// not tested
template<typename StreamTypeIn, typename StreamTypeOut>
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)
{
	private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
		utf8.write(utf8_buffer, buffer_len);
	});
}


} // namespace pt

#endif