pikotools/src/utf8/utf8_templates.h

396 lines
8.5 KiB
C++

/*
* This file is a part of PikoTools
* and is distributed under the (new) BSD licence.
* Author: Tomasz Sowa <t.sowa@ttmath.org>
*/
/*
* Copyright (c) 2021, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* * Neither the name Tomasz Sowa nor the names of contributors to this
* project may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef headerfile_picotools_utf8_utf8_templates
#define headerfile_picotools_utf8_utf8_templates
// this file is included at the end of utf8.h
#include "utf8_private.h"
namespace pt
{
template<typename StreamType>
void int_to_wide(int c, StreamType & res)
{
if( sizeof(wchar_t)==2 && c>0xffff )
{
// UTF16 surrogate pairs
c -= 0x10000;
res << static_cast<wchar_t>(((c >> 10) & 0x3FF) + 0xD800);
res << static_cast<wchar_t>((c & 0x3FF) + 0xDC00);
}
else
{
res << static_cast<wchar_t>(c);
}
}
/*!
converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
(need to be tested)
*/
// need to be tested
template<typename StreamType>
bool utf8_to_wide(const char * utf8, size_t utf8_len, StreamType & res, bool clear, int mode)
{
if( clear )
res.clear();
bool status = private_namespace::utf8_to_wide_generic(utf8, utf8_len, mode, [&res](int c) {
int_to_wide(c, res);
});
return status;
}
template<typename StreamType>
bool utf8_to_wide(const char * utf8, StreamType & res, bool clear, int mode)
{
size_t utf8_len = 0;
while( utf8[utf8_len] != 0 )
utf8_len += 1;
return utf8_to_wide(utf8, utf8_len, res, clear, mode);
}
template<typename StreamType>
bool utf8_to_wide(const std::string & utf8, StreamType & res, bool clear, int mode)
{
return utf8_to_wide(utf8.c_str(), utf8.size(), res, clear, mode);
}
// need to be tested
template<typename StreamType>
bool utf8_to_wide(std::istream & utf8, StreamType & res, bool clear, int mode)
{
int z;
bool correct, was_error = false;
if( clear )
res.clear();
while( utf8_to_int(utf8, z, correct) > 0 )
{
if( !correct )
{
if( mode == 1 )
res << 0xFFFD; // U+FFFD "replacement character"
was_error = true;
}
else
{
int_to_wide(z, res);
}
}
return !was_error;
}
/*
this function converts a UTF-8 stream into wide stream
input:
stream - a UTF-8 stream for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
res - a wide stream for the output sequence
this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool utf8_to_wide(const Stream & stream, StreamType & res, bool clear, int mode)
{
size_t len;
bool correct;
int z;
size_t index = 0;
bool was_error = false;
if( clear )
res.clear();
// CHECKME test me when sizeof(wchar_t) is 2
do
{
len = utf8_to_int(stream, index, z, correct);
if( len > 0 )
{
if( !correct )
{
if( mode == 1 )
res << 0xFFFD; // U+FFFD "replacement character"
was_error = true;
}
else
{
int_to_wide(z, res);
}
index += len;
}
}
while( len > 0 );
return !was_error;
}
/*!
this function converts one wide character into UTF-8 stream
input:
z - wide character
output:
utf8 - a UTF-8 stream for the output sequence
the function returns how many characters have been written to the utf8 stream,
zero means that 'z' is an incorrect unicode character
*/
template<typename StreamType>
size_t int_to_utf8(int z, StreamType & utf8)
{
char buf[10];
size_t len = int_to_utf8(z, buf, sizeof(buf)/sizeof(char));
if( len > 0 )
utf8.write(buf, len);
return len;
}
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a wide string for converting
string_len - size of the string
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool wide_to_utf8(const wchar_t * wide_string, size_t string_len, StreamType & utf8, int mode)
{
bool was_error = false;
size_t chars;
while( string_len > 0 )
{
chars = private_namespace::wide_one_to_utf8(wide_string, string_len, utf8, was_error, mode);
wide_string += chars;
string_len -= chars;
}
return !was_error;
}
/*!
this function converts a wide string into UTF-8 stream
input:
wide_string - a null terminated wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool wide_to_utf8(const wchar_t * wide_string, StreamType & utf8, int mode)
{
bool was_error = false;
while( *wide_string )
wide_string += private_namespace::wide_one_to_utf8(wide_string, utf8, was_error, mode);
return !was_error;
}
/*!
this function converts a wide string (std::wstring) into UTF-8 stream
input:
wide_string - a wide string for converting
mode - what to do with errors when converting
0: skip an invalid character
1: put U+FFFD "replacement character" istead of the invalid character (default)
output:
utf8 - a UTF-8 stream for the output sequence
this function returns false if there were some errors when converting
*/
template<typename StreamType>
bool wide_to_utf8(const std::wstring & wide_string, StreamType & utf8, int mode)
{
return wide_to_utf8(wide_string.c_str(), wide_string.size(), utf8, mode);
}
template<typename StreamType>
void wide_stream_to_utf8(StreamType & buffer, std::string & utf8, bool clear, int mode)
{
if( clear )
utf8.clear();
private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
utf8.append(utf8_buffer, buffer_len);
});
}
template<typename StreamType>
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, int mode)
{
bool was_error = false;
if( clear )
utf8.clear();
for(size_t i=0 ; i < stream.size() ; ++i)
{
int c = static_cast<int>(stream.get_wchar(i));
bool is_correct = false;
if( utf8_check_range(c) )
{
// CHECKME test me when sizeof(wchar_t) == 2
if( is_first_surrogate_char(c) )
{
if( i + 1 < stream.size() )
{
wchar_t c1 = static_cast<wchar_t>(c);
wchar_t c2 = stream.get_wchar(++i);
if( surrogate_pair_to_int(c1, c2, c) )
{
is_correct = true;
}
}
}
else
{
is_correct = true;
}
}
if( is_correct )
{
int_to_utf8(c, utf8);
}
else
{
was_error = true;
if( mode == 1 )
int_to_utf8(0xFFFD, utf8); // U+FFFD "replacement character"
}
}
return !was_error;
}
// not tested
template<typename StreamTypeIn, typename StreamTypeOut>
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)
{
private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
utf8.write(utf8_buffer, buffer_len);
});
}
} // namespace pt
#endif