fix: correctly escape json/xml/csv wide strings

A wide string was first changed to utf-8 and then escaped to json/xml/csv
which is incorrect. First should be escaped and then changed to utf-8.

Add TextStreamBase<>::iterator and TextStreamBase<>::const_interator as classes
with a method wchar_t get_unicode_and_advance(const iterator & end)
to return one character either from utf-8 stream or from wide stream.

Let TextStreamBase<>::operator<<(wchar_t v) correctly use utf-8.
This commit is contained in:
2022-02-03 19:08:21 +01:00
parent fd1a8270cd
commit 6b97b1b74a
6 changed files with 466 additions and 184 deletions

View File

@@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2012-2021, Tomasz Sowa
* Copyright (c) 2012-2022, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -46,6 +46,7 @@
#include "membuffer/membuffer.h"
#include "types.h"
#include "utf8/utf8.h"
#include "utf8/utf8_stream.h"
// for snprintf
#include <cstdio>
@@ -71,8 +72,67 @@ public:
typedef CharT char_type;
typedef MemBuffer<char_type, stack_size, heap_block_size> buffer_type;
typedef typename buffer_type::iterator iterator;
typedef typename buffer_type::const_iterator const_iterator;
class iterator
{
public:
typename buffer_type::iterator membuffer_iterator;
iterator();
iterator(const iterator & i);
iterator & operator=(const iterator & i);
iterator(const typename buffer_type::iterator & i);
iterator & operator=(const typename buffer_type::iterator & i);
bool operator==(const iterator & i) const;
bool operator!=(const iterator & i) const;
iterator & operator++(); // prefix ++
iterator operator++(int); // postfix ++
iterator & operator--(); // prefix --
iterator operator--(int); // postfix --
CharT & operator*();
wchar_t get_unicode_and_advance(const iterator & end);
};
class const_iterator
{
public:
typename buffer_type::const_iterator membuffer_const_iterator;
const_iterator();
const_iterator(const const_iterator & i);
const_iterator(const iterator & i);
const_iterator & operator=(const const_iterator & i);
const_iterator & operator=(const iterator & i);
const_iterator(const typename buffer_type::const_iterator & i);
const_iterator(const typename buffer_type::iterator & i);
const_iterator & operator=(const typename buffer_type::const_iterator & i);
const_iterator & operator=(const typename buffer_type::iterator & i);
bool operator==(const const_iterator & i) const;
bool operator!=(const const_iterator & i) const;
const_iterator & operator++(); // prefix ++
const_iterator operator++(int); // postfix ++
const_iterator & operator--(); // prefix --
const_iterator operator--(int); // postfix --
CharT operator*();
wchar_t get_unicode_and_advance(const const_iterator & end);
};
bool is_char_stream() const;
@@ -112,7 +172,7 @@ public:
TextStreamBase & operator<<(char);
TextStreamBase & operator<<(unsigned char);
TextStreamBase & operator<<(wchar_t);
TextStreamBase & operator<<(wchar_t); // no surrogate pairs are used
TextStreamBase & operator<<(bool);
TextStreamBase & operator<<(short);
TextStreamBase & operator<<(int);
@@ -173,6 +233,272 @@ TextStreamBase<char_type, stack_size, heap_block_size>::TextStreamBase()
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::iterator::iterator()
{
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::iterator::iterator(const iterator & i) : membuffer_iterator(i)
{
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator=(const iterator & i)
{
membuffer_iterator = i;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::iterator::iterator(const typename buffer_type::iterator & i) : membuffer_iterator(i)
{
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator=(const typename buffer_type::iterator & i)
{
membuffer_iterator = i;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
bool TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator==(const iterator & i) const
{
return membuffer_iterator == i.membuffer_iterator;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
bool TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator!=(const iterator & i) const
{
return membuffer_iterator != i.membuffer_iterator;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator++()
{
++membuffer_iterator;
return *this;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::iterator
TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator++(int)
{
const_iterator old(*this);
membuffer_iterator++;
return old;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator--()
{
--membuffer_iterator;
return *this;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::iterator
TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator--(int)
{
const_iterator old(*this);
membuffer_iterator--;
return old;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
char_type & TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator*()
{
return *membuffer_iterator;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
wchar_t TextStreamBase<char_type, stack_size, heap_block_size>::iterator::get_unicode_and_advance(const iterator & end)
{
if( *this != end )
{
if constexpr (sizeof(char_type) == sizeof(char) )
{
int res;
bool correct;
utf8_to_int(*this, end, res, correct);
if( correct )
return static_cast<wchar_t>(res);
else
return static_cast<wchar_t>(0xFFFD); // U+FFFD "replacement character"
}
else
{
wchar_t c = operator*();
++membuffer_iterator;
return c;
}
}
return 0;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator()
{
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator(const const_iterator & i) : membuffer_const_iterator(i.membuffer_const_iterator)
{
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator(const iterator & i) : membuffer_const_iterator(i.membuffer_iterator)
{
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const const_iterator & i)
{
membuffer_const_iterator = i.membuffer_const_iterator;
return *this;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const iterator & i)
{
membuffer_const_iterator = i.membuffer_iterator;
return *this;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator(const typename buffer_type::const_iterator & i) : membuffer_const_iterator(i)
{
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator(const typename buffer_type::iterator & i) : membuffer_const_iterator(i)
{
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const typename buffer_type::const_iterator & i)
{
membuffer_const_iterator = i;
return *this;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const typename buffer_type::iterator & i)
{
membuffer_const_iterator = i;
return *this;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
bool TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator==(const const_iterator & i) const
{
return membuffer_const_iterator == i.membuffer_const_iterator;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
bool TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator!=(const const_iterator & i) const
{
return membuffer_const_iterator != i.membuffer_const_iterator;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator++()
{
++membuffer_const_iterator;
return *this;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator++(int)
{
const_iterator old(*this);
membuffer_const_iterator++;
return old;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator--()
{
--membuffer_const_iterator;
return *this;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator
TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator--(int)
{
const_iterator old(*this);
membuffer_const_iterator--;
return old;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
char_type TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator*()
{
return *membuffer_const_iterator;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
wchar_t TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::get_unicode_and_advance(const const_iterator & end)
{
if( *this != end )
{
if constexpr (sizeof(char_type) == sizeof(char) )
{
int res;
bool correct;
pt::utf8_to_int(*this, end, res, correct);
if( correct )
return static_cast<wchar_t>(res);
else
return static_cast<wchar_t>(0xFFFD); // U+FFFD "replacement character"
}
else
{
wchar_t c = operator*();
++membuffer_const_iterator;
return c;
}
}
return 0;
}
template<typename char_type, size_t stack_size, size_t heap_block_size>
bool TextStreamBase<char_type, stack_size, heap_block_size>::is_char_stream() const
{
@@ -433,10 +759,14 @@ template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size> &
TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(char v)
{
// IMPROVEME
// if char_type == 1 then if v <= 127 then put that char but if (unsigned)v > 127 put replacement character
// if char_type > 1 then simply put that character
buffer.append(static_cast<char_type>(v));
if constexpr (sizeof(char_type) == sizeof(wchar_t) )
{
buffer.append(static_cast<char_type>(static_cast<unsigned char>(v)));
}
else
{
buffer.append(v);
}
return *this;
}
@@ -446,9 +776,6 @@ template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size> &
TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(unsigned char v)
{
// IMPROVEME
// if char_type == 1 then if v <= 127 then put that char but if v > 127 put replacement character
// if char_type > 1 then simply put that character
buffer.append(static_cast<char_type>(v));
return *this;
@@ -459,8 +786,14 @@ template<typename char_type, size_t stack_size, size_t heap_block_size>
TextStreamBase<char_type, stack_size, heap_block_size> &
TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(wchar_t v)
{
// IMPROVEME add utf8/wide conversion, if v is from surrogate pair we can skip it
buffer.append(static_cast<char_type>(v));
if constexpr (sizeof(char_type) == sizeof(wchar_t) )
{
buffer.append(v);
}
else
{
pt::int_to_utf8(static_cast<int>(v), *this);
}
return *this;
}