- added some converting methods: esc_to_json(...), esc_to_xml(...), esc_to_csv() (convert/misc.h)

- BaseParser: added possibility to read from TextStream and WTextStream
- HTMLParser: added filter(const WTextStream & in, Stream & out, ...) method
- added utf8_stream.h with one method:
  template<typename StreamIteratorType>
  size_t utf8_to_int(
    StreamIteratorType & iterator_in,
    StreamIteratorType & iterator_end,
    int & res,
    bool & correct)
This commit is contained in:
Tomasz Sowa 2021-10-12 19:53:11 +02:00
parent 4902eb6037
commit 17d2c0fb25
13 changed files with 807 additions and 128 deletions

View File

@ -1,16 +1,24 @@
# DO NOT DELETE # DO NOT DELETE
./convert/inttostr.o: ./convert/inttostr.h ./convert/inttostr.o: ./convert/inttostr.h
./convert/misc.o: ./convert/misc.h ./convert/text.h ./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h
./convert/misc.o: textstream/types.h utf8/utf8_stream.h
./convert/misc.o: textstream/textstream.h textstream/stream.h space/space.h
./convert/misc.o: convert/inttostr.h utf8/utf8.h utf8/utf8_templates.h
./convert/misc.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
./convert/misc.o: textstream/types.h ./convert/inttostr.h
./convert/text.o: ./convert/text.h ./convert/text_private.h ./convert/text.o: ./convert/text.h ./convert/text_private.h
./convert/double.o: ./convert/double.h textstream/textstream.h ./convert/double.o: ./convert/double.h textstream/textstream.h
./convert/double.o: textstream/stream.h space/space.h textstream/types.h ./convert/double.o: textstream/stream.h space/space.h textstream/types.h
./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
./convert/double.o: membuffer/membuffer.h textstream/types.h ./convert/double.o: membuffer/membuffer.h textstream/types.h
./convert/baseparser.o: ./convert/baseparser.h utf8/utf8.h ./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
./convert/baseparser.o: textstream/stream.h utf8/utf8_templates.h ./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h
./convert/baseparser.o: utf8/utf8_private.h ./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
./convert/baseparser.o: membuffer/membuffer.h textstream/types.h
./convert/baseparser.o: utf8/utf8_stream.h
./date/date.o: ./date/date.h convert/inttostr.h ./date/date.o: ./date/date.h convert/inttostr.h
./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
@ -28,29 +36,39 @@
./space/space.o: convert/patternreplacer.h textstream/textstream.h ./space/space.o: convert/patternreplacer.h textstream/textstream.h
./space/space.o: textstream/stream.h space/space.h date/date.h ./space/space.o: textstream/stream.h space/space.h date/date.h
./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h ./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h
./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h ./space/space.o: ./convert/text.h ./convert/misc.h utf8/utf8_stream.h
./space/space.o: ./convert/double.h
./space/spaceparser.o: ./space/spaceparser.h ./space/space.h ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h
./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h ./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h ./space/spaceparser.o: textstream/textstream.h textstream/stream.h
./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h
./space/spaceparser.o: textstream/types.h convert/strtoint.h ./convert/text.h
./space/spaceparser.o: ./convert/misc.h utf8/utf8_stream.h
./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./utf8/utf8.o: utf8/utf8_private.h ./utf8/utf8.o: utf8/utf8_private.h
./utf8/utf8_private.o: utf8/utf8_private.h ./utf8/utf8_private.o: utf8/utf8_private.h
./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
./csv/csvparser.o: convert/baseparser.h ./csv/csvparser.o: convert/baseparser.h textstream/textstream.h
./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
./csv/csvparser.o: textstream/types.h
./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
./mainoptions/mainoptionsparser.o: utf8/utf8_private.h ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h space/space.h ./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
./html/htmlparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h ./html/htmlparser.o: textstream/textstream.h textstream/stream.h
./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h ./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
./html/htmlparser.o: utf8/utf8_private.h convert/text.h ./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
./html/htmlparser.o: textstream/types.h convert/text.h
./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
./html/bbcodeparser.o: convert/baseparser.h space/space.h textstream/types.h ./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h
./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h ./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h

View File

@ -37,7 +37,7 @@
#include "baseparser.h" #include "baseparser.h"
#include "utf8/utf8.h" #include "utf8/utf8.h"
#include "utf8/utf8_stream.h"
namespace pt namespace pt
@ -45,19 +45,27 @@ namespace pt
BaseParser::BaseParser() BaseParser::BaseParser()
{ {
clear(); clear_input_flags();
} }
void BaseParser::clear() void BaseParser::clear_input_flags()
{ {
line = 0; line = 0;
reading_from_file = false; reading_from_file = false;
pchar_ascii = nullptr; pchar_ascii = nullptr;
pchar_unicode = nullptr; pchar_unicode = nullptr;
reading_from_wchar_string = false; wtext_stream_iterator = nullptr;
wtext_stream_iterator_end = nullptr;
text_stream_iterator = nullptr;
text_stream_iterator_end = nullptr;
lastc = -1; lastc = -1;
input_as_utf8 = true; input_as_utf8 = true;
if( file.is_open() )
file.close();
file.clear();
} }
@ -132,7 +140,6 @@ bool correct;
++line; ++line;
return lastc; return lastc;
} }
@ -150,6 +157,67 @@ return lastc;
} }
int BaseParser::read_char_from_wtext_stream()
{
if( (*wtext_stream_iterator) != (*wtext_stream_iterator_end) )
{
lastc = *(*wtext_stream_iterator);
++(*wtext_stream_iterator);
}
else
{
lastc = -1;
}
if( lastc == '\n' )
++line;
return lastc;
}
int BaseParser::read_char_from_utf8_text_stream()
{
int c;
bool correct;
lastc = -1;
do
{
utf8_to_int(*text_stream_iterator, *text_stream_iterator_end, c, correct);
}
while( !correct && (*text_stream_iterator) != (*text_stream_iterator_end) );
if( correct )
lastc = c;
if( lastc == '\n' )
++line;
return lastc;
}
int BaseParser::read_char_from_ascii_text_stream()
{
if( (*text_stream_iterator) != (*text_stream_iterator_end) )
{
lastc = *(*text_stream_iterator);
++(*text_stream_iterator);
}
else
{
lastc = -1;
}
if( lastc == '\n' )
++line;
return lastc;
}
int BaseParser::read_char_no_escape() int BaseParser::read_char_no_escape()
{ {
if( reading_from_file ) if( reading_from_file )
@ -161,17 +229,33 @@ int BaseParser::read_char_no_escape()
} }
else else
{ {
if( reading_from_wchar_string ) if( pchar_ascii )
{
return read_char_from_wchar_string();
}
else
{ {
if( input_as_utf8 ) if( input_as_utf8 )
return read_char_from_utf8_string(); return read_char_from_utf8_string();
else else
return read_char_from_ascii_string(); return read_char_from_ascii_string();
} }
else if( pchar_unicode )
{
return read_char_from_wchar_string();
}
else if( wtext_stream_iterator && wtext_stream_iterator_end )
{
return read_char_from_wtext_stream();
}
else if( text_stream_iterator && text_stream_iterator_end )
{
if( input_as_utf8 )
return read_char_from_utf8_text_stream();
else
return read_char_from_ascii_text_stream();
}
else
{
lastc = -1;
return lastc;
}
} }
} }

View File

@ -40,6 +40,7 @@
#include <string> #include <string>
#include <fstream> #include <fstream>
#include "textstream/textstream.h"
namespace pt namespace pt
@ -51,15 +52,18 @@ protected:
BaseParser(); BaseParser();
void clear(); virtual void clear_input_flags();
int read_utf8_char(); virtual int read_utf8_char();
int read_ascii_char(); virtual int read_ascii_char();
int read_char_from_wchar_string(); virtual int read_char_from_wchar_string();
int read_char_from_utf8_string(); virtual int read_char_from_utf8_string();
int read_char_from_ascii_string(); virtual int read_char_from_ascii_string();
int read_char_no_escape(); virtual int read_char_from_wtext_stream();
int read_char(); virtual int read_char_from_utf8_text_stream();
virtual int read_char_from_ascii_text_stream();
virtual int read_char_no_escape();
virtual int read_char();
@ -75,6 +79,7 @@ protected:
*/ */
bool reading_from_file; bool reading_from_file;
/* /*
pointers to the current character pointers to the current character
if ParseString() is in used if ParseString() is in used
@ -84,9 +89,20 @@ protected:
/* /*
true if ParseString(wchar_t *) or ParseString(std::wstring&) was called pointers to WTextStream iterators
*/ if set then both of them should be set
bool reading_from_wchar_string; */
WTextStream::const_iterator * wtext_stream_iterator;
WTextStream::const_iterator * wtext_stream_iterator_end;
/*
pointers to TextStream iterators
if set then both of them should be set
*/
TextStream::const_iterator * text_stream_iterator;
TextStream::const_iterator * text_stream_iterator_end;
/* /*
last read char last read char
@ -112,7 +128,6 @@ protected:
}; };
} }

View File

@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2017, Tomasz Sowa * Copyright (c) 2017-2021, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -36,6 +36,8 @@
*/ */
#include "misc.h" #include "misc.h"
#include "inttostr.h"
#include "utf8/utf8.h"
namespace pt namespace pt
@ -49,6 +51,287 @@ void SetOverflow(bool * was_overflow, bool val)
} }
void esc_to_json(char val, Stream & out)
{
if( (unsigned char)val < 32 )
{
char buf[10];
size_t len;
Toa((unsigned char)val, buf, sizeof(buf)/sizeof(char), 16, &len);
out << "\\u";
if( len < 4 )
{
for(size_t i=0 ; i < (4-len) ; ++i)
{
out << '0';
}
}
out << buf;
}
else
{
// CHECKME
// \r \n \t are <32 and will be serialized os \u.... above
switch( val )
{
case 0: out << '\\'; out << '0'; break; // may to skip this character is better?
case '\r': out << '\\'; out << 'r'; break;
case '\n': out << '\\'; out << 'n'; break;
case '\t': out << '\\'; out << 't'; break;
case 0x08: out << '\\'; out << 'b'; break;
case 0x0c: out << '\\'; out << 'f'; break;
case '\\': out << '\\'; out << '\\'; break;
case '"': out << '\\'; out << '\"'; break;
default:
out << val;
}
}
}
void esc_to_json(wchar_t val, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
for(size_t a = 0 ; a < len ; ++a)
{
esc_to_json(utf8_buf[a], out);
}
}
void esc_to_json(const char * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_json(c[i], out);
}
}
void esc_to_json(const char * c, std::size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_json(c[i], out);
}
}
void esc_to_json(const wchar_t * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_json(c[i], out);
}
}
void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_json(c[i], out);
}
}
void esc_to_json(const std::string & in, Stream & out)
{
esc_to_json(in.c_str(), in.size(), out);
}
void esc_to_json(const std::wstring & in, Stream & out)
{
esc_to_json(in.c_str(), in.size(), out);
}
void esc_to_xml(char val, Stream & out)
{
switch(val)
{
case '<':
out << "&lt;";
break;
case '>':
out << "&gt;";
break;
case '&':
out << "&amp;";
break;
case '"':
out << "&quot;";
break;
default:
out << val;
break;
// what about zero (null) character?
}
}
void esc_to_xml(wchar_t val, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
for(size_t a = 0 ; a < len ; ++a)
{
esc_to_xml(utf8_buf[a], out);
}
}
void esc_to_xml(const char * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_xml(c[i], out);
}
}
void esc_to_xml(const char * c, std::size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_xml(c[i], out);
}
}
void esc_to_xml(const wchar_t * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_xml(c[i], out);
}
}
void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_xml(c[i], out);
}
}
void esc_to_xml(const std::string & in, Stream & out)
{
esc_to_xml(in.c_str(), in.size(), out);
}
void esc_to_xml(const std::wstring & in, Stream & out)
{
esc_to_xml(in.c_str(), in.size(), out);
}
void esc_to_csv(char c, pt::Stream & out)
{
switch(c)
{
case '"':
out << "\"\"";
break;
default:
out << c;
break;
// what about zero (null) character?
}
}
void esc_to_csv(wchar_t val, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
for(size_t a = 0 ; a < len ; ++a)
{
esc_to_csv(utf8_buf[a], out);
}
}
void esc_to_csv(const char * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_csv(c[i], out);
}
}
void esc_to_csv(const char * c, std::size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_csv(c[i], out);
}
}
void esc_to_csv(const wchar_t * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_csv(c[i], out);
}
}
void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_csv(c[i], out);
}
}
void esc_to_csv(const std::string & in, Stream & out)
{
esc_to_csv(in.c_str(), in.size(), out);
}
void esc_to_csv(const std::wstring & in, Stream & out)
{
esc_to_csv(in.c_str(), in.size(), out);
}
} }

View File

@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2017, Tomasz Sowa * Copyright (c) 2017-2021, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -40,6 +40,9 @@
#include <limits> #include <limits>
#include "text.h" #include "text.h"
#include "textstream/stream.h"
#include "textstream/types.h"
#include "utf8/utf8_stream.h"
namespace pt namespace pt
@ -47,6 +50,138 @@ namespace pt
void SetOverflow(bool * was_overflow, bool val); void SetOverflow(bool * was_overflow, bool val);
void esc_to_json(char val, Stream & out);
void esc_to_json(wchar_t val, Stream & out);
void esc_to_json(const char * c, pt::Stream & out);
void esc_to_json(const char * c, std::size_t len, Stream & out);
void esc_to_json(const wchar_t * c, Stream & out);
void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out);
void esc_to_json(const std::string & in, Stream & out);
void esc_to_json(const std::wstring & in, Stream & out);
void esc_to_xml(char c, pt::Stream & out);
void esc_to_xml(wchar_t c, pt::Stream & out);
void esc_to_xml(const char * c, pt::Stream & out);
void esc_to_xml(const char * c, std::size_t len, pt::Stream & out);
void esc_to_xml(const wchar_t * c, pt::Stream & out);
void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out);
void esc_to_xml(const std::string & in, Stream & out);
void esc_to_xml(const std::wstring & in, Stream & out);
void esc_to_csv(char c, pt::Stream & out);
void esc_to_csv(wchar_t val, Stream & out);
void esc_to_csv(const char * c, std::size_t len, Stream & out);
void esc_to_csv(const char * c, pt::Stream & out);
void esc_to_csv(const char * c, std::size_t len, pt::Stream & out);
void esc_to_csv(const wchar_t * c, pt::Stream & out);
void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out);
void esc_to_csv(const std::string & in, Stream & out);
template<typename StreamType>
void esc_to_json(const StreamType & in, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
typename StreamType::const_iterator i = in.begin();
typename StreamType::const_iterator end = in.end();
int res;
bool correct;
for( ; i != end ; ++i)
{
if( in.is_wchar_stream() && out.is_char_stream() )
{
std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
esc_to_json(utf8_buf, len, out);
}
else
if( in.is_char_stream() && out.is_wchar_stream() )
{
utf8_to_int(i, end, res, correct);
if( correct )
esc_to_json(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
// put replacement char if not correct?
}
else
{
esc_to_json(static_cast<wchar_t>(*i), out);
}
}
}
template<typename StreamType>
void esc_to_xml(const StreamType & in, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
typename StreamType::const_iterator i = in.begin();
typename StreamType::const_iterator end = in.end();
int res;
bool correct;
for( ; i != end ; ++i)
{
if( in.is_wchar_stream() && out.is_char_stream() )
{
std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
esc_to_xml(utf8_buf, len, out);
}
else
if( in.is_char_stream() && out.is_wchar_stream() )
{
utf8_to_int(i, end, res, correct);
if( correct )
esc_to_xml(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
// put replacement char if not correct?
}
else
{
esc_to_xml(static_cast<wchar_t>(*i), out);
}
}
}
template<typename StreamType>
void esc_to_csv(const StreamType & in, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
typename StreamType::const_iterator i = in.begin();
typename StreamType::const_iterator end = in.end();
int res;
bool correct;
for( ; i != end ; ++i)
{
if( in.is_wchar_stream() && out.is_char_stream() )
{
std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
esc_to_csv(utf8_buf, len, out);
}
else
if( in.is_char_stream() && out.is_wchar_stream() )
{
utf8_to_int(i, end, res, correct);
if( correct )
esc_to_csv(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
// put replacement char if not correct?
}
else
{
esc_to_csv(static_cast<wchar_t>(*i), out);
}
}
}
} }

View File

@ -53,6 +53,8 @@ CSVParser::CSVParser()
CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space) CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
{ {
clear_input_flags();
reading_from_file = true; reading_from_file = true;
space = &out_space; space = &out_space;
@ -103,11 +105,10 @@ CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space &
CSVParser::Status CSVParser::parse(const char * str, Space & out_space) CSVParser::Status CSVParser::parse(const char * str, Space & out_space)
{ {
reading_from_file = false; clear_input_flags();
reading_from_wchar_string = false;
pchar_ascii = str; pchar_ascii = str;
pchar_unicode = 0; space = &out_space;
space = &out_space;
parse(); parse();
@ -124,11 +125,10 @@ CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space)
CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space) CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space)
{ {
reading_from_file = false; clear_input_flags();
reading_from_wchar_string = true;
pchar_unicode = str; pchar_unicode = str;
pchar_ascii = 0; space = &out_space;
space = &out_space;
parse(); parse();

View File

@ -48,6 +48,24 @@ const int HTMLParser::WHITE_MODE_TREE;
void HTMLParser::clear_input_flags()
{
BaseParser::clear_input_flags();
parsing_html = true;
xml_compact_mode = true;
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
out_stream = nullptr;
out_space = nullptr;
line_len = 0;
}
void HTMLParser::Item::Clear() void HTMLParser::Item::Clear()
{ {
name.clear(); name.clear();
@ -71,21 +89,11 @@ HTMLParser::Item::Item()
void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode) void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
{ {
parsing_html = true; clear_input_flags();
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = in;
pchar_ascii = 0;
xml_compact_mode = compact_mode;
status = ok; pchar_unicode = in;
line = 1; xml_compact_mode = compact_mode;
out_space = &space;
stack_len = 0;
out_string = nullptr;
out_space = &space;
//last_new_line = false;
line_len = 0;
out_space->clear(); out_space->clear();
Init(); Init();
@ -96,16 +104,11 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space) HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
{ {
clear_input_flags();
parsing_html = false; parsing_html = false;
reading_from_file = true; reading_from_file = true;
xml_compact_mode = compact_mode; xml_compact_mode = compact_mode;
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
line_len = 0;
this->out_space = &out_space; this->out_space = &out_space;
if( clear_space ) if( clear_space )
@ -153,20 +156,15 @@ HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Sp
void HTMLParser::Filter(const wchar_t * in, std::wstring & out) void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string)
{ {
parsing_html = true; clear_input_flags();
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = in;
pchar_ascii = 0;
stack_len = 0; pchar_unicode = in;
out_string = &out; out_string = &out;
out_space = nullptr;
//last_new_line = false; if( clear_out_string )
line_len = 0; out_string->clear();
out_string->clear();
Init(); Init();
Read(); Read();
@ -174,7 +172,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
} }
void HTMLParser::Filter(const std::wstring & in, std::wstring & out) void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string)
{ {
if( &in == &out ) if( &in == &out )
{ {
@ -187,27 +185,45 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
if( out.capacity() < out_projected_len ) if( out.capacity() < out_projected_len )
out.reserve(out_projected_len); out.reserve(out_projected_len);
Filter(in.c_str(), out); filter(in.c_str(), out, clear_out_string);
} }
void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream)
HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out)
{ {
parsing_html = true; clear_input_flags();
WTextStream::const_iterator begin = in.begin();
WTextStream::const_iterator end = in.end();
wtext_stream_iterator = &begin;
wtext_stream_iterator_end = &end;
out_stream = &out;
if( clear_out_stream )
out_stream->clear();
Init();
Read();
Uninit();
}
HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream)
{
clear_input_flags();
reading_from_file = true; reading_from_file = true;
// open the file before clearing 'out' string, 'out' string can be the same string as the file_name // open the file before clearing 'out' string, 'out' string can be the same string as the file_name
file.clear(); file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in); file.open(file_name, std::ios_base::binary | std::ios_base::in);
status = ok; out_string = &out;
line = 1;
stack_len = 0; if( clear_out_stream )
out_string = &out; out_string->clear();
out_space = nullptr;
line_len = 0;
out_string->clear();
if( file ) if( file )
{ {
@ -226,24 +242,24 @@ HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring
} }
HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out) HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream)
{ {
return filter_file(file_name.c_str(), out); return filter_file(file_name.c_str(), out, clear_out_stream);
} }
HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out) HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream)
{ {
std::string file_name_utf8; std::string file_name_utf8;
pt::wide_to_utf8(file_name, file_name_utf8); pt::wide_to_utf8(file_name, file_name_utf8);
return filter_file(file_name_utf8, out); return filter_file(file_name_utf8, out, clear_out_stream);
} }
HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out) HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream)
{ {
return filter_file(file_name.c_str(), out); return filter_file(file_name.c_str(), out, clear_out_stream);
} }
@ -792,6 +808,9 @@ void HTMLParser::Put(wchar_t c)
if( out_string ) if( out_string )
(*out_string) += c; (*out_string) += c;
if( out_stream )
(*out_stream) << c;
CheckChar(c); CheckChar(c);
} }
@ -806,6 +825,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
if( out_string ) if( out_string )
out_string->append(str, len); out_string->append(str, len);
if( out_stream )
out_stream->write(str, len);
for( ; str < end ; ++str) for( ; str < end ; ++str)
CheckChar(*str); CheckChar(*str);
} }
@ -819,6 +841,9 @@ void HTMLParser::Put(const std::wstring & str)
if( out_string ) if( out_string )
out_string->append(str); out_string->append(str);
if( out_stream )
out_stream->write(str.c_str(), str.size());
for(size_t i=0 ; i < str.size() ; ++i) for(size_t i=0 ; i < str.size() ; ++i)
CheckChar(str[i]); CheckChar(str[i]);
} }
@ -1130,6 +1155,9 @@ void HTMLParser::PutTabs(size_t len)
{ {
if( out_string ) if( out_string )
(*out_string) += ' '; // we do not add them to 'line_len' (*out_string) += ' '; // we do not add them to 'line_len'
if( out_stream )
(*out_stream) << ' ';
} }
} }

View File

@ -44,6 +44,7 @@
#include <algorithm> #include <algorithm>
#include "convert/baseparser.h" #include "convert/baseparser.h"
#include "space/space.h" #include "space/space.h"
#include "textstream/stream.h"
namespace pt namespace pt
@ -130,14 +131,15 @@ public:
// main methods used for filtering // main methods used for filtering
void Filter(const wchar_t * in, std::wstring & out); void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true);
void Filter(const std::wstring & in, std::wstring & out); void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true);
void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const char * file_name, std::wstring & out); HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out); HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out); HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out); HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true);
/* /*
@ -278,7 +280,7 @@ protected:
void clear_input_flags();
/* /*
@ -403,6 +405,7 @@ protected:
size_t stack_len; // length of the stack size_t stack_len; // length of the stack
wchar_t * buffer; // buffer used when printing wchar_t * buffer; // buffer used when printing
std::wstring * out_string; std::wstring * out_string;
Stream * out_stream;
Space * out_space; Space * out_space;
Space text_space_tmp; Space text_space_tmp;

View File

@ -74,11 +74,12 @@ int SpaceParser::get_last_parsed_line()
SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space) SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space)
{ {
clear_input_flags();
reading_from_file = true; reading_from_file = true;
parsing_space = false; parsing_space = false;
root_space = &out_space; root_space = &out_space;
file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in); file.open(file_name, std::ios_base::binary | std::ios_base::in);
if( file ) if( file )
@ -125,11 +126,12 @@ SpaceParser::Status SpaceParser::parse_json_file(const std::wstring & file_name,
SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space) SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space)
{ {
clear_input_flags();
reading_from_file = true; reading_from_file = true;
parsing_space = true; parsing_space = true;
root_space = &out_space; root_space = &out_space;
file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in); file.open(file_name, std::ios_base::binary | std::ios_base::in);
if( file ) if( file )
@ -174,10 +176,9 @@ SpaceParser::Status SpaceParser::parse_space_file(const std::wstring & file_name
SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space) SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space)
{ {
reading_from_file = false; clear_input_flags();
reading_from_wchar_string = false;
pchar_ascii = str; pchar_ascii = str;
pchar_unicode = 0;
parsing_space = false; parsing_space = false;
root_space = &out_space; root_space = &out_space;
@ -195,10 +196,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::string & str, Space & out
SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space) SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space)
{ {
reading_from_file = false; clear_input_flags();
reading_from_wchar_string = true;
pchar_unicode = str; pchar_unicode = str;
pchar_ascii = 0;
parsing_space = false; parsing_space = false;
root_space = &out_space; root_space = &out_space;
@ -219,10 +219,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::wstring & str, Space & ou
SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space) SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space)
{ {
reading_from_file = false; clear_input_flags();
reading_from_wchar_string = false;
pchar_ascii = str; pchar_ascii = str;
pchar_unicode = 0;
parsing_space = true; parsing_space = true;
root_space = &out_space; root_space = &out_space;
@ -240,10 +239,9 @@ SpaceParser::Status SpaceParser::parse_space(const std::string & str, Space & ou
SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space) SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space)
{ {
reading_from_file = false; clear_input_flags();
reading_from_wchar_string = true;
pchar_unicode = str; pchar_unicode = str;
pchar_ascii = 0;
parsing_space = true; parsing_space = true;
root_space = &out_space; root_space = &out_space;

View File

@ -45,6 +45,12 @@
namespace pt namespace pt
{ {
/*
* public methods are also defined in utf8_stream.h
*
*/
/*! /*!
UTF-8, a transformation format of ISO 10646 UTF-8, a transformation format of ISO 10646
http://tools.ietf.org/html/rfc3629 http://tools.ietf.org/html/rfc3629
@ -213,9 +219,7 @@ template<typename StreamType>
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1); bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1);
template<typename StreamTypeIn, typename StreamTypeOut> template<typename StreamTypeIn, typename StreamTypeOut>
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested, IMPROVE ME add clear parameter, mode parameter is not used void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); // not tested, IMPROVE ME mode parameter is not used
} // namespace } // namespace

104
src/utf8/utf8_stream.h Normal file
View File

@ -0,0 +1,104 @@
/*
* This file is a part of PikoTools
* and is distributed under the (new) BSD licence.
* Author: Tomasz Sowa <t.sowa@ttmath.org>
*/
/*
* Copyright (c) 2021, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* * Neither the name Tomasz Sowa nor the names of contributors to this
* project may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef headerfile_picotools_utf8_utf8_stream
#define headerfile_picotools_utf8_utf8_stream
#include "textstream/textstream.h"
namespace pt
{
/*!
this function converts one UTF-8 character into one wide-character
input:
iterator_in - an TextStream iterator for reading from
iterator_end - an end iterator (can be returned by end() method from TextStream)
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input stream
*/
template<typename StreamIteratorType>
size_t utf8_to_int(
StreamIteratorType & iterator_in,
StreamIteratorType & iterator_end,
int & res,
bool & correct)
{
size_t i, len;
unsigned char uz;
res = 0;
correct = false;
if( iterator_in == iterator_end )
return 0;
uz = *iterator_in;
++iterator_in;
if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
return 1;
for(i=1 ; i<len ; ++i)
{
if( iterator_in == iterator_end )
return i;
uz = *iterator_in;
++iterator_in;
if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
return i;
}
if( utf8_check_range(res, len) )
correct = true;
return len;
}
}
#endif

View File

@ -47,6 +47,7 @@ namespace pt
{ {
template<typename StreamType> template<typename StreamType>
void int_to_wide(int c, StreamType & res) void int_to_wide(int c, StreamType & res)
{ {
@ -65,6 +66,7 @@ void int_to_wide(int c, StreamType & res)
/*! /*!
converting UTF-8 string to a TextStreamBase<wchar_t,...> stream converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
(need to be tested) (need to be tested)
@ -376,8 +378,11 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i
// not tested // not tested
template<typename StreamTypeIn, typename StreamTypeOut> template<typename StreamTypeIn, typename StreamTypeOut>
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode) void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
{ {
if( clear )
utf8.clear();
private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){ private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
utf8.write(utf8_buffer, buffer_len); utf8.write(utf8_buffer, buffer_len);
}); });
@ -385,8 +390,6 @@ void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)
} // namespace pt } // namespace pt
#endif #endif

View File

@ -9,12 +9,15 @@
./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h ./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h
./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./convert.o: ../src/convert/strtoint.h ../src/convert/text.h ./convert.o: ../src/convert/strtoint.h ../src/convert/text.h
./convert.o: ../src/convert/misc.h ../src/convert/double.h ./convert.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
./convert.o: ../src/convert/double.h
./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h ./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h ./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
./csvparser.o: ../src/convert/baseparser.h test.h ./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h
./csvparser.o: ../src/textstream/stream.h ../src/date/date.h
./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h test.h
./main.o: convert.h mainoptionsparser.h csvparser.h ./main.o: convert.h mainoptionsparser.h csvparser.h
./test.o: test.h ./test.o: test.h
./mainoptionsparser.o: mainoptionsparser.h test.h ./mainoptionsparser.o: mainoptionsparser.h test.h
@ -30,4 +33,5 @@
./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h ./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h
./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h ./mainoptionsparser.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
./mainoptionsparser.o: ../src/convert/double.h