Browse Source

- added some converting methods: esc_to_json(...), esc_to_xml(...), esc_to_csv() (convert/misc.h)

- BaseParser: added possibility to read from TextStream and WTextStream
- HTMLParser: added filter(const WTextStream & in, Stream & out, ...) method
- added utf8_stream.h with one method:
  template<typename StreamIteratorType>
  size_t utf8_to_int(
    StreamIteratorType & iterator_in,
    StreamIteratorType & iterator_end,
    int & res,
    bool & correct)
htmlparserlistener
Tomasz Sowa 1 year ago
parent
commit
17d2c0fb25
  1. 44
      src/Makefile.dep
  2. 102
      src/convert/baseparser.cpp
  3. 39
      src/convert/baseparser.h
  4. 285
      src/convert/misc.cpp
  5. 137
      src/convert/misc.h
  6. 20
      src/csv/csvparser.cpp
  7. 130
      src/html/htmlparser.cpp
  8. 17
      src/html/htmlparser.h
  9. 26
      src/space/spaceparser.cpp
  10. 10
      src/utf8/utf8.h
  11. 104
      src/utf8/utf8_stream.h
  12. 9
      src/utf8/utf8_templates.h
  13. 10
      tests/Makefile.dep

44
src/Makefile.dep

@ -1,16 +1,24 @@
# DO NOT DELETE
./convert/inttostr.o: ./convert/inttostr.h
./convert/misc.o: ./convert/misc.h ./convert/text.h
./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h
./convert/misc.o: textstream/types.h utf8/utf8_stream.h
./convert/misc.o: textstream/textstream.h textstream/stream.h space/space.h
./convert/misc.o: convert/inttostr.h utf8/utf8.h utf8/utf8_templates.h
./convert/misc.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
./convert/misc.o: textstream/types.h ./convert/inttostr.h
./convert/text.o: ./convert/text.h ./convert/text_private.h
./convert/double.o: ./convert/double.h textstream/textstream.h
./convert/double.o: textstream/stream.h space/space.h textstream/types.h
./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
./convert/double.o: membuffer/membuffer.h textstream/types.h
./convert/baseparser.o: ./convert/baseparser.h utf8/utf8.h
./convert/baseparser.o: textstream/stream.h utf8/utf8_templates.h
./convert/baseparser.o: utf8/utf8_private.h
./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h
./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
./convert/baseparser.o: membuffer/membuffer.h textstream/types.h
./convert/baseparser.o: utf8/utf8_stream.h
./date/date.o: ./date/date.h convert/inttostr.h
./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
@ -28,29 +36,39 @@
./space/space.o: convert/patternreplacer.h textstream/textstream.h
./space/space.o: textstream/stream.h space/space.h date/date.h
./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h
./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h
./space/space.o: ./convert/text.h ./convert/misc.h utf8/utf8_stream.h
./space/space.o: ./convert/double.h
./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h
./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
./space/spaceparser.o: textstream/textstream.h textstream/stream.h
./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h
./space/spaceparser.o: textstream/types.h convert/strtoint.h ./convert/text.h
./space/spaceparser.o: ./convert/misc.h utf8/utf8_stream.h
./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./utf8/utf8.o: utf8/utf8_private.h
./utf8/utf8_private.o: utf8/utf8_private.h
./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
./csv/csvparser.o: convert/baseparser.h
./csv/csvparser.o: convert/baseparser.h textstream/textstream.h
./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
./csv/csvparser.o: textstream/types.h
./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h space/space.h
./html/htmlparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
./html/htmlparser.o: utf8/utf8_private.h convert/text.h
./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
./html/htmlparser.o: textstream/textstream.h textstream/stream.h
./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
./html/htmlparser.o: textstream/types.h convert/text.h
./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
./html/bbcodeparser.o: convert/baseparser.h space/space.h textstream/types.h
./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h
./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h
./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h

102
src/convert/baseparser.cpp

@ -37,7 +37,7 @@
#include "baseparser.h"
#include "utf8/utf8.h"
#include "utf8/utf8_stream.h"
namespace pt
@ -45,19 +45,27 @@ namespace pt
BaseParser::BaseParser()
{
clear();
clear_input_flags();
}
void BaseParser::clear()
void BaseParser::clear_input_flags()
{
line = 0;
reading_from_file = false;
pchar_ascii = nullptr;
pchar_unicode = nullptr;
reading_from_wchar_string = false;
wtext_stream_iterator = nullptr;
wtext_stream_iterator_end = nullptr;
text_stream_iterator = nullptr;
text_stream_iterator_end = nullptr;
lastc = -1;
input_as_utf8 = true;
if( file.is_open() )
file.close();
file.clear();
}
@ -132,7 +140,6 @@ bool correct;
++line;
return lastc;
}
@ -150,6 +157,67 @@ return lastc;
}
int BaseParser::read_char_from_wtext_stream()
{
if( (*wtext_stream_iterator) != (*wtext_stream_iterator_end) )
{
lastc = *(*wtext_stream_iterator);
++(*wtext_stream_iterator);
}
else
{
lastc = -1;
}
if( lastc == '\n' )
++line;
return lastc;
}
int BaseParser::read_char_from_utf8_text_stream()
{
int c;
bool correct;
lastc = -1;
do
{
utf8_to_int(*text_stream_iterator, *text_stream_iterator_end, c, correct);
}
while( !correct && (*text_stream_iterator) != (*text_stream_iterator_end) );
if( correct )
lastc = c;
if( lastc == '\n' )
++line;
return lastc;
}
int BaseParser::read_char_from_ascii_text_stream()
{
if( (*text_stream_iterator) != (*text_stream_iterator_end) )
{
lastc = *(*text_stream_iterator);
++(*text_stream_iterator);
}
else
{
lastc = -1;
}
if( lastc == '\n' )
++line;
return lastc;
}
int BaseParser::read_char_no_escape()
{
if( reading_from_file )
@ -161,16 +229,32 @@ int BaseParser::read_char_no_escape()
}
else
{
if( reading_from_wchar_string )
if( pchar_ascii )
{
if( input_as_utf8 )
return read_char_from_utf8_string();
else
return read_char_from_ascii_string();
}
else if( pchar_unicode )
{
return read_char_from_wchar_string();
}
else
else if( wtext_stream_iterator && wtext_stream_iterator_end )
{
return read_char_from_wtext_stream();
}
else if( text_stream_iterator && text_stream_iterator_end )
{
if( input_as_utf8 )
return read_char_from_utf8_string();
return read_char_from_utf8_text_stream();
else
return read_char_from_ascii_string();
return read_char_from_ascii_text_stream();
}
else
{
lastc = -1;
return lastc;
}
}
}

39
src/convert/baseparser.h

@ -40,6 +40,7 @@
#include <string>
#include <fstream>
#include "textstream/textstream.h"
namespace pt
@ -51,15 +52,18 @@ protected:
BaseParser();
void clear();
virtual void clear_input_flags();
int read_utf8_char();
int read_ascii_char();
int read_char_from_wchar_string();
int read_char_from_utf8_string();
int read_char_from_ascii_string();
int read_char_no_escape();
int read_char();
virtual int read_utf8_char();
virtual int read_ascii_char();
virtual int read_char_from_wchar_string();
virtual int read_char_from_utf8_string();
virtual int read_char_from_ascii_string();
virtual int read_char_from_wtext_stream();
virtual int read_char_from_utf8_text_stream();
virtual int read_char_from_ascii_text_stream();
virtual int read_char_no_escape();
virtual int read_char();
@ -75,6 +79,7 @@ protected:
*/
bool reading_from_file;
/*
pointers to the current character
if ParseString() is in used
@ -84,9 +89,20 @@ protected:
/*
true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
*/
bool reading_from_wchar_string;
pointers to WTextStream iterators
if set then both of them should be set
*/
WTextStream::const_iterator * wtext_stream_iterator;
WTextStream::const_iterator * wtext_stream_iterator_end;
/*
pointers to TextStream iterators
if set then both of them should be set
*/
TextStream::const_iterator * text_stream_iterator;
TextStream::const_iterator * text_stream_iterator_end;
/*
last read char
@ -112,7 +128,6 @@ protected:
};
}

285
src/convert/misc.cpp

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2017, Tomasz Sowa
* Copyright (c) 2017-2021, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -36,6 +36,8 @@
*/
#include "misc.h"
#include "inttostr.h"
#include "utf8/utf8.h"
namespace pt
@ -49,6 +51,287 @@ void SetOverflow(bool * was_overflow, bool val)
}
void esc_to_json(char val, Stream & out)
{
if( (unsigned char)val < 32 )
{
char buf[10];
size_t len;
Toa((unsigned char)val, buf, sizeof(buf)/sizeof(char), 16, &len);
out << "\\u";
if( len < 4 )
{
for(size_t i=0 ; i < (4-len) ; ++i)
{
out << '0';
}
}
out << buf;
}
else
{
// CHECKME
// \r \n \t are <32 and will be serialized os \u.... above
switch( val )
{
case 0: out << '\\'; out << '0'; break; // may to skip this character is better?
case '\r': out << '\\'; out << 'r'; break;
case '\n': out << '\\'; out << 'n'; break;
case '\t': out << '\\'; out << 't'; break;
case 0x08: out << '\\'; out << 'b'; break;
case 0x0c: out << '\\'; out << 'f'; break;
case '\\': out << '\\'; out << '\\'; break;
case '"': out << '\\'; out << '\"'; break;
default:
out << val;
}
}
}
void esc_to_json(wchar_t val, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
for(size_t a = 0 ; a < len ; ++a)
{
esc_to_json(utf8_buf[a], out);
}
}
void esc_to_json(const char * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_json(c[i], out);
}
}
void esc_to_json(const char * c, std::size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_json(c[i], out);
}
}
void esc_to_json(const wchar_t * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_json(c[i], out);
}
}
void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_json(c[i], out);
}
}
void esc_to_json(const std::string & in, Stream & out)
{
esc_to_json(in.c_str(), in.size(), out);
}
void esc_to_json(const std::wstring & in, Stream & out)
{
esc_to_json(in.c_str(), in.size(), out);
}
void esc_to_xml(char val, Stream & out)
{
switch(val)
{
case '<':
out << "&lt;";
break;
case '>':
out << "&gt;";
break;
case '&':
out << "&amp;";
break;
case '"':
out << "&quot;";
break;
default:
out << val;
break;
// what about zero (null) character?
}
}
void esc_to_xml(wchar_t val, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
for(size_t a = 0 ; a < len ; ++a)
{
esc_to_xml(utf8_buf[a], out);
}
}
void esc_to_xml(const char * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_xml(c[i], out);
}
}
void esc_to_xml(const char * c, std::size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_xml(c[i], out);
}
}
void esc_to_xml(const wchar_t * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_xml(c[i], out);
}
}
void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_xml(c[i], out);
}
}
void esc_to_xml(const std::string & in, Stream & out)
{
esc_to_xml(in.c_str(), in.size(), out);
}
void esc_to_xml(const std::wstring & in, Stream & out)
{
esc_to_xml(in.c_str(), in.size(), out);
}
void esc_to_csv(char c, pt::Stream & out)
{
switch(c)
{
case '"':
out << "\"\"";
break;
default:
out << c;
break;
// what about zero (null) character?
}
}
void esc_to_csv(wchar_t val, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
for(size_t a = 0 ; a < len ; ++a)
{
esc_to_csv(utf8_buf[a], out);
}
}
void esc_to_csv(const char * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_csv(c[i], out);
}
}
void esc_to_csv(const char * c, std::size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_csv(c[i], out);
}
}
void esc_to_csv(const wchar_t * c, pt::Stream & out)
{
for(size_t i = 0 ; c[i] != 0 ; ++i)
{
esc_to_csv(c[i], out);
}
}
void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out)
{
for(size_t i = 0 ; i < len ; ++i)
{
esc_to_csv(c[i], out);
}
}
void esc_to_csv(const std::string & in, Stream & out)
{
esc_to_csv(in.c_str(), in.size(), out);
}
void esc_to_csv(const std::wstring & in, Stream & out)
{
esc_to_csv(in.c_str(), in.size(), out);
}
}

137
src/convert/misc.h

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2017, Tomasz Sowa
* Copyright (c) 2017-2021, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -40,6 +40,9 @@
#include <limits>
#include "text.h"
#include "textstream/stream.h"
#include "textstream/types.h"
#include "utf8/utf8_stream.h"
namespace pt
@ -47,6 +50,138 @@ namespace pt
void SetOverflow(bool * was_overflow, bool val);
void esc_to_json(char val, Stream & out);
void esc_to_json(wchar_t val, Stream & out);
void esc_to_json(const char * c, pt::Stream & out);
void esc_to_json(const char * c, std::size_t len, Stream & out);
void esc_to_json(const wchar_t * c, Stream & out);
void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out);
void esc_to_json(const std::string & in, Stream & out);
void esc_to_json(const std::wstring & in, Stream & out);
void esc_to_xml(char c, pt::Stream & out);
void esc_to_xml(wchar_t c, pt::Stream & out);
void esc_to_xml(const char * c, pt::Stream & out);
void esc_to_xml(const char * c, std::size_t len, pt::Stream & out);
void esc_to_xml(const wchar_t * c, pt::Stream & out);
void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out);
void esc_to_xml(const std::string & in, Stream & out);
void esc_to_xml(const std::wstring & in, Stream & out);
void esc_to_csv(char c, pt::Stream & out);
void esc_to_csv(wchar_t val, Stream & out);
void esc_to_csv(const char * c, std::size_t len, Stream & out);
void esc_to_csv(const char * c, pt::Stream & out);
void esc_to_csv(const char * c, std::size_t len, pt::Stream & out);
void esc_to_csv(const wchar_t * c, pt::Stream & out);
void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out);
void esc_to_csv(const std::string & in, Stream & out);
template<typename StreamType>
void esc_to_json(const StreamType & in, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
typename StreamType::const_iterator i = in.begin();
typename StreamType::const_iterator end = in.end();
int res;
bool correct;
for( ; i != end ; ++i)
{
if( in.is_wchar_stream() && out.is_char_stream() )
{
std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
esc_to_json(utf8_buf, len, out);
}
else
if( in.is_char_stream() && out.is_wchar_stream() )
{
utf8_to_int(i, end, res, correct);
if( correct )
esc_to_json(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
// put replacement char if not correct?
}
else
{
esc_to_json(static_cast<wchar_t>(*i), out);
}
}
}
template<typename StreamType>
void esc_to_xml(const StreamType & in, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
typename StreamType::const_iterator i = in.begin();
typename StreamType::const_iterator end = in.end();
int res;
bool correct;
for( ; i != end ; ++i)
{
if( in.is_wchar_stream() && out.is_char_stream() )
{
std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
esc_to_xml(utf8_buf, len, out);
}
else
if( in.is_char_stream() && out.is_wchar_stream() )
{
utf8_to_int(i, end, res, correct);
if( correct )
esc_to_xml(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
// put replacement char if not correct?
}
else
{
esc_to_xml(static_cast<wchar_t>(*i), out);
}
}
}
template<typename StreamType>
void esc_to_csv(const StreamType & in, Stream & out)
{
char utf8_buf[10];
std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
typename StreamType::const_iterator i = in.begin();
typename StreamType::const_iterator end = in.end();
int res;
bool correct;
for( ; i != end ; ++i)
{
if( in.is_wchar_stream() && out.is_char_stream() )
{
std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
esc_to_csv(utf8_buf, len, out);
}
else
if( in.is_char_stream() && out.is_wchar_stream() )
{
utf8_to_int(i, end, res, correct);
if( correct )
esc_to_csv(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
// put replacement char if not correct?
}
else
{
esc_to_csv(static_cast<wchar_t>(*i), out);
}
}
}
}

20
src/csv/csvparser.cpp

@ -53,6 +53,8 @@ CSVParser::CSVParser()
CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
{
clear_input_flags();
reading_from_file = true;
space = &out_space;
@ -103,11 +105,10 @@ CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space &
CSVParser::Status CSVParser::parse(const char * str, Space & out_space)
{
reading_from_file = false;
reading_from_wchar_string = false;
pchar_ascii = str;
pchar_unicode = 0;
space = &out_space;
clear_input_flags();
pchar_ascii = str;
space = &out_space;
parse();
@ -124,11 +125,10 @@ CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space)
CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space)
{
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = str;
pchar_ascii = 0;
space = &out_space;
clear_input_flags();
pchar_unicode = str;
space = &out_space;
parse();

130
src/html/htmlparser.cpp

@ -48,6 +48,24 @@ const int HTMLParser::WHITE_MODE_TREE;
void HTMLParser::clear_input_flags()
{
BaseParser::clear_input_flags();
parsing_html = true;
xml_compact_mode = true;
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
out_stream = nullptr;
out_space = nullptr;
line_len = 0;
}
void HTMLParser::Item::Clear()
{
name.clear();
@ -71,21 +89,11 @@ HTMLParser::Item::Item()
void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
{
parsing_html = true;
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = in;
pchar_ascii = 0;
xml_compact_mode = compact_mode;
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
out_space = &space;
//last_new_line = false;
line_len = 0;
clear_input_flags();
pchar_unicode = in;
xml_compact_mode = compact_mode;
out_space = &space;
out_space->clear();
Init();
@ -96,16 +104,11 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
{
clear_input_flags();
parsing_html = false;
reading_from_file = true;
xml_compact_mode = compact_mode;
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
line_len = 0;
xml_compact_mode = compact_mode;
this->out_space = &out_space;
if( clear_space )
@ -153,20 +156,15 @@ HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Sp
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string)
{
parsing_html = true;
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = in;
pchar_ascii = 0;
clear_input_flags();
stack_len = 0;
pchar_unicode = in;
out_string = &out;
out_space = nullptr;
//last_new_line = false;
line_len = 0;
out_string->clear();
if( clear_out_string )
out_string->clear();
Init();
Read();
@ -174,7 +172,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
}
void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string)
{
if( &in == &out )
{
@ -187,27 +185,45 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
if( out.capacity() < out_projected_len )
out.reserve(out_projected_len);
Filter(in.c_str(), out);
filter(in.c_str(), out, clear_out_string);
}
void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream)
{
clear_input_flags();
WTextStream::const_iterator begin = in.begin();
WTextStream::const_iterator end = in.end();
wtext_stream_iterator = &begin;
wtext_stream_iterator_end = &end;
HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out)
out_stream = &out;
if( clear_out_stream )
out_stream->clear();
Init();
Read();
Uninit();
}
HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream)
{
parsing_html = true;
clear_input_flags();
reading_from_file = true;
// open the file before clearing 'out' string, 'out' string can be the same string as the file_name
file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in);
status = ok;
line = 1;
stack_len = 0;
out_string = &out;
out_space = nullptr;
line_len = 0;
out_string->clear();
out_string = &out;
if( clear_out_stream )
out_string->clear();
if( file )
{
@ -226,24 +242,24 @@ HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring
}
HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out)
HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream)
{
return filter_file(file_name.c_str(), out);
return filter_file(file_name.c_str(), out, clear_out_stream);
}
HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out)
HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream)
{
std::string file_name_utf8;
pt::wide_to_utf8(file_name, file_name_utf8);
return filter_file(file_name_utf8, out);
return filter_file(file_name_utf8, out, clear_out_stream);
}
HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out)
HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream)
{
return filter_file(file_name.c_str(), out);
return filter_file(file_name.c_str(), out, clear_out_stream);
}
@ -792,6 +808,9 @@ void HTMLParser::Put(wchar_t c)
if( out_string )
(*out_string) += c;
if( out_stream )
(*out_stream) << c;
CheckChar(c);
}
@ -806,6 +825,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
if( out_string )
out_string->append(str, len);
if( out_stream )
out_stream->write(str, len);
for( ; str < end ; ++str)
CheckChar(*str);
}
@ -819,6 +841,9 @@ void HTMLParser::Put(const std::wstring & str)
if( out_string )
out_string->append(str);
if( out_stream )
out_stream->write(str.c_str(), str.size());
for(size_t i=0 ; i < str.size() ; ++i)
CheckChar(str[i]);
}
@ -1130,6 +1155,9 @@ void HTMLParser::PutTabs(size_t len)
{
if( out_string )
(*out_string) += ' '; // we do not add them to 'line_len'
if( out_stream )
(*out_stream) << ' ';
}
}

17
src/html/htmlparser.h

@ -44,6 +44,7 @@
#include <algorithm>
#include "convert/baseparser.h"
#include "space/space.h"
#include "textstream/stream.h"
namespace pt
@ -130,14 +131,15 @@ public:
// main methods used for filtering
void Filter(const wchar_t * in, std::wstring & out);
void Filter(const std::wstring & in, std::wstring & out);
void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true);
void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true);
void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const char * file_name, std::wstring & out);
HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out);
HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out);
HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out);
HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true);
/*
@ -278,7 +280,7 @@ protected:
void clear_input_flags();
/*
@ -403,6 +405,7 @@ protected:
size_t stack_len; // length of the stack
wchar_t * buffer; // buffer used when printing
std::wstring * out_string;
Stream * out_stream;
Space * out_space;
Space text_space_tmp;

26
src/space/spaceparser.cpp

@ -74,11 +74,12 @@ int SpaceParser::get_last_parsed_line()
SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space)
{
clear_input_flags();
reading_from_file = true;
parsing_space = false;
root_space = &out_space;
file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in);
if( file )
@ -125,11 +126,12 @@ SpaceParser::Status SpaceParser::parse_json_file(const std::wstring & file_name,
SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space)
{
clear_input_flags();
reading_from_file = true;
parsing_space = true;
root_space = &out_space;
file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in);
if( file )
@ -174,10 +176,9 @@ SpaceParser::Status SpaceParser::parse_space_file(const std::wstring & file_name
SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space)
{
reading_from_file = false;
reading_from_wchar_string = false;
clear_input_flags();
pchar_ascii = str;
pchar_unicode = 0;
parsing_space = false;
root_space = &out_space;
@ -195,10 +196,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::string & str, Space & out
SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space)
{
reading_from_file = false;
reading_from_wchar_string = true;
clear_input_flags();
pchar_unicode = str;
pchar_ascii = 0;
parsing_space = false;
root_space = &out_space;
@ -219,10 +219,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::wstring & str, Space & ou
SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space)
{
reading_from_file = false;
reading_from_wchar_string = false;
clear_input_flags();
pchar_ascii = str;
pchar_unicode = 0;
parsing_space = true;
root_space = &out_space;
@ -240,10 +239,9 @@ SpaceParser::Status SpaceParser::parse_space(const std::string & str, Space & ou
SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space)
{
reading_from_file = false;
reading_from_wchar_string = true;
clear_input_flags();
pchar_unicode = str;
pchar_ascii = 0;
parsing_space = true;
root_space = &out_space;

10
src/utf8/utf8.h

@ -45,6 +45,12 @@
namespace pt
{
/*
* public methods are also defined in utf8_stream.h
*
*/
/*!
UTF-8, a transformation format of ISO 10646
http://tools.ietf.org/html/rfc3629
@ -213,9 +219,7 @@ template<typename StreamType>
bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1);
template<typename StreamTypeIn, typename StreamTypeOut>
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested, IMPROVE ME add clear parameter, mode parameter is not used
void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); // not tested, IMPROVE ME mode parameter is not used
} // namespace

104
src/utf8/utf8_stream.h

@ -0,0 +1,104 @@
/*
* This file is a part of PikoTools
* and is distributed under the (new) BSD licence.
* Author: Tomasz Sowa <t.sowa@ttmath.org>
*/
/*
* Copyright (c) 2021, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* * Neither the name Tomasz Sowa nor the names of contributors to this
* project may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef headerfile_picotools_utf8_utf8_stream
#define headerfile_picotools_utf8_utf8_stream
#include "textstream/textstream.h"
namespace pt
{
/*!
this function converts one UTF-8 character into one wide-character
input:
iterator_in - an TextStream iterator for reading from
iterator_end - an end iterator (can be returned by end() method from TextStream)
output:
res - an output character
correct - true if it is a correct character
the function returns how many characters have been used from the input stream
*/
template<typename StreamIteratorType>
size_t utf8_to_int(
StreamIteratorType & iterator_in,
StreamIteratorType & iterator_end,
int & res,
bool & correct)
{
size_t i, len;
unsigned char uz;
res = 0;
correct = false;
if( iterator_in == iterator_end )
return 0;
uz = *iterator_in;
++iterator_in;
if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
return 1;
for(i=1 ; i<len ; ++i)
{
if( iterator_in == iterator_end )
return i;
uz = *iterator_in;
++iterator_in;
if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
return i;
}
if( utf8_check_range(res, len) )
correct = true;
return len;
}
}
#endif

9
src/utf8/utf8_templates.h

@ -47,6 +47,7 @@ namespace pt
{
template<typename StreamType>
void int_to_wide(int c, StreamType & res)
{
@ -65,6 +66,7 @@ void int_to_wide(int c, StreamType & res)
/*!
converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
(need to be tested)