- added some converting methods: esc_to_json(...), esc_to_xml(...), esc_to_csv() (convert/misc.h)

- BaseParser: added possibility to read from TextStream and WTextStream
- HTMLParser: added filter(const WTextStream & in, Stream & out, ...) method
- added utf8_stream.h with one method:
  template<typename StreamIteratorType>
  size_t utf8_to_int(
    StreamIteratorType & iterator_in,
    StreamIteratorType & iterator_end,
    int & res,
    bool & correct)
This commit is contained in:
2021-10-12 19:53:11 +02:00
parent 4902eb6037
commit 17d2c0fb25
13 changed files with 807 additions and 128 deletions

View File

@@ -48,6 +48,24 @@ const int HTMLParser::WHITE_MODE_TREE;
void HTMLParser::clear_input_flags()
{
BaseParser::clear_input_flags();
parsing_html = true;
xml_compact_mode = true;
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
out_stream = nullptr;
out_space = nullptr;
line_len = 0;
}
void HTMLParser::Item::Clear()
{
name.clear();
@@ -71,21 +89,11 @@ HTMLParser::Item::Item()
void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
{
parsing_html = true;
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = in;
pchar_ascii = 0;
xml_compact_mode = compact_mode;
clear_input_flags();
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
out_space = &space;
//last_new_line = false;
line_len = 0;
pchar_unicode = in;
xml_compact_mode = compact_mode;
out_space = &space;
out_space->clear();
Init();
@@ -96,16 +104,11 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
{
clear_input_flags();
parsing_html = false;
reading_from_file = true;
xml_compact_mode = compact_mode;
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
line_len = 0;
xml_compact_mode = compact_mode;
this->out_space = &out_space;
if( clear_space )
@@ -153,20 +156,15 @@ HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Sp
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string)
{
parsing_html = true;
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = in;
pchar_ascii = 0;
clear_input_flags();
stack_len = 0;
pchar_unicode = in;
out_string = &out;
out_space = nullptr;
//last_new_line = false;
line_len = 0;
out_string->clear();
if( clear_out_string )
out_string->clear();
Init();
Read();
@@ -174,7 +172,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
}
void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string)
{
if( &in == &out )
{
@@ -187,27 +185,45 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
if( out.capacity() < out_projected_len )
out.reserve(out_projected_len);
Filter(in.c_str(), out);
filter(in.c_str(), out, clear_out_string);
}
HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out)
void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream)
{
parsing_html = true;
clear_input_flags();
WTextStream::const_iterator begin = in.begin();
WTextStream::const_iterator end = in.end();
wtext_stream_iterator = &begin;
wtext_stream_iterator_end = &end;
out_stream = &out;
if( clear_out_stream )
out_stream->clear();
Init();
Read();
Uninit();
}
HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream)
{
clear_input_flags();
reading_from_file = true;
// open the file before clearing 'out' string, 'out' string can be the same string as the file_name
file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in);
status = ok;
line = 1;
stack_len = 0;
out_string = &out;
out_space = nullptr;
line_len = 0;
out_string->clear();
out_string = &out;
if( clear_out_stream )
out_string->clear();
if( file )
{
@@ -226,24 +242,24 @@ HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring
}
HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out)
HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream)
{
return filter_file(file_name.c_str(), out);
return filter_file(file_name.c_str(), out, clear_out_stream);
}
HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out)
HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream)
{
std::string file_name_utf8;
pt::wide_to_utf8(file_name, file_name_utf8);
return filter_file(file_name_utf8, out);
return filter_file(file_name_utf8, out, clear_out_stream);
}
HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out)
HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream)
{
return filter_file(file_name.c_str(), out);
return filter_file(file_name.c_str(), out, clear_out_stream);
}
@@ -792,6 +808,9 @@ void HTMLParser::Put(wchar_t c)
if( out_string )
(*out_string) += c;
if( out_stream )
(*out_stream) << c;
CheckChar(c);
}
@@ -806,6 +825,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
if( out_string )
out_string->append(str, len);
if( out_stream )
out_stream->write(str, len);
for( ; str < end ; ++str)
CheckChar(*str);
}
@@ -819,6 +841,9 @@ void HTMLParser::Put(const std::wstring & str)
if( out_string )
out_string->append(str);
if( out_stream )
out_stream->write(str.c_str(), str.size());
for(size_t i=0 ; i < str.size() ; ++i)
CheckChar(str[i]);
}
@@ -1130,6 +1155,9 @@ void HTMLParser::PutTabs(size_t len)
{
if( out_string )
(*out_string) += ' '; // we do not add them to 'line_len'
if( out_stream )
(*out_stream) << ' ';
}
}

View File

@@ -44,6 +44,7 @@
#include <algorithm>
#include "convert/baseparser.h"
#include "space/space.h"
#include "textstream/stream.h"
namespace pt
@@ -130,14 +131,15 @@ public:
// main methods used for filtering
void Filter(const wchar_t * in, std::wstring & out);
void Filter(const std::wstring & in, std::wstring & out);
void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true);
void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true);
void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const char * file_name, std::wstring & out);
HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out);
HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out);
HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out);
HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true);
HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true);
/*
@@ -278,7 +280,7 @@ protected:
void clear_input_flags();
/*
@@ -403,6 +405,7 @@ protected:
size_t stack_len; // length of the stack
wchar_t * buffer; // buffer used when printing
std::wstring * out_string;
Stream * out_stream;
Space * out_space;
Space text_space_tmp;