added: HTMLParser::ItemParsedListener listener with an item_parsed(...) method which is called when a tag is parsed by the parser

This commit is contained in:
Tomasz Sowa 2021-11-30 16:27:27 +01:00
parent bb9205a55e
commit 2dadfc0809
2 changed files with 132 additions and 45 deletions

View File

@ -36,7 +36,6 @@
*/
#include "htmlparser.h"
#include "convert/text.h"
@ -102,6 +101,13 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
}
void HTMLParser::set_item_parsed_listener(ItemParsedListener * listener)
{
item_parsed_listener = listener;
}
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
{
clear_input_flags();
@ -296,6 +302,7 @@ void HTMLParser::SetSomeDefaults()
skip_commentaries = false;
skip_entities = false;
analyze_entities = false;
item_parsed_listener = nullptr;
}
@ -1941,6 +1948,12 @@ void HTMLParser::CheckClosingTags()
return;
}
// CHECK ME
if( RemoveIfNeeded(stack_len - 2) )
{
RemoveLastSpace(i);
}
for(int z=(int)stack_len-2 ; z >= i ; --z)
{
CheckWhiteCharsExceptions(pstack[z]);
@ -2047,6 +2060,36 @@ void HTMLParser::AddItemToSpace()
}
void HTMLParser::RemoveLastSpace(size_t index)
{
if( out_space )
{
Space * parent = out_space;
if( index > 0 )
{
parent = pstack[index - 1].space;
}
if( xml_compact_mode )
{
// IMPLEMENT ME
}
else
{
Space * childs_tab = parent->get_space(L"childs");
size_t len = childs_tab->table_size();
if( childs_tab && childs_tab->is_table() && len > 0 && childs_tab->value.value_table[len-1] == pstack[stack_len-2].space )
{
childs_tab->remove(len - 1);
pstack[stack_len-2].space = nullptr;
}
}
}
}
void HTMLParser::AddSpaceToSpaceTree(const Space & space)
{
const std::wstring * text = space.get_wstr(L"text");
@ -2086,6 +2129,22 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)
bool HTMLParser::RemoveIfNeeded(size_t index)
{
if( item_parsed_listener )
{
if( !item_parsed_listener->item_parsed(pstack[index]) )
{
return true;
}
}
return false;
}
void HTMLParser::ReadLoop()
{
while( status == ok && ReadItem() )
@ -2111,6 +2170,12 @@ void HTMLParser::ReadLoop()
else
if( LastItem().type == Item::simple )
{
if( stack_len > 0 )
{
if( RemoveIfNeeded(stack_len - 1) )
RemoveLastSpace(stack_len - 1);
}
PopStack();
}
else

View File

@ -52,7 +52,6 @@ namespace pt
// max length of a name of a html tag (with terminating null)
#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30
@ -110,6 +109,66 @@ public:
};
// orphans for one language
struct Orphans
{
std::vector<std::wstring> tab;
size_t max_len;
};
struct Item
{
std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
enum Type
{
opening, /* sample: <h1> */
closing, /* sample: </h1> */
simple, /* sample: <br/> */
special, /* sample: <!doctype> */
none
} type;
bool is_commentary;
bool new_line_before;
// is there a new line after this tag
bool new_line;
// is there a new
bool new_line_in_the_middle;
// current orphans table
// (will be propagated)
Orphans * porphans;
// this item or one from its parents is a 'body' html tag
// (will be propagated)
bool has_body_tag;
size_t tree_index;
Space * space;
void Clear();
Item();
};
class ItemParsedListener
{
public:
ItemParsedListener() {}
virtual bool item_parsed(const Item & item) { return true; }
virtual ~ItemParsedListener() {}
};
/*
the last status of parsing, set by parse() methods
*/
@ -120,6 +179,8 @@ public:
HTMLParser & operator=(const HTMLParser & f);
virtual ~HTMLParser();
void set_item_parsed_listener(ItemParsedListener * listener);
void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
@ -222,12 +283,6 @@ protected:
bool xml_compact_mode;
// orphans for one language
struct Orphans
{
std::vector<std::wstring> tab;
size_t max_len;
};
// orphans for all languages
@ -238,45 +293,9 @@ protected:
// html <nofilter> tag name
std::wstring no_filter_tag;
ItemParsedListener * item_parsed_listener;
struct Item
{
std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
enum Type
{
opening, /* sample: <h1> */
closing, /* sample: </h1> */
simple, /* sample: <br/> */
special, /* sample: <!doctype> */
none
} type;
bool is_commentary;
bool new_line_before;
// is there a new line after this tag
bool new_line;
// is there a new
bool new_line_in_the_middle;
// current orphans table
// (will be propagated)
Orphans * porphans;
// this item or one from its parents is a 'body' html tag
// (will be propagated)
bool has_body_tag;
size_t tree_index;
Space * space;
void Clear();
Item();
};
@ -398,8 +417,11 @@ protected:
void CalcOrphansMaxLen(Orphans & orphans);
void AddItemToSpace();
void RemoveLastSpace(size_t index);
void AddSpaceToSpaceTree(const Space & space);
bool RemoveIfNeeded(size_t index);
Item empty;
Item * pstack; // stack pointer
size_t stack_len; // length of the stack