diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 21a826f..9b24071 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -36,7 +36,6 @@
*/
#include "htmlparser.h"
-
#include "convert/text.h"
@@ -102,6 +101,13 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
}
+void HTMLParser::set_item_parsed_listener(ItemParsedListener * listener)
+{
+ item_parsed_listener = listener;
+}
+
+
+
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
{
clear_input_flags();
@@ -296,6 +302,7 @@ void HTMLParser::SetSomeDefaults()
skip_commentaries = false;
skip_entities = false;
analyze_entities = false;
+ item_parsed_listener = nullptr;
}
@@ -1941,6 +1948,12 @@ void HTMLParser::CheckClosingTags()
return;
}
+ // CHECK ME
+ if( RemoveIfNeeded(stack_len - 2) )
+ {
+ RemoveLastSpace(i);
+ }
+
for(int z=(int)stack_len-2 ; z >= i ; --z)
{
CheckWhiteCharsExceptions(pstack[z]);
@@ -2047,6 +2060,36 @@ void HTMLParser::AddItemToSpace()
}
+
+void HTMLParser::RemoveLastSpace(size_t index)
+{
+ if( out_space )
+ {
+ Space * parent = out_space;
+
+ if( index > 0 )
+ {
+ parent = pstack[index - 1].space;
+ }
+
+ if( xml_compact_mode )
+ {
+ // IMPLEMENT ME
+ }
+ else
+ {
+ Space * childs_tab = parent->get_space(L"childs");
+ size_t len = childs_tab->table_size();
+
+ if( childs_tab && childs_tab->is_table() && len > 0 && childs_tab->value.value_table[len-1] == pstack[stack_len-2].space )
+ {
+ childs_tab->remove(len - 1);
+ pstack[stack_len-2].space = nullptr;
+ }
+ }
+ }
+}
+
void HTMLParser::AddSpaceToSpaceTree(const Space & space)
{
const std::wstring * text = space.get_wstr(L"text");
@@ -2086,6 +2129,22 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)
+
+bool HTMLParser::RemoveIfNeeded(size_t index)
+{
+ if( item_parsed_listener )
+ {
+ if( !item_parsed_listener->item_parsed(pstack[index]) )
+ {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+
void HTMLParser::ReadLoop()
{
while( status == ok && ReadItem() )
@@ -2111,6 +2170,12 @@ void HTMLParser::ReadLoop()
else
if( LastItem().type == Item::simple )
{
+ if( stack_len > 0 )
+ {
+ if( RemoveIfNeeded(stack_len - 1) )
+ RemoveLastSpace(stack_len - 1);
+ }
+
PopStack();
}
else
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index caf5cf1..da0074e 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -52,7 +52,6 @@ namespace pt
-
// max length of a name of a html tag (with terminating null)
#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30
@@ -110,6 +109,66 @@ public:
};
+ // orphans for one language
+ struct Orphans
+ {
+ std::vector tab;
+ size_t max_len;
+ };
+
+
+ struct Item
+ {
+ std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
+
+ enum Type
+ {
+ opening, /* sample: */
+ closing, /* sample:
*/
+ simple, /* sample:
*/
+ special, /* sample: */
+ none
+ } type;
+
+ bool is_commentary;
+
+ bool new_line_before;
+
+ // is there a new line after this tag
+ bool new_line;
+
+ // is there a new
+ bool new_line_in_the_middle;
+
+ // current orphans table
+ // (will be propagated)
+ Orphans * porphans;
+
+ // this item or one from its parents is a 'body' html tag
+ // (will be propagated)
+ bool has_body_tag;
+
+ size_t tree_index;
+
+ Space * space;
+
+ void Clear();
+ Item();
+ };
+
+
+ class ItemParsedListener
+ {
+ public:
+
+ ItemParsedListener() {}
+
+ virtual bool item_parsed(const Item & item) { return true; }
+ virtual ~ItemParsedListener() {}
+
+ };
+
+
/*
the last status of parsing, set by parse() methods
*/
@@ -120,6 +179,8 @@ public:
HTMLParser & operator=(const HTMLParser & f);
virtual ~HTMLParser();
+ void set_item_parsed_listener(ItemParsedListener * listener);
+
void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
@@ -222,12 +283,6 @@ protected:
bool xml_compact_mode;
- // orphans for one language
- struct Orphans
- {
- std::vector tab;
- size_t max_len;
- };
// orphans for all languages
@@ -238,45 +293,9 @@ protected:
// html tag name
std::wstring no_filter_tag;
+ ItemParsedListener * item_parsed_listener;
- struct Item
- {
- std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
- enum Type
- {
- opening, /* sample: */
- closing, /* sample:
*/
- simple, /* sample:
*/
- special, /* sample: */
- none
- } type;
-
- bool is_commentary;
-
- bool new_line_before;
-
- // is there a new line after this tag
- bool new_line;
-
- // is there a new
- bool new_line_in_the_middle;
-
- // current orphans table
- // (will be propagated)
- Orphans * porphans;
-
- // this item or one from its parents is a 'body' html tag
- // (will be propagated)
- bool has_body_tag;
-
- size_t tree_index;
-
- Space * space;
-
- void Clear();
- Item();
- };
@@ -398,8 +417,11 @@ protected:
void CalcOrphansMaxLen(Orphans & orphans);
void AddItemToSpace();
+ void RemoveLastSpace(size_t index);
void AddSpaceToSpaceTree(const Space & space);
+ bool RemoveIfNeeded(size_t index);
+
Item empty;
Item * pstack; // stack pointer
size_t stack_len; // length of the stack