diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp index 21a826f..9b24071 100644 --- a/src/html/htmlparser.cpp +++ b/src/html/htmlparser.cpp @@ -36,7 +36,6 @@ */ #include "htmlparser.h" - #include "convert/text.h" @@ -102,6 +101,13 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode } +void HTMLParser::set_item_parsed_listener(ItemParsedListener * listener) +{ + item_parsed_listener = listener; +} + + + HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space) { clear_input_flags(); @@ -296,6 +302,7 @@ void HTMLParser::SetSomeDefaults() skip_commentaries = false; skip_entities = false; analyze_entities = false; + item_parsed_listener = nullptr; } @@ -1941,6 +1948,12 @@ void HTMLParser::CheckClosingTags() return; } + // CHECK ME + if( RemoveIfNeeded(stack_len - 2) ) + { + RemoveLastSpace(i); + } + for(int z=(int)stack_len-2 ; z >= i ; --z) { CheckWhiteCharsExceptions(pstack[z]); @@ -2047,6 +2060,36 @@ void HTMLParser::AddItemToSpace() } + +void HTMLParser::RemoveLastSpace(size_t index) +{ + if( out_space ) + { + Space * parent = out_space; + + if( index > 0 ) + { + parent = pstack[index - 1].space; + } + + if( xml_compact_mode ) + { + // IMPLEMENT ME + } + else + { + Space * childs_tab = parent->get_space(L"childs"); + size_t len = childs_tab->table_size(); + + if( childs_tab && childs_tab->is_table() && len > 0 && childs_tab->value.value_table[len-1] == pstack[stack_len-2].space ) + { + childs_tab->remove(len - 1); + pstack[stack_len-2].space = nullptr; + } + } + } +} + void HTMLParser::AddSpaceToSpaceTree(const Space & space) { const std::wstring * text = space.get_wstr(L"text"); @@ -2086,6 +2129,22 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space) + +bool HTMLParser::RemoveIfNeeded(size_t index) +{ + if( item_parsed_listener ) + { + if( !item_parsed_listener->item_parsed(pstack[index]) ) + { + return true; + } + } + + return false; +} + + + void HTMLParser::ReadLoop() { while( status == ok && ReadItem() ) @@ -2111,6 +2170,12 @@ void HTMLParser::ReadLoop() else if( LastItem().type == Item::simple ) { + if( stack_len > 0 ) + { + if( RemoveIfNeeded(stack_len - 1) ) + RemoveLastSpace(stack_len - 1); + } + PopStack(); } else diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h index caf5cf1..da0074e 100644 --- a/src/html/htmlparser.h +++ b/src/html/htmlparser.h @@ -52,7 +52,6 @@ namespace pt - // max length of a name of a html tag (with terminating null) #define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30 @@ -110,6 +109,66 @@ public: }; + // orphans for one language + struct Orphans + { + std::vector tab; + size_t max_len; + }; + + + struct Item + { + std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN + + enum Type + { + opening, /* sample:

*/ + closing, /* sample:

*/ + simple, /* sample:
*/ + special, /* sample: */ + none + } type; + + bool is_commentary; + + bool new_line_before; + + // is there a new line after this tag + bool new_line; + + // is there a new + bool new_line_in_the_middle; + + // current orphans table + // (will be propagated) + Orphans * porphans; + + // this item or one from its parents is a 'body' html tag + // (will be propagated) + bool has_body_tag; + + size_t tree_index; + + Space * space; + + void Clear(); + Item(); + }; + + + class ItemParsedListener + { + public: + + ItemParsedListener() {} + + virtual bool item_parsed(const Item & item) { return true; } + virtual ~ItemParsedListener() {} + + }; + + /* the last status of parsing, set by parse() methods */ @@ -120,6 +179,8 @@ public: HTMLParser & operator=(const HTMLParser & f); virtual ~HTMLParser(); + void set_item_parsed_listener(ItemParsedListener * listener); + void parse_html(const wchar_t * in, Space & space, bool compact_mode = false); @@ -222,12 +283,6 @@ protected: bool xml_compact_mode; - // orphans for one language - struct Orphans - { - std::vector tab; - size_t max_len; - }; // orphans for all languages @@ -238,45 +293,9 @@ protected: // html tag name std::wstring no_filter_tag; + ItemParsedListener * item_parsed_listener; - struct Item - { - std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN - enum Type - { - opening, /* sample:

*/ - closing, /* sample:

*/ - simple, /* sample:
*/ - special, /* sample: */ - none - } type; - - bool is_commentary; - - bool new_line_before; - - // is there a new line after this tag - bool new_line; - - // is there a new - bool new_line_in_the_middle; - - // current orphans table - // (will be propagated) - Orphans * porphans; - - // this item or one from its parents is a 'body' html tag - // (will be propagated) - bool has_body_tag; - - size_t tree_index; - - Space * space; - - void Clear(); - Item(); - }; @@ -398,8 +417,11 @@ protected: void CalcOrphansMaxLen(Orphans & orphans); void AddItemToSpace(); + void RemoveLastSpace(size_t index); void AddSpaceToSpaceTree(const Space & space); + bool RemoveIfNeeded(size_t index); + Item empty; Item * pstack; // stack pointer size_t stack_len; // length of the stack