diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp index 43e7d57..63dcc61 100644 --- a/src/html/htmlparser.cpp +++ b/src/html/htmlparser.cpp @@ -69,13 +69,14 @@ HTMLParser::Item::Item() } -void HTMLParser::parse_html(const wchar_t * in, Space & space) +void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode) { parsing_html = true; reading_from_file = false; reading_from_wchar_string = true; pchar_unicode = in; pchar_ascii = 0; + xml_compact_mode = compact_mode; stack_len = 0; out_string = nullptr; @@ -1440,7 +1441,7 @@ void HTMLParser::ReadItemOpening() AddItemToSpace(); Space * space = LastItem().space; - if( space ) + if( !xml_compact_mode && space ) space->add(L"name", LastItem().name); if( PrintOpeningItem() ) @@ -1837,27 +1838,85 @@ void HTMLParser::AddItemToSpace() { if( out_space && stack_len > 0 ) { - if( stack_len == 1 ) + Space * parent = out_space; + + if( stack_len > 1 ) { - pstack[stack_len-1].space = out_space; + parent = pstack[stack_len-2].space; + } + + if( xml_compact_mode ) + { + Space * space = parent->get_space(pstack[stack_len-1].name); + + if( space ) + { + if( space->is_table() ) + { + Space & child = space->add_empty_space(); + pstack[stack_len-1].space = &child; + } + else + { + Space * tab = new Space(); + tab->add(space); + Space & child = tab->add_empty_space(); + + parent->value.value_object[pstack[stack_len-1].name] = tab; + pstack[stack_len-1].space = &child; + } + } + else + { + Space & space = parent->add_empty_space(pstack[stack_len-1].name); + pstack[stack_len-1].space = &space; + } } else { - // stack_len > 1 - Space & childs_tab = pstack[stack_len-2].space->get_add_space(L"childs"); + Space & childs_tab = parent->get_add_space(L"childs"); Space & child = childs_tab.add_empty_space(); pstack[stack_len-1].space = &child; } + } } void HTMLParser::AddSpaceToSpaceTree(const Space & space) { - if( out_space && stack_len > 0 ) + const std::wstring * text = space.get_wstr(L"text"); + + if( out_space && stack_len > 0 && text ) { - Space & childs_tab = LastItem().space->get_add_space(L"childs"); - childs_tab.add(space); + if( xml_compact_mode ) + { + Space * child_text = LastItem().space->get_space(L"text"); + + if( child_text ) + { + if( child_text->is_table() ) + { + child_text->add(*text); + } + else + { + Space * tab = new Space(); + tab->add(*child_text); + tab->add(*text); + LastItem().space->value.value_object[L"text"] = tab; + } + } + else + { + LastItem().space->add(L"text", *text); + } + } + else + { + Space & childs_tab = LastItem().space->get_add_space(L"childs"); + childs_tab.add(space); + } } } diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h index 9575f93..c90a3cc 100644 --- a/src/html/htmlparser.h +++ b/src/html/htmlparser.h @@ -107,7 +107,7 @@ public: virtual ~HTMLParser(); - void parse_html(const wchar_t * in, Space & space); + void parse_html(const wchar_t * in, Space & space, bool compact_mode = false); // main methods used for filtering @@ -182,6 +182,8 @@ protected: bool parsing_html; + bool xml_compact_mode; + // orphans for one language struct Orphans {