diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp index 7b422f5..43e7d57 100644 --- a/src/html/htmlparser.cpp +++ b/src/html/htmlparser.cpp @@ -59,6 +59,7 @@ void HTMLParser::Item::Clear() new_line_in_the_middle = false; has_body_tag = false; tree_index = 0; + space = nullptr; } @@ -68,6 +69,27 @@ HTMLParser::Item::Item() } +void HTMLParser::parse_html(const wchar_t * in, Space & space) +{ + parsing_html = true; + reading_from_file = false; + reading_from_wchar_string = true; + pchar_unicode = in; + pchar_ascii = 0; + + stack_len = 0; + out_string = nullptr; + out_space = &space; + //last_new_line = false; + line_len = 0; + out_space->clear(); + + Init(); + Read(); + Uninit(); +} + + void HTMLParser::Filter(const wchar_t * in, std::wstring & out) { @@ -79,6 +101,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out) stack_len = 0; out_string = &out; + out_space = nullptr; //last_new_line = false; line_len = 0; out_string->clear(); @@ -347,6 +370,8 @@ bool HTMLParser::PushStack() return true; } + + void HTMLParser::PopStack() { if( stack_len == 0 ) @@ -609,7 +634,9 @@ void HTMLParser::CheckChar(wchar_t c) void HTMLParser::Put(wchar_t c) { - (*out_string) += c; + if( out_string ) + (*out_string) += c; + CheckChar(c); } @@ -620,7 +647,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end) return; size_t len = end - str; - out_string->append(str, len); + + if( out_string ) + out_string->append(str, len); for( ; str < end ; ++str) CheckChar(*str); @@ -632,7 +661,8 @@ void HTMLParser::Put(const std::wstring & str) { if( !str.empty() ) { - out_string->append(str); + if( out_string ) + out_string->append(str); for(size_t i=0 ; i < str.size() ; ++i) CheckChar(str[i]); @@ -805,7 +835,7 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, } -void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line) +void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text) { was_white_char = false; was_new_line = false; @@ -817,6 +847,9 @@ void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line) else was_white_char = true; + if( result_text ) + (*result_text) += lastc; + if( current_white_char_mode() == WHITE_MODE_ORIGIN ) { Put(lastc); @@ -939,7 +972,10 @@ void HTMLParser::PutTabs(size_t len) len = 30; for(size_t i=0 ; i < (len*tab_size) ; ++i) - (*out_string) += ' '; // we do not add them to 'line_len' + { + if( out_string ) + (*out_string) += ' '; // we do not add them to 'line_len' + } } @@ -1140,6 +1176,18 @@ void HTMLParser::ReadText() } } + Space * text_space = nullptr; + std::wstring * text_space_wstr = nullptr; + + if( out_space ) + { + text_space = &text_space_tmp; + text_space->clear(); + text_space->add(L"name", L""); + Space & wstr_space = text_space->add(L"text", L""); + text_space_wstr = &wstr_space.value.value_wstring; + } + while( lastc != -1 && !IsOpeningTagMark(lastc) ) { tmp_text.clear(); @@ -1150,19 +1198,22 @@ void HTMLParser::ReadText() allow_put_new_line = false; allow_put_space = false; was_non_white_text = true; + + if( text_space_wstr ) + (*text_space_wstr) += tmp_text; } if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) ) { if( lastc == 10 || IsWhite(lastc) ) { - SkipWhiteLines(); + SkipWhiteLines(text_space_wstr); PutNonBreakingSpace(); } } else { - PutNormalWhite(was_white_char, was_new_line); + PutNormalWhite(was_white_char, was_new_line, text_space_wstr); if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE ) { @@ -1190,6 +1241,12 @@ void HTMLParser::ReadText() } } + if( text_space_wstr && !text_space_wstr->empty() && was_non_white_text ) + { + AddSpaceToSpaceTree(*text_space); + } + + text_space_tmp.clear(); new_item_has_new_line_before = was_new_line; } @@ -1292,6 +1349,28 @@ size_t i; } +void HTMLParser::PutItemAttrToSpace() +{ + Space * space = LastItem().space; + + if( space ) + { + Space & attr_tab = space->get_add_space(L"attr"); + Space & attr = attr_tab.add_empty_space(attr_name); + + if( attr_has_value ) + { + attr.set_empty_table(); + + for(size_t i=0 ; i < attr_value.size() ; ++i) + { + attr.add(attr_value[i]); + } + } + } +} + + void HTMLParser::ReadItemClosing() { read_char(); // skipping '/' @@ -1358,13 +1437,19 @@ void HTMLParser::ReadItemOpening() { LastItem().type = Item::opening; ReadItemName(LastItem().name); + AddItemToSpace(); + Space * space = LastItem().space; + if( space ) + space->add(L"name", LastItem().name); + if( PrintOpeningItem() ) { while( ReadItemAttr() ) { CheckItemLangAttr(); PrintItemAttr(); + PutItemAttrToSpace(); } SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple' @@ -1748,6 +1833,36 @@ bool HTMLParser::PrintRest() +void HTMLParser::AddItemToSpace() +{ + if( out_space && stack_len > 0 ) + { + if( stack_len == 1 ) + { + pstack[stack_len-1].space = out_space; + } + else + { + // stack_len > 1 + Space & childs_tab = pstack[stack_len-2].space->get_add_space(L"childs"); + Space & child = childs_tab.add_empty_space(); + pstack[stack_len-1].space = &child; + } + } +} + + +void HTMLParser::AddSpaceToSpaceTree(const Space & space) +{ + if( out_space && stack_len > 0 ) + { + Space & childs_tab = LastItem().space->get_add_space(L"childs"); + childs_tab.add(space); + } +} + + + void HTMLParser::ReadLoop() { while( ReadItem() ) @@ -1759,6 +1874,7 @@ void HTMLParser::ReadLoop() CheckSingleItemExceptions(); } + CheckWhiteCharsExceptions(LastItem()); CheckDifferentContentExceptions(LastItem()); } @@ -1804,7 +1920,8 @@ void HTMLParser::Read() if( current_white_char_mode() != WHITE_MODE_ORIGIN ) SkipWhiteLines(); - // it can be some text or white lines before the first html tag (we print it) + // it can be some text or white lines before the first html tag (we print it if using filtering) + // but they are not added to the Space tree ReadText(); // reading the whole html source diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h index 8bf6969..9575f93 100644 --- a/src/html/htmlparser.h +++ b/src/html/htmlparser.h @@ -43,6 +43,7 @@ #include #include #include "convert/baseparser.h" +#include "space/space.h" namespace pt @@ -106,6 +107,9 @@ public: virtual ~HTMLParser(); + void parse_html(const wchar_t * in, Space & space); + + // main methods used for filtering void Filter(const wchar_t * in, std::wstring & out); void Filter(const std::wstring & in, std::wstring & out); @@ -228,6 +232,8 @@ protected: size_t tree_index; + Space * space; + void Clear(); Item(); }; @@ -331,6 +337,7 @@ protected: bool ReadItemAttr(); void CheckItemLangAttr(); void PrintItemAttr(); + void PutItemAttrToSpace(); void ReadItemClosing(); void ReadItemSpecial(); @@ -342,17 +349,22 @@ protected: void CheckChar(wchar_t c); void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space); - void PutNormalWhite(bool & was_white_char, bool & was_new_line); + void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr); void PutTabs(size_t len); void PutNonBreakingSpace(); void CalcOrphansMaxLen(Orphans & orphans); + void AddItemToSpace(); + void AddSpaceToSpaceTree(const Space & space); + Item empty; Item * pstack; // stack pointer size_t stack_len; // length of the stack wchar_t * buffer; // buffer used when printing std::wstring * out_string; + Space * out_space; + Space text_space_tmp; std::vector white_char_mode_tab;