diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp index 1187a67..7b422f5 100644 --- a/src/html/htmlparser.cpp +++ b/src/html/htmlparser.cpp @@ -71,6 +71,7 @@ HTMLParser::Item::Item() void HTMLParser::Filter(const wchar_t * in, std::wstring & out) { + parsing_html = true; reading_from_file = false; reading_from_wchar_string = true; pchar_unicode = in; @@ -79,7 +80,6 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out) stack_len = 0; out_string = &out; //last_new_line = false; - was_ending_commentary = false; line_len = 0; out_string->clear(); @@ -369,17 +369,27 @@ return false; } -void HTMLParser::SkipWhite() +void HTMLParser::SkipWhite(std::wstring * out_string) { while( IsWhite(lastc) ) + { + if( out_string ) + (*out_string) += lastc; + read_char(); + } } -void HTMLParser::SkipWhiteLines() +void HTMLParser::SkipWhiteLines(std::wstring * out_string) { while( lastc==10 || IsWhite(lastc) ) + { + if( out_string ) + (*out_string) += lastc; + read_char(); + } } @@ -408,6 +418,8 @@ void HTMLParser::SkipWhiteWithFirstNewLine() + + void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text) { bool is_quoted = false; @@ -770,13 +782,6 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, { str += lastc; read_char(); - - if( IsEndingCommentaryTagMarkAtEndOfString(str) ) - { - str.erase(str.size() - 3); // IMPROVEME define a function or what - was_ending_commentary = true; - break; - } } if( !str.empty() ) @@ -911,14 +916,19 @@ void HTMLParser::PutClosingTag(const Item & item) if( skip_tags || !IsTagSafe(item.name) ) return; - if( !item.is_commentary ) + if( item.is_commentary ) + { + Put('-'); + Put('-'); + PutClosingTagMark(); + } + else { PutOpeningTagMark(); Put('/'); + Put(item.name); + PutClosingTagMark(); } - - Put(item.name); - PutClosingTagMark(); } @@ -991,20 +1001,6 @@ bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c) } -bool HTMLParser::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str) -{ - static wchar_t comm_end[] = L"-->"; - size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1; - - if( str.size() >= comm_end_len ) - { - return IsNameEqual(str.c_str() + str.size() - comm_end_len, comm_end); - } - - return false; -} - - bool HTMLParser::IsStartingEntityMark(wchar_t c) { return (c == '&'); @@ -1018,6 +1014,113 @@ bool HTMLParser::IsEndingEntityMark(wchar_t c) +// used for such tags as: script, pre, textarea +void HTMLParser::ReadTextUntilClosingCommentary() +{ + while( lastc != -1 ) + { + if( lastc == '-' ) + { + tmp_text.clear(); + tmp_text += lastc; + read_char(); + + if( lastc == '-' ) + { + tmp_text += lastc; + read_char(); + + if( IsClosingTagMark(lastc) ) + { + tmp_text += lastc; + read_char(); + Put(tmp_text); + + break; + } + } + + Put(tmp_text); + } + else + { + Put(lastc); + read_char(); + } + } +} + + +bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well) +{ + tmp_text.clear(); + tmp_text += lastc; // opening tag mark + read_char(); + + SkipWhiteLines(&tmp_text); + + if( IsClosingTagIndicator(lastc) ) + { + tmp_text += lastc; + read_char(); + SkipWhiteLines(&tmp_text); + ReadItemName(tmp_name); + + if( IsNameEqual(tmp_name, LastItem().name) ) + { + SkipAndCheckClosingTag(); + + if( put_closing_tag_as_well ) + { + Put('<'); + Put('/'); + Put(tmp_name); + Put('>'); + } + + return true; + } + else + { + Put(tmp_text); + Put(tmp_name); + } + } + else + { + Put(tmp_text); + } + +return false; +} + + + + +// used for such tags as: script, pre, textarea +void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well) +{ + while( lastc != -1 ) + { + if( IsOpeningTagMark(lastc) ) + { + if( IsClosingTagForLastItem(put_closing_tag_as_well) ) + { + //CheckNewLine(); + break; + } + } + else + { + Put(lastc); + read_char(); + } + } +} + + + + // reading text between html tags void HTMLParser::ReadText() { @@ -1026,8 +1129,6 @@ void HTMLParser::ReadText() bool was_non_white_text = false; - was_ending_commentary = false; - bool allow_put_new_line = false; bool allow_put_space = false; @@ -1061,9 +1162,6 @@ void HTMLParser::ReadText() } else { - if( was_ending_commentary ) - break; - PutNormalWhite(was_white_char, was_new_line); if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE ) @@ -1304,26 +1402,16 @@ bool HTMLParser::ReadItem() if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle ) LastItem().tree_index += 1; - if( was_ending_commentary ) - { - LastItem().type = Item::closing; - LastItem().is_commentary = true; - LastItem().name = L"--"; - was_ending_commentary = false; - } - else - { - read_char(); // skipping the first opening tag mark '<' - SkipWhiteLines(); + read_char(); // skipping the first opening tag mark '<' + SkipWhiteLines(); - if( IsSpecialTagIndicator(lastc) ) - ReadItemSpecial(); - else - if( IsClosingTagIndicator(lastc) ) - ReadItemClosing(); - else - ReadItemOpening(); - } + if( IsSpecialTagIndicator(lastc) ) + ReadItemSpecial(); + else + if( IsClosingTagIndicator(lastc) ) + ReadItemClosing(); + else + ReadItemOpening(); // IMPROVE ME later CheckSingleItemExceptions() can change opening to single type ItemFound(); @@ -1462,16 +1550,22 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item) bool change_white_mode = false; // in safe_mode the script tag is ignored - if( !safe_mode && IsNameEqual(item.name, L"script") ) - { - change_white_mode = true; - } - - if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") ) +// if( !safe_mode && IsNameEqual(item.name, L"script") ) +// { +// change_white_mode = true; +// } + +// if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") ) +// { +// change_white_mode = true; +// } + + if( IsNameEqual(item.name, L"pre") ) { change_white_mode = true; } + // move to CheckDifferentContentExceptions? if( IsNameEqual(item.name, no_filter_tag) ) { change_white_mode = true; @@ -1493,6 +1587,25 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item) +void HTMLParser::CheckDifferentContentExceptions(Item & item) +{ + if( !safe_mode && IsNameEqual(item.name, L"script") ) + { + ReadTextUntilClosingTag(true); + PopStack(); + } + + if( IsNameEqual(item.name, L"textarea") ) + { + ReadTextUntilClosingTag(true); + PopStack(); + } +} + + + + + void HTMLParser::AddForgottenTags() { @@ -1641,14 +1754,21 @@ void HTMLParser::ReadLoop() { if( LastItem().type == Item::opening ) { - CheckSingleItemExceptions(); + if( parsing_html ) + { + CheckSingleItemExceptions(); + } + CheckWhiteCharsExceptions(LastItem()); + CheckDifferentContentExceptions(LastItem()); } else if( LastItem().type == Item::special ) { - if( !LastItem().is_commentary ) - PopStack(); + if( LastItem().is_commentary ) + ReadTextUntilClosingCommentary(); + + PopStack(); } else if( LastItem().type == Item::simple ) @@ -1666,6 +1786,7 @@ void HTMLParser::ReadLoop() } ReadText(); + is_first_item = false; } } diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h index 7e42eea..8bf6969 100644 --- a/src/html/htmlparser.h +++ b/src/html/htmlparser.h @@ -172,6 +172,12 @@ public: protected: + /* + * true when parsing html input, false for parsing xml + */ + bool parsing_html; + + // orphans for one language struct Orphans { @@ -246,14 +252,13 @@ protected: virtual bool IsStartingEntityMark(wchar_t c); virtual bool IsEndingEntityMark(wchar_t c); - virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str); - virtual bool IsValidCharForName(int c); virtual bool IsValidCharForAttrName(int c); virtual bool IsValidCharForEntityName(int c); virtual void CheckSingleItemExceptions(); virtual void CheckWhiteCharsExceptions(Item & item); + virtual void CheckDifferentContentExceptions(Item & item); virtual void Put(wchar_t c); virtual void Put(const wchar_t * str, const wchar_t * end); @@ -299,12 +304,15 @@ protected: bool CheckOrphan(const wchar_t * str, const wchar_t * end); bool IsWhite(int c); - void SkipWhite(); - void SkipWhiteLines(); + void SkipWhite(std::wstring * out_string = nullptr); + void SkipWhiteLines(std::wstring * out_string = nullptr); void SkipWhiteWithFirstNewLine(); int current_white_char_mode(); + void ReadTextUntilClosingCommentary(); + bool IsClosingTagForLastItem(bool put_closing_tag_as_well); + void ReadTextUntilClosingTag(bool put_closing_tag_as_well); void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr); void PopStack(); @@ -354,7 +362,6 @@ protected: bool is_first_item; size_t wrap_line; // insert a new line character into long lines size_t tab_size; - bool was_ending_commentary; OrphanMode orphan_mode; std::wstring attr_name; std::vector attr_value;