diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp index 4983010..4186445 100644 --- a/src/html/htmlparser.cpp +++ b/src/html/htmlparser.cpp @@ -73,6 +73,7 @@ void HTMLParser::Item::Clear() name.clear(); type = none; is_commentary = false; + is_cdata = false; porphans = nullptr; new_line_before = false; new_line = false; @@ -646,7 +647,7 @@ bool HTMLParser::IsValidCharForName(int c) if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || - c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary + c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA return true; return false; @@ -696,6 +697,13 @@ size_t i; read_char(); break; } + + if( LastItem().type == Item::special && name == L"![CDATA[" ) + { + LastItem().is_cdata = true; + read_char(); + break; + } } read_char(); @@ -1002,10 +1010,49 @@ return CheckOrphan(str, end, LastItem().porphans->tab); } -void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space) +bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata) { - while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) ) + bool was_closing_tag = false; + + while( lastc != -1 && lastc != 10 && !IsWhite(lastc) ) { + if( is_cdata ) + { + if( lastc == ']' ) + { + read_char(); + + if( lastc == ']' ) + { + read_char(); + + if( IsClosingTagMark(lastc) ) + { + read_char(); + was_closing_tag = true; + break; + } + else + { + str += ']'; + str += ']'; + } + } + else + { + str += ']'; + } + } + } + else + { + if( !char_was_escaped && IsOpeningTagMark(lastc) ) + { + was_closing_tag = true; + break; + } + } + str += lastc; read_char(); } @@ -1028,6 +1075,8 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr); else Put(str); + + return was_closing_tag; } @@ -1290,6 +1339,7 @@ void HTMLParser::ReadTextUntilClosingCommentary() } + bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well) { tmp_text.clear(); @@ -1361,7 +1411,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well) // reading text between html tags -void HTMLParser::ReadText() +void HTMLParser::ReadText(bool is_cdata) { bool was_white_char = false; bool was_new_line = false; @@ -1391,10 +1441,12 @@ void HTMLParser::ReadText() text_space_wstr = &wstr_space.value.value_wstring; } - while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) ) + bool was_closing_tag = false; + + while( lastc != -1 && !was_closing_tag ) { tmp_text.clear(); - PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space); + was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata); if( !tmp_text.empty() ) { @@ -1627,6 +1679,11 @@ void HTMLParser::ReadItemSpecial() Put(LastItem().name); } else + if( LastItem().is_cdata ) + { + // do nothing + } + else { tmp_text.clear(); SkipWhiteLines(); @@ -2158,6 +2215,8 @@ void HTMLParser::ReadLoop() { while( status == ok && ReadItem() ) { + bool was_cdata = false; + if( LastItem().type == Item::opening ) { if( parsing_html ) @@ -2174,6 +2233,9 @@ void HTMLParser::ReadLoop() if( LastItem().is_commentary ) ReadTextUntilClosingCommentary(); + if( LastItem().is_cdata ) + was_cdata = true; + PopStack(); } else @@ -2199,7 +2261,7 @@ void HTMLParser::ReadLoop() if( status == ok ) { - ReadText(); + ReadText(was_cdata); } is_first_item = false; @@ -2331,7 +2393,7 @@ void HTMLParser::Read() // it can be some text or white lines before the first html tag (we print it if using filtering) // but they are not added to the Space tree - ReadText(); + ReadText(false); // reading the whole html source ReadLoop(); diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h index 0f352a9..7797b51 100644 --- a/src/html/htmlparser.h +++ b/src/html/htmlparser.h @@ -132,6 +132,8 @@ public: bool is_commentary; + bool is_cdata; + bool new_line_before; // is there a new line after this tag @@ -392,7 +394,7 @@ protected: void CheckStackPrintRest(); void AddForgottenTags(); void CheckClosingTags(); - void ReadText(); + void ReadText(bool is_cdata); bool PrintRest(); bool PrintOpeningItem(); void ReadItemName(std::wstring & name, bool clear_name = true); @@ -415,7 +417,7 @@ protected: void CheckChar(wchar_t c); - void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space); + bool PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata); void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr); void PutTabs(size_t len);