diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 4983010..4186445 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -73,6 +73,7 @@ void HTMLParser::Item::Clear()
name.clear();
type = none;
is_commentary = false;
+ is_cdata = false;
porphans = nullptr;
new_line_before = false;
new_line = false;
@@ -646,7 +647,7 @@ bool HTMLParser::IsValidCharForName(int c)
if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') ||
(c>='0' && c<='9') ||
- c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
+ c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA
return true;
return false;
@@ -696,6 +697,13 @@ size_t i;
read_char();
break;
}
+
+ if( LastItem().type == Item::special && name == L"![CDATA[" )
+ {
+ LastItem().is_cdata = true;
+ read_char();
+ break;
+ }
}
read_char();
@@ -1002,10 +1010,49 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
}
-void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
+bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata)
{
- while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
+ bool was_closing_tag = false;
+
+ while( lastc != -1 && lastc != 10 && !IsWhite(lastc) )
{
+ if( is_cdata )
+ {
+ if( lastc == ']' )
+ {
+ read_char();
+
+ if( lastc == ']' )
+ {
+ read_char();
+
+ if( IsClosingTagMark(lastc) )
+ {
+ read_char();
+ was_closing_tag = true;
+ break;
+ }
+ else
+ {
+ str += ']';
+ str += ']';
+ }
+ }
+ else
+ {
+ str += ']';
+ }
+ }
+ }
+ else
+ {
+ if( !char_was_escaped && IsOpeningTagMark(lastc) )
+ {
+ was_closing_tag = true;
+ break;
+ }
+ }
+
str += lastc;
read_char();
}
@@ -1028,6 +1075,8 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
else
Put(str);
+
+ return was_closing_tag;
}
@@ -1290,6 +1339,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
}
+
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
{
tmp_text.clear();
@@ -1361,7 +1411,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
// reading text between html tags
-void HTMLParser::ReadText()
+void HTMLParser::ReadText(bool is_cdata)
{
bool was_white_char = false;
bool was_new_line = false;
@@ -1391,10 +1441,12 @@ void HTMLParser::ReadText()
text_space_wstr = &wstr_space.value.value_wstring;
}
- while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
+ bool was_closing_tag = false;
+
+ while( lastc != -1 && !was_closing_tag )
{
tmp_text.clear();
- PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
+ was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
if( !tmp_text.empty() )
{
@@ -1627,6 +1679,11 @@ void HTMLParser::ReadItemSpecial()
Put(LastItem().name);
}
else
+ if( LastItem().is_cdata )
+ {
+ // do nothing
+ }
+ else
{
tmp_text.clear();
SkipWhiteLines();
@@ -2158,6 +2215,8 @@ void HTMLParser::ReadLoop()
{
while( status == ok && ReadItem() )
{
+ bool was_cdata = false;
+
if( LastItem().type == Item::opening )
{
if( parsing_html )
@@ -2174,6 +2233,9 @@ void HTMLParser::ReadLoop()
if( LastItem().is_commentary )
ReadTextUntilClosingCommentary();
+ if( LastItem().is_cdata )
+ was_cdata = true;
+
PopStack();
}
else
@@ -2199,7 +2261,7 @@ void HTMLParser::ReadLoop()
if( status == ok )
{
- ReadText();
+ ReadText(was_cdata);
}
is_first_item = false;
@@ -2331,7 +2393,7 @@ void HTMLParser::Read()
// it can be some text or white lines before the first html tag (we print it if using filtering)
// but they are not added to the Space tree
- ReadText();
+ ReadText(false);
// reading the whole html source
ReadLoop();
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 0f352a9..7797b51 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -132,6 +132,8 @@ public:
bool is_commentary;
+ bool is_cdata;
+
bool new_line_before;
// is there a new line after this tag
@@ -392,7 +394,7 @@ protected:
void CheckStackPrintRest();
void AddForgottenTags();
void CheckClosingTags();
- void ReadText();
+ void ReadText(bool is_cdata);
bool PrintRest();
bool PrintOpeningItem();
void ReadItemName(std::wstring & name, bool clear_name = true);
@@ -415,7 +417,7 @@ protected:
void CheckChar(wchar_t c);
- void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
+ bool PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata);
void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
void PutTabs(size_t len);