read CDATA as an ordinary text

This commit is contained in:
Tomasz Sowa 2022-01-18 19:36:40 +01:00
parent b781948f21
commit fd1a8270cd
2 changed files with 74 additions and 10 deletions

View File

@ -73,6 +73,7 @@ void HTMLParser::Item::Clear()
name.clear(); name.clear();
type = none; type = none;
is_commentary = false; is_commentary = false;
is_cdata = false;
porphans = nullptr; porphans = nullptr;
new_line_before = false; new_line_before = false;
new_line = false; new_line = false;
@ -646,7 +647,7 @@ bool HTMLParser::IsValidCharForName(int c)
if( (c>='a' && c<='z') || if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') || (c>='A' && c<='Z') ||
(c>='0' && c<='9') || (c>='0' && c<='9') ||
c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA
return true; return true;
return false; return false;
@ -696,6 +697,13 @@ size_t i;
read_char(); read_char();
break; break;
} }
if( LastItem().type == Item::special && name == L"![CDATA[" )
{
LastItem().is_cdata = true;
read_char();
break;
}
} }
read_char(); read_char();
@ -1002,10 +1010,49 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
} }
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space) bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata)
{ {
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) ) bool was_closing_tag = false;
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) )
{ {
if( is_cdata )
{
if( lastc == ']' )
{
read_char();
if( lastc == ']' )
{
read_char();
if( IsClosingTagMark(lastc) )
{
read_char();
was_closing_tag = true;
break;
}
else
{
str += ']';
str += ']';
}
}
else
{
str += ']';
}
}
}
else
{
if( !char_was_escaped && IsOpeningTagMark(lastc) )
{
was_closing_tag = true;
break;
}
}
str += lastc; str += lastc;
read_char(); read_char();
} }
@ -1028,6 +1075,8 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr); AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
else else
Put(str); Put(str);
return was_closing_tag;
} }
@ -1290,6 +1339,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
} }
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well) bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
{ {
tmp_text.clear(); tmp_text.clear();
@ -1361,7 +1411,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
// reading text between html tags // reading text between html tags
void HTMLParser::ReadText() void HTMLParser::ReadText(bool is_cdata)
{ {
bool was_white_char = false; bool was_white_char = false;
bool was_new_line = false; bool was_new_line = false;
@ -1391,10 +1441,12 @@ void HTMLParser::ReadText()
text_space_wstr = &wstr_space.value.value_wstring; text_space_wstr = &wstr_space.value.value_wstring;
} }
while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) ) bool was_closing_tag = false;
while( lastc != -1 && !was_closing_tag )
{ {
tmp_text.clear(); tmp_text.clear();
PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space); was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
if( !tmp_text.empty() ) if( !tmp_text.empty() )
{ {
@ -1627,6 +1679,11 @@ void HTMLParser::ReadItemSpecial()
Put(LastItem().name); Put(LastItem().name);
} }
else else
if( LastItem().is_cdata )
{
// do nothing
}
else
{ {
tmp_text.clear(); tmp_text.clear();
SkipWhiteLines(); SkipWhiteLines();
@ -2158,6 +2215,8 @@ void HTMLParser::ReadLoop()
{ {
while( status == ok && ReadItem() ) while( status == ok && ReadItem() )
{ {
bool was_cdata = false;
if( LastItem().type == Item::opening ) if( LastItem().type == Item::opening )
{ {
if( parsing_html ) if( parsing_html )
@ -2174,6 +2233,9 @@ void HTMLParser::ReadLoop()
if( LastItem().is_commentary ) if( LastItem().is_commentary )
ReadTextUntilClosingCommentary(); ReadTextUntilClosingCommentary();
if( LastItem().is_cdata )
was_cdata = true;
PopStack(); PopStack();
} }
else else
@ -2199,7 +2261,7 @@ void HTMLParser::ReadLoop()
if( status == ok ) if( status == ok )
{ {
ReadText(); ReadText(was_cdata);
} }
is_first_item = false; is_first_item = false;
@ -2331,7 +2393,7 @@ void HTMLParser::Read()
// it can be some text or white lines before the first html tag (we print it if using filtering) // it can be some text or white lines before the first html tag (we print it if using filtering)
// but they are not added to the Space tree // but they are not added to the Space tree
ReadText(); ReadText(false);
// reading the whole html source // reading the whole html source
ReadLoop(); ReadLoop();

View File

@ -132,6 +132,8 @@ public:
bool is_commentary; bool is_commentary;
bool is_cdata;
bool new_line_before; bool new_line_before;
// is there a new line after this tag // is there a new line after this tag
@ -392,7 +394,7 @@ protected:
void CheckStackPrintRest(); void CheckStackPrintRest();
void AddForgottenTags(); void AddForgottenTags();
void CheckClosingTags(); void CheckClosingTags();
void ReadText(); void ReadText(bool is_cdata);
bool PrintRest(); bool PrintRest();
bool PrintOpeningItem(); bool PrintOpeningItem();
void ReadItemName(std::wstring & name, bool clear_name = true); void ReadItemName(std::wstring & name, bool clear_name = true);
@ -415,7 +417,7 @@ protected:
void CheckChar(wchar_t c); void CheckChar(wchar_t c);
void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space); bool PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata);
void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr); void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
void PutTabs(size_t len); void PutTabs(size_t len);