read CDATA as an ordinary text

This commit is contained in:
Tomasz Sowa 2022-01-18 19:36:40 +01:00
parent b781948f21
commit fd1a8270cd
2 changed files with 74 additions and 10 deletions

View File

@ -73,6 +73,7 @@ void HTMLParser::Item::Clear()
name.clear();
type = none;
is_commentary = false;
is_cdata = false;
porphans = nullptr;
new_line_before = false;
new_line = false;
@ -646,7 +647,7 @@ bool HTMLParser::IsValidCharForName(int c)
if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') ||
(c>='0' && c<='9') ||
c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA
return true;
return false;
@ -696,6 +697,13 @@ size_t i;
read_char();
break;
}
if( LastItem().type == Item::special && name == L"![CDATA[" )
{
LastItem().is_cdata = true;
read_char();
break;
}
}
read_char();
@ -1002,10 +1010,49 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
}
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata)
{
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
bool was_closing_tag = false;
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) )
{
if( is_cdata )
{
if( lastc == ']' )
{
read_char();
if( lastc == ']' )
{
read_char();
if( IsClosingTagMark(lastc) )
{
read_char();
was_closing_tag = true;
break;
}
else
{
str += ']';
str += ']';
}
}
else
{
str += ']';
}
}
}
else
{
if( !char_was_escaped && IsOpeningTagMark(lastc) )
{
was_closing_tag = true;
break;
}
}
str += lastc;
read_char();
}
@ -1028,6 +1075,8 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
else
Put(str);
return was_closing_tag;
}
@ -1290,6 +1339,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
}
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
{
tmp_text.clear();
@ -1361,7 +1411,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
// reading text between html tags
void HTMLParser::ReadText()
void HTMLParser::ReadText(bool is_cdata)
{
bool was_white_char = false;
bool was_new_line = false;
@ -1391,10 +1441,12 @@ void HTMLParser::ReadText()
text_space_wstr = &wstr_space.value.value_wstring;
}
while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
bool was_closing_tag = false;
while( lastc != -1 && !was_closing_tag )
{
tmp_text.clear();
PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
if( !tmp_text.empty() )
{
@ -1627,6 +1679,11 @@ void HTMLParser::ReadItemSpecial()
Put(LastItem().name);
}
else
if( LastItem().is_cdata )
{
// do nothing
}
else
{
tmp_text.clear();
SkipWhiteLines();
@ -2158,6 +2215,8 @@ void HTMLParser::ReadLoop()
{
while( status == ok && ReadItem() )
{
bool was_cdata = false;
if( LastItem().type == Item::opening )
{
if( parsing_html )
@ -2174,6 +2233,9 @@ void HTMLParser::ReadLoop()
if( LastItem().is_commentary )
ReadTextUntilClosingCommentary();
if( LastItem().is_cdata )
was_cdata = true;
PopStack();
}
else
@ -2199,7 +2261,7 @@ void HTMLParser::ReadLoop()
if( status == ok )
{
ReadText();
ReadText(was_cdata);
}
is_first_item = false;
@ -2331,7 +2393,7 @@ void HTMLParser::Read()
// it can be some text or white lines before the first html tag (we print it if using filtering)
// but they are not added to the Space tree
ReadText();
ReadText(false);
// reading the whole html source
ReadLoop();

View File

@ -132,6 +132,8 @@ public:
bool is_commentary;
bool is_cdata;
bool new_line_before;
// is there a new line after this tag
@ -392,7 +394,7 @@ protected:
void CheckStackPrintRest();
void AddForgottenTags();
void CheckClosingTags();
void ReadText();
void ReadText(bool is_cdata);
bool PrintRest();
bool PrintOpeningItem();
void ReadItemName(std::wstring & name, bool clear_name = true);
@ -415,7 +417,7 @@ protected:
void CheckChar(wchar_t c);
void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
bool PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata);
void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
void PutTabs(size_t len);