read CDATA as an ordinary text
parent
b781948f21
commit
fd1a8270cd
|
@ -73,6 +73,7 @@ void HTMLParser::Item::Clear()
|
|||
name.clear();
|
||||
type = none;
|
||||
is_commentary = false;
|
||||
is_cdata = false;
|
||||
porphans = nullptr;
|
||||
new_line_before = false;
|
||||
new_line = false;
|
||||
|
@ -646,7 +647,7 @@ bool HTMLParser::IsValidCharForName(int c)
|
|||
if( (c>='a' && c<='z') ||
|
||||
(c>='A' && c<='Z') ||
|
||||
(c>='0' && c<='9') ||
|
||||
c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
|
||||
c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -696,6 +697,13 @@ size_t i;
|
|||
read_char();
|
||||
break;
|
||||
}
|
||||
|
||||
if( LastItem().type == Item::special && name == L"![CDATA[" )
|
||||
{
|
||||
LastItem().is_cdata = true;
|
||||
read_char();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
read_char();
|
||||
|
@ -1002,10 +1010,49 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
|
|||
}
|
||||
|
||||
|
||||
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
|
||||
bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata)
|
||||
{
|
||||
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
|
||||
bool was_closing_tag = false;
|
||||
|
||||
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) )
|
||||
{
|
||||
if( is_cdata )
|
||||
{
|
||||
if( lastc == ']' )
|
||||
{
|
||||
read_char();
|
||||
|
||||
if( lastc == ']' )
|
||||
{
|
||||
read_char();
|
||||
|
||||
if( IsClosingTagMark(lastc) )
|
||||
{
|
||||
read_char();
|
||||
was_closing_tag = true;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
str += ']';
|
||||
str += ']';
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
str += ']';
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if( !char_was_escaped && IsOpeningTagMark(lastc) )
|
||||
{
|
||||
was_closing_tag = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
str += lastc;
|
||||
read_char();
|
||||
}
|
||||
|
@ -1028,6 +1075,8 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
|
|||
AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
|
||||
else
|
||||
Put(str);
|
||||
|
||||
return was_closing_tag;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1290,6 +1339,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
|
|||
}
|
||||
|
||||
|
||||
|
||||
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
|
||||
{
|
||||
tmp_text.clear();
|
||||
|
@ -1361,7 +1411,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
|
|||
|
||||
|
||||
// reading text between html tags
|
||||
void HTMLParser::ReadText()
|
||||
void HTMLParser::ReadText(bool is_cdata)
|
||||
{
|
||||
bool was_white_char = false;
|
||||
bool was_new_line = false;
|
||||
|
@ -1391,10 +1441,12 @@ void HTMLParser::ReadText()
|
|||
text_space_wstr = &wstr_space.value.value_wstring;
|
||||
}
|
||||
|
||||
while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
|
||||
bool was_closing_tag = false;
|
||||
|
||||
while( lastc != -1 && !was_closing_tag )
|
||||
{
|
||||
tmp_text.clear();
|
||||
PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
|
||||
was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
|
||||
|
||||
if( !tmp_text.empty() )
|
||||
{
|
||||
|
@ -1627,6 +1679,11 @@ void HTMLParser::ReadItemSpecial()
|
|||
Put(LastItem().name);
|
||||
}
|
||||
else
|
||||
if( LastItem().is_cdata )
|
||||
{
|
||||
// do nothing
|
||||
}
|
||||
else
|
||||
{
|
||||
tmp_text.clear();
|
||||
SkipWhiteLines();
|
||||
|
@ -2158,6 +2215,8 @@ void HTMLParser::ReadLoop()
|
|||
{
|
||||
while( status == ok && ReadItem() )
|
||||
{
|
||||
bool was_cdata = false;
|
||||
|
||||
if( LastItem().type == Item::opening )
|
||||
{
|
||||
if( parsing_html )
|
||||
|
@ -2174,6 +2233,9 @@ void HTMLParser::ReadLoop()
|
|||
if( LastItem().is_commentary )
|
||||
ReadTextUntilClosingCommentary();
|
||||
|
||||
if( LastItem().is_cdata )
|
||||
was_cdata = true;
|
||||
|
||||
PopStack();
|
||||
}
|
||||
else
|
||||
|
@ -2199,7 +2261,7 @@ void HTMLParser::ReadLoop()
|
|||
|
||||
if( status == ok )
|
||||
{
|
||||
ReadText();
|
||||
ReadText(was_cdata);
|
||||
}
|
||||
|
||||
is_first_item = false;
|
||||
|
@ -2331,7 +2393,7 @@ void HTMLParser::Read()
|
|||
|
||||
// it can be some text or white lines before the first html tag (we print it if using filtering)
|
||||
// but they are not added to the Space tree
|
||||
ReadText();
|
||||
ReadText(false);
|
||||
|
||||
// reading the whole html source
|
||||
ReadLoop();
|
||||
|
|
|
@ -132,6 +132,8 @@ public:
|
|||
|
||||
bool is_commentary;
|
||||
|
||||
bool is_cdata;
|
||||
|
||||
bool new_line_before;
|
||||
|
||||
// is there a new line after this tag
|
||||
|
@ -392,7 +394,7 @@ protected:
|
|||
void CheckStackPrintRest();
|
||||
void AddForgottenTags();
|
||||
void CheckClosingTags();
|
||||
void ReadText();
|
||||
void ReadText(bool is_cdata);
|
||||
bool PrintRest();
|
||||
bool PrintOpeningItem();
|
||||
void ReadItemName(std::wstring & name, bool clear_name = true);
|
||||
|
@ -415,7 +417,7 @@ protected:
|
|||
|
||||
void CheckChar(wchar_t c);
|
||||
|
||||
void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
|
||||
bool PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata);
|
||||
void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
|
||||
|
||||
void PutTabs(size_t len);
|
||||
|
|
Loading…
Reference in New Issue