read CDATA as an ordinary text
This commit is contained in:
parent
b781948f21
commit
fd1a8270cd
|
@ -73,6 +73,7 @@ void HTMLParser::Item::Clear()
|
||||||
name.clear();
|
name.clear();
|
||||||
type = none;
|
type = none;
|
||||||
is_commentary = false;
|
is_commentary = false;
|
||||||
|
is_cdata = false;
|
||||||
porphans = nullptr;
|
porphans = nullptr;
|
||||||
new_line_before = false;
|
new_line_before = false;
|
||||||
new_line = false;
|
new_line = false;
|
||||||
|
@ -646,7 +647,7 @@ bool HTMLParser::IsValidCharForName(int c)
|
||||||
if( (c>='a' && c<='z') ||
|
if( (c>='a' && c<='z') ||
|
||||||
(c>='A' && c<='Z') ||
|
(c>='A' && c<='Z') ||
|
||||||
(c>='0' && c<='9') ||
|
(c>='0' && c<='9') ||
|
||||||
c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
|
c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
@ -696,6 +697,13 @@ size_t i;
|
||||||
read_char();
|
read_char();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if( LastItem().type == Item::special && name == L"![CDATA[" )
|
||||||
|
{
|
||||||
|
LastItem().is_cdata = true;
|
||||||
|
read_char();
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
read_char();
|
read_char();
|
||||||
|
@ -1002,10 +1010,49 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
|
bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata)
|
||||||
{
|
{
|
||||||
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
|
bool was_closing_tag = false;
|
||||||
|
|
||||||
|
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) )
|
||||||
{
|
{
|
||||||
|
if( is_cdata )
|
||||||
|
{
|
||||||
|
if( lastc == ']' )
|
||||||
|
{
|
||||||
|
read_char();
|
||||||
|
|
||||||
|
if( lastc == ']' )
|
||||||
|
{
|
||||||
|
read_char();
|
||||||
|
|
||||||
|
if( IsClosingTagMark(lastc) )
|
||||||
|
{
|
||||||
|
read_char();
|
||||||
|
was_closing_tag = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
str += ']';
|
||||||
|
str += ']';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
str += ']';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( !char_was_escaped && IsOpeningTagMark(lastc) )
|
||||||
|
{
|
||||||
|
was_closing_tag = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
str += lastc;
|
str += lastc;
|
||||||
read_char();
|
read_char();
|
||||||
}
|
}
|
||||||
|
@ -1028,6 +1075,8 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
|
||||||
AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
|
AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
|
||||||
else
|
else
|
||||||
Put(str);
|
Put(str);
|
||||||
|
|
||||||
|
return was_closing_tag;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1290,6 +1339,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
|
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
|
||||||
{
|
{
|
||||||
tmp_text.clear();
|
tmp_text.clear();
|
||||||
|
@ -1361,7 +1411,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
|
||||||
|
|
||||||
|
|
||||||
// reading text between html tags
|
// reading text between html tags
|
||||||
void HTMLParser::ReadText()
|
void HTMLParser::ReadText(bool is_cdata)
|
||||||
{
|
{
|
||||||
bool was_white_char = false;
|
bool was_white_char = false;
|
||||||
bool was_new_line = false;
|
bool was_new_line = false;
|
||||||
|
@ -1391,10 +1441,12 @@ void HTMLParser::ReadText()
|
||||||
text_space_wstr = &wstr_space.value.value_wstring;
|
text_space_wstr = &wstr_space.value.value_wstring;
|
||||||
}
|
}
|
||||||
|
|
||||||
while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
|
bool was_closing_tag = false;
|
||||||
|
|
||||||
|
while( lastc != -1 && !was_closing_tag )
|
||||||
{
|
{
|
||||||
tmp_text.clear();
|
tmp_text.clear();
|
||||||
PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
|
was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
|
||||||
|
|
||||||
if( !tmp_text.empty() )
|
if( !tmp_text.empty() )
|
||||||
{
|
{
|
||||||
|
@ -1627,6 +1679,11 @@ void HTMLParser::ReadItemSpecial()
|
||||||
Put(LastItem().name);
|
Put(LastItem().name);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
if( LastItem().is_cdata )
|
||||||
|
{
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
tmp_text.clear();
|
tmp_text.clear();
|
||||||
SkipWhiteLines();
|
SkipWhiteLines();
|
||||||
|
@ -2158,6 +2215,8 @@ void HTMLParser::ReadLoop()
|
||||||
{
|
{
|
||||||
while( status == ok && ReadItem() )
|
while( status == ok && ReadItem() )
|
||||||
{
|
{
|
||||||
|
bool was_cdata = false;
|
||||||
|
|
||||||
if( LastItem().type == Item::opening )
|
if( LastItem().type == Item::opening )
|
||||||
{
|
{
|
||||||
if( parsing_html )
|
if( parsing_html )
|
||||||
|
@ -2174,6 +2233,9 @@ void HTMLParser::ReadLoop()
|
||||||
if( LastItem().is_commentary )
|
if( LastItem().is_commentary )
|
||||||
ReadTextUntilClosingCommentary();
|
ReadTextUntilClosingCommentary();
|
||||||
|
|
||||||
|
if( LastItem().is_cdata )
|
||||||
|
was_cdata = true;
|
||||||
|
|
||||||
PopStack();
|
PopStack();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -2199,7 +2261,7 @@ void HTMLParser::ReadLoop()
|
||||||
|
|
||||||
if( status == ok )
|
if( status == ok )
|
||||||
{
|
{
|
||||||
ReadText();
|
ReadText(was_cdata);
|
||||||
}
|
}
|
||||||
|
|
||||||
is_first_item = false;
|
is_first_item = false;
|
||||||
|
@ -2331,7 +2393,7 @@ void HTMLParser::Read()
|
||||||
|
|
||||||
// it can be some text or white lines before the first html tag (we print it if using filtering)
|
// it can be some text or white lines before the first html tag (we print it if using filtering)
|
||||||
// but they are not added to the Space tree
|
// but they are not added to the Space tree
|
||||||
ReadText();
|
ReadText(false);
|
||||||
|
|
||||||
// reading the whole html source
|
// reading the whole html source
|
||||||
ReadLoop();
|
ReadLoop();
|
||||||
|
|
|
@ -132,6 +132,8 @@ public:
|
||||||
|
|
||||||
bool is_commentary;
|
bool is_commentary;
|
||||||
|
|
||||||
|
bool is_cdata;
|
||||||
|
|
||||||
bool new_line_before;
|
bool new_line_before;
|
||||||
|
|
||||||
// is there a new line after this tag
|
// is there a new line after this tag
|
||||||
|
@ -392,7 +394,7 @@ protected:
|
||||||
void CheckStackPrintRest();
|
void CheckStackPrintRest();
|
||||||
void AddForgottenTags();
|
void AddForgottenTags();
|
||||||
void CheckClosingTags();
|
void CheckClosingTags();
|
||||||
void ReadText();
|
void ReadText(bool is_cdata);
|
||||||
bool PrintRest();
|
bool PrintRest();
|
||||||
bool PrintOpeningItem();
|
bool PrintOpeningItem();
|
||||||
void ReadItemName(std::wstring & name, bool clear_name = true);
|
void ReadItemName(std::wstring & name, bool clear_name = true);
|
||||||
|
@ -415,7 +417,7 @@ protected:
|
||||||
|
|
||||||
void CheckChar(wchar_t c);
|
void CheckChar(wchar_t c);
|
||||||
|
|
||||||
void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
|
bool PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata);
|
||||||
void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
|
void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
|
||||||
|
|
||||||
void PutTabs(size_t len);
|
void PutTabs(size_t len);
|
||||||
|
|
Loading…
Reference in New Issue