HTMLParser: for <script> and <!- (comments) we copy the content without parsing

This commit is contained in:
Tomasz Sowa 2021-08-07 02:13:13 +02:00
parent fdfd0b1385
commit 8c5ede5cf3
2 changed files with 194 additions and 66 deletions

View File

@ -71,6 +71,7 @@ HTMLParser::Item::Item()
void HTMLParser::Filter(const wchar_t * in, std::wstring & out) void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
{ {
parsing_html = true;
reading_from_file = false; reading_from_file = false;
reading_from_wchar_string = true; reading_from_wchar_string = true;
pchar_unicode = in; pchar_unicode = in;
@ -79,7 +80,6 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
stack_len = 0; stack_len = 0;
out_string = &out; out_string = &out;
//last_new_line = false; //last_new_line = false;
was_ending_commentary = false;
line_len = 0; line_len = 0;
out_string->clear(); out_string->clear();
@ -369,17 +369,27 @@ return false;
} }
void HTMLParser::SkipWhite() void HTMLParser::SkipWhite(std::wstring * out_string)
{ {
while( IsWhite(lastc) ) while( IsWhite(lastc) )
{
if( out_string )
(*out_string) += lastc;
read_char(); read_char();
}
} }
void HTMLParser::SkipWhiteLines() void HTMLParser::SkipWhiteLines(std::wstring * out_string)
{ {
while( lastc==10 || IsWhite(lastc) ) while( lastc==10 || IsWhite(lastc) )
{
if( out_string )
(*out_string) += lastc;
read_char(); read_char();
}
} }
@ -408,6 +418,8 @@ void HTMLParser::SkipWhiteWithFirstNewLine()
void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text) void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
{ {
bool is_quoted = false; bool is_quoted = false;
@ -770,13 +782,6 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
{ {
str += lastc; str += lastc;
read_char(); read_char();
if( IsEndingCommentaryTagMarkAtEndOfString(str) )
{
str.erase(str.size() - 3); // IMPROVEME define a function or what
was_ending_commentary = true;
break;
}
} }
if( !str.empty() ) if( !str.empty() )
@ -911,14 +916,19 @@ void HTMLParser::PutClosingTag(const Item & item)
if( skip_tags || !IsTagSafe(item.name) ) if( skip_tags || !IsTagSafe(item.name) )
return; return;
if( !item.is_commentary ) if( item.is_commentary )
{
Put('-');
Put('-');
PutClosingTagMark();
}
else
{ {
PutOpeningTagMark(); PutOpeningTagMark();
Put('/'); Put('/');
Put(item.name);
PutClosingTagMark();
} }
Put(item.name);
PutClosingTagMark();
} }
@ -991,20 +1001,6 @@ bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
} }
bool HTMLParser::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
{
static wchar_t comm_end[] = L"-->";
size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1;
if( str.size() >= comm_end_len )
{
return IsNameEqual(str.c_str() + str.size() - comm_end_len, comm_end);
}
return false;
}
bool HTMLParser::IsStartingEntityMark(wchar_t c) bool HTMLParser::IsStartingEntityMark(wchar_t c)
{ {
return (c == '&'); return (c == '&');
@ -1018,6 +1014,113 @@ bool HTMLParser::IsEndingEntityMark(wchar_t c)
// used for such tags as: script, pre, textarea
void HTMLParser::ReadTextUntilClosingCommentary()
{
while( lastc != -1 )
{
if( lastc == '-' )
{
tmp_text.clear();
tmp_text += lastc;
read_char();
if( lastc == '-' )
{
tmp_text += lastc;
read_char();
if( IsClosingTagMark(lastc) )
{
tmp_text += lastc;
read_char();
Put(tmp_text);
break;
}
}
Put(tmp_text);
}
else
{
Put(lastc);
read_char();
}
}
}
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
{
tmp_text.clear();
tmp_text += lastc; // opening tag mark
read_char();
SkipWhiteLines(&tmp_text);
if( IsClosingTagIndicator(lastc) )
{
tmp_text += lastc;
read_char();
SkipWhiteLines(&tmp_text);
ReadItemName(tmp_name);
if( IsNameEqual(tmp_name, LastItem().name) )
{
SkipAndCheckClosingTag();
if( put_closing_tag_as_well )
{
Put('<');
Put('/');
Put(tmp_name);
Put('>');
}
return true;
}
else
{
Put(tmp_text);
Put(tmp_name);
}
}
else
{
Put(tmp_text);
}
return false;
}
// used for such tags as: script, pre, textarea
void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
{
while( lastc != -1 )
{
if( IsOpeningTagMark(lastc) )
{
if( IsClosingTagForLastItem(put_closing_tag_as_well) )
{
//CheckNewLine();
break;
}
}
else
{
Put(lastc);
read_char();
}
}
}
// reading text between html tags // reading text between html tags
void HTMLParser::ReadText() void HTMLParser::ReadText()
{ {
@ -1026,8 +1129,6 @@ void HTMLParser::ReadText()
bool was_non_white_text = false; bool was_non_white_text = false;
was_ending_commentary = false;
bool allow_put_new_line = false; bool allow_put_new_line = false;
bool allow_put_space = false; bool allow_put_space = false;
@ -1061,9 +1162,6 @@ void HTMLParser::ReadText()
} }
else else
{ {
if( was_ending_commentary )
break;
PutNormalWhite(was_white_char, was_new_line); PutNormalWhite(was_white_char, was_new_line);
if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE ) if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
@ -1304,26 +1402,16 @@ bool HTMLParser::ReadItem()
if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle ) if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
LastItem().tree_index += 1; LastItem().tree_index += 1;
if( was_ending_commentary ) read_char(); // skipping the first opening tag mark '<'
{ SkipWhiteLines();
LastItem().type = Item::closing;
LastItem().is_commentary = true;
LastItem().name = L"--";
was_ending_commentary = false;
}
else
{
read_char(); // skipping the first opening tag mark '<'
SkipWhiteLines();
if( IsSpecialTagIndicator(lastc) ) if( IsSpecialTagIndicator(lastc) )
ReadItemSpecial(); ReadItemSpecial();
else else
if( IsClosingTagIndicator(lastc) ) if( IsClosingTagIndicator(lastc) )
ReadItemClosing(); ReadItemClosing();
else else
ReadItemOpening(); ReadItemOpening();
}
// IMPROVE ME later CheckSingleItemExceptions() can change opening to single type // IMPROVE ME later CheckSingleItemExceptions() can change opening to single type
ItemFound(); ItemFound();
@ -1462,16 +1550,22 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
bool change_white_mode = false; bool change_white_mode = false;
// in safe_mode the script tag is ignored // in safe_mode the script tag is ignored
if( !safe_mode && IsNameEqual(item.name, L"script") ) // if( !safe_mode && IsNameEqual(item.name, L"script") )
{ // {
change_white_mode = true; // change_white_mode = true;
} // }
if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") ) // if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
// {
// change_white_mode = true;
// }
if( IsNameEqual(item.name, L"pre") )
{ {
change_white_mode = true; change_white_mode = true;
} }
// move to CheckDifferentContentExceptions?
if( IsNameEqual(item.name, no_filter_tag) ) if( IsNameEqual(item.name, no_filter_tag) )
{ {
change_white_mode = true; change_white_mode = true;
@ -1493,6 +1587,25 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
void HTMLParser::CheckDifferentContentExceptions(Item & item)
{
if( !safe_mode && IsNameEqual(item.name, L"script") )
{
ReadTextUntilClosingTag(true);
PopStack();
}
if( IsNameEqual(item.name, L"textarea") )
{
ReadTextUntilClosingTag(true);
PopStack();
}
}
void HTMLParser::AddForgottenTags() void HTMLParser::AddForgottenTags()
{ {
@ -1641,14 +1754,21 @@ void HTMLParser::ReadLoop()
{ {
if( LastItem().type == Item::opening ) if( LastItem().type == Item::opening )
{ {
CheckSingleItemExceptions(); if( parsing_html )
{
CheckSingleItemExceptions();
}
CheckWhiteCharsExceptions(LastItem()); CheckWhiteCharsExceptions(LastItem());
CheckDifferentContentExceptions(LastItem());
} }
else else
if( LastItem().type == Item::special ) if( LastItem().type == Item::special )
{ {
if( !LastItem().is_commentary ) if( LastItem().is_commentary )
PopStack(); ReadTextUntilClosingCommentary();
PopStack();
} }
else else
if( LastItem().type == Item::simple ) if( LastItem().type == Item::simple )
@ -1666,6 +1786,7 @@ void HTMLParser::ReadLoop()
} }
ReadText(); ReadText();
is_first_item = false; is_first_item = false;
} }
} }

View File

@ -172,6 +172,12 @@ public:
protected: protected:
/*
* true when parsing html input, false for parsing xml
*/
bool parsing_html;
// orphans for one language // orphans for one language
struct Orphans struct Orphans
{ {
@ -246,14 +252,13 @@ protected:
virtual bool IsStartingEntityMark(wchar_t c); virtual bool IsStartingEntityMark(wchar_t c);
virtual bool IsEndingEntityMark(wchar_t c); virtual bool IsEndingEntityMark(wchar_t c);
virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
virtual bool IsValidCharForName(int c); virtual bool IsValidCharForName(int c);
virtual bool IsValidCharForAttrName(int c); virtual bool IsValidCharForAttrName(int c);
virtual bool IsValidCharForEntityName(int c); virtual bool IsValidCharForEntityName(int c);
virtual void CheckSingleItemExceptions(); virtual void CheckSingleItemExceptions();
virtual void CheckWhiteCharsExceptions(Item & item); virtual void CheckWhiteCharsExceptions(Item & item);
virtual void CheckDifferentContentExceptions(Item & item);
virtual void Put(wchar_t c); virtual void Put(wchar_t c);
virtual void Put(const wchar_t * str, const wchar_t * end); virtual void Put(const wchar_t * str, const wchar_t * end);
@ -299,12 +304,15 @@ protected:
bool CheckOrphan(const wchar_t * str, const wchar_t * end); bool CheckOrphan(const wchar_t * str, const wchar_t * end);
bool IsWhite(int c); bool IsWhite(int c);
void SkipWhite(); void SkipWhite(std::wstring * out_string = nullptr);
void SkipWhiteLines(); void SkipWhiteLines(std::wstring * out_string = nullptr);
void SkipWhiteWithFirstNewLine(); void SkipWhiteWithFirstNewLine();
int current_white_char_mode(); int current_white_char_mode();
void ReadTextUntilClosingCommentary();
bool IsClosingTagForLastItem(bool put_closing_tag_as_well);
void ReadTextUntilClosingTag(bool put_closing_tag_as_well);
void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr); void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);
void PopStack(); void PopStack();
@ -354,7 +362,6 @@ protected:
bool is_first_item; bool is_first_item;
size_t wrap_line; // insert a new line character into long lines size_t wrap_line; // insert a new line character into long lines
size_t tab_size; size_t tab_size;
bool was_ending_commentary;
OrphanMode orphan_mode; OrphanMode orphan_mode;
std::wstring attr_name; std::wstring attr_name;
std::vector<std::wstring> attr_value; std::vector<std::wstring> attr_value;