Browse Source

HTMLParser: for <script> and <!- (comments) we copy the content without parsing

htmlparserlistener
Tomasz Sowa 1 year ago
parent
commit
8c5ede5cf3
  1. 235
      src/html/htmlparser.cpp
  2. 17
      src/html/htmlparser.h

235
src/html/htmlparser.cpp

@ -71,6 +71,7 @@ HTMLParser::Item::Item()
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
{
parsing_html = true;
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = in;
@ -79,7 +80,6 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
stack_len = 0;
out_string = &out;
//last_new_line = false;
was_ending_commentary = false;
line_len = 0;
out_string->clear();
@ -369,17 +369,27 @@ return false;
}
void HTMLParser::SkipWhite()
void HTMLParser::SkipWhite(std::wstring * out_string)
{
while( IsWhite(lastc) )
{
if( out_string )
(*out_string) += lastc;
read_char();
}
}
void HTMLParser::SkipWhiteLines()
void HTMLParser::SkipWhiteLines(std::wstring * out_string)
{
while( lastc==10 || IsWhite(lastc) )
{
if( out_string )
(*out_string) += lastc;
read_char();
}
}
@ -408,6 +418,8 @@ void HTMLParser::SkipWhiteWithFirstNewLine()
void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
{
bool is_quoted = false;
@ -770,13 +782,6 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
{
str += lastc;
read_char();
if( IsEndingCommentaryTagMarkAtEndOfString(str) )
{
str.erase(str.size() - 3); // IMPROVEME define a function or what
was_ending_commentary = true;
break;
}
}
if( !str.empty() )
@ -911,14 +916,19 @@ void HTMLParser::PutClosingTag(const Item & item)
if( skip_tags || !IsTagSafe(item.name) )
return;
if( !item.is_commentary )
if( item.is_commentary )
{
Put('-');
Put('-');
PutClosingTagMark();
}
else
{
PutOpeningTagMark();
Put('/');
Put(item.name);
PutClosingTagMark();
}
Put(item.name);
PutClosingTagMark();
}
@ -991,33 +1001,126 @@ bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
}
bool HTMLParser::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
bool HTMLParser::IsStartingEntityMark(wchar_t c)
{
static wchar_t comm_end[] = L"-->";
size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1;
return (c == '&');
}
bool HTMLParser::IsEndingEntityMark(wchar_t c)
{
return (c == ';');
}
if( str.size() >= comm_end_len )
// used for such tags as: script, pre, textarea
void HTMLParser::ReadTextUntilClosingCommentary()
{
while( lastc != -1 )
{
return IsNameEqual(str.c_str() + str.size() - comm_end_len, comm_end);
}
if( lastc == '-' )
{
tmp_text.clear();
tmp_text += lastc;
read_char();
return false;
if( lastc == '-' )
{
tmp_text += lastc;
read_char();
if( IsClosingTagMark(lastc) )
{
tmp_text += lastc;
read_char();
Put(tmp_text);
break;
}
}
Put(tmp_text);
}
else
{
Put(lastc);
read_char();
}
}
}
bool HTMLParser::IsStartingEntityMark(wchar_t c)
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
{
return (c == '&');
tmp_text.clear();
tmp_text += lastc; // opening tag mark
read_char();
SkipWhiteLines(&tmp_text);
if( IsClosingTagIndicator(lastc) )
{
tmp_text += lastc;
read_char();
SkipWhiteLines(&tmp_text);
ReadItemName(tmp_name);
if( IsNameEqual(tmp_name, LastItem().name) )
{
SkipAndCheckClosingTag();
if( put_closing_tag_as_well )
{
Put('<');
Put('/');
Put(tmp_name);
Put('>');
}
return true;
}
else
{
Put(tmp_text);
Put(tmp_name);
}
}
else
{
Put(tmp_text);
}
return false;
}
bool HTMLParser::IsEndingEntityMark(wchar_t c)
// used for such tags as: script, pre, textarea
void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
{
return (c == ';');
while( lastc != -1 )
{
if( IsOpeningTagMark(lastc) )
{
if( IsClosingTagForLastItem(put_closing_tag_as_well) )
{
//CheckNewLine();
break;
}
}
else
{
Put(lastc);
read_char();
}
}
}
// reading text between html tags
void HTMLParser::ReadText()
{
@ -1026,8 +1129,6 @@ void HTMLParser::ReadText()
bool was_non_white_text = false;
was_ending_commentary = false;
bool allow_put_new_line = false;
bool allow_put_space = false;
@ -1061,9 +1162,6 @@ void HTMLParser::ReadText()
}
else
{
if( was_ending_commentary )
break;
PutNormalWhite(was_white_char, was_new_line);
if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
@ -1304,26 +1402,16 @@ bool HTMLParser::ReadItem()
if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
LastItem().tree_index += 1;
if( was_ending_commentary )
{
LastItem().type = Item::closing;
LastItem().is_commentary = true;
LastItem().name = L"--";
was_ending_commentary = false;
}
else
{
read_char(); // skipping the first opening tag mark '<'
SkipWhiteLines();
read_char(); // skipping the first opening tag mark '<'
SkipWhiteLines();
if( IsSpecialTagIndicator(lastc) )
ReadItemSpecial();
else
if( IsClosingTagIndicator(lastc) )
ReadItemClosing();
else
ReadItemOpening();
}
if( IsSpecialTagIndicator(lastc) )
ReadItemSpecial();
else
if( IsClosingTagIndicator(lastc) )
ReadItemClosing();
else
ReadItemOpening();
// IMPROVE ME later CheckSingleItemExceptions() can change opening to single type
ItemFound();
@ -1462,16 +1550,22 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
bool change_white_mode = false;
// in safe_mode the script tag is ignored
if( !safe_mode && IsNameEqual(item.name, L"script") )
{
change_white_mode = true;
}
// if( !safe_mode && IsNameEqual(item.name, L"script") )
// {
// change_white_mode = true;
// }
// if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
// {
// change_white_mode = true;
// }
if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
if( IsNameEqual(item.name, L"pre") )
{
change_white_mode = true;
}
// move to CheckDifferentContentExceptions?
if( IsNameEqual(item.name, no_filter_tag) )
{
change_white_mode = true;
@ -1493,6 +1587,25 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
void HTMLParser::CheckDifferentContentExceptions(Item & item)
{
if( !safe_mode && IsNameEqual(item.name, L"script") )
{
ReadTextUntilClosingTag(true);
PopStack();
}
if( IsNameEqual(item.name, L"textarea") )
{
ReadTextUntilClosingTag(true);
PopStack();
}
}
void HTMLParser::AddForgottenTags()
{
@ -1641,14 +1754,21 @@ void HTMLParser::ReadLoop()
{
if( LastItem().type == Item::opening )
{
CheckSingleItemExceptions();
if( parsing_html )
{
CheckSingleItemExceptions();
}
CheckWhiteCharsExceptions(LastItem());
CheckDifferentContentExceptions(LastItem());
}
else
if( LastItem().type == Item::special )
{
if( !LastItem().is_commentary )
PopStack();
if( LastItem().is_commentary )
ReadTextUntilClosingCommentary();
PopStack();
}
else
if( LastItem().type == Item::simple )
@ -1666,6 +1786,7 @@ void HTMLParser::ReadLoop()
}
ReadText();
is_first_item = false;
}
}

17
src/html/htmlparser.h

@ -172,6 +172,12 @@ public:
protected:
/*
* true when parsing html input, false for parsing xml
*/
bool parsing_html;
// orphans for one language
struct Orphans
{
@ -246,14 +252,13 @@ protected:
virtual bool IsStartingEntityMark(wchar_t c);
virtual bool IsEndingEntityMark(wchar_t c);
virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
virtual bool IsValidCharForName(int c);
virtual bool IsValidCharForAttrName(int c);
virtual bool IsValidCharForEntityName(int c);
virtual void CheckSingleItemExceptions();
virtual void CheckWhiteCharsExceptions(Item & item);
virtual void CheckDifferentContentExceptions(Item & item);
virtual void Put(wchar_t c);
virtual void Put(const wchar_t * str, const wchar_t * end);
@ -299,12 +304,15 @@ protected:
bool CheckOrphan(const wchar_t * str, const wchar_t * end);
bool IsWhite(int c);
void SkipWhite();
void SkipWhiteLines();
void SkipWhite(std::wstring * out_string = nullptr);
void SkipWhiteLines(std::wstring * out_string = nullptr);
void SkipWhiteWithFirstNewLine();
int current_white_char_mode();
void ReadTextUntilClosingCommentary();
bool IsClosingTagForLastItem(bool put_closing_tag_as_well);
void ReadTextUntilClosingTag(bool put_closing_tag_as_well);
void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);
void PopStack();
@ -354,7 +362,6 @@ protected:
bool is_first_item;
size_t wrap_line; // insert a new line character into long lines
size_t tab_size;
bool was_ending_commentary;
OrphanMode orphan_mode;
std::wstring attr_name;
std::vector<std::wstring> attr_value;

Loading…
Cancel
Save