HTMLParser: for <script> and <!- (comments) we copy the content without parsing
This commit is contained in:
parent
fdfd0b1385
commit
8c5ede5cf3
|
@ -71,6 +71,7 @@ HTMLParser::Item::Item()
|
|||
|
||||
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
|
||||
{
|
||||
parsing_html = true;
|
||||
reading_from_file = false;
|
||||
reading_from_wchar_string = true;
|
||||
pchar_unicode = in;
|
||||
|
@ -79,7 +80,6 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
|
|||
stack_len = 0;
|
||||
out_string = &out;
|
||||
//last_new_line = false;
|
||||
was_ending_commentary = false;
|
||||
line_len = 0;
|
||||
out_string->clear();
|
||||
|
||||
|
@ -369,17 +369,27 @@ return false;
|
|||
}
|
||||
|
||||
|
||||
void HTMLParser::SkipWhite()
|
||||
void HTMLParser::SkipWhite(std::wstring * out_string)
|
||||
{
|
||||
while( IsWhite(lastc) )
|
||||
{
|
||||
if( out_string )
|
||||
(*out_string) += lastc;
|
||||
|
||||
read_char();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void HTMLParser::SkipWhiteLines()
|
||||
void HTMLParser::SkipWhiteLines(std::wstring * out_string)
|
||||
{
|
||||
while( lastc==10 || IsWhite(lastc) )
|
||||
{
|
||||
if( out_string )
|
||||
(*out_string) += lastc;
|
||||
|
||||
read_char();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -408,6 +418,8 @@ void HTMLParser::SkipWhiteWithFirstNewLine()
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
|
||||
{
|
||||
bool is_quoted = false;
|
||||
|
@ -770,13 +782,6 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
|
|||
{
|
||||
str += lastc;
|
||||
read_char();
|
||||
|
||||
if( IsEndingCommentaryTagMarkAtEndOfString(str) )
|
||||
{
|
||||
str.erase(str.size() - 3); // IMPROVEME define a function or what
|
||||
was_ending_commentary = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( !str.empty() )
|
||||
|
@ -911,14 +916,19 @@ void HTMLParser::PutClosingTag(const Item & item)
|
|||
if( skip_tags || !IsTagSafe(item.name) )
|
||||
return;
|
||||
|
||||
if( !item.is_commentary )
|
||||
if( item.is_commentary )
|
||||
{
|
||||
Put('-');
|
||||
Put('-');
|
||||
PutClosingTagMark();
|
||||
}
|
||||
else
|
||||
{
|
||||
PutOpeningTagMark();
|
||||
Put('/');
|
||||
Put(item.name);
|
||||
PutClosingTagMark();
|
||||
}
|
||||
|
||||
Put(item.name);
|
||||
PutClosingTagMark();
|
||||
}
|
||||
|
||||
|
||||
|
@ -991,20 +1001,6 @@ bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
|
|||
}
|
||||
|
||||
|
||||
bool HTMLParser::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
|
||||
{
|
||||
static wchar_t comm_end[] = L"-->";
|
||||
size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1;
|
||||
|
||||
if( str.size() >= comm_end_len )
|
||||
{
|
||||
return IsNameEqual(str.c_str() + str.size() - comm_end_len, comm_end);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool HTMLParser::IsStartingEntityMark(wchar_t c)
|
||||
{
|
||||
return (c == '&');
|
||||
|
@ -1018,6 +1014,113 @@ bool HTMLParser::IsEndingEntityMark(wchar_t c)
|
|||
|
||||
|
||||
|
||||
// used for such tags as: script, pre, textarea
|
||||
void HTMLParser::ReadTextUntilClosingCommentary()
|
||||
{
|
||||
while( lastc != -1 )
|
||||
{
|
||||
if( lastc == '-' )
|
||||
{
|
||||
tmp_text.clear();
|
||||
tmp_text += lastc;
|
||||
read_char();
|
||||
|
||||
if( lastc == '-' )
|
||||
{
|
||||
tmp_text += lastc;
|
||||
read_char();
|
||||
|
||||
if( IsClosingTagMark(lastc) )
|
||||
{
|
||||
tmp_text += lastc;
|
||||
read_char();
|
||||
Put(tmp_text);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Put(tmp_text);
|
||||
}
|
||||
else
|
||||
{
|
||||
Put(lastc);
|
||||
read_char();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
|
||||
{
|
||||
tmp_text.clear();
|
||||
tmp_text += lastc; // opening tag mark
|
||||
read_char();
|
||||
|
||||
SkipWhiteLines(&tmp_text);
|
||||
|
||||
if( IsClosingTagIndicator(lastc) )
|
||||
{
|
||||
tmp_text += lastc;
|
||||
read_char();
|
||||
SkipWhiteLines(&tmp_text);
|
||||
ReadItemName(tmp_name);
|
||||
|
||||
if( IsNameEqual(tmp_name, LastItem().name) )
|
||||
{
|
||||
SkipAndCheckClosingTag();
|
||||
|
||||
if( put_closing_tag_as_well )
|
||||
{
|
||||
Put('<');
|
||||
Put('/');
|
||||
Put(tmp_name);
|
||||
Put('>');
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
Put(tmp_text);
|
||||
Put(tmp_name);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Put(tmp_text);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// used for such tags as: script, pre, textarea
|
||||
void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
|
||||
{
|
||||
while( lastc != -1 )
|
||||
{
|
||||
if( IsOpeningTagMark(lastc) )
|
||||
{
|
||||
if( IsClosingTagForLastItem(put_closing_tag_as_well) )
|
||||
{
|
||||
//CheckNewLine();
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Put(lastc);
|
||||
read_char();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// reading text between html tags
|
||||
void HTMLParser::ReadText()
|
||||
{
|
||||
|
@ -1026,8 +1129,6 @@ void HTMLParser::ReadText()
|
|||
|
||||
bool was_non_white_text = false;
|
||||
|
||||
was_ending_commentary = false;
|
||||
|
||||
bool allow_put_new_line = false;
|
||||
bool allow_put_space = false;
|
||||
|
||||
|
@ -1061,9 +1162,6 @@ void HTMLParser::ReadText()
|
|||
}
|
||||
else
|
||||
{
|
||||
if( was_ending_commentary )
|
||||
break;
|
||||
|
||||
PutNormalWhite(was_white_char, was_new_line);
|
||||
|
||||
if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
|
||||
|
@ -1304,26 +1402,16 @@ bool HTMLParser::ReadItem()
|
|||
if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
|
||||
LastItem().tree_index += 1;
|
||||
|
||||
if( was_ending_commentary )
|
||||
{
|
||||
LastItem().type = Item::closing;
|
||||
LastItem().is_commentary = true;
|
||||
LastItem().name = L"--";
|
||||
was_ending_commentary = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
read_char(); // skipping the first opening tag mark '<'
|
||||
SkipWhiteLines();
|
||||
read_char(); // skipping the first opening tag mark '<'
|
||||
SkipWhiteLines();
|
||||
|
||||
if( IsSpecialTagIndicator(lastc) )
|
||||
ReadItemSpecial();
|
||||
else
|
||||
if( IsClosingTagIndicator(lastc) )
|
||||
ReadItemClosing();
|
||||
else
|
||||
ReadItemOpening();
|
||||
}
|
||||
if( IsSpecialTagIndicator(lastc) )
|
||||
ReadItemSpecial();
|
||||
else
|
||||
if( IsClosingTagIndicator(lastc) )
|
||||
ReadItemClosing();
|
||||
else
|
||||
ReadItemOpening();
|
||||
|
||||
// IMPROVE ME later CheckSingleItemExceptions() can change opening to single type
|
||||
ItemFound();
|
||||
|
@ -1462,16 +1550,22 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
|
|||
bool change_white_mode = false;
|
||||
|
||||
// in safe_mode the script tag is ignored
|
||||
if( !safe_mode && IsNameEqual(item.name, L"script") )
|
||||
{
|
||||
change_white_mode = true;
|
||||
}
|
||||
|
||||
if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
|
||||
// if( !safe_mode && IsNameEqual(item.name, L"script") )
|
||||
// {
|
||||
// change_white_mode = true;
|
||||
// }
|
||||
|
||||
// if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
|
||||
// {
|
||||
// change_white_mode = true;
|
||||
// }
|
||||
|
||||
if( IsNameEqual(item.name, L"pre") )
|
||||
{
|
||||
change_white_mode = true;
|
||||
}
|
||||
|
||||
// move to CheckDifferentContentExceptions?
|
||||
if( IsNameEqual(item.name, no_filter_tag) )
|
||||
{
|
||||
change_white_mode = true;
|
||||
|
@ -1493,6 +1587,25 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
|
|||
|
||||
|
||||
|
||||
void HTMLParser::CheckDifferentContentExceptions(Item & item)
|
||||
{
|
||||
if( !safe_mode && IsNameEqual(item.name, L"script") )
|
||||
{
|
||||
ReadTextUntilClosingTag(true);
|
||||
PopStack();
|
||||
}
|
||||
|
||||
if( IsNameEqual(item.name, L"textarea") )
|
||||
{
|
||||
ReadTextUntilClosingTag(true);
|
||||
PopStack();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void HTMLParser::AddForgottenTags()
|
||||
{
|
||||
|
@ -1641,14 +1754,21 @@ void HTMLParser::ReadLoop()
|
|||
{
|
||||
if( LastItem().type == Item::opening )
|
||||
{
|
||||
CheckSingleItemExceptions();
|
||||
if( parsing_html )
|
||||
{
|
||||
CheckSingleItemExceptions();
|
||||
}
|
||||
|
||||
CheckWhiteCharsExceptions(LastItem());
|
||||
CheckDifferentContentExceptions(LastItem());
|
||||
}
|
||||
else
|
||||
if( LastItem().type == Item::special )
|
||||
{
|
||||
if( !LastItem().is_commentary )
|
||||
PopStack();
|
||||
if( LastItem().is_commentary )
|
||||
ReadTextUntilClosingCommentary();
|
||||
|
||||
PopStack();
|
||||
}
|
||||
else
|
||||
if( LastItem().type == Item::simple )
|
||||
|
@ -1666,6 +1786,7 @@ void HTMLParser::ReadLoop()
|
|||
}
|
||||
|
||||
ReadText();
|
||||
|
||||
is_first_item = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -172,6 +172,12 @@ public:
|
|||
|
||||
protected:
|
||||
|
||||
/*
|
||||
* true when parsing html input, false for parsing xml
|
||||
*/
|
||||
bool parsing_html;
|
||||
|
||||
|
||||
// orphans for one language
|
||||
struct Orphans
|
||||
{
|
||||
|
@ -246,14 +252,13 @@ protected:
|
|||
virtual bool IsStartingEntityMark(wchar_t c);
|
||||
virtual bool IsEndingEntityMark(wchar_t c);
|
||||
|
||||
virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
|
||||
|
||||
virtual bool IsValidCharForName(int c);
|
||||
virtual bool IsValidCharForAttrName(int c);
|
||||
virtual bool IsValidCharForEntityName(int c);
|
||||
|
||||
virtual void CheckSingleItemExceptions();
|
||||
virtual void CheckWhiteCharsExceptions(Item & item);
|
||||
virtual void CheckDifferentContentExceptions(Item & item);
|
||||
|
||||
virtual void Put(wchar_t c);
|
||||
virtual void Put(const wchar_t * str, const wchar_t * end);
|
||||
|
@ -299,12 +304,15 @@ protected:
|
|||
bool CheckOrphan(const wchar_t * str, const wchar_t * end);
|
||||
|
||||
bool IsWhite(int c);
|
||||
void SkipWhite();
|
||||
void SkipWhiteLines();
|
||||
void SkipWhite(std::wstring * out_string = nullptr);
|
||||
void SkipWhiteLines(std::wstring * out_string = nullptr);
|
||||
void SkipWhiteWithFirstNewLine();
|
||||
|
||||
int current_white_char_mode();
|
||||
|
||||
void ReadTextUntilClosingCommentary();
|
||||
bool IsClosingTagForLastItem(bool put_closing_tag_as_well);
|
||||
void ReadTextUntilClosingTag(bool put_closing_tag_as_well);
|
||||
void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);
|
||||
|
||||
void PopStack();
|
||||
|
@ -354,7 +362,6 @@ protected:
|
|||
bool is_first_item;
|
||||
size_t wrap_line; // insert a new line character into long lines
|
||||
size_t tab_size;
|
||||
bool was_ending_commentary;
|
||||
OrphanMode orphan_mode;
|
||||
std::wstring attr_name;
|
||||
std::vector<std::wstring> attr_value;
|
||||
|
|
Loading…
Reference in New Issue