HTMLParser now parses correctly such entities: & < > " '

This commit is contained in:
Tomasz Sowa 2021-12-02 17:44:41 +01:00
parent 2dadfc0809
commit b781948f21
2 changed files with 155 additions and 25 deletions

View File

@ -60,6 +60,9 @@ void HTMLParser::clear_input_flags()
out_stream = nullptr;
out_space = nullptr;
line_len = 0;
char_was_escaped = false;
escaped_chars_buffer.clear();
escaped_char_index = 0;
}
@ -602,7 +605,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
while( lastc != -1 )
{
if( lastc == '"' || lastc == '\'' )
if( !char_was_escaped && (lastc == '"' || lastc == '\'') )
{
if( is_quoted )
{
@ -623,7 +626,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
LastItem().type = Item::simple;
}
else
if( !is_quoted && IsClosingTagMark(lastc) )
if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) )
{
read_char();
break;
@ -739,15 +742,18 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
while( lastc != -1 )
{
if( has_quote )
if( !char_was_escaped )
{
if( lastc == quote_char )
break;
}
else
{
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
break;
if( has_quote )
{
if( lastc == quote_char )
break;
}
else
{
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
break;
}
}
if( lastc==10 || IsWhite(lastc) )
@ -780,15 +786,18 @@ void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
while( lastc != -1 )
{
if( has_quote )
if( !char_was_escaped )
{
if( lastc == quote_char )
break;
}
else
{
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
break;
if( has_quote )
{
if( lastc == quote_char )
break;
}
else
{
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
break;
}
}
// IMPROVEME add support for analyze_entities?
@ -995,7 +1004,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
{
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
{
str += lastc;
read_char();
@ -1260,7 +1269,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
tmp_text += lastc;
read_char();
if( IsClosingTagMark(lastc) )
if( !char_was_escaped && IsClosingTagMark(lastc) )
{
tmp_text += lastc;
read_char();
@ -1332,7 +1341,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
{
while( lastc != -1 )
{
if( IsOpeningTagMark(lastc) )
if( !char_was_escaped && IsOpeningTagMark(lastc) )
{
if( IsClosingTagForLastItem(put_closing_tag_as_well) )
{
@ -1382,7 +1391,7 @@ void HTMLParser::ReadText()
text_space_wstr = &wstr_space.value.value_wstring;
}
while( lastc != -1 && !IsOpeningTagMark(lastc) )
while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
{
tmp_text.clear();
PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
@ -1479,7 +1488,7 @@ bool HTMLParser::ReadItemAttr()
read_char(); // skipping '='
SkipWhiteLines();
bool has_quote = (lastc == '\"' || lastc == '\'');
bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\'');
wchar_t quote_char = lastc;
if( has_quote )
@ -1491,7 +1500,7 @@ bool HTMLParser::ReadItemAttr()
else
ReadXMLItemAttrValue(has_quote, quote_char);
if( has_quote && lastc == quote_char )
if( has_quote && !char_was_escaped && lastc == quote_char )
read_char(); // skipping the last quote mark
return true;
@ -2198,6 +2207,116 @@ void HTMLParser::ReadLoop()
}
void HTMLParser::read_char_from_entity_buffer()
{
if( escaped_char_index < escaped_chars_buffer.size() )
{
lastc = escaped_chars_buffer[escaped_char_index];
escaped_char_index += 1;
if( escaped_char_index >= escaped_chars_buffer.size() )
{
escaped_chars_buffer.clear();
escaped_char_index = 0;
}
}
else
{
lastc = -1;
}
}
void HTMLParser::read_xml_entity()
{
const size_t max_entity_length = 6; // length of "&apos;" string
escaped_chars_buffer.clear();
escaped_char_index = 0;
escaped_chars_buffer += '&';
do
{
read_char_no_escape();
if( lastc != -1 )
{
escaped_chars_buffer += lastc;
}
}
while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' );
}
bool HTMLParser::check_escape_sequentions()
{
if( escaped_chars_buffer == L"&amp;" )
{
lastc = '&';
char_was_escaped = true;
}
else
if( escaped_chars_buffer == L"&lt;" )
{
lastc = '<';
char_was_escaped = true;
}
else
if( escaped_chars_buffer == L"&gt;" )
{
lastc = '>';
char_was_escaped = true;
}
else
if( escaped_chars_buffer == L"&quot;" )
{
lastc = '"';
char_was_escaped = true;
}
else
if( escaped_chars_buffer == L"&apos;" )
{
lastc = '\'';
char_was_escaped = true;
}
if( char_was_escaped )
{
escaped_chars_buffer.clear();
escaped_char_index = 0;
}
return char_was_escaped;
}
int HTMLParser::read_char()
{
char_was_escaped = false;
if( escaped_char_index < escaped_chars_buffer.size() )
{
read_char_from_entity_buffer();
}
else
{
read_char_no_escape();
if( lastc == '&' )
{
read_xml_entity();
if( !check_escape_sequentions() )
{
read_char_from_entity_buffer();
}
}
}
return lastc;
}
void HTMLParser::Read()
{

View File

@ -295,8 +295,14 @@ protected:
ItemParsedListener * item_parsed_listener;
/*
true if the lastc was escaped (with a backslash)
we have to know if the last sequence was \" or just "
*/
bool char_was_escaped;
std::wstring escaped_chars_buffer;
size_t escaped_char_index;
void clear_input_flags();
@ -422,6 +428,11 @@ protected:
bool RemoveIfNeeded(size_t index);
bool check_escape_sequentions();
void read_xml_entity();
void read_char_from_entity_buffer();
int read_char() override;
Item empty;
Item * pstack; // stack pointer
size_t stack_len; // length of the stack