HTMLParser now parses correctly such entities: & < > " '
parent
2dadfc0809
commit
b781948f21
|
@ -60,6 +60,9 @@ void HTMLParser::clear_input_flags()
|
|||
out_stream = nullptr;
|
||||
out_space = nullptr;
|
||||
line_len = 0;
|
||||
char_was_escaped = false;
|
||||
escaped_chars_buffer.clear();
|
||||
escaped_char_index = 0;
|
||||
}
|
||||
|
||||
|
||||
|
@ -602,7 +605,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
|
|||
|
||||
while( lastc != -1 )
|
||||
{
|
||||
if( lastc == '"' || lastc == '\'' )
|
||||
if( !char_was_escaped && (lastc == '"' || lastc == '\'') )
|
||||
{
|
||||
if( is_quoted )
|
||||
{
|
||||
|
@ -623,7 +626,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
|
|||
LastItem().type = Item::simple;
|
||||
}
|
||||
else
|
||||
if( !is_quoted && IsClosingTagMark(lastc) )
|
||||
if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) )
|
||||
{
|
||||
read_char();
|
||||
break;
|
||||
|
@ -739,15 +742,18 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
|||
|
||||
while( lastc != -1 )
|
||||
{
|
||||
if( has_quote )
|
||||
if( !char_was_escaped )
|
||||
{
|
||||
if( lastc == quote_char )
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
||||
break;
|
||||
if( has_quote )
|
||||
{
|
||||
if( lastc == quote_char )
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( lastc==10 || IsWhite(lastc) )
|
||||
|
@ -780,15 +786,18 @@ void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
|
|||
|
||||
while( lastc != -1 )
|
||||
{
|
||||
if( has_quote )
|
||||
if( !char_was_escaped )
|
||||
{
|
||||
if( lastc == quote_char )
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
||||
break;
|
||||
if( has_quote )
|
||||
{
|
||||
if( lastc == quote_char )
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// IMPROVEME add support for analyze_entities?
|
||||
|
@ -995,7 +1004,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
|
|||
|
||||
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
|
||||
{
|
||||
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
|
||||
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
|
||||
{
|
||||
str += lastc;
|
||||
read_char();
|
||||
|
@ -1260,7 +1269,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
|
|||
tmp_text += lastc;
|
||||
read_char();
|
||||
|
||||
if( IsClosingTagMark(lastc) )
|
||||
if( !char_was_escaped && IsClosingTagMark(lastc) )
|
||||
{
|
||||
tmp_text += lastc;
|
||||
read_char();
|
||||
|
@ -1332,7 +1341,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
|
|||
{
|
||||
while( lastc != -1 )
|
||||
{
|
||||
if( IsOpeningTagMark(lastc) )
|
||||
if( !char_was_escaped && IsOpeningTagMark(lastc) )
|
||||
{
|
||||
if( IsClosingTagForLastItem(put_closing_tag_as_well) )
|
||||
{
|
||||
|
@ -1382,7 +1391,7 @@ void HTMLParser::ReadText()
|
|||
text_space_wstr = &wstr_space.value.value_wstring;
|
||||
}
|
||||
|
||||
while( lastc != -1 && !IsOpeningTagMark(lastc) )
|
||||
while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
|
||||
{
|
||||
tmp_text.clear();
|
||||
PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
|
||||
|
@ -1479,7 +1488,7 @@ bool HTMLParser::ReadItemAttr()
|
|||
read_char(); // skipping '='
|
||||
SkipWhiteLines();
|
||||
|
||||
bool has_quote = (lastc == '\"' || lastc == '\'');
|
||||
bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\'');
|
||||
wchar_t quote_char = lastc;
|
||||
|
||||
if( has_quote )
|
||||
|
@ -1491,7 +1500,7 @@ bool HTMLParser::ReadItemAttr()
|
|||
else
|
||||
ReadXMLItemAttrValue(has_quote, quote_char);
|
||||
|
||||
if( has_quote && lastc == quote_char )
|
||||
if( has_quote && !char_was_escaped && lastc == quote_char )
|
||||
read_char(); // skipping the last quote mark
|
||||
|
||||
return true;
|
||||
|
@ -2198,6 +2207,116 @@ void HTMLParser::ReadLoop()
|
|||
}
|
||||
|
||||
|
||||
void HTMLParser::read_char_from_entity_buffer()
|
||||
{
|
||||
if( escaped_char_index < escaped_chars_buffer.size() )
|
||||
{
|
||||
lastc = escaped_chars_buffer[escaped_char_index];
|
||||
escaped_char_index += 1;
|
||||
|
||||
if( escaped_char_index >= escaped_chars_buffer.size() )
|
||||
{
|
||||
escaped_chars_buffer.clear();
|
||||
escaped_char_index = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
lastc = -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void HTMLParser::read_xml_entity()
|
||||
{
|
||||
const size_t max_entity_length = 6; // length of "'" string
|
||||
escaped_chars_buffer.clear();
|
||||
escaped_char_index = 0;
|
||||
escaped_chars_buffer += '&';
|
||||
|
||||
do
|
||||
{
|
||||
read_char_no_escape();
|
||||
|
||||
if( lastc != -1 )
|
||||
{
|
||||
escaped_chars_buffer += lastc;
|
||||
}
|
||||
}
|
||||
while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' );
|
||||
}
|
||||
|
||||
|
||||
bool HTMLParser::check_escape_sequentions()
|
||||
{
|
||||
if( escaped_chars_buffer == L"&" )
|
||||
{
|
||||
lastc = '&';
|
||||
char_was_escaped = true;
|
||||
}
|
||||
else
|
||||
if( escaped_chars_buffer == L"<" )
|
||||
{
|
||||
lastc = '<';
|
||||
char_was_escaped = true;
|
||||
}
|
||||
else
|
||||
if( escaped_chars_buffer == L">" )
|
||||
{
|
||||
lastc = '>';
|
||||
char_was_escaped = true;
|
||||
}
|
||||
else
|
||||
if( escaped_chars_buffer == L""" )
|
||||
{
|
||||
lastc = '"';
|
||||
char_was_escaped = true;
|
||||
}
|
||||
else
|
||||
if( escaped_chars_buffer == L"'" )
|
||||
{
|
||||
lastc = '\'';
|
||||
char_was_escaped = true;
|
||||
}
|
||||
|
||||
if( char_was_escaped )
|
||||
{
|
||||
escaped_chars_buffer.clear();
|
||||
escaped_char_index = 0;
|
||||
}
|
||||
|
||||
return char_was_escaped;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int HTMLParser::read_char()
|
||||
{
|
||||
char_was_escaped = false;
|
||||
|
||||
if( escaped_char_index < escaped_chars_buffer.size() )
|
||||
{
|
||||
read_char_from_entity_buffer();
|
||||
}
|
||||
else
|
||||
{
|
||||
read_char_no_escape();
|
||||
|
||||
if( lastc == '&' )
|
||||
{
|
||||
read_xml_entity();
|
||||
|
||||
if( !check_escape_sequentions() )
|
||||
{
|
||||
read_char_from_entity_buffer();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return lastc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void HTMLParser::Read()
|
||||
{
|
||||
|
|
|
@ -295,8 +295,14 @@ protected:
|
|||
|
||||
ItemParsedListener * item_parsed_listener;
|
||||
|
||||
/*
|
||||
true if the lastc was escaped (with a backslash)
|
||||
we have to know if the last sequence was \" or just "
|
||||
*/
|
||||
bool char_was_escaped;
|
||||
|
||||
|
||||
std::wstring escaped_chars_buffer;
|
||||
size_t escaped_char_index;
|
||||
|
||||
|
||||
void clear_input_flags();
|
||||
|
@ -422,6 +428,11 @@ protected:
|
|||
|
||||
bool RemoveIfNeeded(size_t index);
|
||||
|
||||
bool check_escape_sequentions();
|
||||
void read_xml_entity();
|
||||
void read_char_from_entity_buffer();
|
||||
int read_char() override;
|
||||
|
||||
Item empty;
|
||||
Item * pstack; // stack pointer
|
||||
size_t stack_len; // length of the stack
|
||||
|
|
Loading…
Reference in New Issue