HTMLParser now parses correctly such entities: & < > " '
This commit is contained in:
parent
2dadfc0809
commit
b781948f21
|
@ -60,6 +60,9 @@ void HTMLParser::clear_input_flags()
|
||||||
out_stream = nullptr;
|
out_stream = nullptr;
|
||||||
out_space = nullptr;
|
out_space = nullptr;
|
||||||
line_len = 0;
|
line_len = 0;
|
||||||
|
char_was_escaped = false;
|
||||||
|
escaped_chars_buffer.clear();
|
||||||
|
escaped_char_index = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -602,7 +605,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
|
||||||
|
|
||||||
while( lastc != -1 )
|
while( lastc != -1 )
|
||||||
{
|
{
|
||||||
if( lastc == '"' || lastc == '\'' )
|
if( !char_was_escaped && (lastc == '"' || lastc == '\'') )
|
||||||
{
|
{
|
||||||
if( is_quoted )
|
if( is_quoted )
|
||||||
{
|
{
|
||||||
|
@ -623,7 +626,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
|
||||||
LastItem().type = Item::simple;
|
LastItem().type = Item::simple;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
if( !is_quoted && IsClosingTagMark(lastc) )
|
if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) )
|
||||||
{
|
{
|
||||||
read_char();
|
read_char();
|
||||||
break;
|
break;
|
||||||
|
@ -739,15 +742,18 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
||||||
|
|
||||||
while( lastc != -1 )
|
while( lastc != -1 )
|
||||||
{
|
{
|
||||||
if( has_quote )
|
if( !char_was_escaped )
|
||||||
{
|
{
|
||||||
if( lastc == quote_char )
|
if( has_quote )
|
||||||
break;
|
{
|
||||||
}
|
if( lastc == quote_char )
|
||||||
else
|
break;
|
||||||
{
|
}
|
||||||
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
else
|
||||||
break;
|
{
|
||||||
|
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if( lastc==10 || IsWhite(lastc) )
|
if( lastc==10 || IsWhite(lastc) )
|
||||||
|
@ -780,15 +786,18 @@ void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
|
||||||
|
|
||||||
while( lastc != -1 )
|
while( lastc != -1 )
|
||||||
{
|
{
|
||||||
if( has_quote )
|
if( !char_was_escaped )
|
||||||
{
|
{
|
||||||
if( lastc == quote_char )
|
if( has_quote )
|
||||||
break;
|
{
|
||||||
}
|
if( lastc == quote_char )
|
||||||
else
|
break;
|
||||||
{
|
}
|
||||||
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
else
|
||||||
break;
|
{
|
||||||
|
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// IMPROVEME add support for analyze_entities?
|
// IMPROVEME add support for analyze_entities?
|
||||||
|
@ -995,7 +1004,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
|
||||||
|
|
||||||
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
|
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
|
||||||
{
|
{
|
||||||
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
|
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
|
||||||
{
|
{
|
||||||
str += lastc;
|
str += lastc;
|
||||||
read_char();
|
read_char();
|
||||||
|
@ -1260,7 +1269,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
|
||||||
tmp_text += lastc;
|
tmp_text += lastc;
|
||||||
read_char();
|
read_char();
|
||||||
|
|
||||||
if( IsClosingTagMark(lastc) )
|
if( !char_was_escaped && IsClosingTagMark(lastc) )
|
||||||
{
|
{
|
||||||
tmp_text += lastc;
|
tmp_text += lastc;
|
||||||
read_char();
|
read_char();
|
||||||
|
@ -1332,7 +1341,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
|
||||||
{
|
{
|
||||||
while( lastc != -1 )
|
while( lastc != -1 )
|
||||||
{
|
{
|
||||||
if( IsOpeningTagMark(lastc) )
|
if( !char_was_escaped && IsOpeningTagMark(lastc) )
|
||||||
{
|
{
|
||||||
if( IsClosingTagForLastItem(put_closing_tag_as_well) )
|
if( IsClosingTagForLastItem(put_closing_tag_as_well) )
|
||||||
{
|
{
|
||||||
|
@ -1382,7 +1391,7 @@ void HTMLParser::ReadText()
|
||||||
text_space_wstr = &wstr_space.value.value_wstring;
|
text_space_wstr = &wstr_space.value.value_wstring;
|
||||||
}
|
}
|
||||||
|
|
||||||
while( lastc != -1 && !IsOpeningTagMark(lastc) )
|
while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
|
||||||
{
|
{
|
||||||
tmp_text.clear();
|
tmp_text.clear();
|
||||||
PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
|
PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
|
||||||
|
@ -1479,7 +1488,7 @@ bool HTMLParser::ReadItemAttr()
|
||||||
read_char(); // skipping '='
|
read_char(); // skipping '='
|
||||||
SkipWhiteLines();
|
SkipWhiteLines();
|
||||||
|
|
||||||
bool has_quote = (lastc == '\"' || lastc == '\'');
|
bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\'');
|
||||||
wchar_t quote_char = lastc;
|
wchar_t quote_char = lastc;
|
||||||
|
|
||||||
if( has_quote )
|
if( has_quote )
|
||||||
|
@ -1491,7 +1500,7 @@ bool HTMLParser::ReadItemAttr()
|
||||||
else
|
else
|
||||||
ReadXMLItemAttrValue(has_quote, quote_char);
|
ReadXMLItemAttrValue(has_quote, quote_char);
|
||||||
|
|
||||||
if( has_quote && lastc == quote_char )
|
if( has_quote && !char_was_escaped && lastc == quote_char )
|
||||||
read_char(); // skipping the last quote mark
|
read_char(); // skipping the last quote mark
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -2198,6 +2207,116 @@ void HTMLParser::ReadLoop()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void HTMLParser::read_char_from_entity_buffer()
|
||||||
|
{
|
||||||
|
if( escaped_char_index < escaped_chars_buffer.size() )
|
||||||
|
{
|
||||||
|
lastc = escaped_chars_buffer[escaped_char_index];
|
||||||
|
escaped_char_index += 1;
|
||||||
|
|
||||||
|
if( escaped_char_index >= escaped_chars_buffer.size() )
|
||||||
|
{
|
||||||
|
escaped_chars_buffer.clear();
|
||||||
|
escaped_char_index = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
lastc = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void HTMLParser::read_xml_entity()
|
||||||
|
{
|
||||||
|
const size_t max_entity_length = 6; // length of "'" string
|
||||||
|
escaped_chars_buffer.clear();
|
||||||
|
escaped_char_index = 0;
|
||||||
|
escaped_chars_buffer += '&';
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
read_char_no_escape();
|
||||||
|
|
||||||
|
if( lastc != -1 )
|
||||||
|
{
|
||||||
|
escaped_chars_buffer += lastc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool HTMLParser::check_escape_sequentions()
|
||||||
|
{
|
||||||
|
if( escaped_chars_buffer == L"&" )
|
||||||
|
{
|
||||||
|
lastc = '&';
|
||||||
|
char_was_escaped = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if( escaped_chars_buffer == L"<" )
|
||||||
|
{
|
||||||
|
lastc = '<';
|
||||||
|
char_was_escaped = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if( escaped_chars_buffer == L">" )
|
||||||
|
{
|
||||||
|
lastc = '>';
|
||||||
|
char_was_escaped = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if( escaped_chars_buffer == L""" )
|
||||||
|
{
|
||||||
|
lastc = '"';
|
||||||
|
char_was_escaped = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if( escaped_chars_buffer == L"'" )
|
||||||
|
{
|
||||||
|
lastc = '\'';
|
||||||
|
char_was_escaped = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( char_was_escaped )
|
||||||
|
{
|
||||||
|
escaped_chars_buffer.clear();
|
||||||
|
escaped_char_index = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return char_was_escaped;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int HTMLParser::read_char()
|
||||||
|
{
|
||||||
|
char_was_escaped = false;
|
||||||
|
|
||||||
|
if( escaped_char_index < escaped_chars_buffer.size() )
|
||||||
|
{
|
||||||
|
read_char_from_entity_buffer();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
read_char_no_escape();
|
||||||
|
|
||||||
|
if( lastc == '&' )
|
||||||
|
{
|
||||||
|
read_xml_entity();
|
||||||
|
|
||||||
|
if( !check_escape_sequentions() )
|
||||||
|
{
|
||||||
|
read_char_from_entity_buffer();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return lastc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void HTMLParser::Read()
|
void HTMLParser::Read()
|
||||||
{
|
{
|
||||||
|
|
|
@ -295,8 +295,14 @@ protected:
|
||||||
|
|
||||||
ItemParsedListener * item_parsed_listener;
|
ItemParsedListener * item_parsed_listener;
|
||||||
|
|
||||||
|
/*
|
||||||
|
true if the lastc was escaped (with a backslash)
|
||||||
|
we have to know if the last sequence was \" or just "
|
||||||
|
*/
|
||||||
|
bool char_was_escaped;
|
||||||
|
|
||||||
|
std::wstring escaped_chars_buffer;
|
||||||
|
size_t escaped_char_index;
|
||||||
|
|
||||||
|
|
||||||
void clear_input_flags();
|
void clear_input_flags();
|
||||||
|
@ -422,6 +428,11 @@ protected:
|
||||||
|
|
||||||
bool RemoveIfNeeded(size_t index);
|
bool RemoveIfNeeded(size_t index);
|
||||||
|
|
||||||
|
bool check_escape_sequentions();
|
||||||
|
void read_xml_entity();
|
||||||
|
void read_char_from_entity_buffer();
|
||||||
|
int read_char() override;
|
||||||
|
|
||||||
Item empty;
|
Item empty;
|
||||||
Item * pstack; // stack pointer
|
Item * pstack; // stack pointer
|
||||||
size_t stack_len; // length of the stack
|
size_t stack_len; // length of the stack
|
||||||
|
|
Loading…
Reference in New Issue