diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 9b24071..4983010 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -60,6 +60,9 @@ void HTMLParser::clear_input_flags()
out_stream = nullptr;
out_space = nullptr;
line_len = 0;
+ char_was_escaped = false;
+ escaped_chars_buffer.clear();
+ escaped_char_index = 0;
}
@@ -602,7 +605,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
while( lastc != -1 )
{
- if( lastc == '"' || lastc == '\'' )
+ if( !char_was_escaped && (lastc == '"' || lastc == '\'') )
{
if( is_quoted )
{
@@ -623,7 +626,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
LastItem().type = Item::simple;
}
else
- if( !is_quoted && IsClosingTagMark(lastc) )
+ if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) )
{
read_char();
break;
@@ -739,15 +742,18 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
while( lastc != -1 )
{
- if( has_quote )
+ if( !char_was_escaped )
{
- if( lastc == quote_char )
- break;
- }
- else
- {
- if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
- break;
+ if( has_quote )
+ {
+ if( lastc == quote_char )
+ break;
+ }
+ else
+ {
+ if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+ break;
+ }
}
if( lastc==10 || IsWhite(lastc) )
@@ -780,15 +786,18 @@ void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
while( lastc != -1 )
{
- if( has_quote )
+ if( !char_was_escaped )
{
- if( lastc == quote_char )
- break;
- }
- else
- {
- if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
- break;
+ if( has_quote )
+ {
+ if( lastc == quote_char )
+ break;
+ }
+ else
+ {
+ if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+ break;
+ }
}
// IMPROVEME add support for analyze_entities?
@@ -995,7 +1004,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
{
- while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
+ while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
{
str += lastc;
read_char();
@@ -1260,7 +1269,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
tmp_text += lastc;
read_char();
- if( IsClosingTagMark(lastc) )
+ if( !char_was_escaped && IsClosingTagMark(lastc) )
{
tmp_text += lastc;
read_char();
@@ -1332,7 +1341,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
{
while( lastc != -1 )
{
- if( IsOpeningTagMark(lastc) )
+ if( !char_was_escaped && IsOpeningTagMark(lastc) )
{
if( IsClosingTagForLastItem(put_closing_tag_as_well) )
{
@@ -1382,7 +1391,7 @@ void HTMLParser::ReadText()
text_space_wstr = &wstr_space.value.value_wstring;
}
- while( lastc != -1 && !IsOpeningTagMark(lastc) )
+ while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
{
tmp_text.clear();
PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
@@ -1479,7 +1488,7 @@ bool HTMLParser::ReadItemAttr()
read_char(); // skipping '='
SkipWhiteLines();
- bool has_quote = (lastc == '\"' || lastc == '\'');
+ bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\'');
wchar_t quote_char = lastc;
if( has_quote )
@@ -1491,7 +1500,7 @@ bool HTMLParser::ReadItemAttr()
else
ReadXMLItemAttrValue(has_quote, quote_char);
- if( has_quote && lastc == quote_char )
+ if( has_quote && !char_was_escaped && lastc == quote_char )
read_char(); // skipping the last quote mark
return true;
@@ -2198,6 +2207,116 @@ void HTMLParser::ReadLoop()
}
+void HTMLParser::read_char_from_entity_buffer()
+{
+ if( escaped_char_index < escaped_chars_buffer.size() )
+ {
+ lastc = escaped_chars_buffer[escaped_char_index];
+ escaped_char_index += 1;
+
+ if( escaped_char_index >= escaped_chars_buffer.size() )
+ {
+ escaped_chars_buffer.clear();
+ escaped_char_index = 0;
+ }
+ }
+ else
+ {
+ lastc = -1;
+ }
+}
+
+
+void HTMLParser::read_xml_entity()
+{
+ const size_t max_entity_length = 6; // length of "'" string
+ escaped_chars_buffer.clear();
+ escaped_char_index = 0;
+ escaped_chars_buffer += '&';
+
+ do
+ {
+ read_char_no_escape();
+
+ if( lastc != -1 )
+ {
+ escaped_chars_buffer += lastc;
+ }
+ }
+ while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' );
+}
+
+
+bool HTMLParser::check_escape_sequentions()
+{
+ if( escaped_chars_buffer == L"&" )
+ {
+ lastc = '&';
+ char_was_escaped = true;
+ }
+ else
+ if( escaped_chars_buffer == L"<" )
+ {
+ lastc = '<';
+ char_was_escaped = true;
+ }
+ else
+ if( escaped_chars_buffer == L">" )
+ {
+ lastc = '>';
+ char_was_escaped = true;
+ }
+ else
+ if( escaped_chars_buffer == L""" )
+ {
+ lastc = '"';
+ char_was_escaped = true;
+ }
+ else
+ if( escaped_chars_buffer == L"'" )
+ {
+ lastc = '\'';
+ char_was_escaped = true;
+ }
+
+ if( char_was_escaped )
+ {
+ escaped_chars_buffer.clear();
+ escaped_char_index = 0;
+ }
+
+ return char_was_escaped;
+}
+
+
+
+int HTMLParser::read_char()
+{
+ char_was_escaped = false;
+
+ if( escaped_char_index < escaped_chars_buffer.size() )
+ {
+ read_char_from_entity_buffer();
+ }
+ else
+ {
+ read_char_no_escape();
+
+ if( lastc == '&' )
+ {
+ read_xml_entity();
+
+ if( !check_escape_sequentions() )
+ {
+ read_char_from_entity_buffer();
+ }
+ }
+ }
+
+ return lastc;
+}
+
+
void HTMLParser::Read()
{
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index da0074e..0f352a9 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -295,8 +295,14 @@ protected:
ItemParsedListener * item_parsed_listener;
+ /*
+ true if the lastc was escaped (with a backslash)
+ we have to know if the last sequence was \" or just "
+ */
+ bool char_was_escaped;
-
+ std::wstring escaped_chars_buffer;
+ size_t escaped_char_index;
void clear_input_flags();
@@ -422,6 +428,11 @@ protected:
bool RemoveIfNeeded(size_t index);
+ bool check_escape_sequentions();
+ void read_xml_entity();
+ void read_char_from_entity_buffer();
+ int read_char() override;
+
Item empty;
Item * pstack; // stack pointer
size_t stack_len; // length of the stack