HTMLParser now parses correctly such entities: & < > " '

2021-12-02 17:44:41 +01:00
parent 2dadfc0809
commit b781948f21
2 changed files with 155 additions and 25 deletions
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -60,6 +60,9 @@ void HTMLParser::clear_input_flags()
 	out_stream       = nullptr;
 	out_space        = nullptr;
 	line_len         = 0;
+	char_was_escaped = false;
+	escaped_chars_buffer.clear();
+	escaped_char_index = 0;
 }


@@ -602,7 +605,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)

 	while( lastc != -1 )
 	{
-		if( lastc == '"' || lastc == '\'' )
+		if( !char_was_escaped && (lastc == '"' || lastc == '\'') )
 		{
 			if( is_quoted )
 			{
@@ -623,7 +626,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
 			LastItem().type = Item::simple;
 		}
 		else
-		if( !is_quoted && IsClosingTagMark(lastc) )
+		if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) )
 		{
 			read_char();
 			break;
@@ -739,15 +742,18 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)

 	while( lastc != -1 )
 	{
-		if( has_quote )
+		if( !char_was_escaped )
 		{
-			if( lastc == quote_char )
-				break;
-		}
-		else
-		{
-			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
-				break;
+			if( has_quote )
+			{
+				if( lastc == quote_char )
+					break;
+			}
+			else
+			{
+				if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+					break;
+			}
 		}

 		if( lastc==10 || IsWhite(lastc) )
@@ -780,15 +786,18 @@ void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)

 	while( lastc != -1 )
 	{
-		if( has_quote )
+		if( !char_was_escaped )
 		{
-			if( lastc == quote_char )
-				break;
-		}
-		else
-		{
-			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
-				break;
+			if( has_quote )
+			{
+				if( lastc == quote_char )
+					break;
+			}
+			else
+			{
+				if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+					break;
+			}
 		}

 		// IMPROVEME add support for analyze_entities?
@@ -995,7 +1004,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab);

 void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
 {
-	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
+	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
 	{
 		str += lastc;
 		read_char();
@@ -1260,7 +1269,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
 				tmp_text += lastc;
 				read_char();

-				if( IsClosingTagMark(lastc) )
+				if( !char_was_escaped && IsClosingTagMark(lastc) )
 				{
 					tmp_text += lastc;
 					read_char();
@@ -1332,7 +1341,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
 {
 	while( lastc != -1 )
 	{
-		if( IsOpeningTagMark(lastc) )
+		if( !char_was_escaped && IsOpeningTagMark(lastc) )
 		{
 			if( IsClosingTagForLastItem(put_closing_tag_as_well) )
 			{
@@ -1382,7 +1391,7 @@ void HTMLParser::ReadText()
 		text_space_wstr = &wstr_space.value.value_wstring;
 	}

-	while( lastc != -1 && !IsOpeningTagMark(lastc) )
+	while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
 	{
 		tmp_text.clear();
 		PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
@@ -1479,7 +1488,7 @@ bool HTMLParser::ReadItemAttr()
 	read_char();				// skipping '='
 	SkipWhiteLines();

-	bool has_quote = (lastc == '\"' || lastc == '\'');
+	bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\'');
 	wchar_t quote_char = lastc;

 	if( has_quote )
@@ -1491,7 +1500,7 @@ bool HTMLParser::ReadItemAttr()
 	else
 		ReadXMLItemAttrValue(has_quote, quote_char);

-	if( has_quote && lastc == quote_char )
+	if( has_quote && !char_was_escaped && lastc == quote_char )
 		read_char();			// skipping the last quote mark

 return true;
@@ -2198,6 +2207,116 @@ void HTMLParser::ReadLoop()
 }


+void HTMLParser::read_char_from_entity_buffer()
+{
+	if( escaped_char_index < escaped_chars_buffer.size() )
+	{
+		lastc = escaped_chars_buffer[escaped_char_index];
+		escaped_char_index += 1;
+
+		if( escaped_char_index >= escaped_chars_buffer.size() )
+		{
+			escaped_chars_buffer.clear();
+			escaped_char_index = 0;
+		}
+	}
+	else
+	{
+		lastc = -1;
+	}
+}
+
+
+void HTMLParser::read_xml_entity()
+{
+	const size_t max_entity_length = 6; // length of "&apos;" string
+	escaped_chars_buffer.clear();
+	escaped_char_index = 0;
+	escaped_chars_buffer += '&';
+
+	do
+	{
+		read_char_no_escape();
+
+		if( lastc != -1 )
+		{
+			escaped_chars_buffer += lastc;
+		}
+	}
+	while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' );
+}
+
+
+bool HTMLParser::check_escape_sequentions()
+{
+	if( escaped_chars_buffer == L"&amp;" )
+	{
+		lastc = '&';
+		char_was_escaped = true;
+	}
+	else
+	if( escaped_chars_buffer == L"&lt;" )
+	{
+		lastc = '<';
+		char_was_escaped = true;
+	}
+	else
+	if( escaped_chars_buffer == L"&gt;" )
+	{
+		lastc = '>';
+		char_was_escaped = true;
+	}
+	else
+	if( escaped_chars_buffer == L"&quot;" )
+	{
+		lastc = '"';
+		char_was_escaped = true;
+	}
+	else
+	if( escaped_chars_buffer == L"&apos;" )
+	{
+		lastc = '\'';
+		char_was_escaped = true;
+	}
+
+	if( char_was_escaped )
+	{
+		escaped_chars_buffer.clear();
+		escaped_char_index = 0;
+	}
+
+	return char_was_escaped;
+}
+
+
+
+int HTMLParser::read_char()
+{
+	char_was_escaped = false;
+
+	if( escaped_char_index < escaped_chars_buffer.size() )
+	{
+		read_char_from_entity_buffer();
+	}
+	else
+	{
+		read_char_no_escape();
+
+		if( lastc == '&' )
+		{
+			read_xml_entity();
+
+			if( !check_escape_sequentions() )
+			{
+				read_char_from_entity_buffer();
+			}
+		}
+	}
+
+	return lastc;
+}
+
+

 void HTMLParser::Read()
 {
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -295,8 +295,14 @@ protected:

 	ItemParsedListener * item_parsed_listener;

+	/*
+		true if the lastc was escaped (with a backslash)
+		we have to know if the last sequence was \" or just "
+	*/
+	bool char_was_escaped;

-
+	std::wstring escaped_chars_buffer;
+	size_t escaped_char_index;


 	void clear_input_flags();
@@ -422,6 +428,11 @@ protected:

 	bool RemoveIfNeeded(size_t index);

+	bool check_escape_sequentions();
+	void read_xml_entity();
+	void read_char_from_entity_buffer();
+	int read_char() override;
+
 	Item empty;
 	Item * pstack;			// stack pointer
 	size_t stack_len;		// length of the stack