HTMLParser now parses correctly such entities: & < > " '

2021-12-02 17:44:41 +01:00 · 2021-12-02 17:44:41 +01:00 · b781948f21
parent 2dadfc0809
commit b781948f21
2 changed files with 155 additions and 25 deletions
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@ -60,6 +60,9 @@ void HTMLParser::clear_input_flags()
 	out_stream       = nullptr;
 	out_space        = nullptr;
 	line_len         = 0;
 	char_was_escaped = false;
 	escaped_chars_buffer.clear();
 	escaped_char_index = 0;
 }
@ -602,7 +605,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
 	while( lastc != -1 )
 	{
-		if( lastc == '"' || lastc == '\'' )
+		if( !char_was_escaped && (lastc == '"' || lastc == '\'') )
 		{
 			if( is_quoted )
 			{
@ -623,7 +626,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
 			LastItem().type = Item::simple;
 		}
 		else
-		if( !is_quoted && IsClosingTagMark(lastc) )
+		if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) )
 		{
 			read_char();
 			break;
@ -739,15 +742,18 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
 	while( lastc != -1 )
 	{
-		if( has_quote )
+		if( !char_was_escaped )
 		{
-			if( lastc == quote_char )
+			if( has_quote )
-				break;
+			{
-		}
+				if( lastc == quote_char )
-		else
+					break;
-		{
+			}
-			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+			else
-				break;
+			{
 				if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
 					break;
 			}
 		}
 		if( lastc==10 || IsWhite(lastc) )
@ -780,15 +786,18 @@ void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
 	while( lastc != -1 )
 	{
-		if( has_quote )
+		if( !char_was_escaped )
 		{
-			if( lastc == quote_char )
+			if( has_quote )
-				break;
+			{
-		}
+				if( lastc == quote_char )
-		else
+					break;
-		{
+			}
-			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+			else
-				break;
+			{
 				if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
 					break;
 			}
 		}
 		// IMPROVEME add support for analyze_entities?
@ -995,7 +1004,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
 void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
 {
-	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
+	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
 	{
 		str += lastc;
 		read_char();
@ -1260,7 +1269,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
 				tmp_text += lastc;
 				read_char();
-				if( IsClosingTagMark(lastc) )
+				if( !char_was_escaped && IsClosingTagMark(lastc) )
 				{
 					tmp_text += lastc;
 					read_char();
@ -1332,7 +1341,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
 {
 	while( lastc != -1 )
 	{
-		if( IsOpeningTagMark(lastc) )
+		if( !char_was_escaped && IsOpeningTagMark(lastc) )
 		{
 			if( IsClosingTagForLastItem(put_closing_tag_as_well) )
 			{
@ -1382,7 +1391,7 @@ void HTMLParser::ReadText()
 		text_space_wstr = &wstr_space.value.value_wstring;
 	}
-	while( lastc != -1 && !IsOpeningTagMark(lastc) )
+	while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
 	{
 		tmp_text.clear();
 		PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
@ -1479,7 +1488,7 @@ bool HTMLParser::ReadItemAttr()
 	read_char();				// skipping '='
 	SkipWhiteLines();
-	bool has_quote = (lastc == '\"' || lastc == '\'');
+	bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\'');
 	wchar_t quote_char = lastc;
 	if( has_quote )
@ -1491,7 +1500,7 @@ bool HTMLParser::ReadItemAttr()
 	else
 		ReadXMLItemAttrValue(has_quote, quote_char);
-	if( has_quote && lastc == quote_char )
+	if( has_quote && !char_was_escaped && lastc == quote_char )
 		read_char();			// skipping the last quote mark
 return true;
@ -2198,6 +2207,116 @@ void HTMLParser::ReadLoop()
 }
 void HTMLParser::read_char_from_entity_buffer()
 {
 	if( escaped_char_index < escaped_chars_buffer.size() )
 	{
 		lastc = escaped_chars_buffer[escaped_char_index];
 		escaped_char_index += 1;
 		if( escaped_char_index >= escaped_chars_buffer.size() )
 		{
 			escaped_chars_buffer.clear();
 			escaped_char_index = 0;
 		}
 	}
 	else
 	{
 		lastc = -1;
 	}
 }
 void HTMLParser::read_xml_entity()
 {
 	const size_t max_entity_length = 6; // length of "&apos;" string
 	escaped_chars_buffer.clear();
 	escaped_char_index = 0;
 	escaped_chars_buffer += '&';
 	do
 	{
 		read_char_no_escape();
 		if( lastc != -1 )
 		{
 			escaped_chars_buffer += lastc;
 		}
 	}
 	while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' );
 }
 bool HTMLParser::check_escape_sequentions()
 {
 	if( escaped_chars_buffer == L"&amp;" )
 	{
 		lastc = '&';
 		char_was_escaped = true;
 	}
 	else
 	if( escaped_chars_buffer == L"&lt;" )
 	{
 		lastc = '<';
 		char_was_escaped = true;
 	}
 	else
 	if( escaped_chars_buffer == L"&gt;" )
 	{
 		lastc = '>';
 		char_was_escaped = true;
 	}
 	else
 	if( escaped_chars_buffer == L"&quot;" )
 	{
 		lastc = '"';
 		char_was_escaped = true;
 	}
 	else
 	if( escaped_chars_buffer == L"&apos;" )
 	{
 		lastc = '\'';
 		char_was_escaped = true;
 	}
 	if( char_was_escaped )
 	{
 		escaped_chars_buffer.clear();
 		escaped_char_index = 0;
 	}
 	return char_was_escaped;
 }
 int HTMLParser::read_char()
 {
 	char_was_escaped = false;
 	if( escaped_char_index < escaped_chars_buffer.size() )
 	{
 		read_char_from_entity_buffer();
 	}
 	else
 	{
 		read_char_no_escape();
 		if( lastc == '&' )
 		{
 			read_xml_entity();
 			if( !check_escape_sequentions() )
 			{
 				read_char_from_entity_buffer();
 			}
 		}
 	}
 	return lastc;
 }
 void HTMLParser::Read()
 {
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@ -295,8 +295,14 @@ protected:
 	ItemParsedListener * item_parsed_listener;
 	/*
 		true if the lastc was escaped (with a backslash)
 		we have to know if the last sequence was \" or just "
 	*/
 	bool char_was_escaped;
-
+	std::wstring escaped_chars_buffer;
 	size_t escaped_char_index;
 	void clear_input_flags();
@ -422,6 +428,11 @@ protected:
 	bool RemoveIfNeeded(size_t index);
 	bool check_escape_sequentions();
 	void read_xml_entity();
 	void read_char_from_entity_buffer();
 	int read_char() override;
 	Item empty;
 	Item * pstack;			// stack pointer
 	size_t stack_len;		// length of the stack