diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp index 9b24071..4983010 100644 --- a/src/html/htmlparser.cpp +++ b/src/html/htmlparser.cpp @@ -60,6 +60,9 @@ void HTMLParser::clear_input_flags() out_stream = nullptr; out_space = nullptr; line_len = 0; + char_was_escaped = false; + escaped_chars_buffer.clear(); + escaped_char_index = 0; } @@ -602,7 +605,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text) while( lastc != -1 ) { - if( lastc == '"' || lastc == '\'' ) + if( !char_was_escaped && (lastc == '"' || lastc == '\'') ) { if( is_quoted ) { @@ -623,7 +626,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text) LastItem().type = Item::simple; } else - if( !is_quoted && IsClosingTagMark(lastc) ) + if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) ) { read_char(); break; @@ -739,15 +742,18 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char) while( lastc != -1 ) { - if( has_quote ) + if( !char_was_escaped ) { - if( lastc == quote_char ) - break; - } - else - { - if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) - break; + if( has_quote ) + { + if( lastc == quote_char ) + break; + } + else + { + if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) + break; + } } if( lastc==10 || IsWhite(lastc) ) @@ -780,15 +786,18 @@ void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char) while( lastc != -1 ) { - if( has_quote ) + if( !char_was_escaped ) { - if( lastc == quote_char ) - break; - } - else - { - if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) - break; + if( has_quote ) + { + if( lastc == quote_char ) + break; + } + else + { + if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) + break; + } } // IMPROVEME add support for analyze_entities? @@ -995,7 +1004,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab); void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space) { - while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) ) + while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) ) { str += lastc; read_char(); @@ -1260,7 +1269,7 @@ void HTMLParser::ReadTextUntilClosingCommentary() tmp_text += lastc; read_char(); - if( IsClosingTagMark(lastc) ) + if( !char_was_escaped && IsClosingTagMark(lastc) ) { tmp_text += lastc; read_char(); @@ -1332,7 +1341,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well) { while( lastc != -1 ) { - if( IsOpeningTagMark(lastc) ) + if( !char_was_escaped && IsOpeningTagMark(lastc) ) { if( IsClosingTagForLastItem(put_closing_tag_as_well) ) { @@ -1382,7 +1391,7 @@ void HTMLParser::ReadText() text_space_wstr = &wstr_space.value.value_wstring; } - while( lastc != -1 && !IsOpeningTagMark(lastc) ) + while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) ) { tmp_text.clear(); PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space); @@ -1479,7 +1488,7 @@ bool HTMLParser::ReadItemAttr() read_char(); // skipping '=' SkipWhiteLines(); - bool has_quote = (lastc == '\"' || lastc == '\''); + bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\''); wchar_t quote_char = lastc; if( has_quote ) @@ -1491,7 +1500,7 @@ bool HTMLParser::ReadItemAttr() else ReadXMLItemAttrValue(has_quote, quote_char); - if( has_quote && lastc == quote_char ) + if( has_quote && !char_was_escaped && lastc == quote_char ) read_char(); // skipping the last quote mark return true; @@ -2198,6 +2207,116 @@ void HTMLParser::ReadLoop() } +void HTMLParser::read_char_from_entity_buffer() +{ + if( escaped_char_index < escaped_chars_buffer.size() ) + { + lastc = escaped_chars_buffer[escaped_char_index]; + escaped_char_index += 1; + + if( escaped_char_index >= escaped_chars_buffer.size() ) + { + escaped_chars_buffer.clear(); + escaped_char_index = 0; + } + } + else + { + lastc = -1; + } +} + + +void HTMLParser::read_xml_entity() +{ + const size_t max_entity_length = 6; // length of "'" string + escaped_chars_buffer.clear(); + escaped_char_index = 0; + escaped_chars_buffer += '&'; + + do + { + read_char_no_escape(); + + if( lastc != -1 ) + { + escaped_chars_buffer += lastc; + } + } + while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' ); +} + + +bool HTMLParser::check_escape_sequentions() +{ + if( escaped_chars_buffer == L"&" ) + { + lastc = '&'; + char_was_escaped = true; + } + else + if( escaped_chars_buffer == L"<" ) + { + lastc = '<'; + char_was_escaped = true; + } + else + if( escaped_chars_buffer == L">" ) + { + lastc = '>'; + char_was_escaped = true; + } + else + if( escaped_chars_buffer == L""" ) + { + lastc = '"'; + char_was_escaped = true; + } + else + if( escaped_chars_buffer == L"'" ) + { + lastc = '\''; + char_was_escaped = true; + } + + if( char_was_escaped ) + { + escaped_chars_buffer.clear(); + escaped_char_index = 0; + } + + return char_was_escaped; +} + + + +int HTMLParser::read_char() +{ + char_was_escaped = false; + + if( escaped_char_index < escaped_chars_buffer.size() ) + { + read_char_from_entity_buffer(); + } + else + { + read_char_no_escape(); + + if( lastc == '&' ) + { + read_xml_entity(); + + if( !check_escape_sequentions() ) + { + read_char_from_entity_buffer(); + } + } + } + + return lastc; +} + + void HTMLParser::Read() { diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h index da0074e..0f352a9 100644 --- a/src/html/htmlparser.h +++ b/src/html/htmlparser.h @@ -295,8 +295,14 @@ protected: ItemParsedListener * item_parsed_listener; + /* + true if the lastc was escaped (with a backslash) + we have to know if the last sequence was \" or just " + */ + bool char_was_escaped; - + std::wstring escaped_chars_buffer; + size_t escaped_char_index; void clear_input_flags(); @@ -422,6 +428,11 @@ protected: bool RemoveIfNeeded(size_t index); + bool check_escape_sequentions(); + void read_xml_entity(); + void read_char_from_entity_buffer(); + int read_char() override; + Item empty; Item * pstack; // stack pointer size_t stack_len; // length of the stack