From 5253963c84ece446ab502e22ef32cf2e9ea30b3c Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Tue, 8 Feb 2022 16:34:54 +0100 Subject: [PATCH] fix: put a white char before an opening tag in tree mode if it was in the source html --- src/html/htmlparser.cpp | 49 +++++++++++++++++++++++++++-------------- src/html/htmlparser.h | 13 +++++++---- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp index 57f8d00..c5e37cf 100644 --- a/src/html/htmlparser.cpp +++ b/src/html/htmlparser.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2008-2021, Tomasz Sowa + * Copyright (c) 2008-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -71,16 +71,17 @@ void HTMLParser::clear_input_flags() void HTMLParser::Item::Clear() { name.clear(); - type = none; - is_commentary = false; - is_cdata = false; - porphans = nullptr; - new_line_before = false; - new_line = false; + type = none; + is_commentary = false; + is_cdata = false; + porphans = nullptr; + new_line_before = false; + new_line_after = false; new_line_in_the_middle = false; - has_body_tag = false; - tree_index = 0; - space = nullptr; + white_char_before = false; + has_body_tag = false; + tree_index = 0; + space = nullptr; } @@ -1175,10 +1176,18 @@ bool HTMLParser::PutOpeningTag() return false; } - if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before ) + if( current_white_char_mode() == WHITE_MODE_TREE ) { - Put(10); - PutTabs(LastItem().tree_index); + if( LastItem().new_line_before ) + { + Put(10); + PutTabs(LastItem().tree_index); + } + else + if( LastItem().white_char_before ) + { + Put(' '); + } } PutOpeningTagMark(); @@ -1414,6 +1423,8 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well) void HTMLParser::ReadText(bool is_cdata) { new_item_has_new_line_before = false; + new_item_has_white_char_before = false; + bool was_white_char = false; bool was_new_line = false; @@ -1424,7 +1435,7 @@ void HTMLParser::ReadText(bool is_cdata) if( current_white_char_mode() == WHITE_MODE_TREE ) { - if( LastItem().new_line || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) ) + if( LastItem().new_line_after || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) ) { allow_put_new_line = true; } @@ -1450,7 +1461,10 @@ void HTMLParser::ReadText(bool is_cdata) was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata); if( lastc == -1 || was_closing_tag ) + { new_item_has_new_line_before = was_new_line; + new_item_has_white_char_before = was_white_char; + } if( !tmp_text.empty() ) { @@ -1486,7 +1500,7 @@ void HTMLParser::ReadText(bool is_cdata) LastItem().new_line_in_the_middle = true; if( !was_non_white_text ) - LastItem().new_line = true; + LastItem().new_line_after = true; } else { @@ -1758,6 +1772,7 @@ bool HTMLParser::ReadItem() return false; LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method + LastItem().white_char_before = new_item_has_white_char_before; // new_item_has_white_char_before is set by ReadText() method if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle ) LastItem().tree_index += 1; @@ -1971,7 +1986,7 @@ void HTMLParser::CheckStackPrintRest() { while( stack_len-- > 0 ) { - if( stack_len==0 || pstack[stack_len-1].new_line ) + if( stack_len==0 || pstack[stack_len-1].new_line_after ) { if( current_white_char_mode() == WHITE_MODE_TREE ) { @@ -2030,7 +2045,7 @@ void HTMLParser::CheckClosingTags() if( !skip_tags && IsTagSafe(LastItem().name) && !IsNameEqual(no_filter_tag, LastItem().name) ) { - if( pstack[z].new_line ) + if( pstack[z].new_line_after ) { if( current_white_char_mode() == WHITE_MODE_TREE ) { diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h index 7797b51..fb63809 100644 --- a/src/html/htmlparser.h +++ b/src/html/htmlparser.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2008-2021, Tomasz Sowa + * Copyright (c) 2008-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -134,14 +134,18 @@ public: bool is_cdata; + // is a new line before this tag (or just a new line and some white characters) bool new_line_before; - // is there a new line after this tag - bool new_line; + // is there a new line after this tag (or just some white characters and a new line) + bool new_line_after; - // is there a new + // is there a new line in the middle after this tag and before the next tag bool new_line_in_the_middle; + // is there a white char (but not new line) before this tag + bool white_char_before; + // current orphans table // (will be propagated) Orphans * porphans; @@ -448,6 +452,7 @@ protected: //bool last_new_line; bool new_item_has_new_line_before; + bool new_item_has_white_char_before; int white_mode; bool is_first_item; size_t wrap_line; // insert a new line character into long lines