From c0e940c5008e2038e551279fa16fdbd2119eacfd Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Wed, 21 Jul 2021 11:30:49 +0200 Subject: [PATCH] fixed improper new line character after items, added Item::new_line_before flag --- src/html/htmlfilter.cpp | 174 +++++++++++++++------------------------- src/html/htmlfilter.h | 14 ++-- 2 files changed, 70 insertions(+), 118 deletions(-) diff --git a/src/html/htmlfilter.cpp b/src/html/htmlfilter.cpp index 5274950..04888c3 100644 --- a/src/html/htmlfilter.cpp +++ b/src/html/htmlfilter.cpp @@ -36,7 +36,7 @@ */ #include "htmlfilter.h" - +#include "convert/text.h" namespace pt @@ -48,13 +48,14 @@ namespace pt void HTMLFilter::Item::Clear() { name.clear(); - type = none; - is_commentary = false; - porphans = nullptr; - new_line = false; + type = none; + is_commentary = false; + porphans = nullptr; + new_line_before = false; + new_line = false; new_line_in_the_middle = false; - has_body_tag = false; - tree_index = 0; + has_body_tag = false; + tree_index = 0; } @@ -74,7 +75,7 @@ void HTMLFilter::Filter(const wchar_t * in, std::wstring & out) stack_len = 0; out_string = &out; - last_new_line = false; + //last_new_line = false; was_ending_commentary = false; line_len = 0; out_string->clear(); @@ -382,15 +383,15 @@ void HTMLFilter::SkipWhiteWithFirstNewLine() } -void HTMLFilter::CheckNewLine() -{ - if( white_mode == WHITE_MODE_TREE ) - { - SkipWhite(); - } - - last_new_line = (lastc==10); -} +//void HTMLFilter::CheckNewLine() +//{ +// if( white_mode == WHITE_MODE_TREE ) +// { +// SkipWhite(); +// } +// +// last_new_line = (lastc==10); +//} @@ -440,7 +441,7 @@ void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well) was_closing_tag = true; PopStack(); - CheckNewLine(); + //CheckNewLine(); break; } } @@ -857,17 +858,17 @@ void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, } -bool HTMLFilter::PutNormalWhite() +void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line) { - bool was_white_char = false; - bool was_new_line = false; + was_white_char = false; + was_new_line = false; while( lastc == 10 || IsWhite(lastc) ) { - was_white_char = true; // anyone white char even new line - if( lastc == 10 ) was_new_line = true; + else + was_white_char = true; if( white_mode == WHITE_MODE_ORIGIN ) { @@ -877,18 +878,12 @@ bool HTMLFilter::PutNormalWhite() read_char(); } - if( white_mode == WHITE_MODE_SINGLE_LINE && was_white_char ) + if( white_mode == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) ) { Put(' '); } - if( white_mode == WHITE_MODE_TREE && was_new_line ) - { - // in WHITE_MODE_TREE white characters are written at the beginning of a or text - } - - last_new_line = was_new_line; - return was_white_char; + // in WHITE_MODE_TREE white characters are written at the beginning of a or text } @@ -955,7 +950,7 @@ bool HTMLFilter::PutOpeningTag() return false; } - if( white_mode == WHITE_MODE_TREE && last_new_line ) + if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before ) { Put(10); PutTabs(LastItem().tree_index); @@ -991,7 +986,7 @@ void HTMLFilter::PutTabs(size_t len) if( len > 30 ) len = 30; - for(int i=0 ; i < (len*tab_size) ; ++i) + for(size_t i=0 ; i < (len*tab_size) ; ++i) (*out_string) += ' '; // we do not add them to 'line_len' } @@ -1010,15 +1005,6 @@ void HTMLFilter::PutNonBreakingSpace() - -//void HTMLFilter::PutNewLine() -//{ -// buffer[0] = 10; // CHECKME for what purpose is this buffer? -// Put(10); -// line_len = 0; -//} - - // we assume the size of the opening mark to be one bool HTMLFilter::IsOpeningTagMark(wchar_t c) { @@ -1063,22 +1049,6 @@ bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c) } -//bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str) -//{ -//static wchar_t comm_open[] = L""; @@ -1106,35 +1076,12 @@ bool HTMLFilter::IsEndingEntityMark(wchar_t c) -// skipping the commentary tag if exists -bool HTMLFilter::SkipCommentaryTagIfExists() -{ -wchar_t comm_close[] = L"-->"; -size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1; -/* - if( !IsOpeningCommentaryTagMark(pchar) ) - return false; - - pchar += OpeningCommentaryTagMarkSize(); - - // looking for "-->" - while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) ) - ++pchar; - - if( *pchar!= 0 ) - pchar += comm_close_len; - - CheckNewLine(); -*/ - - -return true; -} - - // reading text between html tags -void HTMLFilter::ReadNormalText() +void HTMLFilter::ReadText() { + bool was_white_char = false; + bool was_new_line = false; + bool was_non_white_text = false; was_ending_commentary = false; @@ -1175,13 +1122,16 @@ void HTMLFilter::ReadNormalText() if( was_ending_commentary ) break; - if( PutNormalWhite() && white_mode == WHITE_MODE_TREE ) + PutNormalWhite(was_white_char, was_new_line); + + if( (was_white_char || was_new_line) && white_mode == WHITE_MODE_TREE ) { - if( last_new_line ) + allow_put_new_line = false; + allow_put_space = false; + + if( was_new_line ) { allow_put_new_line = true; - allow_put_space = false; - LastItem().new_line_in_the_middle = true; if( !was_non_white_text ) @@ -1189,7 +1139,6 @@ void HTMLFilter::ReadNormalText() } else { - allow_put_new_line = false; allow_put_space = true; } @@ -1200,6 +1149,8 @@ void HTMLFilter::ReadNormalText() } } } + + new_item_has_new_line_before = was_new_line; } @@ -1319,7 +1270,7 @@ void HTMLFilter::ReadItemSpecial() if( !skip_tags ) { - if( white_mode == WHITE_MODE_TREE && last_new_line ) + if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before ) { Put(10); PutTabs(LastItem().tree_index); @@ -1351,6 +1302,13 @@ void HTMLFilter::ReadItemSpecial() Put(' '); Put(tmp_text); Put('>'); + + if( is_first_item && white_mode == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") ) + { + Put(10); + Put(10); + SkipWhiteLines(); + } } } } @@ -1399,6 +1357,8 @@ bool HTMLFilter::ReadItem() if( !PushStack() ) return false; + LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method + if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle ) LastItem().tree_index += 1; @@ -1602,7 +1562,7 @@ int i; pstack[z].Clear(); } - last_new_line = pstack[stack_len-1].new_line; + //last_new_line = pstack[stack_len-1].new_line; // invalidate tags stack_len = i; @@ -1661,7 +1621,7 @@ void HTMLFilter::CheckClosingTags() } PutClosingTag(pstack[stack_len-1]); - last_new_line = pstack[stack_len-1].new_line; + //last_new_line = pstack[stack_len-1].new_line; PopStack(); PopStack(); } @@ -1711,27 +1671,17 @@ void HTMLFilter::ReadLoop() CheckExceptions(); } else - if( LastItem().type == Item::special || LastItem().type == Item::simple ) + if( LastItem().type == Item::special ) { - if( stack_len > 1 ) - { - //pstack[stack_len-2].new_line = LastItem().new_line; - } - else - if( white_mode == WHITE_MODE_TREE ) - { - // one new line after a simple or special tag - // (if the tag has level 0 in the tree - it not means that this is a first tag) - // for example can be DOCTYPE - - if( !LastItem().is_commentary ) - Put(10); - } - if( !LastItem().is_commentary ) PopStack(); } else + if( LastItem().type == Item::simple ) + { + PopStack(); + } + else if( LastItem().type == Item::closing ) { CheckClosingTags(); @@ -1741,7 +1691,8 @@ void HTMLFilter::ReadLoop() PopStack(); } - ReadNormalText(); + ReadText(); + is_first_item = false; } } @@ -1750,12 +1701,13 @@ void HTMLFilter::ReadLoop() void HTMLFilter::Read() { read_char(); // put first character to lastc + is_first_item = true; if( white_mode != WHITE_MODE_ORIGIN ) SkipWhiteLines(); // it can be some text or white lines before the first html tag (we print it) - ReadNormalText(); + ReadText(); // reading the whole html source ReadLoop(); diff --git a/src/html/htmlfilter.h b/src/html/htmlfilter.h index 6407e0e..4b20ef4 100644 --- a/src/html/htmlfilter.h +++ b/src/html/htmlfilter.h @@ -204,6 +204,8 @@ protected: bool is_commentary; + bool new_line_before; + // is there a new line after this tag bool new_line; @@ -244,15 +246,12 @@ protected: virtual bool IsStartingEntityMark(wchar_t c); virtual bool IsEndingEntityMark(wchar_t c); -// virtual bool IsOpeningCommentaryTagMark(const wchar_t * str); -// virtual size_t OpeningCommentaryTagMarkSize(); virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str); virtual bool IsValidCharForName(int c); virtual bool IsValidCharForAttrName(int c); virtual bool IsValidCharForEntityName(int c); virtual void CheckExceptions(); - virtual bool SkipCommentaryTagIfExists(); virtual void Put(wchar_t c); virtual void Put(const wchar_t * str, const wchar_t * end); @@ -306,11 +305,10 @@ protected: void PopStack(); bool PushStack(); - void CheckNewLine(); void CheckStackPrintRest(); void AddForgottenTags(); void CheckClosingTags(); - void ReadNormalText(); + void ReadText(); bool PrintRest(); bool PrintOpeningItem(); void ReadItemName(std::wstring & name, bool clear_name = true); @@ -332,7 +330,7 @@ protected: void CheckChar(wchar_t c); void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space); - bool PutNormalWhite(); + void PutNormalWhite(bool & was_white_char, bool & was_new_line); void PutEverythingUntilClosingTag(bool put_closing_tag_as_well); void PutTabs(size_t len); void PutNonBreakingSpace(); @@ -343,8 +341,10 @@ protected: size_t stack_len; // length of the stack wchar_t * buffer; // buffer used when printing std::wstring * out_string; - bool last_new_line; + //bool last_new_line; + bool new_item_has_new_line_before; int white_mode; + bool is_first_item; size_t wrap_line; // insert a new line character into long lines size_t tab_size; bool was_ending_commentary;