fixed improper new line character after <single/> items, added Item::new_line_before flag

This commit is contained in:
Tomasz Sowa 2021-07-21 11:30:49 +02:00
parent 4f8ae6ce29
commit c0e940c500
2 changed files with 70 additions and 118 deletions

View File

@ -36,7 +36,7 @@
*/ */
#include "htmlfilter.h" #include "htmlfilter.h"
#include "convert/text.h"
namespace pt namespace pt
@ -48,13 +48,14 @@ namespace pt
void HTMLFilter::Item::Clear() void HTMLFilter::Item::Clear()
{ {
name.clear(); name.clear();
type = none; type = none;
is_commentary = false; is_commentary = false;
porphans = nullptr; porphans = nullptr;
new_line = false; new_line_before = false;
new_line = false;
new_line_in_the_middle = false; new_line_in_the_middle = false;
has_body_tag = false; has_body_tag = false;
tree_index = 0; tree_index = 0;
} }
@ -74,7 +75,7 @@ void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
stack_len = 0; stack_len = 0;
out_string = &out; out_string = &out;
last_new_line = false; //last_new_line = false;
was_ending_commentary = false; was_ending_commentary = false;
line_len = 0; line_len = 0;
out_string->clear(); out_string->clear();
@ -382,15 +383,15 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
} }
void HTMLFilter::CheckNewLine() //void HTMLFilter::CheckNewLine()
{ //{
if( white_mode == WHITE_MODE_TREE ) // if( white_mode == WHITE_MODE_TREE )
{ // {
SkipWhite(); // SkipWhite();
} // }
//
last_new_line = (lastc==10); // last_new_line = (lastc==10);
} //}
@ -440,7 +441,7 @@ void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
was_closing_tag = true; was_closing_tag = true;
PopStack(); PopStack();
CheckNewLine(); //CheckNewLine();
break; break;
} }
} }
@ -857,17 +858,17 @@ void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
} }
bool HTMLFilter::PutNormalWhite() void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
{ {
bool was_white_char = false; was_white_char = false;
bool was_new_line = false; was_new_line = false;
while( lastc == 10 || IsWhite(lastc) ) while( lastc == 10 || IsWhite(lastc) )
{ {
was_white_char = true; // anyone white char even new line
if( lastc == 10 ) if( lastc == 10 )
was_new_line = true; was_new_line = true;
else
was_white_char = true;
if( white_mode == WHITE_MODE_ORIGIN ) if( white_mode == WHITE_MODE_ORIGIN )
{ {
@ -877,18 +878,12 @@ bool HTMLFilter::PutNormalWhite()
read_char(); read_char();
} }
if( white_mode == WHITE_MODE_SINGLE_LINE && was_white_char ) if( white_mode == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) )
{ {
Put(' '); Put(' ');
} }
if( white_mode == WHITE_MODE_TREE && was_new_line ) // in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
{
// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
}
last_new_line = was_new_line;
return was_white_char;
} }
@ -955,7 +950,7 @@ bool HTMLFilter::PutOpeningTag()
return false; return false;
} }
if( white_mode == WHITE_MODE_TREE && last_new_line ) if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
{ {
Put(10); Put(10);
PutTabs(LastItem().tree_index); PutTabs(LastItem().tree_index);
@ -991,7 +986,7 @@ void HTMLFilter::PutTabs(size_t len)
if( len > 30 ) if( len > 30 )
len = 30; len = 30;
for(int i=0 ; i < (len*tab_size) ; ++i) for(size_t i=0 ; i < (len*tab_size) ; ++i)
(*out_string) += ' '; // we do not add them to 'line_len' (*out_string) += ' '; // we do not add them to 'line_len'
} }
@ -1010,15 +1005,6 @@ void HTMLFilter::PutNonBreakingSpace()
//void HTMLFilter::PutNewLine()
//{
// buffer[0] = 10; // CHECKME for what purpose is this buffer?
// Put(10);
// line_len = 0;
//}
// we assume the size of the opening mark to be one // we assume the size of the opening mark to be one
bool HTMLFilter::IsOpeningTagMark(wchar_t c) bool HTMLFilter::IsOpeningTagMark(wchar_t c)
{ {
@ -1063,22 +1049,6 @@ bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
} }
//bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
//{
//static wchar_t comm_open[] = L"<!--";
//size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
//
// //return IsNameEqual(pchar, comm_open, comm_open_len);
// return false;
//}
//
//
//size_t HTMLFilter::OpeningCommentaryTagMarkSize()
//{
// return 4; // size of "<!--"
//}
bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str) bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
{ {
static wchar_t comm_end[] = L"-->"; static wchar_t comm_end[] = L"-->";
@ -1106,35 +1076,12 @@ bool HTMLFilter::IsEndingEntityMark(wchar_t c)
// skipping the commentary tag if exists
bool HTMLFilter::SkipCommentaryTagIfExists()
{
wchar_t comm_close[] = L"-->";
size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
/*
if( !IsOpeningCommentaryTagMark(pchar) )
return false;
pchar += OpeningCommentaryTagMarkSize();
// looking for "-->"
while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
++pchar;
if( *pchar!= 0 )
pchar += comm_close_len;
CheckNewLine();
*/
return true;
}
// reading text between html tags // reading text between html tags
void HTMLFilter::ReadNormalText() void HTMLFilter::ReadText()
{ {
bool was_white_char = false;
bool was_new_line = false;
bool was_non_white_text = false; bool was_non_white_text = false;
was_ending_commentary = false; was_ending_commentary = false;
@ -1175,13 +1122,16 @@ void HTMLFilter::ReadNormalText()
if( was_ending_commentary ) if( was_ending_commentary )
break; break;
if( PutNormalWhite() && white_mode == WHITE_MODE_TREE ) PutNormalWhite(was_white_char, was_new_line);
if( (was_white_char || was_new_line) && white_mode == WHITE_MODE_TREE )
{ {
if( last_new_line ) allow_put_new_line = false;
allow_put_space = false;
if( was_new_line )
{ {
allow_put_new_line = true; allow_put_new_line = true;
allow_put_space = false;
LastItem().new_line_in_the_middle = true; LastItem().new_line_in_the_middle = true;
if( !was_non_white_text ) if( !was_non_white_text )
@ -1189,7 +1139,6 @@ void HTMLFilter::ReadNormalText()
} }
else else
{ {
allow_put_new_line = false;
allow_put_space = true; allow_put_space = true;
} }
@ -1200,6 +1149,8 @@ void HTMLFilter::ReadNormalText()
} }
} }
} }
new_item_has_new_line_before = was_new_line;
} }
@ -1319,7 +1270,7 @@ void HTMLFilter::ReadItemSpecial()
if( !skip_tags ) if( !skip_tags )
{ {
if( white_mode == WHITE_MODE_TREE && last_new_line ) if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
{ {
Put(10); Put(10);
PutTabs(LastItem().tree_index); PutTabs(LastItem().tree_index);
@ -1351,6 +1302,13 @@ void HTMLFilter::ReadItemSpecial()
Put(' '); Put(' ');
Put(tmp_text); Put(tmp_text);
Put('>'); Put('>');
if( is_first_item && white_mode == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") )
{
Put(10);
Put(10);
SkipWhiteLines();
}
} }
} }
} }
@ -1399,6 +1357,8 @@ bool HTMLFilter::ReadItem()
if( !PushStack() ) if( !PushStack() )
return false; return false;
LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method
if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle ) if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
LastItem().tree_index += 1; LastItem().tree_index += 1;
@ -1602,7 +1562,7 @@ int i;
pstack[z].Clear(); pstack[z].Clear();
} }
last_new_line = pstack[stack_len-1].new_line; //last_new_line = pstack[stack_len-1].new_line;
// invalidate tags // invalidate tags
stack_len = i; stack_len = i;
@ -1661,7 +1621,7 @@ void HTMLFilter::CheckClosingTags()
} }
PutClosingTag(pstack[stack_len-1]); PutClosingTag(pstack[stack_len-1]);
last_new_line = pstack[stack_len-1].new_line; //last_new_line = pstack[stack_len-1].new_line;
PopStack(); PopStack();
PopStack(); PopStack();
} }
@ -1711,27 +1671,17 @@ void HTMLFilter::ReadLoop()
CheckExceptions(); CheckExceptions();
} }
else else
if( LastItem().type == Item::special || LastItem().type == Item::simple ) if( LastItem().type == Item::special )
{ {
if( stack_len > 1 )
{
//pstack[stack_len-2].new_line = LastItem().new_line;
}
else
if( white_mode == WHITE_MODE_TREE )
{
// one new line after a simple or special tag
// (if the tag has level 0 in the tree - it not means that this is a first tag)
// for example can be DOCTYPE
if( !LastItem().is_commentary )
Put(10);
}
if( !LastItem().is_commentary ) if( !LastItem().is_commentary )
PopStack(); PopStack();
} }
else else
if( LastItem().type == Item::simple )
{
PopStack();
}
else
if( LastItem().type == Item::closing ) if( LastItem().type == Item::closing )
{ {
CheckClosingTags(); CheckClosingTags();
@ -1741,7 +1691,8 @@ void HTMLFilter::ReadLoop()
PopStack(); PopStack();
} }
ReadNormalText(); ReadText();
is_first_item = false;
} }
} }
@ -1750,12 +1701,13 @@ void HTMLFilter::ReadLoop()
void HTMLFilter::Read() void HTMLFilter::Read()
{ {
read_char(); // put first character to lastc read_char(); // put first character to lastc
is_first_item = true;
if( white_mode != WHITE_MODE_ORIGIN ) if( white_mode != WHITE_MODE_ORIGIN )
SkipWhiteLines(); SkipWhiteLines();
// it can be some text or white lines before the first html tag (we print it) // it can be some text or white lines before the first html tag (we print it)
ReadNormalText(); ReadText();
// reading the whole html source // reading the whole html source
ReadLoop(); ReadLoop();

View File

@ -204,6 +204,8 @@ protected:
bool is_commentary; bool is_commentary;
bool new_line_before;
// is there a new line after this tag // is there a new line after this tag
bool new_line; bool new_line;
@ -244,15 +246,12 @@ protected:
virtual bool IsStartingEntityMark(wchar_t c); virtual bool IsStartingEntityMark(wchar_t c);
virtual bool IsEndingEntityMark(wchar_t c); virtual bool IsEndingEntityMark(wchar_t c);
// virtual bool IsOpeningCommentaryTagMark(const wchar_t * str);
// virtual size_t OpeningCommentaryTagMarkSize();
virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str); virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
virtual bool IsValidCharForName(int c); virtual bool IsValidCharForName(int c);
virtual bool IsValidCharForAttrName(int c); virtual bool IsValidCharForAttrName(int c);
virtual bool IsValidCharForEntityName(int c); virtual bool IsValidCharForEntityName(int c);
virtual void CheckExceptions(); virtual void CheckExceptions();
virtual bool SkipCommentaryTagIfExists();
virtual void Put(wchar_t c); virtual void Put(wchar_t c);
virtual void Put(const wchar_t * str, const wchar_t * end); virtual void Put(const wchar_t * str, const wchar_t * end);
@ -306,11 +305,10 @@ protected:
void PopStack(); void PopStack();
bool PushStack(); bool PushStack();
void CheckNewLine();
void CheckStackPrintRest(); void CheckStackPrintRest();
void AddForgottenTags(); void AddForgottenTags();
void CheckClosingTags(); void CheckClosingTags();
void ReadNormalText(); void ReadText();
bool PrintRest(); bool PrintRest();
bool PrintOpeningItem(); bool PrintOpeningItem();
void ReadItemName(std::wstring & name, bool clear_name = true); void ReadItemName(std::wstring & name, bool clear_name = true);
@ -332,7 +330,7 @@ protected:
void CheckChar(wchar_t c); void CheckChar(wchar_t c);
void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space); void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
bool PutNormalWhite(); void PutNormalWhite(bool & was_white_char, bool & was_new_line);
void PutEverythingUntilClosingTag(bool put_closing_tag_as_well); void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
void PutTabs(size_t len); void PutTabs(size_t len);
void PutNonBreakingSpace(); void PutNonBreakingSpace();
@ -343,8 +341,10 @@ protected:
size_t stack_len; // length of the stack size_t stack_len; // length of the stack
wchar_t * buffer; // buffer used when printing wchar_t * buffer; // buffer used when printing
std::wstring * out_string; std::wstring * out_string;
bool last_new_line; //bool last_new_line;
bool new_item_has_new_line_before;
int white_mode; int white_mode;
bool is_first_item;
size_t wrap_line; // insert a new line character into long lines size_t wrap_line; // insert a new line character into long lines
size_t tab_size; size_t tab_size;
bool was_ending_commentary; bool was_ending_commentary;