Browse Source

fixed improper new line character after <single/> items, added Item::new_line_before flag

htmlparserlistener
Tomasz Sowa 1 year ago
parent
commit
c0e940c500
  1. 174
      src/html/htmlfilter.cpp
  2. 14
      src/html/htmlfilter.h

174
src/html/htmlfilter.cpp

@ -36,7 +36,7 @@
*/
#include "htmlfilter.h"
#include "convert/text.h"
namespace pt
@ -48,13 +48,14 @@ namespace pt
void HTMLFilter::Item::Clear()
{
name.clear();
type = none;
is_commentary = false;
porphans = nullptr;
new_line = false;
type = none;
is_commentary = false;
porphans = nullptr;
new_line_before = false;
new_line = false;
new_line_in_the_middle = false;
has_body_tag = false;
tree_index = 0;
has_body_tag = false;
tree_index = 0;
}
@ -74,7 +75,7 @@ void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
stack_len = 0;
out_string = &out;
last_new_line = false;
//last_new_line = false;
was_ending_commentary = false;
line_len = 0;
out_string->clear();
@ -382,15 +383,15 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
}
void HTMLFilter::CheckNewLine()
{
if( white_mode == WHITE_MODE_TREE )
{
SkipWhite();
}
last_new_line = (lastc==10);
}
//void HTMLFilter::CheckNewLine()
//{
// if( white_mode == WHITE_MODE_TREE )
// {
// SkipWhite();
// }
//
// last_new_line = (lastc==10);
//}
@ -440,7 +441,7 @@ void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
was_closing_tag = true;
PopStack();
CheckNewLine();
//CheckNewLine();
break;
}
}
@ -857,17 +858,17 @@ void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
}
bool HTMLFilter::PutNormalWhite()
void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
{
bool was_white_char = false;
bool was_new_line = false;
was_white_char = false;
was_new_line = false;
while( lastc == 10 || IsWhite(lastc) )
{
was_white_char = true; // anyone white char even new line
if( lastc == 10 )
was_new_line = true;
else
was_white_char = true;
if( white_mode == WHITE_MODE_ORIGIN )
{
@ -877,18 +878,12 @@ bool HTMLFilter::PutNormalWhite()
read_char();
}
if( white_mode == WHITE_MODE_SINGLE_LINE && was_white_char )
if( white_mode == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) )
{
Put(' ');
}
if( white_mode == WHITE_MODE_TREE && was_new_line )
{
// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
}
last_new_line = was_new_line;
return was_white_char;
// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
}
@ -955,7 +950,7 @@ bool HTMLFilter::PutOpeningTag()
return false;
}
if( white_mode == WHITE_MODE_TREE && last_new_line )
if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
{
Put(10);
PutTabs(LastItem().tree_index);
@ -991,7 +986,7 @@ void HTMLFilter::PutTabs(size_t len)
if( len > 30 )
len = 30;
for(int i=0 ; i < (len*tab_size) ; ++i)
for(size_t i=0 ; i < (len*tab_size) ; ++i)
(*out_string) += ' '; // we do not add them to 'line_len'
}
@ -1010,15 +1005,6 @@ void HTMLFilter::PutNonBreakingSpace()
//void HTMLFilter::PutNewLine()
//{
// buffer[0] = 10; // CHECKME for what purpose is this buffer?
// Put(10);
// line_len = 0;
//}
// we assume the size of the opening mark to be one
bool HTMLFilter::IsOpeningTagMark(wchar_t c)
{
@ -1063,22 +1049,6 @@ bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
}
//bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
//{
//static wchar_t comm_open[] = L"<!--";
//size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
//
// //return IsNameEqual(pchar, comm_open, comm_open_len);
// return false;
//}
//
//
//size_t HTMLFilter::OpeningCommentaryTagMarkSize()
//{
// return 4; // size of "<!--"
//}
bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
{
static wchar_t comm_end[] = L"-->";
@ -1106,35 +1076,12 @@ bool HTMLFilter::IsEndingEntityMark(wchar_t c)
// skipping the commentary tag if exists
bool HTMLFilter::SkipCommentaryTagIfExists()
{
wchar_t comm_close[] = L"-->";
size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
/*
if( !IsOpeningCommentaryTagMark(pchar) )
return false;
pchar += OpeningCommentaryTagMarkSize();
// looking for "-->"
while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
++pchar;
if( *pchar!= 0 )
pchar += comm_close_len;
CheckNewLine();
*/
return true;
}
// reading text between html tags
void HTMLFilter::ReadNormalText()
void HTMLFilter::ReadText()
{
bool was_white_char = false;
bool was_new_line = false;
bool was_non_white_text = false;
was_ending_commentary = false;
@ -1175,13 +1122,16 @@ void HTMLFilter::ReadNormalText()
if( was_ending_commentary )
break;
if( PutNormalWhite() && white_mode == WHITE_MODE_TREE )
PutNormalWhite(was_white_char, was_new_line);
if( (was_white_char || was_new_line) && white_mode == WHITE_MODE_TREE )
{
if( last_new_line )
allow_put_new_line = false;
allow_put_space = false;
if( was_new_line )
{
allow_put_new_line = true;
allow_put_space = false;
LastItem().new_line_in_the_middle = true;
if( !was_non_white_text )
@ -1189,7 +1139,6 @@ void HTMLFilter::ReadNormalText()
}
else
{
allow_put_new_line = false;
allow_put_space = true;
}
@ -1200,6 +1149,8 @@ void HTMLFilter::ReadNormalText()
}
}
}
new_item_has_new_line_before = was_new_line;
}
@ -1319,7 +1270,7 @@ void HTMLFilter::ReadItemSpecial()
if( !skip_tags )
{
if( white_mode == WHITE_MODE_TREE && last_new_line )
if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
{
Put(10);
PutTabs(LastItem().tree_index);
@ -1351,6 +1302,13 @@ void HTMLFilter::ReadItemSpecial()
Put(' ');
Put(tmp_text);
Put('>');
if( is_first_item && white_mode == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") )
{
Put(10);
Put(10);
SkipWhiteLines();
}
}
}
}
@ -1399,6 +1357,8 @@ bool HTMLFilter::ReadItem()
if( !PushStack() )
return false;
LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method
if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
LastItem().tree_index += 1;
@ -1602,7 +1562,7 @@ int i;
pstack[z].Clear();
}
last_new_line = pstack[stack_len-1].new_line;
//last_new_line = pstack[stack_len-1].new_line;
// invalidate tags
stack_len = i;
@ -1661,7 +1621,7 @@ void HTMLFilter::CheckClosingTags()
}
PutClosingTag(pstack[stack_len-1]);
last_new_line = pstack[stack_len-1].new_line;
//last_new_line = pstack[stack_len-1].new_line;
PopStack();
PopStack();
}
@ -1711,27 +1671,17 @@ void HTMLFilter::ReadLoop()
CheckExceptions();
}
else
if( LastItem().type == Item::special || LastItem().type == Item::simple )
if( LastItem().type == Item::special )
{
if( stack_len > 1 )
{
//pstack[stack_len-2].new_line = LastItem().new_line;
}
else
if( white_mode == WHITE_MODE_TREE )
{
// one new line after a simple or special tag
// (if the tag has level 0 in the tree - it not means that this is a first tag)
// for example can be DOCTYPE
if( !LastItem().is_commentary )
Put(10);
}
if( !LastItem().is_commentary )
PopStack();
}
else
if( LastItem().type == Item::simple )
{
PopStack();
}
else
if( LastItem().type == Item::closing )
{
CheckClosingTags();
@ -1741,7 +1691,8 @@ void HTMLFilter::ReadLoop()
PopStack();
}
ReadNormalText();
ReadText();
is_first_item = false;
}
}
@ -1750,12 +1701,13 @@ void HTMLFilter::ReadLoop()
void HTMLFilter::Read()
{
read_char(); // put first character to lastc
is_first_item = true;
if( white_mode != WHITE_MODE_ORIGIN )
SkipWhiteLines();
// it can be some text or white lines before the first html tag (we print it)
ReadNormalText();
ReadText();
// reading the whole html source
ReadLoop();

14
src/html/htmlfilter.h

@ -204,6 +204,8 @@ protected:
bool is_commentary;
bool new_line_before;
// is there a new line after this tag
bool new_line;
@ -244,15 +246,12 @@ protected:
virtual bool IsStartingEntityMark(wchar_t c);
virtual bool IsEndingEntityMark(wchar_t c);
// virtual bool IsOpeningCommentaryTagMark(const wchar_t * str);
// virtual size_t OpeningCommentaryTagMarkSize();
virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
virtual bool IsValidCharForName(int c);
virtual bool IsValidCharForAttrName(int c);
virtual bool IsValidCharForEntityName(int c);
virtual void CheckExceptions();
virtual bool SkipCommentaryTagIfExists();
virtual void Put(wchar_t c);
virtual void Put(const wchar_t * str, const wchar_t * end);
@ -306,11 +305,10 @@ protected:
void PopStack();
bool PushStack();
void CheckNewLine();
void CheckStackPrintRest();
void AddForgottenTags();
void CheckClosingTags();
void ReadNormalText();
void ReadText();
bool PrintRest();
bool PrintOpeningItem();
void ReadItemName(std::wstring & name, bool clear_name = true);
@ -332,7 +330,7 @@ protected:
void CheckChar(wchar_t c);
void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
bool PutNormalWhite();
void PutNormalWhite(bool & was_white_char, bool & was_new_line);
void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
void PutTabs(size_t len);
void PutNonBreakingSpace();
@ -343,8 +341,10 @@ protected:
size_t stack_len; // length of the stack
wchar_t * buffer; // buffer used when printing
std::wstring * out_string;
bool last_new_line;
//bool last_new_line;
bool new_item_has_new_line_before;
int white_mode;
bool is_first_item;
size_t wrap_line; // insert a new line character into long lines
size_t tab_size;
bool was_ending_commentary;

Loading…
Cancel
Save