fixed improper new line character after <single/> items, added Item::new_line_before flag
This commit is contained in:
parent
4f8ae6ce29
commit
c0e940c500
|
@ -36,7 +36,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "htmlfilter.h"
|
#include "htmlfilter.h"
|
||||||
|
#include "convert/text.h"
|
||||||
|
|
||||||
|
|
||||||
namespace pt
|
namespace pt
|
||||||
|
@ -48,13 +48,14 @@ namespace pt
|
||||||
void HTMLFilter::Item::Clear()
|
void HTMLFilter::Item::Clear()
|
||||||
{
|
{
|
||||||
name.clear();
|
name.clear();
|
||||||
type = none;
|
type = none;
|
||||||
is_commentary = false;
|
is_commentary = false;
|
||||||
porphans = nullptr;
|
porphans = nullptr;
|
||||||
new_line = false;
|
new_line_before = false;
|
||||||
|
new_line = false;
|
||||||
new_line_in_the_middle = false;
|
new_line_in_the_middle = false;
|
||||||
has_body_tag = false;
|
has_body_tag = false;
|
||||||
tree_index = 0;
|
tree_index = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,7 +75,7 @@ void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
|
||||||
|
|
||||||
stack_len = 0;
|
stack_len = 0;
|
||||||
out_string = &out;
|
out_string = &out;
|
||||||
last_new_line = false;
|
//last_new_line = false;
|
||||||
was_ending_commentary = false;
|
was_ending_commentary = false;
|
||||||
line_len = 0;
|
line_len = 0;
|
||||||
out_string->clear();
|
out_string->clear();
|
||||||
|
@ -382,15 +383,15 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void HTMLFilter::CheckNewLine()
|
//void HTMLFilter::CheckNewLine()
|
||||||
{
|
//{
|
||||||
if( white_mode == WHITE_MODE_TREE )
|
// if( white_mode == WHITE_MODE_TREE )
|
||||||
{
|
// {
|
||||||
SkipWhite();
|
// SkipWhite();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
last_new_line = (lastc==10);
|
// last_new_line = (lastc==10);
|
||||||
}
|
//}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -440,7 +441,7 @@ void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
|
||||||
was_closing_tag = true;
|
was_closing_tag = true;
|
||||||
|
|
||||||
PopStack();
|
PopStack();
|
||||||
CheckNewLine();
|
//CheckNewLine();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -857,17 +858,17 @@ void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool HTMLFilter::PutNormalWhite()
|
void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
|
||||||
{
|
{
|
||||||
bool was_white_char = false;
|
was_white_char = false;
|
||||||
bool was_new_line = false;
|
was_new_line = false;
|
||||||
|
|
||||||
while( lastc == 10 || IsWhite(lastc) )
|
while( lastc == 10 || IsWhite(lastc) )
|
||||||
{
|
{
|
||||||
was_white_char = true; // anyone white char even new line
|
|
||||||
|
|
||||||
if( lastc == 10 )
|
if( lastc == 10 )
|
||||||
was_new_line = true;
|
was_new_line = true;
|
||||||
|
else
|
||||||
|
was_white_char = true;
|
||||||
|
|
||||||
if( white_mode == WHITE_MODE_ORIGIN )
|
if( white_mode == WHITE_MODE_ORIGIN )
|
||||||
{
|
{
|
||||||
|
@ -877,18 +878,12 @@ bool HTMLFilter::PutNormalWhite()
|
||||||
read_char();
|
read_char();
|
||||||
}
|
}
|
||||||
|
|
||||||
if( white_mode == WHITE_MODE_SINGLE_LINE && was_white_char )
|
if( white_mode == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) )
|
||||||
{
|
{
|
||||||
Put(' ');
|
Put(' ');
|
||||||
}
|
}
|
||||||
|
|
||||||
if( white_mode == WHITE_MODE_TREE && was_new_line )
|
// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
|
||||||
{
|
|
||||||
// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
|
|
||||||
}
|
|
||||||
|
|
||||||
last_new_line = was_new_line;
|
|
||||||
return was_white_char;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -955,7 +950,7 @@ bool HTMLFilter::PutOpeningTag()
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( white_mode == WHITE_MODE_TREE && last_new_line )
|
if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
|
||||||
{
|
{
|
||||||
Put(10);
|
Put(10);
|
||||||
PutTabs(LastItem().tree_index);
|
PutTabs(LastItem().tree_index);
|
||||||
|
@ -991,7 +986,7 @@ void HTMLFilter::PutTabs(size_t len)
|
||||||
if( len > 30 )
|
if( len > 30 )
|
||||||
len = 30;
|
len = 30;
|
||||||
|
|
||||||
for(int i=0 ; i < (len*tab_size) ; ++i)
|
for(size_t i=0 ; i < (len*tab_size) ; ++i)
|
||||||
(*out_string) += ' '; // we do not add them to 'line_len'
|
(*out_string) += ' '; // we do not add them to 'line_len'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1010,15 +1005,6 @@ void HTMLFilter::PutNonBreakingSpace()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//void HTMLFilter::PutNewLine()
|
|
||||||
//{
|
|
||||||
// buffer[0] = 10; // CHECKME for what purpose is this buffer?
|
|
||||||
// Put(10);
|
|
||||||
// line_len = 0;
|
|
||||||
//}
|
|
||||||
|
|
||||||
|
|
||||||
// we assume the size of the opening mark to be one
|
// we assume the size of the opening mark to be one
|
||||||
bool HTMLFilter::IsOpeningTagMark(wchar_t c)
|
bool HTMLFilter::IsOpeningTagMark(wchar_t c)
|
||||||
{
|
{
|
||||||
|
@ -1063,22 +1049,6 @@ bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
|
|
||||||
//{
|
|
||||||
//static wchar_t comm_open[] = L"<!--";
|
|
||||||
//size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
|
|
||||||
//
|
|
||||||
// //return IsNameEqual(pchar, comm_open, comm_open_len);
|
|
||||||
// return false;
|
|
||||||
//}
|
|
||||||
//
|
|
||||||
//
|
|
||||||
//size_t HTMLFilter::OpeningCommentaryTagMarkSize()
|
|
||||||
//{
|
|
||||||
// return 4; // size of "<!--"
|
|
||||||
//}
|
|
||||||
|
|
||||||
|
|
||||||
bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
|
bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
|
||||||
{
|
{
|
||||||
static wchar_t comm_end[] = L"-->";
|
static wchar_t comm_end[] = L"-->";
|
||||||
|
@ -1106,35 +1076,12 @@ bool HTMLFilter::IsEndingEntityMark(wchar_t c)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// skipping the commentary tag if exists
|
|
||||||
bool HTMLFilter::SkipCommentaryTagIfExists()
|
|
||||||
{
|
|
||||||
wchar_t comm_close[] = L"-->";
|
|
||||||
size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
|
|
||||||
/*
|
|
||||||
if( !IsOpeningCommentaryTagMark(pchar) )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
pchar += OpeningCommentaryTagMarkSize();
|
|
||||||
|
|
||||||
// looking for "-->"
|
|
||||||
while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
|
|
||||||
++pchar;
|
|
||||||
|
|
||||||
if( *pchar!= 0 )
|
|
||||||
pchar += comm_close_len;
|
|
||||||
|
|
||||||
CheckNewLine();
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// reading text between html tags
|
// reading text between html tags
|
||||||
void HTMLFilter::ReadNormalText()
|
void HTMLFilter::ReadText()
|
||||||
{
|
{
|
||||||
|
bool was_white_char = false;
|
||||||
|
bool was_new_line = false;
|
||||||
|
|
||||||
bool was_non_white_text = false;
|
bool was_non_white_text = false;
|
||||||
|
|
||||||
was_ending_commentary = false;
|
was_ending_commentary = false;
|
||||||
|
@ -1175,13 +1122,16 @@ void HTMLFilter::ReadNormalText()
|
||||||
if( was_ending_commentary )
|
if( was_ending_commentary )
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if( PutNormalWhite() && white_mode == WHITE_MODE_TREE )
|
PutNormalWhite(was_white_char, was_new_line);
|
||||||
|
|
||||||
|
if( (was_white_char || was_new_line) && white_mode == WHITE_MODE_TREE )
|
||||||
{
|
{
|
||||||
if( last_new_line )
|
allow_put_new_line = false;
|
||||||
|
allow_put_space = false;
|
||||||
|
|
||||||
|
if( was_new_line )
|
||||||
{
|
{
|
||||||
allow_put_new_line = true;
|
allow_put_new_line = true;
|
||||||
allow_put_space = false;
|
|
||||||
|
|
||||||
LastItem().new_line_in_the_middle = true;
|
LastItem().new_line_in_the_middle = true;
|
||||||
|
|
||||||
if( !was_non_white_text )
|
if( !was_non_white_text )
|
||||||
|
@ -1189,7 +1139,6 @@ void HTMLFilter::ReadNormalText()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
allow_put_new_line = false;
|
|
||||||
allow_put_space = true;
|
allow_put_space = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1200,6 +1149,8 @@ void HTMLFilter::ReadNormalText()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
new_item_has_new_line_before = was_new_line;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1319,7 +1270,7 @@ void HTMLFilter::ReadItemSpecial()
|
||||||
|
|
||||||
if( !skip_tags )
|
if( !skip_tags )
|
||||||
{
|
{
|
||||||
if( white_mode == WHITE_MODE_TREE && last_new_line )
|
if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
|
||||||
{
|
{
|
||||||
Put(10);
|
Put(10);
|
||||||
PutTabs(LastItem().tree_index);
|
PutTabs(LastItem().tree_index);
|
||||||
|
@ -1351,6 +1302,13 @@ void HTMLFilter::ReadItemSpecial()
|
||||||
Put(' ');
|
Put(' ');
|
||||||
Put(tmp_text);
|
Put(tmp_text);
|
||||||
Put('>');
|
Put('>');
|
||||||
|
|
||||||
|
if( is_first_item && white_mode == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") )
|
||||||
|
{
|
||||||
|
Put(10);
|
||||||
|
Put(10);
|
||||||
|
SkipWhiteLines();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1399,6 +1357,8 @@ bool HTMLFilter::ReadItem()
|
||||||
if( !PushStack() )
|
if( !PushStack() )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method
|
||||||
|
|
||||||
if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
|
if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
|
||||||
LastItem().tree_index += 1;
|
LastItem().tree_index += 1;
|
||||||
|
|
||||||
|
@ -1602,7 +1562,7 @@ int i;
|
||||||
pstack[z].Clear();
|
pstack[z].Clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
last_new_line = pstack[stack_len-1].new_line;
|
//last_new_line = pstack[stack_len-1].new_line;
|
||||||
|
|
||||||
// invalidate tags
|
// invalidate tags
|
||||||
stack_len = i;
|
stack_len = i;
|
||||||
|
@ -1661,7 +1621,7 @@ void HTMLFilter::CheckClosingTags()
|
||||||
}
|
}
|
||||||
|
|
||||||
PutClosingTag(pstack[stack_len-1]);
|
PutClosingTag(pstack[stack_len-1]);
|
||||||
last_new_line = pstack[stack_len-1].new_line;
|
//last_new_line = pstack[stack_len-1].new_line;
|
||||||
PopStack();
|
PopStack();
|
||||||
PopStack();
|
PopStack();
|
||||||
}
|
}
|
||||||
|
@ -1711,27 +1671,17 @@ void HTMLFilter::ReadLoop()
|
||||||
CheckExceptions();
|
CheckExceptions();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
if( LastItem().type == Item::special || LastItem().type == Item::simple )
|
if( LastItem().type == Item::special )
|
||||||
{
|
{
|
||||||
if( stack_len > 1 )
|
|
||||||
{
|
|
||||||
//pstack[stack_len-2].new_line = LastItem().new_line;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
if( white_mode == WHITE_MODE_TREE )
|
|
||||||
{
|
|
||||||
// one new line after a simple or special tag
|
|
||||||
// (if the tag has level 0 in the tree - it not means that this is a first tag)
|
|
||||||
// for example can be DOCTYPE
|
|
||||||
|
|
||||||
if( !LastItem().is_commentary )
|
|
||||||
Put(10);
|
|
||||||
}
|
|
||||||
|
|
||||||
if( !LastItem().is_commentary )
|
if( !LastItem().is_commentary )
|
||||||
PopStack();
|
PopStack();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
if( LastItem().type == Item::simple )
|
||||||
|
{
|
||||||
|
PopStack();
|
||||||
|
}
|
||||||
|
else
|
||||||
if( LastItem().type == Item::closing )
|
if( LastItem().type == Item::closing )
|
||||||
{
|
{
|
||||||
CheckClosingTags();
|
CheckClosingTags();
|
||||||
|
@ -1741,7 +1691,8 @@ void HTMLFilter::ReadLoop()
|
||||||
PopStack();
|
PopStack();
|
||||||
}
|
}
|
||||||
|
|
||||||
ReadNormalText();
|
ReadText();
|
||||||
|
is_first_item = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1750,12 +1701,13 @@ void HTMLFilter::ReadLoop()
|
||||||
void HTMLFilter::Read()
|
void HTMLFilter::Read()
|
||||||
{
|
{
|
||||||
read_char(); // put first character to lastc
|
read_char(); // put first character to lastc
|
||||||
|
is_first_item = true;
|
||||||
|
|
||||||
if( white_mode != WHITE_MODE_ORIGIN )
|
if( white_mode != WHITE_MODE_ORIGIN )
|
||||||
SkipWhiteLines();
|
SkipWhiteLines();
|
||||||
|
|
||||||
// it can be some text or white lines before the first html tag (we print it)
|
// it can be some text or white lines before the first html tag (we print it)
|
||||||
ReadNormalText();
|
ReadText();
|
||||||
|
|
||||||
// reading the whole html source
|
// reading the whole html source
|
||||||
ReadLoop();
|
ReadLoop();
|
||||||
|
|
|
@ -204,6 +204,8 @@ protected:
|
||||||
|
|
||||||
bool is_commentary;
|
bool is_commentary;
|
||||||
|
|
||||||
|
bool new_line_before;
|
||||||
|
|
||||||
// is there a new line after this tag
|
// is there a new line after this tag
|
||||||
bool new_line;
|
bool new_line;
|
||||||
|
|
||||||
|
@ -244,15 +246,12 @@ protected:
|
||||||
virtual bool IsStartingEntityMark(wchar_t c);
|
virtual bool IsStartingEntityMark(wchar_t c);
|
||||||
virtual bool IsEndingEntityMark(wchar_t c);
|
virtual bool IsEndingEntityMark(wchar_t c);
|
||||||
|
|
||||||
// virtual bool IsOpeningCommentaryTagMark(const wchar_t * str);
|
|
||||||
// virtual size_t OpeningCommentaryTagMarkSize();
|
|
||||||
virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
|
virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
|
||||||
|
|
||||||
virtual bool IsValidCharForName(int c);
|
virtual bool IsValidCharForName(int c);
|
||||||
virtual bool IsValidCharForAttrName(int c);
|
virtual bool IsValidCharForAttrName(int c);
|
||||||
virtual bool IsValidCharForEntityName(int c);
|
virtual bool IsValidCharForEntityName(int c);
|
||||||
virtual void CheckExceptions();
|
virtual void CheckExceptions();
|
||||||
virtual bool SkipCommentaryTagIfExists();
|
|
||||||
|
|
||||||
virtual void Put(wchar_t c);
|
virtual void Put(wchar_t c);
|
||||||
virtual void Put(const wchar_t * str, const wchar_t * end);
|
virtual void Put(const wchar_t * str, const wchar_t * end);
|
||||||
|
@ -306,11 +305,10 @@ protected:
|
||||||
|
|
||||||
void PopStack();
|
void PopStack();
|
||||||
bool PushStack();
|
bool PushStack();
|
||||||
void CheckNewLine();
|
|
||||||
void CheckStackPrintRest();
|
void CheckStackPrintRest();
|
||||||
void AddForgottenTags();
|
void AddForgottenTags();
|
||||||
void CheckClosingTags();
|
void CheckClosingTags();
|
||||||
void ReadNormalText();
|
void ReadText();
|
||||||
bool PrintRest();
|
bool PrintRest();
|
||||||
bool PrintOpeningItem();
|
bool PrintOpeningItem();
|
||||||
void ReadItemName(std::wstring & name, bool clear_name = true);
|
void ReadItemName(std::wstring & name, bool clear_name = true);
|
||||||
|
@ -332,7 +330,7 @@ protected:
|
||||||
void CheckChar(wchar_t c);
|
void CheckChar(wchar_t c);
|
||||||
|
|
||||||
void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
|
void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
|
||||||
bool PutNormalWhite();
|
void PutNormalWhite(bool & was_white_char, bool & was_new_line);
|
||||||
void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
|
void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
|
||||||
void PutTabs(size_t len);
|
void PutTabs(size_t len);
|
||||||
void PutNonBreakingSpace();
|
void PutNonBreakingSpace();
|
||||||
|
@ -343,8 +341,10 @@ protected:
|
||||||
size_t stack_len; // length of the stack
|
size_t stack_len; // length of the stack
|
||||||
wchar_t * buffer; // buffer used when printing
|
wchar_t * buffer; // buffer used when printing
|
||||||
std::wstring * out_string;
|
std::wstring * out_string;
|
||||||
bool last_new_line;
|
//bool last_new_line;
|
||||||
|
bool new_item_has_new_line_before;
|
||||||
int white_mode;
|
int white_mode;
|
||||||
|
bool is_first_item;
|
||||||
size_t wrap_line; // insert a new line character into long lines
|
size_t wrap_line; // insert a new line character into long lines
|
||||||
size_t tab_size;
|
size_t tab_size;
|
||||||
bool was_ending_commentary;
|
bool was_ending_commentary;
|
||||||
|
|
Loading…
Reference in New Issue