added to HTMLFilter:

- now we can parse " and ' in html attributes
- we can skip html tags and commentaries, added method:
  void SkipAllTags(bool skip_all_tags, bool skip_commentaries);
- there is virtual method: virtual void ItemFound();
  which is called when a html tag is parsed




git-svn-id: svn://ttmath.org/publicrep/winix/trunk@1129 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
Tomasz Sowa 2018-10-23 23:28:45 +00:00
parent d1e7765e98
commit 027a8ec428
3 changed files with 130 additions and 69 deletions

File diff suppressed because one or more lines are too long

View File

@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2008-2014, Tomasz Sowa * Copyright (c) 2008-2018, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -97,17 +97,25 @@ size_t out_projected_len = in.size() * 2 + 1;
} }
HTMLFilter::HTMLFilter() void HTMLFilter::SetSomeDefaults()
{ {
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
tab_size = 2; tab_size = 2;
trim_white = false; trim_white = false;
break_after = 0; break_after = 0;
wrap_line = 0; wrap_line = 0;
orphan_mode = orphan_nbsp; orphan_mode = orphan_nbsp;
safe_mode = false; safe_mode = false;
skip_all_tags = false;
skip_commentaries = false;
}
HTMLFilter::HTMLFilter()
{
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
SetSomeDefaults();
} }
@ -116,6 +124,8 @@ HTMLFilter::HTMLFilter(const HTMLFilter & f)
// don't need to copy the stack // don't need to copy the stack
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
SetSomeDefaults();
} }
@ -125,6 +135,8 @@ HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
// we can copy some fields from f
return *this; return *this;
} }
@ -136,6 +148,8 @@ HTMLFilter::~HTMLFilter()
} }
void HTMLFilter::BreakWord(size_t break_after_) void HTMLFilter::BreakWord(size_t break_after_)
{ {
break_after = break_after_; break_after = break_after_;
@ -224,6 +238,13 @@ void HTMLFilter::SafeMode(bool safe_mode_)
} }
void HTMLFilter::SkipAllTags(bool skip_all_tags, bool skip_commentaries)
{
this->skip_all_tags = skip_all_tags;
this->skip_commentaries = skip_commentaries;
}
void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name) void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
{ {
no_filter_tag = tag_name; no_filter_tag = tag_name;
@ -374,9 +395,10 @@ return false;
// used for such tags as: script, pre, textarea // used for such tags as: script, pre, textarea
void HTMLFilter::PutLastTagWithClosingTag() void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
{ {
const wchar_t * start = pchar; const wchar_t * start = pchar;
const wchar_t * end = pchar;
while( *pchar != 0 ) while( *pchar != 0 )
{ {
@ -384,32 +406,9 @@ const wchar_t * start = pchar;
{ {
if( IsClosingTagForLastItem() ) if( IsClosingTagForLastItem() )
{ {
PopStack(); if( put_closing_tag_as_well )
CheckNewLine(); end = pchar;
break;
}
}
else
{
pchar += 1;
}
}
Put(start, pchar);
}
// used with <nofilter> </nofilter> tags
void HTMLFilter::PutTextBetweenLastTagWithClosingTag()
{
const wchar_t * start = pchar, * end = pchar;
while( *pchar != 0 )
{
if( IsOpeningTagMark() )
{
if( IsClosingTagForLastItem() )
{
PopStack(); PopStack();
CheckNewLine(); CheckNewLine();
break; break;
@ -427,16 +426,36 @@ const wchar_t * start = pchar, * end = pchar;
void HTMLFilter::SkipAndCheckClosingTag() void HTMLFilter::SkipAndCheckClosingTag()
{ {
bool is_quoted = false;
wchar_t quote_char = 0;
for( ; *pchar ; ++pchar ) for( ; *pchar ; ++pchar )
{ {
if( LastItem().type == Item::opening && IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/' if( *pchar == '"' || *pchar == '\'' )
{
if( is_quoted )
{
if( *pchar == quote_char )
{
is_quoted = false;
}
}
else
{
is_quoted = true;
quote_char = *pchar;
}
}
else
if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
{ {
LastItem().type = Item::simple; LastItem().type = Item::simple;
} }
else
if( IsClosingTagMark() ) if( !is_quoted && IsClosingTagMark() )
{ {
++pchar; ++pchar;
break; break;
@ -502,18 +521,26 @@ size_t i;
void HTMLFilter::ReadItemAttrValue(bool has_quote) void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
{ {
size_t i; size_t i;
// sprawdzic to wszedzie bo teraz jest tablica
attr_value.clear(); attr_value.clear();
attr_value_temp.clear(); attr_value_temp.clear();
// !! dodac obsluge pojedynczego cudzyslowu for(i=0 ; *pchar ; ++i, ++pchar )
for(i=0 ; *pchar && *pchar != '\"' && !IsClosingTagMark() && (has_quote || (*pchar!=10 && !IsWhite(*pchar)) ); ++i )
{ {
if( has_quote )
{
if( *pchar == quote_char )
break;
}
else
{
if( IsClosingTagMark() || *pchar == 10 || IsWhite(*pchar) )
break;
}
if( *pchar==10 || IsWhite(*pchar) ) if( *pchar==10 || IsWhite(*pchar) )
{ {
if( !attr_value_temp.empty() ) if( !attr_value_temp.empty() )
@ -524,9 +551,9 @@ size_t i;
} }
else else
if( i < WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) if( i < WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
{
attr_value_temp += *pchar; attr_value_temp += *pchar;
}
++pchar;
} }
if( !attr_value_temp.empty() ) if( !attr_value_temp.empty() )
@ -795,8 +822,8 @@ void HTMLFilter::PutClosingTagMark()
// !! zmienic na lepsza nazwe // !! IMPROVE ME change to a better name
// bo to nie zwraca true jesli tag jest safe // this functions does not return true when the tag is safe
bool HTMLFilter::IsTagSafe(const wchar_t * tag) bool HTMLFilter::IsTagSafe(const wchar_t * tag)
{ {
if( !safe_mode ) if( !safe_mode )
@ -838,9 +865,10 @@ bool HTMLFilter::IsTagSafe(const std::wstring & tag)
bool HTMLFilter::PutOpeningTag() bool HTMLFilter::PutOpeningTag()
{ {
if( !IsTagSafe(LastItem().name) ) if( !IsTagSafe(LastItem().name) )
// !! IMPROVE ME {
// !! dodac tutaj skipniecie calego tagu SkipAndCheckClosingTag();
return false; return false;
}
PutOpeningTagMark(); PutOpeningTagMark();
Put(LastItem().name); Put(LastItem().name);
@ -852,7 +880,7 @@ return true;
void HTMLFilter::PutClosingTag(const wchar_t * tag) void HTMLFilter::PutClosingTag(const wchar_t * tag)
{ {
if( !IsTagSafe(tag) ) if( skip_all_tags || !IsTagSafe(tag) )
return; return;
PutOpeningTagMark(); PutOpeningTagMark();
@ -1005,9 +1033,19 @@ const wchar_t * last_non_white = pchar;
while( *pchar != 0 ) while( *pchar != 0 )
{ {
const wchar_t * commentary_start = pchar;
if( SkipCommentaryTagIfExists() ) if( SkipCommentaryTagIfExists() )
{ {
last_non_white = pchar - 1; // pointing at the last '>' from a commentary last_non_white = pchar - 1; // pointing at the last '>' from a commentary
PutNormalText(start, commentary_start);
if( !skip_commentaries )
{
PutNormalText(commentary_start, pchar);
}
start = pchar;
} }
else else
{ {
@ -1029,7 +1067,7 @@ const wchar_t * last_non_white = pchar;
bool HTMLFilter::PrintOpeningItem() bool HTMLFilter::PrintOpeningItem()
{ {
if( IsNameEqual(no_filter_tag, LastItem().name) ) if( skip_all_tags || IsNameEqual(no_filter_tag, LastItem().name) )
return true; return true;
if( last_new_line ) if( last_new_line )
@ -1068,16 +1106,15 @@ bool HTMLFilter::ReadItemAttr()
pchar += 1; // skipping '=' pchar += 1; // skipping '='
SkipWhiteLines(); SkipWhiteLines();
bool has_quote = (*pchar == '\"' || *pchar == '\'');
// !! dodac obsluge pojedynczego cudzyslowu wchar_t quote_char = *pchar;
bool has_quote = (*pchar == '\"');
if( has_quote ) if( has_quote )
pchar += 1; // skipping the first quote mark pchar += 1; // skipping the first quote mark
ReadItemAttrValue(has_quote); ReadItemAttrValue(has_quote, quote_char);
if( *pchar == '\"' ) if( has_quote && *pchar == quote_char )
pchar += 1; // skipping the last quote mark pchar += 1; // skipping the last quote mark
return true; return true;
@ -1112,7 +1149,7 @@ void HTMLFilter::PrintItemAttr()
{ {
size_t i; size_t i;
if( IsNameEqual(no_filter_tag, LastItem().name) ) if( skip_all_tags || IsNameEqual(no_filter_tag, LastItem().name) )
return; return;
Put(' '); Put(' ');
@ -1150,11 +1187,17 @@ void HTMLFilter::ReadItemClosing()
void HTMLFilter::ReadItemSpecial() void HTMLFilter::ReadItemSpecial()
{ {
LastItem().type = Item::special; LastItem().type = Item::special;
PutOpeningTagMark();
if( !skip_all_tags )
PutOpeningTagMark();
const wchar_t * start = pchar; const wchar_t * start = pchar;
pchar += 1; // skipping '!'
ReadItemName();
SkipAndCheckClosingTag(); SkipAndCheckClosingTag();
if( pchar > start ) if( !skip_all_tags && pchar > start )
Put(start, pchar); Put(start, pchar);
// closing tag mark is printed directly from the source // closing tag mark is printed directly from the source
@ -1176,7 +1219,7 @@ void HTMLFilter::ReadItemOpening()
SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple' SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
if( !IsNameEqual(no_filter_tag, LastItem().name) ) if( !skip_all_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
{ {
if( LastItem().type == Item::simple ) if( LastItem().type == Item::simple )
Put(L" /"); Put(L" /");
@ -1187,6 +1230,11 @@ void HTMLFilter::ReadItemOpening()
} }
void HTMLFilter::ItemFound()
{
}
bool HTMLFilter::ReadItem() bool HTMLFilter::ReadItem()
{ {
if( *pchar == 0 ) if( *pchar == 0 )
@ -1209,6 +1257,8 @@ bool HTMLFilter::ReadItem()
CheckNewLine(); CheckNewLine();
LastItem().new_line = last_new_line; LastItem().new_line = last_new_line;
ItemFound();
return true; return true;
} }
@ -1332,13 +1382,13 @@ void HTMLFilter::CheckExceptions()
// in safe_mode the script tag is ignored // in safe_mode the script tag is ignored
if( !safe_mode && IsLastTag(L"script") ) if( !safe_mode && IsLastTag(L"script") )
PutLastTagWithClosingTag(); PutEverythingUntilClosingTag(!skip_all_tags);
if( IsLastTag(L"pre") || IsLastTag(L"textarea") ) if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
PutLastTagWithClosingTag(); PutEverythingUntilClosingTag(!skip_all_tags);
if( IsLastTag(no_filter_tag) ) if( IsLastTag(no_filter_tag) )
PutTextBetweenLastTagWithClosingTag(); PutEverythingUntilClosingTag(false);
if( IsLastTag(L"body") ) if( IsLastTag(L"body") )
LastItem().has_body_tag = true; LastItem().has_body_tag = true;
@ -1371,7 +1421,7 @@ int i;
for(int z=(int)stack_len-2 ; z>=i ; --z) for(int z=(int)stack_len-2 ; z>=i ; --z)
{ {
if( pstack[z].new_line ) if( !skip_all_tags && pstack[z].new_line )
{ {
PutNewLine(); PutNewLine();
PutTabs(z); PutTabs(z);
@ -1421,7 +1471,7 @@ void HTMLFilter::CheckClosingTags()
if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) ) if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
{ {
// last closing tag is from the previous one // last closing tag is from the previous one
if( pstack[stack_len-2].new_line ) if( !skip_all_tags && pstack[stack_len-2].new_line )
{ {
PutNewLine(); PutNewLine();
PutTabs(stack_len-2); PutTabs(stack_len-2);
@ -1444,7 +1494,7 @@ bool HTMLFilter::PrintRest()
const wchar_t * start = pchar; const wchar_t * start = pchar;
// in safe mode we do not print the rest html code // in safe mode we do not print the rest html code
if( safe_mode ) if( safe_mode || skip_all_tags )
return false; return false;
while( *pchar ) while( *pchar )
@ -1474,7 +1524,7 @@ void HTMLFilter::ReadLoop()
{ {
if( stack_len > 1 ) if( stack_len > 1 )
{ {
pstack[stack_len-2].new_line = LastItem().new_line; //pstack[stack_len-2].new_line = LastItem().new_line;
} }
else else
if( trim_white ) if( trim_white )
@ -1492,6 +1542,10 @@ void HTMLFilter::ReadLoop()
{ {
CheckClosingTags(); CheckClosingTags();
} }
else
{
PopStack();
}
ReadNormalText(); ReadNormalText();
} }

View File

@ -5,7 +5,7 @@
*/ */
/* /*
* Copyright (c) 2008-2014, Tomasz Sowa * Copyright (c) 2008-2018, Tomasz Sowa
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -99,7 +99,7 @@ public:
HTMLFilter(); HTMLFilter();
HTMLFilter(const HTMLFilter & f); HTMLFilter(const HTMLFilter & f);
HTMLFilter & operator=(const HTMLFilter & f); HTMLFilter & operator=(const HTMLFilter & f);
~HTMLFilter(); virtual ~HTMLFilter();
// main methods used for filtering // main methods used for filtering
@ -156,6 +156,9 @@ public:
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...) // (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
void SafeMode(bool safe_mode_); void SafeMode(bool safe_mode_);
// skip all html tags
// gives only text without markup
void SkipAllTags(bool skip_all_tags, bool skip_commentaries);
protected: protected:
@ -239,10 +242,13 @@ protected:
virtual void PutNormalText(const wchar_t * str, const wchar_t * end); virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white); virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
virtual void ItemFound();
/* /*
others others
*/ */
void SetSomeDefaults();
Item & GetItem(size_t i); Item & GetItem(size_t i);
Item & LastItem(); Item & LastItem();
@ -288,7 +294,7 @@ protected:
bool PrintOpeningItem(); bool PrintOpeningItem();
void ReadItemName(); void ReadItemName();
void ReadItemAttrName(); void ReadItemAttrName();
void ReadItemAttrValue(bool has_quote); void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
bool ReadItemAttr(); bool ReadItemAttr();
bool CheckItemAttr(); bool CheckItemAttr();
@ -307,8 +313,7 @@ protected:
bool HasSemiloconAround(const wchar_t * str, const wchar_t * end); bool HasSemiloconAround(const wchar_t * str, const wchar_t * end);
void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end); void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end);
void PutNormalWhite(const wchar_t * & str, const wchar_t * end); void PutNormalWhite(const wchar_t * & str, const wchar_t * end);
void PutLastTagWithClosingTag(); void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
void PutTextBetweenLastTagWithClosingTag();
void PutTabs(size_t len); void PutTabs(size_t len);
void PutNonBreakingSpace(); void PutNonBreakingSpace();
void PutNewLine(); void PutNewLine();
@ -335,6 +340,8 @@ protected:
size_t line_len; //length of the current line (without first spaces which create the html tree) size_t line_len; //length of the current line (without first spaces which create the html tree)
bool safe_mode; // skipping some unsafe tags bool safe_mode; // skipping some unsafe tags
Orphans orphans_temp; Orphans orphans_temp;
bool skip_all_tags;
bool skip_commentaries;
}; };