added to HTMLFilter:

- now we can parse " and ' in html attributes
- we can skip html tags and commentaries, added method:
  void SkipAllTags(bool skip_all_tags, bool skip_commentaries);
- there is virtual method: virtual void ItemFound();
  which is called when a html tag is parsed




git-svn-id: svn://ttmath.org/publicrep/winix/trunk@1129 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
Tomasz Sowa 2018-10-23 23:28:45 +00:00
parent d1e7765e98
commit 027a8ec428
3 changed files with 130 additions and 69 deletions

File diff suppressed because one or more lines are too long

View File

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2008-2014, Tomasz Sowa
* Copyright (c) 2008-2018, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -97,17 +97,25 @@ size_t out_projected_len = in.size() * 2 + 1;
}
HTMLFilter::HTMLFilter()
void HTMLFilter::SetSomeDefaults()
{
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
tab_size = 2;
trim_white = false;
break_after = 0;
wrap_line = 0;
orphan_mode = orphan_nbsp;
safe_mode = false;
skip_all_tags = false;
skip_commentaries = false;
}
HTMLFilter::HTMLFilter()
{
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
SetSomeDefaults();
}
@ -116,6 +124,8 @@ HTMLFilter::HTMLFilter(const HTMLFilter & f)
// don't need to copy the stack
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
SetSomeDefaults();
}
@ -125,6 +135,8 @@ HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
// we can copy some fields from f
return *this;
}
@ -136,6 +148,8 @@ HTMLFilter::~HTMLFilter()
}
void HTMLFilter::BreakWord(size_t break_after_)
{
break_after = break_after_;
@ -224,6 +238,13 @@ void HTMLFilter::SafeMode(bool safe_mode_)
}
void HTMLFilter::SkipAllTags(bool skip_all_tags, bool skip_commentaries)
{
this->skip_all_tags = skip_all_tags;
this->skip_commentaries = skip_commentaries;
}
void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
{
no_filter_tag = tag_name;
@ -374,9 +395,10 @@ return false;
// used for such tags as: script, pre, textarea
void HTMLFilter::PutLastTagWithClosingTag()
void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
{
const wchar_t * start = pchar;
const wchar_t * end = pchar;
while( *pchar != 0 )
{
@ -384,32 +406,9 @@ const wchar_t * start = pchar;
{
if( IsClosingTagForLastItem() )
{
PopStack();
CheckNewLine();
break;
}
}
else
{
pchar += 1;
}
}
if( put_closing_tag_as_well )
end = pchar;
Put(start, pchar);
}
// used with <nofilter> </nofilter> tags
void HTMLFilter::PutTextBetweenLastTagWithClosingTag()
{
const wchar_t * start = pchar, * end = pchar;
while( *pchar != 0 )
{
if( IsOpeningTagMark() )
{
if( IsClosingTagForLastItem() )
{
PopStack();
CheckNewLine();
break;
@ -427,16 +426,36 @@ const wchar_t * start = pchar, * end = pchar;
void HTMLFilter::SkipAndCheckClosingTag()
{
bool is_quoted = false;
wchar_t quote_char = 0;
for( ; *pchar ; ++pchar )
{
if( LastItem().type == Item::opening && IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
if( *pchar == '"' || *pchar == '\'' )
{
if( is_quoted )
{
if( *pchar == quote_char )
{
is_quoted = false;
}
}
else
{
is_quoted = true;
quote_char = *pchar;
}
}
else
if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
{
LastItem().type = Item::simple;
}
if( IsClosingTagMark() )
else
if( !is_quoted && IsClosingTagMark() )
{
++pchar;
break;
@ -502,18 +521,26 @@ size_t i;
void HTMLFilter::ReadItemAttrValue(bool has_quote)
void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
{
size_t i;
// sprawdzic to wszedzie bo teraz jest tablica
attr_value.clear();
attr_value_temp.clear();
// !! dodac obsluge pojedynczego cudzyslowu
for(i=0 ; *pchar && *pchar != '\"' && !IsClosingTagMark() && (has_quote || (*pchar!=10 && !IsWhite(*pchar)) ); ++i )
for(i=0 ; *pchar ; ++i, ++pchar )
{
if( has_quote )
{
if( *pchar == quote_char )
break;
}
else
{
if( IsClosingTagMark() || *pchar == 10 || IsWhite(*pchar) )
break;
}
if( *pchar==10 || IsWhite(*pchar) )
{
if( !attr_value_temp.empty() )
@ -524,9 +551,9 @@ size_t i;
}
else
if( i < WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
{
attr_value_temp += *pchar;
++pchar;
}
}
if( !attr_value_temp.empty() )
@ -795,8 +822,8 @@ void HTMLFilter::PutClosingTagMark()
// !! zmienic na lepsza nazwe
// bo to nie zwraca true jesli tag jest safe
// !! IMPROVE ME change to a better name
// this functions does not return true when the tag is safe
bool HTMLFilter::IsTagSafe(const wchar_t * tag)
{
if( !safe_mode )
@ -838,9 +865,10 @@ bool HTMLFilter::IsTagSafe(const std::wstring & tag)
bool HTMLFilter::PutOpeningTag()
{
if( !IsTagSafe(LastItem().name) )
// !! IMPROVE ME
// !! dodac tutaj skipniecie calego tagu
{
SkipAndCheckClosingTag();
return false;
}
PutOpeningTagMark();
Put(LastItem().name);
@ -852,7 +880,7 @@ return true;
void HTMLFilter::PutClosingTag(const wchar_t * tag)
{
if( !IsTagSafe(tag) )
if( skip_all_tags || !IsTagSafe(tag) )
return;
PutOpeningTagMark();
@ -1005,9 +1033,19 @@ const wchar_t * last_non_white = pchar;
while( *pchar != 0 )
{
const wchar_t * commentary_start = pchar;
if( SkipCommentaryTagIfExists() )
{
last_non_white = pchar - 1; // pointing at the last '>' from a commentary
PutNormalText(start, commentary_start);
if( !skip_commentaries )
{
PutNormalText(commentary_start, pchar);
}
start = pchar;
}
else
{
@ -1029,7 +1067,7 @@ const wchar_t * last_non_white = pchar;
bool HTMLFilter::PrintOpeningItem()
{
if( IsNameEqual(no_filter_tag, LastItem().name) )
if( skip_all_tags || IsNameEqual(no_filter_tag, LastItem().name) )
return true;
if( last_new_line )
@ -1068,16 +1106,15 @@ bool HTMLFilter::ReadItemAttr()
pchar += 1; // skipping '='
SkipWhiteLines();
// !! dodac obsluge pojedynczego cudzyslowu
bool has_quote = (*pchar == '\"');
bool has_quote = (*pchar == '\"' || *pchar == '\'');
wchar_t quote_char = *pchar;
if( has_quote )
pchar += 1; // skipping the first quote mark
ReadItemAttrValue(has_quote);
ReadItemAttrValue(has_quote, quote_char);
if( *pchar == '\"' )
if( has_quote && *pchar == quote_char )
pchar += 1; // skipping the last quote mark
return true;
@ -1112,7 +1149,7 @@ void HTMLFilter::PrintItemAttr()
{
size_t i;
if( IsNameEqual(no_filter_tag, LastItem().name) )
if( skip_all_tags || IsNameEqual(no_filter_tag, LastItem().name) )
return;
Put(' ');
@ -1150,11 +1187,17 @@ void HTMLFilter::ReadItemClosing()
void HTMLFilter::ReadItemSpecial()
{
LastItem().type = Item::special;
PutOpeningTagMark();
if( !skip_all_tags )
PutOpeningTagMark();
const wchar_t * start = pchar;
pchar += 1; // skipping '!'
ReadItemName();
SkipAndCheckClosingTag();
if( pchar > start )
if( !skip_all_tags && pchar > start )
Put(start, pchar);
// closing tag mark is printed directly from the source
@ -1176,7 +1219,7 @@ void HTMLFilter::ReadItemOpening()
SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
if( !IsNameEqual(no_filter_tag, LastItem().name) )
if( !skip_all_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
{
if( LastItem().type == Item::simple )
Put(L" /");
@ -1187,6 +1230,11 @@ void HTMLFilter::ReadItemOpening()
}
void HTMLFilter::ItemFound()
{
}
bool HTMLFilter::ReadItem()
{
if( *pchar == 0 )
@ -1209,6 +1257,8 @@ bool HTMLFilter::ReadItem()
CheckNewLine();
LastItem().new_line = last_new_line;
ItemFound();
return true;
}
@ -1332,13 +1382,13 @@ void HTMLFilter::CheckExceptions()
// in safe_mode the script tag is ignored
if( !safe_mode && IsLastTag(L"script") )
PutLastTagWithClosingTag();
PutEverythingUntilClosingTag(!skip_all_tags);
if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
PutLastTagWithClosingTag();
PutEverythingUntilClosingTag(!skip_all_tags);
if( IsLastTag(no_filter_tag) )
PutTextBetweenLastTagWithClosingTag();
PutEverythingUntilClosingTag(false);
if( IsLastTag(L"body") )
LastItem().has_body_tag = true;
@ -1371,7 +1421,7 @@ int i;
for(int z=(int)stack_len-2 ; z>=i ; --z)
{
if( pstack[z].new_line )
if( !skip_all_tags && pstack[z].new_line )
{
PutNewLine();
PutTabs(z);
@ -1421,7 +1471,7 @@ void HTMLFilter::CheckClosingTags()
if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
{
// last closing tag is from the previous one
if( pstack[stack_len-2].new_line )
if( !skip_all_tags && pstack[stack_len-2].new_line )
{
PutNewLine();
PutTabs(stack_len-2);
@ -1444,7 +1494,7 @@ bool HTMLFilter::PrintRest()
const wchar_t * start = pchar;
// in safe mode we do not print the rest html code
if( safe_mode )
if( safe_mode || skip_all_tags )
return false;
while( *pchar )
@ -1474,7 +1524,7 @@ void HTMLFilter::ReadLoop()
{
if( stack_len > 1 )
{
pstack[stack_len-2].new_line = LastItem().new_line;
//pstack[stack_len-2].new_line = LastItem().new_line;
}
else
if( trim_white )
@ -1492,6 +1542,10 @@ void HTMLFilter::ReadLoop()
{
CheckClosingTags();
}
else
{
PopStack();
}
ReadNormalText();
}

View File

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2008-2014, Tomasz Sowa
* Copyright (c) 2008-2018, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -99,7 +99,7 @@ public:
HTMLFilter();
HTMLFilter(const HTMLFilter & f);
HTMLFilter & operator=(const HTMLFilter & f);
~HTMLFilter();
virtual ~HTMLFilter();
// main methods used for filtering
@ -156,6 +156,9 @@ public:
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
void SafeMode(bool safe_mode_);
// skip all html tags
// gives only text without markup
void SkipAllTags(bool skip_all_tags, bool skip_commentaries);
protected:
@ -239,10 +242,13 @@ protected:
virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
virtual void ItemFound();
/*
others
*/
void SetSomeDefaults();
Item & GetItem(size_t i);
Item & LastItem();
@ -288,7 +294,7 @@ protected:
bool PrintOpeningItem();
void ReadItemName();
void ReadItemAttrName();
void ReadItemAttrValue(bool has_quote);
void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
bool ReadItemAttr();
bool CheckItemAttr();
@ -307,8 +313,7 @@ protected:
bool HasSemiloconAround(const wchar_t * str, const wchar_t * end);
void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end);
void PutNormalWhite(const wchar_t * & str, const wchar_t * end);
void PutLastTagWithClosingTag();
void PutTextBetweenLastTagWithClosingTag();
void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
void PutTabs(size_t len);
void PutNonBreakingSpace();
void PutNewLine();
@ -335,6 +340,8 @@ protected:
size_t line_len; //length of the current line (without first spaces which create the html tree)
bool safe_mode; // skipping some unsafe tags
Orphans orphans_temp;
bool skip_all_tags;
bool skip_commentaries;
};