added to HTMLFilter:
- now we can parse " and ' in html attributes - we can skip html tags and commentaries, added method: void SkipAllTags(bool skip_all_tags, bool skip_commentaries); - there is virtual method: virtual void ItemFound(); which is called when a html tag is parsed git-svn-id: svn://ttmath.org/publicrep/winix/trunk@1129 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
parent
d1e7765e98
commit
027a8ec428
File diff suppressed because one or more lines are too long
|
@ -5,7 +5,7 @@
|
|||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2008-2014, Tomasz Sowa
|
||||
* Copyright (c) 2008-2018, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -97,17 +97,25 @@ size_t out_projected_len = in.size() * 2 + 1;
|
|||
}
|
||||
|
||||
|
||||
HTMLFilter::HTMLFilter()
|
||||
void HTMLFilter::SetSomeDefaults()
|
||||
{
|
||||
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
||||
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
||||
|
||||
tab_size = 2;
|
||||
trim_white = false;
|
||||
break_after = 0;
|
||||
wrap_line = 0;
|
||||
orphan_mode = orphan_nbsp;
|
||||
safe_mode = false;
|
||||
skip_all_tags = false;
|
||||
skip_commentaries = false;
|
||||
}
|
||||
|
||||
|
||||
HTMLFilter::HTMLFilter()
|
||||
{
|
||||
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
||||
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
||||
|
||||
SetSomeDefaults();
|
||||
}
|
||||
|
||||
|
||||
|
@ -116,6 +124,8 @@ HTMLFilter::HTMLFilter(const HTMLFilter & f)
|
|||
// don't need to copy the stack
|
||||
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
||||
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
||||
|
||||
SetSomeDefaults();
|
||||
}
|
||||
|
||||
|
||||
|
@ -125,6 +135,8 @@ HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
|
|||
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
||||
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
||||
|
||||
// we can copy some fields from f
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -136,6 +148,8 @@ HTMLFilter::~HTMLFilter()
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void HTMLFilter::BreakWord(size_t break_after_)
|
||||
{
|
||||
break_after = break_after_;
|
||||
|
@ -224,6 +238,13 @@ void HTMLFilter::SafeMode(bool safe_mode_)
|
|||
}
|
||||
|
||||
|
||||
void HTMLFilter::SkipAllTags(bool skip_all_tags, bool skip_commentaries)
|
||||
{
|
||||
this->skip_all_tags = skip_all_tags;
|
||||
this->skip_commentaries = skip_commentaries;
|
||||
}
|
||||
|
||||
|
||||
void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
|
||||
{
|
||||
no_filter_tag = tag_name;
|
||||
|
@ -374,9 +395,10 @@ return false;
|
|||
|
||||
|
||||
// used for such tags as: script, pre, textarea
|
||||
void HTMLFilter::PutLastTagWithClosingTag()
|
||||
void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
|
||||
{
|
||||
const wchar_t * start = pchar;
|
||||
const wchar_t * end = pchar;
|
||||
|
||||
while( *pchar != 0 )
|
||||
{
|
||||
|
@ -384,32 +406,9 @@ const wchar_t * start = pchar;
|
|||
{
|
||||
if( IsClosingTagForLastItem() )
|
||||
{
|
||||
PopStack();
|
||||
CheckNewLine();
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pchar += 1;
|
||||
}
|
||||
}
|
||||
if( put_closing_tag_as_well )
|
||||
end = pchar;
|
||||
|
||||
Put(start, pchar);
|
||||
}
|
||||
|
||||
|
||||
// used with <nofilter> </nofilter> tags
|
||||
void HTMLFilter::PutTextBetweenLastTagWithClosingTag()
|
||||
{
|
||||
const wchar_t * start = pchar, * end = pchar;
|
||||
|
||||
while( *pchar != 0 )
|
||||
{
|
||||
if( IsOpeningTagMark() )
|
||||
{
|
||||
if( IsClosingTagForLastItem() )
|
||||
{
|
||||
PopStack();
|
||||
CheckNewLine();
|
||||
break;
|
||||
|
@ -427,16 +426,36 @@ const wchar_t * start = pchar, * end = pchar;
|
|||
|
||||
|
||||
|
||||
|
||||
void HTMLFilter::SkipAndCheckClosingTag()
|
||||
{
|
||||
bool is_quoted = false;
|
||||
wchar_t quote_char = 0;
|
||||
|
||||
for( ; *pchar ; ++pchar )
|
||||
{
|
||||
if( LastItem().type == Item::opening && IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
|
||||
if( *pchar == '"' || *pchar == '\'' )
|
||||
{
|
||||
if( is_quoted )
|
||||
{
|
||||
if( *pchar == quote_char )
|
||||
{
|
||||
is_quoted = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
is_quoted = true;
|
||||
quote_char = *pchar;
|
||||
}
|
||||
}
|
||||
else
|
||||
if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
|
||||
{
|
||||
LastItem().type = Item::simple;
|
||||
}
|
||||
|
||||
if( IsClosingTagMark() )
|
||||
else
|
||||
if( !is_quoted && IsClosingTagMark() )
|
||||
{
|
||||
++pchar;
|
||||
break;
|
||||
|
@ -502,18 +521,26 @@ size_t i;
|
|||
|
||||
|
||||
|
||||
void HTMLFilter::ReadItemAttrValue(bool has_quote)
|
||||
void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
// sprawdzic to wszedzie bo teraz jest tablica
|
||||
attr_value.clear();
|
||||
attr_value_temp.clear();
|
||||
|
||||
// !! dodac obsluge pojedynczego cudzyslowu
|
||||
|
||||
for(i=0 ; *pchar && *pchar != '\"' && !IsClosingTagMark() && (has_quote || (*pchar!=10 && !IsWhite(*pchar)) ); ++i )
|
||||
for(i=0 ; *pchar ; ++i, ++pchar )
|
||||
{
|
||||
if( has_quote )
|
||||
{
|
||||
if( *pchar == quote_char )
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( IsClosingTagMark() || *pchar == 10 || IsWhite(*pchar) )
|
||||
break;
|
||||
}
|
||||
|
||||
if( *pchar==10 || IsWhite(*pchar) )
|
||||
{
|
||||
if( !attr_value_temp.empty() )
|
||||
|
@ -524,9 +551,9 @@ size_t i;
|
|||
}
|
||||
else
|
||||
if( i < WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
||||
{
|
||||
attr_value_temp += *pchar;
|
||||
|
||||
++pchar;
|
||||
}
|
||||
}
|
||||
|
||||
if( !attr_value_temp.empty() )
|
||||
|
@ -795,8 +822,8 @@ void HTMLFilter::PutClosingTagMark()
|
|||
|
||||
|
||||
|
||||
// !! zmienic na lepsza nazwe
|
||||
// bo to nie zwraca true jesli tag jest safe
|
||||
// !! IMPROVE ME change to a better name
|
||||
// this functions does not return true when the tag is safe
|
||||
bool HTMLFilter::IsTagSafe(const wchar_t * tag)
|
||||
{
|
||||
if( !safe_mode )
|
||||
|
@ -838,9 +865,10 @@ bool HTMLFilter::IsTagSafe(const std::wstring & tag)
|
|||
bool HTMLFilter::PutOpeningTag()
|
||||
{
|
||||
if( !IsTagSafe(LastItem().name) )
|
||||
// !! IMPROVE ME
|
||||
// !! dodac tutaj skipniecie calego tagu
|
||||
{
|
||||
SkipAndCheckClosingTag();
|
||||
return false;
|
||||
}
|
||||
|
||||
PutOpeningTagMark();
|
||||
Put(LastItem().name);
|
||||
|
@ -852,7 +880,7 @@ return true;
|
|||
|
||||
void HTMLFilter::PutClosingTag(const wchar_t * tag)
|
||||
{
|
||||
if( !IsTagSafe(tag) )
|
||||
if( skip_all_tags || !IsTagSafe(tag) )
|
||||
return;
|
||||
|
||||
PutOpeningTagMark();
|
||||
|
@ -1005,9 +1033,19 @@ const wchar_t * last_non_white = pchar;
|
|||
|
||||
while( *pchar != 0 )
|
||||
{
|
||||
const wchar_t * commentary_start = pchar;
|
||||
|
||||
if( SkipCommentaryTagIfExists() )
|
||||
{
|
||||
last_non_white = pchar - 1; // pointing at the last '>' from a commentary
|
||||
PutNormalText(start, commentary_start);
|
||||
|
||||
if( !skip_commentaries )
|
||||
{
|
||||
PutNormalText(commentary_start, pchar);
|
||||
}
|
||||
|
||||
start = pchar;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1029,7 +1067,7 @@ const wchar_t * last_non_white = pchar;
|
|||
|
||||
bool HTMLFilter::PrintOpeningItem()
|
||||
{
|
||||
if( IsNameEqual(no_filter_tag, LastItem().name) )
|
||||
if( skip_all_tags || IsNameEqual(no_filter_tag, LastItem().name) )
|
||||
return true;
|
||||
|
||||
if( last_new_line )
|
||||
|
@ -1068,16 +1106,15 @@ bool HTMLFilter::ReadItemAttr()
|
|||
pchar += 1; // skipping '='
|
||||
SkipWhiteLines();
|
||||
|
||||
|
||||
// !! dodac obsluge pojedynczego cudzyslowu
|
||||
bool has_quote = (*pchar == '\"');
|
||||
bool has_quote = (*pchar == '\"' || *pchar == '\'');
|
||||
wchar_t quote_char = *pchar;
|
||||
|
||||
if( has_quote )
|
||||
pchar += 1; // skipping the first quote mark
|
||||
|
||||
ReadItemAttrValue(has_quote);
|
||||
ReadItemAttrValue(has_quote, quote_char);
|
||||
|
||||
if( *pchar == '\"' )
|
||||
if( has_quote && *pchar == quote_char )
|
||||
pchar += 1; // skipping the last quote mark
|
||||
|
||||
return true;
|
||||
|
@ -1112,7 +1149,7 @@ void HTMLFilter::PrintItemAttr()
|
|||
{
|
||||
size_t i;
|
||||
|
||||
if( IsNameEqual(no_filter_tag, LastItem().name) )
|
||||
if( skip_all_tags || IsNameEqual(no_filter_tag, LastItem().name) )
|
||||
return;
|
||||
|
||||
Put(' ');
|
||||
|
@ -1150,11 +1187,17 @@ void HTMLFilter::ReadItemClosing()
|
|||
void HTMLFilter::ReadItemSpecial()
|
||||
{
|
||||
LastItem().type = Item::special;
|
||||
|
||||
if( !skip_all_tags )
|
||||
PutOpeningTagMark();
|
||||
|
||||
const wchar_t * start = pchar;
|
||||
pchar += 1; // skipping '!'
|
||||
|
||||
ReadItemName();
|
||||
SkipAndCheckClosingTag();
|
||||
|
||||
if( pchar > start )
|
||||
if( !skip_all_tags && pchar > start )
|
||||
Put(start, pchar);
|
||||
|
||||
// closing tag mark is printed directly from the source
|
||||
|
@ -1176,7 +1219,7 @@ void HTMLFilter::ReadItemOpening()
|
|||
|
||||
SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
|
||||
|
||||
if( !IsNameEqual(no_filter_tag, LastItem().name) )
|
||||
if( !skip_all_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
|
||||
{
|
||||
if( LastItem().type == Item::simple )
|
||||
Put(L" /");
|
||||
|
@ -1187,6 +1230,11 @@ void HTMLFilter::ReadItemOpening()
|
|||
}
|
||||
|
||||
|
||||
void HTMLFilter::ItemFound()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
bool HTMLFilter::ReadItem()
|
||||
{
|
||||
if( *pchar == 0 )
|
||||
|
@ -1209,6 +1257,8 @@ bool HTMLFilter::ReadItem()
|
|||
CheckNewLine();
|
||||
LastItem().new_line = last_new_line;
|
||||
|
||||
ItemFound();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1332,13 +1382,13 @@ void HTMLFilter::CheckExceptions()
|
|||
|
||||
// in safe_mode the script tag is ignored
|
||||
if( !safe_mode && IsLastTag(L"script") )
|
||||
PutLastTagWithClosingTag();
|
||||
PutEverythingUntilClosingTag(!skip_all_tags);
|
||||
|
||||
if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
|
||||
PutLastTagWithClosingTag();
|
||||
PutEverythingUntilClosingTag(!skip_all_tags);
|
||||
|
||||
if( IsLastTag(no_filter_tag) )
|
||||
PutTextBetweenLastTagWithClosingTag();
|
||||
PutEverythingUntilClosingTag(false);
|
||||
|
||||
if( IsLastTag(L"body") )
|
||||
LastItem().has_body_tag = true;
|
||||
|
@ -1371,7 +1421,7 @@ int i;
|
|||
|
||||
for(int z=(int)stack_len-2 ; z>=i ; --z)
|
||||
{
|
||||
if( pstack[z].new_line )
|
||||
if( !skip_all_tags && pstack[z].new_line )
|
||||
{
|
||||
PutNewLine();
|
||||
PutTabs(z);
|
||||
|
@ -1421,7 +1471,7 @@ void HTMLFilter::CheckClosingTags()
|
|||
if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
|
||||
{
|
||||
// last closing tag is from the previous one
|
||||
if( pstack[stack_len-2].new_line )
|
||||
if( !skip_all_tags && pstack[stack_len-2].new_line )
|
||||
{
|
||||
PutNewLine();
|
||||
PutTabs(stack_len-2);
|
||||
|
@ -1444,7 +1494,7 @@ bool HTMLFilter::PrintRest()
|
|||
const wchar_t * start = pchar;
|
||||
|
||||
// in safe mode we do not print the rest html code
|
||||
if( safe_mode )
|
||||
if( safe_mode || skip_all_tags )
|
||||
return false;
|
||||
|
||||
while( *pchar )
|
||||
|
@ -1474,7 +1524,7 @@ void HTMLFilter::ReadLoop()
|
|||
{
|
||||
if( stack_len > 1 )
|
||||
{
|
||||
pstack[stack_len-2].new_line = LastItem().new_line;
|
||||
//pstack[stack_len-2].new_line = LastItem().new_line;
|
||||
}
|
||||
else
|
||||
if( trim_white )
|
||||
|
@ -1492,6 +1542,10 @@ void HTMLFilter::ReadLoop()
|
|||
{
|
||||
CheckClosingTags();
|
||||
}
|
||||
else
|
||||
{
|
||||
PopStack();
|
||||
}
|
||||
|
||||
ReadNormalText();
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2008-2014, Tomasz Sowa
|
||||
* Copyright (c) 2008-2018, Tomasz Sowa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -99,7 +99,7 @@ public:
|
|||
HTMLFilter();
|
||||
HTMLFilter(const HTMLFilter & f);
|
||||
HTMLFilter & operator=(const HTMLFilter & f);
|
||||
~HTMLFilter();
|
||||
virtual ~HTMLFilter();
|
||||
|
||||
|
||||
// main methods used for filtering
|
||||
|
@ -156,6 +156,9 @@ public:
|
|||
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
|
||||
void SafeMode(bool safe_mode_);
|
||||
|
||||
// skip all html tags
|
||||
// gives only text without markup
|
||||
void SkipAllTags(bool skip_all_tags, bool skip_commentaries);
|
||||
|
||||
|
||||
protected:
|
||||
|
@ -239,10 +242,13 @@ protected:
|
|||
virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
|
||||
virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
|
||||
|
||||
virtual void ItemFound();
|
||||
|
||||
/*
|
||||
others
|
||||
*/
|
||||
void SetSomeDefaults();
|
||||
|
||||
Item & GetItem(size_t i);
|
||||
Item & LastItem();
|
||||
|
||||
|
@ -288,7 +294,7 @@ protected:
|
|||
bool PrintOpeningItem();
|
||||
void ReadItemName();
|
||||
void ReadItemAttrName();
|
||||
void ReadItemAttrValue(bool has_quote);
|
||||
void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
|
||||
|
||||
bool ReadItemAttr();
|
||||
bool CheckItemAttr();
|
||||
|
@ -307,8 +313,7 @@ protected:
|
|||
bool HasSemiloconAround(const wchar_t * str, const wchar_t * end);
|
||||
void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end);
|
||||
void PutNormalWhite(const wchar_t * & str, const wchar_t * end);
|
||||
void PutLastTagWithClosingTag();
|
||||
void PutTextBetweenLastTagWithClosingTag();
|
||||
void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
|
||||
void PutTabs(size_t len);
|
||||
void PutNonBreakingSpace();
|
||||
void PutNewLine();
|
||||
|
@ -335,6 +340,8 @@ protected:
|
|||
size_t line_len; //length of the current line (without first spaces which create the html tree)
|
||||
bool safe_mode; // skipping some unsafe tags
|
||||
Orphans orphans_temp;
|
||||
bool skip_all_tags;
|
||||
bool skip_commentaries;
|
||||
};
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue