added to HTMLFilter:

- possibility to remove html entities
  method: SkipEntity(bool)




git-svn-id: svn://ttmath.org/publicrep/winix/trunk@1132 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
Tomasz Sowa 2018-10-24 16:31:42 +00:00
parent 027a8ec428
commit 1b8f5dc673
5 changed files with 242 additions and 84 deletions

File diff suppressed because one or more lines are too long

View File

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2008-2014, Tomasz Sowa
* Copyright (c) 2008-2018, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -66,36 +66,46 @@ return false;
}
bool BBCODEParser::IsOpeningTagMark()
bool BBCODEParser::IsOpeningTagMark(wchar_t c)
{
return (*pchar == '[');
return (c == '[');
}
// there are no commentaries in bbcode
bool BBCODEParser::IsOpeningCommentaryTagMark()
bool BBCODEParser::IsClosingTagMark(wchar_t c)
{
return (c == ']');
}
bool BBCODEParser::IsClosingXmlSimpleTagMark(wchar_t c)
{
return false;
}
// there are no commentaries in bbcode
bool BBCODEParser::IsOpeningCommentaryTagMark(const wchar_t *)
{
return false;
}
size_t BBCODEParser::OpeningCommentaryTagMarkSize()
{
return 0;
}
bool BBCODEParser::SkipCommentaryTagIfExists()
{
return false;
}
bool BBCODEParser::IsClosingTagMark()
{
return (*pchar == ']');
}
bool BBCODEParser::IsClosingXmlSimpleTagMark()
{
return false;
}
@ -446,7 +456,7 @@ bool first_tag_removed = false;
while( *pchar != 0 )
{
if( IsOpeningTagMark() )
if( IsOpeningTagMark(*pchar) )
{
if( IsClosingTagForLastItem() )
{

View File

@ -5,7 +5,7 @@
*/
/*
* Copyright (c) 2008-2014, Tomasz Sowa
* Copyright (c) 2008-2018, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -62,10 +62,12 @@ class BBCODEParser : public HTMLFilter
virtual void Init();
virtual void Uninit();
virtual bool IsOpeningTagMark();
virtual bool IsOpeningCommentaryTagMark();
virtual bool IsClosingTagMark();
virtual bool IsClosingXmlSimpleTagMark();
virtual bool IsOpeningTagMark(wchar_t c);
virtual bool IsClosingTagMark(wchar_t c);
virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
virtual bool IsOpeningCommentaryTagMark(const wchar_t *);
virtual size_t OpeningCommentaryTagMarkSize();
virtual bool IsValidCharForName(int c);
virtual void CheckExceptions();

View File

@ -88,7 +88,13 @@ void HTMLFilter::Uninit()
void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
{
size_t out_projected_len = in.size() * 2 + 1;
if( &in == &out )
{
// out cannot be the same string as in
return;
}
size_t out_projected_len = in.size() * 2 + 1;
if( out.capacity() < out_projected_len )
out.reserve(out_projected_len);
@ -105,8 +111,10 @@ void HTMLFilter::SetSomeDefaults()
wrap_line = 0;
orphan_mode = orphan_nbsp;
safe_mode = false;
skip_all_tags = false;
skip_tags = false;
skip_commentaries = false;
skip_entities = false;
analyze_entities = false;
}
@ -238,13 +246,34 @@ void HTMLFilter::SafeMode(bool safe_mode_)
}
void HTMLFilter::SkipAllTags(bool skip_all_tags, bool skip_commentaries)
void HTMLFilter::SkipTags(bool skip_tags)
{
this->skip_tags = skip_tags;
}
void HTMLFilter::SkipCommentaries(bool skip_commentaries)
{
this->skip_all_tags = skip_all_tags;
this->skip_commentaries = skip_commentaries;
}
void HTMLFilter::SkipEntities(bool skip_entities)
{
this->skip_entities = skip_entities;
if( this->skip_entities )
{
this->analyze_entities = true;
}
}
void HTMLFilter::AnalyzeEntities(bool analyze_entities)
{
this->analyze_entities = analyze_entities;
}
void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
{
no_filter_tag = tag_name;
@ -380,7 +409,7 @@ bool HTMLFilter::IsClosingTagForLastItem()
pchar += LastItem().name.size();
SkipWhite();
if( IsClosingTagMark() )
if( IsClosingTagMark(*pchar) )
{
pchar += 1;
return true;
@ -402,7 +431,7 @@ const wchar_t * end = pchar;
while( *pchar != 0 )
{
if( IsOpeningTagMark() )
if( IsOpeningTagMark(*pchar) )
{
if( IsClosingTagForLastItem() )
{
@ -450,12 +479,12 @@ void HTMLFilter::SkipAndCheckClosingTag()
}
}
else
if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(*pchar) ) // closing xml tag: default '/'
{
LastItem().type = Item::simple;
}
else
if( !is_quoted && IsClosingTagMark() )
if( !is_quoted && IsClosingTagMark(*pchar) )
{
++pchar;
break;
@ -470,7 +499,7 @@ bool HTMLFilter::IsValidCharForName(int c)
if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') ||
(c>='0' && c<='9') ||
c=='-' || c=='!' )
c=='-' || c=='!' || c==':') // : for namespace character
return true;
return false;
@ -489,6 +518,18 @@ return false;
}
bool HTMLFilter::IsValidCharForEntityName(int c)
{
if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') ||
(c>='0' && c<='9') ||
c=='#' )
return true;
return false;
}
void HTMLFilter::ReadItemName()
{
size_t i;
@ -521,12 +562,28 @@ size_t i;
void HTMLFilter::ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end)
{
attr_value.push_back(std::wstring());
if( analyze_entities )
{
AnalyzeEntitiesAndPut(value_start, value_end, &attr_value.back());
}
else
{
attr_value.back().append(value_start, value_end);
}
}
void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
{
size_t i;
attr_value.clear();
attr_value_temp.clear();
const wchar_t * value_start = pchar;
size_t value_len = 0; // how many non white characters
for(i=0 ; *pchar ; ++i, ++pchar )
{
@ -537,30 +594,28 @@ size_t i;
}
else
{
if( IsClosingTagMark() || *pchar == 10 || IsWhite(*pchar) )
if( IsClosingTagMark(*pchar) || *pchar == 10 || IsWhite(*pchar) )
break;
}
if( *pchar==10 || IsWhite(*pchar) )
{
if( !attr_value_temp.empty() )
{
attr_value.push_back(attr_value_temp);
attr_value_temp.clear();
}
if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
ReadItemAttrValueAdd(value_start, pchar);
value_len = 0;
}
else
if( i < WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
{
attr_value_temp += *pchar;
if( value_len == 0 )
value_start = pchar;
value_len += 1;
}
}
if( !attr_value_temp.empty() )
{
attr_value.push_back(attr_value_temp);
attr_value_temp.clear();
}
if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
ReadItemAttrValueAdd(value_start, pchar);
}
@ -611,6 +666,57 @@ void HTMLFilter::Put(const std::wstring & str)
}
// out can be null
void HTMLFilter::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
{
size_t epsilon = 8; // !! IMPROVE ME put as a constant
const wchar_t * old_str = str;
while( str < end )
{
if( IsStartingEntityMark(*str) )
{
const wchar_t * entity_start = str;
str += 1; // skip &
for(size_t i=0 ; *str && IsValidCharForEntityName(*str) && i < epsilon ; ++i, ++str)
{
}
if( IsEndingEntityMark(*str) && str - entity_start > 1 ) // at least one character in entity name
{
if( out )
out->append(old_str, entity_start);
else
Put(old_str, entity_start);
str += 1; // skip ;
if( !skip_entities )
{
if( out )
out->append(entity_start, str);
else
Put(entity_start, str);
}
EntityFound(entity_start + 1, str - 1); // without & and ;
old_str = str;
}
}
else
{
str += 1;
}
}
if( out )
out->append(old_str, end);
else
Put(old_str, end);
}
int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
@ -700,12 +806,12 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
// (useful in html entities)
// !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str)
// i wtedy tez nie dodajemy zadnego znaku
bool HTMLFilter::HasSemiloconAround(const wchar_t * str, const wchar_t * end)
bool HTMLFilter::HasEntityEndAround(const wchar_t * str, const wchar_t * end)
{
size_t i, epsilon = 8;
size_t i, epsilon = 8;// !! IMPROVE ME put as a constant
for(i=0 ; str < end && i<epsilon ; ++i, ++str)
if( *str == ';' )
if( IsEndingEntityMark(*str) )
return true;
return false;
@ -726,11 +832,11 @@ void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end)
{
const wchar_t * word = str;
size_t non_whites = 0;
bool was_semicolon = false;
bool was_entity_end = false;
for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites )
{
if( break_after != 0 && non_whites >= break_after && (was_semicolon || !HasSemiloconAround(str, end)) )
if( break_after != 0 && non_whites >= break_after && (was_entity_end || !HasEntityEndAround(str, end)) )
{
Put(word, str);
word = str;
@ -739,10 +845,13 @@ bool was_semicolon = false;
CheckLineWrap();
}
was_semicolon = (*str == ';');
was_entity_end = (IsEndingEntityMark(*str));
}
Put(word, str);
if( analyze_entities )
AnalyzeEntitiesAndPut(word, str, nullptr);
else
Put(word, str);
}
@ -880,7 +989,7 @@ return true;
void HTMLFilter::PutClosingTag(const wchar_t * tag)
{
if( skip_all_tags || !IsTagSafe(tag) )
if( skip_tags || !IsTagSafe(tag) )
return;
PutOpeningTagMark();
@ -925,28 +1034,28 @@ void HTMLFilter::PutNewLine()
// we assume the size of the opening mark to be one
bool HTMLFilter::IsOpeningTagMark()
bool HTMLFilter::IsOpeningTagMark(wchar_t c)
{
return (*pchar == '<');
return (c == '<');
}
// we assume the size of the closing mark to be one
bool HTMLFilter::IsClosingTagMark()
bool HTMLFilter::IsClosingTagMark(wchar_t c)
{
return (*pchar == '>');
return (c == '>');
}
// the slash at the end <img src=".." /> (without '>' character)
// we assume the size of the mark to be one
bool HTMLFilter::IsClosingXmlSimpleTagMark()
bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
{
return (*pchar == '/');
return (c == '/');
}
bool HTMLFilter::IsOpeningCommentaryTagMark()
bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
{
static wchar_t comm_open[] = L"<!--";
size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
@ -961,6 +1070,18 @@ size_t HTMLFilter::OpeningCommentaryTagMarkSize()
}
bool HTMLFilter::IsStartingEntityMark(wchar_t c)
{
return (c == '&');
}
bool HTMLFilter::IsEndingEntityMark(wchar_t c)
{
return (c == ';');
}
// skipping the commentary tag if exists
bool HTMLFilter::SkipCommentaryTagIfExists()
@ -968,7 +1089,7 @@ bool HTMLFilter::SkipCommentaryTagIfExists()
static wchar_t comm_close[] = L"-->";
size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
if( !IsOpeningCommentaryTagMark() )
if( !IsOpeningCommentaryTagMark(pchar) )
return false;
pchar += OpeningCommentaryTagMarkSize();
@ -1012,7 +1133,7 @@ void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t
start = pchar;
// exception for the commentary tag
if( IsOpeningCommentaryTagMark() || !IsOpeningTagMark() )
if( IsOpeningCommentaryTagMark(pchar) || !IsOpeningTagMark(*pchar) )
{
PutNewLine();
PutTabs(stack_len);
@ -1049,7 +1170,7 @@ const wchar_t * last_non_white = pchar;
}
else
{
if( IsOpeningTagMark() )
if( IsOpeningTagMark(*pchar) )
break;
if( !IsWhite(*pchar) )
@ -1067,7 +1188,7 @@ const wchar_t * last_non_white = pchar;
bool HTMLFilter::PrintOpeningItem()
{
if( skip_all_tags || IsNameEqual(no_filter_tag, LastItem().name) )
if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
return true;
if( last_new_line )
@ -1149,7 +1270,7 @@ void HTMLFilter::PrintItemAttr()
{
size_t i;
if( skip_all_tags || IsNameEqual(no_filter_tag, LastItem().name) )
if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
return;
Put(' ');
@ -1188,7 +1309,7 @@ void HTMLFilter::ReadItemSpecial()
{
LastItem().type = Item::special;
if( !skip_all_tags )
if( !skip_tags )
PutOpeningTagMark();
const wchar_t * start = pchar;
@ -1197,7 +1318,7 @@ void HTMLFilter::ReadItemSpecial()
ReadItemName();
SkipAndCheckClosingTag();
if( !skip_all_tags && pchar > start )
if( !skip_tags && pchar > start )
Put(start, pchar);
// closing tag mark is printed directly from the source
@ -1219,7 +1340,7 @@ void HTMLFilter::ReadItemOpening()
SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
if( !skip_all_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
{
if( LastItem().type == Item::simple )
Put(L" /");
@ -1234,6 +1355,10 @@ void HTMLFilter::ItemFound()
{
}
void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end)
{
}
bool HTMLFilter::ReadItem()
{
@ -1315,6 +1440,8 @@ bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & na
// len characters from both strings must be equal
// IMPROVE ME change name to something like IsBeginningNameEqual
// and move to text.h (pikotools)
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
{
for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
@ -1382,10 +1509,10 @@ void HTMLFilter::CheckExceptions()
// in safe_mode the script tag is ignored
if( !safe_mode && IsLastTag(L"script") )
PutEverythingUntilClosingTag(!skip_all_tags);
PutEverythingUntilClosingTag(!skip_tags);
if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
PutEverythingUntilClosingTag(!skip_all_tags);
PutEverythingUntilClosingTag(!skip_tags);
if( IsLastTag(no_filter_tag) )
PutEverythingUntilClosingTag(false);
@ -1421,7 +1548,7 @@ int i;
for(int z=(int)stack_len-2 ; z>=i ; --z)
{
if( !skip_all_tags && pstack[z].new_line )
if( !skip_tags && pstack[z].new_line )
{
PutNewLine();
PutTabs(z);
@ -1471,7 +1598,7 @@ void HTMLFilter::CheckClosingTags()
if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
{
// last closing tag is from the previous one
if( !skip_all_tags && pstack[stack_len-2].new_line )
if( !skip_tags && pstack[stack_len-2].new_line )
{
PutNewLine();
PutTabs(stack_len-2);
@ -1494,7 +1621,7 @@ bool HTMLFilter::PrintRest()
const wchar_t * start = pchar;
// in safe mode we do not print the rest html code
if( safe_mode || skip_all_tags )
if( safe_mode || skip_tags )
return false;
while( *pchar )

View File

@ -128,13 +128,11 @@ public:
// false by default
void TrimWhite(bool trim);
// first tabs in a tree
// default: 2 (spaces)
// set 0 to turn off
void InsertTabs(size_t tabsize);
// set a name of a html tag which will be used as 'nofilter' tag
// elements between such tags are not filtered (similarly as in <pre> and <textarea>)
// these tags (opening and closing) will no be placed in the html output
@ -145,20 +143,32 @@ public:
void AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab);
void ClearOrphans();
// check 'orphans' for the specicic language
// if an orphan is detected then the non-break space ("&nbsp;" or ascii 160 code) will be put
// default disable (lang_none)
void OrphansMode(OrphanMode mode = orphan_nbsp);
// skipping some unsafe tags
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
void SafeMode(bool safe_mode_);
// skip all html tags
// gives only text without markup
void SkipAllTags(bool skip_all_tags, bool skip_commentaries);
// but there can be commentaries
void SkipTags(bool skip_tags);
// skip commentaries
void SkipCommentaries(bool skip_commentaries);
// if true then entities such as &nbsp; are skipped
// this automatically turns on AnalyzeEntities
// in such a case FoundEntity callbacks are sent
void SkipEntities(bool skip_entities);
// analyze html entities such as &nbsp;
// virtual method: FoundEntity is called
// entities are analyzed in normal text and in attribute values such as <p class="a&nbsp;">
void AnalyzeEntities(bool analyze_entities);
protected:
@ -219,13 +229,18 @@ protected:
virtual void Init();
virtual void Uninit();
virtual bool IsOpeningTagMark();
virtual bool IsOpeningCommentaryTagMark();
virtual bool IsClosingTagMark();
virtual bool IsClosingXmlSimpleTagMark();
virtual bool IsOpeningTagMark(wchar_t c);
virtual bool IsClosingTagMark(wchar_t c);
virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
virtual bool IsStartingEntityMark(wchar_t c);
virtual bool IsEndingEntityMark(wchar_t c);
virtual bool IsOpeningCommentaryTagMark(const wchar_t * str);
virtual size_t OpeningCommentaryTagMarkSize();
virtual bool IsValidCharForName(int c);
virtual bool IsValidCharForAttrName(int c);
virtual bool IsValidCharForEntityName(int c);
virtual void CheckExceptions();
virtual bool SkipCommentaryTagIfExists();
@ -233,6 +248,7 @@ protected:
virtual void Put(const wchar_t * str);
virtual void Put(const wchar_t * str, const wchar_t * end);
virtual void Put(const std::wstring & str);
virtual void AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out);
virtual void PutOpeningTagMark();
virtual void PutClosingTagMark();
@ -243,6 +259,7 @@ protected:
virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
virtual void ItemFound();
virtual void EntityFound(const wchar_t * str, const wchar_t * end);
/*
others
@ -280,7 +297,6 @@ protected:
void SkipWhiteWithFirstNewLine();
void SkipWhiteLines(const wchar_t * & str, const wchar_t * end);
bool IsClosingTagForLastItem();
size_t OpeningCommentaryTagMarkSize();
void SkipAndCheckClosingTag();
void PopStack();
@ -294,6 +310,7 @@ protected:
bool PrintOpeningItem();
void ReadItemName();
void ReadItemAttrName();
void ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end);
void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
bool ReadItemAttr();
@ -310,7 +327,7 @@ protected:
void CheckChar(wchar_t c);
void CheckLineWrap();
bool HasSemiloconAround(const wchar_t * str, const wchar_t * end);
bool HasEntityEndAround(const wchar_t * str, const wchar_t * end);
void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end);
void PutNormalWhite(const wchar_t * & str, const wchar_t * end);
void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
@ -340,8 +357,10 @@ protected:
size_t line_len; //length of the current line (without first spaces which create the html tree)
bool safe_mode; // skipping some unsafe tags
Orphans orphans_temp;
bool skip_all_tags;
bool skip_tags;
bool skip_commentaries;
bool skip_entities;
bool analyze_entities;
};