winix/core/htmlfilter.cpp

1208 lines
19 KiB
C++
Executable File
Raw Blame History

/*
* This file is a part of Winix
* and is not publicly distributed
*
* Copyright (c) 2008-2010, Tomasz Sowa
* All rights reserved.
*
*/
#include "htmlfilter.h"
void HTMLFilter::Item::Clear()
{
name[0] = 0;
name_len = 0;
type = none;
new_line = false;
}
HTMLFilter::Item::Item()
{
Clear();
}
void HTMLFilter::Filter(const char * in, std::string & out)
{
pchar = in;
stack_len = 0;
out_string = &out;
last_new_line = false;
out_string->clear();
Init();
Read();
Deinit();
}
void HTMLFilter::Init()
{
}
void HTMLFilter::Deinit()
{
}
void HTMLFilter::Filter(const std::string & in, std::string & out)
{
out.reserve(in.size() * 2 + 1);
Filter(in.c_str(), out);
}
HTMLFilter::HTMLFilter()
{
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new char[WINIX_HTMLFILTER_BUFFER_MAXLEN];
tab_size = 2;
trim_white = false;
break_after = 0;
lang = lang_none;
orphan_mode = orphan_nbsp;
safe_mode = false;
}
HTMLFilter::HTMLFilter(const HTMLFilter & f)
{
// don't need to copy the stack
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new char[WINIX_HTMLFILTER_BUFFER_MAXLEN];
}
HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
{
// don't need to copy the stack
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new char[WINIX_HTMLFILTER_BUFFER_MAXLEN];
return *this;
}
HTMLFilter::~HTMLFilter()
{
delete [] pstack;
delete [] buffer;
}
void HTMLFilter::BreakLines(size_t break_after_)
{
break_after = break_after_;
if( break_after > 10000 )
break_after = 10000;
}
void HTMLFilter::TrimWhite(bool trim)
{
trim_white = trim;
}
void HTMLFilter::InsertTabs(size_t tabsize)
{
tab_size = tabsize;
if( tab_size > 1000 )
tab_size = 1000;
}
void HTMLFilter::CheckOrphans(HTMLFilter::Lang lang_, HTMLFilter::OrphanMode mode)
{
lang = lang_;
orphan_mode = mode;
}
void HTMLFilter::SafeMode(bool safe_mode_)
{
safe_mode = safe_mode_;
}
HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
{
if( i >= stack_len )
{
empty.Clear();
return empty;
}
return pstack[i];
}
HTMLFilter::Item & HTMLFilter::LastItem()
{
if( stack_len == 0 )
{
empty.Clear();
return empty;
}
return pstack[stack_len-1];
}
bool HTMLFilter::PushStack()
{
if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
// oops, too many items
return false;
pstack[stack_len].Clear();
stack_len += 1;
return true;
}
void HTMLFilter::PopStack()
{
if( stack_len == 0 )
// oops
return;
stack_len -= 1;
pstack[stack_len].Clear();
}
bool HTMLFilter::IsWhite(int c)
{
// dont use c==10 here
if( c==' ' || c=='\t' || c==13 || c==160 )
return true;
return false;
}
void HTMLFilter::SkipWhite()
{
while( IsWhite(*pchar) )
++pchar;
}
void HTMLFilter::SkipWhiteLines()
{
while( *pchar==10 || IsWhite(*pchar) )
++pchar;
}
void HTMLFilter::SkipWhiteWithFirstNewLine()
{
SkipWhite();
if( *pchar == 10 )
{
pchar += 1;
SkipWhite();
}
}
void HTMLFilter::CheckNewLine()
{
const char * start = pchar;
SkipWhite();
last_new_line = (*pchar==10);
pchar = start;
}
bool HTMLFilter::IsClosingTagForLastItem()
{
pchar += 1;
SkipWhite();
if( *pchar == '/' )
{
pchar += 1;
SkipWhite();
if( IsNameEqual(pchar, LastItem().name, LastItem().name_len) )
{
pchar += LastItem().name_len;
SkipWhite();
if( IsClosingTagMark() )
{
pchar += 1;
return true;
}
}
}
return false;
}
// used for such tags as: script, pre, textarea
void HTMLFilter::PutLastTagWithClosingTag()
{
const char * start = pchar;
while( *pchar != 0 )
{
if( IsOpeningTagMark() )
{
if( IsClosingTagForLastItem() )
{
PopStack();
CheckNewLine();
break;
}
}
else
{
pchar += 1;
}
}
Put(start, pchar);
}
const char * HTMLFilter::SkipItemCheckXmlSimple()
{
const char * end = pchar;
while( *pchar!=0 )
{
while( *pchar!=0 && !IsClosingTagMark() && !IsClosingXmlSimpleTagMark())
++pchar;
if( IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
{
end = pchar;
++pchar;
SkipWhite();
if( IsClosingTagMark() )
{
++pchar;
LastItem().type = Item::simple;
break;
}
}
else
if( IsClosingTagMark() )
{
end = pchar;
++pchar;
break;
}
}
return end;
}
bool HTMLFilter::IsValidCharForName(int c)
{
if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') ||
(c>='0' && c<='9') ||
c=='-' || c=='!' )
return true;
return false;
}
void HTMLFilter::ReadItemName()
{
size_t i;
for( i=0 ; IsValidCharForName(*pchar) && i<WINIX_HTMLFILTER_ITEM_MAXLEN-1 ; ++i )
{
LastItem().name[i] = *pchar;
++pchar;
}
LastItem().name[i] = 0;
LastItem().name_len = i;
}
void HTMLFilter::Put(const char * str, const char * end)
{
if( str>=end )
return;
size_t len = end - str;
out_string->append(str, len);
}
int HTMLFilter::CheckOrphan(const char * str, const char * end, const char * orphan)
{
size_t res;
for( ; str<end && *orphan!=0 ; ++str, ++orphan )
{
res = ToLower(*(unsigned const char*)str) - ToLower(*(unsigned const char*)orphan);
if( res != 0 )
return res;
}
if( str < end )
return ToLower(*(unsigned const char*)str);
return -ToLower(*(unsigned const char*)orphan);
}
// binary search in table
// o1 - index of the first element
// o2 - index of the last element
bool HTMLFilter::CheckOrphanTable(const char * str, const char * end, const char ** table, size_t o1, size_t o2)
{
int res;
res = CheckOrphan(str, end, table[o1]);
if( res == 0 )
return true;
if( res < 0 )
return false;
res = CheckOrphan(str, end, table[o2]);
if( res == 0 )
return true;
if( res > 0 )
return false;
while( o1 + 1 < o2 )
{
size_t o = (o1 + o2) / 2;
res = CheckOrphan(str, end, table[o]);
if( res == 0 )
return true;
if( res < 0 )
o2 = o;
else
o1 = o;
}
return false;
}
bool HTMLFilter::CheckOrphanLangPl(const char * str, const char * end)
{
// the table must be sorted in alphabetical order
// polish letters coded in iso-8859-2
static const char * orphans[] = {
"(np.", "s.", "a", "ale", "bo", "by", "co", "czy", "do", "go", "i",
"ich", "ja", "je", "jej", "jest", "ju<EFBFBD>", "j<EFBFBD>", "ku", "li", "mi", "na",
"nie", "np.", "nr", "o", "od", "po", "s<EFBFBD>", "ta", "to", "tu", "t<EFBFBD>",
"t<EFBFBD>", "u", "w", "we", "wy", "z", "za", "ze", "<EFBFBD>e", "<EFBFBD>w"
};
size_t o1 = 0;
size_t o2 = sizeof(orphans) / sizeof(const char*) - 1;
return CheckOrphanTable(str, end, orphans, o1, o2);
}
// SK i CZ
bool HTMLFilter::CheckOrphanLangCz(const char * str, const char * end)
{
// the table must be sorted in alphabetical order
static const char * orphans[] = {
"a", "i", "k", "o", "s", "u", "v", "z"
};
size_t o1 = 0;
size_t o2 = sizeof(orphans) / sizeof(const char*) - 1;
return CheckOrphanTable(str, end, orphans, o1, o2);
}
bool HTMLFilter::CheckOrphan(const char * str, const char * end)
{
if( str == end || lang == lang_none )
return false;
if( lang == lang_cz || lang == lang_sk )
return CheckOrphanLangCz(str, end);
return CheckOrphanLangPl(str, end);
}
size_t HTMLFilter::PutNormalTextFillBuffer(const char * & str, const char * & end)
{
const char * word = str; // pointing at the beginning of a word
size_t i = 0;
// some space in the buffer for non break spaces (orphans) and spaces at the beginning of a line
size_t epsilon = WINIX_HTMLFILTER_BUFFER_MAXLEN / 10 + 1;
bool is_white;
bool was_white = true;
size_t non_whites = 0;
for( ; str < end && i<WINIX_HTMLFILTER_BUFFER_MAXLEN-epsilon ; ++str )
{
is_white = (*str==10 || IsWhite(*str));
if( is_white && !was_white )
{
if( CheckOrphan(word, str) )
{
i += PutNonBreakSpaceToBuffer(i);
// here we have to skip the whole white string
for( ; (*str==10 || IsWhite(*str)) && str < end ; ++str );
if( str == end )
break;
is_white = false;
was_white = true;
}
}
// skipping the last new line character (if exists)
if( *str == 10 && str < end-1 )
{
buffer[i] = *str;
i += 1;
i += PutTabsToBuffer(i, stack_len);
}
else
if( *str != 10 )
{
buffer[i] = *str;
i += 1;
}
if( was_white && !is_white )
word = str;
if( !is_white )
non_whites += 1;
else
non_whites = 0;
if( break_after!=0 && non_whites>=break_after )
{
buffer[i] = ' ';
i += 1;
non_whites = 0;
}
was_white = is_white;
}
return i;
}
size_t HTMLFilter::PutNormalTextTrimFillBuffer(const char * & str, const char * & end)
{
const char * word = str; // pointint at the beginning of a word
size_t non_whites = 0;
size_t i = 0;
bool is_white;
// some space in the buffer for non break spaces (orphans) and spaces at the beginning of a line
size_t epsilon = WINIX_HTMLFILTER_BUFFER_MAXLEN / 10 + 1;
for( ; str < end && i<WINIX_HTMLFILTER_BUFFER_MAXLEN-epsilon ; ++str )
{
is_white = (*str==10 || IsWhite(*str));
if( is_white )
{
if( CheckOrphan(word, str) )
{
i += PutNonBreakSpaceToBuffer(i);
is_white = false;
}
else
{
non_whites = 0;
}
// skipping the whole white string
for( ; (*str==10 || IsWhite(*str)) && str < end ; ++str );
word = str;
}
if( !is_white )
non_whites += 1;
else
non_whites = 0;
if( (break_after!=0 && non_whites>break_after) || is_white )
{
buffer[i] = ' ';
i += 1;
non_whites = 1;
}
if( str < end )
{
buffer[i] = *str;
i += 1;
}
}
return i;
}
void HTMLFilter::PutNormalText(const char * str, const char * end)
{
size_t buf_len;
while( str < end )
{
buf_len = PutNormalTextFillBuffer(str, end);
Put(buffer, buffer+buf_len);
}
}
void HTMLFilter::PutNormalTextTrim(const char * str, const char * end)
{
size_t buf_len;
while( str < end )
{
buf_len = PutNormalTextTrimFillBuffer(str, end);
Put(buffer, buffer+buf_len);
}
}
void HTMLFilter::PutOpeningTagMark()
{
(*out_string) += '<';
}
void HTMLFilter::PutClosingTagMark()
{
(*out_string) += '>';
}
void HTMLFilter::PutTagName(const char * name)
{
(*out_string) += name;
}
bool HTMLFilter::IsTagSafe(const char * tag)
{
if( !safe_mode )
return true;
static const char * unsafe_tags[] = {
"script", "iframe", "frame", "frameset",
"applet", "head", "meta", "html", "link", "body"
};
size_t len = sizeof(unsafe_tags) / sizeof(const char*);
size_t i;
for(i=0 ; i<len ; ++i)
{
if( IsNameEqual(tag, unsafe_tags[i]) )
return false;
}
return true;
}
// start, end - arguments
void HTMLFilter::PutOpeningTag(const char * start, const char * end)
{
if( !IsTagSafe(LastItem().name) )
return;
PutOpeningTagMark();
PutTagName(LastItem().name);
if( start != end )
{
(*out_string) += ' ';
Put(start, end);
}
PutClosingTagMark();
}
void HTMLFilter::PutClosingTag(const char * tag)
{
if( !IsTagSafe(tag) )
return;
PutOpeningTagMark();
(*out_string) += '/';
PutTagName(tag);
PutClosingTagMark();
}
size_t HTMLFilter::PutTabsToBuffer(size_t index, size_t len)
{
if( len == 0 )
return 0;
if( len > 20 )
len = 20;
// how many spaces do you want
size_t spaces = len * tab_size;
size_t i = 0;
if( index+spaces < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 )
{
for( ; i<spaces ; ++i )
buffer[index+i] = ' ';
}
return i;
}
size_t HTMLFilter::PutNonBreakSpaceToBuffer(size_t index)
{
size_t i = 0;
if( orphan_mode == orphan_nbsp )
{
static const char nb[] = "&nbsp;";
size_t len = sizeof(nb) / sizeof(char) - 1; // '0' at the end
if( index+len < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 )
{
for( ; i<len ; ++i )
buffer[index+i] = nb[i];
}
}
else
{
if( index+1 < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 )
{
i = 1;
buffer[index] = (char)160;
}
}
return i; // return i not len (can be zero)
}
void HTMLFilter::PutTabs(size_t len)
{
size_t i = PutTabsToBuffer(0, len);
Put(buffer, buffer+i);
}
void HTMLFilter::PutNewLine()
{
buffer[0] = 10;
Put(buffer, buffer+1);
}
// we assume the size of the opening mark to be one
bool HTMLFilter::IsOpeningTagMark()
{
return (*pchar == '<');
}
// we assume the size of the closing mark to be one
bool HTMLFilter::IsClosingTagMark()
{
return (*pchar == '>');
}
// the slash at the end <img src=".." /> (without '>' character)
// we assume the size of the mark to be one
bool HTMLFilter::IsClosingXmlSimpleTagMark()
{
return (*pchar == '/');
}
bool HTMLFilter::IsOpeningCommentaryTagMark()
{
static char comm_open[] = "<!--";
size_t comm_open_len = sizeof(comm_open) / sizeof(char) - 1;
return IsNameEqual(pchar, comm_open, comm_open_len);
}
size_t HTMLFilter::OpeningCommentaryTagMarkSize()
{
return 4; // size of "<!--"
}
// skipping the commentary tag if exists
bool HTMLFilter::SkipCommentaryTagIfExists()
{
static char comm_close[] = "-->";
size_t comm_close_len = sizeof(comm_close) / sizeof(char) - 1;
if( !IsOpeningCommentaryTagMark() )
return false;
pchar += OpeningCommentaryTagMarkSize();
// looking for "-->"
while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
++pchar;
if( *pchar!= 0 )
pchar += comm_close_len;
CheckNewLine();
return true;
}
void HTMLFilter::ReadNormalTextSkipWhite(const char * & start, const char * & last_non_white)
{
if( trim_white )
{
// skipping all white chars (with new lines)
// but with remembering the last non white character
for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar)
if( *pchar == 10 )
last_non_white = pchar;
}
else
{
// skipping first white chars with only one line between them
SkipWhite();
last_non_white = pchar;
if( *pchar == 10 )
{
++pchar;
SkipWhite();
}
}
start = pchar;
// exception for the commentary tag
if( IsOpeningCommentaryTagMark() || !IsOpeningTagMark() )
{
PutNewLine();
PutTabs(stack_len);
}
}
// reading text between html tags
void HTMLFilter::ReadNormalText()
{
const char * start = pchar;
const char * last_non_white = pchar;
if( last_new_line )
ReadNormalTextSkipWhite(start, last_non_white);
while( *pchar != 0 )
{
if( SkipCommentaryTagIfExists() )
{
last_non_white = pchar - 1; // pointing at the last '>' from a commentary
}
else
{
if( IsOpeningTagMark() )
break;
if( !IsWhite(*pchar) )
last_non_white = pchar;
pchar += 1;
}
}
last_new_line = (*last_non_white == 10);
if( trim_white )
PutNormalTextTrim(start, pchar);
else
PutNormalText(start, pchar);
}
// start, end - parameters to a tag
void HTMLFilter::PrintItem(const char * start, const char * end)
{
if( last_new_line )
{
PutNewLine();
if( stack_len > 1 )
PutTabs(stack_len-1);
}
PutOpeningTag(start, end);
}
bool HTMLFilter::ReadItem()
{
const char * start = pchar;
if( *pchar == 0 )
return false;
if( !PushStack() )
return false;
// we have '<'
pchar += 1;
SkipWhite();
if( *pchar == '/' ) // we have a closing tag
{
pchar += 1;
SkipWhite();
LastItem().type = Item::closing;
}
ReadItemName();
SkipWhite();
start = pchar; // arguments start here
if( LastItem().type != Item::closing )
LastItem().type = (LastItem().name[0] == '!') ? Item::special : Item::opening;
const char * end = SkipItemCheckXmlSimple();
if( LastItem().type != Item::closing )
PrintItem(start, end);
CheckNewLine();
LastItem().new_line = last_new_line;
return true;
}
int HTMLFilter::ToLower(int c)
{
if( c>='A' && c<='Z' )
return c - 'A' + 'a';
return c;
}
bool HTMLFilter::IsNameEqual(const char * name1, const char * name2)
{
for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
if( ToLower(*name1) != ToLower(*name2) )
return false;
if( *name1==0 && *name2==0 )
return true;
return false;
}
// len characters from both strings must be equal
bool HTMLFilter::IsNameEqual(const char * name1, const char * name2, size_t len)
{
for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
if( ToLower(*name1) != ToLower(*name2) )
return false;
if( len == 0 )
return true;
return false;
}
bool HTMLFilter::IsLastTag(const char * name)
{
const char * tag = LastItem().name;
return IsNameEqual(name, tag);
}
// checking exceptions for opening tags
void HTMLFilter::CheckExceptions()
{
if( IsLastTag("meta") ||
IsLastTag("input") ||
IsLastTag("br") ||
IsLastTag("hr") ||
IsLastTag("img") ||
IsLastTag("link") )
{
LastItem().type = Item::simple;
PopStack();
return;
}
// in safe_mode the script tag is ignored
if( !safe_mode && IsLastTag("script") )
PutLastTagWithClosingTag();
if( IsLastTag("pre") || IsLastTag("textarea") )
PutLastTagWithClosingTag();
}
void HTMLFilter::AddForgottenTags()
{
int i;
if( stack_len < 3 )
return;
// we have forgotten to close some tags
// looking whether there is a matching opening tag
for(i=int(stack_len)-3 ; i>=0 ; --i)
if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
break;
if( i < 0 )
{
// oops, there is no such a tag
// we don't print the closing and the missing opening tag
PopStack();
return;
}
for(int z=(int)stack_len-2 ; z>=i ; --z)
{
if( pstack[z].new_line )
{
PutNewLine();
PutTabs(z);
}
PutClosingTag(pstack[z].name);
pstack[z].Clear();
}
last_new_line = pstack[stack_len-1].new_line;
// invalidate tags
stack_len = i;
}
void HTMLFilter::CheckStackPrintRest()
{
while( stack_len-- > 0 )
{
if( stack_len==0 || pstack[stack_len-1].new_line )
PutNewLine();
PutTabs(stack_len);
PutClosingTag(pstack[stack_len].name);
}
}
void HTMLFilter::CheckClosingTags()
{
if( stack_len == 0 )
return;
// on the stack we have only opening tags
// but only the last tag is a closing tag
if( stack_len == 1 )
{
// there is only last closing tag
// we dont print it
PopStack();
return;
}
// there are more than one tag
if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
{
// last closing tag is from the previous one
if( pstack[stack_len-2].new_line )
{
PutNewLine();
PutTabs(stack_len-2);
}
PutClosingTag(pstack[stack_len-1].name);
last_new_line = pstack[stack_len-1].new_line;
PopStack();
PopStack();
return;
}
AddForgottenTags();
}
bool HTMLFilter::PrintRest()
{
const char * start = pchar;
while( *pchar )
++pchar;
if( pchar > start )
{
Put(start, pchar);
return true;
}
return false;
}
void HTMLFilter::Read()
{
if( trim_white )
SkipWhiteLines();
// it can be some text or white lines before the first html tag (we print it)
ReadNormalText();
while( ReadItem() )
{
if( LastItem().type == Item::opening )
{
CheckExceptions();
}
else
if( LastItem().type == Item::special || LastItem().type == Item::simple )
{
if( stack_len > 1 )
{
pstack[stack_len-2].new_line = LastItem().new_line;
}
else
if( trim_white )
{
// one new line after a simple or special tag
// (if the tag has level 0 in the tree - it not means that this is a first tag)
// for example can be DOCTYPE
PutNewLine();
}
PopStack();
}
else
if( LastItem().type == Item::closing )
{
CheckClosingTags();
}
ReadNormalText();
}
// sometimes ReadItem() can return a false (when there is no space on the stack)
// we print the rest html without filtering
if( !PrintRest() )
CheckStackPrintRest();
}