800 lines
12 KiB
C++
Executable File
800 lines
12 KiB
C++
Executable File
/*
|
|
* This file is a part of CMSLU -- Content Management System like Unix
|
|
* and is not publicly distributed
|
|
*
|
|
* Copyright (c) 2008-2009, Tomasz Sowa
|
|
* All rights reserved.
|
|
*
|
|
*/
|
|
|
|
#include "htmlfilter.h"
|
|
|
|
|
|
|
|
void HTMLFilter::Item::Clear()
|
|
{
|
|
name[0] = 0;
|
|
name_len = 0;
|
|
type = none;
|
|
new_line = false;
|
|
}
|
|
|
|
|
|
HTMLFilter::Item::Item()
|
|
{
|
|
Clear();
|
|
}
|
|
|
|
|
|
|
|
void HTMLFilter::Filter(const char * in, std::string & out)
|
|
{
|
|
pchar = in;
|
|
stack_len = 0;
|
|
out_string = &out;
|
|
last_new_line = false;
|
|
out_string->clear();
|
|
|
|
Read();
|
|
}
|
|
|
|
|
|
void HTMLFilter::Filter(const std::string & in, std::string & out)
|
|
{
|
|
out.reserve(in.size() * 2 + 1);
|
|
Filter(in.c_str(), out);
|
|
}
|
|
|
|
|
|
HTMLFilter::HTMLFilter()
|
|
{
|
|
pstack = new Item[CMSLU_HTMLFILTER_STACK_MAXLEN];
|
|
buffer = new char[CMSLU_HTMLFILTER_BUFFER_MAXLEN];
|
|
|
|
tab_size = 2;
|
|
trim_white = false;
|
|
break_long_lines = false;
|
|
}
|
|
|
|
|
|
HTMLFilter::HTMLFilter(const HTMLFilter & f)
|
|
{
|
|
// don't need to copy the stack
|
|
pstack = new Item[CMSLU_HTMLFILTER_STACK_MAXLEN];
|
|
buffer = new char[CMSLU_HTMLFILTER_BUFFER_MAXLEN];
|
|
}
|
|
|
|
|
|
HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
|
|
{
|
|
// don't need to copy the stack
|
|
pstack = new Item[CMSLU_HTMLFILTER_STACK_MAXLEN];
|
|
buffer = new char[CMSLU_HTMLFILTER_BUFFER_MAXLEN];
|
|
|
|
return *this;
|
|
}
|
|
|
|
|
|
HTMLFilter::~HTMLFilter()
|
|
{
|
|
delete [] pstack;
|
|
delete [] buffer;
|
|
}
|
|
|
|
|
|
void HTMLFilter::BreakLongLines(bool break_lines)
|
|
{
|
|
break_long_lines = break_lines;
|
|
}
|
|
|
|
|
|
void HTMLFilter::TrimWhite(bool trim)
|
|
{
|
|
trim_white = trim;
|
|
}
|
|
|
|
|
|
void HTMLFilter::InsertTabs(size_t tabsize)
|
|
{
|
|
tab_size = tabsize;
|
|
|
|
if( tab_size > 1000 )
|
|
tab_size = 1000;
|
|
}
|
|
|
|
|
|
HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
|
|
{
|
|
if( i >= stack_len )
|
|
{
|
|
empty.Clear();
|
|
return empty;
|
|
}
|
|
|
|
return pstack[i];
|
|
}
|
|
|
|
|
|
HTMLFilter::Item & HTMLFilter::LastItem()
|
|
{
|
|
if( stack_len == 0 )
|
|
{
|
|
empty.Clear();
|
|
return empty;
|
|
}
|
|
|
|
return pstack[stack_len-1];
|
|
}
|
|
|
|
|
|
bool HTMLFilter::PushStack()
|
|
{
|
|
if( stack_len == CMSLU_HTMLFILTER_STACK_MAXLEN )
|
|
// oops, too many items
|
|
return false;
|
|
|
|
pstack[stack_len].Clear();
|
|
stack_len += 1;
|
|
|
|
return true;
|
|
}
|
|
|
|
void HTMLFilter::PopStack()
|
|
{
|
|
if( stack_len == 0 )
|
|
// oops
|
|
return;
|
|
|
|
stack_len -= 1;
|
|
pstack[stack_len].Clear();
|
|
}
|
|
|
|
|
|
bool HTMLFilter::IsWhite(int c)
|
|
{
|
|
// dont use c==10 here
|
|
|
|
if( c==' ' || c=='\t' || c==13 )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
void HTMLFilter::SkipWhite()
|
|
{
|
|
while( IsWhite(*pchar) )
|
|
++pchar;
|
|
}
|
|
|
|
|
|
void HTMLFilter::SkipWhiteLines()
|
|
{
|
|
while( *pchar==10 || IsWhite(*pchar) )
|
|
++pchar;
|
|
}
|
|
|
|
|
|
bool HTMLFilter::SkipTagCheck()
|
|
{
|
|
pchar += 1;
|
|
SkipWhite();
|
|
|
|
if( *pchar == '/' )
|
|
{
|
|
pchar += 1;
|
|
SkipWhite();
|
|
|
|
if( IsNameEqual(pchar, LastItem().name, LastItem().name_len) )
|
|
{
|
|
pchar += LastItem().name_len;
|
|
SkipWhite();
|
|
|
|
if( *pchar == '>' )
|
|
{
|
|
pchar += 1;
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
void HTMLFilter::SkipNormalText()
|
|
{
|
|
while( *pchar!=0 && *pchar!='<' )
|
|
++pchar;
|
|
}
|
|
|
|
void HTMLFilter::CheckNewLine()
|
|
{
|
|
const char * start = pchar;
|
|
|
|
SkipWhite();
|
|
last_new_line = (*pchar==10);
|
|
|
|
pchar = start;
|
|
}
|
|
|
|
|
|
void HTMLFilter::PutLastTagWithClosingTag()
|
|
{
|
|
const char * start = pchar;
|
|
|
|
|
|
while( *pchar != 0 )
|
|
{
|
|
if( *pchar == '<' )
|
|
{
|
|
if( SkipTagCheck() )
|
|
{
|
|
PopStack();
|
|
CheckNewLine();
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pchar += 1;
|
|
}
|
|
}
|
|
|
|
Put(start, pchar);
|
|
}
|
|
|
|
|
|
|
|
void HTMLFilter::SkipItem()
|
|
{
|
|
while( *pchar!=0 && *pchar!='>' )
|
|
++pchar;
|
|
|
|
if( *pchar == '>' )
|
|
++pchar;
|
|
}
|
|
|
|
|
|
void HTMLFilter::SkipItemCheckXmlSimple()
|
|
{
|
|
while( *pchar!=0 )
|
|
{
|
|
while( *pchar!=0 && *pchar!='>' && *pchar!='/')
|
|
++pchar;
|
|
|
|
if( *pchar == '/' ) // closing xml tag
|
|
{
|
|
++pchar;
|
|
SkipWhite();
|
|
|
|
if( *pchar == '>' )
|
|
{
|
|
++pchar;
|
|
LastItem().type = Item::simple;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
if( *pchar == '>' )
|
|
{
|
|
++pchar;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
bool HTMLFilter::IsValidCharForName(int c)
|
|
{
|
|
if( (c>='a' && c<='z') ||
|
|
(c>='A' && c<='Z') ||
|
|
(c>='0' && c<='9') ||
|
|
c=='-' || c=='!' )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemName()
|
|
{
|
|
size_t i;
|
|
|
|
for( i=0 ; IsValidCharForName(*pchar) && i<CMSLU_HTMLFILTER_ITEM_MAXLEN-1 ; ++i )
|
|
{
|
|
LastItem().name[i] = *pchar;
|
|
++pchar;
|
|
}
|
|
|
|
LastItem().name[i] = 0;
|
|
LastItem().name_len = i;
|
|
}
|
|
|
|
|
|
|
|
void HTMLFilter::Put(const char * str, const char * end)
|
|
{
|
|
if( str>=end )
|
|
return;
|
|
|
|
size_t len = end - str;
|
|
out_string->append(str, len);
|
|
}
|
|
|
|
|
|
size_t HTMLFilter::PutTrimFillBuffer(const char * & str, const char * & end)
|
|
{
|
|
size_t non_whites = 0;
|
|
size_t i = 0;
|
|
bool was_white;
|
|
|
|
|
|
for( ; str < end && i<CMSLU_HTMLFILTER_BUFFER_MAXLEN-1 ; ++str )
|
|
{
|
|
was_white = false;
|
|
|
|
if( *str==10 || IsWhite(*str) )
|
|
{
|
|
// skipping the whole white string
|
|
for( ; (*str==10 || IsWhite(*str)) && str < end ; ++str );
|
|
|
|
was_white = true;
|
|
non_whites = 0;
|
|
}
|
|
else
|
|
{
|
|
non_whites += 1;
|
|
}
|
|
|
|
if( (non_whites>60 && break_long_lines) || was_white )
|
|
{
|
|
buffer[i] = ' ';
|
|
i += 1;
|
|
non_whites = 0;
|
|
}
|
|
|
|
if( str < end )
|
|
{
|
|
buffer[i] = *str;
|
|
i += 1;
|
|
}
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
// if there are more than one white characters then they will be changed into one space
|
|
// and putting a space between 50 characters (if there were not any other white characters between them)
|
|
void HTMLFilter::PutTrim(const char * str, const char * end)
|
|
{
|
|
size_t buf_len;
|
|
|
|
// this buffer must have at least 2 bytes (PutTrimFillBuffer needs it)
|
|
if( !trim_white || CMSLU_HTMLFILTER_BUFFER_MAXLEN < 2 )
|
|
{
|
|
Put(str, end);
|
|
return;
|
|
}
|
|
|
|
while( str < end )
|
|
{
|
|
buf_len = PutTrimFillBuffer(str, end);
|
|
Put(buffer, buffer+buf_len);
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLFilter::PutOpeningTag(const char * tag)
|
|
{
|
|
size_t i;
|
|
|
|
if( CMSLU_HTMLFILTER_BUFFER_MAXLEN < CMSLU_HTMLFILTER_ITEM_MAXLEN+2 )
|
|
return;
|
|
|
|
buffer[0] = '<';
|
|
|
|
for(i=1 ; *tag ; ++i, ++tag)
|
|
buffer[i] = *tag;
|
|
|
|
buffer[i] = '>';
|
|
i += 1;
|
|
|
|
Put(buffer, buffer+i);
|
|
}
|
|
|
|
|
|
|
|
void HTMLFilter::PutClosingTag(const char * tag)
|
|
{
|
|
size_t i;
|
|
|
|
if( CMSLU_HTMLFILTER_BUFFER_MAXLEN < CMSLU_HTMLFILTER_ITEM_MAXLEN+3 )
|
|
return;
|
|
|
|
buffer[0] = '<';
|
|
buffer[1] = '/';
|
|
|
|
for(i=2 ; *tag ; ++i, ++tag)
|
|
buffer[i] = *tag;
|
|
|
|
buffer[i] = '>';
|
|
i += 1;
|
|
|
|
Put(buffer, buffer+i);
|
|
}
|
|
|
|
|
|
void HTMLFilter::PutTabs(size_t len)
|
|
{
|
|
if( len == 0 )
|
|
return;
|
|
|
|
if( len > 20 )
|
|
len = 20;
|
|
|
|
// how many spaces do you want
|
|
size_t spaces = (len-1) * tab_size;
|
|
size_t i;
|
|
|
|
if( spaces < CMSLU_HTMLFILTER_BUFFER_MAXLEN )
|
|
{
|
|
for(i=0 ; i<spaces ; ++i)
|
|
buffer[i] = ' ';
|
|
|
|
Put(buffer, buffer+i);
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLFilter::PutNewLine()
|
|
{
|
|
if( !trim_white )
|
|
// new line characters will be directly from the source html
|
|
return;
|
|
|
|
buffer[0] = 10;
|
|
Put(buffer, buffer+1);
|
|
}
|
|
|
|
|
|
|
|
bool HTMLFilter::IsOpeningCommentaryTag()
|
|
{
|
|
static char comm_open[] = "<!--";
|
|
size_t comm_open_len = sizeof(comm_open) / sizeof(char) - 1;
|
|
|
|
return IsNameEqual(pchar, comm_open, comm_open_len);
|
|
}
|
|
|
|
|
|
|
|
// skipping the commentary tag if exists
|
|
bool HTMLFilter::SkipCommentaryTagIfExists()
|
|
{
|
|
static char comm_close[] = "-->";
|
|
size_t comm_close_len = sizeof(comm_close) / sizeof(char) - 1;
|
|
|
|
if( !IsOpeningCommentaryTag() )
|
|
return false;
|
|
|
|
pchar += 4; // size of commentary opening tag
|
|
|
|
// looking for "-->"
|
|
while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
|
|
++pchar;
|
|
|
|
if( *pchar!= 0 )
|
|
pchar += comm_close_len;
|
|
|
|
CheckNewLine();
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
void HTMLFilter::ReadNormalText()
|
|
{
|
|
const char * start = pchar;
|
|
|
|
SkipWhiteLines();
|
|
|
|
if( *pchar!=0 && (IsOpeningCommentaryTag() || *pchar!='<') && last_new_line )
|
|
{
|
|
if( *pchar == '<' )
|
|
// skipping some white characters before a opening commentary tag
|
|
// (but only a tag in a new line)
|
|
start = pchar;
|
|
|
|
PutNewLine();
|
|
PutTabs(stack_len+1);
|
|
last_new_line = false; // in normal text we don't allow a new line character
|
|
}
|
|
|
|
|
|
while( *pchar != 0 )
|
|
{
|
|
if( !SkipCommentaryTagIfExists() )
|
|
{
|
|
if( *pchar == '<' )
|
|
break;
|
|
|
|
pchar += 1;
|
|
}
|
|
}
|
|
|
|
PutTrim(start, pchar);
|
|
}
|
|
|
|
|
|
void HTMLFilter::PrintItem(const char * start, const char * end)
|
|
{
|
|
if( LastItem().type != Item::closing )
|
|
{
|
|
if( last_new_line )
|
|
{
|
|
if( stack_len > 0 )
|
|
PutNewLine();
|
|
|
|
if( stack_len > 1 )
|
|
PutTabs(stack_len);
|
|
}
|
|
|
|
// closing tags will be printed later
|
|
// (when we check the stack)
|
|
|
|
Put(start, pchar);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::ReadItem()
|
|
{
|
|
const char * start;
|
|
|
|
if( *pchar == 0 )
|
|
return false;
|
|
|
|
// we have '<'
|
|
start = pchar;
|
|
pchar += 1;
|
|
SkipWhite();
|
|
|
|
if( PushStack() )
|
|
{
|
|
if( *pchar == '/' ) // we have a closing tag
|
|
{
|
|
pchar += 1;
|
|
SkipWhite();
|
|
LastItem().type = Item::closing;
|
|
}
|
|
|
|
ReadItemName();
|
|
|
|
if( LastItem().type != Item::closing )
|
|
LastItem().type = (LastItem().name[0] == '!') ? Item::special : Item::opening;
|
|
|
|
SkipItemCheckXmlSimple();
|
|
PrintItem(start, pchar);
|
|
CheckNewLine();
|
|
LastItem().new_line = last_new_line;
|
|
return true;
|
|
}
|
|
|
|
pchar = start;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
int HTMLFilter::ToLower(int c)
|
|
{
|
|
if( c>='A' && c<='Z' )
|
|
return c - 'A' + 'a';
|
|
|
|
return c;
|
|
}
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const char * name1, const char * name2)
|
|
{
|
|
for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
|
|
if( ToLower(*name1) != ToLower(*name2) )
|
|
return false;
|
|
|
|
if( *name1==0 && *name2==0 )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
// len characters from both strings must be equal
|
|
bool HTMLFilter::IsNameEqual(const char * name1, const char * name2, size_t len)
|
|
{
|
|
for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
|
|
if( ToLower(*name1) != ToLower(*name2) )
|
|
return false;
|
|
|
|
if( len == 0 )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
bool HTMLFilter::IsLastTag(const char * name)
|
|
{
|
|
const char * tag = LastItem().name;
|
|
|
|
return IsNameEqual(name, tag);
|
|
}
|
|
|
|
|
|
// checking exceptions for opening tags
|
|
void HTMLFilter::CheckExceptions()
|
|
{
|
|
if( IsLastTag("meta") ||
|
|
IsLastTag("input") ||
|
|
IsLastTag("br") ||
|
|
IsLastTag("img") ||
|
|
IsLastTag("link") )
|
|
{
|
|
LastItem().type = Item::simple;
|
|
PopStack();
|
|
return;
|
|
}
|
|
|
|
|
|
if( IsLastTag("script") || IsLastTag("pre") || IsLastTag("textarea") )
|
|
PutLastTagWithClosingTag();
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::AddForgottenTags()
|
|
{
|
|
int i;
|
|
|
|
if( stack_len < 3 )
|
|
return;
|
|
|
|
// we have forgotten to close some tags
|
|
|
|
// looking whether there is a matching opening tag
|
|
for(i=int(stack_len)-3 ; i>=0 ; --i)
|
|
if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
|
|
break;
|
|
|
|
if( i < 0 )
|
|
{
|
|
// oops, there is no such a tag
|
|
// we don't print the closing and the missing opening tag
|
|
PopStack();
|
|
return;
|
|
}
|
|
|
|
for(int z=(int)stack_len-2 ; z>=i ; --z)
|
|
{
|
|
if( pstack[z].new_line )
|
|
{
|
|
PutNewLine();
|
|
PutTabs(z+1);
|
|
}
|
|
|
|
PutClosingTag(pstack[z].name);
|
|
pstack[z].Clear();
|
|
}
|
|
|
|
last_new_line = pstack[stack_len-1].new_line;
|
|
|
|
// invalidate tags
|
|
stack_len = i;
|
|
}
|
|
|
|
|
|
void HTMLFilter::CheckClosingTags()
|
|
{
|
|
if( stack_len == 0 )
|
|
return;
|
|
|
|
// on the stack we have only opening tags
|
|
// but only the last tag is a closing tag
|
|
|
|
if( stack_len == 1 )
|
|
{
|
|
// there is only last closing tag
|
|
// we dont print it
|
|
PopStack();
|
|
return;
|
|
}
|
|
|
|
// there are more than one tag
|
|
if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
|
|
{
|
|
// last closing tag is from the previous one
|
|
if( pstack[stack_len-2].new_line )
|
|
{
|
|
PutNewLine();
|
|
PutTabs(stack_len-1);
|
|
}
|
|
|
|
PutClosingTag(pstack[stack_len-1].name);
|
|
last_new_line = pstack[stack_len-1].new_line;
|
|
PopStack();
|
|
PopStack();
|
|
return;
|
|
}
|
|
|
|
AddForgottenTags();
|
|
}
|
|
|
|
|
|
void HTMLFilter::PrintRest()
|
|
{
|
|
const char * start = pchar;
|
|
|
|
while( *pchar )
|
|
++pchar;
|
|
|
|
if( pchar > start )
|
|
Put(start, pchar);
|
|
}
|
|
|
|
|
|
void HTMLFilter::Read()
|
|
{
|
|
// white characters at the beginning are skipped
|
|
SkipWhiteLines();
|
|
|
|
// it can be some text before the first html tag (we print it)
|
|
ReadNormalText();
|
|
|
|
while( *pchar && ReadItem() )
|
|
{
|
|
if( LastItem().type == Item::opening )
|
|
{
|
|
CheckExceptions();
|
|
}
|
|
else
|
|
if( LastItem().type == Item::special || LastItem().type == Item::simple )
|
|
{
|
|
if( stack_len > 1 )
|
|
{
|
|
pstack[stack_len-2].new_line = LastItem().new_line;
|
|
}
|
|
else
|
|
{
|
|
// one new line after a simple or special tag
|
|
// (if the tag has level 0 in the tree - it not means that this is a first tag)
|
|
// for example can be DOCTYPE
|
|
PutNewLine();
|
|
}
|
|
|
|
PopStack();
|
|
}
|
|
else
|
|
if( LastItem().type == Item::closing )
|
|
{
|
|
CheckClosingTags();
|
|
}
|
|
|
|
ReadNormalText();
|
|
}
|
|
|
|
// sometimes ReadItem() can return a false (when there is no space on the stack)
|
|
// we print the rest html without filtering
|
|
PrintRest();
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|