winix/core/htmlfilter.cpp

/*
 * This file is a part of Winix
 * and is not publicly distributed
 *
 * Copyright (c) 2008-2010, Tomasz Sowa
 * All rights reserved.
 *
 */

#include "htmlfilter.h"


void HTMLFilter::Item::Clear()
{
	name[0]  = 0;
	name_len = 0;
	type     = none;
	new_line = false;
}


HTMLFilter::Item::Item()
{
	Clear();
}


void HTMLFilter::Filter(const char * in, std::string & out)
{
	pchar         = in;
	stack_len     = 0;
	out_string    = &out;
	last_new_line = false;
	out_string->clear();

	Init();
	Read();
	Deinit();
}


void HTMLFilter::Init()
{
}


void HTMLFilter::Deinit()
{
}


void HTMLFilter::Filter(const std::string & in, std::string & out)
{
	out.reserve(in.size() * 2 + 1);
	Filter(in.c_str(), out);
}


HTMLFilter::HTMLFilter()
{
	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
	buffer = new char[WINIX_HTMLFILTER_BUFFER_MAXLEN];

	tab_size    = 2;
	trim_white  = false;
	break_after = 0;
	lang        = lang_none;
	orphan_mode = orphan_nbsp;
	safe_mode   = false;
}


HTMLFilter::HTMLFilter(const HTMLFilter & f)
{
	// don't need to copy the stack
	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
	buffer = new char[WINIX_HTMLFILTER_BUFFER_MAXLEN];
}


HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
{
	// don't need to copy the stack
	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
	buffer = new char[WINIX_HTMLFILTER_BUFFER_MAXLEN];

return *this;
}


HTMLFilter::~HTMLFilter()
{
	delete [] pstack;
	delete [] buffer;
}


void HTMLFilter::BreakLines(size_t break_after_)
{
	break_after = break_after_;

	if( break_after > 10000 )
		break_after = 10000;
}


void HTMLFilter::TrimWhite(bool trim)
{
	trim_white = trim;
}


void HTMLFilter::InsertTabs(size_t tabsize)
{
	tab_size = tabsize;

	if( tab_size > 1000 )
		tab_size = 1000;
}


void HTMLFilter::CheckOrphans(HTMLFilter::Lang lang_, HTMLFilter::OrphanMode mode)
{
	lang = lang_;
	orphan_mode = mode;
}


void HTMLFilter::SafeMode(bool safe_mode_)
{
	safe_mode = safe_mode_;
}


HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
{
	if( i >= stack_len )
	{
		empty.Clear();
		return empty;
	}

return pstack[i];
}


HTMLFilter::Item & HTMLFilter::LastItem()
{
	if( stack_len == 0 )
	{
		empty.Clear();
		return empty;
	}

return pstack[stack_len-1];
}


bool HTMLFilter::PushStack()
{
	if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
		// oops, too many items
		return false;

	pstack[stack_len].Clear();
	stack_len += 1;

return true;
}

void HTMLFilter::PopStack()
{
	if( stack_len == 0 )
		// oops
		return;

	stack_len -= 1;
	pstack[stack_len].Clear();
}


bool HTMLFilter::IsWhite(int c)
{
	// dont use c==10 here

	if( c==' ' || c=='\t' || c==13 || c==160 )
		return true;

return false;
}


void HTMLFilter::SkipWhite()
{
	while( IsWhite(*pchar) )
		++pchar;
}


void HTMLFilter::SkipWhiteLines()
{
	while( *pchar==10 || IsWhite(*pchar) )
		++pchar;
}


void HTMLFilter::SkipWhiteWithFirstNewLine()
{
	SkipWhite();

	if( *pchar == 10 )
	{
		pchar += 1;
		SkipWhite();
	}
}


void HTMLFilter::CheckNewLine()
{
const char * start = pchar;

	SkipWhite();
	last_new_line = (*pchar==10);

	pchar = start;
}


bool HTMLFilter::IsClosingTagForLastItem()
{
	pchar += 1;
	SkipWhite();

	if( *pchar == '/' )
	{
		pchar += 1;
		SkipWhite();

		if( IsNameEqual(pchar, LastItem().name, LastItem().name_len) )
		{
			pchar += LastItem().name_len;
			SkipWhite();

			if( IsClosingTagMark() )
			{
				pchar += 1;
				return true;
			}
		}
	}

return false;
}


// used for such tags as: script, pre, textarea
void HTMLFilter::PutLastTagWithClosingTag()
{
const char * start = pchar;


	while( *pchar != 0 )
	{
		if( IsOpeningTagMark() )
		{
			if( IsClosingTagForLastItem() )
			{
				PopStack();
				CheckNewLine();
				break;
			}
		}
		else
		{
			pchar += 1;
		}
	}

	Put(start, pchar);
}


const char * HTMLFilter::SkipItemCheckXmlSimple()
{
const char * end = pchar;


	while( *pchar!=0 )
	{
		while( *pchar!=0 && !IsClosingTagMark() && !IsClosingXmlSimpleTagMark())
			++pchar;

		if( IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
		{
			end = pchar;
			++pchar;
			SkipWhite();

			if( IsClosingTagMark() )
			{
				++pchar;
				LastItem().type = Item::simple;
				break;
			}
		}
		else
		if( IsClosingTagMark() )
		{
			end = pchar;
			++pchar;
			break;
		}
	}

return end;
}


bool HTMLFilter::IsValidCharForName(int c)
{
	if( (c>='a' && c<='z') ||
		(c>='A' && c<='Z') ||
		(c>='0' && c<='9') ||
		c=='-' || c=='!' )
		return true;

return false;
}


void HTMLFilter::ReadItemName()
{
size_t i;

	for( i=0 ; IsValidCharForName(*pchar) && i<WINIX_HTMLFILTER_ITEM_MAXLEN-1 ; ++i )
	{
		LastItem().name[i] = *pchar;
		++pchar;
	}

	LastItem().name[i] = 0;
	LastItem().name_len = i;
}


void HTMLFilter::Put(const char * str, const char * end)
{
	if( str>=end )
		return;

	size_t len = end - str;
	out_string->append(str, len);
}


int HTMLFilter::CheckOrphan(const char * str, const char * end, const char * orphan)
{
size_t res;

	for( ; str<end && *orphan!=0  ; ++str, ++orphan )
	{
		res = ToLower(*(unsigned const char*)str) - ToLower(*(unsigned const char*)orphan);

		if( res != 0 )
			return res;
	}

	if( str < end )
		return ToLower(*(unsigned const char*)str);

return -ToLower(*(unsigned const char*)orphan);
}


// binary search in table
// o1 - index of the first element
// o2 - index of the last element
bool HTMLFilter::CheckOrphanTable(const char * str, const char * end, const char ** table, size_t o1, size_t o2)
{
int res;

	res = CheckOrphan(str, end, table[o1]);

	if( res == 0 )
		return true;

	if( res < 0 )
		return false;

	res = CheckOrphan(str, end, table[o2]);

	if( res == 0 )
		return true;

	if( res > 0 )
		return false;


	while( o1 + 1 < o2 )
	{
		size_t o = (o1 + o2) / 2;
		res = CheckOrphan(str, end, table[o]);

		if( res == 0 )
			return true;

		if( res < 0 )
			o2 = o;
		else
			o1 = o;
	}


return false;
}


bool HTMLFilter::CheckOrphanLangPl(const char * str, const char * end)
{
	// the table must be sorted in alphabetical order
	// polish letters coded in iso-8859-2
	static const char * orphans[] = {
		"(np.", "s.",  "a",  "ale", "bo",   "by",  "co", "czy", "do", "go", "i",
		"ich",	"ja",  "je", "jej", "jest", "ju<EFBFBD>", "j<EFBFBD>", "ku",  "li", "mi", "na",
		"nie",  "np.", "nr", "o",   "od",   "po",  "s<EFBFBD>", "ta",  "to", "tu", "t<EFBFBD>",
		"t<EFBFBD>",   "u",   "w",  "we",  "wy",   "z",   "za", "ze",  "<EFBFBD>e", "<EFBFBD>w"
	};

	size_t o1 = 0;
	size_t o2 = sizeof(orphans) / sizeof(const char*) - 1;

return CheckOrphanTable(str, end, orphans, o1, o2);
}


// SK i CZ
bool HTMLFilter::CheckOrphanLangCz(const char * str, const char * end)
{
	// the table must be sorted in alphabetical order
	static const char * orphans[] = {
		"a", "i", "k", "o", "s", "u", "v", "z"
	};

	size_t o1 = 0;
	size_t o2 = sizeof(orphans) / sizeof(const char*) - 1;

return CheckOrphanTable(str, end, orphans, o1, o2);
}


bool HTMLFilter::CheckOrphan(const char * str, const char * end)
{
	if( str == end || lang == lang_none )
		return false;

	if( lang == lang_cz || lang == lang_sk )
		return CheckOrphanLangCz(str, end);

return CheckOrphanLangPl(str, end);
}


size_t HTMLFilter::PutNormalTextFillBuffer(const char * & str, const char * & end)
{
const char * word = str; // pointing at the beginning of a word
size_t i = 0;
// some space in the buffer for non break spaces (orphans) and spaces at the beginning of a line
size_t epsilon = WINIX_HTMLFILTER_BUFFER_MAXLEN / 10 + 1;
bool is_white;
bool was_white = true;
size_t non_whites = 0;


	for( ; str < end && i<WINIX_HTMLFILTER_BUFFER_MAXLEN-epsilon ; ++str )
	{
		is_white = (*str==10 || IsWhite(*str));

		if( is_white && !was_white )
		{
			if( CheckOrphan(word, str) )
			{
				i += PutNonBreakSpaceToBuffer(i);

				// here we have to skip the whole white string
				for( ; (*str==10 || IsWhite(*str)) && str < end ; ++str );

				if( str == end )
					break;

				is_white  = false;
				was_white = true;
			}
		}


		// skipping the last new line character (if exists)

		if( *str == 10 && str < end-1 )
		{
			buffer[i] = *str;
			i += 1;
			i += PutTabsToBuffer(i, stack_len);
		}
		else
		if( *str != 10 )
		{
			buffer[i] = *str;
			i += 1;
		}

		if( was_white && !is_white )
			word = str;

		if( !is_white )
			non_whites += 1;
		else
			non_whites = 0;

		if( break_after!=0 && non_whites>=break_after )
		{
			buffer[i] = ' ';
			i += 1;
			non_whites = 0;
		}

		was_white = is_white;
	}

return i;
}


size_t HTMLFilter::PutNormalTextTrimFillBuffer(const char * & str, const char * & end)
{
const char * word = str; // pointint at the beginning of a word
size_t non_whites = 0;
size_t i = 0;
bool is_white;
// some space in the buffer for non break spaces (orphans) and spaces at the beginning of a line
size_t epsilon = WINIX_HTMLFILTER_BUFFER_MAXLEN / 10 + 1;


	for( ; str < end && i<WINIX_HTMLFILTER_BUFFER_MAXLEN-epsilon ; ++str )
	{
		is_white = (*str==10 || IsWhite(*str));

		if( is_white )
		{
			if( CheckOrphan(word, str) )
			{
				i += PutNonBreakSpaceToBuffer(i);
				is_white = false;
			}
			else
			{
				non_whites = 0;
			}

			// skipping the whole white string
			for( ; (*str==10 || IsWhite(*str)) && str < end ; ++str );

			word = str;
		}

		if( !is_white )
			non_whites += 1;
		else
			non_whites = 0;

		if( (break_after!=0 && non_whites>break_after) || is_white )
		{
			buffer[i] = ' ';
			i += 1;
			non_whites = 1;
		}

		if( str < end )
		{
			buffer[i] = *str;
			i += 1;
		}
	}

return i;
}


void HTMLFilter::PutNormalText(const char * str, const char * end)
{
size_t buf_len;

	while( str < end )
	{
		buf_len = PutNormalTextFillBuffer(str, end);
		Put(buffer, buffer+buf_len);
	}
}


void HTMLFilter::PutNormalTextTrim(const char * str, const char * end)
{
size_t buf_len;

	while( str < end )
	{
		buf_len = PutNormalTextTrimFillBuffer(str, end);
		Put(buffer, buffer+buf_len);
	}
}


void HTMLFilter::PutOpeningTagMark()
{
	(*out_string) += '<';
}


void HTMLFilter::PutClosingTagMark()
{
	(*out_string) += '>';
}


void HTMLFilter::PutTagName(const char * name)
{
	(*out_string) += name;
}


bool HTMLFilter::IsTagSafe(const char * tag)
{
	if( !safe_mode )
		return true;

	static const char * unsafe_tags[] = {
		"script", "iframe", "frame", "frameset",
		"applet", "head", "meta", "html", "link", "body"
	};

	size_t len = sizeof(unsafe_tags) / sizeof(const char*);
	size_t i;

	for(i=0 ; i<len ; ++i)
	{
		if( IsNameEqual(tag, unsafe_tags[i]) )
			return false;
	}

return true;
}


// start, end - arguments
void HTMLFilter::PutOpeningTag(const char * start, const char * end)
{
	if( !IsTagSafe(LastItem().name) )
		return;

	PutOpeningTagMark();
	PutTagName(LastItem().name);

	if( start != end )
	{
		(*out_string) += ' ';
		Put(start, end);
	}

	PutClosingTagMark();
}


void HTMLFilter::PutClosingTag(const char * tag)
{
	if( !IsTagSafe(tag) )
		return;

	PutOpeningTagMark();
	(*out_string) += '/';
	PutTagName(tag);
	PutClosingTagMark();
}


size_t HTMLFilter::PutTabsToBuffer(size_t index, size_t len)
{
	if( len == 0 )
		return 0;

	if( len > 20 )
		len = 20;

	// how many spaces do you want
	size_t spaces = len * tab_size;
	size_t i = 0;

	if( index+spaces < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 )
	{
		for( ; i<spaces ; ++i )
			buffer[index+i] = ' ';
	}

return i;
}


size_t HTMLFilter::PutNonBreakSpaceToBuffer(size_t index)
{
size_t i = 0;

	if( orphan_mode == orphan_nbsp )
	{
		static const char nb[] = "&nbsp;";
		size_t len = sizeof(nb) / sizeof(char) - 1; // '0' at the end

		if( index+len < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 )
		{
			for( ; i<len ; ++i )
				buffer[index+i] = nb[i];
		}
	}
	else
	{
		if( index+1 < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 )
		{
			i = 1;
			buffer[index] = (char)160;
		}
	}

return i; // return i not len (can be zero)
}


void HTMLFilter::PutTabs(size_t len)
{
	size_t i = PutTabsToBuffer(0, len);
	Put(buffer, buffer+i);
}


void HTMLFilter::PutNewLine()
{
	buffer[0] = 10;
	Put(buffer, buffer+1);
}


// we assume the size of the opening mark to be one
bool HTMLFilter::IsOpeningTagMark()
{
	return (*pchar == '<');
}


// we assume the size of the closing mark to be one
bool HTMLFilter::IsClosingTagMark()
{
	return (*pchar == '>');
}


// the slash at the end <img src=".." /> (without '>' character)
// we assume the size of the mark to be one
bool HTMLFilter::IsClosingXmlSimpleTagMark()
{
	return (*pchar == '/');
}


bool HTMLFilter::IsOpeningCommentaryTagMark()
{
static char comm_open[] = "<!--";
size_t comm_open_len = sizeof(comm_open) / sizeof(char) - 1;

	return IsNameEqual(pchar, comm_open, comm_open_len);
}


size_t HTMLFilter::OpeningCommentaryTagMarkSize()
{
	return 4; // size of "<!--"
}


// skipping the commentary tag if exists
bool HTMLFilter::SkipCommentaryTagIfExists()
{
static char comm_close[] = "-->";
size_t comm_close_len = sizeof(comm_close) / sizeof(char) - 1;

	if( !IsOpeningCommentaryTagMark() )
		return false;

	pchar += OpeningCommentaryTagMarkSize();

	// looking for "-->"
	while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
		++pchar;

	if( *pchar!= 0 )
		pchar += comm_close_len;

	CheckNewLine();

return true;
}


void HTMLFilter::ReadNormalTextSkipWhite(const char * & start, const char * & last_non_white)
{
	if( trim_white )
	{
		// skipping all white chars (with new lines)
		// but with remembering the last non white character
		for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar)
			if( *pchar == 10 )
				last_non_white = pchar;
	}
	else
	{
		// skipping first white chars with only one line between them
		SkipWhite();
		last_non_white = pchar;

		if( *pchar == 10 )
		{
			++pchar;
			SkipWhite();
		}
	}

	start = pchar;

	// exception for the commentary tag
	if( IsOpeningCommentaryTagMark() || !IsOpeningTagMark() )
	{
		PutNewLine();
		PutTabs(stack_len);
	}
}


// reading text between html tags
void HTMLFilter::ReadNormalText()
{
const char * start = pchar;
const char * last_non_white = pchar;

	if( last_new_line )
		ReadNormalTextSkipWhite(start, last_non_white);


	while( *pchar != 0 )
	{
		if( SkipCommentaryTagIfExists() )
		{
			last_non_white = pchar - 1; // pointing at the last '>' from a commentary
		}
		else
		{
			if( IsOpeningTagMark() )
				break;

			if( !IsWhite(*pchar) )
				last_non_white = pchar;

			pchar += 1;
		}
	}


	last_new_line = (*last_non_white == 10);

	if( trim_white )
		PutNormalTextTrim(start, pchar);
	else
		PutNormalText(start, pchar);
}


// start, end - parameters to a tag
void HTMLFilter::PrintItem(const char * start, const char * end)
{
	if( last_new_line )
	{
		PutNewLine();

		if( stack_len > 1 )
			PutTabs(stack_len-1);
	}

	PutOpeningTag(start, end);
}


bool HTMLFilter::ReadItem()
{
const char * start = pchar;

	if( *pchar == 0 )
		return false;

	if( !PushStack() )
		return false;

	// we have '<'
	pchar += 1;
	SkipWhite();

	if( *pchar == '/' ) // we have a closing tag
	{
		pchar += 1;
		SkipWhite();
		LastItem().type = Item::closing;
	}

	ReadItemName();
	SkipWhite();
	start = pchar; // arguments start here

	if( LastItem().type != Item::closing )
		LastItem().type = (LastItem().name[0] == '!') ? Item::special : Item::opening;

	const char * end = SkipItemCheckXmlSimple();

	if( LastItem().type != Item::closing )
		PrintItem(start, end);

	CheckNewLine();
	LastItem().new_line = last_new_line;

return true;
}


int HTMLFilter::ToLower(int c)
{
	if( c>='A' && c<='Z' )
		return c - 'A' + 'a';

return c;
}


bool HTMLFilter::IsNameEqual(const char * name1, const char * name2)
{
	for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
		if( ToLower(*name1) != ToLower(*name2) )
			return false;

	if( *name1==0 && *name2==0 )
		return true;

return false;
}


// len characters from both strings must be equal
bool HTMLFilter::IsNameEqual(const char * name1, const char * name2, size_t len)
{
	for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
		if( ToLower(*name1) != ToLower(*name2) )
			return false;

	if( len == 0 )
		return true;

return false;
}


bool HTMLFilter::IsLastTag(const char * name)
{
	const char * tag = LastItem().name;

	return IsNameEqual(name, tag);
}


// checking exceptions for opening tags
void HTMLFilter::CheckExceptions()
{
	if( IsLastTag("meta")	||
		IsLastTag("input")	||
		IsLastTag("br")		||
		IsLastTag("hr")		||
		IsLastTag("img")	||
		IsLastTag("link")   )
	{
		LastItem().type = Item::simple;
		PopStack();
		return;
	}

	// in safe_mode the script tag is ignored
	if( !safe_mode && IsLastTag("script") )
		PutLastTagWithClosingTag();

	if( IsLastTag("pre") || IsLastTag("textarea") )
		PutLastTagWithClosingTag();
}


void HTMLFilter::AddForgottenTags()
{
int i;

	if( stack_len < 3 )
		return;

	// we have forgotten to close some tags

	// looking whether there is a matching opening tag
	for(i=int(stack_len)-3 ; i>=0 ; --i)
		if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
			break;

	if( i < 0 )
	{
		// oops, there is no such a tag
		// we don't print the closing and the missing opening tag
		PopStack();
		return;
	}

	for(int z=(int)stack_len-2 ; z>=i ; --z)
	{
		if( pstack[z].new_line )
		{
			PutNewLine();
			PutTabs(z);
		}

		PutClosingTag(pstack[z].name);
		pstack[z].Clear();
	}

	last_new_line = pstack[stack_len-1].new_line;

	// invalidate tags
	stack_len = i;
}


void HTMLFilter::CheckStackPrintRest()
{
	while( stack_len-- > 0 )
	{
		if( stack_len==0 || pstack[stack_len-1].new_line )
			PutNewLine();

		PutTabs(stack_len);
		PutClosingTag(pstack[stack_len].name);
	}
}


void HTMLFilter::CheckClosingTags()
{
	if( stack_len == 0 )
		return;

	// on the stack we have only opening tags
	// but only the last tag is a closing tag

	if( stack_len == 1 )
	{
		// there is only last closing tag
		// we dont print it
		PopStack();
		return;
	}

	// there are more than one tag
	if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
	{
		// last closing tag is from the previous one
		if( pstack[stack_len-2].new_line )
		{
			PutNewLine();
			PutTabs(stack_len-2);
		}

		PutClosingTag(pstack[stack_len-1].name);
		last_new_line = pstack[stack_len-1].new_line;
		PopStack();
		PopStack();
		return;
	}

	AddForgottenTags();
}


bool HTMLFilter::PrintRest()
{
const char * start = pchar;

	while( *pchar )
		++pchar;

	if( pchar > start )
	{
		Put(start, pchar);
		return true;
	}

return false;
}


void HTMLFilter::Read()
{
	if( trim_white )
		SkipWhiteLines();

	// it can be some text or white lines before the first html tag (we print it)
	ReadNormalText();

	while( ReadItem() )
	{
		if( LastItem().type == Item::opening )
		{
			CheckExceptions();
		}
		else
		if( LastItem().type == Item::special || LastItem().type == Item::simple )
		{
			if( stack_len > 1 )
			{
				pstack[stack_len-2].new_line = LastItem().new_line;
			}
			else
			if( trim_white )
			{
				// one new line after a simple or special tag
				// (if the tag has level 0 in the tree - it not means that this is a first tag)
				// for example can be DOCTYPE
				PutNewLine();
			}

			PopStack();
		}
		else
		if( LastItem().type == Item::closing )
		{
			CheckClosingTags();
		}

		ReadNormalText();
	}

	// sometimes ReadItem() can return a false (when there is no space on the stack)
	// we print the rest html without filtering
	if( !PrintRest() )
		CheckStackPrintRest();
}