winix/core/htmlfilter.cpp

/*
 * This file is a part of Winix
 * and is distributed under the 2-Clause BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */

/*
 * Copyright (c) 2008-2014, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include "htmlfilter.h"


namespace Winix
{


void HTMLFilter::Item::Clear()
{
	name.clear();
	type         = none;
	porphans     = 0;
	new_line     = false;
	has_body_tag = false;
}


HTMLFilter::Item::Item()
{
	Clear();
}


void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
{
	pchar         = in;
	stack_len     = 0;
	out_string    = &out;
	last_new_line = false;
	line_len      = 0;
	out_string->clear();

	Init();
	Read();
	Uninit();
}


void HTMLFilter::Init()
{
}


void HTMLFilter::Uninit()
{
}


void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
{
size_t out_projected_len = in.size() * 2 + 1;

	if( out.capacity() < out_projected_len )
		out.reserve(out_projected_len);

	Filter(in.c_str(), out);
}


HTMLFilter::HTMLFilter()
{
	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];

	tab_size    = 2;
	trim_white  = false;
	break_after = 0;
	wrap_line   = 0;
	orphan_mode = orphan_nbsp;
	safe_mode   = false;
}


HTMLFilter::HTMLFilter(const HTMLFilter & f)
{
	// don't need to copy the stack
	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
}


HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
{
	// don't need to copy the stack
	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];

return *this;
}


HTMLFilter::~HTMLFilter()
{
	delete [] pstack;
	delete [] buffer;
}


void HTMLFilter::BreakWord(size_t break_after_)
{
	break_after = break_after_;

	if( break_after > 10000 )
		break_after = 10000;
}


void HTMLFilter::WrapLine(size_t wrap_line_)
{
	wrap_line = wrap_line_;

	if( wrap_line > 10000 )
		wrap_line = 10000;
}


void HTMLFilter::TrimWhite(bool trim)
{
	trim_white = trim;
}


void HTMLFilter::InsertTabs(size_t tabsize)
{
	tab_size = tabsize;

	if( tab_size > 1000 )
		tab_size = 1000;
}


void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans)
{
size_t i;

	orphans.max_len = 0;

	for(i=0 ; i<orphans.tab.size() ; ++i)
	{
		if( orphans.tab[i].size() > orphans.max_len )
			orphans.max_len = orphans.tab[i].size();
	}
}


void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
{
	lang_code_lower = lang_code;
	ToLower(lang_code_lower);

	orphans_temp.tab = otab;
	std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end());
	CalcOrphansMaxLen(orphans_temp);

	orphans_tab[lang_code_lower] = orphans_temp;
}


void HTMLFilter::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
{
	AssignOrphans(lang_code.c_str(), otab);
}


void HTMLFilter::ClearOrphans()
{
	orphans_tab.clear();
}


void HTMLFilter::OrphansMode(HTMLFilter::OrphanMode mode)
{
	orphan_mode = mode;
}


void HTMLFilter::SafeMode(bool safe_mode_)
{
	safe_mode = safe_mode_;
}


void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
{
	no_filter_tag = tag_name;
}


HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
{
	if( i >= stack_len )
	{
		empty.Clear();
		return empty;
	}

return pstack[i];
}


HTMLFilter::Item & HTMLFilter::LastItem()
{
	if( stack_len == 0 )
	{
		empty.Clear();
		return empty;
	}

return pstack[stack_len-1];
}


bool HTMLFilter::PushStack()
{
	if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
		// oops, too many items
		return false;

	pstack[stack_len].Clear();

	if( stack_len > 0 )
	{
		// 'porphans' and 'has_body_tag' attributes are propagated
		pstack[stack_len].porphans     = pstack[stack_len-1].porphans;
		pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag;
	}

	stack_len += 1;

return true;
}

void HTMLFilter::PopStack()
{
	if( stack_len == 0 )
		// oops
		return;

	stack_len -= 1;
	pstack[stack_len].Clear();
}


bool HTMLFilter::IsWhite(int c)
{
	// dont use c==10 here

	if( c==' ' || c=='\t' || c==13 || c==160 )
		return true;

return false;
}


void HTMLFilter::SkipWhite()
{
	while( IsWhite(*pchar) )
		++pchar;
}


void HTMLFilter::SkipWhiteLines()
{
	while( *pchar==10 || IsWhite(*pchar) )
		++pchar;
}


void HTMLFilter::SkipWhiteWithFirstNewLine()
{
	SkipWhite();

	if( *pchar == 10 )
	{
		pchar += 1;
		SkipWhite();
	}
}


void HTMLFilter::SkipWhiteLines(const wchar_t * & str, const wchar_t * end)
{
	while( str < end && (*str==10 || IsWhite(*str)) )
		++str;
}


void HTMLFilter::CheckNewLine()
{
const wchar_t * start = pchar;

	SkipWhite();
	last_new_line = (*pchar==10);

	pchar = start;
}


bool HTMLFilter::IsClosingTagForLastItem()
{
	pchar += 1;
	SkipWhite();

	if( *pchar == '/' )
	{
		pchar += 1;
		SkipWhite();

		if( IsNameEqual(pchar, LastItem().name, LastItem().name.size()) )
		{
			pchar += LastItem().name.size();
			SkipWhite();

			if( IsClosingTagMark() )
			{
				pchar += 1;
				return true;
			}
		}
	}

return false;
}


// used for such tags as: script, pre, textarea
void HTMLFilter::PutLastTagWithClosingTag()
{
const wchar_t * start = pchar;

	while( *pchar != 0 )
	{
		if( IsOpeningTagMark() )
		{
			if( IsClosingTagForLastItem() )
			{
				PopStack();
				CheckNewLine();
				break;
			}
		}
		else
		{
			pchar += 1;
		}
	}

	Put(start, pchar);
}


// used with <nofilter> </nofilter> tags
void HTMLFilter::PutTextBetweenLastTagWithClosingTag()
{
const wchar_t * start = pchar, * end = pchar;

	while( *pchar != 0 )
	{
		if( IsOpeningTagMark() )
		{
			if( IsClosingTagForLastItem() )
			{
				PopStack();
				CheckNewLine();
				break;
			}
		}
		else
		{
			pchar += 1;
			end = pchar;
		}
	}

	Put(start, end);
}


void HTMLFilter::SkipAndCheckClosingTag()
{
	for( ; *pchar ; ++pchar )
	{
		if( LastItem().type == Item::opening && IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
		{
			LastItem().type = Item::simple;
		}

		if( IsClosingTagMark() )
		{
			++pchar;
			break;
		}
	}
}


bool HTMLFilter::IsValidCharForName(int c)
{
	if( (c>='a' && c<='z') ||
		(c>='A' && c<='Z') ||
		(c>='0' && c<='9') ||
		c=='-' || c=='!' )
		return true;

return false;
}


bool HTMLFilter::IsValidCharForAttrName(int c)
{
	if( (c>='a' && c<='z') ||
		(c>='A' && c<='Z') ||
		(c>='0' && c<='9') ||
		c=='-' || c==':' )
		return true;

return false;
}


void HTMLFilter::ReadItemName()
{
size_t i;

	for( i=0 ; IsValidCharForName(*pchar) ; ++i )
	{
		if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN )
			LastItem().name += *pchar;

		++pchar;
	}
}


void HTMLFilter::ReadItemAttrName()
{
size_t i;

	attr_name.clear();

	for( i=0 ; *pchar && IsValidCharForAttrName(*pchar) ; ++i )
	{
		if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN )
			attr_name += *pchar;

		++pchar;
	}
}


void HTMLFilter::ReadItemAttrValue(bool has_quote)
{
size_t i;

	// sprawdzic to wszedzie bo teraz jest tablica
	attr_value.clear();
	attr_value_temp.clear();

	// !! dodac obsluge pojedynczego cudzyslowu

	for(i=0 ; *pchar && *pchar != '\"' && !IsClosingTagMark() && (has_quote || (*pchar!=10 && !IsWhite(*pchar)) ); ++i )
	{
		if( *pchar==10 || IsWhite(*pchar) )
		{
			if( !attr_value_temp.empty() )
			{
				attr_value.push_back(attr_value_temp);
				attr_value_temp.clear();
			}
		}
		else
		if( i < WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
			attr_value_temp += *pchar;

		++pchar;
	}

	if( !attr_value_temp.empty() )
	{
		attr_value.push_back(attr_value_temp);
		attr_value_temp.clear();
	}
}


void HTMLFilter::CheckChar(wchar_t c)
{
	if( c == 10 )
		line_len = 0;
	else
		line_len += 1;
}


void HTMLFilter::Put(wchar_t c)
{
	(*out_string) += c;
	CheckChar(c);
}


void HTMLFilter::Put(const wchar_t * str)
{
	out_string->append(str);

	for( ; *str ; ++str)
		CheckChar(*str);
}


void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
{
	if( str >= end )
		return;

	size_t len = end - str;
	out_string->append(str, len);

	for( ; str < end ; ++str)
		CheckChar(*str);
}


void HTMLFilter::Put(const std::wstring & str)
{
	out_string->append(str);

	for(size_t i=0 ; i<str.size() ; ++i)
		CheckChar(str[i]);
}


int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
{
size_t res;

	const wchar_t * orphan = orphan_str.c_str();

	for( ; str<end && *orphan!=0  ; ++str, ++orphan )
	{
		res = ToLower(*str) - ToLower(*orphan);

		if( res != 0 )
			return res;
	}

	if( str < end )
		return ToLower(*str);

return -int(ToLower(*orphan));
}


// binary search in table (table should be sorted)
bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
{
int res;

	if( table.empty() )
		return false;

	size_t o1 = 0;
	size_t o2 = table.size() - 1;

	res = CheckOrphan(str, end, table[o1]);

	if( res == 0 )
		return true;

	if( res < 0 )
		return false;

	res = CheckOrphan(str, end, table[o2]);

	if( res == 0 )
		return true;

	if( res > 0 )
		return false;


	while( o1 + 1 < o2 )
	{
		size_t o = (o1 + o2) / 2;
		res = CheckOrphan(str, end, table[o]);

		if( res == 0 )
			return true;

		if( res < 0 )
			o2 = o;
		else
			o1 = o;
	}

return false;
}


bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end)
{
	if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
		return false;

	size_t len = end - str;

	if( len > LastItem().porphans->max_len )
		return false;

return CheckOrphan(str, end, LastItem().porphans->tab);
}


// if there is a semicolon nearby then we break the line after it
// (useful in html entities)
// !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str)
// i wtedy tez nie dodajemy zadnego znaku
bool HTMLFilter::HasSemiloconAround(const wchar_t * str, const wchar_t * end)
{
size_t i, epsilon = 8;

	for(i=0 ; str < end && i<epsilon ; ++i, ++str)
		if( *str == ';' )
			return true;

return false;
}


void HTMLFilter::CheckLineWrap()
{
	if( wrap_line != 0 && LastItem().has_body_tag && line_len > wrap_line )
	{
		Put(10);
		PutTabs(stack_len);
	}
}


void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end)
{
const wchar_t * word = str;
size_t non_whites = 0;
bool was_semicolon = false;

	for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites )
	{
		if( break_after != 0 && non_whites >= break_after && (was_semicolon || !HasSemiloconAround(str, end)) )
		{
			Put(word, str);
			word           = str;
			non_whites     = 0;
			Put(' ');
			CheckLineWrap();
		}

		was_semicolon = (*str == ';');
	}

	Put(word, str);
}


void HTMLFilter::PutNormalWhite(const wchar_t * & str, const wchar_t * end)
{
	if( str < end )
	{
		if( trim_white )
		{
			Put(' ');
			SkipWhiteLines(str, end);
		}
		else
		{
			while( str < end && (*str==10 || IsWhite(*str)) )
			{
				Put(*str);

				if( *str == 10 )
					PutTabs(stack_len);

				++str;
			}
		}
	}
}


void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end)
{
const wchar_t * word, * white;

	if( str < end )
		CheckLineWrap();

	while( str < end )
	{
		word = str;
		PutNormalNonWhite(str, end);

		if( CheckOrphan(word, str) )
		{
			white = str;
			SkipWhiteLines(str, end);

			if( white < str )
				PutNonBreakingSpace();
		}
		else
		{
			PutNormalWhite(str, end);

			if( str < end ) // !! lub moze podobnie jak jest na gorze tutaj? juz nie mam sily myslec :(
				CheckLineWrap();
		}

		// for safety (if str was not incremented then there is an infinite loop)
		if( word == str )
			break;
	}
}


void HTMLFilter::PutOpeningTagMark()
{
	Put('<');
}


void HTMLFilter::PutClosingTagMark()
{
	Put('>');
}


// !! zmienic na lepsza nazwe
// bo to nie zwraca true jesli tag jest safe
bool HTMLFilter::IsTagSafe(const wchar_t * tag)
{
	if( !safe_mode )
		return true;

	if( IsNameEqual(tag, no_filter_tag.c_str()) )
		return false;

	static const wchar_t * unsafe_tags[] = {
		L"applet", 	L"base",	L"body",
		L"embed",	L"head",	L"html",
		L"frame",	L"frameset",L"iframe",
		L"link",	L"meta",	L"param"
		L"object",	L"script"
	};

	size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*);
	size_t i;

	for(i=0 ; i<len ; ++i)
	{
		if( IsNameEqual(tag, unsafe_tags[i]) )
			return false;
	}

return true;
}


bool HTMLFilter::IsTagSafe(const std::wstring & tag)
{
	return IsTagSafe(tag.c_str());
}


bool HTMLFilter::PutOpeningTag()
{
	if( !IsTagSafe(LastItem().name) )
		// !! IMPROVE ME
		// !! dodac tutaj skipniecie calego tagu
		return false;

	PutOpeningTagMark();
	Put(LastItem().name);

return true;
}


void HTMLFilter::PutClosingTag(const wchar_t * tag)
{
	if( !IsTagSafe(tag) )
		return;

	PutOpeningTagMark();
	Put('/');
	Put(tag);
	PutClosingTagMark();
}


void HTMLFilter::PutTabs(size_t len)
{
	if( len > 30 )
		len = 30;

	for(size_t i=0 ; i < (len*tab_size) ; ++i)
		(*out_string) += ' '; // we do not add them to 'line_len'
}


void HTMLFilter::PutNonBreakingSpace()
{
	if( orphan_mode == orphan_nbsp )
	{
		Put(L"&nbsp;");
	}
	else
	{
		Put(160);
	}
}


void HTMLFilter::PutNewLine()
{
	buffer[0] = 10;
	Put(buffer, buffer+1);
	line_len = 0;
}


// we assume the size of the opening mark to be one
bool HTMLFilter::IsOpeningTagMark()
{
	return (*pchar == '<');
}


// we assume the size of the closing mark to be one
bool HTMLFilter::IsClosingTagMark()
{
	return (*pchar == '>');
}


// the slash at the end <img src=".." /> (without '>' character)
// we assume the size of the mark to be one
bool HTMLFilter::IsClosingXmlSimpleTagMark()
{
	return (*pchar == '/');
}


bool HTMLFilter::IsOpeningCommentaryTagMark()
{
static wchar_t comm_open[] = L"<!--";
size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;

	return IsNameEqual(pchar, comm_open, comm_open_len);
}


size_t HTMLFilter::OpeningCommentaryTagMarkSize()
{
	return 4; // size of "<!--"
}


// skipping the commentary tag if exists
bool HTMLFilter::SkipCommentaryTagIfExists()
{
static wchar_t comm_close[] = L"-->";
size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;

	if( !IsOpeningCommentaryTagMark() )
		return false;

	pchar += OpeningCommentaryTagMarkSize();

	// looking for "-->"
	while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
		++pchar;

	if( *pchar!= 0 )
		pchar += comm_close_len;

	CheckNewLine();

return true;
}


void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white)
{
	if( trim_white )
	{
		// skipping all white chars (with new lines)
		// but with remembering the last non white character
		for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar)
			if( *pchar == 10 )
				last_non_white = pchar;
	}
	else
	{
		// skipping first white chars with only one line between them
		SkipWhite();
		last_non_white = pchar;

		if( *pchar == 10 )
		{
			++pchar;
			SkipWhite();
		}
	}

	start = pchar;

	// exception for the commentary tag
	if( IsOpeningCommentaryTagMark() || !IsOpeningTagMark() )
	{
		PutNewLine();
		PutTabs(stack_len);
	}
}


// reading text between html tags
void HTMLFilter::ReadNormalText()
{
const wchar_t * start = pchar;
const wchar_t * last_non_white = pchar;

	if( last_new_line )
		ReadNormalTextSkipWhite(start, last_non_white);


	while( *pchar != 0 )
	{
		if( SkipCommentaryTagIfExists() )
		{
			last_non_white = pchar - 1; // pointing at the last '>' from a commentary
		}
		else
		{
			if( IsOpeningTagMark() )
				break;

			if( !IsWhite(*pchar) )
				last_non_white = pchar;

			pchar += 1;
		}
	}

	last_new_line = (*last_non_white == 10);
	PutNormalText(start, pchar);
}


bool HTMLFilter::PrintOpeningItem()
{
	if( IsNameEqual(no_filter_tag, LastItem().name) )
		return true;

	if( last_new_line )
	{
		PutNewLine();

		if( stack_len > 1 )
			PutTabs(stack_len-1);
	}

return PutOpeningTag();
}


bool HTMLFilter::ReadItemAttr()
{
	attr_has_value = false;
	attr_name.clear();
	attr_value.clear();

	SkipWhiteLines();
	ReadItemAttrName();

	if( attr_name.empty() )
		return false;

	SkipWhiteLines();

	if( *pchar != '=' )
		return true;

	attr_has_value = true;
	pchar += 1;				// skipping '='
	SkipWhiteLines();


	// !! dodac obsluge pojedynczego cudzyslowu
	bool has_quote = (*pchar == '\"');

	if( has_quote )
		pchar += 1;			// skipping the first quote mark

	ReadItemAttrValue(has_quote);

	if( *pchar == '\"' )
		pchar += 1;			// skipping the last quote mark

return true;
}


bool HTMLFilter::CheckItemAttr()
{
	if( attr_has_value && IsNameEqual(L"lang", attr_name) )
	{
		LastItem().porphans = 0;

		if( !attr_value.empty() )
		{
			// we are taking the first value only
			attr_value_lower = attr_value[0];
			ToLower(attr_value_lower);

			OrphansTab::iterator i = orphans_tab.find(attr_value_lower);

			if( i != orphans_tab.end()  )
				LastItem().porphans = &i->second;
		}
	}

return true;
}


void HTMLFilter::PrintItemAttr()
{
size_t i;

	if( IsNameEqual(no_filter_tag, LastItem().name) )
		return;

	Put(' ');
	Put(attr_name);

	if( attr_has_value )
	{
		Put(L"=\"");

		for(i=0 ; i<attr_value.size() ; ++i)
		{
			Put(attr_value[i]);

			if( i + 1 < attr_value.size() )
				Put(' ');
		}

		Put('\"');
	}
}


void HTMLFilter::ReadItemClosing()
{
	pchar += 1; // skipping '/'
	SkipWhiteLines();
	ReadItemName();
	LastItem().type = Item::closing;
	SkipAndCheckClosingTag();

	// closing tags are printed later
}


void HTMLFilter::ReadItemSpecial()
{
	LastItem().type = Item::special;
	PutOpeningTagMark();
	const wchar_t * start = pchar;
	SkipAndCheckClosingTag();

	if( pchar > start )
		Put(start, pchar);

	// closing tag mark is printed directly from the source
}


void HTMLFilter::ReadItemOpening()
{
	LastItem().type = Item::opening;
	ReadItemName();

	if( PrintOpeningItem() )
	{
		while( ReadItemAttr() )
		{
			if( CheckItemAttr() )
				PrintItemAttr();
		}

		SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'

		if( !IsNameEqual(no_filter_tag, LastItem().name) )
		{
			if( LastItem().type == Item::simple )
				Put(L" /");

			PutClosingTagMark();
		}
	}
}


bool HTMLFilter::ReadItem()
{
	if( *pchar == 0 )
		return false;

	if( !PushStack() )
		return false;

	pchar += 1;	// skipping the first '<'
	SkipWhiteLines();

	if( *pchar == '!' )
		ReadItemSpecial();
	else
	if( *pchar == '/' ) // we have a closing tag (dodac jako metode wirtualna) !!
		ReadItemClosing();
	else
		ReadItemOpening();

	CheckNewLine();
	LastItem().new_line = last_new_line;

return true;
}


wchar_t HTMLFilter::ToLower(wchar_t c)
{
	if( c>='A' && c<='Z' )
		return c - 'A' + 'a';

return c;
}


void HTMLFilter::ToLower(std::wstring & str)
{
size_t i;

	for(i=0 ; i<str.size() ; ++i)
		str[i] = ToLower(str[i]);
}


bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
{
	for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
		if( ToLower(*name1) != ToLower(*name2) )
			return false;

	if( *name1==0 && *name2==0 )
		return true;

return false;
}


bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2)
{
	return IsNameEqual(name1, name2.c_str());
}


bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2)
{
	return IsNameEqual(name1.c_str(), name2);
}


bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2)
{
	return IsNameEqual(name1.c_str(), name2.c_str());
}


// len characters from both strings must be equal
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
{
	for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
		if( ToLower(*name1) != ToLower(*name2) )
			return false;

	if( len == 0 )
		return true;

return false;
}


bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
{
	return IsNameEqual(name1, name2.c_str(), len);
}


bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
{
	return IsNameEqual(name1.c_str(), name2, len);
}


bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
{
	return IsNameEqual(name1.c_str(), name2.c_str(), len);
}


bool HTMLFilter::IsLastTag(const wchar_t * name)
{
	return IsNameEqual(name, LastItem().name);
}


bool HTMLFilter::IsLastTag(const std::wstring & name)
{
	return IsNameEqual(name, LastItem().name);
}


// checking exceptions for opening tags
void HTMLFilter::CheckExceptions()
{
	if( IsLastTag(L"meta")	||
		IsLastTag(L"input")	||
		IsLastTag(L"br")	||
		IsLastTag(L"hr")	||
		IsLastTag(L"img")	||
		IsLastTag(L"link")	||
		IsLastTag(L"param")	||
		IsLastTag(L"col")	||
		IsLastTag(L"area")   )
	{
		LastItem().type = Item::simple;
		PopStack();
		return;
	}

	// in safe_mode the script tag is ignored
	if( !safe_mode && IsLastTag(L"script") )
		PutLastTagWithClosingTag();

	if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
		PutLastTagWithClosingTag();

	if( IsLastTag(no_filter_tag) )
		PutTextBetweenLastTagWithClosingTag();

	if( IsLastTag(L"body") )
		LastItem().has_body_tag = true;
}


void HTMLFilter::AddForgottenTags()
{
int i;

	if( stack_len < 3 )
		return;

	// we have forgotten to close some tags

	// looking whether there is a matching opening tag
	for(i=int(stack_len)-3 ; i>=0 ; --i)
		if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
			break;

	if( i < 0 )
	{
		// oops, there is no such a tag
		// we don't print the closing and the missing opening tag
		PopStack();
		return;
	}

	for(int z=(int)stack_len-2 ; z>=i ; --z)
	{
		if( pstack[z].new_line )
		{
			PutNewLine();
			PutTabs(z);
		}

		PutClosingTag(pstack[z].name.c_str());
		pstack[z].Clear();
	}

	last_new_line = pstack[stack_len-1].new_line;

	// invalidate tags
	stack_len = i;
}


void HTMLFilter::CheckStackPrintRest()
{
	while( stack_len-- > 0 )
	{
		if( stack_len==0 || pstack[stack_len-1].new_line )
			PutNewLine();

		PutTabs(stack_len);
		PutClosingTag(pstack[stack_len].name.c_str());
	}
}


void HTMLFilter::CheckClosingTags()
{
	if( stack_len == 0 )
		return;

	// on the stack we have only opening tags
	// but only the last tag is a closing tag

	if( stack_len == 1 )
	{
		// there is only last closing tag
		// we dont print it
		PopStack();
		return;
	}

	// there are more than one tag
	if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
	{
		// last closing tag is from the previous one
		if( pstack[stack_len-2].new_line )
		{
			PutNewLine();
			PutTabs(stack_len-2);
		}

		PutClosingTag(pstack[stack_len-1].name.c_str());
		last_new_line = pstack[stack_len-1].new_line;
		PopStack();
		PopStack();
	}
	else
	{
		AddForgottenTags();
	}
}


bool HTMLFilter::PrintRest()
{
const wchar_t * start = pchar;

	// in safe mode we do not print the rest html code
	if( safe_mode )
		return false;

	while( *pchar )
		++pchar;

	if( pchar > start )
	{
		Put(start, pchar);
		return true;
	}

return false;
}


void HTMLFilter::ReadLoop()
{
	while( ReadItem() )
	{
		if( LastItem().type == Item::opening )
		{
			CheckExceptions();
		}
		else
		if( LastItem().type == Item::special || LastItem().type == Item::simple )
		{
			if( stack_len > 1 )
			{
				pstack[stack_len-2].new_line = LastItem().new_line;
			}
			else
			if( trim_white )
			{
				// one new line after a simple or special tag
				// (if the tag has level 0 in the tree - it not means that this is a first tag)
				// for example can be DOCTYPE
				PutNewLine();
			}

			PopStack();
		}
		else
		if( LastItem().type == Item::closing )
		{
			CheckClosingTags();
		}

		ReadNormalText();
	}
}


void HTMLFilter::Read()
{
	if( trim_white )
		SkipWhiteLines();

	// it can be some text or white lines before the first html tag (we print it)
	ReadNormalText();

	// reading the whole html source
	ReadLoop();

	// sometimes there can remain some html source (when there is no space on the stack)
	// we print the rest html without filtering (only if safe_mode is false)
	if( !PrintRest() )
		CheckStackPrintRest();
}


} // namespace Winix