pikotools/src/html/htmlparser.cpp

/*
 * This file is a part of PikoTools
 * and is distributed under the (new) BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */

/*
 * Copyright (c) 2008-2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 *  * Neither the name Tomasz Sowa nor the names of contributors to this
 *    project may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "htmlparser.h"

#include "convert/text.h"


namespace pt
{
const int HTMLParser::WHITE_MODE_ORIGIN;
const int HTMLParser::WHITE_MODE_SINGLE_LINE;
const int HTMLParser::WHITE_MODE_TREE;


void HTMLParser::clear_input_flags()
{
	BaseParser::clear_input_flags();

	parsing_html     = true;
	xml_compact_mode = true;
	status           = ok;
	line             = 1;
	stack_len        = 0;
	out_string       = nullptr;
	out_stream       = nullptr;
	out_space        = nullptr;
	line_len         = 0;
}


void HTMLParser::Item::Clear()
{
	name.clear();
	type            = none;
	is_commentary   = false;
	porphans        = nullptr;
	new_line_before = false;
	new_line        = false;
	new_line_in_the_middle = false;
	has_body_tag    = false;
	tree_index      = 0;
	space           = nullptr;
}


HTMLParser::Item::Item()
{
	Clear();
}


void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
{
	clear_input_flags();

	pchar_unicode    = in;
	xml_compact_mode = compact_mode;
	out_space = &space;
	out_space->clear();

	Init();
	Read();
	Uninit();
}


HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
{
	clear_input_flags();

	parsing_html = false;
	reading_from_file = true;
	xml_compact_mode = compact_mode;
	this->out_space = &out_space;

	if( clear_space )
		this->out_space->clear();

	file.clear();
	file.open(file_name, std::ios_base::binary | std::ios_base::in);

	if( file )
	{
		Init();
		Read();
		Uninit();

		file.close();
	}
	else
	{
		status = cant_open_file;
	}

	return status;
}


HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space)
{
	return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
}


HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
{
	std::string file_name_utf8;
	wide_to_utf8(file_name, file_name_utf8);

	return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
}


HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space)
{
	return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
}


void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string)
{
	clear_input_flags();

	pchar_unicode = in;
	out_string    = &out;

	if( clear_out_string )
		out_string->clear();

	Init();
	Read();
	Uninit();
}


void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string)
{
	if( &in == &out )
	{
		// out cannot be the same string as in
		return;
	}

	size_t out_projected_len = in.size() * 2 + 1;

	if( out.capacity() < out_projected_len )
		out.reserve(out_projected_len);

	filter(in.c_str(), out, clear_out_string);
}


void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream)
{
	clear_input_flags();

	WTextStream::const_iterator begin = in.begin();
	WTextStream::const_iterator end = in.end();

	wtext_stream_iterator = &begin;
	wtext_stream_iterator_end = &end;

	out_stream = &out;

	if( clear_out_stream )
		out_stream->clear();

	Init();
	Read();
	Uninit();
}


HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream)
{
	clear_input_flags();

	reading_from_file = true;

	// open the file before clearing 'out' string, 'out' string can be the same string as the file_name
	file.clear();
	file.open(file_name, std::ios_base::binary | std::ios_base::in);

	out_string = &out;

	if( clear_out_stream )
		out_string->clear();

	if( file )
	{
		Init();
		Read();
		Uninit();

		file.close();
	}
	else
	{
		status = cant_open_file;
	}

	return status;
}


HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream)
{
	return filter_file(file_name.c_str(), out, clear_out_stream);
}


HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream)
{
	std::string file_name_utf8;
	pt::wide_to_utf8(file_name, file_name_utf8);

	return filter_file(file_name_utf8, out, clear_out_stream);
}


HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream)
{
	return filter_file(file_name.c_str(), out, clear_out_stream);
}


void HTMLParser::Init()
{
}


void HTMLParser::Uninit()
{
}


int HTMLParser::get_last_parsed_line()
{
	return line;
}


void HTMLParser::SetSomeDefaults()
{
	white_mode  = WHITE_MODE_ORIGIN;

	tab_size    = 2;
	wrap_line   = 0;
	orphan_mode = orphan_nbsp;
	safe_mode   = false;
	skip_tags = false;
	skip_commentaries = false;
	skip_entities = false;
	analyze_entities = false;
}


HTMLParser::HTMLParser()
{
	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];

	SetSomeDefaults();
}


HTMLParser::HTMLParser(const HTMLParser & f)
{
	// don't need to copy the stack
	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];

	SetSomeDefaults();
}


HTMLParser & HTMLParser::operator=(const HTMLParser & f)
{
	// don't need to copy the stack
	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];

	// we can copy some fields from f

return *this;
}


HTMLParser::~HTMLParser()
{
	delete [] pstack;
	delete [] buffer;
}


void HTMLParser::white_chars_mode(int mode)
{
	if( mode >= WHITE_MODE_ORIGIN && mode <= WHITE_MODE_TREE )
		white_mode = mode;
}


void HTMLParser::WrapLine(size_t wrap_line_)
{
	wrap_line = wrap_line_;

	if( wrap_line > 10000 )
		wrap_line = 10000;
}


void HTMLParser::InsertTabs(size_t tabsize)
{
	tab_size = tabsize;

	if( tab_size > 1000 )
		tab_size = 1000;
}


int HTMLParser::current_white_char_mode()
{
	if( !white_char_mode_tab.empty() )
		return white_char_mode_tab.back();

	return WHITE_MODE_ORIGIN;
}


void HTMLParser::CalcOrphansMaxLen(Orphans & orphans)
{
size_t i;

	orphans.max_len = 0;

	for(i=0 ; i<orphans.tab.size() ; ++i)
	{
		if( orphans.tab[i].size() > orphans.max_len )
			orphans.max_len = orphans.tab[i].size();
	}
}


void HTMLParser::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
{
	lang_code_lower = lang_code;
	ToLower(lang_code_lower);

	orphans_temp.tab = otab;
	std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end());
	CalcOrphansMaxLen(orphans_temp);

	orphans_tab[lang_code_lower] = orphans_temp;
}


void HTMLParser::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
{
	AssignOrphans(lang_code.c_str(), otab);
}


void HTMLParser::ClearOrphans()
{
	orphans_tab.clear();
}


void HTMLParser::OrphansMode(const std::wstring & orphan_mode_str)
{
	if( orphan_mode_str == L"160" )
		orphan_mode = orphan_160space;
	else
		orphan_mode = orphan_nbsp;
}


void HTMLParser::SafeMode(bool safe_mode_)
{
	safe_mode = safe_mode_;
}


void HTMLParser::SkipTags(bool skip_tags)
{
	this->skip_tags = skip_tags;
}

void HTMLParser::SkipCommentaries(bool skip_commentaries)
{
	this->skip_commentaries = skip_commentaries;
}


void HTMLParser::SkipEntities(bool skip_entities)
{
	this->skip_entities = skip_entities;

	if( this->skip_entities )
	{
		this->analyze_entities = true;
	}
}


void HTMLParser::AnalyzeEntities(bool analyze_entities)
{
	this->analyze_entities = analyze_entities;
}


void HTMLParser::SetNoFilterTag(const std::wstring & tag_name)
{
	no_filter_tag = tag_name;
}


HTMLParser::Item & HTMLParser::GetItem(size_t i)
{
	if( i >= stack_len )
	{
		empty.Clear();
		return empty;
	}

return pstack[i];
}


HTMLParser::Item & HTMLParser::LastItem()
{
	if( stack_len == 0 )
	{
		empty.Clear();
		return empty;
	}

return pstack[stack_len-1];
}


bool HTMLParser::PushStack()
{
	if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
		// oops, too many items
		return false;

	pstack[stack_len].Clear();

	if( stack_len > 0 )
	{
		// 'porphans', 'has_body_tag' and 'tree_index' attributes are propagated
		pstack[stack_len].porphans     = pstack[stack_len-1].porphans;
		pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag;
		pstack[stack_len].tree_index   = pstack[stack_len-1].tree_index;
	}

	stack_len += 1;

return true;
}


void HTMLParser::PopStack()
{
	if( stack_len == 0 )
		// oops
		return;

	stack_len -= 1;
	pstack[stack_len].Clear();
}


bool HTMLParser::IsWhite(int c)
{
	// dont use c==10 here

	if( c==' ' || c=='\t' || c==13 || c==160 )
		return true;

return false;
}


void HTMLParser::SkipWhite(std::wstring * out_string)
{
	while( IsWhite(lastc) )
	{
		if( out_string )
			(*out_string) += lastc;

		read_char();
	}
}


void HTMLParser::SkipWhiteLines(std::wstring * out_string)
{
	while( lastc==10 || IsWhite(lastc) )
	{
		if( out_string )
			(*out_string) += lastc;

		read_char();
	}
}


void HTMLParser::SkipWhiteWithFirstNewLine()
{
	SkipWhite();

	if( lastc == 10 )
	{
		read_char();
		SkipWhite();
	}
}


//void HTMLParser::CheckNewLine()
//{
//	if( white_mode == WHITE_MODE_TREE )
//	{
//		SkipWhite();
//	}
//
//	last_new_line = (lastc==10);
//}


void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
{
	bool is_quoted = false;
	wchar_t quote_char = 0;

	while( lastc != -1 )
	{
		if( lastc == '"' || lastc == '\'' )
		{
			if( is_quoted )
			{
				if( lastc == quote_char )
				{
					is_quoted = false;
				}
			}
			else
			{
				is_quoted = true;
				quote_char = lastc;
			}
		}
		else
		if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(lastc) ) // closing xml tag: default '/'
		{
			LastItem().type = Item::simple;
		}
		else
		if( !is_quoted && IsClosingTagMark(lastc) )
		{
			read_char();
			break;
		}

		if( remember_text )
			(*remember_text) += lastc;

		read_char();
	}
}


bool HTMLParser::IsValidCharForName(int c)
{
	if( (c>='a' && c<='z') ||
		(c>='A' && c<='Z') ||
		(c>='0' && c<='9') ||
		c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
		return true;

return false;
}


bool HTMLParser::IsValidCharForAttrName(int c)
{
	if( (c>='a' && c<='z') ||
		(c>='A' && c<='Z') ||
		(c>='0' && c<='9') ||
		c=='-' || c==':' || c=='_')
		return true;

return false;
}


bool HTMLParser::IsValidCharForEntityName(int c)
{
	if( (c>='a' && c<='z') ||
		(c>='A' && c<='Z') ||
		(c>='0' && c<='9') ||
		c=='#' )
		return true;

return false;
}


void HTMLParser::ReadItemName(std::wstring & name, bool clear_name)
{
size_t i;

	if( clear_name )
		name.clear();

	for(i=0 ; IsValidCharForName(lastc) ; ++i)
	{
		if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN )
		{
			name += lastc;

			if( LastItem().type == Item::special && name == L"!--" )
			{
				LastItem().is_commentary = true;
				read_char();
				break;
			}
		}

		read_char();
	}
}


void HTMLParser::ReadItemAttrName()
{
size_t i;

	attr_name.clear();

	for( i=0 ; lastc != -1 && IsValidCharForAttrName(lastc) ; ++i )
	{
		if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN )
			attr_name += lastc;

		read_char();
	}
}


void HTMLParser::ReadItemAttrValueAdd(const std::wstring & str)
{
	if( analyze_entities )
	{
		attr_value.push_back(std::wstring());
		AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), &attr_value.back());
	}
	else
	{
		attr_value.push_back(str);
	}
}


void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
{
	attr_value.clear();
	tmp_text.clear();

	while( lastc != -1 )
	{
		if( has_quote )
		{
			if( lastc == quote_char )
				break;
		}
		else
		{
			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
				break;
		}

		if( lastc==10 || IsWhite(lastc) )
		{
			if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
				ReadItemAttrValueAdd(tmp_text);

			tmp_text.clear();
		}
		else
		{
			if( tmp_text.size() > WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
				tmp_text.clear();

			tmp_text += lastc;
		}

		read_char();
	}

	if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
		ReadItemAttrValueAdd(tmp_text);
}


void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
{
	attr_value.clear();
	tmp_text.clear();

	while( lastc != -1 )
	{
		if( has_quote )
		{
			if( lastc == quote_char )
				break;
		}
		else
		{
			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
				break;
		}

		// IMPROVEME add support for analyze_entities?
		if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
			tmp_text += lastc;

		read_char();
	}
}


void HTMLParser::CheckChar(wchar_t c)
{
	if( c == 10 )
		line_len = 0;
	else
		line_len += 1;
}


void HTMLParser::Put(wchar_t c)
{
	if( out_string )
		(*out_string) += c;

	if( out_stream )
		(*out_stream) << c;

	CheckChar(c);
}


void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
{
	if( str >= end )
		return;

	size_t len = end - str;

	if( out_string )
		out_string->append(str, len);

	if( out_stream )
		out_stream->write(str, len);

	for( ; str < end ; ++str)
		CheckChar(*str);
}


void HTMLParser::Put(const std::wstring & str)
{
	if( !str.empty() )
	{
		if( out_string )
			out_string->append(str);

		if( out_stream )
			out_stream->write(str.c_str(), str.size());

		for(size_t i=0 ; i < str.size() ; ++i)
			CheckChar(str[i]);
	}
}


// out can be null
void HTMLParser::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
{
	size_t epsilon = 8; // !! IMPROVE ME put as a constant
	const wchar_t * old_str = str;

	while( str < end )
	{
		if( IsStartingEntityMark(*str) )
		{
			const wchar_t * entity_start = str;
			str += 1; // skip &

			for(size_t i=0 ; *str && IsValidCharForEntityName(*str) && i < epsilon ; ++i, ++str)
			{
			}

			if( IsEndingEntityMark(*str) && str - entity_start > 1 ) // at least one character in entity name
			{
				if( out )
					out->append(old_str, entity_start);
				else
					Put(old_str, entity_start);

				str += 1; // skip ;

				if( !skip_entities )
				{
					if( out )
						out->append(entity_start, str);
					else
						Put(entity_start, str);
				}

				EntityFound(entity_start + 1, str - 1); // without & and ;
				old_str = str;
			}
		}
		else
		{
			str += 1;
		}
	}

	if( out )
		out->append(old_str, end);
	else
		Put(old_str, end);
}


int HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
{
size_t res;

	const wchar_t * orphan = orphan_str.c_str();

	for( ; str<end && *orphan!=0  ; ++str, ++orphan )
	{
		res = ToLower(*str) - ToLower(*orphan);

		if( res != 0 )
			return res;
	}

	if( str < end )
		return ToLower(*str);

return -int(ToLower(*orphan));
}


// binary search in table (table should be sorted)
bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
{
int res;

	if( table.empty() )
		return false;

	size_t o1 = 0;
	size_t o2 = table.size() - 1;

	res = CheckOrphan(str, end, table[o1]);

	if( res == 0 )
		return true;

	if( res < 0 )
		return false;

	res = CheckOrphan(str, end, table[o2]);

	if( res == 0 )
		return true;

	if( res > 0 )
		return false;


	while( o1 + 1 < o2 )
	{
		size_t o = (o1 + o2) / 2;
		res = CheckOrphan(str, end, table[o]);

		if( res == 0 )
			return true;

		if( res < 0 )
			o2 = o;
		else
			o1 = o;
	}

return false;
}


bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end)
{
	if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
		return false;

	size_t len = end - str;

	if( len > LastItem().porphans->max_len )
		return false;

return CheckOrphan(str, end, LastItem().porphans->tab);
}


void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
{
	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
	{
		str += lastc;
		read_char();
	}

	if( !str.empty() )
	{
		if( allow_put_new_line )
		{
			Put(10);
			PutTabs(LastItem().tree_index + 1);
		}
		else
		if( allow_put_space )
		{
			Put(' ');
		}
	}

	if( analyze_entities )
		AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
	else
		Put(str);
}


void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text)
{
	was_white_char = false;
	was_new_line = false;

	while( lastc == 10 || IsWhite(lastc) )
	{
		if( lastc == 10 )
			was_new_line = true;
		else
			was_white_char = true;

		if( result_text )
			(*result_text) += lastc;

		if( current_white_char_mode() == WHITE_MODE_ORIGIN )
		{
			Put(lastc);
		}

		read_char();
	}

	if( current_white_char_mode() == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) )
	{
		Put(' ');
	}

	// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
}


void HTMLParser::PutOpeningTagMark()
{
	Put('<');
}


void HTMLParser::PutClosingTagMark()
{
	Put('>');
}


// !! IMPROVE ME change to a better name
// this functions does not return true when the tag is safe
bool HTMLParser::IsTagSafe(const wchar_t * tag)
{
	if( !safe_mode )
		return true;

	if( IsNameEqual(tag, no_filter_tag.c_str()) )
		return false;

	static const wchar_t * unsafe_tags[] = {
		L"applet", 	L"base",	L"body",
		L"embed",	L"head",	L"html",
		L"frame",	L"frameset",L"iframe",
		L"link",	L"meta",	L"param"
		L"object",	L"script"
	};

	size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*);
	size_t i;

	for(i=0 ; i<len ; ++i)
	{
		if( IsNameEqual(tag, unsafe_tags[i]) )
			return false;
	}

return true;
}


bool HTMLParser::IsTagSafe(const std::wstring & tag)
{
	return IsTagSafe(tag.c_str());
}


bool HTMLParser::PutOpeningTag()
{
	if( !IsTagSafe(LastItem().name) )
	{
		SkipAndCheckClosingTag();
		return false;
	}

	if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before )
	{
		Put(10);
		PutTabs(LastItem().tree_index);
	}

	PutOpeningTagMark();
	Put(LastItem().name);

return true;
}


void HTMLParser::PutClosingTag(const Item & item)
{
	if( skip_tags || !IsTagSafe(item.name) || IsNameEqual(no_filter_tag, LastItem().name) )
		return;

	if( item.is_commentary )
	{
		Put('-');
		Put('-');
		PutClosingTagMark();
	}
	else
	{
		PutOpeningTagMark();
		Put('/');
		Put(item.name);
		PutClosingTagMark();
	}
}


void HTMLParser::PutTabs(size_t len)
{
	if( len > 30 )
		len = 30;

	for(size_t i=0 ; i < (len*tab_size) ; ++i)
	{
		if( out_string )
			(*out_string) += ' '; // we do not add them to 'line_len'

		if( out_stream )
			(*out_stream) << ' ';
	}
}


void HTMLParser::PutNonBreakingSpace()
{
	if( orphan_mode == orphan_nbsp )
	{
		Put(L"&nbsp;");
	}
	else
	{
		Put(160);
	}
}


// we assume the size of the opening mark to be one
bool HTMLParser::IsOpeningTagMark(wchar_t c)
{
	return (c == '<');
}


// we assume the size of the closing mark to be one
bool HTMLParser::IsClosingTagMark(wchar_t c)
{
	return (c == '>');
}


// the slash in the closing tag mark e.g. </p>
bool HTMLParser::IsClosingTagIndicator(wchar_t c)
{
	return (c == '/');
}


// the slash in the closing tag mark e.g. </p>
bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
{
	return (c == '!');
}

bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c)
{
	return (c == '?');
}

// the '=' operator e.g. class="value"
bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
{
	return (c == '=');
}


// the slash at the end <img src=".." /> (without '>' character)
// we assume the size of the mark to be one
bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
{
	return (c == '/');
}


bool HTMLParser::IsStartingEntityMark(wchar_t c)
{
	return (c == '&');
}


bool HTMLParser::IsEndingEntityMark(wchar_t c)
{
	return (c == ';');
}


// used for such tags as: script, pre, textarea
void HTMLParser::ReadTextUntilClosingCommentary()
{
	while( lastc != -1 )
	{
		if( lastc == '-' )
		{
			tmp_text.clear();
			tmp_text += lastc;
			read_char();

			if( lastc == '-' )
			{
				tmp_text += lastc;
				read_char();

				if( IsClosingTagMark(lastc) )
				{
					tmp_text += lastc;
					read_char();
					Put(tmp_text);

					break;
				}
			}

			Put(tmp_text);
		}
		else
		{
			Put(lastc);
			read_char();
		}
	}
}


bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
{
	tmp_text.clear();
	tmp_text += lastc; // opening tag mark
	read_char();

	SkipWhiteLines(&tmp_text);

	if( IsClosingTagIndicator(lastc) )
	{
		tmp_text += lastc;
		read_char();
		SkipWhiteLines(&tmp_text);
		ReadItemName(tmp_name);

		if( IsNameEqual(tmp_name, LastItem().name) )
		{
			SkipAndCheckClosingTag();

			if( put_closing_tag_as_well )
			{
				Put('<');
				Put('/');
				Put(tmp_name);
				Put('>');
			}

			return true;
		}
		else
		{
			Put(tmp_text);
			Put(tmp_name);
		}
	}
	else
	{
		Put(tmp_text);
	}

return false;
}


// used for such tags as: script, pre, textarea
void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
{
	while( lastc != -1 )
	{
		if( IsOpeningTagMark(lastc) )
		{
			if( IsClosingTagForLastItem(put_closing_tag_as_well) )
			{
				//CheckNewLine();
				break;
			}
		}
		else
		{
			Put(lastc);
			read_char();
		}
	}
}


// reading text between html tags
void HTMLParser::ReadText()
{
	bool was_white_char = false;
	bool was_new_line = false;

	bool was_non_white_text = false;

	bool allow_put_new_line = false;
	bool allow_put_space = false;

	if( current_white_char_mode() == WHITE_MODE_TREE )
	{
		if( LastItem().new_line || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) )
		{
			allow_put_new_line = true;
		}
	}

	Space * text_space = nullptr;
	std::wstring * text_space_wstr = nullptr;

	if( out_space )
	{
		text_space = &text_space_tmp;
		text_space->clear();
		text_space->add(L"name", L"");
		Space & wstr_space = text_space->add(L"text", L"");
		text_space_wstr = &wstr_space.value.value_wstring;
	}

	while( lastc != -1 && !IsOpeningTagMark(lastc) )
	{
		tmp_text.clear();
		PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);

		if( !tmp_text.empty() )
		{
			allow_put_new_line = false;
			allow_put_space = false;
			was_non_white_text = true;

			if( text_space_wstr )
				(*text_space_wstr) += tmp_text;
		}

		if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) )
		{
			if( lastc == 10 || IsWhite(lastc) )
			{
				SkipWhiteLines(text_space_wstr);
				PutNonBreakingSpace();
			}
		}
		else
		{
			PutNormalWhite(was_white_char, was_new_line, text_space_wstr);

			if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
			{
				allow_put_new_line = false;
				allow_put_space = false;

				if( was_new_line )
				{
					allow_put_new_line = true;
					LastItem().new_line_in_the_middle = true;

					if( !was_non_white_text )
						LastItem().new_line = true;
				}
				else
				{
					allow_put_space = true;
				}

				if( wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line )
				{
					allow_put_new_line = true;
				}
			}
		}
	}

	if( text_space_wstr && !text_space_wstr->empty() && was_non_white_text )
	{
		AddSpaceToSpaceTree(*text_space);
	}

	text_space_tmp.clear();
	new_item_has_new_line_before = was_new_line;
}


bool HTMLParser::PrintOpeningItem()
{
	if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
		return true;

	return PutOpeningTag();
}


bool HTMLParser::ReadItemAttr()
{
	attr_has_value = false;
	attr_name.clear();
	attr_value.clear();

	SkipWhiteLines();
	ReadItemAttrName();

	if( attr_name.empty() )
		return false;

	SkipWhiteLines();

	if( !IsAttributeAssignmentMark(lastc) ) // '='
		return true;

	attr_has_value = true;
	read_char();				// skipping '='
	SkipWhiteLines();

	bool has_quote = (lastc == '\"' || lastc == '\'');
	wchar_t quote_char = lastc;

	if( has_quote )
		read_char();			// skipping the first quote mark

	// IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table...
	if( parsing_html )
		ReadItemAttrValue(has_quote, quote_char);
	else
		ReadXMLItemAttrValue(has_quote, quote_char);

	if( has_quote && lastc == quote_char )
		read_char();			// skipping the last quote mark

return true;
}


void HTMLParser::CheckItemLangAttr()
{
	if( attr_has_value && IsNameEqual(L"lang", attr_name) )
	{
		LastItem().porphans = nullptr;

		if( !attr_value.empty() )
		{
			// we are taking the first value only
			attr_value_lower = attr_value[0];
			ToLower(attr_value_lower);

			OrphansTab::iterator i = orphans_tab.find(attr_value_lower);

			if( i != orphans_tab.end()  )
				LastItem().porphans = &i->second;
		}
	}
}


void HTMLParser::PrintItemAttr()
{
size_t i;

	if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
		return;

	Put(' ');
	Put(attr_name);

	if( attr_has_value )
	{
		Put(L"=\"");

		for(i=0 ; i<attr_value.size() ; ++i)
		{
			Put(attr_value[i]);

			if( i + 1 < attr_value.size() )
				Put(' ');
		}

		Put('\"');
	}
}


void HTMLParser::PutItemAttrToSpace()
{
	Space * space = LastItem().space;

	if( space )
	{
		Space & attr_tab = space->get_add_space(L"attr");
		Space & attr = attr_tab.add_empty_space(attr_name);

		if( attr_has_value )
		{
			if( parsing_html )
			{
				attr.set_empty_table();

				for(size_t i=0 ; i < attr_value.size() ; ++i)
				{
					attr.add(attr_value[i]);
				}
			}
			else
			{
				attr.set(tmp_text);
			}
		}
	}
}


void HTMLParser::ReadItemClosing()
{
	read_char(); // skipping '/'
	SkipWhiteLines();
	ReadItemName(LastItem().name);
	LastItem().type = Item::closing;
	SkipAndCheckClosingTag();

	// closing tags are printed later
}


void HTMLParser::ReadItemSpecial()
{
	LastItem().type = Item::special;

	if( !skip_tags )
	{
		if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before )
		{
			Put(10);
			PutTabs(LastItem().tree_index);
		}

		PutOpeningTagMark();
	}

	LastItem().name = lastc;
	read_char(); // skipping '!' or '?'
	ReadItemName(LastItem().name, false);

	if( skip_tags )
	{
		SkipAndCheckClosingTag();
	}
	else
	{
		if( LastItem().is_commentary )
		{
			Put(LastItem().name);
		}
		else
		{
			tmp_text.clear();
			SkipWhiteLines();
			SkipAndCheckClosingTag(&tmp_text);
			Put(LastItem().name);
			Put(' ');
			Put(tmp_text);
			Put('>');

			if( is_first_item && current_white_char_mode() == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") )
			{
				Put(10);
				Put(10);
				SkipWhiteLines();
			}
		}
	}
}


void HTMLParser::ReadItemOpening()
{
	LastItem().type = Item::opening;
	ReadItemName(LastItem().name);
	AddItemToSpace();
	Space * space = LastItem().space;

	if( !xml_compact_mode && space )
		space->add(L"name", LastItem().name);

	if( PrintOpeningItem() )
	{
		while( ReadItemAttr() )
		{
			CheckItemLangAttr();
			PrintItemAttr();
			PutItemAttrToSpace();
		}

		SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'

		if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
		{
			if( LastItem().type == Item::simple )
				Put(L" /");

			PutClosingTagMark();
		}
	}
}


void HTMLParser::ItemFound()
{
}

void HTMLParser::EntityFound(const wchar_t * str, const wchar_t * end)
{
}


bool HTMLParser::ReadItem()
{
	if( lastc == -1 )
		return false;

	if( !PushStack() )
		return false;

	LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method

	if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
		LastItem().tree_index += 1;

	read_char();	// skipping the first opening tag mark '<'
	SkipWhiteLines();

	if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) )
		ReadItemSpecial();
	else
	if( IsClosingTagIndicator(lastc) )
		ReadItemClosing();
	else
		ReadItemOpening();

	// IMPROVE ME later CheckSingleItemExceptions() can change opening to single type
	ItemFound();

return true;
}


wchar_t HTMLParser::ToLower(wchar_t c)
{
	if( c>='A' && c<='Z' )
		return c - 'A' + 'a';

return c;
}


void HTMLParser::ToLower(std::wstring & str)
{
size_t i;

	for(i=0 ; i<str.size() ; ++i)
		str[i] = ToLower(str[i]);
}


bool HTMLParser::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
{
	for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
		if( ToLower(*name1) != ToLower(*name2) )
			return false;

	if( *name1==0 && *name2==0 )
		return true;

return false;
}


bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2)
{
	return IsNameEqual(name1, name2.c_str());
}


bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2)
{
	return IsNameEqual(name1.c_str(), name2);
}


bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2)
{
	return IsNameEqual(name1.c_str(), name2.c_str());
}


// len characters from both strings must be equal
// IMPROVE ME change name to something like IsBeginningNameEqual
// and move to text.h (pikotools)
bool HTMLParser::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
{
	for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
		if( ToLower(*name1) != ToLower(*name2) )
			return false;

	if( len == 0 )
		return true;

return false;
}


bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
{
	return IsNameEqual(name1, name2.c_str(), len);
}


bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
{
	return IsNameEqual(name1.c_str(), name2, len);
}


bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
{
	return IsNameEqual(name1.c_str(), name2.c_str(), len);
}


bool HTMLParser::IsLastTag(const wchar_t * name)
{
	return IsNameEqual(name, LastItem().name);
}


bool HTMLParser::IsLastTag(const std::wstring & name)
{
	return IsNameEqual(name, LastItem().name);
}


// checking exceptions for opening tags
void HTMLParser::CheckSingleItemExceptions()
{
	if( IsLastTag(L"meta")	||
		IsLastTag(L"input")	||
		IsLastTag(L"br")	||
		IsLastTag(L"hr")	||
		IsLastTag(L"img")	||
		IsLastTag(L"link")	||
		IsLastTag(L"param")	||
		IsLastTag(L"col")	||
		IsLastTag(L"area")   )
	{
		LastItem().type = Item::simple;
		PopStack();
		return;
	}

	// move me to a better place
	if( IsLastTag(L"body") )
		LastItem().has_body_tag = true;
}


void HTMLParser::CheckWhiteCharsExceptions(Item & item)
{
	bool change_white_mode = false;

	// in safe_mode the script tag is ignored
//	if( !safe_mode && IsNameEqual(item.name, L"script") )
//	{
//		change_white_mode = true;
//	}

//	if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
//	{
//		change_white_mode = true;
//	}

	if( IsNameEqual(item.name, L"pre") )
	{
		change_white_mode = true;
	}

	// move to CheckDifferentContentExceptions?
	if( IsNameEqual(item.name, no_filter_tag) )
	{
		change_white_mode = true;
	}

	if( change_white_mode )
	{
		if( item.type == Item::opening )
		{
			white_char_mode_tab.push_back(WHITE_MODE_ORIGIN);
		}
		else
		{
			if( !white_char_mode_tab.empty() )
				white_char_mode_tab.pop_back();
		}
	}
}


void HTMLParser::CheckDifferentContentExceptions(Item & item)
{
	if( !safe_mode && IsNameEqual(item.name, L"script") )
	{
		ReadTextUntilClosingTag(true);
		PopStack();
	}

	if( IsNameEqual(item.name, L"textarea") )
	{
		ReadTextUntilClosingTag(true);
		PopStack();
	}
}


void HTMLParser::CheckStackPrintRest()
{
	while( stack_len-- > 0 )
	{
		if( stack_len==0 || pstack[stack_len-1].new_line )
		{
			if( current_white_char_mode() == WHITE_MODE_TREE )
			{
				Put(10);
				PutTabs(pstack[stack_len-1].tree_index);
			}
			else
			{
				Put(' ');
			}
		}

		PutClosingTag(pstack[stack_len]);
	}
}


void HTMLParser::CheckClosingTags()
{
	int i;

	if( stack_len == 0 )
		return;

	// on the stack we have only opening tags
	// but only the last tag is a closing tag

	if( stack_len == 1 )
	{
		PopStack();
		return;
	}

	// looking whether there is a matching opening tag
	for(i=int(stack_len)-2 ; i >= 0 ; --i)
		if( (pstack[i].is_commentary && pstack[stack_len-1].is_commentary) || IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
			break;

	if( i < 0 )
	{
		// oops, there is no such an opening tag on the stack
		// we don't print the closing and the missing opening tag
		PopStack();
		return;
	}

	for(int z=(int)stack_len-2 ; z >= i ; --z)
	{
		CheckWhiteCharsExceptions(pstack[z]);

		if( !skip_tags && IsTagSafe(LastItem().name) && !IsNameEqual(no_filter_tag, LastItem().name) )
		{
			if( pstack[z].new_line )
			{
				if( current_white_char_mode() == WHITE_MODE_TREE )
				{
					Put(10);
					PutTabs(pstack[z].tree_index);
				}
			}

			// IMPROVEME
			// in PutClosingTag we test IsTagSafe() and no_filter_tag too
			PutClosingTag(pstack[z]);
			pstack[z].Clear();
		}
	}

	// invalidate items on the stack
	stack_len = i;
}


bool HTMLParser::PrintRest()
{
//const wchar_t * start = pchar;

	// in safe mode we do not print the rest html code
	if( safe_mode || skip_tags )
		return false;

	bool was_chars = false;

	while( lastc != -1 )
	{
		Put(lastc);
		read_char();
		was_chars = true;
	}

	return was_chars;

//	if( pchar > start )
//	{
//		Put(start, pchar);
//		return true;
//	}

//return false;
}


void HTMLParser::AddItemToSpace()
{
	if( out_space && stack_len > 0 )
	{
		Space * parent = out_space;

		if( stack_len > 1 )
		{
			parent = pstack[stack_len-2].space;
		}

		if( xml_compact_mode )
		{
			Space * space = parent->get_space(pstack[stack_len-1].name);

			if( space )
			{
				if( space->is_table() )
				{
					Space & child = space->add_empty_space();
					pstack[stack_len-1].space = &child;
				}
				else
				{
					Space * tab = new Space();
					tab->add(space);
					Space & child = tab->add_empty_space();

					parent->value.value_object[pstack[stack_len-1].name] = tab;
					pstack[stack_len-1].space = &child;
				}
			}
			else
			{
				Space & space = parent->add_empty_space(pstack[stack_len-1].name);
				pstack[stack_len-1].space = &space;
			}
		}
		else
		{
			Space & childs_tab = parent->get_add_space(L"childs");
			Space & child = childs_tab.add_empty_space();
			pstack[stack_len-1].space = &child;
		}

	}
}


void HTMLParser::AddSpaceToSpaceTree(const Space & space)
{
	const std::wstring * text = space.get_wstr(L"text");

	if( out_space && stack_len > 0 && text )
	{
		if( xml_compact_mode )
		{
			Space * child_text = LastItem().space->get_space(L"text");

			if( child_text )
			{
				if( child_text->is_table() )
				{
					child_text->add(*text);
				}
				else
				{
					Space * tab = new Space();
					tab->add(*child_text);
					tab->add(*text);
					LastItem().space->value.value_object[L"text"] = tab;
				}
			}
			else
			{
				LastItem().space->add(L"text", *text);
			}
		}
		else
		{
			Space & childs_tab = LastItem().space->get_add_space(L"childs");
			childs_tab.add(space);
		}
	}
}


void HTMLParser::ReadLoop()
{
	while( status == ok && ReadItem() )
	{
		if( LastItem().type == Item::opening )
		{
			if( parsing_html )
			{
				CheckSingleItemExceptions();
			}

			CheckWhiteCharsExceptions(LastItem());
			CheckDifferentContentExceptions(LastItem());
		}
		else
		if( LastItem().type == Item::special )
		{
			if( LastItem().is_commentary )
				ReadTextUntilClosingCommentary();

			PopStack();
		}
		else
		if( LastItem().type == Item::simple )
		{
			PopStack();
		}
		else
		if( LastItem().type == Item::closing )
		{
			CheckClosingTags();
		}
		else
		{
			PopStack();
		}

		if( status == ok )
		{
			ReadText();
		}

		is_first_item = false;
	}
}


void HTMLParser::Read()
{
	read_char(); // put first character to lastc
	is_first_item = true;

	white_char_mode_tab.clear();
	white_char_mode_tab.push_back(white_mode);

	if( current_white_char_mode() != WHITE_MODE_ORIGIN )
		SkipWhiteLines();

	// it can be some text or white lines before the first html tag (we print it if using filtering)
	// but they are not added to the Space tree
	ReadText();

	// reading the whole html source
	ReadLoop();

	// sometimes there can remain some html source (when there is no space on the stack)
	// we print the rest html without filtering (only if safe_mode is false)
	if( !PrintRest() )
		CheckStackPrintRest();
}


}