From bdb2616f32c4a2e9fa2f56abb7877583e9a76664 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 17 Jul 2021 13:35:10 +0200
Subject: [PATCH 01/37] added: HTMLFilter (html/htmlfilter.h|cpp) - copied from
 winix project

---
 src/Makefile.dep        |    1 +
 src/html/htmlfilter.cpp | 1711 +++++++++++++++++++++++++++++++++++++++
 src/html/htmlfilter.h   |  376 +++++++++
 tests/Makefile.dep      |   10 +-
 4 files changed, 2093 insertions(+), 5 deletions(-)
 create mode 100644 src/html/htmlfilter.cpp
 create mode 100644 src/html/htmlfilter.h

diff --git a/src/Makefile.dep b/src/Makefile.dep
index 683e3cf..2a8cf37 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -42,3 +42,4 @@
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
+./html/htmlfilter.o: ./html/htmlfilter.h
diff --git a/src/html/htmlfilter.cpp b/src/html/htmlfilter.cpp
new file mode 100644
index 0000000..d103b9e
--- /dev/null
+++ b/src/html/htmlfilter.cpp
@@ -0,0 +1,1711 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/* 
+ * Copyright (c) 2008-2021, Tomasz Sowa
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "htmlfilter.h"
+
+
+
+namespace pt
+{
+
+
+
+
+void HTMLFilter::Item::Clear()
+{
+	name.clear();
+	type         = none;
+	porphans     = 0;
+	new_line     = false;
+	has_body_tag = false;
+}
+
+
+HTMLFilter::Item::Item()
+{
+	Clear();
+}
+
+
+
+void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
+{
+	pchar         = in;
+	stack_len     = 0;
+	out_string    = &out;
+	last_new_line = false;
+	line_len      = 0;
+	out_string->clear();
+
+	Init();
+	Read();
+	Uninit();
+}
+
+
+
+void HTMLFilter::Init()
+{
+}
+
+
+void HTMLFilter::Uninit()
+{
+}
+
+
+
+void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
+{
+	if( &in == &out )
+	{
+		// out cannot be the same string as in
+		return;
+	}
+
+	size_t out_projected_len = in.size() * 2 + 1;
+
+	if( out.capacity() < out_projected_len )
+		out.reserve(out_projected_len);
+
+	Filter(in.c_str(), out);
+}
+
+
+void HTMLFilter::SetSomeDefaults()
+{
+	tab_size    = 2;
+	trim_white  = false;
+	break_after = 0;
+	wrap_line   = 0;
+	orphan_mode = orphan_nbsp;
+	safe_mode   = false;
+	skip_tags = false;
+	skip_commentaries = false;
+	skip_entities = false;
+	analyze_entities = false;
+}
+
+
+HTMLFilter::HTMLFilter()
+{
+	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
+	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
+
+	SetSomeDefaults();
+}
+
+
+HTMLFilter::HTMLFilter(const HTMLFilter & f)
+{
+	// don't need to copy the stack
+	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
+	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
+
+	SetSomeDefaults();
+}
+
+
+HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
+{
+	// don't need to copy the stack
+	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
+	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
+
+	// we can copy some fields from f
+
+return *this;
+}
+
+
+HTMLFilter::~HTMLFilter()
+{
+	delete [] pstack;
+	delete [] buffer;
+}
+
+
+
+
+void HTMLFilter::BreakWord(size_t break_after_)
+{
+	break_after = break_after_;
+
+	if( break_after > 10000 )
+		break_after = 10000;
+}
+
+
+void HTMLFilter::WrapLine(size_t wrap_line_)
+{
+	wrap_line = wrap_line_;
+
+	if( wrap_line > 10000 )
+		wrap_line = 10000;
+}
+
+
+
+void HTMLFilter::TrimWhite(bool trim)
+{
+	trim_white = trim;
+}
+
+
+void HTMLFilter::InsertTabs(size_t tabsize)
+{
+	tab_size = tabsize;
+
+	if( tab_size > 1000 )
+		tab_size = 1000;
+}
+
+
+void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans)
+{
+size_t i;
+
+	orphans.max_len = 0;
+
+	for(i=0 ; i<orphans.tab.size() ; ++i)
+	{
+		if( orphans.tab[i].size() > orphans.max_len )
+			orphans.max_len = orphans.tab[i].size();
+	}
+}
+
+
+void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
+{
+	lang_code_lower = lang_code;
+	ToLower(lang_code_lower);
+
+	orphans_temp.tab = otab;
+	std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end());
+	CalcOrphansMaxLen(orphans_temp);
+
+	orphans_tab[lang_code_lower] = orphans_temp;
+}
+
+
+
+void HTMLFilter::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
+{
+	AssignOrphans(lang_code.c_str(), otab);
+}
+
+
+void HTMLFilter::ClearOrphans()
+{
+	orphans_tab.clear();
+}
+
+
+
+
+void HTMLFilter::OrphansMode(const std::wstring & orphan_mode_str)
+{
+	if( orphan_mode_str == L"160" )
+		orphan_mode = orphan_160space;
+	else
+		orphan_mode = orphan_nbsp;
+}
+
+
+void HTMLFilter::SafeMode(bool safe_mode_)
+{
+	safe_mode = safe_mode_;
+}
+
+
+void HTMLFilter::SkipTags(bool skip_tags)
+{
+	this->skip_tags = skip_tags;
+}
+
+void HTMLFilter::SkipCommentaries(bool skip_commentaries)
+{
+	this->skip_commentaries = skip_commentaries;
+}
+
+
+void HTMLFilter::SkipEntities(bool skip_entities)
+{
+	this->skip_entities = skip_entities;
+
+	if( this->skip_entities )
+	{
+		this->analyze_entities = true;
+	}
+}
+
+
+void HTMLFilter::AnalyzeEntities(bool analyze_entities)
+{
+	this->analyze_entities = analyze_entities;
+}
+
+
+void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
+{
+	no_filter_tag = tag_name;
+}
+
+
+
+
+HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
+{
+	if( i >= stack_len )
+	{
+		empty.Clear();
+		return empty;
+	}
+
+return pstack[i];
+}
+
+
+HTMLFilter::Item & HTMLFilter::LastItem()
+{
+	if( stack_len == 0 )
+	{
+		empty.Clear();
+		return empty;
+	}
+
+return pstack[stack_len-1];
+}
+
+
+bool HTMLFilter::PushStack()
+{
+	if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
+		// oops, too many items
+		return false;
+
+	pstack[stack_len].Clear();
+
+	if( stack_len > 0 )
+	{
+		// 'porphans' and 'has_body_tag' attributes are propagated
+		pstack[stack_len].porphans     = pstack[stack_len-1].porphans;
+		pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag;
+	}
+
+	stack_len += 1;
+
+return true;
+}
+
+void HTMLFilter::PopStack()
+{
+	if( stack_len == 0 )
+		// oops
+		return;
+
+	stack_len -= 1;
+	pstack[stack_len].Clear();
+}
+
+
+bool HTMLFilter::IsWhite(int c)
+{
+	// dont use c==10 here
+
+	if( c==' ' || c=='\t' || c==13 || c==160 )
+		return true;
+
+return false;
+}
+
+
+void HTMLFilter::SkipWhite()
+{
+	while( IsWhite(*pchar) )
+		++pchar;
+}
+
+
+void HTMLFilter::SkipWhiteLines()
+{
+	while( *pchar==10 || IsWhite(*pchar) )
+		++pchar;
+}
+
+
+void HTMLFilter::SkipWhiteWithFirstNewLine()
+{
+	SkipWhite();
+
+	if( *pchar == 10 )
+	{
+		pchar += 1;
+		SkipWhite();
+	}
+}
+
+
+void HTMLFilter::SkipWhiteLines(const wchar_t * & str, const wchar_t * end)
+{
+	while( str < end && (*str==10 || IsWhite(*str)) )
+		++str;
+}
+
+
+void HTMLFilter::CheckNewLine()
+{
+const wchar_t * start = pchar;
+
+	SkipWhite();
+	last_new_line = (*pchar==10);
+
+	pchar = start;
+}
+
+
+
+
+bool HTMLFilter::IsClosingTagForLastItem()
+{
+	pchar += 1;
+	SkipWhite();
+
+	if( *pchar == '/' )
+	{
+		pchar += 1;
+		SkipWhite();
+
+		if( IsNameEqual(pchar, LastItem().name, LastItem().name.size()) )
+		{
+			pchar += LastItem().name.size();
+			SkipWhite();
+
+			if( IsClosingTagMark(*pchar) )
+			{
+				pchar += 1;
+				return true;
+			}
+		}
+	}
+
+return false;
+}
+
+
+
+
+// used for such tags as: script, pre, textarea
+void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
+{
+const wchar_t * start = pchar;
+const wchar_t * end = pchar;
+
+	while( *pchar != 0 )
+	{
+		if( IsOpeningTagMark(*pchar) )
+		{
+			if( IsClosingTagForLastItem() )
+			{
+				if( put_closing_tag_as_well )
+					end = pchar;
+
+				PopStack();
+				CheckNewLine();
+				break;
+			}
+		}
+		else
+		{
+			pchar += 1;
+			end = pchar;
+		}
+	}
+
+	Put(start, end);
+}
+
+
+
+
+void HTMLFilter::SkipAndCheckClosingTag()
+{
+	bool is_quoted = false;
+	wchar_t quote_char = 0;
+
+	for( ; *pchar ; ++pchar )
+	{
+		if( *pchar == '"' || *pchar == '\'' )
+		{
+			if( is_quoted )
+			{
+				if( *pchar == quote_char )
+				{
+					is_quoted = false;
+				}
+			}
+			else
+			{
+				is_quoted = true;
+				quote_char = *pchar;
+			}
+		}
+		else
+		if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(*pchar) ) // closing xml tag: default '/'
+		{
+			LastItem().type = Item::simple;
+		}
+		else
+		if( !is_quoted && IsClosingTagMark(*pchar) )
+		{
+			++pchar;
+			break;
+		}
+	}
+}
+
+
+
+bool HTMLFilter::IsValidCharForName(int c)
+{
+	if( (c>='a' && c<='z') ||
+		(c>='A' && c<='Z') ||
+		(c>='0' && c<='9') ||
+		c=='-' || c=='!' || c==':') // : for namespace character
+		return true;
+
+return false;
+}
+
+
+bool HTMLFilter::IsValidCharForAttrName(int c)
+{
+	if( (c>='a' && c<='z') ||
+		(c>='A' && c<='Z') ||
+		(c>='0' && c<='9') ||
+		c=='-' || c==':' )
+		return true;
+
+return false;
+}
+
+
+bool HTMLFilter::IsValidCharForEntityName(int c)
+{
+	if( (c>='a' && c<='z') ||
+		(c>='A' && c<='Z') ||
+		(c>='0' && c<='9') ||
+		c=='#' )
+		return true;
+
+return false;
+}
+
+
+void HTMLFilter::ReadItemName()
+{
+size_t i;
+
+	for( i=0 ; IsValidCharForName(*pchar) ; ++i )
+	{
+		if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN )
+			LastItem().name += *pchar;
+
+		++pchar;
+	}
+}
+
+
+
+void HTMLFilter::ReadItemAttrName()
+{
+size_t i;
+
+	attr_name.clear();
+
+	for( i=0 ; *pchar && IsValidCharForAttrName(*pchar) ; ++i )
+	{
+		if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN )
+			attr_name += *pchar;
+
+		++pchar;
+	}
+}
+
+
+
+void HTMLFilter::ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end)
+{
+	attr_value.push_back(std::wstring());
+
+	if( analyze_entities )
+	{
+		AnalyzeEntitiesAndPut(value_start, value_end, &attr_value.back());
+	}
+	else
+	{
+		attr_value.back().append(value_start, value_end);
+	}
+}
+
+
+void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
+{
+size_t i;
+
+	attr_value.clear();
+	const wchar_t * value_start = pchar;
+	size_t value_len = 0; // how many non white characters
+
+	for(i=0 ; *pchar ; ++i, ++pchar )
+	{
+		if( has_quote )
+		{
+			if( *pchar == quote_char )
+				break;
+		}
+		else
+		{
+			if( IsClosingTagMark(*pchar) || *pchar == 10 || IsWhite(*pchar) )
+				break;
+		}
+
+		if( *pchar==10 || IsWhite(*pchar) )
+		{
+			if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+				ReadItemAttrValueAdd(value_start, pchar);
+
+			value_len = 0;
+		}
+		else
+		{
+			if( value_len == 0 )
+				value_start = pchar;
+
+			value_len += 1;
+		}
+	}
+
+	if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+		ReadItemAttrValueAdd(value_start, pchar);
+}
+
+
+void HTMLFilter::CheckChar(wchar_t c)
+{
+	if( c == 10 )
+		line_len = 0;
+	else
+		line_len += 1;
+}
+
+
+void HTMLFilter::Put(wchar_t c)
+{
+	(*out_string) += c;
+	CheckChar(c);
+}
+
+
+void HTMLFilter::Put(const wchar_t * str)
+{
+	out_string->append(str);
+
+	for( ; *str ; ++str)
+		CheckChar(*str);
+}
+
+
+void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
+{
+	if( str >= end )
+		return;
+
+	size_t len = end - str;
+	out_string->append(str, len);
+
+	for( ; str < end ; ++str)
+		CheckChar(*str);
+}
+
+
+void HTMLFilter::Put(const std::wstring & str)
+{
+	out_string->append(str);
+
+	for(size_t i=0 ; i<str.size() ; ++i)
+		CheckChar(str[i]);
+}
+
+
+// out can be null
+void HTMLFilter::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
+{
+	size_t epsilon = 8; // !! IMPROVE ME put as a constant
+	const wchar_t * old_str = str;
+
+	while( str < end )
+	{
+		if( IsStartingEntityMark(*str) )
+		{
+			const wchar_t * entity_start = str;
+			str += 1; // skip &
+
+			for(size_t i=0 ; *str && IsValidCharForEntityName(*str) && i < epsilon ; ++i, ++str)
+			{
+			}
+
+			if( IsEndingEntityMark(*str) && str - entity_start > 1 ) // at least one character in entity name
+			{
+				if( out )
+					out->append(old_str, entity_start);
+				else
+					Put(old_str, entity_start);
+
+				str += 1; // skip ;
+
+				if( !skip_entities )
+				{
+					if( out )
+						out->append(entity_start, str);
+					else
+						Put(entity_start, str);
+				}
+
+				EntityFound(entity_start + 1, str - 1); // without & and ;
+				old_str = str;
+			}
+		}
+		else
+		{
+			str += 1;
+		}
+	}
+
+	if( out )
+		out->append(old_str, end);
+	else
+		Put(old_str, end);
+}
+
+
+
+
+int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
+{
+size_t res;
+
+	const wchar_t * orphan = orphan_str.c_str();
+
+	for( ; str<end && *orphan!=0  ; ++str, ++orphan )
+	{
+		res = ToLower(*str) - ToLower(*orphan);
+		
+		if( res != 0 )
+			return res;
+	}
+
+	if( str < end )
+		return ToLower(*str);
+
+return -int(ToLower(*orphan));
+}
+
+
+
+
+// binary search in table (table should be sorted)
+bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
+{
+int res;
+
+	if( table.empty() )
+		return false;
+
+	size_t o1 = 0;
+	size_t o2 = table.size() - 1;
+
+	res = CheckOrphan(str, end, table[o1]);
+
+	if( res == 0 )
+		return true;
+
+	if( res < 0 )
+		return false;
+
+	res = CheckOrphan(str, end, table[o2]);
+
+	if( res == 0 )
+		return true;
+
+	if( res > 0 )
+		return false;
+
+
+	while( o1 + 1 < o2 )
+	{
+		size_t o = (o1 + o2) / 2;
+		res = CheckOrphan(str, end, table[o]);
+
+		if( res == 0 )
+			return true;
+
+		if( res < 0 )
+			o2 = o;
+		else
+			o1 = o;
+	}
+
+return false;
+}
+
+
+bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end)
+{
+	if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
+		return false;
+
+	size_t len = end - str;
+
+	if( len > LastItem().porphans->max_len )
+		return false;
+
+return CheckOrphan(str, end, LastItem().porphans->tab);
+}
+
+
+// if there is a semicolon nearby then we break the line after it
+// (useful in html entities)
+// !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str)
+// i wtedy tez nie dodajemy zadnego znaku
+bool HTMLFilter::HasEntityEndAround(const wchar_t * str, const wchar_t * end)
+{
+size_t i, epsilon = 8;// !! IMPROVE ME put as a constant
+
+	for(i=0 ; str < end && i<epsilon ; ++i, ++str)
+		if( IsEndingEntityMark(*str) )
+			return true;
+
+return false;
+}
+
+
+void HTMLFilter::CheckLineWrap()
+{
+	if( wrap_line != 0 && LastItem().has_body_tag && line_len > wrap_line )
+	{
+		Put(10);
+		PutTabs(stack_len);
+	}
+}
+
+
+void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end)
+{
+const wchar_t * word = str;
+size_t non_whites = 0;
+bool was_entity_end = false;
+
+	for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites )
+	{
+		if( break_after != 0 && non_whites >= break_after && (was_entity_end || !HasEntityEndAround(str, end)) )
+		{
+			Put(word, str);
+			word           = str;
+			non_whites     = 0;
+			Put(' ');
+			CheckLineWrap();
+		}
+
+		was_entity_end = (IsEndingEntityMark(*str));
+	}
+
+	if( analyze_entities )
+		AnalyzeEntitiesAndPut(word, str, nullptr);
+	else
+		Put(word, str);
+}
+
+
+void HTMLFilter::PutNormalWhite(const wchar_t * & str, const wchar_t * end)
+{
+	if( str < end )
+	{
+		if( trim_white )
+		{
+			Put(' ');
+			SkipWhiteLines(str, end);
+		}
+		else
+		{
+			while( str < end && (*str==10 || IsWhite(*str)) )
+			{
+				Put(*str);
+
+				if( *str == 10 )
+					PutTabs(stack_len);
+
+				++str;
+			}
+		}
+	}
+}
+
+
+void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end)
+{
+const wchar_t * word, * white;
+
+	if( str < end )
+		CheckLineWrap();
+
+	while( str < end )
+	{
+		word = str;
+		PutNormalNonWhite(str, end);
+
+		if( CheckOrphan(word, str) )
+		{
+			white = str;
+			SkipWhiteLines(str, end);
+
+			if( white < str )
+				PutNonBreakingSpace();
+		}
+		else
+		{
+			PutNormalWhite(str, end);
+
+			if( str < end ) // !! lub moze podobnie jak jest na gorze tutaj? juz nie mam sily myslec :(
+				CheckLineWrap();
+		}
+
+		// for safety (if str was not incremented then there is an infinite loop)
+		if( word == str )
+			break;
+	}
+}
+
+
+
+
+void HTMLFilter::PutOpeningTagMark()
+{
+	Put('<');
+}
+
+
+void HTMLFilter::PutClosingTagMark()
+{
+	Put('>');
+}
+
+
+
+
+// !! IMPROVE ME change to a better name
+// this functions does not return true when the tag is safe
+bool HTMLFilter::IsTagSafe(const wchar_t * tag)
+{
+	if( !safe_mode )
+		return true;
+
+	if( IsNameEqual(tag, no_filter_tag.c_str()) )
+		return false;
+
+	static const wchar_t * unsafe_tags[] = {
+		L"applet", 	L"base",	L"body",
+		L"embed",	L"head",	L"html",
+		L"frame",	L"frameset",L"iframe",
+		L"link",	L"meta",	L"param"
+		L"object",	L"script"
+	};
+
+	size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*);
+	size_t i;
+
+	for(i=0 ; i<len ; ++i)
+	{
+		if( IsNameEqual(tag, unsafe_tags[i]) )
+			return false;
+	}
+
+return true;
+}
+
+
+bool HTMLFilter::IsTagSafe(const std::wstring & tag)
+{
+	return IsTagSafe(tag.c_str());
+}
+
+
+
+
+
+bool HTMLFilter::PutOpeningTag()
+{
+	if( !IsTagSafe(LastItem().name) )
+	{
+		SkipAndCheckClosingTag();
+		return false;
+	}
+
+	PutOpeningTagMark();
+	Put(LastItem().name);
+
+return true;
+}
+
+
+
+void HTMLFilter::PutClosingTag(const wchar_t * tag)
+{
+	if( skip_tags || !IsTagSafe(tag) )
+		return;
+
+	PutOpeningTagMark();
+	Put('/');
+	Put(tag);
+	PutClosingTagMark();
+}
+
+
+
+void HTMLFilter::PutTabs(size_t len)
+{
+	if( len > 30 )
+		len = 30;
+
+	for(size_t i=0 ; i < (len*tab_size) ; ++i)
+		(*out_string) += ' '; // we do not add them to 'line_len'
+}
+
+
+void HTMLFilter::PutNonBreakingSpace()
+{
+	if( orphan_mode == orphan_nbsp )
+	{
+		Put(L"&nbsp;");
+	}
+	else
+	{
+		Put(160);
+	}
+}
+
+
+
+
+void HTMLFilter::PutNewLine()
+{
+	buffer[0] = 10;
+	Put(buffer, buffer+1);
+	line_len = 0;
+}
+
+
+// we assume the size of the opening mark to be one
+bool HTMLFilter::IsOpeningTagMark(wchar_t c)
+{
+	return (c == '<');
+}
+
+
+// we assume the size of the closing mark to be one
+bool HTMLFilter::IsClosingTagMark(wchar_t c)
+{
+	return (c == '>');
+}
+
+
+// the slash at the end <img src=".." /> (without '>' character)
+// we assume the size of the mark to be one
+bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
+{
+	return (c == '/');
+}
+
+
+bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
+{
+static wchar_t comm_open[] = L"<!--";
+size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
+
+	return IsNameEqual(pchar, comm_open, comm_open_len);
+}
+
+
+size_t HTMLFilter::OpeningCommentaryTagMarkSize()
+{
+	return 4; // size of "<!--"
+}
+
+
+bool HTMLFilter::IsStartingEntityMark(wchar_t c)
+{
+	return (c == '&');
+}
+
+
+bool HTMLFilter::IsEndingEntityMark(wchar_t c)
+{
+	return (c == ';');
+}
+
+
+
+// skipping the commentary tag if exists
+bool HTMLFilter::SkipCommentaryTagIfExists()
+{
+static wchar_t comm_close[] = L"-->";
+size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
+
+	if( !IsOpeningCommentaryTagMark(pchar) )
+		return false;
+
+	pchar += OpeningCommentaryTagMarkSize();
+
+	// looking for "-->"
+	while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
+		++pchar;
+
+	if( *pchar!= 0 )
+		pchar += comm_close_len;
+
+	CheckNewLine();
+
+return true;
+}
+
+
+void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white)
+{
+	if( trim_white )
+	{
+		// skipping all white chars (with new lines)
+		// but with remembering the last non white character
+		for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar)
+			if( *pchar == 10 )
+				last_non_white = pchar;
+	}
+	else
+	{
+		// skipping first white chars with only one line between them
+		SkipWhite();
+		last_non_white = pchar;
+
+		if( *pchar == 10 )
+		{
+			++pchar;
+			SkipWhite();
+		}
+	}
+
+	start = pchar;
+
+	// exception for the commentary tag
+	if( IsOpeningCommentaryTagMark(pchar) || !IsOpeningTagMark(*pchar) )
+	{
+		PutNewLine();
+		PutTabs(stack_len);
+	}
+}
+
+
+
+// reading text between html tags
+void HTMLFilter::ReadNormalText()
+{
+const wchar_t * start = pchar;
+const wchar_t * last_non_white = pchar;
+
+	if( last_new_line )
+		ReadNormalTextSkipWhite(start, last_non_white);
+
+
+	while( *pchar != 0 )
+	{
+		const wchar_t * commentary_start = pchar;
+
+		if( SkipCommentaryTagIfExists() )
+		{
+			last_non_white = pchar - 1; // pointing at the last '>' from a commentary
+			PutNormalText(start, commentary_start);
+
+			if( !skip_commentaries )
+			{
+				PutNormalText(commentary_start, pchar);
+			}
+
+			start = pchar;
+		}
+		else
+		{
+			if( IsOpeningTagMark(*pchar) )
+				break;
+
+			if( !IsWhite(*pchar) )
+				last_non_white = pchar;
+
+			pchar += 1;
+		}
+	}
+
+	last_new_line = (*last_non_white == 10);
+	PutNormalText(start, pchar);
+}
+
+
+
+bool HTMLFilter::PrintOpeningItem()
+{
+	if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
+		return true;
+
+	if( last_new_line )
+	{
+		PutNewLine();
+
+		if( stack_len > 1 )
+			PutTabs(stack_len-1);
+	}
+
+return PutOpeningTag();
+}
+
+
+
+
+
+bool HTMLFilter::ReadItemAttr()
+{
+	attr_has_value = false;
+	attr_name.clear();
+	attr_value.clear();
+
+	SkipWhiteLines();
+	ReadItemAttrName();
+
+	if( attr_name.empty() )
+		return false;
+
+	SkipWhiteLines();
+
+	if( *pchar != '=' )
+		return true;
+	
+	attr_has_value = true;
+	pchar += 1;				// skipping '='
+	SkipWhiteLines();
+
+	bool has_quote = (*pchar == '\"' || *pchar == '\'');
+	wchar_t quote_char = *pchar;
+
+	if( has_quote )
+		pchar += 1;			// skipping the first quote mark
+
+	ReadItemAttrValue(has_quote, quote_char);
+
+	if( has_quote && *pchar == quote_char )
+		pchar += 1;			// skipping the last quote mark
+
+return true;
+}
+
+
+
+bool HTMLFilter::CheckItemAttr()
+{
+	if( attr_has_value && IsNameEqual(L"lang", attr_name) )
+	{
+		LastItem().porphans = 0;
+
+		if( !attr_value.empty() )
+		{
+			// we are taking the first value only
+			attr_value_lower = attr_value[0];
+			ToLower(attr_value_lower);
+
+			OrphansTab::iterator i = orphans_tab.find(attr_value_lower);
+
+			if( i != orphans_tab.end()  )
+				LastItem().porphans = &i->second;
+		}
+	}
+
+return true;
+}
+
+
+void HTMLFilter::PrintItemAttr()
+{
+size_t i;
+
+	if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
+		return;
+
+	Put(' ');
+	Put(attr_name);
+
+	if( attr_has_value )
+	{
+		Put(L"=\"");
+
+		for(i=0 ; i<attr_value.size() ; ++i)
+		{
+			Put(attr_value[i]);
+
+			if( i + 1 < attr_value.size() )
+				Put(' ');
+		}
+
+		Put('\"');
+	}
+}
+
+
+void HTMLFilter::ReadItemClosing()
+{
+	pchar += 1; // skipping '/'
+	SkipWhiteLines();
+	ReadItemName();
+	LastItem().type = Item::closing;
+	SkipAndCheckClosingTag();
+
+	// closing tags are printed later
+}
+
+
+void HTMLFilter::ReadItemSpecial()
+{
+	LastItem().type = Item::special;
+
+	if( !skip_tags )
+		PutOpeningTagMark();
+
+	const wchar_t * start = pchar;
+	pchar += 1; // skipping '!'
+
+	ReadItemName();
+	SkipAndCheckClosingTag();
+
+	if( !skip_tags && pchar > start )
+		Put(start, pchar);
+
+	// closing tag mark is printed directly from the source
+}
+
+
+void HTMLFilter::ReadItemOpening()
+{
+	LastItem().type = Item::opening;
+	ReadItemName();
+	
+	if( PrintOpeningItem() )
+	{
+		while( ReadItemAttr() )
+		{
+			if( CheckItemAttr() )
+				PrintItemAttr();
+		}
+
+		SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
+
+		if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
+		{
+			if( LastItem().type == Item::simple )
+				Put(L" /");
+
+			PutClosingTagMark();
+		}
+	}
+}
+
+
+void HTMLFilter::ItemFound()
+{
+}
+
+void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end)
+{
+}
+
+
+bool HTMLFilter::ReadItem()
+{
+	if( *pchar == 0 )
+		return false;
+
+	if( !PushStack() )
+		return false;
+
+	pchar += 1;	// skipping the first '<'
+	SkipWhiteLines();
+
+	if( *pchar == '!' )
+		ReadItemSpecial();
+	else
+	if( *pchar == '/' ) // we have a closing tag (dodac jako metode wirtualna) !!
+		ReadItemClosing();
+	else
+		ReadItemOpening();
+
+	CheckNewLine();
+	LastItem().new_line = last_new_line;
+
+	ItemFound();
+
+return true;
+}	
+
+
+
+wchar_t HTMLFilter::ToLower(wchar_t c)
+{
+	if( c>='A' && c<='Z' )
+		return c - 'A' + 'a';
+
+return c;
+}
+
+
+void HTMLFilter::ToLower(std::wstring & str)
+{
+size_t i;
+
+	for(i=0 ; i<str.size() ; ++i)
+		str[i] = ToLower(str[i]);
+}
+
+
+bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
+{
+	for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
+		if( ToLower(*name1) != ToLower(*name2) )
+			return false;
+
+	if( *name1==0 && *name2==0 )
+		return true;
+
+return false;
+}
+
+
+bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2)
+{
+	return IsNameEqual(name1, name2.c_str());
+}
+
+
+bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2)
+{
+	return IsNameEqual(name1.c_str(), name2);
+}
+
+
+bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2)
+{
+	return IsNameEqual(name1.c_str(), name2.c_str());
+}
+
+
+
+// len characters from both strings must be equal
+// IMPROVE ME change name to something like IsBeginningNameEqual
+// and move to text.h (pikotools)
+bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
+{
+	for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
+		if( ToLower(*name1) != ToLower(*name2) )
+			return false;
+
+	if( len == 0 )
+		return true;
+
+return false;
+}
+
+
+
+bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
+{
+	return IsNameEqual(name1, name2.c_str(), len);
+}
+
+
+bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
+{
+	return IsNameEqual(name1.c_str(), name2, len);
+}
+
+
+bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
+{
+	return IsNameEqual(name1.c_str(), name2.c_str(), len);
+}
+
+
+
+
+
+bool HTMLFilter::IsLastTag(const wchar_t * name)
+{
+	return IsNameEqual(name, LastItem().name);
+}
+
+
+bool HTMLFilter::IsLastTag(const std::wstring & name)
+{
+	return IsNameEqual(name, LastItem().name);
+}
+
+
+// checking exceptions for opening tags
+void HTMLFilter::CheckExceptions()
+{
+	if( IsLastTag(L"meta")	||
+		IsLastTag(L"input")	||
+		IsLastTag(L"br")	||
+		IsLastTag(L"hr")	||
+		IsLastTag(L"img")	||
+		IsLastTag(L"link")	||
+		IsLastTag(L"param")	||
+		IsLastTag(L"col")	||
+		IsLastTag(L"area")   )
+	{
+		LastItem().type = Item::simple;
+		PopStack();
+		return;
+	}
+
+	// in safe_mode the script tag is ignored
+	if( !safe_mode && IsLastTag(L"script") )
+		PutEverythingUntilClosingTag(!skip_tags);
+
+	if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
+		PutEverythingUntilClosingTag(!skip_tags);
+
+	if( IsLastTag(no_filter_tag) )
+		PutEverythingUntilClosingTag(false);
+
+	if( IsLastTag(L"body") )
+		LastItem().has_body_tag = true;
+}
+
+
+
+
+void HTMLFilter::AddForgottenTags()
+{
+int i;
+
+	if( stack_len < 3 )
+		return;
+
+	// we have forgotten to close some tags
+
+	// looking whether there is a matching opening tag
+	for(i=int(stack_len)-3 ; i>=0 ; --i)
+		if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
+			break;
+
+	if( i < 0 )
+	{
+		// oops, there is no such a tag
+		// we don't print the closing and the missing opening tag
+		PopStack();
+		return;
+	}
+
+	for(int z=(int)stack_len-2 ; z>=i ; --z)
+	{
+		if( !skip_tags && pstack[z].new_line )
+		{
+			PutNewLine();
+			PutTabs(z);
+		}
+
+		PutClosingTag(pstack[z].name.c_str());
+		pstack[z].Clear();
+	}
+
+	last_new_line = pstack[stack_len-1].new_line;
+
+	// invalidate tags
+	stack_len = i;
+}
+
+
+void HTMLFilter::CheckStackPrintRest()
+{
+	while( stack_len-- > 0 )
+	{
+		if( stack_len==0 || pstack[stack_len-1].new_line )
+			PutNewLine();
+
+		PutTabs(stack_len);
+		PutClosingTag(pstack[stack_len].name.c_str());
+	}
+}
+
+
+void HTMLFilter::CheckClosingTags()
+{
+	if( stack_len == 0 )
+		return;
+
+	// on the stack we have only opening tags
+	// but only the last tag is a closing tag
+
+	if( stack_len == 1 )
+	{
+		// there is only last closing tag
+		// we dont print it
+		PopStack();
+		return;
+	}
+
+	// there are more than one tag 
+	if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
+	{
+		// last closing tag is from the previous one
+		if( !skip_tags && pstack[stack_len-2].new_line )
+		{
+			PutNewLine();
+			PutTabs(stack_len-2);
+		}
+
+		PutClosingTag(pstack[stack_len-1].name.c_str());
+		last_new_line = pstack[stack_len-1].new_line;
+		PopStack();
+		PopStack();
+	}
+	else
+	{
+		AddForgottenTags();
+	}
+}
+
+
+bool HTMLFilter::PrintRest()
+{
+const wchar_t * start = pchar;
+
+	// in safe mode we do not print the rest html code
+	if( safe_mode || skip_tags )
+		return false;
+
+	while( *pchar )
+		++pchar;
+
+	if( pchar > start )
+	{
+		Put(start, pchar);
+		return true;
+	}
+
+return false;
+}
+
+
+
+void HTMLFilter::ReadLoop()
+{
+	while( ReadItem() )
+	{
+		if( LastItem().type == Item::opening )
+		{
+			CheckExceptions();
+		}
+		else
+		if( LastItem().type == Item::special || LastItem().type == Item::simple )
+		{
+			if( stack_len > 1 )
+			{
+				//pstack[stack_len-2].new_line = LastItem().new_line;
+			}
+			else
+			if( trim_white )
+			{
+				// one new line after a simple or special tag
+				// (if the tag has level 0 in the tree - it not means that this is a first tag)
+				// for example can be DOCTYPE 
+				PutNewLine(); 
+			}
+
+			PopStack();
+		}
+		else
+		if( LastItem().type == Item::closing )
+		{
+			CheckClosingTags();
+		}
+		else
+		{
+			PopStack();
+		}
+
+		ReadNormalText();
+	}
+}
+
+
+
+void HTMLFilter::Read()
+{
+	if( trim_white )
+		SkipWhiteLines();
+
+	// it can be some text or white lines before the first html tag (we print it)
+	ReadNormalText();
+
+	// reading the whole html source
+	ReadLoop();
+
+	// sometimes there can remain some html source (when there is no space on the stack)
+	// we print the rest html without filtering (only if safe_mode is false)
+	if( !PrintRest() )
+		CheckStackPrintRest();
+}
+
+
+
+
+
+}
+
diff --git a/src/html/htmlfilter.h b/src/html/htmlfilter.h
new file mode 100644
index 0000000..35710d3
--- /dev/null
+++ b/src/html/htmlfilter.h
@@ -0,0 +1,376 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/* 
+ * Copyright (c) 2008-2021, Tomasz Sowa
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_picotools_html_htmlfilter
+#define headerfile_picotools_html_htmlfilter
+
+#include <string>
+#include <map>
+#include <vector>
+#include <algorithm>
+
+
+
+namespace pt
+{
+
+
+
+
+// max length of a name of a html tag (with terminating null)
+#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN	30
+
+// max length of a html lang attribute (e.g. "en", "pl")
+#define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN	10
+
+
+#define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN	40
+
+
+#define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN	500
+
+
+// depth of the html tree
+#define WINIX_HTMLFILTER_STACK_MAXLEN		100
+
+// length of a buffer used for printing
+// it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3
+#define WINIX_HTMLFILTER_BUFFER_MAXLEN	2048
+
+
+
+
+/*!
+	very lightweight filter for html
+	(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
+	this filter has O(n) complexity over the whole html string
+
+	such tags as: <script> <pre> <textarea> are treated in a special way
+	all characters between the opening and closing tag (<script>....</script>) are untouched
+
+	if the filter finds that there are not closed tags it will close them,
+	if the filter finds a closing tag which doesn't have an opening tag - it will skip it
+
+	tags which don't need to be closed: meta, input, br, img, link
+	look at CheckExceptions() method
+
+	the filter recognizes xml simple tags (with / at the end) such as: <br />
+*/
+class HTMLFilter
+{
+public:
+
+	enum OrphanMode
+	{
+		orphan_nbsp,		// putting "&nbsp;" string
+		orphan_160space		// putting 160 ascii code
+	};
+
+	HTMLFilter();
+	HTMLFilter(const HTMLFilter & f);
+	HTMLFilter & operator=(const HTMLFilter & f);
+	virtual ~HTMLFilter();
+
+
+	// main methods used for filtering
+	void Filter(const wchar_t * in, std::wstring & out);
+	void Filter(const std::wstring & in, std::wstring & out);
+
+
+	// insert a white space into long words
+	// (only between html tags)
+	// skipped in such tags: script, pre, textarea
+	// break_after - after how many characters insert a space (0 - off)
+	void BreakWord(size_t break_after_);
+
+	// insert a new line character into long lines
+	// (only between html tags)
+	// skipped in such tags: script, pre, textarea
+	// wrap_line - after how many characters wrap a line (0 - off)
+	// lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section)
+	void WrapLine(size_t wrap_line_);
+
+	// trimming white characters (with new lines)
+	// at the beginning, at the end and in the middle of a string
+	// only between html tags
+	// at the beginning and at the end only one space is left
+	// skipped in such tags: script, pre, textarea
+	// false by default
+	void TrimWhite(bool trim);
+
+	// first tabs in a tree
+	// default: 2 (spaces)
+	// set 0 to turn off
+	void InsertTabs(size_t tabsize);
+
+	// set a name of a html tag which will be used as 'nofilter' tag
+	// elements between such tags are not filtered (similarly as in <pre> and <textarea>)
+	// these tags (opening and closing) will no be placed in the html output
+	void SetNoFilterTag(const std::wstring & tag_name);
+
+	// orphans are checked only in 'body' tag
+	void AssignOrphans(const wchar_t * lang_code,      const std::vector<std::wstring> & otab);
+	void AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab);
+	void ClearOrphans();
+
+	// check 'orphans' for the specicic language
+	// if an orphan is detected then the non-break space ("&nbsp;" or ascii 160 code) will be put
+	// default disable (lang_none)
+	void OrphansMode(const std::wstring & orphan_mode);
+
+	// skipping some unsafe tags
+	// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
+	void SafeMode(bool safe_mode_);
+
+	// skip all html tags
+	// gives only text without markup
+	// but there can be commentaries
+	void SkipTags(bool skip_tags);
+
+	// skip commentaries
+	void SkipCommentaries(bool skip_commentaries);
+
+	// if true then entities such as &nbsp; are skipped
+	// this automatically turns on AnalyzeEntities
+	// in such a case FoundEntity callbacks are sent
+	void SkipEntities(bool skip_entities);
+
+	// analyze html entities such as &nbsp;
+	// virtual method: FoundEntity is called
+	// entities are analyzed in normal text and in attribute values such as <p class="a&nbsp;">
+	void AnalyzeEntities(bool analyze_entities);
+
+
+protected:
+
+	// orphans for one language
+	struct Orphans
+	{
+		std::vector<std::wstring> tab;
+		size_t max_len;
+	};
+
+
+	// orphans for all languages
+	// map<language_code, Orphans>
+	typedef std::map<std::wstring, Orphans> OrphansTab;
+	OrphansTab orphans_tab;
+
+	// html <nofilter> tag name
+	std::wstring no_filter_tag;
+
+
+	struct Item
+	{
+		std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
+
+		enum Type
+		{
+			opening,		/* sample:  <h1>		*/
+			closing,		/* sample:  </h1>		*/
+			simple,			/* sample:  <br/>		*/
+			special,		/* sample:  <!doctype>	*/
+			none
+		} type;
+
+		// is there a new line after this tag
+		bool new_line;
+
+		// current orphans table
+		// (will be propagated)
+		Orphans * porphans;
+
+		// this item or one from its parents is a 'body' html tag
+		// (will be propagated)
+		bool has_body_tag;
+
+		void Clear();
+		Item();
+	};
+
+
+
+
+
+
+	/*
+		virtual methods
+	*/
+	virtual void Init();
+	virtual void Uninit();
+
+	virtual bool IsOpeningTagMark(wchar_t c);
+	virtual bool IsClosingTagMark(wchar_t c);
+	virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
+	virtual bool IsStartingEntityMark(wchar_t c);
+	virtual bool IsEndingEntityMark(wchar_t c);
+
+	virtual bool   IsOpeningCommentaryTagMark(const wchar_t * str);
+	virtual size_t OpeningCommentaryTagMarkSize();
+
+	virtual bool IsValidCharForName(int c);
+	virtual bool IsValidCharForAttrName(int c);
+	virtual bool IsValidCharForEntityName(int c);
+	virtual void CheckExceptions();
+	virtual bool SkipCommentaryTagIfExists();
+
+	virtual void Put(wchar_t c);
+	virtual void Put(const wchar_t * str);
+	virtual void Put(const wchar_t * str, const wchar_t * end);
+	virtual void Put(const std::wstring & str);
+	virtual void AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out);
+
+	virtual void PutOpeningTagMark();
+	virtual void PutClosingTagMark();
+	virtual bool PutOpeningTag();
+	virtual void PutClosingTag(const wchar_t * tag);
+
+	virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
+	virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
+
+	virtual void ItemFound();
+	virtual void EntityFound(const wchar_t * str, const wchar_t * end);
+
+	/*
+		others
+	*/
+	void SetSomeDefaults();
+
+	Item & GetItem(size_t i);
+	Item & LastItem();
+
+	wchar_t ToLower(wchar_t c);
+	void    ToLower(std::wstring & str);
+
+	bool IsNameEqual(const wchar_t * name1,      const wchar_t * name2);
+	bool IsNameEqual(const wchar_t * name1,      const std::wstring & name2);
+	bool IsNameEqual(const std::wstring & name1, const wchar_t * name2);
+	bool IsNameEqual(const std::wstring & name1, const std::wstring & name2);
+
+	bool IsNameEqual(const wchar_t * name1,      const wchar_t * name2,      size_t len);
+	bool IsNameEqual(const wchar_t * name1,      const std::wstring & name2, size_t len);
+	bool IsNameEqual(const std::wstring & name1, const wchar_t * name2,      size_t len);
+	bool IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len);
+
+	bool IsLastTag(const wchar_t * name);
+	bool IsLastTag(const std::wstring & name);
+	bool IsTagSafe(const wchar_t * tag);
+	bool IsTagSafe(const std::wstring & tag);
+
+	int  CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str);
+	bool CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & orphans);
+	bool CheckOrphan(const wchar_t * str, const wchar_t * end);
+
+	bool IsWhite(int c);
+	void SkipWhite();
+	void SkipWhiteLines();
+	void SkipWhiteWithFirstNewLine();
+	void SkipWhiteLines(const wchar_t * & str, const wchar_t * end);
+	bool IsClosingTagForLastItem();
+	void SkipAndCheckClosingTag();
+
+	void PopStack();
+	bool PushStack();
+	void CheckNewLine();
+	void CheckStackPrintRest();
+	void AddForgottenTags();
+	void CheckClosingTags();
+	void ReadNormalText();
+	bool PrintRest();
+	bool PrintOpeningItem();
+	void ReadItemName();
+	void ReadItemAttrName();
+	void ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end);
+	void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
+
+	bool ReadItemAttr();
+	bool CheckItemAttr();
+	void PrintItemAttr();
+
+	void ReadItemClosing();
+	void ReadItemSpecial();
+	void ReadItemOpening();
+	bool ReadItem();
+	void ReadLoop();
+	void Read();
+
+	void CheckChar(wchar_t c);
+
+	void CheckLineWrap();
+	bool HasEntityEndAround(const wchar_t * str, const wchar_t * end);
+	void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end);
+	void PutNormalWhite(const wchar_t * & str, const wchar_t * end);
+	void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
+	void PutTabs(size_t len);
+	void PutNonBreakingSpace();
+	void PutNewLine();
+	void CalcOrphansMaxLen(Orphans & orphans);
+
+	const wchar_t * pchar;
+	Item empty;
+	Item * pstack;			// stack pointer
+	size_t stack_len;		// length of the stack
+	wchar_t * buffer;		// buffer used when printing
+	std::wstring * out_string;
+	bool last_new_line;
+	size_t break_after;		// insert a space into long words after 'break_after' characters
+	size_t wrap_line;		// insert a new line character into long lines
+	bool trim_white;		// trimming white characters
+	size_t tab_size;
+	OrphanMode orphan_mode;
+	std::wstring attr_name;
+	std::vector<std::wstring> attr_value;
+	std::wstring attr_value_temp;
+	std::wstring attr_value_lower;
+	bool attr_has_value;
+	std::wstring lang_code_lower;
+	size_t line_len;		//length of the current line (without first spaces which create the html tree)
+	bool safe_mode;			// skipping some unsafe tags
+	Orphans orphans_temp;
+	bool skip_tags;
+	bool skip_commentaries;
+	bool skip_entities;
+	bool analyze_entities;
+};
+
+
+
+}
+
+
+
+#endif
diff --git a/tests/Makefile.dep b/tests/Makefile.dep
index 85bda87..60ed660 100644
--- a/tests/Makefile.dep
+++ b/tests/Makefile.dep
@@ -1,6 +1,5 @@
 # DO NOT DELETE
 
-./main.o: convert.h mainoptionsparser.h csvparser.h
 ./convert.o: convert.h test.h ../src/convert/convert.h
 ./convert.o: ../src/convert/inttostr.h ../src/convert/patternreplacer.h
 ./convert.o: ../src/textstream/textstream.h ../src/textstream/stream.h
@@ -11,6 +10,11 @@
 ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./convert.o: ../src/convert/strtoint.h ../src/convert/text.h
 ./convert.o: ../src/convert/misc.h ../src/convert/double.h
+./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
+./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
+./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
+./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h test.h
+./main.o: convert.h mainoptionsparser.h csvparser.h
 ./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h
 ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
@@ -26,7 +30,3 @@
 ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
 ./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h
-./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
-./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
-./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
-./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h test.h

From 2a3f43c5c302eb60a18cd07783937caae25f0ce4 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 17 Jul 2021 13:54:03 +0200
Subject: [PATCH 02/37] added BBCODEParser (html/bbcodeparser.h|cpp) - copied
 from winix project

---
 src/Makefile.dep          |   1 +
 src/html/bbcodeparser.cpp | 639 ++++++++++++++++++++++++++++++++++++++
 src/html/bbcodeparser.h   | 128 ++++++++
 3 files changed, 768 insertions(+)
 create mode 100644 src/html/bbcodeparser.cpp
 create mode 100644 src/html/bbcodeparser.h

diff --git a/src/Makefile.dep b/src/Makefile.dep
index 2a8cf37..16e85d6 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -43,3 +43,4 @@
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
 ./html/htmlfilter.o: ./html/htmlfilter.h
+./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlfilter.h
diff --git a/src/html/bbcodeparser.cpp b/src/html/bbcodeparser.cpp
new file mode 100644
index 0000000..0a60273
--- /dev/null
+++ b/src/html/bbcodeparser.cpp
@@ -0,0 +1,639 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/* 
+ * Copyright (c) 2008-2021, Tomasz Sowa
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bbcodeparser.h"
+
+
+namespace pt
+{
+
+
+
+
+bool BBCODEParser::Equal(const wchar_t * str1, const wchar_t * str2)
+{
+	while( *str1 == *str2 && *str1 != 0 )
+	{
+		str1 += 1;
+		str2 += 1;
+	}
+
+return *str1 == *str2;
+}
+
+
+
+
+bool BBCODEParser::IsValidCharForName(int c)
+{
+	if( (c>='a' && c<='z') ||
+		(c>='A' && c<='Z') ||
+		c=='*' || c=='_')
+		return true;
+
+return false;
+}
+
+
+bool BBCODEParser::IsOpeningTagMark(wchar_t c)
+{
+	return (c == '[');
+}
+
+
+bool BBCODEParser::IsClosingTagMark(wchar_t c)
+{
+	return (c == ']');
+}
+
+bool BBCODEParser::IsClosingXmlSimpleTagMark(wchar_t c)
+{
+	return false;
+}
+
+
+
+// there are no commentaries in bbcode
+bool BBCODEParser::IsOpeningCommentaryTagMark(const wchar_t *)
+{
+	return false;
+}
+
+
+size_t BBCODEParser::OpeningCommentaryTagMarkSize()
+{
+	return 0;
+}
+
+
+
+bool BBCODEParser::SkipCommentaryTagIfExists()
+{
+	return false;
+}
+
+
+
+
+
+
+
+
+
+
+// one enter will generate one <br>
+// two enters or more will generate only two br (<br><br>)
+void BBCODEParser::PutNormalText(const wchar_t * str, const wchar_t * end)
+{
+int br_len;
+
+	if( *pchar == 0 )
+	{
+		// trimming last white characters at end of the user text
+		while( str<end && (IsWhite(*(end-1)) || *(end-1)==10) )
+			--end;
+	}
+
+
+	while( str < end )
+	{
+		if( *str == 10 )
+		{
+			++str;
+			br_len = 1;
+
+			// skipping white characters without a new line character
+			while( str < end && IsWhite(*str) )
+				++str;
+
+			if( str < end && *str == 10 )
+			{
+				br_len = 2;
+
+				// skipping white characters with new line characters
+				while( str < end && (IsWhite(*str) || *str==10) )
+					++str;
+			}
+
+			if( !has_open_ol_tag && !has_open_ul_tag && !has_open_li_tag )
+			{
+				for(int i=0 ; i < br_len ; ++i)
+					(*out_string) += L"<br>\n";
+			}
+		}
+		else
+		{
+			PrintEscape(*str);
+			++str;
+		}
+	}
+}
+
+
+
+void BBCODEParser::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white)
+{
+}
+
+
+void BBCODEParser::CheckExceptions()
+{
+	if( stack_len >= 2 )
+	{
+		if( pstack[stack_len-1].type == Item::opening &&
+			pstack[stack_len-2].type == Item::opening &&
+			IsNameEqual(L"*", pstack[stack_len-1].name) &&
+			IsNameEqual(L"*", pstack[stack_len-2].name) )
+		{
+			// removing the last [*] from the stack
+			// </li> was put automatically
+			PopStack();
+		}
+	}
+}
+
+
+
+
+/*
+	bbcode format:
+	[bbcodetag=value]some text[/bbcodetag]
+	the value can be quoted, e.g.
+	[bbcodetag="value"]some text[/bbcodetag], or
+	[bbcodetag='value']some text[/bbcodetag]
+
+	the third string below (in tags table) is 'html_argument' from Tags,
+	it can contain a special character % followed by a string which means:
+	 %1  - "value" escaped as for html
+	 %2  - "some text" escaped as for html
+	 %u1 - "value" trimmed and escaped as for url-es
+	 %u2 - "some text" trimmed and escaped as for url-es
+	 %%  - one %
+
+	 if you are using %2 or %u2 then "some text" is not treated as bbcode, e.g.
+	 [bbcodetag=value]some [b]text[/b][/bbcodetag] will produce:
+	 <htmltag arg="value">some [b]text[/b]</htmltag> (the inner tags [b][/b] were not parsed)
+
+	 also when using %2 or %u2 the closing bbcode tag is skipped 
+	 (if you want this tag then you can put it in 'html_argument')
+
+	 and when using u (%u1 or %u2) the argument is trimmed from whitespaces and new lines
+	 at the beginning and at the end
+	 (because otherwise a space would be changed to %20 and this were probably not what you really wanted)
+*/
+const BBCODEParser::Tags * BBCODEParser::FindTag(const wchar_t * tag)
+{
+	static Tags tags[] = {
+		{L"*",     L"li",    L">",                        false},
+		{L"b",     L"em",    L">",                        true},
+		{L"i",     L"span",  L" class=\"bbitalic\">",     true},
+		{L"u",     L"span",  L" class=\"bbunderline\">",  true},
+		{L"s",     L"span",  L" class=\"bbstrike\">",     true},
+		{L"code",  L"code",  L" class=\"bbcode\">",       false},
+		{L"list",  L"ul",    L" class=\"bblist\">",       false},
+		{L"color", L"span",  L" class=\"bbcol%1\">",      true},
+		{L"url",   L"a",     L" href=\"%u1\">",           true},
+		{L"img",   L"img",   L" alt=\"%1\" src=\"%u2\">", true},
+		{L"quote", L"div",   L" class=\"bbquote\">\n<span class=\"bbquotewho\">%1</span><br>\n", false},
+	};
+
+	size_t i;
+	size_t len = sizeof(tags) / sizeof(Tags);
+
+	for(i=0 ; i<len ; ++i)
+	{
+		if( Equal(tag, tags[i].bbcode) )
+			return &tags[i];
+	}
+
+return 0;
+}
+
+const BBCODEParser::Tags * BBCODEParser::FindTag(const std::wstring & tag)
+{
+	return FindTag(tag.c_str());
+}
+
+
+
+void BBCODEParser::PrintArgumentCheckQuotes(const wchar_t * & start, const wchar_t * & end)
+{
+	// skipping white characters from the argument
+	while( start<end && IsWhite(*start) )
+		++start; 
+
+	// skipping first '=' character if exists
+	if( start<end && *start == '=' )
+		++start; 
+
+	// skipping white characters from the argument
+	// at the beginning
+	while( start<end && IsWhite(*start) )
+		++start; 
+
+	// and at the end
+	while( start<end && IsWhite(*(end-1)) )
+		--end; 
+
+
+	if( start<end && (*start=='\'' || *start=='\"') )
+	{
+		++start;
+
+		if( start<end && *(start-1) == *(end-1) )
+			--end;
+
+		// skipping white characters after a first quote char [url  =  "   ww...."]
+		while( start<end && IsWhite(*start) )
+			++start; 
+	}
+}
+
+
+
+void BBCODEParser::PrintEncode(int c)
+{
+	if( c == '&' )
+	{
+		(*out_string) += L"&amp;";
+	}
+	else
+	if( (c>='a' && c<='z') ||
+		(c>='A' && c<='Z') ||
+		(c>='0' && c<='9') ||
+		(c=='_' || c=='?' || c=='.' || c==',' || c=='/' || c=='-' ||
+		 c=='+' || c=='*' || c=='(' || c==')' || c=='=' || c==':')
+		)
+	{
+		(*out_string) += c;
+	}
+	else
+	{
+		wchar_t buffer[20];
+		swprintf(buffer, 20, L"%02X", c);
+
+		(*out_string) += '%';
+		(*out_string) += buffer;
+	}
+}
+
+
+void BBCODEParser::PrintEscape(int c, bool change_quote)
+{
+	if( c == '<' )
+	{
+		(*out_string) += L"&lt;";
+	}
+	else
+	if( c == '>' ) 
+	{
+		(*out_string) += L"&gt;";
+	}
+	else
+	if( c == '&' ) 
+	{
+		(*out_string) += L"&amp;";
+	}
+	else
+	if( c == '\"' && change_quote )
+	{
+		(*out_string) += L"&quot;";
+	}
+	else
+	{
+		(*out_string) += c;
+	}
+}
+
+
+void BBCODEParser::PrintArgumentEncode(const wchar_t * start, const wchar_t * end)
+{
+	PrintArgumentCheckQuotes(start, end);
+	TrimWhiteWithNewLines(start, end);
+
+	for( ; start<end ; ++start )
+		PrintEncode(*start);
+}
+
+
+void BBCODEParser::PrintArgumentEscape(const wchar_t * start, const wchar_t * end)
+{
+	PrintArgumentCheckQuotes(start, end);
+	
+	for( ; start<end ; ++start )
+		PrintEscape(*start, true); // quotes are escaped as well here
+}
+
+
+void BBCODEParser::CheckOpeningTag(const Tags * tag, const wchar_t * tag_name, bool & condition)
+{
+	if( Equal(tag->html_tag, tag_name) )
+	{
+		if( condition )
+		{
+			PutClosingTag(tag);
+			(*out_string) += '\n';
+		}
+
+		condition = true;
+	}
+}
+
+
+void BBCODEParser::CheckOpeningTag(const Tags * tag)
+{
+	bool has_list_tag = has_open_ul_tag || has_open_ol_tag;
+
+	CheckOpeningTag(tag, L"li", has_open_li_tag);
+	CheckOpeningTag(tag, L"ul", has_open_ul_tag);
+	CheckOpeningTag(tag, L"ol", has_open_ol_tag);
+
+	if( has_open_li_tag && !has_list_tag )
+	{
+		(*out_string) += L"<ul>\n";
+		has_open_ul_tag = true;
+	}
+}
+
+
+
+
+
+void BBCODEParser::PrintEscape(const wchar_t * start, const wchar_t * end, bool change_quote)
+{
+	for( ; start < end ; ++start)
+		PrintEscape(*start, change_quote);
+}
+
+
+
+void BBCODEParser::PrintEncode(const wchar_t * start, const wchar_t * end)
+{
+	for( ; start < end ; ++start)
+		PrintEncode(*start);
+}
+
+
+
+void BBCODEParser::PutOpeningTagFromEzc()
+{
+	// this can be a tag from Ezc templates system
+	(*out_string) += '[';
+	(*out_string) += LastItem().name;
+
+	const wchar_t * start = pchar;
+
+	while( *pchar && *pchar!=']' )
+		++pchar;
+
+	if( *pchar == ']' )
+		++pchar;
+
+	Put(start, pchar);
+}
+
+
+
+
+
+void BBCODEParser::PutHtmlArgument1(const wchar_t * arg_start, const wchar_t * arg_end, bool has_u)
+{
+	if( has_u )
+		PrintArgumentEncode(arg_start, arg_end);
+	else
+		PrintArgumentEscape(arg_start, arg_end);
+}
+
+
+
+void BBCODEParser::TrimWhiteWithNewLines(const wchar_t * & start, const wchar_t * & end)
+{
+	while( start < end && (IsWhite(*start) || *start==10) )
+		++start;
+
+	while( start < end && (IsWhite(*(end-1)) || *(end-1)==10) )
+		--end;
+}
+
+
+
+void BBCODEParser::PutHtmlArgument2(const Tags * tag, bool has_u)
+{
+const wchar_t * start = pchar;
+const wchar_t * end   = pchar;
+bool first_tag_removed = false;
+
+	while( *pchar != 0 )
+	{
+		if( IsOpeningTagMark(*pchar) )
+		{
+			if( IsClosingTagForLastItem() )
+			{
+				// the last tag is skipped when using patterns with %2 or %u2
+
+				PopStack(); // removing opening tag from the stack
+				first_tag_removed = true;
+				break;
+			}
+		}
+		else
+		{
+			pchar += 1;
+			end = pchar;
+		}
+	}
+
+	if( !first_tag_removed )
+		PopStack(); // user has forgotten to close the tag
+
+	if( has_u )
+	{
+		TrimWhiteWithNewLines(start, end);
+		PrintEncode(start, end);
+	}
+	else
+	{
+		PrintEscape(start, end);
+	}
+}
+
+
+
+void BBCODEParser::PutHtmlArgument(const Tags * tag, const wchar_t * arg_start, const wchar_t * arg_end)
+{
+const wchar_t * pattern = tag->html_argument;
+bool has_u;
+
+	while( *pattern )
+	{
+		if( *pattern == '%' )
+		{
+			++pattern;
+			has_u = false;
+
+			if( *pattern == 'u' )
+			{
+				++pattern;
+				has_u = true;
+			}
+
+			if( *pattern == '1' )
+			{
+				++pattern;
+				PutHtmlArgument1(arg_start, arg_end, has_u);
+			}
+			else
+			if( *pattern == '2' )
+			{
+				++pattern;
+				PutHtmlArgument2(tag, has_u);
+			}
+			else
+			if( *pattern == '%' )
+			{
+				(*out_string) += '%';
+				++pattern;
+			}
+			// else unrecognized, will be printed next time as a normal character
+		}
+		else
+		{
+			(*out_string) += *pattern;
+			++pattern;
+		}
+	}
+}
+
+
+void BBCODEParser::PutOpeningTagFromBBCode(const Tags * tag)
+{
+	CheckOpeningTag(tag);
+	PutOpeningTagMark();
+	Put(tag->html_tag);
+
+	const wchar_t * start = pchar;
+
+	while( *pchar && *pchar != ']' )
+		++pchar;
+
+	PutHtmlArgument(tag, start, pchar);
+
+	if( *pchar == ']' )
+		++pchar;
+
+	if( !tag->inline_tag )
+	{
+		Put(10);
+		SkipWhiteLines();
+	}
+}
+
+
+bool BBCODEParser::PutOpeningTag()
+{
+	const Tags * tag = FindTag(LastItem().name);
+
+	if( !tag )
+		PutOpeningTagFromEzc();
+	else
+		PutOpeningTagFromBBCode(tag);
+
+return false;
+}
+
+
+void BBCODEParser::PutClosingTag(const Tags * tag)
+{
+	if( !tag )
+		return; // skipping the tag
+
+	PutOpeningTagMark();
+	(*out_string) += '/';
+	(*out_string) += tag->html_tag;
+	PutClosingTagMark();
+
+	if( !tag->inline_tag )
+	{
+		(*out_string) += L"\n";
+		SkipWhiteLines();
+	}
+
+	if( Equal(tag->html_tag, L"li") )
+		has_open_li_tag = false;
+
+	if( Equal(tag->html_tag, L"ol") )
+		has_open_ol_tag = false;
+
+	if( Equal(tag->html_tag, L"ul") )
+		has_open_ul_tag = false;
+}
+
+
+void BBCODEParser::PutClosingTag(const wchar_t * tag_name)
+{
+	const Tags * tag = FindTag(tag_name);
+	PutClosingTag(tag);
+}
+
+
+
+void BBCODEParser::Init()
+{
+	has_open_li_tag = false;
+	has_open_ol_tag = false;
+	has_open_ul_tag = false;
+
+	SkipWhiteLines();
+}
+
+
+void BBCODEParser::Uninit()
+{
+	if( has_open_li_tag )
+		(*out_string) += L"</li>\n";
+
+	if( has_open_ol_tag )
+		(*out_string) += L"</ol>\n";
+
+	if( has_open_ul_tag )
+		(*out_string) += L"</ul>\n";
+}
+
+
+
+}
+
diff --git a/src/html/bbcodeparser.h b/src/html/bbcodeparser.h
new file mode 100644
index 0000000..bd36e4d
--- /dev/null
+++ b/src/html/bbcodeparser.h
@@ -0,0 +1,128 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/* 
+ * Copyright (c) 2008-2021, Tomasz Sowa
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_winix_core_bbcodeparser
+#define headerfile_winix_core_bbcodeparser
+
+#include "htmlfilter.h"
+
+namespace pt
+{
+
+
+class BBCODEParser : public HTMLFilter
+{
+
+	struct Tags
+	{
+		const wchar_t * bbcode;
+		const wchar_t * html_tag;
+		const wchar_t * html_argument; // with closing '>'
+		bool inline_tag;
+	};
+
+
+	/*
+		virtual methods
+		(from HTMLFilter class)
+	*/
+	virtual void Init();
+	virtual void Uninit();
+
+	virtual bool IsOpeningTagMark(wchar_t c);
+	virtual bool IsClosingTagMark(wchar_t c);
+	virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
+
+	virtual bool IsOpeningCommentaryTagMark(const wchar_t *);
+	virtual size_t OpeningCommentaryTagMarkSize();
+
+	virtual bool IsValidCharForName(int c);
+	virtual void CheckExceptions();
+	virtual bool SkipCommentaryTagIfExists();
+
+	virtual bool PutOpeningTag();
+	virtual void PutClosingTag(const wchar_t * tag);
+
+	virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
+	virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
+
+
+
+	/*
+		others
+	*/
+	bool Equal(const wchar_t * str1, const wchar_t * str2);
+
+	void PutHtmlArgument1(const wchar_t * arg_start, const wchar_t * arg_end, bool has_u);
+	void PutHtmlArgument2(const Tags * tag, bool has_u);
+	void PutHtmlArgument(const Tags * tag, const wchar_t * arg_start, const wchar_t * arg_end);
+
+	void PutOpeningTagFromEzc();
+	void PutOpeningTagFromBBCode(const Tags * tag);
+
+	const Tags * FindTag(const wchar_t * tag);
+	const Tags * FindTag(const std::wstring & tag);
+	void PrintArgumentCheckQuotes(const wchar_t * & start, const wchar_t * & end);
+
+	void PrintEscape(int c, bool change_quote = false);
+	void PrintEncode(int c);
+
+	void PrintEscape(const wchar_t * start, const wchar_t * end, bool change_quote = false);
+	void PrintEncode(const wchar_t * start, const wchar_t * end);
+
+	void PrintArgumentEncode(const wchar_t * start, const wchar_t * end);
+	void PrintArgumentEscape(const wchar_t * start, const wchar_t * end);
+
+	void PutClosingTag(const Tags * tag);
+
+	void CheckOpeningTag(const Tags * tag, const wchar_t * tag_name, bool & condition);
+	void CheckOpeningTag(const Tags * tag);
+
+	void TrimWhiteWithNewLines(const wchar_t * & start, const wchar_t * & end);
+
+
+
+	bool has_open_ol_tag; // has open html <ol> tag
+	bool has_open_ul_tag; // has open html <ul> tag
+	bool has_open_li_tag; // has open html <li> tag
+};
+
+
+}
+
+
+#endif

From 7ce07c57f51f1841e78202575d1f49d16e3c816f Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 17 Jul 2021 14:38:22 +0200
Subject: [PATCH 03/37] added a base class for parsers: BaseParser
 (convert/baseparser.h|cpp)   there are methods for reading from string/files
 there   those methods were moved from SpaceParser and CSVParser fixed:
 CSVParser didn't set input_as_utf8 flag

---
 src/Makefile.dep           |   8 +-
 src/convert/baseparser.cpp | 188 +++++++++++++++++++++++++++++++++++++
 src/convert/baseparser.h   | 120 +++++++++++++++++++++++
 src/csv/csvparser.cpp      | 131 +-------------------------
 src/csv/csvparser.h        |  68 ++------------
 src/space/spaceparser.cpp  | 116 -----------------------
 src/space/spaceparser.h    |  58 +-----------
 tests/Makefile.dep         |   3 +-
 8 files changed, 329 insertions(+), 363 deletions(-)
 create mode 100644 src/convert/baseparser.cpp
 create mode 100644 src/convert/baseparser.h

diff --git a/src/Makefile.dep b/src/Makefile.dep
index 16e85d6..84ddfa4 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -8,6 +8,9 @@
 ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./convert/double.o: membuffer/membuffer.h textstream/types.h
+./convert/baseparser.o: ./convert/baseparser.h utf8/utf8.h
+./convert/baseparser.o: textstream/stream.h utf8/utf8_templates.h
+./convert/baseparser.o: utf8/utf8_private.h
 ./date/date.o: ./date/date.h convert/inttostr.h
 ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
 ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
@@ -29,14 +32,15 @@
 ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
 ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
 ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h
-./space/spaceparser.o: utf8/utf8_private.h convert/strtoint.h
-./space/spaceparser.o: ./convert/text.h ./convert/misc.h
+./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
+./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
 ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
 ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
+./csv/csvparser.o: convert/baseparser.h
 ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
 ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
diff --git a/src/convert/baseparser.cpp b/src/convert/baseparser.cpp
new file mode 100644
index 0000000..b95933d
--- /dev/null
+++ b/src/convert/baseparser.cpp
@@ -0,0 +1,188 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/*
+ * Copyright (c) 2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "baseparser.h"
+#include "utf8/utf8.h"
+
+
+
+namespace pt
+{
+
+BaseParser::BaseParser()
+{
+	clear();
+}
+
+
+void BaseParser::clear()
+{
+	line = 0;
+	reading_from_file = false;
+	pchar_ascii = nullptr;
+	pchar_unicode = nullptr;
+	reading_from_wchar_string = false;
+	lastc = -1;
+	input_as_utf8 = true;
+}
+
+
+int BaseParser::read_utf8_char()
+{
+int c;
+bool correct;
+
+	lastc = -1;
+
+	do
+	{
+		utf8_to_int(file, c, correct);
+
+		if( !file )
+			return lastc;
+	}
+	while( !correct );
+
+	lastc = c;
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+}
+
+
+int BaseParser::read_ascii_char()
+{
+	lastc = file.get();
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+}
+
+
+int BaseParser::read_char_from_wchar_string()
+{
+	if( *pchar_unicode == 0 )
+		lastc = -1;
+	else
+		lastc = *(pchar_unicode++);
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+}
+
+
+int BaseParser::read_char_from_utf8_string()
+{
+int c;
+bool correct;
+
+	lastc = -1;
+
+	do
+	{
+		size_t len = utf8_to_int(pchar_ascii, c, correct);
+		pchar_ascii += len;
+	}
+	while( *pchar_ascii && !correct );
+
+	if( correct )
+		lastc = c;
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+
+}
+
+
+int BaseParser::read_char_from_ascii_string()
+{
+	if( *pchar_ascii == 0 )
+		lastc = -1;
+	else
+		lastc = *(pchar_ascii++);
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+}
+
+
+int BaseParser::read_char_no_escape()
+{
+	if( reading_from_file )
+	{
+		if( input_as_utf8 )
+			return read_utf8_char();
+		else
+			return read_ascii_char();
+	}
+	else
+	{
+		if( reading_from_wchar_string )
+		{
+			return read_char_from_wchar_string();
+		}
+		else
+		{
+			if( input_as_utf8 )
+				return read_char_from_utf8_string();
+			else
+				return read_char_from_ascii_string();
+		}
+	}
+}
+
+
+int BaseParser::read_char()
+{
+	return read_char_no_escape();
+}
+
+
+
+
+}
+
diff --git a/src/convert/baseparser.h b/src/convert/baseparser.h
new file mode 100644
index 0000000..381568f
--- /dev/null
+++ b/src/convert/baseparser.h
@@ -0,0 +1,120 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/*
+ * Copyright (c) 2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_picotools_convert_baseparser
+#define headerfile_picotools_convert_baseparser
+
+#include <string>
+#include <fstream>
+
+
+namespace pt
+{
+
+class BaseParser
+{
+protected:
+
+	BaseParser();
+
+	void clear();
+
+	int read_utf8_char();
+	int read_ascii_char();
+	int read_char_from_wchar_string();
+	int read_char_from_utf8_string();
+	int read_char_from_ascii_string();
+	int read_char_no_escape();
+	int read_char();
+
+
+
+	/*
+		a number of a line in which there is a syntax_error
+	*/
+	int line;
+
+
+	/*
+		true if parse() method was called
+		false if ParseString() was called
+	*/
+	bool reading_from_file;
+
+	/*
+		pointers to the current character
+		if ParseString() is in used
+	*/
+	const char    * pchar_ascii;
+	const wchar_t * pchar_unicode;
+
+
+	/*
+		true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
+	*/
+	bool reading_from_wchar_string;
+
+	/*
+		last read char
+		or -1 if the end
+	*/
+	int lastc;
+
+
+	/*
+		current file
+
+		may it would be better to make a pointer?
+		if we parse only a string then there is no sense to have such an object
+	*/
+	std::ifstream file;
+
+
+	/*
+		input file is in UTF-8
+		default: true
+	*/
+	bool input_as_utf8;
+
+
+
+
+};
+
+}
+
+#endif
diff --git a/src/csv/csvparser.cpp b/src/csv/csvparser.cpp
index 4ab1480..583eee3 100644
--- a/src/csv/csvparser.cpp
+++ b/src/csv/csvparser.cpp
@@ -44,6 +44,11 @@ namespace pt
 {
 
 
+CSVParser::CSVParser()
+{
+	input_as_utf8    = true;
+}
+
 
 
 CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
@@ -285,132 +290,6 @@ bool CSVParser::read_non_quoted_value_to(std::wstring & value)
 
 
 
-
-int CSVParser::read_utf8_char()
-{
-int c;
-bool correct;
-
-	lastc = -1;
-
-	do
-	{
-		utf8_to_int(file, c, correct);
-
-		if( !file )
-			return lastc;
-	}
-	while( !correct );
-
-	lastc = c;
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-
-int CSVParser::read_ascii_char()
-{
-	lastc = file.get();
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-
-
-int CSVParser::read_char_from_wchar_string()
-{
-	if( *pchar_unicode == 0 )
-		lastc = -1;
-	else
-		lastc = *(pchar_unicode++);
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int CSVParser::read_char_from_utf8_string()
-{
-int c;
-bool correct;
-
-	lastc = -1;
-
-	do
-	{
-		size_t len = utf8_to_int(pchar_ascii, c, correct);
-		pchar_ascii += len;
-	}
-	while( *pchar_ascii && !correct );
-
-	if( correct )
-		lastc = c;
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int CSVParser::read_char_from_ascii_string()
-{
-	if( *pchar_ascii == 0 )
-		lastc = -1;
-	else
-		lastc = *(pchar_ascii++);
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int CSVParser::read_char_no_escape()
-{
-	if( reading_from_file )
-	{
-		if( input_as_utf8 )
-			return read_utf8_char();
-		else
-			return read_ascii_char();
-	}
-	else
-	{
-		if( reading_from_wchar_string )
-		{
-			return read_char_from_wchar_string();
-		}
-		else
-		{
-			if( input_as_utf8 )
-				return read_char_from_utf8_string();
-			else
-				return read_char_from_ascii_string();
-		}
-	}
-}
-
-
-
-
-int CSVParser::read_char()
-{
-	return read_char_no_escape();
-}
-
-
 }
 
 
diff --git a/src/csv/csvparser.h b/src/csv/csvparser.h
index 8370867..c549fa5 100644
--- a/src/csv/csvparser.h
+++ b/src/csv/csvparser.h
@@ -38,9 +38,11 @@
 #ifndef headerfile_picotools_csv_csvparser
 #define headerfile_picotools_csv_csvparser
 
-#include "space/space.h"
 #include <string>
 #include <fstream>
+#include "space/space.h"
+#include "convert/baseparser.h"
+
 
 
 namespace pt
@@ -51,10 +53,12 @@ namespace pt
  * https://datatracker.ietf.org/doc/html/rfc4180
  *
  */
-class CSVParser
+class CSVParser : public BaseParser
 {
 public:
 
+	CSVParser();
+
 	enum Status
 	{
 		ok,
@@ -85,53 +89,6 @@ protected:
 
 	Space * space;
 
-	/*
-		true if parse_file() method was called
-		false if parse() was called
-	*/
-	bool reading_from_file;
-
-	/*
-		true if parse(wchar_t *) or parse(std::wstring&) was called
-	*/
-	bool reading_from_wchar_string;
-
-	/*
-		pointers to the current character
-		if parse() is being used
-	*/
-	const char    * pchar_ascii;
-	const wchar_t * pchar_unicode;
-
-
-	/*
-		last read char
-		or -1 if the end
-	*/
-	int lastc;
-
-
-
-	/*
-		a number of a line in which there is a syntax_error
-	*/
-	int line;
-
-	/*
-		current file
-
-		may it would be better to make a pointer?
-		if we parse only a string then there is no sense to have such an object
-	*/
-	std::ifstream file;
-
-	/*
-		input file is in UTF-8
-		default: true
-	*/
-	bool input_as_utf8;
-
-
 
 
 	void parse();
@@ -142,19 +99,6 @@ protected:
 	bool read_non_quoted_value_to(std::wstring & value);
 
 
-
-	/*
-	 * copied from SpaceParser
-	 * may it would be better to have a class with those methods and inherit from it?
-	 */
-	int read_utf8_char();
-	int read_ascii_char();
-	int read_char_from_wchar_string();
-	int read_char_from_utf8_string();
-	int read_char_from_ascii_string();
-	int read_char_no_escape();
-
-	int read_char();
 };
 
 }
diff --git a/src/space/spaceparser.cpp b/src/space/spaceparser.cpp
index ccd905e..9c334a7 100644
--- a/src/space/spaceparser.cpp
+++ b/src/space/spaceparser.cpp
@@ -891,122 +891,6 @@ void SpaceParser::read_key()
 
 
 
-int SpaceParser::read_utf8_char()
-{
-int c;
-bool correct;
-
-	lastc = -1;
-
-	do
-	{
-		utf8_to_int(file, c, correct);
-
-		if( !file )
-			return lastc;
-	}
-	while( !correct );
-
-	lastc = c;
-
-	if( lastc == '\n' )
-		++line;
-	
-return lastc;
-}
-
-
-
-int SpaceParser::read_ascii_char()
-{
-	lastc = file.get();
-
-	if( lastc == '\n' )
-		++line;
-	
-return lastc;
-}
-
-
-
-
-int SpaceParser::read_char_from_wchar_string()
-{
-	if( *pchar_unicode == 0 )
-		lastc = -1;
-	else
-		lastc = *(pchar_unicode++);
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int SpaceParser::read_char_from_utf8_string()
-{
-int c;
-bool correct;
-
-	lastc = -1;
-
-	do
-	{
-		size_t len = utf8_to_int(pchar_ascii, c, correct);
-		pchar_ascii += len;
-	}
-	while( *pchar_ascii && !correct );
-
-	if( correct )
-		lastc = c;
-
-	if( lastc == '\n' )
-		++line;
-	
-return lastc;
-	
-}
-
-
-int SpaceParser::read_char_from_ascii_string()
-{
-	if( *pchar_ascii == 0 )
-		lastc = -1;
-	else
-		lastc = *(pchar_ascii++);
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int SpaceParser::read_char_no_escape()
-{
-	if( reading_from_file )
-	{
-		if( input_as_utf8 )
-			return read_utf8_char();
-		else
-			return read_ascii_char();
-	}
-	else
-	{
-		if( reading_from_wchar_string )
-		{
-			return read_char_from_wchar_string();
-		}
-		else
-		{
-			if( input_as_utf8 )
-				return read_char_from_utf8_string();
-			else
-				return read_char_from_ascii_string();
-		}
-	}
-}
 
 bool SpaceParser::is_hex_digit(wchar_t c)
 {
diff --git a/src/space/spaceparser.h b/src/space/spaceparser.h
index 818b260..6805b88 100644
--- a/src/space/spaceparser.h
+++ b/src/space/spaceparser.h
@@ -40,6 +40,7 @@
 
 #include <fstream>
 #include "space.h"
+#include "convert/baseparser.h"
 
 
 
@@ -49,7 +50,7 @@ namespace pt
 
 
 
-class SpaceParser
+class SpaceParser : public BaseParser
 {
 public:
 
@@ -154,32 +155,6 @@ private:
 	Space * root_space;
 
 
-	/*
-		a number of a line in which there is a syntax_error
-	*/
-	int line;
-
-	/*
-		true if parse() method was called
-		false if ParseString() was called
-	*/
-	bool reading_from_file;
-
-
-	/*
-		pointers to the current character
-		if ParseString() is in used
-	*/
-	const char    * pchar_ascii;
-	const wchar_t * pchar_unicode;
-
-
-	/*
-		true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
-	*/
-	bool reading_from_wchar_string;
-
-
 	/*
 		last read token
 	*/
@@ -222,13 +197,6 @@ private:
 	int option_delimiter;
 
 
-	/*
-		last read char
-		or -1 if the end
-	*/
-	int lastc;
-
-
 	/*
 		true if the lastc was escaped (with a backslash)
 		we have to know if the last sequence was \" or just "
@@ -236,22 +204,6 @@ private:
 	bool char_was_escaped;
 
 
-	/*
-		current file
-
-		may it would be better to make a pointer?
-		if we parse only a string then there is no sense to have such an object
-	*/
-	std::ifstream file;
-
-
-	/*
-		input file is in UTF-8
-		default: true
-	*/
-	bool input_as_utf8;
-
-
 	/*
 	 * if parsing_space is false then it means we are parsing JSON format
 	 *
@@ -287,12 +239,6 @@ private:
 	void read_token_quoted(std::wstring & token);
 	void read_multiline_token_quoted(std::wstring & token);
 
-	int  read_utf8_char();
-	int  read_ascii_char();
-	int  read_char_from_wchar_string();
-	int  read_char_from_utf8_string();
-	int  read_char_from_ascii_string();
-	int  read_char_no_escape();
 	int  read_char();
 	bool is_white(int c);
 	void skip_line();
diff --git a/tests/Makefile.dep b/tests/Makefile.dep
index 60ed660..e83e777 100644
--- a/tests/Makefile.dep
+++ b/tests/Makefile.dep
@@ -13,7 +13,8 @@
 ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
 ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
 ./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
-./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h test.h
+./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
+./csvparser.o: ../src/convert/baseparser.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
 ./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h

From 4f8ae6ce291d7bc535c39ef102e5bf85b351080e Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Tue, 20 Jul 2021 20:48:01 +0200
Subject: [PATCH 04/37] some work in HTMLFilter - instead of directly using
 pchar pointer now we use pointers/streams from BaseParser - removed support
 for putting a white char in long words: removed BreakWord(size_t
 break_after_) method - changed the way how white characters are treated:
 added white_chars_mode(int mode) method   mode 0: WHITE_MODE_ORIGIN   mode 1:
 WHITE_MODE_SINGLE_LINE   mode 2: WHITE_MODE_TREE

---
 src/html/bbcodeparser.cpp |  59 +--
 src/html/htmlfilter.cpp   | 755 +++++++++++++++++++++-----------------
 src/html/htmlfilter.h     |  75 ++--
 3 files changed, 478 insertions(+), 411 deletions(-)

diff --git a/src/html/bbcodeparser.cpp b/src/html/bbcodeparser.cpp
index 0a60273..ec39de6 100644
--- a/src/html/bbcodeparser.cpp
+++ b/src/html/bbcodeparser.cpp
@@ -121,7 +121,7 @@ void BBCODEParser::PutNormalText(const wchar_t * str, const wchar_t * end)
 {
 int br_len;
 
-	if( *pchar == 0 )
+	if( lastc != -1 )
 	{
 		// trimming last white characters at end of the user text
 		while( str<end && (IsWhite(*(end-1)) || *(end-1)==10) )
@@ -415,15 +415,17 @@ void BBCODEParser::PutOpeningTagFromEzc()
 	(*out_string) += '[';
 	(*out_string) += LastItem().name;
 
-	const wchar_t * start = pchar;
 
-	while( *pchar && *pchar!=']' )
-		++pchar;
-
-	if( *pchar == ']' )
-		++pchar;
-
-	Put(start, pchar);
+// FIXME
+//	const wchar_t * start = pchar;
+//
+//	while( *pchar && *pchar!=']' )
+//		++pchar;
+//
+//	if( *pchar == ']' )
+//		++pchar;
+//
+//	Put(start, pchar);
 }
 
 
@@ -453,13 +455,13 @@ void BBCODEParser::TrimWhiteWithNewLines(const wchar_t * & start, const wchar_t
 
 void BBCODEParser::PutHtmlArgument2(const Tags * tag, bool has_u)
 {
-const wchar_t * start = pchar;
-const wchar_t * end   = pchar;
+//const wchar_t * start = pchar;
+//const wchar_t * end   = pchar;
 bool first_tag_removed = false;
 
-	while( *pchar != 0 )
+	while( lastc != -1 )
 	{
-		if( IsOpeningTagMark(*pchar) )
+		if( IsOpeningTagMark(lastc) )
 		{
 			if( IsClosingTagForLastItem() )
 			{
@@ -472,8 +474,8 @@ bool first_tag_removed = false;
 		}
 		else
 		{
-			pchar += 1;
-			end = pchar;
+			read_char();
+			//end = pchar;
 		}
 	}
 
@@ -482,12 +484,14 @@ bool first_tag_removed = false;
 
 	if( has_u )
 	{
-		TrimWhiteWithNewLines(start, end);
-		PrintEncode(start, end);
+// FIXME
+//		TrimWhiteWithNewLines(start, end);
+//		PrintEncode(start, end);
 	}
 	else
 	{
-		PrintEscape(start, end);
+		// FIXME
+//		PrintEscape(start, end);
 	}
 }
 
@@ -545,15 +549,16 @@ void BBCODEParser::PutOpeningTagFromBBCode(const Tags * tag)
 	PutOpeningTagMark();
 	Put(tag->html_tag);
 
-	const wchar_t * start = pchar;
-
-	while( *pchar && *pchar != ']' )
-		++pchar;
-
-	PutHtmlArgument(tag, start, pchar);
-
-	if( *pchar == ']' )
-		++pchar;
+// FIXME
+//	const wchar_t * start = pchar;
+//
+//	while( *pchar && *pchar != ']' )
+//		++pchar;
+//
+//	PutHtmlArgument(tag, start, pchar);
+//
+//	if( *pchar == ']' )
+//		++pchar;
 
 	if( !tag->inline_tag )
 	{
diff --git a/src/html/htmlfilter.cpp b/src/html/htmlfilter.cpp
index d103b9e..5274950 100644
--- a/src/html/htmlfilter.cpp
+++ b/src/html/htmlfilter.cpp
@@ -48,10 +48,13 @@ namespace pt
 void HTMLFilter::Item::Clear()
 {
 	name.clear();
-	type         = none;
-	porphans     = 0;
-	new_line     = false;
-	has_body_tag = false;
+	type          = none;
+	is_commentary = false;
+	porphans      = nullptr;
+	new_line      = false;
+	new_line_in_the_middle = false;
+	has_body_tag  = false;
+	tree_index    = 0;
 }
 
 
@@ -64,10 +67,15 @@ HTMLFilter::Item::Item()
 
 void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
 {
-	pchar         = in;
+	reading_from_file         = false;
+	reading_from_wchar_string = true;
+	pchar_unicode             = in;
+	pchar_ascii               = 0;
+
 	stack_len     = 0;
 	out_string    = &out;
 	last_new_line = false;
+	was_ending_commentary = false;
 	line_len      = 0;
 	out_string->clear();
 
@@ -108,9 +116,9 @@ void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
 
 void HTMLFilter::SetSomeDefaults()
 {
+	white_mode  = WHITE_MODE_ORIGIN;
+
 	tab_size    = 2;
-	trim_white  = false;
-	break_after = 0;
 	wrap_line   = 0;
 	orphan_mode = orphan_nbsp;
 	safe_mode   = false;
@@ -160,16 +168,15 @@ HTMLFilter::~HTMLFilter()
 
 
 
-
-void HTMLFilter::BreakWord(size_t break_after_)
+void HTMLFilter::white_chars_mode(int mode)
 {
-	break_after = break_after_;
-
-	if( break_after > 10000 )
-		break_after = 10000;
+	if( mode >= WHITE_MODE_ORIGIN && mode <= WHITE_MODE_TREE )
+		white_mode = mode;
 }
 
 
+
+
 void HTMLFilter::WrapLine(size_t wrap_line_)
 {
 	wrap_line = wrap_line_;
@@ -180,12 +187,6 @@ void HTMLFilter::WrapLine(size_t wrap_line_)
 
 
 
-void HTMLFilter::TrimWhite(bool trim)
-{
-	trim_white = trim;
-}
-
-
 void HTMLFilter::InsertTabs(size_t tabsize)
 {
 	tab_size = tabsize;
@@ -322,9 +323,10 @@ bool HTMLFilter::PushStack()
 
 	if( stack_len > 0 )
 	{
-		// 'porphans' and 'has_body_tag' attributes are propagated
+		// 'porphans', 'has_body_tag' and 'tree_index' attributes are propagated
 		pstack[stack_len].porphans     = pstack[stack_len-1].porphans;
 		pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag;
+		pstack[stack_len].tree_index   = pstack[stack_len-1].tree_index;
 	}
 
 	stack_len += 1;
@@ -356,15 +358,15 @@ return false;
 
 void HTMLFilter::SkipWhite()
 {
-	while( IsWhite(*pchar) )
-		++pchar;
+	while( IsWhite(lastc) )
+		read_char();
 }
 
 
 void HTMLFilter::SkipWhiteLines()
 {
-	while( *pchar==10 || IsWhite(*pchar) )
-		++pchar;
+	while( lastc==10 || IsWhite(lastc) )
+		read_char();
 }
 
 
@@ -372,29 +374,22 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
 {
 	SkipWhite();
 
-	if( *pchar == 10 )
+	if( lastc == 10 )
 	{
-		pchar += 1;
+		read_char();
 		SkipWhite();
 	}
 }
 
 
-void HTMLFilter::SkipWhiteLines(const wchar_t * & str, const wchar_t * end)
-{
-	while( str < end && (*str==10 || IsWhite(*str)) )
-		++str;
-}
-
-
 void HTMLFilter::CheckNewLine()
 {
-const wchar_t * start = pchar;
+	if( white_mode == WHITE_MODE_TREE )
+	{
+		SkipWhite();
+	}
 
-	SkipWhite();
-	last_new_line = (*pchar==10);
-
-	pchar = start;
+	last_new_line = (lastc==10);
 }
 
 
@@ -402,22 +397,23 @@ const wchar_t * start = pchar;
 
 bool HTMLFilter::IsClosingTagForLastItem()
 {
-	pchar += 1;
+	read_char();
 	SkipWhite();
 
-	if( *pchar == '/' )
+	if( lastc == '/' )
 	{
-		pchar += 1;
+		read_char();
 		SkipWhite();
 
-		if( IsNameEqual(pchar, LastItem().name, LastItem().name.size()) )
+		ReadItemName(tmp_name);
+
+		if( IsNameEqual(tmp_name, LastItem().name) )
 		{
-			pchar += LastItem().name.size();
 			SkipWhite();
 
-			if( IsClosingTagMark(*pchar) )
+			if( IsClosingTagMark(lastc) )
 			{
-				pchar += 1;
+				read_char();
 				return true;
 			}
 		}
@@ -432,17 +428,16 @@ return false;
 // used for such tags as: script, pre, textarea
 void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
 {
-const wchar_t * start = pchar;
-const wchar_t * end = pchar;
+	bool was_closing_tag = false;
+	tmp_text.clear();
 
-	while( *pchar != 0 )
+	while( lastc != -1 )
 	{
-		if( IsOpeningTagMark(*pchar) )
+		if( IsOpeningTagMark(lastc) )
 		{
 			if( IsClosingTagForLastItem() )
 			{
-				if( put_closing_tag_as_well )
-					end = pchar;
+				was_closing_tag = true;
 
 				PopStack();
 				CheckNewLine();
@@ -451,29 +446,37 @@ const wchar_t * end = pchar;
 		}
 		else
 		{
-			pchar += 1;
-			end = pchar;
+			tmp_text += lastc;
+			read_char();
 		}
 	}
 
-	Put(start, end);
+	Put(tmp_text);
+
+	if( was_closing_tag && put_closing_tag_as_well )
+	{
+		Put('<');
+		Put('/');
+		Put(tmp_name);
+		Put('>');
+	}
 }
 
 
 
 
-void HTMLFilter::SkipAndCheckClosingTag()
+void HTMLFilter::SkipAndCheckClosingTag(std::wstring * remember_text)
 {
 	bool is_quoted = false;
 	wchar_t quote_char = 0;
 
-	for( ; *pchar ; ++pchar )
+	while( lastc != -1 )
 	{
-		if( *pchar == '"' || *pchar == '\'' )
+		if( lastc == '"' || lastc == '\'' )
 		{
 			if( is_quoted )
 			{
-				if( *pchar == quote_char )
+				if( lastc == quote_char )
 				{
 					is_quoted = false;
 				}
@@ -481,20 +484,25 @@ void HTMLFilter::SkipAndCheckClosingTag()
 			else
 			{
 				is_quoted = true;
-				quote_char = *pchar;
+				quote_char = lastc;
 			}
 		}
 		else
-		if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(*pchar) ) // closing xml tag: default '/'
+		if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(lastc) ) // closing xml tag: default '/'
 		{
 			LastItem().type = Item::simple;
 		}
 		else
-		if( !is_quoted && IsClosingTagMark(*pchar) )
+		if( !is_quoted && IsClosingTagMark(lastc) )
 		{
-			++pchar;
+			read_char();
 			break;
 		}
+
+		if( remember_text )
+			(*remember_text) += lastc;
+
+		read_char();
 	}
 }
 
@@ -505,7 +513,7 @@ bool HTMLFilter::IsValidCharForName(int c)
 	if( (c>='a' && c<='z') ||
 		(c>='A' && c<='Z') ||
 		(c>='0' && c<='9') ||
-		c=='-' || c=='!' || c==':') // : for namespace character
+		c=='-' || c=='!' || c==':' || c=='-') // : is for a namespace character, - is for a commentary
 		return true;
 
 return false;
@@ -536,16 +544,28 @@ return false;
 }
 
 
-void HTMLFilter::ReadItemName()
+void HTMLFilter::ReadItemName(std::wstring & name, bool clear_name)
 {
 size_t i;
 
-	for( i=0 ; IsValidCharForName(*pchar) ; ++i )
+	if( clear_name )
+		name.clear();
+
+	for(i=0 ; IsValidCharForName(lastc) ; ++i)
 	{
 		if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN )
-			LastItem().name += *pchar;
+		{
+			name += lastc;
 
-		++pchar;
+			if( LastItem().type == Item::special && name == L"!--" )
+			{
+				LastItem().is_commentary = true;
+				read_char();
+				break;
+			}
+		}
+
+		read_char();
 	}
 }
 
@@ -557,71 +577,69 @@ size_t i;
 
 	attr_name.clear();
 
-	for( i=0 ; *pchar && IsValidCharForAttrName(*pchar) ; ++i )
+	for( i=0 ; lastc != -1 && IsValidCharForAttrName(lastc) ; ++i )
 	{
 		if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN )
-			attr_name += *pchar;
+			attr_name += lastc;
 
-		++pchar;
+		read_char();
 	}
 }
 
 
 
-void HTMLFilter::ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end)
+void HTMLFilter::ReadItemAttrValueAdd(const std::wstring & str)
 {
-	attr_value.push_back(std::wstring());
-
 	if( analyze_entities )
 	{
-		AnalyzeEntitiesAndPut(value_start, value_end, &attr_value.back());
+		attr_value.push_back(std::wstring());
+		AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), &attr_value.back());
 	}
 	else
 	{
-		attr_value.back().append(value_start, value_end);
+		attr_value.push_back(str);
 	}
 }
 
 
 void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
 {
-size_t i;
-
 	attr_value.clear();
-	const wchar_t * value_start = pchar;
-	size_t value_len = 0; // how many non white characters
+	tmp_text.clear();
 
-	for(i=0 ; *pchar ; ++i, ++pchar )
+	while( lastc != -1 )
 	{
 		if( has_quote )
 		{
-			if( *pchar == quote_char )
+			if( lastc == quote_char )
 				break;
 		}
 		else
 		{
-			if( IsClosingTagMark(*pchar) || *pchar == 10 || IsWhite(*pchar) )
+			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
 				break;
 		}
 
-		if( *pchar==10 || IsWhite(*pchar) )
+		if( lastc==10 || IsWhite(lastc) )
 		{
-			if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
-				ReadItemAttrValueAdd(value_start, pchar);
+			if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+				ReadItemAttrValueAdd(tmp_text);
 
-			value_len = 0;
+			tmp_text.clear();
 		}
 		else
 		{
-			if( value_len == 0 )
-				value_start = pchar;
+			if( tmp_text.size() > WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+				tmp_text.clear();
 
-			value_len += 1;
+			tmp_text += lastc;
 		}
+
+		read_char();
 	}
 
-	if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
-		ReadItemAttrValueAdd(value_start, pchar);
+	if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+		ReadItemAttrValueAdd(tmp_text);
 }
 
 
@@ -641,15 +659,6 @@ void HTMLFilter::Put(wchar_t c)
 }
 
 
-void HTMLFilter::Put(const wchar_t * str)
-{
-	out_string->append(str);
-
-	for( ; *str ; ++str)
-		CheckChar(*str);
-}
-
-
 void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
 {
 	if( str >= end )
@@ -663,12 +672,16 @@ void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
 }
 
 
+
 void HTMLFilter::Put(const std::wstring & str)
 {
-	out_string->append(str);
+	if( !str.empty() )
+	{
+		out_string->append(str);
 
-	for(size_t i=0 ; i<str.size() ; ++i)
-		CheckChar(str[i]);
+		for(size_t i=0 ; i < str.size() ; ++i)
+			CheckChar(str[i]);
+	}
 }
 
 
@@ -808,118 +821,75 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
 }
 
 
-// if there is a semicolon nearby then we break the line after it
-// (useful in html entities)
-// !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str)
-// i wtedy tez nie dodajemy zadnego znaku
-bool HTMLFilter::HasEntityEndAround(const wchar_t * str, const wchar_t * end)
+void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
 {
-size_t i, epsilon = 8;// !! IMPROVE ME put as a constant
-
-	for(i=0 ; str < end && i<epsilon ; ++i, ++str)
-		if( IsEndingEntityMark(*str) )
-			return true;
-
-return false;
-}
-
-
-void HTMLFilter::CheckLineWrap()
-{
-	if( wrap_line != 0 && LastItem().has_body_tag && line_len > wrap_line )
+	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
 	{
-		Put(10);
-		PutTabs(stack_len);
-	}
-}
+		str += lastc;
+		read_char();
 
-
-void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end)
-{
-const wchar_t * word = str;
-size_t non_whites = 0;
-bool was_entity_end = false;
-
-	for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites )
-	{
-		if( break_after != 0 && non_whites >= break_after && (was_entity_end || !HasEntityEndAround(str, end)) )
+		if( IsEndingCommentaryTagMarkAtEndOfString(str) )
 		{
-			Put(word, str);
-			word           = str;
-			non_whites     = 0;
-			Put(' ');
-			CheckLineWrap();
+			str.erase(str.size() - 3); // IMPROVEME define a function or what
+			was_ending_commentary = true;
+			break;
 		}
+	}
 
-		was_entity_end = (IsEndingEntityMark(*str));
+	if( !str.empty() )
+	{
+		if( allow_put_new_line )
+		{
+			Put(10);
+			PutTabs(LastItem().tree_index + 1);
+		}
+		else
+		if( allow_put_space )
+		{
+			Put(' ');
+		}
 	}
 
 	if( analyze_entities )
-		AnalyzeEntitiesAndPut(word, str, nullptr);
+		AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
 	else
-		Put(word, str);
+		Put(str);
 }
 
 
-void HTMLFilter::PutNormalWhite(const wchar_t * & str, const wchar_t * end)
+bool HTMLFilter::PutNormalWhite()
 {
-	if( str < end )
+	bool was_white_char = false;
+	bool was_new_line = false;
+
+	while( lastc == 10 || IsWhite(lastc) )
 	{
-		if( trim_white )
-		{
-			Put(' ');
-			SkipWhiteLines(str, end);
-		}
-		else
-		{
-			while( str < end && (*str==10 || IsWhite(*str)) )
-			{
-				Put(*str);
+		was_white_char = true; // anyone white char even new line
 
-				if( *str == 10 )
-					PutTabs(stack_len);
+		if( lastc == 10 )
+			was_new_line = true;
 
-				++str;
-			}
+		if( white_mode == WHITE_MODE_ORIGIN )
+		{
+			Put(lastc);
 		}
+
+		read_char();
 	}
-}
 
-
-void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end)
-{
-const wchar_t * word, * white;
-
-	if( str < end )
-		CheckLineWrap();
-
-	while( str < end )
+	if( white_mode == WHITE_MODE_SINGLE_LINE && was_white_char )
 	{
-		word = str;
-		PutNormalNonWhite(str, end);
-
-		if( CheckOrphan(word, str) )
-		{
-			white = str;
-			SkipWhiteLines(str, end);
-
-			if( white < str )
-				PutNonBreakingSpace();
-		}
-		else
-		{
-			PutNormalWhite(str, end);
-
-			if( str < end ) // !! lub moze podobnie jak jest na gorze tutaj? juz nie mam sily myslec :(
-				CheckLineWrap();
-		}
-
-		// for safety (if str was not incremented then there is an infinite loop)
-		if( word == str )
-			break;
+		Put(' ');
 	}
-}
 
+	if( white_mode == WHITE_MODE_TREE && was_new_line )
+	{
+		// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
+	}
+
+	last_new_line = was_new_line;
+	return was_white_char;
+}
 
 
 
@@ -985,6 +955,12 @@ bool HTMLFilter::PutOpeningTag()
 		return false;
 	}
 
+	if( white_mode == WHITE_MODE_TREE && last_new_line )
+	{
+		Put(10);
+		PutTabs(LastItem().tree_index);
+	}
+
 	PutOpeningTagMark();
 	Put(LastItem().name);
 
@@ -993,14 +969,18 @@ return true;
 
 
 
-void HTMLFilter::PutClosingTag(const wchar_t * tag)
+void HTMLFilter::PutClosingTag(const Item & item)
 {
-	if( skip_tags || !IsTagSafe(tag) )
+	if( skip_tags || !IsTagSafe(item.name) )
 		return;
 
-	PutOpeningTagMark();
-	Put('/');
-	Put(tag);
+	if( !item.is_commentary )
+	{
+		PutOpeningTagMark();
+		Put('/');
+	}
+
+	Put(item.name);
 	PutClosingTagMark();
 }
 
@@ -1011,7 +991,7 @@ void HTMLFilter::PutTabs(size_t len)
 	if( len > 30 )
 		len = 30;
 
-	for(size_t i=0 ; i < (len*tab_size) ; ++i)
+	for(int i=0 ; i < (len*tab_size) ; ++i)
 		(*out_string) += ' '; // we do not add them to 'line_len'
 }
 
@@ -1031,12 +1011,12 @@ void HTMLFilter::PutNonBreakingSpace()
 
 
 
-void HTMLFilter::PutNewLine()
-{
-	buffer[0] = 10;
-	Put(buffer, buffer+1);
-	line_len = 0;
-}
+//void HTMLFilter::PutNewLine()
+//{
+//	buffer[0] = 10; // CHECKME for what purpose is this buffer?
+//	Put(10);
+//	line_len = 0;
+//}
 
 
 // we assume the size of the opening mark to be one
@@ -1053,6 +1033,28 @@ bool HTMLFilter::IsClosingTagMark(wchar_t c)
 }
 
 
+// the slash in the closing tag mark e.g. </p>
+bool HTMLFilter::IsClosingTagIndicator(wchar_t c)
+{
+	return (c == '/');
+}
+
+
+// the slash in the closing tag mark e.g. </p>
+bool HTMLFilter::IsSpecialTagIndicator(wchar_t c)
+{
+	return (c == '!');
+}
+
+
+// the '=' operator e.g. class="value"
+bool HTMLFilter::IsAttributeAssignmentMark(wchar_t c)
+{
+	return (c == '=');
+}
+
+
+
 // the slash at the end <img src=".." /> (without '>' character)
 // we assume the size of the mark to be one
 bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
@@ -1061,18 +1063,33 @@ bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
 }
 
 
-bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
+//bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
+//{
+//static wchar_t comm_open[] = L"<!--";
+//size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
+//
+//	//return IsNameEqual(pchar, comm_open, comm_open_len);
+//	return false;
+//}
+//
+//
+//size_t HTMLFilter::OpeningCommentaryTagMarkSize()
+//{
+//	return 4; // size of "<!--"
+//}
+
+
+bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
 {
-static wchar_t comm_open[] = L"<!--";
-size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
+	static wchar_t comm_end[] = L"-->";
+	size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1;
 
-	return IsNameEqual(pchar, comm_open, comm_open_len);
-}
+	if( str.size() >= comm_end_len )
+	{
+		return IsNameEqual(str.c_str() + str.size() - comm_end_len, comm_end);
+	}
 
-
-size_t HTMLFilter::OpeningCommentaryTagMarkSize()
-{
-	return 4; // size of "<!--"
+	return false;
 }
 
 
@@ -1092,9 +1109,9 @@ bool HTMLFilter::IsEndingEntityMark(wchar_t c)
 // skipping the commentary tag if exists
 bool HTMLFilter::SkipCommentaryTagIfExists()
 {
-static wchar_t comm_close[] = L"-->";
+wchar_t comm_close[] = L"-->";
 size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
-
+/*
 	if( !IsOpeningCommentaryTagMark(pchar) )
 		return false;
 
@@ -1108,86 +1125,81 @@ size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
 		pchar += comm_close_len;
 
 	CheckNewLine();
+*/
+
 
 return true;
 }
 
 
-void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white)
-{
-	if( trim_white )
-	{
-		// skipping all white chars (with new lines)
-		// but with remembering the last non white character
-		for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar)
-			if( *pchar == 10 )
-				last_non_white = pchar;
-	}
-	else
-	{
-		// skipping first white chars with only one line between them
-		SkipWhite();
-		last_non_white = pchar;
-
-		if( *pchar == 10 )
-		{
-			++pchar;
-			SkipWhite();
-		}
-	}
-
-	start = pchar;
-
-	// exception for the commentary tag
-	if( IsOpeningCommentaryTagMark(pchar) || !IsOpeningTagMark(*pchar) )
-	{
-		PutNewLine();
-		PutTabs(stack_len);
-	}
-}
-
-
-
 // reading text between html tags
 void HTMLFilter::ReadNormalText()
 {
-const wchar_t * start = pchar;
-const wchar_t * last_non_white = pchar;
+	bool was_non_white_text = false;
 
-	if( last_new_line )
-		ReadNormalTextSkipWhite(start, last_non_white);
+	was_ending_commentary = false;
 
+	bool allow_put_new_line = false;
+	bool allow_put_space = false;
 
-	while( *pchar != 0 )
+	if( white_mode == WHITE_MODE_TREE )
 	{
-		const wchar_t * commentary_start = pchar;
-
-		if( SkipCommentaryTagIfExists() )
+		if( LastItem().new_line || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) )
 		{
-			last_non_white = pchar - 1; // pointing at the last '>' from a commentary
-			PutNormalText(start, commentary_start);
-
-			if( !skip_commentaries )
-			{
-				PutNormalText(commentary_start, pchar);
-			}
-
-			start = pchar;
-		}
-		else
-		{
-			if( IsOpeningTagMark(*pchar) )
-				break;
-
-			if( !IsWhite(*pchar) )
-				last_non_white = pchar;
-
-			pchar += 1;
+			allow_put_new_line = true;
 		}
 	}
 
-	last_new_line = (*last_non_white == 10);
-	PutNormalText(start, pchar);
+	while( lastc != -1 && !IsOpeningTagMark(lastc) )
+	{
+		tmp_text.clear();
+		PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
+
+		if( !tmp_text.empty() )
+		{
+			allow_put_new_line = false;
+			allow_put_space = false;
+			was_non_white_text = true;
+		}
+
+		if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) )
+		{
+			if( lastc == 10 || IsWhite(lastc) )
+			{
+				SkipWhiteLines();
+				PutNonBreakingSpace();
+			}
+		}
+		else
+		{
+			if( was_ending_commentary )
+				break;
+
+			if( PutNormalWhite() && white_mode == WHITE_MODE_TREE )
+			{
+				if( last_new_line )
+				{
+					allow_put_new_line = true;
+					allow_put_space = false;
+
+					LastItem().new_line_in_the_middle = true;
+
+					if( !was_non_white_text )
+						LastItem().new_line = true;
+				}
+				else
+				{
+					allow_put_new_line = false;
+					allow_put_space = true;
+				}
+
+				if( wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line )
+				{
+					allow_put_new_line = true;
+				}
+			}
+		}
+	}
 }
 
 
@@ -1197,15 +1209,7 @@ bool HTMLFilter::PrintOpeningItem()
 	if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
 		return true;
 
-	if( last_new_line )
-	{
-		PutNewLine();
-
-		if( stack_len > 1 )
-			PutTabs(stack_len-1);
-	}
-
-return PutOpeningTag();
+	return PutOpeningTag();
 }
 
 
@@ -1226,34 +1230,34 @@ bool HTMLFilter::ReadItemAttr()
 
 	SkipWhiteLines();
 
-	if( *pchar != '=' )
+	if( !IsAttributeAssignmentMark(lastc) ) // '='
 		return true;
 	
 	attr_has_value = true;
-	pchar += 1;				// skipping '='
+	read_char();				// skipping '='
 	SkipWhiteLines();
 
-	bool has_quote = (*pchar == '\"' || *pchar == '\'');
-	wchar_t quote_char = *pchar;
+	bool has_quote = (lastc == '\"' || lastc == '\'');
+	wchar_t quote_char = lastc;
 
 	if( has_quote )
-		pchar += 1;			// skipping the first quote mark
+		read_char();			// skipping the first quote mark
 
 	ReadItemAttrValue(has_quote, quote_char);
 
-	if( has_quote && *pchar == quote_char )
-		pchar += 1;			// skipping the last quote mark
+	if( has_quote && lastc == quote_char )
+		read_char();			// skipping the last quote mark
 
 return true;
 }
 
 
 
-bool HTMLFilter::CheckItemAttr()
+void HTMLFilter::CheckItemLangAttr()
 {
 	if( attr_has_value && IsNameEqual(L"lang", attr_name) )
 	{
-		LastItem().porphans = 0;
+		LastItem().porphans = nullptr;
 
 		if( !attr_value.empty() )
 		{
@@ -1267,8 +1271,6 @@ bool HTMLFilter::CheckItemAttr()
 				LastItem().porphans = &i->second;
 		}
 	}
-
-return true;
 }
 
 
@@ -1301,9 +1303,9 @@ size_t i;
 
 void HTMLFilter::ReadItemClosing()
 {
-	pchar += 1; // skipping '/'
+	read_char(); // skipping '/'
 	SkipWhiteLines();
-	ReadItemName();
+	ReadItemName(LastItem().name);
 	LastItem().type = Item::closing;
 	SkipAndCheckClosingTag();
 
@@ -1316,32 +1318,55 @@ void HTMLFilter::ReadItemSpecial()
 	LastItem().type = Item::special;
 
 	if( !skip_tags )
+	{
+		if( white_mode == WHITE_MODE_TREE && last_new_line )
+		{
+			Put(10);
+			PutTabs(LastItem().tree_index);
+		}
+
 		PutOpeningTagMark();
+	}
 
-	const wchar_t * start = pchar;
-	pchar += 1; // skipping '!'
+	read_char(); // skipping '!'
+	LastItem().name = '!';
+	ReadItemName(LastItem().name, false);
 
-	ReadItemName();
-	SkipAndCheckClosingTag();
-
-	if( !skip_tags && pchar > start )
-		Put(start, pchar);
-
-	// closing tag mark is printed directly from the source
+	if( skip_tags )
+	{
+		SkipAndCheckClosingTag();
+	}
+	else
+	{
+		if( LastItem().is_commentary )
+		{
+			Put(LastItem().name);
+		}
+		else
+		{
+			tmp_text.clear();
+			SkipWhiteLines();
+			SkipAndCheckClosingTag(&tmp_text);
+			Put(LastItem().name);
+			Put(' ');
+			Put(tmp_text);
+			Put('>');
+		}
+	}
 }
 
 
 void HTMLFilter::ReadItemOpening()
 {
 	LastItem().type = Item::opening;
-	ReadItemName();
+	ReadItemName(LastItem().name);
 	
 	if( PrintOpeningItem() )
 	{
 		while( ReadItemAttr() )
 		{
-			if( CheckItemAttr() )
-				PrintItemAttr();
+			CheckItemLangAttr();
+			PrintItemAttr();
 		}
 
 		SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
@@ -1368,25 +1393,35 @@ void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end)
 
 bool HTMLFilter::ReadItem()
 {
-	if( *pchar == 0 )
+	if( lastc == -1 )
 		return false;
 
 	if( !PushStack() )
 		return false;
 
-	pchar += 1;	// skipping the first '<'
-	SkipWhiteLines();
+	if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
+		LastItem().tree_index += 1;
 
-	if( *pchar == '!' )
-		ReadItemSpecial();
+	if( was_ending_commentary )
+	{
+		LastItem().type = Item::closing;
+		LastItem().is_commentary = true;
+		LastItem().name = L"--";
+		was_ending_commentary = false;
+	}
 	else
-	if( *pchar == '/' ) // we have a closing tag (dodac jako metode wirtualna) !!
-		ReadItemClosing();
-	else
-		ReadItemOpening();
+	{
+		read_char();	// skipping the first opening tag mark '<'
+		SkipWhiteLines();
 
-	CheckNewLine();
-	LastItem().new_line = last_new_line;
+		if( IsSpecialTagIndicator(lastc) )
+			ReadItemSpecial();
+		else
+		if( IsClosingTagIndicator(lastc) )
+			ReadItemClosing();
+		else
+			ReadItemOpening();
+	}
 
 	ItemFound();
 
@@ -1556,11 +1591,14 @@ int i;
 	{
 		if( !skip_tags && pstack[z].new_line )
 		{
-			PutNewLine();
-			PutTabs(z);
+			if( white_mode == WHITE_MODE_TREE )
+			{
+				Put(10);
+				PutTabs(pstack[z].tree_index);
+			}
 		}
 
-		PutClosingTag(pstack[z].name.c_str());
+		PutClosingTag(pstack[z]);
 		pstack[z].Clear();
 	}
 
@@ -1576,10 +1614,19 @@ void HTMLFilter::CheckStackPrintRest()
 	while( stack_len-- > 0 )
 	{
 		if( stack_len==0 || pstack[stack_len-1].new_line )
-			PutNewLine();
+		{
+			if( white_mode == WHITE_MODE_TREE )
+			{
+				Put(10);
+				PutTabs(pstack[stack_len-1].tree_index);
+			}
+			else
+			{
+				Put(' ');
+			}
+		}
 
-		PutTabs(stack_len);
-		PutClosingTag(pstack[stack_len].name.c_str());
+		PutClosingTag(pstack[stack_len]);
 	}
 }
 
@@ -1601,16 +1648,19 @@ void HTMLFilter::CheckClosingTags()
 	}
 
 	// there are more than one tag 
-	if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
+	if( (pstack[stack_len-1].is_commentary && pstack[stack_len-2].is_commentary) || IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
 	{
 		// last closing tag is from the previous one
 		if( !skip_tags && pstack[stack_len-2].new_line )
 		{
-			PutNewLine();
-			PutTabs(stack_len-2);
+			if( white_mode == WHITE_MODE_TREE )
+			{
+				Put(10);
+				PutTabs(pstack[stack_len-2].tree_index);
+			}
 		}
 
-		PutClosingTag(pstack[stack_len-1].name.c_str());
+		PutClosingTag(pstack[stack_len-1]);
 		last_new_line = pstack[stack_len-1].new_line;
 		PopStack();
 		PopStack();
@@ -1624,22 +1674,30 @@ void HTMLFilter::CheckClosingTags()
 
 bool HTMLFilter::PrintRest()
 {
-const wchar_t * start = pchar;
+//const wchar_t * start = pchar;
 
 	// in safe mode we do not print the rest html code
 	if( safe_mode || skip_tags )
 		return false;
 
-	while( *pchar )
-		++pchar;
+	bool was_chars = false;
 
-	if( pchar > start )
+	while( lastc != -1 )
 	{
-		Put(start, pchar);
-		return true;
+		Put(lastc);
+		read_char();
+		was_chars = true;
 	}
 
-return false;
+	return was_chars;
+
+//	if( pchar > start )
+//	{
+//		Put(start, pchar);
+//		return true;
+//	}
+
+//return false;
 }
 
 
@@ -1660,15 +1718,18 @@ void HTMLFilter::ReadLoop()
 				//pstack[stack_len-2].new_line = LastItem().new_line;
 			}
 			else
-			if( trim_white )
+			if( white_mode == WHITE_MODE_TREE )
 			{
 				// one new line after a simple or special tag
 				// (if the tag has level 0 in the tree - it not means that this is a first tag)
-				// for example can be DOCTYPE 
-				PutNewLine(); 
+				// for example can be DOCTYPE
+
+				if( !LastItem().is_commentary )
+					Put(10);
 			}
 
-			PopStack();
+			if( !LastItem().is_commentary )
+				PopStack();
 		}
 		else
 		if( LastItem().type == Item::closing )
@@ -1688,7 +1749,9 @@ void HTMLFilter::ReadLoop()
 
 void HTMLFilter::Read()
 {
-	if( trim_white )
+	read_char(); // put first character to lastc
+
+	if( white_mode != WHITE_MODE_ORIGIN )
 		SkipWhiteLines();
 
 	// it can be some text or white lines before the first html tag (we print it)
diff --git a/src/html/htmlfilter.h b/src/html/htmlfilter.h
index 35710d3..6407e0e 100644
--- a/src/html/htmlfilter.h
+++ b/src/html/htmlfilter.h
@@ -42,7 +42,7 @@
 #include <map>
 #include <vector>
 #include <algorithm>
-
+#include "convert/baseparser.h"
 
 
 namespace pt
@@ -90,7 +90,7 @@ namespace pt
 
 	the filter recognizes xml simple tags (with / at the end) such as: <br />
 */
-class HTMLFilter
+class HTMLFilter : public BaseParser
 {
 public:
 
@@ -111,27 +111,22 @@ public:
 	void Filter(const std::wstring & in, std::wstring & out);
 
 
-	// insert a white space into long words
-	// (only between html tags)
-	// skipped in such tags: script, pre, textarea
-	// break_after - after how many characters insert a space (0 - off)
-	void BreakWord(size_t break_after_);
+	const static int WHITE_MODE_ORIGIN = 0;
+	const static int WHITE_MODE_SINGLE_LINE = 1;
+	const static int WHITE_MODE_TREE = 2;
 
-	// insert a new line character into long lines
-	// (only between html tags)
+
+	// white chars mode
+	//
+	void white_chars_mode(int mode);
+
+	// if the line is wrap_line_ length (or longer) then insert a new line character (in a place of a white char)
+	// (only between html tags and only in <body> subtree)
 	// skipped in such tags: script, pre, textarea
-	// wrap_line - after how many characters wrap a line (0 - off)
+	// 0 - off
 	// lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section)
 	void WrapLine(size_t wrap_line_);
 
-	// trimming white characters (with new lines)
-	// at the beginning, at the end and in the middle of a string
-	// only between html tags
-	// at the beginning and at the end only one space is left
-	// skipped in such tags: script, pre, textarea
-	// false by default
-	void TrimWhite(bool trim);
-
 	// first tabs in a tree
 	// default: 2 (spaces)
 	// set 0 to turn off
@@ -207,9 +202,14 @@ protected:
 			none
 		} type;
 
+		bool is_commentary;
+
 		// is there a new line after this tag
 		bool new_line;
 
+		// is there a new
+		bool new_line_in_the_middle;
+
 		// current orphans table
 		// (will be propagated)
 		Orphans * porphans;
@@ -218,6 +218,8 @@ protected:
 		// (will be propagated)
 		bool has_body_tag;
 
+		size_t tree_index;
+
 		void Clear();
 		Item();
 	};
@@ -235,12 +237,16 @@ protected:
 
 	virtual bool IsOpeningTagMark(wchar_t c);
 	virtual bool IsClosingTagMark(wchar_t c);
+	virtual bool IsClosingTagIndicator(wchar_t c);
+	virtual bool IsSpecialTagIndicator(wchar_t c);
+	virtual bool IsAttributeAssignmentMark(wchar_t c);
 	virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
 	virtual bool IsStartingEntityMark(wchar_t c);
 	virtual bool IsEndingEntityMark(wchar_t c);
 
-	virtual bool   IsOpeningCommentaryTagMark(const wchar_t * str);
-	virtual size_t OpeningCommentaryTagMarkSize();
+//	virtual bool   IsOpeningCommentaryTagMark(const wchar_t * str);
+//	virtual size_t OpeningCommentaryTagMarkSize();
+	virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
 
 	virtual bool IsValidCharForName(int c);
 	virtual bool IsValidCharForAttrName(int c);
@@ -249,7 +255,6 @@ protected:
 	virtual bool SkipCommentaryTagIfExists();
 
 	virtual void Put(wchar_t c);
-	virtual void Put(const wchar_t * str);
 	virtual void Put(const wchar_t * str, const wchar_t * end);
 	virtual void Put(const std::wstring & str);
 	virtual void AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out);
@@ -257,10 +262,7 @@ protected:
 	virtual void PutOpeningTagMark();
 	virtual void PutClosingTagMark();
 	virtual bool PutOpeningTag();
-	virtual void PutClosingTag(const wchar_t * tag);
-
-	virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
-	virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
+	virtual void PutClosingTag(const Item & item);
 
 	virtual void ItemFound();
 	virtual void EntityFound(const wchar_t * str, const wchar_t * end);
@@ -299,9 +301,8 @@ protected:
 	void SkipWhite();
 	void SkipWhiteLines();
 	void SkipWhiteWithFirstNewLine();
-	void SkipWhiteLines(const wchar_t * & str, const wchar_t * end);
 	bool IsClosingTagForLastItem();
-	void SkipAndCheckClosingTag();
+	void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);
 
 	void PopStack();
 	bool PushStack();
@@ -312,13 +313,13 @@ protected:
 	void ReadNormalText();
 	bool PrintRest();
 	bool PrintOpeningItem();
-	void ReadItemName();
+	void ReadItemName(std::wstring & name, bool clear_name = true);
 	void ReadItemAttrName();
-	void ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end);
+	void ReadItemAttrValueAdd(const std::wstring & str);
 	void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
 
 	bool ReadItemAttr();
-	bool CheckItemAttr();
+	void CheckItemLangAttr();
 	void PrintItemAttr();
 
 	void ReadItemClosing();
@@ -330,27 +331,23 @@ protected:
 
 	void CheckChar(wchar_t c);
 
-	void CheckLineWrap();
-	bool HasEntityEndAround(const wchar_t * str, const wchar_t * end);
-	void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end);
-	void PutNormalWhite(const wchar_t * & str, const wchar_t * end);
+	void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
+	bool PutNormalWhite();
 	void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
 	void PutTabs(size_t len);
 	void PutNonBreakingSpace();
-	void PutNewLine();
 	void CalcOrphansMaxLen(Orphans & orphans);
 
-	const wchar_t * pchar;
 	Item empty;
 	Item * pstack;			// stack pointer
 	size_t stack_len;		// length of the stack
 	wchar_t * buffer;		// buffer used when printing
 	std::wstring * out_string;
 	bool last_new_line;
-	size_t break_after;		// insert a space into long words after 'break_after' characters
+	int white_mode;
 	size_t wrap_line;		// insert a new line character into long lines
-	bool trim_white;		// trimming white characters
 	size_t tab_size;
+	bool was_ending_commentary;
 	OrphanMode orphan_mode;
 	std::wstring attr_name;
 	std::vector<std::wstring> attr_value;
@@ -365,6 +362,8 @@ protected:
 	bool skip_commentaries;
 	bool skip_entities;
 	bool analyze_entities;
+	std::wstring tmp_text;
+	std::wstring tmp_name;
 };
 
 

From c0e940c5008e2038e551279fa16fdbd2119eacfd Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Wed, 21 Jul 2021 11:30:49 +0200
Subject: [PATCH 05/37] fixed improper new line character after <single/>
 items, added Item::new_line_before flag

---
 src/html/htmlfilter.cpp | 174 +++++++++++++++-------------------------
 src/html/htmlfilter.h   |  14 ++--
 2 files changed, 70 insertions(+), 118 deletions(-)

diff --git a/src/html/htmlfilter.cpp b/src/html/htmlfilter.cpp
index 5274950..04888c3 100644
--- a/src/html/htmlfilter.cpp
+++ b/src/html/htmlfilter.cpp
@@ -36,7 +36,7 @@
  */
 
 #include "htmlfilter.h"
-
+#include "convert/text.h"
 
 
 namespace pt
@@ -48,13 +48,14 @@ namespace pt
 void HTMLFilter::Item::Clear()
 {
 	name.clear();
-	type          = none;
-	is_commentary = false;
-	porphans      = nullptr;
-	new_line      = false;
+	type            = none;
+	is_commentary   = false;
+	porphans        = nullptr;
+	new_line_before = false;
+	new_line        = false;
 	new_line_in_the_middle = false;
-	has_body_tag  = false;
-	tree_index    = 0;
+	has_body_tag    = false;
+	tree_index      = 0;
 }
 
 
@@ -74,7 +75,7 @@ void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
 
 	stack_len     = 0;
 	out_string    = &out;
-	last_new_line = false;
+	//last_new_line = false;
 	was_ending_commentary = false;
 	line_len      = 0;
 	out_string->clear();
@@ -382,15 +383,15 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
 }
 
 
-void HTMLFilter::CheckNewLine()
-{
-	if( white_mode == WHITE_MODE_TREE )
-	{
-		SkipWhite();
-	}
-
-	last_new_line = (lastc==10);
-}
+//void HTMLFilter::CheckNewLine()
+//{
+//	if( white_mode == WHITE_MODE_TREE )
+//	{
+//		SkipWhite();
+//	}
+//
+//	last_new_line = (lastc==10);
+//}
 
 
 
@@ -440,7 +441,7 @@ void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
 				was_closing_tag = true;
 
 				PopStack();
-				CheckNewLine();
+				//CheckNewLine();
 				break;
 			}
 		}
@@ -857,17 +858,17 @@ void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
 }
 
 
-bool HTMLFilter::PutNormalWhite()
+void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
 {
-	bool was_white_char = false;
-	bool was_new_line = false;
+	was_white_char = false;
+	was_new_line = false;
 
 	while( lastc == 10 || IsWhite(lastc) )
 	{
-		was_white_char = true; // anyone white char even new line
-
 		if( lastc == 10 )
 			was_new_line = true;
+		else
+			was_white_char = true;
 
 		if( white_mode == WHITE_MODE_ORIGIN )
 		{
@@ -877,18 +878,12 @@ bool HTMLFilter::PutNormalWhite()
 		read_char();
 	}
 
-	if( white_mode == WHITE_MODE_SINGLE_LINE && was_white_char )
+	if( white_mode == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) )
 	{
 		Put(' ');
 	}
 
-	if( white_mode == WHITE_MODE_TREE && was_new_line )
-	{
-		// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
-	}
-
-	last_new_line = was_new_line;
-	return was_white_char;
+	// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
 }
 
 
@@ -955,7 +950,7 @@ bool HTMLFilter::PutOpeningTag()
 		return false;
 	}
 
-	if( white_mode == WHITE_MODE_TREE && last_new_line )
+	if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
 	{
 		Put(10);
 		PutTabs(LastItem().tree_index);
@@ -991,7 +986,7 @@ void HTMLFilter::PutTabs(size_t len)
 	if( len > 30 )
 		len = 30;
 
-	for(int i=0 ; i < (len*tab_size) ; ++i)
+	for(size_t i=0 ; i < (len*tab_size) ; ++i)
 		(*out_string) += ' '; // we do not add them to 'line_len'
 }
 
@@ -1010,15 +1005,6 @@ void HTMLFilter::PutNonBreakingSpace()
 
 
 
-
-//void HTMLFilter::PutNewLine()
-//{
-//	buffer[0] = 10; // CHECKME for what purpose is this buffer?
-//	Put(10);
-//	line_len = 0;
-//}
-
-
 // we assume the size of the opening mark to be one
 bool HTMLFilter::IsOpeningTagMark(wchar_t c)
 {
@@ -1063,22 +1049,6 @@ bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
 }
 
 
-//bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
-//{
-//static wchar_t comm_open[] = L"<!--";
-//size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
-//
-//	//return IsNameEqual(pchar, comm_open, comm_open_len);
-//	return false;
-//}
-//
-//
-//size_t HTMLFilter::OpeningCommentaryTagMarkSize()
-//{
-//	return 4; // size of "<!--"
-//}
-
-
 bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
 {
 	static wchar_t comm_end[] = L"-->";
@@ -1106,35 +1076,12 @@ bool HTMLFilter::IsEndingEntityMark(wchar_t c)
 
 
 
-// skipping the commentary tag if exists
-bool HTMLFilter::SkipCommentaryTagIfExists()
-{
-wchar_t comm_close[] = L"-->";
-size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
-/*
-	if( !IsOpeningCommentaryTagMark(pchar) )
-		return false;
-
-	pchar += OpeningCommentaryTagMarkSize();
-
-	// looking for "-->"
-	while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
-		++pchar;
-
-	if( *pchar!= 0 )
-		pchar += comm_close_len;
-
-	CheckNewLine();
-*/
-
-
-return true;
-}
-
-
 // reading text between html tags
-void HTMLFilter::ReadNormalText()
+void HTMLFilter::ReadText()
 {
+	bool was_white_char = false;
+	bool was_new_line = false;
+
 	bool was_non_white_text = false;
 
 	was_ending_commentary = false;
@@ -1175,13 +1122,16 @@ void HTMLFilter::ReadNormalText()
 			if( was_ending_commentary )
 				break;
 
-			if( PutNormalWhite() && white_mode == WHITE_MODE_TREE )
+			PutNormalWhite(was_white_char, was_new_line);
+
+			if( (was_white_char || was_new_line) && white_mode == WHITE_MODE_TREE )
 			{
-				if( last_new_line )
+				allow_put_new_line = false;
+				allow_put_space = false;
+
+				if( was_new_line )
 				{
 					allow_put_new_line = true;
-					allow_put_space = false;
-
 					LastItem().new_line_in_the_middle = true;
 
 					if( !was_non_white_text )
@@ -1189,7 +1139,6 @@ void HTMLFilter::ReadNormalText()
 				}
 				else
 				{
-					allow_put_new_line = false;
 					allow_put_space = true;
 				}
 
@@ -1200,6 +1149,8 @@ void HTMLFilter::ReadNormalText()
 			}
 		}
 	}
+
+	new_item_has_new_line_before = was_new_line;
 }
 
 
@@ -1319,7 +1270,7 @@ void HTMLFilter::ReadItemSpecial()
 
 	if( !skip_tags )
 	{
-		if( white_mode == WHITE_MODE_TREE && last_new_line )
+		if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
 		{
 			Put(10);
 			PutTabs(LastItem().tree_index);
@@ -1351,6 +1302,13 @@ void HTMLFilter::ReadItemSpecial()
 			Put(' ');
 			Put(tmp_text);
 			Put('>');
+
+			if( is_first_item && white_mode == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") )
+			{
+				Put(10);
+				Put(10);
+				SkipWhiteLines();
+			}
 		}
 	}
 }
@@ -1399,6 +1357,8 @@ bool HTMLFilter::ReadItem()
 	if( !PushStack() )
 		return false;
 
+	LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method
+
 	if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
 		LastItem().tree_index += 1;
 
@@ -1602,7 +1562,7 @@ int i;
 		pstack[z].Clear();
 	}
 
-	last_new_line = pstack[stack_len-1].new_line;
+	//last_new_line = pstack[stack_len-1].new_line;
 
 	// invalidate tags
 	stack_len = i;
@@ -1661,7 +1621,7 @@ void HTMLFilter::CheckClosingTags()
 		}
 
 		PutClosingTag(pstack[stack_len-1]);
-		last_new_line = pstack[stack_len-1].new_line;
+		//last_new_line = pstack[stack_len-1].new_line;
 		PopStack();
 		PopStack();
 	}
@@ -1711,27 +1671,17 @@ void HTMLFilter::ReadLoop()
 			CheckExceptions();
 		}
 		else
-		if( LastItem().type == Item::special || LastItem().type == Item::simple )
+		if( LastItem().type == Item::special )
 		{
-			if( stack_len > 1 )
-			{
-				//pstack[stack_len-2].new_line = LastItem().new_line;
-			}
-			else
-			if( white_mode == WHITE_MODE_TREE )
-			{
-				// one new line after a simple or special tag
-				// (if the tag has level 0 in the tree - it not means that this is a first tag)
-				// for example can be DOCTYPE
-
-				if( !LastItem().is_commentary )
-					Put(10);
-			}
-
 			if( !LastItem().is_commentary )
 				PopStack();
 		}
 		else
+		if( LastItem().type == Item::simple )
+		{
+			PopStack();
+		}
+		else
 		if( LastItem().type == Item::closing )
 		{
 			CheckClosingTags();
@@ -1741,7 +1691,8 @@ void HTMLFilter::ReadLoop()
 			PopStack();
 		}
 
-		ReadNormalText();
+		ReadText();
+		is_first_item = false;
 	}
 }
 
@@ -1750,12 +1701,13 @@ void HTMLFilter::ReadLoop()
 void HTMLFilter::Read()
 {
 	read_char(); // put first character to lastc
+	is_first_item = true;
 
 	if( white_mode != WHITE_MODE_ORIGIN )
 		SkipWhiteLines();
 
 	// it can be some text or white lines before the first html tag (we print it)
-	ReadNormalText();
+	ReadText();
 
 	// reading the whole html source
 	ReadLoop();
diff --git a/src/html/htmlfilter.h b/src/html/htmlfilter.h
index 6407e0e..4b20ef4 100644
--- a/src/html/htmlfilter.h
+++ b/src/html/htmlfilter.h
@@ -204,6 +204,8 @@ protected:
 
 		bool is_commentary;
 
+		bool new_line_before;
+
 		// is there a new line after this tag
 		bool new_line;
 
@@ -244,15 +246,12 @@ protected:
 	virtual bool IsStartingEntityMark(wchar_t c);
 	virtual bool IsEndingEntityMark(wchar_t c);
 
-//	virtual bool   IsOpeningCommentaryTagMark(const wchar_t * str);
-//	virtual size_t OpeningCommentaryTagMarkSize();
 	virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
 
 	virtual bool IsValidCharForName(int c);
 	virtual bool IsValidCharForAttrName(int c);
 	virtual bool IsValidCharForEntityName(int c);
 	virtual void CheckExceptions();
-	virtual bool SkipCommentaryTagIfExists();
 
 	virtual void Put(wchar_t c);
 	virtual void Put(const wchar_t * str, const wchar_t * end);
@@ -306,11 +305,10 @@ protected:
 
 	void PopStack();
 	bool PushStack();
-	void CheckNewLine();
 	void CheckStackPrintRest();
 	void AddForgottenTags();
 	void CheckClosingTags();
-	void ReadNormalText();
+	void ReadText();
 	bool PrintRest();
 	bool PrintOpeningItem();
 	void ReadItemName(std::wstring & name, bool clear_name = true);
@@ -332,7 +330,7 @@ protected:
 	void CheckChar(wchar_t c);
 
 	void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
-	bool PutNormalWhite();
+	void PutNormalWhite(bool & was_white_char, bool & was_new_line);
 	void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
 	void PutTabs(size_t len);
 	void PutNonBreakingSpace();
@@ -343,8 +341,10 @@ protected:
 	size_t stack_len;		// length of the stack
 	wchar_t * buffer;		// buffer used when printing
 	std::wstring * out_string;
-	bool last_new_line;
+	//bool last_new_line;
+	bool new_item_has_new_line_before;
 	int white_mode;
+	bool is_first_item;
 	size_t wrap_line;		// insert a new line character into long lines
 	size_t tab_size;
 	bool was_ending_commentary;

From f6df8bc1bcf7b1f1095a6cda16fd6bec088c226f Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Wed, 21 Jul 2021 15:57:46 +0200
Subject: [PATCH 06/37] HTMLFilter: added a std::vector<int> stack for a
 current white mode - white chars mode can be changed by such tags:
 <textarea>, <pre>, <script>, <nofilter>

---
 src/html/bbcodeparser.cpp |  17 ++--
 src/html/htmlfilter.cpp   | 164 ++++++++++++++++----------------------
 src/html/htmlfilter.h     |  13 ++-
 3 files changed, 89 insertions(+), 105 deletions(-)

diff --git a/src/html/bbcodeparser.cpp b/src/html/bbcodeparser.cpp
index ec39de6..254de60 100644
--- a/src/html/bbcodeparser.cpp
+++ b/src/html/bbcodeparser.cpp
@@ -463,14 +463,15 @@ bool first_tag_removed = false;
 	{
 		if( IsOpeningTagMark(lastc) )
 		{
-			if( IsClosingTagForLastItem() )
-			{
-				// the last tag is skipped when using patterns with %2 or %u2
-
-				PopStack(); // removing opening tag from the stack
-				first_tag_removed = true;
-				break;
-			}
+			// FIXME
+//			if( IsClosingTagForLastItem() )
+//			{
+//				// the last tag is skipped when using patterns with %2 or %u2
+//
+//				PopStack(); // removing opening tag from the stack
+//				first_tag_removed = true;
+//				break;
+//			}
 		}
 		else
 		{
diff --git a/src/html/htmlfilter.cpp b/src/html/htmlfilter.cpp
index 04888c3..9084462 100644
--- a/src/html/htmlfilter.cpp
+++ b/src/html/htmlfilter.cpp
@@ -41,7 +41,9 @@
 
 namespace pt
 {
-
+const int HTMLFilter::WHITE_MODE_ORIGIN;
+const int HTMLFilter::WHITE_MODE_SINGLE_LINE;
+const int HTMLFilter::WHITE_MODE_TREE;
 
 
 
@@ -197,6 +199,15 @@ void HTMLFilter::InsertTabs(size_t tabsize)
 }
 
 
+int HTMLFilter::current_white_char_mode()
+{
+	if( !white_char_mode_tab.empty() )
+		return white_char_mode_tab.back();
+
+	return WHITE_MODE_ORIGIN;
+}
+
+
 void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans)
 {
 size_t i;
@@ -396,76 +407,6 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
 
 
 
-bool HTMLFilter::IsClosingTagForLastItem()
-{
-	read_char();
-	SkipWhite();
-
-	if( lastc == '/' )
-	{
-		read_char();
-		SkipWhite();
-
-		ReadItemName(tmp_name);
-
-		if( IsNameEqual(tmp_name, LastItem().name) )
-		{
-			SkipWhite();
-
-			if( IsClosingTagMark(lastc) )
-			{
-				read_char();
-				return true;
-			}
-		}
-	}
-
-return false;
-}
-
-
-
-
-// used for such tags as: script, pre, textarea
-void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
-{
-	bool was_closing_tag = false;
-	tmp_text.clear();
-
-	while( lastc != -1 )
-	{
-		if( IsOpeningTagMark(lastc) )
-		{
-			if( IsClosingTagForLastItem() )
-			{
-				was_closing_tag = true;
-
-				PopStack();
-				//CheckNewLine();
-				break;
-			}
-		}
-		else
-		{
-			tmp_text += lastc;
-			read_char();
-		}
-	}
-
-	Put(tmp_text);
-
-	if( was_closing_tag && put_closing_tag_as_well )
-	{
-		Put('<');
-		Put('/');
-		Put(tmp_name);
-		Put('>');
-	}
-}
-
-
-
-
 void HTMLFilter::SkipAndCheckClosingTag(std::wstring * remember_text)
 {
 	bool is_quoted = false;
@@ -870,7 +811,7 @@ void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
 		else
 			was_white_char = true;
 
-		if( white_mode == WHITE_MODE_ORIGIN )
+		if( current_white_char_mode() == WHITE_MODE_ORIGIN )
 		{
 			Put(lastc);
 		}
@@ -878,7 +819,7 @@ void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
 		read_char();
 	}
 
-	if( white_mode == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) )
+	if( current_white_char_mode() == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) )
 	{
 		Put(' ');
 	}
@@ -950,7 +891,7 @@ bool HTMLFilter::PutOpeningTag()
 		return false;
 	}
 
-	if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
+	if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before )
 	{
 		Put(10);
 		PutTabs(LastItem().tree_index);
@@ -1089,7 +1030,7 @@ void HTMLFilter::ReadText()
 	bool allow_put_new_line = false;
 	bool allow_put_space = false;
 
-	if( white_mode == WHITE_MODE_TREE )
+	if( current_white_char_mode() == WHITE_MODE_TREE )
 	{
 		if( LastItem().new_line || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) )
 		{
@@ -1124,7 +1065,7 @@ void HTMLFilter::ReadText()
 
 			PutNormalWhite(was_white_char, was_new_line);
 
-			if( (was_white_char || was_new_line) && white_mode == WHITE_MODE_TREE )
+			if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
 			{
 				allow_put_new_line = false;
 				allow_put_space = false;
@@ -1270,7 +1211,7 @@ void HTMLFilter::ReadItemSpecial()
 
 	if( !skip_tags )
 	{
-		if( white_mode == WHITE_MODE_TREE && LastItem().new_line_before )
+		if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before )
 		{
 			Put(10);
 			PutTabs(LastItem().tree_index);
@@ -1303,7 +1244,7 @@ void HTMLFilter::ReadItemSpecial()
 			Put(tmp_text);
 			Put('>');
 
-			if( is_first_item && white_mode == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") )
+			if( is_first_item && current_white_char_mode() == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") )
 			{
 				Put(10);
 				Put(10);
@@ -1383,6 +1324,7 @@ bool HTMLFilter::ReadItem()
 			ReadItemOpening();
 	}
 
+	// IMPROVE ME later CheckSingleItemExceptions() can change opening to single type
 	ItemFound();
 
 return true;
@@ -1491,7 +1433,7 @@ bool HTMLFilter::IsLastTag(const std::wstring & name)
 
 
 // checking exceptions for opening tags
-void HTMLFilter::CheckExceptions()
+void HTMLFilter::CheckSingleItemExceptions()
 {
 	if( IsLastTag(L"meta")	||
 		IsLastTag(L"input")	||
@@ -1508,21 +1450,47 @@ void HTMLFilter::CheckExceptions()
 		return;
 	}
 
-	// in safe_mode the script tag is ignored
-	if( !safe_mode && IsLastTag(L"script") )
-		PutEverythingUntilClosingTag(!skip_tags);
-
-	if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
-		PutEverythingUntilClosingTag(!skip_tags);
-
-	if( IsLastTag(no_filter_tag) )
-		PutEverythingUntilClosingTag(false);
-
+	// move me to a better place
 	if( IsLastTag(L"body") )
 		LastItem().has_body_tag = true;
 }
 
 
+void HTMLFilter::CheckWhiteCharsExceptions(Item & item)
+{
+	bool change_white_mode = false;
+
+	// in safe_mode the script tag is ignored
+	if( !safe_mode && IsNameEqual(item.name, L"script") )
+	{
+		change_white_mode = true;
+	}
+
+	if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
+	{
+		change_white_mode = true;
+	}
+
+	if( IsNameEqual(item.name, no_filter_tag) )
+	{
+		change_white_mode = true;
+	}
+
+	if( change_white_mode )
+	{
+		if( item.type == Item::opening )
+		{
+			white_char_mode_tab.push_back(WHITE_MODE_ORIGIN);
+		}
+		else
+		{
+			if( !white_char_mode_tab.empty() )
+				white_char_mode_tab.pop_back();
+		}
+	}
+}
+
+
 
 
 void HTMLFilter::AddForgottenTags()
@@ -1549,9 +1517,11 @@ int i;
 
 	for(int z=(int)stack_len-2 ; z>=i ; --z)
 	{
+		CheckWhiteCharsExceptions(pstack[z]);
+
 		if( !skip_tags && pstack[z].new_line )
 		{
-			if( white_mode == WHITE_MODE_TREE )
+			if( current_white_char_mode() == WHITE_MODE_TREE )
 			{
 				Put(10);
 				PutTabs(pstack[z].tree_index);
@@ -1575,7 +1545,7 @@ void HTMLFilter::CheckStackPrintRest()
 	{
 		if( stack_len==0 || pstack[stack_len-1].new_line )
 		{
-			if( white_mode == WHITE_MODE_TREE )
+			if( current_white_char_mode() == WHITE_MODE_TREE )
 			{
 				Put(10);
 				PutTabs(pstack[stack_len-1].tree_index);
@@ -1610,10 +1580,12 @@ void HTMLFilter::CheckClosingTags()
 	// there are more than one tag 
 	if( (pstack[stack_len-1].is_commentary && pstack[stack_len-2].is_commentary) || IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
 	{
+		CheckWhiteCharsExceptions(pstack[stack_len-1]);
+
 		// last closing tag is from the previous one
 		if( !skip_tags && pstack[stack_len-2].new_line )
 		{
-			if( white_mode == WHITE_MODE_TREE )
+			if( current_white_char_mode() == WHITE_MODE_TREE )
 			{
 				Put(10);
 				PutTabs(pstack[stack_len-2].tree_index);
@@ -1668,7 +1640,8 @@ void HTMLFilter::ReadLoop()
 	{
 		if( LastItem().type == Item::opening )
 		{
-			CheckExceptions();
+			CheckSingleItemExceptions();
+			CheckWhiteCharsExceptions(LastItem());
 		}
 		else
 		if( LastItem().type == Item::special )
@@ -1703,7 +1676,10 @@ void HTMLFilter::Read()
 	read_char(); // put first character to lastc
 	is_first_item = true;
 
-	if( white_mode != WHITE_MODE_ORIGIN )
+	white_char_mode_tab.clear();
+	white_char_mode_tab.push_back(white_mode);
+
+	if( current_white_char_mode() != WHITE_MODE_ORIGIN )
 		SkipWhiteLines();
 
 	// it can be some text or white lines before the first html tag (we print it)
diff --git a/src/html/htmlfilter.h b/src/html/htmlfilter.h
index 4b20ef4..b24e925 100644
--- a/src/html/htmlfilter.h
+++ b/src/html/htmlfilter.h
@@ -251,7 +251,9 @@ protected:
 	virtual bool IsValidCharForName(int c);
 	virtual bool IsValidCharForAttrName(int c);
 	virtual bool IsValidCharForEntityName(int c);
-	virtual void CheckExceptions();
+
+	virtual void CheckSingleItemExceptions();
+	virtual void CheckWhiteCharsExceptions(Item & item);
 
 	virtual void Put(wchar_t c);
 	virtual void Put(const wchar_t * str, const wchar_t * end);
@@ -300,7 +302,9 @@ protected:
 	void SkipWhite();
 	void SkipWhiteLines();
 	void SkipWhiteWithFirstNewLine();
-	bool IsClosingTagForLastItem();
+
+	int current_white_char_mode();
+
 	void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);
 
 	void PopStack();
@@ -331,7 +335,7 @@ protected:
 
 	void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
 	void PutNormalWhite(bool & was_white_char, bool & was_new_line);
-	void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
+
 	void PutTabs(size_t len);
 	void PutNonBreakingSpace();
 	void CalcOrphansMaxLen(Orphans & orphans);
@@ -341,6 +345,9 @@ protected:
 	size_t stack_len;		// length of the stack
 	wchar_t * buffer;		// buffer used when printing
 	std::wstring * out_string;
+
+	std::vector<int> white_char_mode_tab;
+
 	//bool last_new_line;
 	bool new_item_has_new_line_before;
 	int white_mode;

From fdfd0b13857599e9aab6d79b0ea85363a92e4bf2 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Fri, 6 Aug 2021 17:10:19 +0200
Subject: [PATCH 07/37] renamed: HTMLFilter -> HTMLParser

---
 src/Makefile.dep                            |   5 +-
 src/html/bbcodeparser.h                     |   6 +-
 src/html/{htmlfilter.cpp => htmlparser.cpp} | 211 ++++++++++----------
 src/html/{htmlfilter.h => htmlparser.h}     |  10 +-
 4 files changed, 117 insertions(+), 115 deletions(-)
 rename src/html/{htmlfilter.cpp => htmlparser.cpp} (83%)
 rename src/html/{htmlfilter.h => htmlparser.h} (98%)

diff --git a/src/Makefile.dep b/src/Makefile.dep
index 84ddfa4..1561a29 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -46,5 +46,6 @@
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
-./html/htmlfilter.o: ./html/htmlfilter.h
-./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlfilter.h
+./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h convert/text.h
+./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
+./html/bbcodeparser.o: convert/baseparser.h
diff --git a/src/html/bbcodeparser.h b/src/html/bbcodeparser.h
index bd36e4d..a2e2e7f 100644
--- a/src/html/bbcodeparser.h
+++ b/src/html/bbcodeparser.h
@@ -38,13 +38,13 @@
 #ifndef headerfile_winix_core_bbcodeparser
 #define headerfile_winix_core_bbcodeparser
 
-#include "htmlfilter.h"
+#include "htmlparser.h"
 
 namespace pt
 {
 
 
-class BBCODEParser : public HTMLFilter
+class BBCODEParser : public HTMLParser
 {
 
 	struct Tags
@@ -58,7 +58,7 @@ class BBCODEParser : public HTMLFilter
 
 	/*
 		virtual methods
-		(from HTMLFilter class)
+		(from HTMLParser class)
 	*/
 	virtual void Init();
 	virtual void Uninit();
diff --git a/src/html/htmlfilter.cpp b/src/html/htmlparser.cpp
similarity index 83%
rename from src/html/htmlfilter.cpp
rename to src/html/htmlparser.cpp
index 9084462..1187a67 100644
--- a/src/html/htmlfilter.cpp
+++ b/src/html/htmlparser.cpp
@@ -35,19 +35,20 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "htmlfilter.h"
+#include "htmlparser.h"
+
 #include "convert/text.h"
 
 
 namespace pt
 {
-const int HTMLFilter::WHITE_MODE_ORIGIN;
-const int HTMLFilter::WHITE_MODE_SINGLE_LINE;
-const int HTMLFilter::WHITE_MODE_TREE;
+const int HTMLParser::WHITE_MODE_ORIGIN;
+const int HTMLParser::WHITE_MODE_SINGLE_LINE;
+const int HTMLParser::WHITE_MODE_TREE;
 
 
 
-void HTMLFilter::Item::Clear()
+void HTMLParser::Item::Clear()
 {
 	name.clear();
 	type            = none;
@@ -61,14 +62,14 @@ void HTMLFilter::Item::Clear()
 }
 
 
-HTMLFilter::Item::Item()
+HTMLParser::Item::Item()
 {
 	Clear();
 }
 
 
 
-void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
+void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 {
 	reading_from_file         = false;
 	reading_from_wchar_string = true;
@@ -89,18 +90,18 @@ void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
 
 
 
-void HTMLFilter::Init()
+void HTMLParser::Init()
 {
 }
 
 
-void HTMLFilter::Uninit()
+void HTMLParser::Uninit()
 {
 }
 
 
 
-void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
+void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
 {
 	if( &in == &out )
 	{
@@ -117,7 +118,7 @@ void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
 }
 
 
-void HTMLFilter::SetSomeDefaults()
+void HTMLParser::SetSomeDefaults()
 {
 	white_mode  = WHITE_MODE_ORIGIN;
 
@@ -132,7 +133,7 @@ void HTMLFilter::SetSomeDefaults()
 }
 
 
-HTMLFilter::HTMLFilter()
+HTMLParser::HTMLParser()
 {
 	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
 	buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
@@ -141,7 +142,7 @@ HTMLFilter::HTMLFilter()
 }
 
 
-HTMLFilter::HTMLFilter(const HTMLFilter & f)
+HTMLParser::HTMLParser(const HTMLParser & f)
 {
 	// don't need to copy the stack
 	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
@@ -151,7 +152,7 @@ HTMLFilter::HTMLFilter(const HTMLFilter & f)
 }
 
 
-HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
+HTMLParser & HTMLParser::operator=(const HTMLParser & f)
 {
 	// don't need to copy the stack
 	pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
@@ -163,7 +164,7 @@ return *this;
 }
 
 
-HTMLFilter::~HTMLFilter()
+HTMLParser::~HTMLParser()
 {
 	delete [] pstack;
 	delete [] buffer;
@@ -171,7 +172,7 @@ HTMLFilter::~HTMLFilter()
 
 
 
-void HTMLFilter::white_chars_mode(int mode)
+void HTMLParser::white_chars_mode(int mode)
 {
 	if( mode >= WHITE_MODE_ORIGIN && mode <= WHITE_MODE_TREE )
 		white_mode = mode;
@@ -180,7 +181,7 @@ void HTMLFilter::white_chars_mode(int mode)
 
 
 
-void HTMLFilter::WrapLine(size_t wrap_line_)
+void HTMLParser::WrapLine(size_t wrap_line_)
 {
 	wrap_line = wrap_line_;
 
@@ -190,7 +191,7 @@ void HTMLFilter::WrapLine(size_t wrap_line_)
 
 
 
-void HTMLFilter::InsertTabs(size_t tabsize)
+void HTMLParser::InsertTabs(size_t tabsize)
 {
 	tab_size = tabsize;
 
@@ -199,7 +200,7 @@ void HTMLFilter::InsertTabs(size_t tabsize)
 }
 
 
-int HTMLFilter::current_white_char_mode()
+int HTMLParser::current_white_char_mode()
 {
 	if( !white_char_mode_tab.empty() )
 		return white_char_mode_tab.back();
@@ -208,7 +209,7 @@ int HTMLFilter::current_white_char_mode()
 }
 
 
-void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans)
+void HTMLParser::CalcOrphansMaxLen(Orphans & orphans)
 {
 size_t i;
 
@@ -222,7 +223,7 @@ size_t i;
 }
 
 
-void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
+void HTMLParser::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
 {
 	lang_code_lower = lang_code;
 	ToLower(lang_code_lower);
@@ -236,13 +237,13 @@ void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector<std:
 
 
 
-void HTMLFilter::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
+void HTMLParser::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
 {
 	AssignOrphans(lang_code.c_str(), otab);
 }
 
 
-void HTMLFilter::ClearOrphans()
+void HTMLParser::ClearOrphans()
 {
 	orphans_tab.clear();
 }
@@ -250,7 +251,7 @@ void HTMLFilter::ClearOrphans()
 
 
 
-void HTMLFilter::OrphansMode(const std::wstring & orphan_mode_str)
+void HTMLParser::OrphansMode(const std::wstring & orphan_mode_str)
 {
 	if( orphan_mode_str == L"160" )
 		orphan_mode = orphan_160space;
@@ -259,24 +260,24 @@ void HTMLFilter::OrphansMode(const std::wstring & orphan_mode_str)
 }
 
 
-void HTMLFilter::SafeMode(bool safe_mode_)
+void HTMLParser::SafeMode(bool safe_mode_)
 {
 	safe_mode = safe_mode_;
 }
 
 
-void HTMLFilter::SkipTags(bool skip_tags)
+void HTMLParser::SkipTags(bool skip_tags)
 {
 	this->skip_tags = skip_tags;
 }
 
-void HTMLFilter::SkipCommentaries(bool skip_commentaries)
+void HTMLParser::SkipCommentaries(bool skip_commentaries)
 {
 	this->skip_commentaries = skip_commentaries;
 }
 
 
-void HTMLFilter::SkipEntities(bool skip_entities)
+void HTMLParser::SkipEntities(bool skip_entities)
 {
 	this->skip_entities = skip_entities;
 
@@ -287,13 +288,13 @@ void HTMLFilter::SkipEntities(bool skip_entities)
 }
 
 
-void HTMLFilter::AnalyzeEntities(bool analyze_entities)
+void HTMLParser::AnalyzeEntities(bool analyze_entities)
 {
 	this->analyze_entities = analyze_entities;
 }
 
 
-void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
+void HTMLParser::SetNoFilterTag(const std::wstring & tag_name)
 {
 	no_filter_tag = tag_name;
 }
@@ -301,7 +302,7 @@ void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
 
 
 
-HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
+HTMLParser::Item & HTMLParser::GetItem(size_t i)
 {
 	if( i >= stack_len )
 	{
@@ -313,7 +314,7 @@ return pstack[i];
 }
 
 
-HTMLFilter::Item & HTMLFilter::LastItem()
+HTMLParser::Item & HTMLParser::LastItem()
 {
 	if( stack_len == 0 )
 	{
@@ -325,7 +326,7 @@ return pstack[stack_len-1];
 }
 
 
-bool HTMLFilter::PushStack()
+bool HTMLParser::PushStack()
 {
 	if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
 		// oops, too many items
@@ -346,7 +347,7 @@ bool HTMLFilter::PushStack()
 return true;
 }
 
-void HTMLFilter::PopStack()
+void HTMLParser::PopStack()
 {
 	if( stack_len == 0 )
 		// oops
@@ -357,7 +358,7 @@ void HTMLFilter::PopStack()
 }
 
 
-bool HTMLFilter::IsWhite(int c)
+bool HTMLParser::IsWhite(int c)
 {
 	// dont use c==10 here
 
@@ -368,21 +369,21 @@ return false;
 }
 
 
-void HTMLFilter::SkipWhite()
+void HTMLParser::SkipWhite()
 {
 	while( IsWhite(lastc) )
 		read_char();
 }
 
 
-void HTMLFilter::SkipWhiteLines()
+void HTMLParser::SkipWhiteLines()
 {
 	while( lastc==10 || IsWhite(lastc) )
 		read_char();
 }
 
 
-void HTMLFilter::SkipWhiteWithFirstNewLine()
+void HTMLParser::SkipWhiteWithFirstNewLine()
 {
 	SkipWhite();
 
@@ -394,7 +395,7 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
 }
 
 
-//void HTMLFilter::CheckNewLine()
+//void HTMLParser::CheckNewLine()
 //{
 //	if( white_mode == WHITE_MODE_TREE )
 //	{
@@ -407,7 +408,7 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
 
 
 
-void HTMLFilter::SkipAndCheckClosingTag(std::wstring * remember_text)
+void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
 {
 	bool is_quoted = false;
 	wchar_t quote_char = 0;
@@ -450,7 +451,7 @@ void HTMLFilter::SkipAndCheckClosingTag(std::wstring * remember_text)
 
 
 
-bool HTMLFilter::IsValidCharForName(int c)
+bool HTMLParser::IsValidCharForName(int c)
 {
 	if( (c>='a' && c<='z') ||
 		(c>='A' && c<='Z') ||
@@ -462,7 +463,7 @@ return false;
 }
 
 
-bool HTMLFilter::IsValidCharForAttrName(int c)
+bool HTMLParser::IsValidCharForAttrName(int c)
 {
 	if( (c>='a' && c<='z') ||
 		(c>='A' && c<='Z') ||
@@ -474,7 +475,7 @@ return false;
 }
 
 
-bool HTMLFilter::IsValidCharForEntityName(int c)
+bool HTMLParser::IsValidCharForEntityName(int c)
 {
 	if( (c>='a' && c<='z') ||
 		(c>='A' && c<='Z') ||
@@ -486,7 +487,7 @@ return false;
 }
 
 
-void HTMLFilter::ReadItemName(std::wstring & name, bool clear_name)
+void HTMLParser::ReadItemName(std::wstring & name, bool clear_name)
 {
 size_t i;
 
@@ -513,7 +514,7 @@ size_t i;
 
 
 
-void HTMLFilter::ReadItemAttrName()
+void HTMLParser::ReadItemAttrName()
 {
 size_t i;
 
@@ -530,7 +531,7 @@ size_t i;
 
 
 
-void HTMLFilter::ReadItemAttrValueAdd(const std::wstring & str)
+void HTMLParser::ReadItemAttrValueAdd(const std::wstring & str)
 {
 	if( analyze_entities )
 	{
@@ -544,7 +545,7 @@ void HTMLFilter::ReadItemAttrValueAdd(const std::wstring & str)
 }
 
 
-void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
+void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
 {
 	attr_value.clear();
 	tmp_text.clear();
@@ -585,7 +586,7 @@ void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
 }
 
 
-void HTMLFilter::CheckChar(wchar_t c)
+void HTMLParser::CheckChar(wchar_t c)
 {
 	if( c == 10 )
 		line_len = 0;
@@ -594,14 +595,14 @@ void HTMLFilter::CheckChar(wchar_t c)
 }
 
 
-void HTMLFilter::Put(wchar_t c)
+void HTMLParser::Put(wchar_t c)
 {
 	(*out_string) += c;
 	CheckChar(c);
 }
 
 
-void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
+void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
 {
 	if( str >= end )
 		return;
@@ -615,7 +616,7 @@ void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
 
 
 
-void HTMLFilter::Put(const std::wstring & str)
+void HTMLParser::Put(const std::wstring & str)
 {
 	if( !str.empty() )
 	{
@@ -628,7 +629,7 @@ void HTMLFilter::Put(const std::wstring & str)
 
 
 // out can be null
-void HTMLFilter::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
+void HTMLParser::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
 {
 	size_t epsilon = 8; // !! IMPROVE ME put as a constant
 	const wchar_t * old_str = str;
@@ -680,7 +681,7 @@ void HTMLFilter::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end,
 
 
 
-int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
+int HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
 {
 size_t res;
 
@@ -704,7 +705,7 @@ return -int(ToLower(*orphan));
 
 
 // binary search in table (table should be sorted)
-bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
+bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
 {
 int res;
 
@@ -749,7 +750,7 @@ return false;
 }
 
 
-bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end)
+bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end)
 {
 	if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
 		return false;
@@ -763,7 +764,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
 }
 
 
-void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
+void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
 {
 	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
 	{
@@ -799,7 +800,7 @@ void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
 }
 
 
-void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
+void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line)
 {
 	was_white_char = false;
 	was_new_line = false;
@@ -829,13 +830,13 @@ void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
 
 
 
-void HTMLFilter::PutOpeningTagMark()
+void HTMLParser::PutOpeningTagMark()
 {
 	Put('<');
 }
 
 
-void HTMLFilter::PutClosingTagMark()
+void HTMLParser::PutClosingTagMark()
 {
 	Put('>');
 }
@@ -845,7 +846,7 @@ void HTMLFilter::PutClosingTagMark()
 
 // !! IMPROVE ME change to a better name
 // this functions does not return true when the tag is safe
-bool HTMLFilter::IsTagSafe(const wchar_t * tag)
+bool HTMLParser::IsTagSafe(const wchar_t * tag)
 {
 	if( !safe_mode )
 		return true;
@@ -874,7 +875,7 @@ return true;
 }
 
 
-bool HTMLFilter::IsTagSafe(const std::wstring & tag)
+bool HTMLParser::IsTagSafe(const std::wstring & tag)
 {
 	return IsTagSafe(tag.c_str());
 }
@@ -883,7 +884,7 @@ bool HTMLFilter::IsTagSafe(const std::wstring & tag)
 
 
 
-bool HTMLFilter::PutOpeningTag()
+bool HTMLParser::PutOpeningTag()
 {
 	if( !IsTagSafe(LastItem().name) )
 	{
@@ -905,7 +906,7 @@ return true;
 
 
 
-void HTMLFilter::PutClosingTag(const Item & item)
+void HTMLParser::PutClosingTag(const Item & item)
 {
 	if( skip_tags || !IsTagSafe(item.name) )
 		return;
@@ -922,7 +923,7 @@ void HTMLFilter::PutClosingTag(const Item & item)
 
 
 
-void HTMLFilter::PutTabs(size_t len)
+void HTMLParser::PutTabs(size_t len)
 {
 	if( len > 30 )
 		len = 30;
@@ -932,7 +933,7 @@ void HTMLFilter::PutTabs(size_t len)
 }
 
 
-void HTMLFilter::PutNonBreakingSpace()
+void HTMLParser::PutNonBreakingSpace()
 {
 	if( orphan_mode == orphan_nbsp )
 	{
@@ -947,35 +948,35 @@ void HTMLFilter::PutNonBreakingSpace()
 
 
 // we assume the size of the opening mark to be one
-bool HTMLFilter::IsOpeningTagMark(wchar_t c)
+bool HTMLParser::IsOpeningTagMark(wchar_t c)
 {
 	return (c == '<');
 }
 
 
 // we assume the size of the closing mark to be one
-bool HTMLFilter::IsClosingTagMark(wchar_t c)
+bool HTMLParser::IsClosingTagMark(wchar_t c)
 {
 	return (c == '>');
 }
 
 
 // the slash in the closing tag mark e.g. </p>
-bool HTMLFilter::IsClosingTagIndicator(wchar_t c)
+bool HTMLParser::IsClosingTagIndicator(wchar_t c)
 {
 	return (c == '/');
 }
 
 
 // the slash in the closing tag mark e.g. </p>
-bool HTMLFilter::IsSpecialTagIndicator(wchar_t c)
+bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
 {
 	return (c == '!');
 }
 
 
 // the '=' operator e.g. class="value"
-bool HTMLFilter::IsAttributeAssignmentMark(wchar_t c)
+bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
 {
 	return (c == '=');
 }
@@ -984,13 +985,13 @@ bool HTMLFilter::IsAttributeAssignmentMark(wchar_t c)
 
 // the slash at the end <img src=".." /> (without '>' character)
 // we assume the size of the mark to be one
-bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
+bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
 {
 	return (c == '/');
 }
 
 
-bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
+bool HTMLParser::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
 {
 	static wchar_t comm_end[] = L"-->";
 	size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1;
@@ -1004,13 +1005,13 @@ bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str
 }
 
 
-bool HTMLFilter::IsStartingEntityMark(wchar_t c)
+bool HTMLParser::IsStartingEntityMark(wchar_t c)
 {
 	return (c == '&');
 }
 
 
-bool HTMLFilter::IsEndingEntityMark(wchar_t c)
+bool HTMLParser::IsEndingEntityMark(wchar_t c)
 {
 	return (c == ';');
 }
@@ -1018,7 +1019,7 @@ bool HTMLFilter::IsEndingEntityMark(wchar_t c)
 
 
 // reading text between html tags
-void HTMLFilter::ReadText()
+void HTMLParser::ReadText()
 {
 	bool was_white_char = false;
 	bool was_new_line = false;
@@ -1096,7 +1097,7 @@ void HTMLFilter::ReadText()
 
 
 
-bool HTMLFilter::PrintOpeningItem()
+bool HTMLParser::PrintOpeningItem()
 {
 	if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
 		return true;
@@ -1108,7 +1109,7 @@ bool HTMLFilter::PrintOpeningItem()
 
 
 
-bool HTMLFilter::ReadItemAttr()
+bool HTMLParser::ReadItemAttr()
 {
 	attr_has_value = false;
 	attr_name.clear();
@@ -1145,7 +1146,7 @@ return true;
 
 
 
-void HTMLFilter::CheckItemLangAttr()
+void HTMLParser::CheckItemLangAttr()
 {
 	if( attr_has_value && IsNameEqual(L"lang", attr_name) )
 	{
@@ -1166,7 +1167,7 @@ void HTMLFilter::CheckItemLangAttr()
 }
 
 
-void HTMLFilter::PrintItemAttr()
+void HTMLParser::PrintItemAttr()
 {
 size_t i;
 
@@ -1193,7 +1194,7 @@ size_t i;
 }
 
 
-void HTMLFilter::ReadItemClosing()
+void HTMLParser::ReadItemClosing()
 {
 	read_char(); // skipping '/'
 	SkipWhiteLines();
@@ -1205,7 +1206,7 @@ void HTMLFilter::ReadItemClosing()
 }
 
 
-void HTMLFilter::ReadItemSpecial()
+void HTMLParser::ReadItemSpecial()
 {
 	LastItem().type = Item::special;
 
@@ -1255,7 +1256,7 @@ void HTMLFilter::ReadItemSpecial()
 }
 
 
-void HTMLFilter::ReadItemOpening()
+void HTMLParser::ReadItemOpening()
 {
 	LastItem().type = Item::opening;
 	ReadItemName(LastItem().name);
@@ -1281,16 +1282,16 @@ void HTMLFilter::ReadItemOpening()
 }
 
 
-void HTMLFilter::ItemFound()
+void HTMLParser::ItemFound()
 {
 }
 
-void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end)
+void HTMLParser::EntityFound(const wchar_t * str, const wchar_t * end)
 {
 }
 
 
-bool HTMLFilter::ReadItem()
+bool HTMLParser::ReadItem()
 {
 	if( lastc == -1 )
 		return false;
@@ -1332,7 +1333,7 @@ return true;
 
 
 
-wchar_t HTMLFilter::ToLower(wchar_t c)
+wchar_t HTMLParser::ToLower(wchar_t c)
 {
 	if( c>='A' && c<='Z' )
 		return c - 'A' + 'a';
@@ -1341,7 +1342,7 @@ return c;
 }
 
 
-void HTMLFilter::ToLower(std::wstring & str)
+void HTMLParser::ToLower(std::wstring & str)
 {
 size_t i;
 
@@ -1350,7 +1351,7 @@ size_t i;
 }
 
 
-bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
+bool HTMLParser::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
 {
 	for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
 		if( ToLower(*name1) != ToLower(*name2) )
@@ -1363,19 +1364,19 @@ return false;
 }
 
 
-bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2)
+bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2)
 {
 	return IsNameEqual(name1, name2.c_str());
 }
 
 
-bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2)
+bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2)
 {
 	return IsNameEqual(name1.c_str(), name2);
 }
 
 
-bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2)
+bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2)
 {
 	return IsNameEqual(name1.c_str(), name2.c_str());
 }
@@ -1385,7 +1386,7 @@ bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & na
 // len characters from both strings must be equal
 // IMPROVE ME change name to something like IsBeginningNameEqual
 // and move to text.h (pikotools)
-bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
+bool HTMLParser::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
 {
 	for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
 		if( ToLower(*name1) != ToLower(*name2) )
@@ -1399,19 +1400,19 @@ return false;
 
 
 
-bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
+bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
 {
 	return IsNameEqual(name1, name2.c_str(), len);
 }
 
 
-bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
+bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
 {
 	return IsNameEqual(name1.c_str(), name2, len);
 }
 
 
-bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
+bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
 {
 	return IsNameEqual(name1.c_str(), name2.c_str(), len);
 }
@@ -1420,20 +1421,20 @@ bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & na
 
 
 
-bool HTMLFilter::IsLastTag(const wchar_t * name)
+bool HTMLParser::IsLastTag(const wchar_t * name)
 {
 	return IsNameEqual(name, LastItem().name);
 }
 
 
-bool HTMLFilter::IsLastTag(const std::wstring & name)
+bool HTMLParser::IsLastTag(const std::wstring & name)
 {
 	return IsNameEqual(name, LastItem().name);
 }
 
 
 // checking exceptions for opening tags
-void HTMLFilter::CheckSingleItemExceptions()
+void HTMLParser::CheckSingleItemExceptions()
 {
 	if( IsLastTag(L"meta")	||
 		IsLastTag(L"input")	||
@@ -1456,7 +1457,7 @@ void HTMLFilter::CheckSingleItemExceptions()
 }
 
 
-void HTMLFilter::CheckWhiteCharsExceptions(Item & item)
+void HTMLParser::CheckWhiteCharsExceptions(Item & item)
 {
 	bool change_white_mode = false;
 
@@ -1493,7 +1494,7 @@ void HTMLFilter::CheckWhiteCharsExceptions(Item & item)
 
 
 
-void HTMLFilter::AddForgottenTags()
+void HTMLParser::AddForgottenTags()
 {
 int i;
 
@@ -1539,7 +1540,7 @@ int i;
 }
 
 
-void HTMLFilter::CheckStackPrintRest()
+void HTMLParser::CheckStackPrintRest()
 {
 	while( stack_len-- > 0 )
 	{
@@ -1561,7 +1562,7 @@ void HTMLFilter::CheckStackPrintRest()
 }
 
 
-void HTMLFilter::CheckClosingTags()
+void HTMLParser::CheckClosingTags()
 {
 	if( stack_len == 0 )
 		return;
@@ -1604,7 +1605,7 @@ void HTMLFilter::CheckClosingTags()
 }
 
 
-bool HTMLFilter::PrintRest()
+bool HTMLParser::PrintRest()
 {
 //const wchar_t * start = pchar;
 
@@ -1634,7 +1635,7 @@ bool HTMLFilter::PrintRest()
 
 
 
-void HTMLFilter::ReadLoop()
+void HTMLParser::ReadLoop()
 {
 	while( ReadItem() )
 	{
@@ -1671,7 +1672,7 @@ void HTMLFilter::ReadLoop()
 
 
 
-void HTMLFilter::Read()
+void HTMLParser::Read()
 {
 	read_char(); // put first character to lastc
 	is_first_item = true;
diff --git a/src/html/htmlfilter.h b/src/html/htmlparser.h
similarity index 98%
rename from src/html/htmlfilter.h
rename to src/html/htmlparser.h
index b24e925..7e42eea 100644
--- a/src/html/htmlfilter.h
+++ b/src/html/htmlparser.h
@@ -90,7 +90,7 @@ namespace pt
 
 	the filter recognizes xml simple tags (with / at the end) such as: <br />
 */
-class HTMLFilter : public BaseParser
+class HTMLParser : public BaseParser
 {
 public:
 
@@ -100,10 +100,10 @@ public:
 		orphan_160space		// putting 160 ascii code
 	};
 
-	HTMLFilter();
-	HTMLFilter(const HTMLFilter & f);
-	HTMLFilter & operator=(const HTMLFilter & f);
-	virtual ~HTMLFilter();
+	HTMLParser();
+	HTMLParser(const HTMLParser & f);
+	HTMLParser & operator=(const HTMLParser & f);
+	virtual ~HTMLParser();
 
 
 	// main methods used for filtering

From 8c5ede5cf395ab8807ca5854f1dd2557df4ed1d7 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 7 Aug 2021 02:13:13 +0200
Subject: [PATCH 08/37] HTMLParser: for <script> and <!- (comments) we copy the
 content without parsing

---
 src/html/htmlparser.cpp | 243 ++++++++++++++++++++++++++++++----------
 src/html/htmlparser.h   |  17 ++-
 2 files changed, 194 insertions(+), 66 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 1187a67..7b422f5 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -71,6 +71,7 @@ HTMLParser::Item::Item()
 
 void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 {
+	parsing_html              = true;
 	reading_from_file         = false;
 	reading_from_wchar_string = true;
 	pchar_unicode             = in;
@@ -79,7 +80,6 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 	stack_len     = 0;
 	out_string    = &out;
 	//last_new_line = false;
-	was_ending_commentary = false;
 	line_len      = 0;
 	out_string->clear();
 
@@ -369,17 +369,27 @@ return false;
 }
 
 
-void HTMLParser::SkipWhite()
+void HTMLParser::SkipWhite(std::wstring * out_string)
 {
 	while( IsWhite(lastc) )
+	{
+		if( out_string )
+			(*out_string) += lastc;
+
 		read_char();
+	}
 }
 
 
-void HTMLParser::SkipWhiteLines()
+void HTMLParser::SkipWhiteLines(std::wstring * out_string)
 {
 	while( lastc==10 || IsWhite(lastc) )
+	{
+		if( out_string )
+			(*out_string) += lastc;
+
 		read_char();
+	}
 }
 
 
@@ -408,6 +418,8 @@ void HTMLParser::SkipWhiteWithFirstNewLine()
 
 
 
+
+
 void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
 {
 	bool is_quoted = false;
@@ -770,13 +782,6 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
 	{
 		str += lastc;
 		read_char();
-
-		if( IsEndingCommentaryTagMarkAtEndOfString(str) )
-		{
-			str.erase(str.size() - 3); // IMPROVEME define a function or what
-			was_ending_commentary = true;
-			break;
-		}
 	}
 
 	if( !str.empty() )
@@ -911,14 +916,19 @@ void HTMLParser::PutClosingTag(const Item & item)
 	if( skip_tags || !IsTagSafe(item.name) )
 		return;
 
-	if( !item.is_commentary )
+	if( item.is_commentary )
+	{
+		Put('-');
+		Put('-');
+		PutClosingTagMark();
+	}
+	else
 	{
 		PutOpeningTagMark();
 		Put('/');
+		Put(item.name);
+		PutClosingTagMark();
 	}
-
-	Put(item.name);
-	PutClosingTagMark();
 }
 
 
@@ -991,20 +1001,6 @@ bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
 }
 
 
-bool HTMLParser::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
-{
-	static wchar_t comm_end[] = L"-->";
-	size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1;
-
-	if( str.size() >= comm_end_len )
-	{
-		return IsNameEqual(str.c_str() + str.size() - comm_end_len, comm_end);
-	}
-
-	return false;
-}
-
-
 bool HTMLParser::IsStartingEntityMark(wchar_t c)
 {
 	return (c == '&');
@@ -1018,6 +1014,113 @@ bool HTMLParser::IsEndingEntityMark(wchar_t c)
 
 
 
+// used for such tags as: script, pre, textarea
+void HTMLParser::ReadTextUntilClosingCommentary()
+{
+	while( lastc != -1 )
+	{
+		if( lastc == '-' )
+		{
+			tmp_text.clear();
+			tmp_text += lastc;
+			read_char();
+
+			if( lastc == '-' )
+			{
+				tmp_text += lastc;
+				read_char();
+
+				if( IsClosingTagMark(lastc) )
+				{
+					tmp_text += lastc;
+					read_char();
+					Put(tmp_text);
+
+					break;
+				}
+			}
+
+			Put(tmp_text);
+		}
+		else
+		{
+			Put(lastc);
+			read_char();
+		}
+	}
+}
+
+
+bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
+{
+	tmp_text.clear();
+	tmp_text += lastc; // opening tag mark
+	read_char();
+
+	SkipWhiteLines(&tmp_text);
+
+	if( IsClosingTagIndicator(lastc) )
+	{
+		tmp_text += lastc;
+		read_char();
+		SkipWhiteLines(&tmp_text);
+		ReadItemName(tmp_name);
+
+		if( IsNameEqual(tmp_name, LastItem().name) )
+		{
+			SkipAndCheckClosingTag();
+
+			if( put_closing_tag_as_well )
+			{
+				Put('<');
+				Put('/');
+				Put(tmp_name);
+				Put('>');
+			}
+
+			return true;
+		}
+		else
+		{
+			Put(tmp_text);
+			Put(tmp_name);
+		}
+	}
+	else
+	{
+		Put(tmp_text);
+	}
+
+return false;
+}
+
+
+
+
+// used for such tags as: script, pre, textarea
+void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
+{
+	while( lastc != -1 )
+	{
+		if( IsOpeningTagMark(lastc) )
+		{
+			if( IsClosingTagForLastItem(put_closing_tag_as_well) )
+			{
+				//CheckNewLine();
+				break;
+			}
+		}
+		else
+		{
+			Put(lastc);
+			read_char();
+		}
+	}
+}
+
+
+
+
 // reading text between html tags
 void HTMLParser::ReadText()
 {
@@ -1026,8 +1129,6 @@ void HTMLParser::ReadText()
 
 	bool was_non_white_text = false;
 
-	was_ending_commentary = false;
-
 	bool allow_put_new_line = false;
 	bool allow_put_space = false;
 
@@ -1061,9 +1162,6 @@ void HTMLParser::ReadText()
 		}
 		else
 		{
-			if( was_ending_commentary )
-				break;
-
 			PutNormalWhite(was_white_char, was_new_line);
 
 			if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
@@ -1304,26 +1402,16 @@ bool HTMLParser::ReadItem()
 	if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
 		LastItem().tree_index += 1;
 
-	if( was_ending_commentary )
-	{
-		LastItem().type = Item::closing;
-		LastItem().is_commentary = true;
-		LastItem().name = L"--";
-		was_ending_commentary = false;
-	}
-	else
-	{
-		read_char();	// skipping the first opening tag mark '<'
-		SkipWhiteLines();
+	read_char();	// skipping the first opening tag mark '<'
+	SkipWhiteLines();
 
-		if( IsSpecialTagIndicator(lastc) )
-			ReadItemSpecial();
-		else
-		if( IsClosingTagIndicator(lastc) )
-			ReadItemClosing();
-		else
-			ReadItemOpening();
-	}
+	if( IsSpecialTagIndicator(lastc) )
+		ReadItemSpecial();
+	else
+	if( IsClosingTagIndicator(lastc) )
+		ReadItemClosing();
+	else
+		ReadItemOpening();
 
 	// IMPROVE ME later CheckSingleItemExceptions() can change opening to single type
 	ItemFound();
@@ -1462,16 +1550,22 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
 	bool change_white_mode = false;
 
 	// in safe_mode the script tag is ignored
-	if( !safe_mode && IsNameEqual(item.name, L"script") )
-	{
-		change_white_mode = true;
-	}
-
-	if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
+//	if( !safe_mode && IsNameEqual(item.name, L"script") )
+//	{
+//		change_white_mode = true;
+//	}
+
+//	if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
+//	{
+//		change_white_mode = true;
+//	}
+
+	if( IsNameEqual(item.name, L"pre") )
 	{
 		change_white_mode = true;
 	}
 
+	// move to CheckDifferentContentExceptions?
 	if( IsNameEqual(item.name, no_filter_tag) )
 	{
 		change_white_mode = true;
@@ -1493,6 +1587,25 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
 
 
 
+void HTMLParser::CheckDifferentContentExceptions(Item & item)
+{
+	if( !safe_mode && IsNameEqual(item.name, L"script") )
+	{
+		ReadTextUntilClosingTag(true);
+		PopStack();
+	}
+
+	if( IsNameEqual(item.name, L"textarea") )
+	{
+		ReadTextUntilClosingTag(true);
+		PopStack();
+	}
+}
+
+
+
+
+
 
 void HTMLParser::AddForgottenTags()
 {
@@ -1641,14 +1754,21 @@ void HTMLParser::ReadLoop()
 	{
 		if( LastItem().type == Item::opening )
 		{
-			CheckSingleItemExceptions();
+			if( parsing_html )
+			{
+				CheckSingleItemExceptions();
+			}
+
 			CheckWhiteCharsExceptions(LastItem());
+			CheckDifferentContentExceptions(LastItem());
 		}
 		else
 		if( LastItem().type == Item::special )
 		{
-			if( !LastItem().is_commentary )
-				PopStack();
+			if( LastItem().is_commentary )
+				ReadTextUntilClosingCommentary();
+
+			PopStack();
 		}
 		else
 		if( LastItem().type == Item::simple )
@@ -1666,6 +1786,7 @@ void HTMLParser::ReadLoop()
 		}
 
 		ReadText();
+
 		is_first_item = false;
 	}
 }
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 7e42eea..8bf6969 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -172,6 +172,12 @@ public:
 
 protected:
 
+	/*
+	 * true when parsing html input, false for parsing xml
+	 */
+	bool parsing_html;
+
+
 	// orphans for one language
 	struct Orphans
 	{
@@ -246,14 +252,13 @@ protected:
 	virtual bool IsStartingEntityMark(wchar_t c);
 	virtual bool IsEndingEntityMark(wchar_t c);
 
-	virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
-
 	virtual bool IsValidCharForName(int c);
 	virtual bool IsValidCharForAttrName(int c);
 	virtual bool IsValidCharForEntityName(int c);
 
 	virtual void CheckSingleItemExceptions();
 	virtual void CheckWhiteCharsExceptions(Item & item);
+	virtual void CheckDifferentContentExceptions(Item & item);
 
 	virtual void Put(wchar_t c);
 	virtual void Put(const wchar_t * str, const wchar_t * end);
@@ -299,12 +304,15 @@ protected:
 	bool CheckOrphan(const wchar_t * str, const wchar_t * end);
 
 	bool IsWhite(int c);
-	void SkipWhite();
-	void SkipWhiteLines();
+	void SkipWhite(std::wstring * out_string = nullptr);
+	void SkipWhiteLines(std::wstring * out_string = nullptr);
 	void SkipWhiteWithFirstNewLine();
 
 	int current_white_char_mode();
 
+	void ReadTextUntilClosingCommentary();
+	bool IsClosingTagForLastItem(bool put_closing_tag_as_well);
+	void ReadTextUntilClosingTag(bool put_closing_tag_as_well);
 	void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);
 
 	void PopStack();
@@ -354,7 +362,6 @@ protected:
 	bool is_first_item;
 	size_t wrap_line;		// insert a new line character into long lines
 	size_t tab_size;
-	bool was_ending_commentary;
 	OrphanMode orphan_mode;
 	std::wstring attr_name;
 	std::vector<std::wstring> attr_value;

From 7fcfdac52fff18328e4786bac5c7d25ba40adc79 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 7 Aug 2021 21:19:38 +0200
Subject: [PATCH 09/37] Space: added pretty_print parameter to some json
 serializing methods

---
 src/space/space.cpp | 16 ++++++++--------
 src/space/space.h   |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/space/space.cpp b/src/space/space.cpp
index ca816f3..5ad6dba 100644
--- a/src/space/space.cpp
+++ b/src/space/space.cpp
@@ -1474,35 +1474,35 @@ void Space::serialize_to_space_to(std::wstring & str, bool pretty_print) const
 
 
 
-std::string Space::serialize_to_json_str() const
+std::string Space::serialize_to_json_str(bool pretty_print) const
 {
 	std::string str;
-	serialize_to_json_to(str);
+	serialize_to_json_to(str, pretty_print);
 	return str;
 }
 
 
-std::wstring Space::serialize_to_json_wstr() const
+std::wstring Space::serialize_to_json_wstr(bool pretty_print) const
 {
 	std::wstring str;
-	serialize_to_json_to(str);
+	serialize_to_json_to(str, pretty_print);
 	return str;
 }
 
 
-void Space::serialize_to_json_to(std::string & str) const
+void Space::serialize_to_json_to(std::string & str, bool pretty_print) const
 {
 	TextStream stream;
-	serialize_to_json_stream(stream);
+	serialize_to_json_stream(stream, pretty_print);
 
 	stream.to_str(str);
 }
 
 
-void Space::serialize_to_json_to(std::wstring & str) const
+void Space::serialize_to_json_to(std::wstring & str, bool pretty_print) const
 {
 	WTextStream stream;
-	serialize_to_json_stream(stream);
+	serialize_to_json_stream(stream, pretty_print);
 
 	stream.to_str(str);
 }
diff --git a/src/space/space.h b/src/space/space.h
index 02241d4..a943cee 100644
--- a/src/space/space.h
+++ b/src/space/space.h
@@ -555,10 +555,10 @@ public:
 
 
 
-	std::string  serialize_to_json_str() const;
-	std::wstring serialize_to_json_wstr() const;
-	void serialize_to_json_to(std::string & str) const;
-	void serialize_to_json_to(std::wstring & str) const;
+	std::string  serialize_to_json_str(bool pretty_print = false) const;
+	std::wstring serialize_to_json_wstr(bool pretty_print = false) const;
+	void serialize_to_json_to(std::string & str, bool pretty_print = false) const;
+	void serialize_to_json_to(std::wstring & str, bool pretty_print = false) const;
 
 
 	template<typename StreamType>

From b8a03bf85245bb08cf83f55e76d21126d40e5623 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 7 Aug 2021 21:21:16 +0200
Subject: [PATCH 10/37] HTMLParser: added possibility to parse html to Space
 class added method: HTMLParser::parse_html(const wchar_t * in, Space & space)

---
 src/html/htmlparser.cpp | 133 +++++++++++++++++++++++++++++++++++++---
 src/html/htmlparser.h   |  14 ++++-
 2 files changed, 138 insertions(+), 9 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 7b422f5..43e7d57 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -59,6 +59,7 @@ void HTMLParser::Item::Clear()
 	new_line_in_the_middle = false;
 	has_body_tag    = false;
 	tree_index      = 0;
+	space           = nullptr;
 }
 
 
@@ -68,6 +69,27 @@ HTMLParser::Item::Item()
 }
 
 
+void HTMLParser::parse_html(const wchar_t * in, Space & space)
+{
+	parsing_html              = true;
+	reading_from_file         = false;
+	reading_from_wchar_string = true;
+	pchar_unicode             = in;
+	pchar_ascii               = 0;
+
+	stack_len     = 0;
+	out_string    = nullptr;
+	out_space     = &space;
+	//last_new_line = false;
+	line_len      = 0;
+	out_space->clear();
+
+	Init();
+	Read();
+	Uninit();
+}
+
+
 
 void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 {
@@ -79,6 +101,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 
 	stack_len     = 0;
 	out_string    = &out;
+	out_space     = nullptr;
 	//last_new_line = false;
 	line_len      = 0;
 	out_string->clear();
@@ -347,6 +370,8 @@ bool HTMLParser::PushStack()
 return true;
 }
 
+
+
 void HTMLParser::PopStack()
 {
 	if( stack_len == 0 )
@@ -609,7 +634,9 @@ void HTMLParser::CheckChar(wchar_t c)
 
 void HTMLParser::Put(wchar_t c)
 {
-	(*out_string) += c;
+	if( out_string )
+		(*out_string) += c;
+
 	CheckChar(c);
 }
 
@@ -620,7 +647,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
 		return;
 
 	size_t len = end - str;
-	out_string->append(str, len);
+
+	if( out_string )
+		out_string->append(str, len);
 
 	for( ; str < end ; ++str)
 		CheckChar(*str);
@@ -632,7 +661,8 @@ void HTMLParser::Put(const std::wstring & str)
 {
 	if( !str.empty() )
 	{
-		out_string->append(str);
+		if( out_string )
+			out_string->append(str);
 
 		for(size_t i=0 ; i < str.size() ; ++i)
 			CheckChar(str[i]);
@@ -805,7 +835,7 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
 }
 
 
-void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line)
+void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text)
 {
 	was_white_char = false;
 	was_new_line = false;
@@ -817,6 +847,9 @@ void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line)
 		else
 			was_white_char = true;
 
+		if( result_text )
+			(*result_text) += lastc;
+
 		if( current_white_char_mode() == WHITE_MODE_ORIGIN )
 		{
 			Put(lastc);
@@ -939,7 +972,10 @@ void HTMLParser::PutTabs(size_t len)
 		len = 30;
 
 	for(size_t i=0 ; i < (len*tab_size) ; ++i)
-		(*out_string) += ' '; // we do not add them to 'line_len'
+	{
+		if( out_string )
+			(*out_string) += ' '; // we do not add them to 'line_len'
+	}
 }
 
 
@@ -1140,6 +1176,18 @@ void HTMLParser::ReadText()
 		}
 	}
 
+	Space * text_space = nullptr;
+	std::wstring * text_space_wstr = nullptr;
+
+	if( out_space )
+	{
+		text_space = &text_space_tmp;
+		text_space->clear();
+		text_space->add(L"name", L"");
+		Space & wstr_space = text_space->add(L"text", L"");
+		text_space_wstr = &wstr_space.value.value_wstring;
+	}
+
 	while( lastc != -1 && !IsOpeningTagMark(lastc) )
 	{
 		tmp_text.clear();
@@ -1150,19 +1198,22 @@ void HTMLParser::ReadText()
 			allow_put_new_line = false;
 			allow_put_space = false;
 			was_non_white_text = true;
+
+			if( text_space_wstr )
+				(*text_space_wstr) += tmp_text;
 		}
 
 		if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) )
 		{
 			if( lastc == 10 || IsWhite(lastc) )
 			{
-				SkipWhiteLines();
+				SkipWhiteLines(text_space_wstr);
 				PutNonBreakingSpace();
 			}
 		}
 		else
 		{
-			PutNormalWhite(was_white_char, was_new_line);
+			PutNormalWhite(was_white_char, was_new_line, text_space_wstr);
 
 			if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
 			{
@@ -1190,6 +1241,12 @@ void HTMLParser::ReadText()
 		}
 	}
 
+	if( text_space_wstr && !text_space_wstr->empty() && was_non_white_text )
+	{
+		AddSpaceToSpaceTree(*text_space);
+	}
+
+	text_space_tmp.clear();
 	new_item_has_new_line_before = was_new_line;
 }
 
@@ -1292,6 +1349,28 @@ size_t i;
 }
 
 
+void HTMLParser::PutItemAttrToSpace()
+{
+	Space * space = LastItem().space;
+
+	if( space )
+	{
+		Space & attr_tab = space->get_add_space(L"attr");
+		Space & attr = attr_tab.add_empty_space(attr_name);
+
+		if( attr_has_value )
+		{
+			attr.set_empty_table();
+
+			for(size_t i=0 ; i < attr_value.size() ; ++i)
+			{
+				attr.add(attr_value[i]);
+			}
+		}
+	}
+}
+
+
 void HTMLParser::ReadItemClosing()
 {
 	read_char(); // skipping '/'
@@ -1358,13 +1437,19 @@ void HTMLParser::ReadItemOpening()
 {
 	LastItem().type = Item::opening;
 	ReadItemName(LastItem().name);
+	AddItemToSpace();
+	Space * space = LastItem().space;
 	
+	if( space )
+		space->add(L"name", LastItem().name);
+
 	if( PrintOpeningItem() )
 	{
 		while( ReadItemAttr() )
 		{
 			CheckItemLangAttr();
 			PrintItemAttr();
+			PutItemAttrToSpace();
 		}
 
 		SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
@@ -1748,6 +1833,36 @@ bool HTMLParser::PrintRest()
 
 
 
+void HTMLParser::AddItemToSpace()
+{
+	if( out_space && stack_len > 0 )
+	{
+		if( stack_len == 1 )
+		{
+			pstack[stack_len-1].space = out_space;
+		}
+		else
+		{
+			// stack_len > 1
+			Space & childs_tab = pstack[stack_len-2].space->get_add_space(L"childs");
+			Space & child = childs_tab.add_empty_space();
+			pstack[stack_len-1].space = &child;
+		}
+	}
+}
+
+
+void HTMLParser::AddSpaceToSpaceTree(const Space & space)
+{
+	if( out_space && stack_len > 0 )
+	{
+		Space & childs_tab = LastItem().space->get_add_space(L"childs");
+		childs_tab.add(space);
+	}
+}
+
+
+
 void HTMLParser::ReadLoop()
 {
 	while( ReadItem() )
@@ -1759,6 +1874,7 @@ void HTMLParser::ReadLoop()
 				CheckSingleItemExceptions();
 			}
 
+
 			CheckWhiteCharsExceptions(LastItem());
 			CheckDifferentContentExceptions(LastItem());
 		}
@@ -1804,7 +1920,8 @@ void HTMLParser::Read()
 	if( current_white_char_mode() != WHITE_MODE_ORIGIN )
 		SkipWhiteLines();
 
-	// it can be some text or white lines before the first html tag (we print it)
+	// it can be some text or white lines before the first html tag (we print it if using filtering)
+	// but they are not added to the Space tree
 	ReadText();
 
 	// reading the whole html source
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 8bf6969..9575f93 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -43,6 +43,7 @@
 #include <vector>
 #include <algorithm>
 #include "convert/baseparser.h"
+#include "space/space.h"
 
 
 namespace pt
@@ -106,6 +107,9 @@ public:
 	virtual ~HTMLParser();
 
 
+	void parse_html(const wchar_t * in, Space & space);
+
+
 	// main methods used for filtering
 	void Filter(const wchar_t * in, std::wstring & out);
 	void Filter(const std::wstring & in, std::wstring & out);
@@ -228,6 +232,8 @@ protected:
 
 		size_t tree_index;
 
+		Space * space;
+
 		void Clear();
 		Item();
 	};
@@ -331,6 +337,7 @@ protected:
 	bool ReadItemAttr();
 	void CheckItemLangAttr();
 	void PrintItemAttr();
+	void PutItemAttrToSpace();
 
 	void ReadItemClosing();
 	void ReadItemSpecial();
@@ -342,17 +349,22 @@ protected:
 	void CheckChar(wchar_t c);
 
 	void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
-	void PutNormalWhite(bool & was_white_char, bool & was_new_line);
+	void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
 
 	void PutTabs(size_t len);
 	void PutNonBreakingSpace();
 	void CalcOrphansMaxLen(Orphans & orphans);
 
+	void AddItemToSpace();
+	void AddSpaceToSpaceTree(const Space & space);
+
 	Item empty;
 	Item * pstack;			// stack pointer
 	size_t stack_len;		// length of the stack
 	wchar_t * buffer;		// buffer used when printing
 	std::wstring * out_string;
+	Space * out_space;
+	Space text_space_tmp;
 
 	std::vector<int> white_char_mode_tab;
 

From b1cc64a29b73ab687bfdf85b046c1090a697f129 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Tue, 10 Aug 2021 01:45:10 +0200
Subject: [PATCH 11/37] added a compact_mode option when creating a space
 output

---
 src/html/htmlparser.cpp | 77 ++++++++++++++++++++++++++++++++++++-----
 src/html/htmlparser.h   |  4 ++-
 2 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 43e7d57..63dcc61 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -69,13 +69,14 @@ HTMLParser::Item::Item()
 }
 
 
-void HTMLParser::parse_html(const wchar_t * in, Space & space)
+void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
 {
 	parsing_html              = true;
 	reading_from_file         = false;
 	reading_from_wchar_string = true;
 	pchar_unicode             = in;
 	pchar_ascii               = 0;
+	xml_compact_mode          = compact_mode;
 
 	stack_len     = 0;
 	out_string    = nullptr;
@@ -1440,7 +1441,7 @@ void HTMLParser::ReadItemOpening()
 	AddItemToSpace();
 	Space * space = LastItem().space;
 	
-	if( space )
+	if( !xml_compact_mode && space )
 		space->add(L"name", LastItem().name);
 
 	if( PrintOpeningItem() )
@@ -1837,27 +1838,85 @@ void HTMLParser::AddItemToSpace()
 {
 	if( out_space && stack_len > 0 )
 	{
-		if( stack_len == 1 )
+		Space * parent = out_space;
+
+		if( stack_len > 1 )
 		{
-			pstack[stack_len-1].space = out_space;
+			parent = pstack[stack_len-2].space;
+		}
+
+		if( xml_compact_mode )
+		{
+			Space * space = parent->get_space(pstack[stack_len-1].name);
+
+			if( space )
+			{
+				if( space->is_table() )
+				{
+					Space & child = space->add_empty_space();
+					pstack[stack_len-1].space = &child;
+				}
+				else
+				{
+					Space * tab = new Space();
+					tab->add(space);
+					Space & child = tab->add_empty_space();
+
+					parent->value.value_object[pstack[stack_len-1].name] = tab;
+					pstack[stack_len-1].space = &child;
+				}
+			}
+			else
+			{
+				Space & space = parent->add_empty_space(pstack[stack_len-1].name);
+				pstack[stack_len-1].space = &space;
+			}
 		}
 		else
 		{
-			// stack_len > 1
-			Space & childs_tab = pstack[stack_len-2].space->get_add_space(L"childs");
+			Space & childs_tab = parent->get_add_space(L"childs");
 			Space & child = childs_tab.add_empty_space();
 			pstack[stack_len-1].space = &child;
 		}
+
 	}
 }
 
 
 void HTMLParser::AddSpaceToSpaceTree(const Space & space)
 {
-	if( out_space && stack_len > 0 )
+	const std::wstring * text = space.get_wstr(L"text");
+
+	if( out_space && stack_len > 0 && text )
 	{
-		Space & childs_tab = LastItem().space->get_add_space(L"childs");
-		childs_tab.add(space);
+		if( xml_compact_mode )
+		{
+			Space * child_text = LastItem().space->get_space(L"text");
+
+			if( child_text )
+			{
+				if( child_text->is_table() )
+				{
+					child_text->add(*text);
+				}
+				else
+				{
+					Space * tab = new Space();
+					tab->add(*child_text);
+					tab->add(*text);
+					LastItem().space->value.value_object[L"text"] = tab;
+				}
+			}
+			else
+			{
+				LastItem().space->add(L"text", *text);
+			}
+		}
+		else
+		{
+			Space & childs_tab = LastItem().space->get_add_space(L"childs");
+			childs_tab.add(space);
+		}
 	}
 }
 
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 9575f93..c90a3cc 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -107,7 +107,7 @@ public:
 	virtual ~HTMLParser();
 
 
-	void parse_html(const wchar_t * in, Space & space);
+	void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
 
 
 	// main methods used for filtering
@@ -182,6 +182,8 @@ protected:
 	bool parsing_html;
 
 
+	bool xml_compact_mode;
+
 	// orphans for one language
 	struct Orphans
 	{

From 2576eb12d1a3a0ba39ffb1edfd962e8d3253a408 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Tue, 10 Aug 2021 21:56:04 +0200
Subject: [PATCH 12/37] HTMLParser: start working on xml mode added methods:
 Status parse_xml_file(const char * file_name,         Space & out_space, bool
 compact_mode = false, bool clear_space = true); Status parse_xml_file(const
 std::string & file_name,  Space & out_space, bool compact_mode = false, bool
 clear_space = true); Status parse_xml_file(const wchar_t * file_name,     
 Space & out_space, bool compact_mode = false, bool clear_space = true);
 Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool
 compact_mode = false, bool clear_space = true);

---
 src/html/htmlparser.cpp | 138 ++++++++++++++++++++++++++++++++++++----
 src/html/htmlparser.h   |  33 ++++++++++
 2 files changed, 158 insertions(+), 13 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 63dcc61..cf703cc 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -78,6 +78,9 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
 	pchar_ascii               = 0;
 	xml_compact_mode          = compact_mode;
 
+	status = ok;
+	line = 1;
+
 	stack_len     = 0;
 	out_string    = nullptr;
 	out_space     = &space;
@@ -91,6 +94,64 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
 }
 
 
+HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+	parsing_html = false;
+	reading_from_file = true;
+	xml_compact_mode          = compact_mode;
+
+	status = ok;
+	line = 1;
+	stack_len     = 0;
+	out_string    = nullptr;
+	line_len      = 0;
+
+	this->out_space = &out_space;
+
+	if( clear_space )
+		this->out_space->clear();
+
+	file.clear();
+	file.open(file_name, std::ios_base::binary | std::ios_base::in);
+
+	if( file )
+	{
+		Init();
+		Read();
+		Uninit();
+
+		file.close();
+	}
+	else
+	{
+		status = cant_open_file;
+	}
+
+	return status;
+}
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+	return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
+}
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+	std::string file_name_utf8;
+
+	wide_to_utf8(file_name, file_name_utf8);
+	return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
+}
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+	return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
+}
+
+
 
 void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 {
@@ -142,6 +203,12 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
 }
 
 
+int HTMLParser::get_last_parsed_line()
+{
+	return line;
+}
+
+
 void HTMLParser::SetSomeDefaults()
 {
 	white_mode  = WHITE_MODE_ORIGIN;
@@ -494,7 +561,7 @@ bool HTMLParser::IsValidCharForName(int c)
 	if( (c>='a' && c<='z') ||
 		(c>='A' && c<='Z') ||
 		(c>='0' && c<='9') ||
-		c=='-' || c=='!' || c==':' || c=='-') // : is for a namespace character, - is for a commentary
+		c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
 		return true;
 
 return false;
@@ -506,7 +573,7 @@ bool HTMLParser::IsValidCharForAttrName(int c)
 	if( (c>='a' && c<='z') ||
 		(c>='A' && c<='Z') ||
 		(c>='0' && c<='9') ||
-		c=='-' || c==':' )
+		c=='-' || c==':' || c=='_')
 		return true;
 
 return false;
@@ -624,6 +691,34 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
 }
 
 
+void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
+{
+	attr_value.clear();
+	tmp_text.clear();
+
+	while( lastc != -1 )
+	{
+		if( has_quote )
+		{
+			if( lastc == quote_char )
+				break;
+		}
+		else
+		{
+			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+				break;
+		}
+
+		// IMPROVEME add support for analyze_entities?
+		if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+			tmp_text += lastc;
+
+		read_char();
+	}
+}
+
+
+
 void HTMLParser::CheckChar(wchar_t c)
 {
 	if( c == 10 )
@@ -1021,6 +1116,10 @@ bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
 	return (c == '!');
 }
 
+bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c)
+{
+	return (c == '?');
+}
 
 // the '=' operator e.g. class="value"
 bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
@@ -1292,7 +1391,11 @@ bool HTMLParser::ReadItemAttr()
 	if( has_quote )
 		read_char();			// skipping the first quote mark
 
-	ReadItemAttrValue(has_quote, quote_char);
+	// IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table...
+	if( parsing_html )
+		ReadItemAttrValue(has_quote, quote_char);
+	else
+		ReadXMLItemAttrValue(has_quote, quote_char);
 
 	if( has_quote && lastc == quote_char )
 		read_char();			// skipping the last quote mark
@@ -1361,11 +1464,18 @@ void HTMLParser::PutItemAttrToSpace()
 
 		if( attr_has_value )
 		{
-			attr.set_empty_table();
-
-			for(size_t i=0 ; i < attr_value.size() ; ++i)
+			if( parsing_html )
 			{
-				attr.add(attr_value[i]);
+				attr.set_empty_table();
+
+				for(size_t i=0 ; i < attr_value.size() ; ++i)
+				{
+					attr.add(attr_value[i]);
+				}
+			}
+			else
+			{
+				attr.set(tmp_text);
 			}
 		}
 	}
@@ -1399,8 +1509,8 @@ void HTMLParser::ReadItemSpecial()
 		PutOpeningTagMark();
 	}
 
-	read_char(); // skipping '!'
-	LastItem().name = '!';
+	LastItem().name = lastc;
+	read_char(); // skipping '!' or '?'
 	ReadItemName(LastItem().name, false);
 
 	if( skip_tags )
@@ -1491,7 +1601,7 @@ bool HTMLParser::ReadItem()
 	read_char();	// skipping the first opening tag mark '<'
 	SkipWhiteLines();
 
-	if( IsSpecialTagIndicator(lastc) )
+	if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) )
 		ReadItemSpecial();
 	else
 	if( IsClosingTagIndicator(lastc) )
@@ -1924,7 +2034,7 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)
 
 void HTMLParser::ReadLoop()
 {
-	while( ReadItem() )
+	while( status == ok && ReadItem() )
 	{
 		if( LastItem().type == Item::opening )
 		{
@@ -1933,7 +2043,6 @@ void HTMLParser::ReadLoop()
 				CheckSingleItemExceptions();
 			}
 
-
 			CheckWhiteCharsExceptions(LastItem());
 			CheckDifferentContentExceptions(LastItem());
 		}
@@ -1960,7 +2069,10 @@ void HTMLParser::ReadLoop()
 			PopStack();
 		}
 
-		ReadText();
+		if( status == ok )
+		{
+			ReadText();
+		}
 
 		is_first_item = false;
 	}
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index c90a3cc..940eb39 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -95,12 +95,25 @@ class HTMLParser : public BaseParser
 {
 public:
 
+
+	/*
+		status of parsing
+	*/
+	enum Status { ok, cant_open_file, syntax_error };
+
+
 	enum OrphanMode
 	{
 		orphan_nbsp,		// putting "&nbsp;" string
 		orphan_160space		// putting 160 ascii code
 	};
 
+
+	/*
+		the last status of parsing, set by parse() methods
+	*/
+	Status status;
+
 	HTMLParser();
 	HTMLParser(const HTMLParser & f);
 	HTMLParser & operator=(const HTMLParser & f);
@@ -109,12 +122,30 @@ public:
 
 	void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
 
+	Status parse_xml_file(const char * file_name,         Space & out_space, bool compact_mode = false, bool clear_space = true);
+	Status parse_xml_file(const std::string & file_name,  Space & out_space, bool compact_mode = false, bool clear_space = true);
+	Status parse_xml_file(const wchar_t * file_name,      Space & out_space, bool compact_mode = false, bool clear_space = true);
+	Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
+
+
 
 	// main methods used for filtering
 	void Filter(const wchar_t * in, std::wstring & out);
 	void Filter(const std::wstring & in, std::wstring & out);
 
 
+
+	/*
+	 *
+	 * returns a number of a last parsed line
+	 * can be used to obtain the line in which there was a syntax error
+	 *
+	 */
+	int get_last_parsed_line();
+
+
+
+
 	const static int WHITE_MODE_ORIGIN = 0;
 	const static int WHITE_MODE_SINGLE_LINE = 1;
 	const static int WHITE_MODE_TREE = 2;
@@ -255,6 +286,7 @@ protected:
 	virtual bool IsClosingTagMark(wchar_t c);
 	virtual bool IsClosingTagIndicator(wchar_t c);
 	virtual bool IsSpecialTagIndicator(wchar_t c);
+	virtual bool IsXMLSpecialTagIndicator(wchar_t c);
 	virtual bool IsAttributeAssignmentMark(wchar_t c);
 	virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
 	virtual bool IsStartingEntityMark(wchar_t c);
@@ -335,6 +367,7 @@ protected:
 	void ReadItemAttrName();
 	void ReadItemAttrValueAdd(const std::wstring & str);
 	void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
+	void ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char);
 
 	bool ReadItemAttr();
 	void CheckItemLangAttr();

From 2cc9dd69a377b1cd18536f63238716550c5ad73b Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Thu, 12 Aug 2021 21:53:52 +0200
Subject: [PATCH 13/37] make depend

---
 src/Makefile.dep   | 11 ++++++++---
 tests/Makefile.dep |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/Makefile.dep b/src/Makefile.dep
index 1561a29..dfd5461 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -37,6 +37,14 @@
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
+./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
+./html/bbcodeparser.o: convert/baseparser.h space/space.h textstream/types.h
+./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
+./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h
+./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h space/space.h
+./html/htmlparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
+./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
+./html/htmlparser.o: utf8/utf8_private.h convert/text.h
 ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
 ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
@@ -46,6 +54,3 @@
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
-./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h convert/text.h
-./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
-./html/bbcodeparser.o: convert/baseparser.h
diff --git a/tests/Makefile.dep b/tests/Makefile.dep
index e83e777..d76bfd2 100644
--- a/tests/Makefile.dep
+++ b/tests/Makefile.dep
@@ -16,7 +16,6 @@
 ./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
 ./csvparser.o: ../src/convert/baseparser.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
-./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h
 ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
 ./mainoptionsparser.o: ../src/space/space.h ../src/textstream/types.h
@@ -31,3 +30,4 @@
 ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
 ./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h
+./test.o: test.h

From 5b2583b566ff2c973ac205436fcbe8aeacea2505 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 2 Oct 2021 18:45:02 +0200
Subject: [PATCH 14/37] fixed in HTMLParser: sometimes a closing item left on
 the stack, for stack_len < 3 there was not PopStack() called

---
 src/html/htmlparser.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index cf703cc..d05817d 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -1808,7 +1808,10 @@ void HTMLParser::AddForgottenTags()
 int i;
 
 	if( stack_len < 3 )
+	{
+		PopStack();
 		return;
+	}
 
 	// we have forgotten to close some tags
 

From f23cabfb2f0f1852ee16790217e26cf53744c517 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 2 Oct 2021 20:34:19 +0200
Subject: [PATCH 15/37] added to HTMLParser: filter_file(...) methods for
 filtering from a file

---
 src/html/htmlparser.cpp | 85 ++++++++++++++++++++++++++++++++++-------
 src/html/htmlparser.h   |  5 +++
 2 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index d05817d..90a32f1 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -140,8 +140,8 @@ HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Spa
 HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
 {
 	std::string file_name_utf8;
-
 	wide_to_utf8(file_name, file_name_utf8);
+
 	return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
 }
 
@@ -174,18 +174,6 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 }
 
 
-
-void HTMLParser::Init()
-{
-}
-
-
-void HTMLParser::Uninit()
-{
-}
-
-
-
 void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
 {
 	if( &in == &out )
@@ -203,6 +191,77 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
 }
 
 
+
+HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out)
+{
+	parsing_html      = true;
+	reading_from_file = true;
+
+	// open the file before clearing 'out' string, 'out' string can be the same string as the file_name
+	file.clear();
+	file.open(file_name, std::ios_base::binary | std::ios_base::in);
+
+	status        = ok;
+	line          = 1;
+	stack_len     = 0;
+	out_string    = &out;
+	out_space     = nullptr;
+	line_len      = 0;
+	out_string->clear();
+
+	if( file )
+	{
+		Init();
+		Read();
+		Uninit();
+
+		file.close();
+	}
+	else
+	{
+		status = cant_open_file;
+	}
+
+	return status;
+}
+
+
+HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out)
+{
+	return filter_file(file_name.c_str(), out);
+}
+
+
+HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out)
+{
+	std::string file_name_utf8;
+	pt::wide_to_utf8(file_name, file_name_utf8);
+
+	return filter_file(file_name_utf8, out);
+}
+
+
+HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out)
+{
+	return filter_file(file_name.c_str(), out);
+}
+
+
+
+
+void HTMLParser::Init()
+{
+}
+
+
+void HTMLParser::Uninit()
+{
+}
+
+
+
+
+
 int HTMLParser::get_last_parsed_line()
 {
 	return line;
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 940eb39..50df603 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -134,6 +134,11 @@ public:
 	void Filter(const std::wstring & in, std::wstring & out);
 
 
+	HTMLParser::Status filter_file(const char * file_name, std::wstring & out);
+	HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out);
+	HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out);
+	HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out);
+
 
 	/*
 	 *

From abe349be3445b77230c37fcf65f5438c45249a55 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 2 Oct 2021 21:01:09 +0200
Subject: [PATCH 16/37] small refactoring in HTMLParser

---
 src/html/htmlparser.cpp | 93 ++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 67 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 90a32f1..e5d3b66 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -1862,55 +1862,6 @@ void HTMLParser::CheckDifferentContentExceptions(Item & item)
 
 
 
-void HTMLParser::AddForgottenTags()
-{
-int i;
-
-	if( stack_len < 3 )
-	{
-		PopStack();
-		return;
-	}
-
-	// we have forgotten to close some tags
-
-	// looking whether there is a matching opening tag
-	for(i=int(stack_len)-3 ; i>=0 ; --i)
-		if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
-			break;
-
-	if( i < 0 )
-	{
-		// oops, there is no such a tag
-		// we don't print the closing and the missing opening tag
-		PopStack();
-		return;
-	}
-
-	for(int z=(int)stack_len-2 ; z>=i ; --z)
-	{
-		CheckWhiteCharsExceptions(pstack[z]);
-
-		if( !skip_tags && pstack[z].new_line )
-		{
-			if( current_white_char_mode() == WHITE_MODE_TREE )
-			{
-				Put(10);
-				PutTabs(pstack[z].tree_index);
-			}
-		}
-
-		PutClosingTag(pstack[z]);
-		pstack[z].Clear();
-	}
-
-	//last_new_line = pstack[stack_len-1].new_line;
-
-	// invalidate tags
-	stack_len = i;
-}
-
-
 void HTMLParser::CheckStackPrintRest()
 {
 	while( stack_len-- > 0 )
@@ -1935,44 +1886,52 @@ void HTMLParser::CheckStackPrintRest()
 
 void HTMLParser::CheckClosingTags()
 {
+	int i;
+
 	if( stack_len == 0 )
 		return;
 
 	// on the stack we have only opening tags
 	// but only the last tag is a closing tag
 
-	if( stack_len == 1 )
+	if( stack_len < 3 )
 	{
-		// there is only last closing tag
-		// we dont print it
 		PopStack();
 		return;
 	}
 
-	// there are more than one tag 
-	if( (pstack[stack_len-1].is_commentary && pstack[stack_len-2].is_commentary) || IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
-	{
-		CheckWhiteCharsExceptions(pstack[stack_len-1]);
+	// looking whether there is a matching opening tag
+	for(i=int(stack_len)-2 ; i >= 0 ; --i)
+		if( (pstack[i].is_commentary && pstack[stack_len-1].is_commentary) || IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
+			break;
 
-		// last closing tag is from the previous one
-		if( !skip_tags && pstack[stack_len-2].new_line )
+	if( i < 0 )
+	{
+		// oops, there is no such an opening tag on the stack
+		// we don't print the closing and the missing opening tag
+		PopStack();
+		return;
+	}
+
+	for(int z=(int)stack_len-2 ; z >= i ; --z)
+	{
+		CheckWhiteCharsExceptions(pstack[z]);
+
+		if( !skip_tags && pstack[z].new_line )
 		{
 			if( current_white_char_mode() == WHITE_MODE_TREE )
 			{
 				Put(10);
-				PutTabs(pstack[stack_len-2].tree_index);
+				PutTabs(pstack[z].tree_index);
 			}
 		}
 
-		PutClosingTag(pstack[stack_len-1]);
-		//last_new_line = pstack[stack_len-1].new_line;
-		PopStack();
-		PopStack();
-	}
-	else
-	{
-		AddForgottenTags();
+		PutClosingTag(pstack[z]);
+		pstack[z].Clear();
 	}
+
+	// invalidate items on the stack
+	stack_len = i;
 }
 
 

From 5e4c7e9929d6032561c5f2cdfda484a9036b0779 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 2 Oct 2021 21:01:19 +0200
Subject: [PATCH 17/37] make depend

---
 src/Makefile.dep   | 16 ++++++++--------
 tests/Makefile.dep |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/Makefile.dep b/src/Makefile.dep
index dfd5461..a2e4d54 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -37,14 +37,6 @@
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
-./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
-./html/bbcodeparser.o: convert/baseparser.h space/space.h textstream/types.h
-./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
-./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h
-./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h space/space.h
-./html/htmlparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
-./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
-./html/htmlparser.o: utf8/utf8_private.h convert/text.h
 ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
 ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
@@ -54,3 +46,11 @@
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
+./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h space/space.h
+./html/htmlparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
+./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
+./html/htmlparser.o: utf8/utf8_private.h convert/text.h
+./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
+./html/bbcodeparser.o: convert/baseparser.h space/space.h textstream/types.h
+./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
+./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h
diff --git a/tests/Makefile.dep b/tests/Makefile.dep
index d76bfd2..e83e777 100644
--- a/tests/Makefile.dep
+++ b/tests/Makefile.dep
@@ -16,6 +16,7 @@
 ./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
 ./csvparser.o: ../src/convert/baseparser.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
+./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h
 ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
 ./mainoptionsparser.o: ../src/space/space.h ../src/textstream/types.h
@@ -30,4 +31,3 @@
 ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
 ./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h
-./test.o: test.h

From 4902eb60377b289172205bb2dc1aed928376639b Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sun, 3 Oct 2021 13:22:49 +0200
Subject: [PATCH 18/37] fixed: in HTMLParser::CheckClosingTags() don't return
 immediately if stack_len is equal to 2

---
 src/html/htmlparser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index e5d3b66..e35f181 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -1894,7 +1894,7 @@ void HTMLParser::CheckClosingTags()
 	// on the stack we have only opening tags
 	// but only the last tag is a closing tag
 
-	if( stack_len < 3 )
+	if( stack_len == 1 )
 	{
 		PopStack();
 		return;

From 17d2c0fb25a85ed44ed7cbe80f5554cfd8c747c4 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Tue, 12 Oct 2021 19:53:11 +0200
Subject: [PATCH 19/37] - added some converting methods: esc_to_json(...),
 esc_to_xml(...), esc_to_csv() (convert/misc.h) - BaseParser: added
 possibility to read from TextStream and WTextStream - HTMLParser: added
 filter(const WTextStream & in, Stream & out, ...) method - added
 utf8_stream.h with one method:   template<typename StreamIteratorType>  
 size_t utf8_to_int(     StreamIteratorType & iterator_in,    
 StreamIteratorType & iterator_end,     int & res,     bool & correct)

---
 src/Makefile.dep           |  44 ++++--
 src/convert/baseparser.cpp | 104 ++++++++++++--
 src/convert/baseparser.h   |  39 +++--
 src/convert/misc.cpp       | 285 ++++++++++++++++++++++++++++++++++++-
 src/convert/misc.h         | 137 +++++++++++++++++-
 src/csv/csvparser.cpp      |  20 +--
 src/html/htmlparser.cpp    | 130 ++++++++++-------
 src/html/htmlparser.h      |  17 ++-
 src/space/spaceparser.cpp  |  26 ++--
 src/utf8/utf8.h            |  10 +-
 src/utf8/utf8_stream.h     | 104 ++++++++++++++
 src/utf8/utf8_templates.h  |   9 +-
 tests/Makefile.dep         |  10 +-
 13 files changed, 807 insertions(+), 128 deletions(-)
 create mode 100644 src/utf8/utf8_stream.h

diff --git a/src/Makefile.dep b/src/Makefile.dep
index a2e4d54..7dbbb8e 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -1,16 +1,24 @@
 # DO NOT DELETE
 
 ./convert/inttostr.o: ./convert/inttostr.h
-./convert/misc.o: ./convert/misc.h ./convert/text.h
+./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h
+./convert/misc.o: textstream/types.h utf8/utf8_stream.h
+./convert/misc.o: textstream/textstream.h textstream/stream.h space/space.h
+./convert/misc.o: convert/inttostr.h utf8/utf8.h utf8/utf8_templates.h
+./convert/misc.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
+./convert/misc.o: textstream/types.h ./convert/inttostr.h
 ./convert/text.o: ./convert/text.h ./convert/text_private.h
 ./convert/double.o: ./convert/double.h textstream/textstream.h
 ./convert/double.o: textstream/stream.h space/space.h textstream/types.h
 ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./convert/double.o: membuffer/membuffer.h textstream/types.h
-./convert/baseparser.o: ./convert/baseparser.h utf8/utf8.h
-./convert/baseparser.o: textstream/stream.h utf8/utf8_templates.h
-./convert/baseparser.o: utf8/utf8_private.h
+./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
+./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h
+./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
+./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
+./convert/baseparser.o: membuffer/membuffer.h textstream/types.h
+./convert/baseparser.o: utf8/utf8_stream.h
 ./date/date.o: ./date/date.h convert/inttostr.h
 ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
 ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
@@ -28,29 +36,39 @@
 ./space/space.o: convert/patternreplacer.h textstream/textstream.h
 ./space/space.o: textstream/stream.h space/space.h date/date.h
 ./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h
-./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h
+./space/space.o: ./convert/text.h ./convert/misc.h utf8/utf8_stream.h
+./space/space.o: ./convert/double.h
 ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
 ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
 ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h
 ./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
-./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
+./space/spaceparser.o: textstream/textstream.h textstream/stream.h
+./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h
+./space/spaceparser.o: textstream/types.h convert/strtoint.h ./convert/text.h
+./space/spaceparser.o: ./convert/misc.h utf8/utf8_stream.h
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
 ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
 ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
-./csv/csvparser.o: convert/baseparser.h
+./csv/csvparser.o: convert/baseparser.h textstream/textstream.h
+./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
+./csv/csvparser.o: textstream/types.h
 ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
 ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
-./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h space/space.h
-./html/htmlparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
-./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
-./html/htmlparser.o: utf8/utf8_private.h convert/text.h
+./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
+./html/htmlparser.o: textstream/textstream.h textstream/stream.h
+./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
+./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
+./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
+./html/htmlparser.o: textstream/types.h convert/text.h
 ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
-./html/bbcodeparser.o: convert/baseparser.h space/space.h textstream/types.h
+./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
+./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h
 ./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
-./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h
+./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
+./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
diff --git a/src/convert/baseparser.cpp b/src/convert/baseparser.cpp
index b95933d..37fbbbf 100644
--- a/src/convert/baseparser.cpp
+++ b/src/convert/baseparser.cpp
@@ -37,7 +37,7 @@
 
 #include "baseparser.h"
 #include "utf8/utf8.h"
-
+#include "utf8/utf8_stream.h"
 
 
 namespace pt
@@ -45,19 +45,27 @@ namespace pt
 
 BaseParser::BaseParser()
 {
-	clear();
+	clear_input_flags();
 }
 
 
-void BaseParser::clear()
+void BaseParser::clear_input_flags()
 {
 	line = 0;
 	reading_from_file = false;
 	pchar_ascii = nullptr;
 	pchar_unicode = nullptr;
-	reading_from_wchar_string = false;
+	wtext_stream_iterator = nullptr;
+	wtext_stream_iterator_end = nullptr;
+	text_stream_iterator = nullptr;
+	text_stream_iterator_end = nullptr;
 	lastc = -1;
 	input_as_utf8 = true;
+
+	if( file.is_open() )
+		file.close();
+
+	file.clear();
 }
 
 
@@ -132,7 +140,6 @@ bool correct;
 		++line;
 
 return lastc;
-
 }
 
 
@@ -150,6 +157,67 @@ return lastc;
 }
 
 
+int BaseParser::read_char_from_wtext_stream()
+{
+	if( (*wtext_stream_iterator) != (*wtext_stream_iterator_end) )
+	{
+		lastc = *(*wtext_stream_iterator);
+		++(*wtext_stream_iterator);
+	}
+	else
+	{
+		lastc = -1;
+	}
+
+	if( lastc == '\n' )
+		++line;
+
+	return lastc;
+}
+
+
+int BaseParser::read_char_from_utf8_text_stream()
+{
+	int c;
+	bool correct;
+
+	lastc = -1;
+
+	do
+	{
+		utf8_to_int(*text_stream_iterator, *text_stream_iterator_end, c, correct);
+	}
+	while( !correct && (*text_stream_iterator) != (*text_stream_iterator_end) );
+
+	if( correct )
+		lastc = c;
+
+	if( lastc == '\n' )
+		++line;
+
+	return lastc;
+}
+
+
+int BaseParser::read_char_from_ascii_text_stream()
+{
+	if( (*text_stream_iterator) != (*text_stream_iterator_end) )
+	{
+		lastc = *(*text_stream_iterator);
+		++(*text_stream_iterator);
+	}
+	else
+	{
+		lastc = -1;
+	}
+
+	if( lastc == '\n' )
+		++line;
+
+	return lastc;
+}
+
+
 int BaseParser::read_char_no_escape()
 {
 	if( reading_from_file )
@@ -161,17 +229,33 @@ int BaseParser::read_char_no_escape()
 	}
 	else
 	{
-		if( reading_from_wchar_string )
-		{
-			return read_char_from_wchar_string();
-		}
-		else
+		if( pchar_ascii )
 		{
 			if( input_as_utf8 )
 				return read_char_from_utf8_string();
 			else
 				return read_char_from_ascii_string();
 		}
+		else if( pchar_unicode )
+		{
+			return read_char_from_wchar_string();
+		}
+		else if( wtext_stream_iterator && wtext_stream_iterator_end )
+		{
+			return read_char_from_wtext_stream();
+		}
+		else if( text_stream_iterator && text_stream_iterator_end )
+		{
+			if( input_as_utf8 )
+				return read_char_from_utf8_text_stream();
+			else
+				return read_char_from_ascii_text_stream();
+		}
+		else
+		{
+			lastc = -1;
+			return lastc;
+		}
 	}
 }
 
diff --git a/src/convert/baseparser.h b/src/convert/baseparser.h
index 381568f..a8c648d 100644
--- a/src/convert/baseparser.h
+++ b/src/convert/baseparser.h
@@ -40,6 +40,7 @@
 
 #include <string>
 #include <fstream>
+#include "textstream/textstream.h"
 
 
 namespace pt
@@ -51,15 +52,18 @@ protected:
 
 	BaseParser();
 
-	void clear();
+	virtual void clear_input_flags();
 
-	int read_utf8_char();
-	int read_ascii_char();
-	int read_char_from_wchar_string();
-	int read_char_from_utf8_string();
-	int read_char_from_ascii_string();
-	int read_char_no_escape();
-	int read_char();
+	virtual int read_utf8_char();
+	virtual int read_ascii_char();
+	virtual int read_char_from_wchar_string();
+	virtual int read_char_from_utf8_string();
+	virtual int read_char_from_ascii_string();
+	virtual int read_char_from_wtext_stream();
+	virtual int read_char_from_utf8_text_stream();
+	virtual int read_char_from_ascii_text_stream();
+	virtual int read_char_no_escape();
+	virtual int read_char();
 
 
 
@@ -75,6 +79,7 @@ protected:
 	*/
 	bool reading_from_file;
 
+
 	/*
 		pointers to the current character
 		if ParseString() is in used
@@ -84,9 +89,20 @@ protected:
 
 
 	/*
-		true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
-	*/
-	bool reading_from_wchar_string;
+		pointers to WTextStream iterators
+		if set then both of them should be set
+	 */
+	WTextStream::const_iterator * wtext_stream_iterator;
+	WTextStream::const_iterator * wtext_stream_iterator_end;
+
+
+	/*
+		pointers to TextStream iterators
+		if set then both of them should be set
+	 */
+	TextStream::const_iterator * text_stream_iterator;
+	TextStream::const_iterator * text_stream_iterator_end;
+
 
 	/*
 		last read char
@@ -112,7 +128,6 @@ protected:
 
 
 
-
 };
 
 }
diff --git a/src/convert/misc.cpp b/src/convert/misc.cpp
index 978cce7..ffdf457 100644
--- a/src/convert/misc.cpp
+++ b/src/convert/misc.cpp
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2017, Tomasz Sowa
+ * Copyright (c) 2017-2021, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,8 @@
  */
 
 #include "misc.h"
+#include "inttostr.h"
+#include "utf8/utf8.h"
 
 
 namespace pt
@@ -49,6 +51,287 @@ void SetOverflow(bool * was_overflow, bool val)
 }
 
 
+void esc_to_json(char val, Stream & out)
+{
+	if( (unsigned char)val < 32 )
+	{
+		char buf[10];
+		size_t len;
+		Toa((unsigned char)val, buf, sizeof(buf)/sizeof(char), 16, &len);
+
+		out << "\\u";
+
+		if( len < 4 )
+		{
+			for(size_t i=0 ; i < (4-len) ; ++i)
+			{
+				out << '0';
+			}
+		}
+
+		out << buf;
+	}
+	else
+	{
+	// CHECKME
+	// \r \n \t are <32 and will be serialized os \u.... above
+
+		switch( val )
+		{
+		case 0:		out << '\\';	out << '0';		break;	// may to skip this character is better?
+		case '\r':	out << '\\';	out << 'r';		break;
+		case '\n':	out << '\\';	out << 'n';		break;
+		case '\t':	out << '\\';	out << 't';		break;
+		case 0x08:	out << '\\';	out << 'b';		break;
+		case 0x0c:	out << '\\';	out << 'f';		break;
+		case '\\':	out << '\\';	out << '\\';		break;
+		case '"':	out << '\\';	out << '\"';		break;
+		default:
+			out << val;
+		}
+	}
+}
+
+
+void esc_to_json(wchar_t val, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+
+	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
+
+	for(size_t a = 0 ; a < len ; ++a)
+	{
+		esc_to_json(utf8_buf[a], out);
+	}
+}
+
+
+void esc_to_json(const char * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_json(c[i], out);
+	}
+}
+
+
+void esc_to_json(const char * c, std::size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_json(c[i], out);
+	}
+}
+
+
+void esc_to_json(const wchar_t * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_json(c[i], out);
+	}
+}
+
+
+void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_json(c[i], out);
+	}
+}
+
+
+void esc_to_json(const std::string & in, Stream & out)
+{
+	esc_to_json(in.c_str(), in.size(), out);
+}
+
+
+void esc_to_json(const std::wstring & in, Stream & out)
+{
+	esc_to_json(in.c_str(), in.size(), out);
+}
+
+
+
+
+
+
+void esc_to_xml(char val, Stream & out)
+{
+	switch(val)
+	{
+	case '<':
+		out << "&lt;";
+		break;
+
+	case '>':
+		out << "&gt;";
+		break;
+
+	case '&':
+		out << "&amp;";
+		break;
+
+	case '"':
+		out << "&quot;";
+		break;
+
+	default:
+		out << val;
+		break;
+
+	// what about zero (null) character?
+	}
+}
+
+void esc_to_xml(wchar_t val, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+
+	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
+
+	for(size_t a = 0 ; a < len ; ++a)
+	{
+		esc_to_xml(utf8_buf[a], out);
+	}
+}
+
+
+void esc_to_xml(const char * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_xml(c[i], out);
+	}
+}
+
+
+void esc_to_xml(const char * c, std::size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_xml(c[i], out);
+	}
+}
+
+
+void esc_to_xml(const wchar_t * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_xml(c[i], out);
+	}
+}
+
+
+void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_xml(c[i], out);
+	}
+}
+
+
+void esc_to_xml(const std::string & in, Stream & out)
+{
+	esc_to_xml(in.c_str(), in.size(), out);
+}
+
+
+void esc_to_xml(const std::wstring & in, Stream & out)
+{
+	esc_to_xml(in.c_str(), in.size(), out);
+}
+
+
+
+
+
+void esc_to_csv(char c, pt::Stream & out)
+{
+	switch(c)
+	{
+	case '"':
+		out << "\"\"";
+		break;
+
+	default:
+		out << c;
+		break;
+
+	// what about zero (null) character?
+	}
+}
+
+
+void esc_to_csv(wchar_t val, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+
+	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
+
+	for(size_t a = 0 ; a < len ; ++a)
+	{
+		esc_to_csv(utf8_buf[a], out);
+	}
+}
+
+
+
+void esc_to_csv(const char * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_csv(c[i], out);
+	}
+}
+
+
+void esc_to_csv(const char * c, std::size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_csv(c[i], out);
+	}
+}
+
+
+void esc_to_csv(const wchar_t * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_csv(c[i], out);
+	}
+}
+
+
+void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_csv(c[i], out);
+	}
+}
+
+
+void esc_to_csv(const std::string & in, Stream & out)
+{
+	esc_to_csv(in.c_str(), in.size(), out);
+}
+
+
+void esc_to_csv(const std::wstring & in, Stream & out)
+{
+	esc_to_csv(in.c_str(), in.size(), out);
+}
+
+
 
 
 }
diff --git a/src/convert/misc.h b/src/convert/misc.h
index 7dbb128..51f4159 100644
--- a/src/convert/misc.h
+++ b/src/convert/misc.h
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2017, Tomasz Sowa
+ * Copyright (c) 2017-2021, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,9 @@
 
 #include <limits>
 #include "text.h"
+#include "textstream/stream.h"
+#include "textstream/types.h"
+#include "utf8/utf8_stream.h"
 
 
 namespace pt
@@ -47,6 +50,138 @@ namespace pt
 
 void SetOverflow(bool * was_overflow, bool val);
 
+void esc_to_json(char val, Stream & out);
+void esc_to_json(wchar_t val, Stream & out);
+void esc_to_json(const char * c, pt::Stream & out);
+void esc_to_json(const char * c, std::size_t len, Stream & out);
+void esc_to_json(const wchar_t * c, Stream & out);
+void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out);
+void esc_to_json(const std::string & in, Stream & out);
+void esc_to_json(const std::wstring & in, Stream & out);
+
+void esc_to_xml(char c, pt::Stream & out);
+void esc_to_xml(wchar_t c, pt::Stream & out);
+void esc_to_xml(const char * c, pt::Stream & out);
+void esc_to_xml(const char * c, std::size_t len, pt::Stream & out);
+void esc_to_xml(const wchar_t * c, pt::Stream & out);
+void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out);
+void esc_to_xml(const std::string & in, Stream & out);
+void esc_to_xml(const std::wstring & in, Stream & out);
+
+void esc_to_csv(char c, pt::Stream & out);
+void esc_to_csv(wchar_t val, Stream & out);
+void esc_to_csv(const char * c, std::size_t len, Stream & out);
+void esc_to_csv(const char * c, pt::Stream & out);
+void esc_to_csv(const char * c, std::size_t len, pt::Stream & out);
+void esc_to_csv(const wchar_t * c, pt::Stream & out);
+void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out);
+void esc_to_csv(const std::string & in, Stream & out);
+
+
+
+template<typename StreamType>
+void esc_to_json(const StreamType & in, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+	typename StreamType::const_iterator i = in.begin();
+	typename StreamType::const_iterator end = in.end();
+	int res;
+	bool correct;
+
+	for( ; i != end ; ++i)
+	{
+		if( in.is_wchar_stream() && out.is_char_stream() )
+		{
+			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
+			esc_to_json(utf8_buf, len, out);
+		}
+		else
+		if( in.is_char_stream() && out.is_wchar_stream() )
+		{
+			utf8_to_int(i, end, res, correct);
+
+			if( correct )
+				esc_to_json(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
+
+			// put replacement char if not correct?
+		}
+		else
+		{
+			esc_to_json(static_cast<wchar_t>(*i), out);
+		}
+	}
+}
+
+
+template<typename StreamType>
+void esc_to_xml(const StreamType & in, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+	typename StreamType::const_iterator i = in.begin();
+	typename StreamType::const_iterator end = in.end();
+	int res;
+	bool correct;
+
+	for( ; i != end ; ++i)
+	{
+		if( in.is_wchar_stream() && out.is_char_stream() )
+		{
+			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
+			esc_to_xml(utf8_buf, len, out);
+		}
+		else
+		if( in.is_char_stream() && out.is_wchar_stream() )
+		{
+			utf8_to_int(i, end, res, correct);
+
+			if( correct )
+				esc_to_xml(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
+
+			// put replacement char if not correct?
+		}
+		else
+		{
+			esc_to_xml(static_cast<wchar_t>(*i), out);
+		}
+	}
+}
+
+
+template<typename StreamType>
+void esc_to_csv(const StreamType & in, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+	typename StreamType::const_iterator i = in.begin();
+	typename StreamType::const_iterator end = in.end();
+	int res;
+	bool correct;
+
+	for( ; i != end ; ++i)
+	{
+		if( in.is_wchar_stream() && out.is_char_stream() )
+		{
+			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
+			esc_to_csv(utf8_buf, len, out);
+		}
+		else
+		if( in.is_char_stream() && out.is_wchar_stream() )
+		{
+			utf8_to_int(i, end, res, correct);
+
+			if( correct )
+				esc_to_csv(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
+
+			// put replacement char if not correct?
+		}
+		else
+		{
+			esc_to_csv(static_cast<wchar_t>(*i), out);
+		}
+	}
+}
 
 }
 
diff --git a/src/csv/csvparser.cpp b/src/csv/csvparser.cpp
index 583eee3..0a83e92 100644
--- a/src/csv/csvparser.cpp
+++ b/src/csv/csvparser.cpp
@@ -53,6 +53,8 @@ CSVParser::CSVParser()
 
 CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
 {
+	clear_input_flags();
+
 	reading_from_file = true;
 	space = &out_space;
 
@@ -103,11 +105,10 @@ CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space &
 
 CSVParser::Status CSVParser::parse(const char * str, Space & out_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = false;
-	pchar_ascii               = str;
-	pchar_unicode             = 0;
-	space                     = &out_space;
+	clear_input_flags();
+
+	pchar_ascii = str;
+	space       = &out_space;
 
 	parse();
 
@@ -124,11 +125,10 @@ CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space)
 
 CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
-	pchar_unicode             = str;
-	pchar_ascii               = 0;
-	space                     = &out_space;
+	clear_input_flags();
+
+	pchar_unicode = str;
+	space         = &out_space;
 
 	parse();
 
diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index e35f181..9b61b1d 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -48,6 +48,24 @@ const int HTMLParser::WHITE_MODE_TREE;
 
 
 
+void HTMLParser::clear_input_flags()
+{
+	BaseParser::clear_input_flags();
+
+	parsing_html     = true;
+	xml_compact_mode = true;
+	status           = ok;
+	line             = 1;
+	stack_len        = 0;
+	out_string       = nullptr;
+	out_stream       = nullptr;
+	out_space        = nullptr;
+	line_len         = 0;
+}
+
+
+
+
 void HTMLParser::Item::Clear()
 {
 	name.clear();
@@ -71,21 +89,11 @@ HTMLParser::Item::Item()
 
 void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
 {
-	parsing_html              = true;
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
-	pchar_unicode             = in;
-	pchar_ascii               = 0;
-	xml_compact_mode          = compact_mode;
+	clear_input_flags();
 
-	status = ok;
-	line = 1;
-
-	stack_len     = 0;
-	out_string    = nullptr;
-	out_space     = &space;
-	//last_new_line = false;
-	line_len      = 0;
+	pchar_unicode    = in;
+	xml_compact_mode = compact_mode;
+	out_space = &space;
 	out_space->clear();
 
 	Init();
@@ -96,16 +104,11 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
 
 HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
 {
+	clear_input_flags();
+
 	parsing_html = false;
 	reading_from_file = true;
-	xml_compact_mode          = compact_mode;
-
-	status = ok;
-	line = 1;
-	stack_len     = 0;
-	out_string    = nullptr;
-	line_len      = 0;
-
+	xml_compact_mode = compact_mode;
 	this->out_space = &out_space;
 
 	if( clear_space )
@@ -153,20 +156,15 @@ HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Sp
 
 
 
-void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
+void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string)
 {
-	parsing_html              = true;
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
-	pchar_unicode             = in;
-	pchar_ascii               = 0;
+	clear_input_flags();
 
-	stack_len     = 0;
+	pchar_unicode = in;
 	out_string    = &out;
-	out_space     = nullptr;
-	//last_new_line = false;
-	line_len      = 0;
-	out_string->clear();
+
+	if( clear_out_string )
+		out_string->clear();
 
 	Init();
 	Read();
@@ -174,7 +172,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 }
 
 
-void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
+void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string)
 {
 	if( &in == &out )
 	{
@@ -187,27 +185,45 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
 	if( out.capacity() < out_projected_len )
 		out.reserve(out_projected_len);
 
-	Filter(in.c_str(), out);
+	filter(in.c_str(), out, clear_out_string);
 }
 
 
-
-HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out)
+void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream)
 {
-	parsing_html      = true;
+	clear_input_flags();
+
+	WTextStream::const_iterator begin = in.begin();
+	WTextStream::const_iterator end = in.end();
+
+	wtext_stream_iterator = &begin;
+	wtext_stream_iterator_end = &end;
+
+	out_stream = &out;
+
+	if( clear_out_stream )
+		out_stream->clear();
+
+	Init();
+	Read();
+	Uninit();
+}
+
+
+HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream)
+{
+	clear_input_flags();
+
 	reading_from_file = true;
 
 	// open the file before clearing 'out' string, 'out' string can be the same string as the file_name
 	file.clear();
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);
 
-	status        = ok;
-	line          = 1;
-	stack_len     = 0;
-	out_string    = &out;
-	out_space     = nullptr;
-	line_len      = 0;
-	out_string->clear();
+	out_string = &out;
+
+	if( clear_out_stream )
+		out_string->clear();
 
 	if( file )
 	{
@@ -226,24 +242,24 @@ HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring
 }
 
 
-HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out)
+HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream)
 {
-	return filter_file(file_name.c_str(), out);
+	return filter_file(file_name.c_str(), out, clear_out_stream);
 }
 
 
-HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out)
+HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream)
 {
 	std::string file_name_utf8;
 	pt::wide_to_utf8(file_name, file_name_utf8);
 
-	return filter_file(file_name_utf8, out);
+	return filter_file(file_name_utf8, out, clear_out_stream);
 }
 
 
-HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out)
+HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream)
 {
-	return filter_file(file_name.c_str(), out);
+	return filter_file(file_name.c_str(), out, clear_out_stream);
 }
 
 
@@ -792,6 +808,9 @@ void HTMLParser::Put(wchar_t c)
 	if( out_string )
 		(*out_string) += c;
 
+	if( out_stream )
+		(*out_stream) << c;
+
 	CheckChar(c);
 }
 
@@ -806,6 +825,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
 	if( out_string )
 		out_string->append(str, len);
 
+	if( out_stream )
+		out_stream->write(str, len);
+
 	for( ; str < end ; ++str)
 		CheckChar(*str);
 }
@@ -819,6 +841,9 @@ void HTMLParser::Put(const std::wstring & str)
 		if( out_string )
 			out_string->append(str);
 
+		if( out_stream )
+			out_stream->write(str.c_str(), str.size());
+
 		for(size_t i=0 ; i < str.size() ; ++i)
 			CheckChar(str[i]);
 	}
@@ -1130,6 +1155,9 @@ void HTMLParser::PutTabs(size_t len)
 	{
 		if( out_string )
 			(*out_string) += ' '; // we do not add them to 'line_len'
+
+		if( out_stream )
+			(*out_stream) << ' ';
 	}
 }
 
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 50df603..caf5cf1 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -44,6 +44,7 @@
 #include <algorithm>
 #include "convert/baseparser.h"
 #include "space/space.h"
+#include "textstream/stream.h"
 
 
 namespace pt
@@ -130,14 +131,15 @@ public:
 
 
 	// main methods used for filtering
-	void Filter(const wchar_t * in, std::wstring & out);
-	void Filter(const std::wstring & in, std::wstring & out);
+	void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true);
+	void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true);
 
+	void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true);
 
-	HTMLParser::Status filter_file(const char * file_name, std::wstring & out);
-	HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out);
-	HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out);
-	HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out);
+	HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true);
+	HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true);
+	HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true);
+	HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true);
 
 
 	/*
@@ -278,7 +280,7 @@ protected:
 
 
 
-
+	void clear_input_flags();
 
 
 	/*
@@ -403,6 +405,7 @@ protected:
 	size_t stack_len;		// length of the stack
 	wchar_t * buffer;		// buffer used when printing
 	std::wstring * out_string;
+	Stream * out_stream;
 	Space * out_space;
 	Space text_space_tmp;
 
diff --git a/src/space/spaceparser.cpp b/src/space/spaceparser.cpp
index 9c334a7..46f0aa4 100644
--- a/src/space/spaceparser.cpp
+++ b/src/space/spaceparser.cpp
@@ -74,11 +74,12 @@ int SpaceParser::get_last_parsed_line()
 
 SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space)
 {
+	clear_input_flags();
+
 	reading_from_file = true;
 	parsing_space = false;
 	root_space = &out_space;
 
-	file.clear();
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);
 	
 	if( file )
@@ -125,11 +126,12 @@ SpaceParser::Status SpaceParser::parse_json_file(const std::wstring & file_name,
 
 SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space)
 {
+	clear_input_flags();
+
 	reading_from_file = true;
 	parsing_space = true;
 	root_space = &out_space;
 
-	file.clear();
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);
 
 	if( file )
@@ -174,10 +176,9 @@ SpaceParser::Status SpaceParser::parse_space_file(const std::wstring & file_name
 
 SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = false;
+	clear_input_flags();
+
 	pchar_ascii               = str;
-	pchar_unicode             = 0;
 	parsing_space             = false;
 	root_space                = &out_space;
 
@@ -195,10 +196,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::string & str, Space & out
 
 SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
+	clear_input_flags();
+
 	pchar_unicode             = str;
-	pchar_ascii               = 0;
 	parsing_space             = false;
 	root_space                = &out_space;
 
@@ -219,10 +219,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::wstring & str, Space & ou
 
 SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = false;
+	clear_input_flags();
+
 	pchar_ascii               = str;
-	pchar_unicode             = 0;
 	parsing_space             = true;
 	root_space                = &out_space;
 
@@ -240,10 +239,9 @@ SpaceParser::Status SpaceParser::parse_space(const std::string & str, Space & ou
 
 SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
+	clear_input_flags();
+
 	pchar_unicode             = str;
-	pchar_ascii               = 0;
 	parsing_space             = true;
 	root_space                = &out_space;
 
diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h
index bdf28f3..4857184 100644
--- a/src/utf8/utf8.h
+++ b/src/utf8/utf8.h
@@ -45,6 +45,12 @@
 namespace pt
 {
 
+/*
+ * public methods are also defined in utf8_stream.h
+ *
+ */
+
+
 /*!
 	UTF-8, a transformation format of ISO 10646
 	http://tools.ietf.org/html/rfc3629
@@ -213,9 +219,7 @@ template<typename StreamType>
 bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1);
 
 template<typename StreamTypeIn, typename StreamTypeOut>
-void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested, IMPROVE ME add clear parameter, mode parameter is not used
-
-
+void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); // not tested, IMPROVE ME mode parameter is not used
 
 
 } // namespace
diff --git a/src/utf8/utf8_stream.h b/src/utf8/utf8_stream.h
new file mode 100644
index 0000000..3adf848
--- /dev/null
+++ b/src/utf8/utf8_stream.h
@@ -0,0 +1,104 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/*
+ * Copyright (c) 2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_picotools_utf8_utf8_stream
+#define headerfile_picotools_utf8_utf8_stream
+
+#include "textstream/textstream.h"
+
+namespace pt
+{
+
+
+/*!
+	this function converts one UTF-8 character into one wide-character
+
+	input:
+		iterator_in - an TextStream iterator for reading from
+		iterator_end - an end iterator (can be returned by end() method from TextStream)
+
+	output:
+		res - an output character
+		correct - true if it is a correct character
+
+		the function returns how many characters have been used from the input stream
+*/
+template<typename StreamIteratorType>
+size_t utf8_to_int(
+		StreamIteratorType & iterator_in,
+		StreamIteratorType & iterator_end,
+		int & res,
+		bool & correct)
+{
+size_t i, len;
+unsigned char uz;
+
+	res = 0;
+	correct = false;
+
+	if( iterator_in == iterator_end )
+		return 0;
+
+	uz = *iterator_in;
+	++iterator_in;
+
+	if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
+		return 1;
+
+	for(i=1 ; i<len ; ++i)
+	{
+		if( iterator_in == iterator_end )
+			return i;
+
+		uz = *iterator_in;
+		++iterator_in;
+
+		if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
+			return i;
+	}
+
+	if( utf8_check_range(res, len) )
+		correct = true;
+
+return len;
+}
+
+
+
+}
+
+#endif
diff --git a/src/utf8/utf8_templates.h b/src/utf8/utf8_templates.h
index a0f7613..8e374d0 100644
--- a/src/utf8/utf8_templates.h
+++ b/src/utf8/utf8_templates.h
@@ -47,6 +47,7 @@ namespace pt
 {
 
 
+
 template<typename StreamType>
 void int_to_wide(int c, StreamType & res)
 {
@@ -65,6 +66,7 @@ void int_to_wide(int c, StreamType & res)
 
 
 
+
 /*!
 	converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
 	(need to be tested)
@@ -376,8 +378,11 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i
 
 // not tested
 template<typename StreamTypeIn, typename StreamTypeOut>
-void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)
+void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
 {
+	if( clear )
+		utf8.clear();
+
 	private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
 		utf8.write(utf8_buffer, buffer_len);
 	});
@@ -385,8 +390,6 @@ void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)
 
 
 
-
-
 } // namespace pt
 
 #endif
diff --git a/tests/Makefile.dep b/tests/Makefile.dep
index e83e777..a9228ca 100644
--- a/tests/Makefile.dep
+++ b/tests/Makefile.dep
@@ -9,12 +9,15 @@
 ./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h
 ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./convert.o: ../src/convert/strtoint.h ../src/convert/text.h
-./convert.o: ../src/convert/misc.h ../src/convert/double.h
+./convert.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
+./convert.o: ../src/convert/double.h
 ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
 ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
 ./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
 ./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
-./csvparser.o: ../src/convert/baseparser.h test.h
+./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h
+./csvparser.o: ../src/textstream/stream.h ../src/date/date.h
+./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
 ./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h
@@ -30,4 +33,5 @@
 ./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h
 ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
-./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h
+./mainoptionsparser.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
+./mainoptionsparser.o: ../src/convert/double.h

From c54c39882841f194ce30a17aedd4329b127538ab Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Wed, 13 Oct 2021 00:40:55 +0200
Subject: [PATCH 20/37] fixed in HTMLParser: </nofilter> tag was printed

---
 src/html/htmlparser.cpp | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 9b61b1d..21a826f 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -1126,7 +1126,7 @@ return true;
 
 void HTMLParser::PutClosingTag(const Item & item)
 {
-	if( skip_tags || !IsTagSafe(item.name) )
+	if( skip_tags || !IsTagSafe(item.name) || IsNameEqual(no_filter_tag, LastItem().name) )
 		return;
 
 	if( item.is_commentary )
@@ -1945,17 +1945,22 @@ void HTMLParser::CheckClosingTags()
 	{
 		CheckWhiteCharsExceptions(pstack[z]);
 
-		if( !skip_tags && pstack[z].new_line )
+		if( !skip_tags && IsTagSafe(LastItem().name) && !IsNameEqual(no_filter_tag, LastItem().name) )
 		{
-			if( current_white_char_mode() == WHITE_MODE_TREE )
+			if( pstack[z].new_line )
 			{
-				Put(10);
-				PutTabs(pstack[z].tree_index);
+				if( current_white_char_mode() == WHITE_MODE_TREE )
+				{
+					Put(10);
+					PutTabs(pstack[z].tree_index);
+				}
 			}
-		}
 
-		PutClosingTag(pstack[z]);
-		pstack[z].Clear();
+			// IMPROVEME
+			// in PutClosingTag we test IsTagSafe() and no_filter_tag too
+			PutClosingTag(pstack[z]);
+			pstack[z].Clear();
+		}
 	}
 
 	// invalidate items on the stack

From 5eff9a5f4f259a56ce013e919bbf03529680a544 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Wed, 20 Oct 2021 08:30:57 +0200
Subject: [PATCH 21/37] Space::to_bool() return true now when a string/object
 or table is non empty

---
 src/space/space.cpp | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/space/space.cpp b/src/space/space.cpp
index 5ad6dba..48a42bb 100644
--- a/src/space/space.cpp
+++ b/src/space/space.cpp
@@ -827,8 +827,31 @@ bool Space::to_bool() const
 	if( type == type_bool )
 		return value.value_bool;
 
-	long long val = to_long_long();
-	return (val != 0) ? true : false;
+	if( type == type_long )
+		return value.value_long != 0;
+
+	if( type == type_float )
+		return value.value_float != 0.0f;
+
+	if( type == type_double )
+		return value.value_double != 0.0;
+
+	if( type == type_long_double )
+		return value.value_long_double != 0.0L;
+
+	if( type == type_string )
+		return !value.value_string.empty();
+
+	if( type == type_wstring )
+		return !value.value_wstring.empty();
+
+	if( type == type_table )
+		return !value.value_table.empty();
+
+	if( type == type_object )
+		return !value.value_object.empty();
+
+	return false;
 }
 
 short Space::to_short() const

From bb9205a55eef1217a7665792d9dc2c637e5a664d Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Fri, 5 Nov 2021 09:27:32 +0100
Subject: [PATCH 22/37] added: Space::Space(const Date & date),
 Space::set(const Date & date), Space::add(const Date & date),
 Space::add(const wchar_t * field, const Date & date)

---
 src/space/space.cpp | 31 +++++++++++++++++++++++++++++++
 src/space/space.h   |  5 +++++
 2 files changed, 36 insertions(+)

diff --git a/src/space/space.cpp b/src/space/space.cpp
index 48a42bb..9baa939 100644
--- a/src/space/space.cpp
+++ b/src/space/space.cpp
@@ -188,6 +188,12 @@ Space::Space(const Space * space)
 	set(space);
 }
 
+Space::Space(const Date & date)
+{
+	initialize();
+	set(date);
+}
+
 
 void Space::clear()
 {
@@ -427,6 +433,13 @@ void Space::set(Space && space)
 	move_from(std::move(space));
 }
 
+void Space::set(const Date & date)
+{
+	initialize_value_wstring_if_needed();
+	WTextStream str;
+	date.SerializeISO(str);
+	str.to_str(value.value_wstring);
+}
 
 
 Space & Space::add(bool val)
@@ -528,6 +541,12 @@ Space & Space::add(Space && space)
 }
 
 
+Space & Space::add(const Date & date)
+{
+	return add_generic(date);
+}
+
+
 Space & Space::add_empty_space()
 {
 	return add_generic(static_cast<Space*>(nullptr));
@@ -643,6 +662,13 @@ Space & Space::add(const wchar_t * field, Space && space)
 	return *(insert_res.first->second);
 }
 
+
+Space & Space::add(const wchar_t * field, const Date & date)
+{
+	return add_generic(field, date);
+}
+
+
 Space & Space::add_empty_space(const wchar_t * field)
 {
 	return add_generic(field, static_cast<Space*>(nullptr));
@@ -746,6 +772,11 @@ Space & Space::add(const std::wstring & field, Space && space)
 	return add(field.c_str(), std::move(space));
 }
 
+Space & Space::add(const std::wstring & field, const Date & date)
+{
+	return add_generic(field, date);
+}
+
 Space & Space::add_empty_space(const std::wstring & field)
 {
 	return add_generic(field, static_cast<Space*>(nullptr));
diff --git a/src/space/space.h b/src/space/space.h
index a943cee..cb9ebbe 100644
--- a/src/space/space.h
+++ b/src/space/space.h
@@ -207,6 +207,7 @@ public:
 	Space(const std::string & str);
 	Space(const std::wstring & str);
 	Space(const Space * space);
+	Space(const Date & date);
 
 
 	void clear();
@@ -243,6 +244,7 @@ public:
 	void set(const Space & space);
 	void set(const Space * space);
 	void set(Space && space);
+	void set(const Date & date);
 
 
 	// add a value to the table, change to table if needed, return the reference to the new inserted item
@@ -265,6 +267,7 @@ public:
 	Space & add(const Space & space);
 	Space & add(const Space * space);
 	Space & add(Space && space);
+	Space & add(const Date & date);
 	Space & add_empty_space(); // IMPROVEME rename me to something better
 
 
@@ -289,6 +292,7 @@ public:
 	Space & add(const wchar_t * field, const Space & space);
 	Space & add(const wchar_t * field, const Space * space);
 	Space & add(const wchar_t * field, Space && space);
+	Space & add(const wchar_t * field, const Date & date);
 	Space & add_empty_space(const wchar_t * field); // IMPROVEME rename me to something better
 
 	Space & add(const std::wstring & field, bool val);
@@ -310,6 +314,7 @@ public:
 	Space & add(const std::wstring & field, const Space & space);
 	Space & add(const std::wstring & field, const Space * space);
 	Space & add(const std::wstring & field, Space && space);
+	Space & add(const std::wstring & field, const Date & date);
 	Space & add_empty_space(const std::wstring & field); // IMPROVEME rename me to something better
 
 

From 2dadfc0809e8e4e9529c0f03dab94789ab9b74d9 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Tue, 30 Nov 2021 16:27:27 +0100
Subject: [PATCH 23/37] added: HTMLParser::ItemParsedListener listener with an
 item_parsed(...) method which is called when a tag is parsed by the parser

---
 src/html/htmlparser.cpp |  67 +++++++++++++++++++++++-
 src/html/htmlparser.h   | 110 ++++++++++++++++++++++++----------------
 2 files changed, 132 insertions(+), 45 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 21a826f..9b24071 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -36,7 +36,6 @@
  */
 
 #include "htmlparser.h"
-
 #include "convert/text.h"
 
 
@@ -102,6 +101,13 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
 }
 
 
+void HTMLParser::set_item_parsed_listener(ItemParsedListener * listener)
+{
+	item_parsed_listener = listener;
+}
+
+
+
 HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
 {
 	clear_input_flags();
@@ -296,6 +302,7 @@ void HTMLParser::SetSomeDefaults()
 	skip_commentaries = false;
 	skip_entities = false;
 	analyze_entities = false;
+	item_parsed_listener = nullptr;
 }
 
 
@@ -1941,6 +1948,12 @@ void HTMLParser::CheckClosingTags()
 		return;
 	}
 
+	// CHECK ME
+	if( RemoveIfNeeded(stack_len - 2) )
+	{
+		RemoveLastSpace(i);
+	}
+
 	for(int z=(int)stack_len-2 ; z >= i ; --z)
 	{
 		CheckWhiteCharsExceptions(pstack[z]);
@@ -2047,6 +2060,36 @@ void HTMLParser::AddItemToSpace()
 }
 
 
+
+void HTMLParser::RemoveLastSpace(size_t index)
+{
+	if( out_space )
+	{
+		Space * parent = out_space;
+
+		if( index > 0 )
+		{
+			parent = pstack[index - 1].space;
+		}
+
+		if( xml_compact_mode )
+		{
+			// IMPLEMENT ME
+		}
+		else
+		{
+			Space * childs_tab = parent->get_space(L"childs");
+			size_t len = childs_tab->table_size();
+
+			if( childs_tab && childs_tab->is_table() && len > 0 && childs_tab->value.value_table[len-1] == pstack[stack_len-2].space )
+			{
+				childs_tab->remove(len - 1);
+				pstack[stack_len-2].space = nullptr;
+			}
+		}
+	}
+}
+
 void HTMLParser::AddSpaceToSpaceTree(const Space & space)
 {
 	const std::wstring * text = space.get_wstr(L"text");
@@ -2086,6 +2129,22 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)
 
 
 
+
+bool HTMLParser::RemoveIfNeeded(size_t index)
+{
+	if( item_parsed_listener )
+	{
+		if( !item_parsed_listener->item_parsed(pstack[index]) )
+		{
+			return true;
+		}
+	}
+
+	return false;
+}
+
+
+
 void HTMLParser::ReadLoop()
 {
 	while( status == ok && ReadItem() )
@@ -2111,6 +2170,12 @@ void HTMLParser::ReadLoop()
 		else
 		if( LastItem().type == Item::simple )
 		{
+			if( stack_len > 0 )
+			{
+				if( RemoveIfNeeded(stack_len - 1) )
+					RemoveLastSpace(stack_len - 1);
+			}
+
 			PopStack();
 		}
 		else
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index caf5cf1..da0074e 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -52,7 +52,6 @@ namespace pt
 
 
 
-
 // max length of a name of a html tag (with terminating null)
 #define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN	30
 
@@ -110,6 +109,66 @@ public:
 	};
 
 
+	// orphans for one language
+	struct Orphans
+	{
+		std::vector<std::wstring> tab;
+		size_t max_len;
+	};
+
+
+	struct Item
+	{
+		std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
+
+		enum Type
+		{
+			opening,		/* sample:  <h1>		*/
+			closing,		/* sample:  </h1>		*/
+			simple,			/* sample:  <br/>		*/
+			special,		/* sample:  <!doctype>	*/
+			none
+		} type;
+
+		bool is_commentary;
+
+		bool new_line_before;
+
+		// is there a new line after this tag
+		bool new_line;
+
+		// is there a new
+		bool new_line_in_the_middle;
+
+		// current orphans table
+		// (will be propagated)
+		Orphans * porphans;
+
+		// this item or one from its parents is a 'body' html tag
+		// (will be propagated)
+		bool has_body_tag;
+
+		size_t tree_index;
+
+		Space * space;
+
+		void Clear();
+		Item();
+	};
+
+
+	class ItemParsedListener
+	{
+	public:
+
+		ItemParsedListener() {}
+
+		virtual bool item_parsed(const Item & item) { return true; }
+		virtual ~ItemParsedListener() {}
+
+	};
+
+
 	/*
 		the last status of parsing, set by parse() methods
 	*/
@@ -120,6 +179,8 @@ public:
 	HTMLParser & operator=(const HTMLParser & f);
 	virtual ~HTMLParser();
 
+	void set_item_parsed_listener(ItemParsedListener * listener);
+
 
 	void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
 
@@ -222,12 +283,6 @@ protected:
 
 	bool xml_compact_mode;
 
-	// orphans for one language
-	struct Orphans
-	{
-		std::vector<std::wstring> tab;
-		size_t max_len;
-	};
 
 
 	// orphans for all languages
@@ -238,45 +293,9 @@ protected:
 	// html <nofilter> tag name
 	std::wstring no_filter_tag;
 
+	ItemParsedListener * item_parsed_listener;
 
-	struct Item
-	{
-		std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
 
-		enum Type
-		{
-			opening,		/* sample:  <h1>		*/
-			closing,		/* sample:  </h1>		*/
-			simple,			/* sample:  <br/>		*/
-			special,		/* sample:  <!doctype>	*/
-			none
-		} type;
-
-		bool is_commentary;
-
-		bool new_line_before;
-
-		// is there a new line after this tag
-		bool new_line;
-
-		// is there a new
-		bool new_line_in_the_middle;
-
-		// current orphans table
-		// (will be propagated)
-		Orphans * porphans;
-
-		// this item or one from its parents is a 'body' html tag
-		// (will be propagated)
-		bool has_body_tag;
-
-		size_t tree_index;
-
-		Space * space;
-
-		void Clear();
-		Item();
-	};
 
 
 
@@ -398,8 +417,11 @@ protected:
 	void CalcOrphansMaxLen(Orphans & orphans);
 
 	void AddItemToSpace();
+	void RemoveLastSpace(size_t index);
 	void AddSpaceToSpaceTree(const Space & space);
 
+	bool RemoveIfNeeded(size_t index);
+
 	Item empty;
 	Item * pstack;			// stack pointer
 	size_t stack_len;		// length of the stack

From b781948f211528f3e5255fdd24f969f46bb3d70c Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Thu, 2 Dec 2021 17:44:41 +0100
Subject: [PATCH 24/37] HTMLParser now parses correctly such entities: &amp;
 &lt; &gt; &quot; &apos;

---
 src/html/htmlparser.cpp | 167 ++++++++++++++++++++++++++++++++++------
 src/html/htmlparser.h   |  13 +++-
 2 files changed, 155 insertions(+), 25 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 9b24071..4983010 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -60,6 +60,9 @@ void HTMLParser::clear_input_flags()
 	out_stream       = nullptr;
 	out_space        = nullptr;
 	line_len         = 0;
+	char_was_escaped = false;
+	escaped_chars_buffer.clear();
+	escaped_char_index = 0;
 }
 
 
@@ -602,7 +605,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
 
 	while( lastc != -1 )
 	{
-		if( lastc == '"' || lastc == '\'' )
+		if( !char_was_escaped && (lastc == '"' || lastc == '\'') )
 		{
 			if( is_quoted )
 			{
@@ -623,7 +626,7 @@ void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
 			LastItem().type = Item::simple;
 		}
 		else
-		if( !is_quoted && IsClosingTagMark(lastc) )
+		if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) )
 		{
 			read_char();
 			break;
@@ -739,15 +742,18 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
 
 	while( lastc != -1 )
 	{
-		if( has_quote )
+		if( !char_was_escaped )
 		{
-			if( lastc == quote_char )
-				break;
-		}
-		else
-		{
-			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
-				break;
+			if( has_quote )
+			{
+				if( lastc == quote_char )
+					break;
+			}
+			else
+			{
+				if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+					break;
+			}
 		}
 
 		if( lastc==10 || IsWhite(lastc) )
@@ -780,15 +786,18 @@ void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
 
 	while( lastc != -1 )
 	{
-		if( has_quote )
+		if( !char_was_escaped )
 		{
-			if( lastc == quote_char )
-				break;
-		}
-		else
-		{
-			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
-				break;
+			if( has_quote )
+			{
+				if( lastc == quote_char )
+					break;
+			}
+			else
+			{
+				if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+					break;
+			}
 		}
 
 		// IMPROVEME add support for analyze_entities?
@@ -995,7 +1004,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
 
 void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
 {
-	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
+	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
 	{
 		str += lastc;
 		read_char();
@@ -1260,7 +1269,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
 				tmp_text += lastc;
 				read_char();
 
-				if( IsClosingTagMark(lastc) )
+				if( !char_was_escaped && IsClosingTagMark(lastc) )
 				{
 					tmp_text += lastc;
 					read_char();
@@ -1332,7 +1341,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
 {
 	while( lastc != -1 )
 	{
-		if( IsOpeningTagMark(lastc) )
+		if( !char_was_escaped && IsOpeningTagMark(lastc) )
 		{
 			if( IsClosingTagForLastItem(put_closing_tag_as_well) )
 			{
@@ -1382,7 +1391,7 @@ void HTMLParser::ReadText()
 		text_space_wstr = &wstr_space.value.value_wstring;
 	}
 
-	while( lastc != -1 && !IsOpeningTagMark(lastc) )
+	while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
 	{
 		tmp_text.clear();
 		PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
@@ -1479,7 +1488,7 @@ bool HTMLParser::ReadItemAttr()
 	read_char();				// skipping '='
 	SkipWhiteLines();
 
-	bool has_quote = (lastc == '\"' || lastc == '\'');
+	bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\'');
 	wchar_t quote_char = lastc;
 
 	if( has_quote )
@@ -1491,7 +1500,7 @@ bool HTMLParser::ReadItemAttr()
 	else
 		ReadXMLItemAttrValue(has_quote, quote_char);
 
-	if( has_quote && lastc == quote_char )
+	if( has_quote && !char_was_escaped && lastc == quote_char )
 		read_char();			// skipping the last quote mark
 
 return true;
@@ -2198,6 +2207,116 @@ void HTMLParser::ReadLoop()
 }
 
 
+void HTMLParser::read_char_from_entity_buffer()
+{
+	if( escaped_char_index < escaped_chars_buffer.size() )
+	{
+		lastc = escaped_chars_buffer[escaped_char_index];
+		escaped_char_index += 1;
+
+		if( escaped_char_index >= escaped_chars_buffer.size() )
+		{
+			escaped_chars_buffer.clear();
+			escaped_char_index = 0;
+		}
+	}
+	else
+	{
+		lastc = -1;
+	}
+}
+
+
+void HTMLParser::read_xml_entity()
+{
+	const size_t max_entity_length = 6; // length of "&apos;" string
+	escaped_chars_buffer.clear();
+	escaped_char_index = 0;
+	escaped_chars_buffer += '&';
+
+	do
+	{
+		read_char_no_escape();
+
+		if( lastc != -1 )
+		{
+			escaped_chars_buffer += lastc;
+		}
+	}
+	while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' );
+}
+
+
+bool HTMLParser::check_escape_sequentions()
+{
+	if( escaped_chars_buffer == L"&amp;" )
+	{
+		lastc = '&';
+		char_was_escaped = true;
+	}
+	else
+	if( escaped_chars_buffer == L"&lt;" )
+	{
+		lastc = '<';
+		char_was_escaped = true;
+	}
+	else
+	if( escaped_chars_buffer == L"&gt;" )
+	{
+		lastc = '>';
+		char_was_escaped = true;
+	}
+	else
+	if( escaped_chars_buffer == L"&quot;" )
+	{
+		lastc = '"';
+		char_was_escaped = true;
+	}
+	else
+	if( escaped_chars_buffer == L"&apos;" )
+	{
+		lastc = '\'';
+		char_was_escaped = true;
+	}
+
+	if( char_was_escaped )
+	{
+		escaped_chars_buffer.clear();
+		escaped_char_index = 0;
+	}
+
+	return char_was_escaped;
+}
+
+
+
+int HTMLParser::read_char()
+{
+	char_was_escaped = false;
+
+	if( escaped_char_index < escaped_chars_buffer.size() )
+	{
+		read_char_from_entity_buffer();
+	}
+	else
+	{
+		read_char_no_escape();
+
+		if( lastc == '&' )
+		{
+			read_xml_entity();
+
+			if( !check_escape_sequentions() )
+			{
+				read_char_from_entity_buffer();
+			}
+		}
+	}
+
+	return lastc;
+}
+
+
 
 void HTMLParser::Read()
 {
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index da0074e..0f352a9 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -295,8 +295,14 @@ protected:
 
 	ItemParsedListener * item_parsed_listener;
 
+	/*
+		true if the lastc was escaped (with a backslash)
+		we have to know if the last sequence was \" or just "
+	*/
+	bool char_was_escaped;
 
-
+	std::wstring escaped_chars_buffer;
+	size_t escaped_char_index;
 
 
 	void clear_input_flags();
@@ -422,6 +428,11 @@ protected:
 
 	bool RemoveIfNeeded(size_t index);
 
+	bool check_escape_sequentions();
+	void read_xml_entity();
+	void read_char_from_entity_buffer();
+	int read_char() override;
+
 	Item empty;
 	Item * pstack;			// stack pointer
 	size_t stack_len;		// length of the stack

From fd1a8270cd166c32d116b83b276ace8e44afd257 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Tue, 18 Jan 2022 19:36:40 +0100
Subject: [PATCH 25/37] read CDATA as an ordinary text

---
 src/html/htmlparser.cpp | 78 ++++++++++++++++++++++++++++++++++++-----
 src/html/htmlparser.h   |  6 ++--
 2 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 4983010..4186445 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -73,6 +73,7 @@ void HTMLParser::Item::Clear()
 	name.clear();
 	type            = none;
 	is_commentary   = false;
+	is_cdata		= false;
 	porphans        = nullptr;
 	new_line_before = false;
 	new_line        = false;
@@ -646,7 +647,7 @@ bool HTMLParser::IsValidCharForName(int c)
 	if( (c>='a' && c<='z') ||
 		(c>='A' && c<='Z') ||
 		(c>='0' && c<='9') ||
-		c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
+		c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA
 		return true;
 
 return false;
@@ -696,6 +697,13 @@ size_t i;
 				read_char();
 				break;
 			}
+
+			if( LastItem().type == Item::special && name == L"![CDATA[" )
+			{
+				LastItem().is_cdata = true;
+				read_char();
+				break;
+			}
 		}
 
 		read_char();
@@ -1002,10 +1010,49 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
 }
 
 
-void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
+bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata)
 {
-	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
+	bool was_closing_tag = false;
+
+	while( lastc != -1 && lastc != 10 && !IsWhite(lastc) )
 	{
+		if( is_cdata )
+		{
+			if( lastc == ']' )
+			{
+				read_char();
+
+				if( lastc == ']' )
+				{
+					read_char();
+
+					if( IsClosingTagMark(lastc) )
+					{
+						read_char();
+						was_closing_tag = true;
+						break;
+					}
+					else
+					{
+						str += ']';
+						str += ']';
+					}
+				}
+				else
+				{
+					str += ']';
+				}
+			}
+		}
+		else
+		{
+			if( !char_was_escaped && IsOpeningTagMark(lastc) )
+			{
+				was_closing_tag = true;
+				break;
+			}
+		}
+
 		str += lastc;
 		read_char();
 	}
@@ -1028,6 +1075,8 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
 		AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
 	else
 		Put(str);
+
+	return was_closing_tag;
 }
 
 
@@ -1290,6 +1339,7 @@ void HTMLParser::ReadTextUntilClosingCommentary()
 }
 
 
+
 bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
 {
 	tmp_text.clear();
@@ -1361,7 +1411,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
 
 
 // reading text between html tags
-void HTMLParser::ReadText()
+void HTMLParser::ReadText(bool is_cdata)
 {
 	bool was_white_char = false;
 	bool was_new_line = false;
@@ -1391,10 +1441,12 @@ void HTMLParser::ReadText()
 		text_space_wstr = &wstr_space.value.value_wstring;
 	}
 
-	while( lastc != -1 && !(!char_was_escaped && IsOpeningTagMark(lastc)) )
+	bool was_closing_tag = false;
+
+	while( lastc != -1 && !was_closing_tag )
 	{
 		tmp_text.clear();
-		PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
+		was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
 
 		if( !tmp_text.empty() )
 		{
@@ -1627,6 +1679,11 @@ void HTMLParser::ReadItemSpecial()
 			Put(LastItem().name);
 		}
 		else
+		if( LastItem().is_cdata )
+		{
+			// do nothing
+		}
+		else
 		{
 			tmp_text.clear();
 			SkipWhiteLines();
@@ -2158,6 +2215,8 @@ void HTMLParser::ReadLoop()
 {
 	while( status == ok && ReadItem() )
 	{
+		bool was_cdata = false;
+
 		if( LastItem().type == Item::opening )
 		{
 			if( parsing_html )
@@ -2174,6 +2233,9 @@ void HTMLParser::ReadLoop()
 			if( LastItem().is_commentary )
 				ReadTextUntilClosingCommentary();
 
+			if( LastItem().is_cdata )
+				was_cdata = true;
+
 			PopStack();
 		}
 		else
@@ -2199,7 +2261,7 @@ void HTMLParser::ReadLoop()
 
 		if( status == ok )
 		{
-			ReadText();
+			ReadText(was_cdata);
 		}
 
 		is_first_item = false;
@@ -2331,7 +2393,7 @@ void HTMLParser::Read()
 
 	// it can be some text or white lines before the first html tag (we print it if using filtering)
 	// but they are not added to the Space tree
-	ReadText();
+	ReadText(false);
 
 	// reading the whole html source
 	ReadLoop();
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 0f352a9..7797b51 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -132,6 +132,8 @@ public:
 
 		bool is_commentary;
 
+		bool is_cdata;
+
 		bool new_line_before;
 
 		// is there a new line after this tag
@@ -392,7 +394,7 @@ protected:
 	void CheckStackPrintRest();
 	void AddForgottenTags();
 	void CheckClosingTags();
-	void ReadText();
+	void ReadText(bool is_cdata);
 	bool PrintRest();
 	bool PrintOpeningItem();
 	void ReadItemName(std::wstring & name, bool clear_name = true);
@@ -415,7 +417,7 @@ protected:
 
 	void CheckChar(wchar_t c);
 
-	void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
+	bool PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata);
 	void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
 
 	void PutTabs(size_t len);

From 6b97b1b74acc1d7c89f8426218cec867380f1a3f Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Thu, 3 Feb 2022 19:08:21 +0100
Subject: [PATCH 26/37] fix: correctly escape json/xml/csv wide strings

A wide string was first changed to utf-8 and then escaped to json/xml/csv
which is incorrect. First should be escaped and then changed to utf-8.

Add TextStreamBase<>::iterator and TextStreamBase<>::const_interator as classes
with a method wchar_t get_unicode_and_advance(const iterator & end)
to return one character either from utf-8 stream or from wide stream.

Let TextStreamBase<>::operator<<(wchar_t v) correctly use utf-8.
---
 src/Makefile.dep            |  48 ++---
 src/convert/misc.cpp        | 148 ++++++++-------
 src/convert/misc.h          |  89 ++-------
 src/textstream/textstream.h | 359 ++++++++++++++++++++++++++++++++++--
 src/utf8/utf8_stream.h      |   4 +-
 tests/Makefile.dep          |   2 +-
 6 files changed, 466 insertions(+), 184 deletions(-)

diff --git a/src/Makefile.dep b/src/Makefile.dep
index 7dbbb8e..b037dab 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -13,6 +13,7 @@
 ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./convert/double.o: membuffer/membuffer.h textstream/types.h
+./convert/double.o: utf8/utf8_stream.h
 ./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
 ./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h
 ./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
@@ -24,19 +25,19 @@
 ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
 ./log/filelog.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
-./log/filelog.o: textstream/types.h
+./log/filelog.o: textstream/types.h utf8/utf8_stream.h
 ./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h
 ./log/log.o: space/space.h textstream/types.h convert/inttostr.h utf8/utf8.h
 ./log/log.o: textstream/stream.h utf8/utf8_templates.h utf8/utf8_private.h
 ./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h
-./log/log.o: ./log/filelog.h
+./log/log.o: utf8/utf8_stream.h ./log/filelog.h
 ./space/space.o: ./space/space.h textstream/types.h convert/inttostr.h
 ./space/space.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h
 ./space/space.o: convert/patternreplacer.h textstream/textstream.h
 ./space/space.o: textstream/stream.h space/space.h date/date.h
-./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h
-./space/space.o: ./convert/text.h ./convert/misc.h utf8/utf8_stream.h
+./space/space.o: membuffer/membuffer.h textstream/types.h utf8/utf8_stream.h
+./space/space.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
 ./space/space.o: ./convert/double.h
 ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
 ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
@@ -44,31 +45,32 @@
 ./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
 ./space/spaceparser.o: textstream/textstream.h textstream/stream.h
 ./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h
-./space/spaceparser.o: textstream/types.h convert/strtoint.h ./convert/text.h
-./space/spaceparser.o: ./convert/misc.h utf8/utf8_stream.h
+./space/spaceparser.o: textstream/types.h utf8/utf8_stream.h
+./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
-./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
-./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
-./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
-./csv/csvparser.o: convert/baseparser.h textstream/textstream.h
-./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
-./csv/csvparser.o: textstream/types.h
-./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
-./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
-./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
-./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
-./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
-./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
-./html/htmlparser.o: textstream/textstream.h textstream/stream.h
-./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
-./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
-./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
-./html/htmlparser.o: textstream/types.h convert/text.h
 ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
 ./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
 ./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h
 ./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
+./html/bbcodeparser.o: utf8/utf8_stream.h
+./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
+./html/htmlparser.o: textstream/textstream.h textstream/stream.h
+./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
+./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
+./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
+./html/htmlparser.o: textstream/types.h utf8/utf8_stream.h convert/text.h
+./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
+./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
+./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
+./csv/csvparser.o: convert/baseparser.h textstream/textstream.h
+./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
+./csv/csvparser.o: textstream/types.h utf8/utf8_stream.h
+./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
+./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
+./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
+./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
+./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
diff --git a/src/convert/misc.cpp b/src/convert/misc.cpp
index ffdf457..ffa757e 100644
--- a/src/convert/misc.cpp
+++ b/src/convert/misc.cpp
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2017-2021, Tomasz Sowa
+ * Copyright (c) 2017-2022, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,62 +51,83 @@ void SetOverflow(bool * was_overflow, bool val)
 }
 
 
-void esc_to_json(char val, Stream & out)
+
+void esc_to_json_uformat(wchar_t val, Stream & out)
 {
-	if( (unsigned char)val < 32 )
+	char buf[10];
+	size_t len;
+
+	Toa((unsigned long)val, buf, sizeof(buf)/sizeof(char), 16, &len);
+
+	out << "\\u";
+
+	if( len < 4 )
 	{
-		char buf[10];
-		size_t len;
-		Toa((unsigned char)val, buf, sizeof(buf)/sizeof(char), 16, &len);
-
-		out << "\\u";
-
-		if( len < 4 )
+		for(size_t i=0 ; i < (4-len) ; ++i)
 		{
-			for(size_t i=0 ; i < (4-len) ; ++i)
-			{
-				out << '0';
-			}
-		}
-
-		out << buf;
-	}
-	else
-	{
-	// CHECKME
-	// \r \n \t are <32 and will be serialized os \u.... above
-
-		switch( val )
-		{
-		case 0:		out << '\\';	out << '0';		break;	// may to skip this character is better?
-		case '\r':	out << '\\';	out << 'r';		break;
-		case '\n':	out << '\\';	out << 'n';		break;
-		case '\t':	out << '\\';	out << 't';		break;
-		case 0x08:	out << '\\';	out << 'b';		break;
-		case 0x0c:	out << '\\';	out << 'f';		break;
-		case '\\':	out << '\\';	out << '\\';		break;
-		case '"':	out << '\\';	out << '\"';		break;
-		default:
-			out << val;
+			out << '0';
 		}
 	}
+
+	out << buf;
 }
 
 
 void esc_to_json(wchar_t val, Stream & out)
 {
-	char utf8_buf[10];
-	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
-
-	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
-
-	for(size_t a = 0 ; a < len ; ++a)
+	if( val == '\r' )
 	{
-		esc_to_json(utf8_buf[a], out);
+		out << '\\' << 'r';
+	}
+	else
+	if( val == '\n' )
+	{
+		out << '\\' << 'n';
+	}
+	else
+	if( val == '\t' )
+	{
+		out << '\\' << 't';
+	}
+	else
+	if( val == 0x08 )
+	{
+		out << '\\' << 'b';
+	}
+	else
+	if( val == 0x0c )
+	{
+		out << '\\' << 'f';
+	}
+	else
+	if( val == '\\' )
+	{
+		out << '\\' << '\\';
+	}
+	else
+	if( val == '"' )
+	{
+		out << '\\' << '\"';
+	}
+	else
+	if( val < 32 )
+	{
+		esc_to_json_uformat(val, out);
+	}
+	else
+	{
+		out << val;
 	}
 }
 
 
+void esc_to_json(char val, Stream & out)
+{
+	esc_to_json((wchar_t)(unsigned char)val, out);
+}
+
+
+
 void esc_to_json(const char * c, pt::Stream & out)
 {
 	for(size_t i = 0 ; c[i] != 0 ; ++i)
@@ -159,10 +180,15 @@ void esc_to_json(const std::wstring & in, Stream & out)
 
 
 
-void esc_to_xml(char val, Stream & out)
+void esc_to_xml(wchar_t val, Stream & out)
 {
 	switch(val)
 	{
+	case 0:
+		// null character is invalid in XML 1.0 and 1.1
+		// https://en.wikipedia.org/wiki/Valid_characters_in_XML
+		break;
+
 	case '<':
 		out << "&lt;";
 		break;
@@ -182,22 +208,13 @@ void esc_to_xml(char val, Stream & out)
 	default:
 		out << val;
 		break;
-
-	// what about zero (null) character?
 	}
 }
 
-void esc_to_xml(wchar_t val, Stream & out)
+
+void esc_to_xml(char val, Stream & out)
 {
-	char utf8_buf[10];
-	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
-
-	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
-
-	for(size_t a = 0 ; a < len ; ++a)
-	{
-		esc_to_xml(utf8_buf[a], out);
-	}
+	esc_to_xml((wchar_t)(unsigned char)val, out);
 }
 
 
@@ -252,10 +269,14 @@ void esc_to_xml(const std::wstring & in, Stream & out)
 
 
 
-void esc_to_csv(char c, pt::Stream & out)
+void esc_to_csv(wchar_t c, pt::Stream & out)
 {
 	switch(c)
 	{
+	case 0:
+		// null characters are invalid in text files
+		break;
+
 	case '"':
 		out << "\"\"";
 		break;
@@ -263,27 +284,16 @@ void esc_to_csv(char c, pt::Stream & out)
 	default:
 		out << c;
 		break;
-
-	// what about zero (null) character?
 	}
 }
 
 
-void esc_to_csv(wchar_t val, Stream & out)
+void esc_to_csv(char val, Stream & out)
 {
-	char utf8_buf[10];
-	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
-
-	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
-
-	for(size_t a = 0 ; a < len ; ++a)
-	{
-		esc_to_csv(utf8_buf[a], out);
-	}
+	esc_to_csv((wchar_t)(unsigned char)val, out);
 }
 
 
-
 void esc_to_csv(const char * c, pt::Stream & out)
 {
 	for(size_t i = 0 ; c[i] != 0 ; ++i)
diff --git a/src/convert/misc.h b/src/convert/misc.h
index 51f4159..5070655 100644
--- a/src/convert/misc.h
+++ b/src/convert/misc.h
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2017-2021, Tomasz Sowa
+ * Copyright (c) 2017-2022, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -50,8 +50,8 @@ namespace pt
 
 void SetOverflow(bool * was_overflow, bool val);
 
-void esc_to_json(char val, Stream & out);
 void esc_to_json(wchar_t val, Stream & out);
+void esc_to_json(char val, Stream & out);
 void esc_to_json(const char * c, pt::Stream & out);
 void esc_to_json(const char * c, std::size_t len, Stream & out);
 void esc_to_json(const wchar_t * c, Stream & out);
@@ -59,8 +59,8 @@ void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out);
 void esc_to_json(const std::string & in, Stream & out);
 void esc_to_json(const std::wstring & in, Stream & out);
 
-void esc_to_xml(char c, pt::Stream & out);
 void esc_to_xml(wchar_t c, pt::Stream & out);
+void esc_to_xml(char c, pt::Stream & out);
 void esc_to_xml(const char * c, pt::Stream & out);
 void esc_to_xml(const char * c, std::size_t len, pt::Stream & out);
 void esc_to_xml(const wchar_t * c, pt::Stream & out);
@@ -68,8 +68,8 @@ void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out);
 void esc_to_xml(const std::string & in, Stream & out);
 void esc_to_xml(const std::wstring & in, Stream & out);
 
-void esc_to_csv(char c, pt::Stream & out);
 void esc_to_csv(wchar_t val, Stream & out);
+void esc_to_csv(char c, pt::Stream & out);
 void esc_to_csv(const char * c, std::size_t len, Stream & out);
 void esc_to_csv(const char * c, pt::Stream & out);
 void esc_to_csv(const char * c, std::size_t len, pt::Stream & out);
@@ -82,34 +82,13 @@ void esc_to_csv(const std::string & in, Stream & out);
 template<typename StreamType>
 void esc_to_json(const StreamType & in, Stream & out)
 {
-	char utf8_buf[10];
-	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
 	typename StreamType::const_iterator i = in.begin();
 	typename StreamType::const_iterator end = in.end();
-	int res;
-	bool correct;
 
-	for( ; i != end ; ++i)
+	while( i != end )
 	{
-		if( in.is_wchar_stream() && out.is_char_stream() )
-		{
-			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
-			esc_to_json(utf8_buf, len, out);
-		}
-		else
-		if( in.is_char_stream() && out.is_wchar_stream() )
-		{
-			utf8_to_int(i, end, res, correct);
-
-			if( correct )
-				esc_to_json(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
-
-			// put replacement char if not correct?
-		}
-		else
-		{
-			esc_to_json(static_cast<wchar_t>(*i), out);
-		}
+		wchar_t c = i.get_unicode_and_advance(end);
+		esc_to_json(c, out);
 	}
 }
 
@@ -117,34 +96,13 @@ void esc_to_json(const StreamType & in, Stream & out)
 template<typename StreamType>
 void esc_to_xml(const StreamType & in, Stream & out)
 {
-	char utf8_buf[10];
-	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
 	typename StreamType::const_iterator i = in.begin();
 	typename StreamType::const_iterator end = in.end();
-	int res;
-	bool correct;
 
-	for( ; i != end ; ++i)
+	while( i != end )
 	{
-		if( in.is_wchar_stream() && out.is_char_stream() )
-		{
-			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
-			esc_to_xml(utf8_buf, len, out);
-		}
-		else
-		if( in.is_char_stream() && out.is_wchar_stream() )
-		{
-			utf8_to_int(i, end, res, correct);
-
-			if( correct )
-				esc_to_xml(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
-
-			// put replacement char if not correct?
-		}
-		else
-		{
-			esc_to_xml(static_cast<wchar_t>(*i), out);
-		}
+		wchar_t c = i.get_unicode_and_advance(end);
+		esc_to_xml(c, out);
 	}
 }
 
@@ -152,34 +110,13 @@ void esc_to_xml(const StreamType & in, Stream & out)
 template<typename StreamType>
 void esc_to_csv(const StreamType & in, Stream & out)
 {
-	char utf8_buf[10];
-	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
 	typename StreamType::const_iterator i = in.begin();
 	typename StreamType::const_iterator end = in.end();
-	int res;
-	bool correct;
 
-	for( ; i != end ; ++i)
+	while( i != end )
 	{
-		if( in.is_wchar_stream() && out.is_char_stream() )
-		{
-			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
-			esc_to_csv(utf8_buf, len, out);
-		}
-		else
-		if( in.is_char_stream() && out.is_wchar_stream() )
-		{
-			utf8_to_int(i, end, res, correct);
-
-			if( correct )
-				esc_to_csv(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
-
-			// put replacement char if not correct?
-		}
-		else
-		{
-			esc_to_csv(static_cast<wchar_t>(*i), out);
-		}
+		wchar_t c = i.get_unicode_and_advance(end);
+		esc_to_csv(c, out);
 	}
 }
 
diff --git a/src/textstream/textstream.h b/src/textstream/textstream.h
index 772a073..6f0053d 100644
--- a/src/textstream/textstream.h
+++ b/src/textstream/textstream.h
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2012-2021, Tomasz Sowa
+ * Copyright (c) 2012-2022, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -46,6 +46,7 @@
 #include "membuffer/membuffer.h"
 #include "types.h"
 #include "utf8/utf8.h"
+#include "utf8/utf8_stream.h"
 
 // for snprintf
 #include <cstdio>
@@ -71,8 +72,67 @@ public:
 	typedef CharT char_type;
 
 	typedef MemBuffer<char_type, stack_size, heap_block_size> buffer_type;
-	typedef typename buffer_type::iterator iterator;
-	typedef typename buffer_type::const_iterator const_iterator;
+
+
+	class iterator
+	{
+	public:
+
+		typename buffer_type::iterator membuffer_iterator;
+
+		iterator();
+		iterator(const iterator & i);
+		iterator & operator=(const iterator & i);
+
+		iterator(const typename buffer_type::iterator & i);
+		iterator & operator=(const typename buffer_type::iterator & i);
+
+		bool operator==(const iterator & i) const;
+		bool operator!=(const iterator & i) const;
+
+		iterator & operator++(); 		// prefix  ++
+		iterator   operator++(int); 	// postfix ++
+
+		iterator & operator--(); 		// prefix  --
+		iterator   operator--(int); 	// postfix --
+
+		CharT & operator*();
+
+		wchar_t get_unicode_and_advance(const iterator & end);
+	};
+
+
+	class const_iterator
+	{
+	public:
+
+		typename buffer_type::const_iterator membuffer_const_iterator;
+
+		const_iterator();
+		const_iterator(const const_iterator & i);
+		const_iterator(const iterator & i);
+		const_iterator & operator=(const const_iterator & i);
+		const_iterator & operator=(const iterator & i);
+
+		const_iterator(const typename buffer_type::const_iterator & i);
+		const_iterator(const typename buffer_type::iterator & i);
+		const_iterator & operator=(const typename buffer_type::const_iterator & i);
+		const_iterator & operator=(const typename buffer_type::iterator & i);
+
+		bool operator==(const const_iterator & i) const;
+		bool operator!=(const const_iterator & i) const;
+
+		const_iterator & operator++(); 		// prefix  ++
+		const_iterator   operator++(int); 	// postfix ++
+
+		const_iterator & operator--(); 		// prefix  --
+		const_iterator   operator--(int); 	// postfix --
+
+		CharT operator*();
+
+		wchar_t get_unicode_and_advance(const const_iterator & end);
+
+	};
 
 
 	bool is_char_stream() const;
@@ -112,7 +172,7 @@ public:
 
 	TextStreamBase & operator<<(char);
 	TextStreamBase & operator<<(unsigned char);
-	TextStreamBase & operator<<(wchar_t);
+	TextStreamBase & operator<<(wchar_t); // no surrogate pairs are used
 	TextStreamBase & operator<<(bool);
 	TextStreamBase & operator<<(short);
 	TextStreamBase & operator<<(int);
@@ -173,6 +233,272 @@ TextStreamBase<char_type, stack_size, heap_block_size>::TextStreamBase()
 }
 
 
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator::iterator()
+{
+}
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator::iterator(const iterator & i) : membuffer_iterator(i)
+{
+}
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator=(const iterator & i)
+{
+	membuffer_iterator = i;
+}
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator::iterator(const typename buffer_type::iterator & i) : membuffer_iterator(i)
+{
+}
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator=(const typename buffer_type::iterator & i)
+{
+	membuffer_iterator = i;
+}
+
+
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+bool TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator==(const iterator & i) const
+{
+	return membuffer_iterator == i.membuffer_iterator;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+bool TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator!=(const iterator & i) const
+{
+	return membuffer_iterator != i.membuffer_iterator;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator++()
+{
+	++membuffer_iterator;
+	return *this;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator++(int)
+{
+	const_iterator old(*this);
+	membuffer_iterator++;
+	return old;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator--()
+{
+	--membuffer_iterator;
+	return *this;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator
+TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator--(int)
+{
+	const_iterator old(*this);
+	membuffer_iterator--;
+	return old;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+char_type & TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator*()
+{
+	return *membuffer_iterator;
+}
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+wchar_t TextStreamBase<char_type, stack_size, heap_block_size>::iterator::get_unicode_and_advance(const iterator & end)
+{
+	if( *this != end )
+	{
+		if constexpr (sizeof(char_type) == sizeof(char) )
+		{
+			int res;
+			bool correct;
+			utf8_to_int(*this, end, res, correct);
+
+			if( correct )
+				return static_cast<wchar_t>(res);
+			else
+				return static_cast<wchar_t>(0xFFFD); // U+FFFD "replacement character"
+		}
+		else
+		{
+			wchar_t c = operator*();
+			++membuffer_iterator;
+			return c;
+		}
+	}
+
+	return 0;
+}
+
+
+
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator()
+{
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator(const const_iterator & i) : membuffer_const_iterator(i.membuffer_const_iterator)
+{
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator(const iterator & i) : membuffer_const_iterator(i.membuffer_iterator)
+{
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const const_iterator & i)
+{
+	membuffer_const_iterator = i.membuffer_const_iterator;
+	 return *this;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const iterator & i)
+{
+	membuffer_const_iterator = i.membuffer_iterator;
+	 return *this;
+}
+
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator(const typename buffer_type::const_iterator & i) : membuffer_const_iterator(i)
+{
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_iterator(const typename buffer_type::iterator & i) : membuffer_const_iterator(i)
+{
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const typename buffer_type::const_iterator & i)
+{
+	 membuffer_const_iterator = i;
+	 return *this;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const typename buffer_type::iterator & i)
+{
+	 membuffer_const_iterator = i;
+	 return *this;
+}
+
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+bool TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator==(const const_iterator & i) const
+{
+	return membuffer_const_iterator == i.membuffer_const_iterator;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+bool TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator!=(const const_iterator & i) const
+{
+	return membuffer_const_iterator != i.membuffer_const_iterator;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator++()
+{
+	++membuffer_const_iterator;
+	return *this;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator++(int)
+{
+	const_iterator old(*this);
+	membuffer_const_iterator++;
+	return old;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator--()
+{
+	--membuffer_const_iterator;
+	return *this;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator
+TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator--(int)
+{
+	const_iterator old(*this);
+	membuffer_const_iterator--;
+	return old;
+}
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+char_type TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator*()
+{
+	return *membuffer_const_iterator;
+}
+
+
+template<typename char_type, size_t stack_size, size_t heap_block_size>
+wchar_t TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::get_unicode_and_advance(const const_iterator & end)
+{
+	if( *this != end )
+	{
+		if constexpr (sizeof(char_type) == sizeof(char) )
+		{
+			int res;
+			bool correct;
+			pt::utf8_to_int(*this, end, res, correct);
+
+			if( correct )
+				return static_cast<wchar_t>(res);
+			else
+				return static_cast<wchar_t>(0xFFFD); // U+FFFD "replacement character"
+		}
+		else
+		{
+			wchar_t c = operator*();
+			++membuffer_const_iterator;
+			return c;
+		}
+	}
+
+	return 0;
+}
+
+
+
+
+
 template<typename char_type, size_t stack_size, size_t heap_block_size>
 bool TextStreamBase<char_type, stack_size, heap_block_size>::is_char_stream() const
 {
@@ -433,10 +759,14 @@ template<typename char_type, size_t stack_size, size_t heap_block_size>
 TextStreamBase<char_type, stack_size, heap_block_size> &
 TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(char v)
 {
-	// IMPROVEME
-	// if char_type == 1 then if v <= 127 then put that char but if (unsigned)v > 127 put replacement character
-	// if char_type > 1 then simply put that character
-	buffer.append(static_cast<char_type>(v));
+	if constexpr (sizeof(char_type) == sizeof(wchar_t) )
+	{
+		buffer.append(static_cast<char_type>(static_cast<unsigned char>(v)));
+	}
+	else
+	{
+		buffer.append(v);
+	}
 
 return *this;
 }
@@ -446,9 +776,6 @@ template<typename char_type, size_t stack_size, size_t heap_block_size>
 TextStreamBase<char_type, stack_size, heap_block_size> &
 TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(unsigned char v)
 {
-	// IMPROVEME
-	// if char_type == 1 then if v <= 127 then put that char but if v > 127 put replacement character
-	// if char_type > 1 then simply put that character
 	buffer.append(static_cast<char_type>(v));
 
 return *this;
@@ -459,8 +786,14 @@ template<typename char_type, size_t stack_size, size_t heap_block_size>
 TextStreamBase<char_type, stack_size, heap_block_size> &
 TextStreamBase<char_type, stack_size, heap_block_size>::operator<<(wchar_t v)
 {
-	// IMPROVEME add utf8/wide conversion, if v is from surrogate pair we can skip it
-	buffer.append(static_cast<char_type>(v));
+	if constexpr (sizeof(char_type) == sizeof(wchar_t) )
+	{
+		buffer.append(v);
+	}
+	else
+	{
+		pt::int_to_utf8(static_cast<int>(v), *this);
+	}
 
 return *this;
 }
diff --git a/src/utf8/utf8_stream.h b/src/utf8/utf8_stream.h
index 3adf848..565381d 100644
--- a/src/utf8/utf8_stream.h
+++ b/src/utf8/utf8_stream.h
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2021, Tomasz Sowa
+ * Copyright (c) 2021-2022, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -60,7 +60,7 @@ namespace pt
 template<typename StreamIteratorType>
 size_t utf8_to_int(
 		StreamIteratorType & iterator_in,
-		StreamIteratorType & iterator_end,
+		const StreamIteratorType & iterator_end,
 		int & res,
 		bool & correct)
 {
diff --git a/tests/Makefile.dep b/tests/Makefile.dep
index a9228ca..8685629 100644
--- a/tests/Makefile.dep
+++ b/tests/Makefile.dep
@@ -19,7 +19,6 @@
 ./csvparser.o: ../src/textstream/stream.h ../src/date/date.h
 ./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
-./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h
 ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
 ./mainoptionsparser.o: ../src/space/space.h ../src/textstream/types.h
@@ -35,3 +34,4 @@
 ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
 ./mainoptionsparser.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
 ./mainoptionsparser.o: ../src/convert/double.h
+./test.o: test.h

From 3b9b464bb7aa8fad5ce39059acc5b02de6e56b81 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Thu, 3 Feb 2022 19:21:22 +0100
Subject: [PATCH 27/37] fix: add typename keyword in TextStreamBase<> in some
 places

---
 src/textstream/textstream.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/textstream/textstream.h b/src/textstream/textstream.h
index 6f0053d..fdbbb36 100644
--- a/src/textstream/textstream.h
+++ b/src/textstream/textstream.h
@@ -247,7 +247,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::iterator::iterator(const
 
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator=(const iterator & i)
 {
 	membuffer_iterator = i;
@@ -261,7 +261,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::iterator::iterator(const
 
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator=(const typename buffer_type::iterator & i)
 {
 	membuffer_iterator = i;
@@ -283,7 +283,7 @@ bool TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator!
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator++()
 {
 	++membuffer_iterator;
@@ -291,7 +291,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator++()
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::iterator
+typename TextStreamBase<char_type, stack_size, heap_block_size>::iterator
 TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator++(int)
 {
 	const_iterator old(*this);
@@ -300,7 +300,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator++(int
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator--()
 {
 	--membuffer_iterator;
@@ -308,7 +308,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator--()
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::iterator
+typename TextStreamBase<char_type, stack_size, heap_block_size>::iterator
 TextStreamBase<char_type, stack_size, heap_block_size>::iterator::operator--(int)
 {
 	const_iterator old(*this);
@@ -370,7 +370,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_it
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const const_iterator & i)
 {
 	membuffer_const_iterator = i.membuffer_const_iterator;
@@ -378,7 +378,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const iterator & i)
 {
 	membuffer_const_iterator = i.membuffer_iterator;
@@ -398,7 +398,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::const_it
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const typename buffer_type::const_iterator & i)
 {
 	 membuffer_const_iterator = i;
@@ -406,7 +406,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator=(const typename buffer_type::iterator & i)
 {
 	 membuffer_const_iterator = i;
@@ -428,7 +428,7 @@ bool TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::ope
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator++()
 {
 	++membuffer_const_iterator;
@@ -436,7 +436,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator
+typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator
 TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator++(int)
 {
 	const_iterator old(*this);
@@ -445,7 +445,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
+typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator &
 TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator--()
 {
 	--membuffer_const_iterator;
@@ -453,7 +453,7 @@ TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator
 }
 
 template<typename char_type, size_t stack_size, size_t heap_block_size>
-TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator
+typename TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator
 TextStreamBase<char_type, stack_size, heap_block_size>::const_iterator::operator--(int)
 {
 	const_iterator old(*this);

From ac3c59323be3be15119152defa8481f7be5788b0 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Fri, 4 Feb 2022 14:19:54 +0100
Subject: [PATCH 28/37] add methods: try_esc_to_json(wchar_t val, stream)
 try_esc_to_xml(...) try_esc_to_csv(...)

Those methods return true if the val character was escaped and put
to the out stream. If the character is invalid for such a stream
they only return true without putting it to the stream.
---
 src/convert/misc.cpp | 144 +++++++++++++++++++++++++++++++------------
 src/convert/misc.h   |   3 +
 2 files changed, 108 insertions(+), 39 deletions(-)

diff --git a/src/convert/misc.cpp b/src/convert/misc.cpp
index ffa757e..3d78ae7 100644
--- a/src/convert/misc.cpp
+++ b/src/convert/misc.cpp
@@ -73,48 +73,70 @@ void esc_to_json_uformat(wchar_t val, Stream & out)
 }
 
 
-void esc_to_json(wchar_t val, Stream & out)
+/*
+ * return true if the val character was escaped and put to the out stream
+ * if the character is invalid for such a stream then only return true
+ * but not put it to the stream
+ */
+bool try_esc_to_json(wchar_t val, Stream & out)
 {
+	bool status = false;
+
 	if( val == '\r' )
 	{
 		out << '\\' << 'r';
+		status = true;
 	}
 	else
 	if( val == '\n' )
 	{
 		out << '\\' << 'n';
+		status = true;
 	}
 	else
 	if( val == '\t' )
 	{
 		out << '\\' << 't';
+		status = true;
 	}
 	else
 	if( val == 0x08 )
 	{
 		out << '\\' << 'b';
+		status = true;
 	}
 	else
 	if( val == 0x0c )
 	{
 		out << '\\' << 'f';
+		status = true;
 	}
 	else
 	if( val == '\\' )
 	{
 		out << '\\' << '\\';
+		status = true;
 	}
 	else
 	if( val == '"' )
 	{
 		out << '\\' << '\"';
+		status = true;
 	}
 	else
 	if( val < 32 )
 	{
 		esc_to_json_uformat(val, out);
+		status = true;
 	}
-	else
+
+	return status;
+}
+
+
+void esc_to_json(wchar_t val, Stream & out)
+{
+	if( !try_esc_to_json(val, out) )
 	{
 		out << val;
 	}
@@ -123,7 +145,10 @@ void esc_to_json(wchar_t val, Stream & out)
 
 void esc_to_json(char val, Stream & out)
 {
-	esc_to_json((wchar_t)(unsigned char)val, out);
+	if( !try_esc_to_json((wchar_t)(unsigned char)val, out) )
+	{
+		out << val;
+	}
 }
 
 
@@ -177,44 +202,66 @@ void esc_to_json(const std::wstring & in, Stream & out)
 
 
 
+/*
+ * return true if the val character was escaped and put to the out stream
+ * if the character is invalid for such a stream then only return true
+ * but not put it to the stream
+ */
+bool try_esc_to_xml(wchar_t val, Stream & out)
+{
+	bool status = false;
 
+	if( val == 0 )
+	{
+		// null character is invalid in XML 1.0 and 1.1
+		// https://en.wikipedia.org/wiki/Valid_characters_in_XML
+		// return true but not put the char to the out stream
+		status = true;
+	}
+	else
+	if( val == '<')
+	{
+		out << "&lt;";
+		status = true;
+	}
+	else
+	if( val == '>')
+	{
+		out << "&gt;";
+		status = true;
+	}
+	else
+	if( val == '&')
+	{
+		out << "&amp;";
+		status = true;
+	}
+	else
+	if( val == '"')
+	{
+		out << "&quot;";
+		status = true;
+	}
+
+	return status;
+}
 
 
 void esc_to_xml(wchar_t val, Stream & out)
 {
-	switch(val)
+	if( !try_esc_to_xml(val, out) )
 	{
-	case 0:
-		// null character is invalid in XML 1.0 and 1.1
-		// https://en.wikipedia.org/wiki/Valid_characters_in_XML
-		break;
-
-	case '<':
-		out << "&lt;";
-		break;
-
-	case '>':
-		out << "&gt;";
-		break;
-
-	case '&':
-		out << "&amp;";
-		break;
-
-	case '"':
-		out << "&quot;";
-		break;
-
-	default:
 		out << val;
-		break;
 	}
 }
 
 
 void esc_to_xml(char val, Stream & out)
 {
-	esc_to_xml((wchar_t)(unsigned char)val, out);
+	if( !try_esc_to_xml((wchar_t)(unsigned char)val, out) )
+	{
+		out << val;
+	}
 }
 
 
@@ -269,28 +316,47 @@ void esc_to_xml(const std::wstring & in, Stream & out)
 
 
 
-void esc_to_csv(wchar_t c, pt::Stream & out)
+/*
+ * return true if the val character was escaped and put to the out stream
+ * if the character is invalid for such a stream then only return true
+ * but not put it to the stream
+ */
+bool try_esc_to_csv(wchar_t val, pt::Stream & out)
 {
-	switch(c)
+	bool status = false;
+
+	if( val == 0 )
 	{
-	case 0:
 		// null characters are invalid in text files
-		break;
-
-	case '"':
+		// return true but not put to the out stream
+		status = true;
+	}
+	else
+	if( val == '"' )
+	{
 		out << "\"\"";
-		break;
+		status = true;
+	}
 
-	default:
-		out << c;
-		break;
+	return status;
+}
+
+
+void esc_to_csv(wchar_t val, pt::Stream & out)
+{
+	if( !try_esc_to_csv(val, out) )
+	{
+		out << val;
 	}
 }
 
 
 void esc_to_csv(char val, Stream & out)
 {
-	esc_to_csv((wchar_t)(unsigned char)val, out);
+	if( !try_esc_to_csv((wchar_t)(unsigned char)val, out) )
+	{
+		out << val;
+	}
 }
 
 
diff --git a/src/convert/misc.h b/src/convert/misc.h
index 5070655..e8b10bf 100644
--- a/src/convert/misc.h
+++ b/src/convert/misc.h
@@ -50,6 +50,7 @@ namespace pt
 
 void SetOverflow(bool * was_overflow, bool val);
 
+bool try_esc_to_json(wchar_t val, Stream & out);
 void esc_to_json(wchar_t val, Stream & out);
 void esc_to_json(char val, Stream & out);
 void esc_to_json(const char * c, pt::Stream & out);
@@ -59,6 +60,7 @@ void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out);
 void esc_to_json(const std::string & in, Stream & out);
 void esc_to_json(const std::wstring & in, Stream & out);
 
+bool try_esc_to_xml(wchar_t val, Stream & out);
 void esc_to_xml(wchar_t c, pt::Stream & out);
 void esc_to_xml(char c, pt::Stream & out);
 void esc_to_xml(const char * c, pt::Stream & out);
@@ -68,6 +70,7 @@ void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out);
 void esc_to_xml(const std::string & in, Stream & out);
 void esc_to_xml(const std::wstring & in, Stream & out);
 
+bool try_esc_to_csv(wchar_t val, pt::Stream & out);
 void esc_to_csv(wchar_t val, Stream & out);
 void esc_to_csv(char c, pt::Stream & out);
 void esc_to_csv(const char * c, std::size_t len, Stream & out);

From 0100c7e453b1b140ec5b31c726d65de25812d620 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Tue, 8 Feb 2022 14:52:50 +0100
Subject: [PATCH 29/37] fix: check correctly for new lines when filtering html

---
 src/html/htmlparser.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 4186445..57f8d00 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -1413,6 +1413,7 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
 // reading text between html tags
 void HTMLParser::ReadText(bool is_cdata)
 {
+	new_item_has_new_line_before = false;
 	bool was_white_char = false;
 	bool was_new_line = false;
 
@@ -1448,6 +1449,9 @@ void HTMLParser::ReadText(bool is_cdata)
 		tmp_text.clear();
 		was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
 
+		if( lastc == -1 || was_closing_tag )
+			new_item_has_new_line_before = was_new_line;
+
 		if( !tmp_text.empty() )
 		{
 			allow_put_new_line = false;
@@ -1464,6 +1468,7 @@ void HTMLParser::ReadText(bool is_cdata)
 			{
 				SkipWhiteLines(text_space_wstr);
 				PutNonBreakingSpace();
+				was_new_line = false;
 			}
 		}
 		else
@@ -1502,7 +1507,6 @@ void HTMLParser::ReadText(bool is_cdata)
 	}
 
 	text_space_tmp.clear();
-	new_item_has_new_line_before = was_new_line;
 }
 
 

From 5253963c84ece446ab502e22ef32cf2e9ea30b3c Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Tue, 8 Feb 2022 16:34:54 +0100
Subject: [PATCH 30/37] fix: put a white char before an opening tag in tree
 mode if it was in the source html

---
 src/html/htmlparser.cpp | 49 +++++++++++++++++++++++++++--------------
 src/html/htmlparser.h   | 13 +++++++----
 2 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 57f8d00..c5e37cf 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2008-2021, Tomasz Sowa
+ * Copyright (c) 2008-2022, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -71,16 +71,17 @@ void HTMLParser::clear_input_flags()
 void HTMLParser::Item::Clear()
 {
 	name.clear();
-	type            = none;
-	is_commentary   = false;
-	is_cdata		= false;
-	porphans        = nullptr;
-	new_line_before = false;
-	new_line        = false;
+	type				= none;
+	is_commentary		= false;
+	is_cdata			= false;
+	porphans			= nullptr;
+	new_line_before		= false;
+	new_line_after		= false;
 	new_line_in_the_middle = false;
-	has_body_tag    = false;
-	tree_index      = 0;
-	space           = nullptr;
+	white_char_before	= false;
+	has_body_tag		= false;
+	tree_index			= 0;
+	space				= nullptr;
 }
 
 
@@ -1175,10 +1176,18 @@ bool HTMLParser::PutOpeningTag()
 		return false;
 	}
 
-	if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before )
+	if( current_white_char_mode() == WHITE_MODE_TREE )
 	{
-		Put(10);
-		PutTabs(LastItem().tree_index);
+		if( LastItem().new_line_before )
+		{
+			Put(10);
+			PutTabs(LastItem().tree_index);
+		}
+		else
+		if( LastItem().white_char_before )
+		{
+			Put(' ');
+		}
 	}
 
 	PutOpeningTagMark();
@@ -1414,6 +1423,8 @@ void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
 void HTMLParser::ReadText(bool is_cdata)
 {
 	new_item_has_new_line_before = false;
+	new_item_has_white_char_before = false;
+
 	bool was_white_char = false;
 	bool was_new_line = false;
 
@@ -1424,7 +1435,7 @@ void HTMLParser::ReadText(bool is_cdata)
 
 	if( current_white_char_mode() == WHITE_MODE_TREE )
 	{
-		if( LastItem().new_line || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) )
+		if( LastItem().new_line_after || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) )
 		{
 			allow_put_new_line = true;
 		}
@@ -1450,7 +1461,10 @@ void HTMLParser::ReadText(bool is_cdata)
 		was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
 
 		if( lastc == -1 || was_closing_tag )
+		{
 			new_item_has_new_line_before = was_new_line;
+			new_item_has_white_char_before = was_white_char;
+		}
 
 		if( !tmp_text.empty() )
 		{
@@ -1486,7 +1500,7 @@ void HTMLParser::ReadText(bool is_cdata)
 					LastItem().new_line_in_the_middle = true;
 
 					if( !was_non_white_text )
-						LastItem().new_line = true;
+						LastItem().new_line_after = true;
 				}
 				else
 				{
@@ -1758,6 +1772,7 @@ bool HTMLParser::ReadItem()
 		return false;
 
 	LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method
+	LastItem().white_char_before = new_item_has_white_char_before; // new_item_has_white_char_before is set by ReadText() method
 
 	if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
 		LastItem().tree_index += 1;
@@ -1971,7 +1986,7 @@ void HTMLParser::CheckStackPrintRest()
 {
 	while( stack_len-- > 0 )
 	{
-		if( stack_len==0 || pstack[stack_len-1].new_line )
+		if( stack_len==0 || pstack[stack_len-1].new_line_after )
 		{
 			if( current_white_char_mode() == WHITE_MODE_TREE )
 			{
@@ -2030,7 +2045,7 @@ void HTMLParser::CheckClosingTags()
 
 		if( !skip_tags && IsTagSafe(LastItem().name) && !IsNameEqual(no_filter_tag, LastItem().name) )
 		{
-			if( pstack[z].new_line )
+			if( pstack[z].new_line_after )
 			{
 				if( current_white_char_mode() == WHITE_MODE_TREE )
 				{
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 7797b51..fb63809 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2008-2021, Tomasz Sowa
+ * Copyright (c) 2008-2022, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -134,14 +134,18 @@ public:
 
 		bool is_cdata;
 
+		// is a new line before this tag (or just a new line and some white characters)
 		bool new_line_before;
 
-		// is there a new line after this tag
-		bool new_line;
+		// is there a new line after this tag (or just some white characters and a new line)
+		bool new_line_after;
 
-		// is there a new
+		// is there a new line in the middle after this tag and before the next tag
 		bool new_line_in_the_middle;
 
+		// is there a white char (but not new line) before this tag
+		bool white_char_before;
+
 		// current orphans table
 		// (will be propagated)
 		Orphans * porphans;
@@ -448,6 +452,7 @@ protected:
 
 	//bool last_new_line;
 	bool new_item_has_new_line_before;
+	bool new_item_has_white_char_before;
 	int white_mode;
 	bool is_first_item;
 	size_t wrap_line;		// insert a new line character into long lines

From 3173042229f29d6c88549192a9a2c622819b8c97 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Tue, 26 Apr 2022 23:47:27 +0200
Subject: [PATCH 31/37] make depend

---
 src/Makefile.dep   | 26 +++++++++++++-------------
 tests/Makefile.dep | 13 +++++++------
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/Makefile.dep b/src/Makefile.dep
index b037dab..bbf1f99 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -50,19 +50,6 @@
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
-./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
-./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
-./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h
-./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
-./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
-./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
-./html/bbcodeparser.o: utf8/utf8_stream.h
-./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
-./html/htmlparser.o: textstream/textstream.h textstream/stream.h
-./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
-./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
-./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
-./html/htmlparser.o: textstream/types.h utf8/utf8_stream.h convert/text.h
 ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
 ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
@@ -74,3 +61,16 @@
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
+./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
+./html/htmlparser.o: textstream/textstream.h textstream/stream.h
+./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
+./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
+./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
+./html/htmlparser.o: textstream/types.h utf8/utf8_stream.h convert/text.h
+./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
+./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
+./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h
+./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
+./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
+./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
+./html/bbcodeparser.o: utf8/utf8_stream.h
diff --git a/tests/Makefile.dep b/tests/Makefile.dep
index 8685629..8cf551c 100644
--- a/tests/Makefile.dep
+++ b/tests/Makefile.dep
@@ -8,8 +8,8 @@
 ./convert.o: ../src/textstream/stream.h ../src/utf8/utf8_templates.h
 ./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h
 ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
-./convert.o: ../src/convert/strtoint.h ../src/convert/text.h
-./convert.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
+./convert.o: ../src/utf8/utf8_stream.h ../src/convert/strtoint.h
+./convert.o: ../src/convert/text.h ../src/convert/misc.h
 ./convert.o: ../src/convert/double.h
 ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
 ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
@@ -17,8 +17,10 @@
 ./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
 ./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h
 ./csvparser.o: ../src/textstream/stream.h ../src/date/date.h
-./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h test.h
+./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
+./csvparser.o: ../src/utf8/utf8_stream.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
+./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h
 ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
 ./mainoptionsparser.o: ../src/space/space.h ../src/textstream/types.h
@@ -31,7 +33,6 @@
 ./mainoptionsparser.o: ../src/textstream/textstream.h
 ./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h
 ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
-./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
-./mainoptionsparser.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
+./mainoptionsparser.o: ../src/utf8/utf8_stream.h ../src/convert/strtoint.h
+./mainoptionsparser.o: ../src/convert/text.h ../src/convert/misc.h
 ./mainoptionsparser.o: ../src/convert/double.h
-./test.o: test.h

From 72c10b20fb4dfe096bf00f5c61fb545ba39ba218 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Wed, 27 Apr 2022 22:07:58 +0200
Subject: [PATCH 32/37] flush logs when printing to stdout

---
 src/log/filelog.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/log/filelog.cpp b/src/log/filelog.cpp
index afb76f3..c3708c2 100644
--- a/src/log/filelog.cpp
+++ b/src/log/filelog.cpp
@@ -112,6 +112,7 @@ void FileLog::save_log(WTextStream * buffer)
 			if( log_stdout )
 			{
 				wide_stream_to_utf8(*buffer, std::cout);
+				std::cout.flush();
 			}
 
 			if( !log_file.empty() )

From 5d2788d0d83f7357fffa40ca056e3373adbaa398 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Wed, 25 May 2022 19:57:35 +0200
Subject: [PATCH 33/37] add Log::put_multiline() methods

---
 src/log/log.cpp | 48 ++++++++++++++++++++++++++++++++++-
 src/log/log.h   | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/src/log/log.cpp b/src/log/log.cpp
index 921061b..a52ad17 100644
--- a/src/log/log.cpp
+++ b/src/log/log.cpp
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2018-2021, Tomasz Sowa
+ * Copyright (c) 2018-2022, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -684,5 +684,51 @@ void Log::save_log()
 }
 
 
+
+Log & Log::put_multiline(const char * prefix, const char * msg)
+{
+	put_multiline_generic(prefix, msg);
+	return *this;
+}
+
+
+Log & Log::put_multiline(const wchar_t * prefix, const wchar_t * msg)
+{
+	put_multiline_generic(prefix, msg);
+	return *this;
+}
+
+
+Log & Log::put_multiline(const char * prefix, const std::string & msg)
+{
+	put_multiline_generic(prefix, msg.c_str());
+	return *this;
+}
+
+
+Log & Log::put_multiline(const wchar_t * prefix, const std::wstring & msg)
+{
+	put_multiline_generic(prefix, msg.c_str());
+	return *this;
+}
+
+
+Log & Log::put_multiline(const std::string & prefix, const std::string & msg)
+{
+	put_multiline_generic(prefix.c_str(), msg.c_str());
+	return *this;
+}
+
+
+Log & Log::put_multiline(const std::wstring & prefix, const std::wstring & msg)
+{
+	put_multiline_generic(prefix.c_str(), msg.c_str());
+	return *this;
+}
+
+
+
+
+
 } // namespace
 
diff --git a/src/log/log.h b/src/log/log.h
index 9a19875..afe107f 100644
--- a/src/log/log.h
+++ b/src/log/log.h
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2018-2021, Tomasz Sowa
+ * Copyright (c) 2018-2022, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -156,6 +156,17 @@ public:
 	virtual Log & put_binary_blob(const char * blob, size_t blob_len);
 	virtual Log & put_binary_blob(const std::string & blob);
 
+	/*
+	 * put multiline message
+	 * first and last new characters are trimmed
+	 * at the beginning of each line a prefix is inserted
+	 */
+	virtual Log & put_multiline(const char * prefix, 		const char * msg);
+	virtual Log & put_multiline(const wchar_t * prefix, 	const wchar_t * msg);
+	virtual Log & put_multiline(const char * prefix, 		const std::string & msg);
+	virtual Log & put_multiline(const wchar_t * prefix, 	const std::wstring & msg);
+	virtual Log & put_multiline(const std::string & prefix, const std::string & msg);
+	virtual Log & put_multiline(const std::wstring & prefix, const std::wstring & msg);
 
 
 protected:
@@ -184,6 +195,10 @@ protected:
 	virtual void save_log();
 	virtual void save_log_and_clear();
 
+	template<typename CharType>
+	void put_multiline_generic(const CharType * prefix, const CharType * msg);
+
+
 };
 
 
@@ -221,11 +236,61 @@ Log & Log::log_string_generic(const StringType & value, size_t max_size)
 
 
 
+template<typename CharType>
+void Log::put_multiline_generic(const CharType * prefix, const CharType * msg)
+{
+	bool put_prefix = true;
+	bool was_new_line = false;
+	bool was_something_printed = false;
+
+	while( *msg )
+	{
+		if( static_cast<CharType>(*msg) == static_cast<CharType>('\n') )
+		{
+			was_new_line = true;
+			put_prefix = true;
+		}
+		else
+		{
+			if( was_new_line )
+			{
+				if( was_something_printed )
+				{
+					operator<<(logend);
+				}
+
+				was_new_line = false;
+			}
+
+			if( put_prefix )
+			{
+				operator<<(prefix);
+				put_prefix = false;
+			}
+
+			operator<<(*msg);
+			was_something_printed = true;
+		}
+
+		msg += 1;
+	}
+
+	if( was_something_printed )
+	{
+		operator<<(logend);
+	}
+}
+
+
 
 
 } // namespace
 
 
+
+
+
+
 #endif
 
 

From c3b7ab57936fd6c07a3ea3eaa795657fd03b2d8e Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Sat, 28 May 2022 06:06:32 +0200
Subject: [PATCH 34/37] add min_width parameter to methods converting int to
 string

---
 src/convert/inttostr.cpp |  66 ++++++++++++------------
 src/convert/inttostr.h   | 105 +++++++++++++++++++++------------------
 2 files changed, 90 insertions(+), 81 deletions(-)

diff --git a/src/convert/inttostr.cpp b/src/convert/inttostr.cpp
index b9a8d6d..89d9272 100644
--- a/src/convert/inttostr.cpp
+++ b/src/convert/inttostr.cpp
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2021, Tomasz Sowa
+ * Copyright (c) 2021-2022, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,114 +41,114 @@
 namespace pt
 {
 
-std::string to_str(unsigned long long value, int base)
+std::string to_str(unsigned long long value, int base, size_t min_width)
 {
 	std::string res;
-	Toa(value, res, false, base);
+	Toa(value, res, false, base, min_width);
 
 	return res;
 }
 
 
-std::string to_str(long long value, int base)
+std::string to_str(long long value, int base, size_t min_width)
 {
 	std::string res;
-	Toa(value, res, false, base);
+	Toa(value, res, false, base, min_width);
 
 	return res;
 }
 
 
-std::string to_str(unsigned long value, int base)
+std::string to_str(unsigned long value, int base, size_t min_width)
 {
-	return to_str(static_cast<unsigned long long>(value), base);
+	return to_str(static_cast<unsigned long long>(value), base, min_width);
 }
 
 
-std::string to_str(long value, int base)
+std::string to_str(long value, int base, size_t min_width)
 {
-	return to_str(static_cast<long long>(value), base);
+	return to_str(static_cast<long long>(value), base, min_width);
 }
 
 
-std::string to_str(unsigned int value, int base)
+std::string to_str(unsigned int value, int base, size_t min_width)
 {
-	return to_str(static_cast<unsigned long long>(value), base);
+	return to_str(static_cast<unsigned long long>(value), base, min_width);
 }
 
 
-std::string to_str(int value, int base)
+std::string to_str(int value, int base, size_t min_width)
 {
-	return to_str(static_cast<long long>(value), base);
+	return to_str(static_cast<long long>(value), base, min_width);
 }
 
 
-std::string to_str(unsigned short value, int base)
+std::string to_str(unsigned short value, int base, size_t min_width)
 {
-	return to_str(static_cast<unsigned long long>(value), base);
+	return to_str(static_cast<unsigned long long>(value), base, min_width);
 }
 
 
-std::string to_str(short value, int base)
+std::string to_str(short value, int base, size_t min_width)
 {
-	return to_str(static_cast<long long>(value), base);
+	return to_str(static_cast<long long>(value), base, min_width);
 }
 
 
 
 
 
-std::wstring to_wstr(unsigned long long value, int base)
+std::wstring to_wstr(unsigned long long value, int base, size_t min_width)
 {
 	std::wstring res;
-	Toa(value, res, false, base);
+	Toa(value, res, false, base, min_width);
 
 	return res;
 }
 
 
-std::wstring to_wstr(long long value, int base)
+std::wstring to_wstr(long long value, int base, size_t min_width)
 {
 	std::wstring res;
-	Toa(value, res, false, base);
+	Toa(value, res, false, base, min_width);
 
 	return res;
 }
 
 
-std::wstring to_wstr(unsigned long value, int base)
+std::wstring to_wstr(unsigned long value, int base, size_t min_width)
 {
-	return to_wstr(static_cast<unsigned long long>(value), base);
+	return to_wstr(static_cast<unsigned long long>(value), base, min_width);
 }
 
 
-std::wstring to_wstr(long value, int base)
+std::wstring to_wstr(long value, int base, size_t min_width)
 {
-	return to_wstr(static_cast<long long>(value), base);
+	return to_wstr(static_cast<long long>(value), base, min_width);
 }
 
 
-std::wstring to_wstr(unsigned int value, int base)
+std::wstring to_wstr(unsigned int value, int base, size_t min_width)
 {
-	return to_wstr(static_cast<unsigned long long>(value), base);
+	return to_wstr(static_cast<unsigned long long>(value), base, min_width);
 }
 
 
-std::wstring to_wstr(int value, int base)
+std::wstring to_wstr(int value, int base, size_t min_width)
 {
-	return to_wstr(static_cast<long long>(value), base);
+	return to_wstr(static_cast<long long>(value), base, min_width);
 }
 
 
-std::wstring to_wstr(unsigned short value, int base)
+std::wstring to_wstr(unsigned short value, int base, size_t min_width)
 {
-	return to_wstr(static_cast<unsigned long long>(value), base);
+	return to_wstr(static_cast<unsigned long long>(value), base, min_width);
 }
 
 
-std::wstring to_wstr(short value, int base)
+std::wstring to_wstr(short value, int base, size_t min_width)
 {
-	return to_wstr(static_cast<long long>(value), base);
+	return to_wstr(static_cast<long long>(value), base, min_width);
 }
 
 
diff --git a/src/convert/inttostr.h b/src/convert/inttostr.h
index f134dc8..bac0f1d 100644
--- a/src/convert/inttostr.h
+++ b/src/convert/inttostr.h
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2012-2021, Tomasz Sowa
+ * Copyright (c) 2012-2022, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -52,8 +52,9 @@ namespace pt
 
 // if the buffer is too small it will be terminated at the beginning (empty string)
 // and the function returns false
+// min_width - if greater than zero then it is used for zero padding
 template<class CharType>
-bool Toa(unsigned long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(unsigned long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = nullptr, size_t min_width = 0)
 {
 size_t i1, i2;
 long rest;
@@ -77,6 +78,14 @@ long rest;
 	}
 	while(value != 0 && i2 < buf_len);
 
+	if( min_width > 0 )
+	{
+		for( ; i2 < min_width && i2 < buf_len ; ++i2)
+		{
+			buffer[i2] = '0';
+		}
+	}
+
 	if( i2 >= buf_len )
 	{
 		buffer[0] = 0; // ops, the buffer was too small
@@ -106,7 +115,7 @@ return true;
 // if the buffer is too small it will be terminated at the beginning (empty string)
 // and the function returns false
 template<class CharType>
-bool Toa(long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = nullptr, size_t min_width = 0)
 {
 	if( len_out )
 		*len_out = 0;
@@ -126,7 +135,7 @@ bool Toa(long long value, CharType * buffer, size_t buf_len, int base = 10, size
 		is_sign   = true;
 	}
 
-	bool res = Toa(static_cast<unsigned long long>(value), buf, buf_len, base, len_out);
+	bool res = Toa(static_cast<unsigned long long>(value), buf, buf_len, base, len_out, min_width);
 
 	if( res )
 	{
@@ -146,44 +155,44 @@ return res;
 
 
 template<class CharType>
-bool Toa(unsigned long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(unsigned long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
 {
-	return Toa(static_cast<unsigned long long>(value), buffer, buf_len, base, len_out);
+	return Toa(static_cast<unsigned long long>(value), buffer, buf_len, base, len_out, min_width);
 }
 
 template<class CharType>
-bool Toa(long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
 {
-	return Toa(static_cast<long long>(value), buffer, buf_len, base, len_out);
+	return Toa(static_cast<long long>(value), buffer, buf_len, base, len_out, min_width);
 }
 
 
 
 template<class CharType>
-bool Toa(unsigned int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(unsigned int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
 {
-	return Toa(static_cast<unsigned long long>(value), buffer, buf_len, base, len_out);
+	return Toa(static_cast<unsigned long long>(value), buffer, buf_len, base, len_out, min_width);
 }
 
 
 template<class CharType>
-bool Toa(int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
 {
-	return Toa(static_cast<long long>(value), buffer, buf_len, base, len_out);
+	return Toa(static_cast<long long>(value), buffer, buf_len, base, len_out, min_width);
 }
 
 
 template<class CharType>
-bool Toa(unsigned short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(unsigned short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
 {
-	return Toa(static_cast<unsigned long long>(value), buffer, buf_len, base, len_out);
+	return Toa(static_cast<unsigned long long>(value), buffer, buf_len, base, len_out, min_width);
 }
 
 
 template<class CharType>
-bool Toa(short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
 {
-	return Toa(static_cast<long long>(value), buffer, buf_len, base, len_out);
+	return Toa(static_cast<long long>(value), buffer, buf_len, base, len_out, min_width);
 }
 
 
@@ -192,7 +201,7 @@ bool Toa(short value, CharType * buffer, size_t buf_len, int base = 10, size_t *
 
 
 template<class StringType>
-void Toa(unsigned long long value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(unsigned long long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
 {
 	typename StringType::value_type buffer[50];
 	size_t buffer_len = sizeof(buffer) / sizeof(wchar_t);
@@ -204,13 +213,13 @@ void Toa(unsigned long long value, StringType & res, bool clear_string = true, i
 	 * the size of the buffer is sufficient so the status should always be true
 	 */
 	size_t len_out;
-	Toa(value, buffer, buffer_len, base, &len_out);
+	Toa(value, buffer, buffer_len, base, &len_out, min_width);
 	res.append(buffer, len_out);
 }
 
 
 template<class StringType>
-void Toa(long long value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(long long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
 {
 	typename StringType::value_type buffer[50];
 	size_t buffer_len = sizeof(buffer) / sizeof(wchar_t);
@@ -222,71 +231,71 @@ void Toa(long long value, StringType & res, bool clear_string = true, int base =
 	 * the size of the buffer is sufficient so the status should always be true
 	 */
 	size_t len_out;
-	Toa(value, buffer, buffer_len, base, &len_out);
+	Toa(value, buffer, buffer_len, base, &len_out, min_width);
 	res.append(buffer, len_out);
 }
 
 
 template<class StringType>
-void Toa(unsigned long value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(unsigned long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
 {
-	Toa(static_cast<unsigned long long>(value), res, clear_string, base);
+	Toa(static_cast<unsigned long long>(value), res, clear_string, base, min_width);
 }
 
 
 template<class StringType>
-void Toa(long value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
 {
-	Toa(static_cast<long long>(value), res, clear_string, base);
+	Toa(static_cast<long long>(value), res, clear_string, base, min_width);
 }
 
 
 template<class StringType>
-void Toa(unsigned int value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(unsigned int value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
 {
-	Toa(static_cast<unsigned long long>(value), res, clear_string, base);
+	Toa(static_cast<unsigned long long>(value), res, clear_string, base, min_width);
 }
 
 
 template<class StringType>
-void Toa(int value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(int value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
 {
-	Toa(static_cast<long long>(value), res, clear_string, base);
+	Toa(static_cast<long long>(value), res, clear_string, base, min_width);
 }
 
 
 template<class StringType>
-void Toa(unsigned short value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(unsigned short value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
 {
-	Toa(static_cast<unsigned long long>(value), res, clear_string, base);
+	Toa(static_cast<unsigned long long>(value), res, clear_string, base, min_width);
 }
 
 
 template<class StringType>
-void Toa(short value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(short value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
 {
-	Toa(static_cast<long long>(value), res, clear_string, base);
+	Toa(static_cast<long long>(value), res, clear_string, base, min_width);
 }
 
 
 
-std::string to_str(unsigned long long value, int base = 10);
-std::string to_str(long long value, int base = 10);
-std::string to_str(unsigned long value, int base = 10);
-std::string to_str(long value, int base = 10);
-std::string to_str(unsigned int value, int base = 10);
-std::string to_str(int value, int base = 10);
-std::string to_str(unsigned short value, int base = 10);
-std::string to_str(short value, int base = 10);
+std::string to_str(unsigned long long value, int base = 10, size_t min_width = 0);
+std::string to_str(long long value, int base = 10, size_t min_width = 0);
+std::string to_str(unsigned long value, int base = 10, size_t min_width = 0);
+std::string to_str(long value, int base = 10, size_t min_width = 0);
+std::string to_str(unsigned int value, int base = 10, size_t min_width = 0);
+std::string to_str(int value, int base = 10, size_t min_width = 0);
+std::string to_str(unsigned short value, int base = 10, size_t min_width = 0);
+std::string to_str(short value, int base = 10, size_t min_width = 0);
 
-std::wstring to_wstr(unsigned long long value, int base = 10);
-std::wstring to_wstr(long long value, int base = 10);
-std::wstring to_wstr(unsigned long value, int base = 10);
-std::wstring to_wstr(long value, int base = 10);
-std::wstring to_wstr(unsigned int value, int base = 10);
-std::wstring to_wstr(int value, int base = 10);
-std::wstring to_wstr(unsigned short value, int base = 10);
-std::wstring to_wstr(short value, int base = 10);
+std::wstring to_wstr(unsigned long long value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(long long value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(unsigned long value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(long value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(unsigned int value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(int value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(unsigned short value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(short value, int base = 10, size_t min_width = 0);
 
 
 

From a40bab0445139b128593e89afd32f5ebc519cc2d Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Mon, 30 May 2022 00:55:38 +0200
Subject: [PATCH 35/37] add Space::get_table_item() method

---
 src/space/space.cpp | 14 +++++++++++++-
 src/space/space.h   |  4 ++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/space/space.cpp b/src/space/space.cpp
index 9baa939..d8cc41e 100644
--- a/src/space/space.cpp
+++ b/src/space/space.cpp
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2008-2021, Tomasz Sowa
+ * Copyright (c) 2008-2022, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -2577,6 +2577,18 @@ void Space::remove_value_table(bool only_clear)
 }
 
 
+Space * Space::get_table_item(size_t index)
+{
+	if( is_table() && index < table_size() )
+	{
+		return value.value_table[index];
+	}
+
+	return nullptr;
+}
+
+
+
 
 } // namespace
 
diff --git a/src/space/space.h b/src/space/space.h
index cb9ebbe..1332c62 100644
--- a/src/space/space.h
+++ b/src/space/space.h
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2010-2021, Tomasz Sowa
+ * Copyright (c) 2010-2022, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -635,7 +635,7 @@ public:
 	bool has_value(const wchar_t * field, const std::wstring & val) const;
 
 
-
+	Space * get_table_item(size_t index);
 
 
 protected:

From 68fe25c8bf82b49562a30265265f065293bff353 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Mon, 30 May 2022 01:01:14 +0200
Subject: [PATCH 36/37] add limits when parsing a json/space format

while here:
- add column index error
- add parsing methods with pt::TextStream and pt::WTextStream arguments
---
 src/convert/baseparser.cpp |  39 ++---
 src/convert/baseparser.h   |   8 +-
 src/space/spaceparser.cpp  | 290 +++++++++++++++++++++++++++++++------
 src/space/spaceparser.h    |  81 ++++++++++-
 4 files changed, 354 insertions(+), 64 deletions(-)

diff --git a/src/convert/baseparser.cpp b/src/convert/baseparser.cpp
index 37fbbbf..d4abca1 100644
--- a/src/convert/baseparser.cpp
+++ b/src/convert/baseparser.cpp
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2021, Tomasz Sowa
+ * Copyright (c) 2021-2022, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -52,6 +52,7 @@ BaseParser::BaseParser()
 void BaseParser::clear_input_flags()
 {
 	line = 0;
+	column = 0;
 	reading_from_file = false;
 	pchar_ascii = nullptr;
 	pchar_unicode = nullptr;
@@ -69,6 +70,16 @@ void BaseParser::clear_input_flags()
 }
 
 
+void BaseParser::check_new_line()
+{
+	if( lastc == '\n' )
+	{
+		++line;
+		column = 0;
+	}
+}
+
+
 int BaseParser::read_utf8_char()
 {
 int c;
@@ -86,9 +97,7 @@ bool correct;
 	while( !correct );
 
 	lastc = c;
-
-	if( lastc == '\n' )
-		++line;
+	check_new_line();
 
 return lastc;
 }
@@ -97,9 +106,7 @@ return lastc;
 int BaseParser::read_ascii_char()
 {
 	lastc = file.get();
-
-	if( lastc == '\n' )
-		++line;
+	check_new_line();
 
 return lastc;
 }
@@ -112,8 +119,7 @@ int BaseParser::read_char_from_wchar_string()
 	else
 		lastc = *(pchar_unicode++);
 
-	if( lastc == '\n' )
-		++line;
+	check_new_line();
 
 return lastc;
 }
@@ -136,8 +142,7 @@ bool correct;
 	if( correct )
 		lastc = c;
 
-	if( lastc == '\n' )
-		++line;
+	check_new_line();
 
 return lastc;
 }
@@ -150,8 +155,7 @@ int BaseParser::read_char_from_ascii_string()
 	else
 		lastc = *(pchar_ascii++);
 
-	if( lastc == '\n' )
-		++line;
+	check_new_line();
 
 return lastc;
 }
@@ -169,8 +173,7 @@ int BaseParser::read_char_from_wtext_stream()
 		lastc = -1;
 	}
 
-	if( lastc == '\n' )
-		++line;
+	check_new_line();
 
 	return lastc;
 }
@@ -192,8 +195,7 @@ int BaseParser::read_char_from_utf8_text_stream()
 	if( correct )
 		lastc = c;
 
-	if( lastc == '\n' )
-		++line;
+	check_new_line();
 
 	return lastc;
 }
@@ -211,8 +213,7 @@ int BaseParser::read_char_from_ascii_text_stream()
 		lastc = -1;
 	}
 
-	if( lastc == '\n' )
-		++line;
+	check_new_line();
 
 	return lastc;
 }
diff --git a/src/convert/baseparser.h b/src/convert/baseparser.h
index a8c648d..67721b1 100644
--- a/src/convert/baseparser.h
+++ b/src/convert/baseparser.h
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (c) 2021, Tomasz Sowa
+ * Copyright (c) 2021-2022, Tomasz Sowa
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -54,6 +54,7 @@ protected:
 
 	virtual void clear_input_flags();
 
+	virtual void check_new_line();
 	virtual int read_utf8_char();
 	virtual int read_ascii_char();
 	virtual int read_char_from_wchar_string();
@@ -72,6 +73,11 @@ protected:
 	*/
 	int line;
 
+	/*
+		a number of a column in which there is a syntax_error
+	*/
+	int column;
+
 
 	/*
 		true if parse() method was called
diff --git a/src/space/spaceparser.cpp b/src/space/spaceparser.cpp
index 46f0aa4..ef041b3 100644
--- a/src/space/spaceparser.cpp
+++ b/src/space/spaceparser.cpp
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2012-2021, Tomasz Sowa
+ * Copyright (c) 2012-2022, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -55,6 +55,10 @@ SpaceParser::SpaceParser()
 	space_end		 = '}';
 	option_delimiter = ',';
 	input_as_utf8    = true;
+	object_items_limit          = 0;
+	table_items_limit          = 0;
+	all_items_limit          = 0;
+	nested_levels_limit          = 0;
 }
 
 
@@ -71,10 +75,73 @@ int SpaceParser::get_last_parsed_line()
 }
 
 
+int SpaceParser::get_last_parsed_column()
+{
+	return column;
+}
+
+
+void SpaceParser::set_object_items_limit(size_t val)
+{
+	this->object_items_limit = val;
+}
+
+
+void SpaceParser::set_table_items_limit(size_t val)
+{
+	this->table_items_limit = val;
+}
+
+
+void SpaceParser::set_all_items_limit(size_t val)
+{
+	this->all_items_limit = val;
+}
+
+
+void SpaceParser::set_nested_level_limit(size_t val)
+{
+	this->nested_levels_limit = val;
+}
+
+
+size_t SpaceParser::get_object_items_limit()
+{
+	return object_items_limit;
+}
+
+
+size_t SpaceParser::get_table_items_limit()
+{
+	return table_items_limit;
+}
+
+
+size_t SpaceParser::get_all_items_limit()
+{
+	return all_items_limit;
+}
+
+
+size_t SpaceParser::get_nested_level_limit()
+{
+	return nested_levels_limit;
+}
+
+
+
+void SpaceParser::prepare_to_parsing()
+{
+	clear_input_flags();
+
+	current_items_counter = 0;
+	current_nested_level = 0;
+}
+
 
 SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space)
 {
-	clear_input_flags();
+	prepare_to_parsing();
 
 	reading_from_file = true;
 	parsing_space = false;
@@ -126,7 +193,7 @@ SpaceParser::Status SpaceParser::parse_json_file(const std::wstring & file_name,
 
 SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space)
 {
-	clear_input_flags();
+	prepare_to_parsing();
 
 	reading_from_file = true;
 	parsing_space = true;
@@ -176,7 +243,7 @@ SpaceParser::Status SpaceParser::parse_space_file(const std::wstring & file_name
 
 SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space)
 {
-	clear_input_flags();
+	prepare_to_parsing();
 
 	pchar_ascii               = str;
 	parsing_space             = false;
@@ -196,7 +263,7 @@ SpaceParser::Status SpaceParser::parse_json(const std::string & str, Space & out
 
 SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space)
 {
-	clear_input_flags();
+	prepare_to_parsing();
 
 	pchar_unicode             = str;
 	parsing_space             = false;
@@ -215,11 +282,48 @@ SpaceParser::Status SpaceParser::parse_json(const std::wstring & str, Space & ou
 
 
 
+SpaceParser::Status SpaceParser::parse_json(const pt::TextStream & str, Space & out_space, bool clear_space)
+{
+	prepare_to_parsing();
+
+	pt::TextStream::const_iterator start = str.begin();
+	pt::TextStream::const_iterator end = str.end();
+
+	text_stream_iterator      = &start;
+	text_stream_iterator_end  = &end;
+	parsing_space             = false;
+	root_space                = &out_space;
+
+	parse_root_space(clear_space);
+
+	return status;
+}
+
+
+SpaceParser::Status SpaceParser::parse_json(const pt::WTextStream & str, Space & out_space, bool clear_space)
+{
+	prepare_to_parsing();
+
+	pt::WTextStream::const_iterator start = str.begin();
+	pt::WTextStream::const_iterator end = str.end();
+
+	wtext_stream_iterator     = &start;
+	wtext_stream_iterator_end = &end;
+	parsing_space             = false;
+	root_space                = &out_space;
+
+	parse_root_space(clear_space);
+
+	return status;
+}
+
+
+
 
 
 SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space)
 {
-	clear_input_flags();
+	prepare_to_parsing();
 
 	pchar_ascii               = str;
 	parsing_space             = true;
@@ -239,7 +343,7 @@ SpaceParser::Status SpaceParser::parse_space(const std::string & str, Space & ou
 
 SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space)
 {
-	clear_input_flags();
+	prepare_to_parsing();
 
 	pchar_unicode             = str;
 	parsing_space             = true;
@@ -257,6 +361,41 @@ SpaceParser::Status SpaceParser::parse_space(const std::wstring & str, Space & o
 }
 
 
+SpaceParser::Status SpaceParser::parse_space(const pt::TextStream & str, Space & out_space, bool clear_space)
+{
+	prepare_to_parsing();
+
+	pt::TextStream::const_iterator start = str.begin();
+	pt::TextStream::const_iterator end = str.end();
+
+	text_stream_iterator      = &start;
+	text_stream_iterator_end  = &end;
+	parsing_space             = true;
+	root_space                = &out_space;
+
+	parse_root_space(clear_space);
+
+	return status;
+}
+
+
+SpaceParser::Status SpaceParser::parse_space(const pt::WTextStream & str, Space & out_space, bool clear_space)
+{
+	prepare_to_parsing();
+
+	pt::WTextStream::const_iterator start = str.begin();
+	pt::WTextStream::const_iterator end = str.end();
+
+	wtext_stream_iterator     = &start;
+	wtext_stream_iterator_end = &end;
+	parsing_space             = true;
+	root_space                = &out_space;
+
+	parse_root_space(clear_space);
+
+	return status;
+}
+
 
 
 
@@ -287,10 +426,13 @@ void SpaceParser::parse_root_space(bool clear_root_space)
 		parse(root_space, false, false);
 	}
 
-	skip_white();
+	if( status == ok )
+	{
+		skip_white();
 
-	if( lastc != -1 )
-		status = syntax_error;
+		if( lastc != -1 )
+			status = syntax_error;
+	}
 
 	token.clear();
 }
@@ -362,32 +504,45 @@ void SpaceParser::parse(Space * space, bool is_object_value, bool is_table_value
 
 void SpaceParser::parse_space(Space * space)
 {
-	/*
-	 * in Space format in global namespace the space start character is not required
-	 */
-	bool need_space_start_character = !parsing_space || space != root_space;
-
-	if( need_space_start_character )
+	if( nested_levels_limit == 0 || current_nested_level++ < nested_levels_limit )
 	{
-		read_char(); // inserting a next character after the space_start char to lastc
-	}
+		/*
+		 * in Space format in global namespace the space start character is not required
+		 */
+		bool need_space_start_character = !parsing_space || space != root_space;
 
-	if( !space->is_object() )
-		space->set_empty_object();
-
-	parse_key_value_pairs(space);
-
-	if( need_space_start_character )
-	{
-		if( lastc == space_end )
+		if( need_space_start_character )
 		{
-			read_char();
+			read_char(); // inserting a next character after the space_start char to lastc
 		}
-		else
+
+		if( !space->is_object() )
+			space->set_empty_object();
+
+		parse_key_value_pairs(space);
+
+		if( status == ok )
 		{
-			status = syntax_error;
+			if( need_space_start_character )
+			{
+				if( lastc == space_end )
+				{
+					read_char();
+				}
+				else
+				{
+					status = syntax_error;
+				}
+			}
 		}
 	}
+	else
+	{
+		status = limit_nested_level_exceeded;
+	}
+
+	if( current_nested_level > 0 )
+		current_nested_level -= 1;
 }
 
 
@@ -463,18 +618,31 @@ void SpaceParser::parse_floating_point_value(Space * space)
 
 void SpaceParser::parse_table(Space * space)
 {
-	read_char(); // inserting a next character after the table_start char to lastc
-	space->set_empty_table();
-	parse_values_list(space);
-
-	if( lastc == table_end )
+	if( nested_levels_limit == 0 || current_nested_level++ < nested_levels_limit )
 	{
-		read_char();
+		read_char(); // inserting a next character after the table_start char to lastc
+		space->set_empty_table();
+		parse_values_list(space);
+
+		if( status == ok )
+		{
+			if( lastc == table_end )
+			{
+				read_char();
+			}
+			else
+			{
+				status = syntax_error;
+			}
+		}
 	}
 	else
 	{
-		status = syntax_error;
+		status = limit_nested_level_exceeded;
 	}
+
+	if( current_nested_level > 0 )
+		current_nested_level -= 1;
 }
 
 
@@ -524,8 +692,23 @@ void SpaceParser::parse_key_value_pairs(Space * space)
 				{
 					read_char(); // inserting a next character after the separator to lastc
 
-					Space & new_space = space->add(token.c_str(), new Space());
-					parse(&new_space, true, false);
+					if( object_items_limit == 0 || !space->is_object() || (space->object_size() < object_items_limit) )
+					{
+						Space & new_space = space->add(token.c_str(), new Space());
+
+						if( all_items_limit == 0 || current_items_counter++ < all_items_limit )
+						{
+							parse(&new_space, true, false);
+						}
+						else
+						{
+							status = limit_all_items_exceeded;
+						}
+					}
+					else
+					{
+						status = limit_object_items_exceeded;
+					}
 				}
 				else
 				{
@@ -535,7 +718,11 @@ void SpaceParser::parse_key_value_pairs(Space * space)
 		}
 
 		is_first = false;
-		skip_white();
+
+		if( status == ok )
+		{
+			skip_white();
+		}
 	}
 }
 
@@ -576,12 +763,31 @@ void SpaceParser::parse_values_list(Space * space)
 
 		if( status == ok )
 		{
-			Space * new_space = &space->add(new Space());
-			parse(new_space, false, true);
+			if( table_items_limit == 0 || !space->is_table() || (space->table_size() < table_items_limit) )
+			{
+				Space * new_space = &space->add(new Space());
+
+				if( all_items_limit == 0 || current_items_counter++ < all_items_limit )
+				{
+					parse(new_space, false, true);
+				}
+				else
+				{
+					status = limit_all_items_exceeded;
+				}
+			}
+			else
+			{
+				status = limit_table_items_exceeded;
+			}
 		}
 
 		is_first = false;
-		skip_white();
+
+		if( status == ok )
+		{
+			skip_white();
+		}
 	}
 }
 
diff --git a/src/space/spaceparser.h b/src/space/spaceparser.h
index 6805b88..61aad5b 100644
--- a/src/space/spaceparser.h
+++ b/src/space/spaceparser.h
@@ -5,7 +5,7 @@
  */
 
 /* 
- * Copyright (c) 2012-2021, Tomasz Sowa
+ * Copyright (c) 2012-2022, Tomasz Sowa
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -63,8 +63,23 @@ public:
 
 	/*
 		status of parsing
+		ok - input stream has been parsed correctly
+		cant_open_file - I cannot open the file (returns only in a case when parsing a file)
+		syntax_error - a syntax error in the input stream
+		limit_object_items_exceeded - limit of object items has been exceeded
+		limit_table_items_exceeded - limit of table items has been exceeded
+		limit_all_items_exceeded - limit of items (key/value pairs of objects or table items) throughout the whole tree has been exceeded
+		limit_nested_level_exceeded - limit of nested objects/tables has been exceeded
 	*/
-	enum Status { ok, cant_open_file, syntax_error };
+	enum Status {
+		ok,
+		cant_open_file,
+		syntax_error,
+		limit_object_items_exceeded,
+		limit_table_items_exceeded,
+		limit_all_items_exceeded,
+		limit_nested_level_exceeded
+	};
 
 
 	/*
@@ -109,6 +124,8 @@ public:
 	Status parse_json(const wchar_t * str,      Space & out_space, bool clear_space = true);
 	Status parse_json(const std::wstring & str, Space & out_space, bool clear_space = true);
 
+	Status parse_json(const pt::TextStream & str, Space & out_space, bool clear_space = true);
+	Status parse_json(const pt::WTextStream & str, Space & out_space, bool clear_space = true);
 
 
 	Status parse_space(const char * str,         Space & out_space, bool clear_space = true);
@@ -116,6 +133,8 @@ public:
 	Status parse_space(const wchar_t * str,      Space & out_space, bool clear_space = true);
 	Status parse_space(const std::wstring & str, Space & out_space, bool clear_space = true);
 
+	Status parse_space(const pt::TextStream & str, Space & out_space, bool clear_space = true);
+	Status parse_space(const pt::WTextStream & str, Space & out_space, bool clear_space = true);
 
 
 	/*
@@ -144,6 +163,45 @@ public:
 	 *
 	 */
 	int get_last_parsed_line();
+	int get_last_parsed_column();
+
+
+	/*
+	 * get/set limit of object items in one object
+	 * default: 0 (disabled)
+	 */
+	void set_object_items_limit(size_t val);
+	size_t get_object_items_limit();
+
+
+	/*
+	 * get/set limit of items in one table
+	 * default: 0 (disabled)
+	 *
+	 */
+	void set_table_items_limit(size_t val);
+	size_t get_table_items_limit();
+
+
+	/*
+	 * get/set limit of all items (objects items and table items) througout the whole tree
+	 * default: 0 (disabled)
+	 *
+	 */
+	void set_all_items_limit(size_t val);
+	size_t get_all_items_limit();
+
+
+	/*
+	 * get/set nested level limit
+	 * limit of nested objects and tables
+	 * default: 0 (disabled)
+	 *
+	 */
+	void set_nested_level_limit(size_t val);
+	size_t get_nested_level_limit();
+
+
 
 
 private:
@@ -211,6 +269,23 @@ private:
 	bool parsing_space;
 
 
+	/*
+	 * object_items_limit - limit of key/value pairs of one object
+	 * table_items_limit - limit of items of one table
+	 * all_items_limit - limit of all items of all objects and all tables
+	 * nested_levels_limit - limit of nested objects/tables
+	 */
+	size_t object_items_limit;
+	size_t table_items_limit;
+	size_t all_items_limit;
+	size_t nested_levels_limit;
+
+	/*
+	 * current_items_counter - how many items (key/value pairs of objects or table items) throughout the whole tree
+	 * current_nested_level - current nested level of objects and tables
+	 */
+	size_t current_items_counter;
+	size_t current_nested_level;
 
 	void parse_root_space(bool clear_root_space);
 	void parse(Space * space, bool is_object_value, bool is_table_value);
@@ -252,6 +327,8 @@ private:
 	void read_unicode_floating_format();
 	void read_unicode_code_point();
 
+	void prepare_to_parsing();
+
 };
 
 

From 44bda888b5a24128eb42dab2495d279941b0845c Mon Sep 17 00:00:00 2001
From: Tomasz Sowa <t.sowa@ttmath.org>
Date: Wed, 1 Jun 2022 05:17:30 +0200
Subject: [PATCH 37/37] fix: do not unescape xml sequences in filter mode

---
 src/html/htmlparser.cpp | 7 +++++--
 src/html/htmlparser.h   | 6 ++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index c5e37cf..f4b158e 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -63,6 +63,7 @@ void HTMLParser::clear_input_flags()
 	char_was_escaped = false;
 	escaped_chars_buffer.clear();
 	escaped_char_index = 0;
+	filter_mode      = false;
 }
 
 
@@ -173,6 +174,7 @@ void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_s
 
 	pchar_unicode = in;
 	out_string    = &out;
+	filter_mode   = true;
 
 	if( clear_out_string )
 		out_string->clear();
@@ -209,8 +211,8 @@ void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_str
 
 	wtext_stream_iterator = &begin;
 	wtext_stream_iterator_end = &end;
-
 	out_stream = &out;
+	filter_mode = true;
 
 	if( clear_out_stream )
 		out_stream->clear();
@@ -232,6 +234,7 @@ HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);
 
 	out_string = &out;
+	filter_mode = true;
 
 	if( clear_out_stream )
 		out_string->clear();
@@ -2383,7 +2386,7 @@ int HTMLParser::read_char()
 	{
 		read_char_no_escape();
 
-		if( lastc == '&' )
+		if( !filter_mode && lastc == '&' )
 		{
 			read_xml_entity();
 
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index fb63809..15dd8db 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -310,6 +310,12 @@ protected:
 	std::wstring escaped_chars_buffer;
 	size_t escaped_char_index;
 
+	/*
+	 * filter mode, a method filter(...) was called
+	 * in filter mode we do not unescape xml sequences such as &lt; &gt; ...
+	 */
+	bool filter_mode;
+
 
 	void clear_input_flags();