- added some converting methods: esc_to_json(...), esc_to_xml(...), esc_to_csv() (convert/misc.h)

- BaseParser: added possibility to read from TextStream and WTextStream - HTMLParser: added filter(const WTextStream & in, Stream & out, ...) method - added utf8_stream.h with one method: template<typename StreamIteratorType> size_t utf8_to_int( StreamIteratorType & iterator_in, StreamIteratorType & iterator_end, int & res, bool & correct)
2021-10-12 19:53:11 +02:00
parent 4902eb6037
commit 17d2c0fb25
13 changed files with 807 additions and 128 deletions
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -1,16 +1,24 @@
 # DO NOT DELETE
 ./convert/inttostr.o: ./convert/inttostr.h
-./convert/misc.o: ./convert/misc.h ./convert/text.h
+./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h
 ./convert/misc.o: textstream/types.h utf8/utf8_stream.h
 ./convert/misc.o: textstream/textstream.h textstream/stream.h space/space.h
 ./convert/misc.o: convert/inttostr.h utf8/utf8.h utf8/utf8_templates.h
 ./convert/misc.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
 ./convert/misc.o: textstream/types.h ./convert/inttostr.h
 ./convert/text.o: ./convert/text.h ./convert/text_private.h
 ./convert/double.o: ./convert/double.h textstream/textstream.h
 ./convert/double.o: textstream/stream.h space/space.h textstream/types.h
 ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./convert/double.o: membuffer/membuffer.h textstream/types.h
-./convert/baseparser.o: ./convert/baseparser.h utf8/utf8.h
+./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
-./convert/baseparser.o: textstream/stream.h utf8/utf8_templates.h
+./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h
-./convert/baseparser.o: utf8/utf8_private.h
+./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./convert/baseparser.o: membuffer/membuffer.h textstream/types.h
 ./convert/baseparser.o: utf8/utf8_stream.h
 ./date/date.o: ./date/date.h convert/inttostr.h
 ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
 ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
@@ -28,29 +36,39 @@
 ./space/space.o: convert/patternreplacer.h textstream/textstream.h
 ./space/space.o: textstream/stream.h space/space.h date/date.h
 ./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h
-./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h
+./space/space.o: ./convert/text.h ./convert/misc.h utf8/utf8_stream.h
 ./space/space.o: ./convert/double.h
 ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
 ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
 ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h
 ./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
-./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
+./space/spaceparser.o: textstream/textstream.h textstream/stream.h
 ./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h
 ./space/spaceparser.o: textstream/types.h convert/strtoint.h ./convert/text.h
 ./space/spaceparser.o: ./convert/misc.h utf8/utf8_stream.h
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
 ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
 ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
-./csv/csvparser.o: convert/baseparser.h
+./csv/csvparser.o: convert/baseparser.h textstream/textstream.h
 ./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
 ./csv/csvparser.o: textstream/types.h
 ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
 ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
-./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h space/space.h
+./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
-./html/htmlparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
+./html/htmlparser.o: textstream/textstream.h textstream/stream.h
-./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
+./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
-./html/htmlparser.o: utf8/utf8_private.h convert/text.h
+./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
 ./html/htmlparser.o: textstream/types.h convert/text.h
 ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
-./html/bbcodeparser.o: convert/baseparser.h space/space.h textstream/types.h
+./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
 ./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h
 ./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
-./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h
+./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
--- a/src/convert/baseparser.cpp
+++ b/src/convert/baseparser.cpp
@@ -37,7 +37,7 @@
 #include "baseparser.h"
 #include "utf8/utf8.h"
-
+#include "utf8/utf8_stream.h"
 namespace pt
@@ -45,19 +45,27 @@ namespace pt
 BaseParser::BaseParser()
 {
-	clear();
+	clear_input_flags();
 }
-void BaseParser::clear()
+void BaseParser::clear_input_flags()
 {
 	line = 0;
 	reading_from_file = false;
 	pchar_ascii = nullptr;
 	pchar_unicode = nullptr;
-	reading_from_wchar_string = false;
+	wtext_stream_iterator = nullptr;
 	wtext_stream_iterator_end = nullptr;
 	text_stream_iterator = nullptr;
 	text_stream_iterator_end = nullptr;
 	lastc = -1;
 	input_as_utf8 = true;
 	if( file.is_open() )
 		file.close();
 	file.clear();
 }
@@ -132,7 +140,6 @@ bool correct;
 		++line;
 return lastc;
 }
@@ -150,6 +157,67 @@ return lastc;
 }
 int BaseParser::read_char_from_wtext_stream()
 {
 	if( (*wtext_stream_iterator) != (*wtext_stream_iterator_end) )
 	{
 		lastc = *(*wtext_stream_iterator);
 		++(*wtext_stream_iterator);
 	}
 	else
 	{
 		lastc = -1;
 	}
 	if( lastc == '\n' )
 		++line;
 	return lastc;
 }
 int BaseParser::read_char_from_utf8_text_stream()
 {
 	int c;
 	bool correct;
 	lastc = -1;
 	do
 	{
 		utf8_to_int(*text_stream_iterator, *text_stream_iterator_end, c, correct);
 	}
 	while( !correct && (*text_stream_iterator) != (*text_stream_iterator_end) );
 	if( correct )
 		lastc = c;
 	if( lastc == '\n' )
 		++line;
 	return lastc;
 }
 int BaseParser::read_char_from_ascii_text_stream()
 {
 	if( (*text_stream_iterator) != (*text_stream_iterator_end) )
 	{
 		lastc = *(*text_stream_iterator);
 		++(*text_stream_iterator);
 	}
 	else
 	{
 		lastc = -1;
 	}
 	if( lastc == '\n' )
 		++line;
 	return lastc;
 }
 int BaseParser::read_char_no_escape()
 {
 	if( reading_from_file )
@@ -161,17 +229,33 @@ int BaseParser::read_char_no_escape()
 	}
 	else
 	{
-		if( reading_from_wchar_string )
+		if( pchar_ascii )
 		{
 			return read_char_from_wchar_string();
 		}
 		else
 		{
 			if( input_as_utf8 )
 				return read_char_from_utf8_string();
 			else
 				return read_char_from_ascii_string();
 		}
 		else if( pchar_unicode )
 		{
 			return read_char_from_wchar_string();
 		}
 		else if( wtext_stream_iterator && wtext_stream_iterator_end )
 		{
 			return read_char_from_wtext_stream();
 		}
 		else if( text_stream_iterator && text_stream_iterator_end )
 		{
 			if( input_as_utf8 )
 				return read_char_from_utf8_text_stream();
 			else
 				return read_char_from_ascii_text_stream();
 		}
 		else
 		{
 			lastc = -1;
 			return lastc;
 		}
 	}
 }
--- a/src/convert/baseparser.h
+++ b/src/convert/baseparser.h
@@ -40,6 +40,7 @@
 #include <string>
 #include <fstream>
 #include "textstream/textstream.h"
 namespace pt
@@ -51,15 +52,18 @@ protected:
 	BaseParser();
-	void clear();
+	virtual void clear_input_flags();
-	int read_utf8_char();
+	virtual int read_utf8_char();
-	int read_ascii_char();
+	virtual int read_ascii_char();
-	int read_char_from_wchar_string();
+	virtual int read_char_from_wchar_string();
-	int read_char_from_utf8_string();
+	virtual int read_char_from_utf8_string();
-	int read_char_from_ascii_string();
+	virtual int read_char_from_ascii_string();
-	int read_char_no_escape();
+	virtual int read_char_from_wtext_stream();
-	int read_char();
+	virtual int read_char_from_utf8_text_stream();
 	virtual int read_char_from_ascii_text_stream();
 	virtual int read_char_no_escape();
 	virtual int read_char();
@@ -75,6 +79,7 @@ protected:
 	*/
 	bool reading_from_file;
 	/*
 		pointers to the current character
 		if ParseString() is in used
@@ -84,9 +89,20 @@ protected:
 	/*
-		true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
+		pointers to WTextStream iterators
-	*/
+		if set then both of them should be set
-	bool reading_from_wchar_string;
+	 */
 	WTextStream::const_iterator * wtext_stream_iterator;
 	WTextStream::const_iterator * wtext_stream_iterator_end;
 	/*
 		pointers to TextStream iterators
 		if set then both of them should be set
 	 */
 	TextStream::const_iterator * text_stream_iterator;
 	TextStream::const_iterator * text_stream_iterator_end;
 	/*
 		last read char
@@ -112,7 +128,6 @@ protected:
 };
 }
--- a/src/convert/misc.cpp
+++ b/src/convert/misc.cpp
@@ -5,7 +5,7 @@
 */
 /*
- * Copyright (c) 2017, Tomasz Sowa
+ * Copyright (c) 2017-2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,8 @@
 */
 #include "misc.h"
 #include "inttostr.h"
 #include "utf8/utf8.h"
 namespace pt
@@ -49,6 +51,287 @@ void SetOverflow(bool * was_overflow, bool val)
 }
 void esc_to_json(char val, Stream & out)
 {
 	if( (unsigned char)val < 32 )
 	{
 		char buf[10];
 		size_t len;
 		Toa((unsigned char)val, buf, sizeof(buf)/sizeof(char), 16, &len);
 		out << "\\u";
 		if( len < 4 )
 		{
 			for(size_t i=0 ; i < (4-len) ; ++i)
 			{
 				out << '0';
 			}
 		}
 		out << buf;
 	}
 	else
 	{
 	// CHECKME
 	// \r \n \t are <32 and will be serialized os \u.... above
 		switch( val )
 		{
 		case 0:		out << '\\';	out << '0';		break;	// may to skip this character is better?
 		case '\r':	out << '\\';	out << 'r';		break;
 		case '\n':	out << '\\';	out << 'n';		break;
 		case '\t':	out << '\\';	out << 't';		break;
 		case 0x08:	out << '\\';	out << 'b';		break;
 		case 0x0c:	out << '\\';	out << 'f';		break;
 		case '\\':	out << '\\';	out << '\\';		break;
 		case '"':	out << '\\';	out << '\"';		break;
 		default:
 			out << val;
 		}
 	}
 }
 void esc_to_json(wchar_t val, Stream & out)
 {
 	char utf8_buf[10];
 	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
 	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
 	for(size_t a = 0 ; a < len ; ++a)
 	{
 		esc_to_json(utf8_buf[a], out);
 	}
 }
 void esc_to_json(const char * c, pt::Stream & out)
 {
 	for(size_t i = 0 ; c[i] != 0 ; ++i)
 	{
 		esc_to_json(c[i], out);
 	}
 }
 void esc_to_json(const char * c, std::size_t len, pt::Stream & out)
 {
 	for(size_t i = 0 ; i < len ; ++i)
 	{
 		esc_to_json(c[i], out);
 	}
 }
 void esc_to_json(const wchar_t * c, pt::Stream & out)
 {
 	for(size_t i = 0 ; c[i] != 0 ; ++i)
 	{
 		esc_to_json(c[i], out);
 	}
 }
 void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out)
 {
 	for(size_t i = 0 ; i < len ; ++i)
 	{
 		esc_to_json(c[i], out);
 	}
 }
 void esc_to_json(const std::string & in, Stream & out)
 {
 	esc_to_json(in.c_str(), in.size(), out);
 }
 void esc_to_json(const std::wstring & in, Stream & out)
 {
 	esc_to_json(in.c_str(), in.size(), out);
 }
 void esc_to_xml(char val, Stream & out)
 {
 	switch(val)
 	{
 	case '<':
 		out << "&lt;";
 		break;
 	case '>':
 		out << "&gt;";
 		break;
 	case '&':
 		out << "&amp;";
 		break;
 	case '"':
 		out << "&quot;";
 		break;
 	default:
 		out << val;
 		break;
 	// what about zero (null) character?
 	}
 }
 void esc_to_xml(wchar_t val, Stream & out)
 {
 	char utf8_buf[10];
 	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
 	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
 	for(size_t a = 0 ; a < len ; ++a)
 	{
 		esc_to_xml(utf8_buf[a], out);
 	}
 }
 void esc_to_xml(const char * c, pt::Stream & out)
 {
 	for(size_t i = 0 ; c[i] != 0 ; ++i)
 	{
 		esc_to_xml(c[i], out);
 	}
 }
 void esc_to_xml(const char * c, std::size_t len, pt::Stream & out)
 {
 	for(size_t i = 0 ; i < len ; ++i)
 	{
 		esc_to_xml(c[i], out);
 	}
 }
 void esc_to_xml(const wchar_t * c, pt::Stream & out)
 {
 	for(size_t i = 0 ; c[i] != 0 ; ++i)
 	{
 		esc_to_xml(c[i], out);
 	}
 }
 void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out)
 {
 	for(size_t i = 0 ; i < len ; ++i)
 	{
 		esc_to_xml(c[i], out);
 	}
 }
 void esc_to_xml(const std::string & in, Stream & out)
 {
 	esc_to_xml(in.c_str(), in.size(), out);
 }
 void esc_to_xml(const std::wstring & in, Stream & out)
 {
 	esc_to_xml(in.c_str(), in.size(), out);
 }
 void esc_to_csv(char c, pt::Stream & out)
 {
 	switch(c)
 	{
 	case '"':
 		out << "\"\"";
 		break;
 	default:
 		out << c;
 		break;
 	// what about zero (null) character?
 	}
 }
 void esc_to_csv(wchar_t val, Stream & out)
 {
 	char utf8_buf[10];
 	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
 	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
 	for(size_t a = 0 ; a < len ; ++a)
 	{
 		esc_to_csv(utf8_buf[a], out);
 	}
 }
 void esc_to_csv(const char * c, pt::Stream & out)
 {
 	for(size_t i = 0 ; c[i] != 0 ; ++i)
 	{
 		esc_to_csv(c[i], out);
 	}
 }
 void esc_to_csv(const char * c, std::size_t len, pt::Stream & out)
 {
 	for(size_t i = 0 ; i < len ; ++i)
 	{
 		esc_to_csv(c[i], out);
 	}
 }
 void esc_to_csv(const wchar_t * c, pt::Stream & out)
 {
 	for(size_t i = 0 ; c[i] != 0 ; ++i)
 	{
 		esc_to_csv(c[i], out);
 	}
 }
 void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out)
 {
 	for(size_t i = 0 ; i < len ; ++i)
 	{
 		esc_to_csv(c[i], out);
 	}
 }
 void esc_to_csv(const std::string & in, Stream & out)
 {
 	esc_to_csv(in.c_str(), in.size(), out);
 }
 void esc_to_csv(const std::wstring & in, Stream & out)
 {
 	esc_to_csv(in.c_str(), in.size(), out);
 }
 }
--- a/src/convert/misc.h
+++ b/src/convert/misc.h
@@ -5,7 +5,7 @@
 */
 /*
- * Copyright (c) 2017, Tomasz Sowa
+ * Copyright (c) 2017-2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,9 @@
 #include <limits>
 #include "text.h"
 #include "textstream/stream.h"
 #include "textstream/types.h"
 #include "utf8/utf8_stream.h"
 namespace pt
@@ -47,6 +50,138 @@ namespace pt
 void SetOverflow(bool * was_overflow, bool val);
 void esc_to_json(char val, Stream & out);
 void esc_to_json(wchar_t val, Stream & out);
 void esc_to_json(const char * c, pt::Stream & out);
 void esc_to_json(const char * c, std::size_t len, Stream & out);
 void esc_to_json(const wchar_t * c, Stream & out);
 void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out);
 void esc_to_json(const std::string & in, Stream & out);
 void esc_to_json(const std::wstring & in, Stream & out);
 void esc_to_xml(char c, pt::Stream & out);
 void esc_to_xml(wchar_t c, pt::Stream & out);
 void esc_to_xml(const char * c, pt::Stream & out);
 void esc_to_xml(const char * c, std::size_t len, pt::Stream & out);
 void esc_to_xml(const wchar_t * c, pt::Stream & out);
 void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out);
 void esc_to_xml(const std::string & in, Stream & out);
 void esc_to_xml(const std::wstring & in, Stream & out);
 void esc_to_csv(char c, pt::Stream & out);
 void esc_to_csv(wchar_t val, Stream & out);
 void esc_to_csv(const char * c, std::size_t len, Stream & out);
 void esc_to_csv(const char * c, pt::Stream & out);
 void esc_to_csv(const char * c, std::size_t len, pt::Stream & out);
 void esc_to_csv(const wchar_t * c, pt::Stream & out);
 void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out);
 void esc_to_csv(const std::string & in, Stream & out);
 template<typename StreamType>
 void esc_to_json(const StreamType & in, Stream & out)
 {
 	char utf8_buf[10];
 	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
 	typename StreamType::const_iterator i = in.begin();
 	typename StreamType::const_iterator end = in.end();
 	int res;
 	bool correct;
 	for( ; i != end ; ++i)
 	{
 		if( in.is_wchar_stream() && out.is_char_stream() )
 		{
 			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
 			esc_to_json(utf8_buf, len, out);
 		}
 		else
 		if( in.is_char_stream() && out.is_wchar_stream() )
 		{
 			utf8_to_int(i, end, res, correct);
 			if( correct )
 				esc_to_json(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
 			// put replacement char if not correct?
 		}
 		else
 		{
 			esc_to_json(static_cast<wchar_t>(*i), out);
 		}
 	}
 }
 template<typename StreamType>
 void esc_to_xml(const StreamType & in, Stream & out)
 {
 	char utf8_buf[10];
 	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
 	typename StreamType::const_iterator i = in.begin();
 	typename StreamType::const_iterator end = in.end();
 	int res;
 	bool correct;
 	for( ; i != end ; ++i)
 	{
 		if( in.is_wchar_stream() && out.is_char_stream() )
 		{
 			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
 			esc_to_xml(utf8_buf, len, out);
 		}
 		else
 		if( in.is_char_stream() && out.is_wchar_stream() )
 		{
 			utf8_to_int(i, end, res, correct);
 			if( correct )
 				esc_to_xml(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
 			// put replacement char if not correct?
 		}
 		else
 		{
 			esc_to_xml(static_cast<wchar_t>(*i), out);
 		}
 	}
 }
 template<typename StreamType>
 void esc_to_csv(const StreamType & in, Stream & out)
 {
 	char utf8_buf[10];
 	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
 	typename StreamType::const_iterator i = in.begin();
 	typename StreamType::const_iterator end = in.end();
 	int res;
 	bool correct;
 	for( ; i != end ; ++i)
 	{
 		if( in.is_wchar_stream() && out.is_char_stream() )
 		{
 			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
 			esc_to_csv(utf8_buf, len, out);
 		}
 		else
 		if( in.is_char_stream() && out.is_wchar_stream() )
 		{
 			utf8_to_int(i, end, res, correct);
 			if( correct )
 				esc_to_csv(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
 			// put replacement char if not correct?
 		}
 		else
 		{
 			esc_to_csv(static_cast<wchar_t>(*i), out);
 		}
 	}
 }
 }
--- a/src/csv/csvparser.cpp
+++ b/src/csv/csvparser.cpp
@@ -53,6 +53,8 @@ CSVParser::CSVParser()
 CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
 {
 	clear_input_flags();
 	reading_from_file = true;
 	space = &out_space;
@@ -103,11 +105,10 @@ CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space &
 CSVParser::Status CSVParser::parse(const char * str, Space & out_space)
 {
-	reading_from_file         = false;
+	clear_input_flags();
-	reading_from_wchar_string = false;
+
-	pchar_ascii               = str;
+	pchar_ascii = str;
-	pchar_unicode             = 0;
+	space       = &out_space;
 	space                     = &out_space;
 	parse();
@@ -124,11 +125,10 @@ CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space)
 CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space)
 {
-	reading_from_file         = false;
+	clear_input_flags();
-	reading_from_wchar_string = true;
+
-	pchar_unicode             = str;
+	pchar_unicode = str;
-	pchar_ascii               = 0;
+	space         = &out_space;
 	space                     = &out_space;
 	parse();
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -48,6 +48,24 @@ const int HTMLParser::WHITE_MODE_TREE;
 void HTMLParser::clear_input_flags()
 {
 	BaseParser::clear_input_flags();
 	parsing_html     = true;
 	xml_compact_mode = true;
 	status           = ok;
 	line             = 1;
 	stack_len        = 0;
 	out_string       = nullptr;
 	out_stream       = nullptr;
 	out_space        = nullptr;
 	line_len         = 0;
 }
 void HTMLParser::Item::Clear()
 {
 	name.clear();
@@ -71,21 +89,11 @@ HTMLParser::Item::Item()
 void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
 {
-	parsing_html              = true;
+	clear_input_flags();
 	reading_from_file         = false;
 	reading_from_wchar_string = true;
 	pchar_unicode             = in;
 	pchar_ascii               = 0;
 	xml_compact_mode          = compact_mode;
-	status = ok;
+	pchar_unicode    = in;
-	line = 1;
+	xml_compact_mode = compact_mode;
-
+	out_space = &space;
 	stack_len     = 0;
 	out_string    = nullptr;
 	out_space     = &space;
 	//last_new_line = false;
 	line_len      = 0;
 	out_space->clear();
 	Init();
@@ -96,16 +104,11 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
 HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
 {
 	clear_input_flags();
 	parsing_html = false;
 	reading_from_file = true;
-	xml_compact_mode          = compact_mode;
+	xml_compact_mode = compact_mode;
 	status = ok;
 	line = 1;
 	stack_len     = 0;
 	out_string    = nullptr;
 	line_len      = 0;
 	this->out_space = &out_space;
 	if( clear_space )
@@ -153,20 +156,15 @@ HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Sp
-void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
+void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string)
 {
-	parsing_html              = true;
+	clear_input_flags();
 	reading_from_file         = false;
 	reading_from_wchar_string = true;
 	pchar_unicode             = in;
 	pchar_ascii               = 0;
-	stack_len     = 0;
+	pchar_unicode = in;
 	out_string    = &out;
-	out_space     = nullptr;
+
-	//last_new_line = false;
+	if( clear_out_string )
-	line_len      = 0;
+		out_string->clear();
 	out_string->clear();
 	Init();
 	Read();
@@ -174,7 +172,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 }
-void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
+void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string)
 {
 	if( &in == &out )
 	{
@@ -187,27 +185,45 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
 	if( out.capacity() < out_projected_len )
 		out.reserve(out_projected_len);
-	Filter(in.c_str(), out);
+	filter(in.c_str(), out, clear_out_string);
 }
-
+void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream)
 HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out)
 {
-	parsing_html      = true;
+	clear_input_flags();
 	WTextStream::const_iterator begin = in.begin();
 	WTextStream::const_iterator end = in.end();
 	wtext_stream_iterator = &begin;
 	wtext_stream_iterator_end = &end;
 	out_stream = &out;
 	if( clear_out_stream )
 		out_stream->clear();
 	Init();
 	Read();
 	Uninit();
 }
 HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream)
 {
 	clear_input_flags();
 	reading_from_file = true;
 	// open the file before clearing 'out' string, 'out' string can be the same string as the file_name
 	file.clear();
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);
-	status        = ok;
+	out_string = &out;
-	line          = 1;
+
-	stack_len     = 0;
+	if( clear_out_stream )
-	out_string    = &out;
+		out_string->clear();
 	out_space     = nullptr;
 	line_len      = 0;
 	out_string->clear();
 	if( file )
 	{
@@ -226,24 +242,24 @@ HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring
 }
-HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out)
+HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream)
 {
-	return filter_file(file_name.c_str(), out);
+	return filter_file(file_name.c_str(), out, clear_out_stream);
 }
-HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out)
+HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream)
 {
 	std::string file_name_utf8;
 	pt::wide_to_utf8(file_name, file_name_utf8);
-	return filter_file(file_name_utf8, out);
+	return filter_file(file_name_utf8, out, clear_out_stream);
 }
-HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out)
+HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream)
 {
-	return filter_file(file_name.c_str(), out);
+	return filter_file(file_name.c_str(), out, clear_out_stream);
 }
@@ -792,6 +808,9 @@ void HTMLParser::Put(wchar_t c)
 	if( out_string )
 		(*out_string) += c;
 	if( out_stream )
 		(*out_stream) << c;
 	CheckChar(c);
 }
@@ -806,6 +825,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
 	if( out_string )
 		out_string->append(str, len);
 	if( out_stream )
 		out_stream->write(str, len);
 	for( ; str < end ; ++str)
 		CheckChar(*str);
 }
@@ -819,6 +841,9 @@ void HTMLParser::Put(const std::wstring & str)
 		if( out_string )
 			out_string->append(str);
 		if( out_stream )
 			out_stream->write(str.c_str(), str.size());
 		for(size_t i=0 ; i < str.size() ; ++i)
 			CheckChar(str[i]);
 	}
@@ -1130,6 +1155,9 @@ void HTMLParser::PutTabs(size_t len)
 	{
 		if( out_string )
 			(*out_string) += ' '; // we do not add them to 'line_len'
 		if( out_stream )
 			(*out_stream) << ' ';
 	}
 }
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -44,6 +44,7 @@
 #include <algorithm>
 #include "convert/baseparser.h"
 #include "space/space.h"
 #include "textstream/stream.h"
 namespace pt
@@ -130,14 +131,15 @@ public:
 	// main methods used for filtering
-	void Filter(const wchar_t * in, std::wstring & out);
+	void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true);
-	void Filter(const std::wstring & in, std::wstring & out);
+	void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true);
 	void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true);
-	HTMLParser::Status filter_file(const char * file_name, std::wstring & out);
+	HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true);
-	HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out);
+	HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true);
-	HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out);
+	HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true);
-	HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out);
+	HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true);
 	/*
@@ -278,7 +280,7 @@ protected:
-
+	void clear_input_flags();
 	/*
@@ -403,6 +405,7 @@ protected:
 	size_t stack_len;		// length of the stack
 	wchar_t * buffer;		// buffer used when printing
 	std::wstring * out_string;
 	Stream * out_stream;
 	Space * out_space;
 	Space text_space_tmp;
--- a/src/space/spaceparser.cpp
+++ b/src/space/spaceparser.cpp
@@ -74,11 +74,12 @@ int SpaceParser::get_last_parsed_line()
 SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space)
 {
 	clear_input_flags();
 	reading_from_file = true;
 	parsing_space = false;
 	root_space = &out_space;
 	file.clear();
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);
 	if( file )
@@ -125,11 +126,12 @@ SpaceParser::Status SpaceParser::parse_json_file(const std::wstring & file_name,
 SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space)
 {
 	clear_input_flags();
 	reading_from_file = true;
 	parsing_space = true;
 	root_space = &out_space;
 	file.clear();
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);
 	if( file )
@@ -174,10 +176,9 @@ SpaceParser::Status SpaceParser::parse_space_file(const std::wstring & file_name
 SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
+	clear_input_flags();
-	reading_from_wchar_string = false;
+
 	pchar_ascii               = str;
 	pchar_unicode             = 0;
 	parsing_space             = false;
 	root_space                = &out_space;
@@ -195,10 +196,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::string & str, Space & out
 SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
+	clear_input_flags();
-	reading_from_wchar_string = true;
+
 	pchar_unicode             = str;
 	pchar_ascii               = 0;
 	parsing_space             = false;
 	root_space                = &out_space;
@@ -219,10 +219,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::wstring & str, Space & ou
 SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
+	clear_input_flags();
-	reading_from_wchar_string = false;
+
 	pchar_ascii               = str;
 	pchar_unicode             = 0;
 	parsing_space             = true;
 	root_space                = &out_space;
@@ -240,10 +239,9 @@ SpaceParser::Status SpaceParser::parse_space(const std::string & str, Space & ou
 SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
+	clear_input_flags();
-	reading_from_wchar_string = true;
+
 	pchar_unicode             = str;
 	pchar_ascii               = 0;
 	parsing_space             = true;
 	root_space                = &out_space;
--- a/src/utf8/utf8.h
+++ b/src/utf8/utf8.h
@@ -45,6 +45,12 @@
 namespace pt
 {
 /*
 * public methods are also defined in utf8_stream.h
 *
 */
 /*!
 	UTF-8, a transformation format of ISO 10646
 	http://tools.ietf.org/html/rfc3629
@@ -213,9 +219,7 @@ template<typename StreamType>
 bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1);
 template<typename StreamTypeIn, typename StreamTypeOut>
-void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested, IMPROVE ME add clear parameter, mode parameter is not used
+void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); // not tested, IMPROVE ME mode parameter is not used
 } // namespace
--- a/src/utf8/utf8_stream.h
+++ b/src/utf8/utf8_stream.h
@@ -0,0 +1,104 @@
 /*
 * This file is a part of PikoTools
 * and is distributed under the (new) BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */
 /*
 * Copyright (c) 2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 *  * Neither the name Tomasz Sowa nor the names of contributors to this
 *    project may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef headerfile_picotools_utf8_utf8_stream
 #define headerfile_picotools_utf8_utf8_stream
 #include "textstream/textstream.h"
 namespace pt
 {
 /*!
 	this function converts one UTF-8 character into one wide-character
 	input:
 		iterator_in - an TextStream iterator for reading from
 		iterator_end - an end iterator (can be returned by end() method from TextStream)
 	output:
 		res - an output character
 		correct - true if it is a correct character
 		the function returns how many characters have been used from the input stream
 */
 template<typename StreamIteratorType>
 size_t utf8_to_int(
 		StreamIteratorType & iterator_in,
 		StreamIteratorType & iterator_end,
 		int & res,
 		bool & correct)
 {
 size_t i, len;
 unsigned char uz;
 	res = 0;
 	correct = false;
 	if( iterator_in == iterator_end )
 		return 0;
 	uz = *iterator_in;
 	++iterator_in;
 	if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
 		return 1;
 	for(i=1 ; i<len ; ++i)
 	{
 		if( iterator_in == iterator_end )
 			return i;
 		uz = *iterator_in;
 		++iterator_in;
 		if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
 			return i;
 	}
 	if( utf8_check_range(res, len) )
 		correct = true;
 return len;
 }
 }
 #endif
--- a/src/utf8/utf8_templates.h
+++ b/src/utf8/utf8_templates.h
@@ -47,6 +47,7 @@ namespace pt
 {
 template<typename StreamType>
 void int_to_wide(int c, StreamType & res)
 {
@@ -65,6 +66,7 @@ void int_to_wide(int c, StreamType & res)
 /*!
 	converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
 	(need to be tested)
@@ -376,8 +378,11 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i
 // not tested
 template<typename StreamTypeIn, typename StreamTypeOut>
-void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)
+void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
 {
 	if( clear )
 		utf8.clear();
 	private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
 		utf8.write(utf8_buffer, buffer_len);
 	});
@@ -385,8 +390,6 @@ void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)
 } // namespace pt
 #endif
--- a/tests/Makefile.dep
+++ b/tests/Makefile.dep
@@ -9,12 +9,15 @@
 ./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h
 ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./convert.o: ../src/convert/strtoint.h ../src/convert/text.h
-./convert.o: ../src/convert/misc.h ../src/convert/double.h
+./convert.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
 ./convert.o: ../src/convert/double.h
 ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
 ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
 ./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
 ./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
-./csvparser.o: ../src/convert/baseparser.h test.h
+./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h
 ./csvparser.o: ../src/textstream/stream.h ../src/date/date.h
 ./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
 ./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h
@@ -30,4 +33,5 @@
 ./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h
 ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
-./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h
+./mainoptionsparser.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
 ./mainoptionsparser.o: ../src/convert/double.h