- added some converting methods: esc_to_json(...), esc_to_xml(...), esc_to_csv() (convert/misc.h)

- BaseParser: added possibility to read from TextStream and WTextStream - HTMLParser: added filter(const WTextStream & in, Stream & out, ...) method - added utf8_stream.h with one method: template<typename StreamIteratorType> size_t utf8_to_int( StreamIteratorType & iterator_in, StreamIteratorType & iterator_end, int & res, bool & correct)
2021-10-12 19:53:11 +02:00
parent 4902eb6037
commit 17d2c0fb25
13 changed files with 807 additions and 128 deletions
@@ -1,16 +1,24 @@
 # DO NOT DELETE

 ./convert/inttostr.o: ./convert/inttostr.h
-./convert/misc.o: ./convert/misc.h ./convert/text.h
+./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h
+./convert/misc.o: textstream/types.h utf8/utf8_stream.h
+./convert/misc.o: textstream/textstream.h textstream/stream.h space/space.h
+./convert/misc.o: convert/inttostr.h utf8/utf8.h utf8/utf8_templates.h
+./convert/misc.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
+./convert/misc.o: textstream/types.h ./convert/inttostr.h
 ./convert/text.o: ./convert/text.h ./convert/text_private.h
 ./convert/double.o: ./convert/double.h textstream/textstream.h
 ./convert/double.o: textstream/stream.h space/space.h textstream/types.h
 ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./convert/double.o: membuffer/membuffer.h textstream/types.h
-./convert/baseparser.o: ./convert/baseparser.h utf8/utf8.h
-./convert/baseparser.o: textstream/stream.h utf8/utf8_templates.h
-./convert/baseparser.o: utf8/utf8_private.h
+./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
+./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h
+./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
+./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
+./convert/baseparser.o: membuffer/membuffer.h textstream/types.h
+./convert/baseparser.o: utf8/utf8_stream.h
 ./date/date.o: ./date/date.h convert/inttostr.h
 ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
 ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
@@ -28,29 +36,39 @@
 ./space/space.o: convert/patternreplacer.h textstream/textstream.h
 ./space/space.o: textstream/stream.h space/space.h date/date.h
 ./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h
-./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h
+./space/space.o: ./convert/text.h ./convert/misc.h utf8/utf8_stream.h
+./space/space.o: ./convert/double.h
 ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
 ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
 ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h
 ./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
-./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
+./space/spaceparser.o: textstream/textstream.h textstream/stream.h
+./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h
+./space/spaceparser.o: textstream/types.h convert/strtoint.h ./convert/text.h
+./space/spaceparser.o: ./convert/misc.h utf8/utf8_stream.h
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
 ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
 ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
-./csv/csvparser.o: convert/baseparser.h
+./csv/csvparser.o: convert/baseparser.h textstream/textstream.h
+./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
+./csv/csvparser.o: textstream/types.h
 ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
 ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
-./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h space/space.h
-./html/htmlparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
-./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h
-./html/htmlparser.o: utf8/utf8_private.h convert/text.h
+./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
+./html/htmlparser.o: textstream/textstream.h textstream/stream.h
+./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
+./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
+./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
+./html/htmlparser.o: textstream/types.h convert/text.h
 ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
-./html/bbcodeparser.o: convert/baseparser.h space/space.h textstream/types.h
+./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
+./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h
 ./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
-./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h
+./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
+./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
@@ -37,7 +37,7 @@

 #include "baseparser.h"
 #include "utf8/utf8.h"
-
+#include "utf8/utf8_stream.h"


 namespace pt
@@ -45,19 +45,27 @@ namespace pt

 BaseParser::BaseParser()
 {
-	clear();
+	clear_input_flags();
 }


-void BaseParser::clear()
+void BaseParser::clear_input_flags()
 {
 	line = 0;
 	reading_from_file = false;
 	pchar_ascii = nullptr;
 	pchar_unicode = nullptr;
-	reading_from_wchar_string = false;
+	wtext_stream_iterator = nullptr;
+	wtext_stream_iterator_end = nullptr;
+	text_stream_iterator = nullptr;
+	text_stream_iterator_end = nullptr;
 	lastc = -1;
 	input_as_utf8 = true;
+
+	if( file.is_open() )
+		file.close();
+
+	file.clear();
 }


@@ -132,7 +140,6 @@ bool correct;
 		++line;

 return lastc;
-
 }


@@ -150,6 +157,67 @@ return lastc;
 }


+int BaseParser::read_char_from_wtext_stream()
+{
+	if( (*wtext_stream_iterator) != (*wtext_stream_iterator_end) )
+	{
+		lastc = *(*wtext_stream_iterator);
+		++(*wtext_stream_iterator);
+	}
+	else
+	{
+		lastc = -1;
+	}
+
+	if( lastc == '\n' )
+		++line;
+
+	return lastc;
+}
+
+
+int BaseParser::read_char_from_utf8_text_stream()
+{
+	int c;
+	bool correct;
+
+	lastc = -1;
+
+	do
+	{
+		utf8_to_int(*text_stream_iterator, *text_stream_iterator_end, c, correct);
+	}
+	while( !correct && (*text_stream_iterator) != (*text_stream_iterator_end) );
+
+	if( correct )
+		lastc = c;
+
+	if( lastc == '\n' )
+		++line;
+
+	return lastc;
+}
+
+
+int BaseParser::read_char_from_ascii_text_stream()
+{
+	if( (*text_stream_iterator) != (*text_stream_iterator_end) )
+	{
+		lastc = *(*text_stream_iterator);
+		++(*text_stream_iterator);
+	}
+	else
+	{
+		lastc = -1;
+	}
+
+	if( lastc == '\n' )
+		++line;
+
+	return lastc;
+}
+
+
 int BaseParser::read_char_no_escape()
 {
 	if( reading_from_file )
@@ -161,17 +229,33 @@ int BaseParser::read_char_no_escape()
 	}
 	else
 	{
-		if( reading_from_wchar_string )
-		{
-			return read_char_from_wchar_string();
-		}
-		else
+		if( pchar_ascii )
 		{
 			if( input_as_utf8 )
 				return read_char_from_utf8_string();
 			else
 				return read_char_from_ascii_string();
 		}
+		else if( pchar_unicode )
+		{
+			return read_char_from_wchar_string();
+		}
+		else if( wtext_stream_iterator && wtext_stream_iterator_end )
+		{
+			return read_char_from_wtext_stream();
+		}
+		else if( text_stream_iterator && text_stream_iterator_end )
+		{
+			if( input_as_utf8 )
+				return read_char_from_utf8_text_stream();
+			else
+				return read_char_from_ascii_text_stream();
+		}
+		else
+		{
+			lastc = -1;
+			return lastc;
+		}
 	}
 }

@@ -40,6 +40,7 @@

 #include <string>
 #include <fstream>
+#include "textstream/textstream.h"


 namespace pt
@@ -51,15 +52,18 @@ protected:

 	BaseParser();

-	void clear();
+	virtual void clear_input_flags();

-	int read_utf8_char();
-	int read_ascii_char();
-	int read_char_from_wchar_string();
-	int read_char_from_utf8_string();
-	int read_char_from_ascii_string();
-	int read_char_no_escape();
-	int read_char();
+	virtual int read_utf8_char();
+	virtual int read_ascii_char();
+	virtual int read_char_from_wchar_string();
+	virtual int read_char_from_utf8_string();
+	virtual int read_char_from_ascii_string();
+	virtual int read_char_from_wtext_stream();
+	virtual int read_char_from_utf8_text_stream();
+	virtual int read_char_from_ascii_text_stream();
+	virtual int read_char_no_escape();
+	virtual int read_char();



@@ -75,6 +79,7 @@ protected:
 	*/
 	bool reading_from_file;

+
 	/*
 		pointers to the current character
 		if ParseString() is in used
@@ -84,9 +89,20 @@ protected:


 	/*
-		true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
-	*/
-	bool reading_from_wchar_string;
+		pointers to WTextStream iterators
+		if set then both of them should be set
+	 */
+	WTextStream::const_iterator * wtext_stream_iterator;
+	WTextStream::const_iterator * wtext_stream_iterator_end;
+
+
+	/*
+		pointers to TextStream iterators
+		if set then both of them should be set
+	 */
+	TextStream::const_iterator * text_stream_iterator;
+	TextStream::const_iterator * text_stream_iterator_end;
+

 	/*
 		last read char
@@ -112,7 +128,6 @@ protected:



-
 };

 }
@@ -5,7 +5,7 @@
 */

 /*
- * Copyright (c) 2017, Tomasz Sowa
+ * Copyright (c) 2017-2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,8 @@
 */

 #include "misc.h"
+#include "inttostr.h"
+#include "utf8/utf8.h"


 namespace pt
@@ -49,6 +51,287 @@ void SetOverflow(bool * was_overflow, bool val)
 }


+void esc_to_json(char val, Stream & out)
+{
+	if( (unsigned char)val < 32 )
+	{
+		char buf[10];
+		size_t len;
+		Toa((unsigned char)val, buf, sizeof(buf)/sizeof(char), 16, &len);
+
+		out << "\\u";
+
+		if( len < 4 )
+		{
+			for(size_t i=0 ; i < (4-len) ; ++i)
+			{
+				out << '0';
+			}
+		}
+
+		out << buf;
+	}
+	else
+	{
+	// CHECKME
+	// \r \n \t are <32 and will be serialized os \u.... above
+
+		switch( val )
+		{
+		case 0:		out << '\\';	out << '0';		break;	// may to skip this character is better?
+		case '\r':	out << '\\';	out << 'r';		break;
+		case '\n':	out << '\\';	out << 'n';		break;
+		case '\t':	out << '\\';	out << 't';		break;
+		case 0x08:	out << '\\';	out << 'b';		break;
+		case 0x0c:	out << '\\';	out << 'f';		break;
+		case '\\':	out << '\\';	out << '\\';		break;
+		case '"':	out << '\\';	out << '\"';		break;
+		default:
+			out << val;
+		}
+	}
+}
+
+
+void esc_to_json(wchar_t val, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+
+	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
+
+	for(size_t a = 0 ; a < len ; ++a)
+	{
+		esc_to_json(utf8_buf[a], out);
+	}
+}
+
+
+void esc_to_json(const char * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_json(c[i], out);
+	}
+}
+
+
+void esc_to_json(const char * c, std::size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_json(c[i], out);
+	}
+}
+
+
+void esc_to_json(const wchar_t * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_json(c[i], out);
+	}
+}
+
+
+void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_json(c[i], out);
+	}
+}
+
+
+void esc_to_json(const std::string & in, Stream & out)
+{
+	esc_to_json(in.c_str(), in.size(), out);
+}
+
+
+void esc_to_json(const std::wstring & in, Stream & out)
+{
+	esc_to_json(in.c_str(), in.size(), out);
+}
+
+
+
+
+
+
+void esc_to_xml(char val, Stream & out)
+{
+	switch(val)
+	{
+	case '<':
+		out << "&lt;";
+		break;
+
+	case '>':
+		out << "&gt;";
+		break;
+
+	case '&':
+		out << "&amp;";
+		break;
+
+	case '"':
+		out << "&quot;";
+		break;
+
+	default:
+		out << val;
+		break;
+
+	// what about zero (null) character?
+	}
+}
+
+void esc_to_xml(wchar_t val, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+
+	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
+
+	for(size_t a = 0 ; a < len ; ++a)
+	{
+		esc_to_xml(utf8_buf[a], out);
+	}
+}
+
+
+void esc_to_xml(const char * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_xml(c[i], out);
+	}
+}
+
+
+void esc_to_xml(const char * c, std::size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_xml(c[i], out);
+	}
+}
+
+
+void esc_to_xml(const wchar_t * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_xml(c[i], out);
+	}
+}
+
+
+void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_xml(c[i], out);
+	}
+}
+
+
+void esc_to_xml(const std::string & in, Stream & out)
+{
+	esc_to_xml(in.c_str(), in.size(), out);
+}
+
+
+void esc_to_xml(const std::wstring & in, Stream & out)
+{
+	esc_to_xml(in.c_str(), in.size(), out);
+}
+
+
+
+
+
+void esc_to_csv(char c, pt::Stream & out)
+{
+	switch(c)
+	{
+	case '"':
+		out << "\"\"";
+		break;
+
+	default:
+		out << c;
+		break;
+
+	// what about zero (null) character?
+	}
+}
+
+
+void esc_to_csv(wchar_t val, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+
+	size_t len = int_to_utf8(static_cast<int>(val), utf8_buf, utf8_buf_len);
+
+	for(size_t a = 0 ; a < len ; ++a)
+	{
+		esc_to_csv(utf8_buf[a], out);
+	}
+}
+
+
+
+void esc_to_csv(const char * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_csv(c[i], out);
+	}
+}
+
+
+void esc_to_csv(const char * c, std::size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_csv(c[i], out);
+	}
+}
+
+
+void esc_to_csv(const wchar_t * c, pt::Stream & out)
+{
+	for(size_t i = 0 ; c[i] != 0 ; ++i)
+	{
+		esc_to_csv(c[i], out);
+	}
+}
+
+
+void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out)
+{
+	for(size_t i = 0 ; i < len ; ++i)
+	{
+		esc_to_csv(c[i], out);
+	}
+}
+
+
+void esc_to_csv(const std::string & in, Stream & out)
+{
+	esc_to_csv(in.c_str(), in.size(), out);
+}
+
+
+void esc_to_csv(const std::wstring & in, Stream & out)
+{
+	esc_to_csv(in.c_str(), in.size(), out);
+}
+
+


 }
@@ -5,7 +5,7 @@
 */

 /*
- * Copyright (c) 2017, Tomasz Sowa
+ * Copyright (c) 2017-2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,9 @@

 #include <limits>
 #include "text.h"
+#include "textstream/stream.h"
+#include "textstream/types.h"
+#include "utf8/utf8_stream.h"


 namespace pt
@@ -47,6 +50,138 @@ namespace pt

 void SetOverflow(bool * was_overflow, bool val);

+void esc_to_json(char val, Stream & out);
+void esc_to_json(wchar_t val, Stream & out);
+void esc_to_json(const char * c, pt::Stream & out);
+void esc_to_json(const char * c, std::size_t len, Stream & out);
+void esc_to_json(const wchar_t * c, Stream & out);
+void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out);
+void esc_to_json(const std::string & in, Stream & out);
+void esc_to_json(const std::wstring & in, Stream & out);
+
+void esc_to_xml(char c, pt::Stream & out);
+void esc_to_xml(wchar_t c, pt::Stream & out);
+void esc_to_xml(const char * c, pt::Stream & out);
+void esc_to_xml(const char * c, std::size_t len, pt::Stream & out);
+void esc_to_xml(const wchar_t * c, pt::Stream & out);
+void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out);
+void esc_to_xml(const std::string & in, Stream & out);
+void esc_to_xml(const std::wstring & in, Stream & out);
+
+void esc_to_csv(char c, pt::Stream & out);
+void esc_to_csv(wchar_t val, Stream & out);
+void esc_to_csv(const char * c, std::size_t len, Stream & out);
+void esc_to_csv(const char * c, pt::Stream & out);
+void esc_to_csv(const char * c, std::size_t len, pt::Stream & out);
+void esc_to_csv(const wchar_t * c, pt::Stream & out);
+void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out);
+void esc_to_csv(const std::string & in, Stream & out);
+
+
+
+template<typename StreamType>
+void esc_to_json(const StreamType & in, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+	typename StreamType::const_iterator i = in.begin();
+	typename StreamType::const_iterator end = in.end();
+	int res;
+	bool correct;
+
+	for( ; i != end ; ++i)
+	{
+		if( in.is_wchar_stream() && out.is_char_stream() )
+		{
+			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
+			esc_to_json(utf8_buf, len, out);
+		}
+		else
+		if( in.is_char_stream() && out.is_wchar_stream() )
+		{
+			utf8_to_int(i, end, res, correct);
+
+			if( correct )
+				esc_to_json(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
+
+			// put replacement char if not correct?
+		}
+		else
+		{
+			esc_to_json(static_cast<wchar_t>(*i), out);
+		}
+	}
+}
+
+
+template<typename StreamType>
+void esc_to_xml(const StreamType & in, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+	typename StreamType::const_iterator i = in.begin();
+	typename StreamType::const_iterator end = in.end();
+	int res;
+	bool correct;
+
+	for( ; i != end ; ++i)
+	{
+		if( in.is_wchar_stream() && out.is_char_stream() )
+		{
+			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
+			esc_to_xml(utf8_buf, len, out);
+		}
+		else
+		if( in.is_char_stream() && out.is_wchar_stream() )
+		{
+			utf8_to_int(i, end, res, correct);
+
+			if( correct )
+				esc_to_xml(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
+
+			// put replacement char if not correct?
+		}
+		else
+		{
+			esc_to_xml(static_cast<wchar_t>(*i), out);
+		}
+	}
+}
+
+
+template<typename StreamType>
+void esc_to_csv(const StreamType & in, Stream & out)
+{
+	char utf8_buf[10];
+	std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char);
+	typename StreamType::const_iterator i = in.begin();
+	typename StreamType::const_iterator end = in.end();
+	int res;
+	bool correct;
+
+	for( ; i != end ; ++i)
+	{
+		if( in.is_wchar_stream() && out.is_char_stream() )
+		{
+			std::size_t len = int_to_utf8(static_cast<int>(*i), utf8_buf, utf8_buf_len);
+			esc_to_csv(utf8_buf, len, out);
+		}
+		else
+		if( in.is_char_stream() && out.is_wchar_stream() )
+		{
+			utf8_to_int(i, end, res, correct);
+
+			if( correct )
+				esc_to_csv(static_cast<wchar_t>(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2)
+
+			// put replacement char if not correct?
+		}
+		else
+		{
+			esc_to_csv(static_cast<wchar_t>(*i), out);
+		}
+	}
+}

 }

@@ -53,6 +53,8 @@ CSVParser::CSVParser()

 CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
 {
+	clear_input_flags();
+
 	reading_from_file = true;
 	space = &out_space;

@@ -103,11 +105,10 @@ CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space &

 CSVParser::Status CSVParser::parse(const char * str, Space & out_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = false;
-	pchar_ascii               = str;
-	pchar_unicode             = 0;
-	space                     = &out_space;
+	clear_input_flags();
+
+	pchar_ascii = str;
+	space       = &out_space;

 	parse();

@@ -124,11 +125,10 @@ CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space)

 CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
-	pchar_unicode             = str;
-	pchar_ascii               = 0;
-	space                     = &out_space;
+	clear_input_flags();
+
+	pchar_unicode = str;
+	space         = &out_space;

 	parse();

@@ -48,6 +48,24 @@ const int HTMLParser::WHITE_MODE_TREE;



+void HTMLParser::clear_input_flags()
+{
+	BaseParser::clear_input_flags();
+
+	parsing_html     = true;
+	xml_compact_mode = true;
+	status           = ok;
+	line             = 1;
+	stack_len        = 0;
+	out_string       = nullptr;
+	out_stream       = nullptr;
+	out_space        = nullptr;
+	line_len         = 0;
+}
+
+
+
+
 void HTMLParser::Item::Clear()
 {
 	name.clear();
@@ -71,21 +89,11 @@ HTMLParser::Item::Item()

 void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
 {
-	parsing_html              = true;
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
-	pchar_unicode             = in;
-	pchar_ascii               = 0;
-	xml_compact_mode          = compact_mode;
+	clear_input_flags();

-	status = ok;
-	line = 1;
-
-	stack_len     = 0;
-	out_string    = nullptr;
-	out_space     = &space;
-	//last_new_line = false;
-	line_len      = 0;
+	pchar_unicode    = in;
+	xml_compact_mode = compact_mode;
+	out_space = &space;
 	out_space->clear();

 	Init();
@@ -96,16 +104,11 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode

 HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
 {
+	clear_input_flags();
+
 	parsing_html = false;
 	reading_from_file = true;
-	xml_compact_mode          = compact_mode;
-
-	status = ok;
-	line = 1;
-	stack_len     = 0;
-	out_string    = nullptr;
-	line_len      = 0;
-
+	xml_compact_mode = compact_mode;
 	this->out_space = &out_space;

 	if( clear_space )
@@ -153,20 +156,15 @@ HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Sp



-void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
+void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string)
 {
-	parsing_html              = true;
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
-	pchar_unicode             = in;
-	pchar_ascii               = 0;
+	clear_input_flags();

-	stack_len     = 0;
+	pchar_unicode = in;
 	out_string    = &out;
-	out_space     = nullptr;
-	//last_new_line = false;
-	line_len      = 0;
-	out_string->clear();
+
+	if( clear_out_string )
+		out_string->clear();

 	Init();
 	Read();
@@ -174,7 +172,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 }


-void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
+void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string)
 {
 	if( &in == &out )
 	{
@@ -187,27 +185,45 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
 	if( out.capacity() < out_projected_len )
 		out.reserve(out_projected_len);

-	Filter(in.c_str(), out);
+	filter(in.c_str(), out, clear_out_string);
 }


-
-HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out)
+void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream)
 {
-	parsing_html      = true;
+	clear_input_flags();
+
+	WTextStream::const_iterator begin = in.begin();
+	WTextStream::const_iterator end = in.end();
+
+	wtext_stream_iterator = &begin;
+	wtext_stream_iterator_end = &end;
+
+	out_stream = &out;
+
+	if( clear_out_stream )
+		out_stream->clear();
+
+	Init();
+	Read();
+	Uninit();
+}
+
+
+HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream)
+{
+	clear_input_flags();
+
 	reading_from_file = true;

 	// open the file before clearing 'out' string, 'out' string can be the same string as the file_name
 	file.clear();
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);

-	status        = ok;
-	line          = 1;
-	stack_len     = 0;
-	out_string    = &out;
-	out_space     = nullptr;
-	line_len      = 0;
-	out_string->clear();
+	out_string = &out;
+
+	if( clear_out_stream )
+		out_string->clear();

 	if( file )
 	{
@@ -226,24 +242,24 @@ HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring
 }


-HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out)
+HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream)
 {
-	return filter_file(file_name.c_str(), out);
+	return filter_file(file_name.c_str(), out, clear_out_stream);
 }


-HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out)
+HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream)
 {
 	std::string file_name_utf8;
 	pt::wide_to_utf8(file_name, file_name_utf8);

-	return filter_file(file_name_utf8, out);
+	return filter_file(file_name_utf8, out, clear_out_stream);
 }


-HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out)
+HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream)
 {
-	return filter_file(file_name.c_str(), out);
+	return filter_file(file_name.c_str(), out, clear_out_stream);
 }


@@ -792,6 +808,9 @@ void HTMLParser::Put(wchar_t c)
 	if( out_string )
 		(*out_string) += c;

+	if( out_stream )
+		(*out_stream) << c;
+
 	CheckChar(c);
 }

@@ -806,6 +825,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
 	if( out_string )
 		out_string->append(str, len);

+	if( out_stream )
+		out_stream->write(str, len);
+
 	for( ; str < end ; ++str)
 		CheckChar(*str);
 }
@@ -819,6 +841,9 @@ void HTMLParser::Put(const std::wstring & str)
 		if( out_string )
 			out_string->append(str);

+		if( out_stream )
+			out_stream->write(str.c_str(), str.size());
+
 		for(size_t i=0 ; i < str.size() ; ++i)
 			CheckChar(str[i]);
 	}
@@ -1130,6 +1155,9 @@ void HTMLParser::PutTabs(size_t len)
 	{
 		if( out_string )
 			(*out_string) += ' '; // we do not add them to 'line_len'
+
+		if( out_stream )
+			(*out_stream) << ' ';
 	}
 }

@@ -44,6 +44,7 @@
 #include <algorithm>
 #include "convert/baseparser.h"
 #include "space/space.h"
+#include "textstream/stream.h"


 namespace pt
@@ -130,14 +131,15 @@ public:


 	// main methods used for filtering
-	void Filter(const wchar_t * in, std::wstring & out);
-	void Filter(const std::wstring & in, std::wstring & out);
+	void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true);
+	void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true);

+	void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true);

-	HTMLParser::Status filter_file(const char * file_name, std::wstring & out);
-	HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out);
-	HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out);
-	HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out);
+	HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true);
+	HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true);
+	HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true);
+	HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true);


 	/*
@@ -278,7 +280,7 @@ protected:



-
+	void clear_input_flags();


 	/*
@@ -403,6 +405,7 @@ protected:
 	size_t stack_len;		// length of the stack
 	wchar_t * buffer;		// buffer used when printing
 	std::wstring * out_string;
+	Stream * out_stream;
 	Space * out_space;
 	Space text_space_tmp;

@@ -74,11 +74,12 @@ int SpaceParser::get_last_parsed_line()

 SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space)
 {
+	clear_input_flags();
+
 	reading_from_file = true;
 	parsing_space = false;
 	root_space = &out_space;

-	file.clear();
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);
 	
 	if( file )
@@ -125,11 +126,12 @@ SpaceParser::Status SpaceParser::parse_json_file(const std::wstring & file_name,

 SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space)
 {
+	clear_input_flags();
+
 	reading_from_file = true;
 	parsing_space = true;
 	root_space = &out_space;

-	file.clear();
 	file.open(file_name, std::ios_base::binary | std::ios_base::in);

 	if( file )
@@ -174,10 +176,9 @@ SpaceParser::Status SpaceParser::parse_space_file(const std::wstring & file_name

 SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = false;
+	clear_input_flags();
+
 	pchar_ascii               = str;
-	pchar_unicode             = 0;
 	parsing_space             = false;
 	root_space                = &out_space;

@@ -195,10 +196,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::string & str, Space & out

 SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
+	clear_input_flags();
+
 	pchar_unicode             = str;
-	pchar_ascii               = 0;
 	parsing_space             = false;
 	root_space                = &out_space;

@@ -219,10 +219,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::wstring & str, Space & ou

 SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = false;
+	clear_input_flags();
+
 	pchar_ascii               = str;
-	pchar_unicode             = 0;
 	parsing_space             = true;
 	root_space                = &out_space;

@@ -240,10 +239,9 @@ SpaceParser::Status SpaceParser::parse_space(const std::string & str, Space & ou

 SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space)
 {
-	reading_from_file         = false;
-	reading_from_wchar_string = true;
+	clear_input_flags();
+
 	pchar_unicode             = str;
-	pchar_ascii               = 0;
 	parsing_space             = true;
 	root_space                = &out_space;

@@ -45,6 +45,12 @@
 namespace pt
 {

+/*
+ * public methods are also defined in utf8_stream.h
+ *
+ */
+
+
 /*!
 	UTF-8, a transformation format of ISO 10646
 	http://tools.ietf.org/html/rfc3629
@@ -213,9 +219,7 @@ template<typename StreamType>
 bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1);

 template<typename StreamTypeIn, typename StreamTypeOut>
-void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested, IMPROVE ME add clear parameter, mode parameter is not used
-
-
+void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); // not tested, IMPROVE ME mode parameter is not used


 } // namespace
@@ -0,0 +1,104 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/*
+ * Copyright (c) 2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_picotools_utf8_utf8_stream
+#define headerfile_picotools_utf8_utf8_stream
+
+#include "textstream/textstream.h"
+
+namespace pt
+{
+
+
+/*!
+	this function converts one UTF-8 character into one wide-character
+
+	input:
+		iterator_in - an TextStream iterator for reading from
+		iterator_end - an end iterator (can be returned by end() method from TextStream)
+
+	output:
+		res - an output character
+		correct - true if it is a correct character
+
+		the function returns how many characters have been used from the input stream
+*/
+template<typename StreamIteratorType>
+size_t utf8_to_int(
+		StreamIteratorType & iterator_in,
+		StreamIteratorType & iterator_end,
+		int & res,
+		bool & correct)
+{
+size_t i, len;
+unsigned char uz;
+
+	res = 0;
+	correct = false;
+
+	if( iterator_in == iterator_end )
+		return 0;
+
+	uz = *iterator_in;
+	++iterator_in;
+
+	if( !private_namespace::utf8_to_int_first_octet(uz, len, res) )
+		return 1;
+
+	for(i=1 ; i<len ; ++i)
+	{
+		if( iterator_in == iterator_end )
+			return i;
+
+		uz = *iterator_in;
+		++iterator_in;
+
+		if( !private_namespace::utf8_to_int_add_next_octet(uz, res) )
+			return i;
+	}
+
+	if( utf8_check_range(res, len) )
+		correct = true;
+
+return len;
+}
+
+
+
+}
+
+#endif
@@ -47,6 +47,7 @@ namespace pt
 {


+
 template<typename StreamType>
 void int_to_wide(int c, StreamType & res)
 {
@@ -65,6 +66,7 @@ void int_to_wide(int c, StreamType & res)



+
 /*!
 	converting UTF-8 string to a TextStreamBase<wchar_t,...> stream
 	(need to be tested)
@@ -376,8 +378,11 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i

 // not tested
 template<typename StreamTypeIn, typename StreamTypeOut>
-void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)
+void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode)
 {
+	if( clear )
+		utf8.clear();
+
 	private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){
 		utf8.write(utf8_buffer, buffer_len);
 	});
@@ -385,8 +390,6 @@ void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode)



-
-
 } // namespace pt

 #endif
@@ -9,12 +9,15 @@
 ./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h
 ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./convert.o: ../src/convert/strtoint.h ../src/convert/text.h
-./convert.o: ../src/convert/misc.h ../src/convert/double.h
+./convert.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
+./convert.o: ../src/convert/double.h
 ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
 ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
 ./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
 ./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
-./csvparser.o: ../src/convert/baseparser.h test.h
+./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h
+./csvparser.o: ../src/textstream/stream.h ../src/date/date.h
+./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
 ./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h
@@ -30,4 +33,5 @@
 ./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h
 ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
-./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h
+./mainoptionsparser.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h
+./mainoptionsparser.o: ../src/convert/double.h