diff --git a/src/Makefile.dep b/src/Makefile.dep
index 683e3cf..bbf1f99 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -1,44 +1,76 @@
# DO NOT DELETE
./convert/inttostr.o: ./convert/inttostr.h
-./convert/misc.o: ./convert/misc.h ./convert/text.h
+./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h
+./convert/misc.o: textstream/types.h utf8/utf8_stream.h
+./convert/misc.o: textstream/textstream.h textstream/stream.h space/space.h
+./convert/misc.o: convert/inttostr.h utf8/utf8.h utf8/utf8_templates.h
+./convert/misc.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
+./convert/misc.o: textstream/types.h ./convert/inttostr.h
./convert/text.o: ./convert/text.h ./convert/text_private.h
./convert/double.o: ./convert/double.h textstream/textstream.h
./convert/double.o: textstream/stream.h space/space.h textstream/types.h
./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
./convert/double.o: membuffer/membuffer.h textstream/types.h
+./convert/double.o: utf8/utf8_stream.h
+./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h
+./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h
+./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
+./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
+./convert/baseparser.o: membuffer/membuffer.h textstream/types.h
+./convert/baseparser.o: utf8/utf8_stream.h
./date/date.o: ./date/date.h convert/inttostr.h
./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
./log/filelog.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
-./log/filelog.o: textstream/types.h
+./log/filelog.o: textstream/types.h utf8/utf8_stream.h
./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h
./log/log.o: space/space.h textstream/types.h convert/inttostr.h utf8/utf8.h
./log/log.o: textstream/stream.h utf8/utf8_templates.h utf8/utf8_private.h
./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h
-./log/log.o: ./log/filelog.h
+./log/log.o: utf8/utf8_stream.h ./log/filelog.h
./space/space.o: ./space/space.h textstream/types.h convert/inttostr.h
./space/space.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h
./space/space.o: convert/patternreplacer.h textstream/textstream.h
./space/space.o: textstream/stream.h space/space.h date/date.h
-./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h
-./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h
+./space/space.o: membuffer/membuffer.h textstream/types.h utf8/utf8_stream.h
+./space/space.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
+./space/space.o: ./convert/double.h
./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h
-./space/spaceparser.o: utf8/utf8_private.h convert/strtoint.h
-./space/spaceparser.o: ./convert/text.h ./convert/misc.h
+./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
+./space/spaceparser.o: textstream/textstream.h textstream/stream.h
+./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h
+./space/spaceparser.o: textstream/types.h utf8/utf8_stream.h
+./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
./utf8/utf8.o: utf8/utf8_private.h
./utf8/utf8_private.o: utf8/utf8_private.h
./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
+./csv/csvparser.o: convert/baseparser.h textstream/textstream.h
+./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h
+./csv/csvparser.o: textstream/types.h utf8/utf8_stream.h
./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
+./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h
+./html/htmlparser.o: textstream/textstream.h textstream/stream.h
+./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h
+./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
+./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h
+./html/htmlparser.o: textstream/types.h utf8/utf8_stream.h convert/text.h
+./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h
+./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h
+./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h
+./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
+./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
+./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h
+./html/bbcodeparser.o: utf8/utf8_stream.h
diff --git a/src/convert/baseparser.cpp b/src/convert/baseparser.cpp
new file mode 100644
index 0000000..d4abca1
--- /dev/null
+++ b/src/convert/baseparser.cpp
@@ -0,0 +1,273 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa
+ */
+
+/*
+ * Copyright (c) 2021-2022, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name Tomasz Sowa nor the names of contributors to this
+ * project may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "baseparser.h"
+#include "utf8/utf8.h"
+#include "utf8/utf8_stream.h"
+
+
+namespace pt
+{
+
+BaseParser::BaseParser()
+{
+ clear_input_flags();
+}
+
+
+void BaseParser::clear_input_flags()
+{
+ line = 0;
+ column = 0;
+ reading_from_file = false;
+ pchar_ascii = nullptr;
+ pchar_unicode = nullptr;
+ wtext_stream_iterator = nullptr;
+ wtext_stream_iterator_end = nullptr;
+ text_stream_iterator = nullptr;
+ text_stream_iterator_end = nullptr;
+ lastc = -1;
+ input_as_utf8 = true;
+
+ if( file.is_open() )
+ file.close();
+
+ file.clear();
+}
+
+
+void BaseParser::check_new_line()
+{
+ if( lastc == '\n' )
+ {
+ ++line;
+ column = 0;
+ }
+}
+
+
+int BaseParser::read_utf8_char()
+{
+int c;
+bool correct;
+
+ lastc = -1;
+
+ do
+ {
+ utf8_to_int(file, c, correct);
+
+ if( !file )
+ return lastc;
+ }
+ while( !correct );
+
+ lastc = c;
+ check_new_line();
+
+return lastc;
+}
+
+
+int BaseParser::read_ascii_char()
+{
+ lastc = file.get();
+ check_new_line();
+
+return lastc;
+}
+
+
+int BaseParser::read_char_from_wchar_string()
+{
+ if( *pchar_unicode == 0 )
+ lastc = -1;
+ else
+ lastc = *(pchar_unicode++);
+
+ check_new_line();
+
+return lastc;
+}
+
+
+int BaseParser::read_char_from_utf8_string()
+{
+int c;
+bool correct;
+
+ lastc = -1;
+
+ do
+ {
+ size_t len = utf8_to_int(pchar_ascii, c, correct);
+ pchar_ascii += len;
+ }
+ while( *pchar_ascii && !correct );
+
+ if( correct )
+ lastc = c;
+
+ check_new_line();
+
+return lastc;
+}
+
+
+int BaseParser::read_char_from_ascii_string()
+{
+ if( *pchar_ascii == 0 )
+ lastc = -1;
+ else
+ lastc = *(pchar_ascii++);
+
+ check_new_line();
+
+return lastc;
+}
+
+
+int BaseParser::read_char_from_wtext_stream()
+{
+ if( (*wtext_stream_iterator) != (*wtext_stream_iterator_end) )
+ {
+ lastc = *(*wtext_stream_iterator);
+ ++(*wtext_stream_iterator);
+ }
+ else
+ {
+ lastc = -1;
+ }
+
+ check_new_line();
+
+ return lastc;
+}
+
+
+int BaseParser::read_char_from_utf8_text_stream()
+{
+ int c;
+ bool correct;
+
+ lastc = -1;
+
+ do
+ {
+ utf8_to_int(*text_stream_iterator, *text_stream_iterator_end, c, correct);
+ }
+ while( !correct && (*text_stream_iterator) != (*text_stream_iterator_end) );
+
+ if( correct )
+ lastc = c;
+
+ check_new_line();
+
+ return lastc;
+}
+
+
+int BaseParser::read_char_from_ascii_text_stream()
+{
+ if( (*text_stream_iterator) != (*text_stream_iterator_end) )
+ {
+ lastc = *(*text_stream_iterator);
+ ++(*text_stream_iterator);
+ }
+ else
+ {
+ lastc = -1;
+ }
+
+ check_new_line();
+
+ return lastc;
+}
+
+
+int BaseParser::read_char_no_escape()
+{
+ if( reading_from_file )
+ {
+ if( input_as_utf8 )
+ return read_utf8_char();
+ else
+ return read_ascii_char();
+ }
+ else
+ {
+ if( pchar_ascii )
+ {
+ if( input_as_utf8 )
+ return read_char_from_utf8_string();
+ else
+ return read_char_from_ascii_string();
+ }
+ else if( pchar_unicode )
+ {
+ return read_char_from_wchar_string();
+ }
+ else if( wtext_stream_iterator && wtext_stream_iterator_end )
+ {
+ return read_char_from_wtext_stream();
+ }
+ else if( text_stream_iterator && text_stream_iterator_end )
+ {
+ if( input_as_utf8 )
+ return read_char_from_utf8_text_stream();
+ else
+ return read_char_from_ascii_text_stream();
+ }
+ else
+ {
+ lastc = -1;
+ return lastc;
+ }
+ }
+}
+
+
+int BaseParser::read_char()
+{
+ return read_char_no_escape();
+}
+
+
+
+
+}
+
diff --git a/src/convert/baseparser.h b/src/convert/baseparser.h
new file mode 100644
index 0000000..67721b1
--- /dev/null
+++ b/src/convert/baseparser.h
@@ -0,0 +1,141 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa
+ */
+
+/*
+ * Copyright (c) 2021-2022, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name Tomasz Sowa nor the names of contributors to this
+ * project may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_picotools_convert_baseparser
+#define headerfile_picotools_convert_baseparser
+
+#include
+#include
+#include "textstream/textstream.h"
+
+
+namespace pt
+{
+
+class BaseParser
+{
+protected:
+
+ BaseParser();
+
+ virtual void clear_input_flags();
+
+ virtual void check_new_line();
+ virtual int read_utf8_char();
+ virtual int read_ascii_char();
+ virtual int read_char_from_wchar_string();
+ virtual int read_char_from_utf8_string();
+ virtual int read_char_from_ascii_string();
+ virtual int read_char_from_wtext_stream();
+ virtual int read_char_from_utf8_text_stream();
+ virtual int read_char_from_ascii_text_stream();
+ virtual int read_char_no_escape();
+ virtual int read_char();
+
+
+
+ /*
+ a number of a line in which there is a syntax_error
+ */
+ int line;
+
+ /*
+ a number of a column in which there is a syntax_error
+ */
+ int column;
+
+
+ /*
+ true if parse() method was called
+ false if ParseString() was called
+ */
+ bool reading_from_file;
+
+
+ /*
+ pointers to the current character
+ if ParseString() is in used
+ */
+ const char * pchar_ascii;
+ const wchar_t * pchar_unicode;
+
+
+ /*
+ pointers to WTextStream iterators
+ if set then both of them should be set
+ */
+ WTextStream::const_iterator * wtext_stream_iterator;
+ WTextStream::const_iterator * wtext_stream_iterator_end;
+
+
+ /*
+ pointers to TextStream iterators
+ if set then both of them should be set
+ */
+ TextStream::const_iterator * text_stream_iterator;
+ TextStream::const_iterator * text_stream_iterator_end;
+
+
+ /*
+ last read char
+ or -1 if the end
+ */
+ int lastc;
+
+
+ /*
+ current file
+
+ may it would be better to make a pointer?
+ if we parse only a string then there is no sense to have such an object
+ */
+ std::ifstream file;
+
+
+ /*
+ input file is in UTF-8
+ default: true
+ */
+ bool input_as_utf8;
+
+
+
+};
+
+}
+
+#endif
diff --git a/src/convert/inttostr.cpp b/src/convert/inttostr.cpp
index b9a8d6d..89d9272 100644
--- a/src/convert/inttostr.cpp
+++ b/src/convert/inttostr.cpp
@@ -5,7 +5,7 @@
*/
/*
- * Copyright (c) 2021, Tomasz Sowa
+ * Copyright (c) 2021-2022, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -41,114 +41,114 @@
namespace pt
{
-std::string to_str(unsigned long long value, int base)
+std::string to_str(unsigned long long value, int base, size_t min_width)
{
std::string res;
- Toa(value, res, false, base);
+ Toa(value, res, false, base, min_width);
return res;
}
-std::string to_str(long long value, int base)
+std::string to_str(long long value, int base, size_t min_width)
{
std::string res;
- Toa(value, res, false, base);
+ Toa(value, res, false, base, min_width);
return res;
}
-std::string to_str(unsigned long value, int base)
+std::string to_str(unsigned long value, int base, size_t min_width)
{
- return to_str(static_cast(value), base);
+ return to_str(static_cast(value), base, min_width);
}
-std::string to_str(long value, int base)
+std::string to_str(long value, int base, size_t min_width)
{
- return to_str(static_cast(value), base);
+ return to_str(static_cast(value), base, min_width);
}
-std::string to_str(unsigned int value, int base)
+std::string to_str(unsigned int value, int base, size_t min_width)
{
- return to_str(static_cast(value), base);
+ return to_str(static_cast(value), base, min_width);
}
-std::string to_str(int value, int base)
+std::string to_str(int value, int base, size_t min_width)
{
- return to_str(static_cast(value), base);
+ return to_str(static_cast(value), base, min_width);
}
-std::string to_str(unsigned short value, int base)
+std::string to_str(unsigned short value, int base, size_t min_width)
{
- return to_str(static_cast(value), base);
+ return to_str(static_cast(value), base, min_width);
}
-std::string to_str(short value, int base)
+std::string to_str(short value, int base, size_t min_width)
{
- return to_str(static_cast(value), base);
+ return to_str(static_cast(value), base, min_width);
}
-std::wstring to_wstr(unsigned long long value, int base)
+std::wstring to_wstr(unsigned long long value, int base, size_t min_width)
{
std::wstring res;
- Toa(value, res, false, base);
+ Toa(value, res, false, base, min_width);
return res;
}
-std::wstring to_wstr(long long value, int base)
+std::wstring to_wstr(long long value, int base, size_t min_width)
{
std::wstring res;
- Toa(value, res, false, base);
+ Toa(value, res, false, base, min_width);
return res;
}
-std::wstring to_wstr(unsigned long value, int base)
+std::wstring to_wstr(unsigned long value, int base, size_t min_width)
{
- return to_wstr(static_cast(value), base);
+ return to_wstr(static_cast(value), base, min_width);
}
-std::wstring to_wstr(long value, int base)
+std::wstring to_wstr(long value, int base, size_t min_width)
{
- return to_wstr(static_cast(value), base);
+ return to_wstr(static_cast(value), base, min_width);
}
-std::wstring to_wstr(unsigned int value, int base)
+std::wstring to_wstr(unsigned int value, int base, size_t min_width)
{
- return to_wstr(static_cast(value), base);
+ return to_wstr(static_cast(value), base, min_width);
}
-std::wstring to_wstr(int value, int base)
+std::wstring to_wstr(int value, int base, size_t min_width)
{
- return to_wstr(static_cast(value), base);
+ return to_wstr(static_cast(value), base, min_width);
}
-std::wstring to_wstr(unsigned short value, int base)
+std::wstring to_wstr(unsigned short value, int base, size_t min_width)
{
- return to_wstr(static_cast(value), base);
+ return to_wstr(static_cast(value), base, min_width);
}
-std::wstring to_wstr(short value, int base)
+std::wstring to_wstr(short value, int base, size_t min_width)
{
- return to_wstr(static_cast(value), base);
+ return to_wstr(static_cast(value), base, min_width);
}
diff --git a/src/convert/inttostr.h b/src/convert/inttostr.h
index f134dc8..bac0f1d 100644
--- a/src/convert/inttostr.h
+++ b/src/convert/inttostr.h
@@ -5,7 +5,7 @@
*/
/*
- * Copyright (c) 2012-2021, Tomasz Sowa
+ * Copyright (c) 2012-2022, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -52,8 +52,9 @@ namespace pt
// if the buffer is too small it will be terminated at the beginning (empty string)
// and the function returns false
+// min_width - if greater than zero then it is used for zero padding
template
-bool Toa(unsigned long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(unsigned long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = nullptr, size_t min_width = 0)
{
size_t i1, i2;
long rest;
@@ -77,6 +78,14 @@ long rest;
}
while(value != 0 && i2 < buf_len);
+ if( min_width > 0 )
+ {
+ for( ; i2 < min_width && i2 < buf_len ; ++i2)
+ {
+ buffer[i2] = '0';
+ }
+ }
+
if( i2 >= buf_len )
{
buffer[0] = 0; // ops, the buffer was too small
@@ -106,7 +115,7 @@ return true;
// if the buffer is too small it will be terminated at the beginning (empty string)
// and the function returns false
template
-bool Toa(long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = nullptr, size_t min_width = 0)
{
if( len_out )
*len_out = 0;
@@ -126,7 +135,7 @@ bool Toa(long long value, CharType * buffer, size_t buf_len, int base = 10, size
is_sign = true;
}
- bool res = Toa(static_cast(value), buf, buf_len, base, len_out);
+ bool res = Toa(static_cast(value), buf, buf_len, base, len_out, min_width);
if( res )
{
@@ -146,44 +155,44 @@ return res;
template
-bool Toa(unsigned long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(unsigned long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
{
- return Toa(static_cast(value), buffer, buf_len, base, len_out);
+ return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width);
}
template
-bool Toa(long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
{
- return Toa(static_cast(value), buffer, buf_len, base, len_out);
+ return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width);
}
template
-bool Toa(unsigned int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(unsigned int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
{
- return Toa(static_cast(value), buffer, buf_len, base, len_out);
+ return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width);
}
template
-bool Toa(int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
{
- return Toa(static_cast(value), buffer, buf_len, base, len_out);
+ return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width);
}
template
-bool Toa(unsigned short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(unsigned short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
{
- return Toa(static_cast(value), buffer, buf_len, base, len_out);
+ return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width);
}
template
-bool Toa(short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0)
+bool Toa(short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0)
{
- return Toa(static_cast(value), buffer, buf_len, base, len_out);
+ return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width);
}
@@ -192,7 +201,7 @@ bool Toa(short value, CharType * buffer, size_t buf_len, int base = 10, size_t *
template
-void Toa(unsigned long long value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(unsigned long long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
{
typename StringType::value_type buffer[50];
size_t buffer_len = sizeof(buffer) / sizeof(wchar_t);
@@ -204,13 +213,13 @@ void Toa(unsigned long long value, StringType & res, bool clear_string = true, i
* the size of the buffer is sufficient so the status should always be true
*/
size_t len_out;
- Toa(value, buffer, buffer_len, base, &len_out);
+ Toa(value, buffer, buffer_len, base, &len_out, min_width);
res.append(buffer, len_out);
}
template
-void Toa(long long value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(long long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
{
typename StringType::value_type buffer[50];
size_t buffer_len = sizeof(buffer) / sizeof(wchar_t);
@@ -222,71 +231,71 @@ void Toa(long long value, StringType & res, bool clear_string = true, int base =
* the size of the buffer is sufficient so the status should always be true
*/
size_t len_out;
- Toa(value, buffer, buffer_len, base, &len_out);
+ Toa(value, buffer, buffer_len, base, &len_out, min_width);
res.append(buffer, len_out);
}
template
-void Toa(unsigned long value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(unsigned long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
{
- Toa(static_cast(value), res, clear_string, base);
+ Toa(static_cast(value), res, clear_string, base, min_width);
}
template
-void Toa(long value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
{
- Toa(static_cast(value), res, clear_string, base);
+ Toa(static_cast(value), res, clear_string, base, min_width);
}
template
-void Toa(unsigned int value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(unsigned int value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
{
- Toa(static_cast(value), res, clear_string, base);
+ Toa(static_cast(value), res, clear_string, base, min_width);
}
template
-void Toa(int value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(int value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
{
- Toa(static_cast(value), res, clear_string, base);
+ Toa(static_cast(value), res, clear_string, base, min_width);
}
template
-void Toa(unsigned short value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(unsigned short value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
{
- Toa(static_cast(value), res, clear_string, base);
+ Toa(static_cast(value), res, clear_string, base, min_width);
}
template
-void Toa(short value, StringType & res, bool clear_string = true, int base = 10)
+void Toa(short value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0)
{
- Toa(static_cast(value), res, clear_string, base);
+ Toa(static_cast(value), res, clear_string, base, min_width);
}
-std::string to_str(unsigned long long value, int base = 10);
-std::string to_str(long long value, int base = 10);
-std::string to_str(unsigned long value, int base = 10);
-std::string to_str(long value, int base = 10);
-std::string to_str(unsigned int value, int base = 10);
-std::string to_str(int value, int base = 10);
-std::string to_str(unsigned short value, int base = 10);
-std::string to_str(short value, int base = 10);
+std::string to_str(unsigned long long value, int base = 10, size_t min_width = 0);
+std::string to_str(long long value, int base = 10, size_t min_width = 0);
+std::string to_str(unsigned long value, int base = 10, size_t min_width = 0);
+std::string to_str(long value, int base = 10, size_t min_width = 0);
+std::string to_str(unsigned int value, int base = 10, size_t min_width = 0);
+std::string to_str(int value, int base = 10, size_t min_width = 0);
+std::string to_str(unsigned short value, int base = 10, size_t min_width = 0);
+std::string to_str(short value, int base = 10, size_t min_width = 0);
-std::wstring to_wstr(unsigned long long value, int base = 10);
-std::wstring to_wstr(long long value, int base = 10);
-std::wstring to_wstr(unsigned long value, int base = 10);
-std::wstring to_wstr(long value, int base = 10);
-std::wstring to_wstr(unsigned int value, int base = 10);
-std::wstring to_wstr(int value, int base = 10);
-std::wstring to_wstr(unsigned short value, int base = 10);
-std::wstring to_wstr(short value, int base = 10);
+std::wstring to_wstr(unsigned long long value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(long long value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(unsigned long value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(long value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(unsigned int value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(int value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(unsigned short value, int base = 10, size_t min_width = 0);
+std::wstring to_wstr(short value, int base = 10, size_t min_width = 0);
diff --git a/src/convert/misc.cpp b/src/convert/misc.cpp
index 978cce7..3d78ae7 100644
--- a/src/convert/misc.cpp
+++ b/src/convert/misc.cpp
@@ -5,7 +5,7 @@
*/
/*
- * Copyright (c) 2017, Tomasz Sowa
+ * Copyright (c) 2017-2022, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,8 @@
*/
#include "misc.h"
+#include "inttostr.h"
+#include "utf8/utf8.h"
namespace pt
@@ -50,6 +52,363 @@ void SetOverflow(bool * was_overflow, bool val)
+void esc_to_json_uformat(wchar_t val, Stream & out)
+{
+ char buf[10];
+ size_t len;
+
+ Toa((unsigned long)val, buf, sizeof(buf)/sizeof(char), 16, &len);
+
+ out << "\\u";
+
+ if( len < 4 )
+ {
+ for(size_t i=0 ; i < (4-len) ; ++i)
+ {
+ out << '0';
+ }
+ }
+
+ out << buf;
+}
+
+
+/*
+ * return true if the val character was escaped and put to the out stream
+ * if the character is invalid for such a stream then only return true
+ * but not put it to the stream
+ */
+bool try_esc_to_json(wchar_t val, Stream & out)
+{
+ bool status = false;
+
+ if( val == '\r' )
+ {
+ out << '\\' << 'r';
+ status = true;
+ }
+ else
+ if( val == '\n' )
+ {
+ out << '\\' << 'n';
+ status = true;
+ }
+ else
+ if( val == '\t' )
+ {
+ out << '\\' << 't';
+ status = true;
+ }
+ else
+ if( val == 0x08 )
+ {
+ out << '\\' << 'b';
+ status = true;
+ }
+ else
+ if( val == 0x0c )
+ {
+ out << '\\' << 'f';
+ status = true;
+ }
+ else
+ if( val == '\\' )
+ {
+ out << '\\' << '\\';
+ status = true;
+ }
+ else
+ if( val == '"' )
+ {
+ out << '\\' << '\"';
+ status = true;
+ }
+ else
+ if( val < 32 )
+ {
+ esc_to_json_uformat(val, out);
+ status = true;
+ }
+
+ return status;
+}
+
+
+void esc_to_json(wchar_t val, Stream & out)
+{
+ if( !try_esc_to_json(val, out) )
+ {
+ out << val;
+ }
+}
+
+
+void esc_to_json(char val, Stream & out)
+{
+ if( !try_esc_to_json((wchar_t)(unsigned char)val, out) )
+ {
+ out << val;
+ }
+}
+
+
+
+void esc_to_json(const char * c, pt::Stream & out)
+{
+ for(size_t i = 0 ; c[i] != 0 ; ++i)
+ {
+ esc_to_json(c[i], out);
+ }
+}
+
+
+void esc_to_json(const char * c, std::size_t len, pt::Stream & out)
+{
+ for(size_t i = 0 ; i < len ; ++i)
+ {
+ esc_to_json(c[i], out);
+ }
+}
+
+
+void esc_to_json(const wchar_t * c, pt::Stream & out)
+{
+ for(size_t i = 0 ; c[i] != 0 ; ++i)
+ {
+ esc_to_json(c[i], out);
+ }
+}
+
+
+void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out)
+{
+ for(size_t i = 0 ; i < len ; ++i)
+ {
+ esc_to_json(c[i], out);
+ }
+}
+
+
+void esc_to_json(const std::string & in, Stream & out)
+{
+ esc_to_json(in.c_str(), in.size(), out);
+}
+
+
+void esc_to_json(const std::wstring & in, Stream & out)
+{
+ esc_to_json(in.c_str(), in.size(), out);
+}
+
+
+
+/*
+ * return true if the val character was escaped and put to the out stream
+ * if the character is invalid for such a stream then only return true
+ * but not put it to the stream
+ */
+bool try_esc_to_xml(wchar_t val, Stream & out)
+{
+ bool status = false;
+
+ if( val == 0 )
+ {
+ // null character is invalid in XML 1.0 and 1.1
+ // https://en.wikipedia.org/wiki/Valid_characters_in_XML
+ // return true but not put the char to the out stream
+ status = true;
+ }
+ else
+ if( val == '<')
+ {
+ out << "<";
+ status = true;
+ }
+ else
+ if( val == '>')
+ {
+ out << ">";
+ status = true;
+ }
+ else
+ if( val == '&')
+ {
+ out << "&";
+ status = true;
+ }
+ else
+ if( val == '"')
+ {
+ out << """;
+ status = true;
+ }
+
+ return status;
+}
+
+
+void esc_to_xml(wchar_t val, Stream & out)
+{
+ if( !try_esc_to_xml(val, out) )
+ {
+ out << val;
+ }
+}
+
+
+void esc_to_xml(char val, Stream & out)
+{
+ if( !try_esc_to_xml((wchar_t)(unsigned char)val, out) )
+ {
+ out << val;
+ }
+}
+
+
+void esc_to_xml(const char * c, pt::Stream & out)
+{
+ for(size_t i = 0 ; c[i] != 0 ; ++i)
+ {
+ esc_to_xml(c[i], out);
+ }
+}
+
+
+void esc_to_xml(const char * c, std::size_t len, pt::Stream & out)
+{
+ for(size_t i = 0 ; i < len ; ++i)
+ {
+ esc_to_xml(c[i], out);
+ }
+}
+
+
+void esc_to_xml(const wchar_t * c, pt::Stream & out)
+{
+ for(size_t i = 0 ; c[i] != 0 ; ++i)
+ {
+ esc_to_xml(c[i], out);
+ }
+}
+
+
+void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out)
+{
+ for(size_t i = 0 ; i < len ; ++i)
+ {
+ esc_to_xml(c[i], out);
+ }
+}
+
+
+void esc_to_xml(const std::string & in, Stream & out)
+{
+ esc_to_xml(in.c_str(), in.size(), out);
+}
+
+
+void esc_to_xml(const std::wstring & in, Stream & out)
+{
+ esc_to_xml(in.c_str(), in.size(), out);
+}
+
+
+
+
+
+/*
+ * return true if the val character was escaped and put to the out stream
+ * if the character is invalid for such a stream then only return true
+ * but not put it to the stream
+ */
+bool try_esc_to_csv(wchar_t val, pt::Stream & out)
+{
+ bool status = false;
+
+ if( val == 0 )
+ {
+ // null characters are invalid in text files
+ // return true but not put to the out stream
+ status = true;
+ }
+ else
+ if( val == '"' )
+ {
+ out << "\"\"";
+ status = true;
+ }
+
+ return status;
+}
+
+
+void esc_to_csv(wchar_t val, pt::Stream & out)
+{
+ if( !try_esc_to_csv(val, out) )
+ {
+ out << val;
+ }
+}
+
+
+void esc_to_csv(char val, Stream & out)
+{
+ if( !try_esc_to_csv((wchar_t)(unsigned char)val, out) )
+ {
+ out << val;
+ }
+}
+
+
+void esc_to_csv(const char * c, pt::Stream & out)
+{
+ for(size_t i = 0 ; c[i] != 0 ; ++i)
+ {
+ esc_to_csv(c[i], out);
+ }
+}
+
+
+void esc_to_csv(const char * c, std::size_t len, pt::Stream & out)
+{
+ for(size_t i = 0 ; i < len ; ++i)
+ {
+ esc_to_csv(c[i], out);
+ }
+}
+
+
+void esc_to_csv(const wchar_t * c, pt::Stream & out)
+{
+ for(size_t i = 0 ; c[i] != 0 ; ++i)
+ {
+ esc_to_csv(c[i], out);
+ }
+}
+
+
+void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out)
+{
+ for(size_t i = 0 ; i < len ; ++i)
+ {
+ esc_to_csv(c[i], out);
+ }
+}
+
+
+void esc_to_csv(const std::string & in, Stream & out)
+{
+ esc_to_csv(in.c_str(), in.size(), out);
+}
+
+
+void esc_to_csv(const std::wstring & in, Stream & out)
+{
+ esc_to_csv(in.c_str(), in.size(), out);
+}
+
+
+
}
diff --git a/src/convert/misc.h b/src/convert/misc.h
index 7dbb128..e8b10bf 100644
--- a/src/convert/misc.h
+++ b/src/convert/misc.h
@@ -5,7 +5,7 @@
*/
/*
- * Copyright (c) 2017, Tomasz Sowa
+ * Copyright (c) 2017-2022, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,9 @@
#include
#include "text.h"
+#include "textstream/stream.h"
+#include "textstream/types.h"
+#include "utf8/utf8_stream.h"
namespace pt
@@ -47,6 +50,78 @@ namespace pt
void SetOverflow(bool * was_overflow, bool val);
+bool try_esc_to_json(wchar_t val, Stream & out);
+void esc_to_json(wchar_t val, Stream & out);
+void esc_to_json(char val, Stream & out);
+void esc_to_json(const char * c, pt::Stream & out);
+void esc_to_json(const char * c, std::size_t len, Stream & out);
+void esc_to_json(const wchar_t * c, Stream & out);
+void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out);
+void esc_to_json(const std::string & in, Stream & out);
+void esc_to_json(const std::wstring & in, Stream & out);
+
+bool try_esc_to_xml(wchar_t val, Stream & out);
+void esc_to_xml(wchar_t c, pt::Stream & out);
+void esc_to_xml(char c, pt::Stream & out);
+void esc_to_xml(const char * c, pt::Stream & out);
+void esc_to_xml(const char * c, std::size_t len, pt::Stream & out);
+void esc_to_xml(const wchar_t * c, pt::Stream & out);
+void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out);
+void esc_to_xml(const std::string & in, Stream & out);
+void esc_to_xml(const std::wstring & in, Stream & out);
+
+bool try_esc_to_csv(wchar_t val, pt::Stream & out);
+void esc_to_csv(wchar_t val, Stream & out);
+void esc_to_csv(char c, pt::Stream & out);
+void esc_to_csv(const char * c, std::size_t len, Stream & out);
+void esc_to_csv(const char * c, pt::Stream & out);
+void esc_to_csv(const char * c, std::size_t len, pt::Stream & out);
+void esc_to_csv(const wchar_t * c, pt::Stream & out);
+void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out);
+void esc_to_csv(const std::string & in, Stream & out);
+
+
+
+template
+void esc_to_json(const StreamType & in, Stream & out)
+{
+ typename StreamType::const_iterator i = in.begin();
+ typename StreamType::const_iterator end = in.end();
+
+ while( i != end )
+ {
+ wchar_t c = i.get_unicode_and_advance(end);
+ esc_to_json(c, out);
+ }
+}
+
+
+template
+void esc_to_xml(const StreamType & in, Stream & out)
+{
+ typename StreamType::const_iterator i = in.begin();
+ typename StreamType::const_iterator end = in.end();
+
+ while( i != end )
+ {
+ wchar_t c = i.get_unicode_and_advance(end);
+ esc_to_xml(c, out);
+ }
+}
+
+
+template
+void esc_to_csv(const StreamType & in, Stream & out)
+{
+ typename StreamType::const_iterator i = in.begin();
+ typename StreamType::const_iterator end = in.end();
+
+ while( i != end )
+ {
+ wchar_t c = i.get_unicode_and_advance(end);
+ esc_to_csv(c, out);
+ }
+}
}
diff --git a/src/csv/csvparser.cpp b/src/csv/csvparser.cpp
index 4ab1480..0a83e92 100644
--- a/src/csv/csvparser.cpp
+++ b/src/csv/csvparser.cpp
@@ -44,10 +44,17 @@ namespace pt
{
+CSVParser::CSVParser()
+{
+ input_as_utf8 = true;
+}
+
CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
{
+ clear_input_flags();
+
reading_from_file = true;
space = &out_space;
@@ -98,11 +105,10 @@ CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space &
CSVParser::Status CSVParser::parse(const char * str, Space & out_space)
{
- reading_from_file = false;
- reading_from_wchar_string = false;
- pchar_ascii = str;
- pchar_unicode = 0;
- space = &out_space;
+ clear_input_flags();
+
+ pchar_ascii = str;
+ space = &out_space;
parse();
@@ -119,11 +125,10 @@ CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space)
CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space)
{
- reading_from_file = false;
- reading_from_wchar_string = true;
- pchar_unicode = str;
- pchar_ascii = 0;
- space = &out_space;
+ clear_input_flags();
+
+ pchar_unicode = str;
+ space = &out_space;
parse();
@@ -285,132 +290,6 @@ bool CSVParser::read_non_quoted_value_to(std::wstring & value)
-
-int CSVParser::read_utf8_char()
-{
-int c;
-bool correct;
-
- lastc = -1;
-
- do
- {
- utf8_to_int(file, c, correct);
-
- if( !file )
- return lastc;
- }
- while( !correct );
-
- lastc = c;
-
- if( lastc == '\n' )
- ++line;
-
-return lastc;
-}
-
-
-
-int CSVParser::read_ascii_char()
-{
- lastc = file.get();
-
- if( lastc == '\n' )
- ++line;
-
-return lastc;
-}
-
-
-
-
-int CSVParser::read_char_from_wchar_string()
-{
- if( *pchar_unicode == 0 )
- lastc = -1;
- else
- lastc = *(pchar_unicode++);
-
- if( lastc == '\n' )
- ++line;
-
-return lastc;
-}
-
-
-int CSVParser::read_char_from_utf8_string()
-{
-int c;
-bool correct;
-
- lastc = -1;
-
- do
- {
- size_t len = utf8_to_int(pchar_ascii, c, correct);
- pchar_ascii += len;
- }
- while( *pchar_ascii && !correct );
-
- if( correct )
- lastc = c;
-
- if( lastc == '\n' )
- ++line;
-
-return lastc;
-}
-
-
-int CSVParser::read_char_from_ascii_string()
-{
- if( *pchar_ascii == 0 )
- lastc = -1;
- else
- lastc = *(pchar_ascii++);
-
- if( lastc == '\n' )
- ++line;
-
-return lastc;
-}
-
-
-int CSVParser::read_char_no_escape()
-{
- if( reading_from_file )
- {
- if( input_as_utf8 )
- return read_utf8_char();
- else
- return read_ascii_char();
- }
- else
- {
- if( reading_from_wchar_string )
- {
- return read_char_from_wchar_string();
- }
- else
- {
- if( input_as_utf8 )
- return read_char_from_utf8_string();
- else
- return read_char_from_ascii_string();
- }
- }
-}
-
-
-
-
-int CSVParser::read_char()
-{
- return read_char_no_escape();
-}
-
-
}
diff --git a/src/csv/csvparser.h b/src/csv/csvparser.h
index 8370867..c549fa5 100644
--- a/src/csv/csvparser.h
+++ b/src/csv/csvparser.h
@@ -38,9 +38,11 @@
#ifndef headerfile_picotools_csv_csvparser
#define headerfile_picotools_csv_csvparser
-#include "space/space.h"
#include
#include
+#include "space/space.h"
+#include "convert/baseparser.h"
+
namespace pt
@@ -51,10 +53,12 @@ namespace pt
* https://datatracker.ietf.org/doc/html/rfc4180
*
*/
-class CSVParser
+class CSVParser : public BaseParser
{
public:
+ CSVParser();
+
enum Status
{
ok,
@@ -85,53 +89,6 @@ protected:
Space * space;
- /*
- true if parse_file() method was called
- false if parse() was called
- */
- bool reading_from_file;
-
- /*
- true if parse(wchar_t *) or parse(std::wstring&) was called
- */
- bool reading_from_wchar_string;
-
- /*
- pointers to the current character
- if parse() is being used
- */
- const char * pchar_ascii;
- const wchar_t * pchar_unicode;
-
-
- /*
- last read char
- or -1 if the end
- */
- int lastc;
-
-
-
- /*
- a number of a line in which there is a syntax_error
- */
- int line;
-
- /*
- current file
-
- may it would be better to make a pointer?
- if we parse only a string then there is no sense to have such an object
- */
- std::ifstream file;
-
- /*
- input file is in UTF-8
- default: true
- */
- bool input_as_utf8;
-
-
void parse();
@@ -142,19 +99,6 @@ protected:
bool read_non_quoted_value_to(std::wstring & value);
-
- /*
- * copied from SpaceParser
- * may it would be better to have a class with those methods and inherit from it?
- */
- int read_utf8_char();
- int read_ascii_char();
- int read_char_from_wchar_string();
- int read_char_from_utf8_string();
- int read_char_from_ascii_string();
- int read_char_no_escape();
-
- int read_char();
};
}
diff --git a/src/html/bbcodeparser.cpp b/src/html/bbcodeparser.cpp
new file mode 100644
index 0000000..254de60
--- /dev/null
+++ b/src/html/bbcodeparser.cpp
@@ -0,0 +1,645 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa
+ */
+
+/*
+ * Copyright (c) 2008-2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name Tomasz Sowa nor the names of contributors to this
+ * project may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bbcodeparser.h"
+
+
+namespace pt
+{
+
+
+
+
+bool BBCODEParser::Equal(const wchar_t * str1, const wchar_t * str2)
+{
+ while( *str1 == *str2 && *str1 != 0 )
+ {
+ str1 += 1;
+ str2 += 1;
+ }
+
+return *str1 == *str2;
+}
+
+
+
+
+bool BBCODEParser::IsValidCharForName(int c)
+{
+ if( (c>='a' && c<='z') ||
+ (c>='A' && c<='Z') ||
+ c=='*' || c=='_')
+ return true;
+
+return false;
+}
+
+
+bool BBCODEParser::IsOpeningTagMark(wchar_t c)
+{
+ return (c == '[');
+}
+
+
+bool BBCODEParser::IsClosingTagMark(wchar_t c)
+{
+ return (c == ']');
+}
+
+bool BBCODEParser::IsClosingXmlSimpleTagMark(wchar_t c)
+{
+ return false;
+}
+
+
+
+// there are no commentaries in bbcode
+bool BBCODEParser::IsOpeningCommentaryTagMark(const wchar_t *)
+{
+ return false;
+}
+
+
+size_t BBCODEParser::OpeningCommentaryTagMarkSize()
+{
+ return 0;
+}
+
+
+
+bool BBCODEParser::SkipCommentaryTagIfExists()
+{
+ return false;
+}
+
+
+
+
+
+
+
+
+
+
+// one enter will generate one
+// two enters or more will generate only two br (
)
+void BBCODEParser::PutNormalText(const wchar_t * str, const wchar_t * end)
+{
+int br_len;
+
+ if( lastc != -1 )
+ {
+ // trimming last white characters at end of the user text
+ while( str\n";
+ }
+ }
+ else
+ {
+ PrintEscape(*str);
+ ++str;
+ }
+ }
+}
+
+
+
+void BBCODEParser::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white)
+{
+}
+
+
+void BBCODEParser::CheckExceptions()
+{
+ if( stack_len >= 2 )
+ {
+ if( pstack[stack_len-1].type == Item::opening &&
+ pstack[stack_len-2].type == Item::opening &&
+ IsNameEqual(L"*", pstack[stack_len-1].name) &&
+ IsNameEqual(L"*", pstack[stack_len-2].name) )
+ {
+ // removing the last [*] from the stack
+ // was put automatically
+ PopStack();
+ }
+ }
+}
+
+
+
+
+/*
+ bbcode format:
+ [bbcodetag=value]some text[/bbcodetag]
+ the value can be quoted, e.g.
+ [bbcodetag="value"]some text[/bbcodetag], or
+ [bbcodetag='value']some text[/bbcodetag]
+
+ the third string below (in tags table) is 'html_argument' from Tags,
+ it can contain a special character % followed by a string which means:
+ %1 - "value" escaped as for html
+ %2 - "some text" escaped as for html
+ %u1 - "value" trimmed and escaped as for url-es
+ %u2 - "some text" trimmed and escaped as for url-es
+ %% - one %
+
+ if you are using %2 or %u2 then "some text" is not treated as bbcode, e.g.
+ [bbcodetag=value]some [b]text[/b][/bbcodetag] will produce:
+ some [b]text[/b] (the inner tags [b][/b] were not parsed)
+
+ also when using %2 or %u2 the closing bbcode tag is skipped
+ (if you want this tag then you can put it in 'html_argument')
+
+ and when using u (%u1 or %u2) the argument is trimmed from whitespaces and new lines
+ at the beginning and at the end
+ (because otherwise a space would be changed to %20 and this were probably not what you really wanted)
+*/
+const BBCODEParser::Tags * BBCODEParser::FindTag(const wchar_t * tag)
+{
+ static Tags tags[] = {
+ {L"*", L"li", L">", false},
+ {L"b", L"em", L">", true},
+ {L"i", L"span", L" class=\"bbitalic\">", true},
+ {L"u", L"span", L" class=\"bbunderline\">", true},
+ {L"s", L"span", L" class=\"bbstrike\">", true},
+ {L"code", L"code", L" class=\"bbcode\">", false},
+ {L"list", L"ul", L" class=\"bblist\">", false},
+ {L"color", L"span", L" class=\"bbcol%1\">", true},
+ {L"url", L"a", L" href=\"%u1\">", true},
+ {L"img", L"img", L" alt=\"%1\" src=\"%u2\">", true},
+ {L"quote", L"div", L" class=\"bbquote\">\n%1
\n", false},
+ };
+
+ size_t i;
+ size_t len = sizeof(tags) / sizeof(Tags);
+
+ for(i=0 ; i='a' && c<='z') ||
+ (c>='A' && c<='Z') ||
+ (c>='0' && c<='9') ||
+ (c=='_' || c=='?' || c=='.' || c==',' || c=='/' || c=='-' ||
+ c=='+' || c=='*' || c=='(' || c==')' || c=='=' || c==':')
+ )
+ {
+ (*out_string) += c;
+ }
+ else
+ {
+ wchar_t buffer[20];
+ swprintf(buffer, 20, L"%02X", c);
+
+ (*out_string) += '%';
+ (*out_string) += buffer;
+ }
+}
+
+
+void BBCODEParser::PrintEscape(int c, bool change_quote)
+{
+ if( c == '<' )
+ {
+ (*out_string) += L"<";
+ }
+ else
+ if( c == '>' )
+ {
+ (*out_string) += L">";
+ }
+ else
+ if( c == '&' )
+ {
+ (*out_string) += L"&";
+ }
+ else
+ if( c == '\"' && change_quote )
+ {
+ (*out_string) += L""";
+ }
+ else
+ {
+ (*out_string) += c;
+ }
+}
+
+
+void BBCODEParser::PrintArgumentEncode(const wchar_t * start, const wchar_t * end)
+{
+ PrintArgumentCheckQuotes(start, end);
+ TrimWhiteWithNewLines(start, end);
+
+ for( ; starthtml_tag, tag_name) )
+ {
+ if( condition )
+ {
+ PutClosingTag(tag);
+ (*out_string) += '\n';
+ }
+
+ condition = true;
+ }
+}
+
+
+void BBCODEParser::CheckOpeningTag(const Tags * tag)
+{
+ bool has_list_tag = has_open_ul_tag || has_open_ol_tag;
+
+ CheckOpeningTag(tag, L"li", has_open_li_tag);
+ CheckOpeningTag(tag, L"ul", has_open_ul_tag);
+ CheckOpeningTag(tag, L"ol", has_open_ol_tag);
+
+ if( has_open_li_tag && !has_list_tag )
+ {
+ (*out_string) += L"\n";
+ has_open_ul_tag = true;
+ }
+}
+
+
+
+
+
+void BBCODEParser::PrintEscape(const wchar_t * start, const wchar_t * end, bool change_quote)
+{
+ for( ; start < end ; ++start)
+ PrintEscape(*start, change_quote);
+}
+
+
+
+void BBCODEParser::PrintEncode(const wchar_t * start, const wchar_t * end)
+{
+ for( ; start < end ; ++start)
+ PrintEncode(*start);
+}
+
+
+
+void BBCODEParser::PutOpeningTagFromEzc()
+{
+ // this can be a tag from Ezc templates system
+ (*out_string) += '[';
+ (*out_string) += LastItem().name;
+
+
+// FIXME
+// const wchar_t * start = pchar;
+//
+// while( *pchar && *pchar!=']' )
+// ++pchar;
+//
+// if( *pchar == ']' )
+// ++pchar;
+//
+// Put(start, pchar);
+}
+
+
+
+
+
+void BBCODEParser::PutHtmlArgument1(const wchar_t * arg_start, const wchar_t * arg_end, bool has_u)
+{
+ if( has_u )
+ PrintArgumentEncode(arg_start, arg_end);
+ else
+ PrintArgumentEscape(arg_start, arg_end);
+}
+
+
+
+void BBCODEParser::TrimWhiteWithNewLines(const wchar_t * & start, const wchar_t * & end)
+{
+ while( start < end && (IsWhite(*start) || *start==10) )
+ ++start;
+
+ while( start < end && (IsWhite(*(end-1)) || *(end-1)==10) )
+ --end;
+}
+
+
+
+void BBCODEParser::PutHtmlArgument2(const Tags * tag, bool has_u)
+{
+//const wchar_t * start = pchar;
+//const wchar_t * end = pchar;
+bool first_tag_removed = false;
+
+ while( lastc != -1 )
+ {
+ if( IsOpeningTagMark(lastc) )
+ {
+ // FIXME
+// if( IsClosingTagForLastItem() )
+// {
+// // the last tag is skipped when using patterns with %2 or %u2
+//
+// PopStack(); // removing opening tag from the stack
+// first_tag_removed = true;
+// break;
+// }
+ }
+ else
+ {
+ read_char();
+ //end = pchar;
+ }
+ }
+
+ if( !first_tag_removed )
+ PopStack(); // user has forgotten to close the tag
+
+ if( has_u )
+ {
+// FIXME
+// TrimWhiteWithNewLines(start, end);
+// PrintEncode(start, end);
+ }
+ else
+ {
+ // FIXME
+// PrintEscape(start, end);
+ }
+}
+
+
+
+void BBCODEParser::PutHtmlArgument(const Tags * tag, const wchar_t * arg_start, const wchar_t * arg_end)
+{
+const wchar_t * pattern = tag->html_argument;
+bool has_u;
+
+ while( *pattern )
+ {
+ if( *pattern == '%' )
+ {
+ ++pattern;
+ has_u = false;
+
+ if( *pattern == 'u' )
+ {
+ ++pattern;
+ has_u = true;
+ }
+
+ if( *pattern == '1' )
+ {
+ ++pattern;
+ PutHtmlArgument1(arg_start, arg_end, has_u);
+ }
+ else
+ if( *pattern == '2' )
+ {
+ ++pattern;
+ PutHtmlArgument2(tag, has_u);
+ }
+ else
+ if( *pattern == '%' )
+ {
+ (*out_string) += '%';
+ ++pattern;
+ }
+ // else unrecognized, will be printed next time as a normal character
+ }
+ else
+ {
+ (*out_string) += *pattern;
+ ++pattern;
+ }
+ }
+}
+
+
+void BBCODEParser::PutOpeningTagFromBBCode(const Tags * tag)
+{
+ CheckOpeningTag(tag);
+ PutOpeningTagMark();
+ Put(tag->html_tag);
+
+// FIXME
+// const wchar_t * start = pchar;
+//
+// while( *pchar && *pchar != ']' )
+// ++pchar;
+//
+// PutHtmlArgument(tag, start, pchar);
+//
+// if( *pchar == ']' )
+// ++pchar;
+
+ if( !tag->inline_tag )
+ {
+ Put(10);
+ SkipWhiteLines();
+ }
+}
+
+
+bool BBCODEParser::PutOpeningTag()
+{
+ const Tags * tag = FindTag(LastItem().name);
+
+ if( !tag )
+ PutOpeningTagFromEzc();
+ else
+ PutOpeningTagFromBBCode(tag);
+
+return false;
+}
+
+
+void BBCODEParser::PutClosingTag(const Tags * tag)
+{
+ if( !tag )
+ return; // skipping the tag
+
+ PutOpeningTagMark();
+ (*out_string) += '/';
+ (*out_string) += tag->html_tag;
+ PutClosingTagMark();
+
+ if( !tag->inline_tag )
+ {
+ (*out_string) += L"\n";
+ SkipWhiteLines();
+ }
+
+ if( Equal(tag->html_tag, L"li") )
+ has_open_li_tag = false;
+
+ if( Equal(tag->html_tag, L"ol") )
+ has_open_ol_tag = false;
+
+ if( Equal(tag->html_tag, L"ul") )
+ has_open_ul_tag = false;
+}
+
+
+void BBCODEParser::PutClosingTag(const wchar_t * tag_name)
+{
+ const Tags * tag = FindTag(tag_name);
+ PutClosingTag(tag);
+}
+
+
+
+void BBCODEParser::Init()
+{
+ has_open_li_tag = false;
+ has_open_ol_tag = false;
+ has_open_ul_tag = false;
+
+ SkipWhiteLines();
+}
+
+
+void BBCODEParser::Uninit()
+{
+ if( has_open_li_tag )
+ (*out_string) += L"\n";
+
+ if( has_open_ol_tag )
+ (*out_string) += L"\n";
+
+ if( has_open_ul_tag )
+ (*out_string) += L"
\n";
+}
+
+
+
+}
+
diff --git a/src/html/bbcodeparser.h b/src/html/bbcodeparser.h
new file mode 100644
index 0000000..a2e2e7f
--- /dev/null
+++ b/src/html/bbcodeparser.h
@@ -0,0 +1,128 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa
+ */
+
+/*
+ * Copyright (c) 2008-2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name Tomasz Sowa nor the names of contributors to this
+ * project may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_winix_core_bbcodeparser
+#define headerfile_winix_core_bbcodeparser
+
+#include "htmlparser.h"
+
+namespace pt
+{
+
+
+class BBCODEParser : public HTMLParser
+{
+
+ struct Tags
+ {
+ const wchar_t * bbcode;
+ const wchar_t * html_tag;
+ const wchar_t * html_argument; // with closing '>'
+ bool inline_tag;
+ };
+
+
+ /*
+ virtual methods
+ (from HTMLParser class)
+ */
+ virtual void Init();
+ virtual void Uninit();
+
+ virtual bool IsOpeningTagMark(wchar_t c);
+ virtual bool IsClosingTagMark(wchar_t c);
+ virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
+
+ virtual bool IsOpeningCommentaryTagMark(const wchar_t *);
+ virtual size_t OpeningCommentaryTagMarkSize();
+
+ virtual bool IsValidCharForName(int c);
+ virtual void CheckExceptions();
+ virtual bool SkipCommentaryTagIfExists();
+
+ virtual bool PutOpeningTag();
+ virtual void PutClosingTag(const wchar_t * tag);
+
+ virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
+ virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
+
+
+
+ /*
+ others
+ */
+ bool Equal(const wchar_t * str1, const wchar_t * str2);
+
+ void PutHtmlArgument1(const wchar_t * arg_start, const wchar_t * arg_end, bool has_u);
+ void PutHtmlArgument2(const Tags * tag, bool has_u);
+ void PutHtmlArgument(const Tags * tag, const wchar_t * arg_start, const wchar_t * arg_end);
+
+ void PutOpeningTagFromEzc();
+ void PutOpeningTagFromBBCode(const Tags * tag);
+
+ const Tags * FindTag(const wchar_t * tag);
+ const Tags * FindTag(const std::wstring & tag);
+ void PrintArgumentCheckQuotes(const wchar_t * & start, const wchar_t * & end);
+
+ void PrintEscape(int c, bool change_quote = false);
+ void PrintEncode(int c);
+
+ void PrintEscape(const wchar_t * start, const wchar_t * end, bool change_quote = false);
+ void PrintEncode(const wchar_t * start, const wchar_t * end);
+
+ void PrintArgumentEncode(const wchar_t * start, const wchar_t * end);
+ void PrintArgumentEscape(const wchar_t * start, const wchar_t * end);
+
+ void PutClosingTag(const Tags * tag);
+
+ void CheckOpeningTag(const Tags * tag, const wchar_t * tag_name, bool & condition);
+ void CheckOpeningTag(const Tags * tag);
+
+ void TrimWhiteWithNewLines(const wchar_t * & start, const wchar_t * & end);
+
+
+
+ bool has_open_ol_tag; // has open html tag
+ bool has_open_ul_tag; // has open html tag
+ bool has_open_li_tag; // has open html - tag
+};
+
+
+}
+
+
+#endif
diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
new file mode 100644
index 0000000..f4b158e
--- /dev/null
+++ b/src/html/htmlparser.cpp
@@ -0,0 +1,2434 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa
+ */
+
+/*
+ * Copyright (c) 2008-2022, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name Tomasz Sowa nor the names of contributors to this
+ * project may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "htmlparser.h"
+#include "convert/text.h"
+
+
+namespace pt
+{
+const int HTMLParser::WHITE_MODE_ORIGIN;
+const int HTMLParser::WHITE_MODE_SINGLE_LINE;
+const int HTMLParser::WHITE_MODE_TREE;
+
+
+
+void HTMLParser::clear_input_flags()
+{
+ BaseParser::clear_input_flags();
+
+ parsing_html = true;
+ xml_compact_mode = true;
+ status = ok;
+ line = 1;
+ stack_len = 0;
+ out_string = nullptr;
+ out_stream = nullptr;
+ out_space = nullptr;
+ line_len = 0;
+ char_was_escaped = false;
+ escaped_chars_buffer.clear();
+ escaped_char_index = 0;
+ filter_mode = false;
+}
+
+
+
+
+void HTMLParser::Item::Clear()
+{
+ name.clear();
+ type = none;
+ is_commentary = false;
+ is_cdata = false;
+ porphans = nullptr;
+ new_line_before = false;
+ new_line_after = false;
+ new_line_in_the_middle = false;
+ white_char_before = false;
+ has_body_tag = false;
+ tree_index = 0;
+ space = nullptr;
+}
+
+
+HTMLParser::Item::Item()
+{
+ Clear();
+}
+
+
+void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
+{
+ clear_input_flags();
+
+ pchar_unicode = in;
+ xml_compact_mode = compact_mode;
+ out_space = &space;
+ out_space->clear();
+
+ Init();
+ Read();
+ Uninit();
+}
+
+
+void HTMLParser::set_item_parsed_listener(ItemParsedListener * listener)
+{
+ item_parsed_listener = listener;
+}
+
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+ clear_input_flags();
+
+ parsing_html = false;
+ reading_from_file = true;
+ xml_compact_mode = compact_mode;
+ this->out_space = &out_space;
+
+ if( clear_space )
+ this->out_space->clear();
+
+ file.clear();
+ file.open(file_name, std::ios_base::binary | std::ios_base::in);
+
+ if( file )
+ {
+ Init();
+ Read();
+ Uninit();
+
+ file.close();
+ }
+ else
+ {
+ status = cant_open_file;
+ }
+
+ return status;
+}
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+ return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
+}
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+ std::string file_name_utf8;
+ wide_to_utf8(file_name, file_name_utf8);
+
+ return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
+}
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+ return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
+}
+
+
+
+void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string)
+{
+ clear_input_flags();
+
+ pchar_unicode = in;
+ out_string = &out;
+ filter_mode = true;
+
+ if( clear_out_string )
+ out_string->clear();
+
+ Init();
+ Read();
+ Uninit();
+}
+
+
+void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string)
+{
+ if( &in == &out )
+ {
+ // out cannot be the same string as in
+ return;
+ }
+
+ size_t out_projected_len = in.size() * 2 + 1;
+
+ if( out.capacity() < out_projected_len )
+ out.reserve(out_projected_len);
+
+ filter(in.c_str(), out, clear_out_string);
+}
+
+
+void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream)
+{
+ clear_input_flags();
+
+ WTextStream::const_iterator begin = in.begin();
+ WTextStream::const_iterator end = in.end();
+
+ wtext_stream_iterator = &begin;
+ wtext_stream_iterator_end = &end;
+ out_stream = &out;
+ filter_mode = true;
+
+ if( clear_out_stream )
+ out_stream->clear();
+
+ Init();
+ Read();
+ Uninit();
+}
+
+
+HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream)
+{
+ clear_input_flags();
+
+ reading_from_file = true;
+
+ // open the file before clearing 'out' string, 'out' string can be the same string as the file_name
+ file.clear();
+ file.open(file_name, std::ios_base::binary | std::ios_base::in);
+
+ out_string = &out;
+ filter_mode = true;
+
+ if( clear_out_stream )
+ out_string->clear();
+
+ if( file )
+ {
+ Init();
+ Read();
+ Uninit();
+
+ file.close();
+ }
+ else
+ {
+ status = cant_open_file;
+ }
+
+ return status;
+}
+
+
+HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream)
+{
+ return filter_file(file_name.c_str(), out, clear_out_stream);
+}
+
+
+HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream)
+{
+ std::string file_name_utf8;
+ pt::wide_to_utf8(file_name, file_name_utf8);
+
+ return filter_file(file_name_utf8, out, clear_out_stream);
+}
+
+
+HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream)
+{
+ return filter_file(file_name.c_str(), out, clear_out_stream);
+}
+
+
+
+
+void HTMLParser::Init()
+{
+}
+
+
+void HTMLParser::Uninit()
+{
+}
+
+
+
+
+
+int HTMLParser::get_last_parsed_line()
+{
+ return line;
+}
+
+
+void HTMLParser::SetSomeDefaults()
+{
+ white_mode = WHITE_MODE_ORIGIN;
+
+ tab_size = 2;
+ wrap_line = 0;
+ orphan_mode = orphan_nbsp;
+ safe_mode = false;
+ skip_tags = false;
+ skip_commentaries = false;
+ skip_entities = false;
+ analyze_entities = false;
+ item_parsed_listener = nullptr;
+}
+
+
+HTMLParser::HTMLParser()
+{
+ pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
+ buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
+
+ SetSomeDefaults();
+}
+
+
+HTMLParser::HTMLParser(const HTMLParser & f)
+{
+ // don't need to copy the stack
+ pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
+ buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
+
+ SetSomeDefaults();
+}
+
+
+HTMLParser & HTMLParser::operator=(const HTMLParser & f)
+{
+ // don't need to copy the stack
+ pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
+ buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
+
+ // we can copy some fields from f
+
+return *this;
+}
+
+
+HTMLParser::~HTMLParser()
+{
+ delete [] pstack;
+ delete [] buffer;
+}
+
+
+
+void HTMLParser::white_chars_mode(int mode)
+{
+ if( mode >= WHITE_MODE_ORIGIN && mode <= WHITE_MODE_TREE )
+ white_mode = mode;
+}
+
+
+
+
+void HTMLParser::WrapLine(size_t wrap_line_)
+{
+ wrap_line = wrap_line_;
+
+ if( wrap_line > 10000 )
+ wrap_line = 10000;
+}
+
+
+
+void HTMLParser::InsertTabs(size_t tabsize)
+{
+ tab_size = tabsize;
+
+ if( tab_size > 1000 )
+ tab_size = 1000;
+}
+
+
+int HTMLParser::current_white_char_mode()
+{
+ if( !white_char_mode_tab.empty() )
+ return white_char_mode_tab.back();
+
+ return WHITE_MODE_ORIGIN;
+}
+
+
+void HTMLParser::CalcOrphansMaxLen(Orphans & orphans)
+{
+size_t i;
+
+ orphans.max_len = 0;
+
+ for(i=0 ; i orphans.max_len )
+ orphans.max_len = orphans.tab[i].size();
+ }
+}
+
+
+void HTMLParser::AssignOrphans(const wchar_t * lang_code, const std::vector & otab)
+{
+ lang_code_lower = lang_code;
+ ToLower(lang_code_lower);
+
+ orphans_temp.tab = otab;
+ std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end());
+ CalcOrphansMaxLen(orphans_temp);
+
+ orphans_tab[lang_code_lower] = orphans_temp;
+}
+
+
+
+void HTMLParser::AssignOrphans(const std::wstring & lang_code, const std::vector & otab)
+{
+ AssignOrphans(lang_code.c_str(), otab);
+}
+
+
+void HTMLParser::ClearOrphans()
+{
+ orphans_tab.clear();
+}
+
+
+
+
+void HTMLParser::OrphansMode(const std::wstring & orphan_mode_str)
+{
+ if( orphan_mode_str == L"160" )
+ orphan_mode = orphan_160space;
+ else
+ orphan_mode = orphan_nbsp;
+}
+
+
+void HTMLParser::SafeMode(bool safe_mode_)
+{
+ safe_mode = safe_mode_;
+}
+
+
+void HTMLParser::SkipTags(bool skip_tags)
+{
+ this->skip_tags = skip_tags;
+}
+
+void HTMLParser::SkipCommentaries(bool skip_commentaries)
+{
+ this->skip_commentaries = skip_commentaries;
+}
+
+
+void HTMLParser::SkipEntities(bool skip_entities)
+{
+ this->skip_entities = skip_entities;
+
+ if( this->skip_entities )
+ {
+ this->analyze_entities = true;
+ }
+}
+
+
+void HTMLParser::AnalyzeEntities(bool analyze_entities)
+{
+ this->analyze_entities = analyze_entities;
+}
+
+
+void HTMLParser::SetNoFilterTag(const std::wstring & tag_name)
+{
+ no_filter_tag = tag_name;
+}
+
+
+
+
+HTMLParser::Item & HTMLParser::GetItem(size_t i)
+{
+ if( i >= stack_len )
+ {
+ empty.Clear();
+ return empty;
+ }
+
+return pstack[i];
+}
+
+
+HTMLParser::Item & HTMLParser::LastItem()
+{
+ if( stack_len == 0 )
+ {
+ empty.Clear();
+ return empty;
+ }
+
+return pstack[stack_len-1];
+}
+
+
+bool HTMLParser::PushStack()
+{
+ if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
+ // oops, too many items
+ return false;
+
+ pstack[stack_len].Clear();
+
+ if( stack_len > 0 )
+ {
+ // 'porphans', 'has_body_tag' and 'tree_index' attributes are propagated
+ pstack[stack_len].porphans = pstack[stack_len-1].porphans;
+ pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag;
+ pstack[stack_len].tree_index = pstack[stack_len-1].tree_index;
+ }
+
+ stack_len += 1;
+
+return true;
+}
+
+
+
+void HTMLParser::PopStack()
+{
+ if( stack_len == 0 )
+ // oops
+ return;
+
+ stack_len -= 1;
+ pstack[stack_len].Clear();
+}
+
+
+bool HTMLParser::IsWhite(int c)
+{
+ // dont use c==10 here
+
+ if( c==' ' || c=='\t' || c==13 || c==160 )
+ return true;
+
+return false;
+}
+
+
+void HTMLParser::SkipWhite(std::wstring * out_string)
+{
+ while( IsWhite(lastc) )
+ {
+ if( out_string )
+ (*out_string) += lastc;
+
+ read_char();
+ }
+}
+
+
+void HTMLParser::SkipWhiteLines(std::wstring * out_string)
+{
+ while( lastc==10 || IsWhite(lastc) )
+ {
+ if( out_string )
+ (*out_string) += lastc;
+
+ read_char();
+ }
+}
+
+
+void HTMLParser::SkipWhiteWithFirstNewLine()
+{
+ SkipWhite();
+
+ if( lastc == 10 )
+ {
+ read_char();
+ SkipWhite();
+ }
+}
+
+
+//void HTMLParser::CheckNewLine()
+//{
+// if( white_mode == WHITE_MODE_TREE )
+// {
+// SkipWhite();
+// }
+//
+// last_new_line = (lastc==10);
+//}
+
+
+
+
+
+
+void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
+{
+ bool is_quoted = false;
+ wchar_t quote_char = 0;
+
+ while( lastc != -1 )
+ {
+ if( !char_was_escaped && (lastc == '"' || lastc == '\'') )
+ {
+ if( is_quoted )
+ {
+ if( lastc == quote_char )
+ {
+ is_quoted = false;
+ }
+ }
+ else
+ {
+ is_quoted = true;
+ quote_char = lastc;
+ }
+ }
+ else
+ if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(lastc) ) // closing xml tag: default '/'
+ {
+ LastItem().type = Item::simple;
+ }
+ else
+ if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) )
+ {
+ read_char();
+ break;
+ }
+
+ if( remember_text )
+ (*remember_text) += lastc;
+
+ read_char();
+ }
+}
+
+
+
+bool HTMLParser::IsValidCharForName(int c)
+{
+ if( (c>='a' && c<='z') ||
+ (c>='A' && c<='Z') ||
+ (c>='0' && c<='9') ||
+ c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA
+ return true;
+
+return false;
+}
+
+
+bool HTMLParser::IsValidCharForAttrName(int c)
+{
+ if( (c>='a' && c<='z') ||
+ (c>='A' && c<='Z') ||
+ (c>='0' && c<='9') ||
+ c=='-' || c==':' || c=='_')
+ return true;
+
+return false;
+}
+
+
+bool HTMLParser::IsValidCharForEntityName(int c)
+{
+ if( (c>='a' && c<='z') ||
+ (c>='A' && c<='Z') ||
+ (c>='0' && c<='9') ||
+ c=='#' )
+ return true;
+
+return false;
+}
+
+
+void HTMLParser::ReadItemName(std::wstring & name, bool clear_name)
+{
+size_t i;
+
+ if( clear_name )
+ name.clear();
+
+ for(i=0 ; IsValidCharForName(lastc) ; ++i)
+ {
+ if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN )
+ {
+ name += lastc;
+
+ if( LastItem().type == Item::special && name == L"!--" )
+ {
+ LastItem().is_commentary = true;
+ read_char();
+ break;
+ }
+
+ if( LastItem().type == Item::special && name == L"![CDATA[" )
+ {
+ LastItem().is_cdata = true;
+ read_char();
+ break;
+ }
+ }
+
+ read_char();
+ }
+}
+
+
+
+void HTMLParser::ReadItemAttrName()
+{
+size_t i;
+
+ attr_name.clear();
+
+ for( i=0 ; lastc != -1 && IsValidCharForAttrName(lastc) ; ++i )
+ {
+ if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN )
+ attr_name += lastc;
+
+ read_char();
+ }
+}
+
+
+
+void HTMLParser::ReadItemAttrValueAdd(const std::wstring & str)
+{
+ if( analyze_entities )
+ {
+ attr_value.push_back(std::wstring());
+ AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), &attr_value.back());
+ }
+ else
+ {
+ attr_value.push_back(str);
+ }
+}
+
+
+void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
+{
+ attr_value.clear();
+ tmp_text.clear();
+
+ while( lastc != -1 )
+ {
+ if( !char_was_escaped )
+ {
+ if( has_quote )
+ {
+ if( lastc == quote_char )
+ break;
+ }
+ else
+ {
+ if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+ break;
+ }
+ }
+
+ if( lastc==10 || IsWhite(lastc) )
+ {
+ if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+ ReadItemAttrValueAdd(tmp_text);
+
+ tmp_text.clear();
+ }
+ else
+ {
+ if( tmp_text.size() > WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+ tmp_text.clear();
+
+ tmp_text += lastc;
+ }
+
+ read_char();
+ }
+
+ if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+ ReadItemAttrValueAdd(tmp_text);
+}
+
+
+void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
+{
+ attr_value.clear();
+ tmp_text.clear();
+
+ while( lastc != -1 )
+ {
+ if( !char_was_escaped )
+ {
+ if( has_quote )
+ {
+ if( lastc == quote_char )
+ break;
+ }
+ else
+ {
+ if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+ break;
+ }
+ }
+
+ // IMPROVEME add support for analyze_entities?
+ if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+ tmp_text += lastc;
+
+ read_char();
+ }
+}
+
+
+
+void HTMLParser::CheckChar(wchar_t c)
+{
+ if( c == 10 )
+ line_len = 0;
+ else
+ line_len += 1;
+}
+
+
+void HTMLParser::Put(wchar_t c)
+{
+ if( out_string )
+ (*out_string) += c;
+
+ if( out_stream )
+ (*out_stream) << c;
+
+ CheckChar(c);
+}
+
+
+void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
+{
+ if( str >= end )
+ return;
+
+ size_t len = end - str;
+
+ if( out_string )
+ out_string->append(str, len);
+
+ if( out_stream )
+ out_stream->write(str, len);
+
+ for( ; str < end ; ++str)
+ CheckChar(*str);
+}
+
+
+
+void HTMLParser::Put(const std::wstring & str)
+{
+ if( !str.empty() )
+ {
+ if( out_string )
+ out_string->append(str);
+
+ if( out_stream )
+ out_stream->write(str.c_str(), str.size());
+
+ for(size_t i=0 ; i < str.size() ; ++i)
+ CheckChar(str[i]);
+ }
+}
+
+
+// out can be null
+void HTMLParser::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
+{
+ size_t epsilon = 8; // !! IMPROVE ME put as a constant
+ const wchar_t * old_str = str;
+
+ while( str < end )
+ {
+ if( IsStartingEntityMark(*str) )
+ {
+ const wchar_t * entity_start = str;
+ str += 1; // skip &
+
+ for(size_t i=0 ; *str && IsValidCharForEntityName(*str) && i < epsilon ; ++i, ++str)
+ {
+ }
+
+ if( IsEndingEntityMark(*str) && str - entity_start > 1 ) // at least one character in entity name
+ {
+ if( out )
+ out->append(old_str, entity_start);
+ else
+ Put(old_str, entity_start);
+
+ str += 1; // skip ;
+
+ if( !skip_entities )
+ {
+ if( out )
+ out->append(entity_start, str);
+ else
+ Put(entity_start, str);
+ }
+
+ EntityFound(entity_start + 1, str - 1); // without & and ;
+ old_str = str;
+ }
+ }
+ else
+ {
+ str += 1;
+ }
+ }
+
+ if( out )
+ out->append(old_str, end);
+ else
+ Put(old_str, end);
+}
+
+
+
+
+int HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
+{
+size_t res;
+
+ const wchar_t * orphan = orphan_str.c_str();
+
+ for( ; str & table)
+{
+int res;
+
+ if( table.empty() )
+ return false;
+
+ size_t o1 = 0;
+ size_t o2 = table.size() - 1;
+
+ res = CheckOrphan(str, end, table[o1]);
+
+ if( res == 0 )
+ return true;
+
+ if( res < 0 )
+ return false;
+
+ res = CheckOrphan(str, end, table[o2]);
+
+ if( res == 0 )
+ return true;
+
+ if( res > 0 )
+ return false;
+
+
+ while( o1 + 1 < o2 )
+ {
+ size_t o = (o1 + o2) / 2;
+ res = CheckOrphan(str, end, table[o]);
+
+ if( res == 0 )
+ return true;
+
+ if( res < 0 )
+ o2 = o;
+ else
+ o1 = o;
+ }
+
+return false;
+}
+
+
+bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end)
+{
+ if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
+ return false;
+
+ size_t len = end - str;
+
+ if( len > LastItem().porphans->max_len )
+ return false;
+
+return CheckOrphan(str, end, LastItem().porphans->tab);
+}
+
+
+bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata)
+{
+ bool was_closing_tag = false;
+
+ while( lastc != -1 && lastc != 10 && !IsWhite(lastc) )
+ {
+ if( is_cdata )
+ {
+ if( lastc == ']' )
+ {
+ read_char();
+
+ if( lastc == ']' )
+ {
+ read_char();
+
+ if( IsClosingTagMark(lastc) )
+ {
+ read_char();
+ was_closing_tag = true;
+ break;
+ }
+ else
+ {
+ str += ']';
+ str += ']';
+ }
+ }
+ else
+ {
+ str += ']';
+ }
+ }
+ }
+ else
+ {
+ if( !char_was_escaped && IsOpeningTagMark(lastc) )
+ {
+ was_closing_tag = true;
+ break;
+ }
+ }
+
+ str += lastc;
+ read_char();
+ }
+
+ if( !str.empty() )
+ {
+ if( allow_put_new_line )
+ {
+ Put(10);
+ PutTabs(LastItem().tree_index + 1);
+ }
+ else
+ if( allow_put_space )
+ {
+ Put(' ');
+ }
+ }
+
+ if( analyze_entities )
+ AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
+ else
+ Put(str);
+
+ return was_closing_tag;
+}
+
+
+void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text)
+{
+ was_white_char = false;
+ was_new_line = false;
+
+ while( lastc == 10 || IsWhite(lastc) )
+ {
+ if( lastc == 10 )
+ was_new_line = true;
+ else
+ was_white_char = true;
+
+ if( result_text )
+ (*result_text) += lastc;
+
+ if( current_white_char_mode() == WHITE_MODE_ORIGIN )
+ {
+ Put(lastc);
+ }
+
+ read_char();
+ }
+
+ if( current_white_char_mode() == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) )
+ {
+ Put(' ');
+ }
+
+ // in WHITE_MODE_TREE white characters are written at the beginning of a or text
+}
+
+
+
+void HTMLParser::PutOpeningTagMark()
+{
+ Put('<');
+}
+
+
+void HTMLParser::PutClosingTagMark()
+{
+ Put('>');
+}
+
+
+
+
+// !! IMPROVE ME change to a better name
+// this functions does not return true when the tag is safe
+bool HTMLParser::IsTagSafe(const wchar_t * tag)
+{
+ if( !safe_mode )
+ return true;
+
+ if( IsNameEqual(tag, no_filter_tag.c_str()) )
+ return false;
+
+ static const wchar_t * unsafe_tags[] = {
+ L"applet", L"base", L"body",
+ L"embed", L"head", L"html",
+ L"frame", L"frameset",L"iframe",
+ L"link", L"meta", L"param"
+ L"object", L"script"
+ };
+
+ size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*);
+ size_t i;
+
+ for(i=0 ; i 30 )
+ len = 30;
+
+ for(size_t i=0 ; i < (len*tab_size) ; ++i)
+ {
+ if( out_string )
+ (*out_string) += ' '; // we do not add them to 'line_len'
+
+ if( out_stream )
+ (*out_stream) << ' ';
+ }
+}
+
+
+void HTMLParser::PutNonBreakingSpace()
+{
+ if( orphan_mode == orphan_nbsp )
+ {
+ Put(L" ");
+ }
+ else
+ {
+ Put(160);
+ }
+}
+
+
+
+// we assume the size of the opening mark to be one
+bool HTMLParser::IsOpeningTagMark(wchar_t c)
+{
+ return (c == '<');
+}
+
+
+// we assume the size of the closing mark to be one
+bool HTMLParser::IsClosingTagMark(wchar_t c)
+{
+ return (c == '>');
+}
+
+
+// the slash in the closing tag mark e.g.
+bool HTMLParser::IsClosingTagIndicator(wchar_t c)
+{
+ return (c == '/');
+}
+
+
+// the slash in the closing tag mark e.g.
+bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
+{
+ return (c == '!');
+}
+
+bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c)
+{
+ return (c == '?');
+}
+
+// the '=' operator e.g. class="value"
+bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
+{
+ return (c == '=');
+}
+
+
+
+// the slash at the end (without '>' character)
+// we assume the size of the mark to be one
+bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
+{
+ return (c == '/');
+}
+
+
+bool HTMLParser::IsStartingEntityMark(wchar_t c)
+{
+ return (c == '&');
+}
+
+
+bool HTMLParser::IsEndingEntityMark(wchar_t c)
+{
+ return (c == ';');
+}
+
+
+
+// used for such tags as: script, pre, textarea
+void HTMLParser::ReadTextUntilClosingCommentary()
+{
+ while( lastc != -1 )
+ {
+ if( lastc == '-' )
+ {
+ tmp_text.clear();
+ tmp_text += lastc;
+ read_char();
+
+ if( lastc == '-' )
+ {
+ tmp_text += lastc;
+ read_char();
+
+ if( !char_was_escaped && IsClosingTagMark(lastc) )
+ {
+ tmp_text += lastc;
+ read_char();
+ Put(tmp_text);
+
+ break;
+ }
+ }
+
+ Put(tmp_text);
+ }
+ else
+ {
+ Put(lastc);
+ read_char();
+ }
+ }
+}
+
+
+
+bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
+{
+ tmp_text.clear();
+ tmp_text += lastc; // opening tag mark
+ read_char();
+
+ SkipWhiteLines(&tmp_text);
+
+ if( IsClosingTagIndicator(lastc) )
+ {
+ tmp_text += lastc;
+ read_char();
+ SkipWhiteLines(&tmp_text);
+ ReadItemName(tmp_name);
+
+ if( IsNameEqual(tmp_name, LastItem().name) )
+ {
+ SkipAndCheckClosingTag();
+
+ if( put_closing_tag_as_well )
+ {
+ Put('<');
+ Put('/');
+ Put(tmp_name);
+ Put('>');
+ }
+
+ return true;
+ }
+ else
+ {
+ Put(tmp_text);
+ Put(tmp_name);
+ }
+ }
+ else
+ {
+ Put(tmp_text);
+ }
+
+return false;
+}
+
+
+
+
+// used for such tags as: script, pre, textarea
+void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
+{
+ while( lastc != -1 )
+ {
+ if( !char_was_escaped && IsOpeningTagMark(lastc) )
+ {
+ if( IsClosingTagForLastItem(put_closing_tag_as_well) )
+ {
+ //CheckNewLine();
+ break;
+ }
+ }
+ else
+ {
+ Put(lastc);
+ read_char();
+ }
+ }
+}
+
+
+
+
+// reading text between html tags
+void HTMLParser::ReadText(bool is_cdata)
+{
+ new_item_has_new_line_before = false;
+ new_item_has_white_char_before = false;
+
+ bool was_white_char = false;
+ bool was_new_line = false;
+
+ bool was_non_white_text = false;
+
+ bool allow_put_new_line = false;
+ bool allow_put_space = false;
+
+ if( current_white_char_mode() == WHITE_MODE_TREE )
+ {
+ if( LastItem().new_line_after || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) )
+ {
+ allow_put_new_line = true;
+ }
+ }
+
+ Space * text_space = nullptr;
+ std::wstring * text_space_wstr = nullptr;
+
+ if( out_space )
+ {
+ text_space = &text_space_tmp;
+ text_space->clear();
+ text_space->add(L"name", L"");
+ Space & wstr_space = text_space->add(L"text", L"");
+ text_space_wstr = &wstr_space.value.value_wstring;
+ }
+
+ bool was_closing_tag = false;
+
+ while( lastc != -1 && !was_closing_tag )
+ {
+ tmp_text.clear();
+ was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
+
+ if( lastc == -1 || was_closing_tag )
+ {
+ new_item_has_new_line_before = was_new_line;
+ new_item_has_white_char_before = was_white_char;
+ }
+
+ if( !tmp_text.empty() )
+ {
+ allow_put_new_line = false;
+ allow_put_space = false;
+ was_non_white_text = true;
+
+ if( text_space_wstr )
+ (*text_space_wstr) += tmp_text;
+ }
+
+ if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) )
+ {
+ if( lastc == 10 || IsWhite(lastc) )
+ {
+ SkipWhiteLines(text_space_wstr);
+ PutNonBreakingSpace();
+ was_new_line = false;
+ }
+ }
+ else
+ {
+ PutNormalWhite(was_white_char, was_new_line, text_space_wstr);
+
+ if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
+ {
+ allow_put_new_line = false;
+ allow_put_space = false;
+
+ if( was_new_line )
+ {
+ allow_put_new_line = true;
+ LastItem().new_line_in_the_middle = true;
+
+ if( !was_non_white_text )
+ LastItem().new_line_after = true;
+ }
+ else
+ {
+ allow_put_space = true;
+ }
+
+ if( wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line )
+ {
+ allow_put_new_line = true;
+ }
+ }
+ }
+ }
+
+ if( text_space_wstr && !text_space_wstr->empty() && was_non_white_text )
+ {
+ AddSpaceToSpaceTree(*text_space);
+ }
+
+ text_space_tmp.clear();
+}
+
+
+
+bool HTMLParser::PrintOpeningItem()
+{
+ if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
+ return true;
+
+ return PutOpeningTag();
+}
+
+
+
+
+
+bool HTMLParser::ReadItemAttr()
+{
+ attr_has_value = false;
+ attr_name.clear();
+ attr_value.clear();
+
+ SkipWhiteLines();
+ ReadItemAttrName();
+
+ if( attr_name.empty() )
+ return false;
+
+ SkipWhiteLines();
+
+ if( !IsAttributeAssignmentMark(lastc) ) // '='
+ return true;
+
+ attr_has_value = true;
+ read_char(); // skipping '='
+ SkipWhiteLines();
+
+ bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\'');
+ wchar_t quote_char = lastc;
+
+ if( has_quote )
+ read_char(); // skipping the first quote mark
+
+ // IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table...
+ if( parsing_html )
+ ReadItemAttrValue(has_quote, quote_char);
+ else
+ ReadXMLItemAttrValue(has_quote, quote_char);
+
+ if( has_quote && !char_was_escaped && lastc == quote_char )
+ read_char(); // skipping the last quote mark
+
+return true;
+}
+
+
+
+void HTMLParser::CheckItemLangAttr()
+{
+ if( attr_has_value && IsNameEqual(L"lang", attr_name) )
+ {
+ LastItem().porphans = nullptr;
+
+ if( !attr_value.empty() )
+ {
+ // we are taking the first value only
+ attr_value_lower = attr_value[0];
+ ToLower(attr_value_lower);
+
+ OrphansTab::iterator i = orphans_tab.find(attr_value_lower);
+
+ if( i != orphans_tab.end() )
+ LastItem().porphans = &i->second;
+ }
+ }
+}
+
+
+void HTMLParser::PrintItemAttr()
+{
+size_t i;
+
+ if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
+ return;
+
+ Put(' ');
+ Put(attr_name);
+
+ if( attr_has_value )
+ {
+ Put(L"=\"");
+
+ for(i=0 ; iget_add_space(L"attr");
+ Space & attr = attr_tab.add_empty_space(attr_name);
+
+ if( attr_has_value )
+ {
+ if( parsing_html )
+ {
+ attr.set_empty_table();
+
+ for(size_t i=0 ; i < attr_value.size() ; ++i)
+ {
+ attr.add(attr_value[i]);
+ }
+ }
+ else
+ {
+ attr.set(tmp_text);
+ }
+ }
+ }
+}
+
+
+void HTMLParser::ReadItemClosing()
+{
+ read_char(); // skipping '/'
+ SkipWhiteLines();
+ ReadItemName(LastItem().name);
+ LastItem().type = Item::closing;
+ SkipAndCheckClosingTag();
+
+ // closing tags are printed later
+}
+
+
+void HTMLParser::ReadItemSpecial()
+{
+ LastItem().type = Item::special;
+
+ if( !skip_tags )
+ {
+ if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before )
+ {
+ Put(10);
+ PutTabs(LastItem().tree_index);
+ }
+
+ PutOpeningTagMark();
+ }
+
+ LastItem().name = lastc;
+ read_char(); // skipping '!' or '?'
+ ReadItemName(LastItem().name, false);
+
+ if( skip_tags )
+ {
+ SkipAndCheckClosingTag();
+ }
+ else
+ {
+ if( LastItem().is_commentary )
+ {
+ Put(LastItem().name);
+ }
+ else
+ if( LastItem().is_cdata )
+ {
+ // do nothing
+ }
+ else
+ {
+ tmp_text.clear();
+ SkipWhiteLines();
+ SkipAndCheckClosingTag(&tmp_text);
+ Put(LastItem().name);
+ Put(' ');
+ Put(tmp_text);
+ Put('>');
+
+ if( is_first_item && current_white_char_mode() == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") )
+ {
+ Put(10);
+ Put(10);
+ SkipWhiteLines();
+ }
+ }
+ }
+}
+
+
+void HTMLParser::ReadItemOpening()
+{
+ LastItem().type = Item::opening;
+ ReadItemName(LastItem().name);
+ AddItemToSpace();
+ Space * space = LastItem().space;
+
+ if( !xml_compact_mode && space )
+ space->add(L"name", LastItem().name);
+
+ if( PrintOpeningItem() )
+ {
+ while( ReadItemAttr() )
+ {
+ CheckItemLangAttr();
+ PrintItemAttr();
+ PutItemAttrToSpace();
+ }
+
+ SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
+
+ if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
+ {
+ if( LastItem().type == Item::simple )
+ Put(L" /");
+
+ PutClosingTagMark();
+ }
+ }
+}
+
+
+void HTMLParser::ItemFound()
+{
+}
+
+void HTMLParser::EntityFound(const wchar_t * str, const wchar_t * end)
+{
+}
+
+
+bool HTMLParser::ReadItem()
+{
+ if( lastc == -1 )
+ return false;
+
+ if( !PushStack() )
+ return false;
+
+ LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method
+ LastItem().white_char_before = new_item_has_white_char_before; // new_item_has_white_char_before is set by ReadText() method
+
+ if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
+ LastItem().tree_index += 1;
+
+ read_char(); // skipping the first opening tag mark '<'
+ SkipWhiteLines();
+
+ if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) )
+ ReadItemSpecial();
+ else
+ if( IsClosingTagIndicator(lastc) )
+ ReadItemClosing();
+ else
+ ReadItemOpening();
+
+ // IMPROVE ME later CheckSingleItemExceptions() can change opening to single type
+ ItemFound();
+
+return true;
+}
+
+
+
+wchar_t HTMLParser::ToLower(wchar_t c)
+{
+ if( c>='A' && c<='Z' )
+ return c - 'A' + 'a';
+
+return c;
+}
+
+
+void HTMLParser::ToLower(std::wstring & str)
+{
+size_t i;
+
+ for(i=0 ; i0 ; ++name1, ++name2, --len )
+ if( ToLower(*name1) != ToLower(*name2) )
+ return false;
+
+ if( len == 0 )
+ return true;
+
+return false;
+}
+
+
+
+bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
+{
+ return IsNameEqual(name1, name2.c_str(), len);
+}
+
+
+bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
+{
+ return IsNameEqual(name1.c_str(), name2, len);
+}
+
+
+bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
+{
+ return IsNameEqual(name1.c_str(), name2.c_str(), len);
+}
+
+
+
+
+
+bool HTMLParser::IsLastTag(const wchar_t * name)
+{
+ return IsNameEqual(name, LastItem().name);
+}
+
+
+bool HTMLParser::IsLastTag(const std::wstring & name)
+{
+ return IsNameEqual(name, LastItem().name);
+}
+
+
+// checking exceptions for opening tags
+void HTMLParser::CheckSingleItemExceptions()
+{
+ if( IsLastTag(L"meta") ||
+ IsLastTag(L"input") ||
+ IsLastTag(L"br") ||
+ IsLastTag(L"hr") ||
+ IsLastTag(L"img") ||
+ IsLastTag(L"link") ||
+ IsLastTag(L"param") ||
+ IsLastTag(L"col") ||
+ IsLastTag(L"area") )
+ {
+ LastItem().type = Item::simple;
+ PopStack();
+ return;
+ }
+
+ // move me to a better place
+ if( IsLastTag(L"body") )
+ LastItem().has_body_tag = true;
+}
+
+
+void HTMLParser::CheckWhiteCharsExceptions(Item & item)
+{
+ bool change_white_mode = false;
+
+ // in safe_mode the script tag is ignored
+// if( !safe_mode && IsNameEqual(item.name, L"script") )
+// {
+// change_white_mode = true;
+// }
+
+// if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
+// {
+// change_white_mode = true;
+// }
+
+ if( IsNameEqual(item.name, L"pre") )
+ {
+ change_white_mode = true;
+ }
+
+ // move to CheckDifferentContentExceptions?
+ if( IsNameEqual(item.name, no_filter_tag) )
+ {
+ change_white_mode = true;
+ }
+
+ if( change_white_mode )
+ {
+ if( item.type == Item::opening )
+ {
+ white_char_mode_tab.push_back(WHITE_MODE_ORIGIN);
+ }
+ else
+ {
+ if( !white_char_mode_tab.empty() )
+ white_char_mode_tab.pop_back();
+ }
+ }
+}
+
+
+
+void HTMLParser::CheckDifferentContentExceptions(Item & item)
+{
+ if( !safe_mode && IsNameEqual(item.name, L"script") )
+ {
+ ReadTextUntilClosingTag(true);
+ PopStack();
+ }
+
+ if( IsNameEqual(item.name, L"textarea") )
+ {
+ ReadTextUntilClosingTag(true);
+ PopStack();
+ }
+}
+
+
+
+
+
+
+void HTMLParser::CheckStackPrintRest()
+{
+ while( stack_len-- > 0 )
+ {
+ if( stack_len==0 || pstack[stack_len-1].new_line_after )
+ {
+ if( current_white_char_mode() == WHITE_MODE_TREE )
+ {
+ Put(10);
+ PutTabs(pstack[stack_len-1].tree_index);
+ }
+ else
+ {
+ Put(' ');
+ }
+ }
+
+ PutClosingTag(pstack[stack_len]);
+ }
+}
+
+
+void HTMLParser::CheckClosingTags()
+{
+ int i;
+
+ if( stack_len == 0 )
+ return;
+
+ // on the stack we have only opening tags
+ // but only the last tag is a closing tag
+
+ if( stack_len == 1 )
+ {
+ PopStack();
+ return;
+ }
+
+ // looking whether there is a matching opening tag
+ for(i=int(stack_len)-2 ; i >= 0 ; --i)
+ if( (pstack[i].is_commentary && pstack[stack_len-1].is_commentary) || IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
+ break;
+
+ if( i < 0 )
+ {
+ // oops, there is no such an opening tag on the stack
+ // we don't print the closing and the missing opening tag
+ PopStack();
+ return;
+ }
+
+ // CHECK ME
+ if( RemoveIfNeeded(stack_len - 2) )
+ {
+ RemoveLastSpace(i);
+ }
+
+ for(int z=(int)stack_len-2 ; z >= i ; --z)
+ {
+ CheckWhiteCharsExceptions(pstack[z]);
+
+ if( !skip_tags && IsTagSafe(LastItem().name) && !IsNameEqual(no_filter_tag, LastItem().name) )
+ {
+ if( pstack[z].new_line_after )
+ {
+ if( current_white_char_mode() == WHITE_MODE_TREE )
+ {
+ Put(10);
+ PutTabs(pstack[z].tree_index);
+ }
+ }
+
+ // IMPROVEME
+ // in PutClosingTag we test IsTagSafe() and no_filter_tag too
+ PutClosingTag(pstack[z]);
+ pstack[z].Clear();
+ }
+ }
+
+ // invalidate items on the stack
+ stack_len = i;
+}
+
+
+bool HTMLParser::PrintRest()
+{
+//const wchar_t * start = pchar;
+
+ // in safe mode we do not print the rest html code
+ if( safe_mode || skip_tags )
+ return false;
+
+ bool was_chars = false;
+
+ while( lastc != -1 )
+ {
+ Put(lastc);
+ read_char();
+ was_chars = true;
+ }
+
+ return was_chars;
+
+// if( pchar > start )
+// {
+// Put(start, pchar);
+// return true;
+// }
+
+//return false;
+}
+
+
+
+void HTMLParser::AddItemToSpace()
+{
+ if( out_space && stack_len > 0 )
+ {
+ Space * parent = out_space;
+
+ if( stack_len > 1 )
+ {
+ parent = pstack[stack_len-2].space;
+ }
+
+ if( xml_compact_mode )
+ {
+ Space * space = parent->get_space(pstack[stack_len-1].name);
+
+ if( space )
+ {
+ if( space->is_table() )
+ {
+ Space & child = space->add_empty_space();
+ pstack[stack_len-1].space = &child;
+ }
+ else
+ {
+ Space * tab = new Space();
+ tab->add(space);
+ Space & child = tab->add_empty_space();
+
+ parent->value.value_object[pstack[stack_len-1].name] = tab;
+ pstack[stack_len-1].space = &child;
+ }
+ }
+ else
+ {
+ Space & space = parent->add_empty_space(pstack[stack_len-1].name);
+ pstack[stack_len-1].space = &space;
+ }
+ }
+ else
+ {
+ Space & childs_tab = parent->get_add_space(L"childs");
+ Space & child = childs_tab.add_empty_space();
+ pstack[stack_len-1].space = &child;
+ }
+
+ }
+}
+
+
+
+void HTMLParser::RemoveLastSpace(size_t index)
+{
+ if( out_space )
+ {
+ Space * parent = out_space;
+
+ if( index > 0 )
+ {
+ parent = pstack[index - 1].space;
+ }
+
+ if( xml_compact_mode )
+ {
+ // IMPLEMENT ME
+ }
+ else
+ {
+ Space * childs_tab = parent->get_space(L"childs");
+ size_t len = childs_tab->table_size();
+
+ if( childs_tab && childs_tab->is_table() && len > 0 && childs_tab->value.value_table[len-1] == pstack[stack_len-2].space )
+ {
+ childs_tab->remove(len - 1);
+ pstack[stack_len-2].space = nullptr;
+ }
+ }
+ }
+}
+
+void HTMLParser::AddSpaceToSpaceTree(const Space & space)
+{
+ const std::wstring * text = space.get_wstr(L"text");
+
+ if( out_space && stack_len > 0 && text )
+ {
+ if( xml_compact_mode )
+ {
+ Space * child_text = LastItem().space->get_space(L"text");
+
+ if( child_text )
+ {
+ if( child_text->is_table() )
+ {
+ child_text->add(*text);
+ }
+ else
+ {
+ Space * tab = new Space();
+ tab->add(*child_text);
+ tab->add(*text);
+ LastItem().space->value.value_object[L"text"] = tab;
+ }
+ }
+ else
+ {
+ LastItem().space->add(L"text", *text);
+ }
+ }
+ else
+ {
+ Space & childs_tab = LastItem().space->get_add_space(L"childs");
+ childs_tab.add(space);
+ }
+ }
+}
+
+
+
+
+bool HTMLParser::RemoveIfNeeded(size_t index)
+{
+ if( item_parsed_listener )
+ {
+ if( !item_parsed_listener->item_parsed(pstack[index]) )
+ {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+
+void HTMLParser::ReadLoop()
+{
+ while( status == ok && ReadItem() )
+ {
+ bool was_cdata = false;
+
+ if( LastItem().type == Item::opening )
+ {
+ if( parsing_html )
+ {
+ CheckSingleItemExceptions();
+ }
+
+ CheckWhiteCharsExceptions(LastItem());
+ CheckDifferentContentExceptions(LastItem());
+ }
+ else
+ if( LastItem().type == Item::special )
+ {
+ if( LastItem().is_commentary )
+ ReadTextUntilClosingCommentary();
+
+ if( LastItem().is_cdata )
+ was_cdata = true;
+
+ PopStack();
+ }
+ else
+ if( LastItem().type == Item::simple )
+ {
+ if( stack_len > 0 )
+ {
+ if( RemoveIfNeeded(stack_len - 1) )
+ RemoveLastSpace(stack_len - 1);
+ }
+
+ PopStack();
+ }
+ else
+ if( LastItem().type == Item::closing )
+ {
+ CheckClosingTags();
+ }
+ else
+ {
+ PopStack();
+ }
+
+ if( status == ok )
+ {
+ ReadText(was_cdata);
+ }
+
+ is_first_item = false;
+ }
+}
+
+
+void HTMLParser::read_char_from_entity_buffer()
+{
+ if( escaped_char_index < escaped_chars_buffer.size() )
+ {
+ lastc = escaped_chars_buffer[escaped_char_index];
+ escaped_char_index += 1;
+
+ if( escaped_char_index >= escaped_chars_buffer.size() )
+ {
+ escaped_chars_buffer.clear();
+ escaped_char_index = 0;
+ }
+ }
+ else
+ {
+ lastc = -1;
+ }
+}
+
+
+void HTMLParser::read_xml_entity()
+{
+ const size_t max_entity_length = 6; // length of "'" string
+ escaped_chars_buffer.clear();
+ escaped_char_index = 0;
+ escaped_chars_buffer += '&';
+
+ do
+ {
+ read_char_no_escape();
+
+ if( lastc != -1 )
+ {
+ escaped_chars_buffer += lastc;
+ }
+ }
+ while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' );
+}
+
+
+bool HTMLParser::check_escape_sequentions()
+{
+ if( escaped_chars_buffer == L"&" )
+ {
+ lastc = '&';
+ char_was_escaped = true;
+ }
+ else
+ if( escaped_chars_buffer == L"<" )
+ {
+ lastc = '<';
+ char_was_escaped = true;
+ }
+ else
+ if( escaped_chars_buffer == L">" )
+ {
+ lastc = '>';
+ char_was_escaped = true;
+ }
+ else
+ if( escaped_chars_buffer == L""" )
+ {
+ lastc = '"';
+ char_was_escaped = true;
+ }
+ else
+ if( escaped_chars_buffer == L"'" )
+ {
+ lastc = '\'';
+ char_was_escaped = true;
+ }
+
+ if( char_was_escaped )
+ {
+ escaped_chars_buffer.clear();
+ escaped_char_index = 0;
+ }
+
+ return char_was_escaped;
+}
+
+
+
+int HTMLParser::read_char()
+{
+ char_was_escaped = false;
+
+ if( escaped_char_index < escaped_chars_buffer.size() )
+ {
+ read_char_from_entity_buffer();
+ }
+ else
+ {
+ read_char_no_escape();
+
+ if( !filter_mode && lastc == '&' )
+ {
+ read_xml_entity();
+
+ if( !check_escape_sequentions() )
+ {
+ read_char_from_entity_buffer();
+ }
+ }
+ }
+
+ return lastc;
+}
+
+
+
+void HTMLParser::Read()
+{
+ read_char(); // put first character to lastc
+ is_first_item = true;
+
+ white_char_mode_tab.clear();
+ white_char_mode_tab.push_back(white_mode);
+
+ if( current_white_char_mode() != WHITE_MODE_ORIGIN )
+ SkipWhiteLines();
+
+ // it can be some text or white lines before the first html tag (we print it if using filtering)
+ // but they are not added to the Space tree
+ ReadText(false);
+
+ // reading the whole html source
+ ReadLoop();
+
+ // sometimes there can remain some html source (when there is no space on the stack)
+ // we print the rest html without filtering (only if safe_mode is false)
+ if( !PrintRest() )
+ CheckStackPrintRest();
+}
+
+
+
+
+
+}
+
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
new file mode 100644
index 0000000..15dd8db
--- /dev/null
+++ b/src/html/htmlparser.h
@@ -0,0 +1,490 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa
+ */
+
+/*
+ * Copyright (c) 2008-2022, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name Tomasz Sowa nor the names of contributors to this
+ * project may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_picotools_html_htmlfilter
+#define headerfile_picotools_html_htmlfilter
+
+#include
+#include