diff --git a/src/Makefile.dep b/src/Makefile.dep index 683e3cf..bbf1f99 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -1,44 +1,76 @@ # DO NOT DELETE ./convert/inttostr.o: ./convert/inttostr.h -./convert/misc.o: ./convert/misc.h ./convert/text.h +./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h +./convert/misc.o: textstream/types.h utf8/utf8_stream.h +./convert/misc.o: textstream/textstream.h textstream/stream.h space/space.h +./convert/misc.o: convert/inttostr.h utf8/utf8.h utf8/utf8_templates.h +./convert/misc.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./convert/misc.o: textstream/types.h ./convert/inttostr.h ./convert/text.o: ./convert/text.h ./convert/text_private.h ./convert/double.o: ./convert/double.h textstream/textstream.h ./convert/double.o: textstream/stream.h space/space.h textstream/types.h ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./convert/double.o: membuffer/membuffer.h textstream/types.h +./convert/double.o: utf8/utf8_stream.h +./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h +./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h +./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h +./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h +./convert/baseparser.o: membuffer/membuffer.h textstream/types.h +./convert/baseparser.o: utf8/utf8_stream.h ./date/date.o: ./date/date.h convert/inttostr.h ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h ./log/filelog.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./log/filelog.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h -./log/filelog.o: textstream/types.h +./log/filelog.o: textstream/types.h utf8/utf8_stream.h ./log/log.o: ./log/log.h textstream/textstream.h textstream/stream.h ./log/log.o: space/space.h textstream/types.h convert/inttostr.h utf8/utf8.h ./log/log.o: textstream/stream.h utf8/utf8_templates.h utf8/utf8_private.h ./log/log.o: date/date.h membuffer/membuffer.h textstream/types.h -./log/log.o: ./log/filelog.h +./log/log.o: utf8/utf8_stream.h ./log/filelog.h ./space/space.o: ./space/space.h textstream/types.h convert/inttostr.h ./space/space.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./space/space.o: utf8/utf8_private.h convert/convert.h ./convert/inttostr.h ./space/space.o: convert/patternreplacer.h textstream/textstream.h ./space/space.o: textstream/stream.h space/space.h date/date.h -./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h -./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h +./space/space.o: membuffer/membuffer.h textstream/types.h utf8/utf8_stream.h +./space/space.o: convert/strtoint.h ./convert/text.h ./convert/misc.h +./space/space.o: ./convert/double.h ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h -./space/spaceparser.o: utf8/utf8_private.h convert/strtoint.h -./space/spaceparser.o: ./convert/text.h ./convert/misc.h +./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h +./space/spaceparser.o: textstream/textstream.h textstream/stream.h +./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h +./space/spaceparser.o: textstream/types.h utf8/utf8_stream.h +./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./utf8/utf8.o: utf8/utf8_private.h ./utf8/utf8_private.o: utf8/utf8_private.h ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h +./csv/csvparser.o: convert/baseparser.h textstream/textstream.h +./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h +./csv/csvparser.o: textstream/types.h utf8/utf8_stream.h ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h +./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h +./html/htmlparser.o: textstream/textstream.h textstream/stream.h +./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h +./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./html/htmlparser.o: textstream/types.h utf8/utf8_stream.h convert/text.h +./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h +./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h +./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h +./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h +./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h +./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h +./html/bbcodeparser.o: utf8/utf8_stream.h diff --git a/src/convert/baseparser.cpp b/src/convert/baseparser.cpp new file mode 100644 index 0000000..d4abca1 --- /dev/null +++ b/src/convert/baseparser.cpp @@ -0,0 +1,273 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2021-2022, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "baseparser.h" +#include "utf8/utf8.h" +#include "utf8/utf8_stream.h" + + +namespace pt +{ + +BaseParser::BaseParser() +{ + clear_input_flags(); +} + + +void BaseParser::clear_input_flags() +{ + line = 0; + column = 0; + reading_from_file = false; + pchar_ascii = nullptr; + pchar_unicode = nullptr; + wtext_stream_iterator = nullptr; + wtext_stream_iterator_end = nullptr; + text_stream_iterator = nullptr; + text_stream_iterator_end = nullptr; + lastc = -1; + input_as_utf8 = true; + + if( file.is_open() ) + file.close(); + + file.clear(); +} + + +void BaseParser::check_new_line() +{ + if( lastc == '\n' ) + { + ++line; + column = 0; + } +} + + +int BaseParser::read_utf8_char() +{ +int c; +bool correct; + + lastc = -1; + + do + { + utf8_to_int(file, c, correct); + + if( !file ) + return lastc; + } + while( !correct ); + + lastc = c; + check_new_line(); + +return lastc; +} + + +int BaseParser::read_ascii_char() +{ + lastc = file.get(); + check_new_line(); + +return lastc; +} + + +int BaseParser::read_char_from_wchar_string() +{ + if( *pchar_unicode == 0 ) + lastc = -1; + else + lastc = *(pchar_unicode++); + + check_new_line(); + +return lastc; +} + + +int BaseParser::read_char_from_utf8_string() +{ +int c; +bool correct; + + lastc = -1; + + do + { + size_t len = utf8_to_int(pchar_ascii, c, correct); + pchar_ascii += len; + } + while( *pchar_ascii && !correct ); + + if( correct ) + lastc = c; + + check_new_line(); + +return lastc; +} + + +int BaseParser::read_char_from_ascii_string() +{ + if( *pchar_ascii == 0 ) + lastc = -1; + else + lastc = *(pchar_ascii++); + + check_new_line(); + +return lastc; +} + + +int BaseParser::read_char_from_wtext_stream() +{ + if( (*wtext_stream_iterator) != (*wtext_stream_iterator_end) ) + { + lastc = *(*wtext_stream_iterator); + ++(*wtext_stream_iterator); + } + else + { + lastc = -1; + } + + check_new_line(); + + return lastc; +} + + +int BaseParser::read_char_from_utf8_text_stream() +{ + int c; + bool correct; + + lastc = -1; + + do + { + utf8_to_int(*text_stream_iterator, *text_stream_iterator_end, c, correct); + } + while( !correct && (*text_stream_iterator) != (*text_stream_iterator_end) ); + + if( correct ) + lastc = c; + + check_new_line(); + + return lastc; +} + + +int BaseParser::read_char_from_ascii_text_stream() +{ + if( (*text_stream_iterator) != (*text_stream_iterator_end) ) + { + lastc = *(*text_stream_iterator); + ++(*text_stream_iterator); + } + else + { + lastc = -1; + } + + check_new_line(); + + return lastc; +} + + +int BaseParser::read_char_no_escape() +{ + if( reading_from_file ) + { + if( input_as_utf8 ) + return read_utf8_char(); + else + return read_ascii_char(); + } + else + { + if( pchar_ascii ) + { + if( input_as_utf8 ) + return read_char_from_utf8_string(); + else + return read_char_from_ascii_string(); + } + else if( pchar_unicode ) + { + return read_char_from_wchar_string(); + } + else if( wtext_stream_iterator && wtext_stream_iterator_end ) + { + return read_char_from_wtext_stream(); + } + else if( text_stream_iterator && text_stream_iterator_end ) + { + if( input_as_utf8 ) + return read_char_from_utf8_text_stream(); + else + return read_char_from_ascii_text_stream(); + } + else + { + lastc = -1; + return lastc; + } + } +} + + +int BaseParser::read_char() +{ + return read_char_no_escape(); +} + + + + +} + diff --git a/src/convert/baseparser.h b/src/convert/baseparser.h new file mode 100644 index 0000000..67721b1 --- /dev/null +++ b/src/convert/baseparser.h @@ -0,0 +1,141 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2021-2022, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef headerfile_picotools_convert_baseparser +#define headerfile_picotools_convert_baseparser + +#include +#include +#include "textstream/textstream.h" + + +namespace pt +{ + +class BaseParser +{ +protected: + + BaseParser(); + + virtual void clear_input_flags(); + + virtual void check_new_line(); + virtual int read_utf8_char(); + virtual int read_ascii_char(); + virtual int read_char_from_wchar_string(); + virtual int read_char_from_utf8_string(); + virtual int read_char_from_ascii_string(); + virtual int read_char_from_wtext_stream(); + virtual int read_char_from_utf8_text_stream(); + virtual int read_char_from_ascii_text_stream(); + virtual int read_char_no_escape(); + virtual int read_char(); + + + + /* + a number of a line in which there is a syntax_error + */ + int line; + + /* + a number of a column in which there is a syntax_error + */ + int column; + + + /* + true if parse() method was called + false if ParseString() was called + */ + bool reading_from_file; + + + /* + pointers to the current character + if ParseString() is in used + */ + const char * pchar_ascii; + const wchar_t * pchar_unicode; + + + /* + pointers to WTextStream iterators + if set then both of them should be set + */ + WTextStream::const_iterator * wtext_stream_iterator; + WTextStream::const_iterator * wtext_stream_iterator_end; + + + /* + pointers to TextStream iterators + if set then both of them should be set + */ + TextStream::const_iterator * text_stream_iterator; + TextStream::const_iterator * text_stream_iterator_end; + + + /* + last read char + or -1 if the end + */ + int lastc; + + + /* + current file + + may it would be better to make a pointer? + if we parse only a string then there is no sense to have such an object + */ + std::ifstream file; + + + /* + input file is in UTF-8 + default: true + */ + bool input_as_utf8; + + + +}; + +} + +#endif diff --git a/src/convert/inttostr.cpp b/src/convert/inttostr.cpp index b9a8d6d..89d9272 100644 --- a/src/convert/inttostr.cpp +++ b/src/convert/inttostr.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2021, Tomasz Sowa + * Copyright (c) 2021-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -41,114 +41,114 @@ namespace pt { -std::string to_str(unsigned long long value, int base) +std::string to_str(unsigned long long value, int base, size_t min_width) { std::string res; - Toa(value, res, false, base); + Toa(value, res, false, base, min_width); return res; } -std::string to_str(long long value, int base) +std::string to_str(long long value, int base, size_t min_width) { std::string res; - Toa(value, res, false, base); + Toa(value, res, false, base, min_width); return res; } -std::string to_str(unsigned long value, int base) +std::string to_str(unsigned long value, int base, size_t min_width) { - return to_str(static_cast(value), base); + return to_str(static_cast(value), base, min_width); } -std::string to_str(long value, int base) +std::string to_str(long value, int base, size_t min_width) { - return to_str(static_cast(value), base); + return to_str(static_cast(value), base, min_width); } -std::string to_str(unsigned int value, int base) +std::string to_str(unsigned int value, int base, size_t min_width) { - return to_str(static_cast(value), base); + return to_str(static_cast(value), base, min_width); } -std::string to_str(int value, int base) +std::string to_str(int value, int base, size_t min_width) { - return to_str(static_cast(value), base); + return to_str(static_cast(value), base, min_width); } -std::string to_str(unsigned short value, int base) +std::string to_str(unsigned short value, int base, size_t min_width) { - return to_str(static_cast(value), base); + return to_str(static_cast(value), base, min_width); } -std::string to_str(short value, int base) +std::string to_str(short value, int base, size_t min_width) { - return to_str(static_cast(value), base); + return to_str(static_cast(value), base, min_width); } -std::wstring to_wstr(unsigned long long value, int base) +std::wstring to_wstr(unsigned long long value, int base, size_t min_width) { std::wstring res; - Toa(value, res, false, base); + Toa(value, res, false, base, min_width); return res; } -std::wstring to_wstr(long long value, int base) +std::wstring to_wstr(long long value, int base, size_t min_width) { std::wstring res; - Toa(value, res, false, base); + Toa(value, res, false, base, min_width); return res; } -std::wstring to_wstr(unsigned long value, int base) +std::wstring to_wstr(unsigned long value, int base, size_t min_width) { - return to_wstr(static_cast(value), base); + return to_wstr(static_cast(value), base, min_width); } -std::wstring to_wstr(long value, int base) +std::wstring to_wstr(long value, int base, size_t min_width) { - return to_wstr(static_cast(value), base); + return to_wstr(static_cast(value), base, min_width); } -std::wstring to_wstr(unsigned int value, int base) +std::wstring to_wstr(unsigned int value, int base, size_t min_width) { - return to_wstr(static_cast(value), base); + return to_wstr(static_cast(value), base, min_width); } -std::wstring to_wstr(int value, int base) +std::wstring to_wstr(int value, int base, size_t min_width) { - return to_wstr(static_cast(value), base); + return to_wstr(static_cast(value), base, min_width); } -std::wstring to_wstr(unsigned short value, int base) +std::wstring to_wstr(unsigned short value, int base, size_t min_width) { - return to_wstr(static_cast(value), base); + return to_wstr(static_cast(value), base, min_width); } -std::wstring to_wstr(short value, int base) +std::wstring to_wstr(short value, int base, size_t min_width) { - return to_wstr(static_cast(value), base); + return to_wstr(static_cast(value), base, min_width); } diff --git a/src/convert/inttostr.h b/src/convert/inttostr.h index f134dc8..bac0f1d 100644 --- a/src/convert/inttostr.h +++ b/src/convert/inttostr.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2012-2021, Tomasz Sowa + * Copyright (c) 2012-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -52,8 +52,9 @@ namespace pt // if the buffer is too small it will be terminated at the beginning (empty string) // and the function returns false +// min_width - if greater than zero then it is used for zero padding template -bool Toa(unsigned long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0) +bool Toa(unsigned long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = nullptr, size_t min_width = 0) { size_t i1, i2; long rest; @@ -77,6 +78,14 @@ long rest; } while(value != 0 && i2 < buf_len); + if( min_width > 0 ) + { + for( ; i2 < min_width && i2 < buf_len ; ++i2) + { + buffer[i2] = '0'; + } + } + if( i2 >= buf_len ) { buffer[0] = 0; // ops, the buffer was too small @@ -106,7 +115,7 @@ return true; // if the buffer is too small it will be terminated at the beginning (empty string) // and the function returns false template -bool Toa(long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0) +bool Toa(long long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = nullptr, size_t min_width = 0) { if( len_out ) *len_out = 0; @@ -126,7 +135,7 @@ bool Toa(long long value, CharType * buffer, size_t buf_len, int base = 10, size is_sign = true; } - bool res = Toa(static_cast(value), buf, buf_len, base, len_out); + bool res = Toa(static_cast(value), buf, buf_len, base, len_out, min_width); if( res ) { @@ -146,44 +155,44 @@ return res; template -bool Toa(unsigned long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0) +bool Toa(unsigned long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0) { - return Toa(static_cast(value), buffer, buf_len, base, len_out); + return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width); } template -bool Toa(long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0) +bool Toa(long value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0) { - return Toa(static_cast(value), buffer, buf_len, base, len_out); + return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width); } template -bool Toa(unsigned int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0) +bool Toa(unsigned int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0) { - return Toa(static_cast(value), buffer, buf_len, base, len_out); + return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width); } template -bool Toa(int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0) +bool Toa(int value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0) { - return Toa(static_cast(value), buffer, buf_len, base, len_out); + return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width); } template -bool Toa(unsigned short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0) +bool Toa(unsigned short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0) { - return Toa(static_cast(value), buffer, buf_len, base, len_out); + return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width); } template -bool Toa(short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0) +bool Toa(short value, CharType * buffer, size_t buf_len, int base = 10, size_t * len_out = 0, size_t min_width = 0) { - return Toa(static_cast(value), buffer, buf_len, base, len_out); + return Toa(static_cast(value), buffer, buf_len, base, len_out, min_width); } @@ -192,7 +201,7 @@ bool Toa(short value, CharType * buffer, size_t buf_len, int base = 10, size_t * template -void Toa(unsigned long long value, StringType & res, bool clear_string = true, int base = 10) +void Toa(unsigned long long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0) { typename StringType::value_type buffer[50]; size_t buffer_len = sizeof(buffer) / sizeof(wchar_t); @@ -204,13 +213,13 @@ void Toa(unsigned long long value, StringType & res, bool clear_string = true, i * the size of the buffer is sufficient so the status should always be true */ size_t len_out; - Toa(value, buffer, buffer_len, base, &len_out); + Toa(value, buffer, buffer_len, base, &len_out, min_width); res.append(buffer, len_out); } template -void Toa(long long value, StringType & res, bool clear_string = true, int base = 10) +void Toa(long long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0) { typename StringType::value_type buffer[50]; size_t buffer_len = sizeof(buffer) / sizeof(wchar_t); @@ -222,71 +231,71 @@ void Toa(long long value, StringType & res, bool clear_string = true, int base = * the size of the buffer is sufficient so the status should always be true */ size_t len_out; - Toa(value, buffer, buffer_len, base, &len_out); + Toa(value, buffer, buffer_len, base, &len_out, min_width); res.append(buffer, len_out); } template -void Toa(unsigned long value, StringType & res, bool clear_string = true, int base = 10) +void Toa(unsigned long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0) { - Toa(static_cast(value), res, clear_string, base); + Toa(static_cast(value), res, clear_string, base, min_width); } template -void Toa(long value, StringType & res, bool clear_string = true, int base = 10) +void Toa(long value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0) { - Toa(static_cast(value), res, clear_string, base); + Toa(static_cast(value), res, clear_string, base, min_width); } template -void Toa(unsigned int value, StringType & res, bool clear_string = true, int base = 10) +void Toa(unsigned int value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0) { - Toa(static_cast(value), res, clear_string, base); + Toa(static_cast(value), res, clear_string, base, min_width); } template -void Toa(int value, StringType & res, bool clear_string = true, int base = 10) +void Toa(int value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0) { - Toa(static_cast(value), res, clear_string, base); + Toa(static_cast(value), res, clear_string, base, min_width); } template -void Toa(unsigned short value, StringType & res, bool clear_string = true, int base = 10) +void Toa(unsigned short value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0) { - Toa(static_cast(value), res, clear_string, base); + Toa(static_cast(value), res, clear_string, base, min_width); } template -void Toa(short value, StringType & res, bool clear_string = true, int base = 10) +void Toa(short value, StringType & res, bool clear_string = true, int base = 10, size_t min_width = 0) { - Toa(static_cast(value), res, clear_string, base); + Toa(static_cast(value), res, clear_string, base, min_width); } -std::string to_str(unsigned long long value, int base = 10); -std::string to_str(long long value, int base = 10); -std::string to_str(unsigned long value, int base = 10); -std::string to_str(long value, int base = 10); -std::string to_str(unsigned int value, int base = 10); -std::string to_str(int value, int base = 10); -std::string to_str(unsigned short value, int base = 10); -std::string to_str(short value, int base = 10); +std::string to_str(unsigned long long value, int base = 10, size_t min_width = 0); +std::string to_str(long long value, int base = 10, size_t min_width = 0); +std::string to_str(unsigned long value, int base = 10, size_t min_width = 0); +std::string to_str(long value, int base = 10, size_t min_width = 0); +std::string to_str(unsigned int value, int base = 10, size_t min_width = 0); +std::string to_str(int value, int base = 10, size_t min_width = 0); +std::string to_str(unsigned short value, int base = 10, size_t min_width = 0); +std::string to_str(short value, int base = 10, size_t min_width = 0); -std::wstring to_wstr(unsigned long long value, int base = 10); -std::wstring to_wstr(long long value, int base = 10); -std::wstring to_wstr(unsigned long value, int base = 10); -std::wstring to_wstr(long value, int base = 10); -std::wstring to_wstr(unsigned int value, int base = 10); -std::wstring to_wstr(int value, int base = 10); -std::wstring to_wstr(unsigned short value, int base = 10); -std::wstring to_wstr(short value, int base = 10); +std::wstring to_wstr(unsigned long long value, int base = 10, size_t min_width = 0); +std::wstring to_wstr(long long value, int base = 10, size_t min_width = 0); +std::wstring to_wstr(unsigned long value, int base = 10, size_t min_width = 0); +std::wstring to_wstr(long value, int base = 10, size_t min_width = 0); +std::wstring to_wstr(unsigned int value, int base = 10, size_t min_width = 0); +std::wstring to_wstr(int value, int base = 10, size_t min_width = 0); +std::wstring to_wstr(unsigned short value, int base = 10, size_t min_width = 0); +std::wstring to_wstr(short value, int base = 10, size_t min_width = 0); diff --git a/src/convert/misc.cpp b/src/convert/misc.cpp index 978cce7..3d78ae7 100644 --- a/src/convert/misc.cpp +++ b/src/convert/misc.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2017, Tomasz Sowa + * Copyright (c) 2017-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,6 +36,8 @@ */ #include "misc.h" +#include "inttostr.h" +#include "utf8/utf8.h" namespace pt @@ -50,6 +52,363 @@ void SetOverflow(bool * was_overflow, bool val) +void esc_to_json_uformat(wchar_t val, Stream & out) +{ + char buf[10]; + size_t len; + + Toa((unsigned long)val, buf, sizeof(buf)/sizeof(char), 16, &len); + + out << "\\u"; + + if( len < 4 ) + { + for(size_t i=0 ; i < (4-len) ; ++i) + { + out << '0'; + } + } + + out << buf; +} + + +/* + * return true if the val character was escaped and put to the out stream + * if the character is invalid for such a stream then only return true + * but not put it to the stream + */ +bool try_esc_to_json(wchar_t val, Stream & out) +{ + bool status = false; + + if( val == '\r' ) + { + out << '\\' << 'r'; + status = true; + } + else + if( val == '\n' ) + { + out << '\\' << 'n'; + status = true; + } + else + if( val == '\t' ) + { + out << '\\' << 't'; + status = true; + } + else + if( val == 0x08 ) + { + out << '\\' << 'b'; + status = true; + } + else + if( val == 0x0c ) + { + out << '\\' << 'f'; + status = true; + } + else + if( val == '\\' ) + { + out << '\\' << '\\'; + status = true; + } + else + if( val == '"' ) + { + out << '\\' << '\"'; + status = true; + } + else + if( val < 32 ) + { + esc_to_json_uformat(val, out); + status = true; + } + + return status; +} + + +void esc_to_json(wchar_t val, Stream & out) +{ + if( !try_esc_to_json(val, out) ) + { + out << val; + } +} + + +void esc_to_json(char val, Stream & out) +{ + if( !try_esc_to_json((wchar_t)(unsigned char)val, out) ) + { + out << val; + } +} + + + +void esc_to_json(const char * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_json(c[i], out); + } +} + + +void esc_to_json(const char * c, std::size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_json(c[i], out); + } +} + + +void esc_to_json(const wchar_t * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_json(c[i], out); + } +} + + +void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_json(c[i], out); + } +} + + +void esc_to_json(const std::string & in, Stream & out) +{ + esc_to_json(in.c_str(), in.size(), out); +} + + +void esc_to_json(const std::wstring & in, Stream & out) +{ + esc_to_json(in.c_str(), in.size(), out); +} + + + +/* + * return true if the val character was escaped and put to the out stream + * if the character is invalid for such a stream then only return true + * but not put it to the stream + */ +bool try_esc_to_xml(wchar_t val, Stream & out) +{ + bool status = false; + + if( val == 0 ) + { + // null character is invalid in XML 1.0 and 1.1 + // https://en.wikipedia.org/wiki/Valid_characters_in_XML + // return true but not put the char to the out stream + status = true; + } + else + if( val == '<') + { + out << "<"; + status = true; + } + else + if( val == '>') + { + out << ">"; + status = true; + } + else + if( val == '&') + { + out << "&"; + status = true; + } + else + if( val == '"') + { + out << """; + status = true; + } + + return status; +} + + +void esc_to_xml(wchar_t val, Stream & out) +{ + if( !try_esc_to_xml(val, out) ) + { + out << val; + } +} + + +void esc_to_xml(char val, Stream & out) +{ + if( !try_esc_to_xml((wchar_t)(unsigned char)val, out) ) + { + out << val; + } +} + + +void esc_to_xml(const char * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_xml(c[i], out); + } +} + + +void esc_to_xml(const char * c, std::size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_xml(c[i], out); + } +} + + +void esc_to_xml(const wchar_t * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_xml(c[i], out); + } +} + + +void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_xml(c[i], out); + } +} + + +void esc_to_xml(const std::string & in, Stream & out) +{ + esc_to_xml(in.c_str(), in.size(), out); +} + + +void esc_to_xml(const std::wstring & in, Stream & out) +{ + esc_to_xml(in.c_str(), in.size(), out); +} + + + + + +/* + * return true if the val character was escaped and put to the out stream + * if the character is invalid for such a stream then only return true + * but not put it to the stream + */ +bool try_esc_to_csv(wchar_t val, pt::Stream & out) +{ + bool status = false; + + if( val == 0 ) + { + // null characters are invalid in text files + // return true but not put to the out stream + status = true; + } + else + if( val == '"' ) + { + out << "\"\""; + status = true; + } + + return status; +} + + +void esc_to_csv(wchar_t val, pt::Stream & out) +{ + if( !try_esc_to_csv(val, out) ) + { + out << val; + } +} + + +void esc_to_csv(char val, Stream & out) +{ + if( !try_esc_to_csv((wchar_t)(unsigned char)val, out) ) + { + out << val; + } +} + + +void esc_to_csv(const char * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_csv(c[i], out); + } +} + + +void esc_to_csv(const char * c, std::size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_csv(c[i], out); + } +} + + +void esc_to_csv(const wchar_t * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_csv(c[i], out); + } +} + + +void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_csv(c[i], out); + } +} + + +void esc_to_csv(const std::string & in, Stream & out) +{ + esc_to_csv(in.c_str(), in.size(), out); +} + + +void esc_to_csv(const std::wstring & in, Stream & out) +{ + esc_to_csv(in.c_str(), in.size(), out); +} + + + } diff --git a/src/convert/misc.h b/src/convert/misc.h index 7dbb128..e8b10bf 100644 --- a/src/convert/misc.h +++ b/src/convert/misc.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2017, Tomasz Sowa + * Copyright (c) 2017-2022, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -40,6 +40,9 @@ #include #include "text.h" +#include "textstream/stream.h" +#include "textstream/types.h" +#include "utf8/utf8_stream.h" namespace pt @@ -47,6 +50,78 @@ namespace pt void SetOverflow(bool * was_overflow, bool val); +bool try_esc_to_json(wchar_t val, Stream & out); +void esc_to_json(wchar_t val, Stream & out); +void esc_to_json(char val, Stream & out); +void esc_to_json(const char * c, pt::Stream & out); +void esc_to_json(const char * c, std::size_t len, Stream & out); +void esc_to_json(const wchar_t * c, Stream & out); +void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out); +void esc_to_json(const std::string & in, Stream & out); +void esc_to_json(const std::wstring & in, Stream & out); + +bool try_esc_to_xml(wchar_t val, Stream & out); +void esc_to_xml(wchar_t c, pt::Stream & out); +void esc_to_xml(char c, pt::Stream & out); +void esc_to_xml(const char * c, pt::Stream & out); +void esc_to_xml(const char * c, std::size_t len, pt::Stream & out); +void esc_to_xml(const wchar_t * c, pt::Stream & out); +void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out); +void esc_to_xml(const std::string & in, Stream & out); +void esc_to_xml(const std::wstring & in, Stream & out); + +bool try_esc_to_csv(wchar_t val, pt::Stream & out); +void esc_to_csv(wchar_t val, Stream & out); +void esc_to_csv(char c, pt::Stream & out); +void esc_to_csv(const char * c, std::size_t len, Stream & out); +void esc_to_csv(const char * c, pt::Stream & out); +void esc_to_csv(const char * c, std::size_t len, pt::Stream & out); +void esc_to_csv(const wchar_t * c, pt::Stream & out); +void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out); +void esc_to_csv(const std::string & in, Stream & out); + + + +template +void esc_to_json(const StreamType & in, Stream & out) +{ + typename StreamType::const_iterator i = in.begin(); + typename StreamType::const_iterator end = in.end(); + + while( i != end ) + { + wchar_t c = i.get_unicode_and_advance(end); + esc_to_json(c, out); + } +} + + +template +void esc_to_xml(const StreamType & in, Stream & out) +{ + typename StreamType::const_iterator i = in.begin(); + typename StreamType::const_iterator end = in.end(); + + while( i != end ) + { + wchar_t c = i.get_unicode_and_advance(end); + esc_to_xml(c, out); + } +} + + +template +void esc_to_csv(const StreamType & in, Stream & out) +{ + typename StreamType::const_iterator i = in.begin(); + typename StreamType::const_iterator end = in.end(); + + while( i != end ) + { + wchar_t c = i.get_unicode_and_advance(end); + esc_to_csv(c, out); + } +} } diff --git a/src/csv/csvparser.cpp b/src/csv/csvparser.cpp index 4ab1480..0a83e92 100644 --- a/src/csv/csvparser.cpp +++ b/src/csv/csvparser.cpp @@ -44,10 +44,17 @@ namespace pt { +CSVParser::CSVParser() +{ + input_as_utf8 = true; +} + CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space) { + clear_input_flags(); + reading_from_file = true; space = &out_space; @@ -98,11 +105,10 @@ CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space & CSVParser::Status CSVParser::parse(const char * str, Space & out_space) { - reading_from_file = false; - reading_from_wchar_string = false; - pchar_ascii = str; - pchar_unicode = 0; - space = &out_space; + clear_input_flags(); + + pchar_ascii = str; + space = &out_space; parse(); @@ -119,11 +125,10 @@ CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space) CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space) { - reading_from_file = false; - reading_from_wchar_string = true; - pchar_unicode = str; - pchar_ascii = 0; - space = &out_space; + clear_input_flags(); + + pchar_unicode = str; + space = &out_space; parse(); @@ -285,132 +290,6 @@ bool CSVParser::read_non_quoted_value_to(std::wstring & value) - -int CSVParser::read_utf8_char() -{ -int c; -bool correct; - - lastc = -1; - - do - { - utf8_to_int(file, c, correct); - - if( !file ) - return lastc; - } - while( !correct ); - - lastc = c; - - if( lastc == '\n' ) - ++line; - -return lastc; -} - - - -int CSVParser::read_ascii_char() -{ - lastc = file.get(); - - if( lastc == '\n' ) - ++line; - -return lastc; -} - - - - -int CSVParser::read_char_from_wchar_string() -{ - if( *pchar_unicode == 0 ) - lastc = -1; - else - lastc = *(pchar_unicode++); - - if( lastc == '\n' ) - ++line; - -return lastc; -} - - -int CSVParser::read_char_from_utf8_string() -{ -int c; -bool correct; - - lastc = -1; - - do - { - size_t len = utf8_to_int(pchar_ascii, c, correct); - pchar_ascii += len; - } - while( *pchar_ascii && !correct ); - - if( correct ) - lastc = c; - - if( lastc == '\n' ) - ++line; - -return lastc; -} - - -int CSVParser::read_char_from_ascii_string() -{ - if( *pchar_ascii == 0 ) - lastc = -1; - else - lastc = *(pchar_ascii++); - - if( lastc == '\n' ) - ++line; - -return lastc; -} - - -int CSVParser::read_char_no_escape() -{ - if( reading_from_file ) - { - if( input_as_utf8 ) - return read_utf8_char(); - else - return read_ascii_char(); - } - else - { - if( reading_from_wchar_string ) - { - return read_char_from_wchar_string(); - } - else - { - if( input_as_utf8 ) - return read_char_from_utf8_string(); - else - return read_char_from_ascii_string(); - } - } -} - - - - -int CSVParser::read_char() -{ - return read_char_no_escape(); -} - - } diff --git a/src/csv/csvparser.h b/src/csv/csvparser.h index 8370867..c549fa5 100644 --- a/src/csv/csvparser.h +++ b/src/csv/csvparser.h @@ -38,9 +38,11 @@ #ifndef headerfile_picotools_csv_csvparser #define headerfile_picotools_csv_csvparser -#include "space/space.h" #include #include +#include "space/space.h" +#include "convert/baseparser.h" + namespace pt @@ -51,10 +53,12 @@ namespace pt * https://datatracker.ietf.org/doc/html/rfc4180 * */ -class CSVParser +class CSVParser : public BaseParser { public: + CSVParser(); + enum Status { ok, @@ -85,53 +89,6 @@ protected: Space * space; - /* - true if parse_file() method was called - false if parse() was called - */ - bool reading_from_file; - - /* - true if parse(wchar_t *) or parse(std::wstring&) was called - */ - bool reading_from_wchar_string; - - /* - pointers to the current character - if parse() is being used - */ - const char * pchar_ascii; - const wchar_t * pchar_unicode; - - - /* - last read char - or -1 if the end - */ - int lastc; - - - - /* - a number of a line in which there is a syntax_error - */ - int line; - - /* - current file - - may it would be better to make a pointer? - if we parse only a string then there is no sense to have such an object - */ - std::ifstream file; - - /* - input file is in UTF-8 - default: true - */ - bool input_as_utf8; - - void parse(); @@ -142,19 +99,6 @@ protected: bool read_non_quoted_value_to(std::wstring & value); - - /* - * copied from SpaceParser - * may it would be better to have a class with those methods and inherit from it? - */ - int read_utf8_char(); - int read_ascii_char(); - int read_char_from_wchar_string(); - int read_char_from_utf8_string(); - int read_char_from_ascii_string(); - int read_char_no_escape(); - - int read_char(); }; } diff --git a/src/html/bbcodeparser.cpp b/src/html/bbcodeparser.cpp new file mode 100644 index 0000000..254de60 --- /dev/null +++ b/src/html/bbcodeparser.cpp @@ -0,0 +1,645 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2008-2021, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bbcodeparser.h" + + +namespace pt +{ + + + + +bool BBCODEParser::Equal(const wchar_t * str1, const wchar_t * str2) +{ + while( *str1 == *str2 && *str1 != 0 ) + { + str1 += 1; + str2 += 1; + } + +return *str1 == *str2; +} + + + + +bool BBCODEParser::IsValidCharForName(int c) +{ + if( (c>='a' && c<='z') || + (c>='A' && c<='Z') || + c=='*' || c=='_') + return true; + +return false; +} + + +bool BBCODEParser::IsOpeningTagMark(wchar_t c) +{ + return (c == '['); +} + + +bool BBCODEParser::IsClosingTagMark(wchar_t c) +{ + return (c == ']'); +} + +bool BBCODEParser::IsClosingXmlSimpleTagMark(wchar_t c) +{ + return false; +} + + + +// there are no commentaries in bbcode +bool BBCODEParser::IsOpeningCommentaryTagMark(const wchar_t *) +{ + return false; +} + + +size_t BBCODEParser::OpeningCommentaryTagMarkSize() +{ + return 0; +} + + + +bool BBCODEParser::SkipCommentaryTagIfExists() +{ + return false; +} + + + + + + + + + + +// one enter will generate one
+// two enters or more will generate only two br (

) +void BBCODEParser::PutNormalText(const wchar_t * str, const wchar_t * end) +{ +int br_len; + + if( lastc != -1 ) + { + // trimming last white characters at end of the user text + while( str\n"; + } + } + else + { + PrintEscape(*str); + ++str; + } + } +} + + + +void BBCODEParser::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white) +{ +} + + +void BBCODEParser::CheckExceptions() +{ + if( stack_len >= 2 ) + { + if( pstack[stack_len-1].type == Item::opening && + pstack[stack_len-2].type == Item::opening && + IsNameEqual(L"*", pstack[stack_len-1].name) && + IsNameEqual(L"*", pstack[stack_len-2].name) ) + { + // removing the last [*] from the stack + // was put automatically + PopStack(); + } + } +} + + + + +/* + bbcode format: + [bbcodetag=value]some text[/bbcodetag] + the value can be quoted, e.g. + [bbcodetag="value"]some text[/bbcodetag], or + [bbcodetag='value']some text[/bbcodetag] + + the third string below (in tags table) is 'html_argument' from Tags, + it can contain a special character % followed by a string which means: + %1 - "value" escaped as for html + %2 - "some text" escaped as for html + %u1 - "value" trimmed and escaped as for url-es + %u2 - "some text" trimmed and escaped as for url-es + %% - one % + + if you are using %2 or %u2 then "some text" is not treated as bbcode, e.g. + [bbcodetag=value]some [b]text[/b][/bbcodetag] will produce: + some [b]text[/b] (the inner tags [b][/b] were not parsed) + + also when using %2 or %u2 the closing bbcode tag is skipped + (if you want this tag then you can put it in 'html_argument') + + and when using u (%u1 or %u2) the argument is trimmed from whitespaces and new lines + at the beginning and at the end + (because otherwise a space would be changed to %20 and this were probably not what you really wanted) +*/ +const BBCODEParser::Tags * BBCODEParser::FindTag(const wchar_t * tag) +{ + static Tags tags[] = { + {L"*", L"li", L">", false}, + {L"b", L"em", L">", true}, + {L"i", L"span", L" class=\"bbitalic\">", true}, + {L"u", L"span", L" class=\"bbunderline\">", true}, + {L"s", L"span", L" class=\"bbstrike\">", true}, + {L"code", L"code", L" class=\"bbcode\">", false}, + {L"list", L"ul", L" class=\"bblist\">", false}, + {L"color", L"span", L" class=\"bbcol%1\">", true}, + {L"url", L"a", L" href=\"%u1\">", true}, + {L"img", L"img", L" alt=\"%1\" src=\"%u2\">", true}, + {L"quote", L"div", L" class=\"bbquote\">\n%1
\n", false}, + }; + + size_t i; + size_t len = sizeof(tags) / sizeof(Tags); + + for(i=0 ; i='a' && c<='z') || + (c>='A' && c<='Z') || + (c>='0' && c<='9') || + (c=='_' || c=='?' || c=='.' || c==',' || c=='/' || c=='-' || + c=='+' || c=='*' || c=='(' || c==')' || c=='=' || c==':') + ) + { + (*out_string) += c; + } + else + { + wchar_t buffer[20]; + swprintf(buffer, 20, L"%02X", c); + + (*out_string) += '%'; + (*out_string) += buffer; + } +} + + +void BBCODEParser::PrintEscape(int c, bool change_quote) +{ + if( c == '<' ) + { + (*out_string) += L"<"; + } + else + if( c == '>' ) + { + (*out_string) += L">"; + } + else + if( c == '&' ) + { + (*out_string) += L"&"; + } + else + if( c == '\"' && change_quote ) + { + (*out_string) += L"""; + } + else + { + (*out_string) += c; + } +} + + +void BBCODEParser::PrintArgumentEncode(const wchar_t * start, const wchar_t * end) +{ + PrintArgumentCheckQuotes(start, end); + TrimWhiteWithNewLines(start, end); + + for( ; starthtml_tag, tag_name) ) + { + if( condition ) + { + PutClosingTag(tag); + (*out_string) += '\n'; + } + + condition = true; + } +} + + +void BBCODEParser::CheckOpeningTag(const Tags * tag) +{ + bool has_list_tag = has_open_ul_tag || has_open_ol_tag; + + CheckOpeningTag(tag, L"li", has_open_li_tag); + CheckOpeningTag(tag, L"ul", has_open_ul_tag); + CheckOpeningTag(tag, L"ol", has_open_ol_tag); + + if( has_open_li_tag && !has_list_tag ) + { + (*out_string) += L"
    \n"; + has_open_ul_tag = true; + } +} + + + + + +void BBCODEParser::PrintEscape(const wchar_t * start, const wchar_t * end, bool change_quote) +{ + for( ; start < end ; ++start) + PrintEscape(*start, change_quote); +} + + + +void BBCODEParser::PrintEncode(const wchar_t * start, const wchar_t * end) +{ + for( ; start < end ; ++start) + PrintEncode(*start); +} + + + +void BBCODEParser::PutOpeningTagFromEzc() +{ + // this can be a tag from Ezc templates system + (*out_string) += '['; + (*out_string) += LastItem().name; + + +// FIXME +// const wchar_t * start = pchar; +// +// while( *pchar && *pchar!=']' ) +// ++pchar; +// +// if( *pchar == ']' ) +// ++pchar; +// +// Put(start, pchar); +} + + + + + +void BBCODEParser::PutHtmlArgument1(const wchar_t * arg_start, const wchar_t * arg_end, bool has_u) +{ + if( has_u ) + PrintArgumentEncode(arg_start, arg_end); + else + PrintArgumentEscape(arg_start, arg_end); +} + + + +void BBCODEParser::TrimWhiteWithNewLines(const wchar_t * & start, const wchar_t * & end) +{ + while( start < end && (IsWhite(*start) || *start==10) ) + ++start; + + while( start < end && (IsWhite(*(end-1)) || *(end-1)==10) ) + --end; +} + + + +void BBCODEParser::PutHtmlArgument2(const Tags * tag, bool has_u) +{ +//const wchar_t * start = pchar; +//const wchar_t * end = pchar; +bool first_tag_removed = false; + + while( lastc != -1 ) + { + if( IsOpeningTagMark(lastc) ) + { + // FIXME +// if( IsClosingTagForLastItem() ) +// { +// // the last tag is skipped when using patterns with %2 or %u2 +// +// PopStack(); // removing opening tag from the stack +// first_tag_removed = true; +// break; +// } + } + else + { + read_char(); + //end = pchar; + } + } + + if( !first_tag_removed ) + PopStack(); // user has forgotten to close the tag + + if( has_u ) + { +// FIXME +// TrimWhiteWithNewLines(start, end); +// PrintEncode(start, end); + } + else + { + // FIXME +// PrintEscape(start, end); + } +} + + + +void BBCODEParser::PutHtmlArgument(const Tags * tag, const wchar_t * arg_start, const wchar_t * arg_end) +{ +const wchar_t * pattern = tag->html_argument; +bool has_u; + + while( *pattern ) + { + if( *pattern == '%' ) + { + ++pattern; + has_u = false; + + if( *pattern == 'u' ) + { + ++pattern; + has_u = true; + } + + if( *pattern == '1' ) + { + ++pattern; + PutHtmlArgument1(arg_start, arg_end, has_u); + } + else + if( *pattern == '2' ) + { + ++pattern; + PutHtmlArgument2(tag, has_u); + } + else + if( *pattern == '%' ) + { + (*out_string) += '%'; + ++pattern; + } + // else unrecognized, will be printed next time as a normal character + } + else + { + (*out_string) += *pattern; + ++pattern; + } + } +} + + +void BBCODEParser::PutOpeningTagFromBBCode(const Tags * tag) +{ + CheckOpeningTag(tag); + PutOpeningTagMark(); + Put(tag->html_tag); + +// FIXME +// const wchar_t * start = pchar; +// +// while( *pchar && *pchar != ']' ) +// ++pchar; +// +// PutHtmlArgument(tag, start, pchar); +// +// if( *pchar == ']' ) +// ++pchar; + + if( !tag->inline_tag ) + { + Put(10); + SkipWhiteLines(); + } +} + + +bool BBCODEParser::PutOpeningTag() +{ + const Tags * tag = FindTag(LastItem().name); + + if( !tag ) + PutOpeningTagFromEzc(); + else + PutOpeningTagFromBBCode(tag); + +return false; +} + + +void BBCODEParser::PutClosingTag(const Tags * tag) +{ + if( !tag ) + return; // skipping the tag + + PutOpeningTagMark(); + (*out_string) += '/'; + (*out_string) += tag->html_tag; + PutClosingTagMark(); + + if( !tag->inline_tag ) + { + (*out_string) += L"\n"; + SkipWhiteLines(); + } + + if( Equal(tag->html_tag, L"li") ) + has_open_li_tag = false; + + if( Equal(tag->html_tag, L"ol") ) + has_open_ol_tag = false; + + if( Equal(tag->html_tag, L"ul") ) + has_open_ul_tag = false; +} + + +void BBCODEParser::PutClosingTag(const wchar_t * tag_name) +{ + const Tags * tag = FindTag(tag_name); + PutClosingTag(tag); +} + + + +void BBCODEParser::Init() +{ + has_open_li_tag = false; + has_open_ol_tag = false; + has_open_ul_tag = false; + + SkipWhiteLines(); +} + + +void BBCODEParser::Uninit() +{ + if( has_open_li_tag ) + (*out_string) += L"\n"; + + if( has_open_ol_tag ) + (*out_string) += L"\n"; + + if( has_open_ul_tag ) + (*out_string) += L"
\n"; +} + + + +} + diff --git a/src/html/bbcodeparser.h b/src/html/bbcodeparser.h new file mode 100644 index 0000000..a2e2e7f --- /dev/null +++ b/src/html/bbcodeparser.h @@ -0,0 +1,128 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2008-2021, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef headerfile_winix_core_bbcodeparser +#define headerfile_winix_core_bbcodeparser + +#include "htmlparser.h" + +namespace pt +{ + + +class BBCODEParser : public HTMLParser +{ + + struct Tags + { + const wchar_t * bbcode; + const wchar_t * html_tag; + const wchar_t * html_argument; // with closing '>' + bool inline_tag; + }; + + + /* + virtual methods + (from HTMLParser class) + */ + virtual void Init(); + virtual void Uninit(); + + virtual bool IsOpeningTagMark(wchar_t c); + virtual bool IsClosingTagMark(wchar_t c); + virtual bool IsClosingXmlSimpleTagMark(wchar_t c); + + virtual bool IsOpeningCommentaryTagMark(const wchar_t *); + virtual size_t OpeningCommentaryTagMarkSize(); + + virtual bool IsValidCharForName(int c); + virtual void CheckExceptions(); + virtual bool SkipCommentaryTagIfExists(); + + virtual bool PutOpeningTag(); + virtual void PutClosingTag(const wchar_t * tag); + + virtual void PutNormalText(const wchar_t * str, const wchar_t * end); + virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white); + + + + /* + others + */ + bool Equal(const wchar_t * str1, const wchar_t * str2); + + void PutHtmlArgument1(const wchar_t * arg_start, const wchar_t * arg_end, bool has_u); + void PutHtmlArgument2(const Tags * tag, bool has_u); + void PutHtmlArgument(const Tags * tag, const wchar_t * arg_start, const wchar_t * arg_end); + + void PutOpeningTagFromEzc(); + void PutOpeningTagFromBBCode(const Tags * tag); + + const Tags * FindTag(const wchar_t * tag); + const Tags * FindTag(const std::wstring & tag); + void PrintArgumentCheckQuotes(const wchar_t * & start, const wchar_t * & end); + + void PrintEscape(int c, bool change_quote = false); + void PrintEncode(int c); + + void PrintEscape(const wchar_t * start, const wchar_t * end, bool change_quote = false); + void PrintEncode(const wchar_t * start, const wchar_t * end); + + void PrintArgumentEncode(const wchar_t * start, const wchar_t * end); + void PrintArgumentEscape(const wchar_t * start, const wchar_t * end); + + void PutClosingTag(const Tags * tag); + + void CheckOpeningTag(const Tags * tag, const wchar_t * tag_name, bool & condition); + void CheckOpeningTag(const Tags * tag); + + void TrimWhiteWithNewLines(const wchar_t * & start, const wchar_t * & end); + + + + bool has_open_ol_tag; // has open html
    tag + bool has_open_ul_tag; // has open html
      tag + bool has_open_li_tag; // has open html
    • tag +}; + + +} + + +#endif diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp new file mode 100644 index 0000000..f4b158e --- /dev/null +++ b/src/html/htmlparser.cpp @@ -0,0 +1,2434 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2008-2022, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "htmlparser.h" +#include "convert/text.h" + + +namespace pt +{ +const int HTMLParser::WHITE_MODE_ORIGIN; +const int HTMLParser::WHITE_MODE_SINGLE_LINE; +const int HTMLParser::WHITE_MODE_TREE; + + + +void HTMLParser::clear_input_flags() +{ + BaseParser::clear_input_flags(); + + parsing_html = true; + xml_compact_mode = true; + status = ok; + line = 1; + stack_len = 0; + out_string = nullptr; + out_stream = nullptr; + out_space = nullptr; + line_len = 0; + char_was_escaped = false; + escaped_chars_buffer.clear(); + escaped_char_index = 0; + filter_mode = false; +} + + + + +void HTMLParser::Item::Clear() +{ + name.clear(); + type = none; + is_commentary = false; + is_cdata = false; + porphans = nullptr; + new_line_before = false; + new_line_after = false; + new_line_in_the_middle = false; + white_char_before = false; + has_body_tag = false; + tree_index = 0; + space = nullptr; +} + + +HTMLParser::Item::Item() +{ + Clear(); +} + + +void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode) +{ + clear_input_flags(); + + pchar_unicode = in; + xml_compact_mode = compact_mode; + out_space = &space; + out_space->clear(); + + Init(); + Read(); + Uninit(); +} + + +void HTMLParser::set_item_parsed_listener(ItemParsedListener * listener) +{ + item_parsed_listener = listener; +} + + + +HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space) +{ + clear_input_flags(); + + parsing_html = false; + reading_from_file = true; + xml_compact_mode = compact_mode; + this->out_space = &out_space; + + if( clear_space ) + this->out_space->clear(); + + file.clear(); + file.open(file_name, std::ios_base::binary | std::ios_base::in); + + if( file ) + { + Init(); + Read(); + Uninit(); + + file.close(); + } + else + { + status = cant_open_file; + } + + return status; +} + + +HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space) +{ + return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space); +} + + +HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space) +{ + std::string file_name_utf8; + wide_to_utf8(file_name, file_name_utf8); + + return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space); +} + + +HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space) +{ + return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space); +} + + + +void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string) +{ + clear_input_flags(); + + pchar_unicode = in; + out_string = &out; + filter_mode = true; + + if( clear_out_string ) + out_string->clear(); + + Init(); + Read(); + Uninit(); +} + + +void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string) +{ + if( &in == &out ) + { + // out cannot be the same string as in + return; + } + + size_t out_projected_len = in.size() * 2 + 1; + + if( out.capacity() < out_projected_len ) + out.reserve(out_projected_len); + + filter(in.c_str(), out, clear_out_string); +} + + +void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream) +{ + clear_input_flags(); + + WTextStream::const_iterator begin = in.begin(); + WTextStream::const_iterator end = in.end(); + + wtext_stream_iterator = &begin; + wtext_stream_iterator_end = &end; + out_stream = &out; + filter_mode = true; + + if( clear_out_stream ) + out_stream->clear(); + + Init(); + Read(); + Uninit(); +} + + +HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream) +{ + clear_input_flags(); + + reading_from_file = true; + + // open the file before clearing 'out' string, 'out' string can be the same string as the file_name + file.clear(); + file.open(file_name, std::ios_base::binary | std::ios_base::in); + + out_string = &out; + filter_mode = true; + + if( clear_out_stream ) + out_string->clear(); + + if( file ) + { + Init(); + Read(); + Uninit(); + + file.close(); + } + else + { + status = cant_open_file; + } + + return status; +} + + +HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream) +{ + return filter_file(file_name.c_str(), out, clear_out_stream); +} + + +HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream) +{ + std::string file_name_utf8; + pt::wide_to_utf8(file_name, file_name_utf8); + + return filter_file(file_name_utf8, out, clear_out_stream); +} + + +HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream) +{ + return filter_file(file_name.c_str(), out, clear_out_stream); +} + + + + +void HTMLParser::Init() +{ +} + + +void HTMLParser::Uninit() +{ +} + + + + + +int HTMLParser::get_last_parsed_line() +{ + return line; +} + + +void HTMLParser::SetSomeDefaults() +{ + white_mode = WHITE_MODE_ORIGIN; + + tab_size = 2; + wrap_line = 0; + orphan_mode = orphan_nbsp; + safe_mode = false; + skip_tags = false; + skip_commentaries = false; + skip_entities = false; + analyze_entities = false; + item_parsed_listener = nullptr; +} + + +HTMLParser::HTMLParser() +{ + pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; + buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; + + SetSomeDefaults(); +} + + +HTMLParser::HTMLParser(const HTMLParser & f) +{ + // don't need to copy the stack + pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; + buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; + + SetSomeDefaults(); +} + + +HTMLParser & HTMLParser::operator=(const HTMLParser & f) +{ + // don't need to copy the stack + pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; + buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; + + // we can copy some fields from f + +return *this; +} + + +HTMLParser::~HTMLParser() +{ + delete [] pstack; + delete [] buffer; +} + + + +void HTMLParser::white_chars_mode(int mode) +{ + if( mode >= WHITE_MODE_ORIGIN && mode <= WHITE_MODE_TREE ) + white_mode = mode; +} + + + + +void HTMLParser::WrapLine(size_t wrap_line_) +{ + wrap_line = wrap_line_; + + if( wrap_line > 10000 ) + wrap_line = 10000; +} + + + +void HTMLParser::InsertTabs(size_t tabsize) +{ + tab_size = tabsize; + + if( tab_size > 1000 ) + tab_size = 1000; +} + + +int HTMLParser::current_white_char_mode() +{ + if( !white_char_mode_tab.empty() ) + return white_char_mode_tab.back(); + + return WHITE_MODE_ORIGIN; +} + + +void HTMLParser::CalcOrphansMaxLen(Orphans & orphans) +{ +size_t i; + + orphans.max_len = 0; + + for(i=0 ; i orphans.max_len ) + orphans.max_len = orphans.tab[i].size(); + } +} + + +void HTMLParser::AssignOrphans(const wchar_t * lang_code, const std::vector & otab) +{ + lang_code_lower = lang_code; + ToLower(lang_code_lower); + + orphans_temp.tab = otab; + std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end()); + CalcOrphansMaxLen(orphans_temp); + + orphans_tab[lang_code_lower] = orphans_temp; +} + + + +void HTMLParser::AssignOrphans(const std::wstring & lang_code, const std::vector & otab) +{ + AssignOrphans(lang_code.c_str(), otab); +} + + +void HTMLParser::ClearOrphans() +{ + orphans_tab.clear(); +} + + + + +void HTMLParser::OrphansMode(const std::wstring & orphan_mode_str) +{ + if( orphan_mode_str == L"160" ) + orphan_mode = orphan_160space; + else + orphan_mode = orphan_nbsp; +} + + +void HTMLParser::SafeMode(bool safe_mode_) +{ + safe_mode = safe_mode_; +} + + +void HTMLParser::SkipTags(bool skip_tags) +{ + this->skip_tags = skip_tags; +} + +void HTMLParser::SkipCommentaries(bool skip_commentaries) +{ + this->skip_commentaries = skip_commentaries; +} + + +void HTMLParser::SkipEntities(bool skip_entities) +{ + this->skip_entities = skip_entities; + + if( this->skip_entities ) + { + this->analyze_entities = true; + } +} + + +void HTMLParser::AnalyzeEntities(bool analyze_entities) +{ + this->analyze_entities = analyze_entities; +} + + +void HTMLParser::SetNoFilterTag(const std::wstring & tag_name) +{ + no_filter_tag = tag_name; +} + + + + +HTMLParser::Item & HTMLParser::GetItem(size_t i) +{ + if( i >= stack_len ) + { + empty.Clear(); + return empty; + } + +return pstack[i]; +} + + +HTMLParser::Item & HTMLParser::LastItem() +{ + if( stack_len == 0 ) + { + empty.Clear(); + return empty; + } + +return pstack[stack_len-1]; +} + + +bool HTMLParser::PushStack() +{ + if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN ) + // oops, too many items + return false; + + pstack[stack_len].Clear(); + + if( stack_len > 0 ) + { + // 'porphans', 'has_body_tag' and 'tree_index' attributes are propagated + pstack[stack_len].porphans = pstack[stack_len-1].porphans; + pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag; + pstack[stack_len].tree_index = pstack[stack_len-1].tree_index; + } + + stack_len += 1; + +return true; +} + + + +void HTMLParser::PopStack() +{ + if( stack_len == 0 ) + // oops + return; + + stack_len -= 1; + pstack[stack_len].Clear(); +} + + +bool HTMLParser::IsWhite(int c) +{ + // dont use c==10 here + + if( c==' ' || c=='\t' || c==13 || c==160 ) + return true; + +return false; +} + + +void HTMLParser::SkipWhite(std::wstring * out_string) +{ + while( IsWhite(lastc) ) + { + if( out_string ) + (*out_string) += lastc; + + read_char(); + } +} + + +void HTMLParser::SkipWhiteLines(std::wstring * out_string) +{ + while( lastc==10 || IsWhite(lastc) ) + { + if( out_string ) + (*out_string) += lastc; + + read_char(); + } +} + + +void HTMLParser::SkipWhiteWithFirstNewLine() +{ + SkipWhite(); + + if( lastc == 10 ) + { + read_char(); + SkipWhite(); + } +} + + +//void HTMLParser::CheckNewLine() +//{ +// if( white_mode == WHITE_MODE_TREE ) +// { +// SkipWhite(); +// } +// +// last_new_line = (lastc==10); +//} + + + + + + +void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text) +{ + bool is_quoted = false; + wchar_t quote_char = 0; + + while( lastc != -1 ) + { + if( !char_was_escaped && (lastc == '"' || lastc == '\'') ) + { + if( is_quoted ) + { + if( lastc == quote_char ) + { + is_quoted = false; + } + } + else + { + is_quoted = true; + quote_char = lastc; + } + } + else + if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(lastc) ) // closing xml tag: default '/' + { + LastItem().type = Item::simple; + } + else + if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) ) + { + read_char(); + break; + } + + if( remember_text ) + (*remember_text) += lastc; + + read_char(); + } +} + + + +bool HTMLParser::IsValidCharForName(int c) +{ + if( (c>='a' && c<='z') || + (c>='A' && c<='Z') || + (c>='0' && c<='9') || + c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA + return true; + +return false; +} + + +bool HTMLParser::IsValidCharForAttrName(int c) +{ + if( (c>='a' && c<='z') || + (c>='A' && c<='Z') || + (c>='0' && c<='9') || + c=='-' || c==':' || c=='_') + return true; + +return false; +} + + +bool HTMLParser::IsValidCharForEntityName(int c) +{ + if( (c>='a' && c<='z') || + (c>='A' && c<='Z') || + (c>='0' && c<='9') || + c=='#' ) + return true; + +return false; +} + + +void HTMLParser::ReadItemName(std::wstring & name, bool clear_name) +{ +size_t i; + + if( clear_name ) + name.clear(); + + for(i=0 ; IsValidCharForName(lastc) ; ++i) + { + if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN ) + { + name += lastc; + + if( LastItem().type == Item::special && name == L"!--" ) + { + LastItem().is_commentary = true; + read_char(); + break; + } + + if( LastItem().type == Item::special && name == L"![CDATA[" ) + { + LastItem().is_cdata = true; + read_char(); + break; + } + } + + read_char(); + } +} + + + +void HTMLParser::ReadItemAttrName() +{ +size_t i; + + attr_name.clear(); + + for( i=0 ; lastc != -1 && IsValidCharForAttrName(lastc) ; ++i ) + { + if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN ) + attr_name += lastc; + + read_char(); + } +} + + + +void HTMLParser::ReadItemAttrValueAdd(const std::wstring & str) +{ + if( analyze_entities ) + { + attr_value.push_back(std::wstring()); + AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), &attr_value.back()); + } + else + { + attr_value.push_back(str); + } +} + + +void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char) +{ + attr_value.clear(); + tmp_text.clear(); + + while( lastc != -1 ) + { + if( !char_was_escaped ) + { + if( has_quote ) + { + if( lastc == quote_char ) + break; + } + else + { + if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) + break; + } + } + + if( lastc==10 || IsWhite(lastc) ) + { + if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + ReadItemAttrValueAdd(tmp_text); + + tmp_text.clear(); + } + else + { + if( tmp_text.size() > WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + tmp_text.clear(); + + tmp_text += lastc; + } + + read_char(); + } + + if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + ReadItemAttrValueAdd(tmp_text); +} + + +void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char) +{ + attr_value.clear(); + tmp_text.clear(); + + while( lastc != -1 ) + { + if( !char_was_escaped ) + { + if( has_quote ) + { + if( lastc == quote_char ) + break; + } + else + { + if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) + break; + } + } + + // IMPROVEME add support for analyze_entities? + if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + tmp_text += lastc; + + read_char(); + } +} + + + +void HTMLParser::CheckChar(wchar_t c) +{ + if( c == 10 ) + line_len = 0; + else + line_len += 1; +} + + +void HTMLParser::Put(wchar_t c) +{ + if( out_string ) + (*out_string) += c; + + if( out_stream ) + (*out_stream) << c; + + CheckChar(c); +} + + +void HTMLParser::Put(const wchar_t * str, const wchar_t * end) +{ + if( str >= end ) + return; + + size_t len = end - str; + + if( out_string ) + out_string->append(str, len); + + if( out_stream ) + out_stream->write(str, len); + + for( ; str < end ; ++str) + CheckChar(*str); +} + + + +void HTMLParser::Put(const std::wstring & str) +{ + if( !str.empty() ) + { + if( out_string ) + out_string->append(str); + + if( out_stream ) + out_stream->write(str.c_str(), str.size()); + + for(size_t i=0 ; i < str.size() ; ++i) + CheckChar(str[i]); + } +} + + +// out can be null +void HTMLParser::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out) +{ + size_t epsilon = 8; // !! IMPROVE ME put as a constant + const wchar_t * old_str = str; + + while( str < end ) + { + if( IsStartingEntityMark(*str) ) + { + const wchar_t * entity_start = str; + str += 1; // skip & + + for(size_t i=0 ; *str && IsValidCharForEntityName(*str) && i < epsilon ; ++i, ++str) + { + } + + if( IsEndingEntityMark(*str) && str - entity_start > 1 ) // at least one character in entity name + { + if( out ) + out->append(old_str, entity_start); + else + Put(old_str, entity_start); + + str += 1; // skip ; + + if( !skip_entities ) + { + if( out ) + out->append(entity_start, str); + else + Put(entity_start, str); + } + + EntityFound(entity_start + 1, str - 1); // without & and ; + old_str = str; + } + } + else + { + str += 1; + } + } + + if( out ) + out->append(old_str, end); + else + Put(old_str, end); +} + + + + +int HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str) +{ +size_t res; + + const wchar_t * orphan = orphan_str.c_str(); + + for( ; str & table) +{ +int res; + + if( table.empty() ) + return false; + + size_t o1 = 0; + size_t o2 = table.size() - 1; + + res = CheckOrphan(str, end, table[o1]); + + if( res == 0 ) + return true; + + if( res < 0 ) + return false; + + res = CheckOrphan(str, end, table[o2]); + + if( res == 0 ) + return true; + + if( res > 0 ) + return false; + + + while( o1 + 1 < o2 ) + { + size_t o = (o1 + o2) / 2; + res = CheckOrphan(str, end, table[o]); + + if( res == 0 ) + return true; + + if( res < 0 ) + o2 = o; + else + o1 = o; + } + +return false; +} + + +bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end) +{ + if( str==end || !LastItem().has_body_tag || !LastItem().porphans ) + return false; + + size_t len = end - str; + + if( len > LastItem().porphans->max_len ) + return false; + +return CheckOrphan(str, end, LastItem().porphans->tab); +} + + +bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata) +{ + bool was_closing_tag = false; + + while( lastc != -1 && lastc != 10 && !IsWhite(lastc) ) + { + if( is_cdata ) + { + if( lastc == ']' ) + { + read_char(); + + if( lastc == ']' ) + { + read_char(); + + if( IsClosingTagMark(lastc) ) + { + read_char(); + was_closing_tag = true; + break; + } + else + { + str += ']'; + str += ']'; + } + } + else + { + str += ']'; + } + } + } + else + { + if( !char_was_escaped && IsOpeningTagMark(lastc) ) + { + was_closing_tag = true; + break; + } + } + + str += lastc; + read_char(); + } + + if( !str.empty() ) + { + if( allow_put_new_line ) + { + Put(10); + PutTabs(LastItem().tree_index + 1); + } + else + if( allow_put_space ) + { + Put(' '); + } + } + + if( analyze_entities ) + AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr); + else + Put(str); + + return was_closing_tag; +} + + +void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text) +{ + was_white_char = false; + was_new_line = false; + + while( lastc == 10 || IsWhite(lastc) ) + { + if( lastc == 10 ) + was_new_line = true; + else + was_white_char = true; + + if( result_text ) + (*result_text) += lastc; + + if( current_white_char_mode() == WHITE_MODE_ORIGIN ) + { + Put(lastc); + } + + read_char(); + } + + if( current_white_char_mode() == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) ) + { + Put(' '); + } + + // in WHITE_MODE_TREE white characters are written at the beginning of a or text +} + + + +void HTMLParser::PutOpeningTagMark() +{ + Put('<'); +} + + +void HTMLParser::PutClosingTagMark() +{ + Put('>'); +} + + + + +// !! IMPROVE ME change to a better name +// this functions does not return true when the tag is safe +bool HTMLParser::IsTagSafe(const wchar_t * tag) +{ + if( !safe_mode ) + return true; + + if( IsNameEqual(tag, no_filter_tag.c_str()) ) + return false; + + static const wchar_t * unsafe_tags[] = { + L"applet", L"base", L"body", + L"embed", L"head", L"html", + L"frame", L"frameset",L"iframe", + L"link", L"meta", L"param" + L"object", L"script" + }; + + size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*); + size_t i; + + for(i=0 ; i 30 ) + len = 30; + + for(size_t i=0 ; i < (len*tab_size) ; ++i) + { + if( out_string ) + (*out_string) += ' '; // we do not add them to 'line_len' + + if( out_stream ) + (*out_stream) << ' '; + } +} + + +void HTMLParser::PutNonBreakingSpace() +{ + if( orphan_mode == orphan_nbsp ) + { + Put(L" "); + } + else + { + Put(160); + } +} + + + +// we assume the size of the opening mark to be one +bool HTMLParser::IsOpeningTagMark(wchar_t c) +{ + return (c == '<'); +} + + +// we assume the size of the closing mark to be one +bool HTMLParser::IsClosingTagMark(wchar_t c) +{ + return (c == '>'); +} + + +// the slash in the closing tag mark e.g.

      +bool HTMLParser::IsClosingTagIndicator(wchar_t c) +{ + return (c == '/'); +} + + +// the slash in the closing tag mark e.g.

      +bool HTMLParser::IsSpecialTagIndicator(wchar_t c) +{ + return (c == '!'); +} + +bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c) +{ + return (c == '?'); +} + +// the '=' operator e.g. class="value" +bool HTMLParser::IsAttributeAssignmentMark(wchar_t c) +{ + return (c == '='); +} + + + +// the slash at the end (without '>' character) +// we assume the size of the mark to be one +bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c) +{ + return (c == '/'); +} + + +bool HTMLParser::IsStartingEntityMark(wchar_t c) +{ + return (c == '&'); +} + + +bool HTMLParser::IsEndingEntityMark(wchar_t c) +{ + return (c == ';'); +} + + + +// used for such tags as: script, pre, textarea +void HTMLParser::ReadTextUntilClosingCommentary() +{ + while( lastc != -1 ) + { + if( lastc == '-' ) + { + tmp_text.clear(); + tmp_text += lastc; + read_char(); + + if( lastc == '-' ) + { + tmp_text += lastc; + read_char(); + + if( !char_was_escaped && IsClosingTagMark(lastc) ) + { + tmp_text += lastc; + read_char(); + Put(tmp_text); + + break; + } + } + + Put(tmp_text); + } + else + { + Put(lastc); + read_char(); + } + } +} + + + +bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well) +{ + tmp_text.clear(); + tmp_text += lastc; // opening tag mark + read_char(); + + SkipWhiteLines(&tmp_text); + + if( IsClosingTagIndicator(lastc) ) + { + tmp_text += lastc; + read_char(); + SkipWhiteLines(&tmp_text); + ReadItemName(tmp_name); + + if( IsNameEqual(tmp_name, LastItem().name) ) + { + SkipAndCheckClosingTag(); + + if( put_closing_tag_as_well ) + { + Put('<'); + Put('/'); + Put(tmp_name); + Put('>'); + } + + return true; + } + else + { + Put(tmp_text); + Put(tmp_name); + } + } + else + { + Put(tmp_text); + } + +return false; +} + + + + +// used for such tags as: script, pre, textarea +void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well) +{ + while( lastc != -1 ) + { + if( !char_was_escaped && IsOpeningTagMark(lastc) ) + { + if( IsClosingTagForLastItem(put_closing_tag_as_well) ) + { + //CheckNewLine(); + break; + } + } + else + { + Put(lastc); + read_char(); + } + } +} + + + + +// reading text between html tags +void HTMLParser::ReadText(bool is_cdata) +{ + new_item_has_new_line_before = false; + new_item_has_white_char_before = false; + + bool was_white_char = false; + bool was_new_line = false; + + bool was_non_white_text = false; + + bool allow_put_new_line = false; + bool allow_put_space = false; + + if( current_white_char_mode() == WHITE_MODE_TREE ) + { + if( LastItem().new_line_after || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) ) + { + allow_put_new_line = true; + } + } + + Space * text_space = nullptr; + std::wstring * text_space_wstr = nullptr; + + if( out_space ) + { + text_space = &text_space_tmp; + text_space->clear(); + text_space->add(L"name", L""); + Space & wstr_space = text_space->add(L"text", L""); + text_space_wstr = &wstr_space.value.value_wstring; + } + + bool was_closing_tag = false; + + while( lastc != -1 && !was_closing_tag ) + { + tmp_text.clear(); + was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata); + + if( lastc == -1 || was_closing_tag ) + { + new_item_has_new_line_before = was_new_line; + new_item_has_white_char_before = was_white_char; + } + + if( !tmp_text.empty() ) + { + allow_put_new_line = false; + allow_put_space = false; + was_non_white_text = true; + + if( text_space_wstr ) + (*text_space_wstr) += tmp_text; + } + + if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) ) + { + if( lastc == 10 || IsWhite(lastc) ) + { + SkipWhiteLines(text_space_wstr); + PutNonBreakingSpace(); + was_new_line = false; + } + } + else + { + PutNormalWhite(was_white_char, was_new_line, text_space_wstr); + + if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE ) + { + allow_put_new_line = false; + allow_put_space = false; + + if( was_new_line ) + { + allow_put_new_line = true; + LastItem().new_line_in_the_middle = true; + + if( !was_non_white_text ) + LastItem().new_line_after = true; + } + else + { + allow_put_space = true; + } + + if( wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line ) + { + allow_put_new_line = true; + } + } + } + } + + if( text_space_wstr && !text_space_wstr->empty() && was_non_white_text ) + { + AddSpaceToSpaceTree(*text_space); + } + + text_space_tmp.clear(); +} + + + +bool HTMLParser::PrintOpeningItem() +{ + if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) ) + return true; + + return PutOpeningTag(); +} + + + + + +bool HTMLParser::ReadItemAttr() +{ + attr_has_value = false; + attr_name.clear(); + attr_value.clear(); + + SkipWhiteLines(); + ReadItemAttrName(); + + if( attr_name.empty() ) + return false; + + SkipWhiteLines(); + + if( !IsAttributeAssignmentMark(lastc) ) // '=' + return true; + + attr_has_value = true; + read_char(); // skipping '=' + SkipWhiteLines(); + + bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\''); + wchar_t quote_char = lastc; + + if( has_quote ) + read_char(); // skipping the first quote mark + + // IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table... + if( parsing_html ) + ReadItemAttrValue(has_quote, quote_char); + else + ReadXMLItemAttrValue(has_quote, quote_char); + + if( has_quote && !char_was_escaped && lastc == quote_char ) + read_char(); // skipping the last quote mark + +return true; +} + + + +void HTMLParser::CheckItemLangAttr() +{ + if( attr_has_value && IsNameEqual(L"lang", attr_name) ) + { + LastItem().porphans = nullptr; + + if( !attr_value.empty() ) + { + // we are taking the first value only + attr_value_lower = attr_value[0]; + ToLower(attr_value_lower); + + OrphansTab::iterator i = orphans_tab.find(attr_value_lower); + + if( i != orphans_tab.end() ) + LastItem().porphans = &i->second; + } + } +} + + +void HTMLParser::PrintItemAttr() +{ +size_t i; + + if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) ) + return; + + Put(' '); + Put(attr_name); + + if( attr_has_value ) + { + Put(L"=\""); + + for(i=0 ; iget_add_space(L"attr"); + Space & attr = attr_tab.add_empty_space(attr_name); + + if( attr_has_value ) + { + if( parsing_html ) + { + attr.set_empty_table(); + + for(size_t i=0 ; i < attr_value.size() ; ++i) + { + attr.add(attr_value[i]); + } + } + else + { + attr.set(tmp_text); + } + } + } +} + + +void HTMLParser::ReadItemClosing() +{ + read_char(); // skipping '/' + SkipWhiteLines(); + ReadItemName(LastItem().name); + LastItem().type = Item::closing; + SkipAndCheckClosingTag(); + + // closing tags are printed later +} + + +void HTMLParser::ReadItemSpecial() +{ + LastItem().type = Item::special; + + if( !skip_tags ) + { + if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before ) + { + Put(10); + PutTabs(LastItem().tree_index); + } + + PutOpeningTagMark(); + } + + LastItem().name = lastc; + read_char(); // skipping '!' or '?' + ReadItemName(LastItem().name, false); + + if( skip_tags ) + { + SkipAndCheckClosingTag(); + } + else + { + if( LastItem().is_commentary ) + { + Put(LastItem().name); + } + else + if( LastItem().is_cdata ) + { + // do nothing + } + else + { + tmp_text.clear(); + SkipWhiteLines(); + SkipAndCheckClosingTag(&tmp_text); + Put(LastItem().name); + Put(' '); + Put(tmp_text); + Put('>'); + + if( is_first_item && current_white_char_mode() == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") ) + { + Put(10); + Put(10); + SkipWhiteLines(); + } + } + } +} + + +void HTMLParser::ReadItemOpening() +{ + LastItem().type = Item::opening; + ReadItemName(LastItem().name); + AddItemToSpace(); + Space * space = LastItem().space; + + if( !xml_compact_mode && space ) + space->add(L"name", LastItem().name); + + if( PrintOpeningItem() ) + { + while( ReadItemAttr() ) + { + CheckItemLangAttr(); + PrintItemAttr(); + PutItemAttrToSpace(); + } + + SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple' + + if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) ) + { + if( LastItem().type == Item::simple ) + Put(L" /"); + + PutClosingTagMark(); + } + } +} + + +void HTMLParser::ItemFound() +{ +} + +void HTMLParser::EntityFound(const wchar_t * str, const wchar_t * end) +{ +} + + +bool HTMLParser::ReadItem() +{ + if( lastc == -1 ) + return false; + + if( !PushStack() ) + return false; + + LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method + LastItem().white_char_before = new_item_has_white_char_before; // new_item_has_white_char_before is set by ReadText() method + + if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle ) + LastItem().tree_index += 1; + + read_char(); // skipping the first opening tag mark '<' + SkipWhiteLines(); + + if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) ) + ReadItemSpecial(); + else + if( IsClosingTagIndicator(lastc) ) + ReadItemClosing(); + else + ReadItemOpening(); + + // IMPROVE ME later CheckSingleItemExceptions() can change opening to single type + ItemFound(); + +return true; +} + + + +wchar_t HTMLParser::ToLower(wchar_t c) +{ + if( c>='A' && c<='Z' ) + return c - 'A' + 'a'; + +return c; +} + + +void HTMLParser::ToLower(std::wstring & str) +{ +size_t i; + + for(i=0 ; i0 ; ++name1, ++name2, --len ) + if( ToLower(*name1) != ToLower(*name2) ) + return false; + + if( len == 0 ) + return true; + +return false; +} + + + +bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len) +{ + return IsNameEqual(name1, name2.c_str(), len); +} + + +bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len) +{ + return IsNameEqual(name1.c_str(), name2, len); +} + + +bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len) +{ + return IsNameEqual(name1.c_str(), name2.c_str(), len); +} + + + + + +bool HTMLParser::IsLastTag(const wchar_t * name) +{ + return IsNameEqual(name, LastItem().name); +} + + +bool HTMLParser::IsLastTag(const std::wstring & name) +{ + return IsNameEqual(name, LastItem().name); +} + + +// checking exceptions for opening tags +void HTMLParser::CheckSingleItemExceptions() +{ + if( IsLastTag(L"meta") || + IsLastTag(L"input") || + IsLastTag(L"br") || + IsLastTag(L"hr") || + IsLastTag(L"img") || + IsLastTag(L"link") || + IsLastTag(L"param") || + IsLastTag(L"col") || + IsLastTag(L"area") ) + { + LastItem().type = Item::simple; + PopStack(); + return; + } + + // move me to a better place + if( IsLastTag(L"body") ) + LastItem().has_body_tag = true; +} + + +void HTMLParser::CheckWhiteCharsExceptions(Item & item) +{ + bool change_white_mode = false; + + // in safe_mode the script tag is ignored +// if( !safe_mode && IsNameEqual(item.name, L"script") ) +// { +// change_white_mode = true; +// } + +// if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") ) +// { +// change_white_mode = true; +// } + + if( IsNameEqual(item.name, L"pre") ) + { + change_white_mode = true; + } + + // move to CheckDifferentContentExceptions? + if( IsNameEqual(item.name, no_filter_tag) ) + { + change_white_mode = true; + } + + if( change_white_mode ) + { + if( item.type == Item::opening ) + { + white_char_mode_tab.push_back(WHITE_MODE_ORIGIN); + } + else + { + if( !white_char_mode_tab.empty() ) + white_char_mode_tab.pop_back(); + } + } +} + + + +void HTMLParser::CheckDifferentContentExceptions(Item & item) +{ + if( !safe_mode && IsNameEqual(item.name, L"script") ) + { + ReadTextUntilClosingTag(true); + PopStack(); + } + + if( IsNameEqual(item.name, L"textarea") ) + { + ReadTextUntilClosingTag(true); + PopStack(); + } +} + + + + + + +void HTMLParser::CheckStackPrintRest() +{ + while( stack_len-- > 0 ) + { + if( stack_len==0 || pstack[stack_len-1].new_line_after ) + { + if( current_white_char_mode() == WHITE_MODE_TREE ) + { + Put(10); + PutTabs(pstack[stack_len-1].tree_index); + } + else + { + Put(' '); + } + } + + PutClosingTag(pstack[stack_len]); + } +} + + +void HTMLParser::CheckClosingTags() +{ + int i; + + if( stack_len == 0 ) + return; + + // on the stack we have only opening tags + // but only the last tag is a closing tag + + if( stack_len == 1 ) + { + PopStack(); + return; + } + + // looking whether there is a matching opening tag + for(i=int(stack_len)-2 ; i >= 0 ; --i) + if( (pstack[i].is_commentary && pstack[stack_len-1].is_commentary) || IsNameEqual(pstack[i].name, pstack[stack_len-1].name) ) + break; + + if( i < 0 ) + { + // oops, there is no such an opening tag on the stack + // we don't print the closing and the missing opening tag + PopStack(); + return; + } + + // CHECK ME + if( RemoveIfNeeded(stack_len - 2) ) + { + RemoveLastSpace(i); + } + + for(int z=(int)stack_len-2 ; z >= i ; --z) + { + CheckWhiteCharsExceptions(pstack[z]); + + if( !skip_tags && IsTagSafe(LastItem().name) && !IsNameEqual(no_filter_tag, LastItem().name) ) + { + if( pstack[z].new_line_after ) + { + if( current_white_char_mode() == WHITE_MODE_TREE ) + { + Put(10); + PutTabs(pstack[z].tree_index); + } + } + + // IMPROVEME + // in PutClosingTag we test IsTagSafe() and no_filter_tag too + PutClosingTag(pstack[z]); + pstack[z].Clear(); + } + } + + // invalidate items on the stack + stack_len = i; +} + + +bool HTMLParser::PrintRest() +{ +//const wchar_t * start = pchar; + + // in safe mode we do not print the rest html code + if( safe_mode || skip_tags ) + return false; + + bool was_chars = false; + + while( lastc != -1 ) + { + Put(lastc); + read_char(); + was_chars = true; + } + + return was_chars; + +// if( pchar > start ) +// { +// Put(start, pchar); +// return true; +// } + +//return false; +} + + + +void HTMLParser::AddItemToSpace() +{ + if( out_space && stack_len > 0 ) + { + Space * parent = out_space; + + if( stack_len > 1 ) + { + parent = pstack[stack_len-2].space; + } + + if( xml_compact_mode ) + { + Space * space = parent->get_space(pstack[stack_len-1].name); + + if( space ) + { + if( space->is_table() ) + { + Space & child = space->add_empty_space(); + pstack[stack_len-1].space = &child; + } + else + { + Space * tab = new Space(); + tab->add(space); + Space & child = tab->add_empty_space(); + + parent->value.value_object[pstack[stack_len-1].name] = tab; + pstack[stack_len-1].space = &child; + } + } + else + { + Space & space = parent->add_empty_space(pstack[stack_len-1].name); + pstack[stack_len-1].space = &space; + } + } + else + { + Space & childs_tab = parent->get_add_space(L"childs"); + Space & child = childs_tab.add_empty_space(); + pstack[stack_len-1].space = &child; + } + + } +} + + + +void HTMLParser::RemoveLastSpace(size_t index) +{ + if( out_space ) + { + Space * parent = out_space; + + if( index > 0 ) + { + parent = pstack[index - 1].space; + } + + if( xml_compact_mode ) + { + // IMPLEMENT ME + } + else + { + Space * childs_tab = parent->get_space(L"childs"); + size_t len = childs_tab->table_size(); + + if( childs_tab && childs_tab->is_table() && len > 0 && childs_tab->value.value_table[len-1] == pstack[stack_len-2].space ) + { + childs_tab->remove(len - 1); + pstack[stack_len-2].space = nullptr; + } + } + } +} + +void HTMLParser::AddSpaceToSpaceTree(const Space & space) +{ + const std::wstring * text = space.get_wstr(L"text"); + + if( out_space && stack_len > 0 && text ) + { + if( xml_compact_mode ) + { + Space * child_text = LastItem().space->get_space(L"text"); + + if( child_text ) + { + if( child_text->is_table() ) + { + child_text->add(*text); + } + else + { + Space * tab = new Space(); + tab->add(*child_text); + tab->add(*text); + LastItem().space->value.value_object[L"text"] = tab; + } + } + else + { + LastItem().space->add(L"text", *text); + } + } + else + { + Space & childs_tab = LastItem().space->get_add_space(L"childs"); + childs_tab.add(space); + } + } +} + + + + +bool HTMLParser::RemoveIfNeeded(size_t index) +{ + if( item_parsed_listener ) + { + if( !item_parsed_listener->item_parsed(pstack[index]) ) + { + return true; + } + } + + return false; +} + + + +void HTMLParser::ReadLoop() +{ + while( status == ok && ReadItem() ) + { + bool was_cdata = false; + + if( LastItem().type == Item::opening ) + { + if( parsing_html ) + { + CheckSingleItemExceptions(); + } + + CheckWhiteCharsExceptions(LastItem()); + CheckDifferentContentExceptions(LastItem()); + } + else + if( LastItem().type == Item::special ) + { + if( LastItem().is_commentary ) + ReadTextUntilClosingCommentary(); + + if( LastItem().is_cdata ) + was_cdata = true; + + PopStack(); + } + else + if( LastItem().type == Item::simple ) + { + if( stack_len > 0 ) + { + if( RemoveIfNeeded(stack_len - 1) ) + RemoveLastSpace(stack_len - 1); + } + + PopStack(); + } + else + if( LastItem().type == Item::closing ) + { + CheckClosingTags(); + } + else + { + PopStack(); + } + + if( status == ok ) + { + ReadText(was_cdata); + } + + is_first_item = false; + } +} + + +void HTMLParser::read_char_from_entity_buffer() +{ + if( escaped_char_index < escaped_chars_buffer.size() ) + { + lastc = escaped_chars_buffer[escaped_char_index]; + escaped_char_index += 1; + + if( escaped_char_index >= escaped_chars_buffer.size() ) + { + escaped_chars_buffer.clear(); + escaped_char_index = 0; + } + } + else + { + lastc = -1; + } +} + + +void HTMLParser::read_xml_entity() +{ + const size_t max_entity_length = 6; // length of "'" string + escaped_chars_buffer.clear(); + escaped_char_index = 0; + escaped_chars_buffer += '&'; + + do + { + read_char_no_escape(); + + if( lastc != -1 ) + { + escaped_chars_buffer += lastc; + } + } + while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' ); +} + + +bool HTMLParser::check_escape_sequentions() +{ + if( escaped_chars_buffer == L"&" ) + { + lastc = '&'; + char_was_escaped = true; + } + else + if( escaped_chars_buffer == L"<" ) + { + lastc = '<'; + char_was_escaped = true; + } + else + if( escaped_chars_buffer == L">" ) + { + lastc = '>'; + char_was_escaped = true; + } + else + if( escaped_chars_buffer == L""" ) + { + lastc = '"'; + char_was_escaped = true; + } + else + if( escaped_chars_buffer == L"'" ) + { + lastc = '\''; + char_was_escaped = true; + } + + if( char_was_escaped ) + { + escaped_chars_buffer.clear(); + escaped_char_index = 0; + } + + return char_was_escaped; +} + + + +int HTMLParser::read_char() +{ + char_was_escaped = false; + + if( escaped_char_index < escaped_chars_buffer.size() ) + { + read_char_from_entity_buffer(); + } + else + { + read_char_no_escape(); + + if( !filter_mode && lastc == '&' ) + { + read_xml_entity(); + + if( !check_escape_sequentions() ) + { + read_char_from_entity_buffer(); + } + } + } + + return lastc; +} + + + +void HTMLParser::Read() +{ + read_char(); // put first character to lastc + is_first_item = true; + + white_char_mode_tab.clear(); + white_char_mode_tab.push_back(white_mode); + + if( current_white_char_mode() != WHITE_MODE_ORIGIN ) + SkipWhiteLines(); + + // it can be some text or white lines before the first html tag (we print it if using filtering) + // but they are not added to the Space tree + ReadText(false); + + // reading the whole html source + ReadLoop(); + + // sometimes there can remain some html source (when there is no space on the stack) + // we print the rest html without filtering (only if safe_mode is false) + if( !PrintRest() ) + CheckStackPrintRest(); +} + + + + + +} + diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h new file mode 100644 index 0000000..15dd8db --- /dev/null +++ b/src/html/htmlparser.h @@ -0,0 +1,490 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2008-2022, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef headerfile_picotools_html_htmlfilter +#define headerfile_picotools_html_htmlfilter + +#include +#include +#include +#include +#include "convert/baseparser.h" +#include "space/space.h" +#include "textstream/stream.h" + + +namespace pt +{ + + + +// max length of a name of a html tag (with terminating null) +#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30 + +// max length of a html lang attribute (e.g. "en", "pl") +#define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10 + + +#define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40 + + +#define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500 + + +// depth of the html tree +#define WINIX_HTMLFILTER_STACK_MAXLEN 100 + +// length of a buffer used for printing +// it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3 +#define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048 + + + + +/*! + very lightweight filter for html + (without using any dynamic memory - some memory is allocated only at the beginning - in ctors) + this filter has O(n) complexity over the whole html string + + such tags as: ) are untouched + + if the filter finds that there are not closed tags it will close them, + if the filter finds a closing tag which doesn't have an opening tag - it will skip it + + tags which don't need to be closed: meta, input, br, img, link + look at CheckExceptions() method + + the filter recognizes xml simple tags (with / at the end) such as:
      +*/ +class HTMLParser : public BaseParser +{ +public: + + + /* + status of parsing + */ + enum Status { ok, cant_open_file, syntax_error }; + + + enum OrphanMode + { + orphan_nbsp, // putting " " string + orphan_160space // putting 160 ascii code + }; + + + // orphans for one language + struct Orphans + { + std::vector tab; + size_t max_len; + }; + + + struct Item + { + std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN + + enum Type + { + opening, /* sample:

      */ + closing, /* sample:

      */ + simple, /* sample:
      */ + special, /* sample: */ + none + } type; + + bool is_commentary; + + bool is_cdata; + + // is a new line before this tag (or just a new line and some white characters) + bool new_line_before; + + // is there a new line after this tag (or just some white characters and a new line) + bool new_line_after; + + // is there a new line in the middle after this tag and before the next tag + bool new_line_in_the_middle; + + // is there a white char (but not new line) before this tag + bool white_char_before; + + // current orphans table + // (will be propagated) + Orphans * porphans; + + // this item or one from its parents is a 'body' html tag + // (will be propagated) + bool has_body_tag; + + size_t tree_index; + + Space * space; + + void Clear(); + Item(); + }; + + + class ItemParsedListener + { + public: + + ItemParsedListener() {} + + virtual bool item_parsed(const Item & item) { return true; } + virtual ~ItemParsedListener() {} + + }; + + + /* + the last status of parsing, set by parse() methods + */ + Status status; + + HTMLParser(); + HTMLParser(const HTMLParser & f); + HTMLParser & operator=(const HTMLParser & f); + virtual ~HTMLParser(); + + void set_item_parsed_listener(ItemParsedListener * listener); + + + void parse_html(const wchar_t * in, Space & space, bool compact_mode = false); + + Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); + Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); + Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); + Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); + + + + // main methods used for filtering + void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true); + void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true); + + void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true); + + HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true); + HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true); + HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true); + HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true); + + + /* + * + * returns a number of a last parsed line + * can be used to obtain the line in which there was a syntax error + * + */ + int get_last_parsed_line(); + + + + + const static int WHITE_MODE_ORIGIN = 0; + const static int WHITE_MODE_SINGLE_LINE = 1; + const static int WHITE_MODE_TREE = 2; + + + // white chars mode + // + void white_chars_mode(int mode); + + // if the line is wrap_line_ length (or longer) then insert a new line character (in a place of a white char) + // (only between html tags and only in subtree) + // skipped in such tags: script, pre, textarea + // 0 - off + // lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section) + void WrapLine(size_t wrap_line_); + + // first tabs in a tree + // default: 2 (spaces) + // set 0 to turn off + void InsertTabs(size_t tabsize); + + // set a name of a html tag which will be used as 'nofilter' tag + // elements between such tags are not filtered (similarly as in
       and