From 17d2c0fb25a85ed44ed7cbe80f5554cfd8c747c4 Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Tue, 12 Oct 2021 19:53:11 +0200 Subject: [PATCH] - added some converting methods: esc_to_json(...), esc_to_xml(...), esc_to_csv() (convert/misc.h) - BaseParser: added possibility to read from TextStream and WTextStream - HTMLParser: added filter(const WTextStream & in, Stream & out, ...) method - added utf8_stream.h with one method: template size_t utf8_to_int( StreamIteratorType & iterator_in, StreamIteratorType & iterator_end, int & res, bool & correct) --- src/Makefile.dep | 44 ++++-- src/convert/baseparser.cpp | 104 ++++++++++++-- src/convert/baseparser.h | 39 +++-- src/convert/misc.cpp | 285 ++++++++++++++++++++++++++++++++++++- src/convert/misc.h | 137 +++++++++++++++++- src/csv/csvparser.cpp | 20 +-- src/html/htmlparser.cpp | 130 ++++++++++------- src/html/htmlparser.h | 17 ++- src/space/spaceparser.cpp | 26 ++-- src/utf8/utf8.h | 10 +- src/utf8/utf8_stream.h | 104 ++++++++++++++ src/utf8/utf8_templates.h | 9 +- tests/Makefile.dep | 10 +- 13 files changed, 807 insertions(+), 128 deletions(-) create mode 100644 src/utf8/utf8_stream.h diff --git a/src/Makefile.dep b/src/Makefile.dep index a2e4d54..7dbbb8e 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -1,16 +1,24 @@ # DO NOT DELETE ./convert/inttostr.o: ./convert/inttostr.h -./convert/misc.o: ./convert/misc.h ./convert/text.h +./convert/misc.o: ./convert/misc.h ./convert/text.h textstream/stream.h +./convert/misc.o: textstream/types.h utf8/utf8_stream.h +./convert/misc.o: textstream/textstream.h textstream/stream.h space/space.h +./convert/misc.o: convert/inttostr.h utf8/utf8.h utf8/utf8_templates.h +./convert/misc.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./convert/misc.o: textstream/types.h ./convert/inttostr.h ./convert/text.o: ./convert/text.h ./convert/text_private.h ./convert/double.o: ./convert/double.h textstream/textstream.h ./convert/double.o: textstream/stream.h space/space.h textstream/types.h ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h ./convert/double.o: membuffer/membuffer.h textstream/types.h -./convert/baseparser.o: ./convert/baseparser.h utf8/utf8.h -./convert/baseparser.o: textstream/stream.h utf8/utf8_templates.h -./convert/baseparser.o: utf8/utf8_private.h +./convert/baseparser.o: ./convert/baseparser.h textstream/textstream.h +./convert/baseparser.o: textstream/stream.h space/space.h textstream/types.h +./convert/baseparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h +./convert/baseparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h +./convert/baseparser.o: membuffer/membuffer.h textstream/types.h +./convert/baseparser.o: utf8/utf8_stream.h ./date/date.o: ./date/date.h convert/inttostr.h ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h @@ -28,29 +36,39 @@ ./space/space.o: convert/patternreplacer.h textstream/textstream.h ./space/space.o: textstream/stream.h space/space.h date/date.h ./space/space.o: membuffer/membuffer.h textstream/types.h convert/strtoint.h -./space/space.o: ./convert/text.h ./convert/misc.h ./convert/double.h +./space/space.o: ./convert/text.h ./convert/misc.h utf8/utf8_stream.h +./space/space.o: ./convert/double.h ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h ./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h -./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h +./space/spaceparser.o: textstream/textstream.h textstream/stream.h +./space/spaceparser.o: space/space.h date/date.h membuffer/membuffer.h +./space/spaceparser.o: textstream/types.h convert/strtoint.h ./convert/text.h +./space/spaceparser.o: ./convert/misc.h utf8/utf8_stream.h ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h ./utf8/utf8.o: utf8/utf8_private.h ./utf8/utf8_private.o: utf8/utf8_private.h ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h -./csv/csvparser.o: convert/baseparser.h +./csv/csvparser.o: convert/baseparser.h textstream/textstream.h +./csv/csvparser.o: textstream/stream.h date/date.h membuffer/membuffer.h +./csv/csvparser.o: textstream/types.h ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h -./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h space/space.h -./html/htmlparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h -./html/htmlparser.o: textstream/stream.h utf8/utf8_templates.h -./html/htmlparser.o: utf8/utf8_private.h convert/text.h +./html/htmlparser.o: ./html/htmlparser.h convert/baseparser.h +./html/htmlparser.o: textstream/textstream.h textstream/stream.h +./html/htmlparser.o: space/space.h textstream/types.h convert/inttostr.h +./html/htmlparser.o: utf8/utf8.h textstream/stream.h utf8/utf8_templates.h +./html/htmlparser.o: utf8/utf8_private.h date/date.h membuffer/membuffer.h +./html/htmlparser.o: textstream/types.h convert/text.h ./html/bbcodeparser.o: ./html/bbcodeparser.h ./html/htmlparser.h -./html/bbcodeparser.o: convert/baseparser.h space/space.h textstream/types.h +./html/bbcodeparser.o: convert/baseparser.h textstream/textstream.h +./html/bbcodeparser.o: textstream/stream.h space/space.h textstream/types.h ./html/bbcodeparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h -./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h +./html/bbcodeparser.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h +./html/bbcodeparser.o: membuffer/membuffer.h textstream/types.h diff --git a/src/convert/baseparser.cpp b/src/convert/baseparser.cpp index b95933d..37fbbbf 100644 --- a/src/convert/baseparser.cpp +++ b/src/convert/baseparser.cpp @@ -37,7 +37,7 @@ #include "baseparser.h" #include "utf8/utf8.h" - +#include "utf8/utf8_stream.h" namespace pt @@ -45,19 +45,27 @@ namespace pt BaseParser::BaseParser() { - clear(); + clear_input_flags(); } -void BaseParser::clear() +void BaseParser::clear_input_flags() { line = 0; reading_from_file = false; pchar_ascii = nullptr; pchar_unicode = nullptr; - reading_from_wchar_string = false; + wtext_stream_iterator = nullptr; + wtext_stream_iterator_end = nullptr; + text_stream_iterator = nullptr; + text_stream_iterator_end = nullptr; lastc = -1; input_as_utf8 = true; + + if( file.is_open() ) + file.close(); + + file.clear(); } @@ -132,7 +140,6 @@ bool correct; ++line; return lastc; - } @@ -150,6 +157,67 @@ return lastc; } +int BaseParser::read_char_from_wtext_stream() +{ + if( (*wtext_stream_iterator) != (*wtext_stream_iterator_end) ) + { + lastc = *(*wtext_stream_iterator); + ++(*wtext_stream_iterator); + } + else + { + lastc = -1; + } + + if( lastc == '\n' ) + ++line; + + return lastc; +} + + +int BaseParser::read_char_from_utf8_text_stream() +{ + int c; + bool correct; + + lastc = -1; + + do + { + utf8_to_int(*text_stream_iterator, *text_stream_iterator_end, c, correct); + } + while( !correct && (*text_stream_iterator) != (*text_stream_iterator_end) ); + + if( correct ) + lastc = c; + + if( lastc == '\n' ) + ++line; + + return lastc; +} + + +int BaseParser::read_char_from_ascii_text_stream() +{ + if( (*text_stream_iterator) != (*text_stream_iterator_end) ) + { + lastc = *(*text_stream_iterator); + ++(*text_stream_iterator); + } + else + { + lastc = -1; + } + + if( lastc == '\n' ) + ++line; + + return lastc; +} + + int BaseParser::read_char_no_escape() { if( reading_from_file ) @@ -161,17 +229,33 @@ int BaseParser::read_char_no_escape() } else { - if( reading_from_wchar_string ) - { - return read_char_from_wchar_string(); - } - else + if( pchar_ascii ) { if( input_as_utf8 ) return read_char_from_utf8_string(); else return read_char_from_ascii_string(); } + else if( pchar_unicode ) + { + return read_char_from_wchar_string(); + } + else if( wtext_stream_iterator && wtext_stream_iterator_end ) + { + return read_char_from_wtext_stream(); + } + else if( text_stream_iterator && text_stream_iterator_end ) + { + if( input_as_utf8 ) + return read_char_from_utf8_text_stream(); + else + return read_char_from_ascii_text_stream(); + } + else + { + lastc = -1; + return lastc; + } } } diff --git a/src/convert/baseparser.h b/src/convert/baseparser.h index 381568f..a8c648d 100644 --- a/src/convert/baseparser.h +++ b/src/convert/baseparser.h @@ -40,6 +40,7 @@ #include #include +#include "textstream/textstream.h" namespace pt @@ -51,15 +52,18 @@ protected: BaseParser(); - void clear(); + virtual void clear_input_flags(); - int read_utf8_char(); - int read_ascii_char(); - int read_char_from_wchar_string(); - int read_char_from_utf8_string(); - int read_char_from_ascii_string(); - int read_char_no_escape(); - int read_char(); + virtual int read_utf8_char(); + virtual int read_ascii_char(); + virtual int read_char_from_wchar_string(); + virtual int read_char_from_utf8_string(); + virtual int read_char_from_ascii_string(); + virtual int read_char_from_wtext_stream(); + virtual int read_char_from_utf8_text_stream(); + virtual int read_char_from_ascii_text_stream(); + virtual int read_char_no_escape(); + virtual int read_char(); @@ -75,6 +79,7 @@ protected: */ bool reading_from_file; + /* pointers to the current character if ParseString() is in used @@ -84,9 +89,20 @@ protected: /* - true if ParseString(wchar_t *) or ParseString(std::wstring&) was called - */ - bool reading_from_wchar_string; + pointers to WTextStream iterators + if set then both of them should be set + */ + WTextStream::const_iterator * wtext_stream_iterator; + WTextStream::const_iterator * wtext_stream_iterator_end; + + + /* + pointers to TextStream iterators + if set then both of them should be set + */ + TextStream::const_iterator * text_stream_iterator; + TextStream::const_iterator * text_stream_iterator_end; + /* last read char @@ -112,7 +128,6 @@ protected: - }; } diff --git a/src/convert/misc.cpp b/src/convert/misc.cpp index 978cce7..ffdf457 100644 --- a/src/convert/misc.cpp +++ b/src/convert/misc.cpp @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2017, Tomasz Sowa + * Copyright (c) 2017-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,6 +36,8 @@ */ #include "misc.h" +#include "inttostr.h" +#include "utf8/utf8.h" namespace pt @@ -49,6 +51,287 @@ void SetOverflow(bool * was_overflow, bool val) } +void esc_to_json(char val, Stream & out) +{ + if( (unsigned char)val < 32 ) + { + char buf[10]; + size_t len; + Toa((unsigned char)val, buf, sizeof(buf)/sizeof(char), 16, &len); + + out << "\\u"; + + if( len < 4 ) + { + for(size_t i=0 ; i < (4-len) ; ++i) + { + out << '0'; + } + } + + out << buf; + } + else + { + // CHECKME + // \r \n \t are <32 and will be serialized os \u.... above + + switch( val ) + { + case 0: out << '\\'; out << '0'; break; // may to skip this character is better? + case '\r': out << '\\'; out << 'r'; break; + case '\n': out << '\\'; out << 'n'; break; + case '\t': out << '\\'; out << 't'; break; + case 0x08: out << '\\'; out << 'b'; break; + case 0x0c: out << '\\'; out << 'f'; break; + case '\\': out << '\\'; out << '\\'; break; + case '"': out << '\\'; out << '\"'; break; + default: + out << val; + } + } +} + + +void esc_to_json(wchar_t val, Stream & out) +{ + char utf8_buf[10]; + std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); + + size_t len = int_to_utf8(static_cast(val), utf8_buf, utf8_buf_len); + + for(size_t a = 0 ; a < len ; ++a) + { + esc_to_json(utf8_buf[a], out); + } +} + + +void esc_to_json(const char * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_json(c[i], out); + } +} + + +void esc_to_json(const char * c, std::size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_json(c[i], out); + } +} + + +void esc_to_json(const wchar_t * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_json(c[i], out); + } +} + + +void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_json(c[i], out); + } +} + + +void esc_to_json(const std::string & in, Stream & out) +{ + esc_to_json(in.c_str(), in.size(), out); +} + + +void esc_to_json(const std::wstring & in, Stream & out) +{ + esc_to_json(in.c_str(), in.size(), out); +} + + + + + + +void esc_to_xml(char val, Stream & out) +{ + switch(val) + { + case '<': + out << "<"; + break; + + case '>': + out << ">"; + break; + + case '&': + out << "&"; + break; + + case '"': + out << """; + break; + + default: + out << val; + break; + + // what about zero (null) character? + } +} + +void esc_to_xml(wchar_t val, Stream & out) +{ + char utf8_buf[10]; + std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); + + size_t len = int_to_utf8(static_cast(val), utf8_buf, utf8_buf_len); + + for(size_t a = 0 ; a < len ; ++a) + { + esc_to_xml(utf8_buf[a], out); + } +} + + +void esc_to_xml(const char * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_xml(c[i], out); + } +} + + +void esc_to_xml(const char * c, std::size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_xml(c[i], out); + } +} + + +void esc_to_xml(const wchar_t * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_xml(c[i], out); + } +} + + +void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_xml(c[i], out); + } +} + + +void esc_to_xml(const std::string & in, Stream & out) +{ + esc_to_xml(in.c_str(), in.size(), out); +} + + +void esc_to_xml(const std::wstring & in, Stream & out) +{ + esc_to_xml(in.c_str(), in.size(), out); +} + + + + + +void esc_to_csv(char c, pt::Stream & out) +{ + switch(c) + { + case '"': + out << "\"\""; + break; + + default: + out << c; + break; + + // what about zero (null) character? + } +} + + +void esc_to_csv(wchar_t val, Stream & out) +{ + char utf8_buf[10]; + std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); + + size_t len = int_to_utf8(static_cast(val), utf8_buf, utf8_buf_len); + + for(size_t a = 0 ; a < len ; ++a) + { + esc_to_csv(utf8_buf[a], out); + } +} + + + +void esc_to_csv(const char * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_csv(c[i], out); + } +} + + +void esc_to_csv(const char * c, std::size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_csv(c[i], out); + } +} + + +void esc_to_csv(const wchar_t * c, pt::Stream & out) +{ + for(size_t i = 0 ; c[i] != 0 ; ++i) + { + esc_to_csv(c[i], out); + } +} + + +void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out) +{ + for(size_t i = 0 ; i < len ; ++i) + { + esc_to_csv(c[i], out); + } +} + + +void esc_to_csv(const std::string & in, Stream & out) +{ + esc_to_csv(in.c_str(), in.size(), out); +} + + +void esc_to_csv(const std::wstring & in, Stream & out) +{ + esc_to_csv(in.c_str(), in.size(), out); +} + + } diff --git a/src/convert/misc.h b/src/convert/misc.h index 7dbb128..51f4159 100644 --- a/src/convert/misc.h +++ b/src/convert/misc.h @@ -5,7 +5,7 @@ */ /* - * Copyright (c) 2017, Tomasz Sowa + * Copyright (c) 2017-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -40,6 +40,9 @@ #include #include "text.h" +#include "textstream/stream.h" +#include "textstream/types.h" +#include "utf8/utf8_stream.h" namespace pt @@ -47,6 +50,138 @@ namespace pt void SetOverflow(bool * was_overflow, bool val); +void esc_to_json(char val, Stream & out); +void esc_to_json(wchar_t val, Stream & out); +void esc_to_json(const char * c, pt::Stream & out); +void esc_to_json(const char * c, std::size_t len, Stream & out); +void esc_to_json(const wchar_t * c, Stream & out); +void esc_to_json(const wchar_t * c, size_t len, pt::Stream & out); +void esc_to_json(const std::string & in, Stream & out); +void esc_to_json(const std::wstring & in, Stream & out); + +void esc_to_xml(char c, pt::Stream & out); +void esc_to_xml(wchar_t c, pt::Stream & out); +void esc_to_xml(const char * c, pt::Stream & out); +void esc_to_xml(const char * c, std::size_t len, pt::Stream & out); +void esc_to_xml(const wchar_t * c, pt::Stream & out); +void esc_to_xml(const wchar_t * c, size_t len, pt::Stream & out); +void esc_to_xml(const std::string & in, Stream & out); +void esc_to_xml(const std::wstring & in, Stream & out); + +void esc_to_csv(char c, pt::Stream & out); +void esc_to_csv(wchar_t val, Stream & out); +void esc_to_csv(const char * c, std::size_t len, Stream & out); +void esc_to_csv(const char * c, pt::Stream & out); +void esc_to_csv(const char * c, std::size_t len, pt::Stream & out); +void esc_to_csv(const wchar_t * c, pt::Stream & out); +void esc_to_csv(const wchar_t * c, size_t len, pt::Stream & out); +void esc_to_csv(const std::string & in, Stream & out); + + + +template +void esc_to_json(const StreamType & in, Stream & out) +{ + char utf8_buf[10]; + std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); + typename StreamType::const_iterator i = in.begin(); + typename StreamType::const_iterator end = in.end(); + int res; + bool correct; + + for( ; i != end ; ++i) + { + if( in.is_wchar_stream() && out.is_char_stream() ) + { + std::size_t len = int_to_utf8(static_cast(*i), utf8_buf, utf8_buf_len); + esc_to_json(utf8_buf, len, out); + } + else + if( in.is_char_stream() && out.is_wchar_stream() ) + { + utf8_to_int(i, end, res, correct); + + if( correct ) + esc_to_json(static_cast(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2) + + // put replacement char if not correct? + } + else + { + esc_to_json(static_cast(*i), out); + } + } +} + + +template +void esc_to_xml(const StreamType & in, Stream & out) +{ + char utf8_buf[10]; + std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); + typename StreamType::const_iterator i = in.begin(); + typename StreamType::const_iterator end = in.end(); + int res; + bool correct; + + for( ; i != end ; ++i) + { + if( in.is_wchar_stream() && out.is_char_stream() ) + { + std::size_t len = int_to_utf8(static_cast(*i), utf8_buf, utf8_buf_len); + esc_to_xml(utf8_buf, len, out); + } + else + if( in.is_char_stream() && out.is_wchar_stream() ) + { + utf8_to_int(i, end, res, correct); + + if( correct ) + esc_to_xml(static_cast(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2) + + // put replacement char if not correct? + } + else + { + esc_to_xml(static_cast(*i), out); + } + } +} + + +template +void esc_to_csv(const StreamType & in, Stream & out) +{ + char utf8_buf[10]; + std::size_t utf8_buf_len = sizeof(utf8_buf) / sizeof(char); + typename StreamType::const_iterator i = in.begin(); + typename StreamType::const_iterator end = in.end(); + int res; + bool correct; + + for( ; i != end ; ++i) + { + if( in.is_wchar_stream() && out.is_char_stream() ) + { + std::size_t len = int_to_utf8(static_cast(*i), utf8_buf, utf8_buf_len); + esc_to_csv(utf8_buf, len, out); + } + else + if( in.is_char_stream() && out.is_wchar_stream() ) + { + utf8_to_int(i, end, res, correct); + + if( correct ) + esc_to_csv(static_cast(res), out); // IMPROVEME no surrogate pair used here (if sizeof(wchar_t) == 2) + + // put replacement char if not correct? + } + else + { + esc_to_csv(static_cast(*i), out); + } + } +} } diff --git a/src/csv/csvparser.cpp b/src/csv/csvparser.cpp index 583eee3..0a83e92 100644 --- a/src/csv/csvparser.cpp +++ b/src/csv/csvparser.cpp @@ -53,6 +53,8 @@ CSVParser::CSVParser() CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space) { + clear_input_flags(); + reading_from_file = true; space = &out_space; @@ -103,11 +105,10 @@ CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space & CSVParser::Status CSVParser::parse(const char * str, Space & out_space) { - reading_from_file = false; - reading_from_wchar_string = false; - pchar_ascii = str; - pchar_unicode = 0; - space = &out_space; + clear_input_flags(); + + pchar_ascii = str; + space = &out_space; parse(); @@ -124,11 +125,10 @@ CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space) CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space) { - reading_from_file = false; - reading_from_wchar_string = true; - pchar_unicode = str; - pchar_ascii = 0; - space = &out_space; + clear_input_flags(); + + pchar_unicode = str; + space = &out_space; parse(); diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp index e35f181..9b61b1d 100644 --- a/src/html/htmlparser.cpp +++ b/src/html/htmlparser.cpp @@ -48,6 +48,24 @@ const int HTMLParser::WHITE_MODE_TREE; +void HTMLParser::clear_input_flags() +{ + BaseParser::clear_input_flags(); + + parsing_html = true; + xml_compact_mode = true; + status = ok; + line = 1; + stack_len = 0; + out_string = nullptr; + out_stream = nullptr; + out_space = nullptr; + line_len = 0; +} + + + + void HTMLParser::Item::Clear() { name.clear(); @@ -71,21 +89,11 @@ HTMLParser::Item::Item() void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode) { - parsing_html = true; - reading_from_file = false; - reading_from_wchar_string = true; - pchar_unicode = in; - pchar_ascii = 0; - xml_compact_mode = compact_mode; + clear_input_flags(); - status = ok; - line = 1; - - stack_len = 0; - out_string = nullptr; - out_space = &space; - //last_new_line = false; - line_len = 0; + pchar_unicode = in; + xml_compact_mode = compact_mode; + out_space = &space; out_space->clear(); Init(); @@ -96,16 +104,11 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space) { + clear_input_flags(); + parsing_html = false; reading_from_file = true; - xml_compact_mode = compact_mode; - - status = ok; - line = 1; - stack_len = 0; - out_string = nullptr; - line_len = 0; - + xml_compact_mode = compact_mode; this->out_space = &out_space; if( clear_space ) @@ -153,20 +156,15 @@ HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Sp -void HTMLParser::Filter(const wchar_t * in, std::wstring & out) +void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string) { - parsing_html = true; - reading_from_file = false; - reading_from_wchar_string = true; - pchar_unicode = in; - pchar_ascii = 0; + clear_input_flags(); - stack_len = 0; + pchar_unicode = in; out_string = &out; - out_space = nullptr; - //last_new_line = false; - line_len = 0; - out_string->clear(); + + if( clear_out_string ) + out_string->clear(); Init(); Read(); @@ -174,7 +172,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out) } -void HTMLParser::Filter(const std::wstring & in, std::wstring & out) +void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string) { if( &in == &out ) { @@ -187,27 +185,45 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out) if( out.capacity() < out_projected_len ) out.reserve(out_projected_len); - Filter(in.c_str(), out); + filter(in.c_str(), out, clear_out_string); } - -HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out) +void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream) { - parsing_html = true; + clear_input_flags(); + + WTextStream::const_iterator begin = in.begin(); + WTextStream::const_iterator end = in.end(); + + wtext_stream_iterator = &begin; + wtext_stream_iterator_end = &end; + + out_stream = &out; + + if( clear_out_stream ) + out_stream->clear(); + + Init(); + Read(); + Uninit(); +} + + +HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream) +{ + clear_input_flags(); + reading_from_file = true; // open the file before clearing 'out' string, 'out' string can be the same string as the file_name file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); - status = ok; - line = 1; - stack_len = 0; - out_string = &out; - out_space = nullptr; - line_len = 0; - out_string->clear(); + out_string = &out; + + if( clear_out_stream ) + out_string->clear(); if( file ) { @@ -226,24 +242,24 @@ HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring } -HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out) +HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream) { - return filter_file(file_name.c_str(), out); + return filter_file(file_name.c_str(), out, clear_out_stream); } -HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out) +HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream) { std::string file_name_utf8; pt::wide_to_utf8(file_name, file_name_utf8); - return filter_file(file_name_utf8, out); + return filter_file(file_name_utf8, out, clear_out_stream); } -HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out) +HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream) { - return filter_file(file_name.c_str(), out); + return filter_file(file_name.c_str(), out, clear_out_stream); } @@ -792,6 +808,9 @@ void HTMLParser::Put(wchar_t c) if( out_string ) (*out_string) += c; + if( out_stream ) + (*out_stream) << c; + CheckChar(c); } @@ -806,6 +825,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end) if( out_string ) out_string->append(str, len); + if( out_stream ) + out_stream->write(str, len); + for( ; str < end ; ++str) CheckChar(*str); } @@ -819,6 +841,9 @@ void HTMLParser::Put(const std::wstring & str) if( out_string ) out_string->append(str); + if( out_stream ) + out_stream->write(str.c_str(), str.size()); + for(size_t i=0 ; i < str.size() ; ++i) CheckChar(str[i]); } @@ -1130,6 +1155,9 @@ void HTMLParser::PutTabs(size_t len) { if( out_string ) (*out_string) += ' '; // we do not add them to 'line_len' + + if( out_stream ) + (*out_stream) << ' '; } } diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h index 50df603..caf5cf1 100644 --- a/src/html/htmlparser.h +++ b/src/html/htmlparser.h @@ -44,6 +44,7 @@ #include #include "convert/baseparser.h" #include "space/space.h" +#include "textstream/stream.h" namespace pt @@ -130,14 +131,15 @@ public: // main methods used for filtering - void Filter(const wchar_t * in, std::wstring & out); - void Filter(const std::wstring & in, std::wstring & out); + void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true); + void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true); + void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true); - HTMLParser::Status filter_file(const char * file_name, std::wstring & out); - HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out); - HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out); - HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out); + HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true); + HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true); + HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true); + HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true); /* @@ -278,7 +280,7 @@ protected: - + void clear_input_flags(); /* @@ -403,6 +405,7 @@ protected: size_t stack_len; // length of the stack wchar_t * buffer; // buffer used when printing std::wstring * out_string; + Stream * out_stream; Space * out_space; Space text_space_tmp; diff --git a/src/space/spaceparser.cpp b/src/space/spaceparser.cpp index 9c334a7..46f0aa4 100644 --- a/src/space/spaceparser.cpp +++ b/src/space/spaceparser.cpp @@ -74,11 +74,12 @@ int SpaceParser::get_last_parsed_line() SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space) { + clear_input_flags(); + reading_from_file = true; parsing_space = false; root_space = &out_space; - file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); if( file ) @@ -125,11 +126,12 @@ SpaceParser::Status SpaceParser::parse_json_file(const std::wstring & file_name, SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space) { + clear_input_flags(); + reading_from_file = true; parsing_space = true; root_space = &out_space; - file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); if( file ) @@ -174,10 +176,9 @@ SpaceParser::Status SpaceParser::parse_space_file(const std::wstring & file_name SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space) { - reading_from_file = false; - reading_from_wchar_string = false; + clear_input_flags(); + pchar_ascii = str; - pchar_unicode = 0; parsing_space = false; root_space = &out_space; @@ -195,10 +196,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::string & str, Space & out SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space) { - reading_from_file = false; - reading_from_wchar_string = true; + clear_input_flags(); + pchar_unicode = str; - pchar_ascii = 0; parsing_space = false; root_space = &out_space; @@ -219,10 +219,9 @@ SpaceParser::Status SpaceParser::parse_json(const std::wstring & str, Space & ou SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space) { - reading_from_file = false; - reading_from_wchar_string = false; + clear_input_flags(); + pchar_ascii = str; - pchar_unicode = 0; parsing_space = true; root_space = &out_space; @@ -240,10 +239,9 @@ SpaceParser::Status SpaceParser::parse_space(const std::string & str, Space & ou SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space) { - reading_from_file = false; - reading_from_wchar_string = true; + clear_input_flags(); + pchar_unicode = str; - pchar_ascii = 0; parsing_space = true; root_space = &out_space; diff --git a/src/utf8/utf8.h b/src/utf8/utf8.h index bdf28f3..4857184 100644 --- a/src/utf8/utf8.h +++ b/src/utf8/utf8.h @@ -45,6 +45,12 @@ namespace pt { +/* + * public methods are also defined in utf8_stream.h + * + */ + + /*! UTF-8, a transformation format of ISO 10646 http://tools.ietf.org/html/rfc3629 @@ -213,9 +219,7 @@ template bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear = true, int mode = 1); template -void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode = 1); // not tested, IMPROVE ME add clear parameter, mode parameter is not used - - +void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear = true, int mode = 1); // not tested, IMPROVE ME mode parameter is not used } // namespace diff --git a/src/utf8/utf8_stream.h b/src/utf8/utf8_stream.h new file mode 100644 index 0000000..3adf848 --- /dev/null +++ b/src/utf8/utf8_stream.h @@ -0,0 +1,104 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2021, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef headerfile_picotools_utf8_utf8_stream +#define headerfile_picotools_utf8_utf8_stream + +#include "textstream/textstream.h" + +namespace pt +{ + + +/*! + this function converts one UTF-8 character into one wide-character + + input: + iterator_in - an TextStream iterator for reading from + iterator_end - an end iterator (can be returned by end() method from TextStream) + + output: + res - an output character + correct - true if it is a correct character + + the function returns how many characters have been used from the input stream +*/ +template +size_t utf8_to_int( + StreamIteratorType & iterator_in, + StreamIteratorType & iterator_end, + int & res, + bool & correct) +{ +size_t i, len; +unsigned char uz; + + res = 0; + correct = false; + + if( iterator_in == iterator_end ) + return 0; + + uz = *iterator_in; + ++iterator_in; + + if( !private_namespace::utf8_to_int_first_octet(uz, len, res) ) + return 1; + + for(i=1 ; i void int_to_wide(int c, StreamType & res) { @@ -65,6 +66,7 @@ void int_to_wide(int c, StreamType & res) + /*! converting UTF-8 string to a TextStreamBase stream (need to be tested) @@ -376,8 +378,11 @@ bool wide_stream_to_utf8(const Stream & stream, StreamType & utf8, bool clear, i // not tested template -void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode) +void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, bool clear, int mode) { + if( clear ) + utf8.clear(); + private_namespace::wide_to_utf8_generic(buffer, mode, [&utf8](const char * utf8_buffer, std::size_t buffer_len){ utf8.write(utf8_buffer, buffer_len); }); @@ -385,8 +390,6 @@ void wide_stream_to_utf8(StreamTypeIn & buffer, StreamTypeOut & utf8, int mode) - - } // namespace pt #endif diff --git a/tests/Makefile.dep b/tests/Makefile.dep index e83e777..a9228ca 100644 --- a/tests/Makefile.dep +++ b/tests/Makefile.dep @@ -9,12 +9,15 @@ ./convert.o: ../src/utf8/utf8_private.h ../src/date/date.h ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./convert.o: ../src/convert/strtoint.h ../src/convert/text.h -./convert.o: ../src/convert/misc.h ../src/convert/double.h +./convert.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h +./convert.o: ../src/convert/double.h ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h ./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h ./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h -./csvparser.o: ../src/convert/baseparser.h test.h +./csvparser.o: ../src/convert/baseparser.h ../src/textstream/textstream.h +./csvparser.o: ../src/textstream/stream.h ../src/date/date.h +./csvparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h test.h ./main.o: convert.h mainoptionsparser.h csvparser.h ./test.o: test.h ./mainoptionsparser.o: mainoptionsparser.h test.h @@ -30,4 +33,5 @@ ./mainoptionsparser.o: ../src/textstream/stream.h ../src/date/date.h ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h -./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h +./mainoptionsparser.o: ../src/convert/misc.h ../src/utf8/utf8_stream.h +./mainoptionsparser.o: ../src/convert/double.h