added a base class for parsers: BaseParser (convert/baseparser.h|cpp)

there are methods for reading from string/files there those methods were moved from SpaceParser and CSVParser fixed: CSVParser didn't set input_as_utf8 flag
2021-07-17 14:38:22 +02:00
parent 2a3f43c5c3
commit 7ce07c57f5
8 changed files with 329 additions and 363 deletions
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -8,6 +8,9 @@
 ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./convert/double.o: membuffer/membuffer.h textstream/types.h
 ./convert/baseparser.o: ./convert/baseparser.h utf8/utf8.h
 ./convert/baseparser.o: textstream/stream.h utf8/utf8_templates.h
 ./convert/baseparser.o: utf8/utf8_private.h
 ./date/date.o: ./date/date.h convert/inttostr.h
 ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
 ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
@@ -29,14 +32,15 @@
 ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
 ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
 ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h
-./space/spaceparser.o: utf8/utf8_private.h convert/strtoint.h
+./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
-./space/spaceparser.o: ./convert/text.h ./convert/misc.h
+./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
 ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
 ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
 ./csv/csvparser.o: convert/baseparser.h
 ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
 ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
--- a/src/convert/baseparser.cpp
+++ b/src/convert/baseparser.cpp
@@ -0,0 +1,188 @@
 /*
 * This file is a part of PikoTools
 * and is distributed under the (new) BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */
 /*
 * Copyright (c) 2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 *  * Neither the name Tomasz Sowa nor the names of contributors to this
 *    project may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include "baseparser.h"
 #include "utf8/utf8.h"
 namespace pt
 {
 BaseParser::BaseParser()
 {
 	clear();
 }
 void BaseParser::clear()
 {
 	line = 0;
 	reading_from_file = false;
 	pchar_ascii = nullptr;
 	pchar_unicode = nullptr;
 	reading_from_wchar_string = false;
 	lastc = -1;
 	input_as_utf8 = true;
 }
 int BaseParser::read_utf8_char()
 {
 int c;
 bool correct;
 	lastc = -1;
 	do
 	{
 		utf8_to_int(file, c, correct);
 		if( !file )
 			return lastc;
 	}
 	while( !correct );
 	lastc = c;
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int BaseParser::read_ascii_char()
 {
 	lastc = file.get();
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int BaseParser::read_char_from_wchar_string()
 {
 	if( *pchar_unicode == 0 )
 		lastc = -1;
 	else
 		lastc = *(pchar_unicode++);
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int BaseParser::read_char_from_utf8_string()
 {
 int c;
 bool correct;
 	lastc = -1;
 	do
 	{
 		size_t len = utf8_to_int(pchar_ascii, c, correct);
 		pchar_ascii += len;
 	}
 	while( *pchar_ascii && !correct );
 	if( correct )
 		lastc = c;
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int BaseParser::read_char_from_ascii_string()
 {
 	if( *pchar_ascii == 0 )
 		lastc = -1;
 	else
 		lastc = *(pchar_ascii++);
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int BaseParser::read_char_no_escape()
 {
 	if( reading_from_file )
 	{
 		if( input_as_utf8 )
 			return read_utf8_char();
 		else
 			return read_ascii_char();
 	}
 	else
 	{
 		if( reading_from_wchar_string )
 		{
 			return read_char_from_wchar_string();
 		}
 		else
 		{
 			if( input_as_utf8 )
 				return read_char_from_utf8_string();
 			else
 				return read_char_from_ascii_string();
 		}
 	}
 }
 int BaseParser::read_char()
 {
 	return read_char_no_escape();
 }
 }
--- a/src/convert/baseparser.h
+++ b/src/convert/baseparser.h
@@ -0,0 +1,120 @@
 /*
 * This file is a part of PikoTools
 * and is distributed under the (new) BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */
 /*
 * Copyright (c) 2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 *  * Neither the name Tomasz Sowa nor the names of contributors to this
 *    project may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef headerfile_picotools_convert_baseparser
 #define headerfile_picotools_convert_baseparser
 #include <string>
 #include <fstream>
 namespace pt
 {
 class BaseParser
 {
 protected:
 	BaseParser();
 	void clear();
 	int read_utf8_char();
 	int read_ascii_char();
 	int read_char_from_wchar_string();
 	int read_char_from_utf8_string();
 	int read_char_from_ascii_string();
 	int read_char_no_escape();
 	int read_char();
 	/*
 		a number of a line in which there is a syntax_error
 	*/
 	int line;
 	/*
 		true if parse() method was called
 		false if ParseString() was called
 	*/
 	bool reading_from_file;
 	/*
 		pointers to the current character
 		if ParseString() is in used
 	*/
 	const char    * pchar_ascii;
 	const wchar_t * pchar_unicode;
 	/*
 		true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
 	*/
 	bool reading_from_wchar_string;
 	/*
 		last read char
 		or -1 if the end
 	*/
 	int lastc;
 	/*
 		current file
 		may it would be better to make a pointer?
 		if we parse only a string then there is no sense to have such an object
 	*/
 	std::ifstream file;
 	/*
 		input file is in UTF-8
 		default: true
 	*/
 	bool input_as_utf8;
 };
 }
 #endif
--- a/src/csv/csvparser.cpp
+++ b/src/csv/csvparser.cpp
@@ -44,6 +44,11 @@ namespace pt
 {
 CSVParser::CSVParser()
 {
 	input_as_utf8    = true;
 }
 CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
@@ -285,132 +290,6 @@ bool CSVParser::read_non_quoted_value_to(std::wstring & value)
 int CSVParser::read_utf8_char()
 {
 int c;
 bool correct;
 	lastc = -1;
 	do
 	{
 		utf8_to_int(file, c, correct);
 		if( !file )
 			return lastc;
 	}
 	while( !correct );
 	lastc = c;
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int CSVParser::read_ascii_char()
 {
 	lastc = file.get();
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int CSVParser::read_char_from_wchar_string()
 {
 	if( *pchar_unicode == 0 )
 		lastc = -1;
 	else
 		lastc = *(pchar_unicode++);
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int CSVParser::read_char_from_utf8_string()
 {
 int c;
 bool correct;
 	lastc = -1;
 	do
 	{
 		size_t len = utf8_to_int(pchar_ascii, c, correct);
 		pchar_ascii += len;
 	}
 	while( *pchar_ascii && !correct );
 	if( correct )
 		lastc = c;
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int CSVParser::read_char_from_ascii_string()
 {
 	if( *pchar_ascii == 0 )
 		lastc = -1;
 	else
 		lastc = *(pchar_ascii++);
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int CSVParser::read_char_no_escape()
 {
 	if( reading_from_file )
 	{
 		if( input_as_utf8 )
 			return read_utf8_char();
 		else
 			return read_ascii_char();
 	}
 	else
 	{
 		if( reading_from_wchar_string )
 		{
 			return read_char_from_wchar_string();
 		}
 		else
 		{
 			if( input_as_utf8 )
 				return read_char_from_utf8_string();
 			else
 				return read_char_from_ascii_string();
 		}
 	}
 }
 int CSVParser::read_char()
 {
 	return read_char_no_escape();
 }
 }
--- a/src/csv/csvparser.h
+++ b/src/csv/csvparser.h
@@ -38,9 +38,11 @@
 #ifndef headerfile_picotools_csv_csvparser
 #define headerfile_picotools_csv_csvparser
 #include "space/space.h"
 #include <string>
 #include <fstream>
 #include "space/space.h"
 #include "convert/baseparser.h"
 namespace pt
@@ -51,10 +53,12 @@ namespace pt
 * https://datatracker.ietf.org/doc/html/rfc4180
 *
 */
-class CSVParser
+class CSVParser : public BaseParser
 {
 public:
 	CSVParser();
 	enum Status
 	{
 		ok,
@@ -85,53 +89,6 @@ protected:
 	Space * space;
 	/*
 		true if parse_file() method was called
 		false if parse() was called
 	*/
 	bool reading_from_file;
 	/*
 		true if parse(wchar_t *) or parse(std::wstring&) was called
 	*/
 	bool reading_from_wchar_string;
 	/*
 		pointers to the current character
 		if parse() is being used
 	*/
 	const char    * pchar_ascii;
 	const wchar_t * pchar_unicode;
 	/*
 		last read char
 		or -1 if the end
 	*/
 	int lastc;
 	/*
 		a number of a line in which there is a syntax_error
 	*/
 	int line;
 	/*
 		current file
 		may it would be better to make a pointer?
 		if we parse only a string then there is no sense to have such an object
 	*/
 	std::ifstream file;
 	/*
 		input file is in UTF-8
 		default: true
 	*/
 	bool input_as_utf8;
 	void parse();
@@ -142,19 +99,6 @@ protected:
 	bool read_non_quoted_value_to(std::wstring & value);
 	/*
 	 * copied from SpaceParser
 	 * may it would be better to have a class with those methods and inherit from it?
 	 */
 	int read_utf8_char();
 	int read_ascii_char();
 	int read_char_from_wchar_string();
 	int read_char_from_utf8_string();
 	int read_char_from_ascii_string();
 	int read_char_no_escape();
 	int read_char();
 };
 }
--- a/src/space/spaceparser.cpp
+++ b/src/space/spaceparser.cpp
@@ -891,122 +891,6 @@ void SpaceParser::read_key()
 int SpaceParser::read_utf8_char()
 {
 int c;
 bool correct;
 	lastc = -1;
 	do
 	{
 		utf8_to_int(file, c, correct);
 		if( !file )
 			return lastc;
 	}
 	while( !correct );
 	lastc = c;
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int SpaceParser::read_ascii_char()
 {
 	lastc = file.get();
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int SpaceParser::read_char_from_wchar_string()
 {
 	if( *pchar_unicode == 0 )
 		lastc = -1;
 	else
 		lastc = *(pchar_unicode++);
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int SpaceParser::read_char_from_utf8_string()
 {
 int c;
 bool correct;
 	lastc = -1;
 	do
 	{
 		size_t len = utf8_to_int(pchar_ascii, c, correct);
 		pchar_ascii += len;
 	}
 	while( *pchar_ascii && !correct );
 	if( correct )
 		lastc = c;
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int SpaceParser::read_char_from_ascii_string()
 {
 	if( *pchar_ascii == 0 )
 		lastc = -1;
 	else
 		lastc = *(pchar_ascii++);
 	if( lastc == '\n' )
 		++line;
 return lastc;
 }
 int SpaceParser::read_char_no_escape()
 {
 	if( reading_from_file )
 	{
 		if( input_as_utf8 )
 			return read_utf8_char();
 		else
 			return read_ascii_char();
 	}
 	else
 	{
 		if( reading_from_wchar_string )
 		{
 			return read_char_from_wchar_string();
 		}
 		else
 		{
 			if( input_as_utf8 )
 				return read_char_from_utf8_string();
 			else
 				return read_char_from_ascii_string();
 		}
 	}
 }
 bool SpaceParser::is_hex_digit(wchar_t c)
 {
--- a/src/space/spaceparser.h
+++ b/src/space/spaceparser.h
@@ -40,6 +40,7 @@
 #include <fstream>
 #include "space.h"
 #include "convert/baseparser.h"
@@ -49,7 +50,7 @@ namespace pt
-class SpaceParser
+class SpaceParser : public BaseParser
 {
 public:
@@ -154,32 +155,6 @@ private:
 	Space * root_space;
 	/*
 		a number of a line in which there is a syntax_error
 	*/
 	int line;
 	/*
 		true if parse() method was called
 		false if ParseString() was called
 	*/
 	bool reading_from_file;
 	/*
 		pointers to the current character
 		if ParseString() is in used
 	*/
 	const char    * pchar_ascii;
 	const wchar_t * pchar_unicode;
 	/*
 		true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
 	*/
 	bool reading_from_wchar_string;
 	/*
 		last read token
 	*/
@@ -222,13 +197,6 @@ private:
 	int option_delimiter;
 	/*
 		last read char
 		or -1 if the end
 	*/
 	int lastc;
 	/*
 		true if the lastc was escaped (with a backslash)
 		we have to know if the last sequence was \" or just "
@@ -236,22 +204,6 @@ private:
 	bool char_was_escaped;
 	/*
 		current file
 		may it would be better to make a pointer?
 		if we parse only a string then there is no sense to have such an object
 	*/
 	std::ifstream file;
 	/*
 		input file is in UTF-8
 		default: true
 	*/
 	bool input_as_utf8;
 	/*
 	 * if parsing_space is false then it means we are parsing JSON format
 	 *
@@ -287,12 +239,6 @@ private:
 	void read_token_quoted(std::wstring & token);
 	void read_multiline_token_quoted(std::wstring & token);
 	int  read_utf8_char();
 	int  read_ascii_char();
 	int  read_char_from_wchar_string();
 	int  read_char_from_utf8_string();
 	int  read_char_from_ascii_string();
 	int  read_char_no_escape();
 	int  read_char();
 	bool is_white(int c);
 	void skip_line();
--- a/tests/Makefile.dep
+++ b/tests/Makefile.dep
@@ -13,7 +13,8 @@
 ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
 ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
 ./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
-./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h test.h
+./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
 ./csvparser.o: ../src/convert/baseparser.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
 ./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h