added a base class for parsers: BaseParser (convert/baseparser.h|cpp)

there are methods for reading from string/files there those methods were moved from SpaceParser and CSVParser fixed: CSVParser didn't set input_as_utf8 flag
2021-07-17 14:38:22 +02:00
parent 2a3f43c5c3
commit 7ce07c57f5
8 changed files with 329 additions and 363 deletions
@@ -8,6 +8,9 @@
 ./convert/double.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./convert/double.o: utf8/utf8_templates.h utf8/utf8_private.h date/date.h
 ./convert/double.o: membuffer/membuffer.h textstream/types.h
+./convert/baseparser.o: ./convert/baseparser.h utf8/utf8.h
+./convert/baseparser.o: textstream/stream.h utf8/utf8_templates.h
+./convert/baseparser.o: utf8/utf8_private.h
 ./date/date.o: ./date/date.h convert/inttostr.h
 ./log/filelog.o: ./log/filelog.h textstream/textstream.h textstream/stream.h
 ./log/filelog.o: space/space.h textstream/types.h convert/inttostr.h
@@ -29,14 +32,15 @@
 ./space/spaceparser.o: ./space/spaceparser.h ./space/space.h
 ./space/spaceparser.o: textstream/types.h convert/inttostr.h utf8/utf8.h
 ./space/spaceparser.o: textstream/stream.h utf8/utf8_templates.h
-./space/spaceparser.o: utf8/utf8_private.h convert/strtoint.h
-./space/spaceparser.o: ./convert/text.h ./convert/misc.h
+./space/spaceparser.o: utf8/utf8_private.h convert/baseparser.h
+./space/spaceparser.o: convert/strtoint.h ./convert/text.h ./convert/misc.h
 ./utf8/utf8.o: ./utf8/utf8.h textstream/stream.h utf8/utf8_templates.h
 ./utf8/utf8.o: utf8/utf8_private.h
 ./utf8/utf8_private.o: utf8/utf8_private.h
 ./csv/csvparser.o: ./csv/csvparser.h space/space.h textstream/types.h
 ./csv/csvparser.o: convert/inttostr.h utf8/utf8.h textstream/stream.h
 ./csv/csvparser.o: utf8/utf8_templates.h utf8/utf8_private.h
+./csv/csvparser.o: convert/baseparser.h
 ./mainoptions/mainoptionsparser.o: ./mainoptions/mainoptionsparser.h
 ./mainoptions/mainoptionsparser.o: space/space.h textstream/types.h
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
@@ -0,0 +1,188 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/*
+ * Copyright (c) 2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "baseparser.h"
+#include "utf8/utf8.h"
+
+
+
+namespace pt
+{
+
+BaseParser::BaseParser()
+{
+	clear();
+}
+
+
+void BaseParser::clear()
+{
+	line = 0;
+	reading_from_file = false;
+	pchar_ascii = nullptr;
+	pchar_unicode = nullptr;
+	reading_from_wchar_string = false;
+	lastc = -1;
+	input_as_utf8 = true;
+}
+
+
+int BaseParser::read_utf8_char()
+{
+int c;
+bool correct;
+
+	lastc = -1;
+
+	do
+	{
+		utf8_to_int(file, c, correct);
+
+		if( !file )
+			return lastc;
+	}
+	while( !correct );
+
+	lastc = c;
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+}
+
+
+int BaseParser::read_ascii_char()
+{
+	lastc = file.get();
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+}
+
+
+int BaseParser::read_char_from_wchar_string()
+{
+	if( *pchar_unicode == 0 )
+		lastc = -1;
+	else
+		lastc = *(pchar_unicode++);
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+}
+
+
+int BaseParser::read_char_from_utf8_string()
+{
+int c;
+bool correct;
+
+	lastc = -1;
+
+	do
+	{
+		size_t len = utf8_to_int(pchar_ascii, c, correct);
+		pchar_ascii += len;
+	}
+	while( *pchar_ascii && !correct );
+
+	if( correct )
+		lastc = c;
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+
+}
+
+
+int BaseParser::read_char_from_ascii_string()
+{
+	if( *pchar_ascii == 0 )
+		lastc = -1;
+	else
+		lastc = *(pchar_ascii++);
+
+	if( lastc == '\n' )
+		++line;
+
+return lastc;
+}
+
+
+int BaseParser::read_char_no_escape()
+{
+	if( reading_from_file )
+	{
+		if( input_as_utf8 )
+			return read_utf8_char();
+		else
+			return read_ascii_char();
+	}
+	else
+	{
+		if( reading_from_wchar_string )
+		{
+			return read_char_from_wchar_string();
+		}
+		else
+		{
+			if( input_as_utf8 )
+				return read_char_from_utf8_string();
+			else
+				return read_char_from_ascii_string();
+		}
+	}
+}
+
+
+int BaseParser::read_char()
+{
+	return read_char_no_escape();
+}
+
+
+
+
+}
+
@@ -0,0 +1,120 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/*
+ * Copyright (c) 2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_picotools_convert_baseparser
+#define headerfile_picotools_convert_baseparser
+
+#include <string>
+#include <fstream>
+
+
+namespace pt
+{
+
+class BaseParser
+{
+protected:
+
+	BaseParser();
+
+	void clear();
+
+	int read_utf8_char();
+	int read_ascii_char();
+	int read_char_from_wchar_string();
+	int read_char_from_utf8_string();
+	int read_char_from_ascii_string();
+	int read_char_no_escape();
+	int read_char();
+
+
+
+	/*
+		a number of a line in which there is a syntax_error
+	*/
+	int line;
+
+
+	/*
+		true if parse() method was called
+		false if ParseString() was called
+	*/
+	bool reading_from_file;
+
+	/*
+		pointers to the current character
+		if ParseString() is in used
+	*/
+	const char    * pchar_ascii;
+	const wchar_t * pchar_unicode;
+
+
+	/*
+		true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
+	*/
+	bool reading_from_wchar_string;
+
+	/*
+		last read char
+		or -1 if the end
+	*/
+	int lastc;
+
+
+	/*
+		current file
+
+		may it would be better to make a pointer?
+		if we parse only a string then there is no sense to have such an object
+	*/
+	std::ifstream file;
+
+
+	/*
+		input file is in UTF-8
+		default: true
+	*/
+	bool input_as_utf8;
+
+
+
+
+};
+
+}
+
+#endif
@@ -44,6 +44,11 @@ namespace pt
 {


+CSVParser::CSVParser()
+{
+	input_as_utf8    = true;
+}
+


 CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
@@ -285,132 +290,6 @@ bool CSVParser::read_non_quoted_value_to(std::wstring & value)



-
-int CSVParser::read_utf8_char()
-{
-int c;
-bool correct;
-
-	lastc = -1;
-
-	do
-	{
-		utf8_to_int(file, c, correct);
-
-		if( !file )
-			return lastc;
-	}
-	while( !correct );
-
-	lastc = c;
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-
-int CSVParser::read_ascii_char()
-{
-	lastc = file.get();
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-
-
-int CSVParser::read_char_from_wchar_string()
-{
-	if( *pchar_unicode == 0 )
-		lastc = -1;
-	else
-		lastc = *(pchar_unicode++);
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int CSVParser::read_char_from_utf8_string()
-{
-int c;
-bool correct;
-
-	lastc = -1;
-
-	do
-	{
-		size_t len = utf8_to_int(pchar_ascii, c, correct);
-		pchar_ascii += len;
-	}
-	while( *pchar_ascii && !correct );
-
-	if( correct )
-		lastc = c;
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int CSVParser::read_char_from_ascii_string()
-{
-	if( *pchar_ascii == 0 )
-		lastc = -1;
-	else
-		lastc = *(pchar_ascii++);
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int CSVParser::read_char_no_escape()
-{
-	if( reading_from_file )
-	{
-		if( input_as_utf8 )
-			return read_utf8_char();
-		else
-			return read_ascii_char();
-	}
-	else
-	{
-		if( reading_from_wchar_string )
-		{
-			return read_char_from_wchar_string();
-		}
-		else
-		{
-			if( input_as_utf8 )
-				return read_char_from_utf8_string();
-			else
-				return read_char_from_ascii_string();
-		}
-	}
-}
-
-
-
-
-int CSVParser::read_char()
-{
-	return read_char_no_escape();
-}
-
-
 }


@@ -38,9 +38,11 @@
 #ifndef headerfile_picotools_csv_csvparser
 #define headerfile_picotools_csv_csvparser

-#include "space/space.h"
 #include <string>
 #include <fstream>
+#include "space/space.h"
+#include "convert/baseparser.h"
+


 namespace pt
@@ -51,10 +53,12 @@ namespace pt
 * https://datatracker.ietf.org/doc/html/rfc4180
 *
 */
-class CSVParser
+class CSVParser : public BaseParser
 {
 public:

+	CSVParser();
+
 	enum Status
 	{
 		ok,
@@ -85,53 +89,6 @@ protected:

 	Space * space;

-	/*
-		true if parse_file() method was called
-		false if parse() was called
-	*/
-	bool reading_from_file;
-
-	/*
-		true if parse(wchar_t *) or parse(std::wstring&) was called
-	*/
-	bool reading_from_wchar_string;
-
-	/*
-		pointers to the current character
-		if parse() is being used
-	*/
-	const char    * pchar_ascii;
-	const wchar_t * pchar_unicode;
-
-
-	/*
-		last read char
-		or -1 if the end
-	*/
-	int lastc;
-
-
-
-	/*
-		a number of a line in which there is a syntax_error
-	*/
-	int line;
-
-	/*
-		current file
-
-		may it would be better to make a pointer?
-		if we parse only a string then there is no sense to have such an object
-	*/
-	std::ifstream file;
-
-	/*
-		input file is in UTF-8
-		default: true
-	*/
-	bool input_as_utf8;
-
-


 	void parse();
@@ -142,19 +99,6 @@ protected:
 	bool read_non_quoted_value_to(std::wstring & value);


-
-	/*
-	 * copied from SpaceParser
-	 * may it would be better to have a class with those methods and inherit from it?
-	 */
-	int read_utf8_char();
-	int read_ascii_char();
-	int read_char_from_wchar_string();
-	int read_char_from_utf8_string();
-	int read_char_from_ascii_string();
-	int read_char_no_escape();
-
-	int read_char();
 };

 }
@@ -891,122 +891,6 @@ void SpaceParser::read_key()



-int SpaceParser::read_utf8_char()
-{
-int c;
-bool correct;
-
-	lastc = -1;
-
-	do
-	{
-		utf8_to_int(file, c, correct);
-
-		if( !file )
-			return lastc;
-	}
-	while( !correct );
-
-	lastc = c;
-
-	if( lastc == '\n' )
-		++line;
-	
-return lastc;
-}
-
-
-
-int SpaceParser::read_ascii_char()
-{
-	lastc = file.get();
-
-	if( lastc == '\n' )
-		++line;
-	
-return lastc;
-}
-
-
-
-
-int SpaceParser::read_char_from_wchar_string()
-{
-	if( *pchar_unicode == 0 )
-		lastc = -1;
-	else
-		lastc = *(pchar_unicode++);
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int SpaceParser::read_char_from_utf8_string()
-{
-int c;
-bool correct;
-
-	lastc = -1;
-
-	do
-	{
-		size_t len = utf8_to_int(pchar_ascii, c, correct);
-		pchar_ascii += len;
-	}
-	while( *pchar_ascii && !correct );
-
-	if( correct )
-		lastc = c;
-
-	if( lastc == '\n' )
-		++line;
-	
-return lastc;
-	
-}
-
-
-int SpaceParser::read_char_from_ascii_string()
-{
-	if( *pchar_ascii == 0 )
-		lastc = -1;
-	else
-		lastc = *(pchar_ascii++);
-
-	if( lastc == '\n' )
-		++line;
-
-return lastc;
-}
-
-
-int SpaceParser::read_char_no_escape()
-{
-	if( reading_from_file )
-	{
-		if( input_as_utf8 )
-			return read_utf8_char();
-		else
-			return read_ascii_char();
-	}
-	else
-	{
-		if( reading_from_wchar_string )
-		{
-			return read_char_from_wchar_string();
-		}
-		else
-		{
-			if( input_as_utf8 )
-				return read_char_from_utf8_string();
-			else
-				return read_char_from_ascii_string();
-		}
-	}
-}

 bool SpaceParser::is_hex_digit(wchar_t c)
 {
@@ -40,6 +40,7 @@

 #include <fstream>
 #include "space.h"
+#include "convert/baseparser.h"



@@ -49,7 +50,7 @@ namespace pt



-class SpaceParser
+class SpaceParser : public BaseParser
 {
 public:

@@ -154,32 +155,6 @@ private:
 	Space * root_space;


-	/*
-		a number of a line in which there is a syntax_error
-	*/
-	int line;
-
-	/*
-		true if parse() method was called
-		false if ParseString() was called
-	*/
-	bool reading_from_file;
-
-
-	/*
-		pointers to the current character
-		if ParseString() is in used
-	*/
-	const char    * pchar_ascii;
-	const wchar_t * pchar_unicode;
-
-
-	/*
-		true if ParseString(wchar_t *) or ParseString(std::wstring&) was called
-	*/
-	bool reading_from_wchar_string;
-
-
 	/*
 		last read token
 	*/
@@ -222,13 +197,6 @@ private:
 	int option_delimiter;


-	/*
-		last read char
-		or -1 if the end
-	*/
-	int lastc;
-
-
 	/*
 		true if the lastc was escaped (with a backslash)
 		we have to know if the last sequence was \" or just "
@@ -236,22 +204,6 @@ private:
 	bool char_was_escaped;


-	/*
-		current file
-
-		may it would be better to make a pointer?
-		if we parse only a string then there is no sense to have such an object
-	*/
-	std::ifstream file;
-
-
-	/*
-		input file is in UTF-8
-		default: true
-	*/
-	bool input_as_utf8;
-
-
 	/*
 	 * if parsing_space is false then it means we are parsing JSON format
 	 *
@@ -287,12 +239,6 @@ private:
 	void read_token_quoted(std::wstring & token);
 	void read_multiline_token_quoted(std::wstring & token);

-	int  read_utf8_char();
-	int  read_ascii_char();
-	int  read_char_from_wchar_string();
-	int  read_char_from_utf8_string();
-	int  read_char_from_ascii_string();
-	int  read_char_no_escape();
 	int  read_char();
 	bool is_white(int c);
 	void skip_line();
@@ -13,7 +13,8 @@
 ./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
 ./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
 ./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
-./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h test.h
+./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h
+./csvparser.o: ../src/convert/baseparser.h test.h
 ./main.o: convert.h mainoptionsparser.h csvparser.h
 ./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h