pikotools/src/csv/csvparser.cpp

/*
 * This file is a part of PikoTools
 * and is distributed under the (new) BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */

/*
 * Copyright (c) 2021, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 *  * Neither the name Tomasz Sowa nor the names of contributors to this
 *    project may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "csvparser.h"
#include "utf8/utf8.h"


namespace PT
{


CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
{
	reading_from_file = true;
	space = &out_space;

	file.clear();
	file.open(file_name, std::ios_base::binary | std::ios_base::in);

	if( file )
	{
		parse();
		file.close();
	}
	else
	{
		status = cant_open_file;
	}

	return status;
}


CSVParser::Status CSVParser::parse_file(const std::string & file_name, Space & out_space)
{
	return parse_file(file_name.c_str(), out_space);
}


CSVParser::Status CSVParser::parse_file(const wchar_t * file_name, Space & out_space)
{
	std::string file_name_utf8;

	PT::WideToUTF8(file_name, file_name_utf8);
	return parse_file(file_name_utf8.c_str(), out_space);
}


CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space & out_space)
{
	return parse_file(file_name.c_str(), out_space);
}


CSVParser::Status CSVParser::parse(const char * str, Space & out_space)
{
	reading_from_file         = false;
	reading_from_wchar_string = false;
	pchar_ascii               = str;
	pchar_unicode             = 0;
	space                     = &out_space;

	parse();

	return status;
}


CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space)
{
	return parse(str.c_str(), out_space);
}


CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space)
{
	reading_from_file         = false;
	reading_from_wchar_string = true;
	pchar_unicode             = str;
	pchar_ascii               = 0;
	space                     = &out_space;

	parse();

	return status;
}


CSVParser::Status CSVParser::parse(const std::wstring & str, Space & out_space)
{
	return parse(str.c_str(), out_space);
}


void CSVParser::parse()
{
	line    = 1;
	status  = ok;

	space->set_empty_table();
	read_char(); // put first character to lastc

	if( lastc == -1 )
	{
		// an empty file/string, in such a case we return such a space struct (if would be serialized to json): [[]]
		Space row_space;
		row_space.set_empty_table();
		space->add(std::move(row_space));
	}

	while( lastc != -1 )
	{
		/*
		 * even if there is an error when parsing we continue to read the file/string
		 *
		 */

		Space row_space;
		row_space.set_empty_table();

		parse_row(row_space);
		space->add(std::move(row_space));
	}
}


void CSVParser::parse_row(Space & row_space)
{
	bool continue_reading;

	do
	{
		continue_reading = read_value_to(row_space);
	}
	while(continue_reading);
}


bool CSVParser::read_value_to(Space & row_space)
{
	Space & space_value = row_space.add_empty_space();
	space_value.set_empty_wstring();

	if( lastc == '"' )
	{
		return read_quoted_value_to(space_value.value.value_wstring);
	}
	else
	{
		return read_non_quoted_value_to(space_value.value.value_wstring);
	}
}


bool CSVParser::read_quoted_value_to(std::wstring & value)
{
	bool is_comma = false;
	bool is_value_character = true;

	while( lastc != -1 && is_value_character )
	{
		read_char();

		if( lastc == '"' )
		{
			read_char();

			if( lastc == '"' )
			{
				value += lastc;
			}
			else
			{
				is_value_character = false;
			}
		}
		else
		if( lastc != -1 )
		{
			value += lastc;
		}
	}

	if( lastc == ',' )
	{
		is_comma = true;
		read_char(); // skip the comma character
	}
	else
	if( lastc == 13 )
	{
		read_char(); // skip CR character

		if( lastc == 10 )
			read_char();
	}
	else
	if( lastc == 10 )
	{
		read_char(); // skip new line character
	}

	return is_comma;
}


bool CSVParser::read_non_quoted_value_to(std::wstring & value)
{
	while( lastc != -1 && lastc != ',' && lastc != 10 )
	{
		value += lastc;
		read_char();
	}

	bool is_comma = (lastc == ',');

	if( is_comma )
	{
		read_char(); // skip the comma character
	}
	else
	{
		bool is_new_line = (lastc == 10);

		// check CRLF sequence
		if( is_new_line && !value.empty() && value.back() == 13 )
		{
			value.erase(value.size() - 1, 1);
		}

		if( is_new_line )
		{
			read_char(); // skip the new line character
		}
	}

	return is_comma;
}


int CSVParser::read_utf8_char()
{
int c;
bool correct;

	lastc = -1;

	do
	{
		PT::UTF8ToInt(file, c, correct);

		if( !file )
			return lastc;
	}
	while( !correct );

	lastc = c;

	if( lastc == '\n' )
		++line;

return lastc;
}


int CSVParser::read_ascii_char()
{
	lastc = file.get();

	if( lastc == '\n' )
		++line;

return lastc;
}


int CSVParser::read_char_from_wchar_string()
{
	if( *pchar_unicode == 0 )
		lastc = -1;
	else
		lastc = *(pchar_unicode++);

	if( lastc == '\n' )
		++line;

return lastc;
}


int CSVParser::read_char_from_utf8_string()
{
int c;
bool correct;

	lastc = -1;

	do
	{
		size_t len = PT::UTF8ToInt(pchar_ascii, c, correct);
		pchar_ascii += len;
	}
	while( *pchar_ascii && !correct );

	if( correct )
		lastc = c;

	if( lastc == '\n' )
		++line;

return lastc;
}


int CSVParser::read_char_from_ascii_string()
{
	if( *pchar_ascii == 0 )
		lastc = -1;
	else
		lastc = *(pchar_ascii++);

	if( lastc == '\n' )
		++line;

return lastc;
}


int CSVParser::read_char_no_escape()
{
	if( reading_from_file )
	{
		if( input_as_utf8 )
			return read_utf8_char();
		else
			return read_ascii_char();
	}
	else
	{
		if( reading_from_wchar_string )
		{
			return read_char_from_wchar_string();
		}
		else
		{
			if( input_as_utf8 )
				return read_char_from_utf8_string();
			else
				return read_char_from_ascii_string();
		}
	}
}


int CSVParser::read_char()
{
	return read_char_no_escape();
}


}