2021-05-19 03:26:46 +02:00
|
|
|
/*
|
|
|
|
* This file is a part of PikoTools
|
|
|
|
* and is distributed under the (new) BSD licence.
|
|
|
|
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2021, Tomasz Sowa
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* * Neither the name Tomasz Sowa nor the names of contributors to this
|
|
|
|
* project may be used to endorse or promote products derived
|
|
|
|
* from this software without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
|
|
|
* THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "csvparser.h"
|
|
|
|
#include "utf8/utf8.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
2021-05-20 16:11:12 +02:00
|
|
|
namespace pt
|
2021-05-19 03:26:46 +02:00
|
|
|
{
|
|
|
|
|
|
|
|
|
2021-07-17 14:38:22 +02:00
|
|
|
CSVParser::CSVParser()
|
|
|
|
{
|
|
|
|
input_as_utf8 = true;
|
|
|
|
}
|
|
|
|
|
2021-05-19 03:26:46 +02:00
|
|
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space)
|
|
|
|
{
|
|
|
|
reading_from_file = true;
|
|
|
|
space = &out_space;
|
|
|
|
|
|
|
|
file.clear();
|
|
|
|
file.open(file_name, std::ios_base::binary | std::ios_base::in);
|
|
|
|
|
|
|
|
if( file )
|
|
|
|
{
|
|
|
|
parse();
|
|
|
|
file.close();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
status = cant_open_file;
|
|
|
|
}
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse_file(const std::string & file_name, Space & out_space)
|
|
|
|
{
|
|
|
|
return parse_file(file_name.c_str(), out_space);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse_file(const wchar_t * file_name, Space & out_space)
|
|
|
|
{
|
|
|
|
std::string file_name_utf8;
|
|
|
|
|
2021-05-21 00:24:56 +02:00
|
|
|
wide_to_utf8(file_name, file_name_utf8);
|
2021-05-19 03:26:46 +02:00
|
|
|
return parse_file(file_name_utf8.c_str(), out_space);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space & out_space)
|
|
|
|
{
|
|
|
|
return parse_file(file_name.c_str(), out_space);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse(const char * str, Space & out_space)
|
|
|
|
{
|
|
|
|
reading_from_file = false;
|
|
|
|
reading_from_wchar_string = false;
|
|
|
|
pchar_ascii = str;
|
|
|
|
pchar_unicode = 0;
|
|
|
|
space = &out_space;
|
|
|
|
|
|
|
|
parse();
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space)
|
|
|
|
{
|
|
|
|
return parse(str.c_str(), out_space);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space)
|
|
|
|
{
|
|
|
|
reading_from_file = false;
|
|
|
|
reading_from_wchar_string = true;
|
|
|
|
pchar_unicode = str;
|
|
|
|
pchar_ascii = 0;
|
|
|
|
space = &out_space;
|
|
|
|
|
|
|
|
parse();
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse(const std::wstring & str, Space & out_space)
|
|
|
|
{
|
|
|
|
return parse(str.c_str(), out_space);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void CSVParser::parse()
|
|
|
|
{
|
|
|
|
line = 1;
|
|
|
|
status = ok;
|
|
|
|
|
|
|
|
space->set_empty_table();
|
|
|
|
read_char(); // put first character to lastc
|
|
|
|
|
|
|
|
if( lastc == -1 )
|
|
|
|
{
|
|
|
|
// an empty file/string, in such a case we return such a space struct (if would be serialized to json): [[]]
|
2021-05-19 22:34:10 +02:00
|
|
|
Space row_space;
|
|
|
|
row_space.set_empty_table();
|
|
|
|
space->add(std::move(row_space));
|
2021-05-19 03:26:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
while( lastc != -1 )
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* even if there is an error when parsing we continue to read the file/string
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2021-05-19 22:31:18 +02:00
|
|
|
Space row_space;
|
|
|
|
row_space.set_empty_table();
|
2021-05-19 03:26:46 +02:00
|
|
|
|
|
|
|
parse_row(row_space);
|
2021-05-19 22:31:18 +02:00
|
|
|
space->add(std::move(row_space));
|
2021-05-19 03:26:46 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-05-19 22:31:18 +02:00
|
|
|
void CSVParser::parse_row(Space & row_space)
|
2021-05-19 03:26:46 +02:00
|
|
|
{
|
|
|
|
bool continue_reading;
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
continue_reading = read_value_to(row_space);
|
|
|
|
}
|
|
|
|
while(continue_reading);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-05-19 22:31:18 +02:00
|
|
|
bool CSVParser::read_value_to(Space & row_space)
|
2021-05-19 03:26:46 +02:00
|
|
|
{
|
2021-05-19 22:31:18 +02:00
|
|
|
Space & space_value = row_space.add_empty_space();
|
2021-05-19 03:26:46 +02:00
|
|
|
space_value.set_empty_wstring();
|
|
|
|
|
|
|
|
if( lastc == '"' )
|
|
|
|
{
|
|
|
|
return read_quoted_value_to(space_value.value.value_wstring);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return read_non_quoted_value_to(space_value.value.value_wstring);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool CSVParser::read_quoted_value_to(std::wstring & value)
|
|
|
|
{
|
|
|
|
bool is_comma = false;
|
|
|
|
bool is_value_character = true;
|
|
|
|
|
|
|
|
while( lastc != -1 && is_value_character )
|
|
|
|
{
|
|
|
|
read_char();
|
|
|
|
|
|
|
|
if( lastc == '"' )
|
|
|
|
{
|
|
|
|
read_char();
|
|
|
|
|
|
|
|
if( lastc == '"' )
|
|
|
|
{
|
|
|
|
value += lastc;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
is_value_character = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( lastc != -1 )
|
|
|
|
{
|
|
|
|
value += lastc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if( lastc == ',' )
|
|
|
|
{
|
|
|
|
is_comma = true;
|
|
|
|
read_char(); // skip the comma character
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( lastc == 13 )
|
|
|
|
{
|
|
|
|
read_char(); // skip CR character
|
|
|
|
|
|
|
|
if( lastc == 10 )
|
|
|
|
read_char();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( lastc == 10 )
|
|
|
|
{
|
|
|
|
read_char(); // skip new line character
|
|
|
|
}
|
|
|
|
|
|
|
|
return is_comma;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool CSVParser::read_non_quoted_value_to(std::wstring & value)
|
|
|
|
{
|
|
|
|
while( lastc != -1 && lastc != ',' && lastc != 10 )
|
|
|
|
{
|
|
|
|
value += lastc;
|
|
|
|
read_char();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool is_comma = (lastc == ',');
|
|
|
|
|
|
|
|
if( is_comma )
|
|
|
|
{
|
|
|
|
read_char(); // skip the comma character
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
bool is_new_line = (lastc == 10);
|
|
|
|
|
|
|
|
// check CRLF sequence
|
|
|
|
if( is_new_line && !value.empty() && value.back() == 13 )
|
|
|
|
{
|
|
|
|
value.erase(value.size() - 1, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if( is_new_line )
|
|
|
|
{
|
|
|
|
read_char(); // skip the new line character
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return is_comma;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|