You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
416 lines
7.0 KiB
416 lines
7.0 KiB
/* |
|
* This file is a part of PikoTools |
|
* and is distributed under the (new) BSD licence. |
|
* Author: Tomasz Sowa <t.sowa@ttmath.org> |
|
*/ |
|
|
|
/* |
|
* Copyright (c) 2021, Tomasz Sowa |
|
* All rights reserved. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions are met: |
|
* |
|
* * Redistributions of source code must retain the above copyright notice, |
|
* this list of conditions and the following disclaimer. |
|
* |
|
* * Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in the |
|
* documentation and/or other materials provided with the distribution. |
|
* |
|
* * Neither the name Tomasz Sowa nor the names of contributors to this |
|
* project may be used to endorse or promote products derived |
|
* from this software without specific prior written permission. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
|
* THE POSSIBILITY OF SUCH DAMAGE. |
|
*/ |
|
|
|
#include "csvparser.h" |
|
#include "utf8/utf8.h" |
|
|
|
|
|
|
|
namespace pt |
|
{ |
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space) |
|
{ |
|
reading_from_file = true; |
|
space = &out_space; |
|
|
|
file.clear(); |
|
file.open(file_name, std::ios_base::binary | std::ios_base::in); |
|
|
|
if( file ) |
|
{ |
|
parse(); |
|
file.close(); |
|
} |
|
else |
|
{ |
|
status = cant_open_file; |
|
} |
|
|
|
return status; |
|
} |
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse_file(const std::string & file_name, Space & out_space) |
|
{ |
|
return parse_file(file_name.c_str(), out_space); |
|
} |
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse_file(const wchar_t * file_name, Space & out_space) |
|
{ |
|
std::string file_name_utf8; |
|
|
|
WideToUTF8(file_name, file_name_utf8); |
|
return parse_file(file_name_utf8.c_str(), out_space); |
|
} |
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space & out_space) |
|
{ |
|
return parse_file(file_name.c_str(), out_space); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse(const char * str, Space & out_space) |
|
{ |
|
reading_from_file = false; |
|
reading_from_wchar_string = false; |
|
pchar_ascii = str; |
|
pchar_unicode = 0; |
|
space = &out_space; |
|
|
|
parse(); |
|
|
|
return status; |
|
} |
|
|
|
|
|
|
|
CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space) |
|
{ |
|
return parse(str.c_str(), out_space); |
|
} |
|
|
|
|
|
CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space) |
|
{ |
|
reading_from_file = false; |
|
reading_from_wchar_string = true; |
|
pchar_unicode = str; |
|
pchar_ascii = 0; |
|
space = &out_space; |
|
|
|
parse(); |
|
|
|
return status; |
|
} |
|
|
|
|
|
CSVParser::Status CSVParser::parse(const std::wstring & str, Space & out_space) |
|
{ |
|
return parse(str.c_str(), out_space); |
|
} |
|
|
|
|
|
|
|
void CSVParser::parse() |
|
{ |
|
line = 1; |
|
status = ok; |
|
|
|
space->set_empty_table(); |
|
read_char(); // put first character to lastc |
|
|
|
if( lastc == -1 ) |
|
{ |
|
// an empty file/string, in such a case we return such a space struct (if would be serialized to json): [[]] |
|
Space row_space; |
|
row_space.set_empty_table(); |
|
space->add(std::move(row_space)); |
|
} |
|
|
|
while( lastc != -1 ) |
|
{ |
|
/* |
|
* even if there is an error when parsing we continue to read the file/string |
|
* |
|
*/ |
|
|
|
Space row_space; |
|
row_space.set_empty_table(); |
|
|
|
parse_row(row_space); |
|
space->add(std::move(row_space)); |
|
} |
|
} |
|
|
|
|
|
void CSVParser::parse_row(Space & row_space) |
|
{ |
|
bool continue_reading; |
|
|
|
do |
|
{ |
|
continue_reading = read_value_to(row_space); |
|
} |
|
while(continue_reading); |
|
} |
|
|
|
|
|
bool CSVParser::read_value_to(Space & row_space) |
|
{ |
|
Space & space_value = row_space.add_empty_space(); |
|
space_value.set_empty_wstring(); |
|
|
|
if( lastc == '"' ) |
|
{ |
|
return read_quoted_value_to(space_value.value.value_wstring); |
|
} |
|
else |
|
{ |
|
return read_non_quoted_value_to(space_value.value.value_wstring); |
|
} |
|
} |
|
|
|
|
|
bool CSVParser::read_quoted_value_to(std::wstring & value) |
|
{ |
|
bool is_comma = false; |
|
bool is_value_character = true; |
|
|
|
while( lastc != -1 && is_value_character ) |
|
{ |
|
read_char(); |
|
|
|
if( lastc == '"' ) |
|
{ |
|
read_char(); |
|
|
|
if( lastc == '"' ) |
|
{ |
|
value += lastc; |
|
} |
|
else |
|
{ |
|
is_value_character = false; |
|
} |
|
} |
|
else |
|
if( lastc != -1 ) |
|
{ |
|
value += lastc; |
|
} |
|
} |
|
|
|
if( lastc == ',' ) |
|
{ |
|
is_comma = true; |
|
read_char(); // skip the comma character |
|
} |
|
else |
|
if( lastc == 13 ) |
|
{ |
|
read_char(); // skip CR character |
|
|
|
if( lastc == 10 ) |
|
read_char(); |
|
} |
|
else |
|
if( lastc == 10 ) |
|
{ |
|
read_char(); // skip new line character |
|
} |
|
|
|
return is_comma; |
|
} |
|
|
|
|
|
bool CSVParser::read_non_quoted_value_to(std::wstring & value) |
|
{ |
|
while( lastc != -1 && lastc != ',' && lastc != 10 ) |
|
{ |
|
value += lastc; |
|
read_char(); |
|
} |
|
|
|
bool is_comma = (lastc == ','); |
|
|
|
if( is_comma ) |
|
{ |
|
read_char(); // skip the comma character |
|
} |
|
else |
|
{ |
|
bool is_new_line = (lastc == 10); |
|
|
|
// check CRLF sequence |
|
if( is_new_line && !value.empty() && value.back() == 13 ) |
|
{ |
|
value.erase(value.size() - 1, 1); |
|
} |
|
|
|
if( is_new_line ) |
|
{ |
|
read_char(); // skip the new line character |
|
} |
|
} |
|
|
|
return is_comma; |
|
} |
|
|
|
|
|
|
|
|
|
int CSVParser::read_utf8_char() |
|
{ |
|
int c; |
|
bool correct; |
|
|
|
lastc = -1; |
|
|
|
do |
|
{ |
|
UTF8ToInt(file, c, correct); |
|
|
|
if( !file ) |
|
return lastc; |
|
} |
|
while( !correct ); |
|
|
|
lastc = c; |
|
|
|
if( lastc == '\n' ) |
|
++line; |
|
|
|
return lastc; |
|
} |
|
|
|
|
|
|
|
int CSVParser::read_ascii_char() |
|
{ |
|
lastc = file.get(); |
|
|
|
if( lastc == '\n' ) |
|
++line; |
|
|
|
return lastc; |
|
} |
|
|
|
|
|
|
|
|
|
int CSVParser::read_char_from_wchar_string() |
|
{ |
|
if( *pchar_unicode == 0 ) |
|
lastc = -1; |
|
else |
|
lastc = *(pchar_unicode++); |
|
|
|
if( lastc == '\n' ) |
|
++line; |
|
|
|
return lastc; |
|
} |
|
|
|
|
|
int CSVParser::read_char_from_utf8_string() |
|
{ |
|
int c; |
|
bool correct; |
|
|
|
lastc = -1; |
|
|
|
do |
|
{ |
|
size_t len = UTF8ToInt(pchar_ascii, c, correct); |
|
pchar_ascii += len; |
|
} |
|
while( *pchar_ascii && !correct ); |
|
|
|
if( correct ) |
|
lastc = c; |
|
|
|
if( lastc == '\n' ) |
|
++line; |
|
|
|
return lastc; |
|
} |
|
|
|
|
|
int CSVParser::read_char_from_ascii_string() |
|
{ |
|
if( *pchar_ascii == 0 ) |
|
lastc = -1; |
|
else |
|
lastc = *(pchar_ascii++); |
|
|
|
if( lastc == '\n' ) |
|
++line; |
|
|
|
return lastc; |
|
} |
|
|
|
|
|
int CSVParser::read_char_no_escape() |
|
{ |
|
if( reading_from_file ) |
|
{ |
|
if( input_as_utf8 ) |
|
return read_utf8_char(); |
|
else |
|
return read_ascii_char(); |
|
} |
|
else |
|
{ |
|
if( reading_from_wchar_string ) |
|
{ |
|
return read_char_from_wchar_string(); |
|
} |
|
else |
|
{ |
|
if( input_as_utf8 ) |
|
return read_char_from_utf8_string(); |
|
else |
|
return read_char_from_ascii_string(); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
int CSVParser::read_char() |
|
{ |
|
return read_char_no_escape(); |
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|