/* * This file is a part of PikoTools * and is distributed under the (new) BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include "csvparser.h" #include "utf8/utf8.h" namespace PT { CSVParser::Status CSVParser::parse_file(const char * file_name, Space & out_space) { reading_from_file = true; space = &out_space; file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); if( file ) { parse(); file.close(); } else { status = cant_open_file; } return status; } CSVParser::Status CSVParser::parse_file(const std::string & file_name, Space & out_space) { return parse_file(file_name.c_str(), out_space); } CSVParser::Status CSVParser::parse_file(const wchar_t * file_name, Space & out_space) { std::string file_name_utf8; PT::WideToUTF8(file_name, file_name_utf8); return parse_file(file_name_utf8.c_str(), out_space); } CSVParser::Status CSVParser::parse_file(const std::wstring & file_name, Space & out_space) { return parse_file(file_name.c_str(), out_space); } CSVParser::Status CSVParser::parse(const char * str, Space & out_space) { reading_from_file = false; reading_from_wchar_string = false; pchar_ascii = str; pchar_unicode = 0; space = &out_space; parse(); return status; } CSVParser::Status CSVParser::parse(const std::string & str, Space & out_space) { return parse(str.c_str(), out_space); } CSVParser::Status CSVParser::parse(const wchar_t * str, Space & out_space) { reading_from_file = false; reading_from_wchar_string = true; pchar_unicode = str; pchar_ascii = 0; space = &out_space; parse(); return status; } CSVParser::Status CSVParser::parse(const std::wstring & str, Space & out_space) { return parse(str.c_str(), out_space); } void CSVParser::parse() { line = 1; status = ok; space->set_empty_table(); read_char(); // put first character to lastc if( lastc == -1 ) { // an empty file/string, in such a case we return such a space struct (if would be serialized to json): [[]] Space * row_space = new Space(); row_space->set_empty_table(); space->add(row_space); } while( lastc != -1 ) { /* * even if there is an error when parsing we continue to read the file/string * */ Space row_space; row_space.set_empty_table(); parse_row(row_space); space->add(std::move(row_space)); } } void CSVParser::parse_row(Space & row_space) { bool continue_reading; do { continue_reading = read_value_to(row_space); } while(continue_reading); } bool CSVParser::read_value_to(Space & row_space) { Space & space_value = row_space.add_empty_space(); space_value.set_empty_wstring(); if( lastc == '"' ) { return read_quoted_value_to(space_value.value.value_wstring); } else { return read_non_quoted_value_to(space_value.value.value_wstring); } } bool CSVParser::read_quoted_value_to(std::wstring & value) { bool is_comma = false; bool is_value_character = true; while( lastc != -1 && is_value_character ) { read_char(); if( lastc == '"' ) { read_char(); if( lastc == '"' ) { value += lastc; } else { is_value_character = false; } } else if( lastc != -1 ) { value += lastc; } } if( lastc == ',' ) { is_comma = true; read_char(); // skip the comma character } else if( lastc == 13 ) { read_char(); // skip CR character if( lastc == 10 ) read_char(); } else if( lastc == 10 ) { read_char(); // skip new line character } return is_comma; } bool CSVParser::read_non_quoted_value_to(std::wstring & value) { while( lastc != -1 && lastc != ',' && lastc != 10 ) { value += lastc; read_char(); } bool is_comma = (lastc == ','); if( is_comma ) { read_char(); // skip the comma character } else { bool is_new_line = (lastc == 10); // check CRLF sequence if( is_new_line && !value.empty() && value.back() == 13 ) { value.erase(value.size() - 1, 1); } if( is_new_line ) { read_char(); // skip the new line character } } return is_comma; } int CSVParser::read_utf8_char() { int c; bool correct; lastc = -1; do { PT::UTF8ToInt(file, c, correct); if( !file ) return lastc; } while( !correct ); lastc = c; if( lastc == '\n' ) ++line; return lastc; } int CSVParser::read_ascii_char() { lastc = file.get(); if( lastc == '\n' ) ++line; return lastc; } int CSVParser::read_char_from_wchar_string() { if( *pchar_unicode == 0 ) lastc = -1; else lastc = *(pchar_unicode++); if( lastc == '\n' ) ++line; return lastc; } int CSVParser::read_char_from_utf8_string() { int c; bool correct; lastc = -1; do { size_t len = PT::UTF8ToInt(pchar_ascii, c, correct); pchar_ascii += len; } while( *pchar_ascii && !correct ); if( correct ) lastc = c; if( lastc == '\n' ) ++line; return lastc; } int CSVParser::read_char_from_ascii_string() { if( *pchar_ascii == 0 ) lastc = -1; else lastc = *(pchar_ascii++); if( lastc == '\n' ) ++line; return lastc; } int CSVParser::read_char_no_escape() { if( reading_from_file ) { if( input_as_utf8 ) return read_utf8_char(); else return read_ascii_char(); } else { if( reading_from_wchar_string ) { return read_char_from_wchar_string(); } else { if( input_as_utf8 ) return read_char_from_utf8_string(); else return read_char_from_ascii_string(); } } } int CSVParser::read_char() { return read_char_no_escape(); } }