/* * This file is a part of PikoTools * and is distributed under the (new) BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2012-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "spaceparser.h" #include "utf8/utf8.h" #include "convert/strtoint.h" namespace pt { SpaceParser::SpaceParser() { root_space = nullptr; space_start = '{'; space_end = '}'; option_delimiter = ','; input_as_utf8 = true; } void SpaceParser::use_utf8(bool utf) { input_as_utf8 = utf; } int SpaceParser::get_last_parsed_line() { return line; } SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space) { reading_from_file = true; parsing_space = false; root_space = &out_space; file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); if( file ) { parse_root_space(clear_space); file.close(); } else { status = cant_open_file; } return status; } SpaceParser::Status SpaceParser::parse_json_file(const std::string & file_name, Space & out_space, bool clear_space) { return parse_json_file(file_name.c_str(), out_space, clear_space); } SpaceParser::Status SpaceParser::parse_json_file(const wchar_t * file_name, Space & out_space, bool clear_space) { std::string file_name_utf8; wide_to_utf8(file_name, file_name_utf8); return parse_json_file(file_name_utf8.c_str(), out_space, clear_space); } SpaceParser::Status SpaceParser::parse_json_file(const std::wstring & file_name, Space & out_space, bool clear_space) { return parse_json_file(file_name.c_str(), out_space, clear_space); } SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space) { reading_from_file = true; parsing_space = true; root_space = &out_space; file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); if( file ) { parse_root_space(clear_space); file.close(); } else { status = cant_open_file; } return status; } SpaceParser::Status SpaceParser::parse_space_file(const std::string & file_name, Space & out_space, bool clear_space) { return parse_space_file(file_name.c_str(), out_space, clear_space); } SpaceParser::Status SpaceParser::parse_space_file(const wchar_t * file_name, Space & out_space, bool clear_space) { std::string file_name_utf8; wide_to_utf8(file_name, file_name_utf8); return parse_space_file(file_name_utf8.c_str(), out_space, clear_space); } SpaceParser::Status SpaceParser::parse_space_file(const std::wstring & file_name, Space & out_space, bool clear_space) { return parse_space_file(file_name.c_str(), out_space, clear_space); } SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space) { reading_from_file = false; reading_from_wchar_string = false; pchar_ascii = str; pchar_unicode = 0; parsing_space = false; root_space = &out_space; parse_root_space(clear_space); return status; } SpaceParser::Status SpaceParser::parse_json(const std::string & str, Space & out_space, bool clear_space) { return parse_json(str.c_str(), out_space, clear_space); } SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space) { reading_from_file = false; reading_from_wchar_string = true; pchar_unicode = str; pchar_ascii = 0; parsing_space = false; root_space = &out_space; parse_root_space(clear_space); return status; } SpaceParser::Status SpaceParser::parse_json(const std::wstring & str, Space & out_space, bool clear_space) { return parse_json(str.c_str(), out_space, clear_space); } SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space) { reading_from_file = false; reading_from_wchar_string = false; pchar_ascii = str; pchar_unicode = 0; parsing_space = true; root_space = &out_space; parse_root_space(clear_space); return status; } SpaceParser::Status SpaceParser::parse_space(const std::string & str, Space & out_space, bool clear_space) { return parse_space(str.c_str(), out_space, clear_space); } SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space) { reading_from_file = false; reading_from_wchar_string = true; pchar_unicode = str; pchar_ascii = 0; parsing_space = true; root_space = &out_space; parse_root_space(clear_space); return status; } SpaceParser::Status SpaceParser::parse_space(const std::wstring & str, Space & out_space, bool clear_space) { return parse_space(str.c_str(), out_space, clear_space); } void SpaceParser::parse_root_space(bool clear_root_space) { line = 1; status = ok; if( clear_root_space ) { root_space->set_empty_object(); } read_char(); // put first character to lastc if( parsing_space ) { separator = '='; table_start = '('; table_end = ')'; parse_space(root_space); } else { separator = ':'; table_start = '['; table_end = ']'; parse(root_space, false, false); } skip_white(); if( lastc != -1 ) status = syntax_error; token.clear(); } void SpaceParser::parse(Space * space, bool is_object_value, bool is_table_value) { skip_white(); if( lastc == space_start ) { parse_space(space); } else if( lastc == table_start ) { parse_table(space); } else if( lastc == '"' ) // IMPROVEME define a variable { parse_text_value(space); } else { read_string_value(token, is_object_value, is_table_value); if( token == L"null" ) { space->set_null(); } else if( token == L"true" ) { space->set(true); } else if( token == L"false" ) { space->set(false); } else if( is_integer_token() ) { parse_integer_value(space); } else if( is_floating_point_token() ) { parse_floating_point_value(space); } else { if( parsing_space ) { space->set(token); } else { status = syntax_error; } } } } void SpaceParser::parse_space(Space * space) { /* * in Space format in global namespace the space start character is not required */ bool need_space_start_character = !parsing_space || space != root_space; if( need_space_start_character ) { read_char(); // inserting a next character after the space_start char to lastc } if( !space->is_object() ) space->set_empty_object(); parse_key_value_pairs(space); if( need_space_start_character ) { if( lastc == space_end ) { read_char(); } else { status = syntax_error; } } } void SpaceParser::parse_text_value(Space * space) { space->set_empty_wstring(); std::wstring * str = space->get_wstr(); if( parsing_space ) read_multiline_token_quoted(*str); else read_token_quoted(*str); } void SpaceParser::parse_integer_value(Space * space) { const wchar_t * after_str = nullptr; bool was_overflow = false; int base = 10; if( parsing_space ) { // in Space format when the integer value begins with a zero it means // this is an octal number if( !token.empty() && token[0] == '0' ) base = 8; } long long val = Toll(token.c_str(), base, &after_str, &was_overflow, false); if( was_overflow ) { status = syntax_error; } else if( size_t(after_str - token.c_str()) != token.size() ) { status = syntax_error; } else { space->set(val); } } void SpaceParser::parse_floating_point_value(Space * space) { wchar_t * after_str = nullptr; double val = wcstod(token.c_str(), &after_str); if( errno == ERANGE ) { status = syntax_error; } else if( size_t(after_str - token.c_str()) != token.size() ) { status = syntax_error; } else { space->set(val); } } void SpaceParser::parse_table(Space * space) { read_char(); // inserting a next character after the table_start char to lastc space->set_empty_table(); parse_values_list(space); if( lastc == table_end ) { read_char(); } else { status = syntax_error; } } void SpaceParser::parse_key_value_pairs(Space * space) { bool is_first = true; skip_white(); while( status == ok && lastc != space_end && lastc != -1 ) { if( !is_first ) { skip_white(); if( lastc == option_delimiter ) { read_char(); // inserting a next character after the option_delimiter to lastc if( parsing_space ) { // in space format a space_end character is allowed to be after the option_delimiter skip_white(); if( lastc == space_end ) break; } } else if( !parsing_space ) { // in json format the option_delimiter is required status = syntax_error; } } if( status == ok ) { read_key(); if( status == ok ) { skip_white(); if( lastc == separator ) { read_char(); // inserting a next character after the separator to lastc Space & new_space = space->add(token.c_str(), new Space()); parse(&new_space, true, false); } else { status = syntax_error; } } } is_first = false; skip_white(); } } void SpaceParser::parse_values_list(Space * space) { bool is_first = true; skip_white(); while( status == ok && lastc != table_end && lastc != -1 ) { if( !is_first ) { skip_white(); if( lastc == option_delimiter ) // may add a new delimiter for tables? default the same as for objects... { read_char(); // inserting a next character after the delimiter if( parsing_space ) { // in space format a table_end character is allowed to be after the last table item skip_white(); if( lastc == table_end ) break; } } else if( !parsing_space ) { // in json format the option_delimiter is required status = syntax_error; } } if( status == ok ) { Space * new_space = &space->add(new Space()); parse(new_space, false, true); } is_first = false; skip_white(); } } bool SpaceParser::is_integer_token() { if( token.empty() ) return false; size_t i = 0; if( token[i] == '-' ) { i += 1; if( token.size() == 1 ) return false; } for( ; i < token.size() ; ++i) { if( token[i] < '0' || token[i] > '9' ) { return false; } } return true; } bool SpaceParser::is_floating_point_token() { bool was_dot = false; bool was_exponential = false; bool was_plus_minus_sign = false; if( token.empty() ) return false; size_t i = 0; if( token[i] == '-' ) { i += 1; if( token.size() == 1 ) return false; } for( ; i < token.size() ; ++i) { if( token[i] == '.' ) { if( was_dot || was_exponential ) return false; was_dot = true; } else if( token[i] == 'e' || token[i]=='E' ) { if( was_exponential ) return false; was_exponential = true; // the exponential character cannot be the last character if( i + 1 == token.size() ) return false; } else if( token[i] == '+' || token[i] == '-' ) { if( was_plus_minus_sign || !was_exponential ) return false; // the plus or minus should be after the exponential character if( i > 0 && (token[i-1] != 'e' && token[i-1] != 'E') ) return false; was_plus_minus_sign = true; } else if( token[i] < '0' || token[i] > '9' ) { return false; } } return true; } bool SpaceParser::is_white(int c) { // 13 (\r) is at the end of a line in a dos file \r\n // 160 is an unbreakable space if( c==' ' || c=='\t' || c==13 || c==160 || c==10 ) return true; return false; } bool SpaceParser::is_alfa_numeric_char(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '.' || c=='-' || c=='+'; } void SpaceParser::skip_line() { while( lastc != -1 && (char_was_escaped || lastc != '\n') ) read_char(); } void SpaceParser::skip_white() { if( parsing_space ) { while( is_white(lastc) || (!char_was_escaped && lastc == '#') ) { if( lastc == '#' ) skip_line(); else read_char(); } } else { while( is_white(lastc) ) { read_char(); } } } void SpaceParser::trim_last_white(std::wstring & s) { std::wstring::size_type i; for(i=s.size() ; i>0 && is_white(s[i-1]) ; --i) { } if( i < s.size() ) { s.erase(i, std::wstring::npos); } } void SpaceParser::read_token_until_delimiter(std::wstring & token, int delimiter1, int delimiter2) { token.clear(); while( lastc != -1 && (char_was_escaped || (lastc != '\n' && lastc != '#' && lastc != delimiter1 && lastc != delimiter2)) ) { token += static_cast(lastc); read_char(); } trim_last_white(token); } void SpaceParser::read_alfa_numeric_token(std::wstring & token) { token.clear(); while( is_alfa_numeric_char(lastc) ) { token += static_cast(lastc); read_char(); } } void SpaceParser::read_string_value(std::wstring & token, bool is_object_value, bool is_table_value) { if( parsing_space ) { if( is_object_value ) { read_token_until_delimiter(token, space_end, -1); } else if( is_table_value ) { read_token_until_delimiter(token, table_end, option_delimiter); } else { read_token_until_delimiter(token, -1, -1); } } else { read_alfa_numeric_token(token); } } void SpaceParser::read_space_field_token(std::wstring & token) { token.clear(); while( lastc != -1 && (char_was_escaped || (lastc != separator && lastc != 10 && lastc != space_start && lastc != '#' )) ) { token += static_cast(lastc); read_char(); } trim_last_white(token); } // IMPROVEME in JSON we should not allow non-escaped a new line character void SpaceParser::read_token_quoted(std::wstring & token) { token.clear(); read_char(); // skipping the first quotation mark while( lastc != -1 && (char_was_escaped || (lastc != '"' && lastc != 10)) ) { token += static_cast(lastc); read_char(); } if( !char_was_escaped && lastc == '"' ) { read_char(); // skipping the last quotation mark } else { status = syntax_error; } } void SpaceParser::read_multiline_token_quoted(std::wstring & token) { token.clear(); read_char(); // skipping the first quotation mark while( lastc != -1 && (char_was_escaped || lastc != '"') ) { token += static_cast(lastc); read_char(); } if( !char_was_escaped && lastc == '"' ) { read_char(); // skipping the last quotation mark } else { status = syntax_error; } } /* * this method is used to read the field name (key) in an object * or to read the space child name (used in Space format) */ void SpaceParser::read_key() { skip_white(); if( parsing_space ) { if( lastc == '"' ) { read_multiline_token_quoted(token); } else { read_space_field_token(token); } } else { if( lastc == '"' ) { read_token_quoted(token); } else { status = syntax_error; } } } bool SpaceParser::is_hex_digit(wchar_t c) { return ((c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') ); } int SpaceParser::hex_to_int(wchar_t c) { if( c>='0' && c<='9' ) return c - '0'; if( c>='a' && c<='f' ) return c - 'a' + 10; if( c>='A' && c<='F' ) return c - 'A' + 10; return 0; } /* * format: \uHHHH where H is a hex digit 0-F */ bool SpaceParser::read_unicode_four_digit_format(bool has_first_byte, int first_byte) { int c; int value = 0; for(int i=0 ; i<4 ; ++i) { if( i == 0 && has_first_byte ) { c = first_byte; } else { c = read_char_no_escape(); } if( !is_hex_digit(c) ) { return false; } value = (value << 4) | hex_to_int(c); } lastc = static_cast(value); return true; } /* * format: \uHHHH and optionally following by \uHHHH * */ void SpaceParser::read_unicode_json_format(bool has_first_byte, int first_byte) { bool ok = read_unicode_four_digit_format(has_first_byte, first_byte); if( ok && pt::is_first_surrogate_char(lastc) ) { int c1 = lastc; int c = read_char_no_escape(); ok = ok && (c == '\\'); if( ok ) { c = read_char_no_escape(); ok = ok && (c == 'u'); ok = ok && read_unicode_four_digit_format(false, 0); if( ok && pt::is_second_surrogate_char(lastc) ) { int c2 = lastc; ok = ok && pt::surrogate_pair_to_int(c1, c2, lastc); } } } if( !ok || !pt::utf8_check_range(lastc) ) { lastc = 0xFFFD; // U+FFFD "replacement character"; } } /* * format: \u{H...} where H is a hex digit 0-F, minimum digits: 1, maximum digits: 6 */ void SpaceParser::read_unicode_floating_format() { int c; int value = 0; int i; // max 6 hex digits + '}' for(i=0 ; i<7 ; ++i) { c = read_char_no_escape(); if( !is_hex_digit(c) ) { break; } value = (value << 4) | hex_to_int(c); } if( i > 0 && c == '}' && pt::utf8_check_range(value) ) { lastc = static_cast(value); } else { lastc = 0xFFFD; // U+FFFD "replacement character"; } } void SpaceParser::read_unicode_code_point() { if( parsing_space ) { int c = read_char_no_escape(); if( c == '{' ) { read_unicode_floating_format(); } else { read_unicode_json_format(true, c); } } else { read_unicode_json_format(false, 0); } } int SpaceParser::read_char() { char_was_escaped = false; read_char_no_escape(); if( lastc == '\\' ) { char_was_escaped = true; read_char_no_escape(); switch(lastc) { case '0': lastc = 0; break; case 't': lastc = '\t'; break; case 'r': lastc = '\r'; break; case 'n': lastc = '\n'; break; case 'b': lastc = 0x08; break; case 'f': lastc = 0x0c; break; case 'u': read_unicode_code_point(); break; // "in other cases we return the last character, so two \\ returns one \ " } } return lastc; } } // namespace