pikotools/src/space/spaceparser.cpp

1283 lines
22 KiB
C++

/*
* This file is a part of PikoTools
* and is distributed under the 2-Clause BSD licence.
* Author: Tomasz Sowa <t.sowa@ttmath.org>
*/
/*
* Copyright (c) 2012-2022, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <cstdlib>
#include <wchar.h>
#include "spaceparser.h"
#include "utf8/utf8.h"
#include "convert/strtoint.h"
namespace pt
{
SpaceParser::SpaceParser()
{
root_space = nullptr;
space_start = '{';
space_end = '}';
option_delimiter = ',';
input_as_utf8 = true;
object_items_limit = 0;
table_items_limit = 0;
all_items_limit = 0;
nested_levels_limit = 0;
}
void SpaceParser::use_utf8(bool utf)
{
input_as_utf8 = utf;
}
int SpaceParser::get_last_parsed_line()
{
return line;
}
int SpaceParser::get_last_parsed_column()
{
return column;
}
void SpaceParser::set_object_items_limit(size_t val)
{
this->object_items_limit = val;
}
void SpaceParser::set_table_items_limit(size_t val)
{
this->table_items_limit = val;
}
void SpaceParser::set_all_items_limit(size_t val)
{
this->all_items_limit = val;
}
void SpaceParser::set_nested_level_limit(size_t val)
{
this->nested_levels_limit = val;
}
size_t SpaceParser::get_object_items_limit()
{
return object_items_limit;
}
size_t SpaceParser::get_table_items_limit()
{
return table_items_limit;
}
size_t SpaceParser::get_all_items_limit()
{
return all_items_limit;
}
size_t SpaceParser::get_nested_level_limit()
{
return nested_levels_limit;
}
void SpaceParser::prepare_to_parsing()
{
clear_input_flags();
current_items_counter = 0;
current_nested_level = 0;
}
SpaceParser::Status SpaceParser::parse_json_file(const char * file_name, Space & out_space, bool clear_space)
{
prepare_to_parsing();
reading_from_file = true;
parsing_space = false;
root_space = &out_space;
file.open(file_name, std::ios_base::binary | std::ios_base::in);
if( file )
{
parse_root_space(clear_space);
file.close();
}
else
{
status = cant_open_file;
}
return status;
}
SpaceParser::Status SpaceParser::parse_json_file(const std::string & file_name, Space & out_space, bool clear_space)
{
return parse_json_file(file_name.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_json_file(const wchar_t * file_name, Space & out_space, bool clear_space)
{
std::string file_name_utf8;
wide_to_utf8(file_name, file_name_utf8);
return parse_json_file(file_name_utf8.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_json_file(const std::wstring & file_name, Space & out_space, bool clear_space)
{
return parse_json_file(file_name.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_space_file(const char * file_name, Space & out_space, bool clear_space)
{
prepare_to_parsing();
reading_from_file = true;
parsing_space = true;
root_space = &out_space;
file.open(file_name, std::ios_base::binary | std::ios_base::in);
if( file )
{
parse_root_space(clear_space);
file.close();
}
else
{
status = cant_open_file;
}
return status;
}
SpaceParser::Status SpaceParser::parse_space_file(const std::string & file_name, Space & out_space, bool clear_space)
{
return parse_space_file(file_name.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_space_file(const wchar_t * file_name, Space & out_space, bool clear_space)
{
std::string file_name_utf8;
wide_to_utf8(file_name, file_name_utf8);
return parse_space_file(file_name_utf8.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_space_file(const std::wstring & file_name, Space & out_space, bool clear_space)
{
return parse_space_file(file_name.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_json(const char * str, Space & out_space, bool clear_space)
{
prepare_to_parsing();
pchar_ascii = str;
parsing_space = false;
root_space = &out_space;
parse_root_space(clear_space);
return status;
}
SpaceParser::Status SpaceParser::parse_json(const std::string & str, Space & out_space, bool clear_space)
{
return parse_json(str.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_json(const wchar_t * str, Space & out_space, bool clear_space)
{
prepare_to_parsing();
pchar_unicode = str;
parsing_space = false;
root_space = &out_space;
parse_root_space(clear_space);
return status;
}
SpaceParser::Status SpaceParser::parse_json(const std::wstring & str, Space & out_space, bool clear_space)
{
return parse_json(str.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_json(const pt::TextStream & str, Space & out_space, bool clear_space)
{
prepare_to_parsing();
pt::TextStream::const_iterator start = str.begin();
pt::TextStream::const_iterator end = str.end();
text_stream_iterator = &start;
text_stream_iterator_end = &end;
parsing_space = false;
root_space = &out_space;
parse_root_space(clear_space);
return status;
}
SpaceParser::Status SpaceParser::parse_json(const pt::WTextStream & str, Space & out_space, bool clear_space)
{
prepare_to_parsing();
pt::WTextStream::const_iterator start = str.begin();
pt::WTextStream::const_iterator end = str.end();
wtext_stream_iterator = &start;
wtext_stream_iterator_end = &end;
parsing_space = false;
root_space = &out_space;
parse_root_space(clear_space);
return status;
}
SpaceParser::Status SpaceParser::parse_space(const char * str, Space & out_space, bool clear_space)
{
prepare_to_parsing();
pchar_ascii = str;
parsing_space = true;
root_space = &out_space;
parse_root_space(clear_space);
return status;
}
SpaceParser::Status SpaceParser::parse_space(const std::string & str, Space & out_space, bool clear_space)
{
return parse_space(str.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_space(const wchar_t * str, Space & out_space, bool clear_space)
{
prepare_to_parsing();
pchar_unicode = str;
parsing_space = true;
root_space = &out_space;
parse_root_space(clear_space);
return status;
}
SpaceParser::Status SpaceParser::parse_space(const std::wstring & str, Space & out_space, bool clear_space)
{
return parse_space(str.c_str(), out_space, clear_space);
}
SpaceParser::Status SpaceParser::parse_space(const pt::TextStream & str, Space & out_space, bool clear_space)
{
prepare_to_parsing();
pt::TextStream::const_iterator start = str.begin();
pt::TextStream::const_iterator end = str.end();
text_stream_iterator = &start;
text_stream_iterator_end = &end;
parsing_space = true;
root_space = &out_space;
parse_root_space(clear_space);
return status;
}
SpaceParser::Status SpaceParser::parse_space(const pt::WTextStream & str, Space & out_space, bool clear_space)
{
prepare_to_parsing();
pt::WTextStream::const_iterator start = str.begin();
pt::WTextStream::const_iterator end = str.end();
wtext_stream_iterator = &start;
wtext_stream_iterator_end = &end;
parsing_space = true;
root_space = &out_space;
parse_root_space(clear_space);
return status;
}
void SpaceParser::parse_root_space(bool clear_root_space)
{
line = 1;
status = ok;
if( clear_root_space )
{
root_space->set_empty_object();
}
read_char(); // put first character to lastc
if( parsing_space )
{
separator = '=';
table_start = '(';
table_end = ')';
parse_space(root_space);
}
else
{
separator = ':';
table_start = '[';
table_end = ']';
parse(root_space, false, false);
}
if( status == ok )
{
skip_white();
if( lastc != -1 )
status = syntax_error;
}
token.clear();
}
void SpaceParser::parse(Space * space, bool is_object_value, bool is_table_value)
{
skip_white();
if( lastc == space_start )
{
parse_space(space);
}
else
if( lastc == table_start )
{
parse_table(space);
}
else
if( lastc == '"' ) // IMPROVEME define a variable
{
parse_text_value(space);
}
else
{
read_string_value(token, is_object_value, is_table_value);
if( token == L"null" )
{
space->set_null();
}
else
if( token == L"true" )
{
space->set(true);
}
else
if( token == L"false" )
{
space->set(false);
}
else
if( is_integer_token() )
{
parse_integer_value(space);
}
else
if( is_floating_point_token() )
{
parse_floating_point_value(space);
}
else
{
if( parsing_space )
{
space->set(token);
}
else
{
status = syntax_error;
}
}
}
}
void SpaceParser::parse_space(Space * space)
{
if( nested_levels_limit == 0 || current_nested_level++ < nested_levels_limit )
{
/*
* in Space format in global namespace the space start character is not required
*/
bool need_space_start_character = !parsing_space || space != root_space;
if( need_space_start_character )
{
read_char(); // inserting a next character after the space_start char to lastc
}
if( !space->is_object() )
space->set_empty_object();
parse_key_value_pairs(space);
if( status == ok )
{
if( need_space_start_character )
{
if( lastc == space_end )
{
read_char();
}
else
{
status = syntax_error;
}
}
}
}
else
{
status = limit_nested_level_exceeded;
}
if( current_nested_level > 0 )
current_nested_level -= 1;
}
void SpaceParser::parse_text_value(Space * space)
{
space->set_empty_wstring();
std::wstring * str = space->get_wstr();
if( parsing_space )
read_multiline_token_quoted(*str);
else
read_token_quoted(*str);
}
void SpaceParser::parse_integer_value(Space * space)
{
const wchar_t * after_str = nullptr;
bool was_overflow = false;
int base = 10;
if( parsing_space )
{
// in Space format when the integer value begins with a zero it means
// this is an octal number
if( !token.empty() && token[0] == '0' )
base = 8;
}
long long val = Toll(token.c_str(), base, &after_str, &was_overflow, false);
if( was_overflow )
{
status = syntax_error;
}
else
if( size_t(after_str - token.c_str()) != token.size() )
{
status = syntax_error;
}
else
{
space->set(val);
}
}
void SpaceParser::parse_floating_point_value(Space * space)
{
wchar_t * after_str = nullptr;
double val = wcstod(token.c_str(), &after_str);
if( errno == ERANGE )
{
status = syntax_error;
}
else
if( size_t(after_str - token.c_str()) != token.size() )
{
status = syntax_error;
}
else
{
space->set(val);
}
}
void SpaceParser::parse_table(Space * space)
{
if( nested_levels_limit == 0 || current_nested_level++ < nested_levels_limit )
{
read_char(); // inserting a next character after the table_start char to lastc
space->set_empty_table();
parse_values_list(space);
if( status == ok )
{
if( lastc == table_end )
{
read_char();
}
else
{
status = syntax_error;
}
}
}
else
{
status = limit_nested_level_exceeded;
}
if( current_nested_level > 0 )
current_nested_level -= 1;
}
void SpaceParser::parse_key_value_pairs(Space * space)
{
bool is_first = true;
skip_white();
while( status == ok && lastc != space_end && lastc != -1 )
{
if( !is_first )
{
skip_white();
if( lastc == option_delimiter )
{
read_char(); // inserting a next character after the option_delimiter to lastc
if( parsing_space )
{
// in space format a space_end character is allowed to be after the option_delimiter
skip_white();
if( lastc == space_end )
break;
}
}
else
if( !parsing_space )
{
// in json format the option_delimiter is required
status = syntax_error;
}
}
if( status == ok )
{
read_key();
if( status == ok )
{
skip_white();
if( lastc == separator )
{
read_char(); // inserting a next character after the separator to lastc
if( object_items_limit == 0 || !space->is_object() || (space->object_size() < object_items_limit) )
{
Space & new_space = space->add(token.c_str(), new Space());
if( all_items_limit == 0 || current_items_counter++ < all_items_limit )
{
parse(&new_space, true, false);
}
else
{
status = limit_all_items_exceeded;
}
}
else
{
status = limit_object_items_exceeded;
}
}
else
{
status = syntax_error;
}
}
}
is_first = false;
if( status == ok )
{
skip_white();
}
}
}
void SpaceParser::parse_values_list(Space * space)
{
bool is_first = true;
skip_white();
while( status == ok && lastc != table_end && lastc != -1 )
{
if( !is_first )
{
skip_white();
if( lastc == option_delimiter ) // may add a new delimiter for tables? default the same as for objects...
{
read_char(); // inserting a next character after the delimiter
if( parsing_space )
{
// in space format a table_end character is allowed to be after the last table item
skip_white();
if( lastc == table_end )
break;
}
}
else
if( !parsing_space )
{
// in json format the option_delimiter is required
status = syntax_error;
}
}
if( status == ok )
{
if( table_items_limit == 0 || !space->is_table() || (space->table_size() < table_items_limit) )
{
Space * new_space = &space->add(new Space());
if( all_items_limit == 0 || current_items_counter++ < all_items_limit )
{
parse(new_space, false, true);
}
else
{
status = limit_all_items_exceeded;
}
}
else
{
status = limit_table_items_exceeded;
}
}
is_first = false;
if( status == ok )
{
skip_white();
}
}
}
bool SpaceParser::is_integer_token()
{
if( token.empty() )
return false;
size_t i = 0;
if( token[i] == '-' )
{
i += 1;
if( token.size() == 1 )
return false;
}
for( ; i < token.size() ; ++i)
{
if( token[i] < '0' || token[i] > '9' )
{
return false;
}
}
return true;
}
bool SpaceParser::is_floating_point_token()
{
bool was_dot = false;
bool was_exponential = false;
bool was_plus_minus_sign = false;
if( token.empty() )
return false;
size_t i = 0;
if( token[i] == '-' )
{
i += 1;
if( token.size() == 1 )
return false;
}
for( ; i < token.size() ; ++i)
{
if( token[i] == '.' )
{
if( was_dot || was_exponential )
return false;
was_dot = true;
}
else
if( token[i] == 'e' || token[i]=='E' )
{
if( was_exponential )
return false;
was_exponential = true;
// the exponential character cannot be the last character
if( i + 1 == token.size() )
return false;
}
else
if( token[i] == '+' || token[i] == '-' )
{
if( was_plus_minus_sign || !was_exponential )
return false;
// the plus or minus should be after the exponential character
if( i > 0 && (token[i-1] != 'e' && token[i-1] != 'E') )
return false;
was_plus_minus_sign = true;
}
else
if( token[i] < '0' || token[i] > '9' )
{
return false;
}
}
return true;
}
bool SpaceParser::is_white(int c)
{
// 13 (\r) is at the end of a line in a dos file \r\n
// 160 is an unbreakable space
if( c==' ' || c=='\t' || c==13 || c==160 || c==10 )
return true;
return false;
}
bool SpaceParser::is_alfa_numeric_char(int c)
{
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c == '.' || c=='-' || c=='+';
}
void SpaceParser::skip_line()
{
while( lastc != -1 && (char_was_escaped || lastc != '\n') )
read_char();
}
void SpaceParser::skip_white()
{
if( parsing_space )
{
while( is_white(lastc) || (!char_was_escaped && lastc == '#') )
{
if( lastc == '#' )
skip_line();
else
read_char();
}
}
else
{
while( is_white(lastc) )
{
read_char();
}
}
}
void SpaceParser::trim_last_white(std::wstring & s)
{
std::wstring::size_type i;
for(i=s.size() ; i>0 && is_white(s[i-1]) ; --i)
{
}
if( i < s.size() )
{
s.erase(i, std::wstring::npos);
}
}
void SpaceParser::read_token_until_delimiter(std::wstring & token, int delimiter1, int delimiter2)
{
token.clear();
while( lastc != -1 && (char_was_escaped || (lastc != '\n' && lastc != '#' && lastc != delimiter1 && lastc != delimiter2)) )
{
token += static_cast<wchar_t>(lastc);
read_char();
}
trim_last_white(token);
}
void SpaceParser::read_alfa_numeric_token(std::wstring & token)
{
token.clear();
while( is_alfa_numeric_char(lastc) )
{
token += static_cast<wchar_t>(lastc);
read_char();
}
}
void SpaceParser::read_string_value(std::wstring & token, bool is_object_value, bool is_table_value)
{
if( parsing_space )
{
if( is_object_value )
{
read_token_until_delimiter(token, space_end, -1);
}
else
if( is_table_value )
{
read_token_until_delimiter(token, table_end, option_delimiter);
}
else
{
read_token_until_delimiter(token, -1, -1);
}
}
else
{
read_alfa_numeric_token(token);
}
}
void SpaceParser::read_space_field_token(std::wstring & token)
{
token.clear();
while( lastc != -1 && (char_was_escaped || (lastc != separator && lastc != 10 && lastc != space_start && lastc != '#' )) )
{
token += static_cast<wchar_t>(lastc);
read_char();
}
trim_last_white(token);
}
// IMPROVEME in JSON we should not allow non-escaped a new line character
void SpaceParser::read_token_quoted(std::wstring & token)
{
token.clear();
read_char(); // skipping the first quotation mark
while( lastc != -1 && (char_was_escaped || (lastc != '"' && lastc != 10)) )
{
token += static_cast<wchar_t>(lastc);
read_char();
}
if( !char_was_escaped && lastc == '"' )
{
read_char(); // skipping the last quotation mark
}
else
{
status = syntax_error;
}
}
void SpaceParser::read_multiline_token_quoted(std::wstring & token)
{
token.clear();
read_char(); // skipping the first quotation mark
while( lastc != -1 && (char_was_escaped || lastc != '"') )
{
token += static_cast<wchar_t>(lastc);
read_char();
}
if( !char_was_escaped && lastc == '"' )
{
read_char(); // skipping the last quotation mark
}
else
{
status = syntax_error;
}
}
/*
* this method is used to read the field name (key) in an object
* or to read the space child name (used in Space format)
*/
void SpaceParser::read_key()
{
skip_white();
if( parsing_space )
{
if( lastc == '"' )
{
read_multiline_token_quoted(token);
}
else
{
read_space_field_token(token);
}
}
else
{
if( lastc == '"' )
{
read_token_quoted(token);
}
else
{
status = syntax_error;
}
}
}
bool SpaceParser::is_hex_digit(wchar_t c)
{
return ((c>='0' && c<='9') ||
(c>='a' && c<='f') ||
(c>='A' && c<='F') );
}
int SpaceParser::hex_to_int(wchar_t c)
{
if( c>='0' && c<='9' )
return c - '0';
if( c>='a' && c<='f' )
return c - 'a' + 10;
if( c>='A' && c<='F' )
return c - 'A' + 10;
return 0;
}
/*
* format: \uHHHH where H is a hex digit 0-F
*/
bool SpaceParser::read_unicode_four_digit_format(bool has_first_byte, int first_byte)
{
int c;
int value = 0;
for(int i=0 ; i<4 ; ++i)
{
if( i == 0 && has_first_byte )
{
c = first_byte;
}
else
{
c = read_char_no_escape();
}
if( !is_hex_digit(c) )
{
return false;
}
value = (value << 4) | hex_to_int(c);
}
lastc = static_cast<wchar_t>(value);
return true;
}
/*
* format: \uHHHH and optionally following by \uHHHH
*
*/
void SpaceParser::read_unicode_json_format(bool has_first_byte, int first_byte)
{
bool ok = read_unicode_four_digit_format(has_first_byte, first_byte);
if( ok && pt::is_first_surrogate_char(lastc) )
{
int c1 = lastc;
int c = read_char_no_escape();
ok = ok && (c == '\\');
if( ok )
{
c = read_char_no_escape();
ok = ok && (c == 'u');
ok = ok && read_unicode_four_digit_format(false, 0);
if( ok && pt::is_second_surrogate_char(lastc) )
{
int c2 = lastc;
ok = ok && pt::surrogate_pair_to_int(c1, c2, lastc);
}
}
}
if( !ok || !pt::utf8_check_range(lastc) )
{
lastc = 0xFFFD; // U+FFFD "replacement character";
}
}
/*
* format: \u{H...} where H is a hex digit 0-F, minimum digits: 1, maximum digits: 6
*/
void SpaceParser::read_unicode_floating_format()
{
int c;
int value = 0;
int i;
// max 6 hex digits + '}'
for(i=0 ; i<7 ; ++i)
{
c = read_char_no_escape();
if( !is_hex_digit(c) )
{
break;
}
value = (value << 4) | hex_to_int(c);
}
if( i > 0 && c == '}' && pt::utf8_check_range(value) )
{
lastc = static_cast<wchar_t>(value);
}
else
{
lastc = 0xFFFD; // U+FFFD "replacement character";
}
}
void SpaceParser::read_unicode_code_point()
{
if( parsing_space )
{
int c = read_char_no_escape();
if( c == '{' )
{
read_unicode_floating_format();
}
else
{
read_unicode_json_format(true, c);
}
}
else
{
read_unicode_json_format(false, 0);
}
}
int SpaceParser::read_char()
{
char_was_escaped = false;
read_char_no_escape();
if( lastc == '\\' )
{
char_was_escaped = true;
read_char_no_escape();
switch(lastc)
{
case '0': lastc = 0; break;
case 't': lastc = '\t'; break;
case 'r': lastc = '\r'; break;
case 'n': lastc = '\n'; break;
case 'b': lastc = 0x08; break;
case 'f': lastc = 0x0c; break;
case 'u': read_unicode_code_point(); break;
// "in other cases we return the last character, so two \\ returns one \ "
}
}
return lastc;
}
} // namespace