pikotools/src/space/spaceparser.cpp

1162 lines
19 KiB
C++

/*
* This file is a part of PikoTools
* and is distributed under the (new) BSD licence.
* Author: Tomasz Sowa <t.sowa@ttmath.org>
*/
/*
* Copyright (c) 2012-2021, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* * Neither the name Tomasz Sowa nor the names of contributors to this
* project may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cstdlib>
#include <wchar.h>
#include "spaceparser.h"
#include "utf8/utf8.h"
#include "convert/strtoint.h"
namespace PT
{
SpaceParser::SpaceParser()
{
root_space = 0;
SetDefault();
}
void SpaceParser::SetSpace(Space * pspace)
{
root_space = pspace;
}
void SpaceParser::SetSpace(Space & pspace)
{
root_space = &pspace;
}
void SpaceParser::SetDefault()
{
// you can change this separators to what you want
// you shoud not use only white characters here (as expected by IsWhite() method)
// and new line characters ('\n')
separator = ':';
space_start = '{';
space_end = '}';
table_start = '[';
table_end = ']';
option_delimiter = ',';
skip_empty = false;
use_escape_char = true;
input_as_utf8 = true;
}
void SpaceParser::SkipEmpty(bool skip)
{
skip_empty = skip;
}
void SpaceParser::UseEscapeChar(bool escape)
{
use_escape_char = escape;
}
void SpaceParser::UTF8(bool utf)
{
input_as_utf8 = utf;
}
int SpaceParser::get_last_parsed_line()
{
return line;
}
SpaceParser::Status SpaceParser::ParseJSONFile(const char * file_name)
{
reading_from_file = true;
parsing_space = false;
file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in);
if( file )
{
ParseRootSpace();
file.close();
}
else
{
status = cant_open_file;
}
return status;
}
SpaceParser::Status SpaceParser::ParseJSONFile(const std::string & file_name)
{
return ParseJSONFile(file_name.c_str());
}
SpaceParser::Status SpaceParser::ParseJSONFile(const wchar_t * file_name)
{
std::string file_name_utf8;
PT::WideToUTF8(file_name, file_name_utf8);
return ParseJSONFile(file_name_utf8.c_str());
}
SpaceParser::Status SpaceParser::ParseJSONFile(const std::wstring & file_name)
{
return ParseJSONFile(file_name.c_str());
}
SpaceParser::Status SpaceParser::ParseSpaceFile(const char * file_name)
{
reading_from_file = true;
parsing_space = true;
file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in);
if( file )
{
ParseRootSpace();
file.close();
}
else
{
status = cant_open_file;
}
return status;
}
SpaceParser::Status SpaceParser::ParseSpaceFile(const std::string & file_name)
{
return ParseSpaceFile(file_name.c_str());
}
SpaceParser::Status SpaceParser::ParseSpaceFile(const wchar_t * file_name)
{
std::string file_name_utf8;
PT::WideToUTF8(file_name, file_name_utf8);
return ParseSpaceFile(file_name_utf8.c_str());
}
SpaceParser::Status SpaceParser::ParseSpaceFile(const std::wstring & file_name)
{
return ParseSpaceFile(file_name.c_str());
}
SpaceParser::Status SpaceParser::ParseJSON(const char * str)
{
reading_from_file = false;
reading_from_wchar_string = false;
pchar_ascii = str;
pchar_unicode = 0;
parsing_space = false;
ParseRootSpace();
return status;
}
SpaceParser::Status SpaceParser::ParseJSON(const std::string & str)
{
return ParseJSON(str.c_str());
}
SpaceParser::Status SpaceParser::ParseJSON(const wchar_t * str)
{
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = str;
pchar_ascii = 0;
parsing_space = false;
ParseRootSpace();
return status;
}
SpaceParser::Status SpaceParser::ParseJSON(const std::wstring & str)
{
return ParseJSON(str.c_str());
}
SpaceParser::Status SpaceParser::ParseSpace(const char * str)
{
reading_from_file = false;
reading_from_wchar_string = false;
pchar_ascii = str;
pchar_unicode = 0;
parsing_space = true;
ParseRootSpace();
return status;
}
SpaceParser::Status SpaceParser::ParseSpace(const std::string & str)
{
return ParseSpace(str.c_str());
}
SpaceParser::Status SpaceParser::ParseSpace(const wchar_t * str)
{
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = str;
pchar_ascii = 0;
parsing_space = true;
ParseRootSpace();
return status;
}
SpaceParser::Status SpaceParser::ParseSpace(const std::wstring & str)
{
return ParseSpace(str.c_str());
}
void SpaceParser::ParseRootSpace()
{
line = 1;
status = ok;
skipped = 0;
if( !root_space )
{
status = no_space;
return;
}
ReadChar(); // put first character to lastc
if( parsing_space )
{
separator = '=';
table_start = '(';
table_end = ')';
ParseSpace(root_space);
}
else
{
separator = ':';
table_start = '[';
table_end = ']';
Parse(root_space, false, false);
}
SkipWhite();
if( lastc != -1 )
status = syntax_error;
token.clear();
}
void SpaceParser::Parse(Space * space, bool is_object_value, bool is_table_value)
{
SkipWhite();
if( lastc == space_start )
{
ParseSpace(space);
}
else
if( lastc == table_start )
{
ParseTable(space);
}
else
if( lastc == '"' ) // IMPROVEME define a variable
{
ParseTextValue(space);
}
else
{
ReadStringValue(token, is_object_value, is_table_value);
if( token == L"null" )
{
space->set_null();
}
else
if( token == L"true" )
{
space->set(true);
}
else
if( token == L"false" )
{
space->set(false);
}
else
if( is_integer_token() )
{
ParseIntegerValue(space);
}
else
if( is_floating_point_token() )
{
ParseFloatingPointValue(space);
}
else
{
if( parsing_space )
{
space->set(token);
}
else
{
status = syntax_error;
}
}
}
}
void SpaceParser::ParseSpace(Space * space)
{
/*
* in Space format in global namespace the space start character is not required
*/
bool need_space_start_character = !parsing_space || space != root_space;
if( need_space_start_character )
{
ReadChar(); // inserting a next character after the space_start char to lastc
}
space->set_empty_object();
ParseKeyValuePairs(space);
if( need_space_start_character )
{
if( lastc == space_end )
{
ReadChar();
}
else
{
status = syntax_error;
}
}
}
void SpaceParser::ParseTextValue(Space * space)
{
space->set_empty_wstring();
std::wstring * str = space->get_wstr();
if( parsing_space )
ReadMultilineTokenQuoted(*str);
else
ReadTokenQuoted(*str);
}
void SpaceParser::ParseIntegerValue(Space * space)
{
const wchar_t * after_str = nullptr;
bool was_overflow = false;
int base = 10;
if( parsing_space )
{
// in Space format when the integer value begins with a zero it means
// this is an octal number
if( !token.empty() && token[0] == '0' )
base = 8;
}
long long val = Toll(token.c_str(), base, &after_str, &was_overflow, false);
if( was_overflow )
{
status = syntax_error;
}
else
if( size_t(after_str - token.c_str()) != token.size() )
{
status = syntax_error;
}
else
{
space->set(val);
}
}
void SpaceParser::ParseFloatingPointValue(Space * space)
{
wchar_t * after_str = nullptr;
double val = wcstod(token.c_str(), &after_str);
if( errno == ERANGE )
{
status = syntax_error;
}
else
if( size_t(after_str - token.c_str()) != token.size() )
{
status = syntax_error;
}
else
{
space->set(val);
}
}
void SpaceParser::ParseTable(Space * space)
{
space->set_empty_table();
ReadChar(); // inserting a next character after the table_start char to lastc
space->set_empty_table();
ParseValuesList(space);
if( lastc == table_end )
{
ReadChar();
}
else
{
status = syntax_error;
}
}
void SpaceParser::ParseKeyValuePairs(Space * space)
{
bool is_first = true;
SkipWhite();
while( status == ok && lastc != space_end && lastc != -1 )
{
if( !is_first )
{
SkipWhite();
if( lastc == option_delimiter )
{
ReadChar(); // inserting a next character after the option_delimiter to lastc
if( parsing_space )
{
// in space format a space_end character is allowed to be after the last table item
SkipWhite();
if( lastc == space_end )
break;
}
}
else
if( !parsing_space )
{
// in json format the option_delimiter is required
status = syntax_error;
}
}
if( status == ok )
{
ReadKey();
if( status == ok )
{
SkipWhite();
if( lastc == separator )
{
ReadChar(); // inserting a next character after the separator to lastc
Space & new_space = space->add(token.c_str(), new Space());
Parse(&new_space, true, false);
}
else
if( parsing_space && lastc == space_start )
{
Space & new_space = space->add_child_space(token.c_str());
ParseSpace(&new_space);
}
else
{
status = syntax_error;
}
}
}
is_first = false;
SkipWhite();
}
}
void SpaceParser::ParseValuesList(Space * space)
{
bool is_first = true;
SkipWhite();
while( status == ok && lastc != table_end && lastc != -1 )
{
if( !is_first )
{
SkipWhite();
if( lastc == option_delimiter ) // may add a new delimiter for tables? default the same as for objects...
{
ReadChar(); // inserting a next character after the delimiter
if( parsing_space )
{
// in space format a table_end character is allowed to be after the last table item
SkipWhite();
if( lastc == table_end )
break;
}
}
else
if( !parsing_space )
{
// in json format the option_delimiter is required
status = syntax_error;
}
}
if( status == ok )
{
Space * new_space = &space->add(new Space());
Parse(new_space, false, true);
}
is_first = false;
SkipWhite();
}
}
bool SpaceParser::is_integer_token()
{
if( token.empty() )
return false;
size_t i = 0;
if( token[i] == '-' )
{
i += 1;
if( token.size() == 1 )
return false;
}
for( ; i < token.size() ; ++i)
{
if( token[i] < '0' || token[i] > '9' )
{
return false;
}
}
return true;
}
bool SpaceParser::is_floating_point_token()
{
bool was_dot = false;
bool was_exponential = false;
bool was_plus_minus_sign = false;
if( token.empty() )
return false;
size_t i = 0;
if( token[i] == '-' )
{
i += 1;
if( token.size() == 1 )
return false;
}
for( ; i < token.size() ; ++i)
{
if( token[i] == '.' )
{
if( was_dot || was_exponential )
return false;
was_dot = true;
}
else
if( token[i] == 'e' || token[i]=='E' )
{
if( was_exponential )
return false;
was_exponential = true;
// the exponential character cannot be the last character
if( i + 1 == token.size() )
return false;
}
else
if( token[i] == '+' || token[i] == '-' )
{
if( was_plus_minus_sign || !was_exponential )
return false;
// the plus or minus should be after the exponential character
if( i > 0 && (token[i-1] != 'e' && token[i-1] != 'E') )
return false;
was_plus_minus_sign = true;
}
else
if( token[i] < '0' || token[i] > '9' )
{
return false;
}
}
return true;
}
bool SpaceParser::IsWhite(int c)
{
// 13 (\r) is at the end of a line in a dos file \r\n
// 160 is an unbreakable space
if( c==' ' || c=='\t' || c==13 || c==160 || c==10 )
return true;
return false;
}
bool SpaceParser::is_alfa_numeric_char(int c)
{
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c == '.' || c=='-' || c=='+';
}
void SpaceParser::SkipLine()
{
while( lastc != -1 && (char_was_escaped || lastc != '\n') )
ReadChar();
}
void SpaceParser::SkipWhite()
{
if( parsing_space )
{
while( IsWhite(lastc) || (!char_was_escaped && lastc == '#') )
{
if( lastc == '#' )
SkipLine();
else
ReadChar();
}
}
else
{
while( IsWhite(lastc) )
{
ReadChar();
}
}
}
void SpaceParser::TrimLastWhite(std::wstring & s)
{
std::wstring::size_type i;
for(i=s.size() ; i>0 && IsWhite(s[i-1]) ; --i)
{
}
if( i < s.size() )
{
s.erase(i, std::wstring::npos);
}
}
/*
void SpaceParser::Trim(std::wstring & s)
{
std::wstring::size_type i;
if( s.empty() )
return;
// looking for white characters at the end
for(i=s.size()-1 ; i>0 && IsWhite(s[i]) ; --i);
if( i==0 && IsWhite(s[i]) )
{
// the whole string consists of white characters
s.clear();
return;
}
// deleting white characters at the end
if( i != s.size() - 1 )
s.erase(i+1, std::wstring::npos);
// looking for white characters at the beginning
for(i=0 ; i<s.size() && IsWhite(s[i]) ; ++i);
// deleting white characters at the beginning
if( i != 0 )
s.erase(0, i);
}
*/
void SpaceParser::ReadTokenUntilDelimiter(std::wstring & token, int delimiter1, int delimiter2)
{
token.clear();
while( lastc != -1 && (char_was_escaped || (lastc != '\n' && lastc != '#' && lastc != delimiter1 && lastc != delimiter2)) )
{
token += static_cast<wchar_t>(lastc);
ReadChar();
}
TrimLastWhite(token);
}
void SpaceParser::ReadAlfaNumericToken(std::wstring & token)
{
token.clear();
while( is_alfa_numeric_char(lastc) )
{
token += static_cast<wchar_t>(lastc);
ReadChar();
}
}
void SpaceParser::ReadStringValue(std::wstring & token, bool is_object_value, bool is_table_value)
{
if( parsing_space )
{
if( is_object_value )
{
ReadTokenUntilDelimiter(token, space_end, -1);
}
else
if( is_table_value )
{
ReadTokenUntilDelimiter(token, table_end, option_delimiter);
}
else
{
ReadTokenUntilDelimiter(token, -1, -1);
}
}
else
{
ReadAlfaNumericToken(token);
}
}
// rename to something like ReadSpaceFieldToken???
void SpaceParser::ReadSpaceFieldToken(std::wstring & token)
{
token.clear();
while( lastc != -1 && (char_was_escaped || (lastc != separator && lastc != 10 && lastc != space_start && lastc != '#' )) )
{
token += static_cast<wchar_t>(lastc);
ReadChar();
}
TrimLastWhite(token);
}
// IMPROVEME in JSON we should not allow non-escaped a new line character
void SpaceParser::ReadTokenQuoted(std::wstring & token)
{
token.clear();
ReadChar(); // skipping the first quotation mark
while( lastc != -1 && (char_was_escaped || (lastc != '"' && lastc != 10)) )
{
token += static_cast<wchar_t>(lastc);
ReadChar();
}
if( !char_was_escaped && lastc == '"' )
{
ReadChar(); // skipping the last quotation mark
}
else
{
status = syntax_error;
}
}
void SpaceParser::ReadMultilineTokenQuoted(std::wstring & token)
{
token.clear();
ReadChar(); // skipping the first quotation mark
while( lastc != -1 && (char_was_escaped || lastc != '"') )
{
token += static_cast<wchar_t>(lastc);
ReadChar();
}
if( !char_was_escaped && lastc == '"' )
{
ReadChar(); // skipping the last quotation mark
}
else
{
status = syntax_error;
}
}
/*
* this method is used to read the field name (key) in an object
* or to read the space child name (used in Space format)
*/
void SpaceParser::ReadKey()
{
SkipWhite();
if( parsing_space )
{
if( lastc == '"' )
{
ReadMultilineTokenQuoted(token);
}
else
{
ReadSpaceFieldToken(token);
}
}
else
{
if( lastc == '"' )
{
ReadTokenQuoted(token);
}
else
{
status = syntax_error;
}
}
}
int SpaceParser::ReadUTF8Char()
{
int c;
bool correct;
lastc = -1;
do
{
PT::UTF8ToInt(file, c, correct);
if( !file )
return lastc;
}
while( !correct );
lastc = c;
if( lastc == '\n' )
++line;
return lastc;
}
int SpaceParser::ReadASCIIChar()
{
lastc = file.get();
if( lastc == '\n' )
++line;
return lastc;
}
int SpaceParser::ReadCharFromWcharString()
{
if( *pchar_unicode == 0 )
lastc = -1;
else
lastc = *(pchar_unicode++);
if( lastc == '\n' )
++line;
return lastc;
}
int SpaceParser::ReadCharFromUTF8String()
{
int c;
bool correct;
lastc = -1;
do
{
size_t len = PT::UTF8ToInt(pchar_ascii, c, correct);
pchar_ascii += len;
}
while( *pchar_ascii && !correct );
if( correct )
lastc = c;
if( lastc == '\n' )
++line;
return lastc;
}
int SpaceParser::ReadCharFromAsciiString()
{
if( *pchar_ascii == 0 )
lastc = -1;
else
lastc = *(pchar_ascii++);
if( lastc == '\n' )
++line;
return lastc;
}
int SpaceParser::ReadCharNoEscape()
{
if( reading_from_file )
{
if( input_as_utf8 )
return ReadUTF8Char();
else
return ReadASCIIChar();
}
else
{
if( reading_from_wchar_string )
{
return ReadCharFromWcharString();
}
else
{
if( input_as_utf8 )
return ReadCharFromUTF8String();
else
return ReadCharFromAsciiString();
}
}
}
bool SpaceParser::IsHexDigit(wchar_t c)
{
return ((c>='0' && c<='9') ||
(c>='a' && c<='f') ||
(c>='A' && c<='F') );
}
int SpaceParser::HexToInt(wchar_t c)
{
if( c>='0' && c<='9' )
return c - '0';
if( c>='a' && c<='f' )
return c - 'a' + 10;
if( c>='A' && c<='F' )
return c - 'A' + 10;
return 0;
}
void SpaceParser::ReadUnicodeCodePoint()
{
wchar_t c;
int value = 0;
for(int i=0 ; i<4 ; ++i)
{
c = ReadCharNoEscape();
if( !IsHexDigit(c) )
{
status = syntax_error;
return;
}
value = (value << 4) | HexToInt(c);
}
lastc = (wchar_t)value;
}
int SpaceParser::ReadChar()
{
char_was_escaped = false;
ReadCharNoEscape();
if( use_escape_char && lastc == '\\' )
{
char_was_escaped = true;
ReadCharNoEscape();
switch(lastc)
{
case '0': lastc = 0; break;
case 't': lastc = '\t'; break;
case 'r': lastc = '\r'; break;
case 'n': lastc = '\n'; break;
case 'b': lastc = 0x08; break;
case 'f': lastc = 0x0c; break;
case 'u': ReadUnicodeCodePoint(); break;
// "in other cases we return the last character, so two \\ returns one \ "
}
}
return lastc;
}
} // namespace