/* * This file is a part of PikoTools * and is distributed under the (new) BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2012-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "spaceparser.h" #include "utf8/utf8.h" #include "convert/strtoint.h" namespace pt { SpaceParser::SpaceParser() { root_space = 0; SetDefault(); } void SpaceParser::SetSpace(Space * pspace) { root_space = pspace; } void SpaceParser::SetSpace(Space & pspace) { root_space = &pspace; } void SpaceParser::SetDefault() { // you can change this separators to what you want // you shoud not use only white characters here (as expected by IsWhite() method) // and new line characters ('\n') separator = ':'; space_start = '{'; space_end = '}'; table_start = '['; table_end = ']'; option_delimiter = ','; skip_empty = false; use_escape_char = true; input_as_utf8 = true; } void SpaceParser::SkipEmpty(bool skip) { skip_empty = skip; } void SpaceParser::UseEscapeChar(bool escape) { use_escape_char = escape; } void SpaceParser::UTF8(bool utf) { input_as_utf8 = utf; } int SpaceParser::get_last_parsed_line() { return line; } SpaceParser::Status SpaceParser::ParseJSONFile(const char * file_name) { reading_from_file = true; parsing_space = false; file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); if( file ) { ParseRootSpace(); file.close(); } else { status = cant_open_file; } return status; } SpaceParser::Status SpaceParser::ParseJSONFile(const std::string & file_name) { return ParseJSONFile(file_name.c_str()); } SpaceParser::Status SpaceParser::ParseJSONFile(const wchar_t * file_name) { std::string file_name_utf8; WideToUTF8(file_name, file_name_utf8); return ParseJSONFile(file_name_utf8.c_str()); } SpaceParser::Status SpaceParser::ParseJSONFile(const std::wstring & file_name) { return ParseJSONFile(file_name.c_str()); } SpaceParser::Status SpaceParser::ParseSpaceFile(const char * file_name) { reading_from_file = true; parsing_space = true; file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); if( file ) { ParseRootSpace(); file.close(); } else { status = cant_open_file; } return status; } SpaceParser::Status SpaceParser::ParseSpaceFile(const std::string & file_name) { return ParseSpaceFile(file_name.c_str()); } SpaceParser::Status SpaceParser::ParseSpaceFile(const wchar_t * file_name) { std::string file_name_utf8; WideToUTF8(file_name, file_name_utf8); return ParseSpaceFile(file_name_utf8.c_str()); } SpaceParser::Status SpaceParser::ParseSpaceFile(const std::wstring & file_name) { return ParseSpaceFile(file_name.c_str()); } SpaceParser::Status SpaceParser::ParseJSON(const char * str) { reading_from_file = false; reading_from_wchar_string = false; pchar_ascii = str; pchar_unicode = 0; parsing_space = false; ParseRootSpace(); return status; } SpaceParser::Status SpaceParser::ParseJSON(const std::string & str) { return ParseJSON(str.c_str()); } SpaceParser::Status SpaceParser::ParseJSON(const wchar_t * str) { reading_from_file = false; reading_from_wchar_string = true; pchar_unicode = str; pchar_ascii = 0; parsing_space = false; ParseRootSpace(); return status; } SpaceParser::Status SpaceParser::ParseJSON(const std::wstring & str) { return ParseJSON(str.c_str()); } SpaceParser::Status SpaceParser::ParseSpace(const char * str) { reading_from_file = false; reading_from_wchar_string = false; pchar_ascii = str; pchar_unicode = 0; parsing_space = true; ParseRootSpace(); return status; } SpaceParser::Status SpaceParser::ParseSpace(const std::string & str) { return ParseSpace(str.c_str()); } SpaceParser::Status SpaceParser::ParseSpace(const wchar_t * str) { reading_from_file = false; reading_from_wchar_string = true; pchar_unicode = str; pchar_ascii = 0; parsing_space = true; ParseRootSpace(); return status; } SpaceParser::Status SpaceParser::ParseSpace(const std::wstring & str) { return ParseSpace(str.c_str()); } void SpaceParser::ParseRootSpace() { line = 1; status = ok; if( !root_space ) { status = no_space; return; } ReadChar(); // put first character to lastc if( parsing_space ) { separator = '='; table_start = '('; table_end = ')'; ParseSpace(root_space); } else { separator = ':'; table_start = '['; table_end = ']'; Parse(root_space, false, false); } SkipWhite(); if( lastc != -1 ) status = syntax_error; token.clear(); } void SpaceParser::Parse(Space * space, bool is_object_value, bool is_table_value) { SkipWhite(); if( lastc == space_start ) { ParseSpace(space); } else if( lastc == table_start ) { ParseTable(space); } else if( lastc == '"' ) // IMPROVEME define a variable { ParseTextValue(space); } else { ReadStringValue(token, is_object_value, is_table_value); if( token == L"null" ) { space->set_null(); } else if( token == L"true" ) { space->set(true); } else if( token == L"false" ) { space->set(false); } else if( is_integer_token() ) { ParseIntegerValue(space); } else if( is_floating_point_token() ) { ParseFloatingPointValue(space); } else { if( parsing_space ) { space->set(token); } else { status = syntax_error; } } } } void SpaceParser::ParseSpace(Space * space) { /* * in Space format in global namespace the space start character is not required */ bool need_space_start_character = !parsing_space || space != root_space; if( need_space_start_character ) { ReadChar(); // inserting a next character after the space_start char to lastc } space->set_empty_object(); ParseKeyValuePairs(space); if( need_space_start_character ) { if( lastc == space_end ) { ReadChar(); } else { status = syntax_error; } } } void SpaceParser::ParseTextValue(Space * space) { space->set_empty_wstring(); std::wstring * str = space->get_wstr(); if( parsing_space ) ReadMultilineTokenQuoted(*str); else ReadTokenQuoted(*str); } void SpaceParser::ParseIntegerValue(Space * space) { const wchar_t * after_str = nullptr; bool was_overflow = false; int base = 10; if( parsing_space ) { // in Space format when the integer value begins with a zero it means // this is an octal number if( !token.empty() && token[0] == '0' ) base = 8; } long long val = Toll(token.c_str(), base, &after_str, &was_overflow, false); if( was_overflow ) { status = syntax_error; } else if( size_t(after_str - token.c_str()) != token.size() ) { status = syntax_error; } else { space->set(val); } } void SpaceParser::ParseFloatingPointValue(Space * space) { wchar_t * after_str = nullptr; double val = wcstod(token.c_str(), &after_str); if( errno == ERANGE ) { status = syntax_error; } else if( size_t(after_str - token.c_str()) != token.size() ) { status = syntax_error; } else { space->set(val); } } void SpaceParser::ParseTable(Space * space) { space->set_empty_table(); ReadChar(); // inserting a next character after the table_start char to lastc space->set_empty_table(); ParseValuesList(space); if( lastc == table_end ) { ReadChar(); } else { status = syntax_error; } } void SpaceParser::ParseKeyValuePairs(Space * space) { bool is_first = true; SkipWhite(); while( status == ok && lastc != space_end && lastc != -1 ) { if( !is_first ) { SkipWhite(); if( lastc == option_delimiter ) { ReadChar(); // inserting a next character after the option_delimiter to lastc if( parsing_space ) { // in space format a space_end character is allowed to be after the last table item SkipWhite(); if( lastc == space_end ) break; } } else if( !parsing_space ) { // in json format the option_delimiter is required status = syntax_error; } } if( status == ok ) { ReadKey(); if( status == ok ) { SkipWhite(); if( lastc == separator ) { ReadChar(); // inserting a next character after the separator to lastc Space & new_space = space->add(token.c_str(), new Space()); Parse(&new_space, true, false); } else if( parsing_space && lastc == space_start ) { Space & new_space = space->add_child_space(token.c_str()); ParseSpace(&new_space); } else { status = syntax_error; } } } is_first = false; SkipWhite(); } } void SpaceParser::ParseValuesList(Space * space) { bool is_first = true; SkipWhite(); while( status == ok && lastc != table_end && lastc != -1 ) { if( !is_first ) { SkipWhite(); if( lastc == option_delimiter ) // may add a new delimiter for tables? default the same as for objects... { ReadChar(); // inserting a next character after the delimiter if( parsing_space ) { // in space format a table_end character is allowed to be after the last table item SkipWhite(); if( lastc == table_end ) break; } } else if( !parsing_space ) { // in json format the option_delimiter is required status = syntax_error; } } if( status == ok ) { Space * new_space = &space->add(new Space()); Parse(new_space, false, true); } is_first = false; SkipWhite(); } } bool SpaceParser::is_integer_token() { if( token.empty() ) return false; size_t i = 0; if( token[i] == '-' ) { i += 1; if( token.size() == 1 ) return false; } for( ; i < token.size() ; ++i) { if( token[i] < '0' || token[i] > '9' ) { return false; } } return true; } bool SpaceParser::is_floating_point_token() { bool was_dot = false; bool was_exponential = false; bool was_plus_minus_sign = false; if( token.empty() ) return false; size_t i = 0; if( token[i] == '-' ) { i += 1; if( token.size() == 1 ) return false; } for( ; i < token.size() ; ++i) { if( token[i] == '.' ) { if( was_dot || was_exponential ) return false; was_dot = true; } else if( token[i] == 'e' || token[i]=='E' ) { if( was_exponential ) return false; was_exponential = true; // the exponential character cannot be the last character if( i + 1 == token.size() ) return false; } else if( token[i] == '+' || token[i] == '-' ) { if( was_plus_minus_sign || !was_exponential ) return false; // the plus or minus should be after the exponential character if( i > 0 && (token[i-1] != 'e' && token[i-1] != 'E') ) return false; was_plus_minus_sign = true; } else if( token[i] < '0' || token[i] > '9' ) { return false; } } return true; } bool SpaceParser::IsWhite(int c) { // 13 (\r) is at the end of a line in a dos file \r\n // 160 is an unbreakable space if( c==' ' || c=='\t' || c==13 || c==160 || c==10 ) return true; return false; } bool SpaceParser::is_alfa_numeric_char(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '.' || c=='-' || c=='+'; } void SpaceParser::SkipLine() { while( lastc != -1 && (char_was_escaped || lastc != '\n') ) ReadChar(); } void SpaceParser::SkipWhite() { if( parsing_space ) { while( IsWhite(lastc) || (!char_was_escaped && lastc == '#') ) { if( lastc == '#' ) SkipLine(); else ReadChar(); } } else { while( IsWhite(lastc) ) { ReadChar(); } } } void SpaceParser::TrimLastWhite(std::wstring & s) { std::wstring::size_type i; for(i=s.size() ; i>0 && IsWhite(s[i-1]) ; --i) { } if( i < s.size() ) { s.erase(i, std::wstring::npos); } } /* void SpaceParser::Trim(std::wstring & s) { std::wstring::size_type i; if( s.empty() ) return; // looking for white characters at the end for(i=s.size()-1 ; i>0 && IsWhite(s[i]) ; --i); if( i==0 && IsWhite(s[i]) ) { // the whole string consists of white characters s.clear(); return; } // deleting white characters at the end if( i != s.size() - 1 ) s.erase(i+1, std::wstring::npos); // looking for white characters at the beginning for(i=0 ; i(lastc); ReadChar(); } TrimLastWhite(token); } void SpaceParser::ReadAlfaNumericToken(std::wstring & token) { token.clear(); while( is_alfa_numeric_char(lastc) ) { token += static_cast(lastc); ReadChar(); } } void SpaceParser::ReadStringValue(std::wstring & token, bool is_object_value, bool is_table_value) { if( parsing_space ) { if( is_object_value ) { ReadTokenUntilDelimiter(token, space_end, -1); } else if( is_table_value ) { ReadTokenUntilDelimiter(token, table_end, option_delimiter); } else { ReadTokenUntilDelimiter(token, -1, -1); } } else { ReadAlfaNumericToken(token); } } // rename to something like ReadSpaceFieldToken??? void SpaceParser::ReadSpaceFieldToken(std::wstring & token) { token.clear(); while( lastc != -1 && (char_was_escaped || (lastc != separator && lastc != 10 && lastc != space_start && lastc != '#' )) ) { token += static_cast(lastc); ReadChar(); } TrimLastWhite(token); } // IMPROVEME in JSON we should not allow non-escaped a new line character void SpaceParser::ReadTokenQuoted(std::wstring & token) { token.clear(); ReadChar(); // skipping the first quotation mark while( lastc != -1 && (char_was_escaped || (lastc != '"' && lastc != 10)) ) { token += static_cast(lastc); ReadChar(); } if( !char_was_escaped && lastc == '"' ) { ReadChar(); // skipping the last quotation mark } else { status = syntax_error; } } void SpaceParser::ReadMultilineTokenQuoted(std::wstring & token) { token.clear(); ReadChar(); // skipping the first quotation mark while( lastc != -1 && (char_was_escaped || lastc != '"') ) { token += static_cast(lastc); ReadChar(); } if( !char_was_escaped && lastc == '"' ) { ReadChar(); // skipping the last quotation mark } else { status = syntax_error; } } /* * this method is used to read the field name (key) in an object * or to read the space child name (used in Space format) */ void SpaceParser::ReadKey() { SkipWhite(); if( parsing_space ) { if( lastc == '"' ) { ReadMultilineTokenQuoted(token); } else { ReadSpaceFieldToken(token); } } else { if( lastc == '"' ) { ReadTokenQuoted(token); } else { status = syntax_error; } } } int SpaceParser::ReadUTF8Char() { int c; bool correct; lastc = -1; do { UTF8ToInt(file, c, correct); if( !file ) return lastc; } while( !correct ); lastc = c; if( lastc == '\n' ) ++line; return lastc; } int SpaceParser::ReadASCIIChar() { lastc = file.get(); if( lastc == '\n' ) ++line; return lastc; } int SpaceParser::ReadCharFromWcharString() { if( *pchar_unicode == 0 ) lastc = -1; else lastc = *(pchar_unicode++); if( lastc == '\n' ) ++line; return lastc; } int SpaceParser::ReadCharFromUTF8String() { int c; bool correct; lastc = -1; do { size_t len = UTF8ToInt(pchar_ascii, c, correct); pchar_ascii += len; } while( *pchar_ascii && !correct ); if( correct ) lastc = c; if( lastc == '\n' ) ++line; return lastc; } int SpaceParser::ReadCharFromAsciiString() { if( *pchar_ascii == 0 ) lastc = -1; else lastc = *(pchar_ascii++); if( lastc == '\n' ) ++line; return lastc; } int SpaceParser::ReadCharNoEscape() { if( reading_from_file ) { if( input_as_utf8 ) return ReadUTF8Char(); else return ReadASCIIChar(); } else { if( reading_from_wchar_string ) { return ReadCharFromWcharString(); } else { if( input_as_utf8 ) return ReadCharFromUTF8String(); else return ReadCharFromAsciiString(); } } } bool SpaceParser::IsHexDigit(wchar_t c) { return ((c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') ); } int SpaceParser::HexToInt(wchar_t c) { if( c>='0' && c<='9' ) return c - '0'; if( c>='a' && c<='f' ) return c - 'a' + 10; if( c>='A' && c<='F' ) return c - 'A' + 10; return 0; } void SpaceParser::ReadUnicodeCodePoint() { wchar_t c; int value = 0; for(int i=0 ; i<4 ; ++i) { c = ReadCharNoEscape(); if( !IsHexDigit(c) ) { status = syntax_error; return; } value = (value << 4) | HexToInt(c); } lastc = (wchar_t)value; } int SpaceParser::ReadChar() { char_was_escaped = false; ReadCharNoEscape(); if( use_escape_char && lastc == '\\' ) { char_was_escaped = true; ReadCharNoEscape(); switch(lastc) { case '0': lastc = 0; break; case 't': lastc = '\t'; break; case 'r': lastc = '\r'; break; case 'n': lastc = '\n'; break; case 'b': lastc = 0x08; break; case 'f': lastc = 0x0c; break; case 'u': ReadUnicodeCodePoint(); break; // "in other cases we return the last character, so two \\ returns one \ " } } return lastc; } } // namespace