diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp index 63dcc61..cf703cc 100644 --- a/src/html/htmlparser.cpp +++ b/src/html/htmlparser.cpp @@ -78,6 +78,9 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode pchar_ascii = 0; xml_compact_mode = compact_mode; + status = ok; + line = 1; + stack_len = 0; out_string = nullptr; out_space = &space; @@ -91,6 +94,64 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode } +HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space) +{ + parsing_html = false; + reading_from_file = true; + xml_compact_mode = compact_mode; + + status = ok; + line = 1; + stack_len = 0; + out_string = nullptr; + line_len = 0; + + this->out_space = &out_space; + + if( clear_space ) + this->out_space->clear(); + + file.clear(); + file.open(file_name, std::ios_base::binary | std::ios_base::in); + + if( file ) + { + Init(); + Read(); + Uninit(); + + file.close(); + } + else + { + status = cant_open_file; + } + + return status; +} + + +HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space) +{ + return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space); +} + + +HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space) +{ + std::string file_name_utf8; + + wide_to_utf8(file_name, file_name_utf8); + return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space); +} + + +HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space) +{ + return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space); +} + + void HTMLParser::Filter(const wchar_t * in, std::wstring & out) { @@ -142,6 +203,12 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out) } +int HTMLParser::get_last_parsed_line() +{ + return line; +} + + void HTMLParser::SetSomeDefaults() { white_mode = WHITE_MODE_ORIGIN; @@ -494,7 +561,7 @@ bool HTMLParser::IsValidCharForName(int c) if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || - c=='-' || c=='!' || c==':' || c=='-') // : is for a namespace character, - is for a commentary + c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary return true; return false; @@ -506,7 +573,7 @@ bool HTMLParser::IsValidCharForAttrName(int c) if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || - c=='-' || c==':' ) + c=='-' || c==':' || c=='_') return true; return false; @@ -624,6 +691,34 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char) } +void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char) +{ + attr_value.clear(); + tmp_text.clear(); + + while( lastc != -1 ) + { + if( has_quote ) + { + if( lastc == quote_char ) + break; + } + else + { + if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) + break; + } + + // IMPROVEME add support for analyze_entities? + if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + tmp_text += lastc; + + read_char(); + } +} + + + void HTMLParser::CheckChar(wchar_t c) { if( c == 10 ) @@ -1021,6 +1116,10 @@ bool HTMLParser::IsSpecialTagIndicator(wchar_t c) return (c == '!'); } +bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c) +{ + return (c == '?'); +} // the '=' operator e.g. class="value" bool HTMLParser::IsAttributeAssignmentMark(wchar_t c) @@ -1292,7 +1391,11 @@ bool HTMLParser::ReadItemAttr() if( has_quote ) read_char(); // skipping the first quote mark - ReadItemAttrValue(has_quote, quote_char); + // IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table... + if( parsing_html ) + ReadItemAttrValue(has_quote, quote_char); + else + ReadXMLItemAttrValue(has_quote, quote_char); if( has_quote && lastc == quote_char ) read_char(); // skipping the last quote mark @@ -1361,11 +1464,18 @@ void HTMLParser::PutItemAttrToSpace() if( attr_has_value ) { - attr.set_empty_table(); - - for(size_t i=0 ; i < attr_value.size() ; ++i) + if( parsing_html ) { - attr.add(attr_value[i]); + attr.set_empty_table(); + + for(size_t i=0 ; i < attr_value.size() ; ++i) + { + attr.add(attr_value[i]); + } + } + else + { + attr.set(tmp_text); } } } @@ -1399,8 +1509,8 @@ void HTMLParser::ReadItemSpecial() PutOpeningTagMark(); } - read_char(); // skipping '!' - LastItem().name = '!'; + LastItem().name = lastc; + read_char(); // skipping '!' or '?' ReadItemName(LastItem().name, false); if( skip_tags ) @@ -1491,7 +1601,7 @@ bool HTMLParser::ReadItem() read_char(); // skipping the first opening tag mark '<' SkipWhiteLines(); - if( IsSpecialTagIndicator(lastc) ) + if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) ) ReadItemSpecial(); else if( IsClosingTagIndicator(lastc) ) @@ -1924,7 +2034,7 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space) void HTMLParser::ReadLoop() { - while( ReadItem() ) + while( status == ok && ReadItem() ) { if( LastItem().type == Item::opening ) { @@ -1933,7 +2043,6 @@ void HTMLParser::ReadLoop() CheckSingleItemExceptions(); } - CheckWhiteCharsExceptions(LastItem()); CheckDifferentContentExceptions(LastItem()); } @@ -1960,7 +2069,10 @@ void HTMLParser::ReadLoop() PopStack(); } - ReadText(); + if( status == ok ) + { + ReadText(); + } is_first_item = false; } diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h index c90a3cc..940eb39 100644 --- a/src/html/htmlparser.h +++ b/src/html/htmlparser.h @@ -95,12 +95,25 @@ class HTMLParser : public BaseParser { public: + + /* + status of parsing + */ + enum Status { ok, cant_open_file, syntax_error }; + + enum OrphanMode { orphan_nbsp, // putting " " string orphan_160space // putting 160 ascii code }; + + /* + the last status of parsing, set by parse() methods + */ + Status status; + HTMLParser(); HTMLParser(const HTMLParser & f); HTMLParser & operator=(const HTMLParser & f); @@ -109,12 +122,30 @@ public: void parse_html(const wchar_t * in, Space & space, bool compact_mode = false); + Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); + Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); + Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); + Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); + + // main methods used for filtering void Filter(const wchar_t * in, std::wstring & out); void Filter(const std::wstring & in, std::wstring & out); + + /* + * + * returns a number of a last parsed line + * can be used to obtain the line in which there was a syntax error + * + */ + int get_last_parsed_line(); + + + + const static int WHITE_MODE_ORIGIN = 0; const static int WHITE_MODE_SINGLE_LINE = 1; const static int WHITE_MODE_TREE = 2; @@ -255,6 +286,7 @@ protected: virtual bool IsClosingTagMark(wchar_t c); virtual bool IsClosingTagIndicator(wchar_t c); virtual bool IsSpecialTagIndicator(wchar_t c); + virtual bool IsXMLSpecialTagIndicator(wchar_t c); virtual bool IsAttributeAssignmentMark(wchar_t c); virtual bool IsClosingXmlSimpleTagMark(wchar_t c); virtual bool IsStartingEntityMark(wchar_t c); @@ -335,6 +367,7 @@ protected: void ReadItemAttrName(); void ReadItemAttrValueAdd(const std::wstring & str); void ReadItemAttrValue(bool has_quote, wchar_t quote_char); + void ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char); bool ReadItemAttr(); void CheckItemLangAttr();