/* * This file is a part of PikoTools * and is distributed under the (new) BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2008-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include "htmlparser.h" #include "convert/text.h" namespace pt { const int HTMLParser::WHITE_MODE_ORIGIN; const int HTMLParser::WHITE_MODE_SINGLE_LINE; const int HTMLParser::WHITE_MODE_TREE; void HTMLParser::Item::Clear() { name.clear(); type = none; is_commentary = false; porphans = nullptr; new_line_before = false; new_line = false; new_line_in_the_middle = false; has_body_tag = false; tree_index = 0; space = nullptr; } HTMLParser::Item::Item() { Clear(); } void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode) { parsing_html = true; reading_from_file = false; reading_from_wchar_string = true; pchar_unicode = in; pchar_ascii = 0; xml_compact_mode = compact_mode; status = ok; line = 1; stack_len = 0; out_string = nullptr; out_space = &space; //last_new_line = false; line_len = 0; out_space->clear(); Init(); Read(); Uninit(); } HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space) { parsing_html = false; reading_from_file = true; xml_compact_mode = compact_mode; status = ok; line = 1; stack_len = 0; out_string = nullptr; line_len = 0; this->out_space = &out_space; if( clear_space ) this->out_space->clear(); file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); if( file ) { Init(); Read(); Uninit(); file.close(); } else { status = cant_open_file; } return status; } HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space) { return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space); } HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space) { std::string file_name_utf8; wide_to_utf8(file_name, file_name_utf8); return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space); } HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space) { return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space); } void HTMLParser::Filter(const wchar_t * in, std::wstring & out) { parsing_html = true; reading_from_file = false; reading_from_wchar_string = true; pchar_unicode = in; pchar_ascii = 0; stack_len = 0; out_string = &out; out_space = nullptr; //last_new_line = false; line_len = 0; out_string->clear(); Init(); Read(); Uninit(); } void HTMLParser::Filter(const std::wstring & in, std::wstring & out) { if( &in == &out ) { // out cannot be the same string as in return; } size_t out_projected_len = in.size() * 2 + 1; if( out.capacity() < out_projected_len ) out.reserve(out_projected_len); Filter(in.c_str(), out); } HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out) { parsing_html = true; reading_from_file = true; // open the file before clearing 'out' string, 'out' string can be the same string as the file_name file.clear(); file.open(file_name, std::ios_base::binary | std::ios_base::in); status = ok; line = 1; stack_len = 0; out_string = &out; out_space = nullptr; line_len = 0; out_string->clear(); if( file ) { Init(); Read(); Uninit(); file.close(); } else { status = cant_open_file; } return status; } HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out) { return filter_file(file_name.c_str(), out); } HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out) { std::string file_name_utf8; pt::wide_to_utf8(file_name, file_name_utf8); return filter_file(file_name_utf8, out); } HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out) { return filter_file(file_name.c_str(), out); } void HTMLParser::Init() { } void HTMLParser::Uninit() { } int HTMLParser::get_last_parsed_line() { return line; } void HTMLParser::SetSomeDefaults() { white_mode = WHITE_MODE_ORIGIN; tab_size = 2; wrap_line = 0; orphan_mode = orphan_nbsp; safe_mode = false; skip_tags = false; skip_commentaries = false; skip_entities = false; analyze_entities = false; } HTMLParser::HTMLParser() { pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; SetSomeDefaults(); } HTMLParser::HTMLParser(const HTMLParser & f) { // don't need to copy the stack pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; SetSomeDefaults(); } HTMLParser & HTMLParser::operator=(const HTMLParser & f) { // don't need to copy the stack pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; // we can copy some fields from f return *this; } HTMLParser::~HTMLParser() { delete [] pstack; delete [] buffer; } void HTMLParser::white_chars_mode(int mode) { if( mode >= WHITE_MODE_ORIGIN && mode <= WHITE_MODE_TREE ) white_mode = mode; } void HTMLParser::WrapLine(size_t wrap_line_) { wrap_line = wrap_line_; if( wrap_line > 10000 ) wrap_line = 10000; } void HTMLParser::InsertTabs(size_t tabsize) { tab_size = tabsize; if( tab_size > 1000 ) tab_size = 1000; } int HTMLParser::current_white_char_mode() { if( !white_char_mode_tab.empty() ) return white_char_mode_tab.back(); return WHITE_MODE_ORIGIN; } void HTMLParser::CalcOrphansMaxLen(Orphans & orphans) { size_t i; orphans.max_len = 0; for(i=0 ; i orphans.max_len ) orphans.max_len = orphans.tab[i].size(); } } void HTMLParser::AssignOrphans(const wchar_t * lang_code, const std::vector & otab) { lang_code_lower = lang_code; ToLower(lang_code_lower); orphans_temp.tab = otab; std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end()); CalcOrphansMaxLen(orphans_temp); orphans_tab[lang_code_lower] = orphans_temp; } void HTMLParser::AssignOrphans(const std::wstring & lang_code, const std::vector & otab) { AssignOrphans(lang_code.c_str(), otab); } void HTMLParser::ClearOrphans() { orphans_tab.clear(); } void HTMLParser::OrphansMode(const std::wstring & orphan_mode_str) { if( orphan_mode_str == L"160" ) orphan_mode = orphan_160space; else orphan_mode = orphan_nbsp; } void HTMLParser::SafeMode(bool safe_mode_) { safe_mode = safe_mode_; } void HTMLParser::SkipTags(bool skip_tags) { this->skip_tags = skip_tags; } void HTMLParser::SkipCommentaries(bool skip_commentaries) { this->skip_commentaries = skip_commentaries; } void HTMLParser::SkipEntities(bool skip_entities) { this->skip_entities = skip_entities; if( this->skip_entities ) { this->analyze_entities = true; } } void HTMLParser::AnalyzeEntities(bool analyze_entities) { this->analyze_entities = analyze_entities; } void HTMLParser::SetNoFilterTag(const std::wstring & tag_name) { no_filter_tag = tag_name; } HTMLParser::Item & HTMLParser::GetItem(size_t i) { if( i >= stack_len ) { empty.Clear(); return empty; } return pstack[i]; } HTMLParser::Item & HTMLParser::LastItem() { if( stack_len == 0 ) { empty.Clear(); return empty; } return pstack[stack_len-1]; } bool HTMLParser::PushStack() { if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN ) // oops, too many items return false; pstack[stack_len].Clear(); if( stack_len > 0 ) { // 'porphans', 'has_body_tag' and 'tree_index' attributes are propagated pstack[stack_len].porphans = pstack[stack_len-1].porphans; pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag; pstack[stack_len].tree_index = pstack[stack_len-1].tree_index; } stack_len += 1; return true; } void HTMLParser::PopStack() { if( stack_len == 0 ) // oops return; stack_len -= 1; pstack[stack_len].Clear(); } bool HTMLParser::IsWhite(int c) { // dont use c==10 here if( c==' ' || c=='\t' || c==13 || c==160 ) return true; return false; } void HTMLParser::SkipWhite(std::wstring * out_string) { while( IsWhite(lastc) ) { if( out_string ) (*out_string) += lastc; read_char(); } } void HTMLParser::SkipWhiteLines(std::wstring * out_string) { while( lastc==10 || IsWhite(lastc) ) { if( out_string ) (*out_string) += lastc; read_char(); } } void HTMLParser::SkipWhiteWithFirstNewLine() { SkipWhite(); if( lastc == 10 ) { read_char(); SkipWhite(); } } //void HTMLParser::CheckNewLine() //{ // if( white_mode == WHITE_MODE_TREE ) // { // SkipWhite(); // } // // last_new_line = (lastc==10); //} void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text) { bool is_quoted = false; wchar_t quote_char = 0; while( lastc != -1 ) { if( lastc == '"' || lastc == '\'' ) { if( is_quoted ) { if( lastc == quote_char ) { is_quoted = false; } } else { is_quoted = true; quote_char = lastc; } } else if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(lastc) ) // closing xml tag: default '/' { LastItem().type = Item::simple; } else if( !is_quoted && IsClosingTagMark(lastc) ) { read_char(); break; } if( remember_text ) (*remember_text) += lastc; read_char(); } } bool HTMLParser::IsValidCharForName(int c) { if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary return true; return false; } bool HTMLParser::IsValidCharForAttrName(int c) { if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || c=='-' || c==':' || c=='_') return true; return false; } bool HTMLParser::IsValidCharForEntityName(int c) { if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || c=='#' ) return true; return false; } void HTMLParser::ReadItemName(std::wstring & name, bool clear_name) { size_t i; if( clear_name ) name.clear(); for(i=0 ; IsValidCharForName(lastc) ; ++i) { if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN ) { name += lastc; if( LastItem().type == Item::special && name == L"!--" ) { LastItem().is_commentary = true; read_char(); break; } } read_char(); } } void HTMLParser::ReadItemAttrName() { size_t i; attr_name.clear(); for( i=0 ; lastc != -1 && IsValidCharForAttrName(lastc) ; ++i ) { if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN ) attr_name += lastc; read_char(); } } void HTMLParser::ReadItemAttrValueAdd(const std::wstring & str) { if( analyze_entities ) { attr_value.push_back(std::wstring()); AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), &attr_value.back()); } else { attr_value.push_back(str); } } void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char) { attr_value.clear(); tmp_text.clear(); while( lastc != -1 ) { if( has_quote ) { if( lastc == quote_char ) break; } else { if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) break; } if( lastc==10 || IsWhite(lastc) ) { if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) ReadItemAttrValueAdd(tmp_text); tmp_text.clear(); } else { if( tmp_text.size() > WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) tmp_text.clear(); tmp_text += lastc; } read_char(); } if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) ReadItemAttrValueAdd(tmp_text); } void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char) { attr_value.clear(); tmp_text.clear(); while( lastc != -1 ) { if( has_quote ) { if( lastc == quote_char ) break; } else { if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) break; } // IMPROVEME add support for analyze_entities? if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) tmp_text += lastc; read_char(); } } void HTMLParser::CheckChar(wchar_t c) { if( c == 10 ) line_len = 0; else line_len += 1; } void HTMLParser::Put(wchar_t c) { if( out_string ) (*out_string) += c; CheckChar(c); } void HTMLParser::Put(const wchar_t * str, const wchar_t * end) { if( str >= end ) return; size_t len = end - str; if( out_string ) out_string->append(str, len); for( ; str < end ; ++str) CheckChar(*str); } void HTMLParser::Put(const std::wstring & str) { if( !str.empty() ) { if( out_string ) out_string->append(str); for(size_t i=0 ; i < str.size() ; ++i) CheckChar(str[i]); } } // out can be null void HTMLParser::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out) { size_t epsilon = 8; // !! IMPROVE ME put as a constant const wchar_t * old_str = str; while( str < end ) { if( IsStartingEntityMark(*str) ) { const wchar_t * entity_start = str; str += 1; // skip & for(size_t i=0 ; *str && IsValidCharForEntityName(*str) && i < epsilon ; ++i, ++str) { } if( IsEndingEntityMark(*str) && str - entity_start > 1 ) // at least one character in entity name { if( out ) out->append(old_str, entity_start); else Put(old_str, entity_start); str += 1; // skip ; if( !skip_entities ) { if( out ) out->append(entity_start, str); else Put(entity_start, str); } EntityFound(entity_start + 1, str - 1); // without & and ; old_str = str; } } else { str += 1; } } if( out ) out->append(old_str, end); else Put(old_str, end); } int HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str) { size_t res; const wchar_t * orphan = orphan_str.c_str(); for( ; str & table) { int res; if( table.empty() ) return false; size_t o1 = 0; size_t o2 = table.size() - 1; res = CheckOrphan(str, end, table[o1]); if( res == 0 ) return true; if( res < 0 ) return false; res = CheckOrphan(str, end, table[o2]); if( res == 0 ) return true; if( res > 0 ) return false; while( o1 + 1 < o2 ) { size_t o = (o1 + o2) / 2; res = CheckOrphan(str, end, table[o]); if( res == 0 ) return true; if( res < 0 ) o2 = o; else o1 = o; } return false; } bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end) { if( str==end || !LastItem().has_body_tag || !LastItem().porphans ) return false; size_t len = end - str; if( len > LastItem().porphans->max_len ) return false; return CheckOrphan(str, end, LastItem().porphans->tab); } void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space) { while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) ) { str += lastc; read_char(); } if( !str.empty() ) { if( allow_put_new_line ) { Put(10); PutTabs(LastItem().tree_index + 1); } else if( allow_put_space ) { Put(' '); } } if( analyze_entities ) AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr); else Put(str); } void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text) { was_white_char = false; was_new_line = false; while( lastc == 10 || IsWhite(lastc) ) { if( lastc == 10 ) was_new_line = true; else was_white_char = true; if( result_text ) (*result_text) += lastc; if( current_white_char_mode() == WHITE_MODE_ORIGIN ) { Put(lastc); } read_char(); } if( current_white_char_mode() == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) ) { Put(' '); } // in WHITE_MODE_TREE white characters are written at the beginning of a or text } void HTMLParser::PutOpeningTagMark() { Put('<'); } void HTMLParser::PutClosingTagMark() { Put('>'); } // !! IMPROVE ME change to a better name // this functions does not return true when the tag is safe bool HTMLParser::IsTagSafe(const wchar_t * tag) { if( !safe_mode ) return true; if( IsNameEqual(tag, no_filter_tag.c_str()) ) return false; static const wchar_t * unsafe_tags[] = { L"applet", L"base", L"body", L"embed", L"head", L"html", L"frame", L"frameset",L"iframe", L"link", L"meta", L"param" L"object", L"script" }; size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*); size_t i; for(i=0 ; i 30 ) len = 30; for(size_t i=0 ; i < (len*tab_size) ; ++i) { if( out_string ) (*out_string) += ' '; // we do not add them to 'line_len' } } void HTMLParser::PutNonBreakingSpace() { if( orphan_mode == orphan_nbsp ) { Put(L" "); } else { Put(160); } } // we assume the size of the opening mark to be one bool HTMLParser::IsOpeningTagMark(wchar_t c) { return (c == '<'); } // we assume the size of the closing mark to be one bool HTMLParser::IsClosingTagMark(wchar_t c) { return (c == '>'); } // the slash in the closing tag mark e.g.

bool HTMLParser::IsClosingTagIndicator(wchar_t c) { return (c == '/'); } // the slash in the closing tag mark e.g.

bool HTMLParser::IsSpecialTagIndicator(wchar_t c) { return (c == '!'); } bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c) { return (c == '?'); } // the '=' operator e.g. class="value" bool HTMLParser::IsAttributeAssignmentMark(wchar_t c) { return (c == '='); } // the slash at the end (without '>' character) // we assume the size of the mark to be one bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c) { return (c == '/'); } bool HTMLParser::IsStartingEntityMark(wchar_t c) { return (c == '&'); } bool HTMLParser::IsEndingEntityMark(wchar_t c) { return (c == ';'); } // used for such tags as: script, pre, textarea void HTMLParser::ReadTextUntilClosingCommentary() { while( lastc != -1 ) { if( lastc == '-' ) { tmp_text.clear(); tmp_text += lastc; read_char(); if( lastc == '-' ) { tmp_text += lastc; read_char(); if( IsClosingTagMark(lastc) ) { tmp_text += lastc; read_char(); Put(tmp_text); break; } } Put(tmp_text); } else { Put(lastc); read_char(); } } } bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well) { tmp_text.clear(); tmp_text += lastc; // opening tag mark read_char(); SkipWhiteLines(&tmp_text); if( IsClosingTagIndicator(lastc) ) { tmp_text += lastc; read_char(); SkipWhiteLines(&tmp_text); ReadItemName(tmp_name); if( IsNameEqual(tmp_name, LastItem().name) ) { SkipAndCheckClosingTag(); if( put_closing_tag_as_well ) { Put('<'); Put('/'); Put(tmp_name); Put('>'); } return true; } else { Put(tmp_text); Put(tmp_name); } } else { Put(tmp_text); } return false; } // used for such tags as: script, pre, textarea void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well) { while( lastc != -1 ) { if( IsOpeningTagMark(lastc) ) { if( IsClosingTagForLastItem(put_closing_tag_as_well) ) { //CheckNewLine(); break; } } else { Put(lastc); read_char(); } } } // reading text between html tags void HTMLParser::ReadText() { bool was_white_char = false; bool was_new_line = false; bool was_non_white_text = false; bool allow_put_new_line = false; bool allow_put_space = false; if( current_white_char_mode() == WHITE_MODE_TREE ) { if( LastItem().new_line || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) ) { allow_put_new_line = true; } } Space * text_space = nullptr; std::wstring * text_space_wstr = nullptr; if( out_space ) { text_space = &text_space_tmp; text_space->clear(); text_space->add(L"name", L""); Space & wstr_space = text_space->add(L"text", L""); text_space_wstr = &wstr_space.value.value_wstring; } while( lastc != -1 && !IsOpeningTagMark(lastc) ) { tmp_text.clear(); PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space); if( !tmp_text.empty() ) { allow_put_new_line = false; allow_put_space = false; was_non_white_text = true; if( text_space_wstr ) (*text_space_wstr) += tmp_text; } if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) ) { if( lastc == 10 || IsWhite(lastc) ) { SkipWhiteLines(text_space_wstr); PutNonBreakingSpace(); } } else { PutNormalWhite(was_white_char, was_new_line, text_space_wstr); if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE ) { allow_put_new_line = false; allow_put_space = false; if( was_new_line ) { allow_put_new_line = true; LastItem().new_line_in_the_middle = true; if( !was_non_white_text ) LastItem().new_line = true; } else { allow_put_space = true; } if( wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line ) { allow_put_new_line = true; } } } } if( text_space_wstr && !text_space_wstr->empty() && was_non_white_text ) { AddSpaceToSpaceTree(*text_space); } text_space_tmp.clear(); new_item_has_new_line_before = was_new_line; } bool HTMLParser::PrintOpeningItem() { if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) ) return true; return PutOpeningTag(); } bool HTMLParser::ReadItemAttr() { attr_has_value = false; attr_name.clear(); attr_value.clear(); SkipWhiteLines(); ReadItemAttrName(); if( attr_name.empty() ) return false; SkipWhiteLines(); if( !IsAttributeAssignmentMark(lastc) ) // '=' return true; attr_has_value = true; read_char(); // skipping '=' SkipWhiteLines(); bool has_quote = (lastc == '\"' || lastc == '\''); wchar_t quote_char = lastc; if( has_quote ) read_char(); // skipping the first quote mark // IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table... if( parsing_html ) ReadItemAttrValue(has_quote, quote_char); else ReadXMLItemAttrValue(has_quote, quote_char); if( has_quote && lastc == quote_char ) read_char(); // skipping the last quote mark return true; } void HTMLParser::CheckItemLangAttr() { if( attr_has_value && IsNameEqual(L"lang", attr_name) ) { LastItem().porphans = nullptr; if( !attr_value.empty() ) { // we are taking the first value only attr_value_lower = attr_value[0]; ToLower(attr_value_lower); OrphansTab::iterator i = orphans_tab.find(attr_value_lower); if( i != orphans_tab.end() ) LastItem().porphans = &i->second; } } } void HTMLParser::PrintItemAttr() { size_t i; if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) ) return; Put(' '); Put(attr_name); if( attr_has_value ) { Put(L"=\""); for(i=0 ; iget_add_space(L"attr"); Space & attr = attr_tab.add_empty_space(attr_name); if( attr_has_value ) { if( parsing_html ) { attr.set_empty_table(); for(size_t i=0 ; i < attr_value.size() ; ++i) { attr.add(attr_value[i]); } } else { attr.set(tmp_text); } } } } void HTMLParser::ReadItemClosing() { read_char(); // skipping '/' SkipWhiteLines(); ReadItemName(LastItem().name); LastItem().type = Item::closing; SkipAndCheckClosingTag(); // closing tags are printed later } void HTMLParser::ReadItemSpecial() { LastItem().type = Item::special; if( !skip_tags ) { if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before ) { Put(10); PutTabs(LastItem().tree_index); } PutOpeningTagMark(); } LastItem().name = lastc; read_char(); // skipping '!' or '?' ReadItemName(LastItem().name, false); if( skip_tags ) { SkipAndCheckClosingTag(); } else { if( LastItem().is_commentary ) { Put(LastItem().name); } else { tmp_text.clear(); SkipWhiteLines(); SkipAndCheckClosingTag(&tmp_text); Put(LastItem().name); Put(' '); Put(tmp_text); Put('>'); if( is_first_item && current_white_char_mode() == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") ) { Put(10); Put(10); SkipWhiteLines(); } } } } void HTMLParser::ReadItemOpening() { LastItem().type = Item::opening; ReadItemName(LastItem().name); AddItemToSpace(); Space * space = LastItem().space; if( !xml_compact_mode && space ) space->add(L"name", LastItem().name); if( PrintOpeningItem() ) { while( ReadItemAttr() ) { CheckItemLangAttr(); PrintItemAttr(); PutItemAttrToSpace(); } SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple' if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) ) { if( LastItem().type == Item::simple ) Put(L" /"); PutClosingTagMark(); } } } void HTMLParser::ItemFound() { } void HTMLParser::EntityFound(const wchar_t * str, const wchar_t * end) { } bool HTMLParser::ReadItem() { if( lastc == -1 ) return false; if( !PushStack() ) return false; LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle ) LastItem().tree_index += 1; read_char(); // skipping the first opening tag mark '<' SkipWhiteLines(); if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) ) ReadItemSpecial(); else if( IsClosingTagIndicator(lastc) ) ReadItemClosing(); else ReadItemOpening(); // IMPROVE ME later CheckSingleItemExceptions() can change opening to single type ItemFound(); return true; } wchar_t HTMLParser::ToLower(wchar_t c) { if( c>='A' && c<='Z' ) return c - 'A' + 'a'; return c; } void HTMLParser::ToLower(std::wstring & str) { size_t i; for(i=0 ; i0 ; ++name1, ++name2, --len ) if( ToLower(*name1) != ToLower(*name2) ) return false; if( len == 0 ) return true; return false; } bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len) { return IsNameEqual(name1, name2.c_str(), len); } bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len) { return IsNameEqual(name1.c_str(), name2, len); } bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len) { return IsNameEqual(name1.c_str(), name2.c_str(), len); } bool HTMLParser::IsLastTag(const wchar_t * name) { return IsNameEqual(name, LastItem().name); } bool HTMLParser::IsLastTag(const std::wstring & name) { return IsNameEqual(name, LastItem().name); } // checking exceptions for opening tags void HTMLParser::CheckSingleItemExceptions() { if( IsLastTag(L"meta") || IsLastTag(L"input") || IsLastTag(L"br") || IsLastTag(L"hr") || IsLastTag(L"img") || IsLastTag(L"link") || IsLastTag(L"param") || IsLastTag(L"col") || IsLastTag(L"area") ) { LastItem().type = Item::simple; PopStack(); return; } // move me to a better place if( IsLastTag(L"body") ) LastItem().has_body_tag = true; } void HTMLParser::CheckWhiteCharsExceptions(Item & item) { bool change_white_mode = false; // in safe_mode the script tag is ignored // if( !safe_mode && IsNameEqual(item.name, L"script") ) // { // change_white_mode = true; // } // if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") ) // { // change_white_mode = true; // } if( IsNameEqual(item.name, L"pre") ) { change_white_mode = true; } // move to CheckDifferentContentExceptions? if( IsNameEqual(item.name, no_filter_tag) ) { change_white_mode = true; } if( change_white_mode ) { if( item.type == Item::opening ) { white_char_mode_tab.push_back(WHITE_MODE_ORIGIN); } else { if( !white_char_mode_tab.empty() ) white_char_mode_tab.pop_back(); } } } void HTMLParser::CheckDifferentContentExceptions(Item & item) { if( !safe_mode && IsNameEqual(item.name, L"script") ) { ReadTextUntilClosingTag(true); PopStack(); } if( IsNameEqual(item.name, L"textarea") ) { ReadTextUntilClosingTag(true); PopStack(); } } void HTMLParser::AddForgottenTags() { int i; if( stack_len < 3 ) { PopStack(); return; } // we have forgotten to close some tags // looking whether there is a matching opening tag for(i=int(stack_len)-3 ; i>=0 ; --i) if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) ) break; if( i < 0 ) { // oops, there is no such a tag // we don't print the closing and the missing opening tag PopStack(); return; } for(int z=(int)stack_len-2 ; z>=i ; --z) { CheckWhiteCharsExceptions(pstack[z]); if( !skip_tags && pstack[z].new_line ) { if( current_white_char_mode() == WHITE_MODE_TREE ) { Put(10); PutTabs(pstack[z].tree_index); } } PutClosingTag(pstack[z]); pstack[z].Clear(); } //last_new_line = pstack[stack_len-1].new_line; // invalidate tags stack_len = i; } void HTMLParser::CheckStackPrintRest() { while( stack_len-- > 0 ) { if( stack_len==0 || pstack[stack_len-1].new_line ) { if( current_white_char_mode() == WHITE_MODE_TREE ) { Put(10); PutTabs(pstack[stack_len-1].tree_index); } else { Put(' '); } } PutClosingTag(pstack[stack_len]); } } void HTMLParser::CheckClosingTags() { if( stack_len == 0 ) return; // on the stack we have only opening tags // but only the last tag is a closing tag if( stack_len == 1 ) { // there is only last closing tag // we dont print it PopStack(); return; } // there are more than one tag if( (pstack[stack_len-1].is_commentary && pstack[stack_len-2].is_commentary) || IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) ) { CheckWhiteCharsExceptions(pstack[stack_len-1]); // last closing tag is from the previous one if( !skip_tags && pstack[stack_len-2].new_line ) { if( current_white_char_mode() == WHITE_MODE_TREE ) { Put(10); PutTabs(pstack[stack_len-2].tree_index); } } PutClosingTag(pstack[stack_len-1]); //last_new_line = pstack[stack_len-1].new_line; PopStack(); PopStack(); } else { AddForgottenTags(); } } bool HTMLParser::PrintRest() { //const wchar_t * start = pchar; // in safe mode we do not print the rest html code if( safe_mode || skip_tags ) return false; bool was_chars = false; while( lastc != -1 ) { Put(lastc); read_char(); was_chars = true; } return was_chars; // if( pchar > start ) // { // Put(start, pchar); // return true; // } //return false; } void HTMLParser::AddItemToSpace() { if( out_space && stack_len > 0 ) { Space * parent = out_space; if( stack_len > 1 ) { parent = pstack[stack_len-2].space; } if( xml_compact_mode ) { Space * space = parent->get_space(pstack[stack_len-1].name); if( space ) { if( space->is_table() ) { Space & child = space->add_empty_space(); pstack[stack_len-1].space = &child; } else { Space * tab = new Space(); tab->add(space); Space & child = tab->add_empty_space(); parent->value.value_object[pstack[stack_len-1].name] = tab; pstack[stack_len-1].space = &child; } } else { Space & space = parent->add_empty_space(pstack[stack_len-1].name); pstack[stack_len-1].space = &space; } } else { Space & childs_tab = parent->get_add_space(L"childs"); Space & child = childs_tab.add_empty_space(); pstack[stack_len-1].space = &child; } } } void HTMLParser::AddSpaceToSpaceTree(const Space & space) { const std::wstring * text = space.get_wstr(L"text"); if( out_space && stack_len > 0 && text ) { if( xml_compact_mode ) { Space * child_text = LastItem().space->get_space(L"text"); if( child_text ) { if( child_text->is_table() ) { child_text->add(*text); } else { Space * tab = new Space(); tab->add(*child_text); tab->add(*text); LastItem().space->value.value_object[L"text"] = tab; } } else { LastItem().space->add(L"text", *text); } } else { Space & childs_tab = LastItem().space->get_add_space(L"childs"); childs_tab.add(space); } } } void HTMLParser::ReadLoop() { while( status == ok && ReadItem() ) { if( LastItem().type == Item::opening ) { if( parsing_html ) { CheckSingleItemExceptions(); } CheckWhiteCharsExceptions(LastItem()); CheckDifferentContentExceptions(LastItem()); } else if( LastItem().type == Item::special ) { if( LastItem().is_commentary ) ReadTextUntilClosingCommentary(); PopStack(); } else if( LastItem().type == Item::simple ) { PopStack(); } else if( LastItem().type == Item::closing ) { CheckClosingTags(); } else { PopStack(); } if( status == ok ) { ReadText(); } is_first_item = false; } } void HTMLParser::Read() { read_char(); // put first character to lastc is_first_item = true; white_char_mode_tab.clear(); white_char_mode_tab.push_back(white_mode); if( current_white_char_mode() != WHITE_MODE_ORIGIN ) SkipWhiteLines(); // it can be some text or white lines before the first html tag (we print it if using filtering) // but they are not added to the Space tree ReadText(); // reading the whole html source ReadLoop(); // sometimes there can remain some html source (when there is no space on the stack) // we print the rest html without filtering (only if safe_mode is false) if( !PrintRest() ) CheckStackPrintRest(); } }