/* * This file is a part of PikoTools * and is distributed under the (new) BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2008-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include "htmlfilter.h" namespace pt { void HTMLFilter::Item::Clear() { name.clear(); type = none; porphans = 0; new_line = false; has_body_tag = false; } HTMLFilter::Item::Item() { Clear(); } void HTMLFilter::Filter(const wchar_t * in, std::wstring & out) { pchar = in; stack_len = 0; out_string = &out; last_new_line = false; line_len = 0; out_string->clear(); Init(); Read(); Uninit(); } void HTMLFilter::Init() { } void HTMLFilter::Uninit() { } void HTMLFilter::Filter(const std::wstring & in, std::wstring & out) { if( &in == &out ) { // out cannot be the same string as in return; } size_t out_projected_len = in.size() * 2 + 1; if( out.capacity() < out_projected_len ) out.reserve(out_projected_len); Filter(in.c_str(), out); } void HTMLFilter::SetSomeDefaults() { tab_size = 2; trim_white = false; break_after = 0; wrap_line = 0; orphan_mode = orphan_nbsp; safe_mode = false; skip_tags = false; skip_commentaries = false; skip_entities = false; analyze_entities = false; } HTMLFilter::HTMLFilter() { pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; SetSomeDefaults(); } HTMLFilter::HTMLFilter(const HTMLFilter & f) { // don't need to copy the stack pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; SetSomeDefaults(); } HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f) { // don't need to copy the stack pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; // we can copy some fields from f return *this; } HTMLFilter::~HTMLFilter() { delete [] pstack; delete [] buffer; } void HTMLFilter::BreakWord(size_t break_after_) { break_after = break_after_; if( break_after > 10000 ) break_after = 10000; } void HTMLFilter::WrapLine(size_t wrap_line_) { wrap_line = wrap_line_; if( wrap_line > 10000 ) wrap_line = 10000; } void HTMLFilter::TrimWhite(bool trim) { trim_white = trim; } void HTMLFilter::InsertTabs(size_t tabsize) { tab_size = tabsize; if( tab_size > 1000 ) tab_size = 1000; } void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans) { size_t i; orphans.max_len = 0; for(i=0 ; i orphans.max_len ) orphans.max_len = orphans.tab[i].size(); } } void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector & otab) { lang_code_lower = lang_code; ToLower(lang_code_lower); orphans_temp.tab = otab; std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end()); CalcOrphansMaxLen(orphans_temp); orphans_tab[lang_code_lower] = orphans_temp; } void HTMLFilter::AssignOrphans(const std::wstring & lang_code, const std::vector & otab) { AssignOrphans(lang_code.c_str(), otab); } void HTMLFilter::ClearOrphans() { orphans_tab.clear(); } void HTMLFilter::OrphansMode(const std::wstring & orphan_mode_str) { if( orphan_mode_str == L"160" ) orphan_mode = orphan_160space; else orphan_mode = orphan_nbsp; } void HTMLFilter::SafeMode(bool safe_mode_) { safe_mode = safe_mode_; } void HTMLFilter::SkipTags(bool skip_tags) { this->skip_tags = skip_tags; } void HTMLFilter::SkipCommentaries(bool skip_commentaries) { this->skip_commentaries = skip_commentaries; } void HTMLFilter::SkipEntities(bool skip_entities) { this->skip_entities = skip_entities; if( this->skip_entities ) { this->analyze_entities = true; } } void HTMLFilter::AnalyzeEntities(bool analyze_entities) { this->analyze_entities = analyze_entities; } void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name) { no_filter_tag = tag_name; } HTMLFilter::Item & HTMLFilter::GetItem(size_t i) { if( i >= stack_len ) { empty.Clear(); return empty; } return pstack[i]; } HTMLFilter::Item & HTMLFilter::LastItem() { if( stack_len == 0 ) { empty.Clear(); return empty; } return pstack[stack_len-1]; } bool HTMLFilter::PushStack() { if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN ) // oops, too many items return false; pstack[stack_len].Clear(); if( stack_len > 0 ) { // 'porphans' and 'has_body_tag' attributes are propagated pstack[stack_len].porphans = pstack[stack_len-1].porphans; pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag; } stack_len += 1; return true; } void HTMLFilter::PopStack() { if( stack_len == 0 ) // oops return; stack_len -= 1; pstack[stack_len].Clear(); } bool HTMLFilter::IsWhite(int c) { // dont use c==10 here if( c==' ' || c=='\t' || c==13 || c==160 ) return true; return false; } void HTMLFilter::SkipWhite() { while( IsWhite(*pchar) ) ++pchar; } void HTMLFilter::SkipWhiteLines() { while( *pchar==10 || IsWhite(*pchar) ) ++pchar; } void HTMLFilter::SkipWhiteWithFirstNewLine() { SkipWhite(); if( *pchar == 10 ) { pchar += 1; SkipWhite(); } } void HTMLFilter::SkipWhiteLines(const wchar_t * & str, const wchar_t * end) { while( str < end && (*str==10 || IsWhite(*str)) ) ++str; } void HTMLFilter::CheckNewLine() { const wchar_t * start = pchar; SkipWhite(); last_new_line = (*pchar==10); pchar = start; } bool HTMLFilter::IsClosingTagForLastItem() { pchar += 1; SkipWhite(); if( *pchar == '/' ) { pchar += 1; SkipWhite(); if( IsNameEqual(pchar, LastItem().name, LastItem().name.size()) ) { pchar += LastItem().name.size(); SkipWhite(); if( IsClosingTagMark(*pchar) ) { pchar += 1; return true; } } } return false; } // used for such tags as: script, pre, textarea void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well) { const wchar_t * start = pchar; const wchar_t * end = pchar; while( *pchar != 0 ) { if( IsOpeningTagMark(*pchar) ) { if( IsClosingTagForLastItem() ) { if( put_closing_tag_as_well ) end = pchar; PopStack(); CheckNewLine(); break; } } else { pchar += 1; end = pchar; } } Put(start, end); } void HTMLFilter::SkipAndCheckClosingTag() { bool is_quoted = false; wchar_t quote_char = 0; for( ; *pchar ; ++pchar ) { if( *pchar == '"' || *pchar == '\'' ) { if( is_quoted ) { if( *pchar == quote_char ) { is_quoted = false; } } else { is_quoted = true; quote_char = *pchar; } } else if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(*pchar) ) // closing xml tag: default '/' { LastItem().type = Item::simple; } else if( !is_quoted && IsClosingTagMark(*pchar) ) { ++pchar; break; } } } bool HTMLFilter::IsValidCharForName(int c) { if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || c=='-' || c=='!' || c==':') // : for namespace character return true; return false; } bool HTMLFilter::IsValidCharForAttrName(int c) { if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || c=='-' || c==':' ) return true; return false; } bool HTMLFilter::IsValidCharForEntityName(int c) { if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || c=='#' ) return true; return false; } void HTMLFilter::ReadItemName() { size_t i; for( i=0 ; IsValidCharForName(*pchar) ; ++i ) { if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN ) LastItem().name += *pchar; ++pchar; } } void HTMLFilter::ReadItemAttrName() { size_t i; attr_name.clear(); for( i=0 ; *pchar && IsValidCharForAttrName(*pchar) ; ++i ) { if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN ) attr_name += *pchar; ++pchar; } } void HTMLFilter::ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end) { attr_value.push_back(std::wstring()); if( analyze_entities ) { AnalyzeEntitiesAndPut(value_start, value_end, &attr_value.back()); } else { attr_value.back().append(value_start, value_end); } } void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char) { size_t i; attr_value.clear(); const wchar_t * value_start = pchar; size_t value_len = 0; // how many non white characters for(i=0 ; *pchar ; ++i, ++pchar ) { if( has_quote ) { if( *pchar == quote_char ) break; } else { if( IsClosingTagMark(*pchar) || *pchar == 10 || IsWhite(*pchar) ) break; } if( *pchar==10 || IsWhite(*pchar) ) { if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) ReadItemAttrValueAdd(value_start, pchar); value_len = 0; } else { if( value_len == 0 ) value_start = pchar; value_len += 1; } } if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) ReadItemAttrValueAdd(value_start, pchar); } void HTMLFilter::CheckChar(wchar_t c) { if( c == 10 ) line_len = 0; else line_len += 1; } void HTMLFilter::Put(wchar_t c) { (*out_string) += c; CheckChar(c); } void HTMLFilter::Put(const wchar_t * str) { out_string->append(str); for( ; *str ; ++str) CheckChar(*str); } void HTMLFilter::Put(const wchar_t * str, const wchar_t * end) { if( str >= end ) return; size_t len = end - str; out_string->append(str, len); for( ; str < end ; ++str) CheckChar(*str); } void HTMLFilter::Put(const std::wstring & str) { out_string->append(str); for(size_t i=0 ; i 1 ) // at least one character in entity name { if( out ) out->append(old_str, entity_start); else Put(old_str, entity_start); str += 1; // skip ; if( !skip_entities ) { if( out ) out->append(entity_start, str); else Put(entity_start, str); } EntityFound(entity_start + 1, str - 1); // without & and ; old_str = str; } } else { str += 1; } } if( out ) out->append(old_str, end); else Put(old_str, end); } int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str) { size_t res; const wchar_t * orphan = orphan_str.c_str(); for( ; str & table) { int res; if( table.empty() ) return false; size_t o1 = 0; size_t o2 = table.size() - 1; res = CheckOrphan(str, end, table[o1]); if( res == 0 ) return true; if( res < 0 ) return false; res = CheckOrphan(str, end, table[o2]); if( res == 0 ) return true; if( res > 0 ) return false; while( o1 + 1 < o2 ) { size_t o = (o1 + o2) / 2; res = CheckOrphan(str, end, table[o]); if( res == 0 ) return true; if( res < 0 ) o2 = o; else o1 = o; } return false; } bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end) { if( str==end || !LastItem().has_body_tag || !LastItem().porphans ) return false; size_t len = end - str; if( len > LastItem().porphans->max_len ) return false; return CheckOrphan(str, end, LastItem().porphans->tab); } // if there is a semicolon nearby then we break the line after it // (useful in html entities) // !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str) // i wtedy tez nie dodajemy zadnego znaku bool HTMLFilter::HasEntityEndAround(const wchar_t * str, const wchar_t * end) { size_t i, epsilon = 8;// !! IMPROVE ME put as a constant for(i=0 ; str < end && i wrap_line ) { Put(10); PutTabs(stack_len); } } void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end) { const wchar_t * word = str; size_t non_whites = 0; bool was_entity_end = false; for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites ) { if( break_after != 0 && non_whites >= break_after && (was_entity_end || !HasEntityEndAround(str, end)) ) { Put(word, str); word = str; non_whites = 0; Put(' '); CheckLineWrap(); } was_entity_end = (IsEndingEntityMark(*str)); } if( analyze_entities ) AnalyzeEntitiesAndPut(word, str, nullptr); else Put(word, str); } void HTMLFilter::PutNormalWhite(const wchar_t * & str, const wchar_t * end) { if( str < end ) { if( trim_white ) { Put(' '); SkipWhiteLines(str, end); } else { while( str < end && (*str==10 || IsWhite(*str)) ) { Put(*str); if( *str == 10 ) PutTabs(stack_len); ++str; } } } } void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end) { const wchar_t * word, * white; if( str < end ) CheckLineWrap(); while( str < end ) { word = str; PutNormalNonWhite(str, end); if( CheckOrphan(word, str) ) { white = str; SkipWhiteLines(str, end); if( white < str ) PutNonBreakingSpace(); } else { PutNormalWhite(str, end); if( str < end ) // !! lub moze podobnie jak jest na gorze tutaj? juz nie mam sily myslec :( CheckLineWrap(); } // for safety (if str was not incremented then there is an infinite loop) if( word == str ) break; } } void HTMLFilter::PutOpeningTagMark() { Put('<'); } void HTMLFilter::PutClosingTagMark() { Put('>'); } // !! IMPROVE ME change to a better name // this functions does not return true when the tag is safe bool HTMLFilter::IsTagSafe(const wchar_t * tag) { if( !safe_mode ) return true; if( IsNameEqual(tag, no_filter_tag.c_str()) ) return false; static const wchar_t * unsafe_tags[] = { L"applet", L"base", L"body", L"embed", L"head", L"html", L"frame", L"frameset",L"iframe", L"link", L"meta", L"param" L"object", L"script" }; size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*); size_t i; for(i=0 ; i 30 ) len = 30; for(size_t i=0 ; i < (len*tab_size) ; ++i) (*out_string) += ' '; // we do not add them to 'line_len' } void HTMLFilter::PutNonBreakingSpace() { if( orphan_mode == orphan_nbsp ) { Put(L" "); } else { Put(160); } } void HTMLFilter::PutNewLine() { buffer[0] = 10; Put(buffer, buffer+1); line_len = 0; } // we assume the size of the opening mark to be one bool HTMLFilter::IsOpeningTagMark(wchar_t c) { return (c == '<'); } // we assume the size of the closing mark to be one bool HTMLFilter::IsClosingTagMark(wchar_t c) { return (c == '>'); } // the slash at the end (without '>' character) // we assume the size of the mark to be one bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c) { return (c == '/'); } bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str) { static wchar_t comm_open[] = L""; size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1; if( !IsOpeningCommentaryTagMark(pchar) ) return false; pchar += OpeningCommentaryTagMarkSize(); // looking for "-->" while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) ) ++pchar; if( *pchar!= 0 ) pchar += comm_close_len; CheckNewLine(); return true; } void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white) { if( trim_white ) { // skipping all white chars (with new lines) // but with remembering the last non white character for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar) if( *pchar == 10 ) last_non_white = pchar; } else { // skipping first white chars with only one line between them SkipWhite(); last_non_white = pchar; if( *pchar == 10 ) { ++pchar; SkipWhite(); } } start = pchar; // exception for the commentary tag if( IsOpeningCommentaryTagMark(pchar) || !IsOpeningTagMark(*pchar) ) { PutNewLine(); PutTabs(stack_len); } } // reading text between html tags void HTMLFilter::ReadNormalText() { const wchar_t * start = pchar; const wchar_t * last_non_white = pchar; if( last_new_line ) ReadNormalTextSkipWhite(start, last_non_white); while( *pchar != 0 ) { const wchar_t * commentary_start = pchar; if( SkipCommentaryTagIfExists() ) { last_non_white = pchar - 1; // pointing at the last '>' from a commentary PutNormalText(start, commentary_start); if( !skip_commentaries ) { PutNormalText(commentary_start, pchar); } start = pchar; } else { if( IsOpeningTagMark(*pchar) ) break; if( !IsWhite(*pchar) ) last_non_white = pchar; pchar += 1; } } last_new_line = (*last_non_white == 10); PutNormalText(start, pchar); } bool HTMLFilter::PrintOpeningItem() { if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) ) return true; if( last_new_line ) { PutNewLine(); if( stack_len > 1 ) PutTabs(stack_len-1); } return PutOpeningTag(); } bool HTMLFilter::ReadItemAttr() { attr_has_value = false; attr_name.clear(); attr_value.clear(); SkipWhiteLines(); ReadItemAttrName(); if( attr_name.empty() ) return false; SkipWhiteLines(); if( *pchar != '=' ) return true; attr_has_value = true; pchar += 1; // skipping '=' SkipWhiteLines(); bool has_quote = (*pchar == '\"' || *pchar == '\''); wchar_t quote_char = *pchar; if( has_quote ) pchar += 1; // skipping the first quote mark ReadItemAttrValue(has_quote, quote_char); if( has_quote && *pchar == quote_char ) pchar += 1; // skipping the last quote mark return true; } bool HTMLFilter::CheckItemAttr() { if( attr_has_value && IsNameEqual(L"lang", attr_name) ) { LastItem().porphans = 0; if( !attr_value.empty() ) { // we are taking the first value only attr_value_lower = attr_value[0]; ToLower(attr_value_lower); OrphansTab::iterator i = orphans_tab.find(attr_value_lower); if( i != orphans_tab.end() ) LastItem().porphans = &i->second; } } return true; } void HTMLFilter::PrintItemAttr() { size_t i; if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) ) return; Put(' '); Put(attr_name); if( attr_has_value ) { Put(L"=\""); for(i=0 ; i start ) Put(start, pchar); // closing tag mark is printed directly from the source } void HTMLFilter::ReadItemOpening() { LastItem().type = Item::opening; ReadItemName(); if( PrintOpeningItem() ) { while( ReadItemAttr() ) { if( CheckItemAttr() ) PrintItemAttr(); } SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple' if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) ) { if( LastItem().type == Item::simple ) Put(L" /"); PutClosingTagMark(); } } } void HTMLFilter::ItemFound() { } void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end) { } bool HTMLFilter::ReadItem() { if( *pchar == 0 ) return false; if( !PushStack() ) return false; pchar += 1; // skipping the first '<' SkipWhiteLines(); if( *pchar == '!' ) ReadItemSpecial(); else if( *pchar == '/' ) // we have a closing tag (dodac jako metode wirtualna) !! ReadItemClosing(); else ReadItemOpening(); CheckNewLine(); LastItem().new_line = last_new_line; ItemFound(); return true; } wchar_t HTMLFilter::ToLower(wchar_t c) { if( c>='A' && c<='Z' ) return c - 'A' + 'a'; return c; } void HTMLFilter::ToLower(std::wstring & str) { size_t i; for(i=0 ; i0 ; ++name1, ++name2, --len ) if( ToLower(*name1) != ToLower(*name2) ) return false; if( len == 0 ) return true; return false; } bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len) { return IsNameEqual(name1, name2.c_str(), len); } bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len) { return IsNameEqual(name1.c_str(), name2, len); } bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len) { return IsNameEqual(name1.c_str(), name2.c_str(), len); } bool HTMLFilter::IsLastTag(const wchar_t * name) { return IsNameEqual(name, LastItem().name); } bool HTMLFilter::IsLastTag(const std::wstring & name) { return IsNameEqual(name, LastItem().name); } // checking exceptions for opening tags void HTMLFilter::CheckExceptions() { if( IsLastTag(L"meta") || IsLastTag(L"input") || IsLastTag(L"br") || IsLastTag(L"hr") || IsLastTag(L"img") || IsLastTag(L"link") || IsLastTag(L"param") || IsLastTag(L"col") || IsLastTag(L"area") ) { LastItem().type = Item::simple; PopStack(); return; } // in safe_mode the script tag is ignored if( !safe_mode && IsLastTag(L"script") ) PutEverythingUntilClosingTag(!skip_tags); if( IsLastTag(L"pre") || IsLastTag(L"textarea") ) PutEverythingUntilClosingTag(!skip_tags); if( IsLastTag(no_filter_tag) ) PutEverythingUntilClosingTag(false); if( IsLastTag(L"body") ) LastItem().has_body_tag = true; } void HTMLFilter::AddForgottenTags() { int i; if( stack_len < 3 ) return; // we have forgotten to close some tags // looking whether there is a matching opening tag for(i=int(stack_len)-3 ; i>=0 ; --i) if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) ) break; if( i < 0 ) { // oops, there is no such a tag // we don't print the closing and the missing opening tag PopStack(); return; } for(int z=(int)stack_len-2 ; z>=i ; --z) { if( !skip_tags && pstack[z].new_line ) { PutNewLine(); PutTabs(z); } PutClosingTag(pstack[z].name.c_str()); pstack[z].Clear(); } last_new_line = pstack[stack_len-1].new_line; // invalidate tags stack_len = i; } void HTMLFilter::CheckStackPrintRest() { while( stack_len-- > 0 ) { if( stack_len==0 || pstack[stack_len-1].new_line ) PutNewLine(); PutTabs(stack_len); PutClosingTag(pstack[stack_len].name.c_str()); } } void HTMLFilter::CheckClosingTags() { if( stack_len == 0 ) return; // on the stack we have only opening tags // but only the last tag is a closing tag if( stack_len == 1 ) { // there is only last closing tag // we dont print it PopStack(); return; } // there are more than one tag if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) ) { // last closing tag is from the previous one if( !skip_tags && pstack[stack_len-2].new_line ) { PutNewLine(); PutTabs(stack_len-2); } PutClosingTag(pstack[stack_len-1].name.c_str()); last_new_line = pstack[stack_len-1].new_line; PopStack(); PopStack(); } else { AddForgottenTags(); } } bool HTMLFilter::PrintRest() { const wchar_t * start = pchar; // in safe mode we do not print the rest html code if( safe_mode || skip_tags ) return false; while( *pchar ) ++pchar; if( pchar > start ) { Put(start, pchar); return true; } return false; } void HTMLFilter::ReadLoop() { while( ReadItem() ) { if( LastItem().type == Item::opening ) { CheckExceptions(); } else if( LastItem().type == Item::special || LastItem().type == Item::simple ) { if( stack_len > 1 ) { //pstack[stack_len-2].new_line = LastItem().new_line; } else if( trim_white ) { // one new line after a simple or special tag // (if the tag has level 0 in the tree - it not means that this is a first tag) // for example can be DOCTYPE PutNewLine(); } PopStack(); } else if( LastItem().type == Item::closing ) { CheckClosingTags(); } else { PopStack(); } ReadNormalText(); } } void HTMLFilter::Read() { if( trim_white ) SkipWhiteLines(); // it can be some text or white lines before the first html tag (we print it) ReadNormalText(); // reading the whole html source ReadLoop(); // sometimes there can remain some html source (when there is no space on the stack) // we print the rest html without filtering (only if safe_mode is false) if( !PrintRest() ) CheckStackPrintRest(); } }