diff --git a/src/Makefile.dep b/src/Makefile.dep index 683e3cf..2a8cf37 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -42,3 +42,4 @@ ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h +./html/htmlfilter.o: ./html/htmlfilter.h diff --git a/src/html/htmlfilter.cpp b/src/html/htmlfilter.cpp new file mode 100644 index 0000000..d103b9e --- /dev/null +++ b/src/html/htmlfilter.cpp @@ -0,0 +1,1711 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2008-2021, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "htmlfilter.h" + + + +namespace pt +{ + + + + +void HTMLFilter::Item::Clear() +{ + name.clear(); + type = none; + porphans = 0; + new_line = false; + has_body_tag = false; +} + + +HTMLFilter::Item::Item() +{ + Clear(); +} + + + +void HTMLFilter::Filter(const wchar_t * in, std::wstring & out) +{ + pchar = in; + stack_len = 0; + out_string = &out; + last_new_line = false; + line_len = 0; + out_string->clear(); + + Init(); + Read(); + Uninit(); +} + + + +void HTMLFilter::Init() +{ +} + + +void HTMLFilter::Uninit() +{ +} + + + +void HTMLFilter::Filter(const std::wstring & in, std::wstring & out) +{ + if( &in == &out ) + { + // out cannot be the same string as in + return; + } + + size_t out_projected_len = in.size() * 2 + 1; + + if( out.capacity() < out_projected_len ) + out.reserve(out_projected_len); + + Filter(in.c_str(), out); +} + + +void HTMLFilter::SetSomeDefaults() +{ + tab_size = 2; + trim_white = false; + break_after = 0; + wrap_line = 0; + orphan_mode = orphan_nbsp; + safe_mode = false; + skip_tags = false; + skip_commentaries = false; + skip_entities = false; + analyze_entities = false; +} + + +HTMLFilter::HTMLFilter() +{ + pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; + buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; + + SetSomeDefaults(); +} + + +HTMLFilter::HTMLFilter(const HTMLFilter & f) +{ + // don't need to copy the stack + pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; + buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; + + SetSomeDefaults(); +} + + +HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f) +{ + // don't need to copy the stack + pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; + buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN]; + + // we can copy some fields from f + +return *this; +} + + +HTMLFilter::~HTMLFilter() +{ + delete [] pstack; + delete [] buffer; +} + + + + +void HTMLFilter::BreakWord(size_t break_after_) +{ + break_after = break_after_; + + if( break_after > 10000 ) + break_after = 10000; +} + + +void HTMLFilter::WrapLine(size_t wrap_line_) +{ + wrap_line = wrap_line_; + + if( wrap_line > 10000 ) + wrap_line = 10000; +} + + + +void HTMLFilter::TrimWhite(bool trim) +{ + trim_white = trim; +} + + +void HTMLFilter::InsertTabs(size_t tabsize) +{ + tab_size = tabsize; + + if( tab_size > 1000 ) + tab_size = 1000; +} + + +void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans) +{ +size_t i; + + orphans.max_len = 0; + + for(i=0 ; i orphans.max_len ) + orphans.max_len = orphans.tab[i].size(); + } +} + + +void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector & otab) +{ + lang_code_lower = lang_code; + ToLower(lang_code_lower); + + orphans_temp.tab = otab; + std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end()); + CalcOrphansMaxLen(orphans_temp); + + orphans_tab[lang_code_lower] = orphans_temp; +} + + + +void HTMLFilter::AssignOrphans(const std::wstring & lang_code, const std::vector & otab) +{ + AssignOrphans(lang_code.c_str(), otab); +} + + +void HTMLFilter::ClearOrphans() +{ + orphans_tab.clear(); +} + + + + +void HTMLFilter::OrphansMode(const std::wstring & orphan_mode_str) +{ + if( orphan_mode_str == L"160" ) + orphan_mode = orphan_160space; + else + orphan_mode = orphan_nbsp; +} + + +void HTMLFilter::SafeMode(bool safe_mode_) +{ + safe_mode = safe_mode_; +} + + +void HTMLFilter::SkipTags(bool skip_tags) +{ + this->skip_tags = skip_tags; +} + +void HTMLFilter::SkipCommentaries(bool skip_commentaries) +{ + this->skip_commentaries = skip_commentaries; +} + + +void HTMLFilter::SkipEntities(bool skip_entities) +{ + this->skip_entities = skip_entities; + + if( this->skip_entities ) + { + this->analyze_entities = true; + } +} + + +void HTMLFilter::AnalyzeEntities(bool analyze_entities) +{ + this->analyze_entities = analyze_entities; +} + + +void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name) +{ + no_filter_tag = tag_name; +} + + + + +HTMLFilter::Item & HTMLFilter::GetItem(size_t i) +{ + if( i >= stack_len ) + { + empty.Clear(); + return empty; + } + +return pstack[i]; +} + + +HTMLFilter::Item & HTMLFilter::LastItem() +{ + if( stack_len == 0 ) + { + empty.Clear(); + return empty; + } + +return pstack[stack_len-1]; +} + + +bool HTMLFilter::PushStack() +{ + if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN ) + // oops, too many items + return false; + + pstack[stack_len].Clear(); + + if( stack_len > 0 ) + { + // 'porphans' and 'has_body_tag' attributes are propagated + pstack[stack_len].porphans = pstack[stack_len-1].porphans; + pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag; + } + + stack_len += 1; + +return true; +} + +void HTMLFilter::PopStack() +{ + if( stack_len == 0 ) + // oops + return; + + stack_len -= 1; + pstack[stack_len].Clear(); +} + + +bool HTMLFilter::IsWhite(int c) +{ + // dont use c==10 here + + if( c==' ' || c=='\t' || c==13 || c==160 ) + return true; + +return false; +} + + +void HTMLFilter::SkipWhite() +{ + while( IsWhite(*pchar) ) + ++pchar; +} + + +void HTMLFilter::SkipWhiteLines() +{ + while( *pchar==10 || IsWhite(*pchar) ) + ++pchar; +} + + +void HTMLFilter::SkipWhiteWithFirstNewLine() +{ + SkipWhite(); + + if( *pchar == 10 ) + { + pchar += 1; + SkipWhite(); + } +} + + +void HTMLFilter::SkipWhiteLines(const wchar_t * & str, const wchar_t * end) +{ + while( str < end && (*str==10 || IsWhite(*str)) ) + ++str; +} + + +void HTMLFilter::CheckNewLine() +{ +const wchar_t * start = pchar; + + SkipWhite(); + last_new_line = (*pchar==10); + + pchar = start; +} + + + + +bool HTMLFilter::IsClosingTagForLastItem() +{ + pchar += 1; + SkipWhite(); + + if( *pchar == '/' ) + { + pchar += 1; + SkipWhite(); + + if( IsNameEqual(pchar, LastItem().name, LastItem().name.size()) ) + { + pchar += LastItem().name.size(); + SkipWhite(); + + if( IsClosingTagMark(*pchar) ) + { + pchar += 1; + return true; + } + } + } + +return false; +} + + + + +// used for such tags as: script, pre, textarea +void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well) +{ +const wchar_t * start = pchar; +const wchar_t * end = pchar; + + while( *pchar != 0 ) + { + if( IsOpeningTagMark(*pchar) ) + { + if( IsClosingTagForLastItem() ) + { + if( put_closing_tag_as_well ) + end = pchar; + + PopStack(); + CheckNewLine(); + break; + } + } + else + { + pchar += 1; + end = pchar; + } + } + + Put(start, end); +} + + + + +void HTMLFilter::SkipAndCheckClosingTag() +{ + bool is_quoted = false; + wchar_t quote_char = 0; + + for( ; *pchar ; ++pchar ) + { + if( *pchar == '"' || *pchar == '\'' ) + { + if( is_quoted ) + { + if( *pchar == quote_char ) + { + is_quoted = false; + } + } + else + { + is_quoted = true; + quote_char = *pchar; + } + } + else + if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(*pchar) ) // closing xml tag: default '/' + { + LastItem().type = Item::simple; + } + else + if( !is_quoted && IsClosingTagMark(*pchar) ) + { + ++pchar; + break; + } + } +} + + + +bool HTMLFilter::IsValidCharForName(int c) +{ + if( (c>='a' && c<='z') || + (c>='A' && c<='Z') || + (c>='0' && c<='9') || + c=='-' || c=='!' || c==':') // : for namespace character + return true; + +return false; +} + + +bool HTMLFilter::IsValidCharForAttrName(int c) +{ + if( (c>='a' && c<='z') || + (c>='A' && c<='Z') || + (c>='0' && c<='9') || + c=='-' || c==':' ) + return true; + +return false; +} + + +bool HTMLFilter::IsValidCharForEntityName(int c) +{ + if( (c>='a' && c<='z') || + (c>='A' && c<='Z') || + (c>='0' && c<='9') || + c=='#' ) + return true; + +return false; +} + + +void HTMLFilter::ReadItemName() +{ +size_t i; + + for( i=0 ; IsValidCharForName(*pchar) ; ++i ) + { + if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN ) + LastItem().name += *pchar; + + ++pchar; + } +} + + + +void HTMLFilter::ReadItemAttrName() +{ +size_t i; + + attr_name.clear(); + + for( i=0 ; *pchar && IsValidCharForAttrName(*pchar) ; ++i ) + { + if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN ) + attr_name += *pchar; + + ++pchar; + } +} + + + +void HTMLFilter::ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end) +{ + attr_value.push_back(std::wstring()); + + if( analyze_entities ) + { + AnalyzeEntitiesAndPut(value_start, value_end, &attr_value.back()); + } + else + { + attr_value.back().append(value_start, value_end); + } +} + + +void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char) +{ +size_t i; + + attr_value.clear(); + const wchar_t * value_start = pchar; + size_t value_len = 0; // how many non white characters + + for(i=0 ; *pchar ; ++i, ++pchar ) + { + if( has_quote ) + { + if( *pchar == quote_char ) + break; + } + else + { + if( IsClosingTagMark(*pchar) || *pchar == 10 || IsWhite(*pchar) ) + break; + } + + if( *pchar==10 || IsWhite(*pchar) ) + { + if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + ReadItemAttrValueAdd(value_start, pchar); + + value_len = 0; + } + else + { + if( value_len == 0 ) + value_start = pchar; + + value_len += 1; + } + } + + if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + ReadItemAttrValueAdd(value_start, pchar); +} + + +void HTMLFilter::CheckChar(wchar_t c) +{ + if( c == 10 ) + line_len = 0; + else + line_len += 1; +} + + +void HTMLFilter::Put(wchar_t c) +{ + (*out_string) += c; + CheckChar(c); +} + + +void HTMLFilter::Put(const wchar_t * str) +{ + out_string->append(str); + + for( ; *str ; ++str) + CheckChar(*str); +} + + +void HTMLFilter::Put(const wchar_t * str, const wchar_t * end) +{ + if( str >= end ) + return; + + size_t len = end - str; + out_string->append(str, len); + + for( ; str < end ; ++str) + CheckChar(*str); +} + + +void HTMLFilter::Put(const std::wstring & str) +{ + out_string->append(str); + + for(size_t i=0 ; i 1 ) // at least one character in entity name + { + if( out ) + out->append(old_str, entity_start); + else + Put(old_str, entity_start); + + str += 1; // skip ; + + if( !skip_entities ) + { + if( out ) + out->append(entity_start, str); + else + Put(entity_start, str); + } + + EntityFound(entity_start + 1, str - 1); // without & and ; + old_str = str; + } + } + else + { + str += 1; + } + } + + if( out ) + out->append(old_str, end); + else + Put(old_str, end); +} + + + + +int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str) +{ +size_t res; + + const wchar_t * orphan = orphan_str.c_str(); + + for( ; str & table) +{ +int res; + + if( table.empty() ) + return false; + + size_t o1 = 0; + size_t o2 = table.size() - 1; + + res = CheckOrphan(str, end, table[o1]); + + if( res == 0 ) + return true; + + if( res < 0 ) + return false; + + res = CheckOrphan(str, end, table[o2]); + + if( res == 0 ) + return true; + + if( res > 0 ) + return false; + + + while( o1 + 1 < o2 ) + { + size_t o = (o1 + o2) / 2; + res = CheckOrphan(str, end, table[o]); + + if( res == 0 ) + return true; + + if( res < 0 ) + o2 = o; + else + o1 = o; + } + +return false; +} + + +bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end) +{ + if( str==end || !LastItem().has_body_tag || !LastItem().porphans ) + return false; + + size_t len = end - str; + + if( len > LastItem().porphans->max_len ) + return false; + +return CheckOrphan(str, end, LastItem().porphans->tab); +} + + +// if there is a semicolon nearby then we break the line after it +// (useful in html entities) +// !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str) +// i wtedy tez nie dodajemy zadnego znaku +bool HTMLFilter::HasEntityEndAround(const wchar_t * str, const wchar_t * end) +{ +size_t i, epsilon = 8;// !! IMPROVE ME put as a constant + + for(i=0 ; str < end && i wrap_line ) + { + Put(10); + PutTabs(stack_len); + } +} + + +void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end) +{ +const wchar_t * word = str; +size_t non_whites = 0; +bool was_entity_end = false; + + for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites ) + { + if( break_after != 0 && non_whites >= break_after && (was_entity_end || !HasEntityEndAround(str, end)) ) + { + Put(word, str); + word = str; + non_whites = 0; + Put(' '); + CheckLineWrap(); + } + + was_entity_end = (IsEndingEntityMark(*str)); + } + + if( analyze_entities ) + AnalyzeEntitiesAndPut(word, str, nullptr); + else + Put(word, str); +} + + +void HTMLFilter::PutNormalWhite(const wchar_t * & str, const wchar_t * end) +{ + if( str < end ) + { + if( trim_white ) + { + Put(' '); + SkipWhiteLines(str, end); + } + else + { + while( str < end && (*str==10 || IsWhite(*str)) ) + { + Put(*str); + + if( *str == 10 ) + PutTabs(stack_len); + + ++str; + } + } + } +} + + +void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end) +{ +const wchar_t * word, * white; + + if( str < end ) + CheckLineWrap(); + + while( str < end ) + { + word = str; + PutNormalNonWhite(str, end); + + if( CheckOrphan(word, str) ) + { + white = str; + SkipWhiteLines(str, end); + + if( white < str ) + PutNonBreakingSpace(); + } + else + { + PutNormalWhite(str, end); + + if( str < end ) // !! lub moze podobnie jak jest na gorze tutaj? juz nie mam sily myslec :( + CheckLineWrap(); + } + + // for safety (if str was not incremented then there is an infinite loop) + if( word == str ) + break; + } +} + + + + +void HTMLFilter::PutOpeningTagMark() +{ + Put('<'); +} + + +void HTMLFilter::PutClosingTagMark() +{ + Put('>'); +} + + + + +// !! IMPROVE ME change to a better name +// this functions does not return true when the tag is safe +bool HTMLFilter::IsTagSafe(const wchar_t * tag) +{ + if( !safe_mode ) + return true; + + if( IsNameEqual(tag, no_filter_tag.c_str()) ) + return false; + + static const wchar_t * unsafe_tags[] = { + L"applet", L"base", L"body", + L"embed", L"head", L"html", + L"frame", L"frameset",L"iframe", + L"link", L"meta", L"param" + L"object", L"script" + }; + + size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*); + size_t i; + + for(i=0 ; i 30 ) + len = 30; + + for(size_t i=0 ; i < (len*tab_size) ; ++i) + (*out_string) += ' '; // we do not add them to 'line_len' +} + + +void HTMLFilter::PutNonBreakingSpace() +{ + if( orphan_mode == orphan_nbsp ) + { + Put(L" "); + } + else + { + Put(160); + } +} + + + + +void HTMLFilter::PutNewLine() +{ + buffer[0] = 10; + Put(buffer, buffer+1); + line_len = 0; +} + + +// we assume the size of the opening mark to be one +bool HTMLFilter::IsOpeningTagMark(wchar_t c) +{ + return (c == '<'); +} + + +// we assume the size of the closing mark to be one +bool HTMLFilter::IsClosingTagMark(wchar_t c) +{ + return (c == '>'); +} + + +// the slash at the end (without '>' character) +// we assume the size of the mark to be one +bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c) +{ + return (c == '/'); +} + + +bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str) +{ +static wchar_t comm_open[] = L""; +size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1; + + if( !IsOpeningCommentaryTagMark(pchar) ) + return false; + + pchar += OpeningCommentaryTagMarkSize(); + + // looking for "-->" + while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) ) + ++pchar; + + if( *pchar!= 0 ) + pchar += comm_close_len; + + CheckNewLine(); + +return true; +} + + +void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white) +{ + if( trim_white ) + { + // skipping all white chars (with new lines) + // but with remembering the last non white character + for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar) + if( *pchar == 10 ) + last_non_white = pchar; + } + else + { + // skipping first white chars with only one line between them + SkipWhite(); + last_non_white = pchar; + + if( *pchar == 10 ) + { + ++pchar; + SkipWhite(); + } + } + + start = pchar; + + // exception for the commentary tag + if( IsOpeningCommentaryTagMark(pchar) || !IsOpeningTagMark(*pchar) ) + { + PutNewLine(); + PutTabs(stack_len); + } +} + + + +// reading text between html tags +void HTMLFilter::ReadNormalText() +{ +const wchar_t * start = pchar; +const wchar_t * last_non_white = pchar; + + if( last_new_line ) + ReadNormalTextSkipWhite(start, last_non_white); + + + while( *pchar != 0 ) + { + const wchar_t * commentary_start = pchar; + + if( SkipCommentaryTagIfExists() ) + { + last_non_white = pchar - 1; // pointing at the last '>' from a commentary + PutNormalText(start, commentary_start); + + if( !skip_commentaries ) + { + PutNormalText(commentary_start, pchar); + } + + start = pchar; + } + else + { + if( IsOpeningTagMark(*pchar) ) + break; + + if( !IsWhite(*pchar) ) + last_non_white = pchar; + + pchar += 1; + } + } + + last_new_line = (*last_non_white == 10); + PutNormalText(start, pchar); +} + + + +bool HTMLFilter::PrintOpeningItem() +{ + if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) ) + return true; + + if( last_new_line ) + { + PutNewLine(); + + if( stack_len > 1 ) + PutTabs(stack_len-1); + } + +return PutOpeningTag(); +} + + + + + +bool HTMLFilter::ReadItemAttr() +{ + attr_has_value = false; + attr_name.clear(); + attr_value.clear(); + + SkipWhiteLines(); + ReadItemAttrName(); + + if( attr_name.empty() ) + return false; + + SkipWhiteLines(); + + if( *pchar != '=' ) + return true; + + attr_has_value = true; + pchar += 1; // skipping '=' + SkipWhiteLines(); + + bool has_quote = (*pchar == '\"' || *pchar == '\''); + wchar_t quote_char = *pchar; + + if( has_quote ) + pchar += 1; // skipping the first quote mark + + ReadItemAttrValue(has_quote, quote_char); + + if( has_quote && *pchar == quote_char ) + pchar += 1; // skipping the last quote mark + +return true; +} + + + +bool HTMLFilter::CheckItemAttr() +{ + if( attr_has_value && IsNameEqual(L"lang", attr_name) ) + { + LastItem().porphans = 0; + + if( !attr_value.empty() ) + { + // we are taking the first value only + attr_value_lower = attr_value[0]; + ToLower(attr_value_lower); + + OrphansTab::iterator i = orphans_tab.find(attr_value_lower); + + if( i != orphans_tab.end() ) + LastItem().porphans = &i->second; + } + } + +return true; +} + + +void HTMLFilter::PrintItemAttr() +{ +size_t i; + + if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) ) + return; + + Put(' '); + Put(attr_name); + + if( attr_has_value ) + { + Put(L"=\""); + + for(i=0 ; i start ) + Put(start, pchar); + + // closing tag mark is printed directly from the source +} + + +void HTMLFilter::ReadItemOpening() +{ + LastItem().type = Item::opening; + ReadItemName(); + + if( PrintOpeningItem() ) + { + while( ReadItemAttr() ) + { + if( CheckItemAttr() ) + PrintItemAttr(); + } + + SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple' + + if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) ) + { + if( LastItem().type == Item::simple ) + Put(L" /"); + + PutClosingTagMark(); + } + } +} + + +void HTMLFilter::ItemFound() +{ +} + +void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end) +{ +} + + +bool HTMLFilter::ReadItem() +{ + if( *pchar == 0 ) + return false; + + if( !PushStack() ) + return false; + + pchar += 1; // skipping the first '<' + SkipWhiteLines(); + + if( *pchar == '!' ) + ReadItemSpecial(); + else + if( *pchar == '/' ) // we have a closing tag (dodac jako metode wirtualna) !! + ReadItemClosing(); + else + ReadItemOpening(); + + CheckNewLine(); + LastItem().new_line = last_new_line; + + ItemFound(); + +return true; +} + + + +wchar_t HTMLFilter::ToLower(wchar_t c) +{ + if( c>='A' && c<='Z' ) + return c - 'A' + 'a'; + +return c; +} + + +void HTMLFilter::ToLower(std::wstring & str) +{ +size_t i; + + for(i=0 ; i0 ; ++name1, ++name2, --len ) + if( ToLower(*name1) != ToLower(*name2) ) + return false; + + if( len == 0 ) + return true; + +return false; +} + + + +bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len) +{ + return IsNameEqual(name1, name2.c_str(), len); +} + + +bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len) +{ + return IsNameEqual(name1.c_str(), name2, len); +} + + +bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len) +{ + return IsNameEqual(name1.c_str(), name2.c_str(), len); +} + + + + + +bool HTMLFilter::IsLastTag(const wchar_t * name) +{ + return IsNameEqual(name, LastItem().name); +} + + +bool HTMLFilter::IsLastTag(const std::wstring & name) +{ + return IsNameEqual(name, LastItem().name); +} + + +// checking exceptions for opening tags +void HTMLFilter::CheckExceptions() +{ + if( IsLastTag(L"meta") || + IsLastTag(L"input") || + IsLastTag(L"br") || + IsLastTag(L"hr") || + IsLastTag(L"img") || + IsLastTag(L"link") || + IsLastTag(L"param") || + IsLastTag(L"col") || + IsLastTag(L"area") ) + { + LastItem().type = Item::simple; + PopStack(); + return; + } + + // in safe_mode the script tag is ignored + if( !safe_mode && IsLastTag(L"script") ) + PutEverythingUntilClosingTag(!skip_tags); + + if( IsLastTag(L"pre") || IsLastTag(L"textarea") ) + PutEverythingUntilClosingTag(!skip_tags); + + if( IsLastTag(no_filter_tag) ) + PutEverythingUntilClosingTag(false); + + if( IsLastTag(L"body") ) + LastItem().has_body_tag = true; +} + + + + +void HTMLFilter::AddForgottenTags() +{ +int i; + + if( stack_len < 3 ) + return; + + // we have forgotten to close some tags + + // looking whether there is a matching opening tag + for(i=int(stack_len)-3 ; i>=0 ; --i) + if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) ) + break; + + if( i < 0 ) + { + // oops, there is no such a tag + // we don't print the closing and the missing opening tag + PopStack(); + return; + } + + for(int z=(int)stack_len-2 ; z>=i ; --z) + { + if( !skip_tags && pstack[z].new_line ) + { + PutNewLine(); + PutTabs(z); + } + + PutClosingTag(pstack[z].name.c_str()); + pstack[z].Clear(); + } + + last_new_line = pstack[stack_len-1].new_line; + + // invalidate tags + stack_len = i; +} + + +void HTMLFilter::CheckStackPrintRest() +{ + while( stack_len-- > 0 ) + { + if( stack_len==0 || pstack[stack_len-1].new_line ) + PutNewLine(); + + PutTabs(stack_len); + PutClosingTag(pstack[stack_len].name.c_str()); + } +} + + +void HTMLFilter::CheckClosingTags() +{ + if( stack_len == 0 ) + return; + + // on the stack we have only opening tags + // but only the last tag is a closing tag + + if( stack_len == 1 ) + { + // there is only last closing tag + // we dont print it + PopStack(); + return; + } + + // there are more than one tag + if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) ) + { + // last closing tag is from the previous one + if( !skip_tags && pstack[stack_len-2].new_line ) + { + PutNewLine(); + PutTabs(stack_len-2); + } + + PutClosingTag(pstack[stack_len-1].name.c_str()); + last_new_line = pstack[stack_len-1].new_line; + PopStack(); + PopStack(); + } + else + { + AddForgottenTags(); + } +} + + +bool HTMLFilter::PrintRest() +{ +const wchar_t * start = pchar; + + // in safe mode we do not print the rest html code + if( safe_mode || skip_tags ) + return false; + + while( *pchar ) + ++pchar; + + if( pchar > start ) + { + Put(start, pchar); + return true; + } + +return false; +} + + + +void HTMLFilter::ReadLoop() +{ + while( ReadItem() ) + { + if( LastItem().type == Item::opening ) + { + CheckExceptions(); + } + else + if( LastItem().type == Item::special || LastItem().type == Item::simple ) + { + if( stack_len > 1 ) + { + //pstack[stack_len-2].new_line = LastItem().new_line; + } + else + if( trim_white ) + { + // one new line after a simple or special tag + // (if the tag has level 0 in the tree - it not means that this is a first tag) + // for example can be DOCTYPE + PutNewLine(); + } + + PopStack(); + } + else + if( LastItem().type == Item::closing ) + { + CheckClosingTags(); + } + else + { + PopStack(); + } + + ReadNormalText(); + } +} + + + +void HTMLFilter::Read() +{ + if( trim_white ) + SkipWhiteLines(); + + // it can be some text or white lines before the first html tag (we print it) + ReadNormalText(); + + // reading the whole html source + ReadLoop(); + + // sometimes there can remain some html source (when there is no space on the stack) + // we print the rest html without filtering (only if safe_mode is false) + if( !PrintRest() ) + CheckStackPrintRest(); +} + + + + + +} + diff --git a/src/html/htmlfilter.h b/src/html/htmlfilter.h new file mode 100644 index 0000000..35710d3 --- /dev/null +++ b/src/html/htmlfilter.h @@ -0,0 +1,376 @@ +/* + * This file is a part of PikoTools + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2008-2021, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef headerfile_picotools_html_htmlfilter +#define headerfile_picotools_html_htmlfilter + +#include +#include +#include +#include + + + +namespace pt +{ + + + + +// max length of a name of a html tag (with terminating null) +#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30 + +// max length of a html lang attribute (e.g. "en", "pl") +#define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10 + + +#define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40 + + +#define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500 + + +// depth of the html tree +#define WINIX_HTMLFILTER_STACK_MAXLEN 100 + +// length of a buffer used for printing +// it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3 +#define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048 + + + + +/*! + very lightweight filter for html + (without using any dynamic memory - some memory is allocated only at the beginning - in ctors) + this filter has O(n) complexity over the whole html string + + such tags as: ) are untouched + + if the filter finds that there are not closed tags it will close them, + if the filter finds a closing tag which doesn't have an opening tag - it will skip it + + tags which don't need to be closed: meta, input, br, img, link + look at CheckExceptions() method + + the filter recognizes xml simple tags (with / at the end) such as:
+*/ +class HTMLFilter +{ +public: + + enum OrphanMode + { + orphan_nbsp, // putting " " string + orphan_160space // putting 160 ascii code + }; + + HTMLFilter(); + HTMLFilter(const HTMLFilter & f); + HTMLFilter & operator=(const HTMLFilter & f); + virtual ~HTMLFilter(); + + + // main methods used for filtering + void Filter(const wchar_t * in, std::wstring & out); + void Filter(const std::wstring & in, std::wstring & out); + + + // insert a white space into long words + // (only between html tags) + // skipped in such tags: script, pre, textarea + // break_after - after how many characters insert a space (0 - off) + void BreakWord(size_t break_after_); + + // insert a new line character into long lines + // (only between html tags) + // skipped in such tags: script, pre, textarea + // wrap_line - after how many characters wrap a line (0 - off) + // lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section) + void WrapLine(size_t wrap_line_); + + // trimming white characters (with new lines) + // at the beginning, at the end and in the middle of a string + // only between html tags + // at the beginning and at the end only one space is left + // skipped in such tags: script, pre, textarea + // false by default + void TrimWhite(bool trim); + + // first tabs in a tree + // default: 2 (spaces) + // set 0 to turn off + void InsertTabs(size_t tabsize); + + // set a name of a html tag which will be used as 'nofilter' tag + // elements between such tags are not filtered (similarly as in
 and