From bdb2616f32c4a2e9fa2f56abb7877583e9a76664 Mon Sep 17 00:00:00 2001
From: Tomasz Sowa
Date: Sat, 17 Jul 2021 13:35:10 +0200
Subject: [PATCH 01/37] added: HTMLFilter (html/htmlfilter.h|cpp) - copied from
winix project
---
src/Makefile.dep | 1 +
src/html/htmlfilter.cpp | 1711 +++++++++++++++++++++++++++++++++++++++
src/html/htmlfilter.h | 376 +++++++++
tests/Makefile.dep | 10 +-
4 files changed, 2093 insertions(+), 5 deletions(-)
create mode 100644 src/html/htmlfilter.cpp
create mode 100644 src/html/htmlfilter.h
diff --git a/src/Makefile.dep b/src/Makefile.dep
index 683e3cf..2a8cf37 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -42,3 +42,4 @@
./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
+./html/htmlfilter.o: ./html/htmlfilter.h
diff --git a/src/html/htmlfilter.cpp b/src/html/htmlfilter.cpp
new file mode 100644
index 0000000..d103b9e
--- /dev/null
+++ b/src/html/htmlfilter.cpp
@@ -0,0 +1,1711 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa
+ */
+
+/*
+ * Copyright (c) 2008-2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name Tomasz Sowa nor the names of contributors to this
+ * project may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "htmlfilter.h"
+
+
+
+namespace pt
+{
+
+
+
+
+void HTMLFilter::Item::Clear()
+{
+ name.clear();
+ type = none;
+ porphans = 0;
+ new_line = false;
+ has_body_tag = false;
+}
+
+
+HTMLFilter::Item::Item()
+{
+ Clear();
+}
+
+
+
+void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
+{
+ pchar = in;
+ stack_len = 0;
+ out_string = &out;
+ last_new_line = false;
+ line_len = 0;
+ out_string->clear();
+
+ Init();
+ Read();
+ Uninit();
+}
+
+
+
+void HTMLFilter::Init()
+{
+}
+
+
+void HTMLFilter::Uninit()
+{
+}
+
+
+
+void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
+{
+ if( &in == &out )
+ {
+ // out cannot be the same string as in
+ return;
+ }
+
+ size_t out_projected_len = in.size() * 2 + 1;
+
+ if( out.capacity() < out_projected_len )
+ out.reserve(out_projected_len);
+
+ Filter(in.c_str(), out);
+}
+
+
+void HTMLFilter::SetSomeDefaults()
+{
+ tab_size = 2;
+ trim_white = false;
+ break_after = 0;
+ wrap_line = 0;
+ orphan_mode = orphan_nbsp;
+ safe_mode = false;
+ skip_tags = false;
+ skip_commentaries = false;
+ skip_entities = false;
+ analyze_entities = false;
+}
+
+
+HTMLFilter::HTMLFilter()
+{
+ pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
+ buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
+
+ SetSomeDefaults();
+}
+
+
+HTMLFilter::HTMLFilter(const HTMLFilter & f)
+{
+ // don't need to copy the stack
+ pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
+ buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
+
+ SetSomeDefaults();
+}
+
+
+HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
+{
+ // don't need to copy the stack
+ pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
+ buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
+
+ // we can copy some fields from f
+
+return *this;
+}
+
+
+HTMLFilter::~HTMLFilter()
+{
+ delete [] pstack;
+ delete [] buffer;
+}
+
+
+
+
+void HTMLFilter::BreakWord(size_t break_after_)
+{
+ break_after = break_after_;
+
+ if( break_after > 10000 )
+ break_after = 10000;
+}
+
+
+void HTMLFilter::WrapLine(size_t wrap_line_)
+{
+ wrap_line = wrap_line_;
+
+ if( wrap_line > 10000 )
+ wrap_line = 10000;
+}
+
+
+
+void HTMLFilter::TrimWhite(bool trim)
+{
+ trim_white = trim;
+}
+
+
+void HTMLFilter::InsertTabs(size_t tabsize)
+{
+ tab_size = tabsize;
+
+ if( tab_size > 1000 )
+ tab_size = 1000;
+}
+
+
+void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans)
+{
+size_t i;
+
+ orphans.max_len = 0;
+
+ for(i=0 ; i orphans.max_len )
+ orphans.max_len = orphans.tab[i].size();
+ }
+}
+
+
+void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector & otab)
+{
+ lang_code_lower = lang_code;
+ ToLower(lang_code_lower);
+
+ orphans_temp.tab = otab;
+ std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end());
+ CalcOrphansMaxLen(orphans_temp);
+
+ orphans_tab[lang_code_lower] = orphans_temp;
+}
+
+
+
+void HTMLFilter::AssignOrphans(const std::wstring & lang_code, const std::vector & otab)
+{
+ AssignOrphans(lang_code.c_str(), otab);
+}
+
+
+void HTMLFilter::ClearOrphans()
+{
+ orphans_tab.clear();
+}
+
+
+
+
+void HTMLFilter::OrphansMode(const std::wstring & orphan_mode_str)
+{
+ if( orphan_mode_str == L"160" )
+ orphan_mode = orphan_160space;
+ else
+ orphan_mode = orphan_nbsp;
+}
+
+
+void HTMLFilter::SafeMode(bool safe_mode_)
+{
+ safe_mode = safe_mode_;
+}
+
+
+void HTMLFilter::SkipTags(bool skip_tags)
+{
+ this->skip_tags = skip_tags;
+}
+
+void HTMLFilter::SkipCommentaries(bool skip_commentaries)
+{
+ this->skip_commentaries = skip_commentaries;
+}
+
+
+void HTMLFilter::SkipEntities(bool skip_entities)
+{
+ this->skip_entities = skip_entities;
+
+ if( this->skip_entities )
+ {
+ this->analyze_entities = true;
+ }
+}
+
+
+void HTMLFilter::AnalyzeEntities(bool analyze_entities)
+{
+ this->analyze_entities = analyze_entities;
+}
+
+
+void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
+{
+ no_filter_tag = tag_name;
+}
+
+
+
+
+HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
+{
+ if( i >= stack_len )
+ {
+ empty.Clear();
+ return empty;
+ }
+
+return pstack[i];
+}
+
+
+HTMLFilter::Item & HTMLFilter::LastItem()
+{
+ if( stack_len == 0 )
+ {
+ empty.Clear();
+ return empty;
+ }
+
+return pstack[stack_len-1];
+}
+
+
+bool HTMLFilter::PushStack()
+{
+ if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
+ // oops, too many items
+ return false;
+
+ pstack[stack_len].Clear();
+
+ if( stack_len > 0 )
+ {
+ // 'porphans' and 'has_body_tag' attributes are propagated
+ pstack[stack_len].porphans = pstack[stack_len-1].porphans;
+ pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag;
+ }
+
+ stack_len += 1;
+
+return true;
+}
+
+void HTMLFilter::PopStack()
+{
+ if( stack_len == 0 )
+ // oops
+ return;
+
+ stack_len -= 1;
+ pstack[stack_len].Clear();
+}
+
+
+bool HTMLFilter::IsWhite(int c)
+{
+ // dont use c==10 here
+
+ if( c==' ' || c=='\t' || c==13 || c==160 )
+ return true;
+
+return false;
+}
+
+
+void HTMLFilter::SkipWhite()
+{
+ while( IsWhite(*pchar) )
+ ++pchar;
+}
+
+
+void HTMLFilter::SkipWhiteLines()
+{
+ while( *pchar==10 || IsWhite(*pchar) )
+ ++pchar;
+}
+
+
+void HTMLFilter::SkipWhiteWithFirstNewLine()
+{
+ SkipWhite();
+
+ if( *pchar == 10 )
+ {
+ pchar += 1;
+ SkipWhite();
+ }
+}
+
+
+void HTMLFilter::SkipWhiteLines(const wchar_t * & str, const wchar_t * end)
+{
+ while( str < end && (*str==10 || IsWhite(*str)) )
+ ++str;
+}
+
+
+void HTMLFilter::CheckNewLine()
+{
+const wchar_t * start = pchar;
+
+ SkipWhite();
+ last_new_line = (*pchar==10);
+
+ pchar = start;
+}
+
+
+
+
+bool HTMLFilter::IsClosingTagForLastItem()
+{
+ pchar += 1;
+ SkipWhite();
+
+ if( *pchar == '/' )
+ {
+ pchar += 1;
+ SkipWhite();
+
+ if( IsNameEqual(pchar, LastItem().name, LastItem().name.size()) )
+ {
+ pchar += LastItem().name.size();
+ SkipWhite();
+
+ if( IsClosingTagMark(*pchar) )
+ {
+ pchar += 1;
+ return true;
+ }
+ }
+ }
+
+return false;
+}
+
+
+
+
+// used for such tags as: script, pre, textarea
+void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
+{
+const wchar_t * start = pchar;
+const wchar_t * end = pchar;
+
+ while( *pchar != 0 )
+ {
+ if( IsOpeningTagMark(*pchar) )
+ {
+ if( IsClosingTagForLastItem() )
+ {
+ if( put_closing_tag_as_well )
+ end = pchar;
+
+ PopStack();
+ CheckNewLine();
+ break;
+ }
+ }
+ else
+ {
+ pchar += 1;
+ end = pchar;
+ }
+ }
+
+ Put(start, end);
+}
+
+
+
+
+void HTMLFilter::SkipAndCheckClosingTag()
+{
+ bool is_quoted = false;
+ wchar_t quote_char = 0;
+
+ for( ; *pchar ; ++pchar )
+ {
+ if( *pchar == '"' || *pchar == '\'' )
+ {
+ if( is_quoted )
+ {
+ if( *pchar == quote_char )
+ {
+ is_quoted = false;
+ }
+ }
+ else
+ {
+ is_quoted = true;
+ quote_char = *pchar;
+ }
+ }
+ else
+ if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(*pchar) ) // closing xml tag: default '/'
+ {
+ LastItem().type = Item::simple;
+ }
+ else
+ if( !is_quoted && IsClosingTagMark(*pchar) )
+ {
+ ++pchar;
+ break;
+ }
+ }
+}
+
+
+
+bool HTMLFilter::IsValidCharForName(int c)
+{
+ if( (c>='a' && c<='z') ||
+ (c>='A' && c<='Z') ||
+ (c>='0' && c<='9') ||
+ c=='-' || c=='!' || c==':') // : for namespace character
+ return true;
+
+return false;
+}
+
+
+bool HTMLFilter::IsValidCharForAttrName(int c)
+{
+ if( (c>='a' && c<='z') ||
+ (c>='A' && c<='Z') ||
+ (c>='0' && c<='9') ||
+ c=='-' || c==':' )
+ return true;
+
+return false;
+}
+
+
+bool HTMLFilter::IsValidCharForEntityName(int c)
+{
+ if( (c>='a' && c<='z') ||
+ (c>='A' && c<='Z') ||
+ (c>='0' && c<='9') ||
+ c=='#' )
+ return true;
+
+return false;
+}
+
+
+void HTMLFilter::ReadItemName()
+{
+size_t i;
+
+ for( i=0 ; IsValidCharForName(*pchar) ; ++i )
+ {
+ if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN )
+ LastItem().name += *pchar;
+
+ ++pchar;
+ }
+}
+
+
+
+void HTMLFilter::ReadItemAttrName()
+{
+size_t i;
+
+ attr_name.clear();
+
+ for( i=0 ; *pchar && IsValidCharForAttrName(*pchar) ; ++i )
+ {
+ if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN )
+ attr_name += *pchar;
+
+ ++pchar;
+ }
+}
+
+
+
+void HTMLFilter::ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end)
+{
+ attr_value.push_back(std::wstring());
+
+ if( analyze_entities )
+ {
+ AnalyzeEntitiesAndPut(value_start, value_end, &attr_value.back());
+ }
+ else
+ {
+ attr_value.back().append(value_start, value_end);
+ }
+}
+
+
+void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
+{
+size_t i;
+
+ attr_value.clear();
+ const wchar_t * value_start = pchar;
+ size_t value_len = 0; // how many non white characters
+
+ for(i=0 ; *pchar ; ++i, ++pchar )
+ {
+ if( has_quote )
+ {
+ if( *pchar == quote_char )
+ break;
+ }
+ else
+ {
+ if( IsClosingTagMark(*pchar) || *pchar == 10 || IsWhite(*pchar) )
+ break;
+ }
+
+ if( *pchar==10 || IsWhite(*pchar) )
+ {
+ if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+ ReadItemAttrValueAdd(value_start, pchar);
+
+ value_len = 0;
+ }
+ else
+ {
+ if( value_len == 0 )
+ value_start = pchar;
+
+ value_len += 1;
+ }
+ }
+
+ if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+ ReadItemAttrValueAdd(value_start, pchar);
+}
+
+
+void HTMLFilter::CheckChar(wchar_t c)
+{
+ if( c == 10 )
+ line_len = 0;
+ else
+ line_len += 1;
+}
+
+
+void HTMLFilter::Put(wchar_t c)
+{
+ (*out_string) += c;
+ CheckChar(c);
+}
+
+
+void HTMLFilter::Put(const wchar_t * str)
+{
+ out_string->append(str);
+
+ for( ; *str ; ++str)
+ CheckChar(*str);
+}
+
+
+void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
+{
+ if( str >= end )
+ return;
+
+ size_t len = end - str;
+ out_string->append(str, len);
+
+ for( ; str < end ; ++str)
+ CheckChar(*str);
+}
+
+
+void HTMLFilter::Put(const std::wstring & str)
+{
+ out_string->append(str);
+
+ for(size_t i=0 ; i 1 ) // at least one character in entity name
+ {
+ if( out )
+ out->append(old_str, entity_start);
+ else
+ Put(old_str, entity_start);
+
+ str += 1; // skip ;
+
+ if( !skip_entities )
+ {
+ if( out )
+ out->append(entity_start, str);
+ else
+ Put(entity_start, str);
+ }
+
+ EntityFound(entity_start + 1, str - 1); // without & and ;
+ old_str = str;
+ }
+ }
+ else
+ {
+ str += 1;
+ }
+ }
+
+ if( out )
+ out->append(old_str, end);
+ else
+ Put(old_str, end);
+}
+
+
+
+
+int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
+{
+size_t res;
+
+ const wchar_t * orphan = orphan_str.c_str();
+
+ for( ; str & table)
+{
+int res;
+
+ if( table.empty() )
+ return false;
+
+ size_t o1 = 0;
+ size_t o2 = table.size() - 1;
+
+ res = CheckOrphan(str, end, table[o1]);
+
+ if( res == 0 )
+ return true;
+
+ if( res < 0 )
+ return false;
+
+ res = CheckOrphan(str, end, table[o2]);
+
+ if( res == 0 )
+ return true;
+
+ if( res > 0 )
+ return false;
+
+
+ while( o1 + 1 < o2 )
+ {
+ size_t o = (o1 + o2) / 2;
+ res = CheckOrphan(str, end, table[o]);
+
+ if( res == 0 )
+ return true;
+
+ if( res < 0 )
+ o2 = o;
+ else
+ o1 = o;
+ }
+
+return false;
+}
+
+
+bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end)
+{
+ if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
+ return false;
+
+ size_t len = end - str;
+
+ if( len > LastItem().porphans->max_len )
+ return false;
+
+return CheckOrphan(str, end, LastItem().porphans->tab);
+}
+
+
+// if there is a semicolon nearby then we break the line after it
+// (useful in html entities)
+// !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str)
+// i wtedy tez nie dodajemy zadnego znaku
+bool HTMLFilter::HasEntityEndAround(const wchar_t * str, const wchar_t * end)
+{
+size_t i, epsilon = 8;// !! IMPROVE ME put as a constant
+
+ for(i=0 ; str < end && i wrap_line )
+ {
+ Put(10);
+ PutTabs(stack_len);
+ }
+}
+
+
+void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end)
+{
+const wchar_t * word = str;
+size_t non_whites = 0;
+bool was_entity_end = false;
+
+ for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites )
+ {
+ if( break_after != 0 && non_whites >= break_after && (was_entity_end || !HasEntityEndAround(str, end)) )
+ {
+ Put(word, str);
+ word = str;
+ non_whites = 0;
+ Put(' ');
+ CheckLineWrap();
+ }
+
+ was_entity_end = (IsEndingEntityMark(*str));
+ }
+
+ if( analyze_entities )
+ AnalyzeEntitiesAndPut(word, str, nullptr);
+ else
+ Put(word, str);
+}
+
+
+void HTMLFilter::PutNormalWhite(const wchar_t * & str, const wchar_t * end)
+{
+ if( str < end )
+ {
+ if( trim_white )
+ {
+ Put(' ');
+ SkipWhiteLines(str, end);
+ }
+ else
+ {
+ while( str < end && (*str==10 || IsWhite(*str)) )
+ {
+ Put(*str);
+
+ if( *str == 10 )
+ PutTabs(stack_len);
+
+ ++str;
+ }
+ }
+ }
+}
+
+
+void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end)
+{
+const wchar_t * word, * white;
+
+ if( str < end )
+ CheckLineWrap();
+
+ while( str < end )
+ {
+ word = str;
+ PutNormalNonWhite(str, end);
+
+ if( CheckOrphan(word, str) )
+ {
+ white = str;
+ SkipWhiteLines(str, end);
+
+ if( white < str )
+ PutNonBreakingSpace();
+ }
+ else
+ {
+ PutNormalWhite(str, end);
+
+ if( str < end ) // !! lub moze podobnie jak jest na gorze tutaj? juz nie mam sily myslec :(
+ CheckLineWrap();
+ }
+
+ // for safety (if str was not incremented then there is an infinite loop)
+ if( word == str )
+ break;
+ }
+}
+
+
+
+
+void HTMLFilter::PutOpeningTagMark()
+{
+ Put('<');
+}
+
+
+void HTMLFilter::PutClosingTagMark()
+{
+ Put('>');
+}
+
+
+
+
+// !! IMPROVE ME change to a better name
+// this functions does not return true when the tag is safe
+bool HTMLFilter::IsTagSafe(const wchar_t * tag)
+{
+ if( !safe_mode )
+ return true;
+
+ if( IsNameEqual(tag, no_filter_tag.c_str()) )
+ return false;
+
+ static const wchar_t * unsafe_tags[] = {
+ L"applet", L"base", L"body",
+ L"embed", L"head", L"html",
+ L"frame", L"frameset",L"iframe",
+ L"link", L"meta", L"param"
+ L"object", L"script"
+ };
+
+ size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*);
+ size_t i;
+
+ for(i=0 ; i 30 )
+ len = 30;
+
+ for(size_t i=0 ; i < (len*tab_size) ; ++i)
+ (*out_string) += ' '; // we do not add them to 'line_len'
+}
+
+
+void HTMLFilter::PutNonBreakingSpace()
+{
+ if( orphan_mode == orphan_nbsp )
+ {
+ Put(L" ");
+ }
+ else
+ {
+ Put(160);
+ }
+}
+
+
+
+
+void HTMLFilter::PutNewLine()
+{
+ buffer[0] = 10;
+ Put(buffer, buffer+1);
+ line_len = 0;
+}
+
+
+// we assume the size of the opening mark to be one
+bool HTMLFilter::IsOpeningTagMark(wchar_t c)
+{
+ return (c == '<');
+}
+
+
+// we assume the size of the closing mark to be one
+bool HTMLFilter::IsClosingTagMark(wchar_t c)
+{
+ return (c == '>');
+}
+
+
+// the slash at the end (without '>' character)
+// we assume the size of the mark to be one
+bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
+{
+ return (c == '/');
+}
+
+
+bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
+{
+static wchar_t comm_open[] = L"";
+size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
+
+ if( !IsOpeningCommentaryTagMark(pchar) )
+ return false;
+
+ pchar += OpeningCommentaryTagMarkSize();
+
+ // looking for "-->"
+ while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
+ ++pchar;
+
+ if( *pchar!= 0 )
+ pchar += comm_close_len;
+
+ CheckNewLine();
+
+return true;
+}
+
+
+void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white)
+{
+ if( trim_white )
+ {
+ // skipping all white chars (with new lines)
+ // but with remembering the last non white character
+ for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar)
+ if( *pchar == 10 )
+ last_non_white = pchar;
+ }
+ else
+ {
+ // skipping first white chars with only one line between them
+ SkipWhite();
+ last_non_white = pchar;
+
+ if( *pchar == 10 )
+ {
+ ++pchar;
+ SkipWhite();
+ }
+ }
+
+ start = pchar;
+
+ // exception for the commentary tag
+ if( IsOpeningCommentaryTagMark(pchar) || !IsOpeningTagMark(*pchar) )
+ {
+ PutNewLine();
+ PutTabs(stack_len);
+ }
+}
+
+
+
+// reading text between html tags
+void HTMLFilter::ReadNormalText()
+{
+const wchar_t * start = pchar;
+const wchar_t * last_non_white = pchar;
+
+ if( last_new_line )
+ ReadNormalTextSkipWhite(start, last_non_white);
+
+
+ while( *pchar != 0 )
+ {
+ const wchar_t * commentary_start = pchar;
+
+ if( SkipCommentaryTagIfExists() )
+ {
+ last_non_white = pchar - 1; // pointing at the last '>' from a commentary
+ PutNormalText(start, commentary_start);
+
+ if( !skip_commentaries )
+ {
+ PutNormalText(commentary_start, pchar);
+ }
+
+ start = pchar;
+ }
+ else
+ {
+ if( IsOpeningTagMark(*pchar) )
+ break;
+
+ if( !IsWhite(*pchar) )
+ last_non_white = pchar;
+
+ pchar += 1;
+ }
+ }
+
+ last_new_line = (*last_non_white == 10);
+ PutNormalText(start, pchar);
+}
+
+
+
+bool HTMLFilter::PrintOpeningItem()
+{
+ if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
+ return true;
+
+ if( last_new_line )
+ {
+ PutNewLine();
+
+ if( stack_len > 1 )
+ PutTabs(stack_len-1);
+ }
+
+return PutOpeningTag();
+}
+
+
+
+
+
+bool HTMLFilter::ReadItemAttr()
+{
+ attr_has_value = false;
+ attr_name.clear();
+ attr_value.clear();
+
+ SkipWhiteLines();
+ ReadItemAttrName();
+
+ if( attr_name.empty() )
+ return false;
+
+ SkipWhiteLines();
+
+ if( *pchar != '=' )
+ return true;
+
+ attr_has_value = true;
+ pchar += 1; // skipping '='
+ SkipWhiteLines();
+
+ bool has_quote = (*pchar == '\"' || *pchar == '\'');
+ wchar_t quote_char = *pchar;
+
+ if( has_quote )
+ pchar += 1; // skipping the first quote mark
+
+ ReadItemAttrValue(has_quote, quote_char);
+
+ if( has_quote && *pchar == quote_char )
+ pchar += 1; // skipping the last quote mark
+
+return true;
+}
+
+
+
+bool HTMLFilter::CheckItemAttr()
+{
+ if( attr_has_value && IsNameEqual(L"lang", attr_name) )
+ {
+ LastItem().porphans = 0;
+
+ if( !attr_value.empty() )
+ {
+ // we are taking the first value only
+ attr_value_lower = attr_value[0];
+ ToLower(attr_value_lower);
+
+ OrphansTab::iterator i = orphans_tab.find(attr_value_lower);
+
+ if( i != orphans_tab.end() )
+ LastItem().porphans = &i->second;
+ }
+ }
+
+return true;
+}
+
+
+void HTMLFilter::PrintItemAttr()
+{
+size_t i;
+
+ if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
+ return;
+
+ Put(' ');
+ Put(attr_name);
+
+ if( attr_has_value )
+ {
+ Put(L"=\"");
+
+ for(i=0 ; i start )
+ Put(start, pchar);
+
+ // closing tag mark is printed directly from the source
+}
+
+
+void HTMLFilter::ReadItemOpening()
+{
+ LastItem().type = Item::opening;
+ ReadItemName();
+
+ if( PrintOpeningItem() )
+ {
+ while( ReadItemAttr() )
+ {
+ if( CheckItemAttr() )
+ PrintItemAttr();
+ }
+
+ SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
+
+ if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
+ {
+ if( LastItem().type == Item::simple )
+ Put(L" /");
+
+ PutClosingTagMark();
+ }
+ }
+}
+
+
+void HTMLFilter::ItemFound()
+{
+}
+
+void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end)
+{
+}
+
+
+bool HTMLFilter::ReadItem()
+{
+ if( *pchar == 0 )
+ return false;
+
+ if( !PushStack() )
+ return false;
+
+ pchar += 1; // skipping the first '<'
+ SkipWhiteLines();
+
+ if( *pchar == '!' )
+ ReadItemSpecial();
+ else
+ if( *pchar == '/' ) // we have a closing tag (dodac jako metode wirtualna) !!
+ ReadItemClosing();
+ else
+ ReadItemOpening();
+
+ CheckNewLine();
+ LastItem().new_line = last_new_line;
+
+ ItemFound();
+
+return true;
+}
+
+
+
+wchar_t HTMLFilter::ToLower(wchar_t c)
+{
+ if( c>='A' && c<='Z' )
+ return c - 'A' + 'a';
+
+return c;
+}
+
+
+void HTMLFilter::ToLower(std::wstring & str)
+{
+size_t i;
+
+ for(i=0 ; i0 ; ++name1, ++name2, --len )
+ if( ToLower(*name1) != ToLower(*name2) )
+ return false;
+
+ if( len == 0 )
+ return true;
+
+return false;
+}
+
+
+
+bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
+{
+ return IsNameEqual(name1, name2.c_str(), len);
+}
+
+
+bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
+{
+ return IsNameEqual(name1.c_str(), name2, len);
+}
+
+
+bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
+{
+ return IsNameEqual(name1.c_str(), name2.c_str(), len);
+}
+
+
+
+
+
+bool HTMLFilter::IsLastTag(const wchar_t * name)
+{
+ return IsNameEqual(name, LastItem().name);
+}
+
+
+bool HTMLFilter::IsLastTag(const std::wstring & name)
+{
+ return IsNameEqual(name, LastItem().name);
+}
+
+
+// checking exceptions for opening tags
+void HTMLFilter::CheckExceptions()
+{
+ if( IsLastTag(L"meta") ||
+ IsLastTag(L"input") ||
+ IsLastTag(L"br") ||
+ IsLastTag(L"hr") ||
+ IsLastTag(L"img") ||
+ IsLastTag(L"link") ||
+ IsLastTag(L"param") ||
+ IsLastTag(L"col") ||
+ IsLastTag(L"area") )
+ {
+ LastItem().type = Item::simple;
+ PopStack();
+ return;
+ }
+
+ // in safe_mode the script tag is ignored
+ if( !safe_mode && IsLastTag(L"script") )
+ PutEverythingUntilClosingTag(!skip_tags);
+
+ if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
+ PutEverythingUntilClosingTag(!skip_tags);
+
+ if( IsLastTag(no_filter_tag) )
+ PutEverythingUntilClosingTag(false);
+
+ if( IsLastTag(L"body") )
+ LastItem().has_body_tag = true;
+}
+
+
+
+
+void HTMLFilter::AddForgottenTags()
+{
+int i;
+
+ if( stack_len < 3 )
+ return;
+
+ // we have forgotten to close some tags
+
+ // looking whether there is a matching opening tag
+ for(i=int(stack_len)-3 ; i>=0 ; --i)
+ if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
+ break;
+
+ if( i < 0 )
+ {
+ // oops, there is no such a tag
+ // we don't print the closing and the missing opening tag
+ PopStack();
+ return;
+ }
+
+ for(int z=(int)stack_len-2 ; z>=i ; --z)
+ {
+ if( !skip_tags && pstack[z].new_line )
+ {
+ PutNewLine();
+ PutTabs(z);
+ }
+
+ PutClosingTag(pstack[z].name.c_str());
+ pstack[z].Clear();
+ }
+
+ last_new_line = pstack[stack_len-1].new_line;
+
+ // invalidate tags
+ stack_len = i;
+}
+
+
+void HTMLFilter::CheckStackPrintRest()
+{
+ while( stack_len-- > 0 )
+ {
+ if( stack_len==0 || pstack[stack_len-1].new_line )
+ PutNewLine();
+
+ PutTabs(stack_len);
+ PutClosingTag(pstack[stack_len].name.c_str());
+ }
+}
+
+
+void HTMLFilter::CheckClosingTags()
+{
+ if( stack_len == 0 )
+ return;
+
+ // on the stack we have only opening tags
+ // but only the last tag is a closing tag
+
+ if( stack_len == 1 )
+ {
+ // there is only last closing tag
+ // we dont print it
+ PopStack();
+ return;
+ }
+
+ // there are more than one tag
+ if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
+ {
+ // last closing tag is from the previous one
+ if( !skip_tags && pstack[stack_len-2].new_line )
+ {
+ PutNewLine();
+ PutTabs(stack_len-2);
+ }
+
+ PutClosingTag(pstack[stack_len-1].name.c_str());
+ last_new_line = pstack[stack_len-1].new_line;
+ PopStack();
+ PopStack();
+ }
+ else
+ {
+ AddForgottenTags();
+ }
+}
+
+
+bool HTMLFilter::PrintRest()
+{
+const wchar_t * start = pchar;
+
+ // in safe mode we do not print the rest html code
+ if( safe_mode || skip_tags )
+ return false;
+
+ while( *pchar )
+ ++pchar;
+
+ if( pchar > start )
+ {
+ Put(start, pchar);
+ return true;
+ }
+
+return false;
+}
+
+
+
+void HTMLFilter::ReadLoop()
+{
+ while( ReadItem() )
+ {
+ if( LastItem().type == Item::opening )
+ {
+ CheckExceptions();
+ }
+ else
+ if( LastItem().type == Item::special || LastItem().type == Item::simple )
+ {
+ if( stack_len > 1 )
+ {
+ //pstack[stack_len-2].new_line = LastItem().new_line;
+ }
+ else
+ if( trim_white )
+ {
+ // one new line after a simple or special tag
+ // (if the tag has level 0 in the tree - it not means that this is a first tag)
+ // for example can be DOCTYPE
+ PutNewLine();
+ }
+
+ PopStack();
+ }
+ else
+ if( LastItem().type == Item::closing )
+ {
+ CheckClosingTags();
+ }
+ else
+ {
+ PopStack();
+ }
+
+ ReadNormalText();
+ }
+}
+
+
+
+void HTMLFilter::Read()
+{
+ if( trim_white )
+ SkipWhiteLines();
+
+ // it can be some text or white lines before the first html tag (we print it)
+ ReadNormalText();
+
+ // reading the whole html source
+ ReadLoop();
+
+ // sometimes there can remain some html source (when there is no space on the stack)
+ // we print the rest html without filtering (only if safe_mode is false)
+ if( !PrintRest() )
+ CheckStackPrintRest();
+}
+
+
+
+
+
+}
+
diff --git a/src/html/htmlfilter.h b/src/html/htmlfilter.h
new file mode 100644
index 0000000..35710d3
--- /dev/null
+++ b/src/html/htmlfilter.h
@@ -0,0 +1,376 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa
+ */
+
+/*
+ * Copyright (c) 2008-2021, Tomasz Sowa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name Tomasz Sowa nor the names of contributors to this
+ * project may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_picotools_html_htmlfilter
+#define headerfile_picotools_html_htmlfilter
+
+#include
+#include
+bool HTMLFilter::IsSpecialTagIndicator(wchar_t c)
+{
+ return (c == '!');
+}
+
+
+// the '=' operator e.g. class="value"
+bool HTMLFilter::IsAttributeAssignmentMark(wchar_t c)
+{
+ return (c == '=');
+}
+
+
+
// the slash at the end (without '>' character)
// we assume the size of the mark to be one
bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
@@ -1061,18 +1063,33 @@ bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
}
-bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
+//bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
+//{
+//static wchar_t comm_open[] = L"";
+ size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1;
- return IsNameEqual(pchar, comm_open, comm_open_len);
-}
+ if( str.size() >= comm_end_len )
+ {
+ return IsNameEqual(str.c_str() + str.size() - comm_end_len, comm_end);
+ }
-
-size_t HTMLFilter::OpeningCommentaryTagMarkSize()
-{
- return 4; // size of "";
+wchar_t comm_close[] = L"-->";
size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
-
+/*
if( !IsOpeningCommentaryTagMark(pchar) )
return false;
@@ -1108,86 +1125,81 @@ size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
pchar += comm_close_len;
CheckNewLine();
+*/
+
return true;
}
-void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white)
-{
- if( trim_white )
- {
- // skipping all white chars (with new lines)
- // but with remembering the last non white character
- for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar)
- if( *pchar == 10 )
- last_non_white = pchar;
- }
- else
- {
- // skipping first white chars with only one line between them
- SkipWhite();
- last_non_white = pchar;
-
- if( *pchar == 10 )
- {
- ++pchar;
- SkipWhite();
- }
- }
-
- start = pchar;
-
- // exception for the commentary tag
- if( IsOpeningCommentaryTagMark(pchar) || !IsOpeningTagMark(*pchar) )
- {
- PutNewLine();
- PutTabs(stack_len);
- }
-}
-
-
-
// reading text between html tags
void HTMLFilter::ReadNormalText()
{
-const wchar_t * start = pchar;
-const wchar_t * last_non_white = pchar;
+ bool was_non_white_text = false;
- if( last_new_line )
- ReadNormalTextSkipWhite(start, last_non_white);
+ was_ending_commentary = false;
+ bool allow_put_new_line = false;
+ bool allow_put_space = false;
- while( *pchar != 0 )
+ if( white_mode == WHITE_MODE_TREE )
{
- const wchar_t * commentary_start = pchar;
-
- if( SkipCommentaryTagIfExists() )
+ if( LastItem().new_line || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) )
{
- last_non_white = pchar - 1; // pointing at the last '>' from a commentary
- PutNormalText(start, commentary_start);
-
- if( !skip_commentaries )
- {
- PutNormalText(commentary_start, pchar);
- }
-
- start = pchar;
- }
- else
- {
- if( IsOpeningTagMark(*pchar) )
- break;
-
- if( !IsWhite(*pchar) )
- last_non_white = pchar;
-
- pchar += 1;
+ allow_put_new_line = true;
}
}
- last_new_line = (*last_non_white == 10);
- PutNormalText(start, pchar);
+ while( lastc != -1 && !IsOpeningTagMark(lastc) )
+ {
+ tmp_text.clear();
+ PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space);
+
+ if( !tmp_text.empty() )
+ {
+ allow_put_new_line = false;
+ allow_put_space = false;
+ was_non_white_text = true;
+ }
+
+ if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) )
+ {
+ if( lastc == 10 || IsWhite(lastc) )
+ {
+ SkipWhiteLines();
+ PutNonBreakingSpace();
+ }
+ }
+ else
+ {
+ if( was_ending_commentary )
+ break;
+
+ if( PutNormalWhite() && white_mode == WHITE_MODE_TREE )
+ {
+ if( last_new_line )
+ {
+ allow_put_new_line = true;
+ allow_put_space = false;
+
+ LastItem().new_line_in_the_middle = true;
+
+ if( !was_non_white_text )
+ LastItem().new_line = true;
+ }
+ else
+ {
+ allow_put_new_line = false;
+ allow_put_space = true;
+ }
+
+ if( wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line )
+ {
+ allow_put_new_line = true;
+ }
+ }
+ }
+ }
}
@@ -1197,15 +1209,7 @@ bool HTMLFilter::PrintOpeningItem()
if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
return true;
- if( last_new_line )
- {
- PutNewLine();
-
- if( stack_len > 1 )
- PutTabs(stack_len-1);
- }
-
-return PutOpeningTag();
+ return PutOpeningTag();
}
@@ -1226,34 +1230,34 @@ bool HTMLFilter::ReadItemAttr()
SkipWhiteLines();
- if( *pchar != '=' )
+ if( !IsAttributeAssignmentMark(lastc) ) // '='
return true;
attr_has_value = true;
- pchar += 1; // skipping '='
+ read_char(); // skipping '='
SkipWhiteLines();
- bool has_quote = (*pchar == '\"' || *pchar == '\'');
- wchar_t quote_char = *pchar;
+ bool has_quote = (lastc == '\"' || lastc == '\'');
+ wchar_t quote_char = lastc;
if( has_quote )
- pchar += 1; // skipping the first quote mark
+ read_char(); // skipping the first quote mark
ReadItemAttrValue(has_quote, quote_char);
- if( has_quote && *pchar == quote_char )
- pchar += 1; // skipping the last quote mark
+ if( has_quote && lastc == quote_char )
+ read_char(); // skipping the last quote mark
return true;
}
-bool HTMLFilter::CheckItemAttr()
+void HTMLFilter::CheckItemLangAttr()
{
if( attr_has_value && IsNameEqual(L"lang", attr_name) )
{
- LastItem().porphans = 0;
+ LastItem().porphans = nullptr;
if( !attr_value.empty() )
{
@@ -1267,8 +1271,6 @@ bool HTMLFilter::CheckItemAttr()
LastItem().porphans = &i->second;
}
}
-
-return true;
}
@@ -1301,9 +1303,9 @@ size_t i;
void HTMLFilter::ReadItemClosing()
{
- pchar += 1; // skipping '/'
+ read_char(); // skipping '/'
SkipWhiteLines();
- ReadItemName();
+ ReadItemName(LastItem().name);
LastItem().type = Item::closing;
SkipAndCheckClosingTag();
@@ -1316,32 +1318,55 @@ void HTMLFilter::ReadItemSpecial()
LastItem().type = Item::special;
if( !skip_tags )
+ {
+ if( white_mode == WHITE_MODE_TREE && last_new_line )
+ {
+ Put(10);
+ PutTabs(LastItem().tree_index);
+ }
+
PutOpeningTagMark();
+ }
- const wchar_t * start = pchar;
- pchar += 1; // skipping '!'
+ read_char(); // skipping '!'
+ LastItem().name = '!';
+ ReadItemName(LastItem().name, false);
- ReadItemName();
- SkipAndCheckClosingTag();
-
- if( !skip_tags && pchar > start )
- Put(start, pchar);
-
- // closing tag mark is printed directly from the source
+ if( skip_tags )
+ {
+ SkipAndCheckClosingTag();
+ }
+ else
+ {
+ if( LastItem().is_commentary )
+ {
+ Put(LastItem().name);
+ }
+ else
+ {
+ tmp_text.clear();
+ SkipWhiteLines();
+ SkipAndCheckClosingTag(&tmp_text);
+ Put(LastItem().name);
+ Put(' ');
+ Put(tmp_text);
+ Put('>');
+ }
+ }
}
void HTMLFilter::ReadItemOpening()
{
LastItem().type = Item::opening;
- ReadItemName();
+ ReadItemName(LastItem().name);
if( PrintOpeningItem() )
{
while( ReadItemAttr() )
{
- if( CheckItemAttr() )
- PrintItemAttr();
+ CheckItemLangAttr();
+ PrintItemAttr();
}
SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
@@ -1368,25 +1393,35 @@ void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end)
bool HTMLFilter::ReadItem()
{
- if( *pchar == 0 )
+ if( lastc == -1 )
return false;
if( !PushStack() )
return false;
- pchar += 1; // skipping the first '<'
- SkipWhiteLines();
+ if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
+ LastItem().tree_index += 1;
- if( *pchar == '!' )
- ReadItemSpecial();
+ if( was_ending_commentary )
+ {
+ LastItem().type = Item::closing;
+ LastItem().is_commentary = true;
+ LastItem().name = L"--";
+ was_ending_commentary = false;
+ }
else
- if( *pchar == '/' ) // we have a closing tag (dodac jako metode wirtualna) !!
- ReadItemClosing();
- else
- ReadItemOpening();
+ {
+ read_char(); // skipping the first opening tag mark '<'
+ SkipWhiteLines();
- CheckNewLine();
- LastItem().new_line = last_new_line;
+ if( IsSpecialTagIndicator(lastc) )
+ ReadItemSpecial();
+ else
+ if( IsClosingTagIndicator(lastc) )
+ ReadItemClosing();
+ else
+ ReadItemOpening();
+ }
ItemFound();
@@ -1556,11 +1591,14 @@ int i;
{
if( !skip_tags && pstack[z].new_line )
{
- PutNewLine();
- PutTabs(z);
+ if( white_mode == WHITE_MODE_TREE )
+ {
+ Put(10);
+ PutTabs(pstack[z].tree_index);
+ }
}
- PutClosingTag(pstack[z].name.c_str());
+ PutClosingTag(pstack[z]);
pstack[z].Clear();
}
@@ -1576,10 +1614,19 @@ void HTMLFilter::CheckStackPrintRest()
while( stack_len-- > 0 )
{
if( stack_len==0 || pstack[stack_len-1].new_line )
- PutNewLine();
+ {
+ if( white_mode == WHITE_MODE_TREE )
+ {
+ Put(10);
+ PutTabs(pstack[stack_len-1].tree_index);
+ }
+ else
+ {
+ Put(' ');
+ }
+ }
- PutTabs(stack_len);
- PutClosingTag(pstack[stack_len].name.c_str());
+ PutClosingTag(pstack[stack_len]);
}
}
@@ -1601,16 +1648,19 @@ void HTMLFilter::CheckClosingTags()
}
// there are more than one tag
- if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
+ if( (pstack[stack_len-1].is_commentary && pstack[stack_len-2].is_commentary) || IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
{
// last closing tag is from the previous one
if( !skip_tags && pstack[stack_len-2].new_line )
{
- PutNewLine();
- PutTabs(stack_len-2);
+ if( white_mode == WHITE_MODE_TREE )
+ {
+ Put(10);
+ PutTabs(pstack[stack_len-2].tree_index);
+ }
}
- PutClosingTag(pstack[stack_len-1].name.c_str());
+ PutClosingTag(pstack[stack_len-1]);
last_new_line = pstack[stack_len-1].new_line;
PopStack();
PopStack();
@@ -1624,22 +1674,30 @@ void HTMLFilter::CheckClosingTags()
bool HTMLFilter::PrintRest()
{
-const wchar_t * start = pchar;
+//const wchar_t * start = pchar;
// in safe mode we do not print the rest html code
if( safe_mode || skip_tags )
return false;
- while( *pchar )
- ++pchar;
+ bool was_chars = false;
- if( pchar > start )
+ while( lastc != -1 )
{
- Put(start, pchar);
- return true;
+ Put(lastc);
+ read_char();
+ was_chars = true;
}
-return false;
+ return was_chars;
+
+// if( pchar > start )
+// {
+// Put(start, pchar);
+// return true;
+// }
+
+//return false;
}
@@ -1660,15 +1718,18 @@ void HTMLFilter::ReadLoop()
//pstack[stack_len-2].new_line = LastItem().new_line;
}
else
- if( trim_white )
+ if( white_mode == WHITE_MODE_TREE )
{
// one new line after a simple or special tag
// (if the tag has level 0 in the tree - it not means that this is a first tag)
- // for example can be DOCTYPE
- PutNewLine();
+ // for example can be DOCTYPE
+
+ if( !LastItem().is_commentary )
+ Put(10);
}
- PopStack();
+ if( !LastItem().is_commentary )
+ PopStack();
}
else
if( LastItem().type == Item::closing )
@@ -1688,7 +1749,9 @@ void HTMLFilter::ReadLoop()
void HTMLFilter::Read()
{
- if( trim_white )
+ read_char(); // put first character to lastc
+
+ if( white_mode != WHITE_MODE_ORIGIN )
SkipWhiteLines();
// it can be some text or white lines before the first html tag (we print it)
diff --git a/src/html/htmlfilter.h b/src/html/htmlfilter.h
index 35710d3..6407e0e 100644
--- a/src/html/htmlfilter.h
+++ b/src/html/htmlfilter.h
@@ -42,7 +42,7 @@
#include