2009-12-09 01:42:40 +01:00
|
|
|
/*
|
2010-02-28 01:08:10 +01:00
|
|
|
* This file is a part of Winix
|
2014-10-04 20:04:03 +02:00
|
|
|
* and is distributed under the 2-Clause BSD licence.
|
|
|
|
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2018-10-24 01:28:45 +02:00
|
|
|
* Copyright (c) 2008-2018, Tomasz Sowa
|
2009-12-09 01:42:40 +01:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
2014-10-04 20:04:03 +02:00
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*
|
2009-12-09 01:42:40 +01:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include "htmlfilter.h"
|
|
|
|
|
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
|
2014-02-12 17:30:49 +01:00
|
|
|
namespace Winix
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
void HTMLFilter::Item::Clear()
|
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
name.clear();
|
|
|
|
type = none;
|
|
|
|
porphans = 0;
|
|
|
|
new_line = false;
|
|
|
|
has_body_tag = false;
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::Item::Item()
|
|
|
|
{
|
|
|
|
Clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
pchar = in;
|
|
|
|
stack_len = 0;
|
|
|
|
out_string = &out;
|
|
|
|
last_new_line = false;
|
2011-04-16 10:42:22 +02:00
|
|
|
line_len = 0;
|
2009-12-09 01:42:40 +01:00
|
|
|
out_string->clear();
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
Init();
|
2009-12-09 01:42:40 +01:00
|
|
|
Read();
|
2011-04-16 13:27:54 +02:00
|
|
|
Uninit();
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
|
|
|
void HTMLFilter::Init()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 13:27:54 +02:00
|
|
|
void HTMLFilter::Uninit()
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( &in == &out )
|
|
|
|
{
|
|
|
|
// out cannot be the same string as in
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t out_projected_len = in.size() * 2 + 1;
|
2010-11-23 22:52:25 +01:00
|
|
|
|
|
|
|
if( out.capacity() < out_projected_len )
|
|
|
|
out.reserve(out_projected_len);
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
Filter(in.c_str(), out);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
void HTMLFilter::SetSomeDefaults()
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-06-21 00:47:24 +02:00
|
|
|
tab_size = 2;
|
|
|
|
trim_white = false;
|
|
|
|
break_after = 0;
|
2011-04-16 10:42:22 +02:00
|
|
|
wrap_line = 0;
|
2010-06-21 00:47:24 +02:00
|
|
|
orphan_mode = orphan_nbsp;
|
2010-06-30 20:42:50 +02:00
|
|
|
safe_mode = false;
|
2018-10-24 18:31:42 +02:00
|
|
|
skip_tags = false;
|
2018-10-24 01:28:45 +02:00
|
|
|
skip_commentaries = false;
|
2018-10-24 18:31:42 +02:00
|
|
|
skip_entities = false;
|
|
|
|
analyze_entities = false;
|
2018-10-24 01:28:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::HTMLFilter()
|
|
|
|
{
|
|
|
|
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
|
|
|
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
|
|
|
|
|
|
|
SetSomeDefaults();
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::HTMLFilter(const HTMLFilter & f)
|
|
|
|
{
|
|
|
|
// don't need to copy the stack
|
2010-02-11 21:56:52 +01:00
|
|
|
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
2010-11-21 01:19:17 +01:00
|
|
|
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
2018-10-24 01:28:45 +02:00
|
|
|
|
|
|
|
SetSomeDefaults();
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
|
|
|
|
{
|
|
|
|
// don't need to copy the stack
|
2010-02-11 21:56:52 +01:00
|
|
|
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
2010-11-21 01:19:17 +01:00
|
|
|
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
// we can copy some fields from f
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::~HTMLFilter()
|
|
|
|
{
|
|
|
|
delete [] pstack;
|
|
|
|
delete [] buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::BreakWord(size_t break_after_)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-06-21 00:47:24 +02:00
|
|
|
break_after = break_after_;
|
|
|
|
|
|
|
|
if( break_after > 10000 )
|
|
|
|
break_after = 10000;
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::WrapLine(size_t wrap_line_)
|
|
|
|
{
|
|
|
|
wrap_line = wrap_line_;
|
|
|
|
|
|
|
|
if( wrap_line > 10000 )
|
|
|
|
wrap_line = 10000;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
void HTMLFilter::TrimWhite(bool trim)
|
|
|
|
{
|
|
|
|
trim_white = trim;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::InsertTabs(size_t tabsize)
|
|
|
|
{
|
|
|
|
tab_size = tabsize;
|
|
|
|
|
|
|
|
if( tab_size > 1000 )
|
|
|
|
tab_size = 1000;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans)
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
orphans.max_len = 0;
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
for(i=0 ; i<orphans.tab.size() ; ++i)
|
|
|
|
{
|
|
|
|
if( orphans.tab[i].size() > orphans.max_len )
|
|
|
|
orphans.max_len = orphans.tab[i].size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
|
|
|
|
{
|
|
|
|
lang_code_lower = lang_code;
|
|
|
|
ToLower(lang_code_lower);
|
|
|
|
|
|
|
|
orphans_temp.tab = otab;
|
|
|
|
std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end());
|
|
|
|
CalcOrphansMaxLen(orphans_temp);
|
|
|
|
|
|
|
|
orphans_tab[lang_code_lower] = orphans_temp;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
|
|
|
|
{
|
|
|
|
AssignOrphans(lang_code.c_str(), otab);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ClearOrphans()
|
|
|
|
{
|
|
|
|
orphans_tab.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2018-11-21 18:51:15 +01:00
|
|
|
void HTMLFilter::OrphansMode(const std::wstring & orphan_mode_str)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2018-11-21 18:51:15 +01:00
|
|
|
if( orphan_mode_str == L"160" )
|
|
|
|
orphan_mode = orphan_160space;
|
|
|
|
else
|
|
|
|
orphan_mode = orphan_nbsp;
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
void HTMLFilter::SafeMode(bool safe_mode_)
|
|
|
|
{
|
|
|
|
safe_mode = safe_mode_;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
void HTMLFilter::SkipTags(bool skip_tags)
|
|
|
|
{
|
|
|
|
this->skip_tags = skip_tags;
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLFilter::SkipCommentaries(bool skip_commentaries)
|
2018-10-24 01:28:45 +02:00
|
|
|
{
|
|
|
|
this->skip_commentaries = skip_commentaries;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
void HTMLFilter::SkipEntities(bool skip_entities)
|
|
|
|
{
|
|
|
|
this->skip_entities = skip_entities;
|
|
|
|
|
|
|
|
if( this->skip_entities )
|
|
|
|
{
|
|
|
|
this->analyze_entities = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::AnalyzeEntities(bool analyze_entities)
|
|
|
|
{
|
|
|
|
this->analyze_entities = analyze_entities;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
|
|
|
|
{
|
|
|
|
no_filter_tag = tag_name;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
|
|
|
|
{
|
|
|
|
if( i >= stack_len )
|
|
|
|
{
|
|
|
|
empty.Clear();
|
|
|
|
return empty;
|
|
|
|
}
|
|
|
|
|
|
|
|
return pstack[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::Item & HTMLFilter::LastItem()
|
|
|
|
{
|
|
|
|
if( stack_len == 0 )
|
|
|
|
{
|
|
|
|
empty.Clear();
|
|
|
|
return empty;
|
|
|
|
}
|
|
|
|
|
|
|
|
return pstack[stack_len-1];
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::PushStack()
|
|
|
|
{
|
2010-02-11 21:56:52 +01:00
|
|
|
if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
|
2009-12-09 01:42:40 +01:00
|
|
|
// oops, too many items
|
|
|
|
return false;
|
|
|
|
|
|
|
|
pstack[stack_len].Clear();
|
2011-04-16 10:42:22 +02:00
|
|
|
|
|
|
|
if( stack_len > 0 )
|
|
|
|
{
|
|
|
|
// 'porphans' and 'has_body_tag' attributes are propagated
|
|
|
|
pstack[stack_len].porphans = pstack[stack_len-1].porphans;
|
|
|
|
pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag;
|
|
|
|
}
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
stack_len += 1;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLFilter::PopStack()
|
|
|
|
{
|
|
|
|
if( stack_len == 0 )
|
|
|
|
// oops
|
|
|
|
return;
|
|
|
|
|
|
|
|
stack_len -= 1;
|
|
|
|
pstack[stack_len].Clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsWhite(int c)
|
|
|
|
{
|
|
|
|
// dont use c==10 here
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
if( c==' ' || c=='\t' || c==13 || c==160 )
|
2009-12-09 01:42:40 +01:00
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SkipWhite()
|
|
|
|
{
|
|
|
|
while( IsWhite(*pchar) )
|
|
|
|
++pchar;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SkipWhiteLines()
|
|
|
|
{
|
|
|
|
while( *pchar==10 || IsWhite(*pchar) )
|
|
|
|
++pchar;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
void HTMLFilter::SkipWhiteWithFirstNewLine()
|
|
|
|
{
|
|
|
|
SkipWhite();
|
|
|
|
|
|
|
|
if( *pchar == 10 )
|
|
|
|
{
|
|
|
|
pchar += 1;
|
|
|
|
SkipWhite();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::SkipWhiteLines(const wchar_t * & str, const wchar_t * end)
|
|
|
|
{
|
|
|
|
while( str < end && (*str==10 || IsWhite(*str)) )
|
|
|
|
++str;
|
|
|
|
}
|
2010-06-21 00:47:24 +02:00
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::CheckNewLine()
|
|
|
|
{
|
2010-11-21 01:19:17 +01:00
|
|
|
const wchar_t * start = pchar;
|
2010-06-21 00:47:24 +02:00
|
|
|
|
|
|
|
SkipWhite();
|
|
|
|
last_new_line = (*pchar==10);
|
|
|
|
|
|
|
|
pchar = start;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsClosingTagForLastItem()
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
pchar += 1;
|
|
|
|
SkipWhite();
|
|
|
|
|
|
|
|
if( *pchar == '/' )
|
|
|
|
{
|
|
|
|
pchar += 1;
|
|
|
|
SkipWhite();
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( IsNameEqual(pchar, LastItem().name, LastItem().name.size()) )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
pchar += LastItem().name.size();
|
2009-12-09 01:42:40 +01:00
|
|
|
SkipWhite();
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
if( IsClosingTagMark(*pchar) )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
pchar += 1;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
// used for such tags as: script, pre, textarea
|
2018-10-24 01:28:45 +02:00
|
|
|
void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-11-21 01:19:17 +01:00
|
|
|
const wchar_t * start = pchar;
|
2018-10-24 01:28:45 +02:00
|
|
|
const wchar_t * end = pchar;
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
while( *pchar != 0 )
|
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( IsOpeningTagMark(*pchar) )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-06-21 00:47:24 +02:00
|
|
|
if( IsClosingTagForLastItem() )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2018-10-24 01:28:45 +02:00
|
|
|
if( put_closing_tag_as_well )
|
|
|
|
end = pchar;
|
2014-06-19 01:18:28 +02:00
|
|
|
|
|
|
|
PopStack();
|
|
|
|
CheckNewLine();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
pchar += 1;
|
|
|
|
end = pchar;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Put(start, end);
|
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::SkipAndCheckClosingTag()
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2018-10-24 01:28:45 +02:00
|
|
|
bool is_quoted = false;
|
|
|
|
wchar_t quote_char = 0;
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
for( ; *pchar ; ++pchar )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2018-10-24 01:28:45 +02:00
|
|
|
if( *pchar == '"' || *pchar == '\'' )
|
|
|
|
{
|
|
|
|
if( is_quoted )
|
|
|
|
{
|
|
|
|
if( *pchar == quote_char )
|
|
|
|
{
|
|
|
|
is_quoted = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
is_quoted = true;
|
|
|
|
quote_char = *pchar;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
2018-10-24 18:31:42 +02:00
|
|
|
if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(*pchar) ) // closing xml tag: default '/'
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
LastItem().type = Item::simple;
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
2018-10-24 01:28:45 +02:00
|
|
|
else
|
2018-10-24 18:31:42 +02:00
|
|
|
if( !is_quoted && IsClosingTagMark(*pchar) )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
++pchar;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
bool HTMLFilter::IsValidCharForName(int c)
|
|
|
|
{
|
|
|
|
if( (c>='a' && c<='z') ||
|
|
|
|
(c>='A' && c<='Z') ||
|
|
|
|
(c>='0' && c<='9') ||
|
2018-10-24 18:31:42 +02:00
|
|
|
c=='-' || c=='!' || c==':') // : for namespace character
|
2009-12-09 01:42:40 +01:00
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
bool HTMLFilter::IsValidCharForAttrName(int c)
|
|
|
|
{
|
|
|
|
if( (c>='a' && c<='z') ||
|
|
|
|
(c>='A' && c<='Z') ||
|
|
|
|
(c>='0' && c<='9') ||
|
|
|
|
c=='-' || c==':' )
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
bool HTMLFilter::IsValidCharForEntityName(int c)
|
|
|
|
{
|
|
|
|
if( (c>='a' && c<='z') ||
|
|
|
|
(c>='A' && c<='Z') ||
|
|
|
|
(c>='0' && c<='9') ||
|
|
|
|
c=='#' )
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
void HTMLFilter::ReadItemName()
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
for( i=0 ; IsValidCharForName(*pchar) ; ++i )
|
|
|
|
{
|
|
|
|
if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN )
|
|
|
|
LastItem().name += *pchar;
|
|
|
|
|
|
|
|
++pchar;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemAttrName()
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
attr_name.clear();
|
|
|
|
|
|
|
|
for( i=0 ; *pchar && IsValidCharForAttrName(*pchar) ; ++i )
|
|
|
|
{
|
|
|
|
if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN )
|
|
|
|
attr_name += *pchar;
|
|
|
|
|
|
|
|
++pchar;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
void HTMLFilter::ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end)
|
|
|
|
{
|
|
|
|
attr_value.push_back(std::wstring());
|
|
|
|
|
|
|
|
if( analyze_entities )
|
|
|
|
{
|
|
|
|
AnalyzeEntitiesAndPut(value_start, value_end, &attr_value.back());
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
attr_value.back().append(value_start, value_end);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
2011-04-16 10:42:22 +02:00
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
attr_value.clear();
|
2018-10-24 18:31:42 +02:00
|
|
|
const wchar_t * value_start = pchar;
|
|
|
|
size_t value_len = 0; // how many non white characters
|
2011-04-16 10:42:22 +02:00
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
for(i=0 ; *pchar ; ++i, ++pchar )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2018-10-24 01:28:45 +02:00
|
|
|
if( has_quote )
|
|
|
|
{
|
|
|
|
if( *pchar == quote_char )
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( IsClosingTagMark(*pchar) || *pchar == 10 || IsWhite(*pchar) )
|
2018-10-24 01:28:45 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
if( *pchar==10 || IsWhite(*pchar) )
|
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
|
|
|
ReadItemAttrValueAdd(value_start, pchar);
|
|
|
|
|
|
|
|
value_len = 0;
|
2014-06-19 01:18:28 +02:00
|
|
|
}
|
|
|
|
else
|
2018-10-24 01:28:45 +02:00
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( value_len == 0 )
|
|
|
|
value_start = pchar;
|
|
|
|
|
|
|
|
value_len += 1;
|
2018-10-24 01:28:45 +02:00
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
2014-06-19 01:18:28 +02:00
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
|
|
|
ReadItemAttrValueAdd(value_start, pchar);
|
2011-04-16 10:42:22 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::CheckChar(wchar_t c)
|
|
|
|
{
|
|
|
|
if( c == 10 )
|
|
|
|
line_len = 0;
|
|
|
|
else
|
|
|
|
line_len += 1;
|
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
|
|
|
|
void HTMLFilter::Put(wchar_t c)
|
|
|
|
{
|
|
|
|
(*out_string) += c;
|
|
|
|
CheckChar(c);
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::Put(const wchar_t * str)
|
|
|
|
{
|
|
|
|
out_string->append(str);
|
|
|
|
|
|
|
|
for( ; *str ; ++str)
|
|
|
|
CheckChar(*str);
|
|
|
|
}
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
if( str >= end )
|
2009-12-09 01:42:40 +01:00
|
|
|
return;
|
|
|
|
|
|
|
|
size_t len = end - str;
|
|
|
|
out_string->append(str, len);
|
2011-04-16 10:42:22 +02:00
|
|
|
|
|
|
|
for( ; str < end ; ++str)
|
|
|
|
CheckChar(*str);
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::Put(const std::wstring & str)
|
|
|
|
{
|
|
|
|
out_string->append(str);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
for(size_t i=0 ; i<str.size() ; ++i)
|
|
|
|
CheckChar(str[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
// out can be null
|
|
|
|
void HTMLFilter::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
|
|
|
|
{
|
|
|
|
size_t epsilon = 8; // !! IMPROVE ME put as a constant
|
|
|
|
const wchar_t * old_str = str;
|
|
|
|
|
|
|
|
while( str < end )
|
|
|
|
{
|
|
|
|
if( IsStartingEntityMark(*str) )
|
|
|
|
{
|
|
|
|
const wchar_t * entity_start = str;
|
|
|
|
str += 1; // skip &
|
|
|
|
|
|
|
|
for(size_t i=0 ; *str && IsValidCharForEntityName(*str) && i < epsilon ; ++i, ++str)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
if( IsEndingEntityMark(*str) && str - entity_start > 1 ) // at least one character in entity name
|
|
|
|
{
|
|
|
|
if( out )
|
|
|
|
out->append(old_str, entity_start);
|
|
|
|
else
|
|
|
|
Put(old_str, entity_start);
|
|
|
|
|
|
|
|
str += 1; // skip ;
|
|
|
|
|
|
|
|
if( !skip_entities )
|
|
|
|
{
|
|
|
|
if( out )
|
|
|
|
out->append(entity_start, str);
|
|
|
|
else
|
|
|
|
Put(entity_start, str);
|
|
|
|
}
|
|
|
|
|
|
|
|
EntityFound(entity_start + 1, str - 1); // without & and ;
|
|
|
|
old_str = str;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
str += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if( out )
|
|
|
|
out->append(old_str, end);
|
|
|
|
else
|
|
|
|
Put(old_str, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
|
|
|
|
|
|
|
|
int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-06-21 00:47:24 +02:00
|
|
|
size_t res;
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
const wchar_t * orphan = orphan_str.c_str();
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
for( ; str<end && *orphan!=0 ; ++str, ++orphan )
|
|
|
|
{
|
2010-11-21 01:19:17 +01:00
|
|
|
res = ToLower(*str) - ToLower(*orphan);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
|
|
|
if( res != 0 )
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( str < end )
|
2010-11-21 01:19:17 +01:00
|
|
|
return ToLower(*str);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
return -int(ToLower(*orphan));
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
|
|
|
|
// binary search in table (table should be sorted)
|
|
|
|
bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
|
|
|
int res;
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( table.empty() )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
size_t o1 = 0;
|
|
|
|
size_t o2 = table.size() - 1;
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
res = CheckOrphan(str, end, table[o1]);
|
|
|
|
|
|
|
|
if( res == 0 )
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if( res < 0 )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
res = CheckOrphan(str, end, table[o2]);
|
|
|
|
|
|
|
|
if( res == 0 )
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if( res > 0 )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
while( o1 + 1 < o2 )
|
|
|
|
{
|
|
|
|
size_t o = (o1 + o2) / 2;
|
|
|
|
res = CheckOrphan(str, end, table[o]);
|
|
|
|
|
|
|
|
if( res == 0 )
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if( res < 0 )
|
|
|
|
o2 = o;
|
|
|
|
else
|
|
|
|
o1 = o;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
|
|
|
|
return false;
|
2010-11-21 01:19:17 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
size_t len = end - str;
|
2010-11-21 01:19:17 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( len > LastItem().porphans->max_len )
|
|
|
|
return false;
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
return CheckOrphan(str, end, LastItem().porphans->tab);
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
// if there is a semicolon nearby then we break the line after it
|
|
|
|
// (useful in html entities)
|
|
|
|
// !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str)
|
|
|
|
// i wtedy tez nie dodajemy zadnego znaku
|
2018-10-24 18:31:42 +02:00
|
|
|
bool HTMLFilter::HasEntityEndAround(const wchar_t * str, const wchar_t * end)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
size_t i, epsilon = 8;// !! IMPROVE ME put as a constant
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
for(i=0 ; str < end && i<epsilon ; ++i, ++str)
|
2018-10-24 18:31:42 +02:00
|
|
|
if( IsEndingEntityMark(*str) )
|
2011-04-16 10:42:22 +02:00
|
|
|
return true;
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
return false;
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::CheckLineWrap()
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
if( wrap_line != 0 && LastItem().has_body_tag && line_len > wrap_line )
|
|
|
|
{
|
|
|
|
Put(10);
|
|
|
|
PutTabs(stack_len);
|
|
|
|
}
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
const wchar_t * word = str;
|
2010-06-21 00:47:24 +02:00
|
|
|
size_t non_whites = 0;
|
2018-10-24 18:31:42 +02:00
|
|
|
bool was_entity_end = false;
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( break_after != 0 && non_whites >= break_after && (was_entity_end || !HasEntityEndAround(str, end)) )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
Put(word, str);
|
|
|
|
word = str;
|
|
|
|
non_whites = 0;
|
|
|
|
Put(' ');
|
|
|
|
CheckLineWrap();
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
was_entity_end = (IsEndingEntityMark(*str));
|
2011-04-16 10:42:22 +02:00
|
|
|
}
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
if( analyze_entities )
|
|
|
|
AnalyzeEntitiesAndPut(word, str, nullptr);
|
|
|
|
else
|
|
|
|
Put(word, str);
|
2011-04-16 10:42:22 +02:00
|
|
|
}
|
2010-06-21 00:47:24 +02:00
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::PutNormalWhite(const wchar_t * & str, const wchar_t * end)
|
|
|
|
{
|
|
|
|
if( str < end )
|
|
|
|
{
|
|
|
|
if( trim_white )
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
Put(' ');
|
|
|
|
SkipWhiteLines(str, end);
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
while( str < end && (*str==10 || IsWhite(*str)) )
|
|
|
|
{
|
|
|
|
Put(*str);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( *str == 10 )
|
|
|
|
PutTabs(stack_len);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
++str;
|
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
const wchar_t * word, * white;
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( str < end )
|
|
|
|
CheckLineWrap();
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
while( str < end )
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
word = str;
|
|
|
|
PutNormalNonWhite(str, end);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( CheckOrphan(word, str) )
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
white = str;
|
|
|
|
SkipWhiteLines(str, end);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( white < str )
|
|
|
|
PutNonBreakingSpace();
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
PutNormalWhite(str, end);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( str < end ) // !! lub moze podobnie jak jest na gorze tutaj? juz nie mam sily myslec :(
|
|
|
|
CheckLineWrap();
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
// for safety (if str was not incremented then there is an infinite loop)
|
|
|
|
if( word == str )
|
|
|
|
break;
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
void HTMLFilter::PutOpeningTagMark()
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
Put('<');
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
void HTMLFilter::PutClosingTagMark()
|
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
Put('>');
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
// !! IMPROVE ME change to a better name
|
|
|
|
// this functions does not return true when the tag is safe
|
2010-11-21 01:19:17 +01:00
|
|
|
bool HTMLFilter::IsTagSafe(const wchar_t * tag)
|
2010-06-30 20:42:50 +02:00
|
|
|
{
|
|
|
|
if( !safe_mode )
|
|
|
|
return true;
|
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
if( IsNameEqual(tag, no_filter_tag.c_str()) )
|
|
|
|
return false;
|
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
static const wchar_t * unsafe_tags[] = {
|
2014-06-19 01:18:28 +02:00
|
|
|
L"applet", L"base", L"body",
|
|
|
|
L"embed", L"head", L"html",
|
|
|
|
L"frame", L"frameset",L"iframe",
|
|
|
|
L"link", L"meta", L"param"
|
|
|
|
L"object", L"script"
|
2010-06-30 20:42:50 +02:00
|
|
|
};
|
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*);
|
2010-06-30 20:42:50 +02:00
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for(i=0 ; i<len ; ++i)
|
|
|
|
{
|
|
|
|
if( IsNameEqual(tag, unsafe_tags[i]) )
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
bool HTMLFilter::IsTagSafe(const std::wstring & tag)
|
|
|
|
{
|
|
|
|
return IsTagSafe(tag.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 13:27:54 +02:00
|
|
|
bool HTMLFilter::PutOpeningTag()
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-06-30 20:42:50 +02:00
|
|
|
if( !IsTagSafe(LastItem().name) )
|
2018-10-24 01:28:45 +02:00
|
|
|
{
|
|
|
|
SkipAndCheckClosingTag();
|
2011-04-16 13:27:54 +02:00
|
|
|
return false;
|
2018-10-24 01:28:45 +02:00
|
|
|
}
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
PutOpeningTagMark();
|
2011-04-16 13:27:54 +02:00
|
|
|
Put(LastItem().name);
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 13:27:54 +02:00
|
|
|
return true;
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
void HTMLFilter::PutClosingTag(const wchar_t * tag)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( skip_tags || !IsTagSafe(tag) )
|
2010-06-30 20:42:50 +02:00
|
|
|
return;
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
PutOpeningTagMark();
|
2011-04-16 10:42:22 +02:00
|
|
|
Put('/');
|
2011-04-16 13:27:54 +02:00
|
|
|
Put(tag);
|
2010-06-21 00:47:24 +02:00
|
|
|
PutClosingTagMark();
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::PutTabs(size_t len)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
if( len > 30 )
|
|
|
|
len = 30;
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
for(size_t i=0 ; i < (len*tab_size) ; ++i)
|
|
|
|
(*out_string) += ' '; // we do not add them to 'line_len'
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::PutNonBreakingSpace()
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
|
|
|
if( orphan_mode == orphan_nbsp )
|
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
Put(L" ");
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
2010-06-21 00:47:24 +02:00
|
|
|
else
|
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
Put(160);
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
void HTMLFilter::PutNewLine()
|
|
|
|
{
|
2009-12-09 01:42:40 +01:00
|
|
|
buffer[0] = 10;
|
|
|
|
Put(buffer, buffer+1);
|
2011-04-16 10:42:22 +02:00
|
|
|
line_len = 0;
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
// we assume the size of the opening mark to be one
|
2018-10-24 18:31:42 +02:00
|
|
|
bool HTMLFilter::IsOpeningTagMark(wchar_t c)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
return (c == '<');
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// we assume the size of the closing mark to be one
|
2018-10-24 18:31:42 +02:00
|
|
|
bool HTMLFilter::IsClosingTagMark(wchar_t c)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
return (c == '>');
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
|
|
|
// the slash at the end <img src=".." /> (without '>' character)
|
|
|
|
// we assume the size of the mark to be one
|
2018-10-24 18:31:42 +02:00
|
|
|
bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
return (c == '/');
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-11-21 01:19:17 +01:00
|
|
|
static wchar_t comm_open[] = L"<!--";
|
|
|
|
size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
return IsNameEqual(pchar, comm_open, comm_open_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
size_t HTMLFilter::OpeningCommentaryTagMarkSize()
|
|
|
|
{
|
|
|
|
return 4; // size of "<!--"
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
bool HTMLFilter::IsStartingEntityMark(wchar_t c)
|
|
|
|
{
|
|
|
|
return (c == '&');
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsEndingEntityMark(wchar_t c)
|
|
|
|
{
|
|
|
|
return (c == ';');
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
// skipping the commentary tag if exists
|
|
|
|
bool HTMLFilter::SkipCommentaryTagIfExists()
|
|
|
|
{
|
2010-11-21 01:19:17 +01:00
|
|
|
static wchar_t comm_close[] = L"-->";
|
|
|
|
size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
if( !IsOpeningCommentaryTagMark(pchar) )
|
2009-12-09 01:42:40 +01:00
|
|
|
return false;
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
pchar += OpeningCommentaryTagMarkSize();
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
// looking for "-->"
|
|
|
|
while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
|
|
|
|
++pchar;
|
|
|
|
|
|
|
|
if( *pchar!= 0 )
|
|
|
|
pchar += comm_close_len;
|
|
|
|
|
|
|
|
CheckNewLine();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-06-21 00:47:24 +02:00
|
|
|
if( trim_white )
|
|
|
|
{
|
|
|
|
// skipping all white chars (with new lines)
|
|
|
|
// but with remembering the last non white character
|
|
|
|
for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar)
|
|
|
|
if( *pchar == 10 )
|
|
|
|
last_non_white = pchar;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// skipping first white chars with only one line between them
|
|
|
|
SkipWhite();
|
|
|
|
last_non_white = pchar;
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
if( *pchar == 10 )
|
|
|
|
{
|
|
|
|
++pchar;
|
|
|
|
SkipWhite();
|
|
|
|
}
|
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
start = pchar;
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
// exception for the commentary tag
|
2018-10-24 18:31:42 +02:00
|
|
|
if( IsOpeningCommentaryTagMark(pchar) || !IsOpeningTagMark(*pchar) )
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2009-12-09 01:42:40 +01:00
|
|
|
PutNewLine();
|
2010-06-21 00:47:24 +02:00
|
|
|
PutTabs(stack_len);
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// reading text between html tags
|
|
|
|
void HTMLFilter::ReadNormalText()
|
|
|
|
{
|
2010-11-21 01:19:17 +01:00
|
|
|
const wchar_t * start = pchar;
|
|
|
|
const wchar_t * last_non_white = pchar;
|
2010-06-21 00:47:24 +02:00
|
|
|
|
|
|
|
if( last_new_line )
|
|
|
|
ReadNormalTextSkipWhite(start, last_non_white);
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
|
|
|
while( *pchar != 0 )
|
|
|
|
{
|
2018-10-24 01:28:45 +02:00
|
|
|
const wchar_t * commentary_start = pchar;
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
if( SkipCommentaryTagIfExists() )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-06-21 00:47:24 +02:00
|
|
|
last_non_white = pchar - 1; // pointing at the last '>' from a commentary
|
2018-10-24 01:28:45 +02:00
|
|
|
PutNormalText(start, commentary_start);
|
|
|
|
|
|
|
|
if( !skip_commentaries )
|
|
|
|
{
|
|
|
|
PutNormalText(commentary_start, pchar);
|
|
|
|
}
|
|
|
|
|
|
|
|
start = pchar;
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( IsOpeningTagMark(*pchar) )
|
2009-12-09 01:42:40 +01:00
|
|
|
break;
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
if( !IsWhite(*pchar) )
|
|
|
|
last_non_white = pchar;
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
pchar += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
last_new_line = (*last_non_white == 10);
|
2011-04-16 10:42:22 +02:00
|
|
|
PutNormalText(start, pchar);
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 13:27:54 +02:00
|
|
|
bool HTMLFilter::PrintOpeningItem()
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
|
2014-06-19 01:18:28 +02:00
|
|
|
return true;
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
if( last_new_line )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-06-30 20:42:50 +02:00
|
|
|
PutNewLine();
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
if( stack_len > 1 )
|
|
|
|
PutTabs(stack_len-1);
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2011-04-16 13:27:54 +02:00
|
|
|
return PutOpeningTag();
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
|
|
|
|
bool HTMLFilter::ReadItemAttr()
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
attr_has_value = false;
|
|
|
|
attr_name.clear();
|
|
|
|
attr_value.clear();
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
SkipWhiteLines();
|
|
|
|
ReadItemAttrName();
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( attr_name.empty() )
|
2010-06-30 20:42:50 +02:00
|
|
|
return false;
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
SkipWhiteLines();
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( *pchar != '=' )
|
|
|
|
return true;
|
|
|
|
|
|
|
|
attr_has_value = true;
|
|
|
|
pchar += 1; // skipping '='
|
|
|
|
SkipWhiteLines();
|
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
bool has_quote = (*pchar == '\"' || *pchar == '\'');
|
|
|
|
wchar_t quote_char = *pchar;
|
2011-04-16 10:42:22 +02:00
|
|
|
|
|
|
|
if( has_quote )
|
|
|
|
pchar += 1; // skipping the first quote mark
|
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
ReadItemAttrValue(has_quote, quote_char);
|
2011-04-16 10:42:22 +02:00
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
if( has_quote && *pchar == quote_char )
|
2011-04-16 10:42:22 +02:00
|
|
|
pchar += 1; // skipping the last quote mark
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::CheckItemAttr()
|
|
|
|
{
|
|
|
|
if( attr_has_value && IsNameEqual(L"lang", attr_name) )
|
|
|
|
{
|
|
|
|
LastItem().porphans = 0;
|
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
if( !attr_value.empty() )
|
|
|
|
{
|
|
|
|
// we are taking the first value only
|
|
|
|
attr_value_lower = attr_value[0];
|
|
|
|
ToLower(attr_value_lower);
|
2011-04-16 10:42:22 +02:00
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
OrphansTab::iterator i = orphans_tab.find(attr_value_lower);
|
|
|
|
|
|
|
|
if( i != orphans_tab.end() )
|
|
|
|
LastItem().porphans = &i->second;
|
|
|
|
}
|
2011-04-16 10:42:22 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
void HTMLFilter::PrintItemAttr()
|
2011-04-16 10:42:22 +02:00
|
|
|
{
|
2014-06-19 01:18:28 +02:00
|
|
|
size_t i;
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
|
2014-06-19 01:18:28 +02:00
|
|
|
return;
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
Put(' ');
|
|
|
|
Put(attr_name);
|
|
|
|
|
|
|
|
if( attr_has_value )
|
|
|
|
{
|
|
|
|
Put(L"=\"");
|
2014-06-19 01:18:28 +02:00
|
|
|
|
|
|
|
for(i=0 ; i<attr_value.size() ; ++i)
|
|
|
|
{
|
|
|
|
Put(attr_value[i]);
|
|
|
|
|
|
|
|
if( i + 1 < attr_value.size() )
|
|
|
|
Put(' ');
|
|
|
|
}
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
Put('\"');
|
2010-06-30 20:42:50 +02:00
|
|
|
}
|
2011-04-16 10:42:22 +02:00
|
|
|
}
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::ReadItemClosing()
|
|
|
|
{
|
|
|
|
pchar += 1; // skipping '/'
|
|
|
|
SkipWhiteLines();
|
2010-06-30 20:42:50 +02:00
|
|
|
ReadItemName();
|
2011-04-16 10:42:22 +02:00
|
|
|
LastItem().type = Item::closing;
|
|
|
|
SkipAndCheckClosingTag();
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
// closing tags are printed later
|
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::ReadItemSpecial()
|
|
|
|
{
|
|
|
|
LastItem().type = Item::special;
|
2018-10-24 01:28:45 +02:00
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
if( !skip_tags )
|
2018-10-24 01:28:45 +02:00
|
|
|
PutOpeningTagMark();
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
const wchar_t * start = pchar;
|
2018-10-24 01:28:45 +02:00
|
|
|
pchar += 1; // skipping '!'
|
|
|
|
|
|
|
|
ReadItemName();
|
2011-04-16 10:42:22 +02:00
|
|
|
SkipAndCheckClosingTag();
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
if( !skip_tags && pchar > start )
|
2011-04-16 10:42:22 +02:00
|
|
|
Put(start, pchar);
|
|
|
|
|
|
|
|
// closing tag mark is printed directly from the source
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemOpening()
|
|
|
|
{
|
|
|
|
LastItem().type = Item::opening;
|
|
|
|
ReadItemName();
|
2011-04-16 13:27:54 +02:00
|
|
|
|
|
|
|
if( PrintOpeningItem() )
|
2011-04-16 10:42:22 +02:00
|
|
|
{
|
2011-04-16 13:27:54 +02:00
|
|
|
while( ReadItemAttr() )
|
|
|
|
{
|
|
|
|
if( CheckItemAttr() )
|
2014-06-19 01:18:28 +02:00
|
|
|
PrintItemAttr();
|
2011-04-16 13:27:54 +02:00
|
|
|
}
|
2011-04-16 10:42:22 +02:00
|
|
|
|
2011-04-16 13:27:54 +02:00
|
|
|
SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
|
2011-04-16 10:42:22 +02:00
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
|
2014-06-19 01:18:28 +02:00
|
|
|
{
|
|
|
|
if( LastItem().type == Item::simple )
|
|
|
|
Put(L" /");
|
2011-04-16 10:42:22 +02:00
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
PutClosingTagMark();
|
|
|
|
}
|
2011-04-16 13:27:54 +02:00
|
|
|
}
|
2011-04-16 10:42:22 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
void HTMLFilter::ItemFound()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2018-10-24 18:31:42 +02:00
|
|
|
void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
bool HTMLFilter::ReadItem()
|
|
|
|
{
|
|
|
|
if( *pchar == 0 )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if( !PushStack() )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
pchar += 1; // skipping the first '<'
|
|
|
|
SkipWhiteLines();
|
|
|
|
|
|
|
|
if( *pchar == '!' )
|
|
|
|
ReadItemSpecial();
|
|
|
|
else
|
|
|
|
if( *pchar == '/' ) // we have a closing tag (dodac jako metode wirtualna) !!
|
|
|
|
ReadItemClosing();
|
|
|
|
else
|
|
|
|
ReadItemOpening();
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
CheckNewLine();
|
|
|
|
LastItem().new_line = last_new_line;
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2018-10-24 01:28:45 +02:00
|
|
|
ItemFound();
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
return true;
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
wchar_t HTMLFilter::ToLower(wchar_t c)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
if( c>='A' && c<='Z' )
|
|
|
|
return c - 'A' + 'a';
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::ToLower(std::wstring & str)
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for(i=0 ; i<str.size() ; ++i)
|
|
|
|
str[i] = ToLower(str[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
|
|
|
|
if( ToLower(*name1) != ToLower(*name2) )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if( *name1==0 && *name2==0 )
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2)
|
|
|
|
{
|
|
|
|
return IsNameEqual(name1, name2.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2)
|
|
|
|
{
|
|
|
|
return IsNameEqual(name1.c_str(), name2);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2)
|
|
|
|
{
|
|
|
|
return IsNameEqual(name1.c_str(), name2.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
// len characters from both strings must be equal
|
2018-10-24 18:31:42 +02:00
|
|
|
// IMPROVE ME change name to something like IsBeginningNameEqual
|
|
|
|
// and move to text.h (pikotools)
|
2010-11-21 01:19:17 +01:00
|
|
|
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
|
|
|
|
if( ToLower(*name1) != ToLower(*name2) )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if( len == 0 )
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
|
|
|
|
{
|
|
|
|
return IsNameEqual(name1, name2.c_str(), len);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
return IsNameEqual(name1.c_str(), name2, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
|
|
|
|
{
|
|
|
|
return IsNameEqual(name1.c_str(), name2.c_str(), len);
|
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsLastTag(const wchar_t * name)
|
|
|
|
{
|
|
|
|
return IsNameEqual(name, LastItem().name);
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
bool HTMLFilter::IsLastTag(const std::wstring & name)
|
|
|
|
{
|
|
|
|
return IsNameEqual(name, LastItem().name);
|
|
|
|
}
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
// checking exceptions for opening tags
|
|
|
|
void HTMLFilter::CheckExceptions()
|
|
|
|
{
|
2010-11-21 01:19:17 +01:00
|
|
|
if( IsLastTag(L"meta") ||
|
|
|
|
IsLastTag(L"input") ||
|
|
|
|
IsLastTag(L"br") ||
|
|
|
|
IsLastTag(L"hr") ||
|
|
|
|
IsLastTag(L"img") ||
|
|
|
|
IsLastTag(L"link") ||
|
|
|
|
IsLastTag(L"param") ||
|
2011-12-04 20:34:39 +01:00
|
|
|
IsLastTag(L"col") ||
|
2010-11-21 01:19:17 +01:00
|
|
|
IsLastTag(L"area") )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
LastItem().type = Item::simple;
|
|
|
|
PopStack();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
// in safe_mode the script tag is ignored
|
2010-11-21 01:19:17 +01:00
|
|
|
if( !safe_mode && IsLastTag(L"script") )
|
2018-10-24 18:31:42 +02:00
|
|
|
PutEverythingUntilClosingTag(!skip_tags);
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
|
2018-10-24 18:31:42 +02:00
|
|
|
PutEverythingUntilClosingTag(!skip_tags);
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2014-06-19 01:18:28 +02:00
|
|
|
if( IsLastTag(no_filter_tag) )
|
2018-10-24 01:28:45 +02:00
|
|
|
PutEverythingUntilClosingTag(false);
|
2014-06-19 01:18:28 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
if( IsLastTag(L"body") )
|
|
|
|
LastItem().has_body_tag = true;
|
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::AddForgottenTags()
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if( stack_len < 3 )
|
|
|
|
return;
|
|
|
|
|
|
|
|
// we have forgotten to close some tags
|
|
|
|
|
|
|
|
// looking whether there is a matching opening tag
|
|
|
|
for(i=int(stack_len)-3 ; i>=0 ; --i)
|
|
|
|
if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
|
|
|
|
break;
|
|
|
|
|
|
|
|
if( i < 0 )
|
|
|
|
{
|
|
|
|
// oops, there is no such a tag
|
|
|
|
// we don't print the closing and the missing opening tag
|
|
|
|
PopStack();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for(int z=(int)stack_len-2 ; z>=i ; --z)
|
|
|
|
{
|
2018-10-24 18:31:42 +02:00
|
|
|
if( !skip_tags && pstack[z].new_line )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
PutNewLine();
|
2010-06-21 00:47:24 +02:00
|
|
|
PutTabs(z);
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
PutClosingTag(pstack[z].name.c_str());
|
2009-12-09 01:42:40 +01:00
|
|
|
pstack[z].Clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
last_new_line = pstack[stack_len-1].new_line;
|
|
|
|
|
|
|
|
// invalidate tags
|
|
|
|
stack_len = i;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
void HTMLFilter::CheckStackPrintRest()
|
|
|
|
{
|
|
|
|
while( stack_len-- > 0 )
|
|
|
|
{
|
|
|
|
if( stack_len==0 || pstack[stack_len-1].new_line )
|
|
|
|
PutNewLine();
|
|
|
|
|
|
|
|
PutTabs(stack_len);
|
2011-04-16 10:42:22 +02:00
|
|
|
PutClosingTag(pstack[stack_len].name.c_str());
|
2010-06-21 00:47:24 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
void HTMLFilter::CheckClosingTags()
|
|
|
|
{
|
|
|
|
if( stack_len == 0 )
|
|
|
|
return;
|
|
|
|
|
|
|
|
// on the stack we have only opening tags
|
|
|
|
// but only the last tag is a closing tag
|
|
|
|
|
|
|
|
if( stack_len == 1 )
|
|
|
|
{
|
|
|
|
// there is only last closing tag
|
|
|
|
// we dont print it
|
|
|
|
PopStack();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// there are more than one tag
|
|
|
|
if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
|
|
|
|
{
|
|
|
|
// last closing tag is from the previous one
|
2018-10-24 18:31:42 +02:00
|
|
|
if( !skip_tags && pstack[stack_len-2].new_line )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
PutNewLine();
|
2010-06-21 00:47:24 +02:00
|
|
|
PutTabs(stack_len-2);
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
PutClosingTag(pstack[stack_len-1].name.c_str());
|
2009-12-09 01:42:40 +01:00
|
|
|
last_new_line = pstack[stack_len-1].new_line;
|
|
|
|
PopStack();
|
|
|
|
PopStack();
|
|
|
|
}
|
2011-04-16 10:42:22 +02:00
|
|
|
else
|
|
|
|
{
|
|
|
|
AddForgottenTags();
|
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
bool HTMLFilter::PrintRest()
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-11-21 01:19:17 +01:00
|
|
|
const wchar_t * start = pchar;
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-08-14 19:56:07 +02:00
|
|
|
// in safe mode we do not print the rest html code
|
2018-10-24 18:31:42 +02:00
|
|
|
if( safe_mode || skip_tags )
|
2010-08-14 19:56:07 +02:00
|
|
|
return false;
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
while( *pchar )
|
|
|
|
++pchar;
|
|
|
|
|
|
|
|
if( pchar > start )
|
2010-06-21 00:47:24 +02:00
|
|
|
{
|
2009-12-09 01:42:40 +01:00
|
|
|
Put(start, pchar);
|
2010-06-21 00:47:24 +02:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void HTMLFilter::ReadLoop()
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
2010-06-21 00:47:24 +02:00
|
|
|
while( ReadItem() )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
if( LastItem().type == Item::opening )
|
|
|
|
{
|
|
|
|
CheckExceptions();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( LastItem().type == Item::special || LastItem().type == Item::simple )
|
|
|
|
{
|
|
|
|
if( stack_len > 1 )
|
|
|
|
{
|
2018-10-24 01:28:45 +02:00
|
|
|
//pstack[stack_len-2].new_line = LastItem().new_line;
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
else
|
2010-06-21 00:47:24 +02:00
|
|
|
if( trim_white )
|
2009-12-09 01:42:40 +01:00
|
|
|
{
|
|
|
|
// one new line after a simple or special tag
|
|
|
|
// (if the tag has level 0 in the tree - it not means that this is a first tag)
|
|
|
|
// for example can be DOCTYPE
|
|
|
|
PutNewLine();
|
|
|
|
}
|
|
|
|
|
|
|
|
PopStack();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( LastItem().type == Item::closing )
|
|
|
|
{
|
|
|
|
CheckClosingTags();
|
|
|
|
}
|
2018-10-24 01:28:45 +02:00
|
|
|
else
|
|
|
|
{
|
|
|
|
PopStack();
|
|
|
|
}
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
ReadNormalText();
|
|
|
|
}
|
2011-04-16 10:42:22 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Read()
|
|
|
|
{
|
|
|
|
if( trim_white )
|
|
|
|
SkipWhiteLines();
|
|
|
|
|
|
|
|
// it can be some text or white lines before the first html tag (we print it)
|
|
|
|
ReadNormalText();
|
|
|
|
|
|
|
|
// reading the whole html source
|
|
|
|
ReadLoop();
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
// sometimes there can remain some html source (when there is no space on the stack)
|
|
|
|
// we print the rest html without filtering (only if safe_mode is false)
|
2010-06-21 00:47:24 +02:00
|
|
|
if( !PrintRest() )
|
|
|
|
CheckStackPrintRest();
|
2009-12-09 01:42:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2014-02-12 17:30:49 +01:00
|
|
|
} // namespace Winix
|
2009-12-09 01:42:40 +01:00
|
|
|
|