winix/winixd/core/htmlfilter.cpp

1525 lines
25 KiB
C++
Raw Normal View History

/*
* This file is a part of Winix
* and is distributed under the 2-Clause BSD licence.
* Author: Tomasz Sowa <t.sowa@ttmath.org>
*/
/*
* Copyright (c) 2008-2014, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include "htmlfilter.h"
namespace Winix
{
void HTMLFilter::Item::Clear()
{
name.clear();
type = none;
porphans = 0;
new_line = false;
has_body_tag = false;
}
HTMLFilter::Item::Item()
{
Clear();
}
void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
{
pchar = in;
stack_len = 0;
out_string = &out;
last_new_line = false;
line_len = 0;
out_string->clear();
Init();
Read();
Uninit();
}
void HTMLFilter::Init()
{
}
void HTMLFilter::Uninit()
{
}
void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
{
size_t out_projected_len = in.size() * 2 + 1;
if( out.capacity() < out_projected_len )
out.reserve(out_projected_len);
Filter(in.c_str(), out);
}
HTMLFilter::HTMLFilter()
{
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
tab_size = 2;
trim_white = false;
break_after = 0;
wrap_line = 0;
orphan_mode = orphan_nbsp;
safe_mode = false;
}
HTMLFilter::HTMLFilter(const HTMLFilter & f)
{
// don't need to copy the stack
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
}
HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
{
// don't need to copy the stack
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
return *this;
}
HTMLFilter::~HTMLFilter()
{
delete [] pstack;
delete [] buffer;
}
void HTMLFilter::BreakWord(size_t break_after_)
{
break_after = break_after_;
if( break_after > 10000 )
break_after = 10000;
}
void HTMLFilter::WrapLine(size_t wrap_line_)
{
wrap_line = wrap_line_;
if( wrap_line > 10000 )
wrap_line = 10000;
}
void HTMLFilter::TrimWhite(bool trim)
{
trim_white = trim;
}
void HTMLFilter::InsertTabs(size_t tabsize)
{
tab_size = tabsize;
if( tab_size > 1000 )
tab_size = 1000;
}
void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans)
{
size_t i;
orphans.max_len = 0;
for(i=0 ; i<orphans.tab.size() ; ++i)
{
if( orphans.tab[i].size() > orphans.max_len )
orphans.max_len = orphans.tab[i].size();
}
}
void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
{
lang_code_lower = lang_code;
ToLower(lang_code_lower);
orphans_temp.tab = otab;
std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end());
CalcOrphansMaxLen(orphans_temp);
orphans_tab[lang_code_lower] = orphans_temp;
}
void HTMLFilter::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
{
AssignOrphans(lang_code.c_str(), otab);
}
void HTMLFilter::ClearOrphans()
{
orphans_tab.clear();
}
void HTMLFilter::OrphansMode(HTMLFilter::OrphanMode mode)
{
orphan_mode = mode;
}
void HTMLFilter::SafeMode(bool safe_mode_)
{
safe_mode = safe_mode_;
}
void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
{
no_filter_tag = tag_name;
}
HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
{
if( i >= stack_len )
{
empty.Clear();
return empty;
}
return pstack[i];
}
HTMLFilter::Item & HTMLFilter::LastItem()
{
if( stack_len == 0 )
{
empty.Clear();
return empty;
}
return pstack[stack_len-1];
}
bool HTMLFilter::PushStack()
{
if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
// oops, too many items
return false;
pstack[stack_len].Clear();
if( stack_len > 0 )
{
// 'porphans' and 'has_body_tag' attributes are propagated
pstack[stack_len].porphans = pstack[stack_len-1].porphans;
pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag;
}
stack_len += 1;
return true;
}
void HTMLFilter::PopStack()
{
if( stack_len == 0 )
// oops
return;
stack_len -= 1;
pstack[stack_len].Clear();
}
bool HTMLFilter::IsWhite(int c)
{
// dont use c==10 here
if( c==' ' || c=='\t' || c==13 || c==160 )
return true;
return false;
}
void HTMLFilter::SkipWhite()
{
while( IsWhite(*pchar) )
++pchar;
}
void HTMLFilter::SkipWhiteLines()
{
while( *pchar==10 || IsWhite(*pchar) )
++pchar;
}
void HTMLFilter::SkipWhiteWithFirstNewLine()
{
SkipWhite();
if( *pchar == 10 )
{
pchar += 1;
SkipWhite();
}
}
void HTMLFilter::SkipWhiteLines(const wchar_t * & str, const wchar_t * end)
{
while( str < end && (*str==10 || IsWhite(*str)) )
++str;
}
void HTMLFilter::CheckNewLine()
{
const wchar_t * start = pchar;
SkipWhite();
last_new_line = (*pchar==10);
pchar = start;
}
bool HTMLFilter::IsClosingTagForLastItem()
{
pchar += 1;
SkipWhite();
if( *pchar == '/' )
{
pchar += 1;
SkipWhite();
if( IsNameEqual(pchar, LastItem().name, LastItem().name.size()) )
{
pchar += LastItem().name.size();
SkipWhite();
if( IsClosingTagMark() )
{
pchar += 1;
return true;
}
}
}
return false;
}
// used for such tags as: script, pre, textarea
void HTMLFilter::PutLastTagWithClosingTag()
{
const wchar_t * start = pchar;
while( *pchar != 0 )
{
if( IsOpeningTagMark() )
{
if( IsClosingTagForLastItem() )
{
PopStack();
CheckNewLine();
break;
}
}
else
{
pchar += 1;
}
}
Put(start, pchar);
}
// used with <nofilter> </nofilter> tags
void HTMLFilter::PutTextBetweenLastTagWithClosingTag()
{
const wchar_t * start = pchar, * end = pchar;
while( *pchar != 0 )
{
if( IsOpeningTagMark() )
{
if( IsClosingTagForLastItem() )
{
PopStack();
CheckNewLine();
break;
}
}
else
{
pchar += 1;
end = pchar;
}
}
Put(start, end);
}
void HTMLFilter::SkipAndCheckClosingTag()
{
for( ; *pchar ; ++pchar )
{
if( LastItem().type == Item::opening && IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
{
LastItem().type = Item::simple;
}
if( IsClosingTagMark() )
{
++pchar;
break;
}
}
}
bool HTMLFilter::IsValidCharForName(int c)
{
if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') ||
(c>='0' && c<='9') ||
c=='-' || c=='!' )
return true;
return false;
}
bool HTMLFilter::IsValidCharForAttrName(int c)
{
if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') ||
(c>='0' && c<='9') ||
c=='-' || c==':' )
return true;
return false;
}
void HTMLFilter::ReadItemName()
{
size_t i;
for( i=0 ; IsValidCharForName(*pchar) ; ++i )
{
if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN )
LastItem().name += *pchar;
++pchar;
}
}
void HTMLFilter::ReadItemAttrName()
{
size_t i;
attr_name.clear();
for( i=0 ; *pchar && IsValidCharForAttrName(*pchar) ; ++i )
{
if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN )
attr_name += *pchar;
++pchar;
}
}
void HTMLFilter::ReadItemAttrValue(bool has_quote)
{
size_t i;
// sprawdzic to wszedzie bo teraz jest tablica
attr_value.clear();
attr_value_temp.clear();
// !! dodac obsluge pojedynczego cudzyslowu
for(i=0 ; *pchar && *pchar != '\"' && !IsClosingTagMark() && (has_quote || (*pchar!=10 && !IsWhite(*pchar)) ); ++i )
{
if( *pchar==10 || IsWhite(*pchar) )
{
if( !attr_value_temp.empty() )
{
attr_value.push_back(attr_value_temp);
attr_value_temp.clear();
}
}
else
if( i < WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
attr_value_temp += *pchar;
++pchar;
}
if( !attr_value_temp.empty() )
{
attr_value.push_back(attr_value_temp);
attr_value_temp.clear();
}
}
void HTMLFilter::CheckChar(wchar_t c)
{
if( c == 10 )
line_len = 0;
else
line_len += 1;
}
void HTMLFilter::Put(wchar_t c)
{
(*out_string) += c;
CheckChar(c);
}
void HTMLFilter::Put(const wchar_t * str)
{
out_string->append(str);
for( ; *str ; ++str)
CheckChar(*str);
}
void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
{
if( str >= end )
return;
size_t len = end - str;
out_string->append(str, len);
for( ; str < end ; ++str)
CheckChar(*str);
}
void HTMLFilter::Put(const std::wstring & str)
{
out_string->append(str);
for(size_t i=0 ; i<str.size() ; ++i)
CheckChar(str[i]);
}
int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
{
size_t res;
const wchar_t * orphan = orphan_str.c_str();
for( ; str<end && *orphan!=0 ; ++str, ++orphan )
{
res = ToLower(*str) - ToLower(*orphan);
if( res != 0 )
return res;
}
if( str < end )
return ToLower(*str);
return -int(ToLower(*orphan));
}
// binary search in table (table should be sorted)
bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
{
int res;
if( table.empty() )
return false;
size_t o1 = 0;
size_t o2 = table.size() - 1;
res = CheckOrphan(str, end, table[o1]);
if( res == 0 )
return true;
if( res < 0 )
return false;
res = CheckOrphan(str, end, table[o2]);
if( res == 0 )
return true;
if( res > 0 )
return false;
while( o1 + 1 < o2 )
{
size_t o = (o1 + o2) / 2;
res = CheckOrphan(str, end, table[o]);
if( res == 0 )
return true;
if( res < 0 )
o2 = o;
else
o1 = o;
}
return false;
}
bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end)
{
if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
return false;
size_t len = end - str;
if( len > LastItem().porphans->max_len )
return false;
return CheckOrphan(str, end, LastItem().porphans->tab);
}
// if there is a semicolon nearby then we break the line after it
// (useful in html entities)
// !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str)
// i wtedy tez nie dodajemy zadnego znaku
bool HTMLFilter::HasSemiloconAround(const wchar_t * str, const wchar_t * end)
{
size_t i, epsilon = 8;
for(i=0 ; str < end && i<epsilon ; ++i, ++str)
if( *str == ';' )
return true;
return false;
}
void HTMLFilter::CheckLineWrap()
{
if( wrap_line != 0 && LastItem().has_body_tag && line_len > wrap_line )
{
Put(10);
PutTabs(stack_len);
}
}
void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end)
{
const wchar_t * word = str;
size_t non_whites = 0;
bool was_semicolon = false;
for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites )
{
if( break_after != 0 && non_whites >= break_after && (was_semicolon || !HasSemiloconAround(str, end)) )
{
Put(word, str);
word = str;
non_whites = 0;
Put(' ');
CheckLineWrap();
}
was_semicolon = (*str == ';');
}
Put(word, str);
}
void HTMLFilter::PutNormalWhite(const wchar_t * & str, const wchar_t * end)
{
if( str < end )
{
if( trim_white )
{
Put(' ');
SkipWhiteLines(str, end);
}
else
{
while( str < end && (*str==10 || IsWhite(*str)) )
{
Put(*str);
if( *str == 10 )
PutTabs(stack_len);
++str;
}
}
}
}
void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end)
{
const wchar_t * word, * white;
if( str < end )
CheckLineWrap();
while( str < end )
{
word = str;
PutNormalNonWhite(str, end);
if( CheckOrphan(word, str) )
{
white = str;
SkipWhiteLines(str, end);
if( white < str )
PutNonBreakingSpace();
}
else
{
PutNormalWhite(str, end);
if( str < end ) // !! lub moze podobnie jak jest na gorze tutaj? juz nie mam sily myslec :(
CheckLineWrap();