1228 lines
20 KiB
C++
Executable File
1228 lines
20 KiB
C++
Executable File
/*
|
||
* This file is a part of Winix
|
||
* and is not publicly distributed
|
||
*
|
||
* Copyright (c) 2008-2010, Tomasz Sowa
|
||
* All rights reserved.
|
||
*
|
||
*/
|
||
|
||
#include "htmlfilter.h"
|
||
|
||
|
||
|
||
void HTMLFilter::Item::Clear()
|
||
{
|
||
name[0] = 0;
|
||
name_len = 0;
|
||
type = none;
|
||
new_line = false;
|
||
}
|
||
|
||
|
||
HTMLFilter::Item::Item()
|
||
{
|
||
Clear();
|
||
}
|
||
|
||
|
||
|
||
void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
|
||
{
|
||
pchar = in;
|
||
stack_len = 0;
|
||
out_string = &out;
|
||
last_new_line = false;
|
||
out_string->clear();
|
||
|
||
Init();
|
||
Read();
|
||
Deinit();
|
||
}
|
||
|
||
|
||
|
||
void HTMLFilter::Init()
|
||
{
|
||
}
|
||
|
||
|
||
void HTMLFilter::Deinit()
|
||
{
|
||
}
|
||
|
||
|
||
|
||
void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
|
||
{
|
||
out.reserve(in.size() * 2 + 1);
|
||
Filter(in.c_str(), out);
|
||
}
|
||
|
||
|
||
HTMLFilter::HTMLFilter()
|
||
{
|
||
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
||
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
||
|
||
tab_size = 2;
|
||
trim_white = false;
|
||
break_after = 0;
|
||
lang = lang_none;
|
||
orphan_mode = orphan_nbsp;
|
||
safe_mode = false;
|
||
}
|
||
|
||
|
||
HTMLFilter::HTMLFilter(const HTMLFilter & f)
|
||
{
|
||
// don't need to copy the stack
|
||
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
||
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
||
}
|
||
|
||
|
||
HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
|
||
{
|
||
// don't need to copy the stack
|
||
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
||
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
||
|
||
return *this;
|
||
}
|
||
|
||
|
||
HTMLFilter::~HTMLFilter()
|
||
{
|
||
delete [] pstack;
|
||
delete [] buffer;
|
||
}
|
||
|
||
|
||
void HTMLFilter::BreakLines(size_t break_after_)
|
||
{
|
||
break_after = break_after_;
|
||
|
||
if( break_after > 10000 )
|
||
break_after = 10000;
|
||
}
|
||
|
||
|
||
void HTMLFilter::TrimWhite(bool trim)
|
||
{
|
||
trim_white = trim;
|
||
}
|
||
|
||
|
||
void HTMLFilter::InsertTabs(size_t tabsize)
|
||
{
|
||
tab_size = tabsize;
|
||
|
||
if( tab_size > 1000 )
|
||
tab_size = 1000;
|
||
}
|
||
|
||
|
||
|
||
void HTMLFilter::CheckOrphans(HTMLFilter::Lang lang_, HTMLFilter::OrphanMode mode)
|
||
{
|
||
lang = lang_;
|
||
orphan_mode = mode;
|
||
}
|
||
|
||
|
||
void HTMLFilter::SafeMode(bool safe_mode_)
|
||
{
|
||
safe_mode = safe_mode_;
|
||
}
|
||
|
||
|
||
|
||
HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
|
||
{
|
||
if( i >= stack_len )
|
||
{
|
||
empty.Clear();
|
||
return empty;
|
||
}
|
||
|
||
return pstack[i];
|
||
}
|
||
|
||
|
||
HTMLFilter::Item & HTMLFilter::LastItem()
|
||
{
|
||
if( stack_len == 0 )
|
||
{
|
||
empty.Clear();
|
||
return empty;
|
||
}
|
||
|
||
return pstack[stack_len-1];
|
||
}
|
||
|
||
|
||
bool HTMLFilter::PushStack()
|
||
{
|
||
if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
|
||
// oops, too many items
|
||
return false;
|
||
|
||
pstack[stack_len].Clear();
|
||
stack_len += 1;
|
||
|
||
return true;
|
||
}
|
||
|
||
void HTMLFilter::PopStack()
|
||
{
|
||
if( stack_len == 0 )
|
||
// oops
|
||
return;
|
||
|
||
stack_len -= 1;
|
||
pstack[stack_len].Clear();
|
||
}
|
||
|
||
|
||
bool HTMLFilter::IsWhite(int c)
|
||
{
|
||
// dont use c==10 here
|
||
|
||
if( c==' ' || c=='\t' || c==13 || c==160 )
|
||
return true;
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
void HTMLFilter::SkipWhite()
|
||
{
|
||
while( IsWhite(*pchar) )
|
||
++pchar;
|
||
}
|
||
|
||
|
||
void HTMLFilter::SkipWhiteLines()
|
||
{
|
||
while( *pchar==10 || IsWhite(*pchar) )
|
||
++pchar;
|
||
}
|
||
|
||
|
||
void HTMLFilter::SkipWhiteWithFirstNewLine()
|
||
{
|
||
SkipWhite();
|
||
|
||
if( *pchar == 10 )
|
||
{
|
||
pchar += 1;
|
||
SkipWhite();
|
||
}
|
||
}
|
||
|
||
|
||
|
||
|
||
void HTMLFilter::CheckNewLine()
|
||
{
|
||
const wchar_t * start = pchar;
|
||
|
||
SkipWhite();
|
||
last_new_line = (*pchar==10);
|
||
|
||
pchar = start;
|
||
}
|
||
|
||
|
||
|
||
|
||
bool HTMLFilter::IsClosingTagForLastItem()
|
||
{
|
||
pchar += 1;
|
||
SkipWhite();
|
||
|
||
if( *pchar == '/' )
|
||
{
|
||
pchar += 1;
|
||
SkipWhite();
|
||
|
||
if( IsNameEqual(pchar, LastItem().name, LastItem().name_len) )
|
||
{
|
||
pchar += LastItem().name_len;
|
||
SkipWhite();
|
||
|
||
if( IsClosingTagMark() )
|
||
{
|
||
pchar += 1;
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
|
||
|
||
// used for such tags as: script, pre, textarea
|
||
void HTMLFilter::PutLastTagWithClosingTag()
|
||
{
|
||
const wchar_t * start = pchar;
|
||
|
||
|
||
while( *pchar != 0 )
|
||
{
|
||
if( IsOpeningTagMark() )
|
||
{
|
||
if( IsClosingTagForLastItem() )
|
||
{
|
||
PopStack();
|
||
CheckNewLine();
|
||
break;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
pchar += 1;
|
||
}
|
||
}
|
||
|
||
Put(start, pchar);
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
const wchar_t * HTMLFilter::SkipItemCheckXmlSimple()
|
||
{
|
||
const wchar_t * end = pchar;
|
||
|
||
|
||
while( *pchar!=0 )
|
||
{
|
||
while( *pchar!=0 && !IsClosingTagMark() && !IsClosingXmlSimpleTagMark())
|
||
++pchar;
|
||
|
||
if( IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/'
|
||
{
|
||
end = pchar;
|
||
++pchar;
|
||
SkipWhite();
|
||
|
||
if( IsClosingTagMark() )
|
||
{
|
||
++pchar;
|
||
LastItem().type = Item::simple;
|
||
break;
|
||
}
|
||
}
|
||
else
|
||
if( IsClosingTagMark() )
|
||
{
|
||
end = pchar;
|
||
++pchar;
|
||
break;
|
||
}
|
||
}
|
||
|
||
return end;
|
||
}
|
||
|
||
|
||
bool HTMLFilter::IsValidCharForName(int c)
|
||
{
|
||
if( (c>='a' && c<='z') ||
|
||
(c>='A' && c<='Z') ||
|
||
(c>='0' && c<='9') ||
|
||
c=='-' || c=='!' )
|
||
return true;
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
|
||
void HTMLFilter::ReadItemName()
|
||
{
|
||
size_t i;
|
||
|
||
for( i=0 ; IsValidCharForName(*pchar) && i<WINIX_HTMLFILTER_ITEM_MAXLEN-1 ; ++i )
|
||
{
|
||
LastItem().name[i] = *pchar;
|
||
++pchar;
|
||
}
|
||
|
||
LastItem().name[i] = 0;
|
||
LastItem().name_len = i;
|
||
}
|
||
|
||
|
||
|
||
void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
|
||
{
|
||
if( str>=end )
|
||
return;
|
||
|
||
size_t len = end - str;
|
||
out_string->append(str, len);
|
||
}
|
||
|
||
|
||
|
||
int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const wchar_t * orphan)
|
||
{
|
||
size_t res;
|
||
|
||
for( ; str<end && *orphan!=0 ; ++str, ++orphan )
|
||
{
|
||
res = ToLower(*str) - ToLower(*orphan);
|
||
|
||
if( res != 0 )
|
||
return res;
|
||
}
|
||
|
||
if( str < end )
|
||
return ToLower(*str);
|
||
|
||
return -int(ToLower(*orphan));
|
||
}
|
||
|
||
|
||
|
||
// binary search in table
|
||
// o1 - index of the first element
|
||
// o2 - index of the last element
|
||
bool HTMLFilter::CheckOrphanTable(const wchar_t * str, const wchar_t * end, const wchar_t ** table, size_t o1, size_t o2)
|
||
{
|
||
int res;
|
||
|
||
res = CheckOrphan(str, end, table[o1]);
|
||
|
||
if( res == 0 )
|
||
return true;
|
||
|
||
if( res < 0 )
|
||
return false;
|
||
|
||
res = CheckOrphan(str, end, table[o2]);
|
||
|
||
if( res == 0 )
|
||
return true;
|
||
|
||
if( res > 0 )
|
||
return false;
|
||
|
||
|
||
while( o1 + 1 < o2 )
|
||
{
|
||
size_t o = (o1 + o2) / 2;
|
||
res = CheckOrphan(str, end, table[o]);
|
||
|
||
if( res == 0 )
|
||
return true;
|
||
|
||
if( res < 0 )
|
||
o2 = o;
|
||
else
|
||
o1 = o;
|
||
}
|
||
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
bool HTMLFilter::CheckOrphanLangPl(const wchar_t * str, const wchar_t * end)
|
||
{
|
||
// the table must be sorted in alphabetical order
|
||
// polish letters coded in iso-8859-2
|
||
// !! wymieni<6E> na unikode
|
||
|
||
/*
|
||
tak jak bylo oryginalnie (tylko bez L):
|
||
static const wchar_t * orphans[] = {
|
||
L"(np.", L"s.", L"a", L"ale", L"bo", L"by", L"co", L"czy", L"do", L"go", L"i",
|
||
L"ich", L"ja", L"je", L"jej", L"jest", L"ju<6A>", L"j<>", L"ku", L"li", L"mi", L"na",
|
||
L"nie", L"np.", L"nr", L"o", L"od", L"po", L"s<>", L"ta", L"to", L"tu", L"t<>",
|
||
L"t<>", L"u", L"w", L"we", L"wy", L"z", L"za", L"ze", L"<22>e", L"<22>w"
|
||
};
|
||
*/
|
||
|
||
static const wchar_t * orphans[] = {
|
||
L"(np.", L"s.", L"a", L"ale", L"bo", L"by", L"co", L"czy", L"do", L"go", L"i",
|
||
L"ich", L"ja", L"je", L"jej", L"jest", L"juz", L"ja", L"ku", L"li", L"mi", L"na",
|
||
L"nie", L"np.", L"nr", L"o", L"od", L"po", L"sa", L"ta", L"to", L"tu", L"ta",
|
||
L"te", L"u", L"w", L"we", L"wy", L"z", L"za", L"ze", L"ze", L"ow"
|
||
};
|
||
|
||
|
||
|
||
size_t o1 = 0;
|
||
size_t o2 = sizeof(orphans) / sizeof(const wchar_t*) - 1;
|
||
|
||
return CheckOrphanTable(str, end, orphans, o1, o2);
|
||
}
|
||
|
||
|
||
|
||
// SK i CZ
|
||
bool HTMLFilter::CheckOrphanLangCz(const wchar_t * str, const wchar_t * end)
|
||
{
|
||
// the table must be sorted in alphabetical order
|
||
static const wchar_t * orphans[] = {
|
||
L"a", L"i", L"k", L"o", L"s", L"u", L"v", L"z"
|
||
};
|
||
|
||
size_t o1 = 0;
|
||
size_t o2 = sizeof(orphans) / sizeof(const wchar_t*) - 1;
|
||
|
||
return CheckOrphanTable(str, end, orphans, o1, o2);
|
||
}
|
||
|
||
|
||
bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end)
|
||
{
|
||
if( str == end || lang == lang_none )
|
||
return false;
|
||
|
||
if( lang == lang_cz || lang == lang_sk )
|
||
return CheckOrphanLangCz(str, end);
|
||
|
||
return CheckOrphanLangPl(str, end);
|
||
}
|
||
|
||
|
||
|
||
size_t HTMLFilter::PutNormalTextFillBuffer(const wchar_t * & str, const wchar_t * & end)
|
||
{
|
||
const wchar_t * word = str; // pointing at the beginning of a word
|
||
size_t i = 0;
|
||
// some space in the buffer for non break spaces (orphans) and spaces at the beginning of a line
|
||
size_t epsilon = WINIX_HTMLFILTER_BUFFER_MAXLEN / 10 + 1;
|
||
bool is_white;
|
||
bool was_white = true;
|
||
size_t non_whites = 0;
|
||
|
||
|
||
for( ; str < end && i<WINIX_HTMLFILTER_BUFFER_MAXLEN-epsilon ; ++str )
|
||
{
|
||
is_white = (*str==10 || IsWhite(*str));
|
||
|
||
if( is_white && !was_white )
|
||
{
|
||
if( CheckOrphan(word, str) )
|
||
{
|
||
i += PutNonBreakSpaceToBuffer(i);
|
||
|
||
// here we have to skip the whole white string
|
||
for( ; (*str==10 || IsWhite(*str)) && str < end ; ++str );
|
||
|
||
if( str == end )
|
||
break;
|
||
|
||
is_white = false;
|
||
was_white = true;
|
||
}
|
||
}
|
||
|
||
|
||
// skipping the last new line character (if exists)
|
||
|
||
if( *str == 10 && str < end-1 )
|
||
{
|
||
buffer[i] = *str;
|
||
i += 1;
|
||
i += PutTabsToBuffer(i, stack_len);
|
||
}
|
||
else
|
||
if( *str != 10 )
|
||
{
|
||
buffer[i] = *str;
|
||
i += 1;
|
||
}
|
||
|
||
if( was_white && !is_white )
|
||
word = str;
|
||
|
||
if( !is_white )
|
||
non_whites += 1;
|
||
else
|
||
non_whites = 0;
|
||
|
||
if( break_after!=0 && non_whites>=break_after )
|
||
{
|
||
buffer[i] = ' ';
|
||
i += 1;
|
||
non_whites = 0;
|
||
}
|
||
|
||
was_white = is_white;
|
||
}
|
||
|
||
return i;
|
||
}
|
||
|
||
|
||
size_t HTMLFilter::PutNormalTextTrimFillBuffer(const wchar_t * & str, const wchar_t * & end)
|
||
{
|
||
const wchar_t * word = str; // pointint at the beginning of a word
|
||
size_t non_whites = 0;
|
||
size_t i = 0;
|
||
bool is_white;
|
||
// some space in the buffer for non break spaces (orphans) and spaces at the beginning of a line
|
||
size_t epsilon = WINIX_HTMLFILTER_BUFFER_MAXLEN / 10 + 1;
|
||
|
||
|
||
for( ; str < end && i<WINIX_HTMLFILTER_BUFFER_MAXLEN-epsilon ; ++str )
|
||
{
|
||
is_white = (*str==10 || IsWhite(*str));
|
||
|
||
if( is_white )
|
||
{
|
||
if( CheckOrphan(word, str) )
|
||
{
|
||
i += PutNonBreakSpaceToBuffer(i);
|
||
is_white = false;
|
||
}
|
||
else
|
||
{
|
||
non_whites = 0;
|
||
}
|
||
|
||
// skipping the whole white string
|
||
for( ; (*str==10 || IsWhite(*str)) && str < end ; ++str );
|
||
|
||
word = str;
|
||
}
|
||
|
||
if( !is_white )
|
||
non_whites += 1;
|
||
else
|
||
non_whites = 0;
|
||
|
||
if( (break_after!=0 && non_whites>break_after) || is_white )
|
||
{
|
||
buffer[i] = ' ';
|
||
i += 1;
|
||
non_whites = 1;
|
||
}
|
||
|
||
if( str < end )
|
||
{
|
||
buffer[i] = *str;
|
||
i += 1;
|
||
}
|
||
}
|
||
|
||
return i;
|
||
}
|
||
|
||
|
||
|
||
void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end)
|
||
{
|
||
size_t buf_len;
|
||
|
||
while( str < end )
|
||
{
|
||
buf_len = PutNormalTextFillBuffer(str, end);
|
||
Put(buffer, buffer+buf_len);
|
||
}
|
||
}
|
||
|
||
|
||
void HTMLFilter::PutNormalTextTrim(const wchar_t * str, const wchar_t * end)
|
||
{
|
||
size_t buf_len;
|
||
|
||
while( str < end )
|
||
{
|
||
buf_len = PutNormalTextTrimFillBuffer(str, end);
|
||
Put(buffer, buffer+buf_len);
|
||
}
|
||
}
|
||
|
||
|
||
void HTMLFilter::PutOpeningTagMark()
|
||
{
|
||
(*out_string) += '<';
|
||
}
|
||
|
||
|
||
void HTMLFilter::PutClosingTagMark()
|
||
{
|
||
(*out_string) += '>';
|
||
}
|
||
|
||
|
||
|
||
|
||
void HTMLFilter::PutTagName(const wchar_t * name)
|
||
{
|
||
(*out_string) += name;
|
||
}
|
||
|
||
|
||
bool HTMLFilter::IsTagSafe(const wchar_t * tag)
|
||
{
|
||
if( !safe_mode )
|
||
return true;
|
||
|
||
static const wchar_t * unsafe_tags[] = {
|
||
L"script", L"iframe", L"frame", L"frameset",
|
||
L"applet", L"head", L"meta", L"html", L"link", L"body"
|
||
};
|
||
|
||
size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*);
|
||
size_t i;
|
||
|
||
for(i=0 ; i<len ; ++i)
|
||
{
|
||
if( IsNameEqual(tag, unsafe_tags[i]) )
|
||
return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
// start, end - arguments
|
||
void HTMLFilter::PutOpeningTag(const wchar_t * start, const wchar_t * end)
|
||
{
|
||
if( !IsTagSafe(LastItem().name) )
|
||
return;
|
||
|
||
PutOpeningTagMark();
|
||
PutTagName(LastItem().name);
|
||
|
||
if( start != end )
|
||
{
|
||
(*out_string) += ' ';
|
||
Put(start, end);
|
||
}
|
||
|
||
PutClosingTagMark();
|
||
}
|
||
|
||
|
||
|
||
void HTMLFilter::PutClosingTag(const wchar_t * tag)
|
||
{
|
||
if( !IsTagSafe(tag) )
|
||
return;
|
||
|
||
PutOpeningTagMark();
|
||
(*out_string) += '/';
|
||
PutTagName(tag);
|
||
PutClosingTagMark();
|
||
}
|
||
|
||
|
||
|
||
size_t HTMLFilter::PutTabsToBuffer(size_t index, size_t len)
|
||
{
|
||
if( len == 0 )
|
||
return 0;
|
||
|
||
if( len > 20 )
|
||
len = 20;
|
||
|
||
// how many spaces do you want
|
||
size_t spaces = len * tab_size;
|
||
size_t i = 0;
|
||
|
||
if( index+spaces < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 )
|
||
{
|
||
for( ; i<spaces ; ++i )
|
||
buffer[index+i] = ' ';
|
||
}
|
||
|
||
return i;
|
||
}
|
||
|
||
|
||
size_t HTMLFilter::PutNonBreakSpaceToBuffer(size_t index)
|
||
{
|
||
size_t i = 0;
|
||
|
||
if( orphan_mode == orphan_nbsp )
|
||
{
|
||
static const wchar_t nb[] = L" ";
|
||
size_t len = sizeof(nb) / sizeof(wchar_t) - 1; // '0' at the end
|
||
|
||
if( index+len < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 )
|
||
{
|
||
for( ; i<len ; ++i )
|
||
buffer[index+i] = nb[i];
|
||
}
|
||
}
|
||
else
|
||
{
|
||
if( index+1 < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 )
|
||
{
|
||
i = 1;
|
||
buffer[index] = (wchar_t)160;
|
||
}
|
||
}
|
||
|
||
return i; // return i not len (can be zero)
|
||
}
|
||
|
||
|
||
void HTMLFilter::PutTabs(size_t len)
|
||
{
|
||
size_t i = PutTabsToBuffer(0, len);
|
||
Put(buffer, buffer+i);
|
||
}
|
||
|
||
|
||
void HTMLFilter::PutNewLine()
|
||
{
|
||
buffer[0] = 10;
|
||
Put(buffer, buffer+1);
|
||
}
|
||
|
||
|
||
// we assume the size of the opening mark to be one
|
||
bool HTMLFilter::IsOpeningTagMark()
|
||
{
|
||
return (*pchar == '<');
|
||
}
|
||
|
||
|
||
// we assume the size of the closing mark to be one
|
||
bool HTMLFilter::IsClosingTagMark()
|
||
{
|
||
return (*pchar == '>');
|
||
}
|
||
|
||
|
||
// the slash at the end <img src=".." /> (without '>' character)
|
||
// we assume the size of the mark to be one
|
||
bool HTMLFilter::IsClosingXmlSimpleTagMark()
|
||
{
|
||
return (*pchar == '/');
|
||
}
|
||
|
||
|
||
bool HTMLFilter::IsOpeningCommentaryTagMark()
|
||
{
|
||
static wchar_t comm_open[] = L"<!--";
|
||
size_t comm_open_len = sizeof(comm_open) / sizeof(wchar_t) - 1;
|
||
|
||
return IsNameEqual(pchar, comm_open, comm_open_len);
|
||
}
|
||
|
||
|
||
size_t HTMLFilter::OpeningCommentaryTagMarkSize()
|
||
{
|
||
return 4; // size of "<!--"
|
||
}
|
||
|
||
|
||
|
||
// skipping the commentary tag if exists
|
||
bool HTMLFilter::SkipCommentaryTagIfExists()
|
||
{
|
||
static wchar_t comm_close[] = L"-->";
|
||
size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1;
|
||
|
||
if( !IsOpeningCommentaryTagMark() )
|
||
return false;
|
||
|
||
pchar += OpeningCommentaryTagMarkSize();
|
||
|
||
// looking for "-->"
|
||
while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) )
|
||
++pchar;
|
||
|
||
if( *pchar!= 0 )
|
||
pchar += comm_close_len;
|
||
|
||
CheckNewLine();
|
||
|
||
return true;
|
||
}
|
||
|
||
|
||
void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white)
|
||
{
|
||
if( trim_white )
|
||
{
|
||
// skipping all white chars (with new lines)
|
||
// but with remembering the last non white character
|
||
for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar)
|
||
if( *pchar == 10 )
|
||
last_non_white = pchar;
|
||
}
|
||
else
|
||
{
|
||
// skipping first white chars with only one line between them
|
||
SkipWhite();
|
||
last_non_white = pchar;
|
||
|
||
if( *pchar == 10 )
|
||
{
|
||
++pchar;
|
||
SkipWhite();
|
||
}
|
||
}
|
||
|
||
start = pchar;
|
||
|
||
// exception for the commentary tag
|
||
if( IsOpeningCommentaryTagMark() || !IsOpeningTagMark() )
|
||
{
|
||
PutNewLine();
|
||
PutTabs(stack_len);
|
||
}
|
||
}
|
||
|
||
|
||
|
||
// reading text between html tags
|
||
void HTMLFilter::ReadNormalText()
|
||
{
|
||
const wchar_t * start = pchar;
|
||
const wchar_t * last_non_white = pchar;
|
||
|
||
if( last_new_line )
|
||
ReadNormalTextSkipWhite(start, last_non_white);
|
||
|
||
|
||
while( *pchar != 0 )
|
||
{
|
||
if( SkipCommentaryTagIfExists() )
|
||
{
|
||
last_non_white = pchar - 1; // pointing at the last '>' from a commentary
|
||
}
|
||
else
|
||
{
|
||
if( IsOpeningTagMark() )
|
||
break;
|
||
|
||
if( !IsWhite(*pchar) )
|
||
last_non_white = pchar;
|
||
|
||
pchar += 1;
|
||
}
|
||
}
|
||
|
||
|
||
last_new_line = (*last_non_white == 10);
|
||
|
||
if( trim_white )
|
||
PutNormalTextTrim(start, pchar);
|
||
else
|
||
PutNormalText(start, pchar);
|
||
}
|
||
|
||
|
||
|
||
// start, end - parameters to a tag
|
||
void HTMLFilter::PrintItem(const wchar_t * start, const wchar_t * end)
|
||
{
|
||
if( last_new_line )
|
||
{
|
||
PutNewLine();
|
||
|
||
if( stack_len > 1 )
|
||
PutTabs(stack_len-1);
|
||
}
|
||
|
||
PutOpeningTag(start, end);
|
||
}
|
||
|
||
|
||
|
||
|
||
bool HTMLFilter::ReadItem()
|
||
{
|
||
const wchar_t * start = pchar;
|
||
|
||
if( *pchar == 0 )
|
||
return false;
|
||
|
||
if( !PushStack() )
|
||
return false;
|
||
|
||
// we have '<'
|
||
pchar += 1;
|
||
SkipWhite();
|
||
|
||
if( *pchar == '/' ) // we have a closing tag
|
||
{
|
||
pchar += 1;
|
||
SkipWhite();
|
||
LastItem().type = Item::closing;
|
||
}
|
||
|
||
ReadItemName();
|
||
SkipWhite();
|
||
start = pchar; // arguments start here
|
||
|
||
if( LastItem().type != Item::closing )
|
||
LastItem().type = (LastItem().name[0] == '!') ? Item::special : Item::opening;
|
||
|
||
const wchar_t * end = SkipItemCheckXmlSimple();
|
||
|
||
if( LastItem().type != Item::closing )
|
||
PrintItem(start, end);
|
||
|
||
CheckNewLine();
|
||
LastItem().new_line = last_new_line;
|
||
|
||
return true;
|
||
}
|
||
|
||
|
||
|
||
wchar_t HTMLFilter::ToLower(wchar_t c)
|
||
{
|
||
if( c>='A' && c<='Z' )
|
||
return c - 'A' + 'a';
|
||
|
||
return c;
|
||
}
|
||
|
||
|
||
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
|
||
{
|
||
for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
|
||
if( ToLower(*name1) != ToLower(*name2) )
|
||
return false;
|
||
|
||
if( *name1==0 && *name2==0 )
|
||
return true;
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
|
||
// len characters from both strings must be equal
|
||
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
|
||
{
|
||
for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
|
||
if( ToLower(*name1) != ToLower(*name2) )
|
||
return false;
|
||
|
||
if( len == 0 )
|
||
return true;
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
|
||
bool HTMLFilter::IsLastTag(const wchar_t * name)
|
||
{
|
||
const wchar_t * tag = LastItem().name;
|
||
|
||
return IsNameEqual(name, tag);
|
||
}
|
||
|
||
|
||
|
||
// checking exceptions for opening tags
|
||
void HTMLFilter::CheckExceptions()
|
||
{
|
||
if( IsLastTag(L"meta") ||
|
||
IsLastTag(L"input") ||
|
||
IsLastTag(L"br") ||
|
||
IsLastTag(L"hr") ||
|
||
IsLastTag(L"img") ||
|
||
IsLastTag(L"link") ||
|
||
IsLastTag(L"param") ||
|
||
IsLastTag(L"area") )
|
||
{
|
||
LastItem().type = Item::simple;
|
||
PopStack();
|
||
return;
|
||
}
|
||
|
||
// in safe_mode the script tag is ignored
|
||
if( !safe_mode && IsLastTag(L"script") )
|
||
PutLastTagWithClosingTag();
|
||
|
||
if( IsLastTag(L"pre") || IsLastTag(L"textarea") )
|
||
PutLastTagWithClosingTag();
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
void HTMLFilter::AddForgottenTags()
|
||
{
|
||
int i;
|
||
|
||
if( stack_len < 3 )
|
||
return;
|
||
|
||
// we have forgotten to close some tags
|
||
|
||
// looking whether there is a matching opening tag
|
||
for(i=int(stack_len)-3 ; i>=0 ; --i)
|
||
if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
|
||
break;
|
||
|
||
if( i < 0 )
|
||
{
|
||
// oops, there is no such a tag
|
||
// we don't print the closing and the missing opening tag
|
||
PopStack();
|
||
return;
|
||
}
|
||
|
||
for(int z=(int)stack_len-2 ; z>=i ; --z)
|
||
{
|
||
if( pstack[z].new_line )
|
||
{
|
||
PutNewLine();
|
||
PutTabs(z);
|
||
}
|
||
|
||
PutClosingTag(pstack[z].name);
|
||
pstack[z].Clear();
|
||
}
|
||
|
||
last_new_line = pstack[stack_len-1].new_line;
|
||
|
||
// invalidate tags
|
||
stack_len = i;
|
||
}
|
||
|
||
|
||
void HTMLFilter::CheckStackPrintRest()
|
||
{
|
||
while( stack_len-- > 0 )
|
||
{
|
||
if( stack_len==0 || pstack[stack_len-1].new_line )
|
||
PutNewLine();
|
||
|
||
PutTabs(stack_len);
|
||
PutClosingTag(pstack[stack_len].name);
|
||
}
|
||
}
|
||
|
||
|
||
void HTMLFilter::CheckClosingTags()
|
||
{
|
||
if( stack_len == 0 )
|
||
return;
|
||
|
||
// on the stack we have only opening tags
|
||
// but only the last tag is a closing tag
|
||
|
||
if( stack_len == 1 )
|
||
{
|
||
// there is only last closing tag
|
||
// we dont print it
|
||
PopStack();
|
||
return;
|
||
}
|
||
|
||
// there are more than one tag
|
||
if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) )
|
||
{
|
||
// last closing tag is from the previous one
|
||
if( pstack[stack_len-2].new_line )
|
||
{
|
||
PutNewLine();
|
||
PutTabs(stack_len-2);
|
||
}
|
||
|
||
PutClosingTag(pstack[stack_len-1].name);
|
||
last_new_line = pstack[stack_len-1].new_line;
|
||
PopStack();
|
||
PopStack();
|
||
return;
|
||
}
|
||
|
||
AddForgottenTags();
|
||
}
|
||
|
||
|
||
bool HTMLFilter::PrintRest()
|
||
{
|
||
const wchar_t * start = pchar;
|
||
|
||
// in safe mode we do not print the rest html code
|
||
if( safe_mode )
|
||
return false;
|
||
|
||
while( *pchar )
|
||
++pchar;
|
||
|
||
if( pchar > start )
|
||
{
|
||
Put(start, pchar);
|
||
return true;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
|
||
|
||
void HTMLFilter::Read()
|
||
{
|
||
if( trim_white )
|
||
SkipWhiteLines();
|
||
|
||
// it can be some text or white lines before the first html tag (we print it)
|
||
ReadNormalText();
|
||
|
||
while( ReadItem() )
|
||
{
|
||
if( LastItem().type == Item::opening )
|
||
{
|
||
CheckExceptions();
|
||
}
|
||
else
|
||
if( LastItem().type == Item::special || LastItem().type == Item::simple )
|
||
{
|
||
if( stack_len > 1 )
|
||
{
|
||
pstack[stack_len-2].new_line = LastItem().new_line;
|
||
}
|
||
else
|
||
if( trim_white )
|
||
{
|
||
// one new line after a simple or special tag
|
||
// (if the tag has level 0 in the tree - it not means that this is a first tag)
|
||
// for example can be DOCTYPE
|
||
PutNewLine();
|
||
}
|
||
|
||
PopStack();
|
||
}
|
||
else
|
||
if( LastItem().type == Item::closing )
|
||
{
|
||
CheckClosingTags();
|
||
}
|
||
|
||
ReadNormalText();
|
||
}
|
||
|
||
// sometimes ReadItem() can return a false (when there is no space on the stack)
|
||
// we print the rest html without filtering
|
||
if( !PrintRest() )
|
||
CheckStackPrintRest();
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
|