|
|
|
@@ -35,19 +35,20 @@
|
|
|
|
|
* THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "htmlfilter.h"
|
|
|
|
|
#include "htmlparser.h"
|
|
|
|
|
|
|
|
|
|
#include "convert/text.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace pt
|
|
|
|
|
{
|
|
|
|
|
const int HTMLFilter::WHITE_MODE_ORIGIN;
|
|
|
|
|
const int HTMLFilter::WHITE_MODE_SINGLE_LINE;
|
|
|
|
|
const int HTMLFilter::WHITE_MODE_TREE;
|
|
|
|
|
const int HTMLParser::WHITE_MODE_ORIGIN;
|
|
|
|
|
const int HTMLParser::WHITE_MODE_SINGLE_LINE;
|
|
|
|
|
const int HTMLParser::WHITE_MODE_TREE;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Item::Clear()
|
|
|
|
|
void HTMLParser::Item::Clear()
|
|
|
|
|
{
|
|
|
|
|
name.clear();
|
|
|
|
|
type = none;
|
|
|
|
@@ -61,14 +62,14 @@ void HTMLFilter::Item::Clear()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::Item::Item()
|
|
|
|
|
HTMLParser::Item::Item()
|
|
|
|
|
{
|
|
|
|
|
Clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
|
|
|
|
|
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
|
|
|
|
|
{
|
|
|
|
|
reading_from_file = false;
|
|
|
|
|
reading_from_wchar_string = true;
|
|
|
|
@@ -89,18 +90,18 @@ void HTMLFilter::Filter(const wchar_t * in, std::wstring & out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Init()
|
|
|
|
|
void HTMLParser::Init()
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Uninit()
|
|
|
|
|
void HTMLParser::Uninit()
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
|
|
|
|
|
void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
|
|
|
|
|
{
|
|
|
|
|
if( &in == &out )
|
|
|
|
|
{
|
|
|
|
@@ -117,7 +118,7 @@ void HTMLFilter::Filter(const std::wstring & in, std::wstring & out)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SetSomeDefaults()
|
|
|
|
|
void HTMLParser::SetSomeDefaults()
|
|
|
|
|
{
|
|
|
|
|
white_mode = WHITE_MODE_ORIGIN;
|
|
|
|
|
|
|
|
|
@@ -132,7 +133,7 @@ void HTMLFilter::SetSomeDefaults()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::HTMLFilter()
|
|
|
|
|
HTMLParser::HTMLParser()
|
|
|
|
|
{
|
|
|
|
|
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
|
|
|
|
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
|
|
|
@@ -141,7 +142,7 @@ HTMLFilter::HTMLFilter()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::HTMLFilter(const HTMLFilter & f)
|
|
|
|
|
HTMLParser::HTMLParser(const HTMLParser & f)
|
|
|
|
|
{
|
|
|
|
|
// don't need to copy the stack
|
|
|
|
|
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
|
|
|
@@ -151,7 +152,7 @@ HTMLFilter::HTMLFilter(const HTMLFilter & f)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f)
|
|
|
|
|
HTMLParser & HTMLParser::operator=(const HTMLParser & f)
|
|
|
|
|
{
|
|
|
|
|
// don't need to copy the stack
|
|
|
|
|
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
|
|
|
@@ -163,7 +164,7 @@ return *this;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::~HTMLFilter()
|
|
|
|
|
HTMLParser::~HTMLParser()
|
|
|
|
|
{
|
|
|
|
|
delete [] pstack;
|
|
|
|
|
delete [] buffer;
|
|
|
|
@@ -171,7 +172,7 @@ HTMLFilter::~HTMLFilter()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::white_chars_mode(int mode)
|
|
|
|
|
void HTMLParser::white_chars_mode(int mode)
|
|
|
|
|
{
|
|
|
|
|
if( mode >= WHITE_MODE_ORIGIN && mode <= WHITE_MODE_TREE )
|
|
|
|
|
white_mode = mode;
|
|
|
|
@@ -180,7 +181,7 @@ void HTMLFilter::white_chars_mode(int mode)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::WrapLine(size_t wrap_line_)
|
|
|
|
|
void HTMLParser::WrapLine(size_t wrap_line_)
|
|
|
|
|
{
|
|
|
|
|
wrap_line = wrap_line_;
|
|
|
|
|
|
|
|
|
@@ -190,7 +191,7 @@ void HTMLFilter::WrapLine(size_t wrap_line_)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::InsertTabs(size_t tabsize)
|
|
|
|
|
void HTMLParser::InsertTabs(size_t tabsize)
|
|
|
|
|
{
|
|
|
|
|
tab_size = tabsize;
|
|
|
|
|
|
|
|
|
@@ -199,7 +200,7 @@ void HTMLFilter::InsertTabs(size_t tabsize)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int HTMLFilter::current_white_char_mode()
|
|
|
|
|
int HTMLParser::current_white_char_mode()
|
|
|
|
|
{
|
|
|
|
|
if( !white_char_mode_tab.empty() )
|
|
|
|
|
return white_char_mode_tab.back();
|
|
|
|
@@ -208,7 +209,7 @@ int HTMLFilter::current_white_char_mode()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::CalcOrphansMaxLen(Orphans & orphans)
|
|
|
|
|
void HTMLParser::CalcOrphansMaxLen(Orphans & orphans)
|
|
|
|
|
{
|
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
|
@@ -222,7 +223,7 @@ size_t i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
|
|
|
|
|
void HTMLParser::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
|
|
|
|
|
{
|
|
|
|
|
lang_code_lower = lang_code;
|
|
|
|
|
ToLower(lang_code_lower);
|
|
|
|
@@ -236,13 +237,13 @@ void HTMLFilter::AssignOrphans(const wchar_t * lang_code, const std::vector<std:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
|
|
|
|
|
void HTMLParser::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
|
|
|
|
|
{
|
|
|
|
|
AssignOrphans(lang_code.c_str(), otab);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ClearOrphans()
|
|
|
|
|
void HTMLParser::ClearOrphans()
|
|
|
|
|
{
|
|
|
|
|
orphans_tab.clear();
|
|
|
|
|
}
|
|
|
|
@@ -250,7 +251,7 @@ void HTMLFilter::ClearOrphans()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::OrphansMode(const std::wstring & orphan_mode_str)
|
|
|
|
|
void HTMLParser::OrphansMode(const std::wstring & orphan_mode_str)
|
|
|
|
|
{
|
|
|
|
|
if( orphan_mode_str == L"160" )
|
|
|
|
|
orphan_mode = orphan_160space;
|
|
|
|
@@ -259,24 +260,24 @@ void HTMLFilter::OrphansMode(const std::wstring & orphan_mode_str)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SafeMode(bool safe_mode_)
|
|
|
|
|
void HTMLParser::SafeMode(bool safe_mode_)
|
|
|
|
|
{
|
|
|
|
|
safe_mode = safe_mode_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SkipTags(bool skip_tags)
|
|
|
|
|
void HTMLParser::SkipTags(bool skip_tags)
|
|
|
|
|
{
|
|
|
|
|
this->skip_tags = skip_tags;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SkipCommentaries(bool skip_commentaries)
|
|
|
|
|
void HTMLParser::SkipCommentaries(bool skip_commentaries)
|
|
|
|
|
{
|
|
|
|
|
this->skip_commentaries = skip_commentaries;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SkipEntities(bool skip_entities)
|
|
|
|
|
void HTMLParser::SkipEntities(bool skip_entities)
|
|
|
|
|
{
|
|
|
|
|
this->skip_entities = skip_entities;
|
|
|
|
|
|
|
|
|
@@ -287,13 +288,13 @@ void HTMLFilter::SkipEntities(bool skip_entities)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::AnalyzeEntities(bool analyze_entities)
|
|
|
|
|
void HTMLParser::AnalyzeEntities(bool analyze_entities)
|
|
|
|
|
{
|
|
|
|
|
this->analyze_entities = analyze_entities;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
|
|
|
|
|
void HTMLParser::SetNoFilterTag(const std::wstring & tag_name)
|
|
|
|
|
{
|
|
|
|
|
no_filter_tag = tag_name;
|
|
|
|
|
}
|
|
|
|
@@ -301,7 +302,7 @@ void HTMLFilter::SetNoFilterTag(const std::wstring & tag_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::Item & HTMLFilter::GetItem(size_t i)
|
|
|
|
|
HTMLParser::Item & HTMLParser::GetItem(size_t i)
|
|
|
|
|
{
|
|
|
|
|
if( i >= stack_len )
|
|
|
|
|
{
|
|
|
|
@@ -313,7 +314,7 @@ return pstack[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLFilter::Item & HTMLFilter::LastItem()
|
|
|
|
|
HTMLParser::Item & HTMLParser::LastItem()
|
|
|
|
|
{
|
|
|
|
|
if( stack_len == 0 )
|
|
|
|
|
{
|
|
|
|
@@ -325,7 +326,7 @@ return pstack[stack_len-1];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::PushStack()
|
|
|
|
|
bool HTMLParser::PushStack()
|
|
|
|
|
{
|
|
|
|
|
if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
|
|
|
|
|
// oops, too many items
|
|
|
|
@@ -346,7 +347,7 @@ bool HTMLFilter::PushStack()
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::PopStack()
|
|
|
|
|
void HTMLParser::PopStack()
|
|
|
|
|
{
|
|
|
|
|
if( stack_len == 0 )
|
|
|
|
|
// oops
|
|
|
|
@@ -357,7 +358,7 @@ void HTMLFilter::PopStack()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsWhite(int c)
|
|
|
|
|
bool HTMLParser::IsWhite(int c)
|
|
|
|
|
{
|
|
|
|
|
// dont use c==10 here
|
|
|
|
|
|
|
|
|
@@ -368,21 +369,21 @@ return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SkipWhite()
|
|
|
|
|
void HTMLParser::SkipWhite()
|
|
|
|
|
{
|
|
|
|
|
while( IsWhite(lastc) )
|
|
|
|
|
read_char();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SkipWhiteLines()
|
|
|
|
|
void HTMLParser::SkipWhiteLines()
|
|
|
|
|
{
|
|
|
|
|
while( lastc==10 || IsWhite(lastc) )
|
|
|
|
|
read_char();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SkipWhiteWithFirstNewLine()
|
|
|
|
|
void HTMLParser::SkipWhiteWithFirstNewLine()
|
|
|
|
|
{
|
|
|
|
|
SkipWhite();
|
|
|
|
|
|
|
|
|
@@ -394,7 +395,7 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//void HTMLFilter::CheckNewLine()
|
|
|
|
|
//void HTMLParser::CheckNewLine()
|
|
|
|
|
//{
|
|
|
|
|
// if( white_mode == WHITE_MODE_TREE )
|
|
|
|
|
// {
|
|
|
|
@@ -407,7 +408,7 @@ void HTMLFilter::SkipWhiteWithFirstNewLine()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::SkipAndCheckClosingTag(std::wstring * remember_text)
|
|
|
|
|
void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
|
|
|
|
|
{
|
|
|
|
|
bool is_quoted = false;
|
|
|
|
|
wchar_t quote_char = 0;
|
|
|
|
@@ -450,7 +451,7 @@ void HTMLFilter::SkipAndCheckClosingTag(std::wstring * remember_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsValidCharForName(int c)
|
|
|
|
|
bool HTMLParser::IsValidCharForName(int c)
|
|
|
|
|
{
|
|
|
|
|
if( (c>='a' && c<='z') ||
|
|
|
|
|
(c>='A' && c<='Z') ||
|
|
|
|
@@ -462,7 +463,7 @@ return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsValidCharForAttrName(int c)
|
|
|
|
|
bool HTMLParser::IsValidCharForAttrName(int c)
|
|
|
|
|
{
|
|
|
|
|
if( (c>='a' && c<='z') ||
|
|
|
|
|
(c>='A' && c<='Z') ||
|
|
|
|
@@ -474,7 +475,7 @@ return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsValidCharForEntityName(int c)
|
|
|
|
|
bool HTMLParser::IsValidCharForEntityName(int c)
|
|
|
|
|
{
|
|
|
|
|
if( (c>='a' && c<='z') ||
|
|
|
|
|
(c>='A' && c<='Z') ||
|
|
|
|
@@ -486,7 +487,7 @@ return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemName(std::wstring & name, bool clear_name)
|
|
|
|
|
void HTMLParser::ReadItemName(std::wstring & name, bool clear_name)
|
|
|
|
|
{
|
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
|
@@ -513,7 +514,7 @@ size_t i;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemAttrName()
|
|
|
|
|
void HTMLParser::ReadItemAttrName()
|
|
|
|
|
{
|
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
|
@@ -530,7 +531,7 @@ size_t i;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemAttrValueAdd(const std::wstring & str)
|
|
|
|
|
void HTMLParser::ReadItemAttrValueAdd(const std::wstring & str)
|
|
|
|
|
{
|
|
|
|
|
if( analyze_entities )
|
|
|
|
|
{
|
|
|
|
@@ -544,7 +545,7 @@ void HTMLFilter::ReadItemAttrValueAdd(const std::wstring & str)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
|
|
|
|
void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
|
|
|
|
{
|
|
|
|
|
attr_value.clear();
|
|
|
|
|
tmp_text.clear();
|
|
|
|
@@ -585,7 +586,7 @@ void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::CheckChar(wchar_t c)
|
|
|
|
|
void HTMLParser::CheckChar(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
if( c == 10 )
|
|
|
|
|
line_len = 0;
|
|
|
|
@@ -594,14 +595,14 @@ void HTMLFilter::CheckChar(wchar_t c)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Put(wchar_t c)
|
|
|
|
|
void HTMLParser::Put(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
(*out_string) += c;
|
|
|
|
|
CheckChar(c);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
|
|
|
|
|
void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
|
|
|
|
|
{
|
|
|
|
|
if( str >= end )
|
|
|
|
|
return;
|
|
|
|
@@ -615,7 +616,7 @@ void HTMLFilter::Put(const wchar_t * str, const wchar_t * end)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Put(const std::wstring & str)
|
|
|
|
|
void HTMLParser::Put(const std::wstring & str)
|
|
|
|
|
{
|
|
|
|
|
if( !str.empty() )
|
|
|
|
|
{
|
|
|
|
@@ -628,7 +629,7 @@ void HTMLFilter::Put(const std::wstring & str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// out can be null
|
|
|
|
|
void HTMLFilter::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
|
|
|
|
|
void HTMLParser::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
|
|
|
|
|
{
|
|
|
|
|
size_t epsilon = 8; // !! IMPROVE ME put as a constant
|
|
|
|
|
const wchar_t * old_str = str;
|
|
|
|
@@ -680,7 +681,7 @@ void HTMLFilter::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
|
|
|
|
|
int HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
|
|
|
|
|
{
|
|
|
|
|
size_t res;
|
|
|
|
|
|
|
|
|
@@ -704,7 +705,7 @@ return -int(ToLower(*orphan));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// binary search in table (table should be sorted)
|
|
|
|
|
bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
|
|
|
|
|
bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
|
|
|
|
|
{
|
|
|
|
|
int res;
|
|
|
|
|
|
|
|
|
@@ -749,7 +750,7 @@ return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::CheckOrphan(const wchar_t * str, const wchar_t * end)
|
|
|
|
|
bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end)
|
|
|
|
|
{
|
|
|
|
|
if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
|
|
|
|
|
return false;
|
|
|
|
@@ -763,7 +764,7 @@ return CheckOrphan(str, end, LastItem().porphans->tab);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
|
|
|
|
|
void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space)
|
|
|
|
|
{
|
|
|
|
|
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) )
|
|
|
|
|
{
|
|
|
|
@@ -799,7 +800,7 @@ void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
|
|
|
|
|
void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line)
|
|
|
|
|
{
|
|
|
|
|
was_white_char = false;
|
|
|
|
|
was_new_line = false;
|
|
|
|
@@ -829,13 +830,13 @@ void HTMLFilter::PutNormalWhite(bool & was_white_char, bool & was_new_line)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::PutOpeningTagMark()
|
|
|
|
|
void HTMLParser::PutOpeningTagMark()
|
|
|
|
|
{
|
|
|
|
|
Put('<');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::PutClosingTagMark()
|
|
|
|
|
void HTMLParser::PutClosingTagMark()
|
|
|
|
|
{
|
|
|
|
|
Put('>');
|
|
|
|
|
}
|
|
|
|
@@ -845,7 +846,7 @@ void HTMLFilter::PutClosingTagMark()
|
|
|
|
|
|
|
|
|
|
// !! IMPROVE ME change to a better name
|
|
|
|
|
// this functions does not return true when the tag is safe
|
|
|
|
|
bool HTMLFilter::IsTagSafe(const wchar_t * tag)
|
|
|
|
|
bool HTMLParser::IsTagSafe(const wchar_t * tag)
|
|
|
|
|
{
|
|
|
|
|
if( !safe_mode )
|
|
|
|
|
return true;
|
|
|
|
@@ -874,7 +875,7 @@ return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsTagSafe(const std::wstring & tag)
|
|
|
|
|
bool HTMLParser::IsTagSafe(const std::wstring & tag)
|
|
|
|
|
{
|
|
|
|
|
return IsTagSafe(tag.c_str());
|
|
|
|
|
}
|
|
|
|
@@ -883,7 +884,7 @@ bool HTMLFilter::IsTagSafe(const std::wstring & tag)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::PutOpeningTag()
|
|
|
|
|
bool HTMLParser::PutOpeningTag()
|
|
|
|
|
{
|
|
|
|
|
if( !IsTagSafe(LastItem().name) )
|
|
|
|
|
{
|
|
|
|
@@ -905,7 +906,7 @@ return true;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::PutClosingTag(const Item & item)
|
|
|
|
|
void HTMLParser::PutClosingTag(const Item & item)
|
|
|
|
|
{
|
|
|
|
|
if( skip_tags || !IsTagSafe(item.name) )
|
|
|
|
|
return;
|
|
|
|
@@ -922,7 +923,7 @@ void HTMLFilter::PutClosingTag(const Item & item)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::PutTabs(size_t len)
|
|
|
|
|
void HTMLParser::PutTabs(size_t len)
|
|
|
|
|
{
|
|
|
|
|
if( len > 30 )
|
|
|
|
|
len = 30;
|
|
|
|
@@ -932,7 +933,7 @@ void HTMLFilter::PutTabs(size_t len)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::PutNonBreakingSpace()
|
|
|
|
|
void HTMLParser::PutNonBreakingSpace()
|
|
|
|
|
{
|
|
|
|
|
if( orphan_mode == orphan_nbsp )
|
|
|
|
|
{
|
|
|
|
@@ -947,35 +948,35 @@ void HTMLFilter::PutNonBreakingSpace()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// we assume the size of the opening mark to be one
|
|
|
|
|
bool HTMLFilter::IsOpeningTagMark(wchar_t c)
|
|
|
|
|
bool HTMLParser::IsOpeningTagMark(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
return (c == '<');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// we assume the size of the closing mark to be one
|
|
|
|
|
bool HTMLFilter::IsClosingTagMark(wchar_t c)
|
|
|
|
|
bool HTMLParser::IsClosingTagMark(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
return (c == '>');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// the slash in the closing tag mark e.g. </p>
|
|
|
|
|
bool HTMLFilter::IsClosingTagIndicator(wchar_t c)
|
|
|
|
|
bool HTMLParser::IsClosingTagIndicator(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
return (c == '/');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// the slash in the closing tag mark e.g. </p>
|
|
|
|
|
bool HTMLFilter::IsSpecialTagIndicator(wchar_t c)
|
|
|
|
|
bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
return (c == '!');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// the '=' operator e.g. class="value"
|
|
|
|
|
bool HTMLFilter::IsAttributeAssignmentMark(wchar_t c)
|
|
|
|
|
bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
return (c == '=');
|
|
|
|
|
}
|
|
|
|
@@ -984,13 +985,13 @@ bool HTMLFilter::IsAttributeAssignmentMark(wchar_t c)
|
|
|
|
|
|
|
|
|
|
// the slash at the end <img src=".." /> (without '>' character)
|
|
|
|
|
// we assume the size of the mark to be one
|
|
|
|
|
bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c)
|
|
|
|
|
bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
return (c == '/');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
|
|
|
|
|
bool HTMLParser::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
|
|
|
|
|
{
|
|
|
|
|
static wchar_t comm_end[] = L"-->";
|
|
|
|
|
size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1;
|
|
|
|
@@ -1004,13 +1005,13 @@ bool HTMLFilter::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsStartingEntityMark(wchar_t c)
|
|
|
|
|
bool HTMLParser::IsStartingEntityMark(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
return (c == '&');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsEndingEntityMark(wchar_t c)
|
|
|
|
|
bool HTMLParser::IsEndingEntityMark(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
return (c == ';');
|
|
|
|
|
}
|
|
|
|
@@ -1018,7 +1019,7 @@ bool HTMLFilter::IsEndingEntityMark(wchar_t c)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// reading text between html tags
|
|
|
|
|
void HTMLFilter::ReadText()
|
|
|
|
|
void HTMLParser::ReadText()
|
|
|
|
|
{
|
|
|
|
|
bool was_white_char = false;
|
|
|
|
|
bool was_new_line = false;
|
|
|
|
@@ -1096,7 +1097,7 @@ void HTMLFilter::ReadText()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::PrintOpeningItem()
|
|
|
|
|
bool HTMLParser::PrintOpeningItem()
|
|
|
|
|
{
|
|
|
|
|
if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
|
|
|
|
|
return true;
|
|
|
|
@@ -1108,7 +1109,7 @@ bool HTMLFilter::PrintOpeningItem()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::ReadItemAttr()
|
|
|
|
|
bool HTMLParser::ReadItemAttr()
|
|
|
|
|
{
|
|
|
|
|
attr_has_value = false;
|
|
|
|
|
attr_name.clear();
|
|
|
|
@@ -1145,7 +1146,7 @@ return true;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::CheckItemLangAttr()
|
|
|
|
|
void HTMLParser::CheckItemLangAttr()
|
|
|
|
|
{
|
|
|
|
|
if( attr_has_value && IsNameEqual(L"lang", attr_name) )
|
|
|
|
|
{
|
|
|
|
@@ -1166,7 +1167,7 @@ void HTMLFilter::CheckItemLangAttr()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::PrintItemAttr()
|
|
|
|
|
void HTMLParser::PrintItemAttr()
|
|
|
|
|
{
|
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
|
@@ -1193,7 +1194,7 @@ size_t i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemClosing()
|
|
|
|
|
void HTMLParser::ReadItemClosing()
|
|
|
|
|
{
|
|
|
|
|
read_char(); // skipping '/'
|
|
|
|
|
SkipWhiteLines();
|
|
|
|
@@ -1205,7 +1206,7 @@ void HTMLFilter::ReadItemClosing()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemSpecial()
|
|
|
|
|
void HTMLParser::ReadItemSpecial()
|
|
|
|
|
{
|
|
|
|
|
LastItem().type = Item::special;
|
|
|
|
|
|
|
|
|
@@ -1255,7 +1256,7 @@ void HTMLFilter::ReadItemSpecial()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadItemOpening()
|
|
|
|
|
void HTMLParser::ReadItemOpening()
|
|
|
|
|
{
|
|
|
|
|
LastItem().type = Item::opening;
|
|
|
|
|
ReadItemName(LastItem().name);
|
|
|
|
@@ -1281,16 +1282,16 @@ void HTMLFilter::ReadItemOpening()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ItemFound()
|
|
|
|
|
void HTMLParser::ItemFound()
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end)
|
|
|
|
|
void HTMLParser::EntityFound(const wchar_t * str, const wchar_t * end)
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::ReadItem()
|
|
|
|
|
bool HTMLParser::ReadItem()
|
|
|
|
|
{
|
|
|
|
|
if( lastc == -1 )
|
|
|
|
|
return false;
|
|
|
|
@@ -1332,7 +1333,7 @@ return true;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wchar_t HTMLFilter::ToLower(wchar_t c)
|
|
|
|
|
wchar_t HTMLParser::ToLower(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
if( c>='A' && c<='Z' )
|
|
|
|
|
return c - 'A' + 'a';
|
|
|
|
@@ -1341,7 +1342,7 @@ return c;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ToLower(std::wstring & str)
|
|
|
|
|
void HTMLParser::ToLower(std::wstring & str)
|
|
|
|
|
{
|
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
|
@@ -1350,7 +1351,7 @@ size_t i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
|
|
|
|
|
bool HTMLParser::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
|
|
|
|
|
{
|
|
|
|
|
for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
|
|
|
|
|
if( ToLower(*name1) != ToLower(*name2) )
|
|
|
|
@@ -1363,19 +1364,19 @@ return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2)
|
|
|
|
|
bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2)
|
|
|
|
|
{
|
|
|
|
|
return IsNameEqual(name1, name2.c_str());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2)
|
|
|
|
|
bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2)
|
|
|
|
|
{
|
|
|
|
|
return IsNameEqual(name1.c_str(), name2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2)
|
|
|
|
|
bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2)
|
|
|
|
|
{
|
|
|
|
|
return IsNameEqual(name1.c_str(), name2.c_str());
|
|
|
|
|
}
|
|
|
|
@@ -1385,7 +1386,7 @@ bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & na
|
|
|
|
|
// len characters from both strings must be equal
|
|
|
|
|
// IMPROVE ME change name to something like IsBeginningNameEqual
|
|
|
|
|
// and move to text.h (pikotools)
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
|
|
|
|
|
bool HTMLParser::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
|
|
|
|
|
{
|
|
|
|
|
for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
|
|
|
|
|
if( ToLower(*name1) != ToLower(*name2) )
|
|
|
|
@@ -1399,19 +1400,19 @@ return false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
|
|
|
|
|
bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
|
|
|
|
|
{
|
|
|
|
|
return IsNameEqual(name1, name2.c_str(), len);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
|
|
|
|
|
bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
|
|
|
|
|
{
|
|
|
|
|
return IsNameEqual(name1.c_str(), name2, len);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
|
|
|
|
|
bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
|
|
|
|
|
{
|
|
|
|
|
return IsNameEqual(name1.c_str(), name2.c_str(), len);
|
|
|
|
|
}
|
|
|
|
@@ -1420,20 +1421,20 @@ bool HTMLFilter::IsNameEqual(const std::wstring & name1, const std::wstring & na
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsLastTag(const wchar_t * name)
|
|
|
|
|
bool HTMLParser::IsLastTag(const wchar_t * name)
|
|
|
|
|
{
|
|
|
|
|
return IsNameEqual(name, LastItem().name);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::IsLastTag(const std::wstring & name)
|
|
|
|
|
bool HTMLParser::IsLastTag(const std::wstring & name)
|
|
|
|
|
{
|
|
|
|
|
return IsNameEqual(name, LastItem().name);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// checking exceptions for opening tags
|
|
|
|
|
void HTMLFilter::CheckSingleItemExceptions()
|
|
|
|
|
void HTMLParser::CheckSingleItemExceptions()
|
|
|
|
|
{
|
|
|
|
|
if( IsLastTag(L"meta") ||
|
|
|
|
|
IsLastTag(L"input") ||
|
|
|
|
@@ -1456,7 +1457,7 @@ void HTMLFilter::CheckSingleItemExceptions()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::CheckWhiteCharsExceptions(Item & item)
|
|
|
|
|
void HTMLParser::CheckWhiteCharsExceptions(Item & item)
|
|
|
|
|
{
|
|
|
|
|
bool change_white_mode = false;
|
|
|
|
|
|
|
|
|
@@ -1493,7 +1494,7 @@ void HTMLFilter::CheckWhiteCharsExceptions(Item & item)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::AddForgottenTags()
|
|
|
|
|
void HTMLParser::AddForgottenTags()
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
@@ -1539,7 +1540,7 @@ int i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::CheckStackPrintRest()
|
|
|
|
|
void HTMLParser::CheckStackPrintRest()
|
|
|
|
|
{
|
|
|
|
|
while( stack_len-- > 0 )
|
|
|
|
|
{
|
|
|
|
@@ -1561,7 +1562,7 @@ void HTMLFilter::CheckStackPrintRest()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::CheckClosingTags()
|
|
|
|
|
void HTMLParser::CheckClosingTags()
|
|
|
|
|
{
|
|
|
|
|
if( stack_len == 0 )
|
|
|
|
|
return;
|
|
|
|
@@ -1604,7 +1605,7 @@ void HTMLFilter::CheckClosingTags()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLFilter::PrintRest()
|
|
|
|
|
bool HTMLParser::PrintRest()
|
|
|
|
|
{
|
|
|
|
|
//const wchar_t * start = pchar;
|
|
|
|
|
|
|
|
|
@@ -1634,7 +1635,7 @@ bool HTMLFilter::PrintRest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::ReadLoop()
|
|
|
|
|
void HTMLParser::ReadLoop()
|
|
|
|
|
{
|
|
|
|
|
while( ReadItem() )
|
|
|
|
|
{
|
|
|
|
@@ -1671,7 +1672,7 @@ void HTMLFilter::ReadLoop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLFilter::Read()
|
|
|
|
|
void HTMLParser::Read()
|
|
|
|
|
{
|
|
|
|
|
read_char(); // put first character to lastc
|
|
|
|
|
is_first_item = true;
|