2554 lines
44 KiB
C++
2554 lines
44 KiB
C++
/*
|
|
* This file is a part of PikoTools
|
|
* and is distributed under the 2-Clause BSD licence.
|
|
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2008-2024, Tomasz Sowa
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#include "htmlparser.h"
|
|
#include "convert/text.h"
|
|
|
|
|
|
namespace pt
|
|
{
|
|
const int HTMLParser::WHITE_MODE_ORIGIN;
|
|
const int HTMLParser::WHITE_MODE_SINGLE_LINE;
|
|
const int HTMLParser::WHITE_MODE_TREE;
|
|
|
|
|
|
|
|
void HTMLParser::clear_input_flags()
|
|
{
|
|
BaseParser::clear_input_flags();
|
|
|
|
parsing_html = true;
|
|
xml_compact_mode = true;
|
|
status = ok;
|
|
line = 1;
|
|
stack_len = 0;
|
|
out_string = nullptr;
|
|
out_stream = nullptr;
|
|
out_space = nullptr;
|
|
line_len = 0;
|
|
char_was_escaped = false;
|
|
escaped_chars_buffer.clear();
|
|
escaped_char_index = 0;
|
|
filter_mode = false;
|
|
}
|
|
|
|
|
|
|
|
|
|
void HTMLParser::Item::Clear()
|
|
{
|
|
name.clear();
|
|
type = none;
|
|
is_commentary = false;
|
|
is_cdata = false;
|
|
porphans = nullptr;
|
|
new_line_before = false;
|
|
new_line_after = false;
|
|
new_line_in_the_middle = false;
|
|
white_char_before = false;
|
|
has_body_tag = false;
|
|
tree_index = 0;
|
|
space = nullptr;
|
|
}
|
|
|
|
|
|
HTMLParser::Item::Item()
|
|
{
|
|
Clear();
|
|
}
|
|
|
|
|
|
void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode)
|
|
{
|
|
clear_input_flags();
|
|
|
|
pchar_unicode = in;
|
|
xml_compact_mode = compact_mode;
|
|
out_space = &space;
|
|
out_space->clear();
|
|
|
|
Init();
|
|
Read();
|
|
Uninit();
|
|
}
|
|
|
|
|
|
void HTMLParser::set_item_parsed_listener(Listener * listener)
|
|
{
|
|
this->listener = listener;
|
|
}
|
|
|
|
|
|
void HTMLParser::prepare_to_parse_xml(Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
clear_input_flags();
|
|
|
|
parsing_html = false;
|
|
this->out_space = &out_space;
|
|
xml_compact_mode = compact_mode;
|
|
|
|
if( clear_space )
|
|
{
|
|
this->out_space->clear();
|
|
}
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
prepare_to_parse_xml(out_space, compact_mode, clear_space);
|
|
reading_from_file = true;
|
|
|
|
file.clear();
|
|
file.open(file_name, std::ios_base::binary | std::ios_base::in);
|
|
|
|
if( file )
|
|
{
|
|
Init();
|
|
Read();
|
|
Uninit();
|
|
|
|
file.close();
|
|
}
|
|
else
|
|
{
|
|
status = cant_open_file;
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
std::string file_name_utf8;
|
|
wide_to_utf8(file_name, file_name_utf8);
|
|
|
|
return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml(const char * str, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
prepare_to_parse_xml(out_space, compact_mode, clear_space);
|
|
pchar_ascii = str;
|
|
|
|
Init();
|
|
Read();
|
|
Uninit();
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml(const std::string & str, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
return parse_xml(str.c_str(), out_space, compact_mode, clear_space);
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml(const wchar_t * str, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
prepare_to_parse_xml(out_space, compact_mode, clear_space);
|
|
pchar_unicode = str;
|
|
|
|
Init();
|
|
Read();
|
|
Uninit();
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml(const std::wstring & str, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
return parse_xml(str.c_str(), out_space, compact_mode, clear_space);
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml(const pt::TextStream & str, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
prepare_to_parse_xml(out_space, compact_mode, clear_space);
|
|
pt::TextStream::const_iterator start = str.begin();
|
|
pt::TextStream::const_iterator end = str.end();
|
|
|
|
text_stream_iterator = &start;
|
|
text_stream_iterator_end = &end;
|
|
|
|
Init();
|
|
Read();
|
|
Uninit();
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml(const pt::WTextStream & str, Space & out_space, bool compact_mode, bool clear_space)
|
|
{
|
|
prepare_to_parse_xml(out_space, compact_mode, clear_space);
|
|
pt::WTextStream::const_iterator start = str.begin();
|
|
pt::WTextStream::const_iterator end = str.end();
|
|
|
|
wtext_stream_iterator = &start;
|
|
wtext_stream_iterator_end = &end;
|
|
|
|
Init();
|
|
Read();
|
|
Uninit();
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
|
|
|
|
void HTMLParser::filter(const wchar_t * in, std::wstring & out, bool clear_out_string)
|
|
{
|
|
clear_input_flags();
|
|
|
|
pchar_unicode = in;
|
|
out_string = &out;
|
|
filter_mode = true;
|
|
|
|
if( clear_out_string )
|
|
out_string->clear();
|
|
|
|
Init();
|
|
Read();
|
|
Uninit();
|
|
}
|
|
|
|
|
|
void HTMLParser::filter(const std::wstring & in, std::wstring & out, bool clear_out_string)
|
|
{
|
|
if( &in == &out )
|
|
{
|
|
// out cannot be the same string as in
|
|
return;
|
|
}
|
|
|
|
size_t out_projected_len = in.size() * 2 + 1;
|
|
|
|
if( out.capacity() < out_projected_len )
|
|
out.reserve(out_projected_len);
|
|
|
|
filter(in.c_str(), out, clear_out_string);
|
|
}
|
|
|
|
|
|
void HTMLParser::filter(const WTextStream & in, Stream & out, bool clear_out_stream)
|
|
{
|
|
clear_input_flags();
|
|
|
|
WTextStream::const_iterator begin = in.begin();
|
|
WTextStream::const_iterator end = in.end();
|
|
|
|
wtext_stream_iterator = &begin;
|
|
wtext_stream_iterator_end = &end;
|
|
out_stream = &out;
|
|
filter_mode = true;
|
|
|
|
if( clear_out_stream )
|
|
out_stream->clear();
|
|
|
|
Init();
|
|
Read();
|
|
Uninit();
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::filter_file(const char * file_name, std::wstring & out, bool clear_out_stream)
|
|
{
|
|
clear_input_flags();
|
|
|
|
reading_from_file = true;
|
|
|
|
// open the file before clearing 'out' string, 'out' string can be the same string as the file_name
|
|
file.clear();
|
|
file.open(file_name, std::ios_base::binary | std::ios_base::in);
|
|
|
|
out_string = &out;
|
|
filter_mode = true;
|
|
|
|
if( clear_out_stream )
|
|
out_string->clear();
|
|
|
|
if( file )
|
|
{
|
|
Init();
|
|
Read();
|
|
Uninit();
|
|
|
|
file.close();
|
|
}
|
|
else
|
|
{
|
|
status = cant_open_file;
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream)
|
|
{
|
|
return filter_file(file_name.c_str(), out, clear_out_stream);
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream)
|
|
{
|
|
std::string file_name_utf8;
|
|
pt::wide_to_utf8(file_name, file_name_utf8);
|
|
|
|
return filter_file(file_name_utf8, out, clear_out_stream);
|
|
}
|
|
|
|
|
|
HTMLParser::Status HTMLParser::filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream)
|
|
{
|
|
return filter_file(file_name.c_str(), out, clear_out_stream);
|
|
}
|
|
|
|
|
|
|
|
|
|
void HTMLParser::Init()
|
|
{
|
|
}
|
|
|
|
|
|
void HTMLParser::Uninit()
|
|
{
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int HTMLParser::get_last_parsed_line()
|
|
{
|
|
return line;
|
|
}
|
|
|
|
|
|
int HTMLParser::get_last_parsed_column()
|
|
{
|
|
return column;
|
|
}
|
|
|
|
|
|
void HTMLParser::SetSomeDefaults()
|
|
{
|
|
white_mode = WHITE_MODE_ORIGIN;
|
|
|
|
tab_size = 2;
|
|
wrap_line = 0;
|
|
orphan_mode = orphan_nbsp;
|
|
safe_mode = false;
|
|
skip_tags = false;
|
|
skip_commentaries = false;
|
|
skip_entities = false;
|
|
analyze_entities = false;
|
|
listener = nullptr;
|
|
}
|
|
|
|
|
|
HTMLParser::HTMLParser()
|
|
{
|
|
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
|
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
|
|
|
SetSomeDefaults();
|
|
}
|
|
|
|
|
|
HTMLParser::HTMLParser(const HTMLParser & f)
|
|
{
|
|
// don't need to copy the stack
|
|
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
|
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
|
|
|
SetSomeDefaults();
|
|
}
|
|
|
|
|
|
HTMLParser & HTMLParser::operator=(const HTMLParser & f)
|
|
{
|
|
// don't need to copy the stack
|
|
pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN];
|
|
buffer = new wchar_t[WINIX_HTMLFILTER_BUFFER_MAXLEN];
|
|
|
|
// we can copy some fields from f
|
|
|
|
return *this;
|
|
}
|
|
|
|
|
|
HTMLParser::~HTMLParser()
|
|
{
|
|
delete [] pstack;
|
|
delete [] buffer;
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::white_chars_mode(int mode)
|
|
{
|
|
if( mode >= WHITE_MODE_ORIGIN && mode <= WHITE_MODE_TREE )
|
|
white_mode = mode;
|
|
}
|
|
|
|
|
|
|
|
|
|
void HTMLParser::WrapLine(size_t wrap_line_)
|
|
{
|
|
wrap_line = wrap_line_;
|
|
|
|
if( wrap_line > 10000 )
|
|
wrap_line = 10000;
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::InsertTabs(size_t tabsize)
|
|
{
|
|
tab_size = tabsize;
|
|
|
|
if( tab_size > 1000 )
|
|
tab_size = 1000;
|
|
}
|
|
|
|
|
|
int HTMLParser::current_white_char_mode()
|
|
{
|
|
if( !white_char_mode_tab.empty() )
|
|
return white_char_mode_tab.back();
|
|
|
|
return WHITE_MODE_ORIGIN;
|
|
}
|
|
|
|
|
|
void HTMLParser::CalcOrphansMaxLen(Orphans & orphans)
|
|
{
|
|
size_t i;
|
|
|
|
orphans.max_len = 0;
|
|
|
|
for(i=0 ; i<orphans.tab.size() ; ++i)
|
|
{
|
|
if( orphans.tab[i].size() > orphans.max_len )
|
|
orphans.max_len = orphans.tab[i].size();
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab)
|
|
{
|
|
lang_code_lower = lang_code;
|
|
ToLower(lang_code_lower);
|
|
|
|
orphans_temp.tab = otab;
|
|
std::sort(orphans_temp.tab.begin(), orphans_temp.tab.end());
|
|
CalcOrphansMaxLen(orphans_temp);
|
|
|
|
orphans_tab[lang_code_lower] = orphans_temp;
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab)
|
|
{
|
|
AssignOrphans(lang_code.c_str(), otab);
|
|
}
|
|
|
|
|
|
void HTMLParser::ClearOrphans()
|
|
{
|
|
orphans_tab.clear();
|
|
}
|
|
|
|
|
|
|
|
|
|
void HTMLParser::OrphansMode(const std::wstring & orphan_mode_str)
|
|
{
|
|
if( orphan_mode_str == L"160" )
|
|
orphan_mode = orphan_160space;
|
|
else
|
|
orphan_mode = orphan_nbsp;
|
|
}
|
|
|
|
|
|
void HTMLParser::SafeMode(bool safe_mode_)
|
|
{
|
|
safe_mode = safe_mode_;
|
|
}
|
|
|
|
|
|
void HTMLParser::SkipTags(bool skip_tags)
|
|
{
|
|
this->skip_tags = skip_tags;
|
|
}
|
|
|
|
void HTMLParser::SkipCommentaries(bool skip_commentaries)
|
|
{
|
|
this->skip_commentaries = skip_commentaries;
|
|
}
|
|
|
|
|
|
void HTMLParser::SkipEntities(bool skip_entities)
|
|
{
|
|
this->skip_entities = skip_entities;
|
|
|
|
if( this->skip_entities )
|
|
{
|
|
this->analyze_entities = true;
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::AnalyzeEntities(bool analyze_entities)
|
|
{
|
|
this->analyze_entities = analyze_entities;
|
|
}
|
|
|
|
|
|
void HTMLParser::SetNoFilterTag(const std::wstring & tag_name)
|
|
{
|
|
no_filter_tag = tag_name;
|
|
}
|
|
|
|
|
|
|
|
|
|
HTMLParser::Item & HTMLParser::GetItem(size_t i)
|
|
{
|
|
if( i >= stack_len )
|
|
{
|
|
empty.Clear();
|
|
return empty;
|
|
}
|
|
|
|
return pstack[i];
|
|
}
|
|
|
|
|
|
HTMLParser::Item & HTMLParser::LastItem()
|
|
{
|
|
if( stack_len == 0 )
|
|
{
|
|
empty.Clear();
|
|
return empty;
|
|
}
|
|
|
|
return pstack[stack_len-1];
|
|
}
|
|
|
|
|
|
bool HTMLParser::PushStack()
|
|
{
|
|
if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN )
|
|
// oops, too many items
|
|
return false;
|
|
|
|
pstack[stack_len].Clear();
|
|
|
|
if( stack_len > 0 )
|
|
{
|
|
// 'porphans', 'has_body_tag' and 'tree_index' attributes are propagated
|
|
pstack[stack_len].porphans = pstack[stack_len-1].porphans;
|
|
pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag;
|
|
pstack[stack_len].tree_index = pstack[stack_len-1].tree_index;
|
|
}
|
|
|
|
stack_len += 1;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::PopStack()
|
|
{
|
|
if( stack_len == 0 )
|
|
// oops
|
|
return;
|
|
|
|
stack_len -= 1;
|
|
pstack[stack_len].Clear();
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsWhite(int c)
|
|
{
|
|
// dont use c==10 here
|
|
|
|
if( c==' ' || c=='\t' || c==13 || c==160 )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
void HTMLParser::SkipWhite(std::wstring * out_string)
|
|
{
|
|
while( IsWhite(lastc) )
|
|
{
|
|
if( out_string )
|
|
(*out_string) += lastc;
|
|
|
|
read_char();
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::SkipWhiteLines(std::wstring * out_string)
|
|
{
|
|
while( lastc==10 || IsWhite(lastc) )
|
|
{
|
|
if( out_string )
|
|
(*out_string) += lastc;
|
|
|
|
read_char();
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::SkipWhiteWithFirstNewLine()
|
|
{
|
|
SkipWhite();
|
|
|
|
if( lastc == 10 )
|
|
{
|
|
read_char();
|
|
SkipWhite();
|
|
}
|
|
}
|
|
|
|
|
|
//void HTMLParser::CheckNewLine()
|
|
//{
|
|
// if( white_mode == WHITE_MODE_TREE )
|
|
// {
|
|
// SkipWhite();
|
|
// }
|
|
//
|
|
// last_new_line = (lastc==10);
|
|
//}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
|
|
{
|
|
bool is_quoted = false;
|
|
wchar_t quote_char = 0;
|
|
|
|
while( lastc != -1 )
|
|
{
|
|
if( !char_was_escaped && (lastc == '"' || lastc == '\'') )
|
|
{
|
|
if( is_quoted )
|
|
{
|
|
if( lastc == quote_char )
|
|
{
|
|
is_quoted = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
is_quoted = true;
|
|
quote_char = lastc;
|
|
}
|
|
}
|
|
else
|
|
if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(lastc) ) // closing xml tag: default '/'
|
|
{
|
|
LastItem().type = Item::simple;
|
|
}
|
|
else
|
|
if( !is_quoted && (!char_was_escaped && IsClosingTagMark(lastc)) )
|
|
{
|
|
read_char();
|
|
break;
|
|
}
|
|
|
|
if( remember_text )
|
|
(*remember_text) += lastc;
|
|
|
|
read_char();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
bool HTMLParser::IsValidCharForName(int c)
|
|
{
|
|
if( (c>='a' && c<='z') ||
|
|
(c>='A' && c<='Z') ||
|
|
(c>='0' && c<='9') ||
|
|
c=='-' || c=='!' || c==':' || c=='-' || c=='_' || c=='[') // : is for a namespace character, - is for a commentary, [ is for CDATA
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsValidCharForAttrName(int c)
|
|
{
|
|
if( (c>='a' && c<='z') ||
|
|
(c>='A' && c<='Z') ||
|
|
(c>='0' && c<='9') ||
|
|
c=='-' || c==':' || c=='_')
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsValidCharForEntityName(int c)
|
|
{
|
|
if( (c>='a' && c<='z') ||
|
|
(c>='A' && c<='Z') ||
|
|
(c>='0' && c<='9') ||
|
|
c=='#' )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
void HTMLParser::ReadItemName(std::wstring & name, bool clear_name)
|
|
{
|
|
size_t i;
|
|
|
|
if( clear_name )
|
|
name.clear();
|
|
|
|
for(i=0 ; IsValidCharForName(lastc) ; ++i)
|
|
{
|
|
if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN )
|
|
{
|
|
name += lastc;
|
|
|
|
if( LastItem().type == Item::special && name == L"!--" )
|
|
{
|
|
LastItem().is_commentary = true;
|
|
read_char();
|
|
break;
|
|
}
|
|
|
|
if( LastItem().type == Item::special && name == L"![CDATA[" )
|
|
{
|
|
LastItem().is_cdata = true;
|
|
read_char();
|
|
break;
|
|
}
|
|
}
|
|
|
|
read_char();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::ReadItemAttrName()
|
|
{
|
|
size_t i;
|
|
|
|
attr_name.clear();
|
|
|
|
for( i=0 ; lastc != -1 && IsValidCharForAttrName(lastc) ; ++i )
|
|
{
|
|
if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN )
|
|
attr_name += lastc;
|
|
|
|
read_char();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::ReadItemAttrValueAdd(const std::wstring & str)
|
|
{
|
|
if( analyze_entities )
|
|
{
|
|
attr_value.push_back(std::wstring());
|
|
AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), &attr_value.back());
|
|
}
|
|
else
|
|
{
|
|
attr_value.push_back(str);
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
|
{
|
|
attr_value.clear();
|
|
tmp_text.clear();
|
|
|
|
while( lastc != -1 )
|
|
{
|
|
if( !char_was_escaped )
|
|
{
|
|
if( has_quote )
|
|
{
|
|
if( lastc == quote_char )
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
|
break;
|
|
}
|
|
}
|
|
|
|
if( lastc==10 || IsWhite(lastc) )
|
|
{
|
|
if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
|
ReadItemAttrValueAdd(tmp_text);
|
|
|
|
tmp_text.clear();
|
|
}
|
|
else
|
|
{
|
|
if( tmp_text.size() > WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
|
tmp_text.clear();
|
|
|
|
tmp_text += lastc;
|
|
}
|
|
|
|
read_char();
|
|
}
|
|
|
|
if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
|
ReadItemAttrValueAdd(tmp_text);
|
|
}
|
|
|
|
|
|
void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
|
|
{
|
|
attr_value.clear();
|
|
tmp_text.clear();
|
|
|
|
while( lastc != -1 )
|
|
{
|
|
if( !char_was_escaped )
|
|
{
|
|
if( has_quote )
|
|
{
|
|
if( lastc == quote_char )
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
|
break;
|
|
}
|
|
}
|
|
|
|
// IMPROVEME add support for analyze_entities?
|
|
if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
|
tmp_text += lastc;
|
|
|
|
read_char();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::CheckChar(wchar_t c)
|
|
{
|
|
if( c == 10 )
|
|
line_len = 0;
|
|
else
|
|
line_len += 1;
|
|
}
|
|
|
|
|
|
void HTMLParser::Put(wchar_t c)
|
|
{
|
|
if( out_string )
|
|
(*out_string) += c;
|
|
|
|
if( out_stream )
|
|
(*out_stream) << c;
|
|
|
|
CheckChar(c);
|
|
}
|
|
|
|
|
|
void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
|
|
{
|
|
if( str >= end )
|
|
return;
|
|
|
|
size_t len = end - str;
|
|
|
|
if( out_string )
|
|
out_string->append(str, len);
|
|
|
|
if( out_stream )
|
|
out_stream->write(str, len);
|
|
|
|
for( ; str < end ; ++str)
|
|
CheckChar(*str);
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::Put(const std::wstring & str)
|
|
{
|
|
if( !str.empty() )
|
|
{
|
|
if( out_string )
|
|
out_string->append(str);
|
|
|
|
if( out_stream )
|
|
out_stream->write(str.c_str(), str.size());
|
|
|
|
for(size_t i=0 ; i < str.size() ; ++i)
|
|
CheckChar(str[i]);
|
|
}
|
|
}
|
|
|
|
|
|
// out can be null
|
|
void HTMLParser::AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out)
|
|
{
|
|
size_t epsilon = 8; // !! IMPROVE ME put as a constant
|
|
const wchar_t * old_str = str;
|
|
|
|
while( str < end )
|
|
{
|
|
if( IsStartingEntityMark(*str) )
|
|
{
|
|
const wchar_t * entity_start = str;
|
|
str += 1; // skip &
|
|
|
|
for(size_t i=0 ; *str && IsValidCharForEntityName(*str) && i < epsilon ; ++i, ++str)
|
|
{
|
|
}
|
|
|
|
if( IsEndingEntityMark(*str) && str - entity_start > 1 ) // at least one character in entity name
|
|
{
|
|
if( out )
|
|
out->append(old_str, entity_start);
|
|
else
|
|
Put(old_str, entity_start);
|
|
|
|
str += 1; // skip ;
|
|
|
|
if( !skip_entities )
|
|
{
|
|
if( out )
|
|
out->append(entity_start, str);
|
|
else
|
|
Put(entity_start, str);
|
|
}
|
|
|
|
EntityFound(entity_start + 1, str - 1); // without & and ;
|
|
old_str = str;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
str += 1;
|
|
}
|
|
}
|
|
|
|
if( out )
|
|
out->append(old_str, end);
|
|
else
|
|
Put(old_str, end);
|
|
}
|
|
|
|
|
|
|
|
|
|
int HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str)
|
|
{
|
|
size_t res;
|
|
|
|
const wchar_t * orphan = orphan_str.c_str();
|
|
|
|
for( ; str<end && *orphan!=0 ; ++str, ++orphan )
|
|
{
|
|
res = ToLower(*str) - ToLower(*orphan);
|
|
|
|
if( res != 0 )
|
|
return res;
|
|
}
|
|
|
|
if( str < end )
|
|
return ToLower(*str);
|
|
|
|
return -int(ToLower(*orphan));
|
|
}
|
|
|
|
|
|
|
|
|
|
// binary search in table (table should be sorted)
|
|
bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & table)
|
|
{
|
|
int res;
|
|
|
|
if( table.empty() )
|
|
return false;
|
|
|
|
size_t o1 = 0;
|
|
size_t o2 = table.size() - 1;
|
|
|
|
res = CheckOrphan(str, end, table[o1]);
|
|
|
|
if( res == 0 )
|
|
return true;
|
|
|
|
if( res < 0 )
|
|
return false;
|
|
|
|
res = CheckOrphan(str, end, table[o2]);
|
|
|
|
if( res == 0 )
|
|
return true;
|
|
|
|
if( res > 0 )
|
|
return false;
|
|
|
|
|
|
while( o1 + 1 < o2 )
|
|
{
|
|
size_t o = (o1 + o2) / 2;
|
|
res = CheckOrphan(str, end, table[o]);
|
|
|
|
if( res == 0 )
|
|
return true;
|
|
|
|
if( res < 0 )
|
|
o2 = o;
|
|
else
|
|
o1 = o;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool HTMLParser::CheckOrphan(const wchar_t * str, const wchar_t * end)
|
|
{
|
|
if( str==end || !LastItem().has_body_tag || !LastItem().porphans )
|
|
return false;
|
|
|
|
size_t len = end - str;
|
|
|
|
if( len > LastItem().porphans->max_len )
|
|
return false;
|
|
|
|
return CheckOrphan(str, end, LastItem().porphans->tab);
|
|
}
|
|
|
|
|
|
bool HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata)
|
|
{
|
|
bool was_closing_tag = false;
|
|
|
|
while( lastc != -1 && lastc != 10 && !IsWhite(lastc) )
|
|
{
|
|
if( is_cdata )
|
|
{
|
|
if( lastc == ']' )
|
|
{
|
|
read_char();
|
|
|
|
if( lastc == ']' )
|
|
{
|
|
read_char();
|
|
|
|
if( IsClosingTagMark(lastc) )
|
|
{
|
|
read_char();
|
|
was_closing_tag = true;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
str += ']';
|
|
str += ']';
|
|
}
|
|
}
|
|
else
|
|
{
|
|
str += ']';
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if( !char_was_escaped && IsOpeningTagMark(lastc) )
|
|
{
|
|
was_closing_tag = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
str += lastc;
|
|
read_char();
|
|
}
|
|
|
|
if( !str.empty() )
|
|
{
|
|
if( allow_put_new_line )
|
|
{
|
|
Put(10);
|
|
PutTabs(LastItem().tree_index + 1);
|
|
}
|
|
else
|
|
if( allow_put_space )
|
|
{
|
|
Put(' ');
|
|
}
|
|
}
|
|
|
|
if( analyze_entities )
|
|
AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr);
|
|
else
|
|
Put(str);
|
|
|
|
return was_closing_tag;
|
|
}
|
|
|
|
|
|
void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text)
|
|
{
|
|
was_white_char = false;
|
|
was_new_line = false;
|
|
|
|
while( lastc == 10 || IsWhite(lastc) )
|
|
{
|
|
if( lastc == 10 )
|
|
was_new_line = true;
|
|
else
|
|
was_white_char = true;
|
|
|
|
if( result_text )
|
|
(*result_text) += lastc;
|
|
|
|
if( current_white_char_mode() == WHITE_MODE_ORIGIN )
|
|
{
|
|
Put(lastc);
|
|
}
|
|
|
|
read_char();
|
|
}
|
|
|
|
if( current_white_char_mode() == WHITE_MODE_SINGLE_LINE && (was_white_char || was_new_line) )
|
|
{
|
|
Put(' ');
|
|
}
|
|
|
|
// in WHITE_MODE_TREE white characters are written at the beginning of a <tag> or text
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::PutOpeningTagMark()
|
|
{
|
|
Put('<');
|
|
}
|
|
|
|
|
|
void HTMLParser::PutClosingTagMark()
|
|
{
|
|
Put('>');
|
|
}
|
|
|
|
|
|
|
|
|
|
// !! IMPROVE ME change to a better name
|
|
// this functions does not return true when the tag is safe
|
|
bool HTMLParser::IsTagSafe(const wchar_t * tag)
|
|
{
|
|
if( !safe_mode )
|
|
return true;
|
|
|
|
if( IsNameEqual(tag, no_filter_tag.c_str()) )
|
|
return false;
|
|
|
|
static const wchar_t * unsafe_tags[] = {
|
|
L"applet", L"base", L"body",
|
|
L"embed", L"head", L"html",
|
|
L"frame", L"frameset",L"iframe",
|
|
L"link", L"meta", L"param"
|
|
L"object", L"script"
|
|
};
|
|
|
|
size_t len = sizeof(unsafe_tags) / sizeof(const wchar_t*);
|
|
size_t i;
|
|
|
|
for(i=0 ; i<len ; ++i)
|
|
{
|
|
if( IsNameEqual(tag, unsafe_tags[i]) )
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsTagSafe(const std::wstring & tag)
|
|
{
|
|
return IsTagSafe(tag.c_str());
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLParser::PutOpeningTag()
|
|
{
|
|
if( !IsTagSafe(LastItem().name) )
|
|
{
|
|
SkipAndCheckClosingTag();
|
|
return false;
|
|
}
|
|
|
|
if( current_white_char_mode() == WHITE_MODE_TREE )
|
|
{
|
|
if( LastItem().new_line_before )
|
|
{
|
|
Put(10);
|
|
PutTabs(LastItem().tree_index);
|
|
}
|
|
else
|
|
if( LastItem().white_char_before )
|
|
{
|
|
Put(' ');
|
|
}
|
|
}
|
|
|
|
PutOpeningTagMark();
|
|
Put(LastItem().name);
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::PutClosingTag(const Item & item)
|
|
{
|
|
if( skip_tags || !IsTagSafe(item.name) || IsNameEqual(no_filter_tag, LastItem().name) )
|
|
return;
|
|
|
|
if( item.is_commentary )
|
|
{
|
|
Put('-');
|
|
Put('-');
|
|
PutClosingTagMark();
|
|
}
|
|
else
|
|
{
|
|
PutOpeningTagMark();
|
|
Put('/');
|
|
Put(item.name);
|
|
PutClosingTagMark();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::PutTabs(size_t len)
|
|
{
|
|
if( len > 30 )
|
|
len = 30;
|
|
|
|
for(size_t i=0 ; i < (len*tab_size) ; ++i)
|
|
{
|
|
if( out_string )
|
|
(*out_string) += ' '; // we do not add them to 'line_len'
|
|
|
|
if( out_stream )
|
|
(*out_stream) << ' ';
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::PutNonBreakingSpace()
|
|
{
|
|
if( orphan_mode == orphan_nbsp )
|
|
{
|
|
Put(L" ");
|
|
}
|
|
else
|
|
{
|
|
Put(160);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// we assume the size of the opening mark to be one
|
|
bool HTMLParser::IsOpeningTagMark(wchar_t c)
|
|
{
|
|
return (c == '<');
|
|
}
|
|
|
|
|
|
// we assume the size of the closing mark to be one
|
|
bool HTMLParser::IsClosingTagMark(wchar_t c)
|
|
{
|
|
return (c == '>');
|
|
}
|
|
|
|
|
|
// the slash in the closing tag mark e.g. </p>
|
|
bool HTMLParser::IsClosingTagIndicator(wchar_t c)
|
|
{
|
|
return (c == '/');
|
|
}
|
|
|
|
|
|
// the slash in the closing tag mark e.g. </p>
|
|
bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
|
|
{
|
|
return (c == '!');
|
|
}
|
|
|
|
bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c)
|
|
{
|
|
return (c == '?');
|
|
}
|
|
|
|
// the '=' operator e.g. class="value"
|
|
bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
|
|
{
|
|
return (c == '=');
|
|
}
|
|
|
|
|
|
|
|
// the slash at the end <img src=".." /> (without '>' character)
|
|
// we assume the size of the mark to be one
|
|
bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
|
|
{
|
|
return (c == '/');
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsStartingEntityMark(wchar_t c)
|
|
{
|
|
return (c == '&');
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsEndingEntityMark(wchar_t c)
|
|
{
|
|
return (c == ';');
|
|
}
|
|
|
|
|
|
|
|
// used for such tags as: script, pre, textarea
|
|
void HTMLParser::ReadTextUntilClosingCommentary()
|
|
{
|
|
while( lastc != -1 )
|
|
{
|
|
if( lastc == '-' )
|
|
{
|
|
tmp_text.clear();
|
|
tmp_text += lastc;
|
|
read_char();
|
|
|
|
if( lastc == '-' )
|
|
{
|
|
tmp_text += lastc;
|
|
read_char();
|
|
|
|
if( !char_was_escaped && IsClosingTagMark(lastc) )
|
|
{
|
|
tmp_text += lastc;
|
|
read_char();
|
|
Put(tmp_text);
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
Put(tmp_text);
|
|
}
|
|
else
|
|
{
|
|
Put(lastc);
|
|
read_char();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
|
|
{
|
|
tmp_text.clear();
|
|
tmp_text += lastc; // opening tag mark
|
|
read_char();
|
|
|
|
SkipWhiteLines(&tmp_text);
|
|
|
|
if( IsClosingTagIndicator(lastc) )
|
|
{
|
|
tmp_text += lastc;
|
|
read_char();
|
|
SkipWhiteLines(&tmp_text);
|
|
ReadItemName(tmp_name);
|
|
|
|
if( IsNameEqual(tmp_name, LastItem().name) )
|
|
{
|
|
SkipAndCheckClosingTag();
|
|
|
|
if( put_closing_tag_as_well )
|
|
{
|
|
Put('<');
|
|
Put('/');
|
|
Put(tmp_name);
|
|
Put('>');
|
|
}
|
|
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
Put(tmp_text);
|
|
Put(tmp_name);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Put(tmp_text);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
|
|
|
|
// used for such tags as: script, pre, textarea
|
|
void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
|
|
{
|
|
while( lastc != -1 )
|
|
{
|
|
if( !char_was_escaped && IsOpeningTagMark(lastc) )
|
|
{
|
|
if( IsClosingTagForLastItem(put_closing_tag_as_well) )
|
|
{
|
|
//CheckNewLine();
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Put(lastc);
|
|
read_char();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
// reading text between html tags
|
|
void HTMLParser::ReadText(bool is_cdata)
|
|
{
|
|
new_item_has_new_line_before = false;
|
|
new_item_has_white_char_before = false;
|
|
|
|
bool was_white_char = false;
|
|
bool was_new_line = false;
|
|
|
|
bool was_non_white_text = false;
|
|
|
|
bool allow_put_new_line = false;
|
|
bool allow_put_space = false;
|
|
|
|
if( current_white_char_mode() == WHITE_MODE_TREE )
|
|
{
|
|
if( LastItem().new_line_after || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) )
|
|
{
|
|
allow_put_new_line = true;
|
|
}
|
|
}
|
|
|
|
Space * text_space = nullptr;
|
|
std::wstring * text_space_wstr = nullptr;
|
|
|
|
if( out_space )
|
|
{
|
|
text_space = &text_space_tmp;
|
|
text_space->clear();
|
|
text_space->add(L"name", L"");
|
|
Space & wstr_space = text_space->add(L"text", L"");
|
|
text_space_wstr = &wstr_space.value.value_wstring;
|
|
}
|
|
|
|
bool was_closing_tag = false;
|
|
|
|
while( lastc != -1 && !was_closing_tag )
|
|
{
|
|
tmp_text.clear();
|
|
was_closing_tag = PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space, is_cdata);
|
|
|
|
if( lastc == -1 || was_closing_tag )
|
|
{
|
|
new_item_has_new_line_before = was_new_line;
|
|
new_item_has_white_char_before = was_white_char;
|
|
}
|
|
|
|
if( !tmp_text.empty() )
|
|
{
|
|
allow_put_new_line = false;
|
|
allow_put_space = false;
|
|
was_non_white_text = true;
|
|
|
|
if( text_space_wstr )
|
|
(*text_space_wstr) += tmp_text;
|
|
}
|
|
|
|
if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) )
|
|
{
|
|
if( lastc == 10 || IsWhite(lastc) )
|
|
{
|
|
SkipWhiteLines(text_space_wstr);
|
|
PutNonBreakingSpace();
|
|
was_new_line = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
PutNormalWhite(was_white_char, was_new_line, text_space_wstr);
|
|
|
|
if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
|
|
{
|
|
allow_put_new_line = false;
|
|
allow_put_space = false;
|
|
|
|
if( was_new_line )
|
|
{
|
|
allow_put_new_line = true;
|
|
LastItem().new_line_in_the_middle = true;
|
|
|
|
if( !was_non_white_text )
|
|
LastItem().new_line_after = true;
|
|
}
|
|
else
|
|
{
|
|
allow_put_space = true;
|
|
}
|
|
|
|
if( wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line )
|
|
{
|
|
allow_put_new_line = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if( text_space_wstr && !text_space_wstr->empty() && was_non_white_text )
|
|
{
|
|
AddTextSpaceToSpaceTree(*text_space);
|
|
}
|
|
|
|
text_space_tmp.clear();
|
|
}
|
|
|
|
|
|
|
|
bool HTMLParser::PrintOpeningItem()
|
|
{
|
|
if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
|
|
return true;
|
|
|
|
return PutOpeningTag();
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLParser::ReadItemAttr()
|
|
{
|
|
attr_has_value = false;
|
|
attr_name.clear();
|
|
attr_value.clear();
|
|
|
|
SkipWhiteLines();
|
|
ReadItemAttrName();
|
|
|
|
if( attr_name.empty() )
|
|
return false;
|
|
|
|
SkipWhiteLines();
|
|
|
|
if( !IsAttributeAssignmentMark(lastc) ) // '='
|
|
return true;
|
|
|
|
attr_has_value = true;
|
|
read_char(); // skipping '='
|
|
SkipWhiteLines();
|
|
|
|
bool has_quote = !char_was_escaped && (lastc == '"' || lastc == '\'');
|
|
wchar_t quote_char = lastc;
|
|
|
|
if( has_quote )
|
|
read_char(); // skipping the first quote mark
|
|
|
|
// IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table...
|
|
if( parsing_html )
|
|
ReadItemAttrValue(has_quote, quote_char);
|
|
else
|
|
ReadXMLItemAttrValue(has_quote, quote_char);
|
|
|
|
if( has_quote && !char_was_escaped && lastc == quote_char )
|
|
read_char(); // skipping the last quote mark
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::CheckItemLangAttr()
|
|
{
|
|
if( attr_has_value && IsNameEqual(L"lang", attr_name) )
|
|
{
|
|
LastItem().porphans = nullptr;
|
|
|
|
if( !attr_value.empty() )
|
|
{
|
|
// we are taking the first value only
|
|
attr_value_lower = attr_value[0];
|
|
ToLower(attr_value_lower);
|
|
|
|
OrphansTab::iterator i = orphans_tab.find(attr_value_lower);
|
|
|
|
if( i != orphans_tab.end() )
|
|
LastItem().porphans = &i->second;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::PrintItemAttr()
|
|
{
|
|
size_t i;
|
|
|
|
if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) )
|
|
return;
|
|
|
|
Put(' ');
|
|
Put(attr_name);
|
|
|
|
if( attr_has_value )
|
|
{
|
|
Put(L"=\"");
|
|
|
|
for(i=0 ; i<attr_value.size() ; ++i)
|
|
{
|
|
Put(attr_value[i]);
|
|
|
|
if( i + 1 < attr_value.size() )
|
|
Put(' ');
|
|
}
|
|
|
|
Put('\"');
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::PutItemAttrToSpace()
|
|
{
|
|
Space * space = LastItem().space;
|
|
|
|
if( space )
|
|
{
|
|
Space & attr_tab = space->get_add_space(L"attr");
|
|
Space & attr = attr_tab.add_empty_space(attr_name);
|
|
|
|
if( attr_has_value )
|
|
{
|
|
if( parsing_html )
|
|
{
|
|
attr.set_empty_table();
|
|
|
|
for(size_t i=0 ; i < attr_value.size() ; ++i)
|
|
{
|
|
attr.add(attr_value[i]);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
attr.set(tmp_text);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::ReadItemClosing()
|
|
{
|
|
read_char(); // skipping '/'
|
|
SkipWhiteLines();
|
|
ReadItemName(LastItem().name);
|
|
LastItem().type = Item::closing;
|
|
SkipAndCheckClosingTag();
|
|
|
|
// closing tags are printed later
|
|
}
|
|
|
|
|
|
void HTMLParser::ReadItemSpecial()
|
|
{
|
|
LastItem().type = Item::special;
|
|
|
|
if( !skip_tags )
|
|
{
|
|
if( current_white_char_mode() == WHITE_MODE_TREE && LastItem().new_line_before )
|
|
{
|
|
Put(10);
|
|
PutTabs(LastItem().tree_index);
|
|
}
|
|
|
|
PutOpeningTagMark();
|
|
}
|
|
|
|
LastItem().name = lastc;
|
|
read_char(); // skipping '!' or '?'
|
|
ReadItemName(LastItem().name, false);
|
|
|
|
if( skip_tags )
|
|
{
|
|
SkipAndCheckClosingTag();
|
|
}
|
|
else
|
|
{
|
|
if( LastItem().is_commentary )
|
|
{
|
|
Put(LastItem().name);
|
|
}
|
|
else
|
|
if( LastItem().is_cdata )
|
|
{
|
|
// do nothing
|
|
}
|
|
else
|
|
{
|
|
tmp_text.clear();
|
|
SkipWhiteLines();
|
|
SkipAndCheckClosingTag(&tmp_text);
|
|
Put(LastItem().name);
|
|
Put(' ');
|
|
Put(tmp_text);
|
|
Put('>');
|
|
|
|
if( is_first_item && current_white_char_mode() == WHITE_MODE_TREE && is_equal_nc(LastItem().name.c_str(), L"!doctype") )
|
|
{
|
|
Put(10);
|
|
Put(10);
|
|
SkipWhiteLines();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::ReadItemOpening()
|
|
{
|
|
LastItem().type = Item::opening;
|
|
ReadItemName(LastItem().name);
|
|
AddItemToSpace();
|
|
Space * space = LastItem().space;
|
|
|
|
if( !xml_compact_mode && space )
|
|
space->add(L"name", LastItem().name);
|
|
|
|
if( PrintOpeningItem() )
|
|
{
|
|
while( ReadItemAttr() )
|
|
{
|
|
CheckItemLangAttr();
|
|
PrintItemAttr();
|
|
PutItemAttrToSpace();
|
|
}
|
|
|
|
SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
|
|
|
|
if( !skip_tags && !IsNameEqual(no_filter_tag, LastItem().name) )
|
|
{
|
|
if( LastItem().type == Item::simple )
|
|
Put(L" /");
|
|
|
|
PutClosingTagMark();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::ItemFound()
|
|
{
|
|
}
|
|
|
|
void HTMLParser::EntityFound(const wchar_t * str, const wchar_t * end)
|
|
{
|
|
}
|
|
|
|
|
|
bool HTMLParser::ReadItem()
|
|
{
|
|
if( lastc == -1 )
|
|
return false;
|
|
|
|
if( !PushStack() )
|
|
return false;
|
|
|
|
LastItem().new_line_before = new_item_has_new_line_before; // new_item_has_new_line_before is set by ReadText() method
|
|
LastItem().white_char_before = new_item_has_white_char_before; // new_item_has_white_char_before is set by ReadText() method
|
|
|
|
if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
|
|
LastItem().tree_index += 1;
|
|
|
|
read_char(); // skipping the first opening tag mark '<'
|
|
SkipWhiteLines();
|
|
|
|
if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) )
|
|
ReadItemSpecial();
|
|
else
|
|
if( IsClosingTagIndicator(lastc) )
|
|
ReadItemClosing();
|
|
else
|
|
ReadItemOpening();
|
|
|
|
// IMPROVE ME later CheckSingleItemExceptions() can change opening to single type
|
|
ItemFound();
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
wchar_t HTMLParser::ToLower(wchar_t c)
|
|
{
|
|
if( c>='A' && c<='Z' )
|
|
return c - 'A' + 'a';
|
|
|
|
return c;
|
|
}
|
|
|
|
|
|
void HTMLParser::ToLower(std::wstring & str)
|
|
{
|
|
size_t i;
|
|
|
|
for(i=0 ; i<str.size() ; ++i)
|
|
str[i] = ToLower(str[i]);
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsNameEqual(const wchar_t * name1, const wchar_t * name2)
|
|
{
|
|
for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 )
|
|
if( ToLower(*name1) != ToLower(*name2) )
|
|
return false;
|
|
|
|
if( *name1==0 && *name2==0 )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2)
|
|
{
|
|
return IsNameEqual(name1, name2.c_str());
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2)
|
|
{
|
|
return IsNameEqual(name1.c_str(), name2);
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2)
|
|
{
|
|
return IsNameEqual(name1.c_str(), name2.c_str());
|
|
}
|
|
|
|
|
|
|
|
// len characters from both strings must be equal
|
|
// IMPROVE ME change name to something like IsBeginningNameEqual
|
|
// and move to text.h (pikotools)
|
|
bool HTMLParser::IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len)
|
|
{
|
|
for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len )
|
|
if( ToLower(*name1) != ToLower(*name2) )
|
|
return false;
|
|
|
|
if( len == 0 )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
|
|
bool HTMLParser::IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len)
|
|
{
|
|
return IsNameEqual(name1, name2.c_str(), len);
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len)
|
|
{
|
|
return IsNameEqual(name1.c_str(), name2, len);
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len)
|
|
{
|
|
return IsNameEqual(name1.c_str(), name2.c_str(), len);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool HTMLParser::IsLastTag(const wchar_t * name)
|
|
{
|
|
return IsNameEqual(name, LastItem().name);
|
|
}
|
|
|
|
|
|
bool HTMLParser::IsLastTag(const std::wstring & name)
|
|
{
|
|
return IsNameEqual(name, LastItem().name);
|
|
}
|
|
|
|
|
|
// checking exceptions for opening tags
|
|
void HTMLParser::CheckSingleItemExceptions()
|
|
{
|
|
if( IsLastTag(L"meta") ||
|
|
IsLastTag(L"input") ||
|
|
IsLastTag(L"br") ||
|
|
IsLastTag(L"hr") ||
|
|
IsLastTag(L"img") ||
|
|
IsLastTag(L"link") ||
|
|
IsLastTag(L"param") ||
|
|
IsLastTag(L"col") ||
|
|
IsLastTag(L"area") )
|
|
{
|
|
LastItem().type = Item::simple;
|
|
PopStack();
|
|
return;
|
|
}
|
|
|
|
// move me to a better place
|
|
if( IsLastTag(L"body") )
|
|
LastItem().has_body_tag = true;
|
|
}
|
|
|
|
|
|
void HTMLParser::CheckWhiteCharsExceptions(Item & item)
|
|
{
|
|
bool change_white_mode = false;
|
|
|
|
// in safe_mode the script tag is ignored
|
|
// if( !safe_mode && IsNameEqual(item.name, L"script") )
|
|
// {
|
|
// change_white_mode = true;
|
|
// }
|
|
|
|
// if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
|
|
// {
|
|
// change_white_mode = true;
|
|
// }
|
|
|
|
if( IsNameEqual(item.name, L"pre") )
|
|
{
|
|
change_white_mode = true;
|
|
}
|
|
|
|
// move to CheckDifferentContentExceptions?
|
|
if( IsNameEqual(item.name, no_filter_tag) )
|
|
{
|
|
change_white_mode = true;
|
|
}
|
|
|
|
if( change_white_mode )
|
|
{
|
|
if( item.type == Item::opening )
|
|
{
|
|
white_char_mode_tab.push_back(WHITE_MODE_ORIGIN);
|
|
}
|
|
else
|
|
{
|
|
if( !white_char_mode_tab.empty() )
|
|
white_char_mode_tab.pop_back();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::CheckDifferentContentExceptions(Item & item)
|
|
{
|
|
if( !safe_mode && IsNameEqual(item.name, L"script") )
|
|
{
|
|
ReadTextUntilClosingTag(true);
|
|
PopStack();
|
|
}
|
|
|
|
if( IsNameEqual(item.name, L"textarea") )
|
|
{
|
|
ReadTextUntilClosingTag(true);
|
|
PopStack();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLParser::CheckStackPrintRest()
|
|
{
|
|
while( stack_len-- > 0 )
|
|
{
|
|
if( stack_len==0 || pstack[stack_len-1].new_line_after )
|
|
{
|
|
if( current_white_char_mode() == WHITE_MODE_TREE )
|
|
{
|
|
Put(10);
|
|
PutTabs(pstack[stack_len-1].tree_index);
|
|
}
|
|
else
|
|
{
|
|
Put(' ');
|
|
}
|
|
}
|
|
|
|
PutClosingTag(pstack[stack_len]);
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::CheckClosingTags()
|
|
{
|
|
int i;
|
|
|
|
if( stack_len == 0 )
|
|
return;
|
|
|
|
// on the stack we have only opening tags
|
|
// but only the last tag is a closing tag
|
|
|
|
if( stack_len == 1 )
|
|
{
|
|
PopStack();
|
|
return;
|
|
}
|
|
|
|
// looking whether there is a matching opening tag
|
|
for(i=int(stack_len)-2 ; i >= 0 ; --i)
|
|
if( (pstack[i].is_commentary && pstack[stack_len-1].is_commentary) || IsNameEqual(pstack[i].name, pstack[stack_len-1].name) )
|
|
break;
|
|
|
|
if( i < 0 )
|
|
{
|
|
// oops, there is no such an opening tag on the stack
|
|
// we don't print the closing and the missing opening tag
|
|
PopStack();
|
|
return;
|
|
}
|
|
|
|
for(int z=(int)stack_len-2 ; z >= i ; --z)
|
|
{
|
|
CallListener(z); // space from the item can be set as null here (when a should_remove() callback returned true)
|
|
CheckWhiteCharsExceptions(pstack[z]);
|
|
|
|
if( !skip_tags && IsTagSafe(LastItem().name) && !IsNameEqual(no_filter_tag, LastItem().name) )
|
|
{
|
|
if( pstack[z].new_line_after )
|
|
{
|
|
if( current_white_char_mode() == WHITE_MODE_TREE )
|
|
{
|
|
Put(10);
|
|
PutTabs(pstack[z].tree_index);
|
|
}
|
|
}
|
|
|
|
// IMPROVEME
|
|
// in PutClosingTag we test IsTagSafe() and no_filter_tag too
|
|
PutClosingTag(pstack[z]);
|
|
pstack[z].Clear();
|
|
}
|
|
}
|
|
|
|
// invalidate items on the stack
|
|
stack_len = i;
|
|
}
|
|
|
|
|
|
bool HTMLParser::PrintRest()
|
|
{
|
|
//const wchar_t * start = pchar;
|
|
|
|
// in safe mode we do not print the rest html code
|
|
if( safe_mode || skip_tags )
|
|
return false;
|
|
|
|
bool was_chars = false;
|
|
|
|
while( lastc != -1 )
|
|
{
|
|
Put(lastc);
|
|
read_char();
|
|
was_chars = true;
|
|
}
|
|
|
|
return was_chars;
|
|
|
|
// if( pchar > start )
|
|
// {
|
|
// Put(start, pchar);
|
|
// return true;
|
|
// }
|
|
|
|
//return false;
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::AddItemToSpace()
|
|
{
|
|
if( out_space && stack_len > 0 )
|
|
{
|
|
Space * parent = out_space;
|
|
|
|
if( stack_len > 1 )
|
|
{
|
|
parent = pstack[stack_len-2].space;
|
|
}
|
|
|
|
if( xml_compact_mode )
|
|
{
|
|
bool has_parent_object_name = false;
|
|
|
|
if( parent->is_object() )
|
|
{
|
|
Space::ObjectType::iterator i = parent->value.value_object.find(pstack[stack_len-1].name);
|
|
|
|
if( i != parent->value.value_object.end() )
|
|
{
|
|
has_parent_object_name = true;
|
|
|
|
if( i->second->is_table() )
|
|
{
|
|
Space & child = i->second->add_empty_space();
|
|
pstack[stack_len-1].space = &child;
|
|
}
|
|
else
|
|
{
|
|
Space * old_space = i->second;
|
|
i->second = new Space();
|
|
i->second->set_empty_table();
|
|
i->second->value.value_table.push_back(old_space);
|
|
Space & child = i->second->add_empty_space();
|
|
pstack[stack_len-1].space = &child;
|
|
}
|
|
}
|
|
}
|
|
|
|
if( !has_parent_object_name )
|
|
{
|
|
Space & space = parent->add_empty_space(pstack[stack_len-1].name);
|
|
pstack[stack_len-1].space = &space;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Space & childs_tab = parent->get_add_space(L"childs");
|
|
Space & child = childs_tab.add_empty_space();
|
|
pstack[stack_len-1].space = &child;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::RemoveLastSpace(size_t index)
|
|
{
|
|
if( out_space )
|
|
{
|
|
Space * parent = out_space;
|
|
|
|
if( index > 0 )
|
|
{
|
|
parent = pstack[index - 1].space;
|
|
}
|
|
|
|
if( xml_compact_mode )
|
|
{
|
|
if( parent->is_object() )
|
|
{
|
|
Space::ObjectType::iterator i = parent->value.value_object.find(pstack[index].name);
|
|
|
|
if( i != parent->value.value_object.end() )
|
|
{
|
|
if( i->second->is_table() )
|
|
{
|
|
size_t len = i->second->table_size();
|
|
|
|
if( len > 0 && i->second->value.value_table[len-1] == pstack[index].space )
|
|
{
|
|
i->second->remove(len - 1);
|
|
pstack[index].space = nullptr;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if( i->second == pstack[index].space )
|
|
{
|
|
parent->remove(i);
|
|
pstack[index].space = nullptr;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Space * childs_tab = parent->get_space(L"childs");
|
|
|
|
if( childs_tab && childs_tab->is_table() )
|
|
{
|
|
size_t len = childs_tab->table_size();
|
|
|
|
if( len > 0 && childs_tab->value.value_table[len-1] == pstack[index].space )
|
|
{
|
|
childs_tab->remove(len - 1);
|
|
pstack[index].space = nullptr;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::AddTextSpaceToSpaceTree(const Space & space)
|
|
{
|
|
const std::wstring * text = space.get_wstr(L"text");
|
|
|
|
if( out_space && stack_len > 0 && text )
|
|
{
|
|
if( xml_compact_mode )
|
|
{
|
|
bool has_space_text = false;
|
|
|
|
if( LastItem().space->is_object() )
|
|
{
|
|
Space::ObjectType::iterator i = LastItem().space->value.value_object.find(L"text");
|
|
|
|
if( i != LastItem().space->value.value_object.end() )
|
|
{
|
|
has_space_text = true;
|
|
|
|
if( i->second->is_table() )
|
|
{
|
|
i->second->add(*text);
|
|
}
|
|
else
|
|
{
|
|
Space * old_space = i->second;
|
|
i->second = new Space();
|
|
i->second->set_empty_table();
|
|
i->second->value.value_table.push_back(old_space);
|
|
i->second->add(*text);
|
|
}
|
|
}
|
|
}
|
|
|
|
if( !has_space_text )
|
|
{
|
|
LastItem().space->add(L"text", *text);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Space & childs_tab = LastItem().space->get_add_space(L"childs");
|
|
childs_tab.add(space);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::CallListener(size_t index)
|
|
{
|
|
if( listener )
|
|
{
|
|
listener->item_parsed(pstack[index]);
|
|
|
|
if( listener->should_remove(pstack[index]) )
|
|
{
|
|
RemoveLastSpace(index);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::ReadLoop()
|
|
{
|
|
while( status == ok && ReadItem() )
|
|
{
|
|
bool was_cdata = false;
|
|
|
|
if( LastItem().type == Item::opening )
|
|
{
|
|
if( parsing_html )
|
|
{
|
|
CheckSingleItemExceptions();
|
|
}
|
|
|
|
CheckWhiteCharsExceptions(LastItem());
|
|
CheckDifferentContentExceptions(LastItem());
|
|
}
|
|
else
|
|
if( LastItem().type == Item::special )
|
|
{
|
|
if( LastItem().is_commentary )
|
|
ReadTextUntilClosingCommentary();
|
|
|
|
if( LastItem().is_cdata )
|
|
was_cdata = true;
|
|
|
|
PopStack();
|
|
}
|
|
else
|
|
if( LastItem().type == Item::simple )
|
|
{
|
|
if( stack_len > 0 )
|
|
{
|
|
CallListener(stack_len - 1);
|
|
}
|
|
|
|
PopStack();
|
|
}
|
|
else
|
|
if( LastItem().type == Item::closing )
|
|
{
|
|
CheckClosingTags();
|
|
}
|
|
else
|
|
{
|
|
PopStack();
|
|
}
|
|
|
|
if( status == ok )
|
|
{
|
|
ReadText(was_cdata);
|
|
}
|
|
|
|
is_first_item = false;
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::read_char_from_entity_buffer()
|
|
{
|
|
if( escaped_char_index < escaped_chars_buffer.size() )
|
|
{
|
|
lastc = escaped_chars_buffer[escaped_char_index];
|
|
escaped_char_index += 1;
|
|
|
|
if( escaped_char_index >= escaped_chars_buffer.size() )
|
|
{
|
|
escaped_chars_buffer.clear();
|
|
escaped_char_index = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
lastc = -1;
|
|
}
|
|
}
|
|
|
|
|
|
void HTMLParser::read_xml_entity()
|
|
{
|
|
const size_t max_entity_length = 6; // length of "'" string
|
|
escaped_chars_buffer.clear();
|
|
escaped_char_index = 0;
|
|
escaped_chars_buffer += '&';
|
|
|
|
do
|
|
{
|
|
read_char_no_escape();
|
|
|
|
if( lastc != -1 )
|
|
{
|
|
escaped_chars_buffer += lastc;
|
|
}
|
|
}
|
|
while( escaped_chars_buffer.size() < max_entity_length && lastc != -1 && lastc != ';' );
|
|
}
|
|
|
|
|
|
bool HTMLParser::check_escape_sequentions()
|
|
{
|
|
if( escaped_chars_buffer == L"&" )
|
|
{
|
|
lastc = '&';
|
|
char_was_escaped = true;
|
|
}
|
|
else
|
|
if( escaped_chars_buffer == L"<" )
|
|
{
|
|
lastc = '<';
|
|
char_was_escaped = true;
|
|
}
|
|
else
|
|
if( escaped_chars_buffer == L">" )
|
|
{
|
|
lastc = '>';
|
|
char_was_escaped = true;
|
|
}
|
|
else
|
|
if( escaped_chars_buffer == L""" )
|
|
{
|
|
lastc = '"';
|
|
char_was_escaped = true;
|
|
}
|
|
else
|
|
if( escaped_chars_buffer == L"'" )
|
|
{
|
|
lastc = '\'';
|
|
char_was_escaped = true;
|
|
}
|
|
|
|
if( char_was_escaped )
|
|
{
|
|
escaped_chars_buffer.clear();
|
|
escaped_char_index = 0;
|
|
}
|
|
|
|
return char_was_escaped;
|
|
}
|
|
|
|
|
|
|
|
int HTMLParser::read_char()
|
|
{
|
|
char_was_escaped = false;
|
|
|
|
if( escaped_char_index < escaped_chars_buffer.size() )
|
|
{
|
|
read_char_from_entity_buffer();
|
|
}
|
|
else
|
|
{
|
|
read_char_no_escape();
|
|
|
|
if( !filter_mode && lastc == '&' )
|
|
{
|
|
read_xml_entity();
|
|
|
|
if( !check_escape_sequentions() )
|
|
{
|
|
read_char_from_entity_buffer();
|
|
}
|
|
}
|
|
}
|
|
|
|
return lastc;
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::Read()
|
|
{
|
|
read_char(); // put first character to lastc
|
|
is_first_item = true;
|
|
|
|
white_char_mode_tab.clear();
|
|
white_char_mode_tab.push_back(white_mode);
|
|
|
|
if( current_white_char_mode() != WHITE_MODE_ORIGIN )
|
|
SkipWhiteLines();
|
|
|
|
// it can be some text or white lines before the first html tag (we print it if using filtering)
|
|
// but they are not added to the Space tree
|
|
ReadText(false);
|
|
|
|
// reading the whole html source
|
|
ReadLoop();
|
|
|
|
// sometimes there can remain some html source (when there is no space on the stack)
|
|
// we print the rest html without filtering (only if safe_mode is false)
|
|
if( !PrintRest() )
|
|
CheckStackPrintRest();
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|