diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 7b422f5..43e7d57 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -59,6 +59,7 @@ void HTMLParser::Item::Clear()
new_line_in_the_middle = false;
has_body_tag = false;
tree_index = 0;
+ space = nullptr;
}
@@ -68,6 +69,27 @@ HTMLParser::Item::Item()
}
+void HTMLParser::parse_html(const wchar_t * in, Space & space)
+{
+ parsing_html = true;
+ reading_from_file = false;
+ reading_from_wchar_string = true;
+ pchar_unicode = in;
+ pchar_ascii = 0;
+
+ stack_len = 0;
+ out_string = nullptr;
+ out_space = &space;
+ //last_new_line = false;
+ line_len = 0;
+ out_space->clear();
+
+ Init();
+ Read();
+ Uninit();
+}
+
+
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
{
@@ -79,6 +101,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
stack_len = 0;
out_string = &out;
+ out_space = nullptr;
//last_new_line = false;
line_len = 0;
out_string->clear();
@@ -347,6 +370,8 @@ bool HTMLParser::PushStack()
return true;
}
+
+
void HTMLParser::PopStack()
{
if( stack_len == 0 )
@@ -609,7 +634,9 @@ void HTMLParser::CheckChar(wchar_t c)
void HTMLParser::Put(wchar_t c)
{
- (*out_string) += c;
+ if( out_string )
+ (*out_string) += c;
+
CheckChar(c);
}
@@ -620,7 +647,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
return;
size_t len = end - str;
- out_string->append(str, len);
+
+ if( out_string )
+ out_string->append(str, len);
for( ; str < end ; ++str)
CheckChar(*str);
@@ -632,7 +661,8 @@ void HTMLParser::Put(const std::wstring & str)
{
if( !str.empty() )
{
- out_string->append(str);
+ if( out_string )
+ out_string->append(str);
for(size_t i=0 ; i < str.size() ; ++i)
CheckChar(str[i]);
@@ -805,7 +835,7 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
}
-void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line)
+void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text)
{
was_white_char = false;
was_new_line = false;
@@ -817,6 +847,9 @@ void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line)
else
was_white_char = true;
+ if( result_text )
+ (*result_text) += lastc;
+
if( current_white_char_mode() == WHITE_MODE_ORIGIN )
{
Put(lastc);
@@ -939,7 +972,10 @@ void HTMLParser::PutTabs(size_t len)
len = 30;
for(size_t i=0 ; i < (len*tab_size) ; ++i)
- (*out_string) += ' '; // we do not add them to 'line_len'
+ {
+ if( out_string )
+ (*out_string) += ' '; // we do not add them to 'line_len'
+ }
}
@@ -1140,6 +1176,18 @@ void HTMLParser::ReadText()
}
}
+ Space * text_space = nullptr;
+ std::wstring * text_space_wstr = nullptr;
+
+ if( out_space )
+ {
+ text_space = &text_space_tmp;
+ text_space->clear();
+ text_space->add(L"name", L"");
+ Space & wstr_space = text_space->add(L"text", L"");
+ text_space_wstr = &wstr_space.value.value_wstring;
+ }
+
while( lastc != -1 && !IsOpeningTagMark(lastc) )
{
tmp_text.clear();
@@ -1150,19 +1198,22 @@ void HTMLParser::ReadText()
allow_put_new_line = false;
allow_put_space = false;
was_non_white_text = true;
+
+ if( text_space_wstr )
+ (*text_space_wstr) += tmp_text;
}
if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) )
{
if( lastc == 10 || IsWhite(lastc) )
{
- SkipWhiteLines();
+ SkipWhiteLines(text_space_wstr);
PutNonBreakingSpace();
}
}
else
{
- PutNormalWhite(was_white_char, was_new_line);
+ PutNormalWhite(was_white_char, was_new_line, text_space_wstr);
if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
{
@@ -1190,6 +1241,12 @@ void HTMLParser::ReadText()
}
}
+ if( text_space_wstr && !text_space_wstr->empty() && was_non_white_text )
+ {
+ AddSpaceToSpaceTree(*text_space);
+ }
+
+ text_space_tmp.clear();
new_item_has_new_line_before = was_new_line;
}
@@ -1292,6 +1349,28 @@ size_t i;
}
+void HTMLParser::PutItemAttrToSpace()
+{
+ Space * space = LastItem().space;
+
+ if( space )
+ {
+ Space & attr_tab = space->get_add_space(L"attr");
+ Space & attr = attr_tab.add_empty_space(attr_name);
+
+ if( attr_has_value )
+ {
+ attr.set_empty_table();
+
+ for(size_t i=0 ; i < attr_value.size() ; ++i)
+ {
+ attr.add(attr_value[i]);
+ }
+ }
+ }
+}
+
+
void HTMLParser::ReadItemClosing()
{
read_char(); // skipping '/'
@@ -1358,13 +1437,19 @@ void HTMLParser::ReadItemOpening()
{
LastItem().type = Item::opening;
ReadItemName(LastItem().name);
+ AddItemToSpace();
+ Space * space = LastItem().space;
+ if( space )
+ space->add(L"name", LastItem().name);
+
if( PrintOpeningItem() )
{
while( ReadItemAttr() )
{
CheckItemLangAttr();
PrintItemAttr();
+ PutItemAttrToSpace();
}
SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
@@ -1748,6 +1833,36 @@ bool HTMLParser::PrintRest()
+void HTMLParser::AddItemToSpace()
+{
+ if( out_space && stack_len > 0 )
+ {
+ if( stack_len == 1 )
+ {
+ pstack[stack_len-1].space = out_space;
+ }
+ else
+ {
+ // stack_len > 1
+ Space & childs_tab = pstack[stack_len-2].space->get_add_space(L"childs");
+ Space & child = childs_tab.add_empty_space();
+ pstack[stack_len-1].space = &child;
+ }
+ }
+}
+
+
+void HTMLParser::AddSpaceToSpaceTree(const Space & space)
+{
+ if( out_space && stack_len > 0 )
+ {
+ Space & childs_tab = LastItem().space->get_add_space(L"childs");
+ childs_tab.add(space);
+ }
+}
+
+
+
void HTMLParser::ReadLoop()
{
while( ReadItem() )
@@ -1759,6 +1874,7 @@ void HTMLParser::ReadLoop()
CheckSingleItemExceptions();
}
+
CheckWhiteCharsExceptions(LastItem());
CheckDifferentContentExceptions(LastItem());
}
@@ -1804,7 +1920,8 @@ void HTMLParser::Read()
if( current_white_char_mode() != WHITE_MODE_ORIGIN )
SkipWhiteLines();
- // it can be some text or white lines before the first html tag (we print it)
+ // it can be some text or white lines before the first html tag (we print it if using filtering)
+ // but they are not added to the Space tree
ReadText();
// reading the whole html source
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 8bf6969..9575f93 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -43,6 +43,7 @@
#include
#include
#include "convert/baseparser.h"
+#include "space/space.h"
namespace pt
@@ -106,6 +107,9 @@ public:
virtual ~HTMLParser();
+ void parse_html(const wchar_t * in, Space & space);
+
+
// main methods used for filtering
void Filter(const wchar_t * in, std::wstring & out);
void Filter(const std::wstring & in, std::wstring & out);
@@ -228,6 +232,8 @@ protected:
size_t tree_index;
+ Space * space;
+
void Clear();
Item();
};
@@ -331,6 +337,7 @@ protected:
bool ReadItemAttr();
void CheckItemLangAttr();
void PrintItemAttr();
+ void PutItemAttrToSpace();
void ReadItemClosing();
void ReadItemSpecial();
@@ -342,17 +349,22 @@ protected:
void CheckChar(wchar_t c);
void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
- void PutNormalWhite(bool & was_white_char, bool & was_new_line);
+ void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
void PutTabs(size_t len);
void PutNonBreakingSpace();
void CalcOrphansMaxLen(Orphans & orphans);
+ void AddItemToSpace();
+ void AddSpaceToSpaceTree(const Space & space);
+
Item empty;
Item * pstack; // stack pointer
size_t stack_len; // length of the stack
wchar_t * buffer; // buffer used when printing
std::wstring * out_string;
+ Space * out_space;
+ Space text_space_tmp;
std::vector white_char_mode_tab;