HTMLParser: added possibility to parse html to Space class

added method: HTMLParser::parse_html(const wchar_t * in, Space & space)
2021-08-07 21:21:16 +02:00
parent 7fcfdac52f
commit b8a03bf852
2 changed files with 138 additions and 9 deletions
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -59,6 +59,7 @@ void HTMLParser::Item::Clear()
 	new_line_in_the_middle = false;
 	has_body_tag    = false;
 	tree_index      = 0;
+	space           = nullptr;
 }


@@ -68,6 +69,27 @@ HTMLParser::Item::Item()
 }


+void HTMLParser::parse_html(const wchar_t * in, Space & space)
+{
+	parsing_html              = true;
+	reading_from_file         = false;
+	reading_from_wchar_string = true;
+	pchar_unicode             = in;
+	pchar_ascii               = 0;
+
+	stack_len     = 0;
+	out_string    = nullptr;
+	out_space     = &space;
+	//last_new_line = false;
+	line_len      = 0;
+	out_space->clear();
+
+	Init();
+	Read();
+	Uninit();
+}
+
+

 void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 {
@@ -79,6 +101,7 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)

 	stack_len     = 0;
 	out_string    = &out;
+	out_space     = nullptr;
 	//last_new_line = false;
 	line_len      = 0;
 	out_string->clear();
@@ -347,6 +370,8 @@ bool HTMLParser::PushStack()
 return true;
 }

+
+
 void HTMLParser::PopStack()
 {
 	if( stack_len == 0 )
@@ -609,7 +634,9 @@ void HTMLParser::CheckChar(wchar_t c)

 void HTMLParser::Put(wchar_t c)
 {
-	(*out_string) += c;
+	if( out_string )
+		(*out_string) += c;
+
 	CheckChar(c);
 }

@@ -620,7 +647,9 @@ void HTMLParser::Put(const wchar_t * str, const wchar_t * end)
 		return;

 	size_t len = end - str;
-	out_string->append(str, len);
+
+	if( out_string )
+		out_string->append(str, len);

 	for( ; str < end ; ++str)
 		CheckChar(*str);
@@ -632,7 +661,8 @@ void HTMLParser::Put(const std::wstring & str)
 {
 	if( !str.empty() )
 	{
-		out_string->append(str);
+		if( out_string )
+			out_string->append(str);

 		for(size_t i=0 ; i < str.size() ; ++i)
 			CheckChar(str[i]);
@@ -805,7 +835,7 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
 }


-void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line)
+void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text)
 {
 	was_white_char = false;
 	was_new_line = false;
@@ -817,6 +847,9 @@ void HTMLParser::PutNormalWhite(bool & was_white_char, bool & was_new_line)
 		else
 			was_white_char = true;

+		if( result_text )
+			(*result_text) += lastc;
+
 		if( current_white_char_mode() == WHITE_MODE_ORIGIN )
 		{
 			Put(lastc);
@@ -939,7 +972,10 @@ void HTMLParser::PutTabs(size_t len)
 		len = 30;

 	for(size_t i=0 ; i < (len*tab_size) ; ++i)
-		(*out_string) += ' '; // we do not add them to 'line_len'
+	{
+		if( out_string )
+			(*out_string) += ' '; // we do not add them to 'line_len'
+	}
 }


@@ -1140,6 +1176,18 @@ void HTMLParser::ReadText()
 		}
 	}

+	Space * text_space = nullptr;
+	std::wstring * text_space_wstr = nullptr;
+
+	if( out_space )
+	{
+		text_space = &text_space_tmp;
+		text_space->clear();
+		text_space->add(L"name", L"");
+		Space & wstr_space = text_space->add(L"text", L"");
+		text_space_wstr = &wstr_space.value.value_wstring;
+	}
+
 	while( lastc != -1 && !IsOpeningTagMark(lastc) )
 	{
 		tmp_text.clear();
@@ -1150,19 +1198,22 @@ void HTMLParser::ReadText()
 			allow_put_new_line = false;
 			allow_put_space = false;
 			was_non_white_text = true;
+
+			if( text_space_wstr )
+				(*text_space_wstr) += tmp_text;
 		}

 		if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) )
 		{
 			if( lastc == 10 || IsWhite(lastc) )
 			{
-				SkipWhiteLines();
+				SkipWhiteLines(text_space_wstr);
 				PutNonBreakingSpace();
 			}
 		}
 		else
 		{
-			PutNormalWhite(was_white_char, was_new_line);
+			PutNormalWhite(was_white_char, was_new_line, text_space_wstr);

 			if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
 			{
@@ -1190,6 +1241,12 @@ void HTMLParser::ReadText()
 		}
 	}

+	if( text_space_wstr && !text_space_wstr->empty() && was_non_white_text )
+	{
+		AddSpaceToSpaceTree(*text_space);
+	}
+
+	text_space_tmp.clear();
 	new_item_has_new_line_before = was_new_line;
 }

@@ -1292,6 +1349,28 @@ size_t i;
 }


+void HTMLParser::PutItemAttrToSpace()
+{
+	Space * space = LastItem().space;
+
+	if( space )
+	{
+		Space & attr_tab = space->get_add_space(L"attr");
+		Space & attr = attr_tab.add_empty_space(attr_name);
+
+		if( attr_has_value )
+		{
+			attr.set_empty_table();
+
+			for(size_t i=0 ; i < attr_value.size() ; ++i)
+			{
+				attr.add(attr_value[i]);
+			}
+		}
+	}
+}
+
+
 void HTMLParser::ReadItemClosing()
 {
 	read_char(); // skipping '/'
@@ -1358,13 +1437,19 @@ void HTMLParser::ReadItemOpening()
 {
 	LastItem().type = Item::opening;
 	ReadItemName(LastItem().name);
+	AddItemToSpace();
+	Space * space = LastItem().space;
 	
+	if( space )
+		space->add(L"name", LastItem().name);
+
 	if( PrintOpeningItem() )
 	{
 		while( ReadItemAttr() )
 		{
 			CheckItemLangAttr();
 			PrintItemAttr();
+			PutItemAttrToSpace();
 		}

 		SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple'
@@ -1748,6 +1833,36 @@ bool HTMLParser::PrintRest()



+void HTMLParser::AddItemToSpace()
+{
+	if( out_space && stack_len > 0 )
+	{
+		if( stack_len == 1 )
+		{
+			pstack[stack_len-1].space = out_space;
+		}
+		else
+		{
+			// stack_len > 1
+			Space & childs_tab = pstack[stack_len-2].space->get_add_space(L"childs");
+			Space & child = childs_tab.add_empty_space();
+			pstack[stack_len-1].space = &child;
+		}
+	}
+}
+
+
+void HTMLParser::AddSpaceToSpaceTree(const Space & space)
+{
+	if( out_space && stack_len > 0 )
+	{
+		Space & childs_tab = LastItem().space->get_add_space(L"childs");
+		childs_tab.add(space);
+	}
+}
+
+
+
 void HTMLParser::ReadLoop()
 {
 	while( ReadItem() )
@@ -1759,6 +1874,7 @@ void HTMLParser::ReadLoop()
 				CheckSingleItemExceptions();
 			}

+
 			CheckWhiteCharsExceptions(LastItem());
 			CheckDifferentContentExceptions(LastItem());
 		}
@@ -1804,7 +1920,8 @@ void HTMLParser::Read()
 	if( current_white_char_mode() != WHITE_MODE_ORIGIN )
 		SkipWhiteLines();

-	// it can be some text or white lines before the first html tag (we print it)
+	// it can be some text or white lines before the first html tag (we print it if using filtering)
+	// but they are not added to the Space tree
 	ReadText();

 	// reading the whole html source
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -43,6 +43,7 @@
 #include <vector>
 #include <algorithm>
 #include "convert/baseparser.h"
+#include "space/space.h"


 namespace pt
@@ -106,6 +107,9 @@ public:
 	virtual ~HTMLParser();


+	void parse_html(const wchar_t * in, Space & space);
+
+
 	// main methods used for filtering
 	void Filter(const wchar_t * in, std::wstring & out);
 	void Filter(const std::wstring & in, std::wstring & out);
@@ -228,6 +232,8 @@ protected:

 		size_t tree_index;

+		Space * space;
+
 		void Clear();
 		Item();
 	};
@@ -331,6 +337,7 @@ protected:
 	bool ReadItemAttr();
 	void CheckItemLangAttr();
 	void PrintItemAttr();
+	void PutItemAttrToSpace();

 	void ReadItemClosing();
 	void ReadItemSpecial();
@@ -342,17 +349,22 @@ protected:
 	void CheckChar(wchar_t c);

 	void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
-	void PutNormalWhite(bool & was_white_char, bool & was_new_line);
+	void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);

 	void PutTabs(size_t len);
 	void PutNonBreakingSpace();
 	void CalcOrphansMaxLen(Orphans & orphans);

+	void AddItemToSpace();
+	void AddSpaceToSpaceTree(const Space & space);
+
 	Item empty;
 	Item * pstack;			// stack pointer
 	size_t stack_len;		// length of the stack
 	wchar_t * buffer;		// buffer used when printing
 	std::wstring * out_string;
+	Space * out_space;
+	Space text_space_tmp;

 	std::vector<int> white_char_mode_tab;