HTMLParser: start working on xml mode

added methods: Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
2021-08-10 21:56:04 +02:00
parent b1cc64a29b
commit 2576eb12d1
2 changed files with 158 additions and 13 deletions
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -78,6 +78,9 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
 	pchar_ascii               = 0;
 	xml_compact_mode          = compact_mode;

+	status = ok;
+	line = 1;
+
 	stack_len     = 0;
 	out_string    = nullptr;
 	out_space     = &space;
@@ -91,6 +94,64 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
 }


+HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+	parsing_html = false;
+	reading_from_file = true;
+	xml_compact_mode          = compact_mode;
+
+	status = ok;
+	line = 1;
+	stack_len     = 0;
+	out_string    = nullptr;
+	line_len      = 0;
+
+	this->out_space = &out_space;
+
+	if( clear_space )
+		this->out_space->clear();
+
+	file.clear();
+	file.open(file_name, std::ios_base::binary | std::ios_base::in);
+
+	if( file )
+	{
+		Init();
+		Read();
+		Uninit();
+
+		file.close();
+	}
+	else
+	{
+		status = cant_open_file;
+	}
+
+	return status;
+}
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+	return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
+}
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+	std::string file_name_utf8;
+
+	wide_to_utf8(file_name, file_name_utf8);
+	return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
+}
+
+
+HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space)
+{
+	return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
+}
+
+

 void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
 {
@@ -142,6 +203,12 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
 }


+int HTMLParser::get_last_parsed_line()
+{
+	return line;
+}
+
+
 void HTMLParser::SetSomeDefaults()
 {
 	white_mode  = WHITE_MODE_ORIGIN;
@@ -494,7 +561,7 @@ bool HTMLParser::IsValidCharForName(int c)
 	if( (c>='a' && c<='z') ||
 		(c>='A' && c<='Z') ||
 		(c>='0' && c<='9') ||
-		c=='-' || c=='!' || c==':' || c=='-') // : is for a namespace character, - is for a commentary
+		c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
 		return true;

 return false;
@@ -506,7 +573,7 @@ bool HTMLParser::IsValidCharForAttrName(int c)
 	if( (c>='a' && c<='z') ||
 		(c>='A' && c<='Z') ||
 		(c>='0' && c<='9') ||
-		c=='-' || c==':' )
+		c=='-' || c==':' || c=='_')
 		return true;

 return false;
@@ -624,6 +691,34 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
 }


+void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
+{
+	attr_value.clear();
+	tmp_text.clear();
+
+	while( lastc != -1 )
+	{
+		if( has_quote )
+		{
+			if( lastc == quote_char )
+				break;
+		}
+		else
+		{
+			if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
+				break;
+		}
+
+		// IMPROVEME add support for analyze_entities?
+		if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
+			tmp_text += lastc;
+
+		read_char();
+	}
+}
+
+
+
 void HTMLParser::CheckChar(wchar_t c)
 {
 	if( c == 10 )
@@ -1021,6 +1116,10 @@ bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
 	return (c == '!');
 }

+bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c)
+{
+	return (c == '?');
+}

 // the '=' operator e.g. class="value"
 bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
@@ -1292,7 +1391,11 @@ bool HTMLParser::ReadItemAttr()
 	if( has_quote )
 		read_char();			// skipping the first quote mark

-	ReadItemAttrValue(has_quote, quote_char);
+	// IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table...
+	if( parsing_html )
+		ReadItemAttrValue(has_quote, quote_char);
+	else
+		ReadXMLItemAttrValue(has_quote, quote_char);

 	if( has_quote && lastc == quote_char )
 		read_char();			// skipping the last quote mark
@@ -1361,11 +1464,18 @@ void HTMLParser::PutItemAttrToSpace()

 		if( attr_has_value )
 		{
-			attr.set_empty_table();
-
-			for(size_t i=0 ; i < attr_value.size() ; ++i)
+			if( parsing_html )
 			{
-				attr.add(attr_value[i]);
+				attr.set_empty_table();
+
+				for(size_t i=0 ; i < attr_value.size() ; ++i)
+				{
+					attr.add(attr_value[i]);
+				}
+			}
+			else
+			{
+				attr.set(tmp_text);
 			}
 		}
 	}
@@ -1399,8 +1509,8 @@ void HTMLParser::ReadItemSpecial()
 		PutOpeningTagMark();
 	}

-	read_char(); // skipping '!'
-	LastItem().name = '!';
+	LastItem().name = lastc;
+	read_char(); // skipping '!' or '?'
 	ReadItemName(LastItem().name, false);

 	if( skip_tags )
@@ -1491,7 +1601,7 @@ bool HTMLParser::ReadItem()
 	read_char();	// skipping the first opening tag mark '<'
 	SkipWhiteLines();

-	if( IsSpecialTagIndicator(lastc) )
+	if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) )
 		ReadItemSpecial();
 	else
 	if( IsClosingTagIndicator(lastc) )
@@ -1924,7 +2034,7 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)

 void HTMLParser::ReadLoop()
 {
-	while( ReadItem() )
+	while( status == ok && ReadItem() )
 	{
 		if( LastItem().type == Item::opening )
 		{
@@ -1933,7 +2043,6 @@ void HTMLParser::ReadLoop()
 				CheckSingleItemExceptions();
 			}

-
 			CheckWhiteCharsExceptions(LastItem());
 			CheckDifferentContentExceptions(LastItem());
 		}
@@ -1960,7 +2069,10 @@ void HTMLParser::ReadLoop()
 			PopStack();
 		}

-		ReadText();
+		if( status == ok )
+		{
+			ReadText();
+		}

 		is_first_item = false;
 	}
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -95,12 +95,25 @@ class HTMLParser : public BaseParser
 {
 public:

+
+	/*
+		status of parsing
+	*/
+	enum Status { ok, cant_open_file, syntax_error };
+
+
 	enum OrphanMode
 	{
 		orphan_nbsp,		// putting "&nbsp;" string
 		orphan_160space		// putting 160 ascii code
 	};

+
+	/*
+		the last status of parsing, set by parse() methods
+	*/
+	Status status;
+
 	HTMLParser();
 	HTMLParser(const HTMLParser & f);
 	HTMLParser & operator=(const HTMLParser & f);
@@ -109,12 +122,30 @@ public:

 	void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);

+	Status parse_xml_file(const char * file_name,         Space & out_space, bool compact_mode = false, bool clear_space = true);
+	Status parse_xml_file(const std::string & file_name,  Space & out_space, bool compact_mode = false, bool clear_space = true);
+	Status parse_xml_file(const wchar_t * file_name,      Space & out_space, bool compact_mode = false, bool clear_space = true);
+	Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
+
+

 	// main methods used for filtering
 	void Filter(const wchar_t * in, std::wstring & out);
 	void Filter(const std::wstring & in, std::wstring & out);


+
+	/*
+	 *
+	 * returns a number of a last parsed line
+	 * can be used to obtain the line in which there was a syntax error
+	 *
+	 */
+	int get_last_parsed_line();
+
+
+
+
 	const static int WHITE_MODE_ORIGIN = 0;
 	const static int WHITE_MODE_SINGLE_LINE = 1;
 	const static int WHITE_MODE_TREE = 2;
@@ -255,6 +286,7 @@ protected:
 	virtual bool IsClosingTagMark(wchar_t c);
 	virtual bool IsClosingTagIndicator(wchar_t c);
 	virtual bool IsSpecialTagIndicator(wchar_t c);
+	virtual bool IsXMLSpecialTagIndicator(wchar_t c);
 	virtual bool IsAttributeAssignmentMark(wchar_t c);
 	virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
 	virtual bool IsStartingEntityMark(wchar_t c);
@@ -335,6 +367,7 @@ protected:
 	void ReadItemAttrName();
 	void ReadItemAttrValueAdd(const std::wstring & str);
 	void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
+	void ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char);

 	bool ReadItemAttr();
 	void CheckItemLangAttr();