pikotools/src/html/htmlparser.h

/*
 * This file is a part of PikoTools
 * and is distributed under the 2-Clause BSD licence.
 * Author: Tomasz Sowa <t.sowa@ttmath.org>
 */

/*
 * Copyright (c) 2008-2024, Tomasz Sowa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */

#ifndef headerfile_pikotools_src_html_htmlparser
#define headerfile_pikotools_src_html_htmlparser

#include <string>
#include <map>
#include <vector>
#include <algorithm>
#include "convert/baseparser.h"
#include "space/space.h"
#include "textstream/stream.h"


namespace pt
{


// max length of a name of a html tag (with terminating null)
#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN	30

// max length of a html lang attribute (e.g. "en", "pl")
#define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN	10


#define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN	40


#define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN	500


// depth of the html tree
#define WINIX_HTMLFILTER_STACK_MAXLEN		100

// length of a buffer used for printing
// it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3
#define WINIX_HTMLFILTER_BUFFER_MAXLEN	2048


/*!
	very lightweight filter for html
	(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
	this filter has O(n) complexity over the whole html string

	such tags as: <script> <pre> <textarea> are treated in a special way
	all characters between the opening and closing tag (<script>....</script>) are untouched

	if the filter finds that there are not closed tags it will close them,
	if the filter finds a closing tag which doesn't have an opening tag - it will skip it

	tags which don't need to be closed: meta, input, br, img, link
	look at CheckExceptions() method

	the filter recognizes xml simple tags (with / at the end) such as: <br />
*/
class HTMLParser : public BaseParser
{
public:


	/*
		status of parsing
	*/
	enum Status { ok, cant_open_file, syntax_error };


	enum OrphanMode
	{
		orphan_nbsp,		// putting "&nbsp;" string
		orphan_160space		// putting 160 ascii code
	};


	// orphans for one language
	struct Orphans
	{
		std::vector<std::wstring> tab;
		size_t max_len;
	};


	struct Item
	{
		std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN

		enum Type
		{
			opening,		/* sample:  <h1>		*/
			closing,		/* sample:  </h1>		*/
			simple,			/* sample:  <br/>		*/
			special,		/* sample:  <!doctype>	*/
			none
		} type;

		bool is_commentary;

		bool is_cdata;

		// is a new line before this tag (or just a new line and some white characters)
		bool new_line_before;

		// is there a new line after this tag (or just some white characters and a new line)
		bool new_line_after;

		// is there a new line in the middle after this tag and before the next tag
		bool new_line_in_the_middle;

		// is there a white char (but not new line) before this tag
		bool white_char_before;

		// current orphans table
		// (will be propagated)
		Orphans * porphans;

		// this item or one from its parents is a 'body' html tag
		// (will be propagated)
		bool has_body_tag;

		size_t tree_index;

		Space * space;

		void Clear();
		Item();
	};


	class Listener
	{
	public:

		Listener() {}

		virtual void item_parsed(const Item & item) { }
		virtual bool should_remove(const Item & item) { return false; }
		virtual ~Listener() {}

	};


	/*
		the last status of parsing, set by parse() methods
	*/
	Status status;

	HTMLParser();
	HTMLParser(const HTMLParser & f);
	HTMLParser & operator=(const HTMLParser & f);
	virtual ~HTMLParser();

	void set_item_parsed_listener(Listener * listener);


	void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);

	Status parse_xml_file(const char * file_name,         Space & out_space, bool compact_mode = false, bool clear_space = true);
	Status parse_xml_file(const std::string & file_name,  Space & out_space, bool compact_mode = false, bool clear_space = true);
	Status parse_xml_file(const wchar_t * file_name,      Space & out_space, bool compact_mode = false, bool clear_space = true);
	Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);

	Status parse_xml(const char * str,        Space & out_space, bool compact_mode = false, bool clear_space = true);
	Status parse_xml(const std::string & str, Space & out_space, bool compact_mode = false, bool clear_space = true);

	Status parse_xml(const wchar_t * str,      Space & out_space, bool compact_mode = false, bool clear_space = true);
	Status parse_xml(const std::wstring & str, Space & out_space, bool compact_mode = false, bool clear_space = true);

	Status parse_xml(const pt::TextStream & str, Space & out_space, bool compact_mode = false, bool clear_space = true);
	Status parse_xml(const pt::WTextStream & str, Space & out_space, bool compact_mode = false, bool clear_space = true);


	// main methods used for filtering
	void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true);
	void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true);

	void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true);

	HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true);
	HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true);
	HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true);
	HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true);


	/*
	 *
	 * returns a number of a last parsed line/column
	 * can be used to obtain the line in which there was a syntax error
	 *
	 */
	int get_last_parsed_line();
	int get_last_parsed_column();


	const static int WHITE_MODE_ORIGIN = 0;
	const static int WHITE_MODE_SINGLE_LINE = 1;
	const static int WHITE_MODE_TREE = 2;


	// white chars mode
	//
	void white_chars_mode(int mode);

	// if the line is wrap_line_ length (or longer) then insert a new line character (in a place of a white char)
	// (only between html tags and only in <body> subtree)
	// skipped in such tags: script, pre, textarea
	// 0 - off
	// lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section)
	void WrapLine(size_t wrap_line_);

	// first tabs in a tree
	// default: 2 (spaces)
	// set 0 to turn off
	void InsertTabs(size_t tabsize);

	// set a name of a html tag which will be used as 'nofilter' tag
	// elements between such tags are not filtered (similarly as in <pre> and <textarea>)
	// these tags (opening and closing) will no be placed in the html output
	void SetNoFilterTag(const std::wstring & tag_name);

	// orphans are checked only in 'body' tag
	void AssignOrphans(const wchar_t * lang_code,      const std::vector<std::wstring> & otab);
	void AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab);
	void ClearOrphans();

	// check 'orphans' for the specicic language
	// if an orphan is detected then the non-break space ("&nbsp;" or ascii 160 code) will be put
	// default disable (lang_none)
	void OrphansMode(const std::wstring & orphan_mode);

	// skipping some unsafe tags
	// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
	void SafeMode(bool safe_mode_);

	// skip all html tags
	// gives only text without markup
	// but there can be commentaries
	void SkipTags(bool skip_tags);

	// skip commentaries
	void SkipCommentaries(bool skip_commentaries);

	// if true then entities such as &nbsp; are skipped
	// this automatically turns on AnalyzeEntities
	// in such a case FoundEntity callbacks are sent
	void SkipEntities(bool skip_entities);

	// analyze html entities such as &nbsp;
	// virtual method: FoundEntity is called
	// entities are analyzed in normal text and in attribute values such as <p class="a&nbsp;">
	void AnalyzeEntities(bool analyze_entities);


protected:

	/*
	 * true when parsing html input, false for parsing xml
	 */
	bool parsing_html;


	bool xml_compact_mode;


	// orphans for all languages
	// map<language_code, Orphans>
	typedef std::map<std::wstring, Orphans> OrphansTab;
	OrphansTab orphans_tab;

	// html <nofilter> tag name
	std::wstring no_filter_tag;

	Listener * listener;

	/*
		true if the lastc was escaped (with a backslash)
		we have to know if the last sequence was \" or just "
	*/
	bool char_was_escaped;

	std::wstring escaped_chars_buffer;
	size_t escaped_char_index;

	/*
	 * filter mode, a method filter(...) was called
	 * in filter mode we do not unescape xml sequences such as &lt; &gt; ...
	 */
	bool filter_mode;


	void clear_input_flags();


	/*
		virtual methods
	*/
	virtual void Init();
	virtual void Uninit();

	void prepare_to_parse_xml(Space & out_space, bool compact_mode, bool clear_space);

	virtual bool IsOpeningTagMark(wchar_t c);
	virtual bool IsClosingTagMark(wchar_t c);
	virtual bool IsClosingTagIndicator(wchar_t c);
	virtual bool IsSpecialTagIndicator(wchar_t c);
	virtual bool IsXMLSpecialTagIndicator(wchar_t c);
	virtual bool IsAttributeAssignmentMark(wchar_t c);
	virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
	virtual bool IsStartingEntityMark(wchar_t c);
	virtual bool IsEndingEntityMark(wchar_t c);

	virtual bool IsValidCharForName(int c);
	virtual bool IsValidCharForAttrName(int c);
	virtual bool IsValidCharForEntityName(int c);

	virtual void CheckSingleItemExceptions();
	virtual void CheckWhiteCharsExceptions(Item & item);
	virtual void CheckDifferentContentExceptions(Item & item);

	virtual void Put(wchar_t c);
	virtual void Put(const wchar_t * str, const wchar_t * end);
	virtual void Put(const std::wstring & str);
	virtual void AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out);

	virtual void PutOpeningTagMark();
	virtual void PutClosingTagMark();
	virtual bool PutOpeningTag();
	virtual void PutClosingTag(const Item & item);

	virtual void ItemFound();
	virtual void EntityFound(const wchar_t * str, const wchar_t * end);

	/*
		others
	*/
	void SetSomeDefaults();

	Item & GetItem(size_t i);
	Item & LastItem();

	wchar_t ToLower(wchar_t c);
	void    ToLower(std::wstring & str);

	bool IsNameEqual(const wchar_t * name1,      const wchar_t * name2);
	bool IsNameEqual(const wchar_t * name1,      const std::wstring & name2);
	bool IsNameEqual(const std::wstring & name1, const wchar_t * name2);
	bool IsNameEqual(const std::wstring & name1, const std::wstring & name2);

	bool IsNameEqual(const wchar_t * name1,      const wchar_t * name2,      size_t len);
	bool IsNameEqual(const wchar_t * name1,      const std::wstring & name2, size_t len);
	bool IsNameEqual(const std::wstring & name1, const wchar_t * name2,      size_t len);
	bool IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len);

	bool IsLastTag(const wchar_t * name);
	bool IsLastTag(const std::wstring & name);
	bool IsTagSafe(const wchar_t * tag);
	bool IsTagSafe(const std::wstring & tag);

	int  CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str);
	bool CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & orphans);
	bool CheckOrphan(const wchar_t * str, const wchar_t * end);

	bool IsWhite(int c);
	void SkipWhite(std::wstring * out_string = nullptr);
	void SkipWhiteLines(std::wstring * out_string = nullptr);
	void SkipWhiteWithFirstNewLine();

	int current_white_char_mode();

	void ReadTextUntilClosingCommentary();
	bool IsClosingTagForLastItem(bool put_closing_tag_as_well);
	void ReadTextUntilClosingTag(bool put_closing_tag_as_well);
	void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);

	void PopStack();
	bool PushStack();
	void CheckStackPrintRest();
	void AddForgottenTags();
	void CheckClosingTags();
	void ReadText(bool is_cdata);
	bool PrintRest();
	bool PrintOpeningItem();
	void ReadItemName(std::wstring & name, bool clear_name = true);
	void ReadItemAttrName();
	void ReadItemAttrValueAdd(const std::wstring & str);
	void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
	void ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char);

	bool ReadItemAttr();
	void CheckItemLangAttr();
	void PrintItemAttr();
	void PutItemAttrToSpace();

	void ReadItemClosing();
	void ReadItemSpecial();
	void ReadItemOpening();
	bool ReadItem();
	void ReadLoop();
	void Read();

	void CheckChar(wchar_t c);

	bool PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata);
	void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);

	void PutTabs(size_t len);
	void PutNonBreakingSpace();
	void CalcOrphansMaxLen(Orphans & orphans);

	void AddItemToSpace();
	void RemoveLastSpace(size_t index);
	void AddTextSpaceToSpaceTree(const Space & space);

	void CallListener(size_t index);

	bool check_escape_sequentions();
	void read_xml_entity();
	void read_char_from_entity_buffer();
	int read_char() override;

	Item empty;
	Item * pstack;			// stack pointer
	size_t stack_len;		// length of the stack
	wchar_t * buffer;		// buffer used when printing
	std::wstring * out_string;
	Stream * out_stream;
	Space * out_space;
	Space text_space_tmp;

	std::vector<int> white_char_mode_tab;

	//bool last_new_line;
	bool new_item_has_new_line_before;
	bool new_item_has_white_char_before;
	int white_mode;
	bool is_first_item;
	size_t wrap_line;		// insert a new line character into long lines
	size_t tab_size;
	OrphanMode orphan_mode;
	std::wstring attr_name;
	std::vector<std::wstring> attr_value;
	std::wstring attr_value_temp;
	std::wstring attr_value_lower;
	bool attr_has_value;
	std::wstring lang_code_lower;
	size_t line_len;		//length of the current line (without first spaces which create the html tree)
	bool safe_mode;			// skipping some unsafe tags
	Orphans orphans_temp;
	bool skip_tags;
	bool skip_commentaries;
	bool skip_entities;
	bool analyze_entities;
	std::wstring tmp_text;
	std::wstring tmp_name;
};


}


#endif