/* * This file is a part of PikoTools * and is distributed under the (new) BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2008-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef headerfile_picotools_html_htmlfilter #define headerfile_picotools_html_htmlfilter #include #include #include #include #include "convert/baseparser.h" #include "space/space.h" #include "textstream/stream.h" namespace pt { // max length of a name of a html tag (with terminating null) #define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30 // max length of a html lang attribute (e.g. "en", "pl") #define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10 #define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40 #define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500 // depth of the html tree #define WINIX_HTMLFILTER_STACK_MAXLEN 100 // length of a buffer used for printing // it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3 #define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048 /*! very lightweight filter for html (without using any dynamic memory - some memory is allocated only at the beginning - in ctors) this filter has O(n) complexity over the whole html string such tags as: ) are untouched if the filter finds that there are not closed tags it will close them, if the filter finds a closing tag which doesn't have an opening tag - it will skip it tags which don't need to be closed: meta, input, br, img, link look at CheckExceptions() method the filter recognizes xml simple tags (with / at the end) such as:
*/ class HTMLParser : public BaseParser { public: /* status of parsing */ enum Status { ok, cant_open_file, syntax_error }; enum OrphanMode { orphan_nbsp, // putting " " string orphan_160space // putting 160 ascii code }; /* the last status of parsing, set by parse() methods */ Status status; HTMLParser(); HTMLParser(const HTMLParser & f); HTMLParser & operator=(const HTMLParser & f); virtual ~HTMLParser(); void parse_html(const wchar_t * in, Space & space, bool compact_mode = false); Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); // main methods used for filtering void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true); void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true); void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true); HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true); HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true); HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true); HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true); /* * * returns a number of a last parsed line * can be used to obtain the line in which there was a syntax error * */ int get_last_parsed_line(); const static int WHITE_MODE_ORIGIN = 0; const static int WHITE_MODE_SINGLE_LINE = 1; const static int WHITE_MODE_TREE = 2; // white chars mode // void white_chars_mode(int mode); // if the line is wrap_line_ length (or longer) then insert a new line character (in a place of a white char) // (only between html tags and only in subtree) // skipped in such tags: script, pre, textarea // 0 - off // lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section) void WrapLine(size_t wrap_line_); // first tabs in a tree // default: 2 (spaces) // set 0 to turn off void InsertTabs(size_t tabsize); // set a name of a html tag which will be used as 'nofilter' tag // elements between such tags are not filtered (similarly as in

 and )
	// these tags (opening and closing) will no be placed in the html output
	void SetNoFilterTag(const std::wstring & tag_name);

	// orphans are checked only in 'body' tag
	void AssignOrphans(const wchar_t * lang_code,      const std::vector<std::wstring> & otab);
	void AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab);
	void ClearOrphans();

	// check 'orphans' for the specicic language
	// if an orphan is detected then the non-break space (" " or ascii 160 code) will be put
	// default disable (lang_none)
	void OrphansMode(const std::wstring & orphan_mode);

	// skipping some unsafe tags
	// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
	void SafeMode(bool safe_mode_);

	// skip all html tags
	// gives only text without markup
	// but there can be commentaries
	void SkipTags(bool skip_tags);

	// skip commentaries
	void SkipCommentaries(bool skip_commentaries);

	// if true then entities such as   are skipped
	// this automatically turns on AnalyzeEntities
	// in such a case FoundEntity callbacks are sent
	void SkipEntities(bool skip_entities);

	// analyze html entities such as  
	// virtual method: FoundEntity is called
	// entities are analyzed in normal text and in attribute values such as <p class="a ">
	void AnalyzeEntities(bool analyze_entities);


protected:

	/*
	 * true when parsing html input, false for parsing xml
	 */
	bool parsing_html;


	bool xml_compact_mode;

	// orphans for one language
	struct Orphans
	{
		std::vector<std::wstring> tab;
		size_t max_len;
	};


	// orphans for all languages
	// map<language_code, Orphans>
	typedef std::map<std::wstring, Orphans> OrphansTab;
	OrphansTab orphans_tab;

	// html <nofilter> tag name
	std::wstring no_filter_tag;


	struct Item
	{
		std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN

		enum Type
		{
			opening,		/* sample:  <h1>		*/
			closing,		/* sample:  </h1>		*/
			simple,			/* sample:  <br/>		*/
			special,		/* sample:  <!doctype>	*/
			none
		} type;

		bool is_commentary;

		bool new_line_before;

		// is there a new line after this tag
		bool new_line;

		// is there a new
		bool new_line_in_the_middle;

		// current orphans table
		// (will be propagated)
		Orphans * porphans;

		// this item or one from its parents is a 'body' html tag
		// (will be propagated)
		bool has_body_tag;

		size_t tree_index;

		Space * space;

		void Clear();
		Item();
	};



	void clear_input_flags();


	/*
		virtual methods
	*/
	virtual void Init();
	virtual void Uninit();

	virtual bool IsOpeningTagMark(wchar_t c);
	virtual bool IsClosingTagMark(wchar_t c);
	virtual bool IsClosingTagIndicator(wchar_t c);
	virtual bool IsSpecialTagIndicator(wchar_t c);
	virtual bool IsXMLSpecialTagIndicator(wchar_t c);
	virtual bool IsAttributeAssignmentMark(wchar_t c);
	virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
	virtual bool IsStartingEntityMark(wchar_t c);
	virtual bool IsEndingEntityMark(wchar_t c);

	virtual bool IsValidCharForName(int c);
	virtual bool IsValidCharForAttrName(int c);
	virtual bool IsValidCharForEntityName(int c);

	virtual void CheckSingleItemExceptions();
	virtual void CheckWhiteCharsExceptions(Item & item);
	virtual void CheckDifferentContentExceptions(Item & item);

	virtual void Put(wchar_t c);
	virtual void Put(const wchar_t * str, const wchar_t * end);
	virtual void Put(const std::wstring & str);
	virtual void AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out);

	virtual void PutOpeningTagMark();
	virtual void PutClosingTagMark();
	virtual bool PutOpeningTag();
	virtual void PutClosingTag(const Item & item);

	virtual void ItemFound();
	virtual void EntityFound(const wchar_t * str, const wchar_t * end);

	/*
		others
	*/
	void SetSomeDefaults();

	Item & GetItem(size_t i);
	Item & LastItem();

	wchar_t ToLower(wchar_t c);
	void    ToLower(std::wstring & str);

	bool IsNameEqual(const wchar_t * name1,      const wchar_t * name2);
	bool IsNameEqual(const wchar_t * name1,      const std::wstring & name2);
	bool IsNameEqual(const std::wstring & name1, const wchar_t * name2);
	bool IsNameEqual(const std::wstring & name1, const std::wstring & name2);

	bool IsNameEqual(const wchar_t * name1,      const wchar_t * name2,      size_t len);
	bool IsNameEqual(const wchar_t * name1,      const std::wstring & name2, size_t len);
	bool IsNameEqual(const std::wstring & name1, const wchar_t * name2,      size_t len);
	bool IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len);

	bool IsLastTag(const wchar_t * name);
	bool IsLastTag(const std::wstring & name);
	bool IsTagSafe(const wchar_t * tag);
	bool IsTagSafe(const std::wstring & tag);

	int  CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str);
	bool CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & orphans);
	bool CheckOrphan(const wchar_t * str, const wchar_t * end);

	bool IsWhite(int c);
	void SkipWhite(std::wstring * out_string = nullptr);
	void SkipWhiteLines(std::wstring * out_string = nullptr);
	void SkipWhiteWithFirstNewLine();

	int current_white_char_mode();

	void ReadTextUntilClosingCommentary();
	bool IsClosingTagForLastItem(bool put_closing_tag_as_well);
	void ReadTextUntilClosingTag(bool put_closing_tag_as_well);
	void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);

	void PopStack();
	bool PushStack();
	void CheckStackPrintRest();
	void AddForgottenTags();
	void CheckClosingTags();
	void ReadText();
	bool PrintRest();
	bool PrintOpeningItem();
	void ReadItemName(std::wstring & name, bool clear_name = true);
	void ReadItemAttrName();
	void ReadItemAttrValueAdd(const std::wstring & str);
	void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
	void ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char);

	bool ReadItemAttr();
	void CheckItemLangAttr();
	void PrintItemAttr();
	void PutItemAttrToSpace();

	void ReadItemClosing();
	void ReadItemSpecial();
	void ReadItemOpening();
	bool ReadItem();
	void ReadLoop();
	void Read();

	void CheckChar(wchar_t c);

	void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space);
	void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);

	void PutTabs(size_t len);
	void PutNonBreakingSpace();
	void CalcOrphansMaxLen(Orphans & orphans);

	void AddItemToSpace();
	void AddSpaceToSpaceTree(const Space & space);

	Item empty;
	Item * pstack;			// stack pointer
	size_t stack_len;		// length of the stack
	wchar_t * buffer;		// buffer used when printing
	std::wstring * out_string;
	Stream * out_stream;
	Space * out_space;
	Space text_space_tmp;

	std::vector<int> white_char_mode_tab;

	//bool last_new_line;
	bool new_item_has_new_line_before;
	int white_mode;
	bool is_first_item;
	size_t wrap_line;		// insert a new line character into long lines
	size_t tab_size;
	OrphanMode orphan_mode;
	std::wstring attr_name;
	std::vector<std::wstring> attr_value;
	std::wstring attr_value_temp;
	std::wstring attr_value_lower;
	bool attr_has_value;
	std::wstring lang_code_lower;
	size_t line_len;		//length of the current line (without first spaces which create the html tree)
	bool safe_mode;			// skipping some unsafe tags
	Orphans orphans_temp;
	bool skip_tags;
	bool skip_commentaries;
	bool skip_entities;
	bool analyze_entities;
	std::wstring tmp_text;
	std::wstring tmp_name;
};



}



#endif