added: HTMLFilter (html/htmlfilter.h|cpp) - copied from winix project

2021-07-17 13:35:10 +02:00
parent 1e5598cde1
commit bdb2616f32
4 changed files with 2093 additions and 5 deletions
@@ -42,3 +42,4 @@
 ./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
 ./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
 ./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
+./html/htmlfilter.o: ./html/htmlfilter.h
@@ -0,0 +1,376 @@
+/*
+ * This file is a part of PikoTools
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@ttmath.org>
+ */
+
+/* 
+ * Copyright (c) 2008-2021, Tomasz Sowa
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef headerfile_picotools_html_htmlfilter
+#define headerfile_picotools_html_htmlfilter
+
+#include <string>
+#include <map>
+#include <vector>
+#include <algorithm>
+
+
+
+namespace pt
+{
+
+
+
+
+// max length of a name of a html tag (with terminating null)
+#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN	30
+
+// max length of a html lang attribute (e.g. "en", "pl")
+#define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN	10
+
+
+#define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN	40
+
+
+#define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN	500
+
+
+// depth of the html tree
+#define WINIX_HTMLFILTER_STACK_MAXLEN		100
+
+// length of a buffer used for printing
+// it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3
+#define WINIX_HTMLFILTER_BUFFER_MAXLEN	2048
+
+
+
+
+/*!
+	very lightweight filter for html
+	(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
+	this filter has O(n) complexity over the whole html string
+
+	such tags as: <script> <pre> <textarea> are treated in a special way
+	all characters between the opening and closing tag (<script>....</script>) are untouched
+
+	if the filter finds that there are not closed tags it will close them,
+	if the filter finds a closing tag which doesn't have an opening tag - it will skip it
+
+	tags which don't need to be closed: meta, input, br, img, link
+	look at CheckExceptions() method
+
+	the filter recognizes xml simple tags (with / at the end) such as: <br />
+*/
+class HTMLFilter
+{
+public:
+
+	enum OrphanMode
+	{
+		orphan_nbsp,		// putting "&nbsp;" string
+		orphan_160space		// putting 160 ascii code
+	};
+
+	HTMLFilter();
+	HTMLFilter(const HTMLFilter & f);
+	HTMLFilter & operator=(const HTMLFilter & f);
+	virtual ~HTMLFilter();
+
+
+	// main methods used for filtering
+	void Filter(const wchar_t * in, std::wstring & out);
+	void Filter(const std::wstring & in, std::wstring & out);
+
+
+	// insert a white space into long words
+	// (only between html tags)
+	// skipped in such tags: script, pre, textarea
+	// break_after - after how many characters insert a space (0 - off)
+	void BreakWord(size_t break_after_);
+
+	// insert a new line character into long lines
+	// (only between html tags)
+	// skipped in such tags: script, pre, textarea
+	// wrap_line - after how many characters wrap a line (0 - off)
+	// lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section)
+	void WrapLine(size_t wrap_line_);
+
+	// trimming white characters (with new lines)
+	// at the beginning, at the end and in the middle of a string
+	// only between html tags
+	// at the beginning and at the end only one space is left
+	// skipped in such tags: script, pre, textarea
+	// false by default
+	void TrimWhite(bool trim);
+
+	// first tabs in a tree
+	// default: 2 (spaces)
+	// set 0 to turn off
+	void InsertTabs(size_t tabsize);
+
+	// set a name of a html tag which will be used as 'nofilter' tag
+	// elements between such tags are not filtered (similarly as in <pre> and <textarea>)
+	// these tags (opening and closing) will no be placed in the html output
+	void SetNoFilterTag(const std::wstring & tag_name);
+
+	// orphans are checked only in 'body' tag
+	void AssignOrphans(const wchar_t * lang_code,      const std::vector<std::wstring> & otab);
+	void AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab);
+	void ClearOrphans();
+
+	// check 'orphans' for the specicic language
+	// if an orphan is detected then the non-break space ("&nbsp;" or ascii 160 code) will be put
+	// default disable (lang_none)
+	void OrphansMode(const std::wstring & orphan_mode);
+
+	// skipping some unsafe tags
+	// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
+	void SafeMode(bool safe_mode_);
+
+	// skip all html tags
+	// gives only text without markup
+	// but there can be commentaries
+	void SkipTags(bool skip_tags);
+
+	// skip commentaries
+	void SkipCommentaries(bool skip_commentaries);
+
+	// if true then entities such as &nbsp; are skipped
+	// this automatically turns on AnalyzeEntities
+	// in such a case FoundEntity callbacks are sent
+	void SkipEntities(bool skip_entities);
+
+	// analyze html entities such as &nbsp;
+	// virtual method: FoundEntity is called
+	// entities are analyzed in normal text and in attribute values such as <p class="a&nbsp;">
+	void AnalyzeEntities(bool analyze_entities);
+
+
+protected:
+
+	// orphans for one language
+	struct Orphans
+	{
+		std::vector<std::wstring> tab;
+		size_t max_len;
+	};
+
+
+	// orphans for all languages
+	// map<language_code, Orphans>
+	typedef std::map<std::wstring, Orphans> OrphansTab;
+	OrphansTab orphans_tab;
+
+	// html <nofilter> tag name
+	std::wstring no_filter_tag;
+
+
+	struct Item
+	{
+		std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
+
+		enum Type
+		{
+			opening,		/* sample:  <h1>		*/
+			closing,		/* sample:  </h1>		*/
+			simple,			/* sample:  <br/>		*/
+			special,		/* sample:  <!doctype>	*/
+			none
+		} type;
+
+		// is there a new line after this tag
+		bool new_line;
+
+		// current orphans table
+		// (will be propagated)
+		Orphans * porphans;
+
+		// this item or one from its parents is a 'body' html tag
+		// (will be propagated)
+		bool has_body_tag;
+
+		void Clear();
+		Item();
+	};
+
+
+
+
+
+
+	/*
+		virtual methods
+	*/
+	virtual void Init();
+	virtual void Uninit();
+
+	virtual bool IsOpeningTagMark(wchar_t c);
+	virtual bool IsClosingTagMark(wchar_t c);
+	virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
+	virtual bool IsStartingEntityMark(wchar_t c);
+	virtual bool IsEndingEntityMark(wchar_t c);
+
+	virtual bool   IsOpeningCommentaryTagMark(const wchar_t * str);
+	virtual size_t OpeningCommentaryTagMarkSize();
+
+	virtual bool IsValidCharForName(int c);
+	virtual bool IsValidCharForAttrName(int c);
+	virtual bool IsValidCharForEntityName(int c);
+	virtual void CheckExceptions();
+	virtual bool SkipCommentaryTagIfExists();
+
+	virtual void Put(wchar_t c);
+	virtual void Put(const wchar_t * str);
+	virtual void Put(const wchar_t * str, const wchar_t * end);
+	virtual void Put(const std::wstring & str);
+	virtual void AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out);
+
+	virtual void PutOpeningTagMark();
+	virtual void PutClosingTagMark();
+	virtual bool PutOpeningTag();
+	virtual void PutClosingTag(const wchar_t * tag);
+
+	virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
+	virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
+
+	virtual void ItemFound();
+	virtual void EntityFound(const wchar_t * str, const wchar_t * end);
+
+	/*
+		others
+	*/
+	void SetSomeDefaults();
+
+	Item & GetItem(size_t i);
+	Item & LastItem();
+
+	wchar_t ToLower(wchar_t c);
+	void    ToLower(std::wstring & str);
+
+	bool IsNameEqual(const wchar_t * name1,      const wchar_t * name2);
+	bool IsNameEqual(const wchar_t * name1,      const std::wstring & name2);
+	bool IsNameEqual(const std::wstring & name1, const wchar_t * name2);
+	bool IsNameEqual(const std::wstring & name1, const std::wstring & name2);
+
+	bool IsNameEqual(const wchar_t * name1,      const wchar_t * name2,      size_t len);
+	bool IsNameEqual(const wchar_t * name1,      const std::wstring & name2, size_t len);
+	bool IsNameEqual(const std::wstring & name1, const wchar_t * name2,      size_t len);
+	bool IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len);
+
+	bool IsLastTag(const wchar_t * name);
+	bool IsLastTag(const std::wstring & name);
+	bool IsTagSafe(const wchar_t * tag);
+	bool IsTagSafe(const std::wstring & tag);
+
+	int  CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str);
+	bool CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & orphans);
+	bool CheckOrphan(const wchar_t * str, const wchar_t * end);
+
+	bool IsWhite(int c);
+	void SkipWhite();
+	void SkipWhiteLines();
+	void SkipWhiteWithFirstNewLine();
+	void SkipWhiteLines(const wchar_t * & str, const wchar_t * end);
+	bool IsClosingTagForLastItem();
+	void SkipAndCheckClosingTag();
+
+	void PopStack();
+	bool PushStack();
+	void CheckNewLine();
+	void CheckStackPrintRest();
+	void AddForgottenTags();
+	void CheckClosingTags();
+	void ReadNormalText();
+	bool PrintRest();
+	bool PrintOpeningItem();
+	void ReadItemName();
+	void ReadItemAttrName();
+	void ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end);
+	void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
+
+	bool ReadItemAttr();
+	bool CheckItemAttr();
+	void PrintItemAttr();
+
+	void ReadItemClosing();
+	void ReadItemSpecial();
+	void ReadItemOpening();
+	bool ReadItem();
+	void ReadLoop();
+	void Read();
+
+	void CheckChar(wchar_t c);
+
+	void CheckLineWrap();
+	bool HasEntityEndAround(const wchar_t * str, const wchar_t * end);
+	void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end);
+	void PutNormalWhite(const wchar_t * & str, const wchar_t * end);
+	void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
+	void PutTabs(size_t len);
+	void PutNonBreakingSpace();
+	void PutNewLine();
+	void CalcOrphansMaxLen(Orphans & orphans);
+
+	const wchar_t * pchar;
+	Item empty;
+	Item * pstack;			// stack pointer
+	size_t stack_len;		// length of the stack
+	wchar_t * buffer;		// buffer used when printing
+	std::wstring * out_string;
+	bool last_new_line;
+	size_t break_after;		// insert a space into long words after 'break_after' characters
+	size_t wrap_line;		// insert a new line character into long lines
+	bool trim_white;		// trimming white characters
+	size_t tab_size;
+	OrphanMode orphan_mode;
+	std::wstring attr_name;
+	std::vector<std::wstring> attr_value;
+	std::wstring attr_value_temp;
+	std::wstring attr_value_lower;
+	bool attr_has_value;
+	std::wstring lang_code_lower;
+	size_t line_len;		//length of the current line (without first spaces which create the html tree)
+	bool safe_mode;			// skipping some unsafe tags
+	Orphans orphans_temp;
+	bool skip_tags;
+	bool skip_commentaries;
+	bool skip_entities;
+	bool analyze_entities;
+};
+
+
+
+}
+
+
+
+#endif
@@ -1,6 +1,5 @@
 # DO NOT DELETE

-./main.o: convert.h mainoptionsparser.h csvparser.h
 ./convert.o: convert.h test.h ../src/convert/convert.h
 ./convert.o: ../src/convert/inttostr.h ../src/convert/patternreplacer.h
 ./convert.o: ../src/textstream/textstream.h ../src/textstream/stream.h
@@ -11,6 +10,11 @@
 ./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./convert.o: ../src/convert/strtoint.h ../src/convert/text.h
 ./convert.o: ../src/convert/misc.h ../src/convert/double.h
+./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
+./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
+./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
+./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h test.h
+./main.o: convert.h mainoptionsparser.h csvparser.h
 ./test.o: test.h
 ./mainoptionsparser.o: mainoptionsparser.h test.h
 ./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
@@ -26,7 +30,3 @@
 ./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
 ./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
 ./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h
-./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
-./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
-./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
-./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h test.h