winix/core/htmlfilter.h

/*
 * This file is a part of Winix
 * and is not publicly distributed
 *
 * Copyright (c) 2008-2010, Tomasz Sowa
 * All rights reserved.
 *
 */

#ifndef headerfilecmslucorehtmlfilter
#define headerfilecmslucorehtmlfilter

#include <string>


// max length of a name of a html tag (with terminating null)
#define WINIX_HTMLFILTER_ITEM_MAXLEN	30

// depth of the html tree
#define WINIX_HTMLFILTER_STACK_MAXLEN	100

// length of a buffer used for printing
// it should be at least: WINIX_HTMLFILTER_ITEM_MAXLEN+3
#define WINIX_HTMLFILTER_BUFFER_MAXLEN	2048


/*!
	very lightweight filter for html
	(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
	this filter has O(n) complexity over the whole html string

	such tags as: <script> <pre> <textarea> are treated in a special way
	all characters between the opening and closing tag (<script>....</script>) are untouched

	if the filter finds that there are not closed tags it will close them,
	if the filter finds a closing tag which doesn't have an opening tag - it will skip it

	tags which don't need to be closed: meta, input, br, img, link
	look at CheckExceptions() method

	the filter recognizes xml simple tags (with / at the end) such as: <br />
*/
class HTMLFilter
{
public:


	// for checking orphans
	enum Lang
	{
		lang_pl,
		lang_cz,
		lang_sk,
		lang_none
	};

	enum OrphanMode
	{
		orphan_nbsp,		// putting "&nbsp;" string
		orphan_160space		// putting 160 ascii code
	};


	HTMLFilter();
	HTMLFilter(const HTMLFilter & f);
	HTMLFilter & operator=(const HTMLFilter & f);
	~HTMLFilter();


	// main methods used for filtering
	void Filter(const wchar_t * in, std::wstring & out);
	void Filter(const std::wstring & in, std::wstring & out);


	// insert a white space into long lines
	// only between html tags
	// skipped in such tags: script, pre, textarea
	// break_after - after how many characters insert a space (0 - off)
	void BreakLines(size_t break_after_);


	// trimming white characters (with new lines)
	// at the beginning, at the end and in the middle of a string
	// only between html tags
	// at the beginning and at the end only one space is left
	// skipped in such tags: script, pre, textarea
	// false by default
	void TrimWhite(bool trim);


	// first tabs in a tree
	// default: 2 (spaces)
	// set 0 to turn off
	void InsertTabs(size_t tabsize);


	// check 'orphans' for the specicic language
	// if an orphan is detected then the non-break space ("&nbsp;" or ascii 160 code) will be put
	// default disable (lang_none)
	void CheckOrphans(Lang lang_, OrphanMode mode = orphan_nbsp);


	// skipping some unsafe tags
	// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
	void SafeMode(bool safe_mode_);


protected:

	struct Item
	{
		wchar_t name[WINIX_HTMLFILTER_ITEM_MAXLEN];
		size_t name_len;

		enum Type
		{
			opening,
			closing,
			simple,
			special,
			none
		} type;

		// is there a new line after this tag
		bool new_line;

		void Clear();
		Item();
	};


	// only this method have direct access to the output string
	// you can easily change the output from a std::wstring to something else
	virtual void Put(const wchar_t * str, const wchar_t * end);


	Item & GetItem(size_t i);
	Item & LastItem();

	wchar_t ToLower(wchar_t c);
	bool IsNameEqual(const wchar_t * name1, const wchar_t * name2);
	bool IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len);
	bool IsLastTag(const wchar_t * name);
	bool IsTagSafe(const wchar_t * tag);

	int CheckOrphan(const wchar_t * str, const wchar_t * end, const wchar_t * orphan);
	bool CheckOrphanTable(const wchar_t * str, const wchar_t * end, const wchar_t ** table, size_t o1, size_t o2);
	bool CheckOrphanLangPl(const wchar_t * str, const wchar_t * end);
	bool CheckOrphanLangCz(const wchar_t * str, const wchar_t * end);
	bool CheckOrphan(const wchar_t * str, const wchar_t * end);

	bool IsWhite(int c);
	void SkipWhite();
	void SkipWhiteLines();
	void SkipWhiteWithFirstNewLine();
	bool IsClosingTagForLastItem();
	virtual bool IsOpeningTagMark();
	virtual bool IsOpeningCommentaryTagMark();
	size_t OpeningCommentaryTagMarkSize();
	virtual bool IsClosingTagMark();
	virtual bool IsClosingXmlSimpleTagMark();
	bool SkipCommentaryTagIfExists();
	const wchar_t * SkipItemCheckXmlSimple();

	void PopStack();
	bool PushStack();
	virtual bool IsValidCharForName(int c);
	void CheckNewLine();
	virtual void CheckExceptions();
	void CheckStackPrintRest();
	void AddForgottenTags();
	void CheckClosingTags();
	virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
	void ReadNormalText();
	bool PrintRest();
	void PrintItem(const wchar_t * start, const wchar_t * end);
	void ReadItemName();
	bool ReadItem();
	virtual void Init();
	virtual void Deinit();
	void Read();

	size_t PutNormalTextTrimFillBuffer(const wchar_t * & str, const wchar_t * & end);
	size_t PutNormalTextFillBuffer(const wchar_t * & str, const wchar_t * & end);
	virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
	virtual void PutNormalTextTrim(const wchar_t * str, const wchar_t * end);
	void PutLastTagWithClosingTag();
	virtual void PutOpeningTagMark();
	virtual void PutClosingTagMark();
	virtual void PutTagName(const wchar_t * name);
	virtual void PutOpeningTag(const wchar_t * start, const wchar_t * end);
	virtual void PutClosingTag(const wchar_t * tag);
	size_t PutTabsToBuffer(size_t index, size_t len);
	size_t PutNonBreakSpaceToBuffer(size_t index);
	void PutTabs(size_t len);
	void PutNewLine();

	const wchar_t * pchar;
	Item empty;
	Item * pstack;		// stack pointer
	size_t stack_len;	// length of the stack
	wchar_t * buffer;		// buffer used when printing
	std::wstring * out_string;
	bool last_new_line;
	size_t break_after;	// insert a space into long lines after break_after characters
	bool trim_white;	// trimming white characters
	size_t tab_size;
	Lang lang;          // current language for checking orphans
	OrphanMode orphan_mode;
	bool safe_mode;     // skipping some unsafe tags
};


#endif