/* * This file is a part of Winix * and is not publicly distributed * * Copyright (c) 2008-2014, Tomasz Sowa * All rights reserved. * */ #ifndef headerfile_winix_core_htmlfilter #define headerfile_winix_core_htmlfilter #include #include #include #include namespace Winix { // max length of a name of a html tag (with terminating null) #define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30 // max length of a html lang attribute (e.g. "en", "pl") #define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10 #define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40 #define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500 // depth of the html tree #define WINIX_HTMLFILTER_STACK_MAXLEN 100 // length of a buffer used for printing // it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3 #define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048 /*! very lightweight filter for html (without using any dynamic memory - some memory is allocated only at the beginning - in ctors) this filter has O(n) complexity over the whole html string such tags as: ) are untouched if the filter finds that there are not closed tags it will close them, if the filter finds a closing tag which doesn't have an opening tag - it will skip it tags which don't need to be closed: meta, input, br, img, link look at CheckExceptions() method the filter recognizes xml simple tags (with / at the end) such as:
*/ class HTMLFilter { public: enum OrphanMode { orphan_nbsp, // putting " " string orphan_160space // putting 160 ascii code }; HTMLFilter(); HTMLFilter(const HTMLFilter & f); HTMLFilter & operator=(const HTMLFilter & f); ~HTMLFilter(); // main methods used for filtering void Filter(const wchar_t * in, std::wstring & out); void Filter(const std::wstring & in, std::wstring & out); // insert a white space into long words // (only between html tags) // skipped in such tags: script, pre, textarea // break_after - after how many characters insert a space (0 - off) void BreakWord(size_t break_after_); // insert a new line character into long lines // (only between html tags) // skipped in such tags: script, pre, textarea // wrap_line - after how many characters wrap a line (0 - off) // lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section) void WrapLine(size_t wrap_line_); // trimming white characters (with new lines) // at the beginning, at the end and in the middle of a string // only between html tags // at the beginning and at the end only one space is left // skipped in such tags: script, pre, textarea // false by default void TrimWhite(bool trim); // first tabs in a tree // default: 2 (spaces) // set 0 to turn off void InsertTabs(size_t tabsize); // orphans are checked only in 'body' tag void AssignOrphans(const wchar_t * lang_code, const std::vector & otab); void AssignOrphans(const std::wstring & lang_code, const std::vector & otab); void ClearOrphans(); // check 'orphans' for the specicic language // if an orphan is detected then the non-break space (" " or ascii 160 code) will be put // default disable (lang_none) void OrphansMode(OrphanMode mode = orphan_nbsp); // skipping some unsafe tags // (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...) void SafeMode(bool safe_mode_); protected: // orphans for one language struct Orphans { std::vector tab; size_t max_len; }; // orphans for all languages // map typedef std::map OrphansTab; OrphansTab orphans_tab; struct Item { std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN enum Type { opening, /* sample:

*/ closing, /* sample:

*/ simple, /* sample:
*/ special, /* sample: */ none } type; // is there a new line after this tag bool new_line; // current orphans table // (will be propagated) Orphans * porphans; // this item or one from its parents is a 'body' html tag // (will be propagated) bool has_body_tag; void Clear(); Item(); }; /* virtual methods */ virtual void Init(); virtual void Uninit(); virtual bool IsOpeningTagMark(); virtual bool IsOpeningCommentaryTagMark(); virtual bool IsClosingTagMark(); virtual bool IsClosingXmlSimpleTagMark(); virtual bool IsValidCharForName(int c); virtual bool IsValidCharForAttrName(int c); virtual void CheckExceptions(); virtual bool SkipCommentaryTagIfExists(); virtual void Put(wchar_t c); virtual void Put(const wchar_t * str); virtual void Put(const wchar_t * str, const wchar_t * end); virtual void Put(const std::wstring & str); virtual void PutOpeningTagMark(); virtual void PutClosingTagMark(); virtual bool PutOpeningTag(); virtual void PutClosingTag(const wchar_t * tag); virtual void PutNormalText(const wchar_t * str, const wchar_t * end); virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white); /* others */ Item & GetItem(size_t i); Item & LastItem(); wchar_t ToLower(wchar_t c); void ToLower(std::wstring & str); bool IsNameEqual(const wchar_t * name1, const wchar_t * name2); bool IsNameEqual(const wchar_t * name1, const std::wstring & name2); bool IsNameEqual(const std::wstring & name1, const wchar_t * name2); bool IsNameEqual(const std::wstring & name1, const std::wstring & name2); bool IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len); bool IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len); bool IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len); bool IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len); bool IsLastTag(const wchar_t * name); bool IsTagSafe(const wchar_t * tag); bool IsTagSafe(const std::wstring & tag); int CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str); bool CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector & orphans); bool CheckOrphan(const wchar_t * str, const wchar_t * end); bool IsWhite(int c); void SkipWhite(); void SkipWhiteLines(); void SkipWhiteWithFirstNewLine(); void SkipWhiteLines(const wchar_t * & str, const wchar_t * end); bool IsClosingTagForLastItem(); size_t OpeningCommentaryTagMarkSize(); void SkipAndCheckClosingTag(); void PopStack(); bool PushStack(); void CheckNewLine(); void CheckStackPrintRest(); void AddForgottenTags(); void CheckClosingTags(); void ReadNormalText(); bool PrintRest(); bool PrintOpeningItem(); void ReadItemName(); void ReadItemAttrName(); void ReadItemAttrValue(bool has_quote); bool ReadItemAttr(); bool CheckItemAttr(); void PrinItemAttr(); void ReadItemClosing(); void ReadItemSpecial(); void ReadItemOpening(); bool ReadItem(); void ReadLoop(); void Read(); void CheckChar(wchar_t c); void CheckLineWrap(); bool HasSemiloconAround(const wchar_t * str, const wchar_t * end); void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end); void PutNormalWhite(const wchar_t * & str, const wchar_t * end); void PutLastTagWithClosingTag(); void PutTabs(size_t len); void PutNonBreakingSpace(); void PutNewLine(); void CalcOrphansMaxLen(Orphans & orphans); const wchar_t * pchar; Item empty; Item * pstack; // stack pointer size_t stack_len; // length of the stack wchar_t * buffer; // buffer used when printing std::wstring * out_string; bool last_new_line; size_t break_after; // insert a space into long words after 'break_after' characters size_t wrap_line; // insert a new line character into long lines bool trim_white; // trimming white characters size_t tab_size; OrphanMode orphan_mode; std::wstring attr_name; std::wstring attr_value; std::wstring attr_value_lower; bool attr_has_value; std::wstring lang_code_lower; size_t line_len; //length of the current line (without first spaces which create the html tree) bool safe_mode; // skipping some unsafe tags Orphans orphans_temp; }; } // namespace Winix #endif