/* * This file is a part of Winix * and is not publicly distributed * * Copyright (c) 2008-2010, Tomasz Sowa * All rights reserved. * */ #ifndef headerfilecmslucorehtmlfilter #define headerfilecmslucorehtmlfilter #include // max length of a name of a html tag (with terminating null) #define WINIX_HTMLFILTER_ITEM_MAXLEN 30 // depth of the html tree #define WINIX_HTMLFILTER_STACK_MAXLEN 100 // length of a buffer used for printing // it should be at least: WINIX_HTMLFILTER_ITEM_MAXLEN+3 #define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048 /*! very lightweight filter for html (without using any dynamic memory - some memory is allocated only at the beginning - in ctors) this filter has O(n) complexity over the whole html string such tags as: ) are untouched if the filter finds that there are not closed tags it will close them, if the filter finds a closing tag which doesn't have an opening tag - it will skip it tags which don't need to be closed: meta, input, br, img, link look at CheckExceptions() method the filter recognizes xml simple tags (with / at the end) such as:
*/ class HTMLFilter { public: // for checking orphans enum Lang { lang_pl, lang_cz, lang_sk, lang_none }; enum OrphanMode { orphan_nbsp, // putting " " string orphan_160space // putting 160 ascii code }; HTMLFilter(); HTMLFilter(const HTMLFilter & f); HTMLFilter & operator=(const HTMLFilter & f); ~HTMLFilter(); // main methods used for filtering void Filter(const wchar_t * in, std::wstring & out); void Filter(const std::wstring & in, std::wstring & out); // insert a white space into long lines // only between html tags // skipped in such tags: script, pre, textarea // break_after - after how many characters insert a space (0 - off) void BreakLines(size_t break_after_); // trimming white characters (with new lines) // at the beginning, at the end and in the middle of a string // only between html tags // at the beginning and at the end only one space is left // skipped in such tags: script, pre, textarea // false by default void TrimWhite(bool trim); // first tabs in a tree // default: 2 (spaces) // set 0 to turn off void InsertTabs(size_t tabsize); // check 'orphans' for the specicic language // if an orphan is detected then the non-break space (" " or ascii 160 code) will be put // default disable (lang_none) void CheckOrphans(Lang lang_, OrphanMode mode = orphan_nbsp); // skipping some unsafe tags // (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...) void SafeMode(bool safe_mode_); protected: struct Item { wchar_t name[WINIX_HTMLFILTER_ITEM_MAXLEN]; size_t name_len; enum Type { opening, closing, simple, special, none } type; // is there a new line after this tag bool new_line; void Clear(); Item(); }; // only this method have direct access to the output string // you can easily change the output from a std::wstring to something else virtual void Put(const wchar_t * str, const wchar_t * end); Item & GetItem(size_t i); Item & LastItem(); wchar_t ToLower(wchar_t c); bool IsNameEqual(const wchar_t * name1, const wchar_t * name2); bool IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len); bool IsLastTag(const wchar_t * name); bool IsTagSafe(const wchar_t * tag); int CheckOrphan(const wchar_t * str, const wchar_t * end, const wchar_t * orphan); bool CheckOrphanTable(const wchar_t * str, const wchar_t * end, const wchar_t ** table, size_t o1, size_t o2); bool CheckOrphanLangPl(const wchar_t * str, const wchar_t * end); bool CheckOrphanLangCz(const wchar_t * str, const wchar_t * end); bool CheckOrphan(const wchar_t * str, const wchar_t * end); bool IsWhite(int c); void SkipWhite(); void SkipWhiteLines(); void SkipWhiteWithFirstNewLine(); bool IsClosingTagForLastItem(); virtual bool IsOpeningTagMark(); virtual bool IsOpeningCommentaryTagMark(); size_t OpeningCommentaryTagMarkSize(); virtual bool IsClosingTagMark(); virtual bool IsClosingXmlSimpleTagMark(); bool SkipCommentaryTagIfExists(); const wchar_t * SkipItemCheckXmlSimple(); void PopStack(); bool PushStack(); virtual bool IsValidCharForName(int c); void CheckNewLine(); virtual void CheckExceptions(); void CheckStackPrintRest(); void AddForgottenTags(); void CheckClosingTags(); virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white); void ReadNormalText(); bool PrintRest(); void PrintItem(const wchar_t * start, const wchar_t * end); void ReadItemName(); bool ReadItem(); virtual void Init(); virtual void Deinit(); void Read(); size_t PutNormalTextTrimFillBuffer(const wchar_t * & str, const wchar_t * & end); size_t PutNormalTextFillBuffer(const wchar_t * & str, const wchar_t * & end); virtual void PutNormalText(const wchar_t * str, const wchar_t * end); virtual void PutNormalTextTrim(const wchar_t * str, const wchar_t * end); void PutLastTagWithClosingTag(); virtual void PutOpeningTagMark(); virtual void PutClosingTagMark(); virtual void PutTagName(const wchar_t * name); virtual void PutOpeningTag(const wchar_t * start, const wchar_t * end); virtual void PutClosingTag(const wchar_t * tag); size_t PutTabsToBuffer(size_t index, size_t len); size_t PutNonBreakSpaceToBuffer(size_t index); void PutTabs(size_t len); void PutNewLine(); const wchar_t * pchar; Item empty; Item * pstack; // stack pointer size_t stack_len; // length of the stack wchar_t * buffer; // buffer used when printing std::wstring * out_string; bool last_new_line; size_t break_after; // insert a space into long lines after break_after characters bool trim_white; // trimming white characters size_t tab_size; Lang lang; // current language for checking orphans OrphanMode orphan_mode; bool safe_mode; // skipping some unsafe tags }; #endif