/* * This file is a part of Winix * and is not publicly distributed * * Copyright (c) 2008-2010, Tomasz Sowa * All rights reserved. * */ #ifndef headerfilecmslucorehtmlfilter #define headerfilecmslucorehtmlfilter #include // max length of a name of a html tag (with terminating null) #define WINIX_HTMLFILTER_ITEM_MAXLEN 30 // depth of the html tree #define WINIX_HTMLFILTER_STACK_MAXLEN 100 // length of a buffer used for printing // it should be at least: WINIX_HTMLFILTER_ITEM_MAXLEN+3 #define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048 /*! very lightweight filter for html (without using any dynamic memory - some memory is allocated only at the beginning - in ctors) this filter has O(n) complexity over the whole html string such tags as: ) are untouched if the filter finds that there are not closed tags it will close them, if the filter finds a closing tag which doesn't have an opening tag - it will skip it tags which don't need to be closed: meta, input, br, img, link look at CheckExceptions() method the filter recognizes xml simple tags (with / at the end) such as:
*/ class HTMLFilter { public: // for checking orphans enum Lang { lang_pl, lang_cz, lang_sk, lang_none }; enum OrphanMode { orphan_nbsp, // putting " " string orphan_160space // putting 160 ascii code }; HTMLFilter(); HTMLFilter(const HTMLFilter & f); HTMLFilter & operator=(const HTMLFilter & f); ~HTMLFilter(); // main methods used for filtering void Filter(const char * in, std::string & out); void Filter(const std::string & in, std::string & out); // insert a white space into long lines // only between html tags // skipped in such tags: script, pre, textarea // break_after - after how many characters insert a space (0 - off) void BreakLines(size_t break_after_); // trimming white characters (with new lines) // at the beginning, at the end and in the middle of a string // only between html tags // at the beginning and at the end only one space is left // skipped in such tags: script, pre, textarea // false by default void TrimWhite(bool trim); // first tabs in a tree // default: 2 (spaces) // set 0 to turn off void InsertTabs(size_t tabsize); // check 'orphans' for the specicic language // if an orphan is detected then the non-break space (" " or ascii 160 code) will be put // default disable (lang_none) void CheckOrphans(Lang lang_, OrphanMode mode = orphan_nbsp); // skipping some unsafe tags // (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...) void SafeMode(bool safe_mode_); protected: struct Item { char name[WINIX_HTMLFILTER_ITEM_MAXLEN]; size_t name_len; enum Type { opening, closing, simple, special, none } type; // is there a new line after this tag bool new_line; void Clear(); Item(); }; // only this method have direct access to the output string // you can easily change the output from a std::string to something else virtual void Put(const char * str, const char * end); Item & GetItem(size_t i); Item & LastItem(); int ToLower(int c); bool IsNameEqual(const char * name1, const char * name2); bool IsNameEqual(const char * name1, const char * name2, size_t len); bool IsLastTag(const char * name); bool IsTagSafe(const char * tag); int CheckOrphan(const char * str, const char * end, const char * orphan); bool CheckOrphanTable(const char * str, const char * end, const char ** table, size_t o1, size_t o2); bool CheckOrphanLangPl(const char * str, const char * end); bool CheckOrphanLangCz(const char * str, const char * end); bool CheckOrphan(const char * str, const char * end); bool IsWhite(int c); void SkipWhite(); void SkipWhiteLines(); void SkipWhiteWithFirstNewLine(); bool IsClosingTagForLastItem(); virtual bool IsOpeningTagMark(); virtual bool IsOpeningCommentaryTagMark(); size_t OpeningCommentaryTagMarkSize(); virtual bool IsClosingTagMark(); virtual bool IsClosingXmlSimpleTagMark(); bool SkipCommentaryTagIfExists(); const char * SkipItemCheckXmlSimple(); void PopStack(); bool PushStack(); virtual bool IsValidCharForName(int c); void CheckNewLine(); virtual void CheckExceptions(); void CheckStackPrintRest(); void AddForgottenTags(); void CheckClosingTags(); virtual void ReadNormalTextSkipWhite(const char * & start, const char * & last_non_white); void ReadNormalText(); bool PrintRest(); void PrintItem(const char * start, const char * end); void ReadItemName(); bool ReadItem(); virtual void Init(); virtual void Deinit(); void Read(); size_t PutNormalTextTrimFillBuffer(const char * & str, const char * & end); size_t PutNormalTextFillBuffer(const char * & str, const char * & end); virtual void PutNormalText(const char * str, const char * end); virtual void PutNormalTextTrim(const char * str, const char * end); void PutLastTagWithClosingTag(); virtual void PutOpeningTagMark(); virtual void PutClosingTagMark(); virtual void PutTagName(const char * name); virtual void PutOpeningTag(const char * start, const char * end); virtual void PutClosingTag(const char * tag); size_t PutTabsToBuffer(size_t index, size_t len); size_t PutNonBreakSpaceToBuffer(size_t index); void PutTabs(size_t len); void PutNewLine(); const char * pchar; Item empty; Item * pstack; // stack pointer size_t stack_len; // length of the stack char * buffer; // buffer used when printing std::string * out_string; bool last_new_line; size_t break_after; // insert a space into long lines after break_after characters bool trim_white; // trimming white characters size_t tab_size; Lang lang; // current language for checking orphans OrphanMode orphan_mode; bool safe_mode; // skipping some unsafe tags }; #endif