winix/core/htmlfilter.h

222 lines
5.8 KiB
C++
Executable File

/*
* This file is a part of Winix
* and is not publicly distributed
*
* Copyright (c) 2008-2010, Tomasz Sowa
* All rights reserved.
*
*/
#ifndef headerfilecmslucorehtmlfilter
#define headerfilecmslucorehtmlfilter
#include <string>
// max length of a name of a html tag (with terminating null)
#define WINIX_HTMLFILTER_ITEM_MAXLEN 30
// depth of the html tree
#define WINIX_HTMLFILTER_STACK_MAXLEN 100
// length of a buffer used for printing
// it should be at least: WINIX_HTMLFILTER_ITEM_MAXLEN+3
#define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048
/*!
very lightweight filter for html
(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
this filter has O(n) complexity over the whole html string
such tags as: <script> <pre> <textarea> are treated in a special way
all characters between the opening and closing tag (<script>....</script>) are untouched
if the filter finds that there are not closed tags it will close them,
if the filter finds a closing tag which doesn't have an opening tag - it will skip it
tags which don't need to be closed: meta, input, br, img, link
look at CheckExceptions() method
the filter recognizes xml simple tags (with / at the end) such as: <br />
*/
class HTMLFilter
{
public:
// for checking orphans
enum Lang
{
lang_pl,
lang_cz,
lang_sk,
lang_none
};
enum OrphanMode
{
orphan_nbsp, // putting "&nbsp;" string
orphan_160space // putting 160 ascii code
};
HTMLFilter();
HTMLFilter(const HTMLFilter & f);
HTMLFilter & operator=(const HTMLFilter & f);
~HTMLFilter();
// main methods used for filtering
void Filter(const char * in, std::string & out);
void Filter(const std::string & in, std::string & out);
// insert a white space into long lines
// only between html tags
// skipped in such tags: script, pre, textarea
// break_after - after how many characters insert a space (0 - off)
void BreakLines(size_t break_after_);
// trimming white characters (with new lines)
// at the beginning, at the end and in the middle of a string
// only between html tags
// at the beginning and at the end only one space is left
// skipped in such tags: script, pre, textarea
// false by default
void TrimWhite(bool trim);
// first tabs in a tree
// default: 2 (spaces)
// set 0 to turn off
void InsertTabs(size_t tabsize);
// check 'orphans' for the specicic language
// if an orphan is detected then the non-break space ("&nbsp;" or ascii 160 code) will be put
// default disable (lang_none)
void CheckOrphans(Lang lang_, OrphanMode mode = orphan_nbsp);
// skipping some unsafe tags
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
void SafeMode(bool safe_mode_);
protected:
struct Item
{
char name[WINIX_HTMLFILTER_ITEM_MAXLEN];
size_t name_len;
enum Type
{
opening,
closing,
simple,
special,
none
} type;
// is there a new line after this tag
bool new_line;
void Clear();
Item();
};
// only this method have direct access to the output string
// you can easily change the output from a std::string to something else
virtual void Put(const char * str, const char * end);
Item & GetItem(size_t i);
Item & LastItem();
int ToLower(int c);
bool IsNameEqual(const char * name1, const char * name2);
bool IsNameEqual(const char * name1, const char * name2, size_t len);
bool IsLastTag(const char * name);
bool IsTagSafe(const char * tag);
int CheckOrphan(const char * str, const char * end, const char * orphan);
bool CheckOrphanTable(const char * str, const char * end, const char ** table, size_t o1, size_t o2);
bool CheckOrphanLangPl(const char * str, const char * end);
bool CheckOrphanLangCz(const char * str, const char * end);
bool CheckOrphan(const char * str, const char * end);
bool IsWhite(int c);
void SkipWhite();
void SkipWhiteLines();
void SkipWhiteWithFirstNewLine();
bool IsClosingTagForLastItem();
virtual bool IsOpeningTagMark();
virtual bool IsOpeningCommentaryTagMark();
size_t OpeningCommentaryTagMarkSize();
virtual bool IsClosingTagMark();
virtual bool IsClosingXmlSimpleTagMark();
bool SkipCommentaryTagIfExists();
const char * SkipItemCheckXmlSimple();
void PopStack();
bool PushStack();
virtual bool IsValidCharForName(int c);
void CheckNewLine();
virtual void CheckExceptions();
void CheckStackPrintRest();
void AddForgottenTags();
void CheckClosingTags();
virtual void ReadNormalTextSkipWhite(const char * & start, const char * & last_non_white);
void ReadNormalText();
bool PrintRest();
void PrintItem(const char * start, const char * end);
void ReadItemName();
bool ReadItem();
virtual void Init();
virtual void Deinit();
void Read();
size_t PutNormalTextTrimFillBuffer(const char * & str, const char * & end);
size_t PutNormalTextFillBuffer(const char * & str, const char * & end);
virtual void PutNormalText(const char * str, const char * end);
virtual void PutNormalTextTrim(const char * str, const char * end);
void PutLastTagWithClosingTag();
virtual void PutOpeningTagMark();
virtual void PutClosingTagMark();
virtual void PutTagName(const char * name);
virtual void PutOpeningTag(const char * start, const char * end);
virtual void PutClosingTag(const char * tag);
size_t PutTabsToBuffer(size_t index, size_t len);
size_t PutNonBreakSpaceToBuffer(size_t index);
void PutTabs(size_t len);
void PutNewLine();
const char * pchar;
Item empty;
Item * pstack; // stack pointer
size_t stack_len; // length of the stack
char * buffer; // buffer used when printing
std::string * out_string;
bool last_new_line;
size_t break_after; // insert a space into long lines after break_after characters
bool trim_white; // trimming white characters
size_t tab_size;
Lang lang; // current language for checking orphans
OrphanMode orphan_mode;
bool safe_mode; // skipping some unsafe tags
};
#endif