winix/core/htmlfilter.h

222 lines
5.9 KiB
C++
Executable File

/*
* This file is a part of Winix
* and is not publicly distributed
*
* Copyright (c) 2008-2010, Tomasz Sowa
* All rights reserved.
*
*/
#ifndef headerfilecmslucorehtmlfilter
#define headerfilecmslucorehtmlfilter
#include <string>
// max length of a name of a html tag (with terminating null)
#define WINIX_HTMLFILTER_ITEM_MAXLEN 30
// depth of the html tree
#define WINIX_HTMLFILTER_STACK_MAXLEN 100
// length of a buffer used for printing
// it should be at least: WINIX_HTMLFILTER_ITEM_MAXLEN+3
#define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048
/*!
very lightweight filter for html
(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
this filter has O(n) complexity over the whole html string
such tags as: <script> <pre> <textarea> are treated in a special way
all characters between the opening and closing tag (<script>....</script>) are untouched
if the filter finds that there are not closed tags it will close them,
if the filter finds a closing tag which doesn't have an opening tag - it will skip it
tags which don't need to be closed: meta, input, br, img, link
look at CheckExceptions() method
the filter recognizes xml simple tags (with / at the end) such as: <br />
*/
class HTMLFilter
{
public:
// for checking orphans
enum Lang
{
lang_pl,
lang_cz,
lang_sk,
lang_none
};
enum OrphanMode
{
orphan_nbsp, // putting "&nbsp;" string
orphan_160space // putting 160 ascii code
};
HTMLFilter();
HTMLFilter(const HTMLFilter & f);
HTMLFilter & operator=(const HTMLFilter & f);
~HTMLFilter();
// main methods used for filtering
void Filter(const wchar_t * in, std::wstring & out);
void Filter(const std::wstring & in, std::wstring & out);
// insert a white space into long lines
// only between html tags
// skipped in such tags: script, pre, textarea
// break_after - after how many characters insert a space (0 - off)
void BreakLines(size_t break_after_);
// trimming white characters (with new lines)
// at the beginning, at the end and in the middle of a string
// only between html tags
// at the beginning and at the end only one space is left
// skipped in such tags: script, pre, textarea
// false by default
void TrimWhite(bool trim);
// first tabs in a tree
// default: 2 (spaces)
// set 0 to turn off
void InsertTabs(size_t tabsize);
// check 'orphans' for the specicic language
// if an orphan is detected then the non-break space ("&nbsp;" or ascii 160 code) will be put
// default disable (lang_none)
void CheckOrphans(Lang lang_, OrphanMode mode = orphan_nbsp);
// skipping some unsafe tags
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
void SafeMode(bool safe_mode_);
protected:
struct Item
{
wchar_t name[WINIX_HTMLFILTER_ITEM_MAXLEN];
size_t name_len;
enum Type
{
opening,
closing,
simple,
special,
none
} type;
// is there a new line after this tag
bool new_line;
void Clear();
Item();
};
// only this method have direct access to the output string
// you can easily change the output from a std::wstring to something else
virtual void Put(const wchar_t * str, const wchar_t * end);
Item & GetItem(size_t i);
Item & LastItem();
wchar_t ToLower(wchar_t c);
bool IsNameEqual(const wchar_t * name1, const wchar_t * name2);
bool IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len);
bool IsLastTag(const wchar_t * name);
bool IsTagSafe(const wchar_t * tag);
int CheckOrphan(const wchar_t * str, const wchar_t * end, const wchar_t * orphan);
bool CheckOrphanTable(const wchar_t * str, const wchar_t * end, const wchar_t ** table, size_t o1, size_t o2);
bool CheckOrphanLangPl(const wchar_t * str, const wchar_t * end);
bool CheckOrphanLangCz(const wchar_t * str, const wchar_t * end);
bool CheckOrphan(const wchar_t * str, const wchar_t * end);
bool IsWhite(int c);
void SkipWhite();
void SkipWhiteLines();
void SkipWhiteWithFirstNewLine();
bool IsClosingTagForLastItem();
virtual bool IsOpeningTagMark();
virtual bool IsOpeningCommentaryTagMark();
size_t OpeningCommentaryTagMarkSize();
virtual bool IsClosingTagMark();
virtual bool IsClosingXmlSimpleTagMark();
bool SkipCommentaryTagIfExists();
const wchar_t * SkipItemCheckXmlSimple();
void PopStack();
bool PushStack();
virtual bool IsValidCharForName(int c);
void CheckNewLine();
virtual void CheckExceptions();
void CheckStackPrintRest();
void AddForgottenTags();
void CheckClosingTags();
virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
void ReadNormalText();
bool PrintRest();
void PrintItem(const wchar_t * start, const wchar_t * end);
void ReadItemName();
bool ReadItem();
virtual void Init();
virtual void Deinit();
void Read();
size_t PutNormalTextTrimFillBuffer(const wchar_t * & str, const wchar_t * & end);
size_t PutNormalTextFillBuffer(const wchar_t * & str, const wchar_t * & end);
virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
virtual void PutNormalTextTrim(const wchar_t * str, const wchar_t * end);
void PutLastTagWithClosingTag();
virtual void PutOpeningTagMark();
virtual void PutClosingTagMark();
virtual void PutTagName(const wchar_t * name);
virtual void PutOpeningTag(const wchar_t * start, const wchar_t * end);
virtual void PutClosingTag(const wchar_t * tag);
size_t PutTabsToBuffer(size_t index, size_t len);
size_t PutNonBreakSpaceToBuffer(size_t index);
void PutTabs(size_t len);
void PutNewLine();
const wchar_t * pchar;
Item empty;
Item * pstack; // stack pointer
size_t stack_len; // length of the stack
wchar_t * buffer; // buffer used when printing
std::wstring * out_string;
bool last_new_line;
size_t break_after; // insert a space into long lines after break_after characters
bool trim_white; // trimming white characters
size_t tab_size;
Lang lang; // current language for checking orphans
OrphanMode orphan_mode;
bool safe_mode; // skipping some unsafe tags
};
#endif