222 lines
5.8 KiB
C++
Executable File
222 lines
5.8 KiB
C++
Executable File
/*
|
|
* This file is a part of Winix
|
|
* and is not publicly distributed
|
|
*
|
|
* Copyright (c) 2008-2010, Tomasz Sowa
|
|
* All rights reserved.
|
|
*
|
|
*/
|
|
|
|
#ifndef headerfilecmslucorehtmlfilter
|
|
#define headerfilecmslucorehtmlfilter
|
|
|
|
#include <string>
|
|
|
|
|
|
|
|
|
|
// max length of a name of a html tag (with terminating null)
|
|
#define WINIX_HTMLFILTER_ITEM_MAXLEN 30
|
|
|
|
// depth of the html tree
|
|
#define WINIX_HTMLFILTER_STACK_MAXLEN 100
|
|
|
|
// length of a buffer used for printing
|
|
// it should be at least: WINIX_HTMLFILTER_ITEM_MAXLEN+3
|
|
#define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048
|
|
|
|
|
|
|
|
|
|
/*!
|
|
very lightweight filter for html
|
|
(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
|
|
this filter has O(n) complexity over the whole html string
|
|
|
|
such tags as: <script> <pre> <textarea> are treated in a special way
|
|
all characters between the opening and closing tag (<script>....</script>) are untouched
|
|
|
|
if the filter finds that there are not closed tags it will close them,
|
|
if the filter finds a closing tag which doesn't have an opening tag - it will skip it
|
|
|
|
tags which don't need to be closed: meta, input, br, img, link
|
|
look at CheckExceptions() method
|
|
|
|
the filter recognizes xml simple tags (with / at the end) such as: <br />
|
|
*/
|
|
class HTMLFilter
|
|
{
|
|
public:
|
|
|
|
|
|
// for checking orphans
|
|
enum Lang
|
|
{
|
|
lang_pl,
|
|
lang_cz,
|
|
lang_sk,
|
|
lang_none
|
|
};
|
|
|
|
enum OrphanMode
|
|
{
|
|
orphan_nbsp, // putting " " string
|
|
orphan_160space // putting 160 ascii code
|
|
};
|
|
|
|
|
|
HTMLFilter();
|
|
HTMLFilter(const HTMLFilter & f);
|
|
HTMLFilter & operator=(const HTMLFilter & f);
|
|
~HTMLFilter();
|
|
|
|
|
|
// main methods used for filtering
|
|
void Filter(const char * in, std::string & out);
|
|
void Filter(const std::string & in, std::string & out);
|
|
|
|
|
|
// insert a white space into long lines
|
|
// only between html tags
|
|
// skipped in such tags: script, pre, textarea
|
|
// break_after - after how many characters insert a space (0 - off)
|
|
void BreakLines(size_t break_after_);
|
|
|
|
|
|
// trimming white characters (with new lines)
|
|
// at the beginning, at the end and in the middle of a string
|
|
// only between html tags
|
|
// at the beginning and at the end only one space is left
|
|
// skipped in such tags: script, pre, textarea
|
|
// false by default
|
|
void TrimWhite(bool trim);
|
|
|
|
|
|
// first tabs in a tree
|
|
// default: 2 (spaces)
|
|
// set 0 to turn off
|
|
void InsertTabs(size_t tabsize);
|
|
|
|
|
|
// check 'orphans' for the specicic language
|
|
// if an orphan is detected then the non-break space (" " or ascii 160 code) will be put
|
|
// default disable (lang_none)
|
|
void CheckOrphans(Lang lang_, OrphanMode mode = orphan_nbsp);
|
|
|
|
|
|
// skipping some unsafe tags
|
|
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
|
|
void SafeMode(bool safe_mode_);
|
|
|
|
|
|
|
|
protected:
|
|
|
|
struct Item
|
|
{
|
|
char name[WINIX_HTMLFILTER_ITEM_MAXLEN];
|
|
size_t name_len;
|
|
|
|
enum Type
|
|
{
|
|
opening,
|
|
closing,
|
|
simple,
|
|
special,
|
|
none
|
|
} type;
|
|
|
|
// is there a new line after this tag
|
|
bool new_line;
|
|
|
|
void Clear();
|
|
Item();
|
|
};
|
|
|
|
|
|
// only this method have direct access to the output string
|
|
// you can easily change the output from a std::string to something else
|
|
virtual void Put(const char * str, const char * end);
|
|
|
|
|
|
Item & GetItem(size_t i);
|
|
Item & LastItem();
|
|
|
|
int ToLower(int c);
|
|
bool IsNameEqual(const char * name1, const char * name2);
|
|
bool IsNameEqual(const char * name1, const char * name2, size_t len);
|
|
bool IsLastTag(const char * name);
|
|
bool IsTagSafe(const char * tag);
|
|
|
|
int CheckOrphan(const char * str, const char * end, const char * orphan);
|
|
bool CheckOrphanTable(const char * str, const char * end, const char ** table, size_t o1, size_t o2);
|
|
bool CheckOrphanLangPl(const char * str, const char * end);
|
|
bool CheckOrphanLangCz(const char * str, const char * end);
|
|
bool CheckOrphan(const char * str, const char * end);
|
|
|
|
bool IsWhite(int c);
|
|
void SkipWhite();
|
|
void SkipWhiteLines();
|
|
void SkipWhiteWithFirstNewLine();
|
|
bool IsClosingTagForLastItem();
|
|
virtual bool IsOpeningTagMark();
|
|
virtual bool IsOpeningCommentaryTagMark();
|
|
size_t OpeningCommentaryTagMarkSize();
|
|
virtual bool IsClosingTagMark();
|
|
virtual bool IsClosingXmlSimpleTagMark();
|
|
bool SkipCommentaryTagIfExists();
|
|
const char * SkipItemCheckXmlSimple();
|
|
|
|
void PopStack();
|
|
bool PushStack();
|
|
virtual bool IsValidCharForName(int c);
|
|
void CheckNewLine();
|
|
virtual void CheckExceptions();
|
|
void CheckStackPrintRest();
|
|
void AddForgottenTags();
|
|
void CheckClosingTags();
|
|
virtual void ReadNormalTextSkipWhite(const char * & start, const char * & last_non_white);
|
|
void ReadNormalText();
|
|
bool PrintRest();
|
|
void PrintItem(const char * start, const char * end);
|
|
void ReadItemName();
|
|
bool ReadItem();
|
|
virtual void Init();
|
|
virtual void Deinit();
|
|
void Read();
|
|
|
|
size_t PutNormalTextTrimFillBuffer(const char * & str, const char * & end);
|
|
size_t PutNormalTextFillBuffer(const char * & str, const char * & end);
|
|
virtual void PutNormalText(const char * str, const char * end);
|
|
virtual void PutNormalTextTrim(const char * str, const char * end);
|
|
void PutLastTagWithClosingTag();
|
|
virtual void PutOpeningTagMark();
|
|
virtual void PutClosingTagMark();
|
|
virtual void PutTagName(const char * name);
|
|
virtual void PutOpeningTag(const char * start, const char * end);
|
|
virtual void PutClosingTag(const char * tag);
|
|
size_t PutTabsToBuffer(size_t index, size_t len);
|
|
size_t PutNonBreakSpaceToBuffer(size_t index);
|
|
void PutTabs(size_t len);
|
|
void PutNewLine();
|
|
|
|
const char * pchar;
|
|
Item empty;
|
|
Item * pstack; // stack pointer
|
|
size_t stack_len; // length of the stack
|
|
char * buffer; // buffer used when printing
|
|
std::string * out_string;
|
|
bool last_new_line;
|
|
size_t break_after; // insert a space into long lines after break_after characters
|
|
bool trim_white; // trimming white characters
|
|
size_t tab_size;
|
|
Lang lang; // current language for checking orphans
|
|
OrphanMode orphan_mode;
|
|
bool safe_mode; // skipping some unsafe tags
|
|
};
|
|
|
|
|
|
|
|
|
|
#endif
|