2009-12-09 01:42:40 +01:00
|
|
|
/*
|
2010-02-28 01:08:10 +01:00
|
|
|
* This file is a part of Winix
|
2009-12-09 01:42:40 +01:00
|
|
|
* and is not publicly distributed
|
|
|
|
*
|
2011-04-16 10:42:22 +02:00
|
|
|
* Copyright (c) 2008-2011, Tomasz Sowa
|
2009-12-09 01:42:40 +01:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2011-01-23 15:15:30 +01:00
|
|
|
#ifndef headerfile_winix_core_htmlfilter
|
|
|
|
#define headerfile_winix_core_htmlfilter
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
#include <string>
|
2011-04-16 10:42:22 +02:00
|
|
|
#include <map>
|
|
|
|
#include <vector>
|
|
|
|
#include <algorithm>
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// max length of a name of a html tag (with terminating null)
|
2011-04-16 10:42:22 +02:00
|
|
|
#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30
|
|
|
|
|
|
|
|
// max length of a html lang attribute (e.g. "en", "pl")
|
|
|
|
#define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10
|
|
|
|
|
|
|
|
|
|
|
|
#define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40
|
|
|
|
|
|
|
|
|
|
|
|
#define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
// depth of the html tree
|
2011-04-16 10:42:22 +02:00
|
|
|
#define WINIX_HTMLFILTER_STACK_MAXLEN 100
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
// length of a buffer used for printing
|
2011-04-16 10:42:22 +02:00
|
|
|
// it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3
|
2010-06-21 00:47:24 +02:00
|
|
|
#define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*!
|
|
|
|
very lightweight filter for html
|
|
|
|
(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
|
|
|
|
this filter has O(n) complexity over the whole html string
|
|
|
|
|
|
|
|
such tags as: <script> <pre> <textarea> are treated in a special way
|
|
|
|
all characters between the opening and closing tag (<script>....</script>) are untouched
|
|
|
|
|
|
|
|
if the filter finds that there are not closed tags it will close them,
|
|
|
|
if the filter finds a closing tag which doesn't have an opening tag - it will skip it
|
|
|
|
|
|
|
|
tags which don't need to be closed: meta, input, br, img, link
|
|
|
|
look at CheckExceptions() method
|
|
|
|
|
|
|
|
the filter recognizes xml simple tags (with / at the end) such as: <br />
|
|
|
|
*/
|
|
|
|
class HTMLFilter
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
enum OrphanMode
|
|
|
|
{
|
2010-06-30 20:42:50 +02:00
|
|
|
orphan_nbsp, // putting " " string
|
|
|
|
orphan_160space // putting 160 ascii code
|
2010-06-21 00:47:24 +02:00
|
|
|
};
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
HTMLFilter();
|
|
|
|
HTMLFilter(const HTMLFilter & f);
|
|
|
|
HTMLFilter & operator=(const HTMLFilter & f);
|
|
|
|
~HTMLFilter();
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
// main methods used for filtering
|
2010-11-21 01:19:17 +01:00
|
|
|
void Filter(const wchar_t * in, std::wstring & out);
|
|
|
|
void Filter(const std::wstring & in, std::wstring & out);
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
// insert a white space into long words
|
|
|
|
// (only between html tags)
|
2009-12-09 01:42:40 +01:00
|
|
|
// skipped in such tags: script, pre, textarea
|
2010-06-21 00:47:24 +02:00
|
|
|
// break_after - after how many characters insert a space (0 - off)
|
2011-04-16 10:42:22 +02:00
|
|
|
void BreakWord(size_t break_after_);
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
// insert a new line character into long lines
|
|
|
|
// (only between html tags)
|
|
|
|
// skipped in such tags: script, pre, textarea
|
|
|
|
// wrap_line - after how many characters wrap a line (0 - off)
|
|
|
|
// lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section)
|
|
|
|
void WrapLine(size_t wrap_line_);
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
// trimming white characters (with new lines)
|
|
|
|
// at the beginning, at the end and in the middle of a string
|
|
|
|
// only between html tags
|
|
|
|
// at the beginning and at the end only one space is left
|
|
|
|
// skipped in such tags: script, pre, textarea
|
|
|
|
// false by default
|
|
|
|
void TrimWhite(bool trim);
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
// first tabs in a tree
|
|
|
|
// default: 2 (spaces)
|
|
|
|
// set 0 to turn off
|
|
|
|
void InsertTabs(size_t tabsize);
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
// orphans are checked only in 'body' tag
|
|
|
|
void AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab);
|
|
|
|
void AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab);
|
|
|
|
void ClearOrphans();
|
|
|
|
|
|
|
|
|
2010-06-21 00:47:24 +02:00
|
|
|
// check 'orphans' for the specicic language
|
2010-06-30 20:42:50 +02:00
|
|
|
// if an orphan is detected then the non-break space (" " or ascii 160 code) will be put
|
|
|
|
// default disable (lang_none)
|
2011-04-16 10:42:22 +02:00
|
|
|
void OrphansMode(OrphanMode mode = orphan_nbsp);
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
2010-06-30 20:42:50 +02:00
|
|
|
// skipping some unsafe tags
|
|
|
|
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
|
|
|
|
void SafeMode(bool safe_mode_);
|
|
|
|
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
protected:
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
// orphans for one language
|
|
|
|
struct Orphans
|
|
|
|
{
|
|
|
|
std::vector<std::wstring> tab;
|
|
|
|
size_t max_len;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// orphans for all languages
|
|
|
|
// map<language_code, Orphans>
|
|
|
|
typedef std::map<std::wstring, Orphans> OrphansTab;
|
|
|
|
OrphansTab orphans_tab;
|
|
|
|
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
struct Item
|
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
enum Type
|
|
|
|
{
|
2011-04-16 10:42:22 +02:00
|
|
|
opening, /* sample: <h1> */
|
|
|
|
closing, /* sample: </h1> */
|
|
|
|
simple, /* sample: <br/> */
|
|
|
|
special, /* sample: <!doctype> */
|
2009-12-09 01:42:40 +01:00
|
|
|
none
|
|
|
|
} type;
|
|
|
|
|
|
|
|
// is there a new line after this tag
|
|
|
|
bool new_line;
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
// current orphans table
|
|
|
|
// (will be propagated)
|
|
|
|
Orphans * porphans;
|
|
|
|
|
|
|
|
// this item or one from its parents is a 'body' html tag
|
|
|
|
// (will be propagated)
|
|
|
|
bool has_body_tag;
|
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
void Clear();
|
|
|
|
Item();
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2011-04-16 13:27:54 +02:00
|
|
|
/*
|
|
|
|
virtual methods
|
|
|
|
*/
|
|
|
|
virtual void Init();
|
|
|
|
virtual void Uninit();
|
|
|
|
|
|
|
|
virtual bool IsOpeningTagMark();
|
|
|
|
virtual bool IsOpeningCommentaryTagMark();
|
|
|
|
virtual bool IsClosingTagMark();
|
|
|
|
virtual bool IsClosingXmlSimpleTagMark();
|
|
|
|
|
|
|
|
virtual bool IsValidCharForName(int c);
|
|
|
|
virtual bool IsValidCharForAttrName(int c);
|
|
|
|
virtual void CheckExceptions();
|
|
|
|
virtual bool SkipCommentaryTagIfExists();
|
|
|
|
|
|
|
|
virtual void Put(wchar_t c);
|
|
|
|
virtual void Put(const wchar_t * str);
|
|
|
|
virtual void Put(const wchar_t * str, const wchar_t * end);
|
|
|
|
virtual void Put(const std::wstring & str);
|
|
|
|
|
|
|
|
virtual void PutOpeningTagMark();
|
|
|
|
virtual void PutClosingTagMark();
|
|
|
|
virtual bool PutOpeningTag();
|
|
|
|
virtual void PutClosingTag(const wchar_t * tag);
|
|
|
|
|
|
|
|
virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
|
|
|
|
virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
|
2011-04-16 13:27:54 +02:00
|
|
|
/*
|
|
|
|
others
|
|
|
|
*/
|
2009-12-09 01:42:40 +01:00
|
|
|
Item & GetItem(size_t i);
|
|
|
|
Item & LastItem();
|
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
wchar_t ToLower(wchar_t c);
|
2011-04-16 10:42:22 +02:00
|
|
|
void ToLower(std::wstring & str);
|
|
|
|
|
|
|
|
bool IsNameEqual(const wchar_t * name1, const wchar_t * name2);
|
|
|
|
bool IsNameEqual(const wchar_t * name1, const std::wstring & name2);
|
|
|
|
bool IsNameEqual(const std::wstring & name1, const wchar_t * name2);
|
|
|
|
bool IsNameEqual(const std::wstring & name1, const std::wstring & name2);
|
|
|
|
|
|
|
|
bool IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len);
|
|
|
|
bool IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len);
|
|
|
|
bool IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len);
|
|
|
|
bool IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len);
|
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
bool IsLastTag(const wchar_t * name);
|
|
|
|
bool IsTagSafe(const wchar_t * tag);
|
2011-04-16 10:42:22 +02:00
|
|
|
bool IsTagSafe(const std::wstring & tag);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
int CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str);
|
|
|
|
bool CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & orphans);
|
2010-11-21 01:19:17 +01:00
|
|
|
bool CheckOrphan(const wchar_t * str, const wchar_t * end);
|
2010-06-21 00:47:24 +02:00
|
|
|
|
2009-12-09 01:42:40 +01:00
|
|
|
bool IsWhite(int c);
|
|
|
|
void SkipWhite();
|
|
|
|
void SkipWhiteLines();
|
2010-06-21 00:47:24 +02:00
|
|
|
void SkipWhiteWithFirstNewLine();
|
2011-04-16 10:42:22 +02:00
|
|
|
void SkipWhiteLines(const wchar_t * & str, const wchar_t * end);
|
2010-06-21 00:47:24 +02:00
|
|
|
bool IsClosingTagForLastItem();
|
|
|
|
size_t OpeningCommentaryTagMarkSize();
|
2011-04-16 10:42:22 +02:00
|
|
|
void SkipAndCheckClosingTag();
|
2009-12-09 01:42:40 +01:00
|
|
|
|
|
|
|
void PopStack();
|
|
|
|
bool PushStack();
|
|
|
|
void CheckNewLine();
|
2010-06-21 00:47:24 +02:00
|
|
|
void CheckStackPrintRest();
|
2009-12-09 01:42:40 +01:00
|
|
|
void AddForgottenTags();
|
|
|
|
void CheckClosingTags();
|
|
|
|
void ReadNormalText();
|
2010-06-21 00:47:24 +02:00
|
|
|
bool PrintRest();
|
2011-04-16 13:27:54 +02:00
|
|
|
bool PrintOpeningItem();
|
2009-12-09 01:42:40 +01:00
|
|
|
void ReadItemName();
|
2011-04-16 10:42:22 +02:00
|
|
|
void ReadItemAttrName();
|
|
|
|
void ReadItemAttrValue(bool has_quote);
|
|
|
|
|
|
|
|
bool ReadItemAttr();
|
|
|
|
bool CheckItemAttr();
|
|
|
|
void PrinItemAttr();
|
|
|
|
|
|
|
|
void ReadItemClosing();
|
|
|
|
void ReadItemSpecial();
|
|
|
|
void ReadItemOpening();
|
2009-12-09 01:42:40 +01:00
|
|
|
bool ReadItem();
|
2011-04-16 10:42:22 +02:00
|
|
|
void ReadLoop();
|
2009-12-09 01:42:40 +01:00
|
|
|
void Read();
|
|
|
|
|
2011-04-16 10:42:22 +02:00
|
|
|
void CheckChar(wchar_t c);
|
|
|
|
|
|
|
|
void CheckLineWrap();
|
|
|
|
bool HasSemiloconAround(const wchar_t * str, const wchar_t * end);
|
|
|
|
void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end);
|
|
|
|
void PutNormalWhite(const wchar_t * & str, const wchar_t * end);
|
2009-12-09 01:42:40 +01:00
|
|
|
void PutLastTagWithClosingTag();
|
|
|
|
void PutTabs(size_t len);
|
2011-04-16 10:42:22 +02:00
|
|
|
void PutNonBreakingSpace();
|
2009-12-09 01:42:40 +01:00
|
|
|
void PutNewLine();
|
2011-04-16 10:42:22 +02:00
|
|
|
void CalcOrphansMaxLen(Orphans & orphans);
|
2009-12-09 01:42:40 +01:00
|
|
|
|
2010-11-21 01:19:17 +01:00
|
|
|
const wchar_t * pchar;
|
2009-12-09 01:42:40 +01:00
|
|
|
Item empty;
|
2011-04-16 10:42:22 +02:00
|
|
|
Item * pstack; // stack pointer
|
|
|
|
size_t stack_len; // length of the stack
|
2010-11-21 01:19:17 +01:00
|
|
|
wchar_t * buffer; // buffer used when printing
|
|
|
|
std::wstring * out_string;
|
2009-12-09 01:42:40 +01:00
|
|
|
bool last_new_line;
|
2011-04-16 10:42:22 +02:00
|
|
|
size_t break_after; // insert a space into long words after 'break_after' characters
|
|
|
|
size_t wrap_line; // insert a new line character into long lines
|
|
|
|
bool trim_white; // trimming white characters
|
2009-12-09 01:42:40 +01:00
|
|
|
size_t tab_size;
|
2010-06-21 00:47:24 +02:00
|
|
|
OrphanMode orphan_mode;
|
2011-04-16 10:42:22 +02:00
|
|
|
std::wstring attr_name;
|
|
|
|
std::wstring attr_value;
|
|
|
|
std::wstring attr_value_lower;
|
|
|
|
bool attr_has_value;
|
|
|
|
std::wstring lang_code_lower;
|
|
|
|
size_t line_len; //length of the current line (without first spaces which create the html tree)
|
|
|
|
bool safe_mode; // skipping some unsafe tags
|
|
|
|
Orphans orphans_temp;
|
2009-12-09 01:42:40 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#endif
|