winix/core/htmlfilter.h

167 lines
3.9 KiB
C++
Executable File

/*
* This file is a part of CMSLU -- Content Management System like Unix
* and is not publicly distributed
*
* Copyright (c) 2008-2009, Tomasz Sowa
* All rights reserved.
*
*/
#ifndef headerfilecmslucorehtmlfilter
#define headerfilecmslucorehtmlfilter
#include <string>
// max length of a name of a html tag (with terminating null)
#define CMSLU_HTMLFILTER_ITEM_MAXLEN 30
// depth of the html tree
#define CMSLU_HTMLFILTER_STACK_MAXLEN 100
// length of a buffer used for printing
// it should be at least: CMSLU_HTMLFILTER_ITEM_MAXLEN+3
#define CMSLU_HTMLFILTER_BUFFER_MAXLEN 1024
/*!
very lightweight filter for html
(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
this filter has O(n) complexity over the whole html string
such tags as: <script> <pre> <textarea> are treated in a special way
all characters between the opening and closing tag (<script>....</script>) are untouched
if the filter finds that there are not closed tags it will close them,
if the filter finds a closing tag which doesn't have an opening tag - it will skip it
tags which don't need to be closed: meta, input, br, img, link
look at CheckExceptions() method
the filter recognizes xml simple tags (with / at the end) such as: <br />
*/
class HTMLFilter
{
public:
HTMLFilter();
HTMLFilter(const HTMLFilter & f);
HTMLFilter & operator=(const HTMLFilter & f);
~HTMLFilter();
// main methods used for filtering
void Filter(const char * in, std::string & out);
void Filter(const std::string & in, std::string & out);
// insert a white space into long lines
// only between html tags
// skipped in such tags: script, pre, textarea
// false by default
void BreakLongLines(bool break_lines);
// trimming white characters (with new lines)
// at the beginning, at the end and in the middle of a string
// only between html tags
// at the beginning and at the end only one space is left
// skipped in such tags: script, pre, textarea
// false by default
void TrimWhite(bool trim);
// first tabs in a tree
// default: 2 (spaces)
// set 0 to turn off
void InsertTabs(size_t tabsize);
protected:
struct Item
{
char name[CMSLU_HTMLFILTER_ITEM_MAXLEN];
size_t name_len;
enum Type
{
opening,
closing,
simple,
special,
none
} type;
// is there a new line after this tag
bool new_line;
void Clear();
Item();
};
// only this method have direct access to the output string
// you can easily change the output from a std::string to something else
void Put(const char * str, const char * end);
Item & GetItem(size_t i);
Item & LastItem();
int ToLower(int c);
bool IsNameEqual(const char * name1, const char * name2);
bool IsNameEqual(const char * name1, const char * name2, size_t len);
bool IsLastTag(const char * name);
bool IsWhite(int c);
void SkipWhite();
void SkipWhiteLines();
bool SkipTagCheck();
void SkipNormalText();
bool IsOpeningCommentaryTag();
bool SkipCommentaryTagIfExists();
void SkipItem();
void SkipItemCheckXmlSimple();
void PopStack();
bool PushStack();
bool IsValidCharForName(int c);
void CheckNewLine();
void CheckExceptions();
void AddForgottenTags();
void CheckClosingTags();
void ReadNormalText();
void PrintRest();
void PrintItem(const char * start, const char * end);
void ReadItemName();
bool ReadItem();
void Read();
size_t PutTrimFillBuffer(const char * & str, const char * & end);
void PutTrim(const char * str, const char * end);
void PutLastTagWithClosingTag();
void PutOpeningTag(const char * tag);
void PutClosingTag(const char * tag);
void PutTabs(size_t len);
void PutNewLine();
const char * pchar;
Item empty;
Item * pstack; // stack pointer
size_t stack_len; // length of the stack
char * buffer; // buffer used when printing
std::string * out_string;
bool last_new_line;
bool break_long_lines; // insert a space into long lines
bool trim_white; // trimming white characters
size_t tab_size;
};
#endif