added: HTMLFilter (html/htmlfilter.h|cpp) - copied from winix project

htmlparserlistener
Tomasz Sowa 2 years ago
parent 1e5598cde1
commit bdb2616f32

@ -42,3 +42,4 @@
./mainoptions/mainoptionsparser.o: convert/inttostr.h utf8/utf8.h
./mainoptions/mainoptionsparser.o: textstream/stream.h utf8/utf8_templates.h
./mainoptions/mainoptionsparser.o: utf8/utf8_private.h
./html/htmlfilter.o: ./html/htmlfilter.h

File diff suppressed because it is too large Load Diff

@ -0,0 +1,376 @@
/*
* This file is a part of PikoTools
* and is distributed under the (new) BSD licence.
* Author: Tomasz Sowa <t.sowa@ttmath.org>
*/
/*
* Copyright (c) 2008-2021, Tomasz Sowa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* * Neither the name Tomasz Sowa nor the names of contributors to this
* project may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef headerfile_picotools_html_htmlfilter
#define headerfile_picotools_html_htmlfilter
#include <string>
#include <map>
#include <vector>
#include <algorithm>
namespace pt
{
// max length of a name of a html tag (with terminating null)
#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30
// max length of a html lang attribute (e.g. "en", "pl")
#define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10
#define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40
#define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500
// depth of the html tree
#define WINIX_HTMLFILTER_STACK_MAXLEN 100
// length of a buffer used for printing
// it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3
#define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048
/*!
very lightweight filter for html
(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
this filter has O(n) complexity over the whole html string
such tags as: <script> <pre> <textarea> are treated in a special way
all characters between the opening and closing tag (<script>....</script>) are untouched
if the filter finds that there are not closed tags it will close them,
if the filter finds a closing tag which doesn't have an opening tag - it will skip it
tags which don't need to be closed: meta, input, br, img, link
look at CheckExceptions() method
the filter recognizes xml simple tags (with / at the end) such as: <br />
*/
class HTMLFilter
{
public:
enum OrphanMode
{
orphan_nbsp, // putting "&nbsp;" string
orphan_160space // putting 160 ascii code
};
HTMLFilter();
HTMLFilter(const HTMLFilter & f);
HTMLFilter & operator=(const HTMLFilter & f);
virtual ~HTMLFilter();
// main methods used for filtering
void Filter(const wchar_t * in, std::wstring & out);
void Filter(const std::wstring & in, std::wstring & out);
// insert a white space into long words
// (only between html tags)
// skipped in such tags: script, pre, textarea
// break_after - after how many characters insert a space (0 - off)
void BreakWord(size_t break_after_);
// insert a new line character into long lines
// (only between html tags)
// skipped in such tags: script, pre, textarea
// wrap_line - after how many characters wrap a line (0 - off)
// lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section)
void WrapLine(size_t wrap_line_);
// trimming white characters (with new lines)
// at the beginning, at the end and in the middle of a string
// only between html tags
// at the beginning and at the end only one space is left
// skipped in such tags: script, pre, textarea
// false by default
void TrimWhite(bool trim);
// first tabs in a tree
// default: 2 (spaces)
// set 0 to turn off
void InsertTabs(size_t tabsize);
// set a name of a html tag which will be used as 'nofilter' tag
// elements between such tags are not filtered (similarly as in <pre> and <textarea>)
// these tags (opening and closing) will no be placed in the html output
void SetNoFilterTag(const std::wstring & tag_name);
// orphans are checked only in 'body' tag
void AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab);
void AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab);
void ClearOrphans();
// check 'orphans' for the specicic language
// if an orphan is detected then the non-break space ("&nbsp;" or ascii 160 code) will be put
// default disable (lang_none)
void OrphansMode(const std::wstring & orphan_mode);
// skipping some unsafe tags
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
void SafeMode(bool safe_mode_);
// skip all html tags
// gives only text without markup
// but there can be commentaries
void SkipTags(bool skip_tags);
// skip commentaries
void SkipCommentaries(bool skip_commentaries);
// if true then entities such as &nbsp; are skipped
// this automatically turns on AnalyzeEntities
// in such a case FoundEntity callbacks are sent
void SkipEntities(bool skip_entities);
// analyze html entities such as &nbsp;
// virtual method: FoundEntity is called
// entities are analyzed in normal text and in attribute values such as <p class="a&nbsp;">
void AnalyzeEntities(bool analyze_entities);
protected:
// orphans for one language
struct Orphans
{
std::vector<std::wstring> tab;
size_t max_len;
};
// orphans for all languages
// map<language_code, Orphans>
typedef std::map<std::wstring, Orphans> OrphansTab;
OrphansTab orphans_tab;
// html <nofilter> tag name
std::wstring no_filter_tag;
struct Item
{
std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
enum Type
{
opening, /* sample: <h1> */
closing, /* sample: </h1> */
simple, /* sample: <br/> */
special, /* sample: <!doctype> */
none
} type;
// is there a new line after this tag
bool new_line;
// current orphans table
// (will be propagated)
Orphans * porphans;
// this item or one from its parents is a 'body' html tag
// (will be propagated)
bool has_body_tag;
void Clear();
Item();
};
/*
virtual methods
*/
virtual void Init();
virtual void Uninit();
virtual bool IsOpeningTagMark(wchar_t c);
virtual bool IsClosingTagMark(wchar_t c);
virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
virtual bool IsStartingEntityMark(wchar_t c);
virtual bool IsEndingEntityMark(wchar_t c);
virtual bool IsOpeningCommentaryTagMark(const wchar_t * str);
virtual size_t OpeningCommentaryTagMarkSize();
virtual bool IsValidCharForName(int c);
virtual bool IsValidCharForAttrName(int c);
virtual bool IsValidCharForEntityName(int c);
virtual void CheckExceptions();
virtual bool SkipCommentaryTagIfExists();
virtual void Put(wchar_t c);
virtual void Put(const wchar_t * str);
virtual void Put(const wchar_t * str, const wchar_t * end);
virtual void Put(const std::wstring & str);
virtual void AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out);
virtual void PutOpeningTagMark();
virtual void PutClosingTagMark();
virtual bool PutOpeningTag();
virtual void PutClosingTag(const wchar_t * tag);
virtual void PutNormalText(const wchar_t * str, const wchar_t * end);
virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white);
virtual void ItemFound();
virtual void EntityFound(const wchar_t * str, const wchar_t * end);
/*
others
*/
void SetSomeDefaults();
Item & GetItem(size_t i);
Item & LastItem();
wchar_t ToLower(wchar_t c);
void ToLower(std::wstring & str);
bool IsNameEqual(const wchar_t * name1, const wchar_t * name2);
bool IsNameEqual(const wchar_t * name1, const std::wstring & name2);
bool IsNameEqual(const std::wstring & name1, const wchar_t * name2);
bool IsNameEqual(const std::wstring & name1, const std::wstring & name2);
bool IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len);
bool IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len);
bool IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len);
bool IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len);
bool IsLastTag(const wchar_t * name);
bool IsLastTag(const std::wstring & name);
bool IsTagSafe(const wchar_t * tag);
bool IsTagSafe(const std::wstring & tag);
int CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str);
bool CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & orphans);
bool CheckOrphan(const wchar_t * str, const wchar_t * end);
bool IsWhite(int c);
void SkipWhite();
void SkipWhiteLines();
void SkipWhiteWithFirstNewLine();
void SkipWhiteLines(const wchar_t * & str, const wchar_t * end);
bool IsClosingTagForLastItem();
void SkipAndCheckClosingTag();
void PopStack();
bool PushStack();
void CheckNewLine();
void CheckStackPrintRest();
void AddForgottenTags();
void CheckClosingTags();
void ReadNormalText();
bool PrintRest();
bool PrintOpeningItem();
void ReadItemName();
void ReadItemAttrName();
void ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end);
void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
bool ReadItemAttr();
bool CheckItemAttr();
void PrintItemAttr();
void ReadItemClosing();
void ReadItemSpecial();
void ReadItemOpening();
bool ReadItem();
void ReadLoop();
void Read();
void CheckChar(wchar_t c);
void CheckLineWrap();
bool HasEntityEndAround(const wchar_t * str, const wchar_t * end);
void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end);
void PutNormalWhite(const wchar_t * & str, const wchar_t * end);
void PutEverythingUntilClosingTag(bool put_closing_tag_as_well);
void PutTabs(size_t len);
void PutNonBreakingSpace();
void PutNewLine();
void CalcOrphansMaxLen(Orphans & orphans);
const wchar_t * pchar;
Item empty;
Item * pstack; // stack pointer
size_t stack_len; // length of the stack
wchar_t * buffer; // buffer used when printing
std::wstring * out_string;
bool last_new_line;
size_t break_after; // insert a space into long words after 'break_after' characters
size_t wrap_line; // insert a new line character into long lines
bool trim_white; // trimming white characters
size_t tab_size;
OrphanMode orphan_mode;
std::wstring attr_name;
std::vector<std::wstring> attr_value;
std::wstring attr_value_temp;
std::wstring attr_value_lower;
bool attr_has_value;
std::wstring lang_code_lower;
size_t line_len; //length of the current line (without first spaces which create the html tree)
bool safe_mode; // skipping some unsafe tags
Orphans orphans_temp;
bool skip_tags;
bool skip_commentaries;
bool skip_entities;
bool analyze_entities;
};
}
#endif

@ -1,6 +1,5 @@
# DO NOT DELETE
./main.o: convert.h mainoptionsparser.h csvparser.h
./convert.o: convert.h test.h ../src/convert/convert.h
./convert.o: ../src/convert/inttostr.h ../src/convert/patternreplacer.h
./convert.o: ../src/textstream/textstream.h ../src/textstream/stream.h
@ -11,6 +10,11 @@
./convert.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./convert.o: ../src/convert/strtoint.h ../src/convert/text.h
./convert.o: ../src/convert/misc.h ../src/convert/double.h
./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h test.h
./main.o: convert.h mainoptionsparser.h csvparser.h
./test.o: test.h
./mainoptionsparser.o: mainoptionsparser.h test.h
./mainoptionsparser.o: ../src/mainoptions/mainoptionsparser.h
@ -26,7 +30,3 @@
./mainoptionsparser.o: ../src/membuffer/membuffer.h ../src/textstream/types.h
./mainoptionsparser.o: ../src/convert/strtoint.h ../src/convert/text.h
./mainoptionsparser.o: ../src/convert/misc.h ../src/convert/double.h
./csvparser.o: csvparser.h ../src/csv/csvparser.h ../src/space/space.h
./csvparser.o: ../src/textstream/types.h ../src/convert/inttostr.h
./csvparser.o: ../src/utf8/utf8.h ../src/textstream/stream.h
./csvparser.o: ../src/utf8/utf8_templates.h ../src/utf8/utf8_private.h test.h

Loading…
Cancel
Save