499 lines
15 KiB
C++
499 lines
15 KiB
C++
/*
|
|
* This file is a part of PikoTools
|
|
* and is distributed under the 2-Clause BSD licence.
|
|
* Author: Tomasz Sowa <t.sowa@ttmath.org>
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2008-2024, Tomasz Sowa
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#ifndef headerfile_pikotools_src_html_htmlparser
|
|
#define headerfile_pikotools_src_html_htmlparser
|
|
|
|
#include <string>
|
|
#include <map>
|
|
#include <vector>
|
|
#include <algorithm>
|
|
#include "convert/baseparser.h"
|
|
#include "space/space.h"
|
|
#include "textstream/stream.h"
|
|
|
|
|
|
namespace pt
|
|
{
|
|
|
|
|
|
|
|
// max length of a name of a html tag (with terminating null)
|
|
#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30
|
|
|
|
// max length of a html lang attribute (e.g. "en", "pl")
|
|
#define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10
|
|
|
|
|
|
#define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40
|
|
|
|
|
|
#define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500
|
|
|
|
|
|
// depth of the html tree
|
|
#define WINIX_HTMLFILTER_STACK_MAXLEN 100
|
|
|
|
// length of a buffer used for printing
|
|
// it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3
|
|
#define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048
|
|
|
|
|
|
|
|
|
|
/*!
|
|
very lightweight filter for html
|
|
(without using any dynamic memory - some memory is allocated only at the beginning - in ctors)
|
|
this filter has O(n) complexity over the whole html string
|
|
|
|
such tags as: <script> <pre> <textarea> are treated in a special way
|
|
all characters between the opening and closing tag (<script>....</script>) are untouched
|
|
|
|
if the filter finds that there are not closed tags it will close them,
|
|
if the filter finds a closing tag which doesn't have an opening tag - it will skip it
|
|
|
|
tags which don't need to be closed: meta, input, br, img, link
|
|
look at CheckExceptions() method
|
|
|
|
the filter recognizes xml simple tags (with / at the end) such as: <br />
|
|
*/
|
|
class HTMLParser : public BaseParser
|
|
{
|
|
public:
|
|
|
|
|
|
/*
|
|
status of parsing
|
|
*/
|
|
enum Status { ok, cant_open_file, syntax_error };
|
|
|
|
|
|
enum OrphanMode
|
|
{
|
|
orphan_nbsp, // putting " " string
|
|
orphan_160space // putting 160 ascii code
|
|
};
|
|
|
|
|
|
// orphans for one language
|
|
struct Orphans
|
|
{
|
|
std::vector<std::wstring> tab;
|
|
size_t max_len;
|
|
};
|
|
|
|
|
|
struct Item
|
|
{
|
|
std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
|
|
|
|
enum Type
|
|
{
|
|
opening, /* sample: <h1> */
|
|
closing, /* sample: </h1> */
|
|
simple, /* sample: <br/> */
|
|
special, /* sample: <!doctype> */
|
|
none
|
|
} type;
|
|
|
|
bool is_commentary;
|
|
|
|
bool is_cdata;
|
|
|
|
// is a new line before this tag (or just a new line and some white characters)
|
|
bool new_line_before;
|
|
|
|
// is there a new line after this tag (or just some white characters and a new line)
|
|
bool new_line_after;
|
|
|
|
// is there a new line in the middle after this tag and before the next tag
|
|
bool new_line_in_the_middle;
|
|
|
|
// is there a white char (but not new line) before this tag
|
|
bool white_char_before;
|
|
|
|
// current orphans table
|
|
// (will be propagated)
|
|
Orphans * porphans;
|
|
|
|
// this item or one from its parents is a 'body' html tag
|
|
// (will be propagated)
|
|
bool has_body_tag;
|
|
|
|
size_t tree_index;
|
|
|
|
Space * space;
|
|
|
|
void Clear();
|
|
Item();
|
|
};
|
|
|
|
|
|
class Listener
|
|
{
|
|
public:
|
|
|
|
Listener() {}
|
|
|
|
virtual void item_parsed(const Item & item) { }
|
|
virtual bool should_remove(const Item & item) { return false; }
|
|
virtual ~Listener() {}
|
|
|
|
};
|
|
|
|
|
|
/*
|
|
the last status of parsing, set by parse() methods
|
|
*/
|
|
Status status;
|
|
|
|
HTMLParser();
|
|
HTMLParser(const HTMLParser & f);
|
|
HTMLParser & operator=(const HTMLParser & f);
|
|
virtual ~HTMLParser();
|
|
|
|
void set_item_parsed_listener(Listener * listener);
|
|
|
|
|
|
void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
|
|
|
|
Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
|
|
Status parse_xml(const char * str, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
Status parse_xml(const std::string & str, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
|
|
Status parse_xml(const wchar_t * str, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
Status parse_xml(const std::wstring & str, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
|
|
Status parse_xml(const pt::TextStream & str, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
Status parse_xml(const pt::WTextStream & str, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
|
|
|
|
|
// main methods used for filtering
|
|
void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true);
|
|
void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true);
|
|
|
|
void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true);
|
|
|
|
HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true);
|
|
HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true);
|
|
HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true);
|
|
HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true);
|
|
|
|
|
|
/*
|
|
*
|
|
* returns a number of a last parsed line/column
|
|
* can be used to obtain the line in which there was a syntax error
|
|
*
|
|
*/
|
|
int get_last_parsed_line();
|
|
int get_last_parsed_column();
|
|
|
|
|
|
|
|
const static int WHITE_MODE_ORIGIN = 0;
|
|
const static int WHITE_MODE_SINGLE_LINE = 1;
|
|
const static int WHITE_MODE_TREE = 2;
|
|
|
|
|
|
// white chars mode
|
|
//
|
|
void white_chars_mode(int mode);
|
|
|
|
// if the line is wrap_line_ length (or longer) then insert a new line character (in a place of a white char)
|
|
// (only between html tags and only in <body> subtree)
|
|
// skipped in such tags: script, pre, textarea
|
|
// 0 - off
|
|
// lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section)
|
|
void WrapLine(size_t wrap_line_);
|
|
|
|
// first tabs in a tree
|
|
// default: 2 (spaces)
|
|
// set 0 to turn off
|
|
void InsertTabs(size_t tabsize);
|
|
|
|
// set a name of a html tag which will be used as 'nofilter' tag
|
|
// elements between such tags are not filtered (similarly as in <pre> and <textarea>)
|
|
// these tags (opening and closing) will no be placed in the html output
|
|
void SetNoFilterTag(const std::wstring & tag_name);
|
|
|
|
// orphans are checked only in 'body' tag
|
|
void AssignOrphans(const wchar_t * lang_code, const std::vector<std::wstring> & otab);
|
|
void AssignOrphans(const std::wstring & lang_code, const std::vector<std::wstring> & otab);
|
|
void ClearOrphans();
|
|
|
|
// check 'orphans' for the specicic language
|
|
// if an orphan is detected then the non-break space (" " or ascii 160 code) will be put
|
|
// default disable (lang_none)
|
|
void OrphansMode(const std::wstring & orphan_mode);
|
|
|
|
// skipping some unsafe tags
|
|
// (script, iframe, frame, frameset, applet, head, meta, html, link, body, ...)
|
|
void SafeMode(bool safe_mode_);
|
|
|
|
// skip all html tags
|
|
// gives only text without markup
|
|
// but there can be commentaries
|
|
void SkipTags(bool skip_tags);
|
|
|
|
// skip commentaries
|
|
void SkipCommentaries(bool skip_commentaries);
|
|
|
|
// if true then entities such as are skipped
|
|
// this automatically turns on AnalyzeEntities
|
|
// in such a case FoundEntity callbacks are sent
|
|
void SkipEntities(bool skip_entities);
|
|
|
|
// analyze html entities such as
|
|
// virtual method: FoundEntity is called
|
|
// entities are analyzed in normal text and in attribute values such as <p class="a ">
|
|
void AnalyzeEntities(bool analyze_entities);
|
|
|
|
|
|
protected:
|
|
|
|
/*
|
|
* true when parsing html input, false for parsing xml
|
|
*/
|
|
bool parsing_html;
|
|
|
|
|
|
bool xml_compact_mode;
|
|
|
|
|
|
|
|
// orphans for all languages
|
|
// map<language_code, Orphans>
|
|
typedef std::map<std::wstring, Orphans> OrphansTab;
|
|
OrphansTab orphans_tab;
|
|
|
|
// html <nofilter> tag name
|
|
std::wstring no_filter_tag;
|
|
|
|
Listener * listener;
|
|
|
|
/*
|
|
true if the lastc was escaped (with a backslash)
|
|
we have to know if the last sequence was \" or just "
|
|
*/
|
|
bool char_was_escaped;
|
|
|
|
std::wstring escaped_chars_buffer;
|
|
size_t escaped_char_index;
|
|
|
|
/*
|
|
* filter mode, a method filter(...) was called
|
|
* in filter mode we do not unescape xml sequences such as < > ...
|
|
*/
|
|
bool filter_mode;
|
|
|
|
|
|
void clear_input_flags();
|
|
|
|
|
|
/*
|
|
virtual methods
|
|
*/
|
|
virtual void Init();
|
|
virtual void Uninit();
|
|
|
|
void prepare_to_parse_xml(Space & out_space, bool compact_mode, bool clear_space);
|
|
|
|
virtual bool IsOpeningTagMark(wchar_t c);
|
|
virtual bool IsClosingTagMark(wchar_t c);
|
|
virtual bool IsClosingTagIndicator(wchar_t c);
|
|
virtual bool IsSpecialTagIndicator(wchar_t c);
|
|
virtual bool IsXMLSpecialTagIndicator(wchar_t c);
|
|
virtual bool IsAttributeAssignmentMark(wchar_t c);
|
|
virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
|
|
virtual bool IsStartingEntityMark(wchar_t c);
|
|
virtual bool IsEndingEntityMark(wchar_t c);
|
|
|
|
virtual bool IsValidCharForName(int c);
|
|
virtual bool IsValidCharForAttrName(int c);
|
|
virtual bool IsValidCharForEntityName(int c);
|
|
|
|
virtual void CheckSingleItemExceptions();
|
|
virtual void CheckWhiteCharsExceptions(Item & item);
|
|
virtual void CheckDifferentContentExceptions(Item & item);
|
|
|
|
virtual void Put(wchar_t c);
|
|
virtual void Put(const wchar_t * str, const wchar_t * end);
|
|
virtual void Put(const std::wstring & str);
|
|
virtual void AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out);
|
|
|
|
virtual void PutOpeningTagMark();
|
|
virtual void PutClosingTagMark();
|
|
virtual bool PutOpeningTag();
|
|
virtual void PutClosingTag(const Item & item);
|
|
|
|
virtual void ItemFound();
|
|
virtual void EntityFound(const wchar_t * str, const wchar_t * end);
|
|
|
|
/*
|
|
others
|
|
*/
|
|
void SetSomeDefaults();
|
|
|
|
Item & GetItem(size_t i);
|
|
Item & LastItem();
|
|
|
|
wchar_t ToLower(wchar_t c);
|
|
void ToLower(std::wstring & str);
|
|
|
|
bool IsNameEqual(const wchar_t * name1, const wchar_t * name2);
|
|
bool IsNameEqual(const wchar_t * name1, const std::wstring & name2);
|
|
bool IsNameEqual(const std::wstring & name1, const wchar_t * name2);
|
|
bool IsNameEqual(const std::wstring & name1, const std::wstring & name2);
|
|
|
|
bool IsNameEqual(const wchar_t * name1, const wchar_t * name2, size_t len);
|
|
bool IsNameEqual(const wchar_t * name1, const std::wstring & name2, size_t len);
|
|
bool IsNameEqual(const std::wstring & name1, const wchar_t * name2, size_t len);
|
|
bool IsNameEqual(const std::wstring & name1, const std::wstring & name2, size_t len);
|
|
|
|
bool IsLastTag(const wchar_t * name);
|
|
bool IsLastTag(const std::wstring & name);
|
|
bool IsTagSafe(const wchar_t * tag);
|
|
bool IsTagSafe(const std::wstring & tag);
|
|
|
|
int CheckOrphan(const wchar_t * str, const wchar_t * end, const std::wstring & orphan_str);
|
|
bool CheckOrphan(const wchar_t * str, const wchar_t * end, const std::vector<std::wstring> & orphans);
|
|
bool CheckOrphan(const wchar_t * str, const wchar_t * end);
|
|
|
|
bool IsWhite(int c);
|
|
void SkipWhite(std::wstring * out_string = nullptr);
|
|
void SkipWhiteLines(std::wstring * out_string = nullptr);
|
|
void SkipWhiteWithFirstNewLine();
|
|
|
|
int current_white_char_mode();
|
|
|
|
void ReadTextUntilClosingCommentary();
|
|
bool IsClosingTagForLastItem(bool put_closing_tag_as_well);
|
|
void ReadTextUntilClosingTag(bool put_closing_tag_as_well);
|
|
void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);
|
|
|
|
void PopStack();
|
|
bool PushStack();
|
|
void CheckStackPrintRest();
|
|
void AddForgottenTags();
|
|
void CheckClosingTags();
|
|
void ReadText(bool is_cdata);
|
|
bool PrintRest();
|
|
bool PrintOpeningItem();
|
|
void ReadItemName(std::wstring & name, bool clear_name = true);
|
|
void ReadItemAttrName();
|
|
void ReadItemAttrValueAdd(const std::wstring & str);
|
|
void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
|
|
void ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char);
|
|
|
|
bool ReadItemAttr();
|
|
void CheckItemLangAttr();
|
|
void PrintItemAttr();
|
|
void PutItemAttrToSpace();
|
|
|
|
void ReadItemClosing();
|
|
void ReadItemSpecial();
|
|
void ReadItemOpening();
|
|
bool ReadItem();
|
|
void ReadLoop();
|
|
void Read();
|
|
|
|
void CheckChar(wchar_t c);
|
|
|
|
bool PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space, bool is_cdata);
|
|
void PutNormalWhite(bool & was_white_char, bool & was_new_line, std::wstring * result_text = nullptr);
|
|
|
|
void PutTabs(size_t len);
|
|
void PutNonBreakingSpace();
|
|
void CalcOrphansMaxLen(Orphans & orphans);
|
|
|
|
void AddItemToSpace();
|
|
void RemoveLastSpace(size_t index);
|
|
void AddTextSpaceToSpaceTree(const Space & space);
|
|
|
|
void CallListener(size_t index);
|
|
|
|
bool check_escape_sequentions();
|
|
void read_xml_entity();
|
|
void read_char_from_entity_buffer();
|
|
int read_char() override;
|
|
|
|
Item empty;
|
|
Item * pstack; // stack pointer
|
|
size_t stack_len; // length of the stack
|
|
wchar_t * buffer; // buffer used when printing
|
|
std::wstring * out_string;
|
|
Stream * out_stream;
|
|
Space * out_space;
|
|
Space text_space_tmp;
|
|
|
|
std::vector<int> white_char_mode_tab;
|
|
|
|
//bool last_new_line;
|
|
bool new_item_has_new_line_before;
|
|
bool new_item_has_white_char_before;
|
|
int white_mode;
|
|
bool is_first_item;
|
|
size_t wrap_line; // insert a new line character into long lines
|
|
size_t tab_size;
|
|
OrphanMode orphan_mode;
|
|
std::wstring attr_name;
|
|
std::vector<std::wstring> attr_value;
|
|
std::wstring attr_value_temp;
|
|
std::wstring attr_value_lower;
|
|
bool attr_has_value;
|
|
std::wstring lang_code_lower;
|
|
size_t line_len; //length of the current line (without first spaces which create the html tree)
|
|
bool safe_mode; // skipping some unsafe tags
|
|
Orphans orphans_temp;
|
|
bool skip_tags;
|
|
bool skip_commentaries;
|
|
bool skip_entities;
|
|
bool analyze_entities;
|
|
std::wstring tmp_text;
|
|
std::wstring tmp_name;
|
|
};
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|