/* * This file is a part of PikoTools * and is distributed under the (new) BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2008-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef headerfile_picotools_html_htmlfilter #define headerfile_picotools_html_htmlfilter #include #include #include #include #include "convert/baseparser.h" #include "space/space.h" #include "textstream/stream.h" namespace pt { // max length of a name of a html tag (with terminating null) #define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30 // max length of a html lang attribute (e.g. "en", "pl") #define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10 #define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40 #define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500 // depth of the html tree #define WINIX_HTMLFILTER_STACK_MAXLEN 100 // length of a buffer used for printing // it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3 #define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048 /*! very lightweight filter for html (without using any dynamic memory - some memory is allocated only at the beginning - in ctors) this filter has O(n) complexity over the whole html string such tags as: ) are untouched if the filter finds that there are not closed tags it will close them, if the filter finds a closing tag which doesn't have an opening tag - it will skip it tags which don't need to be closed: meta, input, br, img, link look at CheckExceptions() method the filter recognizes xml simple tags (with / at the end) such as:
*/ class HTMLParser : public BaseParser { public: /* status of parsing */ enum Status { ok, cant_open_file, syntax_error }; enum OrphanMode { orphan_nbsp, // putting " " string orphan_160space // putting 160 ascii code }; /* the last status of parsing, set by parse() methods */ Status status; HTMLParser(); HTMLParser(const HTMLParser & f); HTMLParser & operator=(const HTMLParser & f); virtual ~HTMLParser(); void parse_html(const wchar_t * in, Space & space, bool compact_mode = false); Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); // main methods used for filtering void filter(const wchar_t * in, std::wstring & out, bool clear_out_string = true); void filter(const std::wstring & in, std::wstring & out, bool clear_out_string = true); void filter(const WTextStream & in, Stream & out, bool clear_out_stream = true); HTMLParser::Status filter_file(const char * file_name, std::wstring & out, bool clear_out_stream = true); HTMLParser::Status filter_file(const std::string & file_name, std::wstring & out, bool clear_out_stream = true); HTMLParser::Status filter_file(const wchar_t * file_name, std::wstring & out, bool clear_out_stream = true); HTMLParser::Status filter_file(const std::wstring & file_name, std::wstring & out, bool clear_out_stream = true); /* * * returns a number of a last parsed line * can be used to obtain the line in which there was a syntax error * */ int get_last_parsed_line(); const static int WHITE_MODE_ORIGIN = 0; const static int WHITE_MODE_SINGLE_LINE = 1; const static int WHITE_MODE_TREE = 2; // white chars mode // void white_chars_mode(int mode); // if the line is wrap_line_ length (or longer) then insert a new line character (in a place of a white char) // (only between html tags and only in subtree) // skipped in such tags: script, pre, textarea // 0 - off // lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section) void WrapLine(size_t wrap_line_); // first tabs in a tree // default: 2 (spaces) // set 0 to turn off void InsertTabs(size_t tabsize); // set a name of a html tag which will be used as 'nofilter' tag // elements between such tags are not filtered (similarly as in
 and