/* * This file is a part of PikoTools * and is distributed under the (new) BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2008-2021, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef headerfile_picotools_html_htmlfilter #define headerfile_picotools_html_htmlfilter #include #include #include #include #include "convert/baseparser.h" namespace pt { // max length of a name of a html tag (with terminating null) #define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30 // max length of a html lang attribute (e.g. "en", "pl") #define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10 #define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40 #define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500 // depth of the html tree #define WINIX_HTMLFILTER_STACK_MAXLEN 100 // length of a buffer used for printing // it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3 #define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048 /*! very lightweight filter for html (without using any dynamic memory - some memory is allocated only at the beginning - in ctors) this filter has O(n) complexity over the whole html string such tags as: ) are untouched if the filter finds that there are not closed tags it will close them, if the filter finds a closing tag which doesn't have an opening tag - it will skip it tags which don't need to be closed: meta, input, br, img, link look at CheckExceptions() method the filter recognizes xml simple tags (with / at the end) such as:
*/ class HTMLParser : public BaseParser { public: enum OrphanMode { orphan_nbsp, // putting " " string orphan_160space // putting 160 ascii code }; HTMLParser(); HTMLParser(const HTMLParser & f); HTMLParser & operator=(const HTMLParser & f); virtual ~HTMLParser(); // main methods used for filtering void Filter(const wchar_t * in, std::wstring & out); void Filter(const std::wstring & in, std::wstring & out); const static int WHITE_MODE_ORIGIN = 0; const static int WHITE_MODE_SINGLE_LINE = 1; const static int WHITE_MODE_TREE = 2; // white chars mode // void white_chars_mode(int mode); // if the line is wrap_line_ length (or longer) then insert a new line character (in a place of a white char) // (only between html tags and only in subtree) // skipped in such tags: script, pre, textarea // 0 - off // lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section) void WrapLine(size_t wrap_line_); // first tabs in a tree // default: 2 (spaces) // set 0 to turn off void InsertTabs(size_t tabsize); // set a name of a html tag which will be used as 'nofilter' tag // elements between such tags are not filtered (similarly as in
 and