/* * This file is a part of Winix * and is distributed under the 2-Clause BSD licence. * Author: Tomasz Sowa */ /* * Copyright (c) 2008-2018, Tomasz Sowa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #ifndef headerfile_winix_core_htmlfilter #define headerfile_winix_core_htmlfilter #include #include #include #include namespace Winix { // max length of a name of a html tag (with terminating null) #define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30 // max length of a html lang attribute (e.g. "en", "pl") #define WINIX_HTMLFILTER_ITEM_LANG_MAXLEN 10 #define WINIX_HTMLFILTER_ATTR_NAME_MAXLEN 40 #define WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN 500 // depth of the html tree #define WINIX_HTMLFILTER_STACK_MAXLEN 100 // length of a buffer used for printing // it should be at least: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN+3 #define WINIX_HTMLFILTER_BUFFER_MAXLEN 2048 /*! very lightweight filter for html (without using any dynamic memory - some memory is allocated only at the beginning - in ctors) this filter has O(n) complexity over the whole html string such tags as: ) are untouched if the filter finds that there are not closed tags it will close them, if the filter finds a closing tag which doesn't have an opening tag - it will skip it tags which don't need to be closed: meta, input, br, img, link look at CheckExceptions() method the filter recognizes xml simple tags (with / at the end) such as:
*/ class HTMLFilter { public: enum OrphanMode { orphan_nbsp, // putting " " string orphan_160space // putting 160 ascii code }; HTMLFilter(); HTMLFilter(const HTMLFilter & f); HTMLFilter & operator=(const HTMLFilter & f); virtual ~HTMLFilter(); // main methods used for filtering void Filter(const wchar_t * in, std::wstring & out); void Filter(const std::wstring & in, std::wstring & out); // insert a white space into long words // (only between html tags) // skipped in such tags: script, pre, textarea // break_after - after how many characters insert a space (0 - off) void BreakWord(size_t break_after_); // insert a new line character into long lines // (only between html tags) // skipped in such tags: script, pre, textarea // wrap_line - after how many characters wrap a line (0 - off) // lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section) void WrapLine(size_t wrap_line_); // trimming white characters (with new lines) // at the beginning, at the end and in the middle of a string // only between html tags // at the beginning and at the end only one space is left // skipped in such tags: script, pre, textarea // false by default void TrimWhite(bool trim); // first tabs in a tree // default: 2 (spaces) // set 0 to turn off void InsertTabs(size_t tabsize); // set a name of a html tag which will be used as 'nofilter' tag // elements between such tags are not filtered (similarly as in
 and