Compare commits
1 Commits
master
...
htmlparser
Author | SHA1 | Date |
---|---|---|
Tomasz Sowa | 03b159d9be |
|
@ -36,7 +36,6 @@
|
|||
*/
|
||||
|
||||
#include "htmlparser.h"
|
||||
|
||||
#include "convert/text.h"
|
||||
|
||||
|
||||
|
@ -102,6 +101,13 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
|
|||
}
|
||||
|
||||
|
||||
void HTMLParser::set_item_parsed_listener(ItemParsedListener * listener)
|
||||
{
|
||||
item_parsed_listener = listener;
|
||||
}
|
||||
|
||||
|
||||
|
||||
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
|
||||
{
|
||||
clear_input_flags();
|
||||
|
@ -296,6 +302,7 @@ void HTMLParser::SetSomeDefaults()
|
|||
skip_commentaries = false;
|
||||
skip_entities = false;
|
||||
analyze_entities = false;
|
||||
item_parsed_listener = nullptr;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1941,6 +1948,12 @@ void HTMLParser::CheckClosingTags()
|
|||
return;
|
||||
}
|
||||
|
||||
// CHECK ME
|
||||
if( RemoveIfNeeded(stack_len - 2) )
|
||||
{
|
||||
RemoveLastSpace(i);
|
||||
}
|
||||
|
||||
for(int z=(int)stack_len-2 ; z >= i ; --z)
|
||||
{
|
||||
CheckWhiteCharsExceptions(pstack[z]);
|
||||
|
@ -2047,6 +2060,36 @@ void HTMLParser::AddItemToSpace()
|
|||
}
|
||||
|
||||
|
||||
|
||||
void HTMLParser::RemoveLastSpace(size_t index)
|
||||
{
|
||||
if( out_space )
|
||||
{
|
||||
Space * parent = out_space;
|
||||
|
||||
if( index > 0 )
|
||||
{
|
||||
parent = pstack[index - 1].space;
|
||||
}
|
||||
|
||||
if( xml_compact_mode )
|
||||
{
|
||||
// IMPLEMENT ME
|
||||
}
|
||||
else
|
||||
{
|
||||
Space * childs_tab = parent->get_space(L"childs");
|
||||
size_t len = childs_tab->table_size();
|
||||
|
||||
if( childs_tab && childs_tab->is_table() && len > 0 && childs_tab->value.value_table[len-1] == pstack[stack_len-2].space )
|
||||
{
|
||||
childs_tab->remove(len - 1);
|
||||
pstack[stack_len-2].space = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HTMLParser::AddSpaceToSpaceTree(const Space & space)
|
||||
{
|
||||
const std::wstring * text = space.get_wstr(L"text");
|
||||
|
@ -2086,6 +2129,22 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)
|
|||
|
||||
|
||||
|
||||
|
||||
bool HTMLParser::RemoveIfNeeded(size_t index)
|
||||
{
|
||||
if( item_parsed_listener )
|
||||
{
|
||||
if( !item_parsed_listener->item_parsed(pstack[index]) )
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void HTMLParser::ReadLoop()
|
||||
{
|
||||
while( status == ok && ReadItem() )
|
||||
|
@ -2111,6 +2170,12 @@ void HTMLParser::ReadLoop()
|
|||
else
|
||||
if( LastItem().type == Item::simple )
|
||||
{
|
||||
if( stack_len > 0 )
|
||||
{
|
||||
if( RemoveIfNeeded(stack_len - 1) )
|
||||
RemoveLastSpace(stack_len - 1);
|
||||
}
|
||||
|
||||
PopStack();
|
||||
}
|
||||
else
|
||||
|
|
|
@ -52,7 +52,6 @@ namespace pt
|
|||
|
||||
|
||||
|
||||
|
||||
// max length of a name of a html tag (with terminating null)
|
||||
#define WINIX_HTMLFILTER_ITEM_NAME_MAXLEN 30
|
||||
|
||||
|
@ -110,6 +109,66 @@ public:
|
|||
};
|
||||
|
||||
|
||||
// orphans for one language
|
||||
struct Orphans
|
||||
{
|
||||
std::vector<std::wstring> tab;
|
||||
size_t max_len;
|
||||
};
|
||||
|
||||
|
||||
struct Item
|
||||
{
|
||||
std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
|
||||
|
||||
enum Type
|
||||
{
|
||||
opening, /* sample: <h1> */
|
||||
closing, /* sample: </h1> */
|
||||
simple, /* sample: <br/> */
|
||||
special, /* sample: <!doctype> */
|
||||
none
|
||||
} type;
|
||||
|
||||
bool is_commentary;
|
||||
|
||||
bool new_line_before;
|
||||
|
||||
// is there a new line after this tag
|
||||
bool new_line;
|
||||
|
||||
// is there a new
|
||||
bool new_line_in_the_middle;
|
||||
|
||||
// current orphans table
|
||||
// (will be propagated)
|
||||
Orphans * porphans;
|
||||
|
||||
// this item or one from its parents is a 'body' html tag
|
||||
// (will be propagated)
|
||||
bool has_body_tag;
|
||||
|
||||
size_t tree_index;
|
||||
|
||||
Space * space;
|
||||
|
||||
void Clear();
|
||||
Item();
|
||||
};
|
||||
|
||||
|
||||
class ItemParsedListener
|
||||
{
|
||||
public:
|
||||
|
||||
ItemParsedListener() {}
|
||||
|
||||
virtual bool item_parsed(const Item & item) { return true; }
|
||||
virtual ~ItemParsedListener() {}
|
||||
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
the last status of parsing, set by parse() methods
|
||||
*/
|
||||
|
@ -120,6 +179,8 @@ public:
|
|||
HTMLParser & operator=(const HTMLParser & f);
|
||||
virtual ~HTMLParser();
|
||||
|
||||
void set_item_parsed_listener(ItemParsedListener * listener);
|
||||
|
||||
|
||||
void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
|
||||
|
||||
|
@ -222,12 +283,6 @@ protected:
|
|||
|
||||
bool xml_compact_mode;
|
||||
|
||||
// orphans for one language
|
||||
struct Orphans
|
||||
{
|
||||
std::vector<std::wstring> tab;
|
||||
size_t max_len;
|
||||
};
|
||||
|
||||
|
||||
// orphans for all languages
|
||||
|
@ -238,45 +293,9 @@ protected:
|
|||
// html <nofilter> tag name
|
||||
std::wstring no_filter_tag;
|
||||
|
||||
ItemParsedListener * item_parsed_listener;
|
||||
|
||||
struct Item
|
||||
{
|
||||
std::wstring name; // max size: WINIX_HTMLFILTER_ITEM_NAME_MAXLEN
|
||||
|
||||
enum Type
|
||||
{
|
||||
opening, /* sample: <h1> */
|
||||
closing, /* sample: </h1> */
|
||||
simple, /* sample: <br/> */
|
||||
special, /* sample: <!doctype> */
|
||||
none
|
||||
} type;
|
||||
|
||||
bool is_commentary;
|
||||
|
||||
bool new_line_before;
|
||||
|
||||
// is there a new line after this tag
|
||||
bool new_line;
|
||||
|
||||
// is there a new
|
||||
bool new_line_in_the_middle;
|
||||
|
||||
// current orphans table
|
||||
// (will be propagated)
|
||||
Orphans * porphans;
|
||||
|
||||
// this item or one from its parents is a 'body' html tag
|
||||
// (will be propagated)
|
||||
bool has_body_tag;
|
||||
|
||||
size_t tree_index;
|
||||
|
||||
Space * space;
|
||||
|
||||
void Clear();
|
||||
Item();
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
@ -398,8 +417,11 @@ protected:
|
|||
void CalcOrphansMaxLen(Orphans & orphans);
|
||||
|
||||
void AddItemToSpace();
|
||||
void RemoveLastSpace(size_t index);
|
||||
void AddSpaceToSpaceTree(const Space & space);
|
||||
|
||||
bool RemoveIfNeeded(size_t index);
|
||||
|
||||
Item empty;
|
||||
Item * pstack; // stack pointer
|
||||
size_t stack_len; // length of the stack
|
||||
|
|
Loading…
Reference in New Issue