|
|
|
@ -78,6 +78,9 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
|
|
|
|
|
pchar_ascii = 0;
|
|
|
|
|
xml_compact_mode = compact_mode;
|
|
|
|
|
|
|
|
|
|
status = ok;
|
|
|
|
|
line = 1;
|
|
|
|
|
|
|
|
|
|
stack_len = 0;
|
|
|
|
|
out_string = nullptr;
|
|
|
|
|
out_space = &space;
|
|
|
|
@ -91,6 +94,64 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
|
|
|
|
|
{
|
|
|
|
|
parsing_html = false;
|
|
|
|
|
reading_from_file = true;
|
|
|
|
|
xml_compact_mode = compact_mode;
|
|
|
|
|
|
|
|
|
|
status = ok;
|
|
|
|
|
line = 1;
|
|
|
|
|
stack_len = 0;
|
|
|
|
|
out_string = nullptr;
|
|
|
|
|
line_len = 0;
|
|
|
|
|
|
|
|
|
|
this->out_space = &out_space;
|
|
|
|
|
|
|
|
|
|
if( clear_space )
|
|
|
|
|
this->out_space->clear();
|
|
|
|
|
|
|
|
|
|
file.clear();
|
|
|
|
|
file.open(file_name, std::ios_base::binary | std::ios_base::in);
|
|
|
|
|
|
|
|
|
|
if( file )
|
|
|
|
|
{
|
|
|
|
|
Init();
|
|
|
|
|
Read();
|
|
|
|
|
Uninit();
|
|
|
|
|
|
|
|
|
|
file.close();
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
status = cant_open_file;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space)
|
|
|
|
|
{
|
|
|
|
|
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
|
|
|
|
|
{
|
|
|
|
|
std::string file_name_utf8;
|
|
|
|
|
|
|
|
|
|
wide_to_utf8(file_name, file_name_utf8);
|
|
|
|
|
return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space)
|
|
|
|
|
{
|
|
|
|
|
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
|
|
|
|
|
{
|
|
|
|
@ -142,6 +203,12 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int HTMLParser::get_last_parsed_line()
|
|
|
|
|
{
|
|
|
|
|
return line;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLParser::SetSomeDefaults()
|
|
|
|
|
{
|
|
|
|
|
white_mode = WHITE_MODE_ORIGIN;
|
|
|
|
@ -494,7 +561,7 @@ bool HTMLParser::IsValidCharForName(int c)
|
|
|
|
|
if( (c>='a' && c<='z') ||
|
|
|
|
|
(c>='A' && c<='Z') ||
|
|
|
|
|
(c>='0' && c<='9') ||
|
|
|
|
|
c=='-' || c=='!' || c==':' || c=='-') // : is for a namespace character, - is for a commentary
|
|
|
|
|
c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
@ -506,7 +573,7 @@ bool HTMLParser::IsValidCharForAttrName(int c)
|
|
|
|
|
if( (c>='a' && c<='z') ||
|
|
|
|
|
(c>='A' && c<='Z') ||
|
|
|
|
|
(c>='0' && c<='9') ||
|
|
|
|
|
c=='-' || c==':' )
|
|
|
|
|
c=='-' || c==':' || c=='_')
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
@ -624,6 +691,34 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
|
|
|
|
|
{
|
|
|
|
|
attr_value.clear();
|
|
|
|
|
tmp_text.clear();
|
|
|
|
|
|
|
|
|
|
while( lastc != -1 )
|
|
|
|
|
{
|
|
|
|
|
if( has_quote )
|
|
|
|
|
{
|
|
|
|
|
if( lastc == quote_char )
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// IMPROVEME add support for analyze_entities?
|
|
|
|
|
if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
|
|
|
|
tmp_text += lastc;
|
|
|
|
|
|
|
|
|
|
read_char();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void HTMLParser::CheckChar(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
if( c == 10 )
|
|
|
|
@ -1021,6 +1116,10 @@ bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
|
|
|
|
|
return (c == '!');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c)
|
|
|
|
|
{
|
|
|
|
|
return (c == '?');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// the '=' operator e.g. class="value"
|
|
|
|
|
bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
|
|
|
|
@ -1292,7 +1391,11 @@ bool HTMLParser::ReadItemAttr()
|
|
|
|
|
if( has_quote )
|
|
|
|
|
read_char(); // skipping the first quote mark
|
|
|
|
|
|
|
|
|
|
ReadItemAttrValue(has_quote, quote_char);
|
|
|
|
|
// IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table...
|
|
|
|
|
if( parsing_html )
|
|
|
|
|
ReadItemAttrValue(has_quote, quote_char);
|
|
|
|
|
else
|
|
|
|
|
ReadXMLItemAttrValue(has_quote, quote_char);
|
|
|
|
|
|
|
|
|
|
if( has_quote && lastc == quote_char )
|
|
|
|
|
read_char(); // skipping the last quote mark
|
|
|
|
@ -1361,11 +1464,18 @@ void HTMLParser::PutItemAttrToSpace()
|
|
|
|
|
|
|
|
|
|
if( attr_has_value )
|
|
|
|
|
{
|
|
|
|
|
attr.set_empty_table();
|
|
|
|
|
if( parsing_html )
|
|
|
|
|
{
|
|
|
|
|
attr.set_empty_table();
|
|
|
|
|
|
|
|
|
|
for(size_t i=0 ; i < attr_value.size() ; ++i)
|
|
|
|
|
for(size_t i=0 ; i < attr_value.size() ; ++i)
|
|
|
|
|
{
|
|
|
|
|
attr.add(attr_value[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
attr.add(attr_value[i]);
|
|
|
|
|
attr.set(tmp_text);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -1399,8 +1509,8 @@ void HTMLParser::ReadItemSpecial()
|
|
|
|
|
PutOpeningTagMark();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
read_char(); // skipping '!'
|
|
|
|
|
LastItem().name = '!';
|
|
|
|
|
LastItem().name = lastc;
|
|
|
|
|
read_char(); // skipping '!' or '?'
|
|
|
|
|
ReadItemName(LastItem().name, false);
|
|
|
|
|
|
|
|
|
|
if( skip_tags )
|
|
|
|
@ -1491,7 +1601,7 @@ bool HTMLParser::ReadItem()
|
|
|
|
|
read_char(); // skipping the first opening tag mark '<'
|
|
|
|
|
SkipWhiteLines();
|
|
|
|
|
|
|
|
|
|
if( IsSpecialTagIndicator(lastc) )
|
|
|
|
|
if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) )
|
|
|
|
|
ReadItemSpecial();
|
|
|
|
|
else
|
|
|
|
|
if( IsClosingTagIndicator(lastc) )
|
|
|
|
@ -1924,7 +2034,7 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)
|
|
|
|
|
|
|
|
|
|
void HTMLParser::ReadLoop()
|
|
|
|
|
{
|
|
|
|
|
while( ReadItem() )
|
|
|
|
|
while( status == ok && ReadItem() )
|
|
|
|
|
{
|
|
|
|
|
if( LastItem().type == Item::opening )
|
|
|
|
|
{
|
|
|
|
@ -1933,7 +2043,6 @@ void HTMLParser::ReadLoop()
|
|
|
|
|
CheckSingleItemExceptions();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CheckWhiteCharsExceptions(LastItem());
|
|
|
|
|
CheckDifferentContentExceptions(LastItem());
|
|
|
|
|
}
|
|
|
|
@ -1960,7 +2069,10 @@ void HTMLParser::ReadLoop()
|
|
|
|
|
PopStack();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ReadText();
|
|
|
|
|
if( status == ok )
|
|
|
|
|
{
|
|
|
|
|
ReadText();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
is_first_item = false;
|
|
|
|
|
}
|
|
|
|
|