HTMLParser: start working on xml mode
added methods: Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
This commit is contained in:
parent
b1cc64a29b
commit
2576eb12d1
|
@ -78,6 +78,9 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
|
||||||
pchar_ascii = 0;
|
pchar_ascii = 0;
|
||||||
xml_compact_mode = compact_mode;
|
xml_compact_mode = compact_mode;
|
||||||
|
|
||||||
|
status = ok;
|
||||||
|
line = 1;
|
||||||
|
|
||||||
stack_len = 0;
|
stack_len = 0;
|
||||||
out_string = nullptr;
|
out_string = nullptr;
|
||||||
out_space = &space;
|
out_space = &space;
|
||||||
|
@ -91,6 +94,64 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
|
||||||
|
{
|
||||||
|
parsing_html = false;
|
||||||
|
reading_from_file = true;
|
||||||
|
xml_compact_mode = compact_mode;
|
||||||
|
|
||||||
|
status = ok;
|
||||||
|
line = 1;
|
||||||
|
stack_len = 0;
|
||||||
|
out_string = nullptr;
|
||||||
|
line_len = 0;
|
||||||
|
|
||||||
|
this->out_space = &out_space;
|
||||||
|
|
||||||
|
if( clear_space )
|
||||||
|
this->out_space->clear();
|
||||||
|
|
||||||
|
file.clear();
|
||||||
|
file.open(file_name, std::ios_base::binary | std::ios_base::in);
|
||||||
|
|
||||||
|
if( file )
|
||||||
|
{
|
||||||
|
Init();
|
||||||
|
Read();
|
||||||
|
Uninit();
|
||||||
|
|
||||||
|
file.close();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
status = cant_open_file;
|
||||||
|
}
|
||||||
|
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space)
|
||||||
|
{
|
||||||
|
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
|
||||||
|
{
|
||||||
|
std::string file_name_utf8;
|
||||||
|
|
||||||
|
wide_to_utf8(file_name, file_name_utf8);
|
||||||
|
return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space)
|
||||||
|
{
|
||||||
|
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
|
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
|
||||||
{
|
{
|
||||||
|
@ -142,6 +203,12 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int HTMLParser::get_last_parsed_line()
|
||||||
|
{
|
||||||
|
return line;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void HTMLParser::SetSomeDefaults()
|
void HTMLParser::SetSomeDefaults()
|
||||||
{
|
{
|
||||||
white_mode = WHITE_MODE_ORIGIN;
|
white_mode = WHITE_MODE_ORIGIN;
|
||||||
|
@ -494,7 +561,7 @@ bool HTMLParser::IsValidCharForName(int c)
|
||||||
if( (c>='a' && c<='z') ||
|
if( (c>='a' && c<='z') ||
|
||||||
(c>='A' && c<='Z') ||
|
(c>='A' && c<='Z') ||
|
||||||
(c>='0' && c<='9') ||
|
(c>='0' && c<='9') ||
|
||||||
c=='-' || c=='!' || c==':' || c=='-') // : is for a namespace character, - is for a commentary
|
c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
@ -506,7 +573,7 @@ bool HTMLParser::IsValidCharForAttrName(int c)
|
||||||
if( (c>='a' && c<='z') ||
|
if( (c>='a' && c<='z') ||
|
||||||
(c>='A' && c<='Z') ||
|
(c>='A' && c<='Z') ||
|
||||||
(c>='0' && c<='9') ||
|
(c>='0' && c<='9') ||
|
||||||
c=='-' || c==':' )
|
c=='-' || c==':' || c=='_')
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
@ -624,6 +691,34 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
|
||||||
|
{
|
||||||
|
attr_value.clear();
|
||||||
|
tmp_text.clear();
|
||||||
|
|
||||||
|
while( lastc != -1 )
|
||||||
|
{
|
||||||
|
if( has_quote )
|
||||||
|
{
|
||||||
|
if( lastc == quote_char )
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// IMPROVEME add support for analyze_entities?
|
||||||
|
if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
||||||
|
tmp_text += lastc;
|
||||||
|
|
||||||
|
read_char();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void HTMLParser::CheckChar(wchar_t c)
|
void HTMLParser::CheckChar(wchar_t c)
|
||||||
{
|
{
|
||||||
if( c == 10 )
|
if( c == 10 )
|
||||||
|
@ -1021,6 +1116,10 @@ bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
|
||||||
return (c == '!');
|
return (c == '!');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c)
|
||||||
|
{
|
||||||
|
return (c == '?');
|
||||||
|
}
|
||||||
|
|
||||||
// the '=' operator e.g. class="value"
|
// the '=' operator e.g. class="value"
|
||||||
bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
|
bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
|
||||||
|
@ -1292,7 +1391,11 @@ bool HTMLParser::ReadItemAttr()
|
||||||
if( has_quote )
|
if( has_quote )
|
||||||
read_char(); // skipping the first quote mark
|
read_char(); // skipping the first quote mark
|
||||||
|
|
||||||
ReadItemAttrValue(has_quote, quote_char);
|
// IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table...
|
||||||
|
if( parsing_html )
|
||||||
|
ReadItemAttrValue(has_quote, quote_char);
|
||||||
|
else
|
||||||
|
ReadXMLItemAttrValue(has_quote, quote_char);
|
||||||
|
|
||||||
if( has_quote && lastc == quote_char )
|
if( has_quote && lastc == quote_char )
|
||||||
read_char(); // skipping the last quote mark
|
read_char(); // skipping the last quote mark
|
||||||
|
@ -1361,11 +1464,18 @@ void HTMLParser::PutItemAttrToSpace()
|
||||||
|
|
||||||
if( attr_has_value )
|
if( attr_has_value )
|
||||||
{
|
{
|
||||||
attr.set_empty_table();
|
if( parsing_html )
|
||||||
|
|
||||||
for(size_t i=0 ; i < attr_value.size() ; ++i)
|
|
||||||
{
|
{
|
||||||
attr.add(attr_value[i]);
|
attr.set_empty_table();
|
||||||
|
|
||||||
|
for(size_t i=0 ; i < attr_value.size() ; ++i)
|
||||||
|
{
|
||||||
|
attr.add(attr_value[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
attr.set(tmp_text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1399,8 +1509,8 @@ void HTMLParser::ReadItemSpecial()
|
||||||
PutOpeningTagMark();
|
PutOpeningTagMark();
|
||||||
}
|
}
|
||||||
|
|
||||||
read_char(); // skipping '!'
|
LastItem().name = lastc;
|
||||||
LastItem().name = '!';
|
read_char(); // skipping '!' or '?'
|
||||||
ReadItemName(LastItem().name, false);
|
ReadItemName(LastItem().name, false);
|
||||||
|
|
||||||
if( skip_tags )
|
if( skip_tags )
|
||||||
|
@ -1491,7 +1601,7 @@ bool HTMLParser::ReadItem()
|
||||||
read_char(); // skipping the first opening tag mark '<'
|
read_char(); // skipping the first opening tag mark '<'
|
||||||
SkipWhiteLines();
|
SkipWhiteLines();
|
||||||
|
|
||||||
if( IsSpecialTagIndicator(lastc) )
|
if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) )
|
||||||
ReadItemSpecial();
|
ReadItemSpecial();
|
||||||
else
|
else
|
||||||
if( IsClosingTagIndicator(lastc) )
|
if( IsClosingTagIndicator(lastc) )
|
||||||
|
@ -1924,7 +2034,7 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)
|
||||||
|
|
||||||
void HTMLParser::ReadLoop()
|
void HTMLParser::ReadLoop()
|
||||||
{
|
{
|
||||||
while( ReadItem() )
|
while( status == ok && ReadItem() )
|
||||||
{
|
{
|
||||||
if( LastItem().type == Item::opening )
|
if( LastItem().type == Item::opening )
|
||||||
{
|
{
|
||||||
|
@ -1933,7 +2043,6 @@ void HTMLParser::ReadLoop()
|
||||||
CheckSingleItemExceptions();
|
CheckSingleItemExceptions();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
CheckWhiteCharsExceptions(LastItem());
|
CheckWhiteCharsExceptions(LastItem());
|
||||||
CheckDifferentContentExceptions(LastItem());
|
CheckDifferentContentExceptions(LastItem());
|
||||||
}
|
}
|
||||||
|
@ -1960,7 +2069,10 @@ void HTMLParser::ReadLoop()
|
||||||
PopStack();
|
PopStack();
|
||||||
}
|
}
|
||||||
|
|
||||||
ReadText();
|
if( status == ok )
|
||||||
|
{
|
||||||
|
ReadText();
|
||||||
|
}
|
||||||
|
|
||||||
is_first_item = false;
|
is_first_item = false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -95,12 +95,25 @@ class HTMLParser : public BaseParser
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
status of parsing
|
||||||
|
*/
|
||||||
|
enum Status { ok, cant_open_file, syntax_error };
|
||||||
|
|
||||||
|
|
||||||
enum OrphanMode
|
enum OrphanMode
|
||||||
{
|
{
|
||||||
orphan_nbsp, // putting " " string
|
orphan_nbsp, // putting " " string
|
||||||
orphan_160space // putting 160 ascii code
|
orphan_160space // putting 160 ascii code
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
the last status of parsing, set by parse() methods
|
||||||
|
*/
|
||||||
|
Status status;
|
||||||
|
|
||||||
HTMLParser();
|
HTMLParser();
|
||||||
HTMLParser(const HTMLParser & f);
|
HTMLParser(const HTMLParser & f);
|
||||||
HTMLParser & operator=(const HTMLParser & f);
|
HTMLParser & operator=(const HTMLParser & f);
|
||||||
|
@ -109,12 +122,30 @@ public:
|
||||||
|
|
||||||
void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
|
void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
|
||||||
|
|
||||||
|
Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
||||||
|
Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
||||||
|
Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
||||||
|
Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// main methods used for filtering
|
// main methods used for filtering
|
||||||
void Filter(const wchar_t * in, std::wstring & out);
|
void Filter(const wchar_t * in, std::wstring & out);
|
||||||
void Filter(const std::wstring & in, std::wstring & out);
|
void Filter(const std::wstring & in, std::wstring & out);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* returns a number of a last parsed line
|
||||||
|
* can be used to obtain the line in which there was a syntax error
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
int get_last_parsed_line();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
const static int WHITE_MODE_ORIGIN = 0;
|
const static int WHITE_MODE_ORIGIN = 0;
|
||||||
const static int WHITE_MODE_SINGLE_LINE = 1;
|
const static int WHITE_MODE_SINGLE_LINE = 1;
|
||||||
const static int WHITE_MODE_TREE = 2;
|
const static int WHITE_MODE_TREE = 2;
|
||||||
|
@ -255,6 +286,7 @@ protected:
|
||||||
virtual bool IsClosingTagMark(wchar_t c);
|
virtual bool IsClosingTagMark(wchar_t c);
|
||||||
virtual bool IsClosingTagIndicator(wchar_t c);
|
virtual bool IsClosingTagIndicator(wchar_t c);
|
||||||
virtual bool IsSpecialTagIndicator(wchar_t c);
|
virtual bool IsSpecialTagIndicator(wchar_t c);
|
||||||
|
virtual bool IsXMLSpecialTagIndicator(wchar_t c);
|
||||||
virtual bool IsAttributeAssignmentMark(wchar_t c);
|
virtual bool IsAttributeAssignmentMark(wchar_t c);
|
||||||
virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
|
virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
|
||||||
virtual bool IsStartingEntityMark(wchar_t c);
|
virtual bool IsStartingEntityMark(wchar_t c);
|
||||||
|
@ -335,6 +367,7 @@ protected:
|
||||||
void ReadItemAttrName();
|
void ReadItemAttrName();
|
||||||
void ReadItemAttrValueAdd(const std::wstring & str);
|
void ReadItemAttrValueAdd(const std::wstring & str);
|
||||||
void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
|
void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
|
||||||
|
void ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char);
|
||||||
|
|
||||||
bool ReadItemAttr();
|
bool ReadItemAttr();
|
||||||
void CheckItemLangAttr();
|
void CheckItemLangAttr();
|
||||||
|
|
Loading…
Reference in New Issue