HTMLParser: start working on xml mode

added methods:
Status parse_xml_file(const char * file_name,         Space & out_space, bool compact_mode = false, bool clear_space = true);
Status parse_xml_file(const std::string & file_name,  Space & out_space, bool compact_mode = false, bool clear_space = true);
Status parse_xml_file(const wchar_t * file_name,      Space & out_space, bool compact_mode = false, bool clear_space = true);
Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
This commit is contained in:
Tomasz Sowa 2021-08-10 21:56:04 +02:00
parent b1cc64a29b
commit 2576eb12d1
2 changed files with 158 additions and 13 deletions

View File

@ -78,6 +78,9 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
pchar_ascii = 0;
xml_compact_mode = compact_mode;
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
out_space = &space;
@ -91,6 +94,64 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
}
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
{
parsing_html = false;
reading_from_file = true;
xml_compact_mode = compact_mode;
status = ok;
line = 1;
stack_len = 0;
out_string = nullptr;
line_len = 0;
this->out_space = &out_space;
if( clear_space )
this->out_space->clear();
file.clear();
file.open(file_name, std::ios_base::binary | std::ios_base::in);
if( file )
{
Init();
Read();
Uninit();
file.close();
}
else
{
status = cant_open_file;
}
return status;
}
HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space)
{
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
}
HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
{
std::string file_name_utf8;
wide_to_utf8(file_name, file_name_utf8);
return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
}
HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space)
{
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
}
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
{
@ -142,6 +203,12 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
}
int HTMLParser::get_last_parsed_line()
{
return line;
}
void HTMLParser::SetSomeDefaults()
{
white_mode = WHITE_MODE_ORIGIN;
@ -494,7 +561,7 @@ bool HTMLParser::IsValidCharForName(int c)
if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') ||
(c>='0' && c<='9') ||
c=='-' || c=='!' || c==':' || c=='-') // : is for a namespace character, - is for a commentary
c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
return true;
return false;
@ -506,7 +573,7 @@ bool HTMLParser::IsValidCharForAttrName(int c)
if( (c>='a' && c<='z') ||
(c>='A' && c<='Z') ||
(c>='0' && c<='9') ||
c=='-' || c==':' )
c=='-' || c==':' || c=='_')
return true;
return false;
@ -624,6 +691,34 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
}
void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
{
attr_value.clear();
tmp_text.clear();
while( lastc != -1 )
{
if( has_quote )
{
if( lastc == quote_char )
break;
}
else
{
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
break;
}
// IMPROVEME add support for analyze_entities?
if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
tmp_text += lastc;
read_char();
}
}
void HTMLParser::CheckChar(wchar_t c)
{
if( c == 10 )
@ -1021,6 +1116,10 @@ bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
return (c == '!');
}
bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c)
{
return (c == '?');
}
// the '=' operator e.g. class="value"
bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
@ -1292,7 +1391,11 @@ bool HTMLParser::ReadItemAttr()
if( has_quote )
read_char(); // skipping the first quote mark
// IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table...
if( parsing_html )
ReadItemAttrValue(has_quote, quote_char);
else
ReadXMLItemAttrValue(has_quote, quote_char);
if( has_quote && lastc == quote_char )
read_char(); // skipping the last quote mark
@ -1360,6 +1463,8 @@ void HTMLParser::PutItemAttrToSpace()
Space & attr = attr_tab.add_empty_space(attr_name);
if( attr_has_value )
{
if( parsing_html )
{
attr.set_empty_table();
@ -1368,6 +1473,11 @@ void HTMLParser::PutItemAttrToSpace()
attr.add(attr_value[i]);
}
}
else
{
attr.set(tmp_text);
}
}
}
}
@ -1399,8 +1509,8 @@ void HTMLParser::ReadItemSpecial()
PutOpeningTagMark();
}
read_char(); // skipping '!'
LastItem().name = '!';
LastItem().name = lastc;
read_char(); // skipping '!' or '?'
ReadItemName(LastItem().name, false);
if( skip_tags )
@ -1491,7 +1601,7 @@ bool HTMLParser::ReadItem()
read_char(); // skipping the first opening tag mark '<'
SkipWhiteLines();
if( IsSpecialTagIndicator(lastc) )
if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) )
ReadItemSpecial();
else
if( IsClosingTagIndicator(lastc) )
@ -1924,7 +2034,7 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)
void HTMLParser::ReadLoop()
{
while( ReadItem() )
while( status == ok && ReadItem() )
{
if( LastItem().type == Item::opening )
{
@ -1933,7 +2043,6 @@ void HTMLParser::ReadLoop()
CheckSingleItemExceptions();
}
CheckWhiteCharsExceptions(LastItem());
CheckDifferentContentExceptions(LastItem());
}
@ -1960,7 +2069,10 @@ void HTMLParser::ReadLoop()
PopStack();
}
if( status == ok )
{
ReadText();
}
is_first_item = false;
}

View File

@ -95,12 +95,25 @@ class HTMLParser : public BaseParser
{
public:
/*
status of parsing
*/
enum Status { ok, cant_open_file, syntax_error };
enum OrphanMode
{
orphan_nbsp, // putting "&nbsp;" string
orphan_160space // putting 160 ascii code
};
/*
the last status of parsing, set by parse() methods
*/
Status status;
HTMLParser();
HTMLParser(const HTMLParser & f);
HTMLParser & operator=(const HTMLParser & f);
@ -109,12 +122,30 @@ public:
void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
// main methods used for filtering
void Filter(const wchar_t * in, std::wstring & out);
void Filter(const std::wstring & in, std::wstring & out);
/*
*
* returns a number of a last parsed line
* can be used to obtain the line in which there was a syntax error
*
*/
int get_last_parsed_line();
const static int WHITE_MODE_ORIGIN = 0;
const static int WHITE_MODE_SINGLE_LINE = 1;
const static int WHITE_MODE_TREE = 2;
@ -255,6 +286,7 @@ protected:
virtual bool IsClosingTagMark(wchar_t c);
virtual bool IsClosingTagIndicator(wchar_t c);
virtual bool IsSpecialTagIndicator(wchar_t c);
virtual bool IsXMLSpecialTagIndicator(wchar_t c);
virtual bool IsAttributeAssignmentMark(wchar_t c);
virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
virtual bool IsStartingEntityMark(wchar_t c);
@ -335,6 +367,7 @@ protected:
void ReadItemAttrName();
void ReadItemAttrValueAdd(const std::wstring & str);
void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
void ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char);
bool ReadItemAttr();
void CheckItemLangAttr();