HTMLParser: start working on xml mode
added methods: Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true); Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
This commit is contained in:
parent
b1cc64a29b
commit
2576eb12d1
|
@ -78,6 +78,9 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
|
|||
pchar_ascii = 0;
|
||||
xml_compact_mode = compact_mode;
|
||||
|
||||
status = ok;
|
||||
line = 1;
|
||||
|
||||
stack_len = 0;
|
||||
out_string = nullptr;
|
||||
out_space = &space;
|
||||
|
@ -91,6 +94,64 @@ void HTMLParser::parse_html(const wchar_t * in, Space & space, bool compact_mode
|
|||
}
|
||||
|
||||
|
||||
HTMLParser::Status HTMLParser::parse_xml_file(const char * file_name, Space & out_space, bool compact_mode, bool clear_space)
|
||||
{
|
||||
parsing_html = false;
|
||||
reading_from_file = true;
|
||||
xml_compact_mode = compact_mode;
|
||||
|
||||
status = ok;
|
||||
line = 1;
|
||||
stack_len = 0;
|
||||
out_string = nullptr;
|
||||
line_len = 0;
|
||||
|
||||
this->out_space = &out_space;
|
||||
|
||||
if( clear_space )
|
||||
this->out_space->clear();
|
||||
|
||||
file.clear();
|
||||
file.open(file_name, std::ios_base::binary | std::ios_base::in);
|
||||
|
||||
if( file )
|
||||
{
|
||||
Init();
|
||||
Read();
|
||||
Uninit();
|
||||
|
||||
file.close();
|
||||
}
|
||||
else
|
||||
{
|
||||
status = cant_open_file;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
HTMLParser::Status HTMLParser::parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode, bool clear_space)
|
||||
{
|
||||
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
|
||||
}
|
||||
|
||||
|
||||
HTMLParser::Status HTMLParser::parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode, bool clear_space)
|
||||
{
|
||||
std::string file_name_utf8;
|
||||
|
||||
wide_to_utf8(file_name, file_name_utf8);
|
||||
return parse_xml_file(file_name_utf8.c_str(), out_space, compact_mode, clear_space);
|
||||
}
|
||||
|
||||
|
||||
HTMLParser::Status HTMLParser::parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode, bool clear_space)
|
||||
{
|
||||
return parse_xml_file(file_name.c_str(), out_space, compact_mode, clear_space);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
|
||||
{
|
||||
|
@ -142,6 +203,12 @@ void HTMLParser::Filter(const std::wstring & in, std::wstring & out)
|
|||
}
|
||||
|
||||
|
||||
int HTMLParser::get_last_parsed_line()
|
||||
{
|
||||
return line;
|
||||
}
|
||||
|
||||
|
||||
void HTMLParser::SetSomeDefaults()
|
||||
{
|
||||
white_mode = WHITE_MODE_ORIGIN;
|
||||
|
@ -494,7 +561,7 @@ bool HTMLParser::IsValidCharForName(int c)
|
|||
if( (c>='a' && c<='z') ||
|
||||
(c>='A' && c<='Z') ||
|
||||
(c>='0' && c<='9') ||
|
||||
c=='-' || c=='!' || c==':' || c=='-') // : is for a namespace character, - is for a commentary
|
||||
c=='-' || c=='!' || c==':' || c=='-' || c=='_') // : is for a namespace character, - is for a commentary
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -506,7 +573,7 @@ bool HTMLParser::IsValidCharForAttrName(int c)
|
|||
if( (c>='a' && c<='z') ||
|
||||
(c>='A' && c<='Z') ||
|
||||
(c>='0' && c<='9') ||
|
||||
c=='-' || c==':' )
|
||||
c=='-' || c==':' || c=='_')
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -624,6 +691,34 @@ void HTMLParser::ReadItemAttrValue(bool has_quote, wchar_t quote_char)
|
|||
}
|
||||
|
||||
|
||||
void HTMLParser::ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char)
|
||||
{
|
||||
attr_value.clear();
|
||||
tmp_text.clear();
|
||||
|
||||
while( lastc != -1 )
|
||||
{
|
||||
if( has_quote )
|
||||
{
|
||||
if( lastc == quote_char )
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) )
|
||||
break;
|
||||
}
|
||||
|
||||
// IMPROVEME add support for analyze_entities?
|
||||
if( tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN )
|
||||
tmp_text += lastc;
|
||||
|
||||
read_char();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void HTMLParser::CheckChar(wchar_t c)
|
||||
{
|
||||
if( c == 10 )
|
||||
|
@ -1021,6 +1116,10 @@ bool HTMLParser::IsSpecialTagIndicator(wchar_t c)
|
|||
return (c == '!');
|
||||
}
|
||||
|
||||
bool HTMLParser::IsXMLSpecialTagIndicator(wchar_t c)
|
||||
{
|
||||
return (c == '?');
|
||||
}
|
||||
|
||||
// the '=' operator e.g. class="value"
|
||||
bool HTMLParser::IsAttributeAssignmentMark(wchar_t c)
|
||||
|
@ -1292,7 +1391,11 @@ bool HTMLParser::ReadItemAttr()
|
|||
if( has_quote )
|
||||
read_char(); // skipping the first quote mark
|
||||
|
||||
// IMPROVEME we can treat html in the same way as xml? only for filtering we can make a table...
|
||||
if( parsing_html )
|
||||
ReadItemAttrValue(has_quote, quote_char);
|
||||
else
|
||||
ReadXMLItemAttrValue(has_quote, quote_char);
|
||||
|
||||
if( has_quote && lastc == quote_char )
|
||||
read_char(); // skipping the last quote mark
|
||||
|
@ -1360,6 +1463,8 @@ void HTMLParser::PutItemAttrToSpace()
|
|||
Space & attr = attr_tab.add_empty_space(attr_name);
|
||||
|
||||
if( attr_has_value )
|
||||
{
|
||||
if( parsing_html )
|
||||
{
|
||||
attr.set_empty_table();
|
||||
|
||||
|
@ -1368,6 +1473,11 @@ void HTMLParser::PutItemAttrToSpace()
|
|||
attr.add(attr_value[i]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
attr.set(tmp_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1399,8 +1509,8 @@ void HTMLParser::ReadItemSpecial()
|
|||
PutOpeningTagMark();
|
||||
}
|
||||
|
||||
read_char(); // skipping '!'
|
||||
LastItem().name = '!';
|
||||
LastItem().name = lastc;
|
||||
read_char(); // skipping '!' or '?'
|
||||
ReadItemName(LastItem().name, false);
|
||||
|
||||
if( skip_tags )
|
||||
|
@ -1491,7 +1601,7 @@ bool HTMLParser::ReadItem()
|
|||
read_char(); // skipping the first opening tag mark '<'
|
||||
SkipWhiteLines();
|
||||
|
||||
if( IsSpecialTagIndicator(lastc) )
|
||||
if( IsSpecialTagIndicator(lastc) || IsXMLSpecialTagIndicator(lastc) )
|
||||
ReadItemSpecial();
|
||||
else
|
||||
if( IsClosingTagIndicator(lastc) )
|
||||
|
@ -1924,7 +2034,7 @@ void HTMLParser::AddSpaceToSpaceTree(const Space & space)
|
|||
|
||||
void HTMLParser::ReadLoop()
|
||||
{
|
||||
while( ReadItem() )
|
||||
while( status == ok && ReadItem() )
|
||||
{
|
||||
if( LastItem().type == Item::opening )
|
||||
{
|
||||
|
@ -1933,7 +2043,6 @@ void HTMLParser::ReadLoop()
|
|||
CheckSingleItemExceptions();
|
||||
}
|
||||
|
||||
|
||||
CheckWhiteCharsExceptions(LastItem());
|
||||
CheckDifferentContentExceptions(LastItem());
|
||||
}
|
||||
|
@ -1960,7 +2069,10 @@ void HTMLParser::ReadLoop()
|
|||
PopStack();
|
||||
}
|
||||
|
||||
if( status == ok )
|
||||
{
|
||||
ReadText();
|
||||
}
|
||||
|
||||
is_first_item = false;
|
||||
}
|
||||
|
|
|
@ -95,12 +95,25 @@ class HTMLParser : public BaseParser
|
|||
{
|
||||
public:
|
||||
|
||||
|
||||
/*
|
||||
status of parsing
|
||||
*/
|
||||
enum Status { ok, cant_open_file, syntax_error };
|
||||
|
||||
|
||||
enum OrphanMode
|
||||
{
|
||||
orphan_nbsp, // putting " " string
|
||||
orphan_160space // putting 160 ascii code
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
the last status of parsing, set by parse() methods
|
||||
*/
|
||||
Status status;
|
||||
|
||||
HTMLParser();
|
||||
HTMLParser(const HTMLParser & f);
|
||||
HTMLParser & operator=(const HTMLParser & f);
|
||||
|
@ -109,12 +122,30 @@ public:
|
|||
|
||||
void parse_html(const wchar_t * in, Space & space, bool compact_mode = false);
|
||||
|
||||
Status parse_xml_file(const char * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
||||
Status parse_xml_file(const std::string & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
||||
Status parse_xml_file(const wchar_t * file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
||||
Status parse_xml_file(const std::wstring & file_name, Space & out_space, bool compact_mode = false, bool clear_space = true);
|
||||
|
||||
|
||||
|
||||
// main methods used for filtering
|
||||
void Filter(const wchar_t * in, std::wstring & out);
|
||||
void Filter(const std::wstring & in, std::wstring & out);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* returns a number of a last parsed line
|
||||
* can be used to obtain the line in which there was a syntax error
|
||||
*
|
||||
*/
|
||||
int get_last_parsed_line();
|
||||
|
||||
|
||||
|
||||
|
||||
const static int WHITE_MODE_ORIGIN = 0;
|
||||
const static int WHITE_MODE_SINGLE_LINE = 1;
|
||||
const static int WHITE_MODE_TREE = 2;
|
||||
|
@ -255,6 +286,7 @@ protected:
|
|||
virtual bool IsClosingTagMark(wchar_t c);
|
||||
virtual bool IsClosingTagIndicator(wchar_t c);
|
||||
virtual bool IsSpecialTagIndicator(wchar_t c);
|
||||
virtual bool IsXMLSpecialTagIndicator(wchar_t c);
|
||||
virtual bool IsAttributeAssignmentMark(wchar_t c);
|
||||
virtual bool IsClosingXmlSimpleTagMark(wchar_t c);
|
||||
virtual bool IsStartingEntityMark(wchar_t c);
|
||||
|
@ -335,6 +367,7 @@ protected:
|
|||
void ReadItemAttrName();
|
||||
void ReadItemAttrValueAdd(const std::wstring & str);
|
||||
void ReadItemAttrValue(bool has_quote, wchar_t quote_char);
|
||||
void ReadXMLItemAttrValue(bool has_quote, wchar_t quote_char);
|
||||
|
||||
bool ReadItemAttr();
|
||||
void CheckItemLangAttr();
|
||||
|
|
Loading…
Reference in New Issue