diff --git a/src/html/htmlparser.cpp b/src/html/htmlparser.cpp
index 1187a67..7b422f5 100644
--- a/src/html/htmlparser.cpp
+++ b/src/html/htmlparser.cpp
@@ -71,6 +71,7 @@ HTMLParser::Item::Item()
void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
{
+ parsing_html = true;
reading_from_file = false;
reading_from_wchar_string = true;
pchar_unicode = in;
@@ -79,7 +80,6 @@ void HTMLParser::Filter(const wchar_t * in, std::wstring & out)
stack_len = 0;
out_string = &out;
//last_new_line = false;
- was_ending_commentary = false;
line_len = 0;
out_string->clear();
@@ -369,17 +369,27 @@ return false;
}
-void HTMLParser::SkipWhite()
+void HTMLParser::SkipWhite(std::wstring * out_string)
{
while( IsWhite(lastc) )
+ {
+ if( out_string )
+ (*out_string) += lastc;
+
read_char();
+ }
}
-void HTMLParser::SkipWhiteLines()
+void HTMLParser::SkipWhiteLines(std::wstring * out_string)
{
while( lastc==10 || IsWhite(lastc) )
+ {
+ if( out_string )
+ (*out_string) += lastc;
+
read_char();
+ }
}
@@ -408,6 +418,8 @@ void HTMLParser::SkipWhiteWithFirstNewLine()
+
+
void HTMLParser::SkipAndCheckClosingTag(std::wstring * remember_text)
{
bool is_quoted = false;
@@ -770,13 +782,6 @@ void HTMLParser::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line,
{
str += lastc;
read_char();
-
- if( IsEndingCommentaryTagMarkAtEndOfString(str) )
- {
- str.erase(str.size() - 3); // IMPROVEME define a function or what
- was_ending_commentary = true;
- break;
- }
}
if( !str.empty() )
@@ -911,14 +916,19 @@ void HTMLParser::PutClosingTag(const Item & item)
if( skip_tags || !IsTagSafe(item.name) )
return;
- if( !item.is_commentary )
+ if( item.is_commentary )
+ {
+ Put('-');
+ Put('-');
+ PutClosingTagMark();
+ }
+ else
{
PutOpeningTagMark();
Put('/');
+ Put(item.name);
+ PutClosingTagMark();
}
-
- Put(item.name);
- PutClosingTagMark();
}
@@ -991,20 +1001,6 @@ bool HTMLParser::IsClosingXmlSimpleTagMark(wchar_t c)
}
-bool HTMLParser::IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str)
-{
- static wchar_t comm_end[] = L"-->";
- size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1;
-
- if( str.size() >= comm_end_len )
- {
- return IsNameEqual(str.c_str() + str.size() - comm_end_len, comm_end);
- }
-
- return false;
-}
-
-
bool HTMLParser::IsStartingEntityMark(wchar_t c)
{
return (c == '&');
@@ -1018,6 +1014,113 @@ bool HTMLParser::IsEndingEntityMark(wchar_t c)
+// used for such tags as: script, pre, textarea
+void HTMLParser::ReadTextUntilClosingCommentary()
+{
+ while( lastc != -1 )
+ {
+ if( lastc == '-' )
+ {
+ tmp_text.clear();
+ tmp_text += lastc;
+ read_char();
+
+ if( lastc == '-' )
+ {
+ tmp_text += lastc;
+ read_char();
+
+ if( IsClosingTagMark(lastc) )
+ {
+ tmp_text += lastc;
+ read_char();
+ Put(tmp_text);
+
+ break;
+ }
+ }
+
+ Put(tmp_text);
+ }
+ else
+ {
+ Put(lastc);
+ read_char();
+ }
+ }
+}
+
+
+bool HTMLParser::IsClosingTagForLastItem(bool put_closing_tag_as_well)
+{
+ tmp_text.clear();
+ tmp_text += lastc; // opening tag mark
+ read_char();
+
+ SkipWhiteLines(&tmp_text);
+
+ if( IsClosingTagIndicator(lastc) )
+ {
+ tmp_text += lastc;
+ read_char();
+ SkipWhiteLines(&tmp_text);
+ ReadItemName(tmp_name);
+
+ if( IsNameEqual(tmp_name, LastItem().name) )
+ {
+ SkipAndCheckClosingTag();
+
+ if( put_closing_tag_as_well )
+ {
+ Put('<');
+ Put('/');
+ Put(tmp_name);
+ Put('>');
+ }
+
+ return true;
+ }
+ else
+ {
+ Put(tmp_text);
+ Put(tmp_name);
+ }
+ }
+ else
+ {
+ Put(tmp_text);
+ }
+
+return false;
+}
+
+
+
+
+// used for such tags as: script, pre, textarea
+void HTMLParser::ReadTextUntilClosingTag(bool put_closing_tag_as_well)
+{
+ while( lastc != -1 )
+ {
+ if( IsOpeningTagMark(lastc) )
+ {
+ if( IsClosingTagForLastItem(put_closing_tag_as_well) )
+ {
+ //CheckNewLine();
+ break;
+ }
+ }
+ else
+ {
+ Put(lastc);
+ read_char();
+ }
+ }
+}
+
+
+
+
// reading text between html tags
void HTMLParser::ReadText()
{
@@ -1026,8 +1129,6 @@ void HTMLParser::ReadText()
bool was_non_white_text = false;
- was_ending_commentary = false;
-
bool allow_put_new_line = false;
bool allow_put_space = false;
@@ -1061,9 +1162,6 @@ void HTMLParser::ReadText()
}
else
{
- if( was_ending_commentary )
- break;
-
PutNormalWhite(was_white_char, was_new_line);
if( (was_white_char || was_new_line) && current_white_char_mode() == WHITE_MODE_TREE )
@@ -1304,26 +1402,16 @@ bool HTMLParser::ReadItem()
if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle )
LastItem().tree_index += 1;
- if( was_ending_commentary )
- {
- LastItem().type = Item::closing;
- LastItem().is_commentary = true;
- LastItem().name = L"--";
- was_ending_commentary = false;
- }
- else
- {
- read_char(); // skipping the first opening tag mark '<'
- SkipWhiteLines();
+ read_char(); // skipping the first opening tag mark '<'
+ SkipWhiteLines();
- if( IsSpecialTagIndicator(lastc) )
- ReadItemSpecial();
- else
- if( IsClosingTagIndicator(lastc) )
- ReadItemClosing();
- else
- ReadItemOpening();
- }
+ if( IsSpecialTagIndicator(lastc) )
+ ReadItemSpecial();
+ else
+ if( IsClosingTagIndicator(lastc) )
+ ReadItemClosing();
+ else
+ ReadItemOpening();
// IMPROVE ME later CheckSingleItemExceptions() can change opening to single type
ItemFound();
@@ -1462,16 +1550,22 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
bool change_white_mode = false;
// in safe_mode the script tag is ignored
- if( !safe_mode && IsNameEqual(item.name, L"script") )
- {
- change_white_mode = true;
- }
-
- if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
+// if( !safe_mode && IsNameEqual(item.name, L"script") )
+// {
+// change_white_mode = true;
+// }
+
+// if( IsNameEqual(item.name, L"pre") || IsNameEqual(item.name, L"textarea") )
+// {
+// change_white_mode = true;
+// }
+
+ if( IsNameEqual(item.name, L"pre") )
{
change_white_mode = true;
}
+ // move to CheckDifferentContentExceptions?
if( IsNameEqual(item.name, no_filter_tag) )
{
change_white_mode = true;
@@ -1493,6 +1587,25 @@ void HTMLParser::CheckWhiteCharsExceptions(Item & item)
+void HTMLParser::CheckDifferentContentExceptions(Item & item)
+{
+ if( !safe_mode && IsNameEqual(item.name, L"script") )
+ {
+ ReadTextUntilClosingTag(true);
+ PopStack();
+ }
+
+ if( IsNameEqual(item.name, L"textarea") )
+ {
+ ReadTextUntilClosingTag(true);
+ PopStack();
+ }
+}
+
+
+
+
+
void HTMLParser::AddForgottenTags()
{
@@ -1641,14 +1754,21 @@ void HTMLParser::ReadLoop()
{
if( LastItem().type == Item::opening )
{
- CheckSingleItemExceptions();
+ if( parsing_html )
+ {
+ CheckSingleItemExceptions();
+ }
+
CheckWhiteCharsExceptions(LastItem());
+ CheckDifferentContentExceptions(LastItem());
}
else
if( LastItem().type == Item::special )
{
- if( !LastItem().is_commentary )
- PopStack();
+ if( LastItem().is_commentary )
+ ReadTextUntilClosingCommentary();
+
+ PopStack();
}
else
if( LastItem().type == Item::simple )
@@ -1666,6 +1786,7 @@ void HTMLParser::ReadLoop()
}
ReadText();
+
is_first_item = false;
}
}
diff --git a/src/html/htmlparser.h b/src/html/htmlparser.h
index 7e42eea..8bf6969 100644
--- a/src/html/htmlparser.h
+++ b/src/html/htmlparser.h
@@ -172,6 +172,12 @@ public:
protected:
+ /*
+ * true when parsing html input, false for parsing xml
+ */
+ bool parsing_html;
+
+
// orphans for one language
struct Orphans
{
@@ -246,14 +252,13 @@ protected:
virtual bool IsStartingEntityMark(wchar_t c);
virtual bool IsEndingEntityMark(wchar_t c);
- virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str);
-
virtual bool IsValidCharForName(int c);
virtual bool IsValidCharForAttrName(int c);
virtual bool IsValidCharForEntityName(int c);
virtual void CheckSingleItemExceptions();
virtual void CheckWhiteCharsExceptions(Item & item);
+ virtual void CheckDifferentContentExceptions(Item & item);
virtual void Put(wchar_t c);
virtual void Put(const wchar_t * str, const wchar_t * end);
@@ -299,12 +304,15 @@ protected:
bool CheckOrphan(const wchar_t * str, const wchar_t * end);
bool IsWhite(int c);
- void SkipWhite();
- void SkipWhiteLines();
+ void SkipWhite(std::wstring * out_string = nullptr);
+ void SkipWhiteLines(std::wstring * out_string = nullptr);
void SkipWhiteWithFirstNewLine();
int current_white_char_mode();
+ void ReadTextUntilClosingCommentary();
+ bool IsClosingTagForLastItem(bool put_closing_tag_as_well);
+ void ReadTextUntilClosingTag(bool put_closing_tag_as_well);
void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr);
void PopStack();
@@ -354,7 +362,6 @@ protected:
bool is_first_item;
size_t wrap_line; // insert a new line character into long lines
size_t tab_size;
- bool was_ending_commentary;
OrphanMode orphan_mode;
std::wstring attr_name;
std::vector attr_value;