diff --git a/src/html/bbcodeparser.cpp b/src/html/bbcodeparser.cpp index 0a60273..ec39de6 100644 --- a/src/html/bbcodeparser.cpp +++ b/src/html/bbcodeparser.cpp @@ -121,7 +121,7 @@ void BBCODEParser::PutNormalText(const wchar_t * str, const wchar_t * end) { int br_len; - if( *pchar == 0 ) + if( lastc != -1 ) { // trimming last white characters at end of the user text while( strhtml_tag); - const wchar_t * start = pchar; - - while( *pchar && *pchar != ']' ) - ++pchar; - - PutHtmlArgument(tag, start, pchar); - - if( *pchar == ']' ) - ++pchar; +// FIXME +// const wchar_t * start = pchar; +// +// while( *pchar && *pchar != ']' ) +// ++pchar; +// +// PutHtmlArgument(tag, start, pchar); +// +// if( *pchar == ']' ) +// ++pchar; if( !tag->inline_tag ) { diff --git a/src/html/htmlfilter.cpp b/src/html/htmlfilter.cpp index d103b9e..5274950 100644 --- a/src/html/htmlfilter.cpp +++ b/src/html/htmlfilter.cpp @@ -48,10 +48,13 @@ namespace pt void HTMLFilter::Item::Clear() { name.clear(); - type = none; - porphans = 0; - new_line = false; - has_body_tag = false; + type = none; + is_commentary = false; + porphans = nullptr; + new_line = false; + new_line_in_the_middle = false; + has_body_tag = false; + tree_index = 0; } @@ -64,10 +67,15 @@ HTMLFilter::Item::Item() void HTMLFilter::Filter(const wchar_t * in, std::wstring & out) { - pchar = in; + reading_from_file = false; + reading_from_wchar_string = true; + pchar_unicode = in; + pchar_ascii = 0; + stack_len = 0; out_string = &out; last_new_line = false; + was_ending_commentary = false; line_len = 0; out_string->clear(); @@ -108,9 +116,9 @@ void HTMLFilter::Filter(const std::wstring & in, std::wstring & out) void HTMLFilter::SetSomeDefaults() { + white_mode = WHITE_MODE_ORIGIN; + tab_size = 2; - trim_white = false; - break_after = 0; wrap_line = 0; orphan_mode = orphan_nbsp; safe_mode = false; @@ -160,16 +168,15 @@ HTMLFilter::~HTMLFilter() - -void HTMLFilter::BreakWord(size_t break_after_) +void HTMLFilter::white_chars_mode(int mode) { - break_after = break_after_; - - if( break_after > 10000 ) - break_after = 10000; + if( mode >= WHITE_MODE_ORIGIN && mode <= WHITE_MODE_TREE ) + white_mode = mode; } + + void HTMLFilter::WrapLine(size_t wrap_line_) { wrap_line = wrap_line_; @@ -180,12 +187,6 @@ void HTMLFilter::WrapLine(size_t wrap_line_) -void HTMLFilter::TrimWhite(bool trim) -{ - trim_white = trim; -} - - void HTMLFilter::InsertTabs(size_t tabsize) { tab_size = tabsize; @@ -322,9 +323,10 @@ bool HTMLFilter::PushStack() if( stack_len > 0 ) { - // 'porphans' and 'has_body_tag' attributes are propagated + // 'porphans', 'has_body_tag' and 'tree_index' attributes are propagated pstack[stack_len].porphans = pstack[stack_len-1].porphans; pstack[stack_len].has_body_tag = pstack[stack_len-1].has_body_tag; + pstack[stack_len].tree_index = pstack[stack_len-1].tree_index; } stack_len += 1; @@ -356,15 +358,15 @@ return false; void HTMLFilter::SkipWhite() { - while( IsWhite(*pchar) ) - ++pchar; + while( IsWhite(lastc) ) + read_char(); } void HTMLFilter::SkipWhiteLines() { - while( *pchar==10 || IsWhite(*pchar) ) - ++pchar; + while( lastc==10 || IsWhite(lastc) ) + read_char(); } @@ -372,29 +374,22 @@ void HTMLFilter::SkipWhiteWithFirstNewLine() { SkipWhite(); - if( *pchar == 10 ) + if( lastc == 10 ) { - pchar += 1; + read_char(); SkipWhite(); } } -void HTMLFilter::SkipWhiteLines(const wchar_t * & str, const wchar_t * end) -{ - while( str < end && (*str==10 || IsWhite(*str)) ) - ++str; -} - - void HTMLFilter::CheckNewLine() { -const wchar_t * start = pchar; + if( white_mode == WHITE_MODE_TREE ) + { + SkipWhite(); + } - SkipWhite(); - last_new_line = (*pchar==10); - - pchar = start; + last_new_line = (lastc==10); } @@ -402,22 +397,23 @@ const wchar_t * start = pchar; bool HTMLFilter::IsClosingTagForLastItem() { - pchar += 1; + read_char(); SkipWhite(); - if( *pchar == '/' ) + if( lastc == '/' ) { - pchar += 1; + read_char(); SkipWhite(); - if( IsNameEqual(pchar, LastItem().name, LastItem().name.size()) ) + ReadItemName(tmp_name); + + if( IsNameEqual(tmp_name, LastItem().name) ) { - pchar += LastItem().name.size(); SkipWhite(); - if( IsClosingTagMark(*pchar) ) + if( IsClosingTagMark(lastc) ) { - pchar += 1; + read_char(); return true; } } @@ -432,17 +428,16 @@ return false; // used for such tags as: script, pre, textarea void HTMLFilter::PutEverythingUntilClosingTag(bool put_closing_tag_as_well) { -const wchar_t * start = pchar; -const wchar_t * end = pchar; + bool was_closing_tag = false; + tmp_text.clear(); - while( *pchar != 0 ) + while( lastc != -1 ) { - if( IsOpeningTagMark(*pchar) ) + if( IsOpeningTagMark(lastc) ) { if( IsClosingTagForLastItem() ) { - if( put_closing_tag_as_well ) - end = pchar; + was_closing_tag = true; PopStack(); CheckNewLine(); @@ -451,29 +446,37 @@ const wchar_t * end = pchar; } else { - pchar += 1; - end = pchar; + tmp_text += lastc; + read_char(); } } - Put(start, end); + Put(tmp_text); + + if( was_closing_tag && put_closing_tag_as_well ) + { + Put('<'); + Put('/'); + Put(tmp_name); + Put('>'); + } } -void HTMLFilter::SkipAndCheckClosingTag() +void HTMLFilter::SkipAndCheckClosingTag(std::wstring * remember_text) { bool is_quoted = false; wchar_t quote_char = 0; - for( ; *pchar ; ++pchar ) + while( lastc != -1 ) { - if( *pchar == '"' || *pchar == '\'' ) + if( lastc == '"' || lastc == '\'' ) { if( is_quoted ) { - if( *pchar == quote_char ) + if( lastc == quote_char ) { is_quoted = false; } @@ -481,20 +484,25 @@ void HTMLFilter::SkipAndCheckClosingTag() else { is_quoted = true; - quote_char = *pchar; + quote_char = lastc; } } else - if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(*pchar) ) // closing xml tag: default '/' + if( !is_quoted && LastItem().type == Item::opening && IsClosingXmlSimpleTagMark(lastc) ) // closing xml tag: default '/' { LastItem().type = Item::simple; } else - if( !is_quoted && IsClosingTagMark(*pchar) ) + if( !is_quoted && IsClosingTagMark(lastc) ) { - ++pchar; + read_char(); break; } + + if( remember_text ) + (*remember_text) += lastc; + + read_char(); } } @@ -505,7 +513,7 @@ bool HTMLFilter::IsValidCharForName(int c) if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || - c=='-' || c=='!' || c==':') // : for namespace character + c=='-' || c=='!' || c==':' || c=='-') // : is for a namespace character, - is for a commentary return true; return false; @@ -536,16 +544,28 @@ return false; } -void HTMLFilter::ReadItemName() +void HTMLFilter::ReadItemName(std::wstring & name, bool clear_name) { size_t i; - for( i=0 ; IsValidCharForName(*pchar) ; ++i ) + if( clear_name ) + name.clear(); + + for(i=0 ; IsValidCharForName(lastc) ; ++i) { if( i < WINIX_HTMLFILTER_ITEM_NAME_MAXLEN ) - LastItem().name += *pchar; + { + name += lastc; - ++pchar; + if( LastItem().type == Item::special && name == L"!--" ) + { + LastItem().is_commentary = true; + read_char(); + break; + } + } + + read_char(); } } @@ -557,71 +577,69 @@ size_t i; attr_name.clear(); - for( i=0 ; *pchar && IsValidCharForAttrName(*pchar) ; ++i ) + for( i=0 ; lastc != -1 && IsValidCharForAttrName(lastc) ; ++i ) { if( i < WINIX_HTMLFILTER_ATTR_NAME_MAXLEN ) - attr_name += *pchar; + attr_name += lastc; - ++pchar; + read_char(); } } -void HTMLFilter::ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end) +void HTMLFilter::ReadItemAttrValueAdd(const std::wstring & str) { - attr_value.push_back(std::wstring()); - if( analyze_entities ) { - AnalyzeEntitiesAndPut(value_start, value_end, &attr_value.back()); + attr_value.push_back(std::wstring()); + AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), &attr_value.back()); } else { - attr_value.back().append(value_start, value_end); + attr_value.push_back(str); } } void HTMLFilter::ReadItemAttrValue(bool has_quote, wchar_t quote_char) { -size_t i; - attr_value.clear(); - const wchar_t * value_start = pchar; - size_t value_len = 0; // how many non white characters + tmp_text.clear(); - for(i=0 ; *pchar ; ++i, ++pchar ) + while( lastc != -1 ) { if( has_quote ) { - if( *pchar == quote_char ) + if( lastc == quote_char ) break; } else { - if( IsClosingTagMark(*pchar) || *pchar == 10 || IsWhite(*pchar) ) + if( IsClosingTagMark(lastc) || lastc == 10 || IsWhite(lastc) ) break; } - if( *pchar==10 || IsWhite(*pchar) ) + if( lastc==10 || IsWhite(lastc) ) { - if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) - ReadItemAttrValueAdd(value_start, pchar); + if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + ReadItemAttrValueAdd(tmp_text); - value_len = 0; + tmp_text.clear(); } else { - if( value_len == 0 ) - value_start = pchar; + if( tmp_text.size() > WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + tmp_text.clear(); - value_len += 1; + tmp_text += lastc; } + + read_char(); } - if( value_len > 0 && value_len <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) - ReadItemAttrValueAdd(value_start, pchar); + if( tmp_text.size() > 0 && tmp_text.size() <= WINIX_HTMLFILTER_ATTR_VALUE_MAXLEN ) + ReadItemAttrValueAdd(tmp_text); } @@ -641,15 +659,6 @@ void HTMLFilter::Put(wchar_t c) } -void HTMLFilter::Put(const wchar_t * str) -{ - out_string->append(str); - - for( ; *str ; ++str) - CheckChar(*str); -} - - void HTMLFilter::Put(const wchar_t * str, const wchar_t * end) { if( str >= end ) @@ -663,12 +672,16 @@ void HTMLFilter::Put(const wchar_t * str, const wchar_t * end) } + void HTMLFilter::Put(const std::wstring & str) { - out_string->append(str); + if( !str.empty() ) + { + out_string->append(str); - for(size_t i=0 ; itab); } -// if there is a semicolon nearby then we break the line after it -// (useful in html entities) -// !! dodac sprawdzanie czy dlugosc stringu nie jest mala tez (end-str) -// i wtedy tez nie dodajemy zadnego znaku -bool HTMLFilter::HasEntityEndAround(const wchar_t * str, const wchar_t * end) +void HTMLFilter::PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space) { -size_t i, epsilon = 8;// !! IMPROVE ME put as a constant - - for(i=0 ; str < end && i wrap_line ) + while( lastc != -1 && lastc != 10 && !IsWhite(lastc) && !IsOpeningTagMark(lastc) ) { - Put(10); - PutTabs(stack_len); - } -} + str += lastc; + read_char(); - -void HTMLFilter::PutNormalNonWhite(const wchar_t * & str, const wchar_t * end) -{ -const wchar_t * word = str; -size_t non_whites = 0; -bool was_entity_end = false; - - for( ; str < end && *str!=10 && !IsWhite(*str) ; ++str, ++non_whites ) - { - if( break_after != 0 && non_whites >= break_after && (was_entity_end || !HasEntityEndAround(str, end)) ) + if( IsEndingCommentaryTagMarkAtEndOfString(str) ) { - Put(word, str); - word = str; - non_whites = 0; - Put(' '); - CheckLineWrap(); + str.erase(str.size() - 3); // IMPROVEME define a function or what + was_ending_commentary = true; + break; } + } - was_entity_end = (IsEndingEntityMark(*str)); + if( !str.empty() ) + { + if( allow_put_new_line ) + { + Put(10); + PutTabs(LastItem().tree_index + 1); + } + else + if( allow_put_space ) + { + Put(' '); + } } if( analyze_entities ) - AnalyzeEntitiesAndPut(word, str, nullptr); + AnalyzeEntitiesAndPut(str.c_str(), str.c_str() + str.size(), nullptr); else - Put(word, str); + Put(str); } -void HTMLFilter::PutNormalWhite(const wchar_t * & str, const wchar_t * end) +bool HTMLFilter::PutNormalWhite() { - if( str < end ) + bool was_white_char = false; + bool was_new_line = false; + + while( lastc == 10 || IsWhite(lastc) ) { - if( trim_white ) - { - Put(' '); - SkipWhiteLines(str, end); - } - else - { - while( str < end && (*str==10 || IsWhite(*str)) ) - { - Put(*str); + was_white_char = true; // anyone white char even new line - if( *str == 10 ) - PutTabs(stack_len); + if( lastc == 10 ) + was_new_line = true; - ++str; - } + if( white_mode == WHITE_MODE_ORIGIN ) + { + Put(lastc); } + + read_char(); } -} - -void HTMLFilter::PutNormalText(const wchar_t * str, const wchar_t * end) -{ -const wchar_t * word, * white; - - if( str < end ) - CheckLineWrap(); - - while( str < end ) + if( white_mode == WHITE_MODE_SINGLE_LINE && was_white_char ) { - word = str; - PutNormalNonWhite(str, end); - - if( CheckOrphan(word, str) ) - { - white = str; - SkipWhiteLines(str, end); - - if( white < str ) - PutNonBreakingSpace(); - } - else - { - PutNormalWhite(str, end); - - if( str < end ) // !! lub moze podobnie jak jest na gorze tutaj? juz nie mam sily myslec :( - CheckLineWrap(); - } - - // for safety (if str was not incremented then there is an infinite loop) - if( word == str ) - break; + Put(' '); } -} + if( white_mode == WHITE_MODE_TREE && was_new_line ) + { + // in WHITE_MODE_TREE white characters are written at the beginning of a or text + } + + last_new_line = was_new_line; + return was_white_char; +} @@ -985,6 +955,12 @@ bool HTMLFilter::PutOpeningTag() return false; } + if( white_mode == WHITE_MODE_TREE && last_new_line ) + { + Put(10); + PutTabs(LastItem().tree_index); + } + PutOpeningTagMark(); Put(LastItem().name); @@ -993,14 +969,18 @@ return true; -void HTMLFilter::PutClosingTag(const wchar_t * tag) +void HTMLFilter::PutClosingTag(const Item & item) { - if( skip_tags || !IsTagSafe(tag) ) + if( skip_tags || !IsTagSafe(item.name) ) return; - PutOpeningTagMark(); - Put('/'); - Put(tag); + if( !item.is_commentary ) + { + PutOpeningTagMark(); + Put('/'); + } + + Put(item.name); PutClosingTagMark(); } @@ -1011,7 +991,7 @@ void HTMLFilter::PutTabs(size_t len) if( len > 30 ) len = 30; - for(size_t i=0 ; i < (len*tab_size) ; ++i) + for(int i=0 ; i < (len*tab_size) ; ++i) (*out_string) += ' '; // we do not add them to 'line_len' } @@ -1031,12 +1011,12 @@ void HTMLFilter::PutNonBreakingSpace() -void HTMLFilter::PutNewLine() -{ - buffer[0] = 10; - Put(buffer, buffer+1); - line_len = 0; -} +//void HTMLFilter::PutNewLine() +//{ +// buffer[0] = 10; // CHECKME for what purpose is this buffer? +// Put(10); +// line_len = 0; +//} // we assume the size of the opening mark to be one @@ -1053,6 +1033,28 @@ bool HTMLFilter::IsClosingTagMark(wchar_t c) } +// the slash in the closing tag mark e.g.

+bool HTMLFilter::IsClosingTagIndicator(wchar_t c) +{ + return (c == '/'); +} + + +// the slash in the closing tag mark e.g.

+bool HTMLFilter::IsSpecialTagIndicator(wchar_t c) +{ + return (c == '!'); +} + + +// the '=' operator e.g. class="value" +bool HTMLFilter::IsAttributeAssignmentMark(wchar_t c) +{ + return (c == '='); +} + + + // the slash at the end (without '>' character) // we assume the size of the mark to be one bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c) @@ -1061,18 +1063,33 @@ bool HTMLFilter::IsClosingXmlSimpleTagMark(wchar_t c) } -bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str) +//bool HTMLFilter::IsOpeningCommentaryTagMark(const wchar_t * str) +//{ +//static wchar_t comm_open[] = L""; + size_t comm_end_len = sizeof(comm_end) / sizeof(wchar_t) - 1; - return IsNameEqual(pchar, comm_open, comm_open_len); -} + if( str.size() >= comm_end_len ) + { + return IsNameEqual(str.c_str() + str.size() - comm_end_len, comm_end); + } - -size_t HTMLFilter::OpeningCommentaryTagMarkSize() -{ - return 4; // size of ""; +wchar_t comm_close[] = L"-->"; size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1; - +/* if( !IsOpeningCommentaryTagMark(pchar) ) return false; @@ -1108,86 +1125,81 @@ size_t comm_close_len = sizeof(comm_close) / sizeof(wchar_t) - 1; pchar += comm_close_len; CheckNewLine(); +*/ + return true; } -void HTMLFilter::ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white) -{ - if( trim_white ) - { - // skipping all white chars (with new lines) - // but with remembering the last non white character - for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar) - if( *pchar == 10 ) - last_non_white = pchar; - } - else - { - // skipping first white chars with only one line between them - SkipWhite(); - last_non_white = pchar; - - if( *pchar == 10 ) - { - ++pchar; - SkipWhite(); - } - } - - start = pchar; - - // exception for the commentary tag - if( IsOpeningCommentaryTagMark(pchar) || !IsOpeningTagMark(*pchar) ) - { - PutNewLine(); - PutTabs(stack_len); - } -} - - - // reading text between html tags void HTMLFilter::ReadNormalText() { -const wchar_t * start = pchar; -const wchar_t * last_non_white = pchar; + bool was_non_white_text = false; - if( last_new_line ) - ReadNormalTextSkipWhite(start, last_non_white); + was_ending_commentary = false; + bool allow_put_new_line = false; + bool allow_put_space = false; - while( *pchar != 0 ) + if( white_mode == WHITE_MODE_TREE ) { - const wchar_t * commentary_start = pchar; - - if( SkipCommentaryTagIfExists() ) + if( LastItem().new_line || (wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line) ) { - last_non_white = pchar - 1; // pointing at the last '>' from a commentary - PutNormalText(start, commentary_start); - - if( !skip_commentaries ) - { - PutNormalText(commentary_start, pchar); - } - - start = pchar; - } - else - { - if( IsOpeningTagMark(*pchar) ) - break; - - if( !IsWhite(*pchar) ) - last_non_white = pchar; - - pchar += 1; + allow_put_new_line = true; } } - last_new_line = (*last_non_white == 10); - PutNormalText(start, pchar); + while( lastc != -1 && !IsOpeningTagMark(lastc) ) + { + tmp_text.clear(); + PutNormalNonWhite(tmp_text, allow_put_new_line, allow_put_space); + + if( !tmp_text.empty() ) + { + allow_put_new_line = false; + allow_put_space = false; + was_non_white_text = true; + } + + if( CheckOrphan(tmp_text.c_str(), tmp_text.c_str() + tmp_text.size()) ) + { + if( lastc == 10 || IsWhite(lastc) ) + { + SkipWhiteLines(); + PutNonBreakingSpace(); + } + } + else + { + if( was_ending_commentary ) + break; + + if( PutNormalWhite() && white_mode == WHITE_MODE_TREE ) + { + if( last_new_line ) + { + allow_put_new_line = true; + allow_put_space = false; + + LastItem().new_line_in_the_middle = true; + + if( !was_non_white_text ) + LastItem().new_line = true; + } + else + { + allow_put_new_line = false; + allow_put_space = true; + } + + if( wrap_line != 0 && LastItem().has_body_tag && line_len >= wrap_line ) + { + allow_put_new_line = true; + } + } + } + } } @@ -1197,15 +1209,7 @@ bool HTMLFilter::PrintOpeningItem() if( skip_tags || IsNameEqual(no_filter_tag, LastItem().name) ) return true; - if( last_new_line ) - { - PutNewLine(); - - if( stack_len > 1 ) - PutTabs(stack_len-1); - } - -return PutOpeningTag(); + return PutOpeningTag(); } @@ -1226,34 +1230,34 @@ bool HTMLFilter::ReadItemAttr() SkipWhiteLines(); - if( *pchar != '=' ) + if( !IsAttributeAssignmentMark(lastc) ) // '=' return true; attr_has_value = true; - pchar += 1; // skipping '=' + read_char(); // skipping '=' SkipWhiteLines(); - bool has_quote = (*pchar == '\"' || *pchar == '\''); - wchar_t quote_char = *pchar; + bool has_quote = (lastc == '\"' || lastc == '\''); + wchar_t quote_char = lastc; if( has_quote ) - pchar += 1; // skipping the first quote mark + read_char(); // skipping the first quote mark ReadItemAttrValue(has_quote, quote_char); - if( has_quote && *pchar == quote_char ) - pchar += 1; // skipping the last quote mark + if( has_quote && lastc == quote_char ) + read_char(); // skipping the last quote mark return true; } -bool HTMLFilter::CheckItemAttr() +void HTMLFilter::CheckItemLangAttr() { if( attr_has_value && IsNameEqual(L"lang", attr_name) ) { - LastItem().porphans = 0; + LastItem().porphans = nullptr; if( !attr_value.empty() ) { @@ -1267,8 +1271,6 @@ bool HTMLFilter::CheckItemAttr() LastItem().porphans = &i->second; } } - -return true; } @@ -1301,9 +1303,9 @@ size_t i; void HTMLFilter::ReadItemClosing() { - pchar += 1; // skipping '/' + read_char(); // skipping '/' SkipWhiteLines(); - ReadItemName(); + ReadItemName(LastItem().name); LastItem().type = Item::closing; SkipAndCheckClosingTag(); @@ -1316,32 +1318,55 @@ void HTMLFilter::ReadItemSpecial() LastItem().type = Item::special; if( !skip_tags ) + { + if( white_mode == WHITE_MODE_TREE && last_new_line ) + { + Put(10); + PutTabs(LastItem().tree_index); + } + PutOpeningTagMark(); + } - const wchar_t * start = pchar; - pchar += 1; // skipping '!' + read_char(); // skipping '!' + LastItem().name = '!'; + ReadItemName(LastItem().name, false); - ReadItemName(); - SkipAndCheckClosingTag(); - - if( !skip_tags && pchar > start ) - Put(start, pchar); - - // closing tag mark is printed directly from the source + if( skip_tags ) + { + SkipAndCheckClosingTag(); + } + else + { + if( LastItem().is_commentary ) + { + Put(LastItem().name); + } + else + { + tmp_text.clear(); + SkipWhiteLines(); + SkipAndCheckClosingTag(&tmp_text); + Put(LastItem().name); + Put(' '); + Put(tmp_text); + Put('>'); + } + } } void HTMLFilter::ReadItemOpening() { LastItem().type = Item::opening; - ReadItemName(); + ReadItemName(LastItem().name); if( PrintOpeningItem() ) { while( ReadItemAttr() ) { - if( CheckItemAttr() ) - PrintItemAttr(); + CheckItemLangAttr(); + PrintItemAttr(); } SkipAndCheckClosingTag(); // here LastItem().type can be changed to 'simple' @@ -1368,25 +1393,35 @@ void HTMLFilter::EntityFound(const wchar_t * str, const wchar_t * end) bool HTMLFilter::ReadItem() { - if( *pchar == 0 ) + if( lastc == -1 ) return false; if( !PushStack() ) return false; - pchar += 1; // skipping the first '<' - SkipWhiteLines(); + if( stack_len > 1 && pstack[stack_len-2].new_line_in_the_middle ) + LastItem().tree_index += 1; - if( *pchar == '!' ) - ReadItemSpecial(); + if( was_ending_commentary ) + { + LastItem().type = Item::closing; + LastItem().is_commentary = true; + LastItem().name = L"--"; + was_ending_commentary = false; + } else - if( *pchar == '/' ) // we have a closing tag (dodac jako metode wirtualna) !! - ReadItemClosing(); - else - ReadItemOpening(); + { + read_char(); // skipping the first opening tag mark '<' + SkipWhiteLines(); - CheckNewLine(); - LastItem().new_line = last_new_line; + if( IsSpecialTagIndicator(lastc) ) + ReadItemSpecial(); + else + if( IsClosingTagIndicator(lastc) ) + ReadItemClosing(); + else + ReadItemOpening(); + } ItemFound(); @@ -1556,11 +1591,14 @@ int i; { if( !skip_tags && pstack[z].new_line ) { - PutNewLine(); - PutTabs(z); + if( white_mode == WHITE_MODE_TREE ) + { + Put(10); + PutTabs(pstack[z].tree_index); + } } - PutClosingTag(pstack[z].name.c_str()); + PutClosingTag(pstack[z]); pstack[z].Clear(); } @@ -1576,10 +1614,19 @@ void HTMLFilter::CheckStackPrintRest() while( stack_len-- > 0 ) { if( stack_len==0 || pstack[stack_len-1].new_line ) - PutNewLine(); + { + if( white_mode == WHITE_MODE_TREE ) + { + Put(10); + PutTabs(pstack[stack_len-1].tree_index); + } + else + { + Put(' '); + } + } - PutTabs(stack_len); - PutClosingTag(pstack[stack_len].name.c_str()); + PutClosingTag(pstack[stack_len]); } } @@ -1601,16 +1648,19 @@ void HTMLFilter::CheckClosingTags() } // there are more than one tag - if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) ) + if( (pstack[stack_len-1].is_commentary && pstack[stack_len-2].is_commentary) || IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) ) { // last closing tag is from the previous one if( !skip_tags && pstack[stack_len-2].new_line ) { - PutNewLine(); - PutTabs(stack_len-2); + if( white_mode == WHITE_MODE_TREE ) + { + Put(10); + PutTabs(pstack[stack_len-2].tree_index); + } } - PutClosingTag(pstack[stack_len-1].name.c_str()); + PutClosingTag(pstack[stack_len-1]); last_new_line = pstack[stack_len-1].new_line; PopStack(); PopStack(); @@ -1624,22 +1674,30 @@ void HTMLFilter::CheckClosingTags() bool HTMLFilter::PrintRest() { -const wchar_t * start = pchar; +//const wchar_t * start = pchar; // in safe mode we do not print the rest html code if( safe_mode || skip_tags ) return false; - while( *pchar ) - ++pchar; + bool was_chars = false; - if( pchar > start ) + while( lastc != -1 ) { - Put(start, pchar); - return true; + Put(lastc); + read_char(); + was_chars = true; } -return false; + return was_chars; + +// if( pchar > start ) +// { +// Put(start, pchar); +// return true; +// } + +//return false; } @@ -1660,15 +1718,18 @@ void HTMLFilter::ReadLoop() //pstack[stack_len-2].new_line = LastItem().new_line; } else - if( trim_white ) + if( white_mode == WHITE_MODE_TREE ) { // one new line after a simple or special tag // (if the tag has level 0 in the tree - it not means that this is a first tag) - // for example can be DOCTYPE - PutNewLine(); + // for example can be DOCTYPE + + if( !LastItem().is_commentary ) + Put(10); } - PopStack(); + if( !LastItem().is_commentary ) + PopStack(); } else if( LastItem().type == Item::closing ) @@ -1688,7 +1749,9 @@ void HTMLFilter::ReadLoop() void HTMLFilter::Read() { - if( trim_white ) + read_char(); // put first character to lastc + + if( white_mode != WHITE_MODE_ORIGIN ) SkipWhiteLines(); // it can be some text or white lines before the first html tag (we print it) diff --git a/src/html/htmlfilter.h b/src/html/htmlfilter.h index 35710d3..6407e0e 100644 --- a/src/html/htmlfilter.h +++ b/src/html/htmlfilter.h @@ -42,7 +42,7 @@ #include #include #include - +#include "convert/baseparser.h" namespace pt @@ -90,7 +90,7 @@ namespace pt the filter recognizes xml simple tags (with / at the end) such as:
*/ -class HTMLFilter +class HTMLFilter : public BaseParser { public: @@ -111,27 +111,22 @@ public: void Filter(const std::wstring & in, std::wstring & out); - // insert a white space into long words - // (only between html tags) - // skipped in such tags: script, pre, textarea - // break_after - after how many characters insert a space (0 - off) - void BreakWord(size_t break_after_); + const static int WHITE_MODE_ORIGIN = 0; + const static int WHITE_MODE_SINGLE_LINE = 1; + const static int WHITE_MODE_TREE = 2; - // insert a new line character into long lines - // (only between html tags) + + // white chars mode + // + void white_chars_mode(int mode); + + // if the line is wrap_line_ length (or longer) then insert a new line character (in a place of a white char) + // (only between html tags and only in subtree) // skipped in such tags: script, pre, textarea - // wrap_line - after how many characters wrap a line (0 - off) + // 0 - off // lines are wrapped only in 'body' tag (useful for text in 'title' tag which is in 'head' section) void WrapLine(size_t wrap_line_); - // trimming white characters (with new lines) - // at the beginning, at the end and in the middle of a string - // only between html tags - // at the beginning and at the end only one space is left - // skipped in such tags: script, pre, textarea - // false by default - void TrimWhite(bool trim); - // first tabs in a tree // default: 2 (spaces) // set 0 to turn off @@ -207,9 +202,14 @@ protected: none } type; + bool is_commentary; + // is there a new line after this tag bool new_line; + // is there a new + bool new_line_in_the_middle; + // current orphans table // (will be propagated) Orphans * porphans; @@ -218,6 +218,8 @@ protected: // (will be propagated) bool has_body_tag; + size_t tree_index; + void Clear(); Item(); }; @@ -235,12 +237,16 @@ protected: virtual bool IsOpeningTagMark(wchar_t c); virtual bool IsClosingTagMark(wchar_t c); + virtual bool IsClosingTagIndicator(wchar_t c); + virtual bool IsSpecialTagIndicator(wchar_t c); + virtual bool IsAttributeAssignmentMark(wchar_t c); virtual bool IsClosingXmlSimpleTagMark(wchar_t c); virtual bool IsStartingEntityMark(wchar_t c); virtual bool IsEndingEntityMark(wchar_t c); - virtual bool IsOpeningCommentaryTagMark(const wchar_t * str); - virtual size_t OpeningCommentaryTagMarkSize(); +// virtual bool IsOpeningCommentaryTagMark(const wchar_t * str); +// virtual size_t OpeningCommentaryTagMarkSize(); + virtual bool IsEndingCommentaryTagMarkAtEndOfString(const std::wstring & str); virtual bool IsValidCharForName(int c); virtual bool IsValidCharForAttrName(int c); @@ -249,7 +255,6 @@ protected: virtual bool SkipCommentaryTagIfExists(); virtual void Put(wchar_t c); - virtual void Put(const wchar_t * str); virtual void Put(const wchar_t * str, const wchar_t * end); virtual void Put(const std::wstring & str); virtual void AnalyzeEntitiesAndPut(const wchar_t * str, const wchar_t * end, std::wstring * out); @@ -257,10 +262,7 @@ protected: virtual void PutOpeningTagMark(); virtual void PutClosingTagMark(); virtual bool PutOpeningTag(); - virtual void PutClosingTag(const wchar_t * tag); - - virtual void PutNormalText(const wchar_t * str, const wchar_t * end); - virtual void ReadNormalTextSkipWhite(const wchar_t * & start, const wchar_t * & last_non_white); + virtual void PutClosingTag(const Item & item); virtual void ItemFound(); virtual void EntityFound(const wchar_t * str, const wchar_t * end); @@ -299,9 +301,8 @@ protected: void SkipWhite(); void SkipWhiteLines(); void SkipWhiteWithFirstNewLine(); - void SkipWhiteLines(const wchar_t * & str, const wchar_t * end); bool IsClosingTagForLastItem(); - void SkipAndCheckClosingTag(); + void SkipAndCheckClosingTag(std::wstring * remember_text = nullptr); void PopStack(); bool PushStack(); @@ -312,13 +313,13 @@ protected: void ReadNormalText(); bool PrintRest(); bool PrintOpeningItem(); - void ReadItemName(); + void ReadItemName(std::wstring & name, bool clear_name = true); void ReadItemAttrName(); - void ReadItemAttrValueAdd(const wchar_t * value_start, const wchar_t * value_end); + void ReadItemAttrValueAdd(const std::wstring & str); void ReadItemAttrValue(bool has_quote, wchar_t quote_char); bool ReadItemAttr(); - bool CheckItemAttr(); + void CheckItemLangAttr(); void PrintItemAttr(); void ReadItemClosing(); @@ -330,27 +331,23 @@ protected: void CheckChar(wchar_t c); - void CheckLineWrap(); - bool HasEntityEndAround(const wchar_t * str, const wchar_t * end); - void PutNormalNonWhite(const wchar_t * & str, const wchar_t * end); - void PutNormalWhite(const wchar_t * & str, const wchar_t * end); + void PutNormalNonWhite(std::wstring & str, bool allow_put_new_line, bool allow_put_space); + bool PutNormalWhite(); void PutEverythingUntilClosingTag(bool put_closing_tag_as_well); void PutTabs(size_t len); void PutNonBreakingSpace(); - void PutNewLine(); void CalcOrphansMaxLen(Orphans & orphans); - const wchar_t * pchar; Item empty; Item * pstack; // stack pointer size_t stack_len; // length of the stack wchar_t * buffer; // buffer used when printing std::wstring * out_string; bool last_new_line; - size_t break_after; // insert a space into long words after 'break_after' characters + int white_mode; size_t wrap_line; // insert a new line character into long lines - bool trim_white; // trimming white characters size_t tab_size; + bool was_ending_commentary; OrphanMode orphan_mode; std::wstring attr_name; std::vector attr_value; @@ -365,6 +362,8 @@ protected: bool skip_commentaries; bool skip_entities; bool analyze_entities; + std::wstring tmp_text; + std::wstring tmp_name; };