/* * This file is a part of Winix * and is not publicly distributed * * Copyright (c) 2008-2010, Tomasz Sowa * All rights reserved. * */ #include "htmlfilter.h" void HTMLFilter::Item::Clear() { name[0] = 0; name_len = 0; type = none; new_line = false; } HTMLFilter::Item::Item() { Clear(); } void HTMLFilter::Filter(const char * in, std::string & out) { pchar = in; stack_len = 0; out_string = &out; last_new_line = false; out_string->clear(); Init(); Read(); Deinit(); } void HTMLFilter::Init() { } void HTMLFilter::Deinit() { } void HTMLFilter::Filter(const std::string & in, std::string & out) { out.reserve(in.size() * 2 + 1); Filter(in.c_str(), out); } HTMLFilter::HTMLFilter() { pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; buffer = new char[WINIX_HTMLFILTER_BUFFER_MAXLEN]; tab_size = 2; trim_white = false; break_after = 0; lang = lang_none; orphan_mode = orphan_nbsp; safe_mode = false; } HTMLFilter::HTMLFilter(const HTMLFilter & f) { // don't need to copy the stack pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; buffer = new char[WINIX_HTMLFILTER_BUFFER_MAXLEN]; } HTMLFilter & HTMLFilter::operator=(const HTMLFilter & f) { // don't need to copy the stack pstack = new Item[WINIX_HTMLFILTER_STACK_MAXLEN]; buffer = new char[WINIX_HTMLFILTER_BUFFER_MAXLEN]; return *this; } HTMLFilter::~HTMLFilter() { delete [] pstack; delete [] buffer; } void HTMLFilter::BreakLines(size_t break_after_) { break_after = break_after_; if( break_after > 10000 ) break_after = 10000; } void HTMLFilter::TrimWhite(bool trim) { trim_white = trim; } void HTMLFilter::InsertTabs(size_t tabsize) { tab_size = tabsize; if( tab_size > 1000 ) tab_size = 1000; } void HTMLFilter::CheckOrphans(HTMLFilter::Lang lang_, HTMLFilter::OrphanMode mode) { lang = lang_; orphan_mode = mode; } void HTMLFilter::SafeMode(bool safe_mode_) { safe_mode = safe_mode_; } HTMLFilter::Item & HTMLFilter::GetItem(size_t i) { if( i >= stack_len ) { empty.Clear(); return empty; } return pstack[i]; } HTMLFilter::Item & HTMLFilter::LastItem() { if( stack_len == 0 ) { empty.Clear(); return empty; } return pstack[stack_len-1]; } bool HTMLFilter::PushStack() { if( stack_len == WINIX_HTMLFILTER_STACK_MAXLEN ) // oops, too many items return false; pstack[stack_len].Clear(); stack_len += 1; return true; } void HTMLFilter::PopStack() { if( stack_len == 0 ) // oops return; stack_len -= 1; pstack[stack_len].Clear(); } bool HTMLFilter::IsWhite(int c) { // dont use c==10 here if( c==' ' || c=='\t' || c==13 || c==160 ) return true; return false; } void HTMLFilter::SkipWhite() { while( IsWhite(*pchar) ) ++pchar; } void HTMLFilter::SkipWhiteLines() { while( *pchar==10 || IsWhite(*pchar) ) ++pchar; } void HTMLFilter::SkipWhiteWithFirstNewLine() { SkipWhite(); if( *pchar == 10 ) { pchar += 1; SkipWhite(); } } void HTMLFilter::CheckNewLine() { const char * start = pchar; SkipWhite(); last_new_line = (*pchar==10); pchar = start; } bool HTMLFilter::IsClosingTagForLastItem() { pchar += 1; SkipWhite(); if( *pchar == '/' ) { pchar += 1; SkipWhite(); if( IsNameEqual(pchar, LastItem().name, LastItem().name_len) ) { pchar += LastItem().name_len; SkipWhite(); if( IsClosingTagMark() ) { pchar += 1; return true; } } } return false; } // used for such tags as: script, pre, textarea void HTMLFilter::PutLastTagWithClosingTag() { const char * start = pchar; while( *pchar != 0 ) { if( IsOpeningTagMark() ) { if( IsClosingTagForLastItem() ) { PopStack(); CheckNewLine(); break; } } else { pchar += 1; } } Put(start, pchar); } const char * HTMLFilter::SkipItemCheckXmlSimple() { const char * end = pchar; while( *pchar!=0 ) { while( *pchar!=0 && !IsClosingTagMark() && !IsClosingXmlSimpleTagMark()) ++pchar; if( IsClosingXmlSimpleTagMark() ) // closing xml tag: default '/' { end = pchar; ++pchar; SkipWhite(); if( IsClosingTagMark() ) { ++pchar; LastItem().type = Item::simple; break; } } else if( IsClosingTagMark() ) { end = pchar; ++pchar; break; } } return end; } bool HTMLFilter::IsValidCharForName(int c) { if( (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9') || c=='-' || c=='!' ) return true; return false; } void HTMLFilter::ReadItemName() { size_t i; for( i=0 ; IsValidCharForName(*pchar) && i=end ) return; size_t len = end - str; out_string->append(str, len); } int HTMLFilter::CheckOrphan(const char * str, const char * end, const char * orphan) { size_t res; for( ; str 0 ) return false; while( o1 + 1 < o2 ) { size_t o = (o1 + o2) / 2; res = CheckOrphan(str, end, table[o]); if( res == 0 ) return true; if( res < 0 ) o2 = o; else o1 = o; } return false; } bool HTMLFilter::CheckOrphanLangPl(const char * str, const char * end) { // the table must be sorted in alphabetical order // polish letters coded in iso-8859-2 static const char * orphans[] = { "(np.", "s.", "a", "ale", "bo", "by", "co", "czy", "do", "go", "i", "ich", "ja", "je", "jej", "jest", "już", "ją", "ku", "li", "mi", "na", "nie", "np.", "nr", "o", "od", "po", "są", "ta", "to", "tu", "tą", "tę", "u", "w", "we", "wy", "z", "za", "ze", "że", "ów" }; size_t o1 = 0; size_t o2 = sizeof(orphans) / sizeof(const char*) - 1; return CheckOrphanTable(str, end, orphans, o1, o2); } // SK i CZ bool HTMLFilter::CheckOrphanLangCz(const char * str, const char * end) { // the table must be sorted in alphabetical order static const char * orphans[] = { "a", "i", "k", "o", "s", "u", "v", "z" }; size_t o1 = 0; size_t o2 = sizeof(orphans) / sizeof(const char*) - 1; return CheckOrphanTable(str, end, orphans, o1, o2); } bool HTMLFilter::CheckOrphan(const char * str, const char * end) { if( str == end || lang == lang_none ) return false; if( lang == lang_cz || lang == lang_sk ) return CheckOrphanLangCz(str, end); return CheckOrphanLangPl(str, end); } size_t HTMLFilter::PutNormalTextFillBuffer(const char * & str, const char * & end) { const char * word = str; // pointing at the beginning of a word size_t i = 0; // some space in the buffer for non break spaces (orphans) and spaces at the beginning of a line size_t epsilon = WINIX_HTMLFILTER_BUFFER_MAXLEN / 10 + 1; bool is_white; bool was_white = true; size_t non_whites = 0; for( ; str < end && i=break_after ) { buffer[i] = ' '; i += 1; non_whites = 0; } was_white = is_white; } return i; } size_t HTMLFilter::PutNormalTextTrimFillBuffer(const char * & str, const char * & end) { const char * word = str; // pointint at the beginning of a word size_t non_whites = 0; size_t i = 0; bool is_white; // some space in the buffer for non break spaces (orphans) and spaces at the beginning of a line size_t epsilon = WINIX_HTMLFILTER_BUFFER_MAXLEN / 10 + 1; for( ; str < end && ibreak_after) || is_white ) { buffer[i] = ' '; i += 1; non_whites = 1; } if( str < end ) { buffer[i] = *str; i += 1; } } return i; } void HTMLFilter::PutNormalText(const char * str, const char * end) { size_t buf_len; while( str < end ) { buf_len = PutNormalTextFillBuffer(str, end); Put(buffer, buffer+buf_len); } } void HTMLFilter::PutNormalTextTrim(const char * str, const char * end) { size_t buf_len; while( str < end ) { buf_len = PutNormalTextTrimFillBuffer(str, end); Put(buffer, buffer+buf_len); } } void HTMLFilter::PutOpeningTagMark() { (*out_string) += '<'; } void HTMLFilter::PutClosingTagMark() { (*out_string) += '>'; } void HTMLFilter::PutTagName(const char * name) { (*out_string) += name; } bool HTMLFilter::IsTagSafe(const char * tag) { if( !safe_mode ) return true; static const char * unsafe_tags[] = { "script", "iframe", "frame", "frameset", "applet", "head", "meta", "html", "link", "body" }; size_t len = sizeof(unsafe_tags) / sizeof(const char*); size_t i; for(i=0 ; i 20 ) len = 20; // how many spaces do you want size_t spaces = len * tab_size; size_t i = 0; if( index+spaces < WINIX_HTMLFILTER_BUFFER_MAXLEN-1 ) { for( ; i'); } // the slash at the end (without '>' character) // we assume the size of the mark to be one bool HTMLFilter::IsClosingXmlSimpleTagMark() { return (*pchar == '/'); } bool HTMLFilter::IsOpeningCommentaryTagMark() { static char comm_open[] = ""; size_t comm_close_len = sizeof(comm_close) / sizeof(char) - 1; if( !IsOpeningCommentaryTagMark() ) return false; pchar += OpeningCommentaryTagMarkSize(); // looking for "-->" while( *pchar!=0 && !IsNameEqual(pchar, comm_close, comm_close_len) ) ++pchar; if( *pchar!= 0 ) pchar += comm_close_len; CheckNewLine(); return true; } void HTMLFilter::ReadNormalTextSkipWhite(const char * & start, const char * & last_non_white) { if( trim_white ) { // skipping all white chars (with new lines) // but with remembering the last non white character for( ; *pchar==10 || IsWhite(*pchar) ; ++pchar) if( *pchar == 10 ) last_non_white = pchar; } else { // skipping first white chars with only one line between them SkipWhite(); last_non_white = pchar; if( *pchar == 10 ) { ++pchar; SkipWhite(); } } start = pchar; // exception for the commentary tag if( IsOpeningCommentaryTagMark() || !IsOpeningTagMark() ) { PutNewLine(); PutTabs(stack_len); } } // reading text between html tags void HTMLFilter::ReadNormalText() { const char * start = pchar; const char * last_non_white = pchar; if( last_new_line ) ReadNormalTextSkipWhite(start, last_non_white); while( *pchar != 0 ) { if( SkipCommentaryTagIfExists() ) { last_non_white = pchar - 1; // pointing at the last '>' from a commentary } else { if( IsOpeningTagMark() ) break; if( !IsWhite(*pchar) ) last_non_white = pchar; pchar += 1; } } last_new_line = (*last_non_white == 10); if( trim_white ) PutNormalTextTrim(start, pchar); else PutNormalText(start, pchar); } // start, end - parameters to a tag void HTMLFilter::PrintItem(const char * start, const char * end) { if( last_new_line ) { PutNewLine(); if( stack_len > 1 ) PutTabs(stack_len-1); } PutOpeningTag(start, end); } bool HTMLFilter::ReadItem() { const char * start = pchar; if( *pchar == 0 ) return false; if( !PushStack() ) return false; // we have '<' pchar += 1; SkipWhite(); if( *pchar == '/' ) // we have a closing tag { pchar += 1; SkipWhite(); LastItem().type = Item::closing; } ReadItemName(); SkipWhite(); start = pchar; // arguments start here if( LastItem().type != Item::closing ) LastItem().type = (LastItem().name[0] == '!') ? Item::special : Item::opening; const char * end = SkipItemCheckXmlSimple(); if( LastItem().type != Item::closing ) PrintItem(start, end); CheckNewLine(); LastItem().new_line = last_new_line; return true; } int HTMLFilter::ToLower(int c) { if( c>='A' && c<='Z' ) return c - 'A' + 'a'; return c; } bool HTMLFilter::IsNameEqual(const char * name1, const char * name2) { for( ; *name1!=0 && *name2!=0 ; ++name1, ++name2 ) if( ToLower(*name1) != ToLower(*name2) ) return false; if( *name1==0 && *name2==0 ) return true; return false; } // len characters from both strings must be equal bool HTMLFilter::IsNameEqual(const char * name1, const char * name2, size_t len) { for( ; *name1!=0 && *name2!=0 && len>0 ; ++name1, ++name2, --len ) if( ToLower(*name1) != ToLower(*name2) ) return false; if( len == 0 ) return true; return false; } bool HTMLFilter::IsLastTag(const char * name) { const char * tag = LastItem().name; return IsNameEqual(name, tag); } // checking exceptions for opening tags void HTMLFilter::CheckExceptions() { if( IsLastTag("meta") || IsLastTag("input") || IsLastTag("br") || IsLastTag("hr") || IsLastTag("img") || IsLastTag("link") ) { LastItem().type = Item::simple; PopStack(); return; } // in safe_mode the script tag is ignored if( !safe_mode && IsLastTag("script") ) PutLastTagWithClosingTag(); if( IsLastTag("pre") || IsLastTag("textarea") ) PutLastTagWithClosingTag(); } void HTMLFilter::AddForgottenTags() { int i; if( stack_len < 3 ) return; // we have forgotten to close some tags // looking whether there is a matching opening tag for(i=int(stack_len)-3 ; i>=0 ; --i) if( IsNameEqual(pstack[i].name, pstack[stack_len-1].name) ) break; if( i < 0 ) { // oops, there is no such a tag // we don't print the closing and the missing opening tag PopStack(); return; } for(int z=(int)stack_len-2 ; z>=i ; --z) { if( pstack[z].new_line ) { PutNewLine(); PutTabs(z); } PutClosingTag(pstack[z].name); pstack[z].Clear(); } last_new_line = pstack[stack_len-1].new_line; // invalidate tags stack_len = i; } void HTMLFilter::CheckStackPrintRest() { while( stack_len-- > 0 ) { if( stack_len==0 || pstack[stack_len-1].new_line ) PutNewLine(); PutTabs(stack_len); PutClosingTag(pstack[stack_len].name); } } void HTMLFilter::CheckClosingTags() { if( stack_len == 0 ) return; // on the stack we have only opening tags // but only the last tag is a closing tag if( stack_len == 1 ) { // there is only last closing tag // we dont print it PopStack(); return; } // there are more than one tag if( IsNameEqual(pstack[stack_len-1].name, pstack[stack_len-2].name) ) { // last closing tag is from the previous one if( pstack[stack_len-2].new_line ) { PutNewLine(); PutTabs(stack_len-2); } PutClosingTag(pstack[stack_len-1].name); last_new_line = pstack[stack_len-1].new_line; PopStack(); PopStack(); return; } AddForgottenTags(); } bool HTMLFilter::PrintRest() { const char * start = pchar; while( *pchar ) ++pchar; if( pchar > start ) { Put(start, pchar); return true; } return false; } void HTMLFilter::Read() { if( trim_white ) SkipWhiteLines(); // it can be some text or white lines before the first html tag (we print it) ReadNormalText(); while( ReadItem() ) { if( LastItem().type == Item::opening ) { CheckExceptions(); } else if( LastItem().type == Item::special || LastItem().type == Item::simple ) { if( stack_len > 1 ) { pstack[stack_len-2].new_line = LastItem().new_line; } else if( trim_white ) { // one new line after a simple or special tag // (if the tag has level 0 in the tree - it not means that this is a first tag) // for example can be DOCTYPE PutNewLine(); } PopStack(); } else if( LastItem().type == Item::closing ) { CheckClosingTags(); } ReadNormalText(); } // sometimes ReadItem() can return a false (when there is no space on the stack) // we print the rest html without filtering if( !PrintRest() ) CheckStackPrintRest(); }