diff --git a/locale/substitute b/locale/substitute index 90fd84e..97e1418 100755 --- a/locale/substitute +++ b/locale/substitute @@ -14,3 +14,8 @@ smallleters = "ąćęłńóśźż" capitalics = "ĄĆĘŁŃÓŚŹŻ" +# ASCII characters (a-z, A-Z) are tested beforehand +# so this table are not used for such characters +sort = ( "ąa1", "ćc1", "ęe1", "łl1", "ńn1", "óo1", "śs1", "źz1", "żz2", + "ĄA1", "ĆC1", "ĘE1", "ŁL1", "ŃN1", "ÓO1", "ŚS1", "ŹZ1", "ŻZ2" ) + diff --git a/templates/locale.cpp b/templates/locale.cpp index 7ac011c..68d0f5b 100755 --- a/templates/locale.cpp +++ b/templates/locale.cpp @@ -107,6 +107,7 @@ bool read = false; subst_url.clear(); subst_smalllet.clear(); subst_capitallet.clear(); + subst_sort.clear(); if( dir_def && ReadSubstTable(dir_def) ) read = true; @@ -135,7 +136,8 @@ bool read = false; read = true; CreateSubstVector(subst_url, space.table_single[L"url_original"], space.table_single[L"url_changeto"]); CreateSubstVector(subst_smalllet, space.table_single[L"smallleters"], space.table_single[L"capitalics"]); - CreateSubstVector(subst_capitallet, space.table_single[L"capitalics"], space.table_single[L"smallleters"]); + CreateSubstVector(subst_capitallet, space.table_single[L"capitalics"], space.table_single[L"smallleters"]); + CreateSubstSortVector(subst_sort, space.table[L"sort"]); log << log3 << "Locale: read characters substitution tables from: " << file_name << logend; } @@ -167,6 +169,34 @@ void Locale::CreateSubstVector(std::vector & vect, const std::wstring } + +void Locale::CreateSubstSortVector(std::vector & vect, std::vector & tab) +{ + SubstItem s; + + vect.clear(); + + if( tab.empty() ) + return; + + vect.reserve(tab.size()); + + for(size_t i=0 ; i= 3 ) + { + s.from = tab[i][0]; + s.to = tab[i][1]; + s.index = Toi(&tab[i][2]); + + vect.push_back(s); + } + } + + std::sort(vect.begin(), vect.end()); +} + + void Locale::Read(const char * dir, const char * dir_def) { for(size_t i=0 ; i & vect, wchar_t val) +size_t Locale::SubstFindIndex(const std::vector & vect, wchar_t val) { if( vect.empty() ) - return val; + return vect.size(); size_t o1 = 0; size_t o2 = vect.size() - 1; if( val < vect[o1].from ) - return val; + return vect.size(); if( val == vect[o1].from ) - return vect[o1].to; + return o1; if( val > vect[o2].from ) - return val; + return vect.size(); if( val == vect[o2].from ) - return vect[o2].to; + return o2; while( o1 + 1 < o2 ) { size_t o = (o1 + o2) / 2; if( val == vect[o].from ) - return vect[o].to; + return o; if( val < vect[o].from ) o2 = o; @@ -495,10 +525,31 @@ wchar_t Locale::SubstFind(const std::vector & vect, wchar_t val) o1 = o; } -return val; +return vect.size(); } +/* + binary search in vect + vect should be sorted by 'from' + + if the 'val' is found in vect[].from then vect[].to is returned + else 'val' is returned +*/ +wchar_t Locale::SubstFind(const std::vector & vect, wchar_t val) +{ + size_t i = SubstFindIndex(vect, val); + + if( i == vect.size() ) + { + return val; + } + else + { + return vect[i].to; + } +} + wchar_t Locale::UrlSubst(wchar_t c) @@ -546,3 +597,96 @@ void Locale::ToCapital(std::wstring & str) str[i] = ToCapital(str[i]); } + + +/* + comparing lexicographically two characters + + return value: + less than 0 if c1 is 'less' than c2 + zero if they are equal + greater than 0 if c1 is 'greater' than c2 + + capital letters are treated equaly as small ones + but they will appear first (before the small ones) +*/ +int Locale::Compare(wchar_t c1, wchar_t c2) +{ +SubstItem s1, s2; + + s1.from = c1; + s1.to = c1; + s1.index = 0; + + s2.from = c2; + s2.to = c2; + s2.index = 0; + + if( !((c1>='a' && c1<='z') || (c1>='A' && c1<='Z')) ) + { + size_t i1 = SubstFindIndex(subst_sort, c1); + + if( i1 < subst_sort.size() ) + { + s1.to = subst_sort[i1].to; + s1.index = subst_sort[i1].index; + } + } + + if( !((c2>='a' && c2<='z') || (c2>='A' && c2<='Z')) ) + { + size_t i2 = SubstFindIndex(subst_sort, c2); + + if( i2 < subst_sort.size() ) + { + s2.to = subst_sort[i2].to; + s2.index = subst_sort[i2].index; + } + } + + wchar_t small1 = ToSmall(s1.to); + wchar_t small2 = ToSmall(s2.to); + + if( small1 == small2 ) + { + if( s1.index != s2.index ) + return s1.index - s2.index; + + // this will sort capital letters at the end (after small ones) + return s1.to - s2.to; + } + +return small1 - small2; +} + + + +/* + comparing lexicographically two strings + + return value: + less than 0 if str1 is 'less' than str2 + zero if they are equal + greater than 0 if str1 is 'greater' than str2 +*/ +int Locale::Compare(const std::wstring & str1, const std::wstring & str2) +{ + size_t i1 = 0; + size_t i2 = 0; + + for( ; i1 < str1.size() && i2 < str2.size() ; ++i1, ++i2) + { + int res = Compare(str1[i1], str2[i2]); + + if( res != 0 ) + return res; + } + + if( str1.size() < str2.size() ) + return -1; + + if( str1.size() > str2.size() ) + return 1; + +return 0; +} diff --git a/templates/locale.h b/templates/locale.h index 37af169..c6ddf71 100755 --- a/templates/locale.h +++ b/templates/locale.h @@ -99,12 +99,20 @@ public: wchar_t ToCapital(wchar_t c); void ToCapital(std::wstring & str); + // comparing two characters/strings + // return a value less than zero if c1c2 + int Compare(wchar_t c1, wchar_t c2); + int Compare(const std::wstring & str1, const std::wstring & str2); + private: // struct to used for substitution struct SubstItem { wchar_t from, to; + int index; + + SubstItem() { from = to = 0; index = 0; } bool operator<(const SubstItem & arg) const { return from < arg.from; } }; @@ -114,6 +122,8 @@ private: void ReadSubstTable(const char * dir, const char * dir_def); bool ReadSubstTable(const char * dir); void CreateSubstVector(std::vector & vect, const std::wstring & tab1, const std::wstring & tab2); + void CreateSubstSortVector(std::vector & vect, std::vector & tab); + size_t SubstFindIndex(const std::vector & vect, wchar_t val); wchar_t SubstFind(const std::vector & vect, wchar_t val); // locale files @@ -132,6 +142,7 @@ private: std::vector subst_url; std::vector subst_smalllet; // changing from small to capital std::vector subst_capitallet; // changing from capital to small + std::vector subst_sort; // local characters for comparison PT::Space space;