From eaa19dd46acd144e65c52385b5b31e1c2130e10b Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Fri, 15 May 2009 22:27:04 +0000 Subject: [PATCH] added: uint UInt::Mul3(const UInt & ss2) void UInt::Mul3Big(const UInt & ss2, UInt & result) a new multiplication algorithm: Karatsuba multiplication, on a vector UInt<100> with all items different from zero this algorithm is faster about 3 times than Mul2Big(), and on a vector UInt<1000> with all items different from zero this algorithm is faster more than 5 times than Mul2Big() (measured on 32bit platform with GCC 4.3.3 with -O3 and -DTTMATH_RELEASE) added: uint MulFastest(const UInt & ss2) void MulFastestBig(const UInt & ss2, UInt & result) those methods are trying to select the fastest multiplication algorithm changed: uint Mul(const UInt & ss2, uint algorithm = 100) void MulBig(const UInt & ss2, UInt & result, uint algorithm = 100) those methods by default use MulFastest() and MulFastestBig() changed: changed a little Mul2Big() to cooperate with Mul3Big() changed: names of methods in macros TTMATH_LOG() added: uint AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) uint SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) three forms: asm x86, asm x86_64, no-asm those methods are used by the Karatsuba multiplication algorithm git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@148 e52654a7-88a9-db11-a3e9-0013d4bc506e --- CHANGELOG | 21 +- ttmath/ttmathbig.h | 8 +- ttmath/ttmathtypes.h | 11 + ttmath/ttmathuint.h | 460 +++++++++++++++++++++++++++++++++---- ttmath/ttmathuint_noasm.h | 96 +++++++- ttmath/ttmathuint_x86.h | 287 ++++++++++++++++++++++- ttmath/ttmathuint_x86_64.h | 190 ++++++++++++++- 7 files changed, 1001 insertions(+), 72 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 40eed53..0b98f9b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -Version 0.8.5 (2009.05.11): +Version 0.8.5 prerelease (2009.05.15): * fixed: Big::Mod(x) didn't correctly return a carry and the result was sometimes very big (even greater than x) * fixed: global function Mod(x) didn't set an ErrorCode object @@ -11,6 +11,25 @@ Version 0.8.5 (2009.05.11): the same is to Cos() function * changed: PrepareSin(x) is using Big::Mod() now when reducing 2PI period should be a little accurate especially on a very big 'x' + * changed: uint Mul(const UInt & ss2, uint algorithm = 100) + void MulBig(const UInt & ss2, UInt & result, uint algorithm = 100) + those methods by default use MulFastest() and MulFastestBig() + * changed: changed a little Mul2Big() to cooperate with Mul3Big() + * added: uint UInt::Mul3(const UInt & ss2) + void UInt::Mul3Big(const UInt & ss2, UInt & result) + a new multiplication algorithm: Karatsuba multiplication, + on a vector UInt<100> with all items different from zero this algorithm is faster + about 3 times than Mul2Big(), and on a vector UInt<1000> with all items different from + zero this algorithm is faster more than 5 times than Mul2Big() + (measured on 32bit platform with GCC 4.3.3 with -O3 and -DTTMATH_RELEASE) + * added: uint MulFastest(const UInt & ss2) + void MulFastestBig(const UInt & ss2, UInt & result) + those methods are trying to select the fastest multiplication algorithm + * added: uint AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + uint SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + three forms: asm x86, asm x86_64, no-asm + those methods are used by the Karatsuba multiplication algorithm + Version 0.8.4 (2009.05.08): * fixed: UInt::DivInt() didn't check whether the divisor is zero diff --git a/ttmath/ttmathbig.h b/ttmath/ttmathbig.h index 6acf0b3..6f9f5a6 100644 --- a/ttmath/ttmathbig.h +++ b/ttmath/ttmathbig.h @@ -1642,7 +1642,7 @@ public: // MS Visual Express 2005 reports a warning (in the lines with 'uint man_diff = ...'): // warning C4307: '*' : integral constant overflow // but we're using 'if( man > another_man )' and 'if( man < another_man )' and there'll be no such a situation here - #ifndef __GNUC__ + #ifdef _MSC_VER #pragma warning( disable: 4307 ) #endif @@ -1658,7 +1658,7 @@ public: c += exponent.AddInt(man_diff, 0); } - #ifndef __GNUC__ + #ifdef _MSC_VER #pragma warning( default: 4307 ) #endif @@ -3426,9 +3426,9 @@ private: it is called when the base is 10 and some digits were read before */ - int FromString_ReadScientificIfExists(const char * & source) + uint FromString_ReadScientificIfExists(const char * & source) { - int c = 0; + uint c = 0; bool scientific_read = false; const char * before_scientific = source; diff --git a/ttmath/ttmathtypes.h b/ttmath/ttmathtypes.h index e599d8e..331522a 100644 --- a/ttmath/ttmathtypes.h +++ b/ttmath/ttmathtypes.h @@ -237,6 +237,17 @@ namespace ttmath +/*! + this is a limit when calculating Karatsuba multiplication + if the size of a vector is smaller than TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE + the Karatsuba algorithm will use standard schoolbook multiplication +*/ +#ifdef __GNUC__ +#define TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE 3 +#else +#define TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE 5 +#endif + namespace ttmath { diff --git a/ttmath/ttmathuint.h b/ttmath/ttmathuint.h index ae419ea..a089b3e 100644 --- a/ttmath/ttmathuint.h +++ b/ttmath/ttmathuint.h @@ -840,8 +840,10 @@ public: /*! the multiplication 'this' = 'this' * ss2 + + algorithm: 100 - means automatically choose the fastest algorithm */ - uint Mul(const UInt & ss2, uint algorithm = 2) + uint Mul(const UInt & ss2, uint algorithm = 100) { switch( algorithm ) { @@ -849,8 +851,14 @@ public: return Mul1(ss2); case 2: - default: return Mul2(ss2); + + case 3: + return Mul3(ss2); + + case 100: + default: + return MulFastest(ss2); } } @@ -860,10 +868,12 @@ public: since the 'result' is twice bigger than 'this' and 'ss2' this method never returns a carry + + algorithm: 100 - means automatically choose the fastest algorithm */ void MulBig(const UInt & ss2, UInt & result, - uint algorithm = 2) + uint algorithm = 100) { switch( algorithm ) { @@ -871,8 +881,14 @@ public: return Mul1Big(ss2, result); case 2: - default: return Mul2Big(ss2, result); + + case 3: + return Mul3Big(ss2, result); + + case 100: + default: + return MulFastestBig(ss2, result); } } @@ -964,7 +980,7 @@ public: uint Mul2(const UInt & ss2) { UInt result; - uint i; + uint i, c = 0; Mul2Big(ss2, result); @@ -975,11 +991,14 @@ public: // testing carry for( ; i & ss2, UInt & result) { - uint r2,r1; - uint x1size=value_size, x2size=value_size; - uint x1start=0, x2start=0; - - result.SetZero(); - - if( value_size > 2 ) - { - // if the value_size is smaller than or equal to 2 - // there is no sense to set x1size (and others) to another values - - for(x1size=value_size ; x1size>0 && table[x1size-1]==0 ; --x1size); - for(x2size=value_size ; x2size>0 && ss2.table[x2size-1]==0 ; --x2size); - - if( x1size==0 || x2size==0 ) - { - TTMATH_LOG("UInt::Mul2Big") - return; - } - - for(x1start=0 ; x1start(table, ss2.table, result); TTMATH_LOG("UInt::Mul2Big") } +private: + + /*! + an auxiliary method for calculating the multiplication + + arguments we're taking as pointers (this is to improve the Mul3Big2()- avoiding + unnecessary copying objects), the result should be taken as a pointer too, + but at the moment there is no method AddTwoInts() which can operate on pointers + */ + template + void Mul2Big2(const uint * ss1, const uint * ss2, UInt & result) + { + uint x1size = ss_size, x2size = ss_size; + uint x1start = 0, x2start = 0; + + if( ss_size > 2 ) + { + // if the ss_size is smaller than or equal to 2 + // there is no sense to set x1size (and others) to another values + + for(x1size=ss_size ; x1size>0 && ss1[x1size-1]==0 ; --x1size); + for(x2size=ss_size ; x2size>0 && ss2[x2size-1]==0 ; --x2size); + + for(x1start=0 ; x1start(ss1, ss2, result, x1start, x1size, x2start, x2size); + } + + + + /*! + an auxiliary method for calculating the multiplication + */ + template + void Mul2Big3(const uint * ss1, const uint * ss2, UInt & result, uint x1start, uint x1size, uint x2start, uint x2size) + { + uint r2, r1; + + result.SetZero(); + + if( x1size==0 || x2size==0 ) + return; + + for(uint x1=x1start ; x1 & ss2) + { + UInt result; + uint i, c = 0; + + Mul3Big(ss2, result); + + // copying result + for(i=0 ; i & ss2, UInt & result) + { + Mul3Big2(table, ss2.table, result.table); + + TTMATH_LOG("UInt::Mul3Big") + } + + + +private: + + /*! + an auxiliary method for calculating the Karatsuba multiplication + + result_size is equal ss_size*2 + */ + template + void Mul3Big2(const uint * ss1, const uint * ss2, uint * result) + { + const uint * x1, * x0, * y1, * y0; + + + if( ss_size>1 && ss_size res; + Mul2Big2(ss1, ss2, res); + + for(uint i=0 ; i(x1, x0, y1, y0, result); + } + else + { + // ss_size is even + x0 = ss1; + y0 = ss2; + x1 = ss1 + ss_size / 2; + y1 = ss2 + ss_size / 2; + + // all four vectors (x0 x1 y0 y1) are equal in size + Mul3Big3(x1, x0, y1, y0, result); + } + } + + + +#ifdef _MSC_VER +#pragma warning (disable : 4717) +//warning C4717: recursive on all control paths, function will cause runtime stack overflow +//we have the stop point in Mul3Big2() method +#endif + + + /*! + an auxiliary method for calculating the Karatsuba multiplication + + x = x1*B^m + x0 + y = y1*B^m + y0 + + first_size - is the size of vectors: x0 and y0 + second_size - is the size of vectors: x1 and y1 (can be either equal first_size or smaller about one from first_size) + + x*y = (x1*B^m + x0)(y1*B^m + y0) = z2*B^(2m) + z1*B^m + z0 + where + z0 = x0*y0 + z2 = x1*y1 + z1 = (x1 + x0)*(y1 + y0) - z2 - z0 + */ + template + void Mul3Big3(const uint * x1, const uint * x0, const uint * y1, const uint * y0, uint * result) + { + uint i, c, xc, yc; + + UInt temp, temp2; + UInt z1; + + // z0 and z2 we store directly in the result (we don't use any temporary variables) + Mul3Big2(x0, y0, result); // z0 + Mul3Big2(x1, y1, result+first_size*2); // z2 + + // now we calculate z1 + // temp = (x0 + x1) + // temp2 = (y0 + y1) + // we're using temp and temp2 with UInt, although there can be a carry but + // we simple remember it in xc and yc (xc and yc can be either 0 or 1), + // and (x0 + x1)*(y0 + y1) we calculate in this way (schoolbook algorithm): + // + // xc | temp + // yc | temp2 + // -------------------- + // (temp * temp2) + // xc*temp2 | + // yc*temp | + // xc*yc | + // ---------- z1 -------- + // + // and the result is never larger in size than 3*first_size + + xc = AddVector(x0, x1, first_size, second_size, temp.table); + yc = AddVector(y0, y1, first_size, second_size, temp2.table); + + Mul3Big2(temp.table, temp2.table, z1.table); + + // clearing the rest of z1 + for(i=first_size*2 ; i second_size ) + { + uint z1_size = result_size - first_size; + TTMATH_ASSERT( z1_size <= first_size*3 ) + + for(i=z1_size ; i & ss2) + { + UInt result; + uint i, c = 0; + + MulFastestBig(ss2, result); + + // copying result + for(i=0 ; i & ss2, UInt & result) + { + if( value_size < TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE ) + return Mul2Big(ss2, result); + + uint x1size = value_size, x2size = value_size; + uint x1start = 0, x2start = 0; + + for(x1size=value_size ; x1size>0 && table[x1size-1]==0 ; --x1size); + for(x2size=value_size ; x2size>0 && ss2.table[x2size-1]==0 ; --x2size); + + if( x1size==0 || x2size==0 ) + { + // either 'this' or 'ss2' is equal zero - the result is zero too + result.SetZero(); + return; + } + + for(x1start=0 ; x1start(table, ss2.table, result, x1start, x1size, x2start, x2size); + + + // Karatsuba multiplication + Mul3Big(ss2, result); + + TTMATH_LOG("UInt::MulFastestBig") + } /*! @@ -2909,12 +3269,13 @@ private: uint Rcr2(uint bits, uint c); public: - uint Add(const UInt & ss2, uint c=0); uint AddInt(uint value, uint index = 0); uint AddTwoInts(uint x2, uint x1, uint index); + static uint AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result); uint Sub(const UInt & ss2, uint c=0); uint SubInt(uint value, uint index = 0); + static uint SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result); static sint FindLeadingBitInWord(uint x); static uint SetBitInWord(uint & value, uint bit); static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low); @@ -2922,6 +3283,23 @@ public: }; + +/*! + this specialization is needed in order to not confused the compiler "error: ISO C++ forbids zero-size array" + when compiling Mul3Big2() method +*/ +template<> +class UInt<0> +{ +public: + uint table[1]; + + void Mul2Big(const UInt<0> &, UInt<0> &) { TTMATH_ASSERT(false) }; + void SetZero() { TTMATH_ASSERT(false) }; + uint AddTwoInts(uint, uint, uint) { TTMATH_ASSERT(false) return 0; }; +}; + + } //namespace diff --git a/ttmath/ttmathuint_noasm.h b/ttmath/ttmathuint_noasm.h index 4d1fa67..c301fc5 100644 --- a/ttmath/ttmathuint_noasm.h +++ b/ttmath/ttmathuint_noasm.h @@ -95,7 +95,7 @@ namespace ttmath for(i=0 ; i + uint UInt::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + { + uint i, c = 0; + + TTMATH_ASSERT( ss1_size >= ss2_size ) + + for(i=0 ; i uint UInt::SubTwoWords(uint a, uint b, uint carry, uint * result) { @@ -232,7 +273,7 @@ namespace ttmath for(i=0 ; i + uint UInt::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + { + uint i, c = 0; + + TTMATH_ASSERT( ss1_size >= ss2_size ) + + for(i=0 ; i + uint UInt::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + { + TTMATH_ASSERT( ss1_size >= ss2_size ) + + uint rest = ss1_size - ss2_size; + uint c; + + #ifndef __GNUC__ + + // this part might be compiled with for example visual c + __asm + { + pushad + + mov ecx, [ss2_size] + xor edx, edx // edx = 0, cf = 0 + + mov esi, [ss1] + mov ebx, [ss2] + mov edi, [result] + + p: + mov eax, [esi+edx*4] + adc eax, [ebx+edx*4] + mov [edi+edx*4], eax + + inc edx + dec ecx + jnz p + + adc ecx, ecx // ecx has the cf state + + mov ebx, [rest] + or ebx, ebx + jz end + + xor ebx, ebx + sub ebx, ecx // setting cf from ecx + mov ecx, [rest] // ecx is != 0 + mov ebx, 0 + p2: + mov eax, [esi+edx*4] + adc eax, ebx + mov [edi+edx*4], eax + + inc edx + dec ecx + jnz p2 + + adc ecx, ecx + + end: + mov [c], ecx + + popad + } + + #endif + + + #ifdef __GNUC__ + + // this part should be compiled with gcc + uint dummy1, dummy2, dummy3; + + __asm__ __volatile__( + "push %%edx \n" + "xor %%edx, %%edx \n" // edx = 0, cf = 0 + "1: \n" + "mov (%%esi,%%edx,4), %%eax \n" + "adc (%%ebx,%%edx,4), %%eax \n" + "mov %%eax, (%%edi,%%edx,4) \n" + + "inc %%edx \n" + "dec %%ecx \n" + "jnz 1b \n" + + "adc %%ecx, %%ecx \n" // ecx has the cf state + "pop %%eax \n" // eax = rest + + "or %%eax, %%eax \n" + "jz 3f \n" + + "xor %%ebx, %%ebx \n" + "sub %%ecx, %%ebx \n" // setting cf from ecx + "mov %%eax, %%ecx \n" // ecx=rest and is != 0 + "mov $0, %%ebx \n" + "2: \n" + "mov (%%esi, %%edx, 4), %%eax \n" + "adc %%ebx, %%eax \n" + "mov %%eax, (%%edi, %%edx, 4) \n" + + "inc %%edx \n" + "dec %%ecx \n" + "jnz 2b \n" + + "adc %%ecx, %%ecx \n" + "3: \n" + + : "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3) + : "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result) + : "cc", "memory" ); + + #endif + + TTMATH_LOG("UInt::AddVector") + + return c; + } /*! @@ -489,7 +619,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt32::Sub") + TTMATH_LOG("UInt::Sub") return c; } @@ -593,13 +723,152 @@ namespace ttmath #endif - TTMATH_LOG("UInt32::SubInt") + TTMATH_LOG("UInt::SubInt") return c; } + /*! + this static method subtractes one vector from the other + 'ss1' is larger in size or equal to 'ss2' + + ss1 points to the first (larger) vector + ss2 points to the second vector + ss1_size - size of the ss1 (and size of the result too) + ss2_size - size of the ss2 + result - is the result vector (which has size the same as ss1: ss1_size) + + Example: ss1_size is 5, ss2_size is 3 + ss1: ss2: result (output): + 5 1 5-1 + 4 3 4-3 + 2 7 2-7 + 6 6-1 (the borrow from previous item) + 9 9 + return (carry): 0 + of course the carry (borrow) is propagated and will be returned from the last item + (this method is used by the Karatsuba multiplication algorithm) + */ + template + uint UInt::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + { + TTMATH_ASSERT( ss1_size >= ss2_size ) + + uint rest = ss1_size - ss2_size; + uint c; + + #ifndef __GNUC__ + + // this part might be compiled with for example visual c + + /* + the asm code is nearly the same as in AddVector + only two instructions 'adc' are changed to 'sbb' + */ + __asm + { + pushad + + mov ecx, [ss2_size] + xor edx, edx // edx = 0, cf = 0 + + mov esi, [ss1] + mov ebx, [ss2] + mov edi, [result] + + p: + mov eax, [esi+edx*4] + sbb eax, [ebx+edx*4] + mov [edi+edx*4], eax + + inc edx + dec ecx + jnz p + + adc ecx, ecx // ecx has the cf state + + mov ebx, [rest] + or ebx, ebx + jz end + + xor ebx, ebx + sub ebx, ecx // setting cf from ecx + mov ecx, [rest] // ecx is != 0 + mov ebx, 0 + p2: + mov eax, [esi+edx*4] + sbb eax, ebx + mov [edi+edx*4], eax + + inc edx + dec ecx + jnz p2 + + adc ecx, ecx + + end: + mov [c], ecx + + popad + } + + #endif + + + #ifdef __GNUC__ + + // this part should be compiled with gcc + uint dummy1, dummy2, dummy3; + + __asm__ __volatile__( + "push %%edx \n" + "xor %%edx, %%edx \n" // edx = 0, cf = 0 + "1: \n" + "mov (%%esi,%%edx,4), %%eax \n" + "sbb (%%ebx,%%edx,4), %%eax \n" + "mov %%eax, (%%edi,%%edx,4) \n" + + "inc %%edx \n" + "dec %%ecx \n" + "jnz 1b \n" + + "adc %%ecx, %%ecx \n" // ecx has the cf state + "pop %%eax \n" // eax = rest + + "or %%eax, %%eax \n" + "jz 3f \n" + + "xor %%ebx, %%ebx \n" + "sub %%ecx, %%ebx \n" // setting cf from ecx + "mov %%eax, %%ecx \n" // ecx=rest and is != 0 + "mov $0, %%ebx \n" + "2: \n" + "mov (%%esi, %%edx, 4), %%eax \n" + "sbb %%ebx, %%eax \n" + "mov %%eax, (%%edi, %%edx, 4) \n" + + "inc %%edx \n" + "dec %%ecx \n" + "jnz 2b \n" + + "adc %%ecx, %%ecx \n" + "3: \n" + + : "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3) + : "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result) + : "cc", "memory" ); + + #endif + + TTMATH_LOG("UInt::SubVector") + + return c; + } + + + /*! this method moves all bits into the left hand side return value <- this <- c @@ -680,7 +949,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt32::Rcl2_one") + TTMATH_LOG("UInt::Rcl2_one") return c; } @@ -758,7 +1027,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt32::Rcr2_one") + TTMATH_LOG("UInt::Rcr2_one") return c; } @@ -886,7 +1155,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt32::Rcl2") + TTMATH_LOG("UInt::Rcl2") return c; } @@ -1021,7 +1290,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt32::Rcr2") + TTMATH_LOG("UInt::Rcr2") return c; } diff --git a/ttmath/ttmathuint_x86_64.h b/ttmath/ttmathuint_x86_64.h index 96e1452..87f177d 100644 --- a/ttmath/ttmathuint_x86_64.h +++ b/ttmath/ttmathuint_x86_64.h @@ -112,7 +112,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt64::Add") + TTMATH_LOG("UInt::Add") return c; } @@ -178,7 +178,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt64::AddInt") + TTMATH_LOG("UInt::AddInt") return c; } @@ -259,7 +259,90 @@ namespace ttmath #endif - TTMATH_LOG("UInt64::AddTwoInts") + TTMATH_LOG("UInt::AddTwoInts") + + return c; + } + + + + /*! + this static method addes one vector to the other + 'ss1' is larger in size or equal to 'ss2' + + ss1 points to the first (larger) vector + ss2 points to the second vector + ss1_size - size of the ss1 (and size of the result too) + ss2_size - size of the ss2 + result - is the result vector (which has size the same as ss1: ss1_size) + + Example: ss1_size is 5, ss2_size is 3 + ss1: ss2: result (output): + 5 1 5+1 + 4 3 4+3 + 2 7 2+7 + 6 6 + 9 9 + of course the carry is propagated and will be returned from the last item + (this method is used by the Karatsuba multiplication algorithm) + */ + template + uint UInt::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + { + TTMATH_ASSERT( ss1_size >= ss2_size ) + + uint rest = ss1_size - ss2_size; + uint c; + + #ifndef __GNUC__ + #error "another compiler than GCC is currently not supported in 64bit mode" + #endif + + #ifdef __GNUC__ + + // this part should be compiled with gcc + uint dummy1, dummy2, dummy3; + + __asm__ __volatile__( + "mov %%rdx, %%r8 \n" + "xor %%rdx, %%rdx \n" // rdx = 0, cf = 0 + "1: \n" + "mov (%%rsi,%%rdx,8), %%rax \n" + "adc (%%rbx,%%rdx,8), %%rax \n" + "mov %%rax, (%%rdi,%%rdx,8) \n" + + "inc %%rdx \n" + "dec %%rcx \n" + "jnz 1b \n" + + "adc %%rcx, %%rcx \n" // rcx has the cf state + + "or %%r8, %%r8 \n" + "jz 3f \n" + + "xor %%rbx, %%rbx \n" + "sub %%rcx, %%rbx \n" // setting cf from rcx + "mov %%r8, %%rcx \n" // rcx=rest and is != 0 + "mov $0, %%rbx \n" + "2: \n" + "mov (%%rsi, %%rdx, 8), %%rax \n" + "adc %%rbx, %%rax \n" + "mov %%rax, (%%rdi, %%rdx, 8) \n" + + "inc %%rdx \n" + "dec %%rcx \n" + "jnz 2b \n" + + "adc %%rcx, %%rcx \n" + "3: \n" + + : "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3) + : "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result) + : "%r8", "cc", "memory" ); + + #endif + + TTMATH_LOG("UInt::AddVector") return c; } @@ -316,7 +399,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt64::Sub") + TTMATH_LOG("UInt::Sub") return c; } @@ -379,12 +462,101 @@ namespace ttmath #endif - TTMATH_LOG("UInt64::SubInt") + TTMATH_LOG("UInt::SubInt") return c; } + + + /*! + this static method subtractes one vector from the other + 'ss1' is larger in size or equal to 'ss2' + + ss1 points to the first (larger) vector + ss2 points to the second vector + ss1_size - size of the ss1 (and size of the result too) + ss2_size - size of the ss2 + result - is the result vector (which has size the same as ss1: ss1_size) + + Example: ss1_size is 5, ss2_size is 3 + ss1: ss2: result (output): + 5 1 5-1 + 4 3 4-3 + 2 7 2-7 + 6 6-1 (the borrow from previous item) + 9 9 + return (carry): 0 + of course the carry (borrow) is propagated and will be returned from the last item + (this method is used by the Karatsuba multiplication algorithm) + */ + template + uint UInt::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + { + TTMATH_ASSERT( ss1_size >= ss2_size ) + + uint rest = ss1_size - ss2_size; + uint c; + + #ifndef __GNUC__ + #error "another compiler than GCC is currently not supported in 64bit mode" + #endif + + #ifdef __GNUC__ + + /* + the asm code is nearly the same as in AddVector + only two instructions 'adc' are changed to 'sbb' + */ + uint dummy1, dummy2, dummy3; + + __asm__ __volatile__( + "mov %%rdx, %%r8 \n" + "xor %%rdx, %%rdx \n" // rdx = 0, cf = 0 + "1: \n" + "mov (%%rsi,%%rdx,8), %%rax \n" + "sbb (%%rbx,%%rdx,8), %%rax \n" + "mov %%rax, (%%rdi,%%rdx,8) \n" + + "inc %%rdx \n" + "dec %%rcx \n" + "jnz 1b \n" + + "adc %%rcx, %%rcx \n" // rcx has the cf state + + "or %%r8, %%r8 \n" + "jz 3f \n" + + "xor %%rbx, %%rbx \n" + "sub %%rcx, %%rbx \n" // setting cf from rcx + "mov %%r8, %%rcx \n" // rcx=rest and is != 0 + "mov $0, %%rbx \n" + "2: \n" + "mov (%%rsi, %%rdx, 8), %%rax \n" + "sbb %%rbx, %%rax \n" + "mov %%rax, (%%rdi, %%rdx, 8) \n" + + "inc %%rdx \n" + "dec %%rcx \n" + "jnz 2b \n" + + "adc %%rcx, %%rcx \n" + "3: \n" + + : "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3) + : "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result) + : "%r8", "cc", "memory" ); + + #endif + + TTMATH_LOG("UInt::SubVector") + + return c; + } + + + /*! this method moves all bits into the left hand side return value <- this <- c @@ -431,7 +603,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt64::Rcl2_one") + TTMATH_LOG("UInt::Rcl2_one") return c; } @@ -481,7 +653,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt64::Rcr2_one") + TTMATH_LOG("UInt::Rcr2_one") return c; } @@ -553,7 +725,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt64::Rcl2") + TTMATH_LOG("UInt::Rcl2") return c; } @@ -628,7 +800,7 @@ namespace ttmath #endif - TTMATH_LOG("UInt64::Rcr2") + TTMATH_LOG("UInt::Rcr2") return c; }