added: uint UInt::Mul3(const UInt<value_size> & ss2)

void UInt::Mul3Big(const UInt<value_size> & ss2, UInt<value_size*2> & result)
         a new multiplication algorithm: Karatsuba multiplication,
         on a vector UInt<100> with all items different from zero this algorithm is faster
         about 3 times than Mul2Big(), and on a vector UInt<1000> with all items different from
         zero this algorithm is faster more than 5 times than Mul2Big()
         (measured on 32bit platform with GCC 4.3.3 with -O3 and -DTTMATH_RELEASE)
added:   uint MulFastest(const UInt<value_size> & ss2)
         void MulFastestBig(const UInt<value_size> & ss2, UInt<value_size*2> & result)
         those methods are trying to select the fastest multiplication algorithm
changed: uint Mul(const UInt<value_size> & ss2, uint algorithm = 100)
         void MulBig(const UInt<value_size> & ss2, UInt<value_size*2> & result, uint algorithm = 100)
         those methods by default use MulFastest() and MulFastestBig()
changed: changed a little Mul2Big() to cooperate with Mul3Big()
changed: names of methods in macros TTMATH_LOG()
added:   uint AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
         uint SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
         three forms: asm x86, asm x86_64, no-asm
         those methods are used by the Karatsuba multiplication algorithm



git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@148 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
Tomasz Sowa 2009-05-15 22:27:04 +00:00
parent 939d0f7519
commit eaa19dd46a
7 changed files with 1001 additions and 72 deletions

View File

@ -1,4 +1,4 @@
Version 0.8.5 (2009.05.11):
Version 0.8.5 prerelease (2009.05.15):
* fixed: Big::Mod(x) didn't correctly return a carry
and the result was sometimes very big (even greater than x)
* fixed: global function Mod(x) didn't set an ErrorCode object
@ -11,6 +11,25 @@ Version 0.8.5 (2009.05.11):
the same is to Cos() function
* changed: PrepareSin(x) is using Big::Mod() now when reducing 2PI period
should be a little accurate especially on a very big 'x'
* changed: uint Mul(const UInt<value_size> & ss2, uint algorithm = 100)
void MulBig(const UInt<value_size> & ss2, UInt<value_size*2> & result, uint algorithm = 100)
those methods by default use MulFastest() and MulFastestBig()
* changed: changed a little Mul2Big() to cooperate with Mul3Big()
* added: uint UInt::Mul3(const UInt<value_size> & ss2)
void UInt::Mul3Big(const UInt<value_size> & ss2, UInt<value_size*2> & result)
a new multiplication algorithm: Karatsuba multiplication,
on a vector UInt<100> with all items different from zero this algorithm is faster
about 3 times than Mul2Big(), and on a vector UInt<1000> with all items different from
zero this algorithm is faster more than 5 times than Mul2Big()
(measured on 32bit platform with GCC 4.3.3 with -O3 and -DTTMATH_RELEASE)
* added: uint MulFastest(const UInt<value_size> & ss2)
void MulFastestBig(const UInt<value_size> & ss2, UInt<value_size*2> & result)
those methods are trying to select the fastest multiplication algorithm
* added: uint AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
uint SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
three forms: asm x86, asm x86_64, no-asm
those methods are used by the Karatsuba multiplication algorithm
Version 0.8.4 (2009.05.08):
* fixed: UInt::DivInt() didn't check whether the divisor is zero

View File

@ -1642,7 +1642,7 @@ public:
// MS Visual Express 2005 reports a warning (in the lines with 'uint man_diff = ...'):
// warning C4307: '*' : integral constant overflow
// but we're using 'if( man > another_man )' and 'if( man < another_man )' and there'll be no such a situation here
#ifndef __GNUC__
#ifdef _MSC_VER
#pragma warning( disable: 4307 )
#endif
@ -1658,7 +1658,7 @@ public:
c += exponent.AddInt(man_diff, 0);
}
#ifndef __GNUC__
#ifdef _MSC_VER
#pragma warning( default: 4307 )
#endif
@ -3426,9 +3426,9 @@ private:
it is called when the base is 10 and some digits were read before
*/
int FromString_ReadScientificIfExists(const char * & source)
uint FromString_ReadScientificIfExists(const char * & source)
{
int c = 0;
uint c = 0;
bool scientific_read = false;
const char * before_scientific = source;

View File

@ -237,6 +237,17 @@ namespace ttmath
/*!
this is a limit when calculating Karatsuba multiplication
if the size of a vector is smaller than TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE
the Karatsuba algorithm will use standard schoolbook multiplication
*/
#ifdef __GNUC__
#define TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE 3
#else
#define TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE 5
#endif
namespace ttmath
{

View File

@ -840,8 +840,10 @@ public:
/*!
the multiplication 'this' = 'this' * ss2
algorithm: 100 - means automatically choose the fastest algorithm
*/
uint Mul(const UInt<value_size> & ss2, uint algorithm = 2)
uint Mul(const UInt<value_size> & ss2, uint algorithm = 100)
{
switch( algorithm )
{
@ -849,8 +851,14 @@ public:
return Mul1(ss2);
case 2:
default:
return Mul2(ss2);
case 3:
return Mul3(ss2);
case 100:
default:
return MulFastest(ss2);
}
}
@ -860,10 +868,12 @@ public:
since the 'result' is twice bigger than 'this' and 'ss2'
this method never returns a carry
algorithm: 100 - means automatically choose the fastest algorithm
*/
void MulBig(const UInt<value_size> & ss2,
UInt<value_size*2> & result,
uint algorithm = 2)
uint algorithm = 100)
{
switch( algorithm )
{
@ -871,8 +881,14 @@ public:
return Mul1Big(ss2, result);
case 2:
default:
return Mul2Big(ss2, result);
case 3:
return Mul3Big(ss2, result);
case 100:
default:
return MulFastestBig(ss2, result);
}
}
@ -964,7 +980,7 @@ public:
uint Mul2(const UInt<value_size> & ss2)
{
UInt<value_size*2> result;
uint i;
uint i, c = 0;
Mul2Big(ss2, result);
@ -975,11 +991,14 @@ public:
// testing carry
for( ; i<value_size*2 ; ++i)
if( result.table[i] != 0 )
return 1;
{
c = 1;
break;
}
TTMATH_LOG("UInt::Mul2")
return 0;
return c;
}
@ -991,44 +1010,385 @@ public:
*/
void Mul2Big(const UInt<value_size> & ss2, UInt<value_size*2> & result)
{
uint r2,r1;
uint x1size=value_size, x2size=value_size;
uint x1start=0, x2start=0;
result.SetZero();
if( value_size > 2 )
{
// if the value_size is smaller than or equal to 2
// there is no sense to set x1size (and others) to another values
for(x1size=value_size ; x1size>0 && table[x1size-1]==0 ; --x1size);
for(x2size=value_size ; x2size>0 && ss2.table[x2size-1]==0 ; --x2size);
if( x1size==0 || x2size==0 )
{
TTMATH_LOG("UInt::Mul2Big")
return;
}
for(x1start=0 ; x1start<x1size && table[x1start]==0 ; ++x1start);
for(x2start=0 ; x2start<x2size && ss2.table[x2start]==0 ; ++x2start);
}
for(uint x1=x1start ; x1<x1size ; ++x1)
{
for(uint x2=x2start ; x2<x2size ; ++x2)
{
MulTwoWords(table[x1], ss2.table[x2], &r2, &r1);
result.AddTwoInts(r2,r1,x2+x1);
// here will never be a carry
}
}
Mul2Big2<value_size>(table, ss2.table, result);
TTMATH_LOG("UInt::Mul2Big")
}
private:
/*!
an auxiliary method for calculating the multiplication
arguments we're taking as pointers (this is to improve the Mul3Big2()- avoiding
unnecessary copying objects), the result should be taken as a pointer too,
but at the moment there is no method AddTwoInts() which can operate on pointers
*/
template<uint ss_size>
void Mul2Big2(const uint * ss1, const uint * ss2, UInt<ss_size*2> & result)
{
uint x1size = ss_size, x2size = ss_size;
uint x1start = 0, x2start = 0;
if( ss_size > 2 )
{
// if the ss_size is smaller than or equal to 2
// there is no sense to set x1size (and others) to another values
for(x1size=ss_size ; x1size>0 && ss1[x1size-1]==0 ; --x1size);
for(x2size=ss_size ; x2size>0 && ss2[x2size-1]==0 ; --x2size);
for(x1start=0 ; x1start<x1size && ss1[x1start]==0 ; ++x1start);
for(x2start=0 ; x2start<x2size && ss2[x2start]==0 ; ++x2start);
}
Mul2Big3<ss_size>(ss1, ss2, result, x1start, x1size, x2start, x2size);
}
/*!
an auxiliary method for calculating the multiplication
*/
template<uint ss_size>
void Mul2Big3(const uint * ss1, const uint * ss2, UInt<ss_size*2> & result, uint x1start, uint x1size, uint x2start, uint x2size)
{
uint r2, r1;
result.SetZero();
if( x1size==0 || x2size==0 )
return;
for(uint x1=x1start ; x1<x1size ; ++x1)
{
for(uint x2=x2start ; x2<x2size ; ++x2)
{
MulTwoWords(ss1[x1], ss2[x2], &r2, &r1);
result.AddTwoInts(r2, r1, x2+x1);
// here will never be a carry
}
}
}
public:
/*!
multiplication: this = this * ss2
This is Karatsuba Multiplication algorithm, we're using it when value_size is greater than
or equal to TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE macro (defined in ttmathuint.h).
If value_size is smaller then we're using Mul2Big() instead.
Karatsuba multiplication:
Assume we have:
this = x = x1*B^m + x0
ss2 = y = y1*B^m + y0
where x0 and y0 are less than B^m
the product from multiplication we can show as:
x*y = (x1*B^m + x0)(y1*B^m + y0) = z2*B^(2m) + z1*B^m + z0
where
z2 = x1*y1
z1 = x1*y0 + x0*y1
z0 = x0*y0
this is standard schoolbook algorithm with O(n^2), Karatsuba observed that z1 can be given in other form:
z1 = (x1 + x0)*(y1 + y0) - z2 - z0 / z1 = (x1*y1 + x1*y0 + x0*y1 + x0*y0) - x1*y1 - x0*y0 = x1*y0 + x0*y1 /
and to calculate the multiplication we need only three multiplications (with some additions and subtractions)
Our objects 'this' and 'ss2' we divide into two parts and by using recurrence we calculate the multiplication.
Karatsuba multiplication has O( n^(ln(3)/ln(2)) )
*/
uint Mul3(const UInt<value_size> & ss2)
{
UInt<value_size*2> result;
uint i, c = 0;
Mul3Big(ss2, result);
// copying result
for(i=0 ; i<value_size ; ++i)
table[i] = result.table[i];
// testing carry
for( ; i<value_size*2 ; ++i)
if( result.table[i] != 0 )
{
c = 1;
break;
}
TTMATH_LOG("UInt::Mul3")
return c;
}
/*!
multiplication: result = this * ss2
result is twice bigger than this and ss2,
this method never returns carry,
(Karatsuba multiplication)
*/
void Mul3Big(const UInt<value_size> & ss2, UInt<value_size*2> & result)
{
Mul3Big2<value_size>(table, ss2.table, result.table);
TTMATH_LOG("UInt::Mul3Big")
}
private:
/*!
an auxiliary method for calculating the Karatsuba multiplication
result_size is equal ss_size*2
*/
template<uint ss_size>
void Mul3Big2(const uint * ss1, const uint * ss2, uint * result)
{
const uint * x1, * x0, * y1, * y0;
if( ss_size>1 && ss_size<TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE )
{
UInt<ss_size*2> res;
Mul2Big2<ss_size>(ss1, ss2, res);
for(uint i=0 ; i<ss_size*2 ; ++i)
result[i] = res.table[i];
return;
}
else
if( ss_size == 1 )
{
return MulTwoWords(*ss1, *ss2, &result[1], &result[0]);
}
if( (ss_size & 1) == 1 )
{
// ss_size is odd
x0 = ss1;
y0 = ss2;
x1 = ss1 + ss_size / 2 + 1;
y1 = ss2 + ss_size / 2 + 1;
// the second vectors (x1 and y1) are smaller about one from the first ones (x0 and y0)
Mul3Big3<ss_size/2 + 1, ss_size/2, ss_size*2>(x1, x0, y1, y0, result);
}
else
{
// ss_size is even
x0 = ss1;
y0 = ss2;
x1 = ss1 + ss_size / 2;
y1 = ss2 + ss_size / 2;
// all four vectors (x0 x1 y0 y1) are equal in size
Mul3Big3<ss_size/2, ss_size/2, ss_size*2>(x1, x0, y1, y0, result);
}
}
#ifdef _MSC_VER
#pragma warning (disable : 4717)
//warning C4717: recursive on all control paths, function will cause runtime stack overflow
//we have the stop point in Mul3Big2() method
#endif
/*!
an auxiliary method for calculating the Karatsuba multiplication
x = x1*B^m + x0
y = y1*B^m + y0
first_size - is the size of vectors: x0 and y0
second_size - is the size of vectors: x1 and y1 (can be either equal first_size or smaller about one from first_size)
x*y = (x1*B^m + x0)(y1*B^m + y0) = z2*B^(2m) + z1*B^m + z0
where
z0 = x0*y0
z2 = x1*y1
z1 = (x1 + x0)*(y1 + y0) - z2 - z0
*/
template<uint first_size, uint second_size, uint result_size>
void Mul3Big3(const uint * x1, const uint * x0, const uint * y1, const uint * y0, uint * result)
{
uint i, c, xc, yc;
UInt<first_size> temp, temp2;
UInt<first_size*3> z1;
// z0 and z2 we store directly in the result (we don't use any temporary variables)
Mul3Big2<first_size>(x0, y0, result); // z0
Mul3Big2<second_size>(x1, y1, result+first_size*2); // z2
// now we calculate z1
// temp = (x0 + x1)
// temp2 = (y0 + y1)
// we're using temp and temp2 with UInt<first_size>, although there can be a carry but
// we simple remember it in xc and yc (xc and yc can be either 0 or 1),
// and (x0 + x1)*(y0 + y1) we calculate in this way (schoolbook algorithm):
//
// xc | temp
// yc | temp2
// --------------------
// (temp * temp2)
// xc*temp2 |
// yc*temp |
// xc*yc |
// ---------- z1 --------
//
// and the result is never larger in size than 3*first_size
xc = AddVector(x0, x1, first_size, second_size, temp.table);
yc = AddVector(y0, y1, first_size, second_size, temp2.table);
Mul3Big2<first_size>(temp.table, temp2.table, z1.table);
// clearing the rest of z1
for(i=first_size*2 ; i<first_size*3 ; ++i)
z1.table[i] = 0;
if( xc )
{
c = AddVector(z1.table+first_size, temp2.table, first_size*3-first_size, first_size, z1.table+first_size);
TTMATH_ASSERT( c==0 )
}
if( yc )
{
c = AddVector(z1.table+first_size, temp.table, first_size*3-first_size, first_size, z1.table+first_size);
TTMATH_ASSERT( c==0 )
}
if( xc && yc )
{
for( i=first_size*2 ; i<first_size*3 ; ++i )
if( ++z1.table[i] != 0 )
break; // break if there was no carry
}
// z1 = z1 - z2
c = SubVector(z1.table, result+first_size*2, first_size*3, second_size*2, z1.table);
TTMATH_ASSERT(c==0)
// z1 = z1 - z0
c = SubVector(z1.table, result, first_size*3, first_size*2, z1.table);
TTMATH_ASSERT(c==0)
// here we've calculated the z1
// now we're adding it to the result
if( first_size > second_size )
{
uint z1_size = result_size - first_size;
TTMATH_ASSERT( z1_size <= first_size*3 )
for(i=z1_size ; i<first_size*3 ; ++i)
TTMATH_ASSERT( z1.table[i] == 0 )
;
c = AddVector(result+first_size, z1.table, result_size-first_size, z1_size, result+first_size);
TTMATH_ASSERT(c==0)
}
else
{
c = AddVector(result+first_size, z1.table, result_size-first_size, first_size*3, result+first_size);
TTMATH_ASSERT(c==0)
}
}
#ifdef _MSC_VER
#pragma warning (default : 4717)
#endif
public:
/*!
multiplication this = this * ss2
*/
uint MulFastest(const UInt<value_size> & ss2)
{
UInt<value_size*2> result;
uint i, c = 0;
MulFastestBig(ss2, result);
// copying result
for(i=0 ; i<value_size ; ++i)
table[i] = result.table[i];
// testing carry
for( ; i<value_size*2 ; ++i)
if( result.table[i] != 0 )
{
c = 1;
break;
}
TTMATH_LOG("UInt::MulFastest")
return c;
}
/*!
multiplication result = this * ss2
this method is trying to select the fastest algorithm
(in the future this method can be improved)
*/
void MulFastestBig(const UInt<value_size> & ss2, UInt<value_size*2> & result)
{
if( value_size < TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE )
return Mul2Big(ss2, result);
uint x1size = value_size, x2size = value_size;
uint x1start = 0, x2start = 0;
for(x1size=value_size ; x1size>0 && table[x1size-1]==0 ; --x1size);
for(x2size=value_size ; x2size>0 && ss2.table[x2size-1]==0 ; --x2size);
if( x1size==0 || x2size==0 )
{
// either 'this' or 'ss2' is equal zero - the result is zero too
result.SetZero();
return;
}
for(x1start=0 ; x1start<x1size && table[x1start]==0 ; ++x1start);
for(x2start=0 ; x2start<x2size && ss2.table[x2start]==0 ; ++x2start);
uint distancex1 = x1size - x1start;
uint distancex2 = x2size - x2start;
if( distancex1 < 3 || distancex2 < 3 )
// either 'this' or 'ss2' have only 2 (or 1) item different from zero (side by side)
// (this condition in the future can be improved)
return Mul2Big3<value_size>(table, ss2.table, result, x1start, x1size, x2start, x2size);
// Karatsuba multiplication
Mul3Big(ss2, result);
TTMATH_LOG("UInt::MulFastestBig")
}
/*!
@ -2909,12 +3269,13 @@ private:
uint Rcr2(uint bits, uint c);
public:
uint Add(const UInt<value_size> & ss2, uint c=0);
uint AddInt(uint value, uint index = 0);
uint AddTwoInts(uint x2, uint x1, uint index);
static uint AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
uint Sub(const UInt<value_size> & ss2, uint c=0);
uint SubInt(uint value, uint index = 0);
static uint SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
static sint FindLeadingBitInWord(uint x);
static uint SetBitInWord(uint & value, uint bit);
static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low);
@ -2922,6 +3283,23 @@ public:
};
/*!
this specialization is needed in order to not confused the compiler "error: ISO C++ forbids zero-size array"
when compiling Mul3Big2() method
*/
template<>
class UInt<0>
{
public:
uint table[1];
void Mul2Big(const UInt<0> &, UInt<0> &) { TTMATH_ASSERT(false) };
void SetZero() { TTMATH_ASSERT(false) };
uint AddTwoInts(uint, uint, uint) { TTMATH_ASSERT(false) return 0; };
};
} //namespace

View File

@ -95,7 +95,7 @@ namespace ttmath
for(i=0 ; i<value_size ; ++i)
c = AddTwoWords(table[i], ss2.table[i], c, &table[i]);
TTMATH_LOG("UInt_noasm::Add")
TTMATH_LOG("UInt::Add")
return c;
}
@ -131,7 +131,7 @@ namespace ttmath
for(i=index+1 ; i<value_size && c ; ++i)
c = AddTwoWords(table[i], 0, c, &table[i]);
TTMATH_LOG("UInt_noasm::AddInt")
TTMATH_LOG("UInt::AddInt")
return c;
}
@ -184,13 +184,54 @@ namespace ttmath
for(i=index+2 ; i<value_size && c ; ++i)
c = AddTwoWords(table[i], 0, c, &table[i]);
TTMATH_LOG("UInt64::AddTwoInts")
TTMATH_LOG("UInt::AddTwoInts")
return c;
}
/*!
this static method addes one vector to the other
'ss1' is larger in size or equal to 'ss2'
ss1 points to the first (larger) vector
ss2 points to the second vector
ss1_size - size of the ss1 (and size of the result too)
ss2_size - size of the ss2
result - is the result vector (which has size the same as ss1: ss1_size)
Example: ss1_size is 5, ss2_size is 3
ss1: ss2: result (output):
5 1 5+1
4 3 4+3
2 7 2+7
6 6
9 9
of course the carry is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
template<uint value_size>
uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
for( ; i<ss1_size ; ++i)
c = AddTwoWords(ss1[i], 0, c, &result[i]);
TTMATH_LOG("UInt::AddVector")
return c;
}
template<uint value_size>
uint UInt<value_size>::SubTwoWords(uint a, uint b, uint carry, uint * result)
{
@ -232,7 +273,7 @@ namespace ttmath
for(i=0 ; i<value_size ; ++i)
c = SubTwoWords(table[i], ss2.table[i], c, &table[i]);
TTMATH_LOG("UInt_noasm::Sub")
TTMATH_LOG("UInt::Sub")
return c;
}
@ -270,12 +311,51 @@ namespace ttmath
for(i=index+1 ; i<value_size && c ; ++i)
c = SubTwoWords(table[i], 0, c, &table[i]);
TTMATH_LOG("UInt_noasm::SubInt")
TTMATH_LOG("UInt::SubInt")
return c;
}
/*!
this static method subtractes one vector from the other
'ss1' is larger in size or equal to 'ss2'
ss1 points to the first (larger) vector
ss2 points to the second vector
ss1_size - size of the ss1 (and size of the result too)
ss2_size - size of the ss2
result - is the result vector (which has size the same as ss1: ss1_size)
Example: ss1_size is 5, ss2_size is 3
ss1: ss2: result (output):
5 1 5-1
4 3 4-3
2 7 2-7
6 6-1 (the borrow from previous item)
9 9
return (carry): 0
of course the carry (borrow) is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
template<uint value_size>
uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
for( ; i<ss1_size ; ++i)
c = SubTwoWords(ss1[i], 0, c, &result[i]);
TTMATH_LOG("UInt::SubVector")
return c;
}
/*!
@ -305,7 +385,7 @@ namespace ttmath
c = new_c;
}
TTMATH_LOG("UInt64::Rcl2_one")
TTMATH_LOG("UInt::Rcl2_one")
return c;
}
@ -344,7 +424,7 @@ namespace ttmath
c = new_c;
}
TTMATH_LOG("UInt64::Rcr2_one")
TTMATH_LOG("UInt::Rcr2_one")
return c;
}
@ -421,7 +501,7 @@ namespace ttmath
c = new_c;
}
TTMATH_LOG("UInt64::Rcr2")
TTMATH_LOG("UInt::Rcr2")
return (c & TTMATH_UINT_HIGHEST_BIT) ? 1 : 0;
}

View File

@ -162,7 +162,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt32::Add")
TTMATH_LOG("UInt::Add")
return c;
}
@ -267,7 +267,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt32::AddInt")
TTMATH_LOG("UInt::AddInt")
return c;
}
@ -392,13 +392,143 @@ namespace ttmath
#endif
TTMATH_LOG("UInt32::AddTwoInts")
TTMATH_LOG("UInt::AddTwoInts")
return c;
}
/*!
this static method addes one vector to the other
'ss1' is larger in size or equal to 'ss2'
ss1 points to the first (larger) vector
ss2 points to the second vector
ss1_size - size of the ss1 (and size of the result too)
ss2_size - size of the ss2
result - is the result vector (which has size the same as ss1: ss1_size)
Example: ss1_size is 5, ss2_size is 3
ss1: ss2: result (output):
5 1 5+1
4 3 4+3
2 7 2+7
6 6
9 9
of course the carry is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
template<uint value_size>
uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
TTMATH_ASSERT( ss1_size >= ss2_size )
uint rest = ss1_size - ss2_size;
uint c;
#ifndef __GNUC__
// this part might be compiled with for example visual c
__asm
{
pushad
mov ecx, [ss2_size]
xor edx, edx // edx = 0, cf = 0
mov esi, [ss1]
mov ebx, [ss2]
mov edi, [result]
p:
mov eax, [esi+edx*4]
adc eax, [ebx+edx*4]
mov [edi+edx*4], eax
inc edx
dec ecx
jnz p
adc ecx, ecx // ecx has the cf state
mov ebx, [rest]
or ebx, ebx
jz end
xor ebx, ebx
sub ebx, ecx // setting cf from ecx
mov ecx, [rest] // ecx is != 0
mov ebx, 0
p2:
mov eax, [esi+edx*4]
adc eax, ebx
mov [edi+edx*4], eax
inc edx
dec ecx
jnz p2
adc ecx, ecx
end:
mov [c], ecx
popad
}
#endif
#ifdef __GNUC__
// this part should be compiled with gcc
uint dummy1, dummy2, dummy3;
__asm__ __volatile__(
"push %%edx \n"
"xor %%edx, %%edx \n" // edx = 0, cf = 0
"1: \n"
"mov (%%esi,%%edx,4), %%eax \n"
"adc (%%ebx,%%edx,4), %%eax \n"
"mov %%eax, (%%edi,%%edx,4) \n"
"inc %%edx \n"
"dec %%ecx \n"
"jnz 1b \n"
"adc %%ecx, %%ecx \n" // ecx has the cf state
"pop %%eax \n" // eax = rest
"or %%eax, %%eax \n"
"jz 3f \n"
"xor %%ebx, %%ebx \n"
"sub %%ecx, %%ebx \n" // setting cf from ecx
"mov %%eax, %%ecx \n" // ecx=rest and is != 0
"mov $0, %%ebx \n"
"2: \n"
"mov (%%esi, %%edx, 4), %%eax \n"
"adc %%ebx, %%eax \n"
"mov %%eax, (%%edi, %%edx, 4) \n"
"inc %%edx \n"
"dec %%ecx \n"
"jnz 2b \n"
"adc %%ecx, %%ecx \n"
"3: \n"
: "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3)
: "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result)
: "cc", "memory" );
#endif
TTMATH_LOG("UInt::AddVector")
return c;
}
/*!
@ -489,7 +619,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt32::Sub")
TTMATH_LOG("UInt::Sub")
return c;
}
@ -593,13 +723,152 @@ namespace ttmath
#endif
TTMATH_LOG("UInt32::SubInt")
TTMATH_LOG("UInt::SubInt")
return c;
}
/*!
this static method subtractes one vector from the other
'ss1' is larger in size or equal to 'ss2'
ss1 points to the first (larger) vector
ss2 points to the second vector
ss1_size - size of the ss1 (and size of the result too)
ss2_size - size of the ss2
result - is the result vector (which has size the same as ss1: ss1_size)
Example: ss1_size is 5, ss2_size is 3
ss1: ss2: result (output):
5 1 5-1
4 3 4-3
2 7 2-7
6 6-1 (the borrow from previous item)
9 9
return (carry): 0
of course the carry (borrow) is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
template<uint value_size>
uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
TTMATH_ASSERT( ss1_size >= ss2_size )
uint rest = ss1_size - ss2_size;
uint c;
#ifndef __GNUC__
// this part might be compiled with for example visual c
/*
the asm code is nearly the same as in AddVector
only two instructions 'adc' are changed to 'sbb'
*/
__asm
{
pushad
mov ecx, [ss2_size]
xor edx, edx // edx = 0, cf = 0
mov esi, [ss1]
mov ebx, [ss2]
mov edi, [result]
p:
mov eax, [esi+edx*4]
sbb eax, [ebx+edx*4]
mov [edi+edx*4], eax
inc edx
dec ecx
jnz p
adc ecx, ecx // ecx has the cf state
mov ebx, [rest]
or ebx, ebx
jz end
xor ebx, ebx
sub ebx, ecx // setting cf from ecx
mov ecx, [rest] // ecx is != 0
mov ebx, 0
p2:
mov eax, [esi+edx*4]
sbb eax, ebx
mov [edi+edx*4], eax
inc edx
dec ecx
jnz p2
adc ecx, ecx
end:
mov [c], ecx
popad
}
#endif
#ifdef __GNUC__
// this part should be compiled with gcc
uint dummy1, dummy2, dummy3;
__asm__ __volatile__(
"push %%edx \n"
"xor %%edx, %%edx \n" // edx = 0, cf = 0
"1: \n"
"mov (%%esi,%%edx,4), %%eax \n"
"sbb (%%ebx,%%edx,4), %%eax \n"
"mov %%eax, (%%edi,%%edx,4) \n"
"inc %%edx \n"
"dec %%ecx \n"
"jnz 1b \n"
"adc %%ecx, %%ecx \n" // ecx has the cf state
"pop %%eax \n" // eax = rest
"or %%eax, %%eax \n"
"jz 3f \n"
"xor %%ebx, %%ebx \n"
"sub %%ecx, %%ebx \n" // setting cf from ecx
"mov %%eax, %%ecx \n" // ecx=rest and is != 0
"mov $0, %%ebx \n"
"2: \n"
"mov (%%esi, %%edx, 4), %%eax \n"
"sbb %%ebx, %%eax \n"
"mov %%eax, (%%edi, %%edx, 4) \n"
"inc %%edx \n"
"dec %%ecx \n"
"jnz 2b \n"
"adc %%ecx, %%ecx \n"
"3: \n"
: "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3)
: "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result)
: "cc", "memory" );
#endif
TTMATH_LOG("UInt::SubVector")
return c;
}
/*!
this method moves all bits into the left hand side
return value <- this <- c
@ -680,7 +949,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt32::Rcl2_one")
TTMATH_LOG("UInt::Rcl2_one")
return c;
}
@ -758,7 +1027,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt32::Rcr2_one")
TTMATH_LOG("UInt::Rcr2_one")
return c;
}
@ -886,7 +1155,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt32::Rcl2")
TTMATH_LOG("UInt::Rcl2")
return c;
}
@ -1021,7 +1290,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt32::Rcr2")
TTMATH_LOG("UInt::Rcr2")
return c;
}

View File

@ -112,7 +112,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::Add")
TTMATH_LOG("UInt::Add")
return c;
}
@ -178,7 +178,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::AddInt")
TTMATH_LOG("UInt::AddInt")
return c;
}
@ -259,7 +259,90 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::AddTwoInts")
TTMATH_LOG("UInt::AddTwoInts")
return c;
}
/*!
this static method addes one vector to the other
'ss1' is larger in size or equal to 'ss2'
ss1 points to the first (larger) vector
ss2 points to the second vector
ss1_size - size of the ss1 (and size of the result too)
ss2_size - size of the ss2
result - is the result vector (which has size the same as ss1: ss1_size)
Example: ss1_size is 5, ss2_size is 3
ss1: ss2: result (output):
5 1 5+1
4 3 4+3
2 7 2+7
6 6
9 9
of course the carry is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
template<uint value_size>
uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
TTMATH_ASSERT( ss1_size >= ss2_size )
uint rest = ss1_size - ss2_size;
uint c;
#ifndef __GNUC__
#error "another compiler than GCC is currently not supported in 64bit mode"
#endif
#ifdef __GNUC__
// this part should be compiled with gcc
uint dummy1, dummy2, dummy3;
__asm__ __volatile__(
"mov %%rdx, %%r8 \n"
"xor %%rdx, %%rdx \n" // rdx = 0, cf = 0
"1: \n"
"mov (%%rsi,%%rdx,8), %%rax \n"
"adc (%%rbx,%%rdx,8), %%rax \n"
"mov %%rax, (%%rdi,%%rdx,8) \n"
"inc %%rdx \n"
"dec %%rcx \n"
"jnz 1b \n"
"adc %%rcx, %%rcx \n" // rcx has the cf state
"or %%r8, %%r8 \n"
"jz 3f \n"
"xor %%rbx, %%rbx \n"
"sub %%rcx, %%rbx \n" // setting cf from rcx
"mov %%r8, %%rcx \n" // rcx=rest and is != 0
"mov $0, %%rbx \n"
"2: \n"
"mov (%%rsi, %%rdx, 8), %%rax \n"
"adc %%rbx, %%rax \n"
"mov %%rax, (%%rdi, %%rdx, 8) \n"
"inc %%rdx \n"
"dec %%rcx \n"
"jnz 2b \n"
"adc %%rcx, %%rcx \n"
"3: \n"
: "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3)
: "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result)
: "%r8", "cc", "memory" );
#endif
TTMATH_LOG("UInt::AddVector")
return c;
}
@ -316,7 +399,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::Sub")
TTMATH_LOG("UInt::Sub")
return c;
}
@ -379,12 +462,101 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::SubInt")
TTMATH_LOG("UInt::SubInt")
return c;
}
/*!
this static method subtractes one vector from the other
'ss1' is larger in size or equal to 'ss2'
ss1 points to the first (larger) vector
ss2 points to the second vector
ss1_size - size of the ss1 (and size of the result too)
ss2_size - size of the ss2
result - is the result vector (which has size the same as ss1: ss1_size)
Example: ss1_size is 5, ss2_size is 3
ss1: ss2: result (output):
5 1 5-1
4 3 4-3
2 7 2-7
6 6-1 (the borrow from previous item)
9 9
return (carry): 0
of course the carry (borrow) is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
template<uint value_size>
uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
TTMATH_ASSERT( ss1_size >= ss2_size )
uint rest = ss1_size - ss2_size;
uint c;
#ifndef __GNUC__
#error "another compiler than GCC is currently not supported in 64bit mode"
#endif
#ifdef __GNUC__
/*
the asm code is nearly the same as in AddVector
only two instructions 'adc' are changed to 'sbb'
*/
uint dummy1, dummy2, dummy3;
__asm__ __volatile__(
"mov %%rdx, %%r8 \n"
"xor %%rdx, %%rdx \n" // rdx = 0, cf = 0
"1: \n"
"mov (%%rsi,%%rdx,8), %%rax \n"
"sbb (%%rbx,%%rdx,8), %%rax \n"
"mov %%rax, (%%rdi,%%rdx,8) \n"
"inc %%rdx \n"
"dec %%rcx \n"
"jnz 1b \n"
"adc %%rcx, %%rcx \n" // rcx has the cf state
"or %%r8, %%r8 \n"
"jz 3f \n"
"xor %%rbx, %%rbx \n"
"sub %%rcx, %%rbx \n" // setting cf from rcx
"mov %%r8, %%rcx \n" // rcx=rest and is != 0
"mov $0, %%rbx \n"
"2: \n"
"mov (%%rsi, %%rdx, 8), %%rax \n"
"sbb %%rbx, %%rax \n"
"mov %%rax, (%%rdi, %%rdx, 8) \n"
"inc %%rdx \n"
"dec %%rcx \n"
"jnz 2b \n"
"adc %%rcx, %%rcx \n"
"3: \n"
: "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3)
: "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result)
: "%r8", "cc", "memory" );
#endif
TTMATH_LOG("UInt::SubVector")
return c;
}
/*!
this method moves all bits into the left hand side
return value <- this <- c
@ -431,7 +603,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::Rcl2_one")
TTMATH_LOG("UInt::Rcl2_one")
return c;
}
@ -481,7 +653,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::Rcr2_one")
TTMATH_LOG("UInt::Rcr2_one")
return c;
}
@ -553,7 +725,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::Rcl2")
TTMATH_LOG("UInt::Rcl2")
return c;
}
@ -628,7 +800,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::Rcr2")
TTMATH_LOG("UInt::Rcr2")
return c;
}