added: uint UInt::Mul3(const UInt<value_size> & ss2)

void UInt::Mul3Big(const UInt<value_size> & ss2, UInt<value_size*2> & result) a new multiplication algorithm: Karatsuba multiplication, on a vector UInt<100> with all items different from zero this algorithm is faster about 3 times than Mul2Big(), and on a vector UInt<1000> with all items different from zero this algorithm is faster more than 5 times than Mul2Big() (measured on 32bit platform with GCC 4.3.3 with -O3 and -DTTMATH_RELEASE) added: uint MulFastest(const UInt<value_size> & ss2) void MulFastestBig(const UInt<value_size> & ss2, UInt<value_size*2> & result) those methods are trying to select the fastest multiplication algorithm changed: uint Mul(const UInt<value_size> & ss2, uint algorithm = 100) void MulBig(const UInt<value_size> & ss2, UInt<value_size*2> & result, uint algorithm = 100) those methods by default use MulFastest() and MulFastestBig() changed: changed a little Mul2Big() to cooperate with Mul3Big() changed: names of methods in macros TTMATH_LOG() added: uint AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) uint SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) three forms: asm x86, asm x86_64, no-asm those methods are used by the Karatsuba multiplication algorithm git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@148 e52654a7-88a9-db11-a3e9-0013d4bc506e
2009-05-15 22:27:04 +00:00 · 2009-05-15 22:27:04 +00:00 · eaa19dd46a
parent 939d0f7519
commit eaa19dd46a
7 changed files with 1001 additions and 72 deletions
--- a/21
+++ b/21
@ -1,4 +1,4 @@
-Version 0.8.5 (2009.05.11):
+Version 0.8.5 prerelease (2009.05.15):
    * fixed:   Big::Mod(x) didn't correctly return a carry
               and the result was sometimes very big (even greater than x)
    * fixed:   global function Mod(x) didn't set an ErrorCode object
@ -11,6 +11,25 @@ Version 0.8.5 (2009.05.11):
               the same is to Cos() function
    * changed: PrepareSin(x) is using Big::Mod() now when reducing 2PI period
               should be a little accurate especially on a very big 'x'
+    * changed: uint Mul(const UInt<value_size> & ss2, uint algorithm = 100)
+               void MulBig(const UInt<value_size> & ss2, UInt<value_size*2> & result, uint algorithm = 100)
+               those methods by default use MulFastest() and MulFastestBig()
+    * changed: changed a little Mul2Big() to cooperate with Mul3Big()
+    * added:   uint UInt::Mul3(const UInt<value_size> & ss2)
+               void UInt::Mul3Big(const UInt<value_size> & ss2, UInt<value_size*2> & result)
+               a new multiplication algorithm: Karatsuba multiplication,
+               on a vector UInt<100> with all items different from zero this algorithm is faster
+               about 3 times than Mul2Big(), and on a vector UInt<1000> with all items different from
+               zero this algorithm is faster more than 5 times than Mul2Big()
+               (measured on 32bit platform with GCC 4.3.3 with -O3 and -DTTMATH_RELEASE)
+    * added:   uint MulFastest(const UInt<value_size> & ss2)
+               void MulFastestBig(const UInt<value_size> & ss2, UInt<value_size*2> & result)
+               those methods are trying to select the fastest multiplication algorithm
+    * added:   uint AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+               uint SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+               three forms: asm x86, asm x86_64, no-asm
+               those methods are used by the Karatsuba multiplication algorithm
+               
               
 Version 0.8.4 (2009.05.08):
    * fixed:   UInt::DivInt() didn't check whether the divisor is zero
--- a/ttmath/ttmathbig.h
+++ b/ttmath/ttmathbig.h
@ -1642,7 +1642,7 @@ public:
 		// MS Visual Express 2005 reports a warning (in the lines with 'uint man_diff = ...'):
 		// warning C4307: '*' : integral constant overflow
 		// but we're using 'if( man > another_man )' and 'if( man < another_man )' and there'll be no such a situation here
-		#ifndef __GNUC__
+		#ifdef _MSC_VER
 		#pragma warning( disable: 4307 )
 		#endif

@ -1658,7 +1658,7 @@ public:
 			c += exponent.AddInt(man_diff, 0);
 		}
 		
-		#ifndef __GNUC__
+		#ifdef _MSC_VER
 		#pragma warning( default: 4307 )
 		#endif

@ -3426,9 +3426,9 @@ private:

 		it is called when the base is 10 and some digits were read before
 	*/
-	int FromString_ReadScientificIfExists(const char * & source)
+	uint FromString_ReadScientificIfExists(const char * & source)
 	{
-	int c = 0;
+	uint c = 0;

 		bool scientific_read = false;
 		const char * before_scientific = source;
--- a/ttmath/ttmathtypes.h
+++ b/ttmath/ttmathtypes.h
@ -237,6 +237,17 @@ namespace ttmath



+/*!
+	this is a limit when calculating Karatsuba multiplication
+	if the size of a vector is smaller than TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE
+	the Karatsuba algorithm will use standard schoolbook multiplication
+*/
+#ifdef __GNUC__
+#define TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE 3
+#else
+#define TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE 5
+#endif
+

 namespace ttmath
 {
--- a/ttmath/ttmathuint.h
+++ b/ttmath/ttmathuint.h
@ -840,8 +840,10 @@ public:

 	/*!
 		the multiplication 'this' = 'this' * ss2
+
+		algorithm: 100 - means automatically choose the fastest algorithm
 	*/
-	uint Mul(const UInt<value_size> & ss2, uint algorithm = 2)
+	uint Mul(const UInt<value_size> & ss2, uint algorithm = 100)
 	{
 		switch( algorithm )
 		{
@ -849,8 +851,14 @@ public:
 			return Mul1(ss2);

 		case 2:
-		default:
 			return Mul2(ss2);
+
+		case 3:
+			return Mul3(ss2);
+
+		case 100:
+		default:
+			return MulFastest(ss2);
 		}
 	}

@ -860,10 +868,12 @@ public:

 		since the 'result' is twice bigger than 'this' and 'ss2' 
 		this method never returns a carry
+
+		algorithm: 100 - means automatically choose the fastest algorithm
 	*/
 	void MulBig(const UInt<value_size> & ss2,
 				UInt<value_size*2> & result, 
-				uint algorithm = 2)
+				uint algorithm = 100)
 	{
 		switch( algorithm )
 		{
@ -871,8 +881,14 @@ public:
 			return Mul1Big(ss2, result);

 		case 2:
-		default:
 			return Mul2Big(ss2, result);
+
+		case 3:
+			return Mul3Big(ss2, result);
+
+		case 100:
+		default:
+			return MulFastestBig(ss2, result);
 		}
 	}

@ -964,7 +980,7 @@ public:
 	uint Mul2(const UInt<value_size> & ss2)
 	{
 	UInt<value_size*2> result;
-	uint i;
+	uint i, c = 0;

 		Mul2Big(ss2, result);
 	
@ -975,11 +991,14 @@ public:
 		// testing carry
 		for( ; i<value_size*2 ; ++i)
 			if( result.table[i] != 0 )
-				return 1;
+			{
+				c = 1;
+				break;
+			}

 		TTMATH_LOG("UInt::Mul2")

-	return 0;
+	return c;
 	}


@ -991,44 +1010,385 @@ public:
 	*/
 	void Mul2Big(const UInt<value_size> & ss2, UInt<value_size*2> & result)
 	{
-	uint r2,r1;
-	uint x1size=value_size, x2size=value_size;
-	uint x1start=0, x2start=0;
-
-		result.SetZero();
-
-		if( value_size > 2 )
-		{	
-			// if the value_size is smaller than or equal to 2
-			// there is no sense to set x1size (and others) to another values
-
-			for(x1size=value_size ; x1size>0 && table[x1size-1]==0 ; --x1size);
-			for(x2size=value_size ; x2size>0 && ss2.table[x2size-1]==0 ; --x2size);
-
-			if( x1size==0 || x2size==0 )
-			{
-				TTMATH_LOG("UInt::Mul2Big")
-				return;
-			}
-
-			for(x1start=0 ; x1start<x1size && table[x1start]==0 ; ++x1start);
-			for(x2start=0 ; x2start<x2size && ss2.table[x2start]==0 ; ++x2start);
-		}
-
-		for(uint x1=x1start ; x1<x1size ; ++x1)
-		{
-			for(uint x2=x2start ; x2<x2size ; ++x2)
-			{
-				MulTwoWords(table[x1], ss2.table[x2], &r2, &r1);
-				result.AddTwoInts(r2,r1,x2+x1);
-				// here will never be a carry
-			}
-		}
+		Mul2Big2<value_size>(table, ss2.table, result);

 		TTMATH_LOG("UInt::Mul2Big")
 	}


+private:
+
+	/*!
+		an auxiliary method for calculating the multiplication 
+
+		arguments we're taking as pointers (this is to improve the Mul3Big2()- avoiding
+		unnecessary copying objects), the result should be taken as a pointer too,
+		but at the moment there is no method AddTwoInts() which can operate on pointers
+	*/
+	template<uint ss_size>
+	void Mul2Big2(const uint * ss1, const uint * ss2, UInt<ss_size*2> & result)
+	{
+	uint x1size  = ss_size, x2size  = ss_size;
+	uint x1start = 0,       x2start = 0;
+
+		if( ss_size > 2 )
+		{	
+			// if the ss_size is smaller than or equal to 2
+			// there is no sense to set x1size (and others) to another values
+
+			for(x1size=ss_size ; x1size>0 && ss1[x1size-1]==0 ; --x1size);
+			for(x2size=ss_size ; x2size>0 && ss2[x2size-1]==0 ; --x2size);
+
+			for(x1start=0 ; x1start<x1size && ss1[x1start]==0 ; ++x1start);
+			for(x2start=0 ; x2start<x2size && ss2[x2start]==0 ; ++x2start);
+		}
+
+		Mul2Big3<ss_size>(ss1, ss2, result, x1start, x1size, x2start, x2size);
+	}
+
+
+
+	/*!
+		an auxiliary method for calculating the multiplication 
+	*/
+	template<uint ss_size>
+	void Mul2Big3(const uint * ss1, const uint * ss2, UInt<ss_size*2> & result, uint x1start, uint x1size, uint x2start, uint x2size)
+	{
+	uint r2, r1;
+
+		result.SetZero();
+
+		if( x1size==0 || x2size==0 )
+			return;
+
+		for(uint x1=x1start ; x1<x1size ; ++x1)
+		{
+			for(uint x2=x2start ; x2<x2size ; ++x2)
+			{
+				MulTwoWords(ss1[x1], ss2[x2], &r2, &r1);
+				result.AddTwoInts(r2, r1, x2+x1);
+				// here will never be a carry
+			}
+		}
+	}
+
+
+public:
+
+
+	/*!
+		multiplication: this = this * ss2
+
+		This is Karatsuba Multiplication algorithm, we're using it when value_size is greater than
+		or equal to TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE macro (defined in ttmathuint.h).
+		If value_size is smaller then we're using Mul2Big() instead.
+
+		Karatsuba multiplication:
+		Assume we have:
+			this = x = x1*B^m + x0
+			ss2  = y = y1*B^m + y0
+		where x0 and y0 are less than B^m
+		the product from multiplication we can show as:
+	    x*y = (x1*B^m + x0)(y1*B^m + y0) = z2*B^(2m) + z1*B^m + z0
+		where
+		    z2 = x1*y1
+			z1 = x1*y0 + x0*y1
+			z0 = x0*y0 
+		this is standard schoolbook algorithm with O(n^2), Karatsuba observed that z1 can be given in other form:
+			z1 = (x1 + x0)*(y1 + y0) - z2 - z0    / z1 = (x1*y1 + x1*y0 + x0*y1 + x0*y0) - x1*y1 - x0*y0 = x1*y0 + x0*y1 /
+		and to calculate the multiplication we need only three multiplications (with some additions and subtractions)			
+
+		Our objects 'this' and 'ss2' we divide into two parts and by using recurrence we calculate the multiplication.
+		Karatsuba multiplication has O( n^(ln(3)/ln(2)) )
+	*/
+	uint Mul3(const UInt<value_size> & ss2)
+	{
+	UInt<value_size*2> result;
+	uint i, c = 0;
+
+		Mul3Big(ss2, result);
+	
+		// copying result
+		for(i=0 ; i<value_size ; ++i)
+			table[i] = result.table[i];
+
+		// testing carry
+		for( ; i<value_size*2 ; ++i)
+			if( result.table[i] != 0 )
+			{
+				c = 1;
+				break;
+			}
+
+		TTMATH_LOG("UInt::Mul3")
+
+	return c;
+	}
+
+
+
+	/*!
+		multiplication: result = this * ss2
+
+		result is twice bigger than this and ss2,
+		this method never returns carry,
+		(Karatsuba multiplication)
+	*/
+	void Mul3Big(const UInt<value_size> & ss2, UInt<value_size*2> & result)
+	{
+		Mul3Big2<value_size>(table, ss2.table, result.table);
+
+		TTMATH_LOG("UInt::Mul3Big")
+	}
+
+
+
+private:
+
+	/*!
+		an auxiliary method for calculating the Karatsuba multiplication
+
+		result_size is equal ss_size*2
+	*/
+	template<uint ss_size>
+	void Mul3Big2(const uint * ss1, const uint * ss2, uint * result)
+	{
+	const uint * x1, * x0, * y1, * y0;
+
+
+		if( ss_size>1 && ss_size<TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE )
+		{
+			UInt<ss_size*2> res;
+			Mul2Big2<ss_size>(ss1, ss2, res);
+
+			for(uint i=0 ; i<ss_size*2 ; ++i)
+				result[i] = res.table[i];
+
+		return;
+		}
+		else
+		if( ss_size == 1 )
+		{
+			return MulTwoWords(*ss1, *ss2, &result[1], &result[0]);
+		}
+
+
+		if( (ss_size & 1) == 1 )
+		{
+			// ss_size is odd
+			x0 = ss1;
+			y0 = ss2;
+			x1 = ss1 + ss_size / 2 + 1;
+			y1 = ss2 + ss_size / 2 + 1;
+
+			// the second vectors (x1 and y1) are smaller about one from the first ones (x0 and y0)
+			Mul3Big3<ss_size/2 + 1, ss_size/2, ss_size*2>(x1, x0, y1, y0, result);
+		}
+		else
+		{
+			// ss_size is even
+			x0 = ss1;
+			y0 = ss2;
+			x1 = ss1 + ss_size / 2;
+			y1 = ss2 + ss_size / 2;
+			
+			// all four vectors (x0 x1 y0 y1) are equal in size
+			Mul3Big3<ss_size/2, ss_size/2, ss_size*2>(x1, x0, y1, y0, result);
+		}
+	}
+
+
+
+#ifdef _MSC_VER
+#pragma warning (disable : 4717)
+//warning C4717: recursive on all control paths, function will cause runtime stack overflow
+//we have the stop point in Mul3Big2() method
+#endif
+
+
+	/*!
+		an auxiliary method for calculating the Karatsuba multiplication
+
+			x = x1*B^m + x0
+			y = y1*B^m + y0
+
+			first_size  - is the size of vectors: x0 and y0
+			second_size - is the size of vectors: x1 and y1 (can be either equal first_size or smaller about one from first_size)
+
+			x*y = (x1*B^m + x0)(y1*B^m + y0) = z2*B^(2m) + z1*B^m + z0
+		      where
+			   z0 = x0*y0 
+			   z2 = x1*y1
+			   z1 = (x1 + x0)*(y1 + y0) - z2 - z0
+	*/
+	template<uint first_size, uint second_size, uint result_size>
+	void Mul3Big3(const uint * x1, const uint * x0, const uint * y1, const uint * y0, uint * result)
+	{
+	uint i, c, xc, yc;
+
+		UInt<first_size>   temp, temp2;
+		UInt<first_size*3> z1;
+
+		// z0 and z2 we store directly in the result (we don't use any temporary variables)
+		Mul3Big2<first_size>(x0, y0, result);                  // z0
+		Mul3Big2<second_size>(x1, y1, result+first_size*2);    // z2
+
+		// now we calculate z1
+		// temp  = (x0 + x1)
+		// temp2 = (y0 + y1)
+		// we're using temp and temp2 with UInt<first_size>, although there can be a carry but 
+		// we simple remember it in xc and yc (xc and yc can be either 0 or 1),
+		// and (x0 + x1)*(y0 + y1) we calculate in this way (schoolbook algorithm):
+		// 
+		//                 xc     |     temp
+		//                 yc     |     temp2
+		//               --------------------
+		//               (temp    *   temp2)
+		//               xc*temp2 |
+		//               yc*temp  |
+		//       xc*yc |                     
+		//       ----------     z1     --------
+		//
+		// and the result is never larger in size than 3*first_size
+
+		xc = AddVector(x0, x1, first_size, second_size, temp.table);
+		yc = AddVector(y0, y1, first_size, second_size, temp2.table);
+
+		Mul3Big2<first_size>(temp.table, temp2.table, z1.table);
+
+		// clearing the rest of z1
+		for(i=first_size*2 ; i<first_size*3 ; ++i)
+			z1.table[i] = 0;
+
+		
+		if( xc )
+		{
+			c = AddVector(z1.table+first_size, temp2.table, first_size*3-first_size, first_size, z1.table+first_size);
+			TTMATH_ASSERT( c==0 )
+		}
+
+		if( yc )
+		{
+			c = AddVector(z1.table+first_size, temp.table, first_size*3-first_size, first_size, z1.table+first_size);
+			TTMATH_ASSERT( c==0 )
+		}
+
+
+		if( xc && yc )
+		{
+			for( i=first_size*2 ; i<first_size*3 ; ++i )
+				if( ++z1.table[i] != 0 )
+					break;  // break if there was no carry 
+		}
+
+		// z1 = z1 - z2
+		c = SubVector(z1.table, result+first_size*2, first_size*3, second_size*2, z1.table);
+		TTMATH_ASSERT(c==0)
+
+		// z1 = z1 - z0
+		c = SubVector(z1.table, result, first_size*3, first_size*2, z1.table);
+		TTMATH_ASSERT(c==0)
+
+		// here we've calculated the z1
+		// now we're adding it to the result
+
+		if( first_size > second_size )
+		{
+			uint z1_size = result_size - first_size;
+			TTMATH_ASSERT( z1_size <= first_size*3 )
+
+			for(i=z1_size ; i<first_size*3 ; ++i)
+				TTMATH_ASSERT( z1.table[i] == 0 )
+				;
+			
+			c = AddVector(result+first_size, z1.table, result_size-first_size, z1_size, result+first_size);
+			TTMATH_ASSERT(c==0)
+		}
+		else
+		{
+			c = AddVector(result+first_size, z1.table, result_size-first_size, first_size*3, result+first_size);
+			TTMATH_ASSERT(c==0)
+		}
+	}
+
+
+#ifdef _MSC_VER
+#pragma warning (default : 4717)
+#endif
+
+
+public:
+
+
+	/*!
+		multiplication this = this * ss2
+	*/
+	uint MulFastest(const UInt<value_size> & ss2)
+	{
+	UInt<value_size*2> result;
+	uint i, c = 0;
+
+		MulFastestBig(ss2, result);
+	
+		// copying result
+		for(i=0 ; i<value_size ; ++i)
+			table[i] = result.table[i];
+
+		// testing carry
+		for( ; i<value_size*2 ; ++i)
+			if( result.table[i] != 0 )
+			{
+				c = 1;
+				break;
+			}
+
+		TTMATH_LOG("UInt::MulFastest")
+
+	return c;
+	}
+
+
+	/*!
+		multiplication result = this * ss2
+
+		this method is trying to select the fastest algorithm
+		(in the future this method can be improved)
+	*/
+	void MulFastestBig(const UInt<value_size> & ss2, UInt<value_size*2> & result)
+	{
+		if( value_size < TTMATH_USE_KARATSUBA_MULTIPLICATION_FROM_SIZE )
+			return Mul2Big(ss2, result);
+
+		uint x1size  = value_size, x2size  = value_size;
+		uint x1start = 0,          x2start = 0;
+
+		for(x1size=value_size ; x1size>0 && table[x1size-1]==0 ; --x1size);
+		for(x2size=value_size ; x2size>0 && ss2.table[x2size-1]==0 ; --x2size);
+
+		if( x1size==0 || x2size==0 )
+		{
+			// either 'this' or 'ss2' is equal zero - the result is zero too
+			result.SetZero();
+			return;
+		}
+
+		for(x1start=0 ; x1start<x1size && table[x1start]==0 ; ++x1start);
+		for(x2start=0 ; x2start<x2size && ss2.table[x2start]==0 ; ++x2start);
+
+		uint distancex1 = x1size - x1start;
+		uint distancex2 = x2size - x2start;
+
+		if( distancex1 < 3 || distancex2 < 3 )
+			// either 'this' or 'ss2' have only 2 (or 1) item different from zero (side by side)
+			// (this condition in the future can be improved)
+			return Mul2Big3<value_size>(table, ss2.table, result, x1start, x1size, x2start, x2size);
+
+
+		// Karatsuba multiplication
+		Mul3Big(ss2, result);
+
+		TTMATH_LOG("UInt::MulFastestBig")
+	}


 	/*!
@ -2909,12 +3269,13 @@ private:
 	uint Rcr2(uint bits, uint c);

 public:
-	
 	uint Add(const UInt<value_size> & ss2, uint c=0);
 	uint AddInt(uint value, uint index = 0);
 	uint AddTwoInts(uint x2, uint x1, uint index);
+	static uint AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
 	uint Sub(const UInt<value_size> & ss2, uint c=0);
 	uint SubInt(uint value, uint index = 0);
+	static uint SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
 	static sint FindLeadingBitInWord(uint x);
 	static uint SetBitInWord(uint & value, uint bit);
 	static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low);
@ -2922,6 +3283,23 @@ public:
 };


+
+/*!
+	this specialization is needed in order to not confused the compiler "error: ISO C++ forbids zero-size array"
+	when compiling Mul3Big2() method
+*/
+template<>
+class UInt<0>
+{
+public:
+	uint table[1];
+
+	void Mul2Big(const UInt<0> &, UInt<0> &) { TTMATH_ASSERT(false) };
+	void SetZero() { TTMATH_ASSERT(false) };
+	uint AddTwoInts(uint, uint, uint) { TTMATH_ASSERT(false) return 0; };
+};
+
+
 } //namespace


--- a/ttmath/ttmathuint_noasm.h
+++ b/ttmath/ttmathuint_noasm.h
@ -95,7 +95,7 @@ namespace ttmath
 		for(i=0 ; i<value_size ; ++i)
 			c = AddTwoWords(table[i], ss2.table[i], c, &table[i]);

-		TTMATH_LOG("UInt_noasm::Add")
+		TTMATH_LOG("UInt::Add")
 	
 	return c;
 	}
@ -131,7 +131,7 @@ namespace ttmath
 		for(i=index+1 ; i<value_size && c ; ++i)
 			c = AddTwoWords(table[i], 0, c, &table[i]);

-		TTMATH_LOG("UInt_noasm::AddInt")
+		TTMATH_LOG("UInt::AddInt")
 	
 	return c;
 	}
@ -184,13 +184,54 @@ namespace ttmath
 		for(i=index+2 ; i<value_size && c ; ++i)
 			c = AddTwoWords(table[i], 0, c, &table[i]);

-		TTMATH_LOG("UInt64::AddTwoInts")
+		TTMATH_LOG("UInt::AddTwoInts")
 	
 	return c;
 	}



+	/*!
+		this static method addes one vector to the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5+1
+		  4        3         4+3
+		  2        7         2+7
+		  6                  6
+		  9                  9
+	  of course the carry is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+	uint i, c = 0;
+
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+		
+		for(i=0 ; i<ss2_size ; ++i)
+			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
+
+		for( ; i<ss1_size ; ++i)
+			c = AddTwoWords(ss1[i], 0, c, &result[i]);
+
+		TTMATH_LOG("UInt::AddVector")
+
+	return c;
+	}
+
+
+
+
 	template<uint value_size>
 	uint UInt<value_size>::SubTwoWords(uint a, uint b, uint carry, uint * result)
 	{
@ -232,7 +273,7 @@ namespace ttmath
 		for(i=0 ; i<value_size ; ++i)
 			c = SubTwoWords(table[i], ss2.table[i], c, &table[i]);

-		TTMATH_LOG("UInt_noasm::Sub")
+		TTMATH_LOG("UInt::Sub")

 	return c;
 	}
@ -270,12 +311,51 @@ namespace ttmath
 		for(i=index+1 ; i<value_size && c ; ++i)
 			c = SubTwoWords(table[i], 0, c, &table[i]);

-		TTMATH_LOG("UInt_noasm::SubInt")
+		TTMATH_LOG("UInt::SubInt")
 	
 	return c;
 	}


+	/*!
+		this static method subtractes one vector from the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5-1
+		  4        3         4-3
+		  2        7         2-7
+		  6                  6-1  (the borrow from previous item)
+		  9                  9
+		                 return (carry): 0
+	  of course the carry (borrow) is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+	uint i, c = 0;
+
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+		
+		for(i=0 ; i<ss2_size ; ++i)
+			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
+
+		for( ; i<ss1_size ; ++i)
+			c = SubTwoWords(ss1[i], 0, c, &result[i]);
+
+		TTMATH_LOG("UInt::SubVector")
+
+	return c;
+	}
+


 	/*!
@ -305,7 +385,7 @@ namespace ttmath
 			c        = new_c;
 		}

-		TTMATH_LOG("UInt64::Rcl2_one")
+		TTMATH_LOG("UInt::Rcl2_one")

 	return c;
 	}
@ -344,7 +424,7 @@ namespace ttmath
 			c        = new_c;
 		}

-		TTMATH_LOG("UInt64::Rcr2_one")
+		TTMATH_LOG("UInt::Rcr2_one")

 	return c;
 	}
@ -421,7 +501,7 @@ namespace ttmath
 			c        = new_c;
 		}

-		TTMATH_LOG("UInt64::Rcr2")
+		TTMATH_LOG("UInt::Rcr2")

 	return (c & TTMATH_UINT_HIGHEST_BIT) ? 1 : 0;
 	}
--- a/ttmath/ttmathuint_x86.h
+++ b/ttmath/ttmathuint_x86.h
@ -162,7 +162,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt32::Add")
+		TTMATH_LOG("UInt::Add")

 	return c;
 	}
@ -267,7 +267,7 @@ namespace ttmath

 		#endif
 	
-		TTMATH_LOG("UInt32::AddInt")
+		TTMATH_LOG("UInt::AddInt")

 	return c;
 	}
@ -392,13 +392,143 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt32::AddTwoInts")
+		TTMATH_LOG("UInt::AddTwoInts")
 	
 	return c;
 	}



+	/*!
+		this static method addes one vector to the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5+1
+		  4        3         4+3
+		  2        7         2+7
+		  6                  6
+		  9                  9
+	  of course the carry is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+
+		uint rest = ss1_size - ss2_size;
+		uint c;
+
+		#ifndef __GNUC__
+
+			//	this part might be compiled with for example visual c
+			__asm
+			{
+				pushad
+
+				mov ecx, [ss2_size]
+				xor edx, edx               // edx = 0, cf = 0
+
+				mov esi, [ss1]
+				mov ebx, [ss2]
+				mov edi, [result]
+
+			p:
+				mov eax, [esi+edx*4]
+				adc eax, [ebx+edx*4]
+				mov [edi+edx*4], eax
+
+				inc edx
+				dec ecx
+			jnz p
+
+				adc ecx, ecx             // ecx has the cf state
+
+				mov ebx, [rest]
+				or ebx, ebx
+				jz end
+				
+				xor ebx, ebx
+				sub ebx, ecx             // setting cf from ecx
+				mov ecx, [rest]          // ecx is != 0
+				mov ebx, 0
+			p2:
+				mov eax, [esi+edx*4]
+				adc eax, ebx 
+				mov [edi+edx*4], eax
+
+				inc edx
+				dec ecx
+			jnz p2
+
+				adc ecx, ecx
+
+			end:
+				mov [c], ecx
+
+				popad
+			}
+
+		#endif		
+			
+
+		#ifdef __GNUC__
+			
+		//	this part should be compiled with gcc
+		uint dummy1, dummy2, dummy3;
+
+			__asm__ __volatile__(
+				"push %%edx							\n"
+				"xor %%edx, %%edx					\n"   // edx = 0, cf = 0
+			"1:										\n"
+				"mov (%%esi,%%edx,4), %%eax			\n"
+				"adc (%%ebx,%%edx,4), %%eax			\n"
+				"mov %%eax, (%%edi,%%edx,4)			\n"
+
+				"inc %%edx							\n"
+				"dec %%ecx							\n"
+			"jnz 1b									\n"
+
+				"adc %%ecx, %%ecx					\n"   // ecx has the cf state
+				"pop %%eax							\n"   // eax = rest
+
+				"or %%eax, %%eax					\n"
+				"jz 3f								\n"
+				
+				"xor %%ebx, %%ebx					\n"
+				"sub %%ecx, %%ebx					\n"   // setting cf from ecx
+				"mov %%eax, %%ecx					\n"   // ecx=rest and is != 0
+				"mov $0, %%ebx						\n"
+			"2:										\n"
+				"mov (%%esi, %%edx, 4), %%eax		\n"
+				"adc %%ebx, %%eax 					\n"
+				"mov %%eax, (%%edi, %%edx, 4)		\n"
+
+				"inc %%edx							\n"
+				"dec %%ecx							\n"
+			"jnz 2b									\n"
+
+				"adc %%ecx, %%ecx					\n"
+			"3:										\n"
+
+				: "=a" (dummy1), "=b" (dummy2), "=c" (c),       "=d" (dummy3)
+				:                "1" (ss2),     "2" (ss2_size), "3" (rest),   "S" (ss1),  "D" (result)
+				: "cc", "memory" );
+
+		#endif
+
+		TTMATH_LOG("UInt::AddVector")
+
+	return c;
+	}


 	/*!
@ -489,7 +619,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt32::Sub")
+		TTMATH_LOG("UInt::Sub")

 	return c;
 	}
@ -593,13 +723,152 @@ namespace ttmath

 		#endif
 		
-		TTMATH_LOG("UInt32::SubInt")
+		TTMATH_LOG("UInt::SubInt")
 	
 	return c;
 	}



+	/*!
+		this static method subtractes one vector from the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5-1
+		  4        3         4-3
+		  2        7         2-7
+		  6                  6-1  (the borrow from previous item)
+		  9                  9
+		              return (carry): 0
+	  of course the carry (borrow) is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+
+		uint rest = ss1_size - ss2_size;
+		uint c;
+
+		#ifndef __GNUC__
+			
+			//	this part might be compiled with for example visual c
+
+			/*
+				the asm code is nearly the same as in AddVector
+				only two instructions 'adc' are changed to 'sbb'
+			*/
+			__asm
+			{
+				pushad
+
+				mov ecx, [ss2_size]
+				xor edx, edx               // edx = 0, cf = 0
+
+				mov esi, [ss1]
+				mov ebx, [ss2]
+				mov edi, [result]
+
+			p:
+				mov eax, [esi+edx*4]
+				sbb eax, [ebx+edx*4]
+				mov [edi+edx*4], eax
+
+				inc edx
+				dec ecx
+			jnz p
+
+				adc ecx, ecx             // ecx has the cf state
+
+				mov ebx, [rest]
+				or ebx, ebx
+				jz end
+				
+				xor ebx, ebx
+				sub ebx, ecx             // setting cf from ecx
+				mov ecx, [rest]          // ecx is != 0
+				mov ebx, 0
+			p2:
+				mov eax, [esi+edx*4]
+				sbb eax, ebx 
+				mov [edi+edx*4], eax
+
+				inc edx
+				dec ecx
+			jnz p2
+
+				adc ecx, ecx
+
+			end:
+				mov [c], ecx
+
+				popad
+			}
+
+		#endif		
+			
+
+		#ifdef __GNUC__
+			
+		//	this part should be compiled with gcc
+		uint dummy1, dummy2, dummy3;
+
+			__asm__ __volatile__(
+				"push %%edx							\n"
+				"xor %%edx, %%edx					\n"   // edx = 0, cf = 0
+			"1:										\n"
+				"mov (%%esi,%%edx,4), %%eax			\n"
+				"sbb (%%ebx,%%edx,4), %%eax			\n"
+				"mov %%eax, (%%edi,%%edx,4)			\n"
+
+				"inc %%edx							\n"
+				"dec %%ecx							\n"
+			"jnz 1b									\n"
+
+				"adc %%ecx, %%ecx					\n"   // ecx has the cf state
+				"pop %%eax							\n"   // eax = rest
+
+				"or %%eax, %%eax					\n"
+				"jz 3f								\n"
+				
+				"xor %%ebx, %%ebx					\n"
+				"sub %%ecx, %%ebx					\n"   // setting cf from ecx
+				"mov %%eax, %%ecx					\n"   // ecx=rest and is != 0
+				"mov $0, %%ebx						\n"
+			"2:										\n"
+				"mov (%%esi, %%edx, 4), %%eax		\n"
+				"sbb %%ebx, %%eax 					\n"
+				"mov %%eax, (%%edi, %%edx, 4)		\n"
+
+				"inc %%edx							\n"
+				"dec %%ecx							\n"
+			"jnz 2b									\n"
+
+				"adc %%ecx, %%ecx					\n"
+			"3:										\n"
+
+				: "=a" (dummy1), "=b" (dummy2), "=c" (c),       "=d" (dummy3)
+				:                "1" (ss2),     "2" (ss2_size), "3" (rest),   "S" (ss1),  "D" (result)
+				: "cc", "memory" );
+
+		#endif
+
+		TTMATH_LOG("UInt::SubVector")
+
+	return c;
+	}
+
+
+
 	/*!
 		this method moves all bits into the left hand side
 		return value <- this <- c
@ -680,7 +949,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt32::Rcl2_one")
+		TTMATH_LOG("UInt::Rcl2_one")

 	return c;
 	}
@ -758,7 +1027,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt32::Rcr2_one")
+		TTMATH_LOG("UInt::Rcr2_one")

 	return c;
 	}
@ -886,7 +1155,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt32::Rcl2")
+		TTMATH_LOG("UInt::Rcl2")

 	return c;
 	}
@ -1021,7 +1290,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt32::Rcr2")
+		TTMATH_LOG("UInt::Rcr2")

 	return c;
 	}
--- a/ttmath/ttmathuint_x86_64.h
+++ b/ttmath/ttmathuint_x86_64.h
@ -112,7 +112,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt64::Add")
+		TTMATH_LOG("UInt::Add")
 	
 	return c;
 	}
@ -178,7 +178,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt64::AddInt")
+		TTMATH_LOG("UInt::AddInt")
 	
 	return c;
 	}
@ -259,7 +259,90 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt64::AddTwoInts")
+		TTMATH_LOG("UInt::AddTwoInts")
+
+	return c;
+	}
+
+
+
+	/*!
+		this static method addes one vector to the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5+1
+		  4        3         4+3
+		  2        7         2+7
+		  6                  6
+		  9                  9
+	  of course the carry is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+
+		uint rest = ss1_size - ss2_size;
+		uint c;
+
+		#ifndef __GNUC__
+			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#endif
+
+		#ifdef __GNUC__
+			
+		//	this part should be compiled with gcc
+		uint dummy1, dummy2, dummy3;
+
+			__asm__ __volatile__(
+				"mov %%rdx, %%r8					\n"
+				"xor %%rdx, %%rdx					\n"   // rdx = 0, cf = 0
+			"1:										\n"
+				"mov (%%rsi,%%rdx,8), %%rax			\n"
+				"adc (%%rbx,%%rdx,8), %%rax			\n"
+				"mov %%rax, (%%rdi,%%rdx,8)			\n"
+
+				"inc %%rdx							\n"
+				"dec %%rcx							\n"
+			"jnz 1b									\n"
+
+				"adc %%rcx, %%rcx					\n"   // rcx has the cf state
+
+				"or %%r8, %%r8						\n"
+				"jz 3f								\n"
+				
+				"xor %%rbx, %%rbx					\n"
+				"sub %%rcx, %%rbx					\n"   // setting cf from rcx
+				"mov %%r8, %%rcx					\n"   // rcx=rest and is != 0
+				"mov $0, %%rbx						\n"
+			"2:										\n"
+				"mov (%%rsi, %%rdx, 8), %%rax		\n"
+				"adc %%rbx, %%rax 					\n"
+				"mov %%rax, (%%rdi, %%rdx, 8)		\n"
+
+				"inc %%rdx							\n"
+				"dec %%rcx							\n"
+			"jnz 2b									\n"
+
+				"adc %%rcx, %%rcx					\n"
+			"3:										\n"
+
+				: "=a" (dummy1), "=b" (dummy2), "=c" (c),       "=d" (dummy3)
+				:                "1" (ss2),     "2" (ss2_size), "3" (rest),   "S" (ss1),  "D" (result)
+				: "%r8", "cc", "memory" );
+
+		#endif
+
+		TTMATH_LOG("UInt::AddVector")

 	return c;
 	}
@ -316,7 +399,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt64::Sub")
+		TTMATH_LOG("UInt::Sub")

 	return c;
 	}
@ -379,12 +462,101 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt64::SubInt")
+		TTMATH_LOG("UInt::SubInt")

 	return c;
 	}


+
+
+	/*!
+		this static method subtractes one vector from the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5-1
+		  4        3         4-3
+		  2        7         2-7
+		  6                  6-1  (the borrow from previous item)
+		  9                  9
+		               return (carry): 0
+	  of course the carry (borrow) is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+
+		uint rest = ss1_size - ss2_size;
+		uint c;
+
+		#ifndef __GNUC__
+			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#endif
+
+		#ifdef __GNUC__
+
+		/*
+			the asm code is nearly the same as in AddVector
+			only two instructions 'adc' are changed to 'sbb'
+		*/
+		uint dummy1, dummy2, dummy3;
+
+			__asm__ __volatile__(
+				"mov %%rdx, %%r8					\n"
+				"xor %%rdx, %%rdx					\n"   // rdx = 0, cf = 0
+			"1:										\n"
+				"mov (%%rsi,%%rdx,8), %%rax			\n"
+				"sbb (%%rbx,%%rdx,8), %%rax			\n"
+				"mov %%rax, (%%rdi,%%rdx,8)			\n"
+
+				"inc %%rdx							\n"
+				"dec %%rcx							\n"
+			"jnz 1b									\n"
+
+				"adc %%rcx, %%rcx					\n"   // rcx has the cf state
+
+				"or %%r8, %%r8						\n"
+				"jz 3f								\n"
+				
+				"xor %%rbx, %%rbx					\n"
+				"sub %%rcx, %%rbx					\n"   // setting cf from rcx
+				"mov %%r8, %%rcx					\n"   // rcx=rest and is != 0
+				"mov $0, %%rbx						\n"
+			"2:										\n"
+				"mov (%%rsi, %%rdx, 8), %%rax		\n"
+				"sbb %%rbx, %%rax 					\n"
+				"mov %%rax, (%%rdi, %%rdx, 8)		\n"
+
+				"inc %%rdx							\n"
+				"dec %%rcx							\n"
+			"jnz 2b									\n"
+
+				"adc %%rcx, %%rcx					\n"
+			"3:										\n"
+
+				: "=a" (dummy1), "=b" (dummy2), "=c" (c),       "=d" (dummy3)
+				:                "1" (ss2),     "2" (ss2_size), "3" (rest),   "S" (ss1),  "D" (result)
+				: "%r8", "cc", "memory" );
+
+		#endif
+
+		TTMATH_LOG("UInt::SubVector")
+
+	return c;
+	}
+
+
+
 	/*!
 		this method moves all bits into the left hand side
 		return value <- this <- c
@ -431,7 +603,7 @@ namespace ttmath
 	
 		#endif

-		TTMATH_LOG("UInt64::Rcl2_one")
+		TTMATH_LOG("UInt::Rcl2_one")

 	return c;
 	}
@ -481,7 +653,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt64::Rcr2_one")
+		TTMATH_LOG("UInt::Rcr2_one")

 	return c;
 	}
@ -553,7 +725,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt64::Rcl2")
+		TTMATH_LOG("UInt::Rcl2")

 	return c;
 	}
@ -628,7 +800,7 @@ namespace ttmath

 		#endif

-		TTMATH_LOG("UInt64::Rcr2")
+		TTMATH_LOG("UInt::Rcr2")

 	return c;
 	}