- fixed a bug in 64 bit ASM for MSVC

git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@181 e52654a7-88a9-db11-a3e9-0013d4bc506e
2009-07-28 16:34:04 +00:00 · 2009-07-28 16:34:04 +00:00 · e102086f80
parent 51b2c974a1
commit e102086f80
5 changed files with 271 additions and 144 deletions
--- a/ttmath/ttmathbig.h
+++ b/ttmath/ttmathbig.h
@ -3869,47 +3869,66 @@ public:
 		// we should check the mantissas beforehand because sometimes we can have
 		// a mantissa set to zero but in the exponent something another value
 		// (maybe we've forgotten about calling CorrectZero() ?)
-		if( mantissa.IsZero() && ss2.mantissa.IsZero())
-		{
-			return true;
-		}
+		if( mantissa.IsZero())
+			{
+			if (ss2.mantissa.IsZero())
+				return true;
+			return(ss2.AboutEqual(*this,nBitsToIgnore));
+			}
 		
-		if( IsSign() != ss2.IsSign() )
-		{
-			return false;
-		}
+		if (ss2.mantissa.IsZero())
+			{
+			return(this->exponent <= uint(2*(-sint(man*TTMATH_BITS_PER_UINT))+nBitsToIgnore));
+			}
 			
-		if( exponent==ss2.exponent )
-		{
-			if (mantissa == ss2.mantissa)
-				{
-				return(true);
-				}
-			if( IsSign() != ss2.IsSign() )
-				{
-				// we need to check the difference (both might be around Zero)
-				Big<exp,man>	temp(*this);
+		// exponents may not differ much!
+		ttmath::Int<exp>	expdiff(this->exponent - ss2.exponent);
 		
-				temp.Sub(ss2);
+		// they may differ one if for example mantissa1=0x80000000, mantissa2=0xffffffff
+		if (ttmath::Abs(expdiff) > 1)
+			return(false);		

-				Int<exp>	exponent_diff(exponent - temp.exponent);			
+		// calculate the 'difference' mantissa		
+		ttmath::UInt<man>	man1(this->mantissa);
+		ttmath::UInt<man>	man2(ss2.mantissa);
+		ttmath::UInt<man>	mandiff;
 		
-				return(exponent_diff > man*TTMATH_BITS_PER_UINT-nBitsToIgnore);
-				}
+		switch (expdiff.ToInt())
+			{
+			case +1:
+				man2.Rcr(1,0);
+				mandiff = man1;
+				mandiff.Sub(man2);
+				break;
+			case -1:
+				man1.Rcr(1,0);
+				mandiff = man2;
+				mandiff.Sub(man1);
+				break;
+			case 0:
+				if (man2 > man1)
+					{
+					mandiff = man2;
+					mandiff.Sub(man1);
+					}
+				  else
+					{
+					mandiff = man1;
+					mandiff.Sub(man2);
+					}
+				break;
+			}
 			
-			// faster to mask the bits!
-			ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT);
+		// faster to mask the bits!
+		ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT);

-			for (int n = man-1; n > 0; --n)
-				{
-				if (mantissa.table[n] != ss2.mantissa.table[n])
-					return(false);
-				}
-			uint	nMask = ~((1 << nBitsToIgnore) - 1);
-			return((mantissa.table[0] & nMask) == (ss2.mantissa.table[0] & nMask));
-		}
-
-	return false;
+		for (int n = man-1; n > 0; --n)
+			{
+			if (mandiff.table[n] != 0)
+				return(false);
+			}
+		uint	nMask = ~((1 << nBitsToIgnore) - 1);
+		return((mandiff.table[0] & nMask) == 0);
 	}

 	bool operator<(const Big<exp,man> & ss2) const
--- a/ttmath/ttmathuint_noasm.h
+++ b/ttmath/ttmathuint_noasm.h
@ -871,6 +871,83 @@ namespace ttmath
 		u3         = sub_res_low_.u_.low;
 	}

+	/*!
+		this static method addes one vector to the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5+1
+		  4        3         4+3
+		  2        7         2+7
+		  6                  6
+		  9                  9
+	  of course the carry is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+	uint i, c = 0;
+
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+		
+		for(i=0 ; i<ss2_size ; ++i)
+			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
+
+		for( ; i<ss1_size ; ++i)
+			c = AddTwoWords(ss1[i], 0, c, &result[i]);
+
+		TTMATH_LOG("UInt::AddVector")
+
+	return c;
+	}
+
+	/*!
+		this static method subtractes one vector from the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5-1
+		  4        3         4-3
+		  2        7         2-7
+		  6                  6-1  (the borrow from previous item)
+		  9                  9
+		                 return (carry): 0
+	  of course the carry (borrow) is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+	uint i, c = 0;
+
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+		
+		for(i=0 ; i<ss2_size ; ++i)
+			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
+
+		for( ; i<ss1_size ; ++i)
+			c = SubTwoWords(ss1[i], 0, c, &result[i]);
+
+		TTMATH_LOG("UInt::SubVector")
+
+	return c;
+	}
+
 #endif // #ifdef TTMATH_PLATFORM64


--- a/ttmath/ttmathuint_x86.h
+++ b/ttmath/ttmathuint_x86.h
@ -42,7 +42,7 @@
 #ifndef TTMATH_NOASM
 #ifdef TTMATH_PLATFORM32

-#pragma message("TTMATH_ASM")
+#pragma message("TTMATH_ASM32")

 /*!
 	\file ttmathuint_x86.h
--- a/ttmath/ttmathuint_x86_64.h
+++ b/ttmath/ttmathuint_x86_64.h
@ -39,10 +39,10 @@
 #ifndef headerfilettmathuint_x86_64
 #define headerfilettmathuint_x86_64

-
 #ifndef TTMATH_NOASM
 #ifdef TTMATH_PLATFORM64

+#pragma message("TTMATH_ASM64")
 /*!
 	\file ttmathuint_x86_64.h
    \brief template class UInt<uint> with assembler code for 64bit x86_64 processors
@ -50,6 +50,9 @@
 	this file is included at the end of ttmathuint.h
 */

+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+

 namespace ttmath
 {
@ -244,6 +247,30 @@ namespace ttmath
 	*/
 	template<uint value_size>
 	uint UInt<value_size>::AddTwoInts(uint x2, uint x1, uint index)
+	#if 0
+	{
+	uint i, c;
+
+		TTMATH_ASSERT( index < value_size )
+
+		printf("add %Id + %Id\n",x1,x2);
+		for(int i=index ; i<value_size ; ++i)
+			printf("%d: %Id\n",i,table[i]);
+
+		c = AddTwoWords(table[index],   x1, 0, &table[index]);
+		c = AddTwoWords(table[index+1], x2, c, &table[index+1]);
+
+		for(i=index+2 ; i<value_size && c ; ++i)
+			c = AddTwoWords(table[i], 0, c, &table[i]);
+		for(i=index ; i<value_size ; ++i)
+			printf("%d: %Id\n",i,table[i]);
+		printf(" -> %d\n",c);
+
+		TTMATH_LOG("UInt::AddTwoInts")
+
+	return c;
+	}
+	#else
 	{
 	uint b = value_size;
 	uint * p1 = table;
@ -253,7 +280,14 @@ namespace ttmath

 		#ifndef __GNUC__
 			#if defined(_M_X64)
-				c = addindexed2_x64(p1,b,index,x2,x1);
+				//printf("add %Id + %Id\n",x1,x2);
+				//for(int i=index ; i<value_size ; ++i)
+				//	printf("%d: %Id\n",i,table[i]);
+				//if (table[0] == 1265784741359897913) DebugBreak();
+				c = addindexed2_x64(p1,b,index,x1,x2);
+				//for(int i=index ; i<value_size ; ++i)
+				//	printf("%d: %Id\n",i,table[i]);
+				//printf(" -> %d\n",c);
 			#else
 				#error "another compiler than GCC is currently not supported in 64bit mode"
 			#endif
@ -289,10 +323,12 @@ namespace ttmath

 		#endif

+
 		TTMATH_LOG("UInt64::AddTwoInts")

 	return c;
 	}
+	#endif



--- a/ttmath/ttmathuint_x86_amd64_msvc.asm
+++ b/ttmath/ttmathuint_x86_amd64_msvc.asm
@ -98,8 +98,8 @@ addindexed_x64	ENDP

 addindexed2_x64	PROC

-        ; rcx = p1
-        ; rdx = b
+        ; rcx = p1 (pointer)
+        ; rdx = b  (value size)
        ; r8 = nPos
        ; r9 = nValue1
        ; [esp+0x28] = nValue2
@ -109,26 +109,23 @@ addindexed2_x64	PROC
 		sub		rdx, r8				; rdx = remaining count of uints
 		mov		r10, [esp+028h]		; r10 = nValue2

-		add		qword ptr [r11 + r8 * 8], r10
+		add		qword ptr [r11 + r8 * 8], r9
 		lea		r8, [r8+1]
+		lea		rdx, [rdx-1]
+		adc		qword ptr [r11 + r8 * 8], r10
+		jc		next
+		ret

 		ALIGN 16
 loop1:
-		adc		qword ptr [r11 + r8 * 8], r9
+		lea		r8, [r8+1]
+		add		qword ptr [r11 + r8 * 8], 1
 		jc		next
 		ret

 next:
-		lea		r8, [r8+1]
-		xor		r9, r9				; set to 0 -> cy still set!
-		dec		rdx
+		dec		rdx					; does not modify CY too...
 		jnz		loop1
-		jc		return_1			; most of the times, there will be NO carry (I hope)
-
-done:
-		ret
-	
-return_1:
 		lea		rax, [rax+1]
 		ret

@ -138,8 +135,6 @@ addindexed2_x64	ENDP

        ALIGN       8

-        ALIGN       8
-
 ;----------------------------------------

 sbb_x64				PROC