added: uint __fastcall ttmath_addvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);

uint __fastcall ttmath_subvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result); asm code for AddVector() and SubVector() for MS VC x86_64 changed: added prefixes "ttmath_" to all public procedures from ttmathuint_x86_64_msvc.asm git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@192 e52654a7-88a9-db11-a3e9-0013d4bc506e
2009-09-11 23:55:44 +00:00
parent b3d27979d0
commit 9ccacd8817
3 changed files with 175 additions and 170 deletions
--- a/ttmath/ttmathuint.h
+++ b/ttmath/ttmathuint.h
@@ -3298,16 +3298,6 @@ public:
 	static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low);
 	static void DivTwoWords(uint a,uint b, uint c, uint * r, uint * rest);

-
-	/* temporarily */
-	#ifndef TTMATH_NOASM
-	#ifdef TTMATH_PLATFORM64
-	#ifdef _MSC_VER
-		static uint AddTwoWords(uint a, uint b, uint carry, uint * result);
-		static uint SubTwoWords(uint a, uint b, uint carry, uint * result);
-	#endif
-	#endif
-	#endif
 };


--- a/ttmath/ttmathuint_x86_64.h
+++ b/ttmath/ttmathuint_x86_64.h
@@ -63,16 +63,18 @@ namespace ttmath

 		extern "C"
 			{
-			uint __fastcall adc_x64(uint* p1, const uint* p2, uint nSize, uint c);
-			uint __fastcall addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
-			uint __fastcall addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2);
-			uint __fastcall sbb_x64(uint* p1, const uint* p2, uint nSize, uint c);
-			uint __fastcall subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
-			uint __fastcall rcl_x64(uint* p1, uint nSize, uint nLowestBit);
-			uint __fastcall rcr_x64(uint* p1, uint nSize, uint nLowestBit);
-			uint __fastcall div_x64(uint* pnValHi, uint* pnValLo, uint nDiv);
-			uint __fastcall rcl2_x64(uint* p1, uint nSize, uint nBits, uint c);
-			uint __fastcall rcr2_x64(uint* p1, uint nSize, uint nBits, uint c);
+			uint __fastcall ttmath_adc_x64(uint* p1, const uint* p2, uint nSize, uint c);
+			uint __fastcall ttmath_addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
+			uint __fastcall ttmath_addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2);
+			uint __fastcall ttmath_addvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
+			uint __fastcall ttmath_sbb_x64(uint* p1, const uint* p2, uint nSize, uint c);
+			uint __fastcall ttmath_subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
+			uint __fastcall ttmath_subvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
+			uint __fastcall ttmath_rcl_x64(uint* p1, uint nSize, uint nLowestBit);
+			uint __fastcall ttmath_rcr_x64(uint* p1, uint nSize, uint nLowestBit);
+			uint __fastcall ttmath_div_x64(uint* pnValHi, uint* pnValLo, uint nDiv);
+			uint __fastcall ttmath_rcl2_x64(uint* p1, uint nSize, uint nBits, uint c);
+			uint __fastcall ttmath_rcr2_x64(uint* p1, uint nSize, uint nBits, uint c);
 			};
 	#endif

@@ -110,7 +112,7 @@ namespace ttmath
 		#endif

 		#ifdef _MSC_VER
-			c = adc_x64(p1,p2,b,c);
+			c = ttmath_adc_x64(p1,p2,b,c);
 		#endif

 		#ifdef __GNUC__
@@ -182,7 +184,7 @@ namespace ttmath


 		#ifdef _MSC_VER
-			c = addindexed_x64(p1,b,index,value);
+			c = ttmath_addindexed_x64(p1,b,index,value);
 		#endif


@@ -266,7 +268,7 @@ namespace ttmath


 		#ifdef _MSC_VER
-			c = addindexed2_x64(p1,b,index,x1,x2);
+			c = ttmath_addindexed2_x64(p1,b,index,x1,x2);
 		#endif


@@ -327,15 +329,11 @@ namespace ttmath
 	  of course the carry is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
-
-#ifndef _MSC_VER
-
 	template<uint value_size>
 	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 		TTMATH_ASSERT( ss1_size >= ss2_size )

-		uint rest = ss1_size - ss2_size;
 		uint c;

 		#if !defined(__GNUC__) && !defined(_MSC_VER)
@@ -344,12 +342,13 @@ namespace ttmath


 		#ifdef _MSC_VER
-			
+			 c = ttmath_addvector_x64(ss1, ss2, ss1_size, ss2_size, result);
 		#endif


 		#ifdef __GNUC__
 		uint dummy1, dummy2, dummy3;	
+		uint rest = ss1_size - ss2_size;
 			
 			//	this part should be compiled with gcc
 		
@@ -396,27 +395,6 @@ namespace ttmath
 	return c;
 	}

-#else
-	/* temporarily */
-	template<uint value_size>
-	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
-	{
-	uint i, c = 0;
-
-		TTMATH_ASSERT( ss1_size >= ss2_size )
-
-		for(i=0 ; i<ss2_size ; ++i)
-			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
-
-		for( ; i<ss1_size ; ++i)
-			c = AddTwoWords(ss1[i], 0, c, &result[i]);
-
-		TTMATH_LOG("UInt::AddVector")
-
-	return c;
-	}
-
-#endif


 	/*!
@@ -446,7 +424,7 @@ namespace ttmath


 		#ifdef _MSC_VER
-			c = sbb_x64(p1,p2,b,c);
+			c = ttmath_sbb_x64(p1,p2,b,c);
 		#endif


@@ -480,6 +458,7 @@ namespace ttmath
 	}


+
 	/*!
 		this method subtracts one word (at a specific position)
 		and returns a carry (if it was)
@@ -514,7 +493,7 @@ namespace ttmath


 		#ifdef _MSC_VER
-			c = subindexed_x64(p1,b,index,value);
+			c = ttmath_subindexed_x64(p1,b,index,value);
 		#endif


@@ -550,8 +529,6 @@ namespace ttmath
 	}


-
-
 	/*!
 		this static method subtractes one vector from the other
 		'ss1' is larger in size or equal to 'ss2'
@@ -573,15 +550,11 @@ namespace ttmath
 	  of course the carry (borrow) is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
-
-#ifndef _MSC_VER
-
 	template<uint value_size>
 	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 		TTMATH_ASSERT( ss1_size >= ss2_size )

-		uint rest = ss1_size - ss2_size;
 		uint c;

 		#if !defined(__GNUC__) && !defined(_MSC_VER)
@@ -590,17 +563,17 @@ namespace ttmath


 		#ifdef _MSC_VER
-			
+			c = ttmath_subvector_x64(ss1, ss2, ss1_size, ss2_size, result);
 		#endif


 		#ifdef __GNUC__
 		
-		
 		//	the asm code is nearly the same as in AddVector
 		//	only two instructions 'adc' are changed to 'sbb'
 		
 		uint dummy1, dummy2, dummy3;
+		uint rest = ss1_size - ss2_size;

 			__asm__ __volatile__(
 				"mov %%rdx, %%r8					\n"
@@ -645,28 +618,6 @@ namespace ttmath
 	return c;
 	}

-#else
-	/* temporarily */
-	template<uint value_size>
-	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
-	{
-	uint i, c = 0;
-
-		TTMATH_ASSERT( ss1_size >= ss2_size )
-
-		for(i=0 ; i<ss2_size ; ++i)
-			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
-
-		for( ; i<ss1_size ; ++i)
-			c = SubTwoWords(ss1[i], 0, c, &result[i]);
-
-		TTMATH_LOG("UInt::SubVector")
-
-	return c;
-	}
-
-#endif
-

 	/*!
 		this method moves all bits into the left hand side
@@ -695,7 +646,7 @@ namespace ttmath


 		#ifdef _MSC_VER
-			c = rcl_x64(p1,b,c);
+			c = ttmath_rcl_x64(p1,b,c);
 		#endif


@@ -755,7 +706,7 @@ namespace ttmath


 		#ifdef _MSC_VER
-			c = rcr_x64(p1,b,c);
+			c = ttmath_rcr_x64(p1,b,c);
 		#endif


@@ -816,7 +767,7 @@ namespace ttmath


 		#ifdef _MSC_VER
-			c = rcl2_x64(p1,b,bits,c);
+			c = ttmath_rcl2_x64(p1,b,bits,c);
 		#endif


@@ -893,7 +844,7 @@ namespace ttmath


 		#ifdef _MSC_VER
-			c = rcr2_x64(p1,b,bits,c);
+			c = ttmath_rcr2_x64(p1,b,bits,c);
 		#endif


@@ -1146,7 +1097,7 @@ namespace ttmath

 		#ifdef _MSC_VER

-			div_x64(&a,&b,c);
+			ttmath_div_x64(&a,&b,c);
 			r_    = a;
 			rest_ = b;
 			
@@ -1170,60 +1121,6 @@ namespace ttmath
 		*rest = rest_;
 	}

-
-	/* temporarily */
-	template<uint value_size>
-	uint UInt<value_size>::AddTwoWords(uint a, uint b, uint carry, uint * result)
-	{
-	uint temp;
-
-		if( carry == 0 )
-		{
-			temp = a + b;
-
-			if( temp < a )
-				carry = 1;
-		}
-		else
-		{
-			carry = 1;
-			temp  = a + b + carry;
-
-			if( temp > a ) // !(temp<=a)
-				carry = 0;
-		}
-
-		*result = temp;
-
-	return carry;
-	}
-
-
-	/* temporarily */
-	template<uint value_size>
-	uint UInt<value_size>::SubTwoWords(uint a, uint b, uint carry, uint * result)
-	{
-		if( carry == 0 )
-		{
-			*result = a - b;
-
-			if( a < b )
-				carry = 1;
-		}
-		else
-		{
-			carry   = 1;
-			*result = a - b - carry;
-
-			if( a > b ) // !(a <= b )
-				carry = 0;
-		}
-
-	return carry;
-	}
-
-
-
 } //namespace


--- a/ttmath/ttmathuint_x86_64_msvc.asm
+++ b/ttmath/ttmathuint_x86_64_msvc.asm
@@ -41,20 +41,22 @@
 ; this create ttmathuint_x86_64_msvc.obj file which can be linked with your program
 ;

-PUBLIC	adc_x64
-PUBLIC	addindexed_x64
-PUBLIC	addindexed2_x64
+PUBLIC	ttmath_adc_x64
+PUBLIC	ttmath_addindexed_x64
+PUBLIC	ttmath_addindexed2_x64
+PUBLIC  ttmath_addvector_x64

-PUBLIC	sbb_x64
-PUBLIC	subindexed_x64
+PUBLIC	ttmath_sbb_x64
+PUBLIC	ttmath_subindexed_x64
+PUBLIC  ttmath_subvector_x64

-PUBLIC	rcl_x64
-PUBLIC	rcr_x64
+PUBLIC	ttmath_rcl_x64
+PUBLIC	ttmath_rcr_x64

-PUBLIC	rcl2_x64
-PUBLIC	rcr2_x64
+PUBLIC	ttmath_rcl2_x64
+PUBLIC	ttmath_rcr2_x64

-PUBLIC	div_x64
+PUBLIC	ttmath_div_x64

 ;
 ;	"rax, rcx, rdx, r8-r11 are volatile."
@@ -64,11 +66,12 @@ PUBLIC	div_x64

 .CODE

+
        ALIGN       8

 ;----------------------------------------

-adc_x64				PROC
+ttmath_adc_x64				PROC
        ; rcx = p1
        ; rdx = p2
        ; r8 = nSize
@@ -91,7 +94,7 @@ adc_x64				PROC

 		ret

-adc_x64				ENDP
+ttmath_adc_x64				ENDP

 ;----------------------------------------

@@ -99,7 +102,7 @@ adc_x64				ENDP

 ;----------------------------------------

-addindexed_x64	PROC
+ttmath_addindexed_x64	PROC

        ; rcx = p1
        ; rdx = nSize
@@ -132,7 +135,7 @@ done_with_cy:

 		ret

-addindexed_x64	ENDP
+ttmath_addindexed_x64	ENDP

 ;----------------------------------------

@@ -140,7 +143,7 @@ addindexed_x64	ENDP

 ;----------------------------------------

-addindexed2_x64	PROC
+ttmath_addindexed2_x64	PROC

        ; rcx = p1 (pointer)
        ; rdx = b  (value size)
@@ -173,7 +176,9 @@ next:
 		lea		rax, [rax+1]
 		ret

-addindexed2_x64	ENDP
+ttmath_addindexed2_x64	ENDP
+
+

 ;----------------------------------------

@@ -181,7 +186,61 @@ addindexed2_x64	ENDP

 ;----------------------------------------

-sbb_x64				PROC
+
+ttmath_addvector_x64				PROC
+        ; rcx = ss1
+        ; rdx = ss2
+        ; r8 = ss1_size
+        ; r9 = ss2_size
+        ; [esp+0x28] = result
+
+		mov		r10, [esp+028h]
+		sub		r8, r9
+        xor		r11, r11				; r11=0, cf=0
+
+		ALIGN 16
+ loop1:
+		mov		rax, qword ptr [rcx + r11 * 8]
+		adc		rax, qword ptr [rdx + r11 * 8]
+		mov		qword ptr [r10 + r11 * 8], rax
+		inc		r11
+		dec		r9
+		jnz		loop1
+
+		adc		r9, r9					; r9 has the cf state
+
+		or		r8, r8
+		jz		done
+
+		neg		r9						; setting cf from r9
+		mov		r9, 0					; don't use xor here (cf is used)
+ loop2:
+		mov		rax, qword ptr [rcx + r11 * 8]
+		adc		rax, r9
+		mov		qword ptr [r10 + r11 * 8], rax
+		inc		r11
+		dec		r8
+		jnz		loop2
+
+		adc		r8, r8
+		mov		rax, r8
+		
+		ret
+
+done:
+		mov		rax, r9
+		ret
+
+ttmath_addvector_x64				ENDP
+
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+ttmath_sbb_x64				PROC

        ; rcx = p1
        ; rdx = p2
@@ -205,7 +264,7 @@ sbb_x64				PROC

 		ret

-sbb_x64				ENDP
+ttmath_sbb_x64				ENDP

 ;----------------------------------------

@@ -213,7 +272,7 @@ sbb_x64				ENDP

 ;----------------------------------------

-subindexed_x64	PROC
+ttmath_subindexed_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = nPos
@@ -240,7 +299,9 @@ done:
 		mov		rax, 1
 		ret

-subindexed_x64	ENDP
+ttmath_subindexed_x64	ENDP
+
+

 ;----------------------------------------

@@ -248,7 +309,64 @@ subindexed_x64	ENDP

 ;----------------------------------------

-rcl_x64	PROC
+;	the same asm code as in addvector_x64 only two instructions 'adc' changed to 'sbb'
+
+ttmath_subvector_x64				PROC
+        ; rcx = ss1
+        ; rdx = ss2
+        ; r8 = ss1_size
+        ; r9 = ss2_size
+        ; [esp+0x28] = result
+
+		mov		r10, [esp+028h]
+		sub		r8, r9
+        xor		r11, r11				; r11=0, cf=0
+
+		ALIGN 16
+ loop1:
+		mov		rax, qword ptr [rcx + r11 * 8]
+		sbb		rax, qword ptr [rdx + r11 * 8]
+		mov		qword ptr [r10 + r11 * 8], rax
+		inc		r11
+		dec		r9
+		jnz		loop1
+
+		adc		r9, r9					; r9 has the cf state
+
+		or		r8, r8
+		jz		done
+
+		neg		r9						; setting cf from r9
+		mov		r9, 0					; don't use xor here (cf is used)
+ loop2:
+		mov		rax, qword ptr [rcx + r11 * 8]
+		sbb		rax, r9
+		mov		qword ptr [r10 + r11 * 8], rax
+		inc		r11
+		dec		r8
+		jnz		loop2
+
+		adc		r8, r8
+		mov		rax, r8
+		
+		ret
+
+done:
+		mov		rax, r9
+		ret
+
+ttmath_subvector_x64				ENDP
+
+
+
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+ttmath_rcl_x64	PROC
        ; rcx = p1
        ; rdx = b
        ; r8 = nLowestBit
@@ -269,7 +387,7 @@ loop1:

        ret

-rcl_x64	ENDP
+ttmath_rcl_x64	ENDP

 ;----------------------------------------

@@ -277,7 +395,7 @@ rcl_x64	ENDP

 ;----------------------------------------

-rcr_x64	PROC
+ttmath_rcr_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = nLowestBit
@@ -296,7 +414,7 @@ loop1:

        ret

-rcr_x64	ENDP
+ttmath_rcr_x64	ENDP

 ;----------------------------------------

@@ -304,7 +422,7 @@ rcr_x64	ENDP

 ;----------------------------------------

-div_x64	PROC
+ttmath_div_x64	PROC

        ; rcx = &Hi
        ; rdx = &Lo
@@ -321,7 +439,7 @@ div_x64	PROC

        ret

-div_x64	ENDP
+ttmath_div_x64	ENDP

 ;----------------------------------------

@@ -329,7 +447,7 @@ div_x64	ENDP

 ;----------------------------------------

-rcl2_x64	PROC
+ttmath_rcl2_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = bits
@@ -372,7 +490,7 @@ loop1:
 		pop		rbx
        ret

-rcl2_x64	ENDP
+ttmath_rcl2_x64	ENDP

 ;----------------------------------------

@@ -380,7 +498,7 @@ rcl2_x64	ENDP

 ;----------------------------------------

-rcr2_x64	PROC
+ttmath_rcr2_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = bits
@@ -425,6 +543,6 @@ loop1:

        ret

-rcr2_x64	ENDP
+ttmath_rcr2_x64	ENDP

 END