added: uint __fastcall ttmath_addvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);

uint __fastcall ttmath_subvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result); asm code for AddVector() and SubVector() for MS VC x86_64 changed: added prefixes "ttmath_" to all public procedures from ttmathuint_x86_64_msvc.asm git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@192 e52654a7-88a9-db11-a3e9-0013d4bc506e
2009-09-11 23:55:44 +00:00
parent b3d27979d0
commit 9ccacd8817
3 changed files with 175 additions and 170 deletions
--- a/ttmath/ttmathuint.h
+++ b/ttmath/ttmathuint.h
@@ -3298,16 +3298,6 @@ public:
 	static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low);
 	static void DivTwoWords(uint a,uint b, uint c, uint * r, uint * rest);
 	/* temporarily */
 	#ifndef TTMATH_NOASM
 	#ifdef TTMATH_PLATFORM64
 	#ifdef _MSC_VER
 		static uint AddTwoWords(uint a, uint b, uint carry, uint * result);
 		static uint SubTwoWords(uint a, uint b, uint carry, uint * result);
 	#endif
 	#endif
 	#endif
 };
--- a/ttmath/ttmathuint_x86_64.h
+++ b/ttmath/ttmathuint_x86_64.h
@@ -63,16 +63,18 @@ namespace ttmath
 		extern "C"
 			{
-			uint __fastcall adc_x64(uint* p1, const uint* p2, uint nSize, uint c);
+			uint __fastcall ttmath_adc_x64(uint* p1, const uint* p2, uint nSize, uint c);
-			uint __fastcall addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
+			uint __fastcall ttmath_addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
-			uint __fastcall addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2);
+			uint __fastcall ttmath_addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2);
-			uint __fastcall sbb_x64(uint* p1, const uint* p2, uint nSize, uint c);
+			uint __fastcall ttmath_addvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
-			uint __fastcall subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
+			uint __fastcall ttmath_sbb_x64(uint* p1, const uint* p2, uint nSize, uint c);
-			uint __fastcall rcl_x64(uint* p1, uint nSize, uint nLowestBit);
+			uint __fastcall ttmath_subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
-			uint __fastcall rcr_x64(uint* p1, uint nSize, uint nLowestBit);
+			uint __fastcall ttmath_subvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
-			uint __fastcall div_x64(uint* pnValHi, uint* pnValLo, uint nDiv);
+			uint __fastcall ttmath_rcl_x64(uint* p1, uint nSize, uint nLowestBit);
-			uint __fastcall rcl2_x64(uint* p1, uint nSize, uint nBits, uint c);
+			uint __fastcall ttmath_rcr_x64(uint* p1, uint nSize, uint nLowestBit);
-			uint __fastcall rcr2_x64(uint* p1, uint nSize, uint nBits, uint c);
+			uint __fastcall ttmath_div_x64(uint* pnValHi, uint* pnValLo, uint nDiv);
 			uint __fastcall ttmath_rcl2_x64(uint* p1, uint nSize, uint nBits, uint c);
 			uint __fastcall ttmath_rcr2_x64(uint* p1, uint nSize, uint nBits, uint c);
 			};
 	#endif
@@ -110,7 +112,7 @@ namespace ttmath
 		#endif
 		#ifdef _MSC_VER
-			c = adc_x64(p1,p2,b,c);
+			c = ttmath_adc_x64(p1,p2,b,c);
 		#endif
 		#ifdef __GNUC__
@@ -182,7 +184,7 @@ namespace ttmath
 		#ifdef _MSC_VER
-			c = addindexed_x64(p1,b,index,value);
+			c = ttmath_addindexed_x64(p1,b,index,value);
 		#endif
@@ -266,7 +268,7 @@ namespace ttmath
 		#ifdef _MSC_VER
-			c = addindexed2_x64(p1,b,index,x1,x2);
+			c = ttmath_addindexed2_x64(p1,b,index,x1,x2);
 		#endif
@@ -327,15 +329,11 @@ namespace ttmath
 	  of course the carry is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
 #ifndef _MSC_VER
 	template<uint value_size>
 	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 		TTMATH_ASSERT( ss1_size >= ss2_size )
 		uint rest = ss1_size - ss2_size;
 		uint c;
 		#if !defined(__GNUC__) && !defined(_MSC_VER)
@@ -344,12 +342,13 @@ namespace ttmath
 		#ifdef _MSC_VER
-			
+			 c = ttmath_addvector_x64(ss1, ss2, ss1_size, ss2_size, result);
 		#endif
 		#ifdef __GNUC__
 		uint dummy1, dummy2, dummy3;	
 		uint rest = ss1_size - ss2_size;
 			//	this part should be compiled with gcc
@@ -396,27 +395,6 @@ namespace ttmath
 	return c;
 	}
 #else
 	/* temporarily */
 	template<uint value_size>
 	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 	uint i, c = 0;
 		TTMATH_ASSERT( ss1_size >= ss2_size )
 		for(i=0 ; i<ss2_size ; ++i)
 			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
 		for( ; i<ss1_size ; ++i)
 			c = AddTwoWords(ss1[i], 0, c, &result[i]);
 		TTMATH_LOG("UInt::AddVector")
 	return c;
 	}
 #endif
 	/*!
@@ -446,7 +424,7 @@ namespace ttmath
 		#ifdef _MSC_VER
-			c = sbb_x64(p1,p2,b,c);
+			c = ttmath_sbb_x64(p1,p2,b,c);
 		#endif
@@ -480,6 +458,7 @@ namespace ttmath
 	}
 	/*!
 		this method subtracts one word (at a specific position)
 		and returns a carry (if it was)
@@ -514,7 +493,7 @@ namespace ttmath
 		#ifdef _MSC_VER
-			c = subindexed_x64(p1,b,index,value);
+			c = ttmath_subindexed_x64(p1,b,index,value);
 		#endif
@@ -550,8 +529,6 @@ namespace ttmath
 	}
 	/*!
 		this static method subtractes one vector from the other
 		'ss1' is larger in size or equal to 'ss2'
@@ -573,15 +550,11 @@ namespace ttmath
 	  of course the carry (borrow) is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
 #ifndef _MSC_VER
 	template<uint value_size>
 	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 		TTMATH_ASSERT( ss1_size >= ss2_size )
 		uint rest = ss1_size - ss2_size;
 		uint c;
 		#if !defined(__GNUC__) && !defined(_MSC_VER)
@@ -590,17 +563,17 @@ namespace ttmath
 		#ifdef _MSC_VER
-			
+			c = ttmath_subvector_x64(ss1, ss2, ss1_size, ss2_size, result);
 		#endif
 		#ifdef __GNUC__
 		//	the asm code is nearly the same as in AddVector
 		//	only two instructions 'adc' are changed to 'sbb'
 		uint dummy1, dummy2, dummy3;
 		uint rest = ss1_size - ss2_size;
 			__asm__ __volatile__(
 				"mov %%rdx, %%r8					\n"
@@ -645,28 +618,6 @@ namespace ttmath
 	return c;
 	}
 #else
 	/* temporarily */
 	template<uint value_size>
 	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 	uint i, c = 0;
 		TTMATH_ASSERT( ss1_size >= ss2_size )
 		for(i=0 ; i<ss2_size ; ++i)
 			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
 		for( ; i<ss1_size ; ++i)
 			c = SubTwoWords(ss1[i], 0, c, &result[i]);
 		TTMATH_LOG("UInt::SubVector")
 	return c;
 	}
 #endif
 	/*!
 		this method moves all bits into the left hand side
@@ -695,7 +646,7 @@ namespace ttmath
 		#ifdef _MSC_VER
-			c = rcl_x64(p1,b,c);
+			c = ttmath_rcl_x64(p1,b,c);
 		#endif
@@ -755,7 +706,7 @@ namespace ttmath
 		#ifdef _MSC_VER
-			c = rcr_x64(p1,b,c);
+			c = ttmath_rcr_x64(p1,b,c);
 		#endif
@@ -816,7 +767,7 @@ namespace ttmath
 		#ifdef _MSC_VER
-			c = rcl2_x64(p1,b,bits,c);
+			c = ttmath_rcl2_x64(p1,b,bits,c);
 		#endif
@@ -893,7 +844,7 @@ namespace ttmath
 		#ifdef _MSC_VER
-			c = rcr2_x64(p1,b,bits,c);
+			c = ttmath_rcr2_x64(p1,b,bits,c);
 		#endif
@@ -1146,7 +1097,7 @@ namespace ttmath
 		#ifdef _MSC_VER
-			div_x64(&a,&b,c);
+			ttmath_div_x64(&a,&b,c);
 			r_    = a;
 			rest_ = b;
@@ -1170,60 +1121,6 @@ namespace ttmath
 		*rest = rest_;
 	}
 	/* temporarily */
 	template<uint value_size>
 	uint UInt<value_size>::AddTwoWords(uint a, uint b, uint carry, uint * result)
 	{
 	uint temp;
 		if( carry == 0 )
 		{
 			temp = a + b;
 			if( temp < a )
 				carry = 1;
 		}
 		else
 		{
 			carry = 1;
 			temp  = a + b + carry;
 			if( temp > a ) // !(temp<=a)
 				carry = 0;
 		}
 		*result = temp;
 	return carry;
 	}
 	/* temporarily */
 	template<uint value_size>
 	uint UInt<value_size>::SubTwoWords(uint a, uint b, uint carry, uint * result)
 	{
 		if( carry == 0 )
 		{
 			*result = a - b;
 			if( a < b )
 				carry = 1;
 		}
 		else
 		{
 			carry   = 1;
 			*result = a - b - carry;
 			if( a > b ) // !(a <= b )
 				carry = 0;
 		}
 	return carry;
 	}
 } //namespace
--- a/ttmath/ttmathuint_x86_64_msvc.asm
+++ b/ttmath/ttmathuint_x86_64_msvc.asm
@@ -41,20 +41,22 @@
 ; this create ttmathuint_x86_64_msvc.obj file which can be linked with your program
 ;
-PUBLIC	adc_x64
+PUBLIC	ttmath_adc_x64
-PUBLIC	addindexed_x64
+PUBLIC	ttmath_addindexed_x64
-PUBLIC	addindexed2_x64
+PUBLIC	ttmath_addindexed2_x64
 PUBLIC  ttmath_addvector_x64
-PUBLIC	sbb_x64
+PUBLIC	ttmath_sbb_x64
-PUBLIC	subindexed_x64
+PUBLIC	ttmath_subindexed_x64
 PUBLIC  ttmath_subvector_x64
-PUBLIC	rcl_x64
+PUBLIC	ttmath_rcl_x64
-PUBLIC	rcr_x64
+PUBLIC	ttmath_rcr_x64
-PUBLIC	rcl2_x64
+PUBLIC	ttmath_rcl2_x64
-PUBLIC	rcr2_x64
+PUBLIC	ttmath_rcr2_x64
-PUBLIC	div_x64
+PUBLIC	ttmath_div_x64
 ;
 ;	"rax, rcx, rdx, r8-r11 are volatile."
@@ -64,11 +66,12 @@ PUBLIC	div_x64
 .CODE
        ALIGN       8
 ;----------------------------------------
-adc_x64				PROC
+ttmath_adc_x64				PROC
        ; rcx = p1
        ; rdx = p2
        ; r8 = nSize
@@ -91,7 +94,7 @@ adc_x64				PROC
 		ret
-adc_x64				ENDP
+ttmath_adc_x64				ENDP
 ;----------------------------------------
@@ -99,7 +102,7 @@ adc_x64				ENDP
 ;----------------------------------------
-addindexed_x64	PROC
+ttmath_addindexed_x64	PROC
        ; rcx = p1
        ; rdx = nSize
@@ -132,7 +135,7 @@ done_with_cy:
 		ret
-addindexed_x64	ENDP
+ttmath_addindexed_x64	ENDP
 ;----------------------------------------
@@ -140,7 +143,7 @@ addindexed_x64	ENDP
 ;----------------------------------------
-addindexed2_x64	PROC
+ttmath_addindexed2_x64	PROC
        ; rcx = p1 (pointer)
        ; rdx = b  (value size)
@@ -173,7 +176,9 @@ next:
 		lea		rax, [rax+1]
 		ret
-addindexed2_x64	ENDP
+ttmath_addindexed2_x64	ENDP
 ;----------------------------------------
@@ -181,7 +186,61 @@ addindexed2_x64	ENDP
 ;----------------------------------------
-sbb_x64				PROC
+
 ttmath_addvector_x64				PROC
        ; rcx = ss1
        ; rdx = ss2
        ; r8 = ss1_size
        ; r9 = ss2_size
        ; [esp+0x28] = result
 		mov		r10, [esp+028h]
 		sub		r8, r9
        xor		r11, r11				; r11=0, cf=0
 		ALIGN 16
 loop1:
 		mov		rax, qword ptr [rcx + r11 * 8]
 		adc		rax, qword ptr [rdx + r11 * 8]
 		mov		qword ptr [r10 + r11 * 8], rax
 		inc		r11
 		dec		r9
 		jnz		loop1
 		adc		r9, r9					; r9 has the cf state
 		or		r8, r8
 		jz		done
 		neg		r9						; setting cf from r9
 		mov		r9, 0					; don't use xor here (cf is used)
 loop2:
 		mov		rax, qword ptr [rcx + r11 * 8]
 		adc		rax, r9
 		mov		qword ptr [r10 + r11 * 8], rax
 		inc		r11
 		dec		r8
 		jnz		loop2
 		adc		r8, r8
 		mov		rax, r8
 		ret
 done:
 		mov		rax, r9
 		ret
 ttmath_addvector_x64				ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 ttmath_sbb_x64				PROC
        ; rcx = p1
        ; rdx = p2
@@ -205,7 +264,7 @@ sbb_x64				PROC
 		ret
-sbb_x64				ENDP
+ttmath_sbb_x64				ENDP
 ;----------------------------------------
@@ -213,7 +272,7 @@ sbb_x64				ENDP
 ;----------------------------------------
-subindexed_x64	PROC
+ttmath_subindexed_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = nPos
@@ -240,7 +299,9 @@ done:
 		mov		rax, 1
 		ret
-subindexed_x64	ENDP
+ttmath_subindexed_x64	ENDP
 ;----------------------------------------
@@ -248,7 +309,64 @@ subindexed_x64	ENDP
 ;----------------------------------------
-rcl_x64	PROC
+;	the same asm code as in addvector_x64 only two instructions 'adc' changed to 'sbb'
 ttmath_subvector_x64				PROC
        ; rcx = ss1
        ; rdx = ss2
        ; r8 = ss1_size
        ; r9 = ss2_size
        ; [esp+0x28] = result
 		mov		r10, [esp+028h]
 		sub		r8, r9
        xor		r11, r11				; r11=0, cf=0
 		ALIGN 16
 loop1:
 		mov		rax, qword ptr [rcx + r11 * 8]
 		sbb		rax, qword ptr [rdx + r11 * 8]
 		mov		qword ptr [r10 + r11 * 8], rax
 		inc		r11
 		dec		r9
 		jnz		loop1
 		adc		r9, r9					; r9 has the cf state
 		or		r8, r8
 		jz		done
 		neg		r9						; setting cf from r9
 		mov		r9, 0					; don't use xor here (cf is used)
 loop2:
 		mov		rax, qword ptr [rcx + r11 * 8]
 		sbb		rax, r9
 		mov		qword ptr [r10 + r11 * 8], rax
 		inc		r11
 		dec		r8
 		jnz		loop2
 		adc		r8, r8
 		mov		rax, r8
 		ret
 done:
 		mov		rax, r9
 		ret
 ttmath_subvector_x64				ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 ttmath_rcl_x64	PROC
        ; rcx = p1
        ; rdx = b
        ; r8 = nLowestBit
@@ -269,7 +387,7 @@ loop1:
        ret
-rcl_x64	ENDP
+ttmath_rcl_x64	ENDP
 ;----------------------------------------
@@ -277,7 +395,7 @@ rcl_x64	ENDP
 ;----------------------------------------
-rcr_x64	PROC
+ttmath_rcr_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = nLowestBit
@@ -296,7 +414,7 @@ loop1:
        ret
-rcr_x64	ENDP
+ttmath_rcr_x64	ENDP
 ;----------------------------------------
@@ -304,7 +422,7 @@ rcr_x64	ENDP
 ;----------------------------------------
-div_x64	PROC
+ttmath_div_x64	PROC
        ; rcx = &Hi
        ; rdx = &Lo
@@ -321,7 +439,7 @@ div_x64	PROC
        ret
-div_x64	ENDP
+ttmath_div_x64	ENDP
 ;----------------------------------------
@@ -329,7 +447,7 @@ div_x64	ENDP
 ;----------------------------------------
-rcl2_x64	PROC
+ttmath_rcl2_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = bits
@@ -372,7 +490,7 @@ loop1:
 		pop		rbx
        ret
-rcl2_x64	ENDP
+ttmath_rcl2_x64	ENDP
 ;----------------------------------------
@@ -380,7 +498,7 @@ rcl2_x64	ENDP
 ;----------------------------------------
-rcr2_x64	PROC
+ttmath_rcr2_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = bits
@@ -425,6 +543,6 @@ loop1:
        ret
-rcr2_x64	ENDP
+ttmath_rcr2_x64	ENDP
 END