more optimizations for MSVC assembler (parallelism, prefetch optimization, loop alignment, ...)

git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@151 e52654a7-88a9-db11-a3e9-0013d4bc506e
2009-05-20 08:48:51 +00:00 · 2009-05-20 08:48:51 +00:00 · de1e7ac957
parent fdc292e91a
commit de1e7ac957
4 changed files with 1195 additions and 1181 deletions
--- a/ttmath/ttmathbig.h
+++ b/ttmath/ttmathbig.h
@ -3434,7 +3434,7 @@ private:
 	*/
 	int FromString_ReadScientificIfExists(const tchar_t * & source)
 	{
-	int c = 0;
+	uint c = 0;

 		bool scientific_read = false;
 		const tchar_t * before_scientific = source;
--- a/ttmath/ttmathconfig.h
+++ b/ttmath/ttmathconfig.h
@ -37,6 +37,7 @@ namespace ttmath
 {

 #if defined(_MSC_VER)
+	#include <windows.h>
 	#if defined(_UNICODE)
 		typedef	wchar_t					tchar_t;
 		typedef	std::wstring			tstr_t;
@ -71,20 +72,20 @@ namespace ttmath
 			public:
 												clsCrit(void)
 													{
-													::InitializeCriticalSection(&_Crit);
+													InitializeCriticalSection(&_Crit);
 													}
 				virtual							~clsCrit(void)
 													{
-													::DeleteCriticalSection(&_Crit);
+													DeleteCriticalSection(&_Crit);
 													}

 				void							Enter(void) const
 													{
-													::EnterCriticalSection(&_Crit);
+													EnterCriticalSection(&_Crit);
 													}
 				void							Leave(void) const
 													{
-													::LeaveCriticalSection(&_Crit);
+													LeaveCriticalSection(&_Crit);
 													}
 			};

--- a/ttmath/ttmathuint_x86.h
+++ b/ttmath/ttmathuint_x86.h
@ -98,11 +98,12 @@ namespace ttmath

 				sub eax,[c]  // CF=c

+				ALIGN 16
 			p:				
-				mov eax,[esi+edx*4]
-				adc [ebx+edx*4],eax
+				mov eax,[esi+edx*4+0]
+				adc [ebx+edx*4+0],eax

-				inc edx
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
 				dec ecx
 				jnz p

@ -190,12 +191,13 @@ namespace ttmath

 				mov eax, [value]

+				ALIGN 16
 			p:
 				add [ebx+edx*4], eax
 				jnc end

 				mov eax, 1
-				inc edx
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
 				dec ecx
 				jnz p

@ -295,17 +297,18 @@ namespace ttmath

 				mov eax, [x1]
 				add [ebx+edx*4], eax
-				inc edx
-				dec ecx
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
+				lea ecx, [ecx-1]

 				mov eax, [x2]
 			
+				ALIGN 16
 			p:
 				adc [ebx+edx*4], eax
 				jnc end

-				mov eax, 0
-				inc edx
+				xor eax, eax
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
 				dec ecx
 				jnz p

@ -392,11 +395,12 @@ namespace ttmath

 				sub eax, [c]

+				ALIGN 16
 			p:
 				mov eax, [esi+edx*4]
 				sbb [ebx+edx*4], eax

-				inc edx
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
 				dec ecx
 				jnz p

@ -480,12 +484,13 @@ namespace ttmath

 				mov eax, [value]

+				ALIGN 16
 			p:
 				sub [ebx+edx*4], eax
 				jnc end

 				mov eax, 1
-				inc edx
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
 				dec ecx
 				jnz p

@ -564,15 +569,16 @@ namespace ttmath

 				mov ecx, [b]

+				ALIGN 16
 			p:
 				rcl dword ptr [ebx+edx*4], 1
 				
-				inc edx
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
 				dec ecx
 				jnz p

-				setc dl
-				movzx eax, dl
+				setc	al
+				movzx	eax, al
 			}
 		#endif

@ -633,21 +639,21 @@ namespace ttmath
 		#ifndef __GNUC__
 			__asm
 			{
-				mov ebx, [p1]
-
 				xor ecx, ecx
 				sub ecx, [c]

+				mov ebx, [p1]
 				mov ecx, [b]

+				ALIGN 16
 			p:
 				rcr dword ptr [ebx+ecx*4-4], 1
 				
 				dec ecx
 				jnz p

-				setc cl
-				movzx eax, cl
+				setc	al
+				movzx	eax, al
 			}
 		#endif

@ -724,6 +730,7 @@ namespace ttmath
 				or eax, eax
 				cmovnz esi, [mask] // if c then old value = mask

+				ALIGN 16
 			p:
 				rol dword ptr [ebx+edx*4], cl
 				
@ -733,7 +740,7 @@ namespace ttmath
 				or [ebx+edx*4], esi  // saving old value
 				mov esi, eax

-				inc edx
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
 				dec edi
 				jnz p

@ -839,6 +846,7 @@ namespace ttmath
 				or eax, eax
 				cmovnz esi, [mask] // if c then old value = mask

+				ALIGN 16
 			p:
 				ror dword ptr [ebx+edx*4], cl
 				
--- a/ttmath/ttmathuint_x86_amd64_msvc.asm
+++ b/ttmath/ttmathuint_x86_amd64_msvc.asm
@ -31,23 +31,21 @@ adc_x64				PROC
        ; r9 = nCarry

        xor		rax, rax
-        mov		r11, 0
+        xor		r11, r11
        sub		rax, r9		; sets CARRY if r9 != 0
        
+		ALIGN 16
 loop1:	
 		mov		rax,qword ptr [rdx + r11 * 8]
 		adc		qword ptr [rcx + r11 * 8], rax
-		inc		r11
+		lea		r11, [r11+1]
 		dec		r8
 		
 		jnz		loop1
 		
-		jc		return_1	; most of the times, there will be NO carry (I hope)
-		xor		rax, rax
-		ret
+		setc	al
+		movzx	rax, al

-  return_1:
-		mov		rax, 1
 		ret
 		
 adc_x64				ENDP
@ -66,22 +64,20 @@ addindexed_x64	PROC
        ; r9 = nValue

 		sub		rdx, r8				; rdx = remaining count of uints
+		ALIGN 16
 loop1:
 		add		qword ptr [rcx + r8 * 8], r9
 		jnc		done
 		
-		inc		r8
+		lea		r8, [r8+1]
 		mov		r9, 1
 		dec		rdx
 		jnz		loop1
 		
 done:
-		jc		return_1	; most of the times, there will be NO carry (I hope)
-		xor		rax, rax
-		ret
+		setc	al
+		movzx	rax, al
 		
-  return_1:
-		mov		rax, 1
 		ret
 	
 addindexed_x64	ENDP
@ -100,28 +96,32 @@ addindexed2_x64	PROC
        ; r9 = nValue1
        ; [esp+0x28] = nValue2

+		xor		rax, rax			; return value
 		mov		r11, rcx			; table
 		sub		rdx, r8				; rdx = remaining count of uints
 		mov		r10, [esp+028h]		; r10 = nValue2

 		add		qword ptr [r11 + r8 * 8], r10
-		inc		r8
+		lea		r8, [r8+1]
+
+		ALIGN 16
 loop1:
 		adc		qword ptr [r11 + r8 * 8], r9
-		jnc		done
+		jc		next
+		ret
 		
-		inc		r8
-		mov		r9, 0				; set to 0 -> cy still set!
+next:
+		lea		r8, [r8+1]
+		xor		r9, r9				; set to 0 -> cy still set!
 		dec		rdx
 		jnz		loop1
 		jc		return_1			; most of the times, there will be NO carry (I hope)

 done:
-		xor		rax, rax
 		ret
 	
-  return_1:
-		mov		rax, 1
+return_1:
+		lea		rax, [rax+1]
 		ret

 addindexed2_x64	ENDP
@ -142,23 +142,20 @@ sbb_x64				PROC
        ; r9 = nCarry

        xor		rax, rax
-        mov		r11, 0
+        xor		r11, r11
        sub		rax, r9				; sets CARRY if r9 != 0
        
+		ALIGN 16
 loop1:	
 		mov		rax,qword ptr [rdx + r11 * 8]
 		sbb		qword ptr [rcx + r11 * 8], rax
-		inc		r11
+		lea		r11, [r11+1]
 		dec		r8
-		
 		jnz		loop1
 		
-		jc		return_1	; most of the times, there will be NO carry (I hope)
-		xor		rax, rax
-		ret
+		setc	al
+		movzx	rax, al

-  return_1:
-		mov		rax, 1
 		ret

 sbb_x64				ENDP
@ -176,11 +173,13 @@ subindexed_x64	PROC
        ; r9 = nValue

 		sub		rdx, r8				; rdx = remaining count of uints
+		
+		ALIGN 16
 loop1:
 		sub		qword ptr [rcx + r8 * 8], r9
 		jnc		done
 		
-		inc		r8
+		lea		r8, [r8+1]
 		mov		r9, 1
 		dec		rdx
 		jnz		loop1
@ -210,9 +209,11 @@ rcl_x64	PROC
 		mov		r11, rcx			; table
 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
+		
+		ALIGN 16
 loop1:
 		rcl		qword ptr [r11 + r10 * 8], 1
-		inc		r10
+		lea		r10, [r10+1]
 		dec		rdx
 		jnz		loop1
 		
@ -236,6 +237,8 @@ rcr_x64	PROC

 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
+		
+		ALIGN 16
 loop1:
 		rcr		qword ptr -8[rcx + rdx * 8], 1
 		dec		rdx
@ -304,6 +307,7 @@ rcl2_x64	PROC

        mov		r9, rax		; r9 = index (0..nSize-1)

+		ALIGN 16
 loop1:
 		rol		qword ptr [r10+r9*8], cl
 		mov		rax, qword ptr [r10+r9*8]
@ -312,7 +316,7 @@ loop1:
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
 		
-		inc		r9
+		lea		r9, [r9+1]
 		dec		rdx

 		jnz		loop1
@ -352,8 +356,9 @@ rcr2_x64	PROC
 		cmovnz	rbx, r11	; if (c) then old value = mask

        mov		r9, rdx		; r9 = index (0..nSize-1)
-        dec		r9
+		lea		r9, [r9-1]

+		ALIGN 16
 loop1:
 		ror		qword ptr [r10+r9*8], cl
 		mov		rax, qword ptr [r10+r9*8]
@ -362,7 +367,7 @@ loop1:
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
 		
-		dec		r9
+		lea		r9, [r9-1]
 		dec		rdx

 		jnz		loop1