more optimizations for MSVC assembler (parallelism, prefetch optimization, loop alignment, ...)

git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@151 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
Christian Kaiser
2009-05-20 08:48:51 +00:00
parent fdc292e91a
commit de1e7ac957
4 changed files with 1195 additions and 1181 deletions

View File

@@ -3434,7 +3434,7 @@ private:
*/ */
int FromString_ReadScientificIfExists(const tchar_t * & source) int FromString_ReadScientificIfExists(const tchar_t * & source)
{ {
int c = 0; uint c = 0;
bool scientific_read = false; bool scientific_read = false;
const tchar_t * before_scientific = source; const tchar_t * before_scientific = source;

View File

@@ -37,6 +37,7 @@ namespace ttmath
{ {
#if defined(_MSC_VER) #if defined(_MSC_VER)
#include <windows.h>
#if defined(_UNICODE) #if defined(_UNICODE)
typedef wchar_t tchar_t; typedef wchar_t tchar_t;
typedef std::wstring tstr_t; typedef std::wstring tstr_t;
@@ -71,20 +72,20 @@ namespace ttmath
public: public:
clsCrit(void) clsCrit(void)
{ {
::InitializeCriticalSection(&_Crit); InitializeCriticalSection(&_Crit);
} }
virtual ~clsCrit(void) virtual ~clsCrit(void)
{ {
::DeleteCriticalSection(&_Crit); DeleteCriticalSection(&_Crit);
} }
void Enter(void) const void Enter(void) const
{ {
::EnterCriticalSection(&_Crit); EnterCriticalSection(&_Crit);
} }
void Leave(void) const void Leave(void) const
{ {
::LeaveCriticalSection(&_Crit); LeaveCriticalSection(&_Crit);
} }
}; };

File diff suppressed because it is too large Load Diff

View File

@@ -31,23 +31,21 @@ adc_x64 PROC
; r9 = nCarry ; r9 = nCarry
xor rax, rax xor rax, rax
mov r11, 0 xor r11, r11
sub rax, r9 ; sets CARRY if r9 != 0 sub rax, r9 ; sets CARRY if r9 != 0
ALIGN 16
loop1: loop1:
mov rax,qword ptr [rdx + r11 * 8] mov rax,qword ptr [rdx + r11 * 8]
adc qword ptr [rcx + r11 * 8], rax adc qword ptr [rcx + r11 * 8], rax
inc r11 lea r11, [r11+1]
dec r8 dec r8
jnz loop1 jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope) setc al
xor rax, rax movzx rax, al
ret
return_1:
mov rax, 1
ret ret
adc_x64 ENDP adc_x64 ENDP
@@ -66,24 +64,22 @@ addindexed_x64 PROC
; r9 = nValue ; r9 = nValue
sub rdx, r8 ; rdx = remaining count of uints sub rdx, r8 ; rdx = remaining count of uints
ALIGN 16
loop1: loop1:
add qword ptr [rcx + r8 * 8], r9 add qword ptr [rcx + r8 * 8], r9
jnc done jnc done
inc r8 lea r8, [r8+1]
mov r9, 1 mov r9, 1
dec rdx dec rdx
jnz loop1 jnz loop1
done: done:
jc return_1 ; most of the times, there will be NO carry (I hope) setc al
xor rax, rax movzx rax, al
ret ret
return_1:
mov rax, 1
ret
addindexed_x64 ENDP addindexed_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -100,28 +96,32 @@ addindexed2_x64 PROC
; r9 = nValue1 ; r9 = nValue1
; [esp+0x28] = nValue2 ; [esp+0x28] = nValue2
xor rax, rax ; return value
mov r11, rcx ; table mov r11, rcx ; table
sub rdx, r8 ; rdx = remaining count of uints sub rdx, r8 ; rdx = remaining count of uints
mov r10, [esp+028h] ; r10 = nValue2 mov r10, [esp+028h] ; r10 = nValue2
add qword ptr [r11 + r8 * 8], r10 add qword ptr [r11 + r8 * 8], r10
inc r8 lea r8, [r8+1]
ALIGN 16
loop1: loop1:
adc qword ptr [r11 + r8 * 8], r9 adc qword ptr [r11 + r8 * 8], r9
jnc done jc next
ret
inc r8 next:
mov r9, 0 ; set to 0 -> cy still set! lea r8, [r8+1]
xor r9, r9 ; set to 0 -> cy still set!
dec rdx dec rdx
jnz loop1 jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope) jc return_1 ; most of the times, there will be NO carry (I hope)
done: done:
xor rax, rax
ret ret
return_1: return_1:
mov rax, 1 lea rax, [rax+1]
ret ret
addindexed2_x64 ENDP addindexed2_x64 ENDP
@@ -142,23 +142,20 @@ sbb_x64 PROC
; r9 = nCarry ; r9 = nCarry
xor rax, rax xor rax, rax
mov r11, 0 xor r11, r11
sub rax, r9 ; sets CARRY if r9 != 0 sub rax, r9 ; sets CARRY if r9 != 0
ALIGN 16
loop1: loop1:
mov rax,qword ptr [rdx + r11 * 8] mov rax,qword ptr [rdx + r11 * 8]
sbb qword ptr [rcx + r11 * 8], rax sbb qword ptr [rcx + r11 * 8], rax
inc r11 lea r11, [r11+1]
dec r8 dec r8
jnz loop1 jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope) setc al
xor rax, rax movzx rax, al
ret
return_1:
mov rax, 1
ret ret
sbb_x64 ENDP sbb_x64 ENDP
@@ -176,11 +173,13 @@ subindexed_x64 PROC
; r9 = nValue ; r9 = nValue
sub rdx, r8 ; rdx = remaining count of uints sub rdx, r8 ; rdx = remaining count of uints
ALIGN 16
loop1: loop1:
sub qword ptr [rcx + r8 * 8], r9 sub qword ptr [rcx + r8 * 8], r9
jnc done jnc done
inc r8 lea r8, [r8+1]
mov r9, 1 mov r9, 1
dec rdx dec rdx
jnz loop1 jnz loop1
@@ -210,9 +209,11 @@ rcl_x64 PROC
mov r11, rcx ; table mov r11, rcx ; table
xor r10, r10 xor r10, r10
neg r8 ; CY set if r8 <> 0 neg r8 ; CY set if r8 <> 0
ALIGN 16
loop1: loop1:
rcl qword ptr [r11 + r10 * 8], 1 rcl qword ptr [r11 + r10 * 8], 1
inc r10 lea r10, [r10+1]
dec rdx dec rdx
jnz loop1 jnz loop1
@@ -236,6 +237,8 @@ rcr_x64 PROC
xor r10, r10 xor r10, r10
neg r8 ; CY set if r8 <> 0 neg r8 ; CY set if r8 <> 0
ALIGN 16
loop1: loop1:
rcr qword ptr -8[rcx + rdx * 8], 1 rcr qword ptr -8[rcx + rdx * 8], 1
dec rdx dec rdx
@@ -304,6 +307,7 @@ rcl2_x64 PROC
mov r9, rax ; r9 = index (0..nSize-1) mov r9, rax ; r9 = index (0..nSize-1)
ALIGN 16
loop1: loop1:
rol qword ptr [r10+r9*8], cl rol qword ptr [r10+r9*8], cl
mov rax, qword ptr [r10+r9*8] mov rax, qword ptr [r10+r9*8]
@@ -312,7 +316,7 @@ loop1:
or qword ptr [r10+r9*8], rbx or qword ptr [r10+r9*8], rbx
mov rbx, rax mov rbx, rax
inc r9 lea r9, [r9+1]
dec rdx dec rdx
jnz loop1 jnz loop1
@@ -352,8 +356,9 @@ rcr2_x64 PROC
cmovnz rbx, r11 ; if (c) then old value = mask cmovnz rbx, r11 ; if (c) then old value = mask
mov r9, rdx ; r9 = index (0..nSize-1) mov r9, rdx ; r9 = index (0..nSize-1)
dec r9 lea r9, [r9-1]
ALIGN 16
loop1: loop1:
ror qword ptr [r10+r9*8], cl ror qword ptr [r10+r9*8], cl
mov rax, qword ptr [r10+r9*8] mov rax, qword ptr [r10+r9*8]
@@ -362,7 +367,7 @@ loop1:
or qword ptr [r10+r9*8], rbx or qword ptr [r10+r9*8], rbx
mov rbx, rax mov rbx, rax
dec r9 lea r9, [r9-1]
dec rdx dec rdx
jnz loop1 jnz loop1