more optimizations for MSVC assembler (parallelism, prefetch optimization, loop alignment, ...)

git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@151 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
Christian Kaiser 2009-05-20 08:48:51 +00:00
parent fdc292e91a
commit de1e7ac957
4 changed files with 1195 additions and 1181 deletions

View File

@ -3434,7 +3434,7 @@ private:
*/
int FromString_ReadScientificIfExists(const tchar_t * & source)
{
int c = 0;
uint c = 0;
bool scientific_read = false;
const tchar_t * before_scientific = source;

View File

@ -37,6 +37,7 @@ namespace ttmath
{
#if defined(_MSC_VER)
#include <windows.h>
#if defined(_UNICODE)
typedef wchar_t tchar_t;
typedef std::wstring tstr_t;
@ -71,20 +72,20 @@ namespace ttmath
public:
clsCrit(void)
{
::InitializeCriticalSection(&_Crit);
InitializeCriticalSection(&_Crit);
}
virtual ~clsCrit(void)
{
::DeleteCriticalSection(&_Crit);
DeleteCriticalSection(&_Crit);
}
void Enter(void) const
{
::EnterCriticalSection(&_Crit);
EnterCriticalSection(&_Crit);
}
void Leave(void) const
{
::LeaveCriticalSection(&_Crit);
LeaveCriticalSection(&_Crit);
}
};

View File

@ -98,11 +98,12 @@ namespace ttmath
sub eax,[c] // CF=c
ALIGN 16
p:
mov eax,[esi+edx*4]
adc [ebx+edx*4],eax
mov eax,[esi+edx*4+0]
adc [ebx+edx*4+0],eax
inc edx
lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
dec ecx
jnz p
@ -190,12 +191,13 @@ namespace ttmath
mov eax, [value]
ALIGN 16
p:
add [ebx+edx*4], eax
jnc end
mov eax, 1
inc edx
lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
dec ecx
jnz p
@ -295,17 +297,18 @@ namespace ttmath
mov eax, [x1]
add [ebx+edx*4], eax
inc edx
dec ecx
lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
lea ecx, [ecx-1]
mov eax, [x2]
ALIGN 16
p:
adc [ebx+edx*4], eax
jnc end
mov eax, 0
inc edx
xor eax, eax
lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
dec ecx
jnz p
@ -392,11 +395,12 @@ namespace ttmath
sub eax, [c]
ALIGN 16
p:
mov eax, [esi+edx*4]
sbb [ebx+edx*4], eax
inc edx
lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
dec ecx
jnz p
@ -480,12 +484,13 @@ namespace ttmath
mov eax, [value]
ALIGN 16
p:
sub [ebx+edx*4], eax
jnc end
mov eax, 1
inc edx
lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
dec ecx
jnz p
@ -564,15 +569,16 @@ namespace ttmath
mov ecx, [b]
ALIGN 16
p:
rcl dword ptr [ebx+edx*4], 1
inc edx
lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
dec ecx
jnz p
setc dl
movzx eax, dl
setc al
movzx eax, al
}
#endif
@ -633,21 +639,21 @@ namespace ttmath
#ifndef __GNUC__
__asm
{
mov ebx, [p1]
xor ecx, ecx
sub ecx, [c]
mov ebx, [p1]
mov ecx, [b]
ALIGN 16
p:
rcr dword ptr [ebx+ecx*4-4], 1
dec ecx
jnz p
setc cl
movzx eax, cl
setc al
movzx eax, al
}
#endif
@ -724,6 +730,7 @@ namespace ttmath
or eax, eax
cmovnz esi, [mask] // if c then old value = mask
ALIGN 16
p:
rol dword ptr [ebx+edx*4], cl
@ -733,7 +740,7 @@ namespace ttmath
or [ebx+edx*4], esi // saving old value
mov esi, eax
inc edx
lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
dec edi
jnz p
@ -839,6 +846,7 @@ namespace ttmath
or eax, eax
cmovnz esi, [mask] // if c then old value = mask
ALIGN 16
p:
ror dword ptr [ebx+edx*4], cl

View File

@ -31,23 +31,21 @@ adc_x64 PROC
; r9 = nCarry
xor rax, rax
mov r11, 0
xor r11, r11
sub rax, r9 ; sets CARRY if r9 != 0
ALIGN 16
loop1:
mov rax,qword ptr [rdx + r11 * 8]
adc qword ptr [rcx + r11 * 8], rax
inc r11
lea r11, [r11+1]
dec r8
jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope)
xor rax, rax
ret
setc al
movzx rax, al
return_1:
mov rax, 1
ret
adc_x64 ENDP
@ -66,22 +64,20 @@ addindexed_x64 PROC
; r9 = nValue
sub rdx, r8 ; rdx = remaining count of uints
ALIGN 16
loop1:
add qword ptr [rcx + r8 * 8], r9
jnc done
inc r8
lea r8, [r8+1]
mov r9, 1
dec rdx
jnz loop1
done:
jc return_1 ; most of the times, there will be NO carry (I hope)
xor rax, rax
ret
setc al
movzx rax, al
return_1:
mov rax, 1
ret
addindexed_x64 ENDP
@ -100,28 +96,32 @@ addindexed2_x64 PROC
; r9 = nValue1
; [esp+0x28] = nValue2
xor rax, rax ; return value
mov r11, rcx ; table
sub rdx, r8 ; rdx = remaining count of uints
mov r10, [esp+028h] ; r10 = nValue2
add qword ptr [r11 + r8 * 8], r10
inc r8
lea r8, [r8+1]
ALIGN 16
loop1:
adc qword ptr [r11 + r8 * 8], r9
jnc done
jc next
ret
inc r8
mov r9, 0 ; set to 0 -> cy still set!
next:
lea r8, [r8+1]
xor r9, r9 ; set to 0 -> cy still set!
dec rdx
jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope)
done:
xor rax, rax
ret
return_1:
mov rax, 1
return_1:
lea rax, [rax+1]
ret
addindexed2_x64 ENDP
@ -142,23 +142,20 @@ sbb_x64 PROC
; r9 = nCarry
xor rax, rax
mov r11, 0
xor r11, r11
sub rax, r9 ; sets CARRY if r9 != 0
ALIGN 16
loop1:
mov rax,qword ptr [rdx + r11 * 8]
sbb qword ptr [rcx + r11 * 8], rax
inc r11
lea r11, [r11+1]
dec r8
jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope)
xor rax, rax
ret
setc al
movzx rax, al
return_1:
mov rax, 1
ret
sbb_x64 ENDP
@ -176,11 +173,13 @@ subindexed_x64 PROC
; r9 = nValue
sub rdx, r8 ; rdx = remaining count of uints
ALIGN 16
loop1:
sub qword ptr [rcx + r8 * 8], r9
jnc done
inc r8
lea r8, [r8+1]
mov r9, 1
dec rdx
jnz loop1
@ -210,9 +209,11 @@ rcl_x64 PROC
mov r11, rcx ; table
xor r10, r10
neg r8 ; CY set if r8 <> 0
ALIGN 16
loop1:
rcl qword ptr [r11 + r10 * 8], 1
inc r10
lea r10, [r10+1]
dec rdx
jnz loop1
@ -236,6 +237,8 @@ rcr_x64 PROC
xor r10, r10
neg r8 ; CY set if r8 <> 0
ALIGN 16
loop1:
rcr qword ptr -8[rcx + rdx * 8], 1
dec rdx
@ -304,6 +307,7 @@ rcl2_x64 PROC
mov r9, rax ; r9 = index (0..nSize-1)
ALIGN 16
loop1:
rol qword ptr [r10+r9*8], cl
mov rax, qword ptr [r10+r9*8]
@ -312,7 +316,7 @@ loop1:
or qword ptr [r10+r9*8], rbx
mov rbx, rax
inc r9
lea r9, [r9+1]
dec rdx
jnz loop1
@ -352,8 +356,9 @@ rcr2_x64 PROC
cmovnz rbx, r11 ; if (c) then old value = mask
mov r9, rdx ; r9 = index (0..nSize-1)
dec r9
lea r9, [r9-1]
ALIGN 16
loop1:
ror qword ptr [r10+r9*8], cl
mov rax, qword ptr [r10+r9*8]
@ -362,7 +367,7 @@ loop1:
or qword ptr [r10+r9*8], rbx
mov rbx, rax
dec r9
lea r9, [r9-1]
dec rdx
jnz loop1