more optimizations for MSVC assembler (parallelism, prefetch optimization, loop alignment, ...)

git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@151 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
Christian Kaiser 2009-05-20 08:48:51 +00:00
parent fdc292e91a
commit de1e7ac957
4 changed files with 1195 additions and 1181 deletions

View File

@ -3434,7 +3434,7 @@ private:
*/
int FromString_ReadScientificIfExists(const tchar_t * & source)
{
int c = 0;
uint c = 0;
bool scientific_read = false;
const tchar_t * before_scientific = source;

View File

@ -37,6 +37,7 @@ namespace ttmath
{
#if defined(_MSC_VER)
#include <windows.h>
#if defined(_UNICODE)
typedef wchar_t tchar_t;
typedef std::wstring tstr_t;
@ -71,20 +72,20 @@ namespace ttmath
public:
clsCrit(void)
{
::InitializeCriticalSection(&_Crit);
InitializeCriticalSection(&_Crit);
}
virtual ~clsCrit(void)
{
::DeleteCriticalSection(&_Crit);
DeleteCriticalSection(&_Crit);
}
void Enter(void) const
{
::EnterCriticalSection(&_Crit);
EnterCriticalSection(&_Crit);
}
void Leave(void) const
{
::LeaveCriticalSection(&_Crit);
LeaveCriticalSection(&_Crit);
}
};

File diff suppressed because it is too large Load Diff

View File

@ -31,23 +31,21 @@ adc_x64 PROC
; r9 = nCarry
xor rax, rax
mov r11, 0
xor r11, r11
sub rax, r9 ; sets CARRY if r9 != 0
ALIGN 16
loop1:
mov rax,qword ptr [rdx + r11 * 8]
adc qword ptr [rcx + r11 * 8], rax
inc r11
lea r11, [r11+1]
dec r8
jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope)
xor rax, rax
ret
return_1:
mov rax, 1
setc al
movzx rax, al
ret
adc_x64 ENDP
@ -66,24 +64,22 @@ addindexed_x64 PROC
; r9 = nValue
sub rdx, r8 ; rdx = remaining count of uints
ALIGN 16
loop1:
add qword ptr [rcx + r8 * 8], r9
jnc done
inc r8
lea r8, [r8+1]
mov r9, 1
dec rdx
jnz loop1
done:
jc return_1 ; most of the times, there will be NO carry (I hope)
xor rax, rax
setc al
movzx rax, al
ret
return_1:
mov rax, 1
ret
addindexed_x64 ENDP
;----------------------------------------
@ -100,28 +96,32 @@ addindexed2_x64 PROC
; r9 = nValue1
; [esp+0x28] = nValue2
xor rax, rax ; return value
mov r11, rcx ; table
sub rdx, r8 ; rdx = remaining count of uints
mov r10, [esp+028h] ; r10 = nValue2
add qword ptr [r11 + r8 * 8], r10
inc r8
lea r8, [r8+1]
ALIGN 16
loop1:
adc qword ptr [r11 + r8 * 8], r9
jnc done
jc next
ret
inc r8
mov r9, 0 ; set to 0 -> cy still set!
next:
lea r8, [r8+1]
xor r9, r9 ; set to 0 -> cy still set!
dec rdx
jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope)
done:
xor rax, rax
ret
return_1:
mov rax, 1
return_1:
lea rax, [rax+1]
ret
addindexed2_x64 ENDP
@ -142,23 +142,20 @@ sbb_x64 PROC
; r9 = nCarry
xor rax, rax
mov r11, 0
xor r11, r11
sub rax, r9 ; sets CARRY if r9 != 0
ALIGN 16
loop1:
mov rax,qword ptr [rdx + r11 * 8]
sbb qword ptr [rcx + r11 * 8], rax
inc r11
lea r11, [r11+1]
dec r8
jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope)
xor rax, rax
ret
return_1:
mov rax, 1
setc al
movzx rax, al
ret
sbb_x64 ENDP
@ -176,11 +173,13 @@ subindexed_x64 PROC
; r9 = nValue
sub rdx, r8 ; rdx = remaining count of uints
ALIGN 16
loop1:
sub qword ptr [rcx + r8 * 8], r9
jnc done
inc r8
lea r8, [r8+1]
mov r9, 1
dec rdx
jnz loop1
@ -210,9 +209,11 @@ rcl_x64 PROC
mov r11, rcx ; table
xor r10, r10
neg r8 ; CY set if r8 <> 0
ALIGN 16
loop1:
rcl qword ptr [r11 + r10 * 8], 1
inc r10
lea r10, [r10+1]
dec rdx
jnz loop1
@ -236,6 +237,8 @@ rcr_x64 PROC
xor r10, r10
neg r8 ; CY set if r8 <> 0
ALIGN 16
loop1:
rcr qword ptr -8[rcx + rdx * 8], 1
dec rdx
@ -304,6 +307,7 @@ rcl2_x64 PROC
mov r9, rax ; r9 = index (0..nSize-1)
ALIGN 16
loop1:
rol qword ptr [r10+r9*8], cl
mov rax, qword ptr [r10+r9*8]
@ -312,7 +316,7 @@ loop1:
or qword ptr [r10+r9*8], rbx
mov rbx, rax
inc r9
lea r9, [r9+1]
dec rdx
jnz loop1
@ -352,8 +356,9 @@ rcr2_x64 PROC
cmovnz rbx, r11 ; if (c) then old value = mask
mov r9, rdx ; r9 = index (0..nSize-1)
dec r9
lea r9, [r9-1]
ALIGN 16
loop1:
ror qword ptr [r10+r9*8], cl
mov rax, qword ptr [r10+r9*8]
@ -362,7 +367,7 @@ loop1:
or qword ptr [r10+r9*8], rbx
mov rbx, rax
dec r9
lea r9, [r9-1]
dec rdx
jnz loop1