From 91e7badb623d0521347d7f50f74460e8e73d780c Mon Sep 17 00:00:00 2001 From: Tomasz Sowa Date: Wed, 29 Oct 2008 21:54:27 +0000 Subject: [PATCH] changed: asm code in: UInt::Sub, UInt::SubInt (32 and 64bit) (much faster now) changed: asm code in: UInt::Rcl2, UInt::Rcr2 (32 and 64bit) previous versions of Rcl2 and Rcr2 had O(n2) complexity, now they have O(n) and are much faster changed: now we do not use LAHF and SAHF instructions (both in 32 and 64 bit code) git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@83 e52654a7-88a9-db11-a3e9-0013d4bc506e --- tests/Makefile | 2 +- tests/main.cpp | 2 - ttmath/ttmathuint.h | 399 ++++++++++++++++++++---------------------- ttmath/ttmathuint64.h | 243 +++++++++++-------------- 4 files changed, 291 insertions(+), 355 deletions(-) diff --git a/tests/Makefile b/tests/Makefile index 3985d27..be1c0ca 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,6 +1,6 @@ CC = g++ o = main.o uinttest.o -CFLAGS = -Wall +CFLAGS = -Wall -O2 -s ttmath = .. name = tests diff --git a/tests/main.cpp b/tests/main.cpp index 1305c31..92abd8f 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -120,7 +120,5 @@ using namespace ttmath; test_uint(); - - return 0; } diff --git a/ttmath/ttmathuint.h b/ttmath/ttmathuint.h index 06bbd3b..b31e810 100644 --- a/ttmath/ttmathuint.h +++ b/ttmath/ttmathuint.h @@ -574,36 +574,31 @@ public: push ebx push ecx push edx + push esi mov ecx,[b] mov ebx,[p1] - mov edx,[p2] + mov esi,[p2] - mov eax,0 - sub eax,[c] + xor eax, eax + mov edx, eax + + sub eax, [c] p: - mov eax,[ebx] - sbb eax,[edx] - mov [ebx],eax - - inc ebx - inc ebx - inc ebx - inc ebx + mov eax, [esi+edx*4] + sbb [ebx+edx*4], eax inc edx - inc edx - inc edx - inc edx + dec ecx + jnz p - loop p - - mov eax,0 - adc eax,eax - mov [c],eax + setc al + movzx edx, al + mov [c], edx + pop esi pop edx pop ecx pop ebx @@ -616,40 +611,28 @@ public: #ifdef __GNUC__ __asm__ __volatile__( - "push %%ebx \n" - "push %%ecx \n" - "push %%edx \n" + "push %%ecx \n" + + "xorl %%eax, %%eax \n" + "movl %%eax, %%edx \n" + "subl %%edi, %%eax \n" - "movl $0, %%eax \n" - "subl %%esi, %%eax \n" - "1: \n" - "movl (%%ebx),%%eax \n" - "sbbl (%%edx),%%eax \n" - "movl %%eax,(%%ebx) \n" - - "inc %%ebx \n" - "inc %%ebx \n" - "inc %%ebx \n" - "inc %%ebx \n" - - "inc %%edx \n" - "inc %%edx \n" - "inc %%edx \n" - "inc %%edx \n" - - "loop 1b \n" + "1: \n" + "movl (%%esi,%%edx,4),%%eax \n" + "sbbl %%eax, (%%ebx,%%edx,4) \n" + + "incl %%edx \n" + "decl %%ecx \n" + "jnz 1b \n" - "movl $0, %%eax \n" - "adcl %%eax,%%eax \n" - "movl %%eax, %%esi \n" + "setc %%al \n" + "movzx %%al,%%edx \n" - "pop %%edx \n" - "pop %%ecx \n" - "pop %%ebx \n" + "pop %%ecx \n" - : "=S" (c) - : "0" (c), "c" (b), "b" (p1), "d" (p2) + : "=d" (c) + : "D" (c), "c" (b), "b" (p1), "S" (p2) : "%eax", "cc", "memory" ); #endif @@ -696,32 +679,23 @@ public: sub ecx, [index] mov edx, [index] - mov eax, [p1] - - lea ebx, [eax+4*edx] - mov edx, [value] + mov ebx, [p1] + + mov eax, [value] - clc p: - mov eax, [ebx] - sbb eax, edx - mov [ebx], eax - + sub [ebx+edx*4], eax jnc end - mov edx, 0 - inc ebx - inc ebx - inc ebx - inc ebx - - loop p + mov eax, 1 + inc edx + dec ecx + jnz p end: - - mov eax,0 - adc eax,eax - mov [c],eax + setc al + movzx edx, al + mov [c], edx pop edx pop ecx @@ -734,44 +708,29 @@ public: #ifdef __GNUC__ __asm__ __volatile__( - "push %%ebx \n" + "push %%eax \n" "push %%ecx \n" - "push %%edx \n" "subl %%edx, %%ecx \n" - "leal (%%ebx,%%edx,4), %%ebx \n" - - "movl %%esi, %%edx \n" - "clc \n" "1: \n" - - "movl (%%ebx), %%eax \n" - "sbbl %%edx, %%eax \n" - "movl %%eax, (%%ebx) \n" - + "subl %%eax, (%%ebx,%%edx,4) \n" "jnc 2f \n" - - "movl $0, %%edx \n" - - "inc %%ebx \n" - "inc %%ebx \n" - "inc %%ebx \n" - "inc %%ebx \n" - - "loop 1b \n" + + "movl $1, %%eax \n" + "incl %%edx \n" + "decl %%ecx \n" + "jnz 1b \n" "2: \n" + "setc %%al \n" + "movzx %%al, %%edx \n" - "movl $0, %%eax \n" - "adcl %%eax,%%eax \n" - - "pop %%edx \n" "pop %%ecx \n" - "pop %%ebx \n" + "pop %%eax \n" - : "=a" (c) - : "c" (b), "d" (index), "b" (p1), "S" (value) + : "=d" (c) + : "a" (value), "c" (b), "0" (index), "b" (p1) : "cc", "memory" ); #endif @@ -803,6 +762,7 @@ public: private: + public: ///// !!!!!! #ifdef TTMATH_PLATFORM32 @@ -828,7 +788,7 @@ private: register sint b = value_size; register uint * p1 = table; - + register uint mask; #ifndef __GNUC__ __asm @@ -837,40 +797,45 @@ private: push ebx push ecx push edx + push esi + push edi - mov edx, [bits] - - a: - xor eax, eax - sub eax, [c] + mov edi, [b] - mov ecx, [b] + mov ecx, 32 + sub ecx, [bits] + mov edx, -1 + shr edx, cl + mov [mask], edx + + mov ecx, [bits] mov ebx, [p1] - lahf - p: - sahf - rcl dword ptr[ebx],1 - lahf + xor edx, edx // edx = 0 + mov esi, edx // old value = 0 - add ebx,4 + mov eax, [c] + or eax, eax + cmovnz esi, [mask] // if c then old value = mask - sub ecx,1 + p: + rol dword ptr [ebx+edx*4], cl + + mov eax, [ebx+edx*4] + and eax, [mask] + xor [ebx+edx*4], eax // clearing bits + or [ebx+edx*4], esi // saving old value + mov esi, eax + + inc edx + dec edi jnz p - sub edx,1 - jnz a - - sahf - setc al - - // - // movzx dword ptr [c],al - // - movzx edx, al - mov [c], edx - // + and eax, 1 + mov [c], eax + pop edi + pop esi pop edx pop ecx pop ebx @@ -881,43 +846,47 @@ private: #ifdef __GNUC__ __asm__ __volatile__( - - "push %%esi \n" + + "push %%edx \n" + "push %%esi \n" + "push %%edi \n" - "2: \n" - - "xorl %%eax,%%eax \n" - "subl %%edx,%%eax \n" + "movl %%ecx, %%esi \n" + "movl $32, %%ecx \n" + "subl %%esi, %%ecx \n" + "movl $-1, %%edx \n" + "shrl %%cl, %%edx \n" + "movl %%edx, %[amask] \n" + "movl %%esi, %%ecx \n" - "push %%ebx \n" - "push %%ecx \n" + "xorl %%edx, %%edx \n" + "movl %%edx, %%esi \n" - "lahf \n" - "1: \n" - "sahf \n" - "rcll $1,(%%ebx) \n" - "lahf \n" + "orl %%eax, %%eax \n" + "cmovnz %[amask], %%esi \n" - "addl $4,%%ebx \n" + "1: \n" + "roll %%cl, (%%ebx,%%edx,4) \n" - "subl $1,%%ecx \n" - "jnz 1b \n" + "movl (%%ebx,%%edx,4), %%eax \n" + "andl %[amask], %%eax \n" + "xorl %%eax, (%%ebx,%%edx,4) \n" + "orl %%esi, (%%ebx,%%edx,4) \n" + "movl %%eax, %%esi \n" - "pop %%ecx \n" - "pop %%ebx \n" + "incl %%edx \n" + "decl %%edi \n" + "jnz 1b \n" + + "and $1, %%eax \n" - "subl $1,%%esi \n" - "jnz 2b \n" + "pop %%edi \n" + "pop %%esi \n" + "pop %%edx \n" - "sahf \n" - "setc %%dl \n" - "movzx %%dl, %%edx \n" - - "pop %%esi \n" - - : "=d" (c) - : "0" (c), "c" (b), "b" (p1), "S" (bits) - : "%eax", "cc", "memory" ); + : "=a" (c) + : "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask) + : "cc", "memory" ); #endif @@ -947,7 +916,7 @@ private: register sint b = value_size; register uint * p1 = table; - + register uint mask; #ifndef __GNUC__ __asm @@ -956,42 +925,48 @@ private: push ebx push ecx push edx + push esi + push edi - mov edx,[bits] + mov edi, [b] - a: + mov ecx, 32 + sub ecx, [bits] + mov edx, -1 + shl edx, cl + mov [mask], edx - xor eax,eax - sub eax,[c] + mov ecx, [bits] + mov ebx, [p1] - mov ebx,[p1] - mov ecx,[b] - lea ebx,[ebx+4*ecx] + xor edx, edx // edx = 0 + mov esi, edx // old value = 0 + add edx, edi + dec edx // edx - is pointing at the last word + + mov eax, [c] + or eax, eax + cmovnz esi, [mask] // if c then old value = mask - lahf p: - sub ebx,4 - - sahf - rcr dword ptr [ebx],1 - lahf - - sub ecx,1 - jnz p + ror dword ptr [ebx+edx*4], cl - sub edx,1 - jnz a + mov eax, [ebx+edx*4] + and eax, [mask] + xor [ebx+edx*4], eax // clearing bits + or [ebx+edx*4], esi // saving old value + mov esi, eax - sahf - setc al + dec edx + dec edi + jnz p - // - // movzx dword ptr [c],al - // - movzx edx, al - mov [c], edx - // + rol eax, 1 // 31bit will be first + and eax, 1 + mov [c], eax + pop edi + pop esi pop edx pop ecx pop ebx @@ -1003,46 +978,49 @@ private: #ifdef __GNUC__ __asm__ __volatile__( - "push %%esi \n" - - - "2: \n" - - "push %%ebx \n" - "push %%ecx \n" - - "leal (%%ebx,%%ecx,4),%%ebx \n" + "push %%edx \n" + "push %%esi \n" + "push %%edi \n" - "xorl %%eax, %%eax \n" - "subl %%edx, %%eax \n" + "movl %%ecx, %%esi \n" + "movl $32, %%ecx \n" + "subl %%esi, %%ecx \n" + "movl $-1, %%edx \n" + "shll %%cl, %%edx \n" + "movl %%edx, %[amask] \n" + "movl %%esi, %%ecx \n" - "lahf \n" - "1: \n" - "subl $4,%%ebx \n" + "xorl %%edx, %%edx \n" + "movl %%edx, %%esi \n" + "addl %%edi, %%edx \n" + "decl %%edx \n" - "sahf \n" - "rcrl $1,(%%ebx) \n" - "lahf \n" + "orl %%eax, %%eax \n" + "cmovnz %[amask], %%esi \n" - "subl $1,%%ecx \n" - "jnz 1b \n" + "1: \n" + "rorl %%cl, (%%ebx,%%edx,4) \n" - "pop %%ecx \n" - "pop %%ebx \n" + "movl (%%ebx,%%edx,4), %%eax \n" + "andl %[amask], %%eax \n" + "xorl %%eax, (%%ebx,%%edx,4) \n" + "orl %%esi, (%%ebx,%%edx,4) \n" + "movl %%eax, %%esi \n" + + "decl %%edx \n" + "decl %%edi \n" + "jnz 1b \n" + + "roll $1, %%eax \n" + "andl $1, %%eax \n" - "subl $1,%%esi \n" + "pop %%edi \n" + "pop %%esi \n" + "pop %%edx \n" - "jnz 2b \n" - - "sahf \n" - "setc %%dl \n" - "movzx %%dl, %%edx \n" - - "pop %%esi \n" - - : "=d" (c) - : "0" (c), "c" (b), "b" (p1), "S" (bits) - : "%eax", "cc", "memory" ); + : "=a" (c) + : "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask) + : "cc", "memory" ); #endif @@ -3451,6 +3429,7 @@ public: #ifdef TTMATH_PLATFORM64 private: +public: uint Rcl2(uint bits, uint c); uint Rcr2(uint bits, uint c); diff --git a/ttmath/ttmathuint64.h b/ttmath/ttmathuint64.h index ec33c73..9000fa4 100644 --- a/ttmath/ttmathuint64.h +++ b/ttmath/ttmathuint64.h @@ -431,50 +431,31 @@ namespace ttmath #ifdef __GNUC__ __asm__ __volatile__( - "push %%rbx \n" - "push %%rcx \n" - "push %%rdx \n" + "push %%rcx \n" + + "xorq %%rax, %%rax \n" + "movq %%rax, %%rdx \n" + "subq %%rdi, %%rax \n" - "movq $0, %%rax \n" - "subq %%rsi, %%rax \n" - "1: \n" - "movq (%%rbx),%%rax \n" - "sbbq (%%rdx),%%rax \n" - "movq %%rax,(%%rbx) \n" - - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - - "inc %%rdx \n" - "inc %%rdx \n" - "inc %%rdx \n" - "inc %%rdx \n" - "inc %%rdx \n" - "inc %%rdx \n" - "inc %%rdx \n" - "inc %%rdx \n" - - "loop 1b \n" + "1: \n" + "movq (%%rsi,%%rdx,8),%%rax \n" + "sbbq %%rax, (%%rbx,%%rdx,8) \n" + + "incq %%rdx \n" + "decq %%rcx \n" + "jnz 1b \n" - "movq $0, %%rax \n" - "adcq %%rax,%%rax \n" - "movq %%rax, %%rsi \n" + "setc %%al \n" + "movzx %%al,%%rdx \n" - "pop %%rdx \n" - "pop %%rcx \n" - "pop %%rbx \n" + "pop %%rcx \n" - : "=S" (c) - : "0" (c), "c" (b), "b" (p1), "d" (p2) + : "=d" (c) + : "D" (c), "c" (b), "b" (p1), "S" (p2) : "%rax", "cc", "memory" ); + #endif @@ -515,48 +496,29 @@ namespace ttmath #ifdef __GNUC__ __asm__ __volatile__( - "push %%rbx \n" + "push %%rax \n" "push %%rcx \n" - "push %%rdx \n" "subq %%rdx, %%rcx \n" - "leaq (%%rbx,%%rdx,8), %%rbx \n" - - "movq %%rsi, %%rdx \n" - "clc \n" "1: \n" - - "movq (%%rbx), %%rax \n" - "sbbq %%rdx, %%rax \n" - "movq %%rax, (%%rbx) \n" - + "subq %%rax, (%%rbx,%%rdx,8) \n" "jnc 2f \n" - - "movq $0, %%rdx \n" - - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - "inc %%rbx \n" - - "loop 1b \n" + + "movq $1, %%rax \n" + "incq %%rdx \n" + "decq %%rcx \n" + "jnz 1b \n" "2: \n" + "setc %%al \n" + "movzx %%al, %%rdx \n" - "movq $0, %%rax \n" - "adcq %%rax,%%rax \n" - - "pop %%rdx \n" "pop %%rcx \n" - "pop %%rbx \n" + "pop %%rax \n" - : "=a" (c) - : "c" (b), "d" (index), "b" (p1), "S" (value) + : "=d" (c) + : "a" (value), "c" (b), "0" (index), "b" (p1) : "cc", "memory" ); #endif @@ -590,6 +552,7 @@ namespace ttmath register sint b = value_size; register uint * p1 = table; + register uint mask; #ifndef __GNUC__ #error "another compiler than GCC is currently not supported in 64bit mode" @@ -598,47 +561,46 @@ namespace ttmath #ifdef __GNUC__ __asm__ __volatile__( - "push %%rsi \n" + "push %%rdx \n" + "push %%rsi \n" + "push %%rdi \n" - - "2: \n" + "movq %%rcx, %%rsi \n" + "movq $64, %%rcx \n" + "subq %%rsi, %%rcx \n" + "movq $-1, %%rdx \n" + "shrq %%cl, %%rdx \n" + "movq %%rdx, %[amask] \n" + "movq %%rsi, %%rcx \n" - "xorq %%rax,%%rax \n" - "subq %%rdx,%%rax \n" + "xorq %%rdx, %%rdx \n" + "movq %%rdx, %%rsi \n" - "push %%rbx \n" - "push %%rcx \n" + "orq %%rax, %%rax \n" + "cmovnz %[amask], %%rsi \n" - //"lahf \n" - ".byte 0x9f \n" - "1: \n" - //"sahf \n" - ".byte 0x9e \n" - "rclq $1,(%%rbx) \n" - //"lahf \n" - ".byte 0x9f \n" + "1: \n" + "rolq %%cl, (%%rbx,%%rdx,8) \n" - "addq $8,%%rbx \n" - - "subq $1,%%rcx \n" - "loop 1b \n" + "movq (%%rbx,%%rdx,8), %%rax \n" + "andq %[amask], %%rax \n" + "xorq %%rax, (%%rbx,%%rdx,8) \n" + "orq %%rsi, (%%rbx,%%rdx,8) \n" + "movq %%rax, %%rsi \n" - "pop %%rcx \n" - "pop %%rbx \n" + "incq %%rdx \n" + "decq %%rdi \n" + "jnz 1b \n" + + "and $1, %%rax \n" - "subq $1,%%rsi \n" - "jnz 2b \n" + "pop %%rdi \n" + "pop %%rsi \n" + "pop %%rdx \n" - "xor %%rdx,%%rdx \n" - //"sahf \n" - ".byte 0x9e \n" - "setc %%dl \n" - - "pop %%rsi \n" - - : "=d" (c) - : "0" (c), "c" (b), "b" (p1), "S" (bits) - : "%rax", "cc", "memory" ); + : "=a" (c) + : "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask) + : "cc", "memory" ); #endif @@ -671,7 +633,7 @@ namespace ttmath register sint b = value_size; register uint * p1 = table; - + register uint mask; #ifndef __GNUC__ #error "another compiler than GCC is currently not supported in 64bit mode" @@ -681,52 +643,49 @@ namespace ttmath #ifdef __GNUC__ __asm__ __volatile__( - "push %%rsi \n" + "push %%rdx \n" + "push %%rsi \n" + "push %%rdi \n" + + "movq %%rcx, %%rsi \n" + "movq $64, %%rcx \n" + "subq %%rsi, %%rcx \n" + "movq $-1, %%rdx \n" + "shlq %%cl, %%rdx \n" + "movq %%rdx, %[amask] \n" + "movq %%rsi, %%rcx \n" + "xorq %%rdx, %%rdx \n" + "movq %%rdx, %%rsi \n" + "addq %%rdi, %%rdx \n" + "decq %%rdx \n" - "2: \n" + "orq %%rax, %%rax \n" + "cmovnz %[amask], %%rsi \n" + "1: \n" + "rorq %%cl, (%%rbx,%%rdx,8) \n" - "push %%rbx \n" - "push %%rcx \n" + "movq (%%rbx,%%rdx,8), %%rax \n" + "andq %[amask], %%rax \n" + "xorq %%rax, (%%rbx,%%rdx,8) \n" + "orq %%rsi, (%%rbx,%%rdx,8) \n" + "movq %%rax, %%rsi \n" + + "decq %%rdx \n" + "decq %%rdi \n" + "jnz 1b \n" + + "rolq $1, %%rax \n" + "andq $1, %%rax \n" - "leaq (%%rbx,%%rcx,8),%%rbx \n" + "pop %%rdi \n" + "pop %%rsi \n" + "pop %%rdx \n" - "xorq %%rax, %%rax \n" - "subq %%rdx, %%rax \n" - - //"lahf \n" - ".byte 0x9f \n" - "1: \n" - "subq $8, %%rbx \n" - - //"sahf \n" - ".byte 0x9e \n" - - "rcrq $1,(%%rbx) \n" - //"lahf \n" - ".byte 0x9f \n" - - "subq $1,%%rcx \n" - "jnz 1b \n" - - "pop %%rcx \n" - "pop %%rbx \n" - - "subq $1,%%rsi \n" - - "jnz 2b \n" - - "xor %%rdx,%%rdx \n" - //"sahf \n" - ".byte 0x9e \n" - "setc %%dl \n" - - "pop %%rsi \n" - - : "=d" (c) - : "0" (c), "c" (b), "b" (p1), "S" (bits) - : "%rax", "cc", "memory" ); + : "=a" (c) + : "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask) + : "cc", "memory" ); #endif