diff --git a/ttmath/ttmathuint.h b/ttmath/ttmathuint.h index 37e591c..c9c3678 100644 --- a/ttmath/ttmathuint.h +++ b/ttmath/ttmathuint.h @@ -3298,16 +3298,6 @@ public: static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low); static void DivTwoWords(uint a,uint b, uint c, uint * r, uint * rest); - - /* temporarily */ - #ifndef TTMATH_NOASM - #ifdef TTMATH_PLATFORM64 - #ifdef _MSC_VER - static uint AddTwoWords(uint a, uint b, uint carry, uint * result); - static uint SubTwoWords(uint a, uint b, uint carry, uint * result); - #endif - #endif - #endif }; diff --git a/ttmath/ttmathuint_x86_64.h b/ttmath/ttmathuint_x86_64.h index d4ae8c8..a30a1ac 100644 --- a/ttmath/ttmathuint_x86_64.h +++ b/ttmath/ttmathuint_x86_64.h @@ -63,16 +63,18 @@ namespace ttmath extern "C" { - uint __fastcall adc_x64(uint* p1, const uint* p2, uint nSize, uint c); - uint __fastcall addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue); - uint __fastcall addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2); - uint __fastcall sbb_x64(uint* p1, const uint* p2, uint nSize, uint c); - uint __fastcall subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue); - uint __fastcall rcl_x64(uint* p1, uint nSize, uint nLowestBit); - uint __fastcall rcr_x64(uint* p1, uint nSize, uint nLowestBit); - uint __fastcall div_x64(uint* pnValHi, uint* pnValLo, uint nDiv); - uint __fastcall rcl2_x64(uint* p1, uint nSize, uint nBits, uint c); - uint __fastcall rcr2_x64(uint* p1, uint nSize, uint nBits, uint c); + uint __fastcall ttmath_adc_x64(uint* p1, const uint* p2, uint nSize, uint c); + uint __fastcall ttmath_addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue); + uint __fastcall ttmath_addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2); + uint __fastcall ttmath_addvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result); + uint __fastcall ttmath_sbb_x64(uint* p1, const uint* p2, uint nSize, uint c); + uint __fastcall ttmath_subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue); + uint __fastcall ttmath_subvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result); + uint __fastcall ttmath_rcl_x64(uint* p1, uint nSize, uint nLowestBit); + uint __fastcall ttmath_rcr_x64(uint* p1, uint nSize, uint nLowestBit); + uint __fastcall ttmath_div_x64(uint* pnValHi, uint* pnValLo, uint nDiv); + uint __fastcall ttmath_rcl2_x64(uint* p1, uint nSize, uint nBits, uint c); + uint __fastcall ttmath_rcr2_x64(uint* p1, uint nSize, uint nBits, uint c); }; #endif @@ -110,7 +112,7 @@ namespace ttmath #endif #ifdef _MSC_VER - c = adc_x64(p1,p2,b,c); + c = ttmath_adc_x64(p1,p2,b,c); #endif #ifdef __GNUC__ @@ -182,7 +184,7 @@ namespace ttmath #ifdef _MSC_VER - c = addindexed_x64(p1,b,index,value); + c = ttmath_addindexed_x64(p1,b,index,value); #endif @@ -266,7 +268,7 @@ namespace ttmath #ifdef _MSC_VER - c = addindexed2_x64(p1,b,index,x1,x2); + c = ttmath_addindexed2_x64(p1,b,index,x1,x2); #endif @@ -327,15 +329,11 @@ namespace ttmath of course the carry is propagated and will be returned from the last item (this method is used by the Karatsuba multiplication algorithm) */ - -#ifndef _MSC_VER - template uint UInt::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) { TTMATH_ASSERT( ss1_size >= ss2_size ) - uint rest = ss1_size - ss2_size; uint c; #if !defined(__GNUC__) && !defined(_MSC_VER) @@ -344,12 +342,13 @@ namespace ttmath #ifdef _MSC_VER - + c = ttmath_addvector_x64(ss1, ss2, ss1_size, ss2_size, result); #endif #ifdef __GNUC__ uint dummy1, dummy2, dummy3; + uint rest = ss1_size - ss2_size; // this part should be compiled with gcc @@ -396,27 +395,6 @@ namespace ttmath return c; } -#else - /* temporarily */ - template - uint UInt::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) - { - uint i, c = 0; - - TTMATH_ASSERT( ss1_size >= ss2_size ) - - for(i=0 ; i uint UInt::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) { TTMATH_ASSERT( ss1_size >= ss2_size ) - uint rest = ss1_size - ss2_size; uint c; #if !defined(__GNUC__) && !defined(_MSC_VER) @@ -590,17 +563,17 @@ namespace ttmath #ifdef _MSC_VER - + c = ttmath_subvector_x64(ss1, ss2, ss1_size, ss2_size, result); #endif #ifdef __GNUC__ - // the asm code is nearly the same as in AddVector // only two instructions 'adc' are changed to 'sbb' uint dummy1, dummy2, dummy3; + uint rest = ss1_size - ss2_size; __asm__ __volatile__( "mov %%rdx, %%r8 \n" @@ -645,28 +618,6 @@ namespace ttmath return c; } -#else - /* temporarily */ - template - uint UInt::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) - { - uint i, c = 0; - - TTMATH_ASSERT( ss1_size >= ss2_size ) - - for(i=0 ; i - uint UInt::AddTwoWords(uint a, uint b, uint carry, uint * result) - { - uint temp; - - if( carry == 0 ) - { - temp = a + b; - - if( temp < a ) - carry = 1; - } - else - { - carry = 1; - temp = a + b + carry; - - if( temp > a ) // !(temp<=a) - carry = 0; - } - - *result = temp; - - return carry; - } - - - /* temporarily */ - template - uint UInt::SubTwoWords(uint a, uint b, uint carry, uint * result) - { - if( carry == 0 ) - { - *result = a - b; - - if( a < b ) - carry = 1; - } - else - { - carry = 1; - *result = a - b - carry; - - if( a > b ) // !(a <= b ) - carry = 0; - } - - return carry; - } - - - } //namespace diff --git a/ttmath/ttmathuint_x86_64_msvc.asm b/ttmath/ttmathuint_x86_64_msvc.asm index 9adfbc4..40bd245 100644 --- a/ttmath/ttmathuint_x86_64_msvc.asm +++ b/ttmath/ttmathuint_x86_64_msvc.asm @@ -41,20 +41,22 @@ ; this create ttmathuint_x86_64_msvc.obj file which can be linked with your program ; -PUBLIC adc_x64 -PUBLIC addindexed_x64 -PUBLIC addindexed2_x64 +PUBLIC ttmath_adc_x64 +PUBLIC ttmath_addindexed_x64 +PUBLIC ttmath_addindexed2_x64 +PUBLIC ttmath_addvector_x64 -PUBLIC sbb_x64 -PUBLIC subindexed_x64 +PUBLIC ttmath_sbb_x64 +PUBLIC ttmath_subindexed_x64 +PUBLIC ttmath_subvector_x64 -PUBLIC rcl_x64 -PUBLIC rcr_x64 +PUBLIC ttmath_rcl_x64 +PUBLIC ttmath_rcr_x64 -PUBLIC rcl2_x64 -PUBLIC rcr2_x64 +PUBLIC ttmath_rcl2_x64 +PUBLIC ttmath_rcr2_x64 -PUBLIC div_x64 +PUBLIC ttmath_div_x64 ; ; "rax, rcx, rdx, r8-r11 are volatile." @@ -64,11 +66,12 @@ PUBLIC div_x64 .CODE + ALIGN 8 ;---------------------------------------- -adc_x64 PROC +ttmath_adc_x64 PROC ; rcx = p1 ; rdx = p2 ; r8 = nSize @@ -91,7 +94,7 @@ adc_x64 PROC ret -adc_x64 ENDP +ttmath_adc_x64 ENDP ;---------------------------------------- @@ -99,7 +102,7 @@ adc_x64 ENDP ;---------------------------------------- -addindexed_x64 PROC +ttmath_addindexed_x64 PROC ; rcx = p1 ; rdx = nSize @@ -132,7 +135,7 @@ done_with_cy: ret -addindexed_x64 ENDP +ttmath_addindexed_x64 ENDP ;---------------------------------------- @@ -140,7 +143,7 @@ addindexed_x64 ENDP ;---------------------------------------- -addindexed2_x64 PROC +ttmath_addindexed2_x64 PROC ; rcx = p1 (pointer) ; rdx = b (value size) @@ -173,7 +176,9 @@ next: lea rax, [rax+1] ret -addindexed2_x64 ENDP +ttmath_addindexed2_x64 ENDP + + ;---------------------------------------- @@ -181,7 +186,61 @@ addindexed2_x64 ENDP ;---------------------------------------- -sbb_x64 PROC + +ttmath_addvector_x64 PROC + ; rcx = ss1 + ; rdx = ss2 + ; r8 = ss1_size + ; r9 = ss2_size + ; [esp+0x28] = result + + mov r10, [esp+028h] + sub r8, r9 + xor r11, r11 ; r11=0, cf=0 + + ALIGN 16 + loop1: + mov rax, qword ptr [rcx + r11 * 8] + adc rax, qword ptr [rdx + r11 * 8] + mov qword ptr [r10 + r11 * 8], rax + inc r11 + dec r9 + jnz loop1 + + adc r9, r9 ; r9 has the cf state + + or r8, r8 + jz done + + neg r9 ; setting cf from r9 + mov r9, 0 ; don't use xor here (cf is used) + loop2: + mov rax, qword ptr [rcx + r11 * 8] + adc rax, r9 + mov qword ptr [r10 + r11 * 8], rax + inc r11 + dec r8 + jnz loop2 + + adc r8, r8 + mov rax, r8 + + ret + +done: + mov rax, r9 + ret + +ttmath_addvector_x64 ENDP + + +;---------------------------------------- + + ALIGN 8 + +;---------------------------------------- + +ttmath_sbb_x64 PROC ; rcx = p1 ; rdx = p2 @@ -205,7 +264,7 @@ sbb_x64 PROC ret -sbb_x64 ENDP +ttmath_sbb_x64 ENDP ;---------------------------------------- @@ -213,7 +272,7 @@ sbb_x64 ENDP ;---------------------------------------- -subindexed_x64 PROC +ttmath_subindexed_x64 PROC ; rcx = p1 ; rdx = nSize ; r8 = nPos @@ -240,7 +299,9 @@ done: mov rax, 1 ret -subindexed_x64 ENDP +ttmath_subindexed_x64 ENDP + + ;---------------------------------------- @@ -248,7 +309,64 @@ subindexed_x64 ENDP ;---------------------------------------- -rcl_x64 PROC +; the same asm code as in addvector_x64 only two instructions 'adc' changed to 'sbb' + +ttmath_subvector_x64 PROC + ; rcx = ss1 + ; rdx = ss2 + ; r8 = ss1_size + ; r9 = ss2_size + ; [esp+0x28] = result + + mov r10, [esp+028h] + sub r8, r9 + xor r11, r11 ; r11=0, cf=0 + + ALIGN 16 + loop1: + mov rax, qword ptr [rcx + r11 * 8] + sbb rax, qword ptr [rdx + r11 * 8] + mov qword ptr [r10 + r11 * 8], rax + inc r11 + dec r9 + jnz loop1 + + adc r9, r9 ; r9 has the cf state + + or r8, r8 + jz done + + neg r9 ; setting cf from r9 + mov r9, 0 ; don't use xor here (cf is used) + loop2: + mov rax, qword ptr [rcx + r11 * 8] + sbb rax, r9 + mov qword ptr [r10 + r11 * 8], rax + inc r11 + dec r8 + jnz loop2 + + adc r8, r8 + mov rax, r8 + + ret + +done: + mov rax, r9 + ret + +ttmath_subvector_x64 ENDP + + + + +;---------------------------------------- + + ALIGN 8 + +;---------------------------------------- + +ttmath_rcl_x64 PROC ; rcx = p1 ; rdx = b ; r8 = nLowestBit @@ -269,7 +387,7 @@ loop1: ret -rcl_x64 ENDP +ttmath_rcl_x64 ENDP ;---------------------------------------- @@ -277,7 +395,7 @@ rcl_x64 ENDP ;---------------------------------------- -rcr_x64 PROC +ttmath_rcr_x64 PROC ; rcx = p1 ; rdx = nSize ; r8 = nLowestBit @@ -296,7 +414,7 @@ loop1: ret -rcr_x64 ENDP +ttmath_rcr_x64 ENDP ;---------------------------------------- @@ -304,7 +422,7 @@ rcr_x64 ENDP ;---------------------------------------- -div_x64 PROC +ttmath_div_x64 PROC ; rcx = &Hi ; rdx = &Lo @@ -321,7 +439,7 @@ div_x64 PROC ret -div_x64 ENDP +ttmath_div_x64 ENDP ;---------------------------------------- @@ -329,7 +447,7 @@ div_x64 ENDP ;---------------------------------------- -rcl2_x64 PROC +ttmath_rcl2_x64 PROC ; rcx = p1 ; rdx = nSize ; r8 = bits @@ -372,7 +490,7 @@ loop1: pop rbx ret -rcl2_x64 ENDP +ttmath_rcl2_x64 ENDP ;---------------------------------------- @@ -380,7 +498,7 @@ rcl2_x64 ENDP ;---------------------------------------- -rcr2_x64 PROC +ttmath_rcr2_x64 PROC ; rcx = p1 ; rdx = nSize ; r8 = bits @@ -425,6 +543,6 @@ loop1: ret -rcr2_x64 ENDP +ttmath_rcr2_x64 ENDP END