added: uint __fastcall ttmath_addvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);

uint __fastcall ttmath_subvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
       asm code for AddVector() and SubVector() for MS VC x86_64
changed: added prefixes "ttmath_" to all public procedures from ttmathuint_x86_64_msvc.asm


git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@192 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
2009-09-11 23:55:44 +00:00
parent b3d27979d0
commit 9ccacd8817
3 changed files with 175 additions and 170 deletions

View File

@@ -3298,16 +3298,6 @@ public:
static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low); static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low);
static void DivTwoWords(uint a,uint b, uint c, uint * r, uint * rest); static void DivTwoWords(uint a,uint b, uint c, uint * r, uint * rest);
/* temporarily */
#ifndef TTMATH_NOASM
#ifdef TTMATH_PLATFORM64
#ifdef _MSC_VER
static uint AddTwoWords(uint a, uint b, uint carry, uint * result);
static uint SubTwoWords(uint a, uint b, uint carry, uint * result);
#endif
#endif
#endif
}; };

View File

@@ -63,16 +63,18 @@ namespace ttmath
extern "C" extern "C"
{ {
uint __fastcall adc_x64(uint* p1, const uint* p2, uint nSize, uint c); uint __fastcall ttmath_adc_x64(uint* p1, const uint* p2, uint nSize, uint c);
uint __fastcall addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue); uint __fastcall ttmath_addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
uint __fastcall addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2); uint __fastcall ttmath_addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2);
uint __fastcall sbb_x64(uint* p1, const uint* p2, uint nSize, uint c); uint __fastcall ttmath_addvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
uint __fastcall subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue); uint __fastcall ttmath_sbb_x64(uint* p1, const uint* p2, uint nSize, uint c);
uint __fastcall rcl_x64(uint* p1, uint nSize, uint nLowestBit); uint __fastcall ttmath_subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
uint __fastcall rcr_x64(uint* p1, uint nSize, uint nLowestBit); uint __fastcall ttmath_subvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
uint __fastcall div_x64(uint* pnValHi, uint* pnValLo, uint nDiv); uint __fastcall ttmath_rcl_x64(uint* p1, uint nSize, uint nLowestBit);
uint __fastcall rcl2_x64(uint* p1, uint nSize, uint nBits, uint c); uint __fastcall ttmath_rcr_x64(uint* p1, uint nSize, uint nLowestBit);
uint __fastcall rcr2_x64(uint* p1, uint nSize, uint nBits, uint c); uint __fastcall ttmath_div_x64(uint* pnValHi, uint* pnValLo, uint nDiv);
uint __fastcall ttmath_rcl2_x64(uint* p1, uint nSize, uint nBits, uint c);
uint __fastcall ttmath_rcr2_x64(uint* p1, uint nSize, uint nBits, uint c);
}; };
#endif #endif
@@ -110,7 +112,7 @@ namespace ttmath
#endif #endif
#ifdef _MSC_VER #ifdef _MSC_VER
c = adc_x64(p1,p2,b,c); c = ttmath_adc_x64(p1,p2,b,c);
#endif #endif
#ifdef __GNUC__ #ifdef __GNUC__
@@ -182,7 +184,7 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = addindexed_x64(p1,b,index,value); c = ttmath_addindexed_x64(p1,b,index,value);
#endif #endif
@@ -266,7 +268,7 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = addindexed2_x64(p1,b,index,x1,x2); c = ttmath_addindexed2_x64(p1,b,index,x1,x2);
#endif #endif
@@ -327,15 +329,11 @@ namespace ttmath
of course the carry is propagated and will be returned from the last item of course the carry is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm) (this method is used by the Karatsuba multiplication algorithm)
*/ */
#ifndef _MSC_VER
template<uint value_size> template<uint value_size>
uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{ {
TTMATH_ASSERT( ss1_size >= ss2_size ) TTMATH_ASSERT( ss1_size >= ss2_size )
uint rest = ss1_size - ss2_size;
uint c; uint c;
#if !defined(__GNUC__) && !defined(_MSC_VER) #if !defined(__GNUC__) && !defined(_MSC_VER)
@@ -344,12 +342,13 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = ttmath_addvector_x64(ss1, ss2, ss1_size, ss2_size, result);
#endif #endif
#ifdef __GNUC__ #ifdef __GNUC__
uint dummy1, dummy2, dummy3; uint dummy1, dummy2, dummy3;
uint rest = ss1_size - ss2_size;
// this part should be compiled with gcc // this part should be compiled with gcc
@@ -396,27 +395,6 @@ namespace ttmath
return c; return c;
} }
#else
/* temporarily */
template<uint value_size>
uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
for( ; i<ss1_size ; ++i)
c = AddTwoWords(ss1[i], 0, c, &result[i]);
TTMATH_LOG("UInt::AddVector")
return c;
}
#endif
/*! /*!
@@ -446,7 +424,7 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = sbb_x64(p1,p2,b,c); c = ttmath_sbb_x64(p1,p2,b,c);
#endif #endif
@@ -480,6 +458,7 @@ namespace ttmath
} }
/*! /*!
this method subtracts one word (at a specific position) this method subtracts one word (at a specific position)
and returns a carry (if it was) and returns a carry (if it was)
@@ -514,7 +493,7 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = subindexed_x64(p1,b,index,value); c = ttmath_subindexed_x64(p1,b,index,value);
#endif #endif
@@ -550,8 +529,6 @@ namespace ttmath
} }
/*! /*!
this static method subtractes one vector from the other this static method subtractes one vector from the other
'ss1' is larger in size or equal to 'ss2' 'ss1' is larger in size or equal to 'ss2'
@@ -573,15 +550,11 @@ namespace ttmath
of course the carry (borrow) is propagated and will be returned from the last item of course the carry (borrow) is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm) (this method is used by the Karatsuba multiplication algorithm)
*/ */
#ifndef _MSC_VER
template<uint value_size> template<uint value_size>
uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{ {
TTMATH_ASSERT( ss1_size >= ss2_size ) TTMATH_ASSERT( ss1_size >= ss2_size )
uint rest = ss1_size - ss2_size;
uint c; uint c;
#if !defined(__GNUC__) && !defined(_MSC_VER) #if !defined(__GNUC__) && !defined(_MSC_VER)
@@ -590,17 +563,17 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = ttmath_subvector_x64(ss1, ss2, ss1_size, ss2_size, result);
#endif #endif
#ifdef __GNUC__ #ifdef __GNUC__
// the asm code is nearly the same as in AddVector // the asm code is nearly the same as in AddVector
// only two instructions 'adc' are changed to 'sbb' // only two instructions 'adc' are changed to 'sbb'
uint dummy1, dummy2, dummy3; uint dummy1, dummy2, dummy3;
uint rest = ss1_size - ss2_size;
__asm__ __volatile__( __asm__ __volatile__(
"mov %%rdx, %%r8 \n" "mov %%rdx, %%r8 \n"
@@ -645,28 +618,6 @@ namespace ttmath
return c; return c;
} }
#else
/* temporarily */
template<uint value_size>
uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
for( ; i<ss1_size ; ++i)
c = SubTwoWords(ss1[i], 0, c, &result[i]);
TTMATH_LOG("UInt::SubVector")
return c;
}
#endif
/*! /*!
this method moves all bits into the left hand side this method moves all bits into the left hand side
@@ -695,7 +646,7 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = rcl_x64(p1,b,c); c = ttmath_rcl_x64(p1,b,c);
#endif #endif
@@ -755,7 +706,7 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = rcr_x64(p1,b,c); c = ttmath_rcr_x64(p1,b,c);
#endif #endif
@@ -816,7 +767,7 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = rcl2_x64(p1,b,bits,c); c = ttmath_rcl2_x64(p1,b,bits,c);
#endif #endif
@@ -893,7 +844,7 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
c = rcr2_x64(p1,b,bits,c); c = ttmath_rcr2_x64(p1,b,bits,c);
#endif #endif
@@ -1146,7 +1097,7 @@ namespace ttmath
#ifdef _MSC_VER #ifdef _MSC_VER
div_x64(&a,&b,c); ttmath_div_x64(&a,&b,c);
r_ = a; r_ = a;
rest_ = b; rest_ = b;
@@ -1170,60 +1121,6 @@ namespace ttmath
*rest = rest_; *rest = rest_;
} }
/* temporarily */
template<uint value_size>
uint UInt<value_size>::AddTwoWords(uint a, uint b, uint carry, uint * result)
{
uint temp;
if( carry == 0 )
{
temp = a + b;
if( temp < a )
carry = 1;
}
else
{
carry = 1;
temp = a + b + carry;
if( temp > a ) // !(temp<=a)
carry = 0;
}
*result = temp;
return carry;
}
/* temporarily */
template<uint value_size>
uint UInt<value_size>::SubTwoWords(uint a, uint b, uint carry, uint * result)
{
if( carry == 0 )
{
*result = a - b;
if( a < b )
carry = 1;
}
else
{
carry = 1;
*result = a - b - carry;
if( a > b ) // !(a <= b )
carry = 0;
}
return carry;
}
} //namespace } //namespace

View File

@@ -41,20 +41,22 @@
; this create ttmathuint_x86_64_msvc.obj file which can be linked with your program ; this create ttmathuint_x86_64_msvc.obj file which can be linked with your program
; ;
PUBLIC adc_x64 PUBLIC ttmath_adc_x64
PUBLIC addindexed_x64 PUBLIC ttmath_addindexed_x64
PUBLIC addindexed2_x64 PUBLIC ttmath_addindexed2_x64
PUBLIC ttmath_addvector_x64
PUBLIC sbb_x64 PUBLIC ttmath_sbb_x64
PUBLIC subindexed_x64 PUBLIC ttmath_subindexed_x64
PUBLIC ttmath_subvector_x64
PUBLIC rcl_x64 PUBLIC ttmath_rcl_x64
PUBLIC rcr_x64 PUBLIC ttmath_rcr_x64
PUBLIC rcl2_x64 PUBLIC ttmath_rcl2_x64
PUBLIC rcr2_x64 PUBLIC ttmath_rcr2_x64
PUBLIC div_x64 PUBLIC ttmath_div_x64
; ;
; "rax, rcx, rdx, r8-r11 are volatile." ; "rax, rcx, rdx, r8-r11 are volatile."
@@ -64,11 +66,12 @@ PUBLIC div_x64
.CODE .CODE
ALIGN 8 ALIGN 8
;---------------------------------------- ;----------------------------------------
adc_x64 PROC ttmath_adc_x64 PROC
; rcx = p1 ; rcx = p1
; rdx = p2 ; rdx = p2
; r8 = nSize ; r8 = nSize
@@ -91,7 +94,7 @@ adc_x64 PROC
ret ret
adc_x64 ENDP ttmath_adc_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -99,7 +102,7 @@ adc_x64 ENDP
;---------------------------------------- ;----------------------------------------
addindexed_x64 PROC ttmath_addindexed_x64 PROC
; rcx = p1 ; rcx = p1
; rdx = nSize ; rdx = nSize
@@ -132,7 +135,7 @@ done_with_cy:
ret ret
addindexed_x64 ENDP ttmath_addindexed_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -140,7 +143,7 @@ addindexed_x64 ENDP
;---------------------------------------- ;----------------------------------------
addindexed2_x64 PROC ttmath_addindexed2_x64 PROC
; rcx = p1 (pointer) ; rcx = p1 (pointer)
; rdx = b (value size) ; rdx = b (value size)
@@ -173,7 +176,9 @@ next:
lea rax, [rax+1] lea rax, [rax+1]
ret ret
addindexed2_x64 ENDP ttmath_addindexed2_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -181,7 +186,61 @@ addindexed2_x64 ENDP
;---------------------------------------- ;----------------------------------------
sbb_x64 PROC
ttmath_addvector_x64 PROC
; rcx = ss1
; rdx = ss2
; r8 = ss1_size
; r9 = ss2_size
; [esp+0x28] = result
mov r10, [esp+028h]
sub r8, r9
xor r11, r11 ; r11=0, cf=0
ALIGN 16
loop1:
mov rax, qword ptr [rcx + r11 * 8]
adc rax, qword ptr [rdx + r11 * 8]
mov qword ptr [r10 + r11 * 8], rax
inc r11
dec r9
jnz loop1
adc r9, r9 ; r9 has the cf state
or r8, r8
jz done
neg r9 ; setting cf from r9
mov r9, 0 ; don't use xor here (cf is used)
loop2:
mov rax, qword ptr [rcx + r11 * 8]
adc rax, r9
mov qword ptr [r10 + r11 * 8], rax
inc r11
dec r8
jnz loop2
adc r8, r8
mov rax, r8
ret
done:
mov rax, r9
ret
ttmath_addvector_x64 ENDP
;----------------------------------------
ALIGN 8
;----------------------------------------
ttmath_sbb_x64 PROC
; rcx = p1 ; rcx = p1
; rdx = p2 ; rdx = p2
@@ -205,7 +264,7 @@ sbb_x64 PROC
ret ret
sbb_x64 ENDP ttmath_sbb_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -213,7 +272,7 @@ sbb_x64 ENDP
;---------------------------------------- ;----------------------------------------
subindexed_x64 PROC ttmath_subindexed_x64 PROC
; rcx = p1 ; rcx = p1
; rdx = nSize ; rdx = nSize
; r8 = nPos ; r8 = nPos
@@ -240,7 +299,9 @@ done:
mov rax, 1 mov rax, 1
ret ret
subindexed_x64 ENDP ttmath_subindexed_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -248,7 +309,64 @@ subindexed_x64 ENDP
;---------------------------------------- ;----------------------------------------
rcl_x64 PROC ; the same asm code as in addvector_x64 only two instructions 'adc' changed to 'sbb'
ttmath_subvector_x64 PROC
; rcx = ss1
; rdx = ss2
; r8 = ss1_size
; r9 = ss2_size
; [esp+0x28] = result
mov r10, [esp+028h]
sub r8, r9
xor r11, r11 ; r11=0, cf=0
ALIGN 16
loop1:
mov rax, qword ptr [rcx + r11 * 8]
sbb rax, qword ptr [rdx + r11 * 8]
mov qword ptr [r10 + r11 * 8], rax
inc r11
dec r9
jnz loop1
adc r9, r9 ; r9 has the cf state
or r8, r8
jz done
neg r9 ; setting cf from r9
mov r9, 0 ; don't use xor here (cf is used)
loop2:
mov rax, qword ptr [rcx + r11 * 8]
sbb rax, r9
mov qword ptr [r10 + r11 * 8], rax
inc r11
dec r8
jnz loop2
adc r8, r8
mov rax, r8
ret
done:
mov rax, r9
ret
ttmath_subvector_x64 ENDP
;----------------------------------------
ALIGN 8
;----------------------------------------
ttmath_rcl_x64 PROC
; rcx = p1 ; rcx = p1
; rdx = b ; rdx = b
; r8 = nLowestBit ; r8 = nLowestBit
@@ -269,7 +387,7 @@ loop1:
ret ret
rcl_x64 ENDP ttmath_rcl_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -277,7 +395,7 @@ rcl_x64 ENDP
;---------------------------------------- ;----------------------------------------
rcr_x64 PROC ttmath_rcr_x64 PROC
; rcx = p1 ; rcx = p1
; rdx = nSize ; rdx = nSize
; r8 = nLowestBit ; r8 = nLowestBit
@@ -296,7 +414,7 @@ loop1:
ret ret
rcr_x64 ENDP ttmath_rcr_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -304,7 +422,7 @@ rcr_x64 ENDP
;---------------------------------------- ;----------------------------------------
div_x64 PROC ttmath_div_x64 PROC
; rcx = &Hi ; rcx = &Hi
; rdx = &Lo ; rdx = &Lo
@@ -321,7 +439,7 @@ div_x64 PROC
ret ret
div_x64 ENDP ttmath_div_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -329,7 +447,7 @@ div_x64 ENDP
;---------------------------------------- ;----------------------------------------
rcl2_x64 PROC ttmath_rcl2_x64 PROC
; rcx = p1 ; rcx = p1
; rdx = nSize ; rdx = nSize
; r8 = bits ; r8 = bits
@@ -372,7 +490,7 @@ loop1:
pop rbx pop rbx
ret ret
rcl2_x64 ENDP ttmath_rcl2_x64 ENDP
;---------------------------------------- ;----------------------------------------
@@ -380,7 +498,7 @@ rcl2_x64 ENDP
;---------------------------------------- ;----------------------------------------
rcr2_x64 PROC ttmath_rcr2_x64 PROC
; rcx = p1 ; rcx = p1
; rdx = nSize ; rdx = nSize
; r8 = bits ; r8 = bits
@@ -425,6 +543,6 @@ loop1:
ret ret
rcr2_x64 ENDP ttmath_rcr2_x64 ENDP
END END