added: uint __fastcall ttmath_addvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);

uint __fastcall ttmath_subvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
       asm code for AddVector() and SubVector() for MS VC x86_64
changed: added prefixes "ttmath_" to all public procedures from ttmathuint_x86_64_msvc.asm


git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@192 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
2009-09-11 23:55:44 +00:00
parent b3d27979d0
commit 9ccacd8817
3 changed files with 175 additions and 170 deletions

View File

@@ -3298,16 +3298,6 @@ public:
static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low);
static void DivTwoWords(uint a,uint b, uint c, uint * r, uint * rest);
/* temporarily */
#ifndef TTMATH_NOASM
#ifdef TTMATH_PLATFORM64
#ifdef _MSC_VER
static uint AddTwoWords(uint a, uint b, uint carry, uint * result);
static uint SubTwoWords(uint a, uint b, uint carry, uint * result);
#endif
#endif
#endif
};

View File

@@ -63,16 +63,18 @@ namespace ttmath
extern "C"
{
uint __fastcall adc_x64(uint* p1, const uint* p2, uint nSize, uint c);
uint __fastcall addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
uint __fastcall addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2);
uint __fastcall sbb_x64(uint* p1, const uint* p2, uint nSize, uint c);
uint __fastcall subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
uint __fastcall rcl_x64(uint* p1, uint nSize, uint nLowestBit);
uint __fastcall rcr_x64(uint* p1, uint nSize, uint nLowestBit);
uint __fastcall div_x64(uint* pnValHi, uint* pnValLo, uint nDiv);
uint __fastcall rcl2_x64(uint* p1, uint nSize, uint nBits, uint c);
uint __fastcall rcr2_x64(uint* p1, uint nSize, uint nBits, uint c);
uint __fastcall ttmath_adc_x64(uint* p1, const uint* p2, uint nSize, uint c);
uint __fastcall ttmath_addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
uint __fastcall ttmath_addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2);
uint __fastcall ttmath_addvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
uint __fastcall ttmath_sbb_x64(uint* p1, const uint* p2, uint nSize, uint c);
uint __fastcall ttmath_subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
uint __fastcall ttmath_subvector_x64(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result);
uint __fastcall ttmath_rcl_x64(uint* p1, uint nSize, uint nLowestBit);
uint __fastcall ttmath_rcr_x64(uint* p1, uint nSize, uint nLowestBit);
uint __fastcall ttmath_div_x64(uint* pnValHi, uint* pnValLo, uint nDiv);
uint __fastcall ttmath_rcl2_x64(uint* p1, uint nSize, uint nBits, uint c);
uint __fastcall ttmath_rcr2_x64(uint* p1, uint nSize, uint nBits, uint c);
};
#endif
@@ -110,7 +112,7 @@ namespace ttmath
#endif
#ifdef _MSC_VER
c = adc_x64(p1,p2,b,c);
c = ttmath_adc_x64(p1,p2,b,c);
#endif
#ifdef __GNUC__
@@ -182,7 +184,7 @@ namespace ttmath
#ifdef _MSC_VER
c = addindexed_x64(p1,b,index,value);
c = ttmath_addindexed_x64(p1,b,index,value);
#endif
@@ -266,7 +268,7 @@ namespace ttmath
#ifdef _MSC_VER
c = addindexed2_x64(p1,b,index,x1,x2);
c = ttmath_addindexed2_x64(p1,b,index,x1,x2);
#endif
@@ -327,15 +329,11 @@ namespace ttmath
of course the carry is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
#ifndef _MSC_VER
template<uint value_size>
uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
TTMATH_ASSERT( ss1_size >= ss2_size )
uint rest = ss1_size - ss2_size;
uint c;
#if !defined(__GNUC__) && !defined(_MSC_VER)
@@ -344,12 +342,13 @@ namespace ttmath
#ifdef _MSC_VER
c = ttmath_addvector_x64(ss1, ss2, ss1_size, ss2_size, result);
#endif
#ifdef __GNUC__
uint dummy1, dummy2, dummy3;
uint rest = ss1_size - ss2_size;
// this part should be compiled with gcc
@@ -396,27 +395,6 @@ namespace ttmath
return c;
}
#else
/* temporarily */
template<uint value_size>
uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
for( ; i<ss1_size ; ++i)
c = AddTwoWords(ss1[i], 0, c, &result[i]);
TTMATH_LOG("UInt::AddVector")
return c;
}
#endif
/*!
@@ -446,7 +424,7 @@ namespace ttmath
#ifdef _MSC_VER
c = sbb_x64(p1,p2,b,c);
c = ttmath_sbb_x64(p1,p2,b,c);
#endif
@@ -480,6 +458,7 @@ namespace ttmath
}
/*!
this method subtracts one word (at a specific position)
and returns a carry (if it was)
@@ -514,7 +493,7 @@ namespace ttmath
#ifdef _MSC_VER
c = subindexed_x64(p1,b,index,value);
c = ttmath_subindexed_x64(p1,b,index,value);
#endif
@@ -550,8 +529,6 @@ namespace ttmath
}
/*!
this static method subtractes one vector from the other
'ss1' is larger in size or equal to 'ss2'
@@ -573,15 +550,11 @@ namespace ttmath
of course the carry (borrow) is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
#ifndef _MSC_VER
template<uint value_size>
uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
TTMATH_ASSERT( ss1_size >= ss2_size )
uint rest = ss1_size - ss2_size;
uint c;
#if !defined(__GNUC__) && !defined(_MSC_VER)
@@ -590,17 +563,17 @@ namespace ttmath
#ifdef _MSC_VER
c = ttmath_subvector_x64(ss1, ss2, ss1_size, ss2_size, result);
#endif
#ifdef __GNUC__
// the asm code is nearly the same as in AddVector
// only two instructions 'adc' are changed to 'sbb'
uint dummy1, dummy2, dummy3;
uint rest = ss1_size - ss2_size;
__asm__ __volatile__(
"mov %%rdx, %%r8 \n"
@@ -645,28 +618,6 @@ namespace ttmath
return c;
}
#else
/* temporarily */
template<uint value_size>
uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
for( ; i<ss1_size ; ++i)
c = SubTwoWords(ss1[i], 0, c, &result[i]);
TTMATH_LOG("UInt::SubVector")
return c;
}
#endif
/*!
this method moves all bits into the left hand side
@@ -695,7 +646,7 @@ namespace ttmath
#ifdef _MSC_VER
c = rcl_x64(p1,b,c);
c = ttmath_rcl_x64(p1,b,c);
#endif
@@ -755,7 +706,7 @@ namespace ttmath
#ifdef _MSC_VER
c = rcr_x64(p1,b,c);
c = ttmath_rcr_x64(p1,b,c);
#endif
@@ -816,7 +767,7 @@ namespace ttmath
#ifdef _MSC_VER
c = rcl2_x64(p1,b,bits,c);
c = ttmath_rcl2_x64(p1,b,bits,c);
#endif
@@ -893,7 +844,7 @@ namespace ttmath
#ifdef _MSC_VER
c = rcr2_x64(p1,b,bits,c);
c = ttmath_rcr2_x64(p1,b,bits,c);
#endif
@@ -1146,7 +1097,7 @@ namespace ttmath
#ifdef _MSC_VER
div_x64(&a,&b,c);
ttmath_div_x64(&a,&b,c);
r_ = a;
rest_ = b;
@@ -1170,60 +1121,6 @@ namespace ttmath
*rest = rest_;
}
/* temporarily */
template<uint value_size>
uint UInt<value_size>::AddTwoWords(uint a, uint b, uint carry, uint * result)
{
uint temp;
if( carry == 0 )
{
temp = a + b;
if( temp < a )
carry = 1;
}
else
{
carry = 1;
temp = a + b + carry;
if( temp > a ) // !(temp<=a)
carry = 0;
}
*result = temp;
return carry;
}
/* temporarily */
template<uint value_size>
uint UInt<value_size>::SubTwoWords(uint a, uint b, uint carry, uint * result)
{
if( carry == 0 )
{
*result = a - b;
if( a < b )
carry = 1;
}
else
{
carry = 1;
*result = a - b - carry;
if( a > b ) // !(a <= b )
carry = 0;
}
return carry;
}
} //namespace

View File

@@ -41,20 +41,22 @@
; this create ttmathuint_x86_64_msvc.obj file which can be linked with your program
;
PUBLIC adc_x64
PUBLIC addindexed_x64
PUBLIC addindexed2_x64
PUBLIC ttmath_adc_x64
PUBLIC ttmath_addindexed_x64
PUBLIC ttmath_addindexed2_x64
PUBLIC ttmath_addvector_x64
PUBLIC sbb_x64
PUBLIC subindexed_x64
PUBLIC ttmath_sbb_x64
PUBLIC ttmath_subindexed_x64
PUBLIC ttmath_subvector_x64
PUBLIC rcl_x64
PUBLIC rcr_x64
PUBLIC ttmath_rcl_x64
PUBLIC ttmath_rcr_x64
PUBLIC rcl2_x64
PUBLIC rcr2_x64
PUBLIC ttmath_rcl2_x64
PUBLIC ttmath_rcr2_x64
PUBLIC div_x64
PUBLIC ttmath_div_x64
;
; "rax, rcx, rdx, r8-r11 are volatile."
@@ -64,11 +66,12 @@ PUBLIC div_x64
.CODE
ALIGN 8
;----------------------------------------
adc_x64 PROC
ttmath_adc_x64 PROC
; rcx = p1
; rdx = p2
; r8 = nSize
@@ -91,7 +94,7 @@ adc_x64 PROC
ret
adc_x64 ENDP
ttmath_adc_x64 ENDP
;----------------------------------------
@@ -99,7 +102,7 @@ adc_x64 ENDP
;----------------------------------------
addindexed_x64 PROC
ttmath_addindexed_x64 PROC
; rcx = p1
; rdx = nSize
@@ -132,7 +135,7 @@ done_with_cy:
ret
addindexed_x64 ENDP
ttmath_addindexed_x64 ENDP
;----------------------------------------
@@ -140,7 +143,7 @@ addindexed_x64 ENDP
;----------------------------------------
addindexed2_x64 PROC
ttmath_addindexed2_x64 PROC
; rcx = p1 (pointer)
; rdx = b (value size)
@@ -173,7 +176,9 @@ next:
lea rax, [rax+1]
ret
addindexed2_x64 ENDP
ttmath_addindexed2_x64 ENDP
;----------------------------------------
@@ -181,7 +186,61 @@ addindexed2_x64 ENDP
;----------------------------------------
sbb_x64 PROC
ttmath_addvector_x64 PROC
; rcx = ss1
; rdx = ss2
; r8 = ss1_size
; r9 = ss2_size
; [esp+0x28] = result
mov r10, [esp+028h]
sub r8, r9
xor r11, r11 ; r11=0, cf=0
ALIGN 16
loop1:
mov rax, qword ptr [rcx + r11 * 8]
adc rax, qword ptr [rdx + r11 * 8]
mov qword ptr [r10 + r11 * 8], rax
inc r11
dec r9
jnz loop1
adc r9, r9 ; r9 has the cf state
or r8, r8
jz done
neg r9 ; setting cf from r9
mov r9, 0 ; don't use xor here (cf is used)
loop2:
mov rax, qword ptr [rcx + r11 * 8]
adc rax, r9
mov qword ptr [r10 + r11 * 8], rax
inc r11
dec r8
jnz loop2
adc r8, r8
mov rax, r8
ret
done:
mov rax, r9
ret
ttmath_addvector_x64 ENDP
;----------------------------------------
ALIGN 8
;----------------------------------------
ttmath_sbb_x64 PROC
; rcx = p1
; rdx = p2
@@ -205,7 +264,7 @@ sbb_x64 PROC
ret
sbb_x64 ENDP
ttmath_sbb_x64 ENDP
;----------------------------------------
@@ -213,7 +272,7 @@ sbb_x64 ENDP
;----------------------------------------
subindexed_x64 PROC
ttmath_subindexed_x64 PROC
; rcx = p1
; rdx = nSize
; r8 = nPos
@@ -240,7 +299,9 @@ done:
mov rax, 1
ret
subindexed_x64 ENDP
ttmath_subindexed_x64 ENDP
;----------------------------------------
@@ -248,7 +309,64 @@ subindexed_x64 ENDP
;----------------------------------------
rcl_x64 PROC
; the same asm code as in addvector_x64 only two instructions 'adc' changed to 'sbb'
ttmath_subvector_x64 PROC
; rcx = ss1
; rdx = ss2
; r8 = ss1_size
; r9 = ss2_size
; [esp+0x28] = result
mov r10, [esp+028h]
sub r8, r9
xor r11, r11 ; r11=0, cf=0
ALIGN 16
loop1:
mov rax, qword ptr [rcx + r11 * 8]
sbb rax, qword ptr [rdx + r11 * 8]
mov qword ptr [r10 + r11 * 8], rax
inc r11
dec r9
jnz loop1
adc r9, r9 ; r9 has the cf state
or r8, r8
jz done
neg r9 ; setting cf from r9
mov r9, 0 ; don't use xor here (cf is used)
loop2:
mov rax, qword ptr [rcx + r11 * 8]
sbb rax, r9
mov qword ptr [r10 + r11 * 8], rax
inc r11
dec r8
jnz loop2
adc r8, r8
mov rax, r8
ret
done:
mov rax, r9
ret
ttmath_subvector_x64 ENDP
;----------------------------------------
ALIGN 8
;----------------------------------------
ttmath_rcl_x64 PROC
; rcx = p1
; rdx = b
; r8 = nLowestBit
@@ -269,7 +387,7 @@ loop1:
ret
rcl_x64 ENDP
ttmath_rcl_x64 ENDP
;----------------------------------------
@@ -277,7 +395,7 @@ rcl_x64 ENDP
;----------------------------------------
rcr_x64 PROC
ttmath_rcr_x64 PROC
; rcx = p1
; rdx = nSize
; r8 = nLowestBit
@@ -296,7 +414,7 @@ loop1:
ret
rcr_x64 ENDP
ttmath_rcr_x64 ENDP
;----------------------------------------
@@ -304,7 +422,7 @@ rcr_x64 ENDP
;----------------------------------------
div_x64 PROC
ttmath_div_x64 PROC
; rcx = &Hi
; rdx = &Lo
@@ -321,7 +439,7 @@ div_x64 PROC
ret
div_x64 ENDP
ttmath_div_x64 ENDP
;----------------------------------------
@@ -329,7 +447,7 @@ div_x64 ENDP
;----------------------------------------
rcl2_x64 PROC
ttmath_rcl2_x64 PROC
; rcx = p1
; rdx = nSize
; r8 = bits
@@ -372,7 +490,7 @@ loop1:
pop rbx
ret
rcl2_x64 ENDP
ttmath_rcl2_x64 ENDP
;----------------------------------------
@@ -380,7 +498,7 @@ rcl2_x64 ENDP
;----------------------------------------
rcr2_x64 PROC
ttmath_rcr2_x64 PROC
; rcx = p1
; rdx = nSize
; r8 = bits
@@ -425,6 +543,6 @@ loop1:
ret
rcr2_x64 ENDP
ttmath_rcr2_x64 ENDP
END