- fixed a bug in 64 bit ASM for MSVC

git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@181 e52654a7-88a9-db11-a3e9-0013d4bc506e
This commit is contained in:
Christian Kaiser 2009-07-28 16:34:04 +00:00
parent 51b2c974a1
commit e102086f80
5 changed files with 271 additions and 144 deletions

View File

@ -3869,47 +3869,66 @@ public:
// we should check the mantissas beforehand because sometimes we can have
// a mantissa set to zero but in the exponent something another value
// (maybe we've forgotten about calling CorrectZero() ?)
if( mantissa.IsZero() && ss2.mantissa.IsZero())
{
return true;
}
if( IsSign() != ss2.IsSign() )
{
return false;
}
if( mantissa.IsZero())
{
if (ss2.mantissa.IsZero())
return true;
return(ss2.AboutEqual(*this,nBitsToIgnore));
}
if( exponent==ss2.exponent )
{
if (mantissa == ss2.mantissa)
{
return(true);
}
if( IsSign() != ss2.IsSign() )
{
// we need to check the difference (both might be around Zero)
Big<exp,man> temp(*this);
temp.Sub(ss2);
if (ss2.mantissa.IsZero())
{
return(this->exponent <= uint(2*(-sint(man*TTMATH_BITS_PER_UINT))+nBitsToIgnore));
}
// exponents may not differ much!
ttmath::Int<exp> expdiff(this->exponent - ss2.exponent);
// they may differ one if for example mantissa1=0x80000000, mantissa2=0xffffffff
if (ttmath::Abs(expdiff) > 1)
return(false);
Int<exp> exponent_diff(exponent - temp.exponent);
return(exponent_diff > man*TTMATH_BITS_PER_UINT-nBitsToIgnore);
}
// faster to mask the bits!
ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT);
// calculate the 'difference' mantissa
ttmath::UInt<man> man1(this->mantissa);
ttmath::UInt<man> man2(ss2.mantissa);
ttmath::UInt<man> mandiff;
switch (expdiff.ToInt())
{
case +1:
man2.Rcr(1,0);
mandiff = man1;
mandiff.Sub(man2);
break;
case -1:
man1.Rcr(1,0);
mandiff = man2;
mandiff.Sub(man1);
break;
case 0:
if (man2 > man1)
{
mandiff = man2;
mandiff.Sub(man1);
}
else
{
mandiff = man1;
mandiff.Sub(man2);
}
break;
}
// faster to mask the bits!
ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT);
for (int n = man-1; n > 0; --n)
{
if (mantissa.table[n] != ss2.mantissa.table[n])
return(false);
}
uint nMask = ~((1 << nBitsToIgnore) - 1);
return((mantissa.table[0] & nMask) == (ss2.mantissa.table[0] & nMask));
}
return false;
for (int n = man-1; n > 0; --n)
{
if (mandiff.table[n] != 0)
return(false);
}
uint nMask = ~((1 << nBitsToIgnore) - 1);
return((mandiff.table[0] & nMask) == 0);
}
bool operator<(const Big<exp,man> & ss2) const

View File

@ -871,6 +871,83 @@ namespace ttmath
u3 = sub_res_low_.u_.low;
}
/*!
this static method addes one vector to the other
'ss1' is larger in size or equal to 'ss2'
ss1 points to the first (larger) vector
ss2 points to the second vector
ss1_size - size of the ss1 (and size of the result too)
ss2_size - size of the ss2
result - is the result vector (which has size the same as ss1: ss1_size)
Example: ss1_size is 5, ss2_size is 3
ss1: ss2: result (output):
5 1 5+1
4 3 4+3
2 7 2+7
6 6
9 9
of course the carry is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
template<uint value_size>
uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
for( ; i<ss1_size ; ++i)
c = AddTwoWords(ss1[i], 0, c, &result[i]);
TTMATH_LOG("UInt::AddVector")
return c;
}
/*!
this static method subtractes one vector from the other
'ss1' is larger in size or equal to 'ss2'
ss1 points to the first (larger) vector
ss2 points to the second vector
ss1_size - size of the ss1 (and size of the result too)
ss2_size - size of the ss2
result - is the result vector (which has size the same as ss1: ss1_size)
Example: ss1_size is 5, ss2_size is 3
ss1: ss2: result (output):
5 1 5-1
4 3 4-3
2 7 2-7
6 6-1 (the borrow from previous item)
9 9
return (carry): 0
of course the carry (borrow) is propagated and will be returned from the last item
(this method is used by the Karatsuba multiplication algorithm)
*/
template<uint value_size>
uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
{
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
for( ; i<ss1_size ; ++i)
c = SubTwoWords(ss1[i], 0, c, &result[i]);
TTMATH_LOG("UInt::SubVector")
return c;
}
#endif // #ifdef TTMATH_PLATFORM64

View File

@ -42,7 +42,7 @@
#ifndef TTMATH_NOASM
#ifdef TTMATH_PLATFORM32
#pragma message("TTMATH_ASM")
#pragma message("TTMATH_ASM32")
/*!
\file ttmathuint_x86.h

View File

@ -4,20 +4,20 @@
* Author: Tomasz Sowa <t.sowa@slimaczek.pl>
*/
/*
/*
* Copyright (c) 2006-2009, Tomasz Sowa
* All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
*
* * Neither the name Tomasz Sowa nor the names of contributors to this
* project may be used to endorse or promote products derived
* from this software without specific prior written permission.
@ -39,10 +39,10 @@
#ifndef headerfilettmathuint_x86_64
#define headerfilettmathuint_x86_64
#ifndef TTMATH_NOASM
#ifdef TTMATH_PLATFORM64
#pragma message("TTMATH_ASM64")
/*!
\file ttmathuint_x86_64.h
\brief template class UInt<uint> with assembler code for 64bit x86_64 processors
@ -50,6 +50,9 @@
this file is included at the end of ttmathuint.h
*/
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
namespace ttmath
{
@ -113,14 +116,14 @@ namespace ttmath
this part should be compiled with gcc
*/
__asm__ __volatile__(
"xorq %%rdx, %%rdx \n"
"neg %%rax \n" // CF=1 if rax!=0 , CF=0 if rax==0
"1: \n"
"movq (%%rsi,%%rdx,8), %%rax \n"
"adcq %%rax, (%%rbx,%%rdx,8) \n"
"incq %%rdx \n"
"decq %%rcx \n"
"jnz 1b \n"
@ -134,7 +137,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::Add")
return c;
}
@ -150,7 +153,7 @@ namespace ttmath
if we've got (value_size=3):
table[0] = 10;
table[1] = 30;
table[2] = 5;
table[2] = 5;
and we call:
AddInt(2,1)
then it'll be:
@ -187,7 +190,7 @@ namespace ttmath
"1: \n"
"addq %%rax, (%%rbx,%%rdx,8) \n"
"jnc 2f \n"
"movq $1, %%rax \n"
"incq %%rdx \n"
"decq %%rcx \n"
@ -204,7 +207,7 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::AddInt")
return c;
}
@ -236,14 +239,38 @@ namespace ttmath
table[1] = 4 + x1 = 14
table[2] = 5 + x2 = 25
table[3] = 6
and no carry at the end of table[3]
(of course if there was a carry in table[2](5+20) then
(of course if there was a carry in table[2](5+20) then
this carry would be passed to the table[3] etc.)
*/
template<uint value_size>
uint UInt<value_size>::AddTwoInts(uint x2, uint x1, uint index)
#if 0
{
uint i, c;
TTMATH_ASSERT( index < value_size )
printf("add %Id + %Id\n",x1,x2);
for(int i=index ; i<value_size ; ++i)
printf("%d: %Id\n",i,table[i]);
c = AddTwoWords(table[index], x1, 0, &table[index]);
c = AddTwoWords(table[index+1], x2, c, &table[index+1]);
for(i=index+2 ; i<value_size && c ; ++i)
c = AddTwoWords(table[i], 0, c, &table[i]);
for(i=index ; i<value_size ; ++i)
printf("%d: %Id\n",i,table[i]);
printf(" -> %d\n",c);
TTMATH_LOG("UInt::AddTwoInts")
return c;
}
#else
{
uint b = value_size;
uint * p1 = table;
@ -253,7 +280,14 @@ namespace ttmath
#ifndef __GNUC__
#if defined(_M_X64)
c = addindexed2_x64(p1,b,index,x2,x1);
//printf("add %Id + %Id\n",x1,x2);
//for(int i=index ; i<value_size ; ++i)
// printf("%d: %Id\n",i,table[i]);
//if (table[0] == 1265784741359897913) DebugBreak();
c = addindexed2_x64(p1,b,index,x1,x2);
//for(int i=index ; i<value_size ; ++i)
// printf("%d: %Id\n",i,table[i]);
//printf(" -> %d\n",c);
#else
#error "another compiler than GCC is currently not supported in 64bit mode"
#endif
@ -261,11 +295,11 @@ namespace ttmath
#ifdef __GNUC__
uint dummy, dummy2;
__asm__ __volatile__(
"subq %%rdx, %%rcx \n"
"addq %%rsi, (%%rbx,%%rdx,8) \n"
"incq %%rdx \n"
"decq %%rcx \n"
@ -289,10 +323,12 @@ namespace ttmath
#endif
TTMATH_LOG("UInt64::AddTwoInts")
return c;
}
#endif
@ -328,16 +364,16 @@ namespace ttmath
#ifdef __GNUC__
uint dummy, dummy2;
__asm__ __volatile__(
"xorq %%rdx, %%rdx \n"
"neg %%rax \n" // CF=1 if rax!=0 , CF=0 if rax==0
"1: \n"
"movq (%%rsi,%%rdx,8), %%rax \n"
"sbbq %%rax, (%%rbx,%%rdx,8) \n"
"incq %%rdx \n"
"decq %%rcx \n"
"jnz 1b \n"
@ -366,7 +402,7 @@ namespace ttmath
if we've got (value_size=3):
table[0] = 10;
table[1] = 30;
table[2] = 5;
table[2] = 5;
and we call:
SubInt(2,1)
then it'll be:
@ -395,15 +431,15 @@ namespace ttmath
#ifdef __GNUC__
uint dummy, dummy2;
__asm__ __volatile__(
"subq %%rdx, %%rcx \n"
"1: \n"
"subq %%rax, (%%rbx,%%rdx,8) \n"
"jnc 2f \n"
"movq $1, %%rax \n"
"incq %%rdx \n"
"decq %%rcx \n"
@ -436,7 +472,7 @@ namespace ttmath
for example:
let this is 001010000
after Rcl2_one(1) there'll be 010100001 and Rcl2_one returns 0
***this method is created only on a 64bit platform***
*/
template<uint value_size>
@ -455,9 +491,9 @@ namespace ttmath
#ifdef __GNUC__
uint dummy, dummy2;
__asm__ __volatile__(
"xorq %%rdx, %%rdx \n" // rdx=0
"neg %%rax \n" // CF=1 if rax!=0 , CF=0 if rax==0
@ -473,7 +509,7 @@ namespace ttmath
: "=c" (c), "=a" (dummy), "=d" (dummy2)
: "1" (c), "0" (b), "b" (p1)
: "cc", "memory" );
#endif
TTMATH_LOG("UInt64::Rcl2_one")
@ -512,7 +548,7 @@ namespace ttmath
#ifdef __GNUC__
uint dummy;
__asm__ __volatile__(
"neg %%rax \n" // CF=1 if rax!=0 , CF=0 if rax==0
@ -549,7 +585,7 @@ namespace ttmath
for example:
let this is 001010000
after Rcl2(3, 1) there'll be 010000111 and Rcl2 returns 1
***this method is created only on a 64bit platform***
*/
template<uint value_size>
@ -570,9 +606,9 @@ namespace ttmath
#ifdef __GNUC__
uint dummy, dummy2, dummy3;
__asm__ __volatile__(
"movq %%rcx, %%rsi \n"
"movq $64, %%rcx \n"
"subq %%rsi, %%rcx \n"
@ -595,11 +631,11 @@ namespace ttmath
"xorq %%rax, (%%rbx,%%rdx,8) \n"
"orq %%rsi, (%%rbx,%%rdx,8) \n"
"movq %%rax, %%rsi \n"
"incq %%rdx \n"
"decq %%rdi \n"
"jnz 1b \n"
"and $1, %%rax \n"
: "=a" (c), "=D" (dummy), "=S" (dummy2), "=d" (dummy3)
@ -647,7 +683,7 @@ namespace ttmath
#ifdef __GNUC__
uint dummy, dummy2, dummy3;
__asm__ __volatile__(
"movq %%rcx, %%rsi \n"
@ -674,11 +710,11 @@ namespace ttmath
"xorq %%rax, (%%rbx,%%rdx,8) \n"
"orq %%rsi, (%%rbx,%%rdx,8) \n"
"movq %%rax, %%rsi \n"
"decq %%rdx \n"
"decq %%rdi \n"
"jnz 1b \n"
"rolq $1, %%rax \n"
"andq $1, %%rax \n"
@ -754,7 +790,7 @@ namespace ttmath
uint UInt<value_size>::SetBitInWord(uint & value, uint bit)
{
TTMATH_ASSERT( bit < TTMATH_BITS_PER_UINT )
uint old_bit;
uint v = value;
@ -778,7 +814,7 @@ namespace ttmath
"setc %%bl \n"
"movzx %%bl, %%rbx \n"
: "=a" (v), "=b" (old_bit)
: "0" (v), "1" (bit)
: "cc" );
@ -803,7 +839,7 @@ namespace ttmath
multiplication: result2:result1 = a * b
result2 - higher word
result1 - lower word of the result
this methos never returns a carry
***this method is created only on a 64bit platform***
@ -834,7 +870,7 @@ namespace ttmath
#ifdef __GNUC__
__asm__ __volatile__(
"mulq %%rdx \n"
: "=a" (result1_), "=d" (result2_)
@ -857,13 +893,13 @@ namespace ttmath
*
*
*/
#ifndef __GNUC__
/*!
this method calculates 64bits word a:b / 32bits c (a higher, b lower word)
r = a:b / c and rest - remainder
***this method is created only on a 64bit platform***
*
@ -896,7 +932,7 @@ namespace ttmath
#endif
#ifdef __GNUC__
__asm__ __volatile__(
"divq %%rcx \n"
@ -986,7 +1022,7 @@ namespace ttmath
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
@ -1025,7 +1061,7 @@ namespace ttmath
uint i, c = 0;
TTMATH_ASSERT( ss1_size >= ss2_size )
for(i=0 ; i<ss2_size ; ++i)
c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);

View File

@ -19,7 +19,7 @@ PUBLIC div_x64
;
.CODE
ALIGN 8
;----------------------------------------
@ -33,20 +33,20 @@ adc_x64 PROC
xor rax, rax
xor r11, r11
sub rax, r9 ; sets CARRY if r9 != 0
ALIGN 16
loop1:
loop1:
mov rax,qword ptr [rdx + r11 * 8]
adc qword ptr [rcx + r11 * 8], rax
lea r11, [r11+1]
dec r8
jnz loop1
setc al
movzx rax, al
ret
adc_x64 ENDP
;----------------------------------------
@ -80,14 +80,14 @@ loop1:
lea r8, [r8+1]
add qword ptr [rcx + r8 * 8], r9
jc loop1
ret
done_with_cy:
lea rax, [rax+1] ; rax = 1
ret
addindexed_x64 ENDP
;----------------------------------------
@ -98,8 +98,8 @@ addindexed_x64 ENDP
addindexed2_x64 PROC
; rcx = p1
; rdx = b
; rcx = p1 (pointer)
; rdx = b (value size)
; r8 = nPos
; r9 = nValue1
; [esp+0x28] = nValue2
@ -109,26 +109,23 @@ addindexed2_x64 PROC
sub rdx, r8 ; rdx = remaining count of uints
mov r10, [esp+028h] ; r10 = nValue2
add qword ptr [r11 + r8 * 8], r10
add qword ptr [r11 + r8 * 8], r9
lea r8, [r8+1]
lea rdx, [rdx-1]
adc qword ptr [r11 + r8 * 8], r10
jc next
ret
ALIGN 16
loop1:
adc qword ptr [r11 + r8 * 8], r9
lea r8, [r8+1]
add qword ptr [r11 + r8 * 8], 1
jc next
ret
next:
lea r8, [r8+1]
xor r9, r9 ; set to 0 -> cy still set!
dec rdx
jnz loop1
jc return_1 ; most of the times, there will be NO carry (I hope)
done:
ret
return_1:
next:
dec rdx ; does not modify CY too...
jnz loop1
lea rax, [rax+1]
ret
@ -138,8 +135,6 @@ addindexed2_x64 ENDP
ALIGN 8
ALIGN 8
;----------------------------------------
sbb_x64 PROC
@ -152,15 +147,15 @@ sbb_x64 PROC
xor rax, rax
xor r11, r11
sub rax, r9 ; sets CARRY if r9 != 0
ALIGN 16
loop1:
loop1:
mov rax,qword ptr [rdx + r11 * 8]
sbb qword ptr [rcx + r11 * 8], rax
lea r11, [r11+1]
dec r8
jnz loop1
setc al
movzx rax, al
@ -181,12 +176,12 @@ subindexed_x64 PROC
; r9 = nValue
sub rdx, r8 ; rdx = remaining count of uints
ALIGN 16
loop1:
sub qword ptr [rcx + r8 * 8], r9
jnc done
lea r8, [r8+1]
mov r9, 1
dec rdx
@ -196,7 +191,7 @@ loop1:
done:
xor rax, rax
ret
return_1:
mov rax, 1
ret
@ -217,17 +212,17 @@ rcl_x64 PROC
mov r11, rcx ; table
xor r10, r10
neg r8 ; CY set if r8 <> 0
ALIGN 16
loop1:
rcl qword ptr [r11 + r10 * 8], 1
lea r10, [r10+1]
dec rdx
jnz loop1
setc al
movzx rax, al
ret
rcl_x64 ENDP
@ -245,16 +240,16 @@ rcr_x64 PROC
xor r10, r10
neg r8 ; CY set if r8 <> 0
ALIGN 16
loop1:
rcr qword ptr -8[rcx + rdx * 8], 1
dec rdx
jnz loop1
setc al
movzx rax, al
ret
rcr_x64 ENDP
@ -270,7 +265,7 @@ div_x64 PROC
; rcx = &Hi
; rdx = &Lo
; r8 = nDiv
mov r11, rcx
mov r10, rdx
@ -295,21 +290,21 @@ rcl2_x64 PROC
; rdx = nSize
; r8 = bits
; r9 = c
push rbx
mov r10, rcx ; r10 = p1
xor rax, rax
xor rax, rax
mov rcx, 64
sub rcx, r8
mov r11, -1
shr r11, cl ; r11 = mask
mov rcx, r8 ; rcx = count of bits
mov rbx, rax ; rbx = old value = 0
mov rbx, rax ; rbx = old value = 0
or r9, r9
cmovnz rbx, r11 ; if (c) then old value = mask
@ -323,7 +318,7 @@ loop1:
xor qword ptr [r10+r9*8], rax
or qword ptr [r10+r9*8], rbx
mov rbx, rax
lea r9, [r9+1]
dec rdx
@ -332,8 +327,8 @@ loop1:
and rax, 1
pop rbx
ret
rcl2_x64 ENDP
rcl2_x64 ENDP
;----------------------------------------
@ -346,20 +341,20 @@ rcr2_x64 PROC
; rdx = nSize
; r8 = bits
; r9 = c
push rbx
mov r10, rcx ; r10 = p1
xor rax, rax
xor rax, rax
mov rcx, 64
sub rcx, r8
mov r11, -1
shl r11, cl ; r11 = mask
mov rcx, r8 ; rcx = count of bits
mov rbx, rax ; rbx = old value = 0
mov rbx, rax ; rbx = old value = 0
or r9, r9
cmovnz rbx, r11 ; if (c) then old value = mask
@ -374,18 +369,18 @@ loop1:
xor qword ptr [r10+r9*8], rax
or qword ptr [r10+r9*8], rbx
mov rbx, rax
lea r9, [r9-1]
dec rdx
jnz loop1
rol rax, 1
and rax, 1
pop rbx
ret
rcr2_x64 ENDP
rcr2_x64 ENDP
END