From e102086f80314e095ab44ec85303060bedc46de9 Mon Sep 17 00:00:00 2001 From: Christian Kaiser Date: Tue, 28 Jul 2009 16:34:04 +0000 Subject: [PATCH] - fixed a bug in 64 bit ASM for MSVC git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@181 e52654a7-88a9-db11-a3e9-0013d4bc506e --- ttmath/ttmathbig.h | 95 ++++++++++++-------- ttmath/ttmathuint_noasm.h | 77 ++++++++++++++++ ttmath/ttmathuint_x86.h | 2 +- ttmath/ttmathuint_x86_64.h | 130 +++++++++++++++++---------- ttmath/ttmathuint_x86_amd64_msvc.asm | 111 +++++++++++------------ 5 files changed, 271 insertions(+), 144 deletions(-) diff --git a/ttmath/ttmathbig.h b/ttmath/ttmathbig.h index 7f3574c..4fbb8a0 100644 --- a/ttmath/ttmathbig.h +++ b/ttmath/ttmathbig.h @@ -3869,47 +3869,66 @@ public: // we should check the mantissas beforehand because sometimes we can have // a mantissa set to zero but in the exponent something another value // (maybe we've forgotten about calling CorrectZero() ?) - if( mantissa.IsZero() && ss2.mantissa.IsZero()) - { - return true; - } - - if( IsSign() != ss2.IsSign() ) - { - return false; - } + if( mantissa.IsZero()) + { + if (ss2.mantissa.IsZero()) + return true; + return(ss2.AboutEqual(*this,nBitsToIgnore)); + } - if( exponent==ss2.exponent ) - { - if (mantissa == ss2.mantissa) - { - return(true); - } - if( IsSign() != ss2.IsSign() ) - { - // we need to check the difference (both might be around Zero) - Big temp(*this); - - temp.Sub(ss2); + if (ss2.mantissa.IsZero()) + { + return(this->exponent <= uint(2*(-sint(man*TTMATH_BITS_PER_UINT))+nBitsToIgnore)); + } + + // exponents may not differ much! + ttmath::Int expdiff(this->exponent - ss2.exponent); + + // they may differ one if for example mantissa1=0x80000000, mantissa2=0xffffffff + if (ttmath::Abs(expdiff) > 1) + return(false); - Int exponent_diff(exponent - temp.exponent); - - return(exponent_diff > man*TTMATH_BITS_PER_UINT-nBitsToIgnore); - } - - // faster to mask the bits! - ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT); + // calculate the 'difference' mantissa + ttmath::UInt man1(this->mantissa); + ttmath::UInt man2(ss2.mantissa); + ttmath::UInt mandiff; + + switch (expdiff.ToInt()) + { + case +1: + man2.Rcr(1,0); + mandiff = man1; + mandiff.Sub(man2); + break; + case -1: + man1.Rcr(1,0); + mandiff = man2; + mandiff.Sub(man1); + break; + case 0: + if (man2 > man1) + { + mandiff = man2; + mandiff.Sub(man1); + } + else + { + mandiff = man1; + mandiff.Sub(man2); + } + break; + } + + // faster to mask the bits! + ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT); - for (int n = man-1; n > 0; --n) - { - if (mantissa.table[n] != ss2.mantissa.table[n]) - return(false); - } - uint nMask = ~((1 << nBitsToIgnore) - 1); - return((mantissa.table[0] & nMask) == (ss2.mantissa.table[0] & nMask)); - } - - return false; + for (int n = man-1; n > 0; --n) + { + if (mandiff.table[n] != 0) + return(false); + } + uint nMask = ~((1 << nBitsToIgnore) - 1); + return((mandiff.table[0] & nMask) == 0); } bool operator<(const Big & ss2) const diff --git a/ttmath/ttmathuint_noasm.h b/ttmath/ttmathuint_noasm.h index 951f40d..c09f5e8 100644 --- a/ttmath/ttmathuint_noasm.h +++ b/ttmath/ttmathuint_noasm.h @@ -871,6 +871,83 @@ namespace ttmath u3 = sub_res_low_.u_.low; } + /*! + this static method addes one vector to the other + 'ss1' is larger in size or equal to 'ss2' + + ss1 points to the first (larger) vector + ss2 points to the second vector + ss1_size - size of the ss1 (and size of the result too) + ss2_size - size of the ss2 + result - is the result vector (which has size the same as ss1: ss1_size) + + Example: ss1_size is 5, ss2_size is 3 + ss1: ss2: result (output): + 5 1 5+1 + 4 3 4+3 + 2 7 2+7 + 6 6 + 9 9 + of course the carry is propagated and will be returned from the last item + (this method is used by the Karatsuba multiplication algorithm) + */ + template + uint UInt::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + { + uint i, c = 0; + + TTMATH_ASSERT( ss1_size >= ss2_size ) + + for(i=0 ; i + uint UInt::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result) + { + uint i, c = 0; + + TTMATH_ASSERT( ss1_size >= ss2_size ) + + for(i=0 ; i */ -/* +/* * Copyright (c) 2006-2009, Tomasz Sowa * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: - * + * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. - * + * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * * Neither the name Tomasz Sowa nor the names of contributors to this * project may be used to endorse or promote products derived * from this software without specific prior written permission. @@ -39,10 +39,10 @@ #ifndef headerfilettmathuint_x86_64 #define headerfilettmathuint_x86_64 - #ifndef TTMATH_NOASM #ifdef TTMATH_PLATFORM64 +#pragma message("TTMATH_ASM64") /*! \file ttmathuint_x86_64.h \brief template class UInt with assembler code for 64bit x86_64 processors @@ -50,6 +50,9 @@ this file is included at the end of ttmathuint.h */ +#define WIN32_LEAN_AND_MEAN +#include + namespace ttmath { @@ -113,14 +116,14 @@ namespace ttmath this part should be compiled with gcc */ __asm__ __volatile__( - + "xorq %%rdx, %%rdx \n" "neg %%rax \n" // CF=1 if rax!=0 , CF=0 if rax==0 "1: \n" "movq (%%rsi,%%rdx,8), %%rax \n" "adcq %%rax, (%%rbx,%%rdx,8) \n" - + "incq %%rdx \n" "decq %%rcx \n" "jnz 1b \n" @@ -134,7 +137,7 @@ namespace ttmath #endif TTMATH_LOG("UInt64::Add") - + return c; } @@ -150,7 +153,7 @@ namespace ttmath if we've got (value_size=3): table[0] = 10; table[1] = 30; - table[2] = 5; + table[2] = 5; and we call: AddInt(2,1) then it'll be: @@ -187,7 +190,7 @@ namespace ttmath "1: \n" "addq %%rax, (%%rbx,%%rdx,8) \n" "jnc 2f \n" - + "movq $1, %%rax \n" "incq %%rdx \n" "decq %%rcx \n" @@ -204,7 +207,7 @@ namespace ttmath #endif TTMATH_LOG("UInt64::AddInt") - + return c; } @@ -236,14 +239,38 @@ namespace ttmath table[1] = 4 + x1 = 14 table[2] = 5 + x2 = 25 table[3] = 6 - + and no carry at the end of table[3] - (of course if there was a carry in table[2](5+20) then + (of course if there was a carry in table[2](5+20) then this carry would be passed to the table[3] etc.) */ template uint UInt::AddTwoInts(uint x2, uint x1, uint index) + #if 0 + { + uint i, c; + + TTMATH_ASSERT( index < value_size ) + + printf("add %Id + %Id\n",x1,x2); + for(int i=index ; i %d\n",c); + + TTMATH_LOG("UInt::AddTwoInts") + + return c; + } + #else { uint b = value_size; uint * p1 = table; @@ -253,7 +280,14 @@ namespace ttmath #ifndef __GNUC__ #if defined(_M_X64) - c = addindexed2_x64(p1,b,index,x2,x1); + //printf("add %Id + %Id\n",x1,x2); + //for(int i=index ; i %d\n",c); #else #error "another compiler than GCC is currently not supported in 64bit mode" #endif @@ -261,11 +295,11 @@ namespace ttmath #ifdef __GNUC__ uint dummy, dummy2; - + __asm__ __volatile__( - + "subq %%rdx, %%rcx \n" - + "addq %%rsi, (%%rbx,%%rdx,8) \n" "incq %%rdx \n" "decq %%rcx \n" @@ -289,10 +323,12 @@ namespace ttmath #endif + TTMATH_LOG("UInt64::AddTwoInts") return c; } + #endif @@ -328,16 +364,16 @@ namespace ttmath #ifdef __GNUC__ uint dummy, dummy2; - + __asm__ __volatile__( - + "xorq %%rdx, %%rdx \n" "neg %%rax \n" // CF=1 if rax!=0 , CF=0 if rax==0 "1: \n" "movq (%%rsi,%%rdx,8), %%rax \n" "sbbq %%rax, (%%rbx,%%rdx,8) \n" - + "incq %%rdx \n" "decq %%rcx \n" "jnz 1b \n" @@ -366,7 +402,7 @@ namespace ttmath if we've got (value_size=3): table[0] = 10; table[1] = 30; - table[2] = 5; + table[2] = 5; and we call: SubInt(2,1) then it'll be: @@ -395,15 +431,15 @@ namespace ttmath #ifdef __GNUC__ uint dummy, dummy2; - + __asm__ __volatile__( - + "subq %%rdx, %%rcx \n" "1: \n" "subq %%rax, (%%rbx,%%rdx,8) \n" "jnc 2f \n" - + "movq $1, %%rax \n" "incq %%rdx \n" "decq %%rcx \n" @@ -436,7 +472,7 @@ namespace ttmath for example: let this is 001010000 after Rcl2_one(1) there'll be 010100001 and Rcl2_one returns 0 - + ***this method is created only on a 64bit platform*** */ template @@ -455,9 +491,9 @@ namespace ttmath #ifdef __GNUC__ uint dummy, dummy2; - + __asm__ __volatile__( - + "xorq %%rdx, %%rdx \n" // rdx=0 "neg %%rax \n" // CF=1 if rax!=0 , CF=0 if rax==0 @@ -473,7 +509,7 @@ namespace ttmath : "=c" (c), "=a" (dummy), "=d" (dummy2) : "1" (c), "0" (b), "b" (p1) : "cc", "memory" ); - + #endif TTMATH_LOG("UInt64::Rcl2_one") @@ -512,7 +548,7 @@ namespace ttmath #ifdef __GNUC__ uint dummy; - + __asm__ __volatile__( "neg %%rax \n" // CF=1 if rax!=0 , CF=0 if rax==0 @@ -549,7 +585,7 @@ namespace ttmath for example: let this is 001010000 after Rcl2(3, 1) there'll be 010000111 and Rcl2 returns 1 - + ***this method is created only on a 64bit platform*** */ template @@ -570,9 +606,9 @@ namespace ttmath #ifdef __GNUC__ uint dummy, dummy2, dummy3; - + __asm__ __volatile__( - + "movq %%rcx, %%rsi \n" "movq $64, %%rcx \n" "subq %%rsi, %%rcx \n" @@ -595,11 +631,11 @@ namespace ttmath "xorq %%rax, (%%rbx,%%rdx,8) \n" "orq %%rsi, (%%rbx,%%rdx,8) \n" "movq %%rax, %%rsi \n" - + "incq %%rdx \n" "decq %%rdi \n" "jnz 1b \n" - + "and $1, %%rax \n" : "=a" (c), "=D" (dummy), "=S" (dummy2), "=d" (dummy3) @@ -647,7 +683,7 @@ namespace ttmath #ifdef __GNUC__ uint dummy, dummy2, dummy3; - + __asm__ __volatile__( "movq %%rcx, %%rsi \n" @@ -674,11 +710,11 @@ namespace ttmath "xorq %%rax, (%%rbx,%%rdx,8) \n" "orq %%rsi, (%%rbx,%%rdx,8) \n" "movq %%rax, %%rsi \n" - + "decq %%rdx \n" "decq %%rdi \n" "jnz 1b \n" - + "rolq $1, %%rax \n" "andq $1, %%rax \n" @@ -754,7 +790,7 @@ namespace ttmath uint UInt::SetBitInWord(uint & value, uint bit) { TTMATH_ASSERT( bit < TTMATH_BITS_PER_UINT ) - + uint old_bit; uint v = value; @@ -778,7 +814,7 @@ namespace ttmath "setc %%bl \n" "movzx %%bl, %%rbx \n" - + : "=a" (v), "=b" (old_bit) : "0" (v), "1" (bit) : "cc" ); @@ -803,7 +839,7 @@ namespace ttmath multiplication: result2:result1 = a * b result2 - higher word result1 - lower word of the result - + this methos never returns a carry ***this method is created only on a 64bit platform*** @@ -834,7 +870,7 @@ namespace ttmath #ifdef __GNUC__ __asm__ __volatile__( - + "mulq %%rdx \n" : "=a" (result1_), "=d" (result2_) @@ -857,13 +893,13 @@ namespace ttmath * * */ - + #ifndef __GNUC__ /*! this method calculates 64bits word a:b / 32bits c (a higher, b lower word) r = a:b / c and rest - remainder - + ***this method is created only on a 64bit platform*** * @@ -896,7 +932,7 @@ namespace ttmath #endif #ifdef __GNUC__ - + __asm__ __volatile__( "divq %%rcx \n" @@ -986,7 +1022,7 @@ namespace ttmath uint i, c = 0; TTMATH_ASSERT( ss1_size >= ss2_size ) - + for(i=0 ; i= ss2_size ) - + for(i=0 ; i cy still set! - dec rdx - jnz loop1 - jc return_1 ; most of the times, there will be NO carry (I hope) -done: - ret - -return_1: +next: + dec rdx ; does not modify CY too... + jnz loop1 lea rax, [rax+1] ret @@ -138,8 +135,6 @@ addindexed2_x64 ENDP ALIGN 8 - ALIGN 8 - ;---------------------------------------- sbb_x64 PROC @@ -152,15 +147,15 @@ sbb_x64 PROC xor rax, rax xor r11, r11 sub rax, r9 ; sets CARRY if r9 != 0 - + ALIGN 16 - loop1: + loop1: mov rax,qword ptr [rdx + r11 * 8] sbb qword ptr [rcx + r11 * 8], rax lea r11, [r11+1] dec r8 jnz loop1 - + setc al movzx rax, al @@ -181,12 +176,12 @@ subindexed_x64 PROC ; r9 = nValue sub rdx, r8 ; rdx = remaining count of uints - + ALIGN 16 loop1: sub qword ptr [rcx + r8 * 8], r9 jnc done - + lea r8, [r8+1] mov r9, 1 dec rdx @@ -196,7 +191,7 @@ loop1: done: xor rax, rax ret - + return_1: mov rax, 1 ret @@ -217,17 +212,17 @@ rcl_x64 PROC mov r11, rcx ; table xor r10, r10 neg r8 ; CY set if r8 <> 0 - + ALIGN 16 loop1: rcl qword ptr [r11 + r10 * 8], 1 lea r10, [r10+1] dec rdx jnz loop1 - + setc al movzx rax, al - + ret rcl_x64 ENDP @@ -245,16 +240,16 @@ rcr_x64 PROC xor r10, r10 neg r8 ; CY set if r8 <> 0 - + ALIGN 16 loop1: rcr qword ptr -8[rcx + rdx * 8], 1 dec rdx jnz loop1 - + setc al movzx rax, al - + ret rcr_x64 ENDP @@ -270,7 +265,7 @@ div_x64 PROC ; rcx = &Hi ; rdx = &Lo ; r8 = nDiv - + mov r11, rcx mov r10, rdx @@ -295,21 +290,21 @@ rcl2_x64 PROC ; rdx = nSize ; r8 = bits ; r9 = c - + push rbx - + mov r10, rcx ; r10 = p1 - xor rax, rax - + xor rax, rax + mov rcx, 64 sub rcx, r8 - + mov r11, -1 shr r11, cl ; r11 = mask mov rcx, r8 ; rcx = count of bits - mov rbx, rax ; rbx = old value = 0 + mov rbx, rax ; rbx = old value = 0 or r9, r9 cmovnz rbx, r11 ; if (c) then old value = mask @@ -323,7 +318,7 @@ loop1: xor qword ptr [r10+r9*8], rax or qword ptr [r10+r9*8], rbx mov rbx, rax - + lea r9, [r9+1] dec rdx @@ -332,8 +327,8 @@ loop1: and rax, 1 pop rbx ret - -rcl2_x64 ENDP + +rcl2_x64 ENDP ;---------------------------------------- @@ -346,20 +341,20 @@ rcr2_x64 PROC ; rdx = nSize ; r8 = bits ; r9 = c - + push rbx mov r10, rcx ; r10 = p1 - xor rax, rax - + xor rax, rax + mov rcx, 64 sub rcx, r8 - + mov r11, -1 shl r11, cl ; r11 = mask mov rcx, r8 ; rcx = count of bits - mov rbx, rax ; rbx = old value = 0 + mov rbx, rax ; rbx = old value = 0 or r9, r9 cmovnz rbx, r11 ; if (c) then old value = mask @@ -374,18 +369,18 @@ loop1: xor qword ptr [r10+r9*8], rax or qword ptr [r10+r9*8], rbx mov rbx, rax - + lea r9, [r9-1] dec rdx jnz loop1 - + rol rax, 1 and rax, 1 pop rbx - + ret - -rcr2_x64 ENDP + +rcr2_x64 ENDP END