From de1e7ac957d9e16dfca091d7db593bb35648d56a Mon Sep 17 00:00:00 2001 From: Christian Kaiser Date: Wed, 20 May 2009 08:48:51 +0000 Subject: [PATCH] more optimizations for MSVC assembler (parallelism, prefetch optimization, loop alignment, ...) git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@151 e52654a7-88a9-db11-a3e9-0013d4bc506e --- ttmath/ttmathbig.h | 2 +- ttmath/ttmathconfig.h | 9 +- ttmath/ttmathuint_x86.h | 2288 +++++++++++++------------- ttmath/ttmathuint_x86_amd64_msvc.asm | 77 +- 4 files changed, 1195 insertions(+), 1181 deletions(-) diff --git a/ttmath/ttmathbig.h b/ttmath/ttmathbig.h index 978d8e8..b97e238 100644 --- a/ttmath/ttmathbig.h +++ b/ttmath/ttmathbig.h @@ -3434,7 +3434,7 @@ private: */ int FromString_ReadScientificIfExists(const tchar_t * & source) { - int c = 0; + uint c = 0; bool scientific_read = false; const tchar_t * before_scientific = source; diff --git a/ttmath/ttmathconfig.h b/ttmath/ttmathconfig.h index 785bc8e..13c0f8c 100644 --- a/ttmath/ttmathconfig.h +++ b/ttmath/ttmathconfig.h @@ -37,6 +37,7 @@ namespace ttmath { #if defined(_MSC_VER) + #include #if defined(_UNICODE) typedef wchar_t tchar_t; typedef std::wstring tstr_t; @@ -71,20 +72,20 @@ namespace ttmath public: clsCrit(void) { - ::InitializeCriticalSection(&_Crit); + InitializeCriticalSection(&_Crit); } virtual ~clsCrit(void) { - ::DeleteCriticalSection(&_Crit); + DeleteCriticalSection(&_Crit); } void Enter(void) const { - ::EnterCriticalSection(&_Crit); + EnterCriticalSection(&_Crit); } void Leave(void) const { - ::LeaveCriticalSection(&_Crit); + LeaveCriticalSection(&_Crit); } }; diff --git a/ttmath/ttmathuint_x86.h b/ttmath/ttmathuint_x86.h index e2b404a..67cde3c 100644 --- a/ttmath/ttmathuint_x86.h +++ b/ttmath/ttmathuint_x86.h @@ -1,1140 +1,1148 @@ -/* - * This file is a part of TTMath Bignum Library - * and is distributed under the (new) BSD licence. - * Author: Tomasz Sowa - */ - -/* - * Copyright (c) 2006-2009, Tomasz Sowa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the name Tomasz Sowa nor the names of contributors to this - * project may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - */ - - - -#ifndef headerfilettmathuint_x86 -#define headerfilettmathuint_x86 - - -#ifndef TTMATH_NOASM -#ifdef TTMATH_PLATFORM32 - - -/*! - \file ttmathuint_x86.h - \brief template class UInt with assembler code for 32bit x86 processors - - this file is included at the end of ttmathuint.h -*/ - - - -/*! - \brief a namespace for the TTMath library -*/ -namespace ttmath -{ - - /*! - * - * basic mathematic functions - * - */ - - - /*! - adding ss2 to the this and adding carry if it's defined - (this = this + ss2 + c) - - c must be zero or one (might be a bigger value than 1) - function returns carry (1) (if it has been) - */ - template - uint UInt::Add(const UInt & ss2, uint c) - { - register uint b = value_size; - register uint * p1 = table; - register uint * p2 = const_cast(ss2.table); - - // we don't have to use TTMATH_REFERENCE_ASSERT here - // this algorithm doesn't require it - - #ifndef __GNUC__ - - // this part might be compiled with for example visual c - - __asm - { - xor eax,eax // eax=0 - mov ecx,[b] - mov edx,eax // edx=0 - mov ebx,[p1] - mov esi,[p2] - - sub eax,[c] // CF=c - - p: - mov eax,[esi+edx*4] - adc [ebx+edx*4],eax - - inc edx - dec ecx - jnz p - - setc al - movzx eax, al - } - - #endif - - - #ifdef __GNUC__ - - // this part should be compiled with gcc - - __asm__ __volatile__( - - "push %%ecx \n" - - "xorl %%eax, %%eax \n" - "movl %%eax, %%edx \n" - "subl %%edi, %%eax \n" - - - "1: \n" - "movl (%%esi,%%edx,4),%%eax \n" - "adcl %%eax, (%%ebx,%%edx,4) \n" - - "incl %%edx \n" - "decl %%ecx \n" - "jnz 1b \n" - - "setc %%al \n" - "movzx %%al,%%edx \n" - - "pop %%ecx \n" - - : "=d" (c) - : "D" (c), "c" (b), "b" (p1), "S" (p2) - : "%eax", "cc", "memory" ); - - return c; - #endif - - TTMATH_LOG("UInt32::Add") - } - - - - /*! - adding one word (at a specific position) - and returning a carry (if it has been) - - e.g. - - if we've got (value_size=3): - table[0] = 10; - table[1] = 30; - table[2] = 5; - and we call: - AddInt(2,1) - then it'll be: - table[0] = 10; - table[1] = 30 + 2; - table[2] = 5; - - of course if there was a carry from table[2] it would be returned - */ - template - uint UInt::AddInt(uint value, uint index) - { - register uint b = value_size; - register uint * p1 = table; - - TTMATH_ASSERT( index < value_size ) - - #ifndef __GNUC__ - - __asm - { - mov ecx, [b] - sub ecx, [index] - - mov edx, [index] - mov ebx, [p1] - - mov eax, [value] - - p: - add [ebx+edx*4], eax - jnc end - - mov eax, 1 - inc edx - dec ecx - jnz p - - end: - setc al - movzx eax, al - } - - #endif - - - #ifdef __GNUC__ - register uint c; - - __asm__ __volatile__( - - "push %%eax \n" - "push %%ecx \n" - - "subl %%edx, %%ecx \n" - - "1: \n" - "addl %%eax, (%%ebx,%%edx,4) \n" - "jnc 2f \n" - - "movl $1, %%eax \n" - "incl %%edx \n" - "decl %%ecx \n" - "jnz 1b \n" - - "2: \n" - "setc %%al \n" - "movzx %%al, %%edx \n" - - "pop %%ecx \n" - "pop %%eax \n" - - : "=d" (c) - : "a" (value), "c" (b), "0" (index), "b" (p1) - : "cc", "memory" ); - - return c; - #endif - - TTMATH_LOG("UInt32::AddInt") - } - - - - - /*! - adding only two unsigned words to the existing value - and these words begin on the 'index' position - (it's used in the multiplication algorithm 2) - - index should be equal or smaller than value_size-2 (index <= value_size-2) - x1 - lower word, x2 - higher word - - for example if we've got value_size equal 4 and: - table[0] = 3 - table[1] = 4 - table[2] = 5 - table[3] = 6 - then let - x1 = 10 - x2 = 20 - and - index = 1 - - the result of this method will be: - table[0] = 3 - table[1] = 4 + x1 = 14 - table[2] = 5 + x2 = 25 - table[3] = 6 - - and no carry at the end of table[3] - - (of course if there was a carry in table[2](5+20) then - this carry would be passed to the table[3] etc.) - */ - template - uint UInt::AddTwoInts(uint x2, uint x1, uint index) - { - register uint b = value_size; - register uint * p1 = table; - - TTMATH_ASSERT( index < value_size - 1 ) - - #ifndef __GNUC__ - __asm - { - mov ecx, [b] - sub ecx, [index] - - mov ebx, [p1] - mov edx, [index] - - mov eax, [x1] - add [ebx+edx*4], eax - inc edx - dec ecx - - mov eax, [x2] - - p: - adc [ebx+edx*4], eax - jnc end - - mov eax, 0 - inc edx - dec ecx - jnz p - - end: - setc al - movzx eax, al - } - #endif - - - #ifdef __GNUC__ - register uint c; - - __asm__ __volatile__( - - "push %%ecx \n" - "push %%edx \n" - - "subl %%edx, %%ecx \n" - - "addl %%esi, (%%ebx,%%edx,4) \n" - "incl %%edx \n" - "decl %%ecx \n" - - "1: \n" - "adcl %%eax, (%%ebx,%%edx,4) \n" - "jnc 2f \n" - - "mov $0, %%eax \n" - "incl %%edx \n" - "decl %%ecx \n" - "jnz 1b \n" - - "2: \n" - "setc %%al \n" - "movzx %%al, %%eax \n" - - "pop %%edx \n" - "pop %%ecx \n" - - : "=a" (c) - : "c" (b), "d" (index), "b" (p1), "S" (x1), "0" (x2) - : "cc", "memory" ); - - return c; - #endif - - TTMATH_LOG("UInt32::AddTwoInts") - } - - - - - - /*! - subtracting ss2 from the 'this' and subtracting - carry if it has been defined - (this = this - ss2 - c) - - c must be zero or one (might be a bigger value than 1) - function returns carry (1) (if it has been) - */ - template - uint UInt::Sub(const UInt & ss2, uint c) - { - register uint b = value_size; - register uint * p1 = table; - register uint * p2 = const_cast(ss2.table); - - // we don't have to use TTMATH_REFERENCE_ASSERT here - // this algorithm doesn't require it - - #ifndef __GNUC__ - - __asm - { - mov ecx,[b] - - mov ebx,[p1] - mov esi,[p2] - - xor eax, eax - mov edx, eax - - sub eax, [c] - - p: - mov eax, [esi+edx*4] - sbb [ebx+edx*4], eax - - inc edx - dec ecx - jnz p - - setc al - movzx eax, al - } - - #endif - - - #ifdef __GNUC__ - __asm__ __volatile__( - "push %%ecx \n" - - "xorl %%eax, %%eax \n" - "movl %%eax, %%edx \n" - "subl %%edi, %%eax \n" - - - "1: \n" - "movl (%%esi,%%edx,4),%%eax \n" - "sbbl %%eax, (%%ebx,%%edx,4) \n" - - "incl %%edx \n" - "decl %%ecx \n" - "jnz 1b \n" - - "setc %%al \n" - "movzx %%al,%%edx \n" - - "pop %%ecx \n" - - : "=d" (c) - : "D" (c), "c" (b), "b" (p1), "S" (p2) - : "%eax", "cc", "memory" ); - - return c; - #endif - - TTMATH_LOG("UInt32::Sub") - } - - - - - /*! - this method subtracts one word (at a specific position) - and returns a carry (if it was) - - e.g. - - if we've got (value_size=3): - table[0] = 10; - table[1] = 30; - table[2] = 5; - and we call: - SubInt(2,1) - then it'll be: - table[0] = 10; - table[1] = 30 - 2; - table[2] = 5; - - of course if there was a carry from table[2] it would be returned - */ - template - uint UInt::SubInt(uint value, uint index) - { - register uint b = value_size; - register uint * p1 = table; - - TTMATH_ASSERT( index < value_size ) - - #ifndef __GNUC__ - __asm - { - mov ecx, [b] - sub ecx, [index] - - mov edx, [index] - mov ebx, [p1] - - mov eax, [value] - - p: - sub [ebx+edx*4], eax - jnc end - - mov eax, 1 - inc edx - dec ecx - jnz p - - end: - setc al - movzx eax, al - } - #endif - - - #ifdef __GNUC__ - register uint c; - - __asm__ __volatile__( - - "push %%eax \n" - "push %%ecx \n" - - "subl %%edx, %%ecx \n" - - "1: \n" - "subl %%eax, (%%ebx,%%edx,4) \n" - "jnc 2f \n" - - "movl $1, %%eax \n" - "incl %%edx \n" - "decl %%ecx \n" - "jnz 1b \n" - - "2: \n" - "setc %%al \n" - "movzx %%al, %%edx \n" - - "pop %%ecx \n" - "pop %%eax \n" - - : "=d" (c) - : "a" (value), "c" (b), "0" (index), "b" (p1) - : "cc", "memory" ); - - return c; - #endif - - TTMATH_LOG("UInt32::SubInt") - - } - - - - /*! - this method moves all bits into the left hand side - return value <- this <- c - - the lowest *bit* will be held the 'c' and - the state of one additional bit (on the left hand side) - will be returned - - for example: - let this is 001010000 - after Rcl2_one(1) there'll be 010100001 and Rcl2_one returns 0 - */ - template - uint UInt::Rcl2_one(uint c) - { - register sint b = value_size; - register uint * p1 = table; - - #ifndef __GNUC__ - __asm - { - mov ebx, [p1] - - xor edx, edx - mov ecx, edx - sub ecx, [c] - - mov ecx, [b] - - p: - rcl dword ptr [ebx+edx*4], 1 - - inc edx - dec ecx - jnz p - - setc dl - movzx eax, dl - } - #endif - - - #ifdef __GNUC__ - __asm__ __volatile__( - - "push %%edx \n" - "push %%ecx \n" - - "xorl %%edx, %%edx \n" // edx=0 - "neg %%eax \n" // CF=1 if eax!=0 , CF=0 if eax==0 - - "1: \n" - "rcll $1, (%%ebx, %%edx, 4) \n" - - "incl %%edx \n" - "decl %%ecx \n" - "jnz 1b \n" - - "setc %%al \n" - "movzx %%al, %%eax \n" - - "pop %%ecx \n" - "pop %%edx \n" - - : "=a" (c) - : "0" (c), "c" (b), "b" (p1) - : "cc", "memory" ); - - return c; - #endif - - TTMATH_LOG("UInt32::Rcl2_one") - - } - - - - /*! - this method moves all bits into the right hand side - c -> this -> return value - - the highest *bit* will be held the 'c' and - the state of one additional bit (on the right hand side) - will be returned - - for example: - let this is 000000010 - after Rcr2_one(1) there'll be 100000001 and Rcr2_one returns 0 - */ - template - uint UInt::Rcr2_one(uint c) - { - register sint b = value_size; - register uint * p1 = table; - - #ifndef __GNUC__ - __asm - { - mov ebx, [p1] - - xor ecx, ecx - sub ecx, [c] - - mov ecx, [b] - - p: - rcr dword ptr [ebx+ecx*4-4], 1 - - dec ecx - jnz p - - setc cl - movzx eax, cl - } - #endif - - - #ifdef __GNUC__ - __asm__ __volatile__( - - "push %%ecx \n" - - "neg %%eax \n" // CF=1 if eax!=0 , CF=0 if eax==0 - - "1: \n" - "rcrl $1, -4(%%ebx, %%ecx, 4) \n" - - "decl %%ecx \n" - "jnz 1b \n" - - "setc %%al \n" - "movzx %%al, %%eax \n" - - "pop %%ecx \n" - - : "=a" (c) - : "0" (c), "c" (b), "b" (p1) - : "cc", "memory" ); - - return c; - #endif - - TTMATH_LOG("UInt32::Rcr2_one") - } - - - - /*! - this method moves all bits into the left hand side - return value <- this <- c - - the lowest *bits* will be held the 'c' and - the state of one additional bit (on the left hand side) - will be returned - - for example: - let this is 001010000 - after Rcl2(3, 1) there'll be 010000111 and Rcl2 returns 1 - */ - template - uint UInt::Rcl2(uint bits, uint c) - { - TTMATH_ASSERT( bits>0 && bits this -> return value - - the highest *bits* will be held the 'c' and - the state of one additional bit (on the right hand side) - will be returned - - for example: - let this is 000000010 - after Rcr2(2, 1) there'll be 110000000 and Rcr2 returns 1 - */ - template - uint UInt::Rcr2(uint bits, uint c) - { - TTMATH_ASSERT( bits>0 && bits - sint UInt::FindLeadingBitInWord(uint x) - { - - #ifndef __GNUC__ - __asm - { - mov edx,-1 - bsr eax,[x] - cmovz eax,edx - } - #endif - - - #ifdef __GNUC__ - register sint result; - - __asm__ __volatile__( - - "bsrl %1, %0 \n" - "jnz 1f \n" - "movl $-1, %0 \n" - "1: \n" - - : "=R" (result) - : "R" (x) - : "cc" ); - - return result; - - #endif - } - - - - - - /*! - this method sets a special bit in the 'value' - and returns the last state of the bit (zero or one) - - bit is from <0,31> - e.g. - uint x = 100; - uint bit = SetBitInWord(x, 3); - now: x = 108 and bit = 0 - */ - template - uint UInt::SetBitInWord(uint & value, uint bit) - { - TTMATH_ASSERT( bit < TTMATH_BITS_PER_UINT ) - - uint v = value; - - #ifndef __GNUC__ - __asm - { - mov eax, [v] - mov ebx, [bit] - bts eax, ebx - mov [v], eax - - setc bl - movzx ebx, bl - mov eax, ebx - } - #endif - - - #ifdef __GNUC__ - uint old_bit; - - __asm__ __volatile__( - - "btsl %%ebx, %%eax \n" - - "setc %%bl \n" - "movzx %%bl, %%ebx \n" - - : "=a" (v), "=b" (old_bit) - : "0" (v), "1" (bit) - : "cc" ); - - return old_bit; - #endif - - value = v; - } - - - - - /*! - multiplication: result_high:result_low = a * b - result_high - higher word of the result - result_low - lower word of the result - - this methos never returns a carry - this method is used in the second version of the multiplication algorithms - */ - template - void UInt::MulTwoWords(uint a, uint b, uint * result_high, uint * result_low) - { - /* - we must use these temporary variables in order to inform the compilator - that value pointed with result1 and result2 has changed - - this has no effect in visual studio but it's useful when - using gcc and options like -Ox - */ - register uint result1_; - register uint result2_; - - #ifndef __GNUC__ - - __asm - { - mov eax, [a] - mul dword ptr [b] - - mov [result2_], edx - mov [result1_], eax - } - - #endif - - - #ifdef __GNUC__ - - __asm__ __volatile__( - - "mull %%edx \n" - - : "=a" (result1_), "=d" (result2_) - : "0" (a), "1" (b) - : "cc" ); - - #endif - - - *result_low = result1_; - *result_high = result2_; - } - - - - - - /*! - * - * Division - * - * - */ - - - - - /*! - this method calculates 64bits word a:b / 32bits c (a higher, b lower word) - r = a:b / c and rest - remainder - - * - * WARNING: - * if r (one word) is too small for the result or c is equal zero - * there'll be a hardware interruption (0) - * and probably the end of your program - * - */ - template - void UInt::DivTwoWords(uint a, uint b, uint c, uint * r, uint * rest) - { - register uint r_; - register uint rest_; - /* - these variables have similar meaning like those in - the multiplication algorithm MulTwoWords - */ - - TTMATH_ASSERT( c != 0 ) - - #ifndef __GNUC__ - __asm - { - mov edx, [a] - mov eax, [b] - div dword ptr [c] - - mov [r_], eax - mov [rest_], edx - } - #endif - - - #ifdef __GNUC__ - - __asm__ __volatile__( - - "divl %%ecx \n" - - : "=a" (r_), "=d" (rest_) - : "d" (a), "a" (b), "c" (c) - : "cc" ); - - #endif - - - *r = r_; - *rest = rest_; - - } - - - -} //namespace - - - -#endif //ifdef TTMATH_PLATFORM32 -#endif //ifndef TTMATH_NOASM -#endif +/* + * This file is a part of TTMath Bignum Library + * and is distributed under the (new) BSD licence. + * Author: Tomasz Sowa + */ + +/* + * Copyright (c) 2006-2009, Tomasz Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name Tomasz Sowa nor the names of contributors to this + * project may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + + + +#ifndef headerfilettmathuint_x86 +#define headerfilettmathuint_x86 + + +#ifndef TTMATH_NOASM +#ifdef TTMATH_PLATFORM32 + + +/*! + \file ttmathuint_x86.h + \brief template class UInt with assembler code for 32bit x86 processors + + this file is included at the end of ttmathuint.h +*/ + + + +/*! + \brief a namespace for the TTMath library +*/ +namespace ttmath +{ + + /*! + * + * basic mathematic functions + * + */ + + + /*! + adding ss2 to the this and adding carry if it's defined + (this = this + ss2 + c) + + c must be zero or one (might be a bigger value than 1) + function returns carry (1) (if it has been) + */ + template + uint UInt::Add(const UInt & ss2, uint c) + { + register uint b = value_size; + register uint * p1 = table; + register uint * p2 = const_cast(ss2.table); + + // we don't have to use TTMATH_REFERENCE_ASSERT here + // this algorithm doesn't require it + + #ifndef __GNUC__ + + // this part might be compiled with for example visual c + + __asm + { + xor eax,eax // eax=0 + mov ecx,[b] + mov edx,eax // edx=0 + mov ebx,[p1] + mov esi,[p2] + + sub eax,[c] // CF=c + + ALIGN 16 + p: + mov eax,[esi+edx*4+0] + adc [ebx+edx*4+0],eax + + lea edx, [edx+1] // inc edx, but faster (no flags dependencies) + dec ecx + jnz p + + setc al + movzx eax, al + } + + #endif + + + #ifdef __GNUC__ + + // this part should be compiled with gcc + + __asm__ __volatile__( + + "push %%ecx \n" + + "xorl %%eax, %%eax \n" + "movl %%eax, %%edx \n" + "subl %%edi, %%eax \n" + + + "1: \n" + "movl (%%esi,%%edx,4),%%eax \n" + "adcl %%eax, (%%ebx,%%edx,4) \n" + + "incl %%edx \n" + "decl %%ecx \n" + "jnz 1b \n" + + "setc %%al \n" + "movzx %%al,%%edx \n" + + "pop %%ecx \n" + + : "=d" (c) + : "D" (c), "c" (b), "b" (p1), "S" (p2) + : "%eax", "cc", "memory" ); + + return c; + #endif + + TTMATH_LOG("UInt32::Add") + } + + + + /*! + adding one word (at a specific position) + and returning a carry (if it has been) + + e.g. + + if we've got (value_size=3): + table[0] = 10; + table[1] = 30; + table[2] = 5; + and we call: + AddInt(2,1) + then it'll be: + table[0] = 10; + table[1] = 30 + 2; + table[2] = 5; + + of course if there was a carry from table[2] it would be returned + */ + template + uint UInt::AddInt(uint value, uint index) + { + register uint b = value_size; + register uint * p1 = table; + + TTMATH_ASSERT( index < value_size ) + + #ifndef __GNUC__ + + __asm + { + mov ecx, [b] + sub ecx, [index] + + mov edx, [index] + mov ebx, [p1] + + mov eax, [value] + + ALIGN 16 + p: + add [ebx+edx*4], eax + jnc end + + mov eax, 1 + lea edx, [edx+1] // inc edx, but faster (no flags dependencies) + dec ecx + jnz p + + end: + setc al + movzx eax, al + } + + #endif + + + #ifdef __GNUC__ + register uint c; + + __asm__ __volatile__( + + "push %%eax \n" + "push %%ecx \n" + + "subl %%edx, %%ecx \n" + + "1: \n" + "addl %%eax, (%%ebx,%%edx,4) \n" + "jnc 2f \n" + + "movl $1, %%eax \n" + "incl %%edx \n" + "decl %%ecx \n" + "jnz 1b \n" + + "2: \n" + "setc %%al \n" + "movzx %%al, %%edx \n" + + "pop %%ecx \n" + "pop %%eax \n" + + : "=d" (c) + : "a" (value), "c" (b), "0" (index), "b" (p1) + : "cc", "memory" ); + + return c; + #endif + + TTMATH_LOG("UInt32::AddInt") + } + + + + + /*! + adding only two unsigned words to the existing value + and these words begin on the 'index' position + (it's used in the multiplication algorithm 2) + + index should be equal or smaller than value_size-2 (index <= value_size-2) + x1 - lower word, x2 - higher word + + for example if we've got value_size equal 4 and: + table[0] = 3 + table[1] = 4 + table[2] = 5 + table[3] = 6 + then let + x1 = 10 + x2 = 20 + and + index = 1 + + the result of this method will be: + table[0] = 3 + table[1] = 4 + x1 = 14 + table[2] = 5 + x2 = 25 + table[3] = 6 + + and no carry at the end of table[3] + + (of course if there was a carry in table[2](5+20) then + this carry would be passed to the table[3] etc.) + */ + template + uint UInt::AddTwoInts(uint x2, uint x1, uint index) + { + register uint b = value_size; + register uint * p1 = table; + + TTMATH_ASSERT( index < value_size - 1 ) + + #ifndef __GNUC__ + __asm + { + mov ecx, [b] + sub ecx, [index] + + mov ebx, [p1] + mov edx, [index] + + mov eax, [x1] + add [ebx+edx*4], eax + lea edx, [edx+1] // inc edx, but faster (no flags dependencies) + lea ecx, [ecx-1] + + mov eax, [x2] + + ALIGN 16 + p: + adc [ebx+edx*4], eax + jnc end + + xor eax, eax + lea edx, [edx+1] // inc edx, but faster (no flags dependencies) + dec ecx + jnz p + + end: + setc al + movzx eax, al + } + #endif + + + #ifdef __GNUC__ + register uint c; + + __asm__ __volatile__( + + "push %%ecx \n" + "push %%edx \n" + + "subl %%edx, %%ecx \n" + + "addl %%esi, (%%ebx,%%edx,4) \n" + "incl %%edx \n" + "decl %%ecx \n" + + "1: \n" + "adcl %%eax, (%%ebx,%%edx,4) \n" + "jnc 2f \n" + + "mov $0, %%eax \n" + "incl %%edx \n" + "decl %%ecx \n" + "jnz 1b \n" + + "2: \n" + "setc %%al \n" + "movzx %%al, %%eax \n" + + "pop %%edx \n" + "pop %%ecx \n" + + : "=a" (c) + : "c" (b), "d" (index), "b" (p1), "S" (x1), "0" (x2) + : "cc", "memory" ); + + return c; + #endif + + TTMATH_LOG("UInt32::AddTwoInts") + } + + + + + + /*! + subtracting ss2 from the 'this' and subtracting + carry if it has been defined + (this = this - ss2 - c) + + c must be zero or one (might be a bigger value than 1) + function returns carry (1) (if it has been) + */ + template + uint UInt::Sub(const UInt & ss2, uint c) + { + register uint b = value_size; + register uint * p1 = table; + register uint * p2 = const_cast(ss2.table); + + // we don't have to use TTMATH_REFERENCE_ASSERT here + // this algorithm doesn't require it + + #ifndef __GNUC__ + + __asm + { + mov ecx,[b] + + mov ebx,[p1] + mov esi,[p2] + + xor eax, eax + mov edx, eax + + sub eax, [c] + + ALIGN 16 + p: + mov eax, [esi+edx*4] + sbb [ebx+edx*4], eax + + lea edx, [edx+1] // inc edx, but faster (no flags dependencies) + dec ecx + jnz p + + setc al + movzx eax, al + } + + #endif + + + #ifdef __GNUC__ + __asm__ __volatile__( + "push %%ecx \n" + + "xorl %%eax, %%eax \n" + "movl %%eax, %%edx \n" + "subl %%edi, %%eax \n" + + + "1: \n" + "movl (%%esi,%%edx,4),%%eax \n" + "sbbl %%eax, (%%ebx,%%edx,4) \n" + + "incl %%edx \n" + "decl %%ecx \n" + "jnz 1b \n" + + "setc %%al \n" + "movzx %%al,%%edx \n" + + "pop %%ecx \n" + + : "=d" (c) + : "D" (c), "c" (b), "b" (p1), "S" (p2) + : "%eax", "cc", "memory" ); + + return c; + #endif + + TTMATH_LOG("UInt32::Sub") + } + + + + + /*! + this method subtracts one word (at a specific position) + and returns a carry (if it was) + + e.g. + + if we've got (value_size=3): + table[0] = 10; + table[1] = 30; + table[2] = 5; + and we call: + SubInt(2,1) + then it'll be: + table[0] = 10; + table[1] = 30 - 2; + table[2] = 5; + + of course if there was a carry from table[2] it would be returned + */ + template + uint UInt::SubInt(uint value, uint index) + { + register uint b = value_size; + register uint * p1 = table; + + TTMATH_ASSERT( index < value_size ) + + #ifndef __GNUC__ + __asm + { + mov ecx, [b] + sub ecx, [index] + + mov edx, [index] + mov ebx, [p1] + + mov eax, [value] + + ALIGN 16 + p: + sub [ebx+edx*4], eax + jnc end + + mov eax, 1 + lea edx, [edx+1] // inc edx, but faster (no flags dependencies) + dec ecx + jnz p + + end: + setc al + movzx eax, al + } + #endif + + + #ifdef __GNUC__ + register uint c; + + __asm__ __volatile__( + + "push %%eax \n" + "push %%ecx \n" + + "subl %%edx, %%ecx \n" + + "1: \n" + "subl %%eax, (%%ebx,%%edx,4) \n" + "jnc 2f \n" + + "movl $1, %%eax \n" + "incl %%edx \n" + "decl %%ecx \n" + "jnz 1b \n" + + "2: \n" + "setc %%al \n" + "movzx %%al, %%edx \n" + + "pop %%ecx \n" + "pop %%eax \n" + + : "=d" (c) + : "a" (value), "c" (b), "0" (index), "b" (p1) + : "cc", "memory" ); + + return c; + #endif + + TTMATH_LOG("UInt32::SubInt") + + } + + + + /*! + this method moves all bits into the left hand side + return value <- this <- c + + the lowest *bit* will be held the 'c' and + the state of one additional bit (on the left hand side) + will be returned + + for example: + let this is 001010000 + after Rcl2_one(1) there'll be 010100001 and Rcl2_one returns 0 + */ + template + uint UInt::Rcl2_one(uint c) + { + register sint b = value_size; + register uint * p1 = table; + + #ifndef __GNUC__ + __asm + { + mov ebx, [p1] + + xor edx, edx + mov ecx, edx + sub ecx, [c] + + mov ecx, [b] + + ALIGN 16 + p: + rcl dword ptr [ebx+edx*4], 1 + + lea edx, [edx+1] // inc edx, but faster (no flags dependencies) + dec ecx + jnz p + + setc al + movzx eax, al + } + #endif + + + #ifdef __GNUC__ + __asm__ __volatile__( + + "push %%edx \n" + "push %%ecx \n" + + "xorl %%edx, %%edx \n" // edx=0 + "neg %%eax \n" // CF=1 if eax!=0 , CF=0 if eax==0 + + "1: \n" + "rcll $1, (%%ebx, %%edx, 4) \n" + + "incl %%edx \n" + "decl %%ecx \n" + "jnz 1b \n" + + "setc %%al \n" + "movzx %%al, %%eax \n" + + "pop %%ecx \n" + "pop %%edx \n" + + : "=a" (c) + : "0" (c), "c" (b), "b" (p1) + : "cc", "memory" ); + + return c; + #endif + + TTMATH_LOG("UInt32::Rcl2_one") + + } + + + + /*! + this method moves all bits into the right hand side + c -> this -> return value + + the highest *bit* will be held the 'c' and + the state of one additional bit (on the right hand side) + will be returned + + for example: + let this is 000000010 + after Rcr2_one(1) there'll be 100000001 and Rcr2_one returns 0 + */ + template + uint UInt::Rcr2_one(uint c) + { + register sint b = value_size; + register uint * p1 = table; + + #ifndef __GNUC__ + __asm + { + xor ecx, ecx + sub ecx, [c] + + mov ebx, [p1] + mov ecx, [b] + + ALIGN 16 + p: + rcr dword ptr [ebx+ecx*4-4], 1 + + dec ecx + jnz p + + setc al + movzx eax, al + } + #endif + + + #ifdef __GNUC__ + __asm__ __volatile__( + + "push %%ecx \n" + + "neg %%eax \n" // CF=1 if eax!=0 , CF=0 if eax==0 + + "1: \n" + "rcrl $1, -4(%%ebx, %%ecx, 4) \n" + + "decl %%ecx \n" + "jnz 1b \n" + + "setc %%al \n" + "movzx %%al, %%eax \n" + + "pop %%ecx \n" + + : "=a" (c) + : "0" (c), "c" (b), "b" (p1) + : "cc", "memory" ); + + return c; + #endif + + TTMATH_LOG("UInt32::Rcr2_one") + } + + + + /*! + this method moves all bits into the left hand side + return value <- this <- c + + the lowest *bits* will be held the 'c' and + the state of one additional bit (on the left hand side) + will be returned + + for example: + let this is 001010000 + after Rcl2(3, 1) there'll be 010000111 and Rcl2 returns 1 + */ + template + uint UInt::Rcl2(uint bits, uint c) + { + TTMATH_ASSERT( bits>0 && bits this -> return value + + the highest *bits* will be held the 'c' and + the state of one additional bit (on the right hand side) + will be returned + + for example: + let this is 000000010 + after Rcr2(2, 1) there'll be 110000000 and Rcr2 returns 1 + */ + template + uint UInt::Rcr2(uint bits, uint c) + { + TTMATH_ASSERT( bits>0 && bits + sint UInt::FindLeadingBitInWord(uint x) + { + + #ifndef __GNUC__ + __asm + { + mov edx,-1 + bsr eax,[x] + cmovz eax,edx + } + #endif + + + #ifdef __GNUC__ + register sint result; + + __asm__ __volatile__( + + "bsrl %1, %0 \n" + "jnz 1f \n" + "movl $-1, %0 \n" + "1: \n" + + : "=R" (result) + : "R" (x) + : "cc" ); + + return result; + + #endif + } + + + + + + /*! + this method sets a special bit in the 'value' + and returns the last state of the bit (zero or one) + + bit is from <0,31> + e.g. + uint x = 100; + uint bit = SetBitInWord(x, 3); + now: x = 108 and bit = 0 + */ + template + uint UInt::SetBitInWord(uint & value, uint bit) + { + TTMATH_ASSERT( bit < TTMATH_BITS_PER_UINT ) + + uint v = value; + + #ifndef __GNUC__ + __asm + { + mov eax, [v] + mov ebx, [bit] + bts eax, ebx + mov [v], eax + + setc bl + movzx ebx, bl + mov eax, ebx + } + #endif + + + #ifdef __GNUC__ + uint old_bit; + + __asm__ __volatile__( + + "btsl %%ebx, %%eax \n" + + "setc %%bl \n" + "movzx %%bl, %%ebx \n" + + : "=a" (v), "=b" (old_bit) + : "0" (v), "1" (bit) + : "cc" ); + + return old_bit; + #endif + + value = v; + } + + + + + /*! + multiplication: result_high:result_low = a * b + result_high - higher word of the result + result_low - lower word of the result + + this methos never returns a carry + this method is used in the second version of the multiplication algorithms + */ + template + void UInt::MulTwoWords(uint a, uint b, uint * result_high, uint * result_low) + { + /* + we must use these temporary variables in order to inform the compilator + that value pointed with result1 and result2 has changed + + this has no effect in visual studio but it's useful when + using gcc and options like -Ox + */ + register uint result1_; + register uint result2_; + + #ifndef __GNUC__ + + __asm + { + mov eax, [a] + mul dword ptr [b] + + mov [result2_], edx + mov [result1_], eax + } + + #endif + + + #ifdef __GNUC__ + + __asm__ __volatile__( + + "mull %%edx \n" + + : "=a" (result1_), "=d" (result2_) + : "0" (a), "1" (b) + : "cc" ); + + #endif + + + *result_low = result1_; + *result_high = result2_; + } + + + + + + /*! + * + * Division + * + * + */ + + + + + /*! + this method calculates 64bits word a:b / 32bits c (a higher, b lower word) + r = a:b / c and rest - remainder + + * + * WARNING: + * if r (one word) is too small for the result or c is equal zero + * there'll be a hardware interruption (0) + * and probably the end of your program + * + */ + template + void UInt::DivTwoWords(uint a, uint b, uint c, uint * r, uint * rest) + { + register uint r_; + register uint rest_; + /* + these variables have similar meaning like those in + the multiplication algorithm MulTwoWords + */ + + TTMATH_ASSERT( c != 0 ) + + #ifndef __GNUC__ + __asm + { + mov edx, [a] + mov eax, [b] + div dword ptr [c] + + mov [r_], eax + mov [rest_], edx + } + #endif + + + #ifdef __GNUC__ + + __asm__ __volatile__( + + "divl %%ecx \n" + + : "=a" (r_), "=d" (rest_) + : "d" (a), "a" (b), "c" (c) + : "cc" ); + + #endif + + + *r = r_; + *rest = rest_; + + } + + + +} //namespace + + + +#endif //ifdef TTMATH_PLATFORM32 +#endif //ifndef TTMATH_NOASM +#endif diff --git a/ttmath/ttmathuint_x86_amd64_msvc.asm b/ttmath/ttmathuint_x86_amd64_msvc.asm index 7d430f3..60de740 100644 --- a/ttmath/ttmathuint_x86_amd64_msvc.asm +++ b/ttmath/ttmathuint_x86_amd64_msvc.asm @@ -31,23 +31,21 @@ adc_x64 PROC ; r9 = nCarry xor rax, rax - mov r11, 0 + xor r11, r11 sub rax, r9 ; sets CARRY if r9 != 0 + ALIGN 16 loop1: mov rax,qword ptr [rdx + r11 * 8] adc qword ptr [rcx + r11 * 8], rax - inc r11 + lea r11, [r11+1] dec r8 jnz loop1 - jc return_1 ; most of the times, there will be NO carry (I hope) - xor rax, rax - ret - - return_1: - mov rax, 1 + setc al + movzx rax, al + ret adc_x64 ENDP @@ -66,24 +64,22 @@ addindexed_x64 PROC ; r9 = nValue sub rdx, r8 ; rdx = remaining count of uints + ALIGN 16 loop1: add qword ptr [rcx + r8 * 8], r9 jnc done - inc r8 + lea r8, [r8+1] mov r9, 1 dec rdx jnz loop1 done: - jc return_1 ; most of the times, there will be NO carry (I hope) - xor rax, rax + setc al + movzx rax, al + ret - return_1: - mov rax, 1 - ret - addindexed_x64 ENDP ;---------------------------------------- @@ -100,28 +96,32 @@ addindexed2_x64 PROC ; r9 = nValue1 ; [esp+0x28] = nValue2 + xor rax, rax ; return value mov r11, rcx ; table sub rdx, r8 ; rdx = remaining count of uints mov r10, [esp+028h] ; r10 = nValue2 add qword ptr [r11 + r8 * 8], r10 - inc r8 + lea r8, [r8+1] + + ALIGN 16 loop1: adc qword ptr [r11 + r8 * 8], r9 - jnc done + jc next + ret - inc r8 - mov r9, 0 ; set to 0 -> cy still set! +next: + lea r8, [r8+1] + xor r9, r9 ; set to 0 -> cy still set! dec rdx jnz loop1 jc return_1 ; most of the times, there will be NO carry (I hope) done: - xor rax, rax ret - return_1: - mov rax, 1 +return_1: + lea rax, [rax+1] ret addindexed2_x64 ENDP @@ -142,23 +142,20 @@ sbb_x64 PROC ; r9 = nCarry xor rax, rax - mov r11, 0 + xor r11, r11 sub rax, r9 ; sets CARRY if r9 != 0 + ALIGN 16 loop1: mov rax,qword ptr [rdx + r11 * 8] sbb qword ptr [rcx + r11 * 8], rax - inc r11 + lea r11, [r11+1] dec r8 - jnz loop1 - jc return_1 ; most of the times, there will be NO carry (I hope) - xor rax, rax - ret - - return_1: - mov rax, 1 + setc al + movzx rax, al + ret sbb_x64 ENDP @@ -176,11 +173,13 @@ subindexed_x64 PROC ; r9 = nValue sub rdx, r8 ; rdx = remaining count of uints + + ALIGN 16 loop1: sub qword ptr [rcx + r8 * 8], r9 jnc done - inc r8 + lea r8, [r8+1] mov r9, 1 dec rdx jnz loop1 @@ -210,9 +209,11 @@ rcl_x64 PROC mov r11, rcx ; table xor r10, r10 neg r8 ; CY set if r8 <> 0 + + ALIGN 16 loop1: rcl qword ptr [r11 + r10 * 8], 1 - inc r10 + lea r10, [r10+1] dec rdx jnz loop1 @@ -236,6 +237,8 @@ rcr_x64 PROC xor r10, r10 neg r8 ; CY set if r8 <> 0 + + ALIGN 16 loop1: rcr qword ptr -8[rcx + rdx * 8], 1 dec rdx @@ -304,6 +307,7 @@ rcl2_x64 PROC mov r9, rax ; r9 = index (0..nSize-1) + ALIGN 16 loop1: rol qword ptr [r10+r9*8], cl mov rax, qword ptr [r10+r9*8] @@ -312,7 +316,7 @@ loop1: or qword ptr [r10+r9*8], rbx mov rbx, rax - inc r9 + lea r9, [r9+1] dec rdx jnz loop1 @@ -352,8 +356,9 @@ rcr2_x64 PROC cmovnz rbx, r11 ; if (c) then old value = mask mov r9, rdx ; r9 = index (0..nSize-1) - dec r9 + lea r9, [r9-1] + ALIGN 16 loop1: ror qword ptr [r10+r9*8], cl mov rax, qword ptr [r10+r9*8] @@ -362,7 +367,7 @@ loop1: or qword ptr [r10+r9*8], rbx mov rbx, rax - dec r9 + lea r9, [r9-1] dec rdx jnz loop1