merged: x86_64 asm code for Microsoft Visual compiler

file: ttmathuint_x86_64_msvc.asm from chk branch (original was: ttmathuint_x86_amd64_msvc.asm) (this file should be compiled first because MS VC doesn't support inline assembler in x86_64 mode) git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@187 e52654a7-88a9-db11-a3e9-0013d4bc506e
2009-09-07 02:03:00 +00:00 · 2009-09-07 02:03:00 +00:00 · 28964d30f7
parent 0d71b0cec2
commit 28964d30f7
6 changed files with 780 additions and 109 deletions
--- a/5
+++ b/5
@ -1,4 +1,4 @@
-Version 0.9.0 prerelease (2009.09.05):
+Version 0.9.0 prerelease (2009.09.07):
    * added:   support for wide characters (wchar_t)
               wide characters are used when macro TTMATH_USE_WCHAR is defined
               this macro is defined automatically when there is macro UNICODE or _UNICODE defined
@ -22,6 +22,9 @@ Version 0.9.0 prerelease (2009.09.05):
               and use TTMATH_MULTITHREADS_HELPER macro somewhere in your *.cpp file
    * added:   Big::AboutEqual(const Big<exp,man> & ss2, int nBitsToIgnore = 4)
               the last nBitsToIgnore bits from mantissas will be skipped when comparing
    * added:   x86_64 asm code for Microsoft Visual compiler
               file: ttmathuint_x86_64_msvc.asm
               (this file should be compiled first because MS VC doesn't support inline assembler in x86_64 mode) 
    * changed: Factorial() is using the Gamma() function now
    * removed: Parser<>::SetFactorialMax() method
               the factorial() is such a fast now that we don't need the method longer
--- a/ttmath/ttmathbig.h
+++ b/ttmath/ttmathbig.h
@ -3916,75 +3916,75 @@ public:
 	}
-	bool AboutEqual(const Big<exp,man> & ss2, int nBitsToIgnore = 4) const
+	bool AboutEqual(const Big<exp,man> & ss2, int nBitsToIgnore = 4) const
-	{
+	{
-		// we should check the mantissas beforehand because sometimes we can have
+		// we should check the mantissas beforehand because sometimes we can have
-		// a mantissa set to zero but in the exponent something another value
+		// a mantissa set to zero but in the exponent something another value
-		// (maybe we've forgotten about calling CorrectZero() ?)
+		// (maybe we've forgotten about calling CorrectZero() ?)
-		if( mantissa.IsZero() )
+		if( mantissa.IsZero() )
-		{
+		{
-			if( ss2.mantissa.IsZero() )
+			if( ss2.mantissa.IsZero() )
-				return true;
+				return true;
-
+
-			return(ss2.AboutEqual(*this,nBitsToIgnore));
+			return(ss2.AboutEqual(*this,nBitsToIgnore));
-		}
+		}
-
+
-		if( ss2.mantissa.IsZero() )
+		if( ss2.mantissa.IsZero() )
-		{
+		{
-			return(this->exponent <= uint(2*(-sint(man*TTMATH_BITS_PER_UINT))+nBitsToIgnore));
+			return(this->exponent <= uint(2*(-sint(man*TTMATH_BITS_PER_UINT))+nBitsToIgnore));
-		}
+		}
-
+
-		// exponents may not differ much!
+		// exponents may not differ much!
-		ttmath::Int<exp> expdiff(this->exponent - ss2.exponent);
+		ttmath::Int<exp> expdiff(this->exponent - ss2.exponent);
-
+
-		// they may differ one if for example mantissa1=0x80000000, mantissa2=0xffffffff
+		// they may differ one if for example mantissa1=0x80000000, mantissa2=0xffffffff
-		if( ttmath::Abs(expdiff) > 1 )
+		if( ttmath::Abs(expdiff) > 1 )
-			return(false);		
+			return(false);		
-
+
-		// calculate the 'difference' mantissa		
+		// calculate the 'difference' mantissa		
-		ttmath::UInt<man> man1(this->mantissa);
+		ttmath::UInt<man> man1(this->mantissa);
-		ttmath::UInt<man> man2(ss2.mantissa);
+		ttmath::UInt<man> man2(ss2.mantissa);
-		ttmath::UInt<man> mandiff;
+		ttmath::UInt<man> mandiff;
-
+
-		switch( expdiff.ToInt() )
+		switch( expdiff.ToInt() )
-		{
+		{
-			case +1:
+			case +1:
-				man2.Rcr(1,0);
+				man2.Rcr(1,0);
-				mandiff = man1;
+				mandiff = man1;
-				mandiff.Sub(man2);
+				mandiff.Sub(man2);
-				break;
+				break;
-			case -1:
+			case -1:
-				man1.Rcr(1,0);
+				man1.Rcr(1,0);
-				mandiff = man2;
+				mandiff = man2;
-				mandiff.Sub(man1);
+				mandiff.Sub(man1);
-				break;
+				break;
-			default:
+			default:
-				if( man2 > man1 )
+				if( man2 > man1 )
-				{
+				{
-					mandiff = man2;
+					mandiff = man2;
-					mandiff.Sub(man1);
+					mandiff.Sub(man1);
-				}
+				}
-				else
+				else
-				{
+				{
-					mandiff = man1;
+					mandiff = man1;
-					mandiff.Sub(man2);
+					mandiff.Sub(man2);
-				}
+				}
-			break;
+			break;
-		}
+		}
-
+
-		// faster to mask the bits!
+		// faster to mask the bits!
-		TTMATH_ASSERT( nBitsToIgnore < TTMATH_BITS_PER_UINT );
+		TTMATH_ASSERT( nBitsToIgnore < TTMATH_BITS_PER_UINT );
-
+
-		for( int n = man-1; n > 0; --n )
+		for( int n = man-1; n > 0; --n )
-		{
+		{
-			if( mandiff.table[n] != 0 )
+			if( mandiff.table[n] != 0 )
-				return(false);
+				return(false);
-		}
+		}
-
+
-		uint nMask = ~((1 << nBitsToIgnore) - 1);
+		uint nMask = ~((1 << nBitsToIgnore) - 1);
-
+
-	return((mandiff.table[0] & nMask) == 0);
+	return((mandiff.table[0] & nMask) == 0);
-	}
+	}
 	bool operator<(const Big<exp,man> & ss2) const
--- a/ttmath/ttmathtypes.h
+++ b/ttmath/ttmathtypes.h
@ -162,8 +162,14 @@ namespace ttmath
 	/*!
 		on 64bit platforms one word (uint, sint) will be equal 64bits
 	*/
-	typedef unsigned long uint;
+	#ifdef _MSC_VER
-	typedef signed   long sint;
+		/* in VC 'long' type has 32 bits, __int64 is VC extension */
 		typedef unsigned __int64 uint;
 		typedef signed   __int64 sint;
 	#else
 		typedef unsigned long uint;
 		typedef signed   long sint;
 	#endif 
 	/*!
 		on 64bit platform we do not define ulint
--- a/ttmath/ttmathuint.h
+++ b/ttmath/ttmathuint.h
@ -3297,6 +3297,17 @@ public:
 	static uint SetBitInWord(uint & value, uint bit);
 	static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low);
 	static void DivTwoWords(uint a,uint b, uint c, uint * r, uint * rest);
 	/* temporarily */
 	#ifndef TTMATH_NOASM
 	#ifdef TTMATH_PLATFORM64
 	#ifdef _MSC_VER
 		static uint AddTwoWords(uint a, uint b, uint carry, uint * result);
 		static uint SubTwoWords(uint a, uint b, uint carry, uint * result);
 	#endif
 	#endif
 	#endif
 };
--- a/ttmath/ttmathuint_x86_64.h
+++ b/ttmath/ttmathuint_x86_64.h
@ -51,10 +51,33 @@
 	this file is included at the end of ttmathuint.h
 */
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
 namespace ttmath
 {
 	#ifdef _MSC_VER
 		extern "C"
 			{
 			uint __fastcall adc_x64(uint* p1, const uint* p2, uint nSize, uint c);
 			uint __fastcall addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
 			uint __fastcall addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2);
 			uint __fastcall sbb_x64(uint* p1, const uint* p2, uint nSize, uint c);
 			uint __fastcall subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
 			uint __fastcall rcl_x64(uint* p1, uint nSize, uint nLowestBit);
 			uint __fastcall rcr_x64(uint* p1, uint nSize, uint nLowestBit);
 			uint __fastcall div_x64(uint* pnValHi, uint* pnValLo, uint nDiv);
 			uint __fastcall rcl2_x64(uint* p1, uint nSize, uint nBits, uint c);
 			uint __fastcall rcr2_x64(uint* p1, uint nSize, uint nBits, uint c);
 			};
 	#endif
 	/*!
 	*
 	*	basic mathematic functions
@ -82,8 +105,12 @@ namespace ttmath
 		// we don't have to use TTMATH_REFERENCE_ASSERT here
 		// this algorithm doesn't require it
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			c = adc_x64(p1,p2,b,c);
 		#endif
 		#ifdef __GNUC__
@ -149,10 +176,16 @@ namespace ttmath
 		TTMATH_ASSERT( index < value_size )
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			c = addindexed_x64(p1,b,index,value);
 		#endif
 		#ifdef __GNUC__
 		uint dummy, dummy2;
@ -227,10 +260,16 @@ namespace ttmath
 		TTMATH_ASSERT( index < value_size - 1 )
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			c = addindexed2_x64(p1,b,index,x1,x2);
 		#endif
 		#ifdef __GNUC__
 		uint dummy, dummy2;
@ -288,6 +327,9 @@ namespace ttmath
 	  of course the carry is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
 #ifndef _MSC_VER
 	template<uint value_size>
 	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
@ -296,10 +338,16 @@ namespace ttmath
 		uint rest = ss1_size - ss2_size;
 		uint c;
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 		#endif
 		#ifdef __GNUC__
 		uint dummy1, dummy2, dummy3;	
@ -348,8 +396,27 @@ namespace ttmath
 	return c;
 	}
 #else
 	/* temporarily */
 	template<uint value_size>
 	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 	uint i, c = 0;
 		TTMATH_ASSERT( ss1_size >= ss2_size )
 		for(i=0 ; i<ss2_size ; ++i)
 			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
 		for( ; i<ss1_size ; ++i)
 			c = AddTwoWords(ss1[i], 0, c, &result[i]);
 		TTMATH_LOG("UInt::AddVector")
 	return c;
 	}
 #endif
 	/*!
@ -373,10 +440,16 @@ namespace ttmath
 		// we don't have to use TTMATH_REFERENCE_ASSERT here
 		// this algorithm doesn't require it
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			c = sbb_x64(p1,p2,b,c);
 		#endif
 		#ifdef __GNUC__
 		uint dummy, dummy2;
@ -432,15 +505,22 @@ namespace ttmath
 	uint b = value_size;
 	uint * p1 = table;
 	uint c;
 	uint dummy, dummy2;
 		TTMATH_ASSERT( index < value_size )
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			c = subindexed_x64(p1,b,index,value);
 		#endif
 		#ifdef __GNUC__
 			uint dummy, dummy2;
 			__asm__ __volatile__(
 				"subq %%rdx, %%rcx 				\n"
@ -493,6 +573,9 @@ namespace ttmath
 	  of course the carry (borrow) is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
 #ifndef _MSC_VER
 	template<uint value_size>
 	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
@ -501,16 +584,22 @@ namespace ttmath
 		uint rest = ss1_size - ss2_size;
 		uint c;
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 		#endif
 		#ifdef __GNUC__
-		/*
+		
-			the asm code is nearly the same as in AddVector
+		//	the asm code is nearly the same as in AddVector
-			only two instructions 'adc' are changed to 'sbb'
+		//	only two instructions 'adc' are changed to 'sbb'
-		*/
+		
 		uint dummy1, dummy2, dummy3;
 			__asm__ __volatile__(
@ -556,6 +645,27 @@ namespace ttmath
 	return c;
 	}
 #else
 	/* temporarily */
 	template<uint value_size>
 	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 	uint i, c = 0;
 		TTMATH_ASSERT( ss1_size >= ss2_size )
 		for(i=0 ; i<ss2_size ; ++i)
 			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
 		for( ; i<ss1_size ; ++i)
 			c = SubTwoWords(ss1[i], 0, c, &result[i]);
 		TTMATH_LOG("UInt::SubVector")
 	return c;
 	}
 #endif
 	/*!
@ -579,10 +689,16 @@ namespace ttmath
 	uint * p1 = table;
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			c = rcl_x64(p1,b,c);
 		#endif
 		#ifdef __GNUC__
 		uint dummy, dummy2;
@ -633,10 +749,16 @@ namespace ttmath
 	uint * p1 = table;
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			c = rcr_x64(p1,b,c);
 		#endif
 		#ifdef __GNUC__
 		uint dummy;
@ -688,10 +810,16 @@ namespace ttmath
 	uint * p1 = table;
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			c = rcl2_x64(p1,b,bits,c);
 		#endif
 		#ifdef __GNUC__
 		uint dummy, dummy2, dummy3;
@ -758,14 +886,20 @@ namespace ttmath
 	sint b = value_size;
 	uint * p1 = table;
 	uint dummy, dummy2, dummy3;
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			c = rcr2_x64(p1,b,bits,c);
 		#endif
 		#ifdef __GNUC__
 			uint dummy, dummy2, dummy3;
 			__asm__  __volatile__(
 			"movq %%rcx, %%rsi				\n"
@ -823,10 +957,23 @@ namespace ttmath
 	sint result;
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			unsigned long nIndex = 0;
 			if( _BitScanReverse64(&nIndex,x) == 0 )
 				result = -1;
 			else
 				result = nIndex;
 		#endif
 		#ifdef __GNUC__
 		uint dummy;
@ -868,11 +1015,16 @@ namespace ttmath
 		uint old_bit;
 		uint v = value;
-
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-		#ifndef __GNUC__
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 			#error "another compiler than GCC is currently not supported in 64bit mode"
 		#endif
 		#ifdef _MSC_VER
 			old_bit = _bittestandset64((__int64*)&value,bit) != 0;
 		#endif
 		#ifdef __GNUC__
 			__asm__ (
@ -924,10 +1076,16 @@ namespace ttmath
 	uint result1_;
 	uint result2_;
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			result1_ = _umul128(a,b,&result2_);
 		#endif
 		#ifdef __GNUC__
 		__asm__ (
@ -981,10 +1139,20 @@ namespace ttmath
 		TTMATH_ASSERT( c != 0 )
-		#ifndef __GNUC__
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif
 		#ifdef _MSC_VER
 			div_x64(&a,&b,c);
 			r_    = a;
 			rest_ = b;
 		#endif
 		#ifdef __GNUC__
 			__asm__ (
@ -1003,6 +1171,59 @@ namespace ttmath
 	}
 	/* temporarily */
 	template<uint value_size>
 	uint UInt<value_size>::AddTwoWords(uint a, uint b, uint carry, uint * result)
 	{
 	uint temp;
 		if( carry == 0 )
 		{
 			temp = a + b;
 			if( temp < a )
 				carry = 1;
 		}
 		else
 		{
 			carry = 1;
 			temp  = a + b + carry;
 			if( temp > a ) // !(temp<=a)
 				carry = 0;
 		}
 		*result = temp;
 	return carry;
 	}
 	/* temporarily */
 	template<uint value_size>
 	uint UInt<value_size>::SubTwoWords(uint a, uint b, uint carry, uint * result)
 	{
 		if( carry == 0 )
 		{
 			*result = a - b;
 			if( a < b )
 				carry = 1;
 		}
 		else
 		{
 			carry   = 1;
 			*result = a - b - carry;
 			if( a > b ) // !(a <= b )
 				carry = 0;
 		}
 	return carry;
 	}
 } //namespace
--- a/ttmath/ttmathuint_x86_64_msvc.asm
+++ b/ttmath/ttmathuint_x86_64_msvc.asm
@ -0,0 +1,430 @@
 ;
 ; This file is a part of TTMath Bignum Library
 ; and is distributed under the (new) BSD licence.
 ; Author: Christian Kaiser <>
 ;
 ; 
 ; Copyright (c) 2009, Christian Kaiser
 ; All rights reserved.
 ; 
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are met:
 ; 
 ;  * Redistributions of source code must retain the above copyright notice,
 ;    this list of conditions and the following disclaimer.
 ;    
 ;  * Redistributions in binary form must reproduce the above copyright
 ;    notice, this list of conditions and the following disclaimer in the
 ;    documentation and/or other materials provided with the distribution.
 ;    
 ;  * Neither the name Tomasz Sowa nor the names of contributors to this
 ;    project may be used to endorse or promote products derived
 ;    from this software without specific prior written permission.
 ;
 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 ; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 ; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 ; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 ; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 ; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 ; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 ; THE POSSIBILITY OF SUCH DAMAGE.
 ;
 ;
 ; compile with debug info:    ml64.exe /Zd /Zi ttmathuint_x86_64_msvc.asm
 ; compile without debug info: ml64.exe ttmathuint_x86_64_msvc.asm
 ; this create ttmathuint_x86_64_msvc.obj file which can be linked with your program
 ;
 PUBLIC	adc_x64
 PUBLIC	addindexed_x64
 PUBLIC	addindexed2_x64
 PUBLIC	sbb_x64
 PUBLIC	subindexed_x64
 PUBLIC	rcl_x64
 PUBLIC	rcr_x64
 PUBLIC	rcl2_x64
 PUBLIC	rcr2_x64
 PUBLIC	div_x64
 ;
 ;	"rax, rcx, rdx, r8-r11 are volatile."
 ;	"rbx, rbp, rdi, rsi, r12-r15 are nonvolatile."
 ;
 .CODE
        ALIGN       8
 ;----------------------------------------
 adc_x64				PROC
        ; rcx = p1
        ; rdx = p2
        ; r8 = nSize
        ; r9 = nCarry
        xor		rax, rax
        xor		r11, r11
        sub		rax, r9		; sets CARRY if r9 != 0
 		ALIGN 16
 loop1:
 		mov		rax,qword ptr [rdx + r11 * 8]
 		adc		qword ptr [rcx + r11 * 8], rax
 		lea		r11, [r11+1]
 		dec		r8
 		jnz		loop1
 		setc	al
 		movzx	rax, al
 		ret
 adc_x64				ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 addindexed_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = nPos
        ; r9 = nValue
 		xor		rax, rax			; rax = result
 		sub		rdx, r8				; rdx = remaining count of uints
 		add		qword ptr [rcx + r8 * 8], r9
 		jc		next1
 		ret
 next1:
 		mov		r9, 1
 		ALIGN 16
 loop1:
 		dec		rdx
 		jz		done_with_cy
 		lea		r8, [r8+1]
 		add		qword ptr [rcx + r8 * 8], r9
 		jc		loop1
 		ret
 done_with_cy:
 		lea		rax, [rax+1]		; rax = 1
 		ret
 addindexed_x64	ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 addindexed2_x64	PROC
        ; rcx = p1 (pointer)
        ; rdx = b  (value size)
        ; r8 = nPos
        ; r9 = nValue1
        ; [esp+0x28] = nValue2
 		xor		rax, rax			; return value
 		mov		r11, rcx			; table
 		sub		rdx, r8				; rdx = remaining count of uints
 		mov		r10, [esp+028h]		; r10 = nValue2
 		add		qword ptr [r11 + r8 * 8], r9
 		lea		r8, [r8+1]
 		lea		rdx, [rdx-1]
 		adc		qword ptr [r11 + r8 * 8], r10
 		jc		next
 		ret
 		ALIGN 16
 loop1:
 		lea		r8, [r8+1]
 		add		qword ptr [r11 + r8 * 8], 1
 		jc		next
 		ret
 next:
 		dec		rdx					; does not modify CY too...
 		jnz		loop1
 		lea		rax, [rax+1]
 		ret
 addindexed2_x64	ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 sbb_x64				PROC
        ; rcx = p1
        ; rdx = p2
        ; r8 = nCount
        ; r9 = nCarry
        xor		rax, rax
        xor		r11, r11
        sub		rax, r9				; sets CARRY if r9 != 0
 		ALIGN 16
 loop1:
 		mov		rax,qword ptr [rdx + r11 * 8]
 		sbb		qword ptr [rcx + r11 * 8], rax
 		lea		r11, [r11+1]
 		dec		r8
 		jnz		loop1
 		setc	al
 		movzx	rax, al
 		ret
 sbb_x64				ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 subindexed_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = nPos
        ; r9 = nValue
 		sub		rdx, r8				; rdx = remaining count of uints
 		ALIGN 16
 loop1:
 		sub		qword ptr [rcx + r8 * 8], r9
 		jnc		done
 		lea		r8, [r8+1]
 		mov		r9, 1
 		dec		rdx
 		jnz		loop1
 		jc		return_1	; most of the times, there will be NO carry (I hope)
 done:
 		xor		rax, rax
 		ret
  return_1:
 		mov		rax, 1
 		ret
 subindexed_x64	ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 rcl_x64	PROC
        ; rcx = p1
        ; rdx = b
        ; r8 = nLowestBit
 		mov		r11, rcx			; table
 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
 		ALIGN 16
 loop1:
 		rcl		qword ptr [r11 + r10 * 8], 1
 		lea		r10, [r10+1]
 		dec		rdx
 		jnz		loop1
 		setc	al
 		movzx	rax, al
        ret
 rcl_x64	ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 rcr_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = nLowestBit
 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
 		ALIGN 16
 loop1:
 		rcr		qword ptr -8[rcx + rdx * 8], 1
 		dec		rdx
 		jnz		loop1
 		setc	al
 		movzx	rax, al
        ret
 rcr_x64	ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 div_x64	PROC
        ; rcx = &Hi
        ; rdx = &Lo
        ; r8 = nDiv
        mov		r11, rcx
        mov		r10, rdx
        mov		rdx, qword ptr [r11]
        mov		rax, qword ptr [r10]
        div		r8
        mov		qword ptr [r10], rdx ; remainder
        mov		qword ptr [r11], rax ; value
        ret
 div_x64	ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 rcl2_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = bits
        ; r9 = c
        push	rbx
        mov		r10, rcx	; r10 = p1
        xor		rax, rax
        mov		rcx, 64
        sub		rcx, r8
        mov		r11, -1
        shr		r11, cl		; r11 = mask
 		mov		rcx, r8		; rcx = count of bits
 		mov		rbx, rax	; rbx = old value = 0
 		or		r9, r9
 		cmovnz	rbx, r11	; if (c) then old value = mask
        mov		r9, rax		; r9 = index (0..nSize-1)
 		ALIGN 16
 loop1:
 		rol		qword ptr [r10+r9*8], cl
 		mov		rax, qword ptr [r10+r9*8]
 		and		rax, r11
 		xor		qword ptr [r10+r9*8], rax
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
 		lea		r9, [r9+1]
 		dec		rdx
 		jnz		loop1
 		and		rax, 1
 		pop		rbx
        ret
 rcl2_x64	ENDP
 ;----------------------------------------
        ALIGN       8
 ;----------------------------------------
 rcr2_x64	PROC
        ; rcx = p1
        ; rdx = nSize
        ; r8 = bits
        ; r9 = c
        push	rbx
        mov		r10, rcx	; r10 = p1
        xor		rax, rax
        mov		rcx, 64
        sub		rcx, r8
        mov		r11, -1
        shl		r11, cl		; r11 = mask
 		mov		rcx, r8		; rcx = count of bits
 		mov		rbx, rax	; rbx = old value = 0
 		or		r9, r9
 		cmovnz	rbx, r11	; if (c) then old value = mask
        mov		r9, rdx		; r9 = index (0..nSize-1)
 		lea		r9, [r9-1]
 		ALIGN 16
 loop1:
 		ror		qword ptr [r10+r9*8], cl
 		mov		rax, qword ptr [r10+r9*8]
 		and		rax, r11
 		xor		qword ptr [r10+r9*8], rax
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
 		lea		r9, [r9-1]
 		dec		rdx
 		jnz		loop1
 		rol		rax, 1
 		and		rax, 1
 		pop		rbx
        ret
 rcr2_x64	ENDP
 END