merged: x86_64 asm code for Microsoft Visual compiler

file: ttmathuint_x86_64_msvc.asm from chk branch (original was: ttmathuint_x86_amd64_msvc.asm) (this file should be compiled first because MS VC doesn't support inline assembler in x86_64 mode) git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@187 e52654a7-88a9-db11-a3e9-0013d4bc506e
2009-09-07 02:03:00 +00:00
parent 0d71b0cec2
commit 28964d30f7
6 changed files with 780 additions and 109 deletions
--- a/5
+++ b/5
@@ -1,4 +1,4 @@
-Version 0.9.0 prerelease (2009.09.05):
+Version 0.9.0 prerelease (2009.09.07):
    * added:   support for wide characters (wchar_t)
               wide characters are used when macro TTMATH_USE_WCHAR is defined
               this macro is defined automatically when there is macro UNICODE or _UNICODE defined
@@ -22,6 +22,9 @@ Version 0.9.0 prerelease (2009.09.05):
               and use TTMATH_MULTITHREADS_HELPER macro somewhere in your *.cpp file
    * added:   Big::AboutEqual(const Big<exp,man> & ss2, int nBitsToIgnore = 4)
               the last nBitsToIgnore bits from mantissas will be skipped when comparing
+    * added:   x86_64 asm code for Microsoft Visual compiler
+               file: ttmathuint_x86_64_msvc.asm
+               (this file should be compiled first because MS VC doesn't support inline assembler in x86_64 mode) 
    * changed: Factorial() is using the Gamma() function now
    * removed: Parser<>::SetFactorialMax() method
               the factorial() is such a fast now that we don't need the method longer
--- a/ttmath/ttmathbig.h
+++ b/ttmath/ttmathbig.h
@@ -3916,75 +3916,75 @@ public:
 	}


-	bool AboutEqual(const Big<exp,man> & ss2, int nBitsToIgnore = 4) const
-	{
-		// we should check the mantissas beforehand because sometimes we can have
-		// a mantissa set to zero but in the exponent something another value
-		// (maybe we've forgotten about calling CorrectZero() ?)
-		if( mantissa.IsZero() )
-		{
-			if( ss2.mantissa.IsZero() )
-				return true;
-
-			return(ss2.AboutEqual(*this,nBitsToIgnore));
-		}
-
-		if( ss2.mantissa.IsZero() )
-		{
-			return(this->exponent <= uint(2*(-sint(man*TTMATH_BITS_PER_UINT))+nBitsToIgnore));
-		}
-
-		// exponents may not differ much!
-		ttmath::Int<exp> expdiff(this->exponent - ss2.exponent);
-
-		// they may differ one if for example mantissa1=0x80000000, mantissa2=0xffffffff
-		if( ttmath::Abs(expdiff) > 1 )
-			return(false);		
-
-		// calculate the 'difference' mantissa		
-		ttmath::UInt<man> man1(this->mantissa);
-		ttmath::UInt<man> man2(ss2.mantissa);
-		ttmath::UInt<man> mandiff;
-
-		switch( expdiff.ToInt() )
-		{
-			case +1:
-				man2.Rcr(1,0);
-				mandiff = man1;
-				mandiff.Sub(man2);
-				break;
-			case -1:
-				man1.Rcr(1,0);
-				mandiff = man2;
-				mandiff.Sub(man1);
-				break;
-			default:
-				if( man2 > man1 )
-				{
-					mandiff = man2;
-					mandiff.Sub(man1);
-				}
-				else
-				{
-					mandiff = man1;
-					mandiff.Sub(man2);
-				}
-			break;
-		}
-
-		// faster to mask the bits!
-		TTMATH_ASSERT( nBitsToIgnore < TTMATH_BITS_PER_UINT );
-
-		for( int n = man-1; n > 0; --n )
-		{
-			if( mandiff.table[n] != 0 )
-				return(false);
-		}
-
-		uint nMask = ~((1 << nBitsToIgnore) - 1);
-
-	return((mandiff.table[0] & nMask) == 0);
-	}
+	bool AboutEqual(const Big<exp,man> & ss2, int nBitsToIgnore = 4) const
+	{
+		// we should check the mantissas beforehand because sometimes we can have
+		// a mantissa set to zero but in the exponent something another value
+		// (maybe we've forgotten about calling CorrectZero() ?)
+		if( mantissa.IsZero() )
+		{
+			if( ss2.mantissa.IsZero() )
+				return true;
+
+			return(ss2.AboutEqual(*this,nBitsToIgnore));
+		}
+
+		if( ss2.mantissa.IsZero() )
+		{
+			return(this->exponent <= uint(2*(-sint(man*TTMATH_BITS_PER_UINT))+nBitsToIgnore));
+		}
+
+		// exponents may not differ much!
+		ttmath::Int<exp> expdiff(this->exponent - ss2.exponent);
+
+		// they may differ one if for example mantissa1=0x80000000, mantissa2=0xffffffff
+		if( ttmath::Abs(expdiff) > 1 )
+			return(false);		
+
+		// calculate the 'difference' mantissa		
+		ttmath::UInt<man> man1(this->mantissa);
+		ttmath::UInt<man> man2(ss2.mantissa);
+		ttmath::UInt<man> mandiff;
+
+		switch( expdiff.ToInt() )
+		{
+			case +1:
+				man2.Rcr(1,0);
+				mandiff = man1;
+				mandiff.Sub(man2);
+				break;
+			case -1:
+				man1.Rcr(1,0);
+				mandiff = man2;
+				mandiff.Sub(man1);
+				break;
+			default:
+				if( man2 > man1 )
+				{
+					mandiff = man2;
+					mandiff.Sub(man1);
+				}
+				else
+				{
+					mandiff = man1;
+					mandiff.Sub(man2);
+				}
+			break;
+		}
+
+		// faster to mask the bits!
+		TTMATH_ASSERT( nBitsToIgnore < TTMATH_BITS_PER_UINT );
+
+		for( int n = man-1; n > 0; --n )
+		{
+			if( mandiff.table[n] != 0 )
+				return(false);
+		}
+
+		uint nMask = ~((1 << nBitsToIgnore) - 1);
+
+	return((mandiff.table[0] & nMask) == 0);
+	}


 	bool operator<(const Big<exp,man> & ss2) const
--- a/ttmath/ttmathtypes.h
+++ b/ttmath/ttmathtypes.h
@@ -162,8 +162,14 @@ namespace ttmath
 	/*!
 		on 64bit platforms one word (uint, sint) will be equal 64bits
 	*/
-	typedef unsigned long uint;
-	typedef signed   long sint;
+	#ifdef _MSC_VER
+		/* in VC 'long' type has 32 bits, __int64 is VC extension */
+		typedef unsigned __int64 uint;
+		typedef signed   __int64 sint;
+	#else
+		typedef unsigned long uint;
+		typedef signed   long sint;
+	#endif 

 	/*!
 		on 64bit platform we do not define ulint
--- a/ttmath/ttmathuint.h
+++ b/ttmath/ttmathuint.h
@@ -3297,6 +3297,17 @@ public:
 	static uint SetBitInWord(uint & value, uint bit);
 	static void MulTwoWords(uint a, uint b, uint * result_high, uint * result_low);
 	static void DivTwoWords(uint a,uint b, uint c, uint * r, uint * rest);
+
+
+	/* temporarily */
+	#ifndef TTMATH_NOASM
+	#ifdef TTMATH_PLATFORM64
+	#ifdef _MSC_VER
+		static uint AddTwoWords(uint a, uint b, uint carry, uint * result);
+		static uint SubTwoWords(uint a, uint b, uint carry, uint * result);
+	#endif
+	#endif
+	#endif
 };


--- a/ttmath/ttmathuint_x86_64.h
+++ b/ttmath/ttmathuint_x86_64.h
@@ -51,10 +51,33 @@
 	this file is included at the end of ttmathuint.h
 */

+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+

 namespace ttmath
 {

+	#ifdef _MSC_VER
+
+		extern "C"
+			{
+			uint __fastcall adc_x64(uint* p1, const uint* p2, uint nSize, uint c);
+			uint __fastcall addindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
+			uint __fastcall addindexed2_x64(uint* p1, uint nSize, uint nPos, uint nValue1, uint nValue2);
+			uint __fastcall sbb_x64(uint* p1, const uint* p2, uint nSize, uint c);
+			uint __fastcall subindexed_x64(uint* p1, uint nSize, uint nPos, uint nValue);
+			uint __fastcall rcl_x64(uint* p1, uint nSize, uint nLowestBit);
+			uint __fastcall rcr_x64(uint* p1, uint nSize, uint nLowestBit);
+			uint __fastcall div_x64(uint* pnValHi, uint* pnValLo, uint nDiv);
+			uint __fastcall rcl2_x64(uint* p1, uint nSize, uint nBits, uint c);
+			uint __fastcall rcr2_x64(uint* p1, uint nSize, uint nBits, uint c);
+			};
+	#endif
+
+
+
 	/*!
 	*
 	*	basic mathematic functions
@@ -82,8 +105,12 @@ namespace ttmath
 		// we don't have to use TTMATH_REFERENCE_ASSERT here
 		// this algorithm doesn't require it

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
+		#endif
+
+		#ifdef _MSC_VER
+			c = adc_x64(p1,p2,b,c);
 		#endif

 		#ifdef __GNUC__
@@ -149,10 +176,16 @@ namespace ttmath

 		TTMATH_ASSERT( index < value_size )

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			c = addindexed_x64(p1,b,index,value);
+		#endif
+
+
 		#ifdef __GNUC__
 		uint dummy, dummy2;
 	
@@ -227,10 +260,16 @@ namespace ttmath

 		TTMATH_ASSERT( index < value_size - 1 )

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			c = addindexed2_x64(p1,b,index,x1,x2);
+		#endif
+
+
 		#ifdef __GNUC__
 		uint dummy, dummy2;

@@ -288,6 +327,9 @@ namespace ttmath
 	  of course the carry is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
+
+#ifndef _MSC_VER
+
 	template<uint value_size>
 	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
@@ -296,10 +338,16 @@ namespace ttmath
 		uint rest = ss1_size - ss2_size;
 		uint c;

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			
+		#endif
+
+
 		#ifdef __GNUC__
 		uint dummy1, dummy2, dummy3;	
 			
@@ -348,8 +396,27 @@ namespace ttmath
 	return c;
 	}

+#else
+	/* temporarily */
+	template<uint value_size>
+	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+	uint i, c = 0;

+		TTMATH_ASSERT( ss1_size >= ss2_size )

+		for(i=0 ; i<ss2_size ; ++i)
+			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
+
+		for( ; i<ss1_size ; ++i)
+			c = AddTwoWords(ss1[i], 0, c, &result[i]);
+
+		TTMATH_LOG("UInt::AddVector")
+
+	return c;
+	}
+
+#endif


 	/*!
@@ -373,10 +440,16 @@ namespace ttmath
 		// we don't have to use TTMATH_REFERENCE_ASSERT here
 		// this algorithm doesn't require it

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			c = sbb_x64(p1,p2,b,c);
+		#endif
+
+
 		#ifdef __GNUC__
 		uint dummy, dummy2;

@@ -432,15 +505,22 @@ namespace ttmath
 	uint b = value_size;
 	uint * p1 = table;
 	uint c;
-	uint dummy, dummy2;

 		TTMATH_ASSERT( index < value_size )

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			c = subindexed_x64(p1,b,index,value);
+		#endif
+
+
 		#ifdef __GNUC__
+			uint dummy, dummy2;
+
 			__asm__ __volatile__(
 			
 				"subq %%rdx, %%rcx 				\n"
@@ -493,6 +573,9 @@ namespace ttmath
 	  of course the carry (borrow) is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
+
+#ifndef _MSC_VER
+
 	template<uint value_size>
 	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
@@ -501,16 +584,22 @@ namespace ttmath
 		uint rest = ss1_size - ss2_size;
 		uint c;

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			
+		#endif
+
+
 		#ifdef __GNUC__

-		/*
-			the asm code is nearly the same as in AddVector
-			only two instructions 'adc' are changed to 'sbb'
-		*/
+		
+		//	the asm code is nearly the same as in AddVector
+		//	only two instructions 'adc' are changed to 'sbb'
+		
 		uint dummy1, dummy2, dummy3;

 			__asm__ __volatile__(
@@ -556,6 +645,27 @@ namespace ttmath
 	return c;
 	}

+#else
+	/* temporarily */
+	template<uint value_size>
+	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+	uint i, c = 0;
+
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+
+		for(i=0 ; i<ss2_size ; ++i)
+			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
+
+		for( ; i<ss1_size ; ++i)
+			c = SubTwoWords(ss1[i], 0, c, &result[i]);
+
+		TTMATH_LOG("UInt::SubVector")
+
+	return c;
+	}
+
+#endif


 	/*!
@@ -579,10 +689,16 @@ namespace ttmath
 	uint * p1 = table;
 	

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			c = rcl_x64(p1,b,c);
+		#endif
+
+
 		#ifdef __GNUC__
 		uint dummy, dummy2;

@@ -633,10 +749,16 @@ namespace ttmath
 	uint * p1 = table;
 	

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			c = rcr_x64(p1,b,c);
+		#endif
+
+
 		#ifdef __GNUC__
 		uint dummy;

@@ -688,10 +810,16 @@ namespace ttmath
 	uint * p1 = table;


-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			c = rcl2_x64(p1,b,bits,c);
+		#endif
+
+
 		#ifdef __GNUC__
 		uint dummy, dummy2, dummy3;

@@ -758,14 +886,20 @@ namespace ttmath

 	sint b = value_size;
 	uint * p1 = table;
-	uint dummy, dummy2, dummy3;

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
+		#endif
+
+
+		#ifdef _MSC_VER
+			c = rcr2_x64(p1,b,bits,c);
 		#endif


 		#ifdef __GNUC__
+			uint dummy, dummy2, dummy3;
+
 			__asm__  __volatile__(

 			"movq %%rcx, %%rsi				\n"
@@ -823,10 +957,23 @@ namespace ttmath
 	sint result;


-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+		
+		#ifdef _MSC_VER
+
+			unsigned long nIndex = 0;
+
+			if( _BitScanReverse64(&nIndex,x) == 0 )
+				result = -1;
+			else
+				result = nIndex;
+
+		#endif
+
+
 		#ifdef __GNUC__
 		uint dummy;

@@ -868,11 +1015,16 @@ namespace ttmath
 		uint old_bit;
 		uint v = value;

-
-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			old_bit = _bittestandset64((__int64*)&value,bit) != 0;
+		#endif
+
+
 		#ifdef __GNUC__

 			__asm__ (
@@ -924,10 +1076,16 @@ namespace ttmath
 	uint result1_;
 	uint result2_;

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+			result1_ = _umul128(a,b,&result2_);
+		#endif
+
+
 		#ifdef __GNUC__

 		__asm__ (
@@ -981,10 +1139,20 @@ namespace ttmath

 		TTMATH_ASSERT( c != 0 )

-		#ifndef __GNUC__
-			#error "another compiler than GCC is currently not supported in 64bit mode"
+		#if !defined(__GNUC__) && !defined(_MSC_VER)
+			#error "another compiler than GCC or Microsoft VC is currently not supported in 64bit mode, you can compile with TTMATH_NOASM macro"
 		#endif

+
+		#ifdef _MSC_VER
+
+			div_x64(&a,&b,c);
+			r_    = a;
+			rest_ = b;
+			
+		#endif
+
+
 		#ifdef __GNUC__
 		
 			__asm__ (
@@ -1003,6 +1171,59 @@ namespace ttmath
 	}


+	/* temporarily */
+	template<uint value_size>
+	uint UInt<value_size>::AddTwoWords(uint a, uint b, uint carry, uint * result)
+	{
+	uint temp;
+
+		if( carry == 0 )
+		{
+			temp = a + b;
+
+			if( temp < a )
+				carry = 1;
+		}
+		else
+		{
+			carry = 1;
+			temp  = a + b + carry;
+
+			if( temp > a ) // !(temp<=a)
+				carry = 0;
+		}
+
+		*result = temp;
+
+	return carry;
+	}
+
+
+	/* temporarily */
+	template<uint value_size>
+	uint UInt<value_size>::SubTwoWords(uint a, uint b, uint carry, uint * result)
+	{
+		if( carry == 0 )
+		{
+			*result = a - b;
+
+			if( a < b )
+				carry = 1;
+		}
+		else
+		{
+			carry   = 1;
+			*result = a - b - carry;
+
+			if( a > b ) // !(a <= b )
+				carry = 0;
+		}
+
+	return carry;
+	}
+
+
+
 } //namespace


--- a/ttmath/ttmathuint_x86_64_msvc.asm
+++ b/ttmath/ttmathuint_x86_64_msvc.asm
@@ -0,0 +1,430 @@
+;
+; This file is a part of TTMath Bignum Library
+; and is distributed under the (new) BSD licence.
+; Author: Christian Kaiser <>
+;
+
+; 
+; Copyright (c) 2009, Christian Kaiser
+; All rights reserved.
+; 
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+; 
+;  * Redistributions of source code must retain the above copyright notice,
+;    this list of conditions and the following disclaimer.
+;    
+;  * Redistributions in binary form must reproduce the above copyright
+;    notice, this list of conditions and the following disclaimer in the
+;    documentation and/or other materials provided with the distribution.
+;    
+;  * Neither the name Tomasz Sowa nor the names of contributors to this
+;    project may be used to endorse or promote products derived
+;    from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+; THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+;
+; compile with debug info:    ml64.exe /Zd /Zi ttmathuint_x86_64_msvc.asm
+; compile without debug info: ml64.exe ttmathuint_x86_64_msvc.asm
+; this create ttmathuint_x86_64_msvc.obj file which can be linked with your program
+;
+
+PUBLIC	adc_x64
+PUBLIC	addindexed_x64
+PUBLIC	addindexed2_x64
+
+PUBLIC	sbb_x64
+PUBLIC	subindexed_x64
+
+PUBLIC	rcl_x64
+PUBLIC	rcr_x64
+
+PUBLIC	rcl2_x64
+PUBLIC	rcr2_x64
+
+PUBLIC	div_x64
+
+;
+;	"rax, rcx, rdx, r8-r11 are volatile."
+;	"rbx, rbp, rdi, rsi, r12-r15 are nonvolatile."
+;
+
+
+.CODE
+
+        ALIGN       8
+
+;----------------------------------------
+
+adc_x64				PROC
+        ; rcx = p1
+        ; rdx = p2
+        ; r8 = nSize
+        ; r9 = nCarry
+
+        xor		rax, rax
+        xor		r11, r11
+        sub		rax, r9		; sets CARRY if r9 != 0
+
+		ALIGN 16
+ loop1:
+		mov		rax,qword ptr [rdx + r11 * 8]
+		adc		qword ptr [rcx + r11 * 8], rax
+		lea		r11, [r11+1]
+		dec		r8
+		jnz		loop1
+
+		setc	al
+		movzx	rax, al
+
+		ret
+
+adc_x64				ENDP
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+addindexed_x64	PROC
+
+        ; rcx = p1
+        ; rdx = nSize
+        ; r8 = nPos
+        ; r9 = nValue
+
+		xor		rax, rax			; rax = result
+		sub		rdx, r8				; rdx = remaining count of uints
+
+		add		qword ptr [rcx + r8 * 8], r9
+		jc		next1
+
+		ret
+
+next1:
+		mov		r9, 1
+
+		ALIGN 16
+loop1:
+		dec		rdx
+		jz		done_with_cy
+		lea		r8, [r8+1]
+		add		qword ptr [rcx + r8 * 8], r9
+		jc		loop1
+
+		ret
+
+done_with_cy:
+		lea		rax, [rax+1]		; rax = 1
+
+		ret
+
+addindexed_x64	ENDP
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+addindexed2_x64	PROC
+
+        ; rcx = p1 (pointer)
+        ; rdx = b  (value size)
+        ; r8 = nPos
+        ; r9 = nValue1
+        ; [esp+0x28] = nValue2
+
+		xor		rax, rax			; return value
+		mov		r11, rcx			; table
+		sub		rdx, r8				; rdx = remaining count of uints
+		mov		r10, [esp+028h]		; r10 = nValue2
+
+		add		qword ptr [r11 + r8 * 8], r9
+		lea		r8, [r8+1]
+		lea		rdx, [rdx-1]
+		adc		qword ptr [r11 + r8 * 8], r10
+		jc		next
+		ret
+
+		ALIGN 16
+loop1:
+		lea		r8, [r8+1]
+		add		qword ptr [r11 + r8 * 8], 1
+		jc		next
+		ret
+
+next:
+		dec		rdx					; does not modify CY too...
+		jnz		loop1
+		lea		rax, [rax+1]
+		ret
+
+addindexed2_x64	ENDP
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+sbb_x64				PROC
+
+        ; rcx = p1
+        ; rdx = p2
+        ; r8 = nCount
+        ; r9 = nCarry
+
+        xor		rax, rax
+        xor		r11, r11
+        sub		rax, r9				; sets CARRY if r9 != 0
+
+		ALIGN 16
+ loop1:
+		mov		rax,qword ptr [rdx + r11 * 8]
+		sbb		qword ptr [rcx + r11 * 8], rax
+		lea		r11, [r11+1]
+		dec		r8
+		jnz		loop1
+
+		setc	al
+		movzx	rax, al
+
+		ret
+
+sbb_x64				ENDP
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+subindexed_x64	PROC
+        ; rcx = p1
+        ; rdx = nSize
+        ; r8 = nPos
+        ; r9 = nValue
+
+		sub		rdx, r8				; rdx = remaining count of uints
+
+		ALIGN 16
+loop1:
+		sub		qword ptr [rcx + r8 * 8], r9
+		jnc		done
+
+		lea		r8, [r8+1]
+		mov		r9, 1
+		dec		rdx
+		jnz		loop1
+		jc		return_1	; most of the times, there will be NO carry (I hope)
+
+done:
+		xor		rax, rax
+		ret
+
+  return_1:
+		mov		rax, 1
+		ret
+
+subindexed_x64	ENDP
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+rcl_x64	PROC
+        ; rcx = p1
+        ; rdx = b
+        ; r8 = nLowestBit
+
+		mov		r11, rcx			; table
+		xor		r10, r10
+		neg		r8					; CY set if r8 <> 0
+
+		ALIGN 16
+loop1:
+		rcl		qword ptr [r11 + r10 * 8], 1
+		lea		r10, [r10+1]
+		dec		rdx
+		jnz		loop1
+
+		setc	al
+		movzx	rax, al
+
+        ret
+
+rcl_x64	ENDP
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+rcr_x64	PROC
+        ; rcx = p1
+        ; rdx = nSize
+        ; r8 = nLowestBit
+
+		xor		r10, r10
+		neg		r8					; CY set if r8 <> 0
+
+		ALIGN 16
+loop1:
+		rcr		qword ptr -8[rcx + rdx * 8], 1
+		dec		rdx
+		jnz		loop1
+
+		setc	al
+		movzx	rax, al
+
+        ret
+
+rcr_x64	ENDP
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+div_x64	PROC
+
+        ; rcx = &Hi
+        ; rdx = &Lo
+        ; r8 = nDiv
+
+        mov		r11, rcx
+        mov		r10, rdx
+
+        mov		rdx, qword ptr [r11]
+        mov		rax, qword ptr [r10]
+        div		r8
+        mov		qword ptr [r10], rdx ; remainder
+        mov		qword ptr [r11], rax ; value
+
+        ret
+
+div_x64	ENDP
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+rcl2_x64	PROC
+        ; rcx = p1
+        ; rdx = nSize
+        ; r8 = bits
+        ; r9 = c
+
+        push	rbx
+
+        mov		r10, rcx	; r10 = p1
+        xor		rax, rax
+
+        mov		rcx, 64
+        sub		rcx, r8
+
+        mov		r11, -1
+        shr		r11, cl		; r11 = mask
+
+		mov		rcx, r8		; rcx = count of bits
+
+		mov		rbx, rax	; rbx = old value = 0
+		or		r9, r9
+		cmovnz	rbx, r11	; if (c) then old value = mask
+
+        mov		r9, rax		; r9 = index (0..nSize-1)
+
+		ALIGN 16
+loop1:
+		rol		qword ptr [r10+r9*8], cl
+		mov		rax, qword ptr [r10+r9*8]
+		and		rax, r11
+		xor		qword ptr [r10+r9*8], rax
+		or		qword ptr [r10+r9*8], rbx
+		mov		rbx, rax
+
+		lea		r9, [r9+1]
+		dec		rdx
+
+		jnz		loop1
+
+		and		rax, 1
+		pop		rbx
+        ret
+
+rcl2_x64	ENDP
+
+;----------------------------------------
+
+        ALIGN       8
+
+;----------------------------------------
+
+rcr2_x64	PROC
+        ; rcx = p1
+        ; rdx = nSize
+        ; r8 = bits
+        ; r9 = c
+
+        push	rbx
+        mov		r10, rcx	; r10 = p1
+        xor		rax, rax
+
+        mov		rcx, 64
+        sub		rcx, r8
+
+        mov		r11, -1
+        shl		r11, cl		; r11 = mask
+
+		mov		rcx, r8		; rcx = count of bits
+
+		mov		rbx, rax	; rbx = old value = 0
+		or		r9, r9
+		cmovnz	rbx, r11	; if (c) then old value = mask
+
+        mov		r9, rdx		; r9 = index (0..nSize-1)
+		lea		r9, [r9-1]
+
+		ALIGN 16
+loop1:
+		ror		qword ptr [r10+r9*8], cl
+		mov		rax, qword ptr [r10+r9*8]
+		and		rax, r11
+		xor		qword ptr [r10+r9*8], rax
+		or		qword ptr [r10+r9*8], rbx
+		mov		rbx, rax
+
+		lea		r9, [r9-1]
+		dec		rdx
+
+		jnz		loop1
+
+		rol		rax, 1
+		and		rax, 1
+		pop		rbx
+
+        ret
+
+rcr2_x64	ENDP
+
+END