From e102086f80314e095ab44ec85303060bedc46de9 Mon Sep 17 00:00:00 2001
From: Christian Kaiser <chk@online.de>
Date: Tue, 28 Jul 2009 16:34:04 +0000
Subject: [PATCH] - fixed a bug in 64 bit ASM for MSVC

git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@181 e52654a7-88a9-db11-a3e9-0013d4bc506e
---
 ttmath/ttmathbig.h                   |  95 ++++++++++++--------
 ttmath/ttmathuint_noasm.h            |  77 ++++++++++++++++
 ttmath/ttmathuint_x86.h              |   2 +-
 ttmath/ttmathuint_x86_64.h           | 130 +++++++++++++++++----------
 ttmath/ttmathuint_x86_amd64_msvc.asm | 111 +++++++++++------------
 5 files changed, 271 insertions(+), 144 deletions(-)
diff --git a/ttmath/ttmathbig.h b/ttmath/ttmathbig.h
index 7f3574c..4fbb8a0 100644
--- a/ttmath/ttmathbig.h
+++ b/ttmath/ttmathbig.h
@@ -3869,47 +3869,66 @@ public:
 		// we should check the mantissas beforehand because sometimes we can have
 		// a mantissa set to zero but in the exponent something another value
 		// (maybe we've forgotten about calling CorrectZero() ?)
-		if( mantissa.IsZero() && ss2.mantissa.IsZero())
-		{
-			return true;
-		}
-
-		if( IsSign() != ss2.IsSign() )
-		{
-			return false;
-		}
+		if( mantissa.IsZero())
+			{
+			if (ss2.mantissa.IsZero())
+				return true;
+			return(ss2.AboutEqual(*this,nBitsToIgnore));
+			}
 		
-		if( exponent==ss2.exponent )
-		{
-			if (mantissa == ss2.mantissa)
-				{
-				return(true);
-				}
-			if( IsSign() != ss2.IsSign() )
-				{
-				// we need to check the difference (both might be around Zero)
-				Big<exp,man>	temp(*this);
-				
-				temp.Sub(ss2);
+		if (ss2.mantissa.IsZero())
+			{
+			return(this->exponent <= uint(2*(-sint(man*TTMATH_BITS_PER_UINT))+nBitsToIgnore));
+			}
+			
+		// exponents may not differ much!
+		ttmath::Int<exp>	expdiff(this->exponent - ss2.exponent);
+		
+		// they may differ one if for example mantissa1=0x80000000, mantissa2=0xffffffff
+		if (ttmath::Abs(expdiff) > 1)
+			return(false);		
 
-				Int<exp>	exponent_diff(exponent - temp.exponent);			
-				
-				return(exponent_diff > man*TTMATH_BITS_PER_UINT-nBitsToIgnore);
-				}
-				
-			// faster to mask the bits!
-			ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT);
+		// calculate the 'difference' mantissa		
+		ttmath::UInt<man>	man1(this->mantissa);
+		ttmath::UInt<man>	man2(ss2.mantissa);
+		ttmath::UInt<man>	mandiff;
+		
+		switch (expdiff.ToInt())
+			{
+			case +1:
+				man2.Rcr(1,0);
+				mandiff = man1;
+				mandiff.Sub(man2);
+				break;
+			case -1:
+				man1.Rcr(1,0);
+				mandiff = man2;
+				mandiff.Sub(man1);
+				break;
+			case 0:
+				if (man2 > man1)
+					{
+					mandiff = man2;
+					mandiff.Sub(man1);
+					}
+				  else
+					{
+					mandiff = man1;
+					mandiff.Sub(man2);
+					}
+				break;
+			}
+			
+		// faster to mask the bits!
+		ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT);
 
-			for (int n = man-1; n > 0; --n)
-				{
-				if (mantissa.table[n] != ss2.mantissa.table[n])
-					return(false);
-				}
-			uint	nMask = ~((1 << nBitsToIgnore) - 1);
-			return((mantissa.table[0] & nMask) == (ss2.mantissa.table[0] & nMask));
-		}
-
-	return false;
+		for (int n = man-1; n > 0; --n)
+			{
+			if (mandiff.table[n] != 0)
+				return(false);
+			}
+		uint	nMask = ~((1 << nBitsToIgnore) - 1);
+		return((mandiff.table[0] & nMask) == 0);
 	}
 
 	bool operator<(const Big<exp,man> & ss2) const
diff --git a/ttmath/ttmathuint_noasm.h b/ttmath/ttmathuint_noasm.h
index 951f40d..c09f5e8 100644
--- a/ttmath/ttmathuint_noasm.h
+++ b/ttmath/ttmathuint_noasm.h
@@ -871,6 +871,83 @@ namespace ttmath
 		u3         = sub_res_low_.u_.low;
 	}
 
+	/*!
+		this static method addes one vector to the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5+1
+		  4        3         4+3
+		  2        7         2+7
+		  6                  6
+		  9                  9
+	  of course the carry is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+	uint i, c = 0;
+
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+		
+		for(i=0 ; i<ss2_size ; ++i)
+			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
+
+		for( ; i<ss1_size ; ++i)
+			c = AddTwoWords(ss1[i], 0, c, &result[i]);
+
+		TTMATH_LOG("UInt::AddVector")
+
+	return c;
+	}
+
+	/*!
+		this static method subtractes one vector from the other
+		'ss1' is larger in size or equal to 'ss2'
+
+		ss1 points to the first (larger) vector
+		ss2 points to the second vector
+		ss1_size - size of the ss1 (and size of the result too)
+		ss2_size - size of the ss2
+		result - is the result vector (which has size the same as ss1: ss1_size)
+
+		Example:  ss1_size is 5, ss2_size is 3
+		ss1:      ss2:   result (output):
+		  5        1         5-1
+		  4        3         4-3
+		  2        7         2-7
+		  6                  6-1  (the borrow from previous item)
+		  9                  9
+		                 return (carry): 0
+	  of course the carry (borrow) is propagated and will be returned from the last item
+	  (this method is used by the Karatsuba multiplication algorithm)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
+	{
+	uint i, c = 0;
+
+		TTMATH_ASSERT( ss1_size >= ss2_size )
+		
+		for(i=0 ; i<ss2_size ; ++i)
+			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
+
+		for( ; i<ss1_size ; ++i)
+			c = SubTwoWords(ss1[i], 0, c, &result[i]);
+
+		TTMATH_LOG("UInt::SubVector")
+
+	return c;
+	}
+
 #endif // #ifdef TTMATH_PLATFORM64
 
 
diff --git a/ttmath/ttmathuint_x86.h b/ttmath/ttmathuint_x86.h
index 781a8f7..5d68398 100644
--- a/ttmath/ttmathuint_x86.h
+++ b/ttmath/ttmathuint_x86.h
@@ -42,7 +42,7 @@
 #ifndef TTMATH_NOASM
 #ifdef TTMATH_PLATFORM32
 
-#pragma message("TTMATH_ASM")
+#pragma message("TTMATH_ASM32")
 
 /*!
 	\file ttmathuint_x86.h
diff --git a/ttmath/ttmathuint_x86_64.h b/ttmath/ttmathuint_x86_64.h
index a2eaa81..25a1338 100644
--- a/ttmath/ttmathuint_x86_64.h
+++ b/ttmath/ttmathuint_x86_64.h
@@ -4,20 +4,20 @@
  * Author: Tomasz Sowa <t.sowa@slimaczek.pl>
  */
 
-/* 
+/*
  * Copyright (c) 2006-2009, Tomasz Sowa
  * All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
- * 
+ *
  *  * Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
- *    
+ *
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- *    
+ *
  *  * Neither the name Tomasz Sowa nor the names of contributors to this
  *    project may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
@@ -39,10 +39,10 @@
 #ifndef headerfilettmathuint_x86_64
 #define headerfilettmathuint_x86_64
 
-
 #ifndef TTMATH_NOASM
 #ifdef TTMATH_PLATFORM64
 
+#pragma message("TTMATH_ASM64")
 /*!
 	\file ttmathuint_x86_64.h
     \brief template class UInt<uint> with assembler code for 64bit x86_64 processors
@@ -50,6 +50,9 @@
 	this file is included at the end of ttmathuint.h
 */
 
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
 
 namespace ttmath
 {
@@ -113,14 +116,14 @@ namespace ttmath
 				this part should be compiled with gcc
 			*/
 			__asm__ __volatile__(
-	
+
 				"xorq %%rdx, %%rdx				\n"
 				"neg %%rax						\n"     // CF=1 if rax!=0 , CF=0 if rax==0
 
 			"1:									\n"
 				"movq (%%rsi,%%rdx,8), %%rax	\n"
 				"adcq %%rax, (%%rbx,%%rdx,8)	\n"
-			
+
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
 			"jnz 1b								\n"
@@ -134,7 +137,7 @@ namespace ttmath
 		#endif
 
 		TTMATH_LOG("UInt64::Add")
-	
+
 	return c;
 	}
 
@@ -150,7 +153,7 @@ namespace ttmath
 		if we've got (value_size=3):
 			table[0] = 10;
 			table[1] = 30;
-			table[2] = 5;	
+			table[2] = 5;
 		and we call:
 			AddInt(2,1)
 		then it'll be:
@@ -187,7 +190,7 @@ namespace ttmath
 			"1:									\n"
 				"addq %%rax, (%%rbx,%%rdx,8)	\n"
 			"jnc 2f								\n"
-				
+
 				"movq $1, %%rax					\n"
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
@@ -204,7 +207,7 @@ namespace ttmath
 		#endif
 
 		TTMATH_LOG("UInt64::AddInt")
-	
+
 	return c;
 	}
 
@@ -236,14 +239,38 @@ namespace ttmath
 			table[1] = 4 + x1 = 14
 			table[2] = 5 + x2 = 25
 			table[3] = 6
-		
+
 		and no carry at the end of table[3]
 
-		(of course if there was a carry in table[2](5+20) then 
+		(of course if there was a carry in table[2](5+20) then
 		this carry would be passed to the table[3] etc.)
 	*/
 	template<uint value_size>
 	uint UInt<value_size>::AddTwoInts(uint x2, uint x1, uint index)
+	#if 0
+	{
+	uint i, c;
+
+		TTMATH_ASSERT( index < value_size )
+
+		printf("add %Id + %Id\n",x1,x2);
+		for(int i=index ; i<value_size ; ++i)
+			printf("%d: %Id\n",i,table[i]);
+
+		c = AddTwoWords(table[index],   x1, 0, &table[index]);
+		c = AddTwoWords(table[index+1], x2, c, &table[index+1]);
+
+		for(i=index+2 ; i<value_size && c ; ++i)
+			c = AddTwoWords(table[i], 0, c, &table[i]);
+		for(i=index ; i<value_size ; ++i)
+			printf("%d: %Id\n",i,table[i]);
+		printf(" -> %d\n",c);
+
+		TTMATH_LOG("UInt::AddTwoInts")
+
+	return c;
+	}
+	#else
 	{
 	uint b = value_size;
 	uint * p1 = table;
@@ -253,7 +280,14 @@ namespace ttmath
 
 		#ifndef __GNUC__
 			#if defined(_M_X64)
-				c = addindexed2_x64(p1,b,index,x2,x1);
+				//printf("add %Id + %Id\n",x1,x2);
+				//for(int i=index ; i<value_size ; ++i)
+				//	printf("%d: %Id\n",i,table[i]);
+				//if (table[0] == 1265784741359897913) DebugBreak();
+				c = addindexed2_x64(p1,b,index,x1,x2);
+				//for(int i=index ; i<value_size ; ++i)
+				//	printf("%d: %Id\n",i,table[i]);
+				//printf(" -> %d\n",c);
 			#else
 				#error "another compiler than GCC is currently not supported in 64bit mode"
 			#endif
@@ -261,11 +295,11 @@ namespace ttmath
 
 		#ifdef __GNUC__
 			uint dummy, dummy2;
-			
+
 			__asm__ __volatile__(
-			
+
 				"subq %%rdx, %%rcx 				\n"
-				
+
 				"addq %%rsi, (%%rbx,%%rdx,8) 	\n"
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
@@ -289,10 +323,12 @@ namespace ttmath
 
 		#endif
 
+
 		TTMATH_LOG("UInt64::AddTwoInts")
 
 	return c;
 	}
+	#endif
 
 
 
@@ -328,16 +364,16 @@ namespace ttmath
 
 		#ifdef __GNUC__
 			uint dummy, dummy2;
-			
+
 			__asm__  __volatile__(
-	
+
 				"xorq %%rdx, %%rdx				\n"
 				"neg %%rax						\n"     // CF=1 if rax!=0 , CF=0 if rax==0
 
 			"1:									\n"
 				"movq (%%rsi,%%rdx,8), %%rax	\n"
 				"sbbq %%rax, (%%rbx,%%rdx,8)	\n"
-			
+
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
 			"jnz 1b								\n"
@@ -366,7 +402,7 @@ namespace ttmath
 		if we've got (value_size=3):
 			table[0] = 10;
 			table[1] = 30;
-			table[2] = 5;	
+			table[2] = 5;
 		and we call:
 			SubInt(2,1)
 		then it'll be:
@@ -395,15 +431,15 @@ namespace ttmath
 
 		#ifdef __GNUC__
 			uint dummy, dummy2;
-			
+
 			__asm__ __volatile__(
-			
+
 				"subq %%rdx, %%rcx 				\n"
 
 			"1:									\n"
 				"subq %%rax, (%%rbx,%%rdx,8)	\n"
 			"jnc 2f								\n"
-				
+
 				"movq $1, %%rax					\n"
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
@@ -436,7 +472,7 @@ namespace ttmath
 		for example:
 		let this is 001010000
 		after Rcl2_one(1) there'll be 010100001 and Rcl2_one returns 0
-	
+
 		***this method is created only on a 64bit platform***
 	*/
 	template<uint value_size>
@@ -455,9 +491,9 @@ namespace ttmath
 
 		#ifdef __GNUC__
 		uint dummy, dummy2;
-		
+
 		__asm__  __volatile__(
-		
+
 			"xorq %%rdx, %%rdx			\n"   // rdx=0
 			"neg %%rax					\n"   // CF=1 if rax!=0 , CF=0 if rax==0
 
@@ -473,7 +509,7 @@ namespace ttmath
 			: "=c" (c), "=a" (dummy), "=d" (dummy2)
 			: "1" (c), "0" (b), "b" (p1)
 			: "cc", "memory" );
-	
+
 		#endif
 
 		TTMATH_LOG("UInt64::Rcl2_one")
@@ -512,7 +548,7 @@ namespace ttmath
 
 		#ifdef __GNUC__
 		uint dummy;
-		
+
 		__asm__  __volatile__(
 
 			"neg %%rax						\n"   // CF=1 if rax!=0 , CF=0 if rax==0
@@ -549,7 +585,7 @@ namespace ttmath
 		for example:
 		let this is 001010000
 		after Rcl2(3, 1) there'll be 010000111 and Rcl2 returns 1
-	
+
 		***this method is created only on a 64bit platform***
 	*/
 	template<uint value_size>
@@ -570,9 +606,9 @@ namespace ttmath
 
 		#ifdef __GNUC__
 		uint dummy, dummy2, dummy3;
-		
+
 		__asm__  __volatile__(
-		
+
 			"movq %%rcx, %%rsi				\n"
 			"movq $64, %%rcx				\n"
 			"subq %%rsi, %%rcx				\n"
@@ -595,11 +631,11 @@ namespace ttmath
 			"xorq %%rax, (%%rbx,%%rdx,8)	\n"
 			"orq  %%rsi, (%%rbx,%%rdx,8)	\n"
 			"movq %%rax, %%rsi				\n"
-			
+
 			"incq %%rdx						\n"
 			"decq %%rdi						\n"
 		"jnz 1b								\n"
-			
+
 			"and $1, %%rax					\n"
 
 			: "=a" (c), "=D" (dummy), "=S" (dummy2), "=d" (dummy3)
@@ -647,7 +683,7 @@ namespace ttmath
 
 		#ifdef __GNUC__
 			uint dummy, dummy2, dummy3;
-			
+
 			__asm__  __volatile__(
 
 			"movq %%rcx, %%rsi				\n"
@@ -674,11 +710,11 @@ namespace ttmath
 			"xorq %%rax, (%%rbx,%%rdx,8)	\n"
 			"orq  %%rsi, (%%rbx,%%rdx,8)	\n"
 			"movq %%rax, %%rsi				\n"
-			
+
 			"decq %%rdx						\n"
 			"decq %%rdi						\n"
 		"jnz 1b								\n"
-			
+
 			"rolq $1, %%rax					\n"
 			"andq $1, %%rax					\n"
 
@@ -754,7 +790,7 @@ namespace ttmath
 	uint UInt<value_size>::SetBitInWord(uint & value, uint bit)
 	{
 		TTMATH_ASSERT( bit < TTMATH_BITS_PER_UINT )
-		
+
 		uint old_bit;
 		uint v = value;
 
@@ -778,7 +814,7 @@ namespace ttmath
 
 			"setc %%bl				\n"
 			"movzx %%bl, %%rbx		\n"
-			
+
 			: "=a" (v), "=b" (old_bit)
 			: "0" (v), "1" (bit)
 			: "cc" );
@@ -803,7 +839,7 @@ namespace ttmath
 		multiplication: result2:result1 = a * b
 		result2 - higher word
 		result1 - lower word of the result
-	
+
 		this methos never returns a carry
 
 		***this method is created only on a 64bit platform***
@@ -834,7 +870,7 @@ namespace ttmath
 		#ifdef __GNUC__
 
 		__asm__ __volatile__(
-		
+
 			"mulq %%rdx			\n"
 
 			: "=a" (result1_), "=d" (result2_)
@@ -857,13 +893,13 @@ namespace ttmath
 	 *
 	 *
 	*/
-	
+
 	#ifndef __GNUC__
 
 	/*!
 		this method calculates 64bits word a:b / 32bits c (a higher, b lower word)
 		r = a:b / c and rest - remainder
-		
+
 		***this method is created only on a 64bit platform***
 
 		*
@@ -896,7 +932,7 @@ namespace ttmath
 		#endif
 
 		#ifdef __GNUC__
-		
+
 			__asm__ __volatile__(
 
 			"divq %%rcx				\n"
@@ -986,7 +1022,7 @@ namespace ttmath
 	uint i, c = 0;
 
 		TTMATH_ASSERT( ss1_size >= ss2_size )
-		
+
 		for(i=0 ; i<ss2_size ; ++i)
 			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
 
@@ -1025,7 +1061,7 @@ namespace ttmath
 	uint i, c = 0;
 
 		TTMATH_ASSERT( ss1_size >= ss2_size )
-		
+
 		for(i=0 ; i<ss2_size ; ++i)
 			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
 
diff --git a/ttmath/ttmathuint_x86_amd64_msvc.asm b/ttmath/ttmathuint_x86_amd64_msvc.asm
index 04fb736..094e547 100644
--- a/ttmath/ttmathuint_x86_amd64_msvc.asm
+++ b/ttmath/ttmathuint_x86_amd64_msvc.asm
@@ -19,7 +19,7 @@ PUBLIC	div_x64
 ;
 
 .CODE
-       
+
         ALIGN       8
 
 ;----------------------------------------
@@ -33,20 +33,20 @@ adc_x64				PROC
         xor		rax, rax
         xor		r11, r11
         sub		rax, r9		; sets CARRY if r9 != 0
-        
+
 		ALIGN 16
- loop1:	
+ loop1:
 		mov		rax,qword ptr [rdx + r11 * 8]
 		adc		qword ptr [rcx + r11 * 8], rax
 		lea		r11, [r11+1]
 		dec		r8
 		jnz		loop1
-		
+
 		setc	al
 		movzx	rax, al
 
 		ret
-		
+
 adc_x64				ENDP
 
 ;----------------------------------------
@@ -80,14 +80,14 @@ loop1:
 		lea		r8, [r8+1]
 		add		qword ptr [rcx + r8 * 8], r9
 		jc		loop1
-		
+
 		ret
-		
+
 done_with_cy:
 		lea		rax, [rax+1]		; rax = 1
-		
+
 		ret
-	
+
 addindexed_x64	ENDP
 
 ;----------------------------------------
@@ -98,8 +98,8 @@ addindexed_x64	ENDP
 
 addindexed2_x64	PROC
 
-        ; rcx = p1
-        ; rdx = b
+        ; rcx = p1 (pointer)
+        ; rdx = b  (value size)
         ; r8 = nPos
         ; r9 = nValue1
         ; [esp+0x28] = nValue2
@@ -109,26 +109,23 @@ addindexed2_x64	PROC
 		sub		rdx, r8				; rdx = remaining count of uints
 		mov		r10, [esp+028h]		; r10 = nValue2
 
-		add		qword ptr [r11 + r8 * 8], r10
+		add		qword ptr [r11 + r8 * 8], r9
 		lea		r8, [r8+1]
+		lea		rdx, [rdx-1]
+		adc		qword ptr [r11 + r8 * 8], r10
+		jc		next
+		ret
 
 		ALIGN 16
 loop1:
-		adc		qword ptr [r11 + r8 * 8], r9
+		lea		r8, [r8+1]
+		add		qword ptr [r11 + r8 * 8], 1
 		jc		next
 		ret
-		
-next:
-		lea		r8, [r8+1]
-		xor		r9, r9				; set to 0 -> cy still set!
-		dec		rdx
-		jnz		loop1
-		jc		return_1			; most of the times, there will be NO carry (I hope)
 
-done:
-		ret
-	
-return_1:
+next:
+		dec		rdx					; does not modify CY too...
+		jnz		loop1
 		lea		rax, [rax+1]
 		ret
 
@@ -138,8 +135,6 @@ addindexed2_x64	ENDP
 
         ALIGN       8
 
-        ALIGN       8
-
 ;----------------------------------------
 
 sbb_x64				PROC
@@ -152,15 +147,15 @@ sbb_x64				PROC
         xor		rax, rax
         xor		r11, r11
         sub		rax, r9				; sets CARRY if r9 != 0
-        
+
 		ALIGN 16
- loop1:	
+ loop1:
 		mov		rax,qword ptr [rdx + r11 * 8]
 		sbb		qword ptr [rcx + r11 * 8], rax
 		lea		r11, [r11+1]
 		dec		r8
 		jnz		loop1
-		
+
 		setc	al
 		movzx	rax, al
 
@@ -181,12 +176,12 @@ subindexed_x64	PROC
         ; r9 = nValue
 
 		sub		rdx, r8				; rdx = remaining count of uints
-		
+
 		ALIGN 16
 loop1:
 		sub		qword ptr [rcx + r8 * 8], r9
 		jnc		done
-		
+
 		lea		r8, [r8+1]
 		mov		r9, 1
 		dec		rdx
@@ -196,7 +191,7 @@ loop1:
 done:
 		xor		rax, rax
 		ret
-	
+
   return_1:
 		mov		rax, 1
 		ret
@@ -217,17 +212,17 @@ rcl_x64	PROC
 		mov		r11, rcx			; table
 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
-		
+
 		ALIGN 16
 loop1:
 		rcl		qword ptr [r11 + r10 * 8], 1
 		lea		r10, [r10+1]
 		dec		rdx
 		jnz		loop1
-		
+
 		setc	al
 		movzx	rax, al
-		
+
         ret
 
 rcl_x64	ENDP
@@ -245,16 +240,16 @@ rcr_x64	PROC
 
 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
-		
+
 		ALIGN 16
 loop1:
 		rcr		qword ptr -8[rcx + rdx * 8], 1
 		dec		rdx
 		jnz		loop1
-		
+
 		setc	al
 		movzx	rax, al
-		
+
         ret
 
 rcr_x64	ENDP
@@ -270,7 +265,7 @@ div_x64	PROC
         ; rcx = &Hi
         ; rdx = &Lo
         ; r8 = nDiv
-        
+
         mov		r11, rcx
         mov		r10, rdx
 
@@ -295,21 +290,21 @@ rcl2_x64	PROC
         ; rdx = nSize
         ; r8 = bits
         ; r9 = c
-        
+
         push	rbx
-        
+
         mov		r10, rcx	; r10 = p1
-        xor		rax, rax	
-        
+        xor		rax, rax
+
         mov		rcx, 64
         sub		rcx, r8
-        
+
         mov		r11, -1
         shr		r11, cl		; r11 = mask
 
 		mov		rcx, r8		; rcx = count of bits
 
-		mov		rbx, rax	; rbx = old value = 0 
+		mov		rbx, rax	; rbx = old value = 0
 		or		r9, r9
 		cmovnz	rbx, r11	; if (c) then old value = mask
 
@@ -323,7 +318,7 @@ loop1:
 		xor		qword ptr [r10+r9*8], rax
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
-		
+
 		lea		r9, [r9+1]
 		dec		rdx
 
@@ -332,8 +327,8 @@ loop1:
 		and		rax, 1
 		pop		rbx
         ret
-				
-rcl2_x64	ENDP				
+
+rcl2_x64	ENDP
 
 ;----------------------------------------
 
@@ -346,20 +341,20 @@ rcr2_x64	PROC
         ; rdx = nSize
         ; r8 = bits
         ; r9 = c
-        
+
         push	rbx
         mov		r10, rcx	; r10 = p1
-        xor		rax, rax	
-        
+        xor		rax, rax
+
         mov		rcx, 64
         sub		rcx, r8
-        
+
         mov		r11, -1
         shl		r11, cl		; r11 = mask
 
 		mov		rcx, r8		; rcx = count of bits
 
-		mov		rbx, rax	; rbx = old value = 0 
+		mov		rbx, rax	; rbx = old value = 0
 		or		r9, r9
 		cmovnz	rbx, r11	; if (c) then old value = mask
 
@@ -374,18 +369,18 @@ loop1:
 		xor		qword ptr [r10+r9*8], rax
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
-		
+
 		lea		r9, [r9-1]
 		dec		rdx
 
 		jnz		loop1
-		
+
 		rol		rax, 1
 		and		rax, 1
 		pop		rbx
-		
+
         ret
-				
-rcr2_x64	ENDP				
+
+rcr2_x64	ENDP
 
 END