- fixed a bug in 64 bit ASM for MSVC

git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@181 e52654a7-88a9-db11-a3e9-0013d4bc506e
2009-07-28 16:34:04 +00:00 · 2009-07-28 16:34:04 +00:00 · e102086f80
parent 51b2c974a1
commit e102086f80
5 changed files with 271 additions and 144 deletions
--- a/ttmath/ttmathbig.h
+++ b/ttmath/ttmathbig.h
@ -3869,47 +3869,66 @@ public:
 		// we should check the mantissas beforehand because sometimes we can have
 		// a mantissa set to zero but in the exponent something another value
 		// (maybe we've forgotten about calling CorrectZero() ?)
-		if( mantissa.IsZero() && ss2.mantissa.IsZero())
+		if( mantissa.IsZero())
-		{
+			{
-			return true;
+			if (ss2.mantissa.IsZero())
-		}
+				return true;
-
+			return(ss2.AboutEqual(*this,nBitsToIgnore));
-		if( IsSign() != ss2.IsSign() )
+			}
 		{
 			return false;
 		}
-		if( exponent==ss2.exponent )
+		if (ss2.mantissa.IsZero())
-		{
+			{
-			if (mantissa == ss2.mantissa)
+			return(this->exponent <= uint(2*(-sint(man*TTMATH_BITS_PER_UINT))+nBitsToIgnore));
-				{
+			}
-				return(true);
+			
-				}
+		// exponents may not differ much!
-			if( IsSign() != ss2.IsSign() )
+		ttmath::Int<exp>	expdiff(this->exponent - ss2.exponent);
-				{
+		
-				// we need to check the difference (both might be around Zero)
+		// they may differ one if for example mantissa1=0x80000000, mantissa2=0xffffffff
-				Big<exp,man>	temp(*this);
+		if (ttmath::Abs(expdiff) > 1)
-				
+			return(false);		
 				temp.Sub(ss2);
-				Int<exp>	exponent_diff(exponent - temp.exponent);			
+		// calculate the 'difference' mantissa		
-				
+		ttmath::UInt<man>	man1(this->mantissa);
-				return(exponent_diff > man*TTMATH_BITS_PER_UINT-nBitsToIgnore);
+		ttmath::UInt<man>	man2(ss2.mantissa);
-				}
+		ttmath::UInt<man>	mandiff;
-				
+		
-			// faster to mask the bits!
+		switch (expdiff.ToInt())
-			ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT);
+			{
 			case +1:
 				man2.Rcr(1,0);
 				mandiff = man1;
 				mandiff.Sub(man2);
 				break;
 			case -1:
 				man1.Rcr(1,0);
 				mandiff = man2;
 				mandiff.Sub(man1);
 				break;
 			case 0:
 				if (man2 > man1)
 					{
 					mandiff = man2;
 					mandiff.Sub(man1);
 					}
 				  else
 					{
 					mandiff = man1;
 					mandiff.Sub(man2);
 					}
 				break;
 			}
 		// faster to mask the bits!
 		ASSERT(nBitsToIgnore < TTMATH_BITS_PER_UINT);
-			for (int n = man-1; n > 0; --n)
+		for (int n = man-1; n > 0; --n)
-				{
+			{
-				if (mantissa.table[n] != ss2.mantissa.table[n])
+			if (mandiff.table[n] != 0)
-					return(false);
+				return(false);
-				}
+			}
-			uint	nMask = ~((1 << nBitsToIgnore) - 1);
+		uint	nMask = ~((1 << nBitsToIgnore) - 1);
-			return((mantissa.table[0] & nMask) == (ss2.mantissa.table[0] & nMask));
+		return((mandiff.table[0] & nMask) == 0);
 		}
 	return false;
 	}
 	bool operator<(const Big<exp,man> & ss2) const
--- a/ttmath/ttmathuint_noasm.h
+++ b/ttmath/ttmathuint_noasm.h
@ -871,6 +871,83 @@ namespace ttmath
 		u3         = sub_res_low_.u_.low;
 	}
 	/*!
 		this static method addes one vector to the other
 		'ss1' is larger in size or equal to 'ss2'
 		ss1 points to the first (larger) vector
 		ss2 points to the second vector
 		ss1_size - size of the ss1 (and size of the result too)
 		ss2_size - size of the ss2
 		result - is the result vector (which has size the same as ss1: ss1_size)
 		Example:  ss1_size is 5, ss2_size is 3
 		ss1:      ss2:   result (output):
 		  5        1         5+1
 		  4        3         4+3
 		  2        7         2+7
 		  6                  6
 		  9                  9
 	  of course the carry is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
 	template<uint value_size>
 	uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 	uint i, c = 0;
 		TTMATH_ASSERT( ss1_size >= ss2_size )
 		for(i=0 ; i<ss2_size ; ++i)
 			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
 		for( ; i<ss1_size ; ++i)
 			c = AddTwoWords(ss1[i], 0, c, &result[i]);
 		TTMATH_LOG("UInt::AddVector")
 	return c;
 	}
 	/*!
 		this static method subtractes one vector from the other
 		'ss1' is larger in size or equal to 'ss2'
 		ss1 points to the first (larger) vector
 		ss2 points to the second vector
 		ss1_size - size of the ss1 (and size of the result too)
 		ss2_size - size of the ss2
 		result - is the result vector (which has size the same as ss1: ss1_size)
 		Example:  ss1_size is 5, ss2_size is 3
 		ss1:      ss2:   result (output):
 		  5        1         5-1
 		  4        3         4-3
 		  2        7         2-7
 		  6                  6-1  (the borrow from previous item)
 		  9                  9
 		                 return (carry): 0
 	  of course the carry (borrow) is propagated and will be returned from the last item
 	  (this method is used by the Karatsuba multiplication algorithm)
 	*/
 	template<uint value_size>
 	uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
 	{
 	uint i, c = 0;
 		TTMATH_ASSERT( ss1_size >= ss2_size )
 		for(i=0 ; i<ss2_size ; ++i)
 			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
 		for( ; i<ss1_size ; ++i)
 			c = SubTwoWords(ss1[i], 0, c, &result[i]);
 		TTMATH_LOG("UInt::SubVector")
 	return c;
 	}
 #endif // #ifdef TTMATH_PLATFORM64
--- a/ttmath/ttmathuint_x86.h
+++ b/ttmath/ttmathuint_x86.h
@ -42,7 +42,7 @@
 #ifndef TTMATH_NOASM
 #ifdef TTMATH_PLATFORM32
-#pragma message("TTMATH_ASM")
+#pragma message("TTMATH_ASM32")
 /*!
 	\file ttmathuint_x86.h
--- a/ttmath/ttmathuint_x86_64.h
+++ b/ttmath/ttmathuint_x86_64.h
@ -4,20 +4,20 @@
 * Author: Tomasz Sowa <t.sowa@slimaczek.pl>
 */
-/* 
+/*
 * Copyright (c) 2006-2009, Tomasz Sowa
 * All rights reserved.
- * 
+ *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
- * 
+ *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
- *    
+ *
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
- *    
+ *
 *  * Neither the name Tomasz Sowa nor the names of contributors to this
 *    project may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
@ -39,10 +39,10 @@
 #ifndef headerfilettmathuint_x86_64
 #define headerfilettmathuint_x86_64
 #ifndef TTMATH_NOASM
 #ifdef TTMATH_PLATFORM64
 #pragma message("TTMATH_ASM64")
 /*!
 	\file ttmathuint_x86_64.h
    \brief template class UInt<uint> with assembler code for 64bit x86_64 processors
@ -50,6 +50,9 @@
 	this file is included at the end of ttmathuint.h
 */
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 namespace ttmath
 {
@ -113,14 +116,14 @@ namespace ttmath
 				this part should be compiled with gcc
 			*/
 			__asm__ __volatile__(
-	
+
 				"xorq %%rdx, %%rdx				\n"
 				"neg %%rax						\n"     // CF=1 if rax!=0 , CF=0 if rax==0
 			"1:									\n"
 				"movq (%%rsi,%%rdx,8), %%rax	\n"
 				"adcq %%rax, (%%rbx,%%rdx,8)	\n"
-			
+
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
 			"jnz 1b								\n"
@ -134,7 +137,7 @@ namespace ttmath
 		#endif
 		TTMATH_LOG("UInt64::Add")
-	
+
 	return c;
 	}
@ -150,7 +153,7 @@ namespace ttmath
 		if we've got (value_size=3):
 			table[0] = 10;
 			table[1] = 30;
-			table[2] = 5;	
+			table[2] = 5;
 		and we call:
 			AddInt(2,1)
 		then it'll be:
@ -187,7 +190,7 @@ namespace ttmath
 			"1:									\n"
 				"addq %%rax, (%%rbx,%%rdx,8)	\n"
 			"jnc 2f								\n"
-				
+
 				"movq $1, %%rax					\n"
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
@ -204,7 +207,7 @@ namespace ttmath
 		#endif
 		TTMATH_LOG("UInt64::AddInt")
-	
+
 	return c;
 	}
@ -236,14 +239,38 @@ namespace ttmath
 			table[1] = 4 + x1 = 14
 			table[2] = 5 + x2 = 25
 			table[3] = 6
-		
+
 		and no carry at the end of table[3]
-		(of course if there was a carry in table[2](5+20) then 
+		(of course if there was a carry in table[2](5+20) then
 		this carry would be passed to the table[3] etc.)
 	*/
 	template<uint value_size>
 	uint UInt<value_size>::AddTwoInts(uint x2, uint x1, uint index)
 	#if 0
 	{
 	uint i, c;
 		TTMATH_ASSERT( index < value_size )
 		printf("add %Id + %Id\n",x1,x2);
 		for(int i=index ; i<value_size ; ++i)
 			printf("%d: %Id\n",i,table[i]);
 		c = AddTwoWords(table[index],   x1, 0, &table[index]);
 		c = AddTwoWords(table[index+1], x2, c, &table[index+1]);
 		for(i=index+2 ; i<value_size && c ; ++i)
 			c = AddTwoWords(table[i], 0, c, &table[i]);
 		for(i=index ; i<value_size ; ++i)
 			printf("%d: %Id\n",i,table[i]);
 		printf(" -> %d\n",c);
 		TTMATH_LOG("UInt::AddTwoInts")
 	return c;
 	}
 	#else
 	{
 	uint b = value_size;
 	uint * p1 = table;
@ -253,7 +280,14 @@ namespace ttmath
 		#ifndef __GNUC__
 			#if defined(_M_X64)
-				c = addindexed2_x64(p1,b,index,x2,x1);
+				//printf("add %Id + %Id\n",x1,x2);
 				//for(int i=index ; i<value_size ; ++i)
 				//	printf("%d: %Id\n",i,table[i]);
 				//if (table[0] == 1265784741359897913) DebugBreak();
 				c = addindexed2_x64(p1,b,index,x1,x2);
 				//for(int i=index ; i<value_size ; ++i)
 				//	printf("%d: %Id\n",i,table[i]);
 				//printf(" -> %d\n",c);
 			#else
 				#error "another compiler than GCC is currently not supported in 64bit mode"
 			#endif
@ -261,11 +295,11 @@ namespace ttmath
 		#ifdef __GNUC__
 			uint dummy, dummy2;
-			
+
 			__asm__ __volatile__(
-			
+
 				"subq %%rdx, %%rcx 				\n"
-				
+
 				"addq %%rsi, (%%rbx,%%rdx,8) 	\n"
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
@ -289,10 +323,12 @@ namespace ttmath
 		#endif
 		TTMATH_LOG("UInt64::AddTwoInts")
 	return c;
 	}
 	#endif
@ -328,16 +364,16 @@ namespace ttmath
 		#ifdef __GNUC__
 			uint dummy, dummy2;
-			
+
 			__asm__  __volatile__(
-	
+
 				"xorq %%rdx, %%rdx				\n"
 				"neg %%rax						\n"     // CF=1 if rax!=0 , CF=0 if rax==0
 			"1:									\n"
 				"movq (%%rsi,%%rdx,8), %%rax	\n"
 				"sbbq %%rax, (%%rbx,%%rdx,8)	\n"
-			
+
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
 			"jnz 1b								\n"
@ -366,7 +402,7 @@ namespace ttmath
 		if we've got (value_size=3):
 			table[0] = 10;
 			table[1] = 30;
-			table[2] = 5;	
+			table[2] = 5;
 		and we call:
 			SubInt(2,1)
 		then it'll be:
@ -395,15 +431,15 @@ namespace ttmath
 		#ifdef __GNUC__
 			uint dummy, dummy2;
-			
+
 			__asm__ __volatile__(
-			
+
 				"subq %%rdx, %%rcx 				\n"
 			"1:									\n"
 				"subq %%rax, (%%rbx,%%rdx,8)	\n"
 			"jnc 2f								\n"
-				
+
 				"movq $1, %%rax					\n"
 				"incq %%rdx						\n"
 				"decq %%rcx						\n"
@ -436,7 +472,7 @@ namespace ttmath
 		for example:
 		let this is 001010000
 		after Rcl2_one(1) there'll be 010100001 and Rcl2_one returns 0
-	
+
 		***this method is created only on a 64bit platform***
 	*/
 	template<uint value_size>
@ -455,9 +491,9 @@ namespace ttmath
 		#ifdef __GNUC__
 		uint dummy, dummy2;
-		
+
 		__asm__  __volatile__(
-		
+
 			"xorq %%rdx, %%rdx			\n"   // rdx=0
 			"neg %%rax					\n"   // CF=1 if rax!=0 , CF=0 if rax==0
@ -473,7 +509,7 @@ namespace ttmath
 			: "=c" (c), "=a" (dummy), "=d" (dummy2)
 			: "1" (c), "0" (b), "b" (p1)
 			: "cc", "memory" );
-	
+
 		#endif
 		TTMATH_LOG("UInt64::Rcl2_one")
@ -512,7 +548,7 @@ namespace ttmath
 		#ifdef __GNUC__
 		uint dummy;
-		
+
 		__asm__  __volatile__(
 			"neg %%rax						\n"   // CF=1 if rax!=0 , CF=0 if rax==0
@ -549,7 +585,7 @@ namespace ttmath
 		for example:
 		let this is 001010000
 		after Rcl2(3, 1) there'll be 010000111 and Rcl2 returns 1
-	
+
 		***this method is created only on a 64bit platform***
 	*/
 	template<uint value_size>
@ -570,9 +606,9 @@ namespace ttmath
 		#ifdef __GNUC__
 		uint dummy, dummy2, dummy3;
-		
+
 		__asm__  __volatile__(
-		
+
 			"movq %%rcx, %%rsi				\n"
 			"movq $64, %%rcx				\n"
 			"subq %%rsi, %%rcx				\n"
@ -595,11 +631,11 @@ namespace ttmath
 			"xorq %%rax, (%%rbx,%%rdx,8)	\n"
 			"orq  %%rsi, (%%rbx,%%rdx,8)	\n"
 			"movq %%rax, %%rsi				\n"
-			
+
 			"incq %%rdx						\n"
 			"decq %%rdi						\n"
 		"jnz 1b								\n"
-			
+
 			"and $1, %%rax					\n"
 			: "=a" (c), "=D" (dummy), "=S" (dummy2), "=d" (dummy3)
@ -647,7 +683,7 @@ namespace ttmath
 		#ifdef __GNUC__
 			uint dummy, dummy2, dummy3;
-			
+
 			__asm__  __volatile__(
 			"movq %%rcx, %%rsi				\n"
@ -674,11 +710,11 @@ namespace ttmath
 			"xorq %%rax, (%%rbx,%%rdx,8)	\n"
 			"orq  %%rsi, (%%rbx,%%rdx,8)	\n"
 			"movq %%rax, %%rsi				\n"
-			
+
 			"decq %%rdx						\n"
 			"decq %%rdi						\n"
 		"jnz 1b								\n"
-			
+
 			"rolq $1, %%rax					\n"
 			"andq $1, %%rax					\n"
@ -754,7 +790,7 @@ namespace ttmath
 	uint UInt<value_size>::SetBitInWord(uint & value, uint bit)
 	{
 		TTMATH_ASSERT( bit < TTMATH_BITS_PER_UINT )
-		
+
 		uint old_bit;
 		uint v = value;
@ -778,7 +814,7 @@ namespace ttmath
 			"setc %%bl				\n"
 			"movzx %%bl, %%rbx		\n"
-			
+
 			: "=a" (v), "=b" (old_bit)
 			: "0" (v), "1" (bit)
 			: "cc" );
@ -803,7 +839,7 @@ namespace ttmath
 		multiplication: result2:result1 = a * b
 		result2 - higher word
 		result1 - lower word of the result
-	
+
 		this methos never returns a carry
 		***this method is created only on a 64bit platform***
@ -834,7 +870,7 @@ namespace ttmath
 		#ifdef __GNUC__
 		__asm__ __volatile__(
-		
+
 			"mulq %%rdx			\n"
 			: "=a" (result1_), "=d" (result2_)
@ -857,13 +893,13 @@ namespace ttmath
 	 *
 	 *
 	*/
-	
+
 	#ifndef __GNUC__
 	/*!
 		this method calculates 64bits word a:b / 32bits c (a higher, b lower word)
 		r = a:b / c and rest - remainder
-		
+
 		***this method is created only on a 64bit platform***
 		*
@ -896,7 +932,7 @@ namespace ttmath
 		#endif
 		#ifdef __GNUC__
-		
+
 			__asm__ __volatile__(
 			"divq %%rcx				\n"
@ -986,7 +1022,7 @@ namespace ttmath
 	uint i, c = 0;
 		TTMATH_ASSERT( ss1_size >= ss2_size )
-		
+
 		for(i=0 ; i<ss2_size ; ++i)
 			c = AddTwoWords(ss1[i], ss2[i], c, &result[i]);
@ -1025,7 +1061,7 @@ namespace ttmath
 	uint i, c = 0;
 		TTMATH_ASSERT( ss1_size >= ss2_size )
-		
+
 		for(i=0 ; i<ss2_size ; ++i)
 			c = SubTwoWords(ss1[i], ss2[i], c, &result[i]);
--- a/ttmath/ttmathuint_x86_amd64_msvc.asm
+++ b/ttmath/ttmathuint_x86_amd64_msvc.asm
@ -19,7 +19,7 @@ PUBLIC	div_x64
 ;
 .CODE
-       
+
        ALIGN       8
 ;----------------------------------------
@ -33,20 +33,20 @@ adc_x64				PROC
        xor		rax, rax
        xor		r11, r11
        sub		rax, r9		; sets CARRY if r9 != 0
-        
+
 		ALIGN 16
- loop1:	
+ loop1:
 		mov		rax,qword ptr [rdx + r11 * 8]
 		adc		qword ptr [rcx + r11 * 8], rax
 		lea		r11, [r11+1]
 		dec		r8
 		jnz		loop1
-		
+
 		setc	al
 		movzx	rax, al
 		ret
-		
+
 adc_x64				ENDP
 ;----------------------------------------
@ -80,14 +80,14 @@ loop1:
 		lea		r8, [r8+1]
 		add		qword ptr [rcx + r8 * 8], r9
 		jc		loop1
-		
+
 		ret
-		
+
 done_with_cy:
 		lea		rax, [rax+1]		; rax = 1
-		
+
 		ret
-	
+
 addindexed_x64	ENDP
 ;----------------------------------------
@ -98,8 +98,8 @@ addindexed_x64	ENDP
 addindexed2_x64	PROC
-        ; rcx = p1
+        ; rcx = p1 (pointer)
-        ; rdx = b
+        ; rdx = b  (value size)
        ; r8 = nPos
        ; r9 = nValue1
        ; [esp+0x28] = nValue2
@ -109,26 +109,23 @@ addindexed2_x64	PROC
 		sub		rdx, r8				; rdx = remaining count of uints
 		mov		r10, [esp+028h]		; r10 = nValue2
-		add		qword ptr [r11 + r8 * 8], r10
+		add		qword ptr [r11 + r8 * 8], r9
 		lea		r8, [r8+1]
 		lea		rdx, [rdx-1]
 		adc		qword ptr [r11 + r8 * 8], r10
 		jc		next
 		ret
 		ALIGN 16
 loop1:
-		adc		qword ptr [r11 + r8 * 8], r9
+		lea		r8, [r8+1]
 		add		qword ptr [r11 + r8 * 8], 1
 		jc		next
 		ret
 next:
 		lea		r8, [r8+1]
 		xor		r9, r9				; set to 0 -> cy still set!
 		dec		rdx
 		jnz		loop1
 		jc		return_1			; most of the times, there will be NO carry (I hope)
-done:
+next:
-		ret
+		dec		rdx					; does not modify CY too...
-	
+		jnz		loop1
 return_1:
 		lea		rax, [rax+1]
 		ret
@ -138,8 +135,6 @@ addindexed2_x64	ENDP
        ALIGN       8
        ALIGN       8
 ;----------------------------------------
 sbb_x64				PROC
@ -152,15 +147,15 @@ sbb_x64				PROC
        xor		rax, rax
        xor		r11, r11
        sub		rax, r9				; sets CARRY if r9 != 0
-        
+
 		ALIGN 16
- loop1:	
+ loop1:
 		mov		rax,qword ptr [rdx + r11 * 8]
 		sbb		qword ptr [rcx + r11 * 8], rax
 		lea		r11, [r11+1]
 		dec		r8
 		jnz		loop1
-		
+
 		setc	al
 		movzx	rax, al
@ -181,12 +176,12 @@ subindexed_x64	PROC
        ; r9 = nValue
 		sub		rdx, r8				; rdx = remaining count of uints
-		
+
 		ALIGN 16
 loop1:
 		sub		qword ptr [rcx + r8 * 8], r9
 		jnc		done
-		
+
 		lea		r8, [r8+1]
 		mov		r9, 1
 		dec		rdx
@ -196,7 +191,7 @@ loop1:
 done:
 		xor		rax, rax
 		ret
-	
+
  return_1:
 		mov		rax, 1
 		ret
@ -217,17 +212,17 @@ rcl_x64	PROC
 		mov		r11, rcx			; table
 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
-		
+
 		ALIGN 16
 loop1:
 		rcl		qword ptr [r11 + r10 * 8], 1
 		lea		r10, [r10+1]
 		dec		rdx
 		jnz		loop1
-		
+
 		setc	al
 		movzx	rax, al
-		
+
        ret
 rcl_x64	ENDP
@ -245,16 +240,16 @@ rcr_x64	PROC
 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
-		
+
 		ALIGN 16
 loop1:
 		rcr		qword ptr -8[rcx + rdx * 8], 1
 		dec		rdx
 		jnz		loop1
-		
+
 		setc	al
 		movzx	rax, al
-		
+
        ret
 rcr_x64	ENDP
@ -270,7 +265,7 @@ div_x64	PROC
        ; rcx = &Hi
        ; rdx = &Lo
        ; r8 = nDiv
-        
+
        mov		r11, rcx
        mov		r10, rdx
@ -295,21 +290,21 @@ rcl2_x64	PROC
        ; rdx = nSize
        ; r8 = bits
        ; r9 = c
-        
+
        push	rbx
-        
+
        mov		r10, rcx	; r10 = p1
-        xor		rax, rax	
+        xor		rax, rax
-        
+
        mov		rcx, 64
        sub		rcx, r8
-        
+
        mov		r11, -1
        shr		r11, cl		; r11 = mask
 		mov		rcx, r8		; rcx = count of bits
-		mov		rbx, rax	; rbx = old value = 0 
+		mov		rbx, rax	; rbx = old value = 0
 		or		r9, r9
 		cmovnz	rbx, r11	; if (c) then old value = mask
@ -323,7 +318,7 @@ loop1:
 		xor		qword ptr [r10+r9*8], rax
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
-		
+
 		lea		r9, [r9+1]
 		dec		rdx
@ -332,8 +327,8 @@ loop1:
 		and		rax, 1
 		pop		rbx
        ret
-				
+
-rcl2_x64	ENDP				
+rcl2_x64	ENDP
 ;----------------------------------------
@ -346,20 +341,20 @@ rcr2_x64	PROC
        ; rdx = nSize
        ; r8 = bits
        ; r9 = c
-        
+
        push	rbx
        mov		r10, rcx	; r10 = p1
-        xor		rax, rax	
+        xor		rax, rax
-        
+
        mov		rcx, 64
        sub		rcx, r8
-        
+
        mov		r11, -1
        shl		r11, cl		; r11 = mask
 		mov		rcx, r8		; rcx = count of bits
-		mov		rbx, rax	; rbx = old value = 0 
+		mov		rbx, rax	; rbx = old value = 0
 		or		r9, r9
 		cmovnz	rbx, r11	; if (c) then old value = mask
@ -374,18 +369,18 @@ loop1:
 		xor		qword ptr [r10+r9*8], rax
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
-		
+
 		lea		r9, [r9-1]
 		dec		rdx
 		jnz		loop1
-		
+
 		rol		rax, 1
 		and		rax, 1
 		pop		rbx
-		
+
        ret
-				
+
-rcr2_x64	ENDP				
+rcr2_x64	ENDP
 END