changed: a little changes in all asm code

it should be a little faster git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@149 e52654a7-88a9-db11-a3e9-0013d4bc506e
2009-05-17 00:04:42 +00:00
parent eaa19dd46a
commit 5e5a106605
2 changed files with 236 additions and 265 deletions
--- a/ttmath/ttmathuint_x86.h
+++ b/ttmath/ttmathuint_x86.h
@@ -77,9 +77,9 @@ namespace ttmath
 	template<uint value_size>
 	uint UInt<value_size>::Add(const UInt<value_size> & ss2, uint c)
 	{
-	register uint b = value_size;
-	register uint * p1 = table;
-	register uint * p2 = const_cast<uint*>(ss2.table);
+	uint b = value_size;
+	uint * p1 = table;
+	uint * p2 = const_cast<uint*>(ss2.table);

 		// we don't have to use TTMATH_REFERENCE_ASSERT here
 		// this algorithm doesn't require it
@@ -101,10 +101,9 @@ namespace ttmath
 				mov ebx,[p1]
 				mov esi,[p2]

-				xor eax,eax  // eax=0
-				mov edx,eax  // edx=0
-
-				sub eax,[c]  // CF=c
+				xor edx,edx          // edx=0
+				mov eax,[c]
+				neg eax              // CF=1 if rax!=0 , CF=0 if rax==0

 			p:
 				mov eax,[esi+edx*4]
@@ -114,9 +113,8 @@ namespace ttmath
 				dec ecx
 			jnz p

-				setc al
-				movzx edx, al
-				mov [c], edx
+				adc ecx, ecx
+				mov [c], ecx

 				pop esi
 				pop edx
@@ -131,35 +129,27 @@ namespace ttmath
 			

 		#ifdef __GNUC__
-			
+		uint dummy, dummy2;
 			//	this part should be compiled with gcc
 			
 			__asm__ __volatile__(
-			
-				"push %%ecx						\n"
-			
-				"xorl %%eax, %%eax				\n"
-				"movl %%eax, %%edx				\n"
-				"subl %%edi, %%eax				\n"

+				"xorl %%edx, %%edx				\n"
+				"negl %%eax						\n"  // CF=1 if rax!=0 , CF=0 if rax==0

 			"1:									\n"
-				"movl (%%esi,%%edx,4),%%eax		\n"
+				"movl (%%esi,%%edx,4), %%eax	\n"
 				"adcl %%eax, (%%ebx,%%edx,4)	\n"
 			
 				"incl %%edx						\n"
 				"decl %%ecx						\n"
 			"jnz 1b								\n"

-				"setc %%al						\n"
-				"movzx %%al,%%edx				\n"
-
-				"pop %%ecx						\n"
-
-				: "=d" (c)
-				: "D" (c), "c" (b), "b" (p1), "S" (p2)
-				: "%eax", "cc", "memory" );
+				"adc %%ecx, %%ecx				\n"

+				: "=c" (c), "=a" (dummy), "=d" (dummy2)
+				: "0" (b),  "1" (c), "b" (p1), "S" (p2)
+				: "cc", "memory" );
 		#endif

 		TTMATH_LOG("UInt::Add")
@@ -191,9 +181,9 @@ namespace ttmath
 	template<uint value_size>
 	uint UInt<value_size>::AddInt(uint value, uint index)
 	{
-	register uint b = value_size;
-	register uint * p1 = table;
-	register uint c;
+	uint b = value_size;
+	uint * p1 = table;
+	uint c;

 		TTMATH_ASSERT( index < value_size )

@@ -238,11 +228,10 @@ namespace ttmath
 			

 		#ifdef __GNUC__
+		uint dummy, dummy2;
+
 			__asm__ __volatile__(
 			
-				"push %%eax						\n"
-				"push %%ecx						\n"
-
 				"subl %%edx, %%ecx 				\n"

 			"1:									\n"
@@ -258,11 +247,8 @@ namespace ttmath
 				"setc %%al						\n"
 				"movzx %%al, %%edx				\n"

-				"pop %%ecx						\n"
-				"pop %%eax						\n"
-
-				: "=d" (c)
-				: "a" (value), "c" (b), "0" (index), "b" (p1)
+				: "=d" (c),    "=a" (dummy), "=c" (dummy2)
+				: "0" (index), "1" (value),  "2" (b), "b" (p1)
 				: "cc", "memory" );

 		#endif
@@ -308,9 +294,9 @@ namespace ttmath
 	template<uint value_size>
 	uint UInt<value_size>::AddTwoInts(uint x2, uint x1, uint index)
 	{
-	register uint b = value_size;
-	register uint * p1 = table;
-	register uint c;
+	uint b = value_size;
+	uint * p1 = table;
+	uint c;

 		TTMATH_ASSERT( index < value_size - 1 )

@@ -359,11 +345,10 @@ namespace ttmath
 			

 		#ifdef __GNUC__
+		uint dummy, dummy2;
+
 			__asm__ __volatile__(
 			
-				"push %%ecx						\n"
-				"push %%edx						\n"
-
 				"subl %%edx, %%ecx 				\n"
 				
 				"addl %%esi, (%%ebx,%%edx,4) 	\n"
@@ -383,11 +368,8 @@ namespace ttmath
 				"setc %%al						\n"
 				"movzx %%al, %%eax				\n"

-				"pop %%edx						\n"
-				"pop %%ecx						\n"
-
-				: "=a" (c)
-				: "c" (b), "d" (index), "b" (p1), "S" (x1), "0" (x2)
+				: "=a" (c), "=c" (dummy), "=d" (dummy2)
+				: "0" (x2), "1" (b),      "2" (index), "b" (p1), "S" (x1)
 				: "cc", "memory" );

 		#endif
@@ -456,10 +438,9 @@ namespace ttmath
 				or ebx, ebx
 				jz end
 				
-				xor ebx, ebx
-				sub ebx, ecx             // setting cf from ecx
+				xor ebx, ebx             // ebx = 0
+				neg ecx                  // setting cf from ecx
 				mov ecx, [rest]          // ecx is != 0
-				mov ebx, 0
 			p2:
 				mov eax, [esi+edx*4]
 				adc eax, ebx 
@@ -503,10 +484,9 @@ namespace ttmath
 				"or %%eax, %%eax					\n"
 				"jz 3f								\n"
 				
-				"xor %%ebx, %%ebx					\n"
-				"sub %%ecx, %%ebx					\n"   // setting cf from ecx
+				"xor %%ebx, %%ebx					\n"   // ebx = 0
+				"neg %%ecx							\n"   // setting cf from ecx
 				"mov %%eax, %%ecx					\n"   // ecx=rest and is != 0
-				"mov $0, %%ebx						\n"
 			"2:										\n"
 				"mov (%%esi, %%edx, 4), %%eax		\n"
 				"adc %%ebx, %%eax 					\n"
@@ -542,9 +522,9 @@ namespace ttmath
 	template<uint value_size>
 	uint UInt<value_size>::Sub(const UInt<value_size> & ss2, uint c)
 	{
-	register uint b = value_size;
-	register uint * p1 = table;
-	register uint * p2 = const_cast<uint*>(ss2.table);
+	uint b = value_size;
+	uint * p1 = table;
+	uint * p2 = const_cast<uint*>(ss2.table);

 		// we don't have to use TTMATH_REFERENCE_ASSERT here
 		// this algorithm doesn't require it
@@ -564,22 +544,20 @@ namespace ttmath
 				mov ebx,[p1]
 				mov esi,[p2]

-				xor eax, eax
-				mov edx, eax
-
-				sub eax, [c]
+				xor edx,edx          // edx=0
+				mov eax,[c]
+				neg eax              // CF=1 if rax!=0 , CF=0 if rax==0

 			p:
-				mov eax, [esi+edx*4]
-				sbb [ebx+edx*4], eax
+				mov eax,[esi+edx*4]
+				sbb [ebx+edx*4],eax

 				inc edx
 				dec ecx
 			jnz p

-				setc al
-				movzx edx, al
-				mov [c], edx
+				adc ecx, ecx
+				mov [c], ecx

 				pop esi
 				pop edx
@@ -592,30 +570,26 @@ namespace ttmath


 		#ifdef __GNUC__
-			__asm__  __volatile__(
-				"push %%ecx						\n"
-			
-				"xorl %%eax, %%eax				\n"
-				"movl %%eax, %%edx				\n"
-				"subl %%edi, %%eax				\n"
+		uint dummy, dummy2;

+			__asm__  __volatile__(
+
+				"xorl %%edx, %%edx				\n"
+				"negl %%eax						\n"  // CF=1 if rax!=0 , CF=0 if rax==0

 			"1:									\n"
-				"movl (%%esi,%%edx,4),%%eax		\n"
+				"movl (%%esi,%%edx,4), %%eax	\n"
 				"sbbl %%eax, (%%ebx,%%edx,4)	\n"
 			
 				"incl %%edx						\n"
 				"decl %%ecx						\n"
 			"jnz 1b								\n"

-				"setc %%al						\n"
-				"movzx %%al,%%edx				\n"
+				"adc %%ecx, %%ecx				\n"

-				"pop %%ecx						\n"
-
-				: "=d" (c)
-				: "D" (c), "c" (b), "b" (p1), "S" (p2)
-				: "%eax", "cc", "memory" );
+				: "=c" (c), "=a" (dummy), "=d" (dummy2)
+				: "0" (b),  "1" (c), "b" (p1), "S" (p2)
+				: "cc", "memory" );

 		#endif

@@ -649,13 +623,14 @@ namespace ttmath
 	template<uint value_size>
 	uint UInt<value_size>::SubInt(uint value, uint index)
 	{
-	register uint b = value_size;
-	register uint * p1 = table;
-	register uint c;
+	uint b = value_size;
+	uint * p1 = table;
+	uint c;

 		TTMATH_ASSERT( index < value_size )

 		#ifndef __GNUC__
+
 			__asm
 			{
 				push eax
@@ -690,15 +665,15 @@ namespace ttmath
 				pop ebx
 				pop eax
 			}
+
 		#endif		
 			

 		#ifdef __GNUC__
+		uint dummy, dummy2;
+
 			__asm__ __volatile__(
 			
-				"push %%eax						\n"
-				"push %%ecx						\n"
-
 				"subl %%edx, %%ecx 				\n"

 			"1:									\n"
@@ -714,11 +689,8 @@ namespace ttmath
 				"setc %%al						\n"
 				"movzx %%al, %%edx				\n"

-				"pop %%ecx						\n"
-				"pop %%eax						\n"
-
-				: "=d" (c)
-				: "a" (value), "c" (b), "0" (index), "b" (p1)
+				: "=d" (c),    "=a" (dummy), "=c" (dummy2)
+				: "0" (index), "1" (value),  "2" (b), "b" (p1)
 				: "cc", "memory" );

 		#endif
@@ -793,10 +765,9 @@ namespace ttmath
 				or ebx, ebx
 				jz end
 				
-				xor ebx, ebx
-				sub ebx, ecx             // setting cf from ecx
+				xor ebx, ebx             // ebx = 0
+				neg ecx                  // setting cf from ecx
 				mov ecx, [rest]          // ecx is != 0
-				mov ebx, 0
 			p2:
 				mov eax, [esi+edx*4]
 				sbb eax, ebx 
@@ -840,10 +811,9 @@ namespace ttmath
 				"or %%eax, %%eax					\n"
 				"jz 3f								\n"
 				
-				"xor %%ebx, %%ebx					\n"
-				"sub %%ecx, %%ebx					\n"   // setting cf from ecx
+				"xor %%ebx, %%ebx					\n"   // ebx = 0
+				"neg %%ecx							\n"   // setting cf from ecx
 				"mov %%eax, %%ecx					\n"   // ecx=rest and is != 0
-				"mov $0, %%ebx						\n"
 			"2:										\n"
 				"mov (%%esi, %%edx, 4), %%eax		\n"
 				"sbb %%ebx, %%eax 					\n"
@@ -884,8 +854,8 @@ namespace ttmath
 	template<uint value_size>
 	uint UInt<value_size>::Rcl2_one(uint c)
 	{
-	register sint b = value_size;
-	register uint * p1 = table;
+	uint b = value_size;
+	uint * p1 = table;

 		#ifndef __GNUC__
 			__asm
@@ -895,11 +865,9 @@ namespace ttmath
 				push edx

 				mov ebx, [p1]
-
 				xor edx, edx
-				mov ecx, edx
-				sub ecx, [c]
-
+				mov ecx, [c]
+				neg ecx
 				mov ecx, [b]

 			p:
@@ -909,10 +877,8 @@ namespace ttmath
 				dec ecx
 			jnz p

-				setc dl
-				movzx edx, dl
-				mov [c], edx
-
+				adc ecx, ecx
+				mov [c], ecx
 				
 				pop edx
 				pop ecx
@@ -922,13 +888,12 @@ namespace ttmath


 		#ifdef __GNUC__
+		uint dummy, dummy2;
+
 		__asm__  __volatile__(

-			"push %%edx					\n"
-			"push %%ecx					\n"
-
 			"xorl %%edx, %%edx			\n"   // edx=0
-			"neg %%eax					\n"   // CF=1 if eax!=0 , CF=0 if eax==0
+			"negl %%eax					\n"   // CF=1 if eax!=0 , CF=0 if eax==0

 		"1:								\n"
 			"rcll $1, (%%ebx, %%edx, 4)	\n"
@@ -937,14 +902,10 @@ namespace ttmath
 			"decl %%ecx					\n"
 		"jnz 1b							\n"

-			"setc %%al					\n"
-			"movzx %%al, %%eax			\n"
+			"adcl %%ecx, %%ecx			\n"

-			"pop %%ecx					\n"
-			"pop %%edx					\n"
-
-			: "=a" (c)
-			: "0" (c), "c" (b), "b" (p1)
+			: "=c" (c), "=a" (dummy), "=d" (dummy2)
+			: "0" (b),  "1" (c), "b" (p1)
 			: "cc", "memory" );

 		#endif
@@ -971,8 +932,8 @@ namespace ttmath
 	template<uint value_size>
 	uint UInt<value_size>::Rcr2_one(uint c)
 	{
-	register sint b = value_size;
-	register uint * p1 = table;
+	uint b = value_size;
+	uint * p1 = table;

 		#ifndef __GNUC__
 			__asm
@@ -981,10 +942,8 @@ namespace ttmath
 				push ecx

 				mov ebx, [p1]
-
-				xor ecx, ecx
-				sub ecx, [c]
-
+				mov ecx, [c]
+				neg ecx
 				mov ecx, [b]

 			p:
@@ -993,8 +952,7 @@ namespace ttmath
 				dec ecx
 			jnz p

-				setc cl
-				movzx ecx, cl
+				adc ecx, ecx
 				mov [c], ecx

 				pop ecx
@@ -1004,11 +962,11 @@ namespace ttmath


 		#ifdef __GNUC__
+		uint dummy;
+
 		__asm__  __volatile__(

-			"push %%ecx						\n"
-
-			"neg %%eax						\n"   // CF=1 if eax!=0 , CF=0 if eax==0
+			"negl %%eax						\n"   // CF=1 if eax!=0 , CF=0 if eax==0

 		"1:									\n"
 			"rcrl $1, -4(%%ebx, %%ecx, 4)	\n"
@@ -1016,13 +974,10 @@ namespace ttmath
 			"decl %%ecx						\n"
 		"jnz 1b								\n"

-			"setc %%al						\n"
-			"movzx %%al, %%eax				\n"
+			"adcl %%ecx, %%ecx				\n"

-			"pop %%ecx						\n"
-
-			: "=a" (c)
-			: "0" (c), "c" (b), "b" (p1)
+			: "=c" (c), "=a" (dummy)
+			: "0" (b),  "1" (c), "b" (p1)
 			: "cc", "memory" );

 		#endif
@@ -1034,6 +989,13 @@ namespace ttmath



+#ifdef _MSC_VER
+#pragma warning (disable : 4731)
+//warning C4731: frame pointer register 'ebp' modified by inline assembly code
+#endif
+	
+
+
 	/*!
 		this method moves all bits into the left hand side
 		return value <- this <- c
@@ -1051,9 +1013,8 @@ namespace ttmath
 	{
 	TTMATH_ASSERT( bits>0 && bits<TTMATH_BITS_PER_UINT )
 		
-	register sint b = value_size;
-	register uint * p1 = table;
-	register uint mask;
+	uint b = value_size;
+	uint * p1 = table;

 		#ifndef __GNUC__
 			__asm
@@ -1064,6 +1025,7 @@ namespace ttmath
 				push edx
 				push esi
 				push edi
+				push ebp

 				mov edi, [b]

@@ -1071,23 +1033,23 @@ namespace ttmath
 				sub ecx, [bits]
 				mov edx, -1
 				shr edx, cl
-				mov [mask], edx

 				mov ecx, [bits]
 				mov ebx, [p1]
-
-				xor edx, edx   // edx = 0
-				mov esi, edx   // old value = 0 
-
 				mov eax, [c]
+
+				mov ebp, edx         // ebp = mask (modified ebp - don't read/write to variables)
+
+				xor edx, edx         // edx = 0
+				mov esi, edx
 				or eax, eax
-				cmovnz esi, [mask] // if c then old value = mask
+				cmovnz esi, ebp      // if(c) esi=mask else esi=0

 		p:
 				rol dword ptr [ebx+edx*4], cl
 				
 				mov eax, [ebx+edx*4]
-				and eax, [mask] 
+				and eax, ebp
 				xor [ebx+edx*4], eax // clearing bits
 				or [ebx+edx*4], esi  // saving old value
 				mov esi, eax
@@ -1096,6 +1058,8 @@ namespace ttmath
 				dec edi
 			jnz p

+				pop ebp              // restoring ebp
+
 				and eax, 1
 				mov [c], eax

@@ -1110,31 +1074,30 @@ namespace ttmath


 		#ifdef __GNUC__
+		uint dummy, dummy2, dummy3;
+
 		__asm__  __volatile__(

-			"push %%edx						\n"
-			"push %%esi						\n"
-			"push %%edi						\n"
+			"push %%ebp						\n"
 			
 			"movl %%ecx, %%esi				\n"
 			"movl $32, %%ecx				\n"
-			"subl %%esi, %%ecx				\n"
-			"movl $-1, %%edx				\n"
-			"shrl %%cl, %%edx				\n"
-			"movl %%edx, %[amask]			\n"
+			"subl %%esi, %%ecx				\n"    // ecx = 32 - bits
+			"movl $-1, %%edx				\n"    // edx = -1 (all bits set to one)
+			"shrl %%cl, %%edx				\n"    // shifting (0 -> edx -> cf)  (cl times)
+			"movl %%edx, %%ebp				\n"    // ebp = edx = mask
 			"movl %%esi, %%ecx				\n"

 			"xorl %%edx, %%edx				\n"
 			"movl %%edx, %%esi				\n"
-
 			"orl %%eax, %%eax				\n"
-			"cmovnz %[amask], %%esi			\n"
+			"cmovnz %%ebp, %%esi			\n"    // if(c) esi=mask else esi=0

 		"1:									\n"
 			"roll %%cl, (%%ebx,%%edx,4)		\n"

 			"movl (%%ebx,%%edx,4), %%eax	\n"
-			"andl %[amask], %%eax			\n"
+			"andl %%ebp, %%eax				\n"
 			"xorl %%eax, (%%ebx,%%edx,4)	\n"
 			"orl  %%esi, (%%ebx,%%edx,4)	\n"
 			"movl %%eax, %%esi				\n"
@@ -1145,12 +1108,10 @@ namespace ttmath
 			
 			"and $1, %%eax					\n"

-			"pop %%edi						\n"
-			"pop %%esi						\n"
-			"pop %%edx						\n"
+			"pop %%ebp						\n"

-			: "=a" (c)
-			: "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask)
+			: "=a" (c), "=D" (dummy), "=S" (dummy2), "=d" (dummy3)
+			: "0" (c),  "1" (b), "b" (p1), "c" (bits)
 			: "cc", "memory" );

 		#endif
@@ -1180,9 +1141,8 @@ namespace ttmath
 	{
 	TTMATH_ASSERT( bits>0 && bits<TTMATH_BITS_PER_UINT )

-	register sint b = value_size;
-	register uint * p1 = table;
-	register uint mask;
+	uint b = value_size;
+	uint * p1 = table;

 		#ifndef __GNUC__
 			__asm
@@ -1193,6 +1153,7 @@ namespace ttmath
 				push edx
 				push esi
 				push edi
+				push ebp

 				mov edi, [b]

@@ -1200,25 +1161,25 @@ namespace ttmath
 				sub ecx, [bits]
 				mov edx, -1
 				shl edx, cl
-				mov [mask], edx

 				mov ecx, [bits]
 				mov ebx, [p1]
-
-				xor edx, edx   // edx = 0
-				mov esi, edx   // old value = 0 
-				add edx, edi   
-				dec edx        // edx - is pointing at the last word
-
 				mov eax, [c]
+
+				mov ebp, edx         // ebp = mask (modified ebp - don't read/write to variables)
+
+				xor edx, edx         // edx = 0
+				mov esi, edx
+				add edx, edi
+				dec edx              // edx is pointing at the end of the table (on last word)
 				or eax, eax
-				cmovnz esi, [mask] // if c then old value = mask
+				cmovnz esi, ebp      // if(c) esi=mask else esi=0

 			p:
 				ror dword ptr [ebx+edx*4], cl
 				
 				mov eax, [ebx+edx*4]
-				and eax, [mask] 
+				and eax, ebp 
 				xor [ebx+edx*4], eax // clearing bits
 				or [ebx+edx*4], esi  // saving old value
 				mov esi, eax
@@ -1227,7 +1188,9 @@ namespace ttmath
 				dec edi
 			jnz p

-				rol eax, 1    // 31bit will be first
+				pop ebp              // restoring ebp
+
+				rol eax, 1           // 31bit will be first
 				and eax, 1  
 				mov [c], eax

@@ -1242,33 +1205,32 @@ namespace ttmath


 		#ifdef __GNUC__
+		uint dummy, dummy2, dummy3;
+
 			__asm__  __volatile__(

-			"push %%edx						\n"
-			"push %%esi						\n"
-			"push %%edi						\n"
+			"push %%ebp						\n"
 			
 			"movl %%ecx, %%esi				\n"
 			"movl $32, %%ecx				\n"
-			"subl %%esi, %%ecx				\n"
-			"movl $-1, %%edx				\n"
-			"shll %%cl, %%edx				\n"
-			"movl %%edx, %[amask]			\n"
+			"subl %%esi, %%ecx				\n"    // ecx = 32 - bits
+			"movl $-1, %%edx				\n"    // edx = -1 (all bits set to one)
+			"shll %%cl, %%edx				\n"    // shifting (cf <- edx <- 0)  (cl times)
+			"movl %%edx, %%ebp				\n"    // ebp = edx = mask
 			"movl %%esi, %%ecx				\n"

 			"xorl %%edx, %%edx				\n"
 			"movl %%edx, %%esi				\n"
 			"addl %%edi, %%edx				\n"
-			"decl %%edx						\n"
-
+			"decl %%edx						\n"    // edx is pointing at the end of the table (on last word)
 			"orl %%eax, %%eax				\n"
-			"cmovnz %[amask], %%esi			\n"
+			"cmovnz %%ebp, %%esi			\n"    // if(c) esi=mask else esi=0

 		"1:									\n"
 			"rorl %%cl, (%%ebx,%%edx,4)		\n"

 			"movl (%%ebx,%%edx,4), %%eax	\n"
-			"andl %[amask], %%eax			\n"
+			"andl %%ebp, %%eax				\n"
 			"xorl %%eax, (%%ebx,%%edx,4)	\n"
 			"orl  %%esi, (%%ebx,%%edx,4)	\n"
 			"movl %%eax, %%esi				\n"
@@ -1280,12 +1242,10 @@ namespace ttmath
 			"roll $1, %%eax					\n"
 			"andl $1, %%eax					\n"

-			"pop %%edi						\n"
-			"pop %%esi						\n"
-			"pop %%edx						\n"
+			"pop %%ebp						\n"

-			: "=a" (c)
-			: "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask)
+			: "=a" (c), "=D" (dummy), "=S" (dummy2), "=d" (dummy3)
+			: "0" (c),  "1" (b), "b" (p1), "c" (bits)
 			: "cc", "memory" );

 		#endif
@@ -1296,6 +1256,10 @@ namespace ttmath
 	}


+#ifdef _MSC_VER
+#pragma warning (default : 4731)
+#endif
+

 	/*
 		this method returns the number of the highest set bit in one 32-bit word
@@ -1304,7 +1268,7 @@ namespace ttmath
 	template<uint value_size>
 	sint UInt<value_size>::FindLeadingBitInWord(uint x)
 	{
-	register sint result;
+	sint result;

 		#ifndef __GNUC__
 			__asm
@@ -1324,16 +1288,17 @@ namespace ttmath


 		#ifdef __GNUC__
-			__asm__  __volatile__(
+		uint dummy;

-			"bsrl %1, %0		\n"
-			"jnz 1f				\n"
-			"movl $-1, %0		\n"
-			"1:					\n"
-
-			: "=R" (result)
-			: "R" (x)
-			: "cc" );
+				__asm__ (
+
+				"movl $-1, %1          \n"
+				"bsrl %2, %0           \n"
+				"cmovz %1, %0          \n"
+
+				: "=r" (result), "=&r" (dummy)
+				: "r" (x)
+				: "cc" );

 		#endif

@@ -1384,15 +1349,14 @@ namespace ttmath


 		#ifdef __GNUC__
-			__asm__  __volatile__(
+			__asm__ (

 			"btsl %%ebx, %%eax		\n"
-
 			"setc %%bl				\n"
 			"movzx %%bl, %%ebx		\n"
 			
 			: "=a" (v), "=b" (old_bit)
-			: "0" (v), "1" (bit)
+			: "0" (v),  "1" (bit)
 			: "cc" );

 		#endif
@@ -1423,8 +1387,8 @@ namespace ttmath
 		this has no effect in visual studio but it's useful when
 		using gcc and options like -Ox
 	*/
-	register uint result1_;
-	register uint result2_;
+	uint result1_;
+	uint result2_;

 		#ifndef __GNUC__

@@ -1448,12 +1412,12 @@ namespace ttmath

 		#ifdef __GNUC__

-		__asm__ __volatile__(
+		__asm__ (
 		
 			"mull %%edx			\n"

 			: "=a" (result1_), "=d" (result2_)
-			: "0" (a), "1" (b)
+			: "0" (a),         "1" (b)
 			: "cc" );

 		#endif
@@ -1491,8 +1455,8 @@ namespace ttmath
 	template<uint value_size>
 	void UInt<value_size>::DivTwoWords(uint a, uint b, uint c, uint * r, uint * rest)
 	{
-		register uint r_;
-		register uint rest_;
+		uint r_;
+		uint rest_;
 		/*
 			these variables have similar meaning like those in
 			the multiplication algorithm MulTwoWords
@@ -1521,12 +1485,12 @@ namespace ttmath

 		#ifdef __GNUC__
 		
-			__asm__ __volatile__(
+			__asm__ (

 			"divl %%ecx				\n"

 			: "=a" (r_), "=d" (rest_)
-			: "d" (a), "a" (b), "c" (c)
+			: "0" (b),   "1" (a), "c" (c)
 			: "cc" );

 		#endif
--- a/ttmath/ttmathuint_x86_64.h
+++ b/ttmath/ttmathuint_x86_64.h
@@ -78,7 +78,6 @@ namespace ttmath
 	uint b = value_size;
 	uint * p1 = table;
 	const uint * p2 = ss2.table;
-	uint dummy, dummy2;

 		// we don't have to use TTMATH_REFERENCE_ASSERT here
 		// this algorithm doesn't require it
@@ -88,13 +87,15 @@ namespace ttmath
 		#endif

 		#ifdef __GNUC__
+		uint dummy, dummy2;
+
 			/*
 				this part should be compiled with gcc
 			*/
 			__asm__ __volatile__(
 	
 				"xorq %%rdx, %%rdx				\n"
-				"neg %%rax						\n"     // CF=1 if rax!=0 , CF=0 if rax==0
+				"negq %%rax						\n"     // CF=1 if rax!=0 , CF=0 if rax==0

 			"1:									\n"
 				"movq (%%rsi,%%rdx,8), %%rax	\n"
@@ -107,7 +108,7 @@ namespace ttmath
 				"adcq %%rcx, %%rcx				\n"

 				: "=c" (c), "=a" (dummy), "=d" (dummy2)
-				: "0" (b), "1" (c), "b" (p1), "S" (p2)
+				: "0" (b),  "1" (c), "b" (p1), "S" (p2)
 				: "cc", "memory" );

 		#endif
@@ -145,7 +146,6 @@ namespace ttmath
 	uint b = value_size;
 	uint * p1 = table;
 	uint c;
-	uint dummy, dummy2;

 		TTMATH_ASSERT( index < value_size )

@@ -154,7 +154,8 @@ namespace ttmath
 		#endif

 		#ifdef __GNUC__
-
+		uint dummy, dummy2;
+	
 			__asm__ __volatile__(

 				"subq %%rdx, %%rcx 				\n"
@@ -172,8 +173,8 @@ namespace ttmath
 				"setc %%al						\n"
 				"movzx %%al, %%rdx				\n"

-				: "=d" (c), "=a" (dummy), "=c" (dummy2)
-				: "a" (value), "c" (b), "0" (index), "b" (p1)
+				: "=d" (c),    "=a" (dummy), "=c" (dummy2)
+				: "0" (index), "1" (value),  "2" (b), "b" (p1)
 				: "cc", "memory" );

 		#endif
@@ -223,7 +224,6 @@ namespace ttmath
 	uint b = value_size;
 	uint * p1 = table;
 	uint c;
-	uint dummy, dummy2;

 		TTMATH_ASSERT( index < value_size - 1 )

@@ -232,6 +232,8 @@ namespace ttmath
 		#endif

 		#ifdef __GNUC__
+		uint dummy, dummy2;
+
 			__asm__ __volatile__(
 			
 				"subq %%rdx, %%rcx 				\n"
@@ -254,7 +256,7 @@ namespace ttmath
 				"movzx %%al, %%rax				\n"

 				: "=a" (c), "=c" (dummy), "=d" (dummy2)
-				: "1" (b), "2" (index), "b" (p1), "S" (x1), "0" (x2)
+				: "0" (x2), "1" (b),      "2" (index), "b" (p1), "S" (x1)
 				: "cc", "memory" );

 		#endif
@@ -299,10 +301,10 @@ namespace ttmath
 		#endif

 		#ifdef __GNUC__
+		uint dummy1, dummy2, dummy3;	
 			
-		//	this part should be compiled with gcc
-		uint dummy1, dummy2, dummy3;
-
+			//	this part should be compiled with gcc
+		
 			__asm__ __volatile__(
 				"mov %%rdx, %%r8					\n"
 				"xor %%rdx, %%rdx					\n"   // rdx = 0, cf = 0
@@ -320,10 +322,9 @@ namespace ttmath
 				"or %%r8, %%r8						\n"
 				"jz 3f								\n"
 				
-				"xor %%rbx, %%rbx					\n"
-				"sub %%rcx, %%rbx					\n"   // setting cf from rcx
+				"xor %%rbx, %%rbx					\n"   // ebx = 0
+				"neg %%rcx							\n"   // setting cf from rcx
 				"mov %%r8, %%rcx					\n"   // rcx=rest and is != 0
-				"mov $0, %%rbx						\n"
 			"2:										\n"
 				"mov (%%rsi, %%rdx, 8), %%rax		\n"
 				"adc %%rbx, %%rax 					\n"
@@ -367,7 +368,7 @@ namespace ttmath
 	uint b = value_size;
 	uint * p1 = table;
 	const uint * p2 = ss2.table;
-	uint dummy, dummy2;
+	

 		// we don't have to use TTMATH_REFERENCE_ASSERT here
 		// this algorithm doesn't require it
@@ -377,10 +378,12 @@ namespace ttmath
 		#endif

 		#ifdef __GNUC__
+		uint dummy, dummy2;
+
 			__asm__  __volatile__(
 	
 				"xorq %%rdx, %%rdx				\n"
-				"neg %%rax						\n"     // CF=1 if rax!=0 , CF=0 if rax==0
+				"negq %%rax						\n"     // CF=1 if rax!=0 , CF=0 if rax==0

 			"1:									\n"
 				"movq (%%rsi,%%rdx,8), %%rax	\n"
@@ -393,10 +396,9 @@ namespace ttmath
 				"adcq %%rcx, %%rcx				\n"

 				: "=c" (c), "=a" (dummy), "=d" (dummy2)
-				: "0" (b), "1" (c), "b" (p1), "S" (p2)
+				: "0" (b),  "1" (c), "b" (p1), "S" (p2)
 				: "cc", "memory" );

-
 		#endif

 		TTMATH_LOG("UInt::Sub")
@@ -456,8 +458,8 @@ namespace ttmath
 				"setc %%al						\n"
 				"movzx %%al, %%rdx				\n"

-				: "=d" (c), "=a" (dummy), "=c" (dummy2)
-				: "1" (value), "2" (b), "0" (index), "b" (p1)
+				: "=d" (c),    "=a" (dummy), "=c" (dummy2)
+				: "0" (index), "1" (value),  "2" (b), "b" (p1)
 				: "cc", "memory" );

 		#endif
@@ -528,10 +530,9 @@ namespace ttmath
 				"or %%r8, %%r8						\n"
 				"jz 3f								\n"
 				
-				"xor %%rbx, %%rbx					\n"
-				"sub %%rcx, %%rbx					\n"   // setting cf from rcx
+				"xor %%rbx, %%rbx					\n"   // ebx = 0
+				"neg %%rcx							\n"   // setting cf from rcx
 				"mov %%r8, %%rcx					\n"   // rcx=rest and is != 0
-				"mov $0, %%rbx						\n"
 			"2:										\n"
 				"mov (%%rsi, %%rdx, 8), %%rax		\n"
 				"sbb %%rbx, %%rax 					\n"
@@ -576,17 +577,19 @@ namespace ttmath
 	{
 	sint b = value_size;
 	uint * p1 = table;
-	uint dummy, dummy2;
+	

 		#ifndef __GNUC__
 			#error "another compiler than GCC is currently not supported in 64bit mode"
 		#endif

 		#ifdef __GNUC__
+		uint dummy, dummy2;
+
 		__asm__  __volatile__(
 		
 			"xorq %%rdx, %%rdx			\n"   // rdx=0
-			"neg %%rax					\n"   // CF=1 if rax!=0 , CF=0 if rax==0
+			"negq %%rax					\n"   // CF=1 if rax!=0 , CF=0 if rax==0

 		"1:								\n"
 			"rclq $1, (%%rbx, %%rdx, 8)	\n"
@@ -598,7 +601,7 @@ namespace ttmath
 			"adcq %%rcx, %%rcx			\n"

 			: "=c" (c), "=a" (dummy), "=d" (dummy2)
-			: "1" (c), "0" (b), "b" (p1)
+			: "0" (b),  "1" (c), "b" (p1)
 			: "cc", "memory" );
 	
 		#endif
@@ -628,16 +631,18 @@ namespace ttmath
 	{
 	sint b = value_size;
 	uint * p1 = table;
-	uint dummy;
+	

 		#ifndef __GNUC__
 			#error "another compiler than GCC is currently not supported in 64bit mode"
 		#endif

 		#ifdef __GNUC__
+		uint dummy;
+
 		__asm__  __volatile__(

-			"neg %%rax						\n"   // CF=1 if rax!=0 , CF=0 if rax==0
+			"negq %%rax						\n"   // CF=1 if rax!=0 , CF=0 if rax==0

 		"1:									\n"
 			"rcrq $1, -8(%%rbx, %%rcx, 8)	\n"
@@ -648,7 +653,7 @@ namespace ttmath
 			"adcq %%rcx, %%rcx				\n"

 			: "=c" (c), "=a" (dummy)
-			: "1" (c), "0" (b), "b" (p1)
+			: "0" (b),  "1" (c), "b" (p1)
 			: "cc", "memory" );

 		#endif
@@ -681,13 +686,15 @@ namespace ttmath

 	uint b = value_size;
 	uint * p1 = table;
-	uint dummy, dummy2, dummy3;
+

 		#ifndef __GNUC__
 			#error "another compiler than GCC is currently not supported in 64bit mode"
 		#endif

 		#ifdef __GNUC__
+		uint dummy, dummy2, dummy3;
+
 		__asm__  __volatile__(
 		
 			"movq %%rcx, %%rsi				\n"
@@ -700,7 +707,6 @@ namespace ttmath

 			"xorq %%rdx, %%rdx				\n"
 			"movq %%rdx, %%rsi				\n"
-
 			"orq %%rax, %%rax				\n"
 			"cmovnz %%r8, %%rsi				\n"

@@ -720,7 +726,7 @@ namespace ttmath
 			"and $1, %%rax					\n"

 			: "=a" (c), "=D" (dummy), "=S" (dummy2), "=d" (dummy3)
-			: "0" (c), "1" (b), "b" (p1), "c" (bits)
+			: "0" (c),  "1" (b), "b" (p1), "c" (bits)
 			: "%r8", "cc", "memory" );

 		#endif
@@ -774,7 +780,6 @@ namespace ttmath
 			"movq %%rdx, %%rsi				\n"
 			"addq %%rdi, %%rdx				\n"
 			"decq %%rdx						\n"
-
 			"orq %%rax, %%rax				\n"
 			"cmovnz %%R8, %%rsi				\n"

@@ -782,7 +787,7 @@ namespace ttmath
 			"rorq %%cl, (%%rbx,%%rdx,8)		\n"

 			"movq (%%rbx,%%rdx,8), %%rax	\n"
-			"andq %%R8, %%rax			\n"
+			"andq %%R8, %%rax				\n"
 			"xorq %%rax, (%%rbx,%%rdx,8)	\n"
 			"orq  %%rsi, (%%rbx,%%rdx,8)	\n"
 			"movq %%rax, %%rsi				\n"
@@ -815,23 +820,25 @@ namespace ttmath
 	template<uint value_size>
 	sint UInt<value_size>::FindLeadingBitInWord(uint x)
 	{
-	register sint result;
+	sint result;
+

 		#ifndef __GNUC__
 			#error "another compiler than GCC is currently not supported in 64bit mode"
 		#endif

 		#ifdef __GNUC__
-			__asm__  __volatile__(
+		uint dummy;

-			"bsrq %1, %0		\n"
-			"jnz 1f				\n"
-			"movq $-1, %0		\n"
-			"1:					\n"
-
-			: "=R" (result)
-			: "R" (x)
-			: "cc" );
+				__asm__ (
+
+				"movq $-1, %1          \n"
+				"bsrq %2, %0           \n"
+				"cmovz %1, %0          \n"
+
+				: "=r" (result), "=&r" (dummy)
+				: "r" (x)
+				: "cc" );

 		#endif

@@ -867,15 +874,15 @@ namespace ttmath
 		#endif

 		#ifdef __GNUC__
-			__asm__  __volatile__(
+
+			__asm__ (

 			"btsq %%rbx, %%rax		\n"
-
 			"setc %%bl				\n"
 			"movzx %%bl, %%rbx		\n"
 			
 			: "=a" (v), "=b" (old_bit)
-			: "0" (v), "1" (bit)
+			: "0" (v),  "1" (bit)
 			: "cc" );

 		#endif
@@ -914,8 +921,8 @@ namespace ttmath
 		this has no effect in visual studio but it's usefull when
 		using gcc and options like -O
 	*/
-	register uint result1_;
-	register uint result2_;
+	uint result1_;
+	uint result2_;

 		#ifndef __GNUC__
 			#error "another compiler than GCC is currently not supported in 64bit mode"
@@ -923,12 +930,12 @@ namespace ttmath

 		#ifdef __GNUC__

-		__asm__ __volatile__(
+		__asm__ (
 		
 			"mulq %%rdx			\n"

 			: "=a" (result1_), "=d" (result2_)
-			: "0" (a), "1" (b)
+			: "0" (a),         "1" (b)
 			: "cc" );

 		#endif
@@ -965,8 +972,8 @@ namespace ttmath
 	template<uint value_size>
 	void UInt<value_size>::DivTwoWords(uint a,uint b, uint c, uint * r, uint * rest)
 	{
-		register uint r_;
-		register uint rest_;
+		uint r_;
+		uint rest_;
 		/*
 			these variables have similar meaning like those in
 			the multiplication algorithm MulTwoWords
@@ -980,7 +987,7 @@ namespace ttmath

 		#ifdef __GNUC__
 		
-			__asm__ __volatile__(
+			__asm__ (

 			"divq %%rcx				\n"