changed: asm code in UInt::Add, UInt::AddInt, AddTwoInts

32 and 64 bits, much faster now added: tests for UInt::AddInt fixed: tests: test_lahf() returned incorrect value for 32bit platform git-svn-id: svn://ttmath.org/publicrep/ttmath/trunk@82 e52654a7-88a9-db11-a3e9-0013d4bc506e
2008-10-25 20:05:51 +00:00 · 2008-10-25 20:05:51 +00:00 · cfd719cca2
parent f1115a2ce9
commit cfd719cca2
7 changed files with 280 additions and 320 deletions
--- a/tests/Makefile
+++ b/tests/Makefile
@ -1,6 +1,6 @@
 CC     = g++
 o      = main.o uinttest.o
-CFLAGS = -Wall -pedantic
+CFLAGS = -Wall
 ttmath = ..
 name   = tests

--- a/tests/main.cpp
+++ b/tests/main.cpp
@ -40,6 +40,7 @@
 #include <ttmath/ttmath.h>
 #include "uinttest.h"

+//#include <windows.h>

 const char uint_tests_file[] = "tests.uint32";

@ -90,9 +91,13 @@ bool test_lahf()
 	std::cout << "fail" << std::endl;


+	return false;
+
 #endif

-return false;
+
+	// 32bit platform
+return true;
 }


@ -117,7 +122,5 @@ using namespace ttmath;



-
-
 return 0;
 }
--- a/tests/tests.uint32
+++ b/tests/tests.uint32
@ -22,6 +22,42 @@ add   192        192        6277101735386680763835789423207666416102355444464034
 add   192        192        6277101735386680763835789423207666416102355444464034512895 0 6277101735386680763835789423207666416102355444464034512895 0


+# AddInt
+
+#       min_bits max_bits   bits_per_int a               b(int)       index         result           carry
+addint  32       0          32           0               0            0             0                0
+addint  32       0          32           1000            2000         0             3000             0
+addint  64       0          32           562342345       1423445      1             6113650284997065 0                
+addint  64       0          32           5342342455      3423553423  0              8765895878 0
+addint  96       0          32           478895734       46756734     2             862509505820513898647477878                0
+addint  128      0          32           27370506140054471803784984408165997441 24543 3 27372450636847059393422542757339093889 0
+addint  128      128        32           340282366841711102552375003685868034945 2234543 3 177038656721750864719686733515479937 1
+addint  160      160        32           1461501637330902918124457471805283415910032366465 3 3 158457126631793409034731674497 1
+addint  192      0          32           6277101735386680763835789423128439055191355840718134336385 3354 1 6277101735386680763835789423128439055191355855123454647169 0
+addint  192      192        32           6277101735386680763835789423128439055191355840718134336385 3354 5 4901876491607848387655079701569502248322251848964993 1
+
+addint  64       0          64           0               0            0             0                0
+addint  64       0          64           5342342 345534234 0 350876576 0
+addint  64       0          64           5342342455 34235534234 0 39577876689 0
+addint  64       64         64           18446744073709550615 2000 0 999 1
+addint  128      0          64           42895062544824211012058135 3453234 0 42895062544824211015511369 0
+addint  128      0          64           42895062544824211012058135 456234234 1 8458931214807741031021280279 0
+addint  128      128        64           340282366920938426569886460012664978455 45623 1 804702316727431770143767 1
+addint  192      192        64           6277101735386680763835789423207666379208867297044931279895 45623234 1 841563227924816702308613143 1
+addint  192      192        64           6277101735386679588840776445207152040176347835149297122327 45623234 2 15523607057094857017675614218510090830281178135 1
+addint  192      192        64           6277101735386680763835789423207666416102355444464034512895 1 0 0 1
+
+
+
+
+
+
+
+
+
+
+
+



--- a/tests/uinttest.cpp
+++ b/tests/uinttest.cpp
@ -46,9 +46,9 @@ void UIntTest::set_file_name(const std::string & f)
 }


-int UIntTest::read_int()
+uuint UIntTest::read_uint()
 {
-int result = 0;
+uuint result = 0;
 	
 	skip_white_characters();

@ -60,39 +60,55 @@ return result;



-template<unsigned int type_size>
-void UIntTest::test_add()
+bool UIntTest::check_minmax_bits(int type_size)
 {
-using namespace ttmath;
-
-	UInt<type_size> a,b,result, new_result;
-
-	int min_bits = read_int();
-	int max_bits = read_int();
+	int min_bits = read_uint();
+	int max_bits = read_uint();

 	if( min_bits != 0 && type_size * TTMATH_BITS_PER_UINT < (unsigned int)min_bits )
-		return;
+		return false;

 	if( max_bits != 0 && type_size * TTMATH_BITS_PER_UINT > (unsigned int)max_bits )
-		return;
+		return false;

-	a.FromString(pline, 10, &pline);
-	b.FromString(pline, 10, &pline);
-	result.FromString(pline, 10, &pline);
-	int carry = read_int();
+return true;
+}

-	std::cerr << '[' << row << "] Add<" << type_size << ">: ";

+bool UIntTest::check_minmax_bits_bitperint(int type_size)
+{
+	if( !check_minmax_bits(type_size) )
+		return false;
+
+	int bits = read_uint();
+
+	if( TTMATH_BITS_PER_UINT != bits )
+		return false;
+
+return true;
+}
+
+
+
+bool UIntTest::check_end()
+{
 	skip_white_characters();
+
 	if( *pline!='#' && *pline!= 0 )
 	{
 		std::cerr << "syntax error" << std::endl;
-		return;
+		return false;
 	}

-	new_result = a;
-	int new_carry = new_result.Add(b);
-	bool ok = true;
+return true;
+}
+
+
+template<uuint type_size>
+bool UIntTest::check_result_carry(const ttmath::UInt<type_size> & result, const ttmath::UInt<type_size> & new_result,
+						int carry, int new_carry)
+{
+bool ok = true;

 	if( new_carry != carry )
 	{
@ -106,15 +122,67 @@ using namespace ttmath;
 		ok = false;
 	}

-	if( ok )
-	{
-		std::cerr << "ok" << std::endl;
-	}
+return ok;
 }




+
+
+template<uuint type_size>
+void UIntTest::test_add()
+{
+	UInt<type_size> a,b,result, new_result;
+
+	if( !check_minmax_bits(type_size) )
+		return;
+
+	a.FromString(pline, 10, &pline);
+	b.FromString(pline, 10, &pline);
+	result.FromString(pline, 10, &pline);
+	int carry = read_uint();
+
+	std::cerr << '[' << row << "] Add<" << type_size << ">: ";
+
+	if( !check_end() )
+		return;
+
+	new_result = a;
+	int new_carry = new_result.Add(b);
+
+	if( check_result_carry(result, new_result, carry, new_carry) )
+		std::cerr << "ok" << std::endl;
+}
+
+
+template<uuint type_size>
+void UIntTest::test_addint()
+{
+	UInt<type_size> a, result, new_result;
+
+	if( !check_minmax_bits_bitperint(type_size) )
+		return;
+
+	a.FromString(pline, 10, &pline);
+	uuint b = read_uint();
+	uuint index = read_uint();
+	result.FromString(pline, 10, &pline);
+	int carry = read_uint();
+
+	std::cerr << '[' << row << "] AddInt<" << type_size << ">: ";
+
+	if( !check_end() )
+		return;
+
+	new_result = a;
+	int new_carry = new_result.AddInt(b, index);
+
+	if( check_result_carry(result, new_result, carry, new_carry) )
+		std::cerr << "ok" << std::endl;
+}
+
+
 int UIntTest::upper_char(int c)
 {
 	if( c>='a' && c<='z' )
@ -191,6 +259,19 @@ const char * p = pline;
 		pline = p; test_add<9>();
 	}
 	else
+	if( method == "ADDINT" )
+	{
+		pline = p; test_addint<1>();
+		pline = p; test_addint<2>();
+		pline = p; test_addint<3>();
+		pline = p; test_addint<4>();
+		pline = p; test_addint<5>();
+		pline = p; test_addint<6>();
+		pline = p; test_addint<7>();
+		pline = p; test_addint<8>();
+		pline = p; test_addint<9>();
+	}
+	else
 	{
 		std::cerr << '[' << row << "] ";
 		std::cerr << "method " << method << " is not supported" << std::endl;
--- a/tests/uinttest.h
+++ b/tests/uinttest.h
@ -45,7 +45,8 @@

 #include <ttmath/ttmath.h>

-
+using namespace ttmath;
+typedef ttmath::uint uuint;


 class UIntTest
@ -70,8 +71,11 @@ public:

 	void go();

-	template<unsigned int type_size>
-	void test_add();
+	template<uuint type_size> void test_add();
+	template<uuint type_size> void test_addint();
+
+	template<uuint type_size> bool check_result_carry(const ttmath::UInt<type_size> & result, const ttmath::UInt<type_size> & new_result,
+						int carry, int new_carry);


 int upper_char(int c);
@ -80,8 +84,10 @@ void skip_white_characters();
 bool read_method();
 void test_method();
 bool check_line();
-int read_int();
-
+uuint read_uint();
+bool check_minmax_bits(int type_size);
+bool check_minmax_bits_bitperint(int type_size);
+bool check_end();


 };
--- a/ttmath/ttmathuint.h
+++ b/ttmath/ttmathuint.h
@ -249,48 +249,46 @@ public:
 		#ifndef __GNUC__
 			
 			//	this part might be compiled with for example visual c
-			
+
 			__asm
 			{
 				push eax
 				push ebx
 				push ecx
 				push edx
+				push esi

 				mov ecx,[b]
 				
 				mov ebx,[p1]
-				mov edx,[p2]
+				mov esi,[p2]

-				xor eax,eax
-				sub eax,[c]
+				xor eax,eax  // eax=0
+				mov edx,eax  // edx=0
+
+				sub eax,[c]  // CF=c

-				lahf   //  flags -> AH   (flags: SF ZF AF PF CF)
 			p:
-				sahf   //  AH -> flags   (flags: SF ZF AF PF CF)
-				mov eax,[ebx]
-				adc eax,[edx]
-				mov [ebx],eax
-				lahf
-				
-				add ebx,4
-				add edx,4
+				mov eax,[esi+edx*4]
+				adc [ebx+edx*4],eax

+				inc edx
 				dec ecx
 			jnz p

-				// checking carry from the last word
-				// CF = bit 0
-				test ah,1
-				setnz al
+				setc al
 				movzx edx, al
 				mov [c], edx

+				pop esi
 				pop edx
 				pop ecx
 				pop ebx
 				pop eax
 			}
+
+
+
 		#endif		
 			

@ -300,37 +298,28 @@ public:
 			
 			__asm__ __volatile__(
 			
-				"push %%ebx				\n"
-				"push %%ecx				\n"
-				"push %%edx				\n"
+				"push %%ecx						\n"
 			
-				"xorl %%eax, %%eax		\n"
-				"subl %%esi, %%eax		\n"
+				"xorl %%eax, %%eax				\n"
+				"movl %%eax, %%edx				\n"
+				"subl %%edi, %%eax				\n"

-				"lahf					\n"
-			"1:							\n"
-				"sahf					\n"
-				"movl (%%ebx),%%eax		\n"
-				"adcl (%%edx),%%eax		\n"
-				"movl %%eax,(%%ebx)		\n"
-				"lahf					\n"
-				
-				"add $4,%%ebx			\n"
-				"add $4,%%edx			\n"

-				"decl %%ecx				\n"
-			"jnz 1b						\n"
+			"1:									\n"
+				"movl (%%esi,%%edx,4),%%eax		\n"
+				"adcl %%eax, (%%ebx,%%edx,4)	\n"
+			
+				"incl %%edx						\n"
+				"decl %%ecx						\n"
+			"jnz 1b								\n"

-				"test $1,%%ah			\n"
-				"setnz %%al				\n"
-				"movzx %%al,%%esi		\n"
+				"setc %%al						\n"
+				"movzx %%al,%%edx				\n"

-				"pop %%edx				\n"
-				"pop %%ecx				\n"
-				"pop %%ebx				\n"
+				"pop %%ecx						\n"

-				: "=S" (c)
-				: "0" (c), "c" (b), "b" (p1), "d" (p2)
+				: "=d" (c)
+				: "D" (c), "c" (b), "b" (p1), "S" (p2)
 				: "%eax", "cc", "memory" );

 		#endif
@ -366,106 +355,71 @@ public:
 	register uint c;

 		#ifndef __GNUC__
+
 			__asm
 			{
 				push eax
 				push ebx
 				push ecx
 				push edx
-				push edi

 				mov ecx, [b]
 				sub ecx, [index]				

 				mov edx, [index]
-				mov eax, [p1]
-			
-				lea ebx, [eax+4*edx]
-				mov edx, [value]
+				mov ebx, [p1]

-				mov edi,1
+				mov eax, [value]

-				clc
-				lahf
 			p:
-				sahf	; restore flags
-				mov eax, [ebx]
-				adc eax, edx
-				mov [ebx], eax
-				lahf	; save flags
+				add [ebx+edx*4], eax
+			jnc end

-			cmovnc ecx,edi
-				xor edx,edx
-				add ebx,4
-
-			sub ecx,1
+				mov eax, 1
+				inc edx
+				dec ecx
 			jnz p

-//			end:
-
-				test ah,1
-				setnz al
-
-				//
-				// movzx dword ptr [c],al
-				//
+			end:
+				setc al
 				movzx edx, al
 				mov [c], edx
-				// 

-				pop edi
 				pop edx
 				pop ecx
 				pop ebx
 				pop eax
 			}
+
 		#endif		
 			

 		#ifdef __GNUC__
 			__asm__ __volatile__(
 			
-				"push %%ebx						\n"
+				"push %%eax						\n"
 				"push %%ecx						\n"
-				"push %%edx						\n"
-				"push %%edi						\n"

 				"subl %%edx, %%ecx 				\n"

-				"leal (%%ebx,%%edx,4), %%ebx 	\n"
-
-				"movl %%esi, %%edx				\n"
-				"movl $1, %%edi					\n"
-				"clc							\n"
-				"lahf							\n"
 			"1:									\n"
-				"sahf							\n"
-				"movl (%%ebx), %%eax			\n"
-				"adcl %%edx, %%eax				\n"
-				"movl %%eax, (%%ebx)			\n"
-				"lahf							\n"
-
-			"cmovnc %%edi,%%ecx					\n"
-
-				"xorl %%edx, %%edx				\n"
-
-				"addl $4,%%ebx					\n"
-
-			"subl $1,%%ecx						\n"
+				"addl %%eax, (%%ebx,%%edx,4)	\n"
+			"jnc 2f								\n"
+				
+				"movl $1, %%eax					\n"
+				"incl %%edx						\n"
+				"decl %%ecx						\n"
 			"jnz 1b								\n"

+			"2:									\n"
+				"setc %%al						\n"
+				"movzx %%al, %%edx				\n"

-				"test $1,%%ah					\n"
-				"setnz %%al						\n"
-				"movzx %%al,%%eax				\n"
-
-				"pop %%edi						\n"
-				"pop %%edx						\n"
 				"pop %%ecx						\n"
-				"pop %%ebx						\n"
+				"pop %%eax						\n"

-				: "=a" (c)
-				: "c" (b), "d" (index), "b" (p1), "S" (value)
+				: "=d" (c)
+				: "a" (value), "c" (b), "0" (index), "b" (p1)
 				: "cc", "memory" );

 		#endif
@ -523,59 +477,35 @@ public:
 				mov ecx, [b]
 				sub ecx, [index]				

+				mov ebx, [p1]
 				mov edx, [index]
-				mov eax, [p1]
+
+				mov eax, [x1]
+				add [ebx+edx*4], eax
+				inc edx
+				dec ecx
+
+				mov eax, [x2]
 			
-				lea ebx, [eax+4*edx]
-
-				xor edx,edx
-
-				mov eax, [ebx]
-				add eax, [x1]
-				mov [ebx], eax
-
-				setc al
-				movzx eax,al
-				add ebx,4
-
-				add eax, [ebx]
-				add eax, [x2]
-				mov [ebx], eax
-			jnc end
-
-				dec ecx
-				dec ecx
-			jz end
-
 			p:
-				inc ebx
-				inc ebx
-				inc ebx
-				inc ebx
-
-				mov eax,[ebx]
-				adc eax, edx
-				mov [ebx], eax
-
+				adc [ebx+edx*4], eax
 			jnc end

-			loop p
+				mov eax, 0
+				inc edx
+				dec ecx
+			jnz p

 			end:
-
 				setc al
-
-				//
-				// movzx dword ptr [c],al
-				//
 				movzx edx, al
 				mov [c], edx
-				//
-
+				
 				pop edx
 				pop ecx
 				pop ebx
 				pop eax
+
 			}
 		#endif		
 			
@ -583,59 +513,33 @@ public:
 		#ifdef __GNUC__
 			__asm__ __volatile__(
 			
-				"push %%ebx						\n"
 				"push %%ecx						\n"
 				"push %%edx						\n"

 				"subl %%edx, %%ecx 				\n"
 				
-				"leal (%%ebx,%%edx,4), %%ebx 	\n"
-
-				"xorl %%edx, %%edx				\n"
-
-				"movl (%%ebx), %%eax			\n"
-				"addl %%esi, %%eax					\n"
-				"movl %%eax, (%%ebx)			\n"
-
-				"inc %%ebx						\n"
-				"inc %%ebx						\n"
-				"inc %%ebx						\n"
-				"inc %%ebx						\n"
-
-				"movl (%%ebx), %%eax			\n"
-				"adcl %%edi, %%eax					\n"
-				"movl %%eax, (%%ebx)			\n"
-			"jnc 2f								\n"
-
-				"dec %%ecx						\n"
-				"dec %%ecx						\n"
-			"jz 2f								\n"
+				"addl %%esi, (%%ebx,%%edx,4) 	\n"
+				"incl %%edx						\n"
+				"decl %%ecx						\n"

 			"1:									\n"
-				"inc %%ebx						\n"
-				"inc %%ebx						\n"
-				"inc %%ebx						\n"
-				"inc %%ebx						\n"
-
-				"movl (%%ebx), %%eax			\n"
-				"adcl %%edx, %%eax				\n"
-				"movl %%eax, (%%ebx)			\n"
-
+				"adcl %%eax, (%%ebx,%%edx,4)	\n"
 			"jnc 2f								\n"

-			"loop 1b							\n"
+				"mov $0, %%eax					\n"
+				"incl %%edx						\n"
+				"decl %%ecx						\n"
+			"jnz 1b								\n"

 			"2:									\n"
-
 				"setc %%al						\n"
-				"movzx %%al,%%eax				\n"
+				"movzx %%al, %%eax				\n"

 				"pop %%edx						\n"
 				"pop %%ecx						\n"
-				"pop %%ebx						\n"

 				: "=a" (c)
-				: "c" (b), "d" (index), "b" (p1), "S" (x1), "D" (x2)
+				: "c" (b), "d" (index), "b" (p1), "S" (x1), "0" (x2)
 				: "cc", "memory" );

 		#endif
--- a/ttmath/ttmathuint64.h
+++ b/ttmath/ttmathuint64.h
@ -221,47 +221,28 @@ namespace ttmath
 			*/
 			__asm__ __volatile__(
 			
-				"push %%rbx				\n"
-				"push %%rcx				\n"
-				"push %%rdx				\n"
+				"push %%rcx						\n"
 			
-				"xorq %%rax, %%rax		\n"
-				"subq %%rsi, %%rax		\n"
+				"xorq %%rax, %%rax				\n"
+				"movq %%rax, %%rdx				\n"
+				"subq %%rdi, %%rax				\n"

-				//"lahf					\n"
-				// in order to use this instruction one need to use -msahf option of the GCC
-				// but in my compiler (gcc version 4.2.1) there is no such option
-				// at the moment I'm using the opcode of this instruction
-				// In the future this can be simply change into 'lahf'
-				".byte 0x9f				\n"

-			"1:							\n"
-				//"sahf					\n"
-				".byte 0x9e				\n"
+			"1:									\n"
+				"movq (%%rsi,%%rdx,8),%%rax		\n"
+				"adcq %%rax, (%%rbx,%%rdx,8)	\n"
 			
-				"movq (%%rbx),%%rax		\n"
-				"adcq (%%rdx),%%rax		\n"
-				"movq %%rax,(%%rbx)		\n"
+				"incq %%rdx						\n"
+				"decq %%rcx						\n"
+			"jnz 1b								\n"

-				//"lahf					\n"
-				".byte 0x9f				\n"
+				"setc %%al						\n"
+				"movzx %%al,%%rdx				\n"

-				"addq $8, %%rbx			\n"
-				"addq $8, %%rdx			\n"
-				
-				"decq %%rcx				\n"
-			"jnz 1b					\n"
+				"pop %%rcx						\n"

-				"test $1, %%ah			\n"
-				"setnz %%al				\n"
-				"movzx %%al, %%rsi		\n"
-
-				"pop %%rdx				\n"
-				"pop %%rcx				\n"
-				"pop %%rbx				\n"
-
-				: "=S" (c)
-				: "0" (c), "c" (b), "b" (p1), "d" (p2)
+				: "=d" (c)
+				: "D" (c), "c" (b), "b" (p1), "S" (p2)
 				: "%rax", "cc", "memory" );

 		#endif
@ -305,49 +286,32 @@ namespace ttmath
 		#endif

 		#ifdef __GNUC__
+
 			__asm__ __volatile__(
-				"push %%rbx						\n"
+
+				"push %%rax						\n"
 				"push %%rcx						\n"
-				"push %%rdx						\n"

 				"subq %%rdx, %%rcx 				\n"

-				"leaq (%%rbx,%%rdx,8), %%rbx 	\n"
-
-				"movq %%rsi, %%rdx				\n"
-				"clc							\n"
 			"1:									\n"
-
-				"movq (%%rbx), %%rax			\n"
-				"adcq %%rdx, %%rax				\n"
-				"movq %%rax, (%%rbx)			\n"
-
+				"addq %%rax, (%%rbx,%%rdx,8)	\n"
 			"jnc 2f								\n"
-
-				"movq $0, %%rdx					\n"
-
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-
-			"loop 1b							\n"
+				
+				"movq $1, %%rax					\n"
+				"incq %%rdx						\n"
+				"decq %%rcx						\n"
+			"jnz 1b								\n"

 			"2:									\n"
+				"setc %%al						\n"
+				"movzx %%al, %%rdx				\n"

-				"movq $0, %%rax					\n"
-				"adcq %%rax,%%rax				\n"
-
-				"pop %%rdx						\n"
 				"pop %%rcx						\n"
-				"pop %%rbx						\n"
+				"pop %%rax						\n"

-				: "=a" (c)
-				: "c" (b), "d" (index), "b" (p1), "S" (value)
+				: "=d" (c)
+				: "a" (value), "c" (b), "0" (index), "b" (p1)
 				: "cc", "memory" );

 		#endif
@ -404,67 +368,33 @@ namespace ttmath
 		#ifdef __GNUC__
 			__asm__ __volatile__(
 			
-				"push %%rbx						\n"
 				"push %%rcx						\n"
 				"push %%rdx						\n"

 				"subq %%rdx, %%rcx 				\n"
 				
-				"leaq (%%rbx,%%rdx,8), %%rbx 	\n"
-
-				"movq $0, %%rdx					\n"
-
-				"movq (%%rbx), %%rax			\n"
-				"addq %%rsi, %%rax				\n"
-				"movq %%rax, (%%rbx)			\n"
-
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-
-				"movq (%%rbx), %%rax			\n"
-				"adcq %%rdi, %%rax				\n"
-				"movq %%rax, (%%rbx)			\n"
-			"jnc 2f								\n"
-
-				"dec %%rcx						\n"
-				"dec %%rcx						\n"
-			"jz 2f								\n"
+				"addq %%rsi, (%%rbx,%%rdx,8) 	\n"
+				"incq %%rdx						\n"
+				"decq %%rcx						\n"

 			"1:									\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-				"inc %%rbx						\n"
-
-				"movq (%%rbx), %%rax			\n"
-				"adcq %%rdx, %%rax				\n"
-				"movq %%rax, (%%rbx)			\n"
-
+				"adcq %%rax, (%%rbx,%%rdx,8)	\n"
 			"jnc 2f								\n"

-			"loop 1b							\n"
+				"mov $0, %%rax					\n"
+				"incq %%rdx						\n"
+				"decq %%rcx						\n"
+			"jnz 1b								\n"

 			"2:									\n"
-
-				"movq $0, %%rax					\n"
-				"adcq %%rax,%%rax				\n"
+				"setc %%al						\n"
+				"movzx %%al, %%rax				\n"

 				"pop %%rdx						\n"
 				"pop %%rcx						\n"
-				"pop %%rbx						\n"

 				: "=a" (c)
-				: "c" (b), "d" (index), "b" (p1), "S" (x1), "D" (x2)
+				: "c" (b), "d" (index), "b" (p1), "S" (x1), "0" (x2)
 				: "cc", "memory" );

 		#endif