From de1e7ac957d9e16dfca091d7db593bb35648d56a Mon Sep 17 00:00:00 2001
From: Christian Kaiser <chk@online.de>
Date: Wed, 20 May 2009 08:48:51 +0000
Subject: [PATCH] more optimizations for MSVC assembler (parallelism, prefetch
 optimization, loop alignment, ...)

git-svn-id: svn://ttmath.org/publicrep/ttmath/branches/chk@151 e52654a7-88a9-db11-a3e9-0013d4bc506e
---
 ttmath/ttmathbig.h                   |    2 +-
 ttmath/ttmathconfig.h                |    9 +-
 ttmath/ttmathuint_x86.h              | 2288 +++++++++++++-------------
 ttmath/ttmathuint_x86_amd64_msvc.asm |   77 +-
 4 files changed, 1195 insertions(+), 1181 deletions(-)

diff --git a/ttmath/ttmathbig.h b/ttmath/ttmathbig.h
index 978d8e8..b97e238 100644
--- a/ttmath/ttmathbig.h
+++ b/ttmath/ttmathbig.h
@@ -3434,7 +3434,7 @@ private:
 	*/
 	int FromString_ReadScientificIfExists(const tchar_t * & source)
 	{
-	int c = 0;
+	uint c = 0;
 
 		bool scientific_read = false;
 		const tchar_t * before_scientific = source;
diff --git a/ttmath/ttmathconfig.h b/ttmath/ttmathconfig.h
index 785bc8e..13c0f8c 100644
--- a/ttmath/ttmathconfig.h
+++ b/ttmath/ttmathconfig.h
@@ -37,6 +37,7 @@ namespace ttmath
 {
 
 #if defined(_MSC_VER)
+	#include <windows.h>
 	#if defined(_UNICODE)
 		typedef	wchar_t					tchar_t;
 		typedef	std::wstring			tstr_t;
@@ -71,20 +72,20 @@ namespace ttmath
 			public:
 												clsCrit(void)
 													{
-													::InitializeCriticalSection(&_Crit);
+													InitializeCriticalSection(&_Crit);
 													}
 				virtual							~clsCrit(void)
 													{
-													::DeleteCriticalSection(&_Crit);
+													DeleteCriticalSection(&_Crit);
 													}
 
 				void							Enter(void) const
 													{
-													::EnterCriticalSection(&_Crit);
+													EnterCriticalSection(&_Crit);
 													}
 				void							Leave(void) const
 													{
-													::LeaveCriticalSection(&_Crit);
+													LeaveCriticalSection(&_Crit);
 													}
 			};
 
diff --git a/ttmath/ttmathuint_x86.h b/ttmath/ttmathuint_x86.h
index e2b404a..67cde3c 100644
--- a/ttmath/ttmathuint_x86.h
+++ b/ttmath/ttmathuint_x86.h
@@ -1,1140 +1,1148 @@
-/*
- * This file is a part of TTMath Bignum Library
- * and is distributed under the (new) BSD licence.
- * Author: Tomasz Sowa <t.sowa@slimaczek.pl>
- */
-
-/* 
- * Copyright (c) 2006-2009, Tomasz Sowa
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * 
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *    
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *    
- *  * Neither the name Tomasz Sowa nor the names of contributors to this
- *    project may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-
-#ifndef headerfilettmathuint_x86
-#define headerfilettmathuint_x86
-
-
-#ifndef TTMATH_NOASM
-#ifdef TTMATH_PLATFORM32
-
-
-/*!
-	\file ttmathuint_x86.h
-    \brief template class UInt<uint> with assembler code for 32bit x86 processors
-
-	this file is included at the end of ttmathuint.h
-*/
-
-
-
-/*!
-    \brief a namespace for the TTMath library
-*/
-namespace ttmath
-{
-
-	/*!
-	*
-	*	basic mathematic functions
-	*
-	*/
-
-
-	/*!
-		adding ss2 to the this and adding carry if it's defined
-		(this = this + ss2 + c)
-
-		c must be zero or one (might be a bigger value than 1)
-		function returns carry (1) (if it has been)
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::Add(const UInt<value_size> & ss2, uint c)
-	{
-	register uint b = value_size;
-	register uint * p1 = table;
-	register uint * p2 = const_cast<uint*>(ss2.table);
-
-		// we don't have to use TTMATH_REFERENCE_ASSERT here
-		// this algorithm doesn't require it
-
-		#ifndef __GNUC__
-			
-			//	this part might be compiled with for example visual c
-
-			__asm
-			{
-				xor eax,eax  // eax=0
-				mov ecx,[b]
-				mov edx,eax  // edx=0
-				mov ebx,[p1]
-				mov esi,[p2]
-
-				sub eax,[c]  // CF=c
-
-			p:
-				mov eax,[esi+edx*4]
-				adc [ebx+edx*4],eax
-
-				inc edx
-				dec ecx
-				jnz p
-
-				setc al
-				movzx eax, al
-			}
-
-		#endif		
-			
-
-		#ifdef __GNUC__
-			
-			//	this part should be compiled with gcc
-			
-			__asm__ __volatile__(
-			
-				"push %%ecx						\n"
-			
-				"xorl %%eax, %%eax				\n"
-				"movl %%eax, %%edx				\n"
-				"subl %%edi, %%eax				\n"
-
-
-			"1:									\n"
-				"movl (%%esi,%%edx,4),%%eax		\n"
-				"adcl %%eax, (%%ebx,%%edx,4)	\n"
-			
-				"incl %%edx						\n"
-				"decl %%ecx						\n"
-			"jnz 1b								\n"
-
-				"setc %%al						\n"
-				"movzx %%al,%%edx				\n"
-
-				"pop %%ecx						\n"
-
-				: "=d" (c)
-				: "D" (c), "c" (b), "b" (p1), "S" (p2)
-				: "%eax", "cc", "memory" );
-
-			return c;
-		#endif
-
-		TTMATH_LOG("UInt32::Add")
-	}
-
-
-
-	/*!
-		adding one word (at a specific position)
-		and returning a carry (if it has been)
-
-		e.g.
-
-		if we've got (value_size=3):
-			table[0] = 10;
-			table[1] = 30;
-			table[2] = 5;	
-		and we call:
-			AddInt(2,1)
-		then it'll be:
-			table[0] = 10;
-			table[1] = 30 + 2;
-			table[2] = 5;
-
-		of course if there was a carry from table[2] it would be returned
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::AddInt(uint value, uint index)
-	{
-	register uint b = value_size;
-	register uint * p1 = table;
-
-		TTMATH_ASSERT( index < value_size )
-
-		#ifndef __GNUC__
-
-			__asm
-			{
-				mov ecx, [b]
-				sub ecx, [index]				
-
-				mov edx, [index]
-				mov ebx, [p1]
-
-				mov eax, [value]
-
-			p:
-				add [ebx+edx*4], eax
-				jnc end
-
-				mov eax, 1
-				inc edx
-				dec ecx
-				jnz p
-
-			end:
-				setc al
-				movzx eax, al
-			}
-
-		#endif		
-			
-
-		#ifdef __GNUC__
-			register uint c;
-
-			__asm__ __volatile__(
-			
-				"push %%eax						\n"
-				"push %%ecx						\n"
-
-				"subl %%edx, %%ecx 				\n"
-
-			"1:									\n"
-				"addl %%eax, (%%ebx,%%edx,4)	\n"
-			"jnc 2f								\n"
-				
-				"movl $1, %%eax					\n"
-				"incl %%edx						\n"
-				"decl %%ecx						\n"
-			"jnz 1b								\n"
-
-			"2:									\n"
-				"setc %%al						\n"
-				"movzx %%al, %%edx				\n"
-
-				"pop %%ecx						\n"
-				"pop %%eax						\n"
-
-				: "=d" (c)
-				: "a" (value), "c" (b), "0" (index), "b" (p1)
-				: "cc", "memory" );
-
-			return c;
-		#endif
-	
-		TTMATH_LOG("UInt32::AddInt")
-	}
-
-
-
-
-	/*!
-		adding only two unsigned words to the existing value
-		and these words begin on the 'index' position
-		(it's used in the multiplication algorithm 2)
-
-		index should be equal or smaller than value_size-2 (index <= value_size-2)
-		x1 - lower word, x2 - higher word
-
-		for example if we've got value_size equal 4 and:
-			table[0] = 3
-			table[1] = 4
-			table[2] = 5
-			table[3] = 6
-		then let
-			x1 = 10
-			x2 = 20
-		and
-			index = 1
-
-		the result of this method will be:
-			table[0] = 3
-			table[1] = 4 + x1 = 14
-			table[2] = 5 + x2 = 25
-			table[3] = 6
-		
-		and no carry at the end of table[3]
-
-		(of course if there was a carry in table[2](5+20) then 
-		this carry would be passed to the table[3] etc.)
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::AddTwoInts(uint x2, uint x1, uint index)
-	{
-	register uint b = value_size;
-	register uint * p1 = table;
-
-		TTMATH_ASSERT( index < value_size - 1 )
-
-		#ifndef __GNUC__
-			__asm
-			{
-				mov ecx, [b]
-				sub ecx, [index]				
-
-				mov ebx, [p1]
-				mov edx, [index]
-
-				mov eax, [x1]
-				add [ebx+edx*4], eax
-				inc edx
-				dec ecx
-
-				mov eax, [x2]
-			
-			p:
-				adc [ebx+edx*4], eax
-				jnc end
-
-				mov eax, 0
-				inc edx
-				dec ecx
-				jnz p
-
-			end:
-				setc al
-				movzx eax, al
-			}
-		#endif		
-			
-
-		#ifdef __GNUC__
-			register uint c;
-			
-			__asm__ __volatile__(
-			
-				"push %%ecx						\n"
-				"push %%edx						\n"
-
-				"subl %%edx, %%ecx 				\n"
-				
-				"addl %%esi, (%%ebx,%%edx,4) 	\n"
-				"incl %%edx						\n"
-				"decl %%ecx						\n"
-
-			"1:									\n"
-				"adcl %%eax, (%%ebx,%%edx,4)	\n"
-			"jnc 2f								\n"
-
-				"mov $0, %%eax					\n"
-				"incl %%edx						\n"
-				"decl %%ecx						\n"
-			"jnz 1b								\n"
-
-			"2:									\n"
-				"setc %%al						\n"
-				"movzx %%al, %%eax				\n"
-
-				"pop %%edx						\n"
-				"pop %%ecx						\n"
-
-				: "=a" (c)
-				: "c" (b), "d" (index), "b" (p1), "S" (x1), "0" (x2)
-				: "cc", "memory" );
-
-			return c;
-		#endif
-
-		TTMATH_LOG("UInt32::AddTwoInts")
-	}
-
-
-
-
-
-	/*!
-		subtracting ss2 from the 'this' and subtracting
-		carry if it has been defined
-		(this = this - ss2 - c)
-
-		c must be zero or one (might be a bigger value than 1)
-		function returns carry (1) (if it has been)
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::Sub(const UInt<value_size> & ss2, uint c)
-	{
-	register uint b = value_size;
-	register uint * p1 = table;
-	register uint * p2 = const_cast<uint*>(ss2.table);
-
-		// we don't have to use TTMATH_REFERENCE_ASSERT here
-		// this algorithm doesn't require it
-
-		#ifndef __GNUC__
-
-			__asm
-			{
-				mov ecx,[b]
-				
-				mov ebx,[p1]
-				mov esi,[p2]
-
-				xor eax, eax
-				mov edx, eax
-
-				sub eax, [c]
-
-			p:
-				mov eax, [esi+edx*4]
-				sbb [ebx+edx*4], eax
-
-				inc edx
-				dec ecx
-				jnz p
-
-				setc al
-				movzx eax, al
-			}
-
-		#endif
-
-
-		#ifdef __GNUC__
-			__asm__  __volatile__(
-				"push %%ecx						\n"
-			
-				"xorl %%eax, %%eax				\n"
-				"movl %%eax, %%edx				\n"
-				"subl %%edi, %%eax				\n"
-
-
-			"1:									\n"
-				"movl (%%esi,%%edx,4),%%eax		\n"
-				"sbbl %%eax, (%%ebx,%%edx,4)	\n"
-			
-				"incl %%edx						\n"
-				"decl %%ecx						\n"
-			"jnz 1b								\n"
-
-				"setc %%al						\n"
-				"movzx %%al,%%edx				\n"
-
-				"pop %%ecx						\n"
-
-				: "=d" (c)
-				: "D" (c), "c" (b), "b" (p1), "S" (p2)
-				: "%eax", "cc", "memory" );
-
-		return c;
-		#endif
-
-		TTMATH_LOG("UInt32::Sub")
-	}
-
-
-
-
-	/*!
-		this method subtracts one word (at a specific position)
-		and returns a carry (if it was)
-
-		e.g.
-
-		if we've got (value_size=3):
-			table[0] = 10;
-			table[1] = 30;
-			table[2] = 5;	
-		and we call:
-			SubInt(2,1)
-		then it'll be:
-			table[0] = 10;
-			table[1] = 30 - 2;
-			table[2] = 5;
-
-		of course if there was a carry from table[2] it would be returned
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::SubInt(uint value, uint index)
-	{
-	register uint b = value_size;
-	register uint * p1 = table;
-
-		TTMATH_ASSERT( index < value_size )
-
-		#ifndef __GNUC__
-			__asm
-			{
-				mov ecx, [b]
-				sub ecx, [index]				
-
-				mov edx, [index]
-				mov ebx, [p1]
-
-				mov eax, [value]
-
-			p:
-				sub [ebx+edx*4], eax
-				jnc end
-
-				mov eax, 1
-				inc edx
-				dec ecx
-				jnz p
-
-			end:
-				setc al
-				movzx eax, al
-			}
-		#endif		
-			
-
-		#ifdef __GNUC__
-			register uint c;
-
-			__asm__ __volatile__(
-			
-				"push %%eax						\n"
-				"push %%ecx						\n"
-
-				"subl %%edx, %%ecx 				\n"
-
-			"1:									\n"
-				"subl %%eax, (%%ebx,%%edx,4)	\n"
-			"jnc 2f								\n"
-				
-				"movl $1, %%eax					\n"
-				"incl %%edx						\n"
-				"decl %%ecx						\n"
-			"jnz 1b								\n"
-
-			"2:									\n"
-				"setc %%al						\n"
-				"movzx %%al, %%edx				\n"
-
-				"pop %%ecx						\n"
-				"pop %%eax						\n"
-
-				: "=d" (c)
-				: "a" (value), "c" (b), "0" (index), "b" (p1)
-				: "cc", "memory" );
-
-			return c;
-		#endif
-		
-		TTMATH_LOG("UInt32::SubInt")
-	
-	}
-
-
-
-	/*!
-		this method moves all bits into the left hand side
-		return value <- this <- c
-
-		the lowest *bit* will be held the 'c' and
-		the state of one additional bit (on the left hand side)
-		will be returned
-
-		for example:
-		let this is 001010000
-		after Rcl2_one(1) there'll be 010100001 and Rcl2_one returns 0
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::Rcl2_one(uint c)
-	{
-	register sint b = value_size;
-	register uint * p1 = table;
-
-		#ifndef __GNUC__
-			__asm
-			{
-				mov ebx, [p1]
-
-				xor edx, edx
-				mov ecx, edx
-				sub ecx, [c]
-
-				mov ecx, [b]
-
-			p:
-				rcl dword ptr [ebx+edx*4], 1
-				
-				inc edx
-				dec ecx
-				jnz p
-
-				setc dl
-				movzx eax, dl
-			}
-		#endif
-
-
-		#ifdef __GNUC__
-		__asm__  __volatile__(
-
-			"push %%edx					\n"
-			"push %%ecx					\n"
-
-			"xorl %%edx, %%edx			\n"   // edx=0
-			"neg %%eax					\n"   // CF=1 if eax!=0 , CF=0 if eax==0
-
-		"1:								\n"
-			"rcll $1, (%%ebx, %%edx, 4)	\n"
-
-			"incl %%edx					\n"
-			"decl %%ecx					\n"
-		"jnz 1b							\n"
-
-			"setc %%al					\n"
-			"movzx %%al, %%eax			\n"
-
-			"pop %%ecx					\n"
-			"pop %%edx					\n"
-
-			: "=a" (c)
-			: "0" (c), "c" (b), "b" (p1)
-			: "cc", "memory" );
-
-			return c;
-		#endif
-
-		TTMATH_LOG("UInt32::Rcl2_one")
-
-	}
-
-
-
-	/*!
-		this method moves all bits into the right hand side
-		c -> this -> return value
-
-		the highest *bit* will be held the 'c' and
-		the state of one additional bit (on the right hand side)
-		will be returned
-
-		for example:
-		let this is 000000010
-		after Rcr2_one(1) there'll be 100000001 and Rcr2_one returns 0
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::Rcr2_one(uint c)
-	{
-	register sint b = value_size;
-	register uint * p1 = table;
-
-		#ifndef __GNUC__
-			__asm
-			{
-				mov ebx, [p1]
-
-				xor ecx, ecx
-				sub ecx, [c]
-
-				mov ecx, [b]
-
-			p:
-				rcr dword ptr [ebx+ecx*4-4], 1
-				
-				dec ecx
-				jnz p
-
-				setc cl
-				movzx eax, cl
-			}
-		#endif
-
-
-		#ifdef __GNUC__
-		__asm__  __volatile__(
-
-			"push %%ecx						\n"
-
-			"neg %%eax						\n"   // CF=1 if eax!=0 , CF=0 if eax==0
-
-		"1:									\n"
-			"rcrl $1, -4(%%ebx, %%ecx, 4)	\n"
-
-			"decl %%ecx						\n"
-		"jnz 1b								\n"
-
-			"setc %%al						\n"
-			"movzx %%al, %%eax				\n"
-
-			"pop %%ecx						\n"
-
-			: "=a" (c)
-			: "0" (c), "c" (b), "b" (p1)
-			: "cc", "memory" );
-
-			return c;
-		#endif
-
-		TTMATH_LOG("UInt32::Rcr2_one")
-	}
-
-
-
-	/*!
-		this method moves all bits into the left hand side
-		return value <- this <- c
-
-		the lowest *bits* will be held the 'c' and
-		the state of one additional bit (on the left hand side)
-		will be returned
-
-		for example:
-		let this is 001010000
-		after Rcl2(3, 1) there'll be 010000111 and Rcl2 returns 1
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::Rcl2(uint bits, uint c)
-	{
-	TTMATH_ASSERT( bits>0 && bits<TTMATH_BITS_PER_UINT )
-		
-	register sint b = value_size;
-	register uint * p1 = table;
-	register uint mask;
-
-		#ifndef __GNUC__
-			__asm
-			{
-				mov edi, [b]
-
-				mov ecx, 32
-				sub ecx, [bits]
-				mov edx, -1
-				shr edx, cl
-				mov [mask], edx
-
-				mov ecx, [bits]
-				mov ebx, [p1]
-
-				xor edx, edx   // edx = 0
-				mov esi, edx   // old value = 0 
-
-				mov eax, [c]
-				or eax, eax
-				cmovnz esi, [mask] // if c then old value = mask
-
-			p:
-				rol dword ptr [ebx+edx*4], cl
-				
-				mov eax, [ebx+edx*4]
-				and eax, [mask] 
-				xor [ebx+edx*4], eax // clearing bits
-				or [ebx+edx*4], esi  // saving old value
-				mov esi, eax
-
-				inc edx
-				dec edi
-				jnz p
-
-				and eax, 1
-			}
-		#endif
-
-
-		#ifdef __GNUC__
-		__asm__  __volatile__(
-
-			"push %%edx						\n"
-			"push %%esi						\n"
-			"push %%edi						\n"
-			
-			"movl %%ecx, %%esi				\n"
-			"movl $32, %%ecx				\n"
-			"subl %%esi, %%ecx				\n"
-			"movl $-1, %%edx				\n"
-			"shrl %%cl, %%edx				\n"
-			"movl %%edx, %[amask]			\n"
-			"movl %%esi, %%ecx				\n"
-
-			"xorl %%edx, %%edx				\n"
-			"movl %%edx, %%esi				\n"
-
-			"orl %%eax, %%eax				\n"
-			"cmovnz %[amask], %%esi			\n"
-
-		"1:									\n"
-			"roll %%cl, (%%ebx,%%edx,4)		\n"
-
-			"movl (%%ebx,%%edx,4), %%eax	\n"
-			"andl %[amask], %%eax			\n"
-			"xorl %%eax, (%%ebx,%%edx,4)	\n"
-			"orl  %%esi, (%%ebx,%%edx,4)	\n"
-			"movl %%eax, %%esi				\n"
-			
-			"incl %%edx						\n"
-			"decl %%edi						\n"
-		"jnz 1b								\n"
-			
-			"and $1, %%eax					\n"
-
-			"pop %%edi						\n"
-			"pop %%esi						\n"
-			"pop %%edx						\n"
-
-			: "=a" (c)
-			: "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask)
-			: "cc", "memory" );
-
-			return c;
-		#endif
-
-		TTMATH_LOG("UInt32::Rcl2")
-	}
-
-
-
-
-	/*!
-		this method moves all bits into the right hand side
-		C -> this -> return value
-
-		the highest *bits* will be held the 'c' and
-		the state of one additional bit (on the right hand side)
-		will be returned
-
-		for example:
-		let this is 000000010
-		after Rcr2(2, 1) there'll be 110000000 and Rcr2 returns 1
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::Rcr2(uint bits, uint c)
-	{
-	TTMATH_ASSERT( bits>0 && bits<TTMATH_BITS_PER_UINT )
-
-	register sint b = value_size;
-	register uint * p1 = table;
-	register uint mask;
-
-		#ifndef __GNUC__
-			__asm
-			{
-				mov edi, [b]
-
-				mov ecx, 32
-				sub ecx, [bits]
-				mov edx, -1
-				shl edx, cl
-				mov [mask], edx
-
-				mov ecx, [bits]
-				mov ebx, [p1]
-
-				xor edx, edx   // edx = 0
-				mov esi, edx   // old value = 0 
-				add edx, edi   
-				dec edx        // edx - is pointing at the last word
-
-				mov eax, [c]
-				or eax, eax
-				cmovnz esi, [mask] // if c then old value = mask
-
-			p:
-				ror dword ptr [ebx+edx*4], cl
-				
-				mov eax, [ebx+edx*4]
-				and eax, [mask] 
-				xor [ebx+edx*4], eax // clearing bits
-				or [ebx+edx*4], esi  // saving old value
-				mov esi, eax
-
-				dec edx
-				dec edi
-				jnz p
-
-				rol eax, 1    // bit 31 will be bit 0
-				and eax, 1  
-			}
-		#endif
-
-
-		#ifdef __GNUC__
-			__asm__  __volatile__(
-
-			"push %%edx						\n"
-			"push %%esi						\n"
-			"push %%edi						\n"
-			
-			"movl %%ecx, %%esi				\n"
-			"movl $32, %%ecx				\n"
-			"subl %%esi, %%ecx				\n"
-			"movl $-1, %%edx				\n"
-			"shll %%cl, %%edx				\n"
-			"movl %%edx, %[amask]			\n"
-			"movl %%esi, %%ecx				\n"
-
-			"xorl %%edx, %%edx				\n"
-			"movl %%edx, %%esi				\n"
-			"addl %%edi, %%edx				\n"
-			"decl %%edx						\n"
-
-			"orl %%eax, %%eax				\n"
-			"cmovnz %[amask], %%esi			\n"
-
-		"1:									\n"
-			"rorl %%cl, (%%ebx,%%edx,4)		\n"
-
-			"movl (%%ebx,%%edx,4), %%eax	\n"
-			"andl %[amask], %%eax			\n"
-			"xorl %%eax, (%%ebx,%%edx,4)	\n"
-			"orl  %%esi, (%%ebx,%%edx,4)	\n"
-			"movl %%eax, %%esi				\n"
-			
-			"decl %%edx						\n"
-			"decl %%edi						\n"
-		"jnz 1b								\n"
-			
-			"roll $1, %%eax					\n"
-			"andl $1, %%eax					\n"
-
-			"pop %%edi						\n"
-			"pop %%esi						\n"
-			"pop %%edx						\n"
-
-			: "=a" (c)
-			: "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask)
-			: "cc", "memory" );
-
-			return c;
-		#endif
-
-		TTMATH_LOG("UInt32::Rcr2")
-	}
-
-
-
-	/*
-		this method returns the number of the highest set bit in one 32-bit word
-		if the 'x' is zero this method returns '-1'
-	*/
-	template<uint value_size>
-	sint UInt<value_size>::FindLeadingBitInWord(uint x)
-	{
-
-		#ifndef __GNUC__
-			__asm
-			{
-				mov edx,-1
-				bsr eax,[x]
-				cmovz eax,edx
-			}
-		#endif
-
-
-		#ifdef __GNUC__
-			register sint result;
-			
-			__asm__  __volatile__(
-
-			"bsrl %1, %0		\n"
-			"jnz 1f				\n"
-			"movl $-1, %0		\n"
-			"1:					\n"
-
-			: "=R" (result)
-			: "R" (x)
-			: "cc" );
-
-			return result;
-
-		#endif
-	}
-
-
-
-
-
-	/*!
-		this method sets a special bit in the 'value'
-		and returns the last state of the bit (zero or one)
-
-		bit is from <0,31>
-		e.g.
-		 uint x = 100;
-		 uint bit = SetBitInWord(x, 3);
-		 now: x = 108 and bit = 0
-	*/
-	template<uint value_size>
-	uint UInt<value_size>::SetBitInWord(uint & value, uint bit)
-	{
-		TTMATH_ASSERT( bit < TTMATH_BITS_PER_UINT )
-
-		uint v = value;
-
-		#ifndef __GNUC__
-			__asm
-			{
-				mov eax, [v]
-				mov ebx, [bit]
-				bts eax, ebx
-				mov [v], eax
-
-				setc bl
-				movzx ebx, bl
-				mov eax, ebx
-			}
-		#endif
-
-
-		#ifdef __GNUC__
-			uint old_bit;
-
-			__asm__  __volatile__(
-
-			"btsl %%ebx, %%eax		\n"
-
-			"setc %%bl				\n"
-			"movzx %%bl, %%ebx		\n"
-			
-			: "=a" (v), "=b" (old_bit)
-			: "0" (v), "1" (bit)
-			: "cc" );
-
-			return old_bit;
-		#endif
-
-		value = v;
-	}
-
-
-
-
-	/*!
-		multiplication: result_high:result_low = a * b
-		result_high - higher word of the result
-		result_low  - lower word of the result
-	
-		this methos never returns a carry
-		this method is used in the second version of the multiplication algorithms
-	*/
-	template<uint value_size>
-	void UInt<value_size>::MulTwoWords(uint a, uint b, uint * result_high, uint * result_low)
-	{
-	/*
-		we must use these temporary variables in order to inform the compilator
-		that value pointed with result1 and result2 has changed
-
-		this has no effect in visual studio but it's useful when
-		using gcc and options like -Ox
-	*/
-	register uint result1_;
-	register uint result2_;
-
-		#ifndef __GNUC__
-
-			__asm
-			{
-				mov eax, [a]
-				mul dword ptr [b]
-
-				mov [result2_], edx
-				mov [result1_], eax
-			}
-
-		#endif
-
-
-		#ifdef __GNUC__
-
-		__asm__ __volatile__(
-		
-			"mull %%edx			\n"
-
-			: "=a" (result1_), "=d" (result2_)
-			: "0" (a), "1" (b)
-			: "cc" );
-
-		#endif
-
-
-		*result_low  = result1_;
-		*result_high = result2_;
-	}
-
-
-
-
-
-	/*!
-	 *
-	 * Division
-	 *
-	 *
-	*/
-	
-
-
-
-	/*!
-		this method calculates 64bits word a:b / 32bits c (a higher, b lower word)
-		r = a:b / c and rest - remainder
-
-		*
-		* WARNING:
-		* if r (one word) is too small for the result or c is equal zero
-		* there'll be a hardware interruption (0)
-		* and probably the end of your program
-		*
-	*/
-	template<uint value_size>
-	void UInt<value_size>::DivTwoWords(uint a, uint b, uint c, uint * r, uint * rest)
-	{
-		register uint r_;
-		register uint rest_;
-		/*
-			these variables have similar meaning like those in
-			the multiplication algorithm MulTwoWords
-		*/
-
-		TTMATH_ASSERT( c != 0 )
-
-		#ifndef __GNUC__
-			__asm
-			{
-				mov edx, [a]
-				mov eax, [b]
-				div dword ptr [c]
-
-				mov [r_], eax
-				mov [rest_], edx
-			}
-		#endif
-
-
-		#ifdef __GNUC__
-		
-			__asm__ __volatile__(
-
-			"divl %%ecx				\n"
-
-			: "=a" (r_), "=d" (rest_)
-			: "d" (a), "a" (b), "c" (c)
-			: "cc" );
-
-		#endif
-
-
-		*r = r_;
-		*rest = rest_;
-
-	}
-
-
-
-} //namespace
-
-
-
-#endif //ifdef TTMATH_PLATFORM32
-#endif //ifndef TTMATH_NOASM
-#endif
+/*
+ * This file is a part of TTMath Bignum Library
+ * and is distributed under the (new) BSD licence.
+ * Author: Tomasz Sowa <t.sowa@slimaczek.pl>
+ */
+
+/* 
+ * Copyright (c) 2006-2009, Tomasz Sowa
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *    
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *    
+ *  * Neither the name Tomasz Sowa nor the names of contributors to this
+ *    project may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+
+#ifndef headerfilettmathuint_x86
+#define headerfilettmathuint_x86
+
+
+#ifndef TTMATH_NOASM
+#ifdef TTMATH_PLATFORM32
+
+
+/*!
+	\file ttmathuint_x86.h
+    \brief template class UInt<uint> with assembler code for 32bit x86 processors
+
+	this file is included at the end of ttmathuint.h
+*/
+
+
+
+/*!
+    \brief a namespace for the TTMath library
+*/
+namespace ttmath
+{
+
+	/*!
+	*
+	*	basic mathematic functions
+	*
+	*/
+
+
+	/*!
+		adding ss2 to the this and adding carry if it's defined
+		(this = this + ss2 + c)
+
+		c must be zero or one (might be a bigger value than 1)
+		function returns carry (1) (if it has been)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::Add(const UInt<value_size> & ss2, uint c)
+	{
+	register uint b = value_size;
+	register uint * p1 = table;
+	register uint * p2 = const_cast<uint*>(ss2.table);
+
+		// we don't have to use TTMATH_REFERENCE_ASSERT here
+		// this algorithm doesn't require it
+
+		#ifndef __GNUC__
+			
+			//	this part might be compiled with for example visual c
+
+			__asm
+			{
+				xor eax,eax  // eax=0
+				mov ecx,[b]
+				mov edx,eax  // edx=0
+				mov ebx,[p1]
+				mov esi,[p2]
+
+				sub eax,[c]  // CF=c
+
+				ALIGN 16
+			p:				
+				mov eax,[esi+edx*4+0]
+				adc [ebx+edx*4+0],eax
+
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
+				dec ecx
+				jnz p
+
+				setc	al
+				movzx	eax, al
+			}
+
+		#endif		
+			
+
+		#ifdef __GNUC__
+			
+			//	this part should be compiled with gcc
+			
+			__asm__ __volatile__(
+			
+				"push %%ecx						\n"
+			
+				"xorl %%eax, %%eax				\n"
+				"movl %%eax, %%edx				\n"
+				"subl %%edi, %%eax				\n"
+
+
+			"1:									\n"
+				"movl (%%esi,%%edx,4),%%eax		\n"
+				"adcl %%eax, (%%ebx,%%edx,4)	\n"
+			
+				"incl %%edx						\n"
+				"decl %%ecx						\n"
+			"jnz 1b								\n"
+
+				"setc %%al						\n"
+				"movzx %%al,%%edx				\n"
+
+				"pop %%ecx						\n"
+
+				: "=d" (c)
+				: "D" (c), "c" (b), "b" (p1), "S" (p2)
+				: "%eax", "cc", "memory" );
+
+			return c;
+		#endif
+
+		TTMATH_LOG("UInt32::Add")
+	}
+
+
+
+	/*!
+		adding one word (at a specific position)
+		and returning a carry (if it has been)
+
+		e.g.
+
+		if we've got (value_size=3):
+			table[0] = 10;
+			table[1] = 30;
+			table[2] = 5;	
+		and we call:
+			AddInt(2,1)
+		then it'll be:
+			table[0] = 10;
+			table[1] = 30 + 2;
+			table[2] = 5;
+
+		of course if there was a carry from table[2] it would be returned
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::AddInt(uint value, uint index)
+	{
+	register uint b = value_size;
+	register uint * p1 = table;
+
+		TTMATH_ASSERT( index < value_size )
+
+		#ifndef __GNUC__
+
+			__asm
+			{
+				mov ecx, [b]
+				sub ecx, [index]				
+
+				mov edx, [index]
+				mov ebx, [p1]
+
+				mov eax, [value]
+
+				ALIGN 16
+			p:
+				add [ebx+edx*4], eax
+				jnc end
+
+				mov eax, 1
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
+				dec ecx
+				jnz p
+
+			end:
+				setc	al
+				movzx	eax, al
+			}
+
+		#endif		
+			
+
+		#ifdef __GNUC__
+			register uint c;
+
+			__asm__ __volatile__(
+			
+				"push %%eax						\n"
+				"push %%ecx						\n"
+
+				"subl %%edx, %%ecx 				\n"
+
+			"1:									\n"
+				"addl %%eax, (%%ebx,%%edx,4)	\n"
+			"jnc 2f								\n"
+				
+				"movl $1, %%eax					\n"
+				"incl %%edx						\n"
+				"decl %%ecx						\n"
+			"jnz 1b								\n"
+
+			"2:									\n"
+				"setc %%al						\n"
+				"movzx %%al, %%edx				\n"
+
+				"pop %%ecx						\n"
+				"pop %%eax						\n"
+
+				: "=d" (c)
+				: "a" (value), "c" (b), "0" (index), "b" (p1)
+				: "cc", "memory" );
+
+			return c;
+		#endif
+	
+		TTMATH_LOG("UInt32::AddInt")
+	}
+
+
+
+
+	/*!
+		adding only two unsigned words to the existing value
+		and these words begin on the 'index' position
+		(it's used in the multiplication algorithm 2)
+
+		index should be equal or smaller than value_size-2 (index <= value_size-2)
+		x1 - lower word, x2 - higher word
+
+		for example if we've got value_size equal 4 and:
+			table[0] = 3
+			table[1] = 4
+			table[2] = 5
+			table[3] = 6
+		then let
+			x1 = 10
+			x2 = 20
+		and
+			index = 1
+
+		the result of this method will be:
+			table[0] = 3
+			table[1] = 4 + x1 = 14
+			table[2] = 5 + x2 = 25
+			table[3] = 6
+		
+		and no carry at the end of table[3]
+
+		(of course if there was a carry in table[2](5+20) then 
+		this carry would be passed to the table[3] etc.)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::AddTwoInts(uint x2, uint x1, uint index)
+	{
+	register uint b = value_size;
+	register uint * p1 = table;
+
+		TTMATH_ASSERT( index < value_size - 1 )
+
+		#ifndef __GNUC__
+			__asm
+			{
+				mov ecx, [b]
+				sub ecx, [index]				
+
+				mov ebx, [p1]
+				mov edx, [index]
+
+				mov eax, [x1]
+				add [ebx+edx*4], eax
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
+				lea ecx, [ecx-1]
+
+				mov eax, [x2]
+			
+				ALIGN 16
+			p:
+				adc [ebx+edx*4], eax
+				jnc end
+
+				xor eax, eax
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
+				dec ecx
+				jnz p
+
+			end:
+				setc	al
+				movzx	eax, al
+			}
+		#endif		
+			
+
+		#ifdef __GNUC__
+			register uint c;
+			
+			__asm__ __volatile__(
+			
+				"push %%ecx						\n"
+				"push %%edx						\n"
+
+				"subl %%edx, %%ecx 				\n"
+				
+				"addl %%esi, (%%ebx,%%edx,4) 	\n"
+				"incl %%edx						\n"
+				"decl %%ecx						\n"
+
+			"1:									\n"
+				"adcl %%eax, (%%ebx,%%edx,4)	\n"
+			"jnc 2f								\n"
+
+				"mov $0, %%eax					\n"
+				"incl %%edx						\n"
+				"decl %%ecx						\n"
+			"jnz 1b								\n"
+
+			"2:									\n"
+				"setc %%al						\n"
+				"movzx %%al, %%eax				\n"
+
+				"pop %%edx						\n"
+				"pop %%ecx						\n"
+
+				: "=a" (c)
+				: "c" (b), "d" (index), "b" (p1), "S" (x1), "0" (x2)
+				: "cc", "memory" );
+
+			return c;
+		#endif
+
+		TTMATH_LOG("UInt32::AddTwoInts")
+	}
+
+
+
+
+
+	/*!
+		subtracting ss2 from the 'this' and subtracting
+		carry if it has been defined
+		(this = this - ss2 - c)
+
+		c must be zero or one (might be a bigger value than 1)
+		function returns carry (1) (if it has been)
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::Sub(const UInt<value_size> & ss2, uint c)
+	{
+	register uint b = value_size;
+	register uint * p1 = table;
+	register uint * p2 = const_cast<uint*>(ss2.table);
+
+		// we don't have to use TTMATH_REFERENCE_ASSERT here
+		// this algorithm doesn't require it
+
+		#ifndef __GNUC__
+
+			__asm
+			{
+				mov ecx,[b]
+				
+				mov ebx,[p1]
+				mov esi,[p2]
+
+				xor eax, eax
+				mov edx, eax
+
+				sub eax, [c]
+
+				ALIGN 16
+			p:
+				mov eax, [esi+edx*4]
+				sbb [ebx+edx*4], eax
+
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
+				dec ecx
+				jnz p
+
+				setc	al
+				movzx	eax, al
+			}
+
+		#endif
+
+
+		#ifdef __GNUC__
+			__asm__  __volatile__(
+				"push %%ecx						\n"
+			
+				"xorl %%eax, %%eax				\n"
+				"movl %%eax, %%edx				\n"
+				"subl %%edi, %%eax				\n"
+
+
+			"1:									\n"
+				"movl (%%esi,%%edx,4),%%eax		\n"
+				"sbbl %%eax, (%%ebx,%%edx,4)	\n"
+			
+				"incl %%edx						\n"
+				"decl %%ecx						\n"
+			"jnz 1b								\n"
+
+				"setc %%al						\n"
+				"movzx %%al,%%edx				\n"
+
+				"pop %%ecx						\n"
+
+				: "=d" (c)
+				: "D" (c), "c" (b), "b" (p1), "S" (p2)
+				: "%eax", "cc", "memory" );
+
+		return c;
+		#endif
+
+		TTMATH_LOG("UInt32::Sub")
+	}
+
+
+
+
+	/*!
+		this method subtracts one word (at a specific position)
+		and returns a carry (if it was)
+
+		e.g.
+
+		if we've got (value_size=3):
+			table[0] = 10;
+			table[1] = 30;
+			table[2] = 5;	
+		and we call:
+			SubInt(2,1)
+		then it'll be:
+			table[0] = 10;
+			table[1] = 30 - 2;
+			table[2] = 5;
+
+		of course if there was a carry from table[2] it would be returned
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::SubInt(uint value, uint index)
+	{
+	register uint b = value_size;
+	register uint * p1 = table;
+
+		TTMATH_ASSERT( index < value_size )
+
+		#ifndef __GNUC__
+			__asm
+			{
+				mov ecx, [b]
+				sub ecx, [index]				
+
+				mov edx, [index]
+				mov ebx, [p1]
+
+				mov eax, [value]
+
+				ALIGN 16
+			p:
+				sub [ebx+edx*4], eax
+				jnc end
+
+				mov eax, 1
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
+				dec ecx
+				jnz p
+
+			end:
+				setc	al
+				movzx	eax, al
+			}
+		#endif		
+			
+
+		#ifdef __GNUC__
+			register uint c;
+
+			__asm__ __volatile__(
+			
+				"push %%eax						\n"
+				"push %%ecx						\n"
+
+				"subl %%edx, %%ecx 				\n"
+
+			"1:									\n"
+				"subl %%eax, (%%ebx,%%edx,4)	\n"
+			"jnc 2f								\n"
+				
+				"movl $1, %%eax					\n"
+				"incl %%edx						\n"
+				"decl %%ecx						\n"
+			"jnz 1b								\n"
+
+			"2:									\n"
+				"setc %%al						\n"
+				"movzx %%al, %%edx				\n"
+
+				"pop %%ecx						\n"
+				"pop %%eax						\n"
+
+				: "=d" (c)
+				: "a" (value), "c" (b), "0" (index), "b" (p1)
+				: "cc", "memory" );
+
+			return c;
+		#endif
+		
+		TTMATH_LOG("UInt32::SubInt")
+	
+	}
+
+
+
+	/*!
+		this method moves all bits into the left hand side
+		return value <- this <- c
+
+		the lowest *bit* will be held the 'c' and
+		the state of one additional bit (on the left hand side)
+		will be returned
+
+		for example:
+		let this is 001010000
+		after Rcl2_one(1) there'll be 010100001 and Rcl2_one returns 0
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::Rcl2_one(uint c)
+	{
+	register sint b = value_size;
+	register uint * p1 = table;
+
+		#ifndef __GNUC__
+			__asm
+			{
+				mov ebx, [p1]
+
+				xor edx, edx
+				mov ecx, edx
+				sub ecx, [c]
+
+				mov ecx, [b]
+
+				ALIGN 16
+			p:
+				rcl dword ptr [ebx+edx*4], 1
+				
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
+				dec ecx
+				jnz p
+
+				setc	al
+				movzx	eax, al
+			}
+		#endif
+
+
+		#ifdef __GNUC__
+		__asm__  __volatile__(
+
+			"push %%edx					\n"
+			"push %%ecx					\n"
+
+			"xorl %%edx, %%edx			\n"   // edx=0
+			"neg %%eax					\n"   // CF=1 if eax!=0 , CF=0 if eax==0
+
+		"1:								\n"
+			"rcll $1, (%%ebx, %%edx, 4)	\n"
+
+			"incl %%edx					\n"
+			"decl %%ecx					\n"
+		"jnz 1b							\n"
+
+			"setc %%al					\n"
+			"movzx %%al, %%eax			\n"
+
+			"pop %%ecx					\n"
+			"pop %%edx					\n"
+
+			: "=a" (c)
+			: "0" (c), "c" (b), "b" (p1)
+			: "cc", "memory" );
+
+			return c;
+		#endif
+
+		TTMATH_LOG("UInt32::Rcl2_one")
+
+	}
+
+
+
+	/*!
+		this method moves all bits into the right hand side
+		c -> this -> return value
+
+		the highest *bit* will be held the 'c' and
+		the state of one additional bit (on the right hand side)
+		will be returned
+
+		for example:
+		let this is 000000010
+		after Rcr2_one(1) there'll be 100000001 and Rcr2_one returns 0
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::Rcr2_one(uint c)
+	{
+	register sint b = value_size;
+	register uint * p1 = table;
+
+		#ifndef __GNUC__
+			__asm
+			{
+				xor ecx, ecx
+				sub ecx, [c]
+
+				mov ebx, [p1]
+				mov ecx, [b]
+
+				ALIGN 16
+			p:
+				rcr dword ptr [ebx+ecx*4-4], 1
+				
+				dec ecx
+				jnz p
+
+				setc	al
+				movzx	eax, al
+			}
+		#endif
+
+
+		#ifdef __GNUC__
+		__asm__  __volatile__(
+
+			"push %%ecx						\n"
+
+			"neg %%eax						\n"   // CF=1 if eax!=0 , CF=0 if eax==0
+
+		"1:									\n"
+			"rcrl $1, -4(%%ebx, %%ecx, 4)	\n"
+
+			"decl %%ecx						\n"
+		"jnz 1b								\n"
+
+			"setc %%al						\n"
+			"movzx %%al, %%eax				\n"
+
+			"pop %%ecx						\n"
+
+			: "=a" (c)
+			: "0" (c), "c" (b), "b" (p1)
+			: "cc", "memory" );
+
+			return c;
+		#endif
+
+		TTMATH_LOG("UInt32::Rcr2_one")
+	}
+
+
+
+	/*!
+		this method moves all bits into the left hand side
+		return value <- this <- c
+
+		the lowest *bits* will be held the 'c' and
+		the state of one additional bit (on the left hand side)
+		will be returned
+
+		for example:
+		let this is 001010000
+		after Rcl2(3, 1) there'll be 010000111 and Rcl2 returns 1
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::Rcl2(uint bits, uint c)
+	{
+	TTMATH_ASSERT( bits>0 && bits<TTMATH_BITS_PER_UINT )
+		
+	register sint b = value_size;
+	register uint * p1 = table;
+	register uint mask;
+
+		#ifndef __GNUC__
+			__asm
+			{
+				mov edi, [b]
+
+				mov ecx, 32
+				sub ecx, [bits]
+				mov edx, -1
+				shr edx, cl
+				mov [mask], edx
+
+				mov ecx, [bits]
+				mov ebx, [p1]
+
+				xor edx, edx   // edx = 0
+				mov esi, edx   // old value = 0 
+
+				mov eax, [c]
+				or eax, eax
+				cmovnz esi, [mask] // if c then old value = mask
+
+				ALIGN 16
+			p:
+				rol dword ptr [ebx+edx*4], cl
+				
+				mov eax, [ebx+edx*4]
+				and eax, [mask] 
+				xor [ebx+edx*4], eax // clearing bits
+				or [ebx+edx*4], esi  // saving old value
+				mov esi, eax
+
+				lea edx, [edx+1] // inc edx, but faster (no flags dependencies)
+				dec edi
+				jnz p
+
+				and eax, 1
+			}
+		#endif
+
+
+		#ifdef __GNUC__
+		__asm__  __volatile__(
+
+			"push %%edx						\n"
+			"push %%esi						\n"
+			"push %%edi						\n"
+			
+			"movl %%ecx, %%esi				\n"
+			"movl $32, %%ecx				\n"
+			"subl %%esi, %%ecx				\n"
+			"movl $-1, %%edx				\n"
+			"shrl %%cl, %%edx				\n"
+			"movl %%edx, %[amask]			\n"
+			"movl %%esi, %%ecx				\n"
+
+			"xorl %%edx, %%edx				\n"
+			"movl %%edx, %%esi				\n"
+
+			"orl %%eax, %%eax				\n"
+			"cmovnz %[amask], %%esi			\n"
+
+		"1:									\n"
+			"roll %%cl, (%%ebx,%%edx,4)		\n"
+
+			"movl (%%ebx,%%edx,4), %%eax	\n"
+			"andl %[amask], %%eax			\n"
+			"xorl %%eax, (%%ebx,%%edx,4)	\n"
+			"orl  %%esi, (%%ebx,%%edx,4)	\n"
+			"movl %%eax, %%esi				\n"
+			
+			"incl %%edx						\n"
+			"decl %%edi						\n"
+		"jnz 1b								\n"
+			
+			"and $1, %%eax					\n"
+
+			"pop %%edi						\n"
+			"pop %%esi						\n"
+			"pop %%edx						\n"
+
+			: "=a" (c)
+			: "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask)
+			: "cc", "memory" );
+
+			return c;
+		#endif
+
+		TTMATH_LOG("UInt32::Rcl2")
+	}
+
+
+
+
+	/*!
+		this method moves all bits into the right hand side
+		C -> this -> return value
+
+		the highest *bits* will be held the 'c' and
+		the state of one additional bit (on the right hand side)
+		will be returned
+
+		for example:
+		let this is 000000010
+		after Rcr2(2, 1) there'll be 110000000 and Rcr2 returns 1
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::Rcr2(uint bits, uint c)
+	{
+	TTMATH_ASSERT( bits>0 && bits<TTMATH_BITS_PER_UINT )
+
+	register sint b = value_size;
+	register uint * p1 = table;
+	register uint mask;
+
+		#ifndef __GNUC__
+			__asm
+			{
+				mov edi, [b]
+
+				mov ecx, 32
+				sub ecx, [bits]
+				mov edx, -1
+				shl edx, cl
+				mov [mask], edx
+
+				mov ecx, [bits]
+				mov ebx, [p1]
+
+				xor edx, edx   // edx = 0
+				mov esi, edx   // old value = 0 
+				add edx, edi   
+				dec edx        // edx - is pointing at the last word
+
+				mov eax, [c]
+				or eax, eax
+				cmovnz esi, [mask] // if c then old value = mask
+
+				ALIGN 16
+			p:
+				ror dword ptr [ebx+edx*4], cl
+				
+				mov eax, [ebx+edx*4]
+				and eax, [mask] 
+				xor [ebx+edx*4], eax // clearing bits
+				or [ebx+edx*4], esi  // saving old value
+				mov esi, eax
+
+				dec edx
+				dec edi
+				jnz p
+
+				rol eax, 1    // bit 31 will be bit 0
+				and eax, 1  
+			}
+		#endif
+
+
+		#ifdef __GNUC__
+			__asm__  __volatile__(
+
+			"push %%edx						\n"
+			"push %%esi						\n"
+			"push %%edi						\n"
+			
+			"movl %%ecx, %%esi				\n"
+			"movl $32, %%ecx				\n"
+			"subl %%esi, %%ecx				\n"
+			"movl $-1, %%edx				\n"
+			"shll %%cl, %%edx				\n"
+			"movl %%edx, %[amask]			\n"
+			"movl %%esi, %%ecx				\n"
+
+			"xorl %%edx, %%edx				\n"
+			"movl %%edx, %%esi				\n"
+			"addl %%edi, %%edx				\n"
+			"decl %%edx						\n"
+
+			"orl %%eax, %%eax				\n"
+			"cmovnz %[amask], %%esi			\n"
+
+		"1:									\n"
+			"rorl %%cl, (%%ebx,%%edx,4)		\n"
+
+			"movl (%%ebx,%%edx,4), %%eax	\n"
+			"andl %[amask], %%eax			\n"
+			"xorl %%eax, (%%ebx,%%edx,4)	\n"
+			"orl  %%esi, (%%ebx,%%edx,4)	\n"
+			"movl %%eax, %%esi				\n"
+			
+			"decl %%edx						\n"
+			"decl %%edi						\n"
+		"jnz 1b								\n"
+			
+			"roll $1, %%eax					\n"
+			"andl $1, %%eax					\n"
+
+			"pop %%edi						\n"
+			"pop %%esi						\n"
+			"pop %%edx						\n"
+
+			: "=a" (c)
+			: "0" (c), "D" (b), "b" (p1), "c" (bits), [amask] "m" (mask)
+			: "cc", "memory" );
+
+			return c;
+		#endif
+
+		TTMATH_LOG("UInt32::Rcr2")
+	}
+
+
+
+	/*
+		this method returns the number of the highest set bit in one 32-bit word
+		if the 'x' is zero this method returns '-1'
+	*/
+	template<uint value_size>
+	sint UInt<value_size>::FindLeadingBitInWord(uint x)
+	{
+
+		#ifndef __GNUC__
+			__asm
+			{
+				mov edx,-1
+				bsr eax,[x]
+				cmovz eax,edx
+			}
+		#endif
+
+
+		#ifdef __GNUC__
+			register sint result;
+			
+			__asm__  __volatile__(
+
+			"bsrl %1, %0		\n"
+			"jnz 1f				\n"
+			"movl $-1, %0		\n"
+			"1:					\n"
+
+			: "=R" (result)
+			: "R" (x)
+			: "cc" );
+
+			return result;
+
+		#endif
+	}
+
+
+
+
+
+	/*!
+		this method sets a special bit in the 'value'
+		and returns the last state of the bit (zero or one)
+
+		bit is from <0,31>
+		e.g.
+		 uint x = 100;
+		 uint bit = SetBitInWord(x, 3);
+		 now: x = 108 and bit = 0
+	*/
+	template<uint value_size>
+	uint UInt<value_size>::SetBitInWord(uint & value, uint bit)
+	{
+		TTMATH_ASSERT( bit < TTMATH_BITS_PER_UINT )
+
+		uint v = value;
+
+		#ifndef __GNUC__
+			__asm
+			{
+				mov eax, [v]
+				mov ebx, [bit]
+				bts eax, ebx
+				mov [v], eax
+
+				setc bl
+				movzx ebx, bl
+				mov eax, ebx
+			}
+		#endif
+
+
+		#ifdef __GNUC__
+			uint old_bit;
+
+			__asm__  __volatile__(
+
+			"btsl %%ebx, %%eax		\n"
+
+			"setc %%bl				\n"
+			"movzx %%bl, %%ebx		\n"
+			
+			: "=a" (v), "=b" (old_bit)
+			: "0" (v), "1" (bit)
+			: "cc" );
+
+			return old_bit;
+		#endif
+
+		value = v;
+	}
+
+
+
+
+	/*!
+		multiplication: result_high:result_low = a * b
+		result_high - higher word of the result
+		result_low  - lower word of the result
+	
+		this methos never returns a carry
+		this method is used in the second version of the multiplication algorithms
+	*/
+	template<uint value_size>
+	void UInt<value_size>::MulTwoWords(uint a, uint b, uint * result_high, uint * result_low)
+	{
+	/*
+		we must use these temporary variables in order to inform the compilator
+		that value pointed with result1 and result2 has changed
+
+		this has no effect in visual studio but it's useful when
+		using gcc and options like -Ox
+	*/
+	register uint result1_;
+	register uint result2_;
+
+		#ifndef __GNUC__
+
+			__asm
+			{
+				mov eax, [a]
+				mul dword ptr [b]
+
+				mov [result2_], edx
+				mov [result1_], eax
+			}
+
+		#endif
+
+
+		#ifdef __GNUC__
+
+		__asm__ __volatile__(
+		
+			"mull %%edx			\n"
+
+			: "=a" (result1_), "=d" (result2_)
+			: "0" (a), "1" (b)
+			: "cc" );
+
+		#endif
+
+
+		*result_low  = result1_;
+		*result_high = result2_;
+	}
+
+
+
+
+
+	/*!
+	 *
+	 * Division
+	 *
+	 *
+	*/
+	
+
+
+
+	/*!
+		this method calculates 64bits word a:b / 32bits c (a higher, b lower word)
+		r = a:b / c and rest - remainder
+
+		*
+		* WARNING:
+		* if r (one word) is too small for the result or c is equal zero
+		* there'll be a hardware interruption (0)
+		* and probably the end of your program
+		*
+	*/
+	template<uint value_size>
+	void UInt<value_size>::DivTwoWords(uint a, uint b, uint c, uint * r, uint * rest)
+	{
+		register uint r_;
+		register uint rest_;
+		/*
+			these variables have similar meaning like those in
+			the multiplication algorithm MulTwoWords
+		*/
+
+		TTMATH_ASSERT( c != 0 )
+
+		#ifndef __GNUC__
+			__asm
+			{
+				mov edx, [a]
+				mov eax, [b]
+				div dword ptr [c]
+
+				mov [r_], eax
+				mov [rest_], edx
+			}
+		#endif
+
+
+		#ifdef __GNUC__
+		
+			__asm__ __volatile__(
+
+			"divl %%ecx				\n"
+
+			: "=a" (r_), "=d" (rest_)
+			: "d" (a), "a" (b), "c" (c)
+			: "cc" );
+
+		#endif
+
+
+		*r = r_;
+		*rest = rest_;
+
+	}
+
+
+
+} //namespace
+
+
+
+#endif //ifdef TTMATH_PLATFORM32
+#endif //ifndef TTMATH_NOASM
+#endif
diff --git a/ttmath/ttmathuint_x86_amd64_msvc.asm b/ttmath/ttmathuint_x86_amd64_msvc.asm
index 7d430f3..60de740 100644
--- a/ttmath/ttmathuint_x86_amd64_msvc.asm
+++ b/ttmath/ttmathuint_x86_amd64_msvc.asm
@@ -31,23 +31,21 @@ adc_x64				PROC
         ; r9 = nCarry
 
         xor		rax, rax
-        mov		r11, 0
+        xor		r11, r11
         sub		rax, r9		; sets CARRY if r9 != 0
         
+		ALIGN 16
  loop1:	
 		mov		rax,qword ptr [rdx + r11 * 8]
 		adc		qword ptr [rcx + r11 * 8], rax
-		inc		r11
+		lea		r11, [r11+1]
 		dec		r8
 		
 		jnz		loop1
 		
-		jc		return_1	; most of the times, there will be NO carry (I hope)
-		xor		rax, rax
-		ret
-	
-  return_1:
-		mov		rax, 1
+		setc	al
+		movzx	rax, al
+
 		ret
 		
 adc_x64				ENDP
@@ -66,24 +64,22 @@ addindexed_x64	PROC
         ; r9 = nValue
 
 		sub		rdx, r8				; rdx = remaining count of uints
+		ALIGN 16
 loop1:
 		add		qword ptr [rcx + r8 * 8], r9
 		jnc		done
 		
-		inc		r8
+		lea		r8, [r8+1]
 		mov		r9, 1
 		dec		rdx
 		jnz		loop1
 		
 done:
-		jc		return_1	; most of the times, there will be NO carry (I hope)
-		xor		rax, rax
+		setc	al
+		movzx	rax, al
+		
 		ret
 	
-  return_1:
-		mov		rax, 1
-		ret
-
 addindexed_x64	ENDP
 
 ;----------------------------------------
@@ -100,28 +96,32 @@ addindexed2_x64	PROC
         ; r9 = nValue1
         ; [esp+0x28] = nValue2
 
+		xor		rax, rax			; return value
 		mov		r11, rcx			; table
 		sub		rdx, r8				; rdx = remaining count of uints
 		mov		r10, [esp+028h]		; r10 = nValue2
 
 		add		qword ptr [r11 + r8 * 8], r10
-		inc		r8
+		lea		r8, [r8+1]
+
+		ALIGN 16
 loop1:
 		adc		qword ptr [r11 + r8 * 8], r9
-		jnc		done
+		jc		next
+		ret
 		
-		inc		r8
-		mov		r9, 0				; set to 0 -> cy still set!
+next:
+		lea		r8, [r8+1]
+		xor		r9, r9				; set to 0 -> cy still set!
 		dec		rdx
 		jnz		loop1
 		jc		return_1			; most of the times, there will be NO carry (I hope)
 
 done:
-		xor		rax, rax
 		ret
 	
-  return_1:
-		mov		rax, 1
+return_1:
+		lea		rax, [rax+1]
 		ret
 
 addindexed2_x64	ENDP
@@ -142,23 +142,20 @@ sbb_x64				PROC
         ; r9 = nCarry
 
         xor		rax, rax
-        mov		r11, 0
+        xor		r11, r11
         sub		rax, r9				; sets CARRY if r9 != 0
         
+		ALIGN 16
  loop1:	
 		mov		rax,qword ptr [rdx + r11 * 8]
 		sbb		qword ptr [rcx + r11 * 8], rax
-		inc		r11
+		lea		r11, [r11+1]
 		dec		r8
-		
 		jnz		loop1
 		
-		jc		return_1	; most of the times, there will be NO carry (I hope)
-		xor		rax, rax
-		ret
-	
-  return_1:
-		mov		rax, 1
+		setc	al
+		movzx	rax, al
+
 		ret
 
 sbb_x64				ENDP
@@ -176,11 +173,13 @@ subindexed_x64	PROC
         ; r9 = nValue
 
 		sub		rdx, r8				; rdx = remaining count of uints
+		
+		ALIGN 16
 loop1:
 		sub		qword ptr [rcx + r8 * 8], r9
 		jnc		done
 		
-		inc		r8
+		lea		r8, [r8+1]
 		mov		r9, 1
 		dec		rdx
 		jnz		loop1
@@ -210,9 +209,11 @@ rcl_x64	PROC
 		mov		r11, rcx			; table
 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
+		
+		ALIGN 16
 loop1:
 		rcl		qword ptr [r11 + r10 * 8], 1
-		inc		r10
+		lea		r10, [r10+1]
 		dec		rdx
 		jnz		loop1
 		
@@ -236,6 +237,8 @@ rcr_x64	PROC
 
 		xor		r10, r10
 		neg		r8					; CY set if r8 <> 0
+		
+		ALIGN 16
 loop1:
 		rcr		qword ptr -8[rcx + rdx * 8], 1
 		dec		rdx
@@ -304,6 +307,7 @@ rcl2_x64	PROC
 
         mov		r9, rax		; r9 = index (0..nSize-1)
 
+		ALIGN 16
 loop1:
 		rol		qword ptr [r10+r9*8], cl
 		mov		rax, qword ptr [r10+r9*8]
@@ -312,7 +316,7 @@ loop1:
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
 		
-		inc		r9
+		lea		r9, [r9+1]
 		dec		rdx
 
 		jnz		loop1
@@ -352,8 +356,9 @@ rcr2_x64	PROC
 		cmovnz	rbx, r11	; if (c) then old value = mask
 
         mov		r9, rdx		; r9 = index (0..nSize-1)
-        dec		r9
+		lea		r9, [r9-1]
 
+		ALIGN 16
 loop1:
 		ror		qword ptr [r10+r9*8], cl
 		mov		rax, qword ptr [r10+r9*8]
@@ -362,7 +367,7 @@ loop1:
 		or		qword ptr [r10+r9*8], rbx
 		mov		rbx, rax
 		
-		dec		r9
+		lea		r9, [r9-1]
 		dec		rdx
 
 		jnz		loop1