scummvm-cursorfix/engines/hpl1/engine/libraries/newton/physics/dgWorldDynamicUpdate.cpp

/* Copyright (c) <2003-2011> <Julio Jerez, Newton Game Dynamics>
 *
 * This software is provided 'as-is', without any express or implied
 * warranty. In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 * claim that you wrote the original software. If you use this software
 * in a product, an acknowledgment in the product documentation would be
 * appreciated but is not required.
 *
 * 2. Altered source versions must be plainly marked as such, and must not be
 * misrepresented as being the original software.
 *
 * 3. This notice may not be removed or altered from any source distribution.
 */

#include "hpl1/engine/libraries/newton/core/dg.h"

#include "dgBody.h"
#include "dgConstraint.h"
#include "dgWorld.h"
#include "dgWorldDynamicUpdate.h"


#define DG_FREEZZING_VELOCITY_DRAG dgFloat32(0.9f)
#define DG_SOLVER_MAX_ERROR (DG_FREEZE_MAG * dgFloat32(0.5f))

#ifdef DG_BUILD_SIMD_CODE
#define DG_SIMD_WORD_SIZE dgInt32(sizeof(simd_type) / sizeof(dgFloat32))
#else
#define DG_SIMD_WORD_SIZE dgInt32(sizeof(dgVector) / sizeof(dgFloat32))
#endif

#define DG_PARALLEL_JOINT_COUNT 64

// in my twist to RK4 I am no sure if the final derivative have to be weighted, it seems a mistake, I need to investigate more
// uncomment this out for more stable behavior but no exactly correct solution on the iterative solver
//#define DG_WIGHT_FINAL_RK4_DERIVATIVES

#ifdef TARGET_OS_IPHONE
#define DG_BASE_ITERATION_COUNT 2
#else
#define DG_BASE_ITERATION_COUNT 3
#endif

class dgBodyInfo {
public:
	dgBody *m_body;
};

class dgJointInfo {
public:
	dgConstraint *m_joint;
	dgInt32 m_autoPairstart;
	dgInt32 m_autoPaircount;
	dgInt32 m_autoPairActiveCount;
	dgInt32 m_m0;
	dgInt32 m_m1;
};

class dgIsland {
public:
	dgInt32 m_bodyCount;
	dgInt32 m_bodyStart;
	dgInt32 m_jointCount;
	dgInt32 m_jointStart;
	dgInt32 m_hasUnilateralJoints : 1;
	dgInt32 m_isContinueCollision : 1;
};

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

static inline dgInt32 CompareIslands(const dgIsland *const islandA,
                                     const dgIsland *const islandB, void *notUsed) {
	dgInt32 countA;
	dgInt32 countB;

	countA = islandA->m_jointCount - (islandA->m_hasUnilateralJoints << 23);
	countB = islandB->m_jointCount - (islandB->m_hasUnilateralJoints << 23);

	if (countA < countB) {
		return -1;
	}
	if (countA > countB) {
		return 1;
	}
	return 0;
}

dgBody *dgWorld::GetIslandBody(const void *const islandPtr, dgInt32 index) const {
	const dgIslandCallbackStruct &island = *(const dgIslandCallbackStruct *)islandPtr;

	char *ptr = &((char *)island.m_bodyArray)[island.m_strideInByte * index];
	dgBody **bodyPtr = (dgBody **)ptr;
	return (index < island.m_count) ? ((index >= 0) ? *bodyPtr : NULL) : NULL;
}

dgWorldDynamicUpdate::dgWorldDynamicUpdate() {
	m_bodies = 0;
	;
	m_joints = 0;
	m_islands = 0;
	m_markLru = 0;
	m_maxJointCount = 0;
	m_maxBodiesCount = 0;
	m_maxIslandCount = 0;
}

void dgWorldDynamicUpdate::UpdateDynamics(dgWorld *const world,
        dgInt32 archModel, dgFloat32 timestep) {
	dgUnsigned32 updateTime = world->m_getPerformanceCount();

	m_bodies = 0;
	m_joints = 0;
	m_islands = 0;
	m_markLru = 0;
	m_world = world;
	ReallocBodyMemory(0);
	ReallocIslandMemory(0);
	ReallocJointsMemory(0);

	dgInt32 threadCounts = dgInt32(m_world->m_numberOfTheads);
	for (dgInt32 i = 0; i < threadCounts; i++) {
		m_solverMemory[i].m_world = m_world;
		m_solverMemory[i].m_threadIndex = i;
		ReallocJacobiansMemory(0, i);
		ReallocIntenalForcesMemory(0, i);
	}

	m_world->m_dynamicsLru = m_world->m_dynamicsLru + 2;
	m_markLru = dgInt32(m_world->m_dynamicsLru);
	dgInt32 lru = m_markLru - 1;

	dgBodyMasterList &me = *m_world;

	NEWTON_ASSERT(me.GetFirst()->GetInfo().GetBody() == m_world->m_sentionelBody);

	dgUnsigned32 solverMode = m_world->m_solverMode;

	m_world->m_sentionelBody->m_index = 0;
	m_world->m_sentionelBody->m_dynamicsLru = dgUnsigned32(m_markLru);

	for (dgBodyMasterList::dgListNode *node = me.GetLast(); node;
	        node = node->GetPrev()) {

		const dgBodyMasterListRow &graphNode = node->GetInfo();
		dgBody *const body = graphNode.GetBody();
		if (body->m_invMass.m_w == dgFloat32(0.0f)) {
#ifdef _DEBUG
			for (; node; node = node->GetPrev()) {
				NEWTON_ASSERT(node->GetInfo().GetBody()->m_invMass.m_w == dgFloat32(0.0f));
			}
#endif
			break;
		}

		if (dgInt32(body->m_dynamicsLru) < lru) {
			if (!(body->m_freeze | body->m_spawnnedFromCallback | body->m_sleeping | !body->m_isInWorld)) {
				SpanningTree(body);
			}
		}
		body->m_spawnnedFromCallback = false;
	}

	dgSort(m_islandArray, m_islands, CompareIslands);
	//  dgRadixSort (m_islandArray, &m_islandArray[m_islands], m_islands, 3, GetIslandsKey);

	dgUnsigned32 dynamicsTime = m_world->m_getPerformanceCount();
	m_world->m_perfomanceCounters[m_dynamicsBuildSpanningTreeTicks] = dynamicsTime - updateTime;

	if (threadCounts > 1) {
		dgInt32 chunkSizes[DG_MAXIMUN_THREADS];

		if (m_world->m_singleIslandMultithreading) {
			const dgJacobianMemory &system = m_solverMemory[0];
			while (m_islands && (m_islandArray[m_islands - 1].m_jointCount >= DG_PARALLEL_JOINT_COUNT)) {
				m_islands--;

				NEWTON_ASSERT(!m_islandArray[m_islands].m_isContinueCollision);
				BuildJacobianMatrixParallel(m_islandArray[m_islands], timestep, archModel);
				system.CalculateReactionsForcesParallel(dgInt32(solverMode), DG_SOLVER_MAX_ERROR, archModel);
				IntegrateArray(&system.m_bodyArray[1], system.m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep, 0, true);
			}
		}

		if (m_islands) {
			m_world->m_threadsManager.CalculateChunkSizes(m_islands, chunkSizes);
			for (dgInt32 threadIndex = 0; threadIndex < threadCounts; threadIndex++) {
				m_workerThreads[threadIndex].m_useSimd = archModel;
				m_workerThreads[threadIndex].m_world = m_world;
				m_workerThreads[threadIndex].m_dynamics = this;
				m_workerThreads[threadIndex].m_count = chunkSizes[threadIndex] * threadCounts;
				m_workerThreads[threadIndex].m_threads = threadCounts;
				m_workerThreads[threadIndex].m_timestep = timestep;
				m_workerThreads[threadIndex].m_solverMode = dgInt32(solverMode);
				m_workerThreads[threadIndex].m_threadIndex = threadIndex;
				m_workerThreads[threadIndex].m_system = &m_solverMemory[threadIndex];
				m_world->m_threadsManager.SubmitJob(&m_workerThreads[threadIndex]);
			}
			m_world->m_threadsManager.SynchronizationBarrier();
		}

	} else {
		m_workerThreads[0].m_threads = 1;
		m_workerThreads[0].m_useSimd = archModel;
		m_workerThreads[0].m_count = m_islands;
		m_workerThreads[0].m_world = m_world;
		m_workerThreads[0].m_dynamics = this;
		m_workerThreads[0].m_threadIndex = 0;
		m_workerThreads[0].m_solverMode = dgInt32(solverMode);
		m_workerThreads[0].m_timestep = timestep;
		m_workerThreads[0].m_system = &m_solverMemory[0];
		m_workerThreads[0].ThreadExecute();
	}

	dgUnsigned32 ticks = m_world->m_getPerformanceCount();
	m_world->m_perfomanceCounters[m_dynamicsSolveSpanningTreeTicks] = ticks - dynamicsTime;
	m_world->m_perfomanceCounters[m_dynamicsTicks] = ticks - updateTime;
}

void dgSolverWorlkerThreads::ThreadExecute() {
	const dgIsland *const m_islandArray = m_dynamics->m_islandArray;
	dgContactPoint *const contactBuffer =
	    (dgContactPoint *)m_world->m_contactBuffers[m_threadIndex];

	dgInt32 step = m_threads;
	dgInt32 count = m_count;
	if (m_useSimd) {
		for (dgInt32 i = 0; i < count; i += step) {
			const dgIsland &island = m_islandArray[i + m_threadIndex];
			if (!island.m_isContinueCollision) {
				m_dynamics->BuildJacobianMatrixSimd(island, m_threadIndex, m_timestep);
				m_system->CalculateReactionsForcesSimd(m_solverMode,
				                                       DG_SOLVER_MAX_ERROR);
				m_dynamics->IntegrateArray(&m_system->m_bodyArray[1],
				                           m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, m_timestep,
				                           m_threadIndex, true);
			} else {
				dgBodyInfo *const bodyArray =
				    &m_dynamics->m_bodyArray[island.m_bodyStart];
				dgJointInfo *const constraintArray =
				    &m_dynamics->m_constraintArray[island.m_jointStart];

				dgFloat32 dist = dgFloat32(0.0f);
				dgFloat32 maxSpeed2 = dgFloat32(0.0f);
				for (dgInt32 k = 1; k < island.m_bodyCount; k++) {
					dgVector veloc;
					dgVector omega;

					dgBody *const body = bodyArray[k].m_body;
					NEWTON_ASSERT(body->m_mass.m_w > dgFloat32(0.0f));
					body->CalculateContinueVelocity(m_timestep, veloc, omega);

					dgFloat32 mag2 = veloc % veloc;
					if (mag2 > maxSpeed2) {
						maxSpeed2 = mag2;
						dist = body->m_collision->GetBoxMinRadius();
					}
				}

				NEWTON_ASSERT(dist > dgFloat32(0.0f));
				dgInt32 steps = dgInt32(
				                    dgFloat32(4.0f) * dgSqrt(maxSpeed2) * m_timestep / dist) +
				                1;
				dgFloat32 timestep = m_timestep / dgFloat32(steps);

				if (steps > 8) {
					steps = 8;
				}

				for (dgInt32 j = 0; j < steps - 1; j++) {
					m_dynamics->BuildJacobianMatrixSimd(island, m_threadIndex, timestep);
					m_system->CalculateReactionsForcesSimd(m_solverMode,
					                                       DG_SOLVER_MAX_ERROR);
					m_dynamics->IntegrateArray(&m_system->m_bodyArray[1],
					                           m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep,
					                           m_threadIndex, false);

					for (dgInt32 k = 1; k < island.m_bodyCount; k++) {
						dgBody *const body = bodyArray[k].m_body;
						body->m_accel = body->m_prevExternalForce;
						body->m_alpha = body->m_prevExternalTorque;
					}

					for (dgInt32 k = 0; k < island.m_jointCount; k++) {
						dgConstraint *const constraint = constraintArray[k].m_joint;
						if (constraint->GetId() == dgContactConstraintId) {
							dgContact *const contact = (dgContact *)constraint;
							const dgContactMaterial *const material =
							    contact->m_myCacheMaterial;
							if (material->m_flags & dgContactMaterial::m_collisionEnable__) {
								if (material->m_aabbOverlap) {
									NEWTON_ASSERT(contact->m_body0);
									NEWTON_ASSERT(contact->m_body1);
									material->m_aabbOverlap(reinterpret_cast<const NewtonMaterial *>(material), reinterpret_cast<const NewtonBody *>(contact->m_body0),
															reinterpret_cast<const NewtonBody *>(contact->m_body1), m_threadIndex);
								}
								dgCollidingPairCollector::dgPair pair;
								pair.m_body0 = contact->m_body0;
								pair.m_body1 = contact->m_body1;
								pair.m_contact = contact;
								pair.m_material = material;
								pair.m_contactBuffer = contactBuffer;

								m_world->CalculateContactsSimd(&pair, timestep, m_threadIndex);
								//NEWTON_ASSERT (contact->m_maxDOF == 0);

								NEWTON_ASSERT(pair.m_contact);
								if (pair.m_contactCount) {
									NEWTON_ASSERT(pair.m_contactCount <= (DG_CONSTRAINT_MAX_ROWS / 3));
									m_world->ProcessContacts(&pair, timestep, m_threadIndex);
								} else if (!pair.m_contactBuffer) {
									m_world->ProcessCachedContacts(pair.m_contact,
									                               pair.m_material, timestep, m_threadIndex);
								}
							}
						}
					}
				}
				m_dynamics->BuildJacobianMatrixSimd(island, m_threadIndex, timestep);
				m_system->CalculateReactionsForcesSimd(m_solverMode,
				                                       DG_SOLVER_MAX_ERROR);
				m_dynamics->IntegrateArray(&m_system->m_bodyArray[1],
				                           m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep,
				                           m_threadIndex, true);
			}
		}
	} else {
		for (dgInt32 i = 0; i < count; i += step) {
			const dgIsland &island = m_islandArray[i + m_threadIndex];

			if (!island.m_isContinueCollision) {
				m_dynamics->BuildJacobianMatrix(island, m_threadIndex, m_timestep);
				m_system->CalculateReactionsForces(m_solverMode, DG_SOLVER_MAX_ERROR);
				m_dynamics->IntegrateArray(&m_system->m_bodyArray[1],
				                           m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, m_timestep,
				                           m_threadIndex, true);
			} else {
				dgBodyInfo *const bodyArray =
				    &m_dynamics->m_bodyArray[island.m_bodyStart];
				dgJointInfo *const constraintArray =
				    &m_dynamics->m_constraintArray[island.m_jointStart];

				dgFloat32 dist = dgFloat32(0.0f);
				dgFloat32 maxSpeed2 = dgFloat32(0.0f);
				for (dgInt32 k = 1; k < island.m_bodyCount; k++) {
					dgVector veloc;
					dgVector omega;

					dgBody *const body = bodyArray[k].m_body;
					NEWTON_ASSERT(body->m_mass.m_w > dgFloat32(0.0f));
					body->CalculateContinueVelocity(m_timestep, veloc, omega);

					dgFloat32 mag2 = veloc % veloc;
					if (mag2 > maxSpeed2) {
						maxSpeed2 = mag2;
						dist = body->m_collision->GetBoxMinRadius();
					}
				}

				NEWTON_ASSERT(dist > dgFloat32(0.0f));
				dgInt32 steps = dgInt32(
				                    dgFloat32(4.0f) * dgSqrt(maxSpeed2) * m_timestep / dist) +
				                1;
				dgFloat32 timestep = m_timestep / dgFloat32(steps);

				if (steps > 8) {
					steps = 8;
				}

				for (dgInt32 j = 0; j < steps - 1; j++) {
					m_dynamics->BuildJacobianMatrix(island, m_threadIndex, timestep);
					m_system->CalculateReactionsForces(m_solverMode, DG_SOLVER_MAX_ERROR);
					m_dynamics->IntegrateArray(&m_system->m_bodyArray[1],
					                           m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep,
					                           m_threadIndex, false);

					for (dgInt32 k = 1; k < island.m_bodyCount; k++) {
						dgBody *const body = bodyArray[k].m_body;
						body->m_accel = body->m_prevExternalForce;
						body->m_alpha = body->m_prevExternalTorque;
					}

					for (dgInt32 k = 0; k < island.m_jointCount; k++) {
						dgConstraint *const constraint = constraintArray[k].m_joint;
						if (constraint->GetId() == dgContactConstraintId) {
							dgContact *const contact = (dgContact *)constraint;
							const dgContactMaterial *const material =
							    contact->m_myCacheMaterial;
							if (material->m_flags & dgContactMaterial::m_collisionEnable__) {
								if (material->m_aabbOverlap) {
									NEWTON_ASSERT(contact->m_body0);
									NEWTON_ASSERT(contact->m_body1);
									material->m_aabbOverlap(reinterpret_cast<const NewtonMaterial *>(material), reinterpret_cast<const NewtonBody *>(contact->m_body0),
															reinterpret_cast<const NewtonBody *>(contact->m_body1), m_threadIndex);
								}
								dgCollidingPairCollector::dgPair pair;
								pair.m_body0 = contact->m_body0;
								pair.m_body1 = contact->m_body1;
								pair.m_contact = contact;
								pair.m_material = material;
								pair.m_contactBuffer = contactBuffer;
								m_world->CalculateContacts(&pair, timestep, m_threadIndex);

								NEWTON_ASSERT(pair.m_contact);

								if (pair.m_contactCount) {
									NEWTON_ASSERT(pair.m_contactCount <= (DG_CONSTRAINT_MAX_ROWS / 3));
									m_world->ProcessContacts(&pair, timestep, m_threadIndex);
								} else if (!pair.m_contactBuffer) {
									m_world->ProcessCachedContacts(pair.m_contact,
									                               pair.m_material, timestep, m_threadIndex);
								}
							}
						}
					}
				}
				m_dynamics->BuildJacobianMatrix(island, m_threadIndex, timestep);
				m_system->CalculateReactionsForces(m_solverMode, DG_SOLVER_MAX_ERROR);
				m_dynamics->IntegrateArray(&m_system->m_bodyArray[1],
				                           m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep,
				                           m_threadIndex, true);
			}
		}
	}
}

void dgWorldDynamicUpdate::ReallocBodyMemory(dgInt32 bodyCount) {
	void *memory;
	dgInt32 stride;
	dgInt32 newCount;
	dgBodyInfo *bodyArray;

	stride = sizeof(dgBodyInfo);
	if (bodyCount) {
		m_world->m_bodiesMemorySizeInBytes = m_world->m_bodiesMemorySizeInBytes * 2;
		memory = m_world->GetAllocator()->MallocLow(
		             m_world->m_bodiesMemorySizeInBytes);

		newCount = (m_world->m_bodiesMemorySizeInBytes / stride) & (-4);
		bodyArray = (dgBodyInfo *)memory;

		memcpy(bodyArray, m_bodyArray, bodyCount * sizeof(dgBodyInfo));
		m_world->GetAllocator()->FreeLow(m_world->m_bodiesMemory);

	} else {
		memory = m_world->m_bodiesMemory;
		newCount = (m_world->m_bodiesMemorySizeInBytes / stride) & (-4);
		bodyArray = (dgBodyInfo *)memory;
	}

	m_maxBodiesCount = newCount;
	m_world->m_bodiesMemory = memory;
	m_bodyArray = bodyArray;
	NEWTON_ASSERT((dgUnsigned64(m_bodyArray) & 0x0f) == 0);
}

void dgWorldDynamicUpdate::ReallocIslandMemory(dgInt32 islandCount) {
	void *memory;
	dgInt32 stride;
	dgInt32 newCount;
	dgIsland *islandArray;

	stride = sizeof(dgIsland);
	if (islandCount) {
		m_world->m_islandMemorySizeInBytes = m_world->m_islandMemorySizeInBytes * 2;
		memory = m_world->GetAllocator()->MallocLow(
		             m_world->m_islandMemorySizeInBytes);

		newCount = (m_world->m_islandMemorySizeInBytes / stride) & (-4);

		islandArray = (dgIsland *)memory;

		memcpy(islandArray, m_islandArray, islandCount * sizeof(dgIsland));
		m_world->GetAllocator()->FreeLow(m_world->m_islandMemory);

	} else {
		memory = m_world->m_islandMemory;
		newCount = (m_world->m_islandMemorySizeInBytes / stride) & (-4);
		islandArray = (dgIsland *)memory;
	}

	m_maxIslandCount = newCount;
	m_world->m_islandMemory = memory;

	m_islandArray = islandArray;
	NEWTON_ASSERT((dgUnsigned64(m_islandArray) & 0x0f) == 0);
}

void dgWorldDynamicUpdate::ReallocJointsMemory(dgInt32 count) {
	void *memory;
	dgInt32 stride;
	dgInt32 newCount;
	dgJointInfo *constraintArray;

	stride = sizeof(dgJointInfo);

	if (count) {
		m_world->m_jointsMemorySizeInBytes = m_world->m_jointsMemorySizeInBytes * 2;
		memory = m_world->GetAllocator()->MallocLow(
		             m_world->m_jointsMemorySizeInBytes);
		newCount = (m_world->m_jointsMemorySizeInBytes / stride) & (-4);

		constraintArray = (dgJointInfo *)memory;

		memcpy(constraintArray, m_constraintArray, count * sizeof(dgJointInfo));

		m_world->GetAllocator()->FreeLow(m_world->m_jointsMemory);
	} else {
		memory = m_world->m_jointsMemory;
		newCount = (m_world->m_jointsMemorySizeInBytes / stride) & (-4);

		constraintArray = (dgJointInfo *)memory;
	}

	m_maxJointCount = newCount;
	m_world->m_jointsMemory = memory;

	m_constraintArray = constraintArray;
	NEWTON_ASSERT((dgUnsigned64(m_constraintArray) & 0x0f) == 0);
}

void dgWorldDynamicUpdate::ReallocJacobiansMemory(dgInt32 count,
        dgInt32 threadIndex) {
	void *memory;
	dgInt32 stride;
	dgInt32 newCount;
	dgJacobianPair *Jt;
	dgJacobianPair *JMinv;
	dgFloat32 *force;
	dgFloat32 *accel;
	dgFloat32 *deltaAccel;
	dgFloat32 *deltaForce;
	dgFloat32 *diagDamp;
	dgFloat32 *invDJMinvJt;
	dgFloat32 *restitution;
	dgFloat32 *penetration;
	dgFloat32 *coordenateAccel;
	dgFloat32 *penetrationStiffness;
	dgFloat32 *lowerBoundFrictionCoefficent;
	dgFloat32 *upperBoundFrictionCoefficent;
	dgFloat32 **forceFeedback;
	dgInt32 *normalForceIndex;
	dgInt32 *accelIsMotor;

	stride = sizeof(dgJacobianPair) + // Jt
	         sizeof(dgJacobianPair) + // JtMinv
	         sizeof(dgFloat32) +      // lowerBoundFrictionCoefficent
	         sizeof(dgFloat32) +      // upperBoundFrictionCoefficent
	         sizeof(dgFloat32 *) +    // forceFeedback
	         sizeof(dgFloat32) +      // force
	         sizeof(dgFloat32) +      // accel
	         sizeof(dgFloat32) +      // deltaAccel
	         sizeof(dgFloat32) +      // deltaForce
	         sizeof(dgFloat32) +      // diagDamp
	         sizeof(dgFloat32) +      // invDJMinvJt
	         sizeof(dgFloat32) +      // penetration
	         sizeof(dgFloat32) +      // restitution
	         sizeof(dgFloat32) +      // coordinateAccel
	         sizeof(dgFloat32) +      // penetrationStiffness
	         sizeof(dgInt32) +        // normalForceIndex
	         sizeof(dgInt32) +        // accelIsMotor;
	         0;

	if (count) {
		m_world->m_jacobiansMemorySizeInBytes[threadIndex] =
		    m_world->m_jacobiansMemorySizeInBytes[threadIndex] * 2;
		memory = m_world->GetAllocator()->MallocLow(
		             m_world->m_jacobiansMemorySizeInBytes[threadIndex] + 64);
		newCount = ((m_world->m_jacobiansMemorySizeInBytes[threadIndex] - 16) / stride) & (-8);

		Jt = (dgJacobianPair *)memory;
		JMinv = (dgJacobianPair *)&Jt[newCount];
		force = (dgFloat32 *)&JMinv[newCount];
		force += 4;
		accel = (dgFloat32 *)&force[newCount];
		;
		deltaAccel = (dgFloat32 *)&accel[newCount];
		deltaForce = (dgFloat32 *)&deltaAccel[newCount];
		// forceStep = (dgFloat32*) &deltaForce [newCount];
		diagDamp = (dgFloat32 *)&deltaForce[newCount];
		coordenateAccel = (dgFloat32 *)&diagDamp[newCount];
		restitution = (dgFloat32 *)&coordenateAccel[newCount];
		penetration = (dgFloat32 *)&restitution[newCount];
		penetrationStiffness = (dgFloat32 *)&penetration[newCount];
		invDJMinvJt = (dgFloat32 *)&penetrationStiffness[newCount];
		lowerBoundFrictionCoefficent = (dgFloat32 *)&invDJMinvJt[newCount];
		upperBoundFrictionCoefficent =
		    (dgFloat32 *)&lowerBoundFrictionCoefficent[newCount];
		forceFeedback = (dgFloat32 **)&upperBoundFrictionCoefficent[newCount];
		normalForceIndex = (dgInt32 *)&forceFeedback[newCount];
		accelIsMotor = (dgInt32 *)&normalForceIndex[newCount];

		m_world->GetAllocator()->FreeLow(m_world->m_jacobiansMemory[threadIndex]);

	} else {
		memory = m_world->m_jacobiansMemory[threadIndex];
		newCount = ((m_world->m_jacobiansMemorySizeInBytes[threadIndex] - 16) / stride) & (-8);

		Jt = (dgJacobianPair *)memory;
		JMinv = (dgJacobianPair *)&Jt[newCount];
		force = (dgFloat32 *)&JMinv[newCount];
		force += 4;
		accel = (dgFloat32 *)&force[newCount];
		;
		deltaAccel = (dgFloat32 *)&accel[newCount];
		deltaForce = (dgFloat32 *)&deltaAccel[newCount];
		diagDamp = (dgFloat32 *)&deltaForce[newCount];
		coordenateAccel = (dgFloat32 *)&diagDamp[newCount];
		restitution = (dgFloat32 *)&coordenateAccel[newCount];
		penetration = (dgFloat32 *)&restitution[newCount];
		penetrationStiffness = (dgFloat32 *)&penetration[newCount];
		invDJMinvJt = (dgFloat32 *)&penetrationStiffness[newCount];
		lowerBoundFrictionCoefficent = (dgFloat32 *)&invDJMinvJt[newCount];
		upperBoundFrictionCoefficent =
		    (dgFloat32 *)&lowerBoundFrictionCoefficent[newCount];
		forceFeedback = (dgFloat32 **)&upperBoundFrictionCoefficent[newCount];
		normalForceIndex = (dgInt32 *)&forceFeedback[newCount];
		accelIsMotor = (dgInt32 *)&normalForceIndex[newCount];
	}

	m_solverMemory[threadIndex].m_maxJacobiansCount = newCount;
	m_world->m_jacobiansMemory[threadIndex] = memory;

	force[-1] = dgFloat32(1.0f);
	force[-2] = dgFloat32(1.0f);
	force[-3] = dgFloat32(1.0f);
	force[-4] = dgFloat32(1.0f);

	m_solverMemory[threadIndex].m_Jt = Jt;
	m_solverMemory[threadIndex].m_JMinv = JMinv;
	m_solverMemory[threadIndex].m_force = force;
	m_solverMemory[threadIndex].m_accel = accel;
	m_solverMemory[threadIndex].m_deltaForce = deltaForce;
	m_solverMemory[threadIndex].m_deltaAccel = deltaAccel;
	m_solverMemory[threadIndex].m_diagDamp = diagDamp;
	m_solverMemory[threadIndex].m_invDJMinvJt = invDJMinvJt;
	m_solverMemory[threadIndex].m_penetration = penetration;
	m_solverMemory[threadIndex].m_restitution = restitution;
	m_solverMemory[threadIndex].m_coordenateAccel = coordenateAccel;
	m_solverMemory[threadIndex].m_penetrationStiffness = penetrationStiffness;
	m_solverMemory[threadIndex].m_lowerBoundFrictionCoefficent =
	    lowerBoundFrictionCoefficent;
	m_solverMemory[threadIndex].m_upperBoundFrictionCoefficent =
	    upperBoundFrictionCoefficent;
	m_solverMemory[threadIndex].m_jointFeebackForce = forceFeedback;
	m_solverMemory[threadIndex].m_normalForceIndex = normalForceIndex;
	m_solverMemory[threadIndex].m_accelIsMotor = accelIsMotor;

	NEWTON_ASSERT((dgUnsigned64(Jt) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(JMinv) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(force) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(accel) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(deltaAccel) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(deltaForce) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(diagDamp) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(penetration) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(restitution) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(invDJMinvJt) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(coordenateAccel) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(penetrationStiffness) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(lowerBoundFrictionCoefficent) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(upperBoundFrictionCoefficent) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(forceFeedback) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(normalForceIndex) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(accelIsMotor) & 0x0f) == 0);
}

void dgWorldDynamicUpdate::ReallocIntenalForcesMemory(dgInt32 count,
        dgInt32 threadIndex) {
	dgInt32 stride;
	dgInt32 newCount;
	void *memory;
	dgInt32 *treadLocks;
	dgJacobian *internalForces;
	dgJacobian *internalVeloc;

	stride = sizeof(dgJacobian) + sizeof(dgJacobian) + sizeof(dgInt32) + 0;

	if (count) {
		m_world->m_internalForcesMemorySizeInBytes[threadIndex] =
		    m_world->m_internalForcesMemorySizeInBytes[threadIndex] * 2;
		memory = m_world->GetAllocator()->MallocLow(
		             m_world->m_internalForcesMemorySizeInBytes[threadIndex]);
		//      newCount = m_world->m_internalForcesMemorySizeInBytes[threadIndex] / stride;
		newCount = ((m_world->m_internalForcesMemorySizeInBytes[threadIndex] - 16) / stride) & (-8);

		internalForces = (dgJacobian *)memory;
		internalVeloc = &internalForces[newCount];
		treadLocks = (dgInt32 *)&internalVeloc[newCount];

		m_world->GetAllocator()->FreeLow(
		    m_world->m_internalForcesMemory[threadIndex]);
	} else {
		memory = m_world->m_internalForcesMemory[threadIndex];
		newCount = ((m_world->m_internalForcesMemorySizeInBytes[threadIndex] - 16) / stride) & (-8);

		internalForces = (dgJacobian *)memory;
		internalVeloc = &internalForces[newCount];
		treadLocks = (dgInt32 *)&internalVeloc[newCount];
	}

	NEWTON_ASSERT(
	    (((dgInt8 *)&treadLocks[newCount - 1]) - ((dgInt8 *)memory)) < m_world->m_internalForcesMemorySizeInBytes[threadIndex]);

	m_solverMemory[threadIndex].m_maxBodiesCount = newCount;
	m_world->m_internalForcesMemory[threadIndex] = memory;

	m_solverMemory[threadIndex].m_internalForces = internalForces;
	m_solverMemory[threadIndex].m_internalVeloc = internalVeloc;
	m_solverMemory[threadIndex].m_treadLocks = treadLocks;
	memset(treadLocks, 0, newCount * sizeof(dgInt32));
	// m_solverMemory[threadIndex].m_bodyFreeze = bodyFreeze;

	NEWTON_ASSERT((dgUnsigned64(internalForces) & 0x0f) == 0);
	NEWTON_ASSERT((dgUnsigned64(internalVeloc) & 0x0f) == 0);
}

void dgWorldDynamicUpdate::BuildIsland(dgQueue<dgBody *> &queue,
                                       dgInt32 jointCount, dgInt32 hasUnilateralJoints,
                                       dgInt32 isContinueCollisionIsland) {
	dgInt32 bodyCount = 1;
	dgInt32 lruMark = m_markLru;

	if (m_bodies >= m_maxBodiesCount) {
		ReallocBodyMemory(m_bodies);
	}

	m_bodyArray[m_bodies].m_body = m_world->m_sentionelBody;
	NEWTON_ASSERT(m_world->m_sentionelBody->m_index == 0);
	NEWTON_ASSERT(dgInt32(m_world->m_sentionelBody->m_dynamicsLru) == m_markLru);
	while (!queue.IsEmpty()) {
		dgInt32 count = queue.m_firstIndex - queue.m_lastIndex;
		if (count < 0) {
			NEWTON_ASSERT(0);
			count += queue.m_mod;
		}

		dgInt32 index = queue.m_lastIndex;
		queue.Reset();

		for (dgInt32 j = 0; j < count; j++) {
			dgBody *const body = queue.m_pool[index];
			NEWTON_ASSERT(body);
			NEWTON_ASSERT(dgInt32(body->m_dynamicsLru) == lruMark);
			NEWTON_ASSERT(body->m_masterNode);

			if (body->m_invMass.m_w > dgFloat32(0.0f)) {
				dgInt32 bodyIndex = m_bodies + bodyCount;
				if (bodyIndex >= m_maxBodiesCount) {
					ReallocBodyMemory(bodyIndex);
				}
				body->m_index = bodyCount;
				m_bodyArray[bodyIndex].m_body = body;

				bodyCount++;
			}
			for (dgBodyMasterListRow::dgListNode *jointNode =
			            body->m_masterNode->GetInfo().GetFirst();
			        jointNode; jointNode =
			            jointNode->GetNext()) {
				dgBodyMasterListCell &cell = jointNode->GetInfo();

				dgBody *const bodyN = cell.m_bodyNode;
				dgConstraint *const constraint = cell.m_joint;
				if (dgInt32(constraint->m_dynamicsLru) != lruMark) {
					constraint->m_dynamicsLru = dgUnsigned32(lruMark);

					dgInt32 jointIndex = m_joints + jointCount;
					if (jointIndex >= m_maxJointCount) {
						ReallocJointsMemory(jointIndex);
					}

					if (constraint->m_isUnilateral) {
						hasUnilateralJoints = 1;
						NEWTON_ASSERT(
						    (constraint->m_body0 == m_world->m_sentionelBody) || (constraint->m_body1 == m_world->m_sentionelBody));
					}

					constraint->m_index = dgUnsigned32(jointCount);
					m_constraintArray[jointIndex].m_joint = constraint;

					jointCount++;

					NEWTON_ASSERT(constraint->m_body0);
					NEWTON_ASSERT(constraint->m_body1);
				}

				if (dgInt32(bodyN->m_dynamicsLru) != lruMark) {
					if (bodyN->m_invMass.m_w > dgFloat32(0.0f)) {
						queue.Insert(bodyN);
						bodyN->m_dynamicsLru = dgUnsigned32(lruMark);
					}
				}
			}

			index++;
			if (index >= queue.m_mod) {
				NEWTON_ASSERT(0);
				index = 0;
			}
		}
	}

	if (bodyCount > 1) {
		if (m_islands >= m_maxIslandCount) {
			ReallocIslandMemory(m_islands);
		}

		m_islandArray[m_islands].m_bodyStart = m_bodies;
		m_islandArray[m_islands].m_jointStart = m_joints;
		m_islandArray[m_islands].m_bodyCount = bodyCount;
		m_islandArray[m_islands].m_jointCount = jointCount;
		m_islandArray[m_islands].m_hasUnilateralJoints = hasUnilateralJoints;
		m_islandArray[m_islands].m_isContinueCollision = isContinueCollisionIsland;
		m_islands++;
		m_bodies += bodyCount;
		m_joints += jointCount;
	}
}

void dgWorldDynamicUpdate::SpanningTree(dgBody *const body) {
	dgInt32 bodyCount = 0;
	dgInt32 jointCount = 0;
	dgInt32 staticCount = 0;
	dgInt32 lruMark = m_markLru - 1;
	dgInt32 isInWorld = 0;
	dgInt32 isInEquilibrium = 1;
	dgInt32 hasUnilateralJoints = 0;
	dgInt32 isContinueCollisionIsland = 0;
	dgBody *heaviestBody = NULL;
	dgFloat32 haviestMass = dgFloat32(0.0f);

	dgQueue<dgBody *> queue(
	    (dgBody **)m_world->m_pairMemoryBuffer,
	    (dgInt32(m_world->m_pairMemoryBufferSizeInBytes >> 1)) / dgInt32(sizeof(void *)));
	dgBody **const staticPool = &queue.m_pool[queue.m_mod];

	body->m_dynamicsLru = dgUnsigned32(lruMark);

	queue.Insert(body);
	while (!queue.IsEmpty()) {
		dgInt32 count = queue.m_firstIndex - queue.m_lastIndex;
		if (count < 0) {
			NEWTON_ASSERT(0);
			count += queue.m_mod;
		}

		dgInt32 index = queue.m_lastIndex;
		queue.Reset();
		for (dgInt32 j = 0; j < count; j++) {

			dgBody *const srcBody = queue.m_pool[index];
			NEWTON_ASSERT(srcBody);
			NEWTON_ASSERT(srcBody->m_invMass.m_w > dgFloat32(0.0f));
			NEWTON_ASSERT(dgInt32(srcBody->m_dynamicsLru) == lruMark);
			NEWTON_ASSERT(srcBody->m_masterNode);

			dgInt32 bodyIndex = m_bodies + bodyCount;
			if (bodyIndex >= m_maxBodiesCount) {
				ReallocBodyMemory(bodyIndex);
			}
			m_bodyArray[bodyIndex].m_body = srcBody;

			isInWorld |= srcBody->m_isInWorld;
			isInEquilibrium &= srcBody->m_equilibrium;
			isInEquilibrium &= srcBody->m_autoSleep;

			// isInEquilibrium = 0;

			srcBody->m_sleeping = false;
			isContinueCollisionIsland |= srcBody->m_solverInContinueCollision;

			if (srcBody->m_mass.m_w > haviestMass) {
				haviestMass = srcBody->m_mass.m_w;
				heaviestBody = srcBody;
			}

			bodyCount++;
			for (dgBodyMasterListRow::dgListNode *jointNode =
			            srcBody->m_masterNode->GetInfo().GetFirst();
			        jointNode; jointNode =
			            jointNode->GetNext()) {
				dgBodyMasterListCell &cell = jointNode->GetInfo();

				dgBody *const bodyN = cell.m_bodyNode;
				if (dgInt32(bodyN->m_dynamicsLru) < lruMark) {
					NEWTON_ASSERT(bodyN == cell.m_bodyNode);
					bodyN->m_dynamicsLru = dgUnsigned32(lruMark);

					if (bodyN->m_invMass.m_w > dgFloat32(0.0f)) {
						queue.Insert(bodyN);
					} else {

						dgInt32 duplicateBody = 0;
						for (; duplicateBody < staticCount; duplicateBody++) {
							if (staticPool[duplicateBody] == srcBody) {
								break;
							}
						}
						if (duplicateBody == staticCount) {
							staticPool[staticCount] = srcBody;
							staticCount++;
							NEWTON_ASSERT(srcBody->m_invMass.m_w > dgFloat32(0.0f));
						}

						dgConstraint *const constraint = cell.m_joint;
						NEWTON_ASSERT(dgInt32(constraint->m_dynamicsLru) != m_markLru);

						dgInt32 jointIndex = m_joints + jointCount;
						if (jointIndex >= m_maxJointCount) {
							ReallocJointsMemory(jointIndex);
						}

						if (constraint->m_isUnilateral) {
							hasUnilateralJoints = 1;
							NEWTON_ASSERT(
							    (constraint->m_body0 == m_world->m_sentionelBody) || (constraint->m_body1 == m_world->m_sentionelBody));
						}

						constraint->m_index = dgUnsigned32(jointCount);
						m_constraintArray[jointIndex].m_joint = constraint;

						jointCount++;

						NEWTON_ASSERT(constraint->m_body0);
						NEWTON_ASSERT(constraint->m_body1);
					}
				} else if (cell.m_bodyNode->m_invMass.m_w == dgFloat32(0.0f)) {
					dgInt32 duplicateBody = 0;
					for (; duplicateBody < staticCount; duplicateBody++) {
						if (staticPool[duplicateBody] == srcBody) {
							break;
						}
					}
					if (duplicateBody == staticCount) {
						staticPool[staticCount] = srcBody;
						staticCount++;
						NEWTON_ASSERT(srcBody->m_invMass.m_w > dgFloat32(0.0f));
					}

					dgConstraint *const constraint = cell.m_joint;
					NEWTON_ASSERT(dgInt32(constraint->m_dynamicsLru) != m_markLru);

					dgInt32 jointIndex = m_joints + jointCount;
					if (jointIndex >= m_maxJointCount) {
						ReallocJointsMemory(jointIndex);
					}

					if (constraint->m_isUnilateral) {
						hasUnilateralJoints = 1;
						NEWTON_ASSERT(
						    (constraint->m_body0 == m_world->m_sentionelBody) || (constraint->m_body1 == m_world->m_sentionelBody));
					}

					constraint->m_index = dgUnsigned32(jointCount);
					m_constraintArray[jointIndex].m_joint = constraint;

					jointCount++;

					NEWTON_ASSERT(constraint->m_body0);
					NEWTON_ASSERT(constraint->m_body1);
				}
			}

			index++;
			if (index >= queue.m_mod) {
				NEWTON_ASSERT(0);
				index = 0;
			}
		}
	}

	if (isInEquilibrium | !isInWorld) {
		for (dgInt32 i = 0; i < bodyCount; i++) {
			dgBody *const bodyI = m_bodyArray[m_bodies + i].m_body;
			bodyI->m_dynamicsLru = dgUnsigned32(m_markLru);
			bodyI->m_sleeping = true;
		}
	} else {
		if (m_world->m_islandUpdate) {
			dgIslandCallbackStruct record;
			record.m_world = m_world;
			record.m_count = bodyCount;
			record.m_strideInByte = sizeof(dgBodyInfo);
			record.m_bodyArray = &m_bodyArray[m_bodies].m_body;
			if (!m_world->m_islandUpdate(m_world, &record, bodyCount)) {
				for (dgInt32 i = 0; i < bodyCount; i++) {
					dgBody *const bodyI = m_bodyArray[m_bodies + i].m_body;
					bodyI->m_dynamicsLru = dgUnsigned32(m_markLru);
				}
				return;
			}
		}

		if (staticCount) {
			queue.Reset();
			for (dgInt32 i = 0; i < staticCount; i++) {
				dgBody *const bodyI = staticPool[i];
				bodyI->m_dynamicsLru = dgUnsigned32(m_markLru);
				queue.Insert(bodyI);
				NEWTON_ASSERT(dgInt32(bodyI->m_dynamicsLru) == m_markLru);
			}

			for (dgInt32 i = 0; i < jointCount; i++) {
				dgConstraint *const constraint = m_constraintArray[m_joints + i].m_joint;
				constraint->m_dynamicsLru = dgUnsigned32(m_markLru);
			}
		} else {
			NEWTON_ASSERT(heaviestBody);
			queue.Insert(heaviestBody);
			// body->m_dynamicsLru = m_markLru;
			heaviestBody->m_dynamicsLru = dgUnsigned32(m_markLru);
		}
		BuildIsland(queue, jointCount, hasUnilateralJoints,
		            isContinueCollisionIsland);
	}
}

void dgWorldDynamicUpdate::IntegrateArray(const dgBodyInfo *bodyArray,
        dgInt32 count, dgFloat32 accelTolerance, dgFloat32 timestep,
        dgInt32 threadIndex, bool update) const {
	bool isAutoSleep = true;
	bool stackSleeping = true;
	dgInt32 sleepCounter = 10000;

	dgFloat32 speedFreeze = m_world->m_freezeSpeed2;
	dgFloat32 accelFreeze = m_world->m_freezeAccel2;
	dgFloat32 maxAccel = dgFloat32(0.0f);
	dgFloat32 maxAlpha = dgFloat32(0.0f);
	dgFloat32 maxSpeed = dgFloat32(0.0f);
	dgFloat32 maxOmega = dgFloat32(0.0f);

	dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
	              dgFloat32(0.0f));

	dgFloat32 forceDamp = DG_FREEZZING_VELOCITY_DRAG;
	if (count <= 2) {
		bool autosleep = bodyArray[0].m_body->m_autoSleep;
		if (count == 2) {
			autosleep &= bodyArray[1].m_body->m_autoSleep;
		}
		if (!autosleep) {
			forceDamp = dgFloat32(0.9999f);
		}
	}
	for (dgInt32 i = 0; i < count; i++) {
		dgBody *const body = bodyArray[i].m_body;

		if (body->m_invMass.m_w && body->m_isInWorld) {
			body->IntegrateVelocity(timestep);

			dgFloat32 accel2 = body->m_accel % body->m_accel;
			dgFloat32 alpha2 = body->m_alpha % body->m_alpha;
			dgFloat32 speed2 = body->m_veloc % body->m_veloc;
			dgFloat32 omega2 = body->m_omega % body->m_omega;

			maxAccel = GetMax(maxAccel, accel2);
			maxAlpha = GetMax(maxAlpha, alpha2);
			maxSpeed = GetMax(maxSpeed, speed2);
			maxOmega = GetMax(maxOmega, omega2);

			bool equilibrium = (accel2 < accelFreeze) && (alpha2 < accelFreeze) && (speed2 < speedFreeze) && (omega2 < speedFreeze);
			if (equilibrium) {
				body->m_veloc = body->m_veloc.Scale(forceDamp);
				body->m_omega = body->m_omega.Scale(forceDamp);
			}
			body->m_equilibrium = dgUnsigned32(equilibrium);
			stackSleeping &= equilibrium;
			isAutoSleep &= body->m_autoSleep;

			sleepCounter = GetMin(sleepCounter, body->m_sleepingCounter);
		}
	}

	if (update) {
		for (dgInt32 i = 0; i < count; i++) {
			dgBody *const body = bodyArray[i].m_body;
			if (body->m_invMass.m_w && body->m_isInWorld) {
				body->UpdateMatrix(timestep, threadIndex);
			}
		}

		if (isAutoSleep) {
			if (stackSleeping) {
				for (dgInt32 i = 0; i < count; i++) {
					dgBody *const body = bodyArray[i].m_body;
					body->m_netForce = zero;
					body->m_netTorque = zero;
					body->m_veloc = zero;
					body->m_omega = zero;
				}
			} else {
				if ((maxAccel > m_world->m_sleepTable[DG_SLEEP_ENTRIES - 1].m_maxAccel) || (maxAlpha > m_world->m_sleepTable[DG_SLEEP_ENTRIES - 1].m_maxAlpha) || (maxSpeed > m_world->m_sleepTable[DG_SLEEP_ENTRIES - 1].m_maxVeloc) || (maxOmega > m_world->m_sleepTable[DG_SLEEP_ENTRIES - 1].m_maxOmega)) {
					for (dgInt32 i = 0; i < count; i++) {
						dgBody *const body = bodyArray[i].m_body;
						body->m_sleepingCounter = 0;
					}
				} else {
					dgInt32 index = 0;
					for (dgInt32 i = 0; i < DG_SLEEP_ENTRIES; i++) {
						if ((maxAccel <= m_world->m_sleepTable[i].m_maxAccel) && (maxAlpha <= m_world->m_sleepTable[i].m_maxAlpha) && (maxSpeed <= m_world->m_sleepTable[i].m_maxVeloc) && (maxOmega <= m_world->m_sleepTable[i].m_maxOmega)) {
							index = i;
							break;
						}
					}

					dgInt32 timeScaleSleepCount = dgInt32(
					                                  dgFloat32(60.0f) * sleepCounter * timestep);
					if (timeScaleSleepCount > m_world->m_sleepTable[index].m_steps) {
						for (dgInt32 i = 0; i < count; i++) {
							dgBody *const body = bodyArray[i].m_body;
							body->m_netForce = zero;
							body->m_netTorque = zero;
							body->m_veloc = zero;
							body->m_omega = zero;
							body->m_equilibrium = true;
						}
					} else {
						sleepCounter++;
						for (dgInt32 i = 0; i < count; i++) {
							dgBody *const body = bodyArray[i].m_body;
							body->m_sleepingCounter = sleepCounter;
						}
					}
				}
			}
		}
	}
}

dgInt32 dgWorldDynamicUpdate::GetJacobialDerivatives(const dgIsland &island,
        dgInt32 threadIndex, bool bitMode, dgInt32 rowCount, dgFloat32 timestep) {
	dgContraintDescritor constraintParams;

	dgInt32 jointCount = island.m_jointCount;

	constraintParams.m_world = m_world;
	constraintParams.m_threadIndex = threadIndex;
	constraintParams.m_timestep = timestep;
	constraintParams.m_invTimestep = dgFloat32(1.0f / timestep);

	dgJacobianMemory &solverMemory = m_solverMemory[threadIndex];
	dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart];

	for (dgInt32 j = 0; j < jointCount; j++) {
		dgConstraint *const constraint = constraintArray[j].m_joint;
		;
		if (constraint->m_isUnilateral ^ bitMode) {
			dgInt32 dof = dgInt32(constraint->m_maxDOF);
			NEWTON_ASSERT(dof <= DG_CONSTRAINT_MAX_ROWS);
			for (dgInt32 i = 0; i < dof; i++) {
				constraintParams.m_forceBounds[i].m_low = DG_MIN_BOUND;
				constraintParams.m_forceBounds[i].m_upper = DG_MAX_BOUND;
				constraintParams.m_forceBounds[i].m_jointForce = NULL;
				constraintParams.m_forceBounds[i].m_normalIndex =
				    DG_BILATERAL_CONSTRAINT;
			}

			NEWTON_ASSERT(constraint->m_body0);
			NEWTON_ASSERT(constraint->m_body1);

			constraint->m_body0->m_inCallback = true;
			constraint->m_body1->m_inCallback = true;

			dof = dgInt32(constraint->JacobianDerivative(constraintParams));

			constraint->m_body0->m_inCallback = false;
			constraint->m_body1->m_inCallback = false;

			dgInt32 m0 =
			    (constraint->m_body0->m_invMass.m_w != dgFloat32(0.0f)) ? constraint->m_body0->m_index : 0;
			dgInt32 m1 =
			    (constraint->m_body1->m_invMass.m_w != dgFloat32(0.0f)) ? constraint->m_body1->m_index : 0;

			NEWTON_ASSERT(constraint->m_index == dgUnsigned32(j));
			constraintArray[j].m_autoPairstart = rowCount;
			constraintArray[j].m_autoPaircount = dof;
			constraintArray[j].m_autoPairActiveCount = dof;
			constraintArray[j].m_m0 = m0;
			constraintArray[j].m_m1 = m1;

			for (dgInt32 i = 0; i < dof; i++) {
				NEWTON_ASSERT(constraintParams.m_forceBounds[i].m_jointForce);
				solverMemory.m_Jt[rowCount] = constraintParams.m_jacobian[i];

				NEWTON_ASSERT(constraintParams.m_jointStiffness[i] >= dgFloat32(0.1f));
				NEWTON_ASSERT(constraintParams.m_jointStiffness[i] <= dgFloat32(100.0f));
				solverMemory.m_diagDamp[rowCount] =
				    constraintParams.m_jointStiffness[i];
				solverMemory.m_coordenateAccel[rowCount] =
				    constraintParams.m_jointAccel[i];
				solverMemory.m_accelIsMotor[rowCount] = dgInt32(
				        constraintParams.m_isMotor[i]);
				solverMemory.m_restitution[rowCount] =
				    constraintParams.m_restitution[i];
				solverMemory.m_penetration[rowCount] =
				    constraintParams.m_penetration[i];
				solverMemory.m_penetrationStiffness[rowCount] =
				    constraintParams.m_penetrationStiffness[i];

				solverMemory.m_lowerBoundFrictionCoefficent[rowCount] =
				    constraintParams.m_forceBounds[i].m_low;
				solverMemory.m_upperBoundFrictionCoefficent[rowCount] =
				    constraintParams.m_forceBounds[i].m_upper;
				solverMemory.m_jointFeebackForce[rowCount] =
				    constraintParams.m_forceBounds[i].m_jointForce;
				solverMemory.m_normalForceIndex[rowCount] =
				    constraintParams.m_forceBounds[i].m_normalIndex + ((constraintParams.m_forceBounds[i].m_normalIndex >= 0) ? (rowCount - i) : 0);

				rowCount++;
			}

#ifdef _DEBUG
			for (dgInt32 i = 0; i < ((rowCount + 3) & 0xfffc) - rowCount; i++) {
				solverMemory.m_diagDamp[rowCount + i] = dgFloat32(0.0f);
				solverMemory.m_coordenateAccel[rowCount + i] = dgFloat32(0.0f);
				solverMemory.m_restitution[rowCount + i] = dgFloat32(0.0f);
				solverMemory.m_penetration[rowCount + i] = dgFloat32(0.0f);
				solverMemory.m_penetrationStiffness[rowCount + i] = dgFloat32(0.0f);

				solverMemory.m_lowerBoundFrictionCoefficent[rowCount + i] = dgFloat32(
				            0.0f);
				solverMemory.m_upperBoundFrictionCoefficent[rowCount + i] = dgFloat32(
				            0.0f);
				solverMemory.m_jointFeebackForce[rowCount + i] = 0;
				solverMemory.m_normalForceIndex[rowCount + i] = 0;
			}
#endif

			rowCount =
			    (rowCount & (DG_SIMD_WORD_SIZE - 1)) ? ((rowCount & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : rowCount;
			NEWTON_ASSERT((rowCount & (DG_SIMD_WORD_SIZE - 1)) == 0);
		}
	}
	return rowCount;
}

void dgWorldDynamicUpdate::BuildJacobianMatrixSimd(const dgIsland &island,
        dgInt32 threadIndex, dgFloat32 timestep) {
#ifdef DG_BUILD_SIMD_CODE
	//  dgInt32 threads;
	//  dgInt32 rowCount;
	//  dgInt32 bodyCount;
	//  dgInt32 jointCount;

	//  dgContraintDescritor constraintParams;
	//  dgVector zeroVector (dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f));

	NEWTON_ASSERT(island.m_bodyCount >= 2);
	//NEWTON_ASSERT (island.m_jointCount >= 1);

	//  dgInt32 threads = m_world->m_numberOfTheads;
	dgJacobianMemory &solverMemory = m_solverMemory[threadIndex];

	dgInt32 bodyCount = island.m_bodyCount;
	dgBodyInfo *const bodyArray = &m_bodyArray[island.m_bodyStart];

	bodyArray[0].m_body = m_world->GetSentinelBody();

	// body = bodyArray[0].m_body;
	// body->m_hasRelativeVelocity = false;
	// body->m_accel = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
	// body->m_alpha = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
	NEWTON_ASSERT(
	    (bodyArray[0].m_body->m_accel % bodyArray[0].m_body->m_accel) == dgFloat32(0.0f));
	NEWTON_ASSERT(
	    (bodyArray[0].m_body->m_alpha % bodyArray[0].m_body->m_alpha) == dgFloat32(0.0f));

	for (dgInt32 i = 1; i < bodyCount; i++) {
		dgBody *const body = bodyArray[i].m_body;
		NEWTON_ASSERT(body->m_invMass.m_w > dgFloat32(0.0f));

		// body->m_hasRelativeVelocity = false;
		// body->AddGyroscopicTorque();
		body->AddDamingAcceleration();
		body->CalcInvInertiaMatrixSimd();
	}

	while (bodyCount >= solverMemory.m_maxBodiesCount) {
		m_world->dgGetUserLock();
		ReallocIntenalForcesMemory(bodyCount, threadIndex);
		m_world->dgReleasedUserLock();
	}

	dgInt32 jointCount = island.m_jointCount;
	// constraintArray = &m_constraintArray[island.m_jointStart];
	dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart];
	solverMemory.m_constraintArray = constraintArray;

	dgInt32 rowCount = 0;
	dgFloat32 maxRowCount = 0;
	for (dgInt32 j = 0; j < jointCount; j++) {
		dgConstraint *const constraint = constraintArray[j].m_joint;
		dgInt32 dof =
		    dgInt32(
		        (constraint->m_maxDOF & (DG_SIMD_WORD_SIZE - 1)) ? ((constraint->m_maxDOF & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : constraint->m_maxDOF);
		maxRowCount += dof;
	}

	while (maxRowCount > solverMemory.m_maxJacobiansCount) {
		m_world->dgGetUserLock();
		ReallocJacobiansMemory(solverMemory.m_maxJacobiansCount * 2, threadIndex);
		m_world->dgReleasedUserLock();
	}

	if (island.m_hasUnilateralJoints) {
		rowCount = GetJacobialDerivatives(island, threadIndex, false, rowCount,
		                                  timestep);
	}
	rowCount = GetJacobialDerivatives(island, threadIndex, true, rowCount,
	                                  timestep);

	solverMemory.m_rowCount = rowCount;
	solverMemory.m_bodyCount = bodyCount;
	solverMemory.m_bodyArray = bodyArray;
	solverMemory.m_jointCount = jointCount;
	solverMemory.m_timeStep = timestep;
	solverMemory.m_invTimeStep = dgFloat32(1.0f) / solverMemory.m_timeStep;

	dgFloat32 *const force = solverMemory.m_force;
	const dgJacobianPair *const Jt = solverMemory.m_Jt;
	dgJacobianPair *const JMinv = solverMemory.m_JMinv;
	dgFloat32 *const diagDamp = solverMemory.m_diagDamp;
	dgFloat32 *const extAccel = solverMemory.m_deltaAccel;
	dgFloat32 *const invDJMinvJt = solverMemory.m_invDJMinvJt;
	dgFloat32 *const coordenateAccel = solverMemory.m_coordenateAccel;
	dgFloat32 **const jointForceFeeback = solverMemory.m_jointFeebackForce;
	//  dgJacobianIndex* const jacobianIndexArray = solverMemory.m_jacobianIndexArray;
	//  dgBilateralBounds* const bilateralForceBounds = solverMemory.m_bilateralForceBounds;

	simd_type zero;
	zero = simd_set1(dgFloat32(0.0f));

	for (dgInt32 k = 0; k < jointCount; k++) {
		//      dgInt32 m0;
		//      dgInt32 m1;
		//      dgInt32 index;
		//      dgInt32 count;
		//      dgFloat32 diag;
		//      dgFloat32 stiffness;
		//      simd_type tmp0;
		//      simd_type tmp1;
		//      simd_type invMass0;
		//      simd_type invMass1;
		//      simd_type tmpDiag;
		//      simd_type tmpAccel;
		//      dgBody* body0;
		//      dgBody* body1;

		dgInt32 index = constraintArray[k].m_autoPairstart;
		dgInt32 count = constraintArray[k].m_autoPaircount;
		dgInt32 m0 = constraintArray[k].m_m0;
		dgInt32 m1 = constraintArray[k].m_m1;
		NEWTON_ASSERT(m0 >= 0);
		NEWTON_ASSERT(m0 < bodyCount);
		NEWTON_ASSERT(m1 >= 0);
		NEWTON_ASSERT(m1 < bodyCount);

		dgBody *const body0 = bodyArray[m0].m_body;
		// invMass0 = body0->m_invMass[3];
		simd_type invMass0 = simd_set1(body0->m_invMass[3]);
		NEWTON_ASSERT((dgUnsigned64(&body0->m_invWorldInertiaMatrix) & 0x0f) == 0);
		const dgMatrix &invInertia0 = body0->m_invWorldInertiaMatrix;

		dgBody *const body1 = bodyArray[m1].m_body;
		// invMass1 = body1->m_invMass[3];
		simd_type invMass1 = simd_set1(body1->m_invMass[3]);
		NEWTON_ASSERT((dgUnsigned64(&body1->m_invWorldInertiaMatrix) & 0x0f) == 0);
		const dgMatrix &invInertia1 = body1->m_invWorldInertiaMatrix;
		for (dgInt32 i = 0; i < count; i++) {
			// JMinv[index].m_jacobian_IM0.m_linear = Jt[index].m_jacobian_IM0.m_linear.Scale (invMass0);
			// JMinv[index].m_jacobian_IM0.m_angular = invInertia0.UnrotateVector (Jt[index].m_jacobian_IM0.m_angular);
			// dgVector tmpDiag (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(Jt[index].m_jacobian_IM0.m_linear));
			// tmpDiag += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(Jt[index].m_jacobian_IM0.m_angular);
			// dgVector tmpAccel (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(body0->m_accel));
			// tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(body0->m_alpha);

			((simd_type &)JMinv[index].m_jacobian_IM0.m_linear) =
			    simd_mul_v((simd_type &)Jt[index].m_jacobian_IM0.m_linear, invMass0);
			simd_type tmp0 = (simd_type &)Jt[index].m_jacobian_IM0.m_angular;
			simd_type tmp1 =
			    simd_mul_v((simd_type &)invInertia0.m_front, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 0, 0, 0)));
			tmp1 =
			    simd_mul_add_v(tmp1, (simd_type &)invInertia0.m_up, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 1, 1, 1)));
			((simd_type &)JMinv[index].m_jacobian_IM0.m_angular) =
			    simd_mul_add_v(tmp1, (simd_type &)invInertia0.m_right, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 2, 2, 2)));
			simd_type tmpDiag =
			    simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear);
			tmpDiag =
			    simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular);
			simd_type tmpAccel =
			    simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, (simd_type &)body0->m_accel);
			tmpAccel =
			    simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, (simd_type &)body0->m_alpha);

			// JMinv[index].m_jacobian_IM1.m_linear = Jt[index].m_jacobian_IM1.m_linear.Scale (invMass1);
			// JMinv[index].m_jacobian_IM1.m_angular = invInertia1.UnrotateVector (Jt[index].m_jacobian_IM1.m_angular);
			// tmpDiag += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(Jt[index].m_jacobian_IM1.m_linear);
			// tmpDiag += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(Jt[index].m_jacobian_IM1.m_angular);
			// tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(body1->m_accel);
			// tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(body1->m_alpha);

			((simd_type &)JMinv[index].m_jacobian_IM1.m_linear) =
			    simd_mul_v((simd_type &)Jt[index].m_jacobian_IM1.m_linear, invMass1);
			tmp0 = (simd_type &)Jt[index].m_jacobian_IM1.m_angular;
			tmp1 =
			    simd_mul_v((simd_type &)invInertia1.m_front, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 0, 0, 0)));
			tmp1 =
			    simd_mul_add_v(tmp1, (simd_type &)invInertia1.m_up, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 1, 1, 1)));
			((simd_type &)JMinv[index].m_jacobian_IM1.m_angular) =
			    simd_mul_add_v(tmp1, (simd_type &)invInertia1.m_right, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 2, 2, 2)));
			tmpDiag =
			    simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear);
			tmpDiag =
			    simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular);
			tmpAccel =
			    simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, (simd_type &)body1->m_accel);
			tmpAccel =
			    simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, (simd_type &)body1->m_alpha);

			// coordenateAccel[index] -= (tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z);
			//NEWTON_ASSERT(tmpAccel.m128_f32[3] == dgFloat32 (0.0f));
			tmpAccel = simd_add_v(tmpAccel, simd_move_hl_v(tmpAccel, tmpAccel));
			tmpAccel =
			    simd_sub_s(zero, simd_add_s(tmpAccel, simd_permut_v(tmpAccel, tmpAccel, PURMUT_MASK(3, 3, 3, 1))));

			simd_store_s(tmpAccel, &extAccel[index]);
			simd_store_s(simd_add_s(simd_load_s(coordenateAccel[index]), tmpAccel),
			             &coordenateAccel[index]);

			// force[index] = bilateralForceBounds[index].m_jointForce[0];
			force[index] = jointForceFeeback[index][0];

			NEWTON_ASSERT(diagDamp[index] >= dgFloat32(0.1f));
			NEWTON_ASSERT(diagDamp[index] <= dgFloat32(100.0f));
			dgFloat32 stiffness = DG_PSD_DAMP_TOL * diagDamp[index];

			// diag = (tmpDiag.m_x + tmpDiag.m_y + tmpDiag.m_z);
			tmpDiag = simd_add_v(tmpDiag, simd_move_hl_v(tmpDiag, tmpDiag));
			dgFloat32 diag;
			simd_store_s(
			    simd_add_s(tmpDiag, simd_permut_v(tmpDiag, tmpDiag, PURMUT_MASK(3, 3, 3, 1))),
			    &diag);
			NEWTON_ASSERT(diag > dgFloat32(0.0f));
			diagDamp[index] = diag * stiffness;

			diag *= (dgFloat32(1.0f) + stiffness);
			invDJMinvJt[index] = dgFloat32(1.0f) / diag;

			index++;
		}
	}

#endif
}

void dgWorldDynamicUpdate::BuildJacobianMatrix(const dgIsland &island,
        dgInt32 threadIndex, dgFloat32 timestep) {

	NEWTON_ASSERT(island.m_bodyCount >= 2);
	//NEWTON_ASSERT (island.m_jointCount >= 1);

	dgJacobianMemory &solverMemory = m_solverMemory[threadIndex];

	dgInt32 bodyCount = island.m_bodyCount;
	dgBodyInfo *const bodyArray = &m_bodyArray[island.m_bodyStart];

	NEWTON_ASSERT(
	    (bodyArray[0].m_body->m_accel % bodyArray[0].m_body->m_accel) == dgFloat32(0.0f));
	NEWTON_ASSERT(
	    (bodyArray[0].m_body->m_alpha % bodyArray[0].m_body->m_alpha) == dgFloat32(0.0f));

	for (dgInt32 i = 1; i < bodyCount; i++) {
		dgBody *const body = bodyArray[i].m_body;
		NEWTON_ASSERT(body->m_invMass.m_w > dgFloat32(0.0f));
		body->AddDamingAcceleration();
		body->CalcInvInertiaMatrix();
	}

	while (bodyCount >= solverMemory.m_maxBodiesCount) {
		m_world->dgGetUserLock();
		ReallocIntenalForcesMemory(bodyCount, threadIndex);
		m_world->dgReleasedUserLock();
	}

	dgInt32 jointCount = island.m_jointCount;
	dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart];
	solverMemory.m_constraintArray = constraintArray;

	dgInt32 maxRowCount = 0;
	for (dgInt32 j = 0; j < jointCount; j++) {
		dgConstraint *const constraint = constraintArray[j].m_joint;
		;
		dgInt32 dof =
		    dgInt32(
		        (constraint->m_maxDOF & (DG_SIMD_WORD_SIZE - 1)) ? ((constraint->m_maxDOF & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : constraint->m_maxDOF);
		maxRowCount += dof;
	}

	while (maxRowCount > solverMemory.m_maxJacobiansCount) {
		m_world->dgGetUserLock();
		ReallocJacobiansMemory(solverMemory.m_maxJacobiansCount * 2, threadIndex);
		m_world->dgReleasedUserLock();
	}

	dgInt32 rowCount = 0;
	if (island.m_hasUnilateralJoints) {
		rowCount = GetJacobialDerivatives(island, threadIndex, false, rowCount,
		                                  timestep);
	}
	rowCount = GetJacobialDerivatives(island, threadIndex, true, rowCount,
	                                  timestep);

	solverMemory.m_rowCount = rowCount;
	solverMemory.m_bodyCount = bodyCount;
	solverMemory.m_bodyArray = bodyArray;
	solverMemory.m_jointCount = jointCount;
	solverMemory.m_timeStep = timestep;
	solverMemory.m_invTimeStep = dgFloat32(1.0f) / solverMemory.m_timeStep;

	dgFloat32 *const force = solverMemory.m_force;
	const dgJacobianPair *const Jt = solverMemory.m_Jt;
	dgJacobianPair *const JMinv = solverMemory.m_JMinv;
	dgFloat32 *const diagDamp = solverMemory.m_diagDamp;
	dgFloat32 *const extAccel = solverMemory.m_deltaAccel;
	dgFloat32 *const invDJMinvJt = solverMemory.m_invDJMinvJt;
	dgFloat32 *const coordenateAccel = solverMemory.m_coordenateAccel;
	dgFloat32 **const jointForceFeeback = solverMemory.m_jointFeebackForce;
	//  dgInt32* const accelIsMotor = solverMemory.m_accelIsMotor;

	for (dgInt32 k = 0; k < jointCount; k++) {
		dgInt32 index = constraintArray[k].m_autoPairstart;
		dgInt32 count = constraintArray[k].m_autoPaircount;
		dgInt32 m0 = constraintArray[k].m_m0;
		dgInt32 m1 = constraintArray[k].m_m1;

		NEWTON_ASSERT(m0 >= 0);
		NEWTON_ASSERT(m0 < bodyCount);
		dgBody *const body0 = bodyArray[m0].m_body;
		dgFloat32 invMass0 = body0->m_invMass[3];
		const dgMatrix &invInertia0 = body0->m_invWorldInertiaMatrix;

		NEWTON_ASSERT(m1 >= 0);
		NEWTON_ASSERT(m1 < bodyCount);
		dgBody *const body1 = bodyArray[m1].m_body;
		dgFloat32 invMass1 = body1->m_invMass[3];
		const dgMatrix &invInertia1 = body1->m_invWorldInertiaMatrix;
		for (dgInt32 i = 0; i < count; i++) {
			// dgFloat32 extenalAcceleration;

			JMinv[index].m_jacobian_IM0.m_linear =
			    Jt[index].m_jacobian_IM0.m_linear.Scale(invMass0);
			JMinv[index].m_jacobian_IM0.m_angular = invInertia0.UnrotateVector(
			        Jt[index].m_jacobian_IM0.m_angular);
			dgVector tmpDiag(
			    JMinv[index].m_jacobian_IM0.m_linear.CompProduct(
			        Jt[index].m_jacobian_IM0.m_linear));
			tmpDiag += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(
			               Jt[index].m_jacobian_IM0.m_angular);
			dgVector tmpAccel(
			    JMinv[index].m_jacobian_IM0.m_linear.CompProduct(body0->m_accel));
			tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(
			                body0->m_alpha);

			JMinv[index].m_jacobian_IM1.m_linear =
			    Jt[index].m_jacobian_IM1.m_linear.Scale(invMass1);
			JMinv[index].m_jacobian_IM1.m_angular = invInertia1.UnrotateVector(
			        Jt[index].m_jacobian_IM1.m_angular);
			tmpDiag += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(
			               Jt[index].m_jacobian_IM1.m_linear);
			tmpDiag += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(
			               Jt[index].m_jacobian_IM1.m_angular);
			tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(
			                body1->m_accel);
			tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(
			                body1->m_alpha);

			dgFloat32 extenalAcceleration = -(tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z);
			extAccel[index] = extenalAcceleration;
			coordenateAccel[index] += extenalAcceleration;
			force[index] = jointForceFeeback[index][0];

			// force[index] = 0.0f;

			NEWTON_ASSERT(diagDamp[index] >= dgFloat32(0.1f));
			NEWTON_ASSERT(diagDamp[index] <= dgFloat32(100.0f));
			dgFloat32 stiffness = DG_PSD_DAMP_TOL * diagDamp[index];

			dgFloat32 diag = (tmpDiag.m_x + tmpDiag.m_y + tmpDiag.m_z);
			NEWTON_ASSERT(diag > dgFloat32(0.0f));
			diagDamp[index] = diag * stiffness;

			diag *= (dgFloat32(1.0f) + stiffness);
			// solverMemory.m_diagJMinvJt[index] = diag;
			invDJMinvJt[index] = dgFloat32(1.0f) / diag;

			index++;
		}
	}
}

void dgJacobianMemory::CalculateReactionsForcesSimd(dgInt32 solverMode,
        dgFloat32 tolerance) const {
#ifdef DG_BUILD_SIMD_CODE

	if (m_jointCount == 0) {
		// ApplyExternalForcesAndAccelerationSimd (tolerance);
		ApplyExternalForcesAndAccelerationSimd(0.0f);
		return;
	}

	if (m_jointCount == 1) {
		CalculateSimpleBodyReactionsForcesSimd(tolerance);
		ApplyExternalForcesAndAccelerationSimd(tolerance * 0.001f);
		return;
	}

	if (solverMode) {
		CalculateForcesGameModeSimd(solverMode, tolerance);
	} else {
		CalculateForcesSimulationModeSimd(tolerance);
	}
#endif
}

void dgJacobianMemory::CalculateReactionsForces(dgInt32 solverMode,
        dgFloat32 tolerance) const {
	if (m_jointCount == 0) {
		// ApplyExternalForcesAndAcceleration (tolerance);
		ApplyExternalForcesAndAcceleration(0.0f);
		return;
	}

	if (m_jointCount == 1) {
		CalculateSimpleBodyReactionsForces(tolerance);
		ApplyExternalForcesAndAcceleration(tolerance * 0.001f);
		return;
	}

	if (solverMode) {
		CalculateForcesGameMode(solverMode, tolerance);
	} else {
		CalculateForcesSimulationMode(tolerance);
	}
}

void dgJacobianMemory::ApplyExternalForcesAndAccelerationSimd(
    dgFloat32 tolerance) const {
#ifdef DG_BUILD_SIMD_CODE
	//  dgInt32 hasJointFeeback;
	//  simd_type zero;
	//  simd_type absMask;
	//  simd_type timeStep;
	//  simd_type toleranceSimd;

	dgFloat32 *const force = m_force;
	dgJacobian *const internalForces = m_internalForces;
	const dgJacobianPair *const Jt = m_Jt;
	const dgBodyInfo *const bodyArray = m_bodyArray;
	const dgJointInfo *const constraintArray = m_constraintArray;
	dgFloat32 **const jointForceFeeback = m_jointFeebackForce;

	simd_type zero = simd_set1(dgFloat32(0.0f));

	for (dgInt32 i = 0; i < m_bodyCount; i++) {
		(simd_type &)internalForces[i].m_linear = zero;
		(simd_type &)internalForces[i].m_angular = zero;
	}

	//  dgInt32 tmp = 0x7fffffff;
	dgInt32 hasJointFeeback = 0;
	//  simd_type absMask = simd_set1(*((dgFloat32*)&tmp));
	simd_type timeStep = simd_set1(m_timeStep);
	simd_type toleranceSimd = simd_set1(tolerance);
	toleranceSimd = simd_mul_s(toleranceSimd, toleranceSimd);
	for (dgInt32 i = 0; i < m_jointCount; i++) {
		//      dgInt32 m0;
		//      dgInt32 m1;
		//      dgInt32 first;
		//      dgInt32 count;
		//      dgInt32 index;
		//      simd_type y0_linear;
		//      simd_type y0_angular;
		//      simd_type y1_linear;
		//      simd_type y1_angular;

		dgInt32 first = constraintArray[i].m_autoPairstart;
		dgInt32 count = constraintArray[i].m_autoPaircount;
		dgInt32 m0 = constraintArray[i].m_m0;
		dgInt32 m1 = constraintArray[i].m_m1;

		simd_type y0_linear = zero;
		simd_type y0_angular = zero;
		simd_type y1_linear = zero;
		simd_type y1_angular = zero;
		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 index = j + first;
			// val = force[index];
			//NEWTON_ASSERT (dgCheckFloat(val));
			simd_type val = simd_set1(force[index]);
			simd_store_s(val, &jointForceFeeback[index][0]);
			//          maxForce = simd_max_s (maxForce, simd_and_v (val, absMask));

			y0_linear =
			    simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, val);
			y0_angular =
			    simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, val);
			y1_linear =
			    simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, val);
			y1_angular =
			    simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, val);
		}

		//      if (constraintArray[i].m_joint->GetId() == dgContactConstraintId) {
		//          dgFloat32 force;
		//          simd_store_s (maxForce, &force);
		//          m_world->AddToBreakQueue ((dgContact*)constraintArray[i].m_joint, force);
		//      }

		hasJointFeeback |= (constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0);
		//      if (constraintArray[i].m_joint->m_updaFeedbackCallback) {
		//          constraintArray[i].m_joint->m_updaFeedbackCallback (*constraintArray[i].m_joint, m_timeStep, m_threadIndex);
		//      }

		(simd_type &)internalForces[m0].m_linear =
		    simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear);
		(simd_type &)internalForces[m0].m_angular =
		    simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular);
		(simd_type &)internalForces[m1].m_linear =
		    simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear);
		(simd_type &)internalForces[m1].m_angular =
		    simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular);
	}

	for (dgInt32 i = 1; i < m_bodyCount; i++) {
		//      simd_type tmp;
		//      simd_type accel;
		//      simd_type alpha;
		//      dgBody* body;
		dgBody *const body = bodyArray[i].m_body;
		(simd_type &)body->m_accel =
		    simd_add_v((simd_type &)internalForces[i].m_linear, (simd_type &)body->m_accel);
		(simd_type &)body->m_alpha =
		    simd_add_v((simd_type &)internalForces[i].m_angular, (simd_type &)body->m_alpha);

		// dgVector accel (body->m_accel.Scale (body->m_invMass.m_w));
		simd_type accel =
		    simd_mul_v((simd_type &)body->m_accel, simd_set1(body->m_invMass.m_w));
		// dgVector alpha (body->m_invWorldInertiaMatrix.RotateVector (body->m_alpha));
		simd_type alpha =
		    simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_invWorldInertiaMatrix[0], simd_set1(body->m_alpha.m_x)),
		                                  (simd_type &)body->m_invWorldInertiaMatrix[1], simd_set1(body->m_alpha.m_y)),
		                   (simd_type &)body->m_invWorldInertiaMatrix[2], simd_set1(body->m_alpha.m_z));

		// dgVector accel (body->m_accel.Scale (body->m_invMass.m_w));
		// dgVector alpha (body->m_invWorldInertiaMatrix.RotateVector (body->m_alpha));
		// error = accel % accel;
		// if (error < accelTol2) {
		//  accel = zero;
		//  body->m_accel = zero;
		// }
		simd_type tmp = simd_mul_v(accel, accel);
		tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
		tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
		tmp = simd_cmplt_s(tmp, toleranceSimd);
		tmp = simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0));
		accel = simd_andnot_v(accel, tmp);
		(simd_type &)body->m_accel = simd_andnot_v((simd_type &)body->m_accel, tmp);

		//      error = alpha % alpha;
		//      if (error < accelTol2) {
		//          alpha = zero;
		//          body->m_alpha = zero;
		//      }
		tmp = simd_mul_v(alpha, alpha);
		tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
		tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
		tmp = simd_cmplt_s(tmp, toleranceSimd);
		tmp = simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0));
		alpha = simd_andnot_v(alpha, tmp);
		(simd_type &)body->m_alpha = simd_andnot_v((simd_type &)body->m_alpha, tmp);

		(simd_type &)body->m_netForce = (simd_type &)body->m_accel;
		(simd_type &)body->m_netTorque = (simd_type &)body->m_alpha;

		(simd_type &)body->m_veloc =
		    simd_mul_add_v((simd_type &)body->m_veloc, accel, timeStep);
		(simd_type &)body->m_omega =
		    simd_mul_add_v((simd_type &)body->m_omega, alpha, timeStep);
	}

	if (hasJointFeeback) {
		for (dgInt32 i = 0; i < m_jointCount; i++) {
			if (constraintArray[i].m_joint->m_updaFeedbackCallback) {
				constraintArray[i].m_joint->m_updaFeedbackCallback(
				    *constraintArray[i].m_joint, m_timeStep, m_threadIndex);
			}
		}
	}

#endif
}

void dgJacobianMemory::ApplyExternalForcesAndAcceleration(
    dgFloat32 tolerance) const {
	//  dgInt32 hasJointFeeback;
	//  dgFloat32 accelTol2;
	dgFloat32 *const force = m_force;
	const dgJacobianPair *const Jt = m_Jt;
	const dgBodyInfo *const bodyArray = m_bodyArray;
	const dgJointInfo *const constraintArray = m_constraintArray;
	dgFloat32 **const jointForceFeeback = m_jointFeebackForce;
	dgJacobian *const internalForces = m_internalForces;

	dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
	              dgFloat32(0.0f));
	for (dgInt32 i = 0; i < m_bodyCount; i++) {
		internalForces[i].m_linear = zero;
		internalForces[i].m_angular = zero;
	}

	dgInt32 hasJointFeeback = 0;
	dgFloat32 accelTol2 = tolerance * tolerance;
	for (dgInt32 i = 0; i < m_jointCount; i++) {
		//      dgInt32 m0;
		//      dgInt32 m1;
		//      dgInt32 first;
		//      dgInt32 count;
		//      dgInt32 index;
		//      dgFloat32 val;
		dgInt32 first = constraintArray[i].m_autoPairstart;
		dgInt32 count = constraintArray[i].m_autoPaircount;

		dgInt32 m0 = constraintArray[i].m_m0;
		dgInt32 m1 = constraintArray[i].m_m1;

		dgJacobian y0;
		dgJacobian y1;
		y0.m_linear = zero;
		y0.m_angular = zero;
		y1.m_linear = zero;
		y1.m_angular = zero;

		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 index = j + first;
			dgFloat32 val = force[index];

			NEWTON_ASSERT(dgCheckFloat(val));
			jointForceFeeback[index][0] = val;

			y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(val);
			y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(val);
			y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(val);
			y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(val);
		}

		//      if (constraintArray[i].m_joint->GetId() == dgContactConstraintId) {
		//          m_world->AddToBreakQueue ((dgContact*)constraintArray[i].m_joint, maxForce);
		//      }

		//      hasJointFeeback |= dgUnsigned32 (constraintArray[i].m_joint->m_updaFeedbackCallback);
		hasJointFeeback |= (constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0);

		internalForces[m0].m_linear += y0.m_linear;
		internalForces[m0].m_angular += y0.m_angular;
		internalForces[m1].m_linear += y1.m_linear;
		internalForces[m1].m_angular += y1.m_angular;
	}

	for (dgInt32 i = 1; i < m_bodyCount; i++) {
		dgBody *const body = bodyArray[i].m_body;
		body->m_accel += internalForces[i].m_linear;
		body->m_alpha += internalForces[i].m_angular;

		dgVector accel(body->m_accel.Scale(body->m_invMass.m_w));
		dgVector alpha(body->m_invWorldInertiaMatrix.RotateVector(body->m_alpha));
		dgFloat32 error = accel % accel;
		if (error < accelTol2) {
			accel = zero;
			body->m_accel = zero;
		}

		error = alpha % alpha;
		if (error < accelTol2) {
			alpha = zero;
			body->m_alpha = zero;
		}

		body->m_netForce = body->m_accel;
		body->m_netTorque = body->m_alpha;

		body->m_veloc += accel.Scale(m_timeStep);
		body->m_omega += alpha.Scale(m_timeStep);
	}

	if (hasJointFeeback) {
		for (dgInt32 i = 0; i < m_jointCount; i++) {
			if (constraintArray[i].m_joint->m_updaFeedbackCallback) {
				constraintArray[i].m_joint->m_updaFeedbackCallback(
					reinterpret_cast<const NewtonJoint *>(constraintArray[i].m_joint), m_timeStep, m_threadIndex);
			}
		}
	}
}

void dgJacobianMemory::SwapRowsSimd(dgInt32 i, dgInt32 j) const {
#ifdef DG_BUILD_SIMD_CODE
	//  simd_type tmp0;
	//  simd_type tmp1;
	//  simd_type tmp2;
	//  simd_type tmp3;
	//  simd_type tmp4;
	//  simd_type tmp5;
	//  simd_type tmp6;
	//  simd_type tmp7;

	NEWTON_ASSERT(i != j);
	// Swap (m_Jt[i], m_Jt[j]);
	dgJacobianPair *const ptr0 = m_Jt;
	simd_type tmp0 = (simd_type &)ptr0[i].m_jacobian_IM0.m_linear;
	simd_type tmp1 = (simd_type &)ptr0[i].m_jacobian_IM0.m_angular;
	simd_type tmp2 = (simd_type &)ptr0[i].m_jacobian_IM1.m_linear;
	simd_type tmp3 = (simd_type &)ptr0[i].m_jacobian_IM1.m_angular;
	simd_type tmp4 = (simd_type &)ptr0[j].m_jacobian_IM0.m_linear;
	simd_type tmp5 = (simd_type &)ptr0[j].m_jacobian_IM0.m_angular;
	simd_type tmp6 = (simd_type &)ptr0[j].m_jacobian_IM1.m_linear;
	simd_type tmp7 = (simd_type &)ptr0[j].m_jacobian_IM1.m_angular;

	(simd_type &)ptr0[j].m_jacobian_IM0.m_linear = tmp0;
	(simd_type &)ptr0[j].m_jacobian_IM0.m_angular = tmp1;
	(simd_type &)ptr0[j].m_jacobian_IM1.m_linear = tmp2;
	(simd_type &)ptr0[j].m_jacobian_IM1.m_angular = tmp3;
	(simd_type &)ptr0[i].m_jacobian_IM0.m_linear = tmp4;
	(simd_type &)ptr0[i].m_jacobian_IM0.m_angular = tmp5;
	(simd_type &)ptr0[i].m_jacobian_IM1.m_linear = tmp6;
	(simd_type &)ptr0[i].m_jacobian_IM1.m_angular = tmp7;

	// Swap (m_JMinv[i], m_JMinv[j]);
	dgJacobianPair *const ptr1 = m_JMinv;
	tmp0 = (simd_type &)ptr1[i].m_jacobian_IM0.m_linear;
	tmp1 = (simd_type &)ptr1[i].m_jacobian_IM0.m_angular;
	tmp2 = (simd_type &)ptr1[i].m_jacobian_IM1.m_linear;
	tmp3 = (simd_type &)ptr1[i].m_jacobian_IM1.m_angular;
	tmp4 = (simd_type &)ptr1[j].m_jacobian_IM0.m_linear;
	tmp5 = (simd_type &)ptr1[j].m_jacobian_IM0.m_angular;
	tmp6 = (simd_type &)ptr1[j].m_jacobian_IM1.m_linear;
	tmp7 = (simd_type &)ptr1[j].m_jacobian_IM1.m_angular;

	(simd_type &)ptr1[j].m_jacobian_IM0.m_linear = tmp0;
	(simd_type &)ptr1[j].m_jacobian_IM0.m_angular = tmp1;
	(simd_type &)ptr1[j].m_jacobian_IM1.m_linear = tmp2;
	(simd_type &)ptr1[j].m_jacobian_IM1.m_angular = tmp3;
	(simd_type &)ptr1[i].m_jacobian_IM0.m_linear = tmp4;
	(simd_type &)ptr1[i].m_jacobian_IM0.m_angular = tmp5;
	(simd_type &)ptr1[i].m_jacobian_IM1.m_linear = tmp6;
	(simd_type &)ptr1[i].m_jacobian_IM1.m_angular = tmp7;

	Swap(m_diagDamp[i], m_diagDamp[j]);
	Swap(m_invDJMinvJt[i], m_invDJMinvJt[j]);
	//  Swap (m_jacobianIndexArray[i], m_jacobianIndexArray[j]);
	Swap(m_normalForceIndex[i], m_normalForceIndex[j]);
	//  Swap (m_lowerBoundFrictionForce[i], m_lowerBoundFrictionForce[j]);
	//  Swap (m_upperBoundFrictionForce[i], m_upperBoundFrictionForce[j]);
	Swap(m_lowerBoundFrictionCoefficent[i], m_lowerBoundFrictionCoefficent[j]);
	Swap(m_upperBoundFrictionCoefficent[i], m_upperBoundFrictionCoefficent[j]);
	Swap(m_jointFeebackForce[i], m_jointFeebackForce[j]);
	Swap(m_coordenateAccel[i], m_coordenateAccel[j]);
	Swap(m_force[i], m_force[j]);
	Swap(m_accel[i], m_accel[j]);
	//  Swap (m_forceStep[i], m_forceStep[j]);
	Swap(m_deltaAccel[i], m_deltaAccel[j]);
	Swap(m_deltaForce[i], m_deltaForce[j]);

#else

#endif
}

void dgJacobianMemory::SwapRows(dgInt32 i, dgInt32 j) const {
	NEWTON_ASSERT(i != j);
#define SwapMacro(a, b) Swap(a, b)

	SwapMacro(m_Jt[i], m_Jt[j]);
	SwapMacro(m_JMinv[i], m_JMinv[j]);

	SwapMacro(m_diagDamp[i], m_diagDamp[j]);
	SwapMacro(m_invDJMinvJt[i], m_invDJMinvJt[j]);
	// SwapMacro (m_diagJMinvJt[i], m_diagJMinvJt[j]);
	// SwapMacro (m_frictionThreshold[i], m_frictionThreshold[j]);
	// SwapMacro (m_jacobianIndexArray[i], m_jacobianIndexArray[j]);
	// SwapMacro (m_bilateralForceBounds[i], m_bilateralForceBounds[j]);

	SwapMacro(m_normalForceIndex[i], m_normalForceIndex[j]);
	//  SwapMacro (m_lowerBoundFrictionForce[i], m_lowerBoundFrictionForce[j]);
	//  SwapMacro (m_upperBoundFrictionForce[i], m_upperBoundFrictionForce[j]);
	SwapMacro(m_lowerBoundFrictionCoefficent[i],
	          m_lowerBoundFrictionCoefficent[j]);
	SwapMacro(m_upperBoundFrictionCoefficent[i],
	          m_upperBoundFrictionCoefficent[j]);
	SwapMacro(m_jointFeebackForce[i], m_jointFeebackForce[j]);

	SwapMacro(m_coordenateAccel[i], m_coordenateAccel[j]);

	SwapMacro(m_force[i], m_force[j]);
	SwapMacro(m_accel[i], m_accel[j]);
	//  SwapMacro (m_forceStep[i], m_forceStep[j]);
	SwapMacro(m_deltaAccel[i], m_deltaAccel[j]);
	SwapMacro(m_deltaForce[i], m_deltaForce[j]);
}

void dgJacobianMemory::CalculateSimpleBodyReactionsForcesSimd(
    dgFloat32 maxAccNorm) const {
#ifdef DG_BUILD_SIMD_CODE

	simd_type accelPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];
	simd_type activeRowPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];
	simd_type lowBoundPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];
	simd_type highBoundPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];
	simd_type deltaAccelPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];
	simd_type deltaForcePtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];

	dgFloat32 *const force = m_force;
	dgFloat32 *const activeRow = (dgFloat32 *)activeRowPtr;
	dgFloat32 *const lowBound = (dgFloat32 *)lowBoundPtr;
	dgFloat32 *const highBound = (dgFloat32 *)highBoundPtr;
	dgFloat32 *const accel = (dgFloat32 *)accelPtr;
	dgFloat32 *const deltaAccel = (dgFloat32 *)deltaAccelPtr;
	dgFloat32 *const deltaForce = (dgFloat32 *)deltaForcePtr;

	const dgJacobianPair *const Jt = m_Jt;
	const dgJacobianPair *const JMinv = m_JMinv;
	const dgFloat32 *const diagDamp = m_diagDamp;
	const dgFloat32 *const invDJMinvJt = m_invDJMinvJt;
	const dgInt32 *const normalForceIndex = m_normalForceIndex;
	const dgFloat32 *const coordenateAccel = m_coordenateAccel;
	const dgJointInfo *const constraintArray = m_constraintArray;
	const dgFloat32 *const lowerFriction = m_lowerBoundFrictionCoefficent;
	const dgFloat32 *const upperFriction = m_upperBoundFrictionCoefficent;

	dgFloatSign tmpIndex;
	tmpIndex.m_integer.m_iVal = 0x7fffffff;

	simd_type signMask = simd_set1(tmpIndex.m_fVal);
	simd_type one = simd_set1(dgFloat32(1.0f));
	simd_type zero = simd_set1(dgFloat32(0.0f));
	simd_type tol_pos_1eNeg5 = simd_set1(dgFloat32(1.0e-5f));
	simd_type tol_pos_1eNeg8 = simd_set1(dgFloat32(1.0e-8f));
	simd_type tol_pos_1eNeg16 = simd_set1(dgFloat32(1.0e-16f));
	simd_type tol_neg_1eNeg16 = simd_set1(dgFloat32(-1.0e-16f));
	// dgVector zero (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));

	dgInt32 count = constraintArray[0].m_autoPaircount;
	NEWTON_ASSERT(constraintArray[0].m_autoPairstart == 0);
	dgInt32 roundCount = count & (-DG_SIMD_WORD_SIZE);

	if (roundCount != count) {
		roundCount += 4;
		for (dgInt32 i = count; i < roundCount; i++) {
			// force[i] = dgFloat32 (0.0f);
			// accel[i] = dgFloat32 (0.0f);
			// activeRow[i] = dgFloat32 (0.0f);
			// deltaAccel[i] = dgFloat32 (0.0f);
			// deltaForce[i] = dgFloat32 (0.0f);
			m_normalForceIndex[i] = -1;
			simd_store_s(zero, &force[i]);
			simd_store_s(zero, &accel[i]);
			simd_store_s(zero, &activeRow[i]);
			simd_store_s(zero, &deltaAccel[i]);
			simd_store_s(zero, &deltaForce[i]);
			simd_store_s(one, &m_lowerBoundFrictionCoefficent[i]);
			simd_store_s(zero, &m_upperBoundFrictionCoefficent[i]);
		}
	}

	//  j0 = jacobianIndexArray[0].m_m0;
	//  j1 = jacobianIndexArray[0].m_m1;
	// dgInt32 m0 = constraintArray[0].m_m0;
	// dgInt32 m1 = constraintArray[0].m_m1;

	//  dgBody* const body0 = bodyArray[m0].m_body;
	//  dgBody* const body1 = bodyArray[m1].m_body;

	// ak = dgSqrt((body1->m_accel % body1->m_accel) + (body0->m_accel % body0->m_accel));
	//  tmp3 = simd_mul_add_v (simd_mul_v ((simd_type&)body1->m_accel, (simd_type&)body1->m_accel),
	//                                      (simd_type&)body0->m_accel, (simd_type&)body0->m_accel);
	//  tmp3 = simd_add_v (tmp3, simd_permut_v (tmp3, tmp3, PURMUT_MASK (0, 0, 3, 2)));
	//  tmp3 = simd_add_s (tmp3, simd_permut_v (tmp3, tmp3, PURMUT_MASK (0, 0, 0, 1)));
	//  tmp3 = simd_mul_s (tmp3, simd_rsqrt_s(tmp3));

	//  maxPasses = 0;
	simd_type tmp2 = zero;
	//  tmp3 = simd_permut_v (tmp3, tmp3, PURMUT_MASK (0, 0, 0, 0));
	//  for (i = 0; i <roundCount; i ++) {
	for (dgInt32 i = 0; i < roundCount; i += DG_SIMD_WORD_SIZE) {
		//      simd_type tmp5;
		//      simd_type tmp6;
		//      simd_type tmp7;
		//      bool test;
		//      bool lowBoundTest;
		//      bool highBoundTest;
		//      dgFloat32 normalForce;

		// simd_type lowBoundTest;
		// simd_type highBoundTest;

		//      k = normalForceIndex[i];
		//      if (k >= 0) {
		//          val = force[k];
		//          if (val < dgFloat32 (1.0e-2f)) {
		//              val = ak;
		//          }
		//      } else {
		//          val = dgFloat32 (1.0f);
		//      }

		//      normalForce = force[normalForceIndex[i]] ;
		//      normalForce = (normalForce > dgFloat32 (1.0e-2f)) ? normalForce : ak;

		simd_type normalForce =
		    simd_move_lh_v(simd_pack_lo_v(simd_load_s(force[normalForceIndex[i + 0]]),
		                                  simd_load_s(force[normalForceIndex[i + 1]])),
		                   simd_pack_lo_v(simd_load_s(force[normalForceIndex[i + 2]]),
		                                  simd_load_s(force[normalForceIndex[i + 3]])));
		//      test = simd_cmplt_v (normalForce, tol_pos_1eNeg2);
		//      normalForce = simd_or_v (simd_and_v (tmp3, test), simd_andnot_v (normalForce, test));

		//      lowBound[i] = normalForce * lowerFriction[i];
		//      highBound[i] = normalForce * upperFriction[i];
		(simd_type &)lowBound[i] =
		    simd_mul_v(normalForce, (simd_type &)lowerFriction[i]);
		(simd_type &)highBound[i] =
		    simd_mul_v(normalForce, (simd_type &)upperFriction[i]);

		//      activeRow[i] = dgFloat32 (1.0f);
		//      if (force[i] < lowBound[i]) {
		//          maxPasses --;
		//          force[i] = lowBound[i];
		//          activeRow[i] = dgFloat32 (0.0f);
		//      } else if (force[i] > highBound[i]) {
		//          maxPasses --;
		//          force[i] = highBound[i];
		//          activeRow[i] = dgFloat32 (0.0f);
		//      }

		//      lowBoundTest = force[i] < lowBound[i];
		//      highBoundTest = force[i] > highBound[i];
		//      lowBoundTest = simd_cmplt_v ((simd_type&)force[i], (simd_type&)lowBound[i]);
		//      highBoundTest = simd_cmpgt_v ((simd_type&)force[i], (simd_type&)highBound[i]);
		//      test = lowBoundTest | highBoundTest;
		//      test = simd_or_v (lowBoundTest, highBoundTest);
		simd_type test =
		    simd_or_v(simd_cmplt_v((simd_type &)force[i], (simd_type &)lowBound[i]), simd_cmpgt_v((simd_type &)force[i], (simd_type &)highBound[i]));

		//      activeRow[i] = test ? dgFloat32 (0.0f) : dgFloat32 (1.0f);
		(simd_type &)activeRow[i] = simd_andnot_v(one, test);

		//      maxPasses += test ? 0 : 1;
		tmp2 = simd_add_v(tmp2, (simd_type &)activeRow[i]);

		//      force[i] = lowBoundTest ? lowBound[i] : (highBoundTest ? highBound[i] : force[i]);
		//      normalForce = simd_or_v (simd_and_v ((simd_type&)lowBound[i], lowBoundTest), simd_and_v ((simd_type&)highBound[i], highBoundTest));
		//      (simd_type&)force[i] = simd_or_v (simd_and_v (normalForce, test), simd_andnot_v ((simd_type&)force[i], test));
		(simd_type &)force[i] =
		    simd_min_v((simd_type &)highBound[i], simd_max_v((simd_type &)force[i], (simd_type &)lowBound[i]));
	}

	tmp2 = simd_add_v(tmp2, simd_move_hl_v(tmp2, tmp2));
	dgInt32 maxPasses =
	    simd_store_is(simd_add_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(0, 0, 0, 1))));

	// y0.m_linear = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
	// y0.m_angular = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
	// y1.m_linear = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
	// y1.m_angular = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
	simd_type y0_linear = zero;
	simd_type y0_angular = zero;
	simd_type y1_linear = zero;
	simd_type y1_angular = zero;
	for (dgInt32 i = 0; i < count; i++) {
		// force = m_force[i];
		simd_type tmp1 = simd_set1(force[i]);

		// y0.m_linear += m_Jt[i].m_jacobian_IM0.m_linear.Scale (force);
		// y0.m_angular += m_Jt[i].m_jacobian_IM0.m_angular.Scale (force);
		// y1.m_linear += m_Jt[i].m_jacobian_IM1.m_linear.Scale (force);
		// y1.m_angular += m_Jt[i].m_jacobian_IM1.m_angular.Scale (force);
		y0_linear =
		    simd_mul_add_v(y0_linear, (simd_type &)Jt[i].m_jacobian_IM0.m_linear, tmp1);
		y0_angular =
		    simd_mul_add_v(y0_angular, (simd_type &)Jt[i].m_jacobian_IM0.m_angular, tmp1);
		y1_linear =
		    simd_mul_add_v(y1_linear, (simd_type &)Jt[i].m_jacobian_IM1.m_linear, tmp1);
		y1_angular =
		    simd_mul_add_v(y1_angular, (simd_type &)Jt[i].m_jacobian_IM1.m_angular, tmp1);
	}

	// akNum = dgFloat32 (0.0f);
	// accNorm = dgFloat32(0.0f);
	simd_type tmp0 = zero;
	simd_type tmp1 = zero;
	for (dgInt32 i = 0; i < count; i++) {
		// dgVector acc (m_JMinv[i].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
		// acc += m_JMinv[i].m_jacobian_IM0.m_angular.CompProduct (y0.m_angular);
		// acc += m_JMinv[i].m_jacobian_IM1.m_linear.CompProduct (y1.m_linear);
		// acc += m_JMinv[i].m_jacobian_IM1.m_angular.CompProduct (y1.m_angular);
		simd_type tmp2 =
		    simd_mul_v((simd_type &)JMinv[i].m_jacobian_IM0.m_linear, y0_linear);
		tmp2 =
		    simd_mul_add_v(tmp2, (simd_type &)JMinv[i].m_jacobian_IM0.m_angular, y0_angular);
		tmp2 =
		    simd_mul_add_v(tmp2, (simd_type &)JMinv[i].m_jacobian_IM1.m_linear, y1_linear);
		tmp2 =
		    simd_mul_add_v(tmp2, (simd_type &)JMinv[i].m_jacobian_IM1.m_angular, y1_angular);

		// m_accel[i] = m_coordenateAccel[i] - acc.m_x - acc.m_y - acc.m_z - m_force[i] * m_diagDamp[i];
		tmp2 = simd_add_v(tmp2, simd_move_hl_v(tmp2, tmp2));
		tmp2 = simd_add_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(3, 3, 3, 1)));
		tmp2 =
		    simd_sub_s(simd_load_s(coordenateAccel[i]), simd_mul_add_s(tmp2, simd_load_s(force[i]), simd_load_s(diagDamp[i])));
		simd_store_s(tmp2, &accel[i]);

		// m_deltaForce[i] = m_accel[i] * m_invDJMinvJt[i] * activeRow[i];
		simd_type tmp3 =
		    simd_mul_s(tmp2, simd_mul_s(simd_load_s(invDJMinvJt[i]), simd_load_s(activeRow[i])));
		simd_store_s(tmp3, &deltaForce[i]);

		// akNum += m_accel[i] * m_deltaForce[i];
		tmp0 = simd_mul_add_v(tmp0, tmp3, tmp2);

		// accNorm = GetMax (dgAbsf (m_accel[i] * activeRow[i]), accNorm);
		tmp1 =
		    simd_max_s(tmp1, simd_and_v(simd_mul_v(tmp2, simd_load_s(activeRow[i])), signMask));
	}

	dgFloat32 akNum;
	dgFloat32 accNorm;
	simd_store_s(tmp0, &akNum);
	simd_store_s(tmp1, &accNorm);
	for (dgInt32 i = 0; (i < maxPasses) && (accNorm > maxAccNorm); i++) {
		//      dgInt32 clampedForceIndex;
		//      dgFloat32 ak;
		//      dgFloat32 clampedForceIndexValue;

		// MatrixTimeVector (forceRows, m_deltaForce, m_deltaAccel);
		// y0.m_linear = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
		// y0.m_angular = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
		// y1.m_linear = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
		// y1.m_angular = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f));
		simd_type y0_linear = zero;
		simd_type y0_angular = zero;
		simd_type y1_linear = zero;
		simd_type y1_angular = zero;
		for (dgInt32 k = 0; k < count; k++) {
			// ak = m_deltaForce[k];
			simd_type tmp1 = simd_set1(deltaForce[k]);

			// y0.m_linear += m_Jt[k].m_jacobian_IM0.m_linear.Scale (ak);
			// y0.m_angular += m_Jt[k].m_jacobian_IM0.m_angular.Scale (ak);
			// y1.m_linear += m_Jt[k].m_jacobian_IM1.m_linear.Scale (ak);
			// y1.m_angular += m_Jt[k].m_jacobian_IM1.m_angular.Scale (ak);
			y0_linear =
			    simd_mul_add_v(y0_linear, (simd_type &)Jt[k].m_jacobian_IM0.m_linear, tmp1);
			y0_angular =
			    simd_mul_add_v(y0_angular, (simd_type &)Jt[k].m_jacobian_IM0.m_angular, tmp1);
			y1_linear =
			    simd_mul_add_v(y1_linear, (simd_type &)Jt[k].m_jacobian_IM1.m_linear, tmp1);
			y1_angular =
			    simd_mul_add_v(y1_angular, (simd_type &)Jt[k].m_jacobian_IM1.m_angular, tmp1);
		}

		// akDen = dgFloat32 (0.0f);
		simd_type tmp0 = zero;
		for (dgInt32 k = 0; k < count; k++) {
			// dgVector acc (m_JMinv[k].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
			// acc += m_JMinv[k].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
			// acc += m_JMinv[k].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
			// acc += m_JMinv[k].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);
			simd_type tmp2 =
			    simd_mul_v((simd_type &)JMinv[k].m_jacobian_IM0.m_linear, y0_linear);
			tmp2 =
			    simd_mul_add_v(tmp2, (simd_type &)JMinv[k].m_jacobian_IM0.m_angular, y0_angular);
			tmp2 =
			    simd_mul_add_v(tmp2, (simd_type &)JMinv[k].m_jacobian_IM1.m_linear, y1_linear);
			tmp2 =
			    simd_mul_add_v(tmp2, (simd_type &)JMinv[k].m_jacobian_IM1.m_angular, y1_angular);

			// m_deltaAccel[k] = acc.m_x + acc.m_y + acc.m_z + m_deltaForce[k] * m_diagDamp[k];
			simd_type tmp1 = simd_load_s(deltaForce[k]);
			tmp2 = simd_add_v(tmp2, simd_move_hl_v(tmp2, tmp2));
			tmp2 =
			    simd_add_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(3, 3, 3, 1)));
			tmp2 = simd_mul_add_s(tmp2, tmp1, simd_load_s(diagDamp[k]));
			simd_store_s(tmp2, &deltaAccel[k]);

			// akDen += m_deltaAccel[k] * m_deltaForce[k];
			tmp0 = simd_mul_add_v(tmp0, tmp2, tmp1);
		}

		//NEWTON_ASSERT (akDen > dgFloat32 (0.0f));
		// akDen = GetMax (akDen, dgFloat32(1.0e-16f));
		//NEWTON_ASSERT (dgAbsf (akDen) >= dgFloat32(1.0e-16f));
		// ak = akNum / akDen;
		tmp0 = simd_div_s(simd_load_s(akNum), simd_max_s(tmp0, tol_pos_1eNeg16));

		//      simd_type min_index;
		//      simd_type minClampIndex;
		//      simd_type min_index_step;
		//      simd_type campedIndexValue;
		simd_type campedIndexValue = zero;
		simd_type minClampIndex = simd_set1(dgFloat32(-1.0f));
		simd_type min_index_step = simd_set1(dgFloat32(4.0f));
		tmp0 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 0));
		simd_type min_index =
		    simd_set(dgFloat32(0.0f), dgFloat32(1.0f), dgFloat32(2.0f), dgFloat32(3.0f));

		//      for (k = 0; k < roundCount; k ++) {
		for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) {
			//          simd_type val;
			//          simd_type num;
			//          simd_type den;
			//          simd_type test;
			//          simd_type negTest;
			//          simd_type posTest;
			//          simd_type negValTest;
			//          simd_type posValTest;
			//          simd_type negDeltaForceTest;
			//          simd_type posDeltaForceTest;

			// Make sure AK is not negative
			//          val = force[k] + ak * deltaForce[k];
			simd_type val =
			    simd_mul_add_v((simd_type &)force[k], tmp0, (simd_type &)deltaForce[k]);

			//          negValTest = val < lowBound[k];
			simd_type negValTest = simd_cmplt_v(val, (simd_type &)lowBound[k]);

			//          posValTest = val > highBound[k];
			simd_type posValTest = simd_cmpgt_v(val, (simd_type &)highBound[k]);

			//          negDeltaForceTest = deltaForce[k] < dgFloat32 (-1.0e-16f);
			simd_type negDeltaForceTest =
			    simd_cmplt_v((simd_type &)deltaForce[k], tol_neg_1eNeg16);

			//          posDeltaForceTest = deltaForce[k] > dgFloat32 ( 1.0e-16f);
			simd_type posDeltaForceTest =
			    simd_cmpgt_v((simd_type &)deltaForce[k], tol_pos_1eNeg16);

			//          negTest = negValTest & negDeltaForceTest;
			simd_type negTest = simd_and_v(negValTest, negDeltaForceTest);

			//          posTest = posValTest & posDeltaForceTest;
			simd_type posTest = simd_and_v(posValTest, posDeltaForceTest);

			//          test = negTest | posTest;
			simd_type test = simd_or_v(negTest, posTest);

			//          num = negTest ? lowBound[k] : (posTest ? highBound[k] : force[k]);
			simd_type num =
			    simd_or_v(simd_and_v((simd_type &)lowBound[k], negTest), simd_and_v((simd_type &)highBound[k], posTest));
			num =
			    simd_or_v(simd_and_v(num, test), simd_andnot_v((simd_type &)force[k], test));

			//          den = test ? deltaForce[k] : dgFloat32 (1.0f);
			simd_type den =
			    simd_or_v(simd_and_v((simd_type &)deltaForce[k], test), simd_andnot_v(one, test));

			//          test = test & (activeRow[k] > dgFloat32 (0.0f));
			test = simd_and_v(test, simd_cmpgt_v((simd_type &)activeRow[k], zero));

			//          NEWTON_ASSERT (dgAbsf (den) > 1.0e-16f);
			//          ak = test ? (num - force[k]) / den : ak;
			tmp0 =
			    simd_or_v(simd_div_v(simd_sub_v(num, (simd_type &)force[k]), den), simd_andnot_v(tmp0, test));

			//          ak = (ak < dgFloat32 (1.0e-8f)) ? dgFloat32 (0.0f) : ak;
			tmp0 = simd_and_v(tmp0, simd_cmpgt_v(tmp0, tol_pos_1eNeg8));

			//          clampedForceIndex = test ? k : clampedForceIndex;
			minClampIndex =
			    simd_or_v(simd_and_v(min_index, test), simd_andnot_v(minClampIndex, test));
			min_index = simd_add_v(min_index, min_index_step);

			//          clampedForceIndexValue = test ? num : clampedForceIndexValue;
			campedIndexValue =
			    simd_or_v(simd_and_v(num, test), simd_andnot_v(campedIndexValue, test));
		}

		simd_type tmp2 = simd_move_hl_v(tmp0, tmp0);
		simd_type tmp3 = simd_cmplt_v(tmp0, tmp2);
		tmp0 = simd_min_v(tmp0, tmp2);
		minClampIndex =
		    simd_or_v(simd_and_v(minClampIndex, tmp3), simd_andnot_v(simd_move_hl_v(minClampIndex, minClampIndex), tmp3));
		campedIndexValue =
		    simd_or_v(simd_and_v(campedIndexValue, tmp3), simd_andnot_v(simd_move_hl_v(campedIndexValue, campedIndexValue), tmp3));

		tmp2 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 1));
		tmp3 = simd_cmplt_s(tmp0, tmp2);
		tmp0 = simd_min_s(tmp0, tmp2);
		minClampIndex =
		    simd_or_v(simd_and_v(minClampIndex, tmp3), simd_andnot_v(simd_permut_v(minClampIndex, minClampIndex, PURMUT_MASK(0, 0, 0, 1)), tmp3));
		campedIndexValue =
		    simd_or_v(simd_and_v(campedIndexValue, tmp3), simd_andnot_v(simd_permut_v(campedIndexValue, campedIndexValue, PURMUT_MASK(0, 0, 0, 1)), tmp3));

		dgFloat32 ak;
		simd_store_s(tmp0, &ak);
		dgInt32 clampedForceIndex = simd_store_is(minClampIndex);

		dgFloat32 clampedForceIndexValue;
		simd_store_s(campedIndexValue, &clampedForceIndexValue);

		tmp2 = zero;
		tmp0 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 0));
		//      if (ak == dgFloat32 (0.0f)) {
		if (ak == dgFloat32(0.0f) && (clampedForceIndex != -1)) {
			NEWTON_ASSERT(clampedForceIndex != -1);

			//          akNum = dgFloat32 (0.0f);
			//          accNorm = dgFloat32(0.0f);
			simd_type tmp1 = zero;
			//          tmp2 = zero;
			simd_type tmp3 = zero;
			//          for (k = 0; k < count; k ++) {
			//          for (k = 0; k < roundCount; k ++) {

			activeRow[clampedForceIndex] = dgFloat32(0.0f);
			deltaForce[clampedForceIndex] = dgFloat32(0.0f);
			force[clampedForceIndex] = clampedForceIndexValue;

			for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) {
				// simd_type val_k;
				// simd_type test_0;
				// simd_type test_1;
				// simd_type accel_k;
				// simd_type force_k;

				// bool test0;
				// bool test1;
				// dgFloat32 val;

				simd_type accel_k = (simd_type &)accel[k];
				simd_type force_k = (simd_type &)force[k];
				// val = dgAbsf (lowBound[k] - force[k]);
				simd_type val_k =
				    simd_and_v(simd_sub_v((simd_type &)lowBound[k], force_k), signMask);

				// test0 = (val < dgFloat32 (1.0e-5f)) & (accel[k] < dgFloat32 (0.0f));
				simd_type test_0 =
				    simd_and_v(simd_cmplt_v(val_k, tol_pos_1eNeg5), simd_cmplt_v(accel_k, zero));

				// val = dgAbsf (highBound[k] - force[k]);
				val_k =
				    simd_and_v(simd_sub_v((simd_type &)highBound[k], force_k), signMask);

				// test1 = (val < dgFloat32 (1.0e-5f)) & (accel[k] > dgFloat32 (0.0f));
				simd_type test_1 =
				    simd_and_v(simd_cmplt_v(val_k, tol_pos_1eNeg5), simd_cmpgt_v(accel_k, zero));

				// force[k] = test0 ? lowBound[k] : (test1 ? highBound[k] : force[k]);
				// val_k = simd_or_v (simd_and_v ((simd_type&)lowBound[k], test_0), simd_and_v ((simd_type&)highBound[k], test_1));
				//(simd_type&) force[k] = simd_or_v (simd_and_v (val_k, test_2) , simd_andnot_v (force_k, test_2));
				(simd_type &)force[k] =
				    simd_min_v((simd_type &)highBound[k], simd_max_v((simd_type &)force[k], (simd_type &)lowBound[k]));

				// activeRow[k] *= (test0 | test1) ? dgFloat32 (0.0f) : dgFloat32 (1.0f);
				// test_2 = simd_or_v (test_0, test_1);
				(simd_type &)activeRow[k] =
				    simd_mul_v((simd_type &)activeRow[k], simd_andnot_v(one, simd_or_v(test_0, test_1)));

				// deltaForce[k] = accel[k] * invDJMinvJt[k] * activeRow[k];
				(simd_type &)deltaForce[k] =
				    simd_mul_v(accel_k, simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[k]));

				// akNum += accel[k] * deltaForce[k];
				tmp1 = simd_mul_add_v(tmp1, (simd_type &)deltaForce[k], accel_k);

				// accNorm = GetMax (dgAbsf (accel[k] * activeRow[k]), accNorm);
				tmp2 =
				    simd_max_v(tmp2, simd_and_v(simd_mul_v(accel_k, (simd_type &)activeRow[k]), signMask));

				// maxPases += 1;
				tmp3 = simd_add_v(tmp3, (simd_type &)activeRow[k]);
			}
			tmp1 = simd_add_v(tmp1, simd_move_hl_v(tmp1, tmp1));
			tmp1 =
			    simd_add_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(tmp1, &akNum);

			tmp2 = simd_max_v(tmp2, simd_move_hl_v(tmp2, tmp2));
			tmp2 =
			    simd_max_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(tmp2, &accNorm);

			i = -1;
			// maxPasses = GetMax (maxPasses - 1, 1);
			tmp3 = simd_add_v(tmp3, simd_move_hl_v(tmp3, tmp3));
			maxPasses =
			    simd_store_is(simd_add_s(tmp3, simd_permut_v(tmp3, tmp3, PURMUT_MASK(0, 0, 0, 1))));

		} else if (clampedForceIndex >= 0) {
			// akNum = dgFloat32(0.0f);
			// accNorm = dgFloat32(0.0f);
			tmp1 = zero;
			activeRow[clampedForceIndex] = dgFloat32(0.0f);
			// for (k = 0; k < count; k ++) {
			for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) {
				// m_force[k] += ak * m_deltaForce[k];
				// m_accel[k] -= ak * m_deltaAccel[k];
				(simd_type &)force[k] =
				    simd_mul_add_v((simd_type &)force[k], tmp0, (simd_type &)deltaForce[k]);
				(simd_type &)accel[k] =
				    simd_mul_sub_v((simd_type &)accel[k], tmp0, (simd_type &)deltaAccel[k]);

				// accNorm = GetMax (dgAbsf (m_accel[k] * activeRow[k]), accNorm);
				tmp2 =
				    simd_max_v(tmp2, simd_and_v(simd_mul_v((simd_type &)accel[k], (simd_type &)activeRow[k]), signMask));
				//NEWTON_ASSERT (dgCheckFloat(m_force[k]));
				//NEWTON_ASSERT (dgCheckFloat(m_accel[k]));

				// m_deltaForce[k] = m_accel[k] * m_invDJMinvJt[k] * activeRow[k];
				(simd_type &)deltaForce[k] =
				    simd_mul_v((simd_type &)accel[k], simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[k]));

				// akNum += m_deltaForce[k] * m_accel[k];
				tmp1 =
				    simd_mul_add_v(tmp1, (simd_type &)deltaForce[k], (simd_type &)accel[k]);
			}
			tmp1 = simd_add_v(tmp1, simd_move_hl_v(tmp1, tmp1));
			tmp1 =
			    simd_add_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(tmp1, &akNum);

			tmp2 = simd_max_v(tmp2, simd_move_hl_v(tmp2, tmp2));
			tmp2 =
			    simd_max_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(tmp2, &accNorm);

			force[clampedForceIndex] = clampedForceIndexValue;
			i = -1;
			maxPasses = GetMax(maxPasses - 1, 1);

		} else {
			// accNorm = dgFloat32(0.0f);
			// for (k = 0; k < count; k ++) {
			for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) {
				// m_force[k] += ak * m_deltaForce[k];
				// m_accel[k] -= ak * m_deltaAccel[k];
				(simd_type &)force[k] =
				    simd_mul_add_v((simd_type &)force[k], tmp0, (simd_type &)deltaForce[k]);
				(simd_type &)accel[k] =
				    simd_mul_sub_v((simd_type &)accel[k], tmp0, (simd_type &)deltaAccel[k]);

				// accNorm = GetMax (dgAbsf (m_accel[k] * activeRow[k]), accNorm);
				tmp2 =
				    simd_max_v(tmp2, simd_and_v(simd_mul_v((simd_type &)accel[k], (simd_type &)activeRow[k]), signMask));
			}
			tmp2 = simd_max_v(tmp2, simd_move_hl_v(tmp2, tmp2));
			tmp2 =
			    simd_max_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(tmp2, &accNorm);

			if (accNorm > maxAccNorm) {

				// akDen = akNum;
				// akNum = dgFloat32(0.0f);
				tmp1 = simd_set1(akNum);
				tmp0 = zero;
				// for (k = 0; k < count; k ++) {
				for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) {
					// m_deltaAccel[k] = m_accel[k] * m_invDJMinvJt[k] * activeRow[k];
					(simd_type &)deltaAccel[k] =
					    simd_mul_v((simd_type &)accel[k], simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[k]));

					// akNum += m_accel[k] * m_deltaAccel[k];
					tmp0 =
					    simd_mul_add_v(tmp0, (simd_type &)accel[k], (simd_type &)deltaAccel[k]);
				}

				tmp0 = simd_add_v(tmp0, simd_move_hl_v(tmp0, tmp0));
				tmp0 =
				    simd_add_s(tmp0, simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 1)));
				simd_store_s(tmp0, &akNum);

				//NEWTON_ASSERT (bk > dgFloat32(0.0f));
				// bk = GetMax (bk, dgFloat32 (1.0e-17f));
				tmp1 = simd_max_s(tmp1, simd_set1(dgFloat32(1.0e-17f)));
				// ak = dgFloat32 (akNum / akDen);
				tmp0 = simd_div_s(tmp0, tmp1);
				tmp0 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 0));
				// for (k = 0; k < count; k ++) {
				for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) {
					// m_deltaForce[k] = m_deltaAccel[k] + ak * m_deltaForce[k];
					(simd_type &)deltaForce[k] =
					    simd_mul_add_v((simd_type &)deltaAccel[k], tmp0, (simd_type &)deltaForce[k]);
				}
			}
		}
	}

#else
#endif
}

void dgJacobianMemory::CalculateSimpleBodyReactionsForces(
    dgFloat32 maxAccNorm) const {
	dgFloat32 accel[DG_CONSTRAINT_MAX_ROWS];
	dgFloat32 activeRow[DG_CONSTRAINT_MAX_ROWS];
	dgFloat32 lowBound[DG_CONSTRAINT_MAX_ROWS];
	dgFloat32 highBound[DG_CONSTRAINT_MAX_ROWS];
	dgFloat32 deltaForce[DG_CONSTRAINT_MAX_ROWS];
	dgFloat32 deltaAccel[DG_CONSTRAINT_MAX_ROWS];

	//  dgFloat32* const accel = m_accel;
	dgFloat32 *const force = m_force;
	const dgJacobianPair *const Jt = m_Jt;
	const dgJacobianPair *const JMinv = m_JMinv;
	const dgFloat32 *const diagDamp = m_diagDamp;
	const dgFloat32 *const invDJMinvJt = m_invDJMinvJt;
	const dgFloat32 *const lowerFriction = m_lowerBoundFrictionCoefficent;
	const dgFloat32 *const upperFriction = m_upperBoundFrictionCoefficent;
	const dgInt32 *const normalForceIndex = m_normalForceIndex;
	const dgFloat32 *const coordenateAccel = m_coordenateAccel;
	const dgJointInfo *const constraintArray = m_constraintArray;
	dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
	              dgFloat32(0.0f));

	dgInt32 count = constraintArray[0].m_autoPaircount;
	NEWTON_ASSERT(constraintArray[0].m_autoPairstart == 0);

	// ak = dgFloat32 (3.0f) * dgSqrt((body1->m_accel % body1->m_accel) + (body0->m_accel % body0->m_accel)) / dgFloat32 (count);
	//  ak = dgSqrt((body1->m_accel % body1->m_accel) + (body0->m_accel % body0->m_accel));
	// dTrace (("%f\n", ak));

	dgInt32 maxPasses = count;
	for (dgInt32 i = 0; i < count; i++) {

		// k = bilateralForceBounds[i].m_normalIndex;
		dgInt32 k = normalForceIndex[i];
		// force = (k >= 0) ? m_force[k] : dgFloat32 (1.0f);
		NEWTON_ASSERT(
		    ((k < 0) && (force[k] == dgFloat32(1.0f))) || ((k >= 0) && (force[k] >= dgFloat32(0.0f))));
		dgFloat32 val = force[k];
		// if (dgAbsf (val) < dgFloat32 (1.0e-2f)) {
		//  val = ak;
		// }
		// lowBound[i] = val * bilateralForceBounds[i].m_low;
		// highBound[i] = val * bilateralForceBounds[i].m_upper;
		lowBound[i] = val * lowerFriction[i];
		highBound[i] = val * upperFriction[i];

		activeRow[i] = dgFloat32(1.0f);
		if (force[i] < lowBound[i]) {
			maxPasses--;
			force[i] = lowBound[i];
			activeRow[i] = dgFloat32(0.0f);
		} else if (force[i] > highBound[i]) {
			maxPasses--;
			force[i] = highBound[i];
			activeRow[i] = dgFloat32(0.0f);
		}
	}

	dgJacobian y0;
	dgJacobian y1;
	y0.m_linear = zero;
	y0.m_angular = zero;
	y1.m_linear = zero;
	y1.m_angular = zero;
	for (dgInt32 i = 0; i < count; i++) {
		dgFloat32 val = force[i];
		y0.m_linear += Jt[i].m_jacobian_IM0.m_linear.Scale(val);
		y0.m_angular += Jt[i].m_jacobian_IM0.m_angular.Scale(val);
		y1.m_linear += Jt[i].m_jacobian_IM1.m_linear.Scale(val);
		y1.m_angular += Jt[i].m_jacobian_IM1.m_angular.Scale(val);
	}

	dgFloat32 akNum = dgFloat32(0.0f);
	dgFloat32 accNorm = dgFloat32(0.0f);
	for (dgInt32 i = 0; i < count; i++) {
		dgVector acc(JMinv[i].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
		acc += JMinv[i].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
		acc += JMinv[i].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
		acc += JMinv[i].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);
		accel[i] = coordenateAccel[i] - acc.m_x - acc.m_y - acc.m_z - force[i] * diagDamp[i];

		deltaForce[i] = accel[i] * invDJMinvJt[i] * activeRow[i];
		akNum += accel[i] * deltaForce[i];
		accNorm = GetMax(dgAbsf(accel[i] * activeRow[i]), accNorm);
	}

	dgFloat32 clampedForceIndexValue = dgFloat32(0.0f);
	for (dgInt32 i = 0; (i < maxPasses) && (accNorm > maxAccNorm); i++) {

		// MatrixTimeVector (forceRows, m_deltaForce, m_deltaAccel);
		y0.m_linear = zero;
		y0.m_angular = zero;
		y1.m_linear = zero;
		y1.m_angular = zero;
		for (dgInt32 k = 0; k < count; k++) {
			dgFloat32 val = deltaForce[k];
			y0.m_linear += Jt[k].m_jacobian_IM0.m_linear.Scale(val);
			y0.m_angular += Jt[k].m_jacobian_IM0.m_angular.Scale(val);
			y1.m_linear += Jt[k].m_jacobian_IM1.m_linear.Scale(val);
			y1.m_angular += Jt[k].m_jacobian_IM1.m_angular.Scale(val);
		}

		dgFloat32 akDen = dgFloat32(0.0f);
		for (dgInt32 k = 0; k < count; k++) {
			dgVector acc(JMinv[k].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
			acc += JMinv[k].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
			acc += JMinv[k].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
			acc += JMinv[k].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);

			// m_accel[i0] = m_coordenateAccel[i0] - acc.m_x - acc.m_y - acc.m_z - m_force[i0] * m_diagDamp[i0];
			deltaAccel[k] = acc.m_x + acc.m_y + acc.m_z + deltaForce[k] * diagDamp[k];
			akDen += deltaAccel[k] * deltaForce[k];
		}

		NEWTON_ASSERT(akDen > dgFloat32(0.0f));
		akDen = GetMax(akDen, dgFloat32(1.0e-16f));
		NEWTON_ASSERT(dgAbsf(akDen) >= dgFloat32(1.0e-16f));
		dgFloat32 ak = akNum / akDen;

		dgInt32 clampedForceIndex = -1;
		for (dgInt32 k = 0; k < count; k++) {
			if (activeRow[k]) {
				dgFloat32 val = force[k] + ak * deltaForce[k];
				if (deltaForce[k] < dgFloat32(-1.0e-16f)) {
					if (val < lowBound[k]) {
						ak = GetMax((lowBound[k] - force[k]) / deltaForce[k],
						            dgFloat32(0.0f));
						clampedForceIndex = k;
						clampedForceIndexValue = lowBound[k];
						if (ak < dgFloat32(1.0e-8f)) {
							ak = dgFloat32(0.0f);
							break;
						}
					}
				} else if (deltaForce[k] > dgFloat32(1.0e-16f)) {
					if (val >= highBound[k]) {
						ak = GetMax((highBound[k] - force[k]) / deltaForce[k],
						            dgFloat32(0.0f));
						;
						clampedForceIndex = k;
						clampedForceIndexValue = highBound[k];
						if (ak < dgFloat32(1.0e-8f)) {
							ak = dgFloat32(0.0f);
							break;
						}
					}
				}
			}
		}

		//      if (ak == dgFloat32 (0.0f)) {
		if (ak == dgFloat32(0.0f) && (clampedForceIndex != -1)) {
			NEWTON_ASSERT(clampedForceIndex != -1);
			akNum = dgFloat32(0.0f);
			accNorm = dgFloat32(0.0f);

			activeRow[clampedForceIndex] = dgFloat32(0.0f);
			deltaForce[clampedForceIndex] = dgFloat32(0.0f);
			force[clampedForceIndex] = clampedForceIndexValue;
			for (dgInt32 k = 0; k < count; k++) {
				if (activeRow[k]) {
					dgFloat32 val;
					val = lowBound[k] - force[k];
					if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[k] < dgFloat32(0.0f))) {
						force[k] = lowBound[k];
						activeRow[k] = dgFloat32(0.0f);
						deltaForce[k] = dgFloat32(0.0f);

					} else {
						val = highBound[k] - force[k];
						if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[k] > dgFloat32(0.0f))) {
							force[k] = highBound[k];
							activeRow[k] = dgFloat32(0.0f);
							deltaForce[k] = dgFloat32(0.0f);
						} else {
							NEWTON_ASSERT(activeRow[k] > dgFloat32(0.0f));
							deltaForce[k] = accel[k] * invDJMinvJt[k];
							akNum += accel[k] * deltaForce[k];
							accNorm = GetMax(dgAbsf(accel[k]), accNorm);
						}
					}
				}
			}

			i = -1;
			maxPasses = GetMax(maxPasses - 1, dgInt32(1));

		} else if (clampedForceIndex >= 0) {
			akNum = dgFloat32(0.0f);
			accNorm = dgFloat32(0.0f);
			activeRow[clampedForceIndex] = dgFloat32(0.0f);
			for (dgInt32 k = 0; k < count; k++) {
				force[k] += ak * deltaForce[k];
				accel[k] -= ak * deltaAccel[k];
				accNorm = GetMax(dgAbsf(accel[k] * activeRow[k]), accNorm);
				NEWTON_ASSERT(dgCheckFloat(force[k]));
				NEWTON_ASSERT(dgCheckFloat(accel[k]));

				deltaForce[k] = accel[k] * invDJMinvJt[k] * activeRow[k];
				akNum += deltaForce[k] * accel[k];
			}
			force[clampedForceIndex] = clampedForceIndexValue;

			i = -1;
			maxPasses = GetMax(maxPasses - 1, dgInt32(1));

		} else {
			accNorm = dgFloat32(0.0f);
			for (dgInt32 k = 0; k < count; k++) {
				force[k] += ak * deltaForce[k];
				accel[k] -= ak * deltaAccel[k];
				accNorm = GetMax(dgAbsf(accel[k] * activeRow[k]), accNorm);
				NEWTON_ASSERT(dgCheckFloat(force[k]));
				NEWTON_ASSERT(dgCheckFloat(accel[k]));
			}

			if (accNorm > maxAccNorm) {

				akDen = akNum;
				akNum = dgFloat32(0.0f);
				for (dgInt32 k = 0; k < count; k++) {
					deltaAccel[k] = accel[k] * invDJMinvJt[k] * activeRow[k];
					akNum += accel[k] * deltaAccel[k];
				}

				NEWTON_ASSERT(akDen > dgFloat32(0.0f));
				akDen = GetMax(akDen, dgFloat32(1.0e-17f));
				ak = dgFloat32(akNum / akDen);
				for (dgInt32 k = 0; k < count; k++) {
					deltaForce[k] = deltaAccel[k] + ak * deltaForce[k];
				}
			}
		}
	}
}

void dgJacobianMemory::CalculateForcesSimulationModeSimd(
    dgFloat32 maxAccNorm) const {
#ifdef DG_BUILD_SIMD_CODE
	dgInt32 passes;
	dgInt32 prevJoint;
	dgInt32 maxPasses;
	dgInt32 forceRows;
	dgInt32 roundBodyCount;
	dgInt32 totalPassesCount;
	dgFloatSign tmpIndex;

	dgFloat32 akNum;
	dgFloat32 accNorm;
	dgFloat32 *const force = m_force;
	dgFloat32 *const accel = m_accel;
	const dgJacobianPair *const Jt = m_Jt;
	dgFloat32 *const deltaAccel = m_deltaAccel;
	dgFloat32 *const deltaForce = m_deltaForce;
	const dgJacobianPair *const JMinv = m_JMinv;
	const dgFloat32 *const diagDamp = m_diagDamp;
	const dgFloat32 *const invDJMinvJt = m_invDJMinvJt;
	dgJacobian *const internalForces = m_internalForces;
	dgInt32 *const normalForceIndex = m_normalForceIndex;
	dgJointInfo *const constraintArray = m_constraintArray;
	const dgFloat32 *const coordenateAccel = m_coordenateAccel;
	dgFloat32 *const lowerForceBound = m_lowerBoundFrictionCoefficent;
	dgFloat32 *const upperForceBound = m_upperBoundFrictionCoefficent;
	simd_type forceStepPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];

	dgFloat32 *const forceStep = (dgFloat32 *)forceStepPtr;

	simd_type one;
	simd_type zero;
	simd_type signMask;
	simd_type tol_pos_1eNeg8;
	simd_type tol_pos_1eNeg16;
	simd_type tol_neg_1eNeg16;

	zero = simd_set1(dgFloat32(0.0f));

	roundBodyCount = m_bodyCount & -4;
	for (dgInt32 i = 0; i < roundBodyCount; i += 4) {
		((simd_type &)internalForces[i + 0].m_linear) = zero;
		((simd_type &)internalForces[i + 0].m_angular) = zero;
		((simd_type &)internalForces[i + 1].m_linear) = zero;
		((simd_type &)internalForces[i + 1].m_angular) = zero;
		((simd_type &)internalForces[i + 2].m_linear) = zero;
		((simd_type &)internalForces[i + 2].m_angular) = zero;
		((simd_type &)internalForces[i + 3].m_linear) = zero;
		((simd_type &)internalForces[i + 3].m_angular) = zero;
	}

	for (dgInt32 i = roundBodyCount; i < m_bodyCount; i++) {
		((simd_type &)internalForces[i].m_linear) = zero;
		((simd_type &)internalForces[i].m_angular) = zero;
	}

	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 j;
		dgInt32 m0;
		dgInt32 m1;
		dgInt32 index;
		dgInt32 first;
		dgInt32 count;

		simd_type y0_linear;
		simd_type y0_angular;
		simd_type y1_linear;
		simd_type y1_angular;

		first = constraintArray[i].m_autoPairstart;
		count = constraintArray[i].m_autoPairActiveCount;
		m0 = constraintArray[i].m_m0;
		m1 = constraintArray[i].m_m1;

		y0_linear = zero;
		y0_angular = zero;
		y1_linear = zero;
		y1_angular = zero;
		for (j = 0; j < count; j++) {
			simd_type tmp0;
			index = first + j;
			// force = m_force[index];
			tmp0 = simd_set1(force[index]);

			// y0.m_linear += m_Jt[index].m_jacobian_IM0.m_linear.Scale (m_force[index]);
			// y0.m_angular += m_Jt[index].m_jacobian_IM0.m_angular.Scale (m_force[index]);
			// y1.m_linear += m_Jt[index].m_jacobian_IM1.m_linear.Scale (m_force[index]);
			// y1.m_angular += m_Jt[index].m_jacobian_IM1.m_angular.Scale (m_force[index]);
			y0_linear =
			    simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, tmp0);
			y0_angular =
			    simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, tmp0);
			y1_linear =
			    simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, tmp0);
			y1_angular =
			    simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, tmp0);
		}
		// m_internalForces[j0] = y0;
		// m_internalForces[j1] = y1;
		(simd_type &)internalForces[m0].m_linear =
		    simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear);
		(simd_type &)internalForces[m0].m_angular =
		    simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular);
		(simd_type &)internalForces[m1].m_linear =
		    simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear);
		(simd_type &)internalForces[m1].m_angular =
		    simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular);
	}

	for (dgInt32 i = 0; i < DG_CONSTRAINT_MAX_ROWS; i += (DG_SIMD_WORD_SIZE * 4)) {
		(simd_type &)forceStep[i + DG_SIMD_WORD_SIZE * 0] = zero;
		(simd_type &)forceStep[i + DG_SIMD_WORD_SIZE * 1] = zero;
		(simd_type &)forceStep[i + DG_SIMD_WORD_SIZE * 2] = zero;
		(simd_type &)forceStep[i + DG_SIMD_WORD_SIZE * 3] = zero;
	}

	maxPasses = 4;
	prevJoint = 0;
	accNorm = maxAccNorm * dgFloat32(2.0f);
	for (passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) {
		accNorm = dgFloat32(0.0f);
		for (dgInt32 currJoint = 0; currJoint < m_jointCount; currJoint++) {
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 index;
			//          dgInt32 currJoint;
			dgInt32 rowsCount;
			dgFloat32 jointAccel;
			simd_type y0_linear;
			simd_type y0_angular;
			simd_type y1_linear;
			simd_type y1_angular;

			index = constraintArray[prevJoint].m_autoPairstart;
			rowsCount = constraintArray[prevJoint].m_autoPaircount;
			m0 = constraintArray[prevJoint].m_m0;
			m1 = constraintArray[prevJoint].m_m1;
			y0_linear = zero;
			y0_angular = zero;
			y1_linear = zero;
			y1_angular = zero;

			for (dgInt32 i = 0; i < rowsCount; i++) {
				// dgFloat32 deltaForce;
				// deltaForce = forceStep[i];
				// y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (deltaForce);
				// y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (deltaForce);
				// y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (deltaForce);
				// y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (deltaForce);

				simd_type deltaForce;
				deltaForce = simd_set1(forceStep[i]);
				y0_linear =
				    simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, deltaForce);
				y0_angular =
				    simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, deltaForce);
				y1_linear =
				    simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, deltaForce);
				y1_angular =
				    simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, deltaForce);

				index++;
			}
			(simd_type &)internalForces[m0].m_linear =
			    simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear);
			(simd_type &)internalForces[m0].m_angular =
			    simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular);
			(simd_type &)internalForces[m1].m_linear =
			    simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear);
			(simd_type &)internalForces[m1].m_angular =
			    simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular);

			//          currJoint = jointRemapArray[j];
			index = constraintArray[currJoint].m_autoPairstart;
			rowsCount = constraintArray[currJoint].m_autoPaircount;
			m0 = constraintArray[currJoint].m_m0;
			m1 = constraintArray[currJoint].m_m1;
			// y0 = internalForces[m0];
			// y1 = internalForces[m1];
			y0_linear = (simd_type &)internalForces[m0].m_linear;
			y0_angular = (simd_type &)internalForces[m0].m_angular;
			y1_linear = (simd_type &)internalForces[m1].m_linear;
			y1_angular = (simd_type &)internalForces[m1].m_angular;
			for (dgInt32 i = 0; i < rowsCount; i++) {
				simd_type tmpAccel;

				//              dgVector acc (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
				//              acc += JMinv[index].m_jacobian_IM0.m_angular.CompProduct (y0.m_angular);
				//              acc += JMinv[index].m_jacobian_IM1.m_linear.CompProduct (y1.m_linear);
				//              acc += JMinv[index].m_jacobian_IM1.m_angular.CompProduct (y1.m_angular);

				tmpAccel =
				    simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, y0_linear);
				tmpAccel =
				    simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, y0_angular);
				tmpAccel =
				    simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, y1_linear);
				tmpAccel =
				    simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, y1_angular);

				//              accel[i] = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index];
				tmpAccel = simd_add_v(tmpAccel, simd_move_hl_v(tmpAccel, tmpAccel));
				tmpAccel =
				    simd_add_s(tmpAccel, simd_permut_v(tmpAccel, tmpAccel, PURMUT_MASK(3, 3, 3, 1)));
				tmpAccel =
				    simd_mul_add_s(tmpAccel, simd_load_s(force[index]), simd_load_s(diagDamp[index]));
				simd_store_s(simd_sub_s(simd_load_s(coordenateAccel[index]), tmpAccel),
				             &accel[i]);
				index++;
			}

			jointAccel = CalculateJointForcesSimd(currJoint, forceStep, maxAccNorm);
			accNorm = GetMax(accNorm, jointAccel);
			prevJoint = currJoint;
		}
	}

	one = simd_set1(dgFloat32(1.0f));
	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 first;
		dgInt32 count;
		dgInt32 index;
		dgInt32 roundCount;
		first = constraintArray[i].m_autoPairstart;
		count = constraintArray[i].m_autoPairActiveCount;

		roundCount = count & (-DG_SIMD_WORD_SIZE);
		if (roundCount != count) {
			roundCount += DG_SIMD_WORD_SIZE;
			for (dgInt32 k = count; k < roundCount; k++) {
				dgInt32 j;
				j = first + k;
				// force[j] = dgFloat32 (0.0f);
				// accel[j] -= dgFloat32 (0.0f);
				// deltaAccel[j] = dgFloat32 (0.0f);
				// deltaForce[j] = dgFloat32 (0.0f);
				normalForceIndex[j] = -1;
				simd_store_s(zero, &force[j]);
				simd_store_s(zero, &accel[j]);
				simd_store_s(zero, &deltaAccel[j]);
				simd_store_s(zero, &deltaForce[j]);
				simd_store_s(one, &lowerForceBound[j]);
				simd_store_s(zero, &upperForceBound[j]);
			}
		}

		for (dgInt32 k = 0; k < count; k += DG_SIMD_WORD_SIZE) {
			simd_type normalForce;

			index = first + k;
			// dgBilateralBounds& forceBounds = bilateralForceBounds[index];
			// j = normalForceIndex[index];
			//NEWTON_ASSERT (((j < 0) && (force[j] == dgFloat32 (1.0f))) || ((j >= 0) && (force[j] >= dgFloat32 (0.0f))));
			// val = GetMax (force[j], dgFloat32(0.0f));

			// val = force[j];
			normalForce =
			    simd_move_lh_v(simd_pack_lo_v(simd_load_s(force[normalForceIndex[index + 0]]), simd_load_s(force[normalForceIndex[index + 1]])), simd_pack_lo_v(simd_load_s(force[normalForceIndex[index + 2]]), simd_load_s(force[normalForceIndex[index + 3]])));

			// lowerForceBound[index] *= val;
			// upperForceBound[index] *= val;
			(simd_type &)lowerForceBound[index] =
			    simd_mul_v(normalForce, (simd_type &)lowerForceBound[index]);
			(simd_type &)upperForceBound[index] =
			    simd_mul_v(normalForce, (simd_type &)upperForceBound[index]);

			// force[index] = ClampValue(force[index], lowerForceBound[index], upperForceBound[index]);
			(simd_type &)force[index] =
			    simd_min_v((simd_type &)upperForceBound[index], simd_max_v((simd_type &)force[index], (simd_type &)lowerForceBound[index]));
		}
	}

	for (dgInt32 i = 0; i < roundBodyCount; i += 4) {
		(simd_type &)internalForces[i + 0].m_linear = zero;
		(simd_type &)internalForces[i + 0].m_angular = zero;
		(simd_type &)internalForces[i + 1].m_linear = zero;
		(simd_type &)internalForces[i + 1].m_angular = zero;
		(simd_type &)internalForces[i + 2].m_linear = zero;
		(simd_type &)internalForces[i + 2].m_angular = zero;
		(simd_type &)internalForces[i + 3].m_linear = zero;
		(simd_type &)internalForces[i + 3].m_angular = zero;
	}
	for (dgInt32 i = roundBodyCount; i < m_bodyCount; i++) {
		(simd_type &)internalForces[i].m_linear = zero;
		(simd_type &)internalForces[i].m_angular = zero;
	}

	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 m0;
		dgInt32 m1;
		dgInt32 first;
		dgInt32 count;
		dgInt32 index;
		// dgFloat32 val;
		simd_type y0_linear;
		simd_type y0_angular;
		simd_type y1_linear;
		simd_type y1_angular;

		first = constraintArray[i].m_autoPairstart;
		count = constraintArray[i].m_autoPairActiveCount;
		m0 = constraintArray[i].m_m0;
		m1 = constraintArray[i].m_m1;
		// dgJacobian y0 (internalForces[k0]);
		// dgJacobian y1 (internalForces[k1]);
		y0_linear = zero;
		y0_angular = zero;
		y1_linear = zero;
		y1_angular = zero;
		for (dgInt32 j = 0; j < count; j++) {
			simd_type tmp0;

			index = j + first;

			// val = force[index];
			tmp0 = simd_set1(force[index]);

			// y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (val);
			// y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (val);
			// y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (val);
			// y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (val);
			y0_linear =
			    simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, tmp0);
			y0_angular =
			    simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, tmp0);
			y1_linear =
			    simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, tmp0);
			y1_angular =
			    simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, tmp0);
		}
		// internalForces[k0] = y0;
		// internalForces[k1] = y1;
		(simd_type &)internalForces[m0].m_linear =
		    simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear);
		(simd_type &)internalForces[m0].m_angular =
		    simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular);
		(simd_type &)internalForces[m1].m_linear =
		    simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear);
		(simd_type &)internalForces[m1].m_angular =
		    simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular);
	}

	forceRows = 0;
	akNum = dgFloat32(0.0f);
	accNorm = dgFloat32(0.0f);
	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 m0;
		dgInt32 m1;
		dgInt32 first;
		dgInt32 count;
		dgInt32 index;
		dgInt32 activeCount;

		simd_type y0_linear;
		simd_type y0_angular;
		simd_type y1_linear;
		simd_type y1_angular;
		bool isClamped[DG_CONSTRAINT_MAX_ROWS];

		first = constraintArray[i].m_autoPairstart;
		count = constraintArray[i].m_autoPairActiveCount;
		m0 = constraintArray[i].m_m0;
		m1 = constraintArray[i].m_m1;

		// const dgJacobian& y0 = internalForces[m0];
		// const dgJacobian& y1 = internalForces[m1];
		y0_linear = (simd_type &)internalForces[m0].m_linear;
		y0_angular = (simd_type &)internalForces[m0].m_angular;
		y1_linear = (simd_type &)internalForces[m1].m_linear;
		y1_angular = (simd_type &)internalForces[m1].m_angular;
		for (dgInt32 j = 0; j < count; j++) {
			simd_type tmp0;
			index = j + first;
			// dgVector tmpAccel (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
			// tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
			// tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
			// tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);

			tmp0 =
			    simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, y0_linear);
			tmp0 =
			    simd_mul_add_v(tmp0, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, y0_angular);
			tmp0 =
			    simd_mul_add_v(tmp0, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, y1_linear);
			tmp0 =
			    simd_mul_add_v(tmp0, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, y1_angular);

			// accel[index] = coordenateAccel[index] - (tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z + force[index] * diagDamp[index]);
			tmp0 = simd_add_v(tmp0, simd_move_hl_v(tmp0, tmp0));
			tmp0 =
			    simd_add_s(tmp0, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 3, 3, 1)));
			tmp0 =
			    simd_sub_s(simd_load_s(coordenateAccel[index]), simd_mul_add_s(tmp0, simd_load_s(force[index]), simd_load_s(diagDamp[index])));
			simd_store_s(tmp0, &accel[index]);
		}

		activeCount = 0;
		for (dgInt32 j = 0; j < count; j++) {
			dgFloat32 val;
			index = j + first;
			val = lowerForceBound[index] - force[index];
			if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[index] < dgFloat32(0.0f))) {
				force[index] = lowerForceBound[index];
				accel[index] = dgFloat32(0.0f);
				deltaForce[index] = dgFloat32(0.0f);
				deltaAccel[index] = dgFloat32(0.0f);

				isClamped[j] = true;
			} else {
				val = upperForceBound[index] - force[index];
				if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[index] > dgFloat32(0.0f))) {
					force[index] = upperForceBound[index];
					accel[index] = dgFloat32(0.0f);
					deltaForce[index] = dgFloat32(0.0f);
					deltaAccel[index] = dgFloat32(0.0f);
					isClamped[j] = true;
				} else {
					forceRows++;
					activeCount++;
					deltaForce[index] = accel[index] * invDJMinvJt[index];
					akNum += accel[index] * deltaForce[index];
					accNorm = GetMax(dgAbsf(accel[index]), accNorm);
					isClamped[j] = false;
				}
			}
		}

		if (activeCount < count) {
			dgInt32 i0;
			dgInt32 i1;

			i0 = 0;
			i1 = count - 1;
			constraintArray[i].m_autoPairActiveCount = activeCount;
			do {
				while ((i0 <= i1) && !isClamped[i0])
					i0++;
				while ((i0 <= i1) && isClamped[i1])
					i1--;
				if (i0 < i1) {
					SwapRowsSimd(first + i0, first + i1);
					i0++;
					i1--;
				}
			} while (i0 < i1);
		}
	}

	//  maxPasses = 0x7fffffff;
	//  signMask = simd_set1((dgFloat32&) maxPasses);
	tmpIndex.m_integer.m_iVal = 0x7fffffff;
	signMask = simd_set1(tmpIndex.m_fVal);
	zero = simd_set1(dgFloat32(0.0f));
	tol_pos_1eNeg8 = simd_set1(dgFloat32(1.0e-8f));
	tol_pos_1eNeg16 = simd_set1(dgFloat32(1.0e-16f));
	tol_neg_1eNeg16 = simd_set1(dgFloat32(-1.0e-16f));

	maxPasses = forceRows;
	totalPassesCount = 0;
	for (passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) {

		dgInt32 clampedForceIndex;
		dgInt32 clampedForceJoint;
		//      dgFloat32 ak;
		//      dgFloat32 akDen;
		//      dgFloat32 clampedForceIndexValue;
		simd_type akSimd;
		simd_type akDenSimd;
		simd_type clampedForceIndexValue;

		for (dgInt32 i = 0; i < roundBodyCount; i += 4) {
			(simd_type &)internalForces[i + 0].m_linear = zero;
			(simd_type &)internalForces[i + 0].m_angular = zero;
			(simd_type &)internalForces[i + 1].m_linear = zero;
			(simd_type &)internalForces[i + 1].m_angular = zero;
			(simd_type &)internalForces[i + 2].m_linear = zero;
			(simd_type &)internalForces[i + 2].m_angular = zero;
			(simd_type &)internalForces[i + 3].m_linear = zero;
			(simd_type &)internalForces[i + 3].m_angular = zero;
		}
		for (dgInt32 i = roundBodyCount; i < m_bodyCount; i++) {
			(simd_type &)internalForces[i].m_linear = zero;
			(simd_type &)internalForces[i].m_angular = zero;
		}

		for (dgInt32 i = 0; i < m_jointCount; i++) {
			dgInt32 j;
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 first;
			dgInt32 count;
			dgInt32 index;
			// dgFloat32 ak;
			simd_type y0_linear;
			simd_type y0_angular;
			simd_type y1_linear;
			simd_type y1_angular;

			first = constraintArray[i].m_autoPairstart;
			count = constraintArray[i].m_autoPairActiveCount;
			m0 = constraintArray[i].m_m0;
			m1 = constraintArray[i].m_m1;

			// dgJacobian y0 (internalForces[m0]);
			// dgJacobian y1 (internalForces[m1]);
			y0_linear = zero;
			y0_angular = zero;
			y1_linear = zero;
			y1_angular = zero;

			for (j = 0; j < count; j++) {
				simd_type tmp0;
				index = j + first;
				// ak = deltaForce[index];
				tmp0 = simd_set1(deltaForce[index]);

				// y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (ak);
				// y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (ak);
				// y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (ak);
				// y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (ak);
				y0_linear =
				    simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, tmp0);
				y0_angular =
				    simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, tmp0);
				y1_linear =
				    simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, tmp0);
				y1_angular =
				    simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, tmp0);
			}
			// internalForces[m0] = y0;
			// internalForces[m1] = y1;
			(simd_type &)internalForces[m0].m_linear =
			    simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear);
			(simd_type &)internalForces[m0].m_angular =
			    simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular);
			(simd_type &)internalForces[m1].m_linear =
			    simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear);
			(simd_type &)internalForces[m1].m_angular =
			    simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular);
		}

		akDenSimd = zero;
		for (dgInt32 i = 0; i < m_jointCount; i++) {
			dgInt32 j;
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 first;
			dgInt32 count;
			dgInt32 index;

			simd_type y0_linear;
			simd_type y0_angular;
			simd_type y1_linear;
			simd_type y1_angular;

			first = constraintArray[i].m_autoPairstart;
			count = constraintArray[i].m_autoPairActiveCount;
			m0 = constraintArray[i].m_m0;
			m1 = constraintArray[i].m_m1;
			// const dgJacobian& y0 = internalForces[k0];
			// const dgJacobian& y1 = internalForces[k1];

			y0_linear = (simd_type &)internalForces[m0].m_linear;
			y0_angular = (simd_type &)internalForces[m0].m_angular;
			y1_linear = (simd_type &)internalForces[m1].m_linear;
			y1_angular = (simd_type &)internalForces[m1].m_angular;
			for (j = 0; j < count; j++) {
				simd_type tmp1;
				simd_type tmp2;
				index = j + first;

				// dgVector tmpAccel (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
				// tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
				// tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
				// tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);

				tmp2 =
				    simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, y0_linear);
				tmp2 =
				    simd_mul_add_v(tmp2, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, y0_angular);
				tmp2 =
				    simd_mul_add_v(tmp2, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, y1_linear);
				tmp2 =
				    simd_mul_add_v(tmp2, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, y1_angular);

				// deltaAccel[index] = tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z + deltaForce[index] * diagDamp[index];
				tmp1 = simd_load_s(deltaForce[index]);
				tmp2 = simd_add_v(tmp2, simd_move_hl_v(tmp2, tmp2));
				tmp2 =
				    simd_add_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(3, 3, 3, 1)));
				tmp2 = simd_mul_add_s(tmp2, tmp1, simd_load_s(diagDamp[index]));
				simd_store_s(tmp2, &deltaAccel[index]);

				// akDen += deltaAccel[index] * deltaForce[index];
				akDenSimd = simd_mul_add_s(akDenSimd, tmp2, tmp1);
			}
		}

		//      NEWTON_ASSERT (akDen > dgFloat32 (0.0f));
		//      akDen = GetMax (akDen, dgFloat32(1.0e-16f));
		//      NEWTON_ASSERT (dgAbsf (akDen) >= dgFloat32(1.0e-16f));
		//      ak = akNum / akDen;

		akSimd =
		    simd_div_s(simd_load_s(akNum), simd_max_s(akDenSimd, tol_pos_1eNeg16));

		clampedForceIndex = -1;
		clampedForceJoint = -1;
		clampedForceIndexValue = zero;
		for (dgInt32 i = 0; i < m_jointCount; i++) {
			dgInt32 j;
			dgInt32 first;
			dgInt32 count;
			dgInt32 index;

			if (simd_store_is(simd_cmpgt_s(akSimd, tol_pos_1eNeg8))) {
				first = constraintArray[i].m_autoPairstart;
				count = constraintArray[i].m_autoPairActiveCount;
				for (j = 0; j < count; j++) {
					simd_type tmp1;
					simd_type test0;
					simd_type test1;

					index = j + first;
					// tmp = force[index] + ak * deltaForce[index];

					simd_type force_index;
					simd_type delta_force;
					simd_type low_force;
					simd_type high_force;

					// Make sure AK is not negative
					force_index = simd_set1(force[index]);
					delta_force = simd_set1(deltaForce[index]);
					low_force = simd_set1(lowerForceBound[index]);
					high_force = simd_set1(upperForceBound[index]);

					tmp1 = simd_mul_add_s(force_index, akSimd, delta_force);

					// test0 = (deltaForce[index] < dgFloat32 (-1.0e-16f)) & (tmp < lowerForceBound[index]);
					test0 =
					    simd_and_v(simd_cmplt_s(delta_force, tol_neg_1eNeg16), simd_cmplt_s(tmp1, low_force));

					// test1 = (deltaForce[index] > dgFloat32 ( 1.0e-16f)) & (tmp > upperForceBound[index]);
					test1 =
					    simd_and_v(simd_cmpgt_s(delta_force, tol_pos_1eNeg16), simd_cmpgt_s(tmp1, high_force));

					// test =  test0 | test1;
					if (simd_store_is(simd_or_v(test0, test1))) {
						// clampedForceIndexValue = lowerForceBound[index];
						clampedForceIndexValue =
						    simd_or_v(simd_and_v(low_force, test0), simd_and_v(high_force, test1));

						akSimd =
						    simd_div_s(simd_sub_s(clampedForceIndexValue, force_index), delta_force);

						// clampedForceIndex = test ? j : clampedForceIndex;
						// clampedForceJoint = test ? i : clampedForceJoint;
						clampedForceIndex = j;
						clampedForceJoint = i;
					}
				}
			}
		}

		akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0));
		if (clampedForceIndex >= 0) {
			dgInt32 first;
			dgInt32 count;
			dgInt32 activeCount;
			simd_type tmp0;
			simd_type tmp1;
			bool isClamped[DG_CONSTRAINT_MAX_ROWS];

			for (dgInt32 i = 0; i < m_jointCount; i++) {
				dgInt32 j;
				dgInt32 first;
				dgInt32 count;
				dgInt32 index;
				first = constraintArray[i].m_autoPairstart;
				count = constraintArray[i].m_autoPairActiveCount;
				for (j = 0; j < count; j += DG_SIMD_WORD_SIZE) {
					index = j + first;
					// force[index] += ak * deltaForce[index];
					// accel[index] -= ak * deltaAccel[index];
					(simd_type &)force[index] =
					    simd_mul_add_v((simd_type &)force[index], akSimd, (simd_type &)deltaForce[index]);
					(simd_type &)accel[index] =
					    simd_mul_sub_v((simd_type &)accel[index], akSimd, (simd_type &)deltaAccel[index]);
				}
			}

			first = constraintArray[clampedForceJoint].m_autoPairstart;
			count = constraintArray[clampedForceJoint].m_autoPairActiveCount;
			count--;
			// force[first + clampedForceIndex] = clampedForceIndexValue;
			simd_store_s(clampedForceIndexValue, &force[first + clampedForceIndex]);
			accel[first + clampedForceIndex] = dgFloat32(0.0f);
			deltaForce[first + clampedForceIndex] = dgFloat32(0.0f);
			deltaAccel[first + clampedForceIndex] = dgFloat32(0.0f);
			if (clampedForceIndex != count) {
				SwapRowsSimd(first + clampedForceIndex, first + count);
			}

			activeCount = count;
			for (dgInt32 i = 0; i < count; i++) {
				dgInt32 index;
				dgFloat32 val;
				index = first + i;
				isClamped[i] = false;
				val = lowerForceBound[index] - force[index];
				if ((val > dgFloat32(-1.0e-5f)) && (accel[index] < dgFloat32(0.0f))) {
					activeCount--;
					isClamped[i] = true;
					accel[index] = dgFloat32(0.0f);
					deltaForce[index] = dgFloat32(0.0f);
					deltaAccel[index] = dgFloat32(0.0f);

				} else {
					val = upperForceBound[index] - force[index];
					if ((val < dgFloat32(1.0e-5f)) && (accel[index] > dgFloat32(0.0f))) {
						activeCount--;
						isClamped[i] = true;
						accel[index] = dgFloat32(0.0f);
						deltaForce[index] = dgFloat32(0.0f);
						deltaAccel[index] = dgFloat32(0.0f);
					}
				}
			}

			if (activeCount < count) {
				dgInt32 i0;
				dgInt32 i1;
				i0 = 0;
				i1 = count - 1;
				do {
					while ((i0 <= i1) && !isClamped[i0])
						i0++;
					while ((i0 <= i1) && isClamped[i1])
						i1--;
					if (i0 < i1) {
						SwapRowsSimd(first + i0, first + i1);
						i0++;
						i1--;
					}
				} while (i0 < i1);
			}
			constraintArray[clampedForceJoint].m_autoPairActiveCount = activeCount;

			forceRows = 0;
			// akNum = dgFloat32 (0.0f);
			// accNorm = dgFloat32(0.0f);

			tmp0 = zero;
			tmp1 = zero;
			for (dgInt32 i = 0; i < m_jointCount; i++) {
				dgInt32 j;
				dgInt32 first;
				dgInt32 count;
				dgInt32 index;
				first = constraintArray[i].m_autoPairstart;
				count = constraintArray[i].m_autoPairActiveCount;
				forceRows += count;
				//              for (j = 0; j < count; j ++) {
				for (j = 0; j < count; j += DG_SIMD_WORD_SIZE) {
					index = first + j;
					//NEWTON_ASSERT ((i != clampedForceJoint) || !((dgAbsf (lowerForceBound[index] - force[index]) < dgFloat32 (1.0e-5f)) && (accel[index] < dgFloat32 (0.0f))));
					//NEWTON_ASSERT ((i != clampedForceJoint) || !((dgAbsf (upperForceBound[index] - force[index]) < dgFloat32 (1.0e-5f)) && (accel[index] > dgFloat32 (0.0f))));
					// deltaForce[index] = accel[index] * invDJMinvJt[index];
					(simd_type &)deltaForce[index] =
					    simd_mul_v((simd_type &)accel[index], (simd_type &)invDJMinvJt[index]);

					// akNum += deltaForce[index] * accel[index];
					tmp0 =
					    simd_mul_add_v(tmp0, (simd_type &)accel[index], (simd_type &)deltaForce[index]);

					// accNorm = GetMax (dgAbsf (accel[index]), accNorm);
					tmp1 =
					    simd_max_v(tmp1, simd_and_v((simd_type &)accel[index], signMask));
					//NEWTON_ASSERT (dgCheckFloat(deltaForce[index]));
				}
			}

			tmp0 = simd_add_v(tmp0, simd_move_hl_v(tmp0, tmp0));
			tmp0 =
			    simd_add_s(tmp0, simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(tmp0, &akNum);

			tmp1 = simd_max_v(tmp1, simd_move_hl_v(tmp1, tmp1));
			tmp1 =
			    simd_max_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(tmp1, &accNorm);

			NEWTON_ASSERT(akNum >= dgFloat32(0.0f));
			passes = -1;
			maxPasses = forceRows;

		} else {
			simd_type tmp1;

			// accNorm = dgFloat32(0.0f);
			tmp1 = zero;
			for (dgInt32 i = 0; i < m_jointCount; i++) {
				dgInt32 first;
				dgInt32 count;
				dgInt32 index;
				first = constraintArray[i].m_autoPairstart;
				count = constraintArray[i].m_autoPairActiveCount;
				for (dgInt32 j = 0; j < count; j += DG_SIMD_WORD_SIZE) {
					index = j + first;
					// force[index] += ak * deltaForce[index];
					// accel[index] -= ak * deltaAccel[index];
					(simd_type &)force[index] =
					    simd_mul_add_v((simd_type &)force[index], akSimd, (simd_type &)deltaForce[index]);
					(simd_type &)accel[index] =
					    simd_mul_sub_v((simd_type &)accel[index], akSimd, (simd_type &)deltaAccel[index]);

					// accNorm = GetMax (dgAbsf (accel[index]), accNorm);
					tmp1 =
					    simd_max_v(tmp1, simd_and_v((simd_type &)accel[index], signMask));
				}
			}

			tmp1 = simd_max_v(tmp1, simd_move_hl_v(tmp1, tmp1));
			tmp1 =
			    simd_max_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(tmp1, &accNorm);

			if (accNorm > maxAccNorm) {
				simd_type tmp0;
				// simd_type akNum;
				// akDen = akNum;
				// akNum = dgFloat32(0.0f);

				tmp0 = zero;
				akDenSimd = simd_set1(akNum);
				for (dgInt32 i = 0; i < m_jointCount; i++) {
					dgInt32 first;
					dgInt32 count;
					dgInt32 index;
					first = constraintArray[i].m_autoPairstart;
					count = constraintArray[i].m_autoPairActiveCount;
					for (dgInt32 j = 0; j < count; j += DG_SIMD_WORD_SIZE) {
						index = j + first;
						// deltaAccel[index] = accel[index] * invDJMinvJt[index];
						(simd_type &)deltaAccel[index] =
						    simd_mul_v((simd_type &)accel[index], (simd_type &)invDJMinvJt[index]);

						// akNum += accel[index] * deltaAccel[index];
						tmp0 =
						    simd_mul_add_v(tmp0, (simd_type &)accel[index], (simd_type &)deltaAccel[index]);
					}
				}

				tmp0 = simd_add_v(tmp0, simd_move_hl_v(tmp0, tmp0));
				tmp0 =
				    simd_add_s(tmp0, simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 1)));
				simd_store_s(tmp0, &akNum);

				//              NEWTON_ASSERT (akNum >= dgFloat32 (0.0f));
				//              NEWTON_ASSERT (akDen > dgFloat32(0.0f));
				//              akDen = GetMax (akDen, dgFloat32 (1.0e-17f));
				//              ak = dgFloat32 (akNum / akDen);

				tmp0 =
				    simd_div_s(tmp0, simd_max_s(akDenSimd, simd_set1(dgFloat32(1.0e-17f))));
				tmp0 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 0));
				for (dgInt32 i = 0; i < m_jointCount; i++) {
					dgInt32 first;
					dgInt32 count;
					dgInt32 index;
					first = constraintArray[i].m_autoPairstart;
					count = constraintArray[i].m_autoPairActiveCount;
					for (dgInt32 j = 0; j < count; j += DG_SIMD_WORD_SIZE) {
						index = j + first;
						// deltaForce[index] = deltaAccel[index] + ak * deltaForce[index];
						(simd_type &)deltaForce[index] =
						    simd_mul_add_v((simd_type &)deltaAccel[index], tmp0, (simd_type &)deltaForce[index]);
					}
				}
			}
		}
		totalPassesCount++;
	}
	ApplyExternalForcesAndAccelerationSimd(maxAccNorm);
#endif
}

void dgJacobianMemory::CalculateForcesSimulationMode(dgFloat32 maxAccNorm) const {
	dgInt32 passes;
	dgInt32 prevJoint;
	dgInt32 maxPasses;
	dgInt32 forceRows;
	//dgInt32 totalPassesCount;
	dgFloat32 akNum;
	dgFloat32 accNorm;
	dgFloat32 *const force = m_force;
	dgFloat32 *const accel = m_accel;
	const dgJacobianPair *const Jt = m_Jt;
	dgFloat32 *const deltaAccel = m_deltaAccel;
	dgFloat32 *const deltaForce = m_deltaForce;
	const dgJacobianPair *const JMinv = m_JMinv;
	const dgFloat32 *const diagDamp = m_diagDamp;
	const dgFloat32 *const invDJMinvJt = m_invDJMinvJt;

	dgJacobian *const internalForces = m_internalForces;
	dgInt32 *const normalForceIndex = m_normalForceIndex;
	dgJointInfo *const constraintArray = m_constraintArray;
	const dgFloat32 *const coordenateAccel = m_coordenateAccel;
	dgFloat32 *const lowerForceBound = m_lowerBoundFrictionCoefficent;
	dgFloat32 *const upperForceBound = m_upperBoundFrictionCoefficent;
	dgFloat32 forceStep[DG_CONSTRAINT_MAX_ROWS];

	dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
	              dgFloat32(0.0f));

	// initialize the intermediate force accumulation to zero
	for (dgInt32 i = 0; i < m_bodyCount; i++) {
		internalForces[i].m_linear = zero;
		internalForces[i].m_angular = zero;
	}

	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 m0;
		dgInt32 m1;
		dgInt32 index;
		dgInt32 first;
		dgInt32 count;
		dgFloat32 val;
		dgJacobian y0;
		dgJacobian y1;

		first = constraintArray[i].m_autoPairstart;
		count = constraintArray[i].m_autoPairActiveCount;
		m0 = constraintArray[i].m_m0;
		m1 = constraintArray[i].m_m1;

		y0.m_linear = zero;
		y0.m_angular = zero;
		y1.m_linear = zero;
		y1.m_angular = zero;
		for (dgInt32 j = 0; j < count; j++) {
			index = first + j;
			val = force[index];
			y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(val);
			y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(val);
			y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(val);
			y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(val);
		}
		internalForces[m0].m_linear += y0.m_linear;
		internalForces[m0].m_angular += y0.m_angular;
		internalForces[m1].m_linear += y1.m_linear;
		internalForces[m1].m_angular += y1.m_angular;
	}

	for (dgInt32 i = 0; i < DG_CONSTRAINT_MAX_ROWS; i++) {
		forceStep[i] = dgFloat32(0.0f);
	}

	maxPasses = 4;
	prevJoint = 0;
	accNorm = maxAccNorm * dgFloat32(2.0f);
	for (passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) {
		accNorm = dgFloat32(0.0f);
		for (dgInt32 currJoint = 0; currJoint < m_jointCount; currJoint++) {
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 index;
			dgInt32 rowsCount;
			dgFloat32 jointAccel;
			dgJacobian y0;
			dgJacobian y1;

			index = constraintArray[prevJoint].m_autoPairstart;
			rowsCount = constraintArray[prevJoint].m_autoPaircount;
			m0 = constraintArray[prevJoint].m_m0;
			m1 = constraintArray[prevJoint].m_m1;

			y0.m_linear = zero;
			y0.m_angular = zero;
			y1.m_linear = zero;
			y1.m_angular = zero;
			for (dgInt32 i = 0; i < rowsCount; i++) {
				dgFloat32 deltaForceI;
				deltaForceI = forceStep[i];
				y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(deltaForceI);
				y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(deltaForceI);
				y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(deltaForceI);
				y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(deltaForceI);
				index++;
			}
			internalForces[m0].m_linear += y0.m_linear;
			internalForces[m0].m_angular += y0.m_angular;
			internalForces[m1].m_linear += y1.m_linear;
			internalForces[m1].m_angular += y1.m_angular;

			index = constraintArray[currJoint].m_autoPairstart;
			rowsCount = constraintArray[currJoint].m_autoPaircount;
			m0 = constraintArray[currJoint].m_m0;
			m1 = constraintArray[currJoint].m_m1;
			y0 = internalForces[m0];
			y1 = internalForces[m1];
			for (dgInt32 i = 0; i < rowsCount; i++) {
				dgVector acc(
				    JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
				acc += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
				acc += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
				acc += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);

				accel[i] = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index];
				index++;
			}

			jointAccel = CalculateJointForces(currJoint, forceStep, maxAccNorm);
			accNorm = GetMax(accNorm, jointAccel);
			prevJoint = currJoint;
		}
	}

	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 j;
		dgInt32 k;
		dgInt32 first;
		dgInt32 count;
		dgInt32 index;
		dgFloat32 val;
		first = constraintArray[i].m_autoPairstart;
		count = constraintArray[i].m_autoPaircount;
		constraintArray[i].m_autoPaircount = count;
		for (k = 0; k < count; k++) {
			index = first + k;
			j = normalForceIndex[index];
			NEWTON_ASSERT(((j < 0) && (force[j] == dgFloat32(1.0f))) || ((j >= 0) && (force[j] >= dgFloat32(0.0f))));
			val = force[j];
			lowerForceBound[index] *= val;
			upperForceBound[index] *= val;

			val = force[index];
			force[index] = ClampValue(val, lowerForceBound[index], upperForceBound[index]);
		}
	}

	for (dgInt32 i = 0; i < m_bodyCount; i++) {
		internalForces[i].m_linear[0] = dgFloat32(0.0f);
		internalForces[i].m_linear[1] = dgFloat32(0.0f);
		internalForces[i].m_linear[2] = dgFloat32(0.0f);
		internalForces[i].m_linear[3] = dgFloat32(0.0f);
		internalForces[i].m_angular[0] = dgFloat32(0.0f);
		internalForces[i].m_angular[1] = dgFloat32(0.0f);
		internalForces[i].m_angular[2] = dgFloat32(0.0f);
		internalForces[i].m_angular[3] = dgFloat32(0.0f);
	}

	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 j;
		dgInt32 m0;
		dgInt32 m1;
		dgInt32 first;
		dgInt32 count;
		dgInt32 index;
		dgJacobian y0;
		dgJacobian y1;

		first = constraintArray[i].m_autoPairstart;
		count = constraintArray[i].m_autoPairActiveCount;
		m0 = constraintArray[i].m_m0;
		m1 = constraintArray[i].m_m1;
		y0.m_linear = zero;
		y0.m_angular = zero;
		y1.m_linear = zero;
		y1.m_angular = zero;
		for (j = 0; j < count; j++) {
			dgFloat32 val;
			index = j + first;
			val = force[index];
			y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(val);
			y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(val);
			y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(val);
			y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(val);
		}
		internalForces[m0].m_linear += y0.m_linear;
		internalForces[m0].m_angular += y0.m_angular;
		internalForces[m1].m_linear += y1.m_linear;
		internalForces[m1].m_angular += y1.m_angular;
	}

	forceRows = 0;
	akNum = dgFloat32(0.0f);
	accNorm = dgFloat32(0.0f);
	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 m0;
		dgInt32 m1;
		dgInt32 first;
		dgInt32 count;
		dgInt32 index;
		dgInt32 activeCount;
		bool isClamped[DG_CONSTRAINT_MAX_ROWS];

		first = constraintArray[i].m_autoPairstart;
		count = constraintArray[i].m_autoPairActiveCount;
		m0 = constraintArray[i].m_m0;
		m1 = constraintArray[i].m_m1;

		const dgJacobian &y0 = internalForces[m0];
		const dgJacobian &y1 = internalForces[m1];
		for (dgInt32 j = 0; j < count; j++) {
			index = j + first;
			dgVector tmpAccel(JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
			tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
			tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
			tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);
			accel[index] = coordenateAccel[index] - (tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z + force[index] * diagDamp[index]);
		}

		activeCount = 0;
		for (dgInt32 j = 0; j < count; j++) {
			dgFloat32 val;
			index = j + first;
			val = lowerForceBound[index] - force[index];
			if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[index] < dgFloat32(0.0f))) {
				force[index] = lowerForceBound[index];
				isClamped[j] = true;
			} else {
				val = upperForceBound[index] - force[index];
				if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[index] > dgFloat32(0.0f))) {
					force[index] = upperForceBound[index];
					isClamped[j] = true;
				} else {
					forceRows++;
					activeCount++;
					deltaForce[index] = accel[index] * invDJMinvJt[index];
					akNum += accel[index] * deltaForce[index];
					accNorm = GetMax(dgAbsf(accel[index]), accNorm);
					isClamped[j] = false;
				}
			}
		}

		if (activeCount < count) {
			dgInt32 i0;
			dgInt32 i1;

			i0 = 0;
			i1 = count - 1;
			constraintArray[i].m_autoPairActiveCount = activeCount;
			do {
				while ((i0 <= i1) && !isClamped[i0])
					i0++;
				while ((i0 <= i1) && isClamped[i1])
					i1--;
				if (i0 < i1) {
					SwapRows(first + i0, first + i1);
					i0++;
					i1--;
				}
			} while (i0 < i1);
		}
	}

	maxPasses = forceRows;
	//totalPassesCount = 0;
	for (passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) {
		dgInt32 clampedForceIndex;
		dgInt32 clampedForceJoint;
		dgFloat32 ak;
		dgFloat32 akDen;
		dgFloat32 clampedForceIndexValue;

		for (dgInt32 i = 0; i < m_bodyCount; i++) {
			internalForces[i].m_linear[0] = dgFloat32(0.0f);
			internalForces[i].m_linear[1] = dgFloat32(0.0f);
			internalForces[i].m_linear[2] = dgFloat32(0.0f);
			internalForces[i].m_linear[3] = dgFloat32(0.0f);
			internalForces[i].m_angular[0] = dgFloat32(0.0f);
			internalForces[i].m_angular[1] = dgFloat32(0.0f);
			internalForces[i].m_angular[2] = dgFloat32(0.0f);
			internalForces[i].m_angular[3] = dgFloat32(0.0f);
		}

		for (dgInt32 i = 0; i < m_jointCount; i++) {
			dgJacobian y0;
			dgJacobian y1;

			dgInt32 jfirst = constraintArray[i].m_autoPairstart;
			dgInt32 jcount = constraintArray[i].m_autoPairActiveCount;
			dgInt32 m0 = constraintArray[i].m_m0;
			dgInt32 m1 = constraintArray[i].m_m1;
			y0.m_linear = zero;
			y0.m_angular = zero;
			y1.m_linear = zero;
			y1.m_angular = zero;
			for (dgInt32 j = 0; j < jcount; j++) {
				dgInt32 index = j + jfirst;
				dgFloat32 jak = deltaForce[index];
				y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(jak);
				y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(jak);
				y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(jak);
				y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(jak);
			}
			internalForces[m0].m_linear += y0.m_linear;
			internalForces[m0].m_angular += y0.m_angular;
			internalForces[m1].m_linear += y1.m_linear;
			internalForces[m1].m_angular += y1.m_angular;
		}

		akDen = dgFloat32(0.0f);
		for (dgInt32 i = 0; i < m_jointCount; i++) {
			dgInt32 j;
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 first;
			dgInt32 count;
			dgInt32 index;
			first = constraintArray[i].m_autoPairstart;
			count = constraintArray[i].m_autoPairActiveCount;
			m0 = constraintArray[i].m_m0;
			m1 = constraintArray[i].m_m1;
			const dgJacobian &y0 = internalForces[m0];
			const dgJacobian &y1 = internalForces[m1];
			for (j = 0; j < count; j++) {
				index = j + first;
				dgVector tmpAccel(JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
				tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
				tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
				tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);
				deltaAccel[index] = tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z + deltaForce[index] * diagDamp[index];
				akDen += deltaAccel[index] * deltaForce[index];
			}
		}

		NEWTON_ASSERT(akDen > dgFloat32(0.0f));
		akDen = GetMax(akDen, dgFloat32(1.0e-16f));
		NEWTON_ASSERT(dgAbsf(akDen) >= dgFloat32(1.0e-16f));
		ak = akNum / akDen;
		clampedForceIndex = -1;
		clampedForceJoint = -1;
		clampedForceIndexValue = dgFloat32(0.0f);
		for (dgInt32 i = 0; i < m_jointCount; i++) {
			dgInt32 j;
			dgInt32 first;
			dgInt32 count;
			dgInt32 index;
			dgFloat32 val;
			if (ak > dgFloat32(1.0e-8f)) {
				first = constraintArray[i].m_autoPairstart;
				count = constraintArray[i].m_autoPairActiveCount;
				for (j = 0; j < count; j++) {
					index = j + first;
					val = force[index] + ak * deltaForce[index];
					if (deltaForce[index] < dgFloat32(-1.0e-16f)) {
						// if (val < bilateralForceBounds[index].m_low) {
						if (val < lowerForceBound[index]) {
							ak = GetMax((lowerForceBound[index] - force[index]) / deltaForce[index], dgFloat32(0.0f));
							NEWTON_ASSERT(ak >= dgFloat32(0.0f));
							clampedForceIndex = j;
							clampedForceJoint = i;
							// clampedForceIndexValue = bilateralForceBounds[index].m_low;
							clampedForceIndexValue = lowerForceBound[index];
						}
					} else if (deltaForce[index] > dgFloat32(1.0e-16f)) {
						// if (val > bilateralForceBounds[index].m_upper) {
						if (val > upperForceBound[index]) {
							ak = GetMax((upperForceBound[index] - force[index]) / deltaForce[index], dgFloat32(0.0f));
							NEWTON_ASSERT(ak >= dgFloat32(0.0f));
							clampedForceIndex = j;
							clampedForceJoint = i;
							clampedForceIndexValue = upperForceBound[index];
						}
					}
				}
			}
		}

		if (clampedForceIndex >= 0) {
			dgInt32 first;
			dgInt32 count;
			dgInt32 activeCount;
			bool isClamped[DG_CONSTRAINT_MAX_ROWS];

			for (dgInt32 i = 0; i < m_jointCount; i++) {
				dgInt32 jfirst = constraintArray[i].m_autoPairstart;
				dgInt32 jcount = constraintArray[i].m_autoPairActiveCount;
				for (dgInt32 j = 0; j < jcount; j++) {
					dgInt32 jindex = j + jfirst;
					force[jindex] += ak * deltaForce[jindex];
					accel[jindex] -= ak * deltaAccel[jindex];
				}
			}

			first = constraintArray[clampedForceJoint].m_autoPairstart;
			count = constraintArray[clampedForceJoint].m_autoPairActiveCount;
			count--;
			force[first + clampedForceIndex] = clampedForceIndexValue;
			if (clampedForceIndex != count) {
				SwapRows(first + clampedForceIndex, first + count);
			}

			activeCount = count;
			for (dgInt32 i = 0; i < count; i++) {
				dgInt32 index;
				dgFloat32 val;
				index = first + i;
				isClamped[i] = false;
				val = lowerForceBound[index] - force[index];
				if ((val > dgFloat32(-1.0e-5f)) && (accel[index] < dgFloat32(0.0f))) {
					activeCount--;
					isClamped[i] = true;
				} else {
					val = upperForceBound[index] - force[index];
					if ((val < dgFloat32(1.0e-5f)) && (accel[index] > dgFloat32(0.0f))) {
						activeCount--;
						isClamped[i] = true;
					}
				}
			}

			if (activeCount < count) {
				dgInt32 i0;
				dgInt32 i1;
				i0 = 0;
				i1 = count - 1;
				do {
					while ((i0 <= i1) && !isClamped[i0])
						i0++;
					while ((i0 <= i1) && isClamped[i1])
						i1--;
					if (i0 < i1) {
						SwapRows(first + i0, first + i1);
						// Swap (isClamped[i0], isClamped[i1]);
						// Swap (permutationIndex[i0], permutationIndex[i1]);
						i0++;
						i1--;
					}
				} while (i0 < i1);
			}
			constraintArray[clampedForceJoint].m_autoPairActiveCount = activeCount;

			forceRows = 0;
			akNum = dgFloat32(0.0f);
			accNorm = dgFloat32(0.0f);
			for (dgInt32 i = 0; i < m_jointCount; i++) {
				dgInt32 jfirst = constraintArray[i].m_autoPairstart;
				dgInt32 jcount = constraintArray[i].m_autoPairActiveCount;
				forceRows += jcount;
				for (dgInt32 j = 0; j < jcount; j++) {
					dgInt32 jindex = jfirst + j;
					NEWTON_ASSERT((i != clampedForceJoint) || !((dgAbsf(lowerForceBound[jindex] - force[jindex]) < dgFloat32(1.0e-5f)) && (accel[jindex] < dgFloat32(0.0f))));
					NEWTON_ASSERT((i != clampedForceJoint) || !((dgAbsf(upperForceBound[jindex] - force[jindex]) < dgFloat32(1.0e-5f)) && (accel[jindex] > dgFloat32(0.0f))));
					deltaForce[jindex] = accel[jindex] * invDJMinvJt[jindex];
					akNum += deltaForce[jindex] * accel[jindex];
					accNorm = GetMax(dgAbsf(accel[jindex]), accNorm);
					NEWTON_ASSERT(dgCheckFloat(deltaForce[jindex]));
				}
			}

			NEWTON_ASSERT(akNum >= dgFloat32(0.0f));
			passes = -1;
			maxPasses = forceRows;

		} else {
			accNorm = dgFloat32(0.0f);
			for (dgInt32 i = 0; i < m_jointCount; i++) {
				dgInt32 j;
				dgInt32 first;
				dgInt32 count;
				dgInt32 index;
				first = constraintArray[i].m_autoPairstart;
				count = constraintArray[i].m_autoPairActiveCount;
				for (j = 0; j < count; j++) {
					index = j + first;
					force[index] += ak * deltaForce[index];
					accel[index] -= ak * deltaAccel[index];
					accNorm = GetMax(dgAbsf(accel[index]), accNorm);
				}
			}

			if (accNorm > maxAccNorm) {
				akDen = akNum;
				akNum = dgFloat32(0.0f);
				for (dgInt32 i = 0; i < m_jointCount; i++) {
					dgInt32 j;
					dgInt32 first;
					dgInt32 count;
					dgInt32 index;
					first = constraintArray[i].m_autoPairstart;
					count = constraintArray[i].m_autoPairActiveCount;
					for (j = 0; j < count; j++) {
						index = j + first;
						deltaAccel[index] = accel[index] * invDJMinvJt[index];
						akNum += accel[index] * deltaAccel[index];
					}
				}

				NEWTON_ASSERT(akNum >= dgFloat32(0.0f));
				NEWTON_ASSERT(akDen > dgFloat32(0.0f));
				akDen = GetMax(akDen, dgFloat32(1.0e-17f));
				ak = dgFloat32(akNum / akDen);
				for (dgInt32 i = 0; i < m_jointCount; i++) {
					dgInt32 j;
					dgInt32 first;
					dgInt32 count;
					dgInt32 index;
					first = constraintArray[i].m_autoPairstart;
					count = constraintArray[i].m_autoPairActiveCount;
					for (j = 0; j < count; j++) {
						index = j + first;
						deltaForce[index] = deltaAccel[index] + ak * deltaForce[index];
					}
				}
			}
		}
		//totalPassesCount++;
	}
	ApplyExternalForcesAndAcceleration(maxAccNorm);
}

dgFloat32 dgJacobianMemory::CalculateJointForcesSimd(dgInt32 joint,
        dgFloat32 *forceStep, dgFloat32 maxAccNorm) const {

#ifdef DG_BUILD_SIMD_CODE

	dgInt32 m0;
	dgInt32 m1;
	dgInt32 first;
	dgInt32 count;
	dgInt32 roundCount;
	dgInt32 maxPasses;
	dgInt32 clampedForceIndex;
	dgFloat32 ak;
	dgFloat32 akNum;
	dgFloat32 retAccel;
	dgFloatSign tmpIndex;
	// dgFloat32 akDen;
	// dgFloat32 force;
	dgFloat32 accNorm;
	// dgFloat32 retAccNorm;
	dgFloat32 clampedForceIndexValue;
	// dgJacobian y0;
	// dgJacobian y1;
	//  simd_type tmp0;
	//  simd_type tmp1;
	//  simd_type tmp2;
	//  simd_type tmp3;

	simd_type akNumSimd;
	simd_type accNormSimd;
	simd_type maxPassesSimd;
	simd_type y0_linear;
	simd_type y0_angular;
	simd_type y1_linear;
	simd_type y1_angular;
	simd_type one;
	simd_type zero;
	simd_type signMask;
	simd_type tol_pos_1eNeg5;
	simd_type tol_pos_1eNeg8;
	simd_type tol_neg_1eNeg16;
	simd_type tol_pos_1eNeg16;
	simd_type deltaAccelPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];
	simd_type deltaForcePtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];
	simd_type activeRowPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];
	simd_type lowBoundPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];
	simd_type highBoundPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE];

	dgFloat32 *const deltaAccel = (dgFloat32 *)deltaAccelPtr;
	dgFloat32 *const deltaForce = (dgFloat32 *)deltaForcePtr;
	dgFloat32 *const activeRow = (dgFloat32 *)activeRowPtr;
	dgFloat32 *const lowBound = (dgFloat32 *)lowBoundPtr;
	dgFloat32 *const highBound = (dgFloat32 *)highBoundPtr;

	dgFloat32 *const accel = m_accel;
	dgFloat32 *const force = m_force;
	const dgJacobianPair *const Jt = m_Jt;
	const dgJacobianPair *const JMinv = m_JMinv;
	const dgFloat32 *const diagDamp = m_diagDamp;
	const dgFloat32 *const invDJMinvJt = m_invDJMinvJt;
	//  const dgFloat32* const lowerFrictionForce = m_lowerBoundFrictionCoefficent;
	//  const dgFloat32* const upperFrictionForce = m_upperBoundFrictionCoefficent;
	const dgFloat32 *const lowerFrictionCoef = m_lowerBoundFrictionCoefficent;
	const dgFloat32 *const upperFrictionCoef = m_upperBoundFrictionCoefficent;

	const dgInt32 *const normalForceIndex = m_normalForceIndex;
	const dgJointInfo *const constraintArray = m_constraintArray;
	//  const dgJacobianIndex* const jacobianIndexArray = m_jacobianIndexArray;

	//  count = 0x7fffffff;
	//  signMask = simd_set1((dgFloat32&) count);
	tmpIndex.m_integer.m_iVal = 0x7fffffff;
	signMask = simd_set1(tmpIndex.m_fVal);

	one = simd_set1(dgFloat32(1.0f));
	zero = simd_set1(dgFloat32(0.0f));
	tol_pos_1eNeg8 = simd_set1(dgFloat32(1.0e-8f));
	tol_pos_1eNeg5 = simd_set1(dgFloat32(1.0e-5f));
	tol_pos_1eNeg16 = simd_set1(dgFloat32(1.0e-16f));
	tol_neg_1eNeg16 = simd_set1(dgFloat32(-1.0e-16f));

	first = constraintArray[joint].m_autoPairstart;
	count = constraintArray[joint].m_autoPaircount;
	m0 = constraintArray[joint].m_m0;
	m1 = constraintArray[joint].m_m1;

	roundCount = count & (-DG_SIMD_WORD_SIZE);
	if (roundCount != count) {
		roundCount += 4;

		for (dgInt32 j = count; j < roundCount; j++) {
			dgInt32 i;
			i = first + j;
			// force[i] = dgFloat32 (0.0f);
			// accel[j] -= dgFloat32 (0.0f);
			// activeRow[j] = dgFloat32 (0.0f);
			// deltaAccel[j] = dgFloat32 (0.0f);
			// deltaForce[j] = dgFloat32 (0.0f);
			m_normalForceIndex[i] = -1;
			simd_store_s(zero, &force[i]);
			simd_store_s(zero, &accel[j]);
			simd_store_s(zero, &activeRow[j]);
			simd_store_s(zero, &deltaAccel[j]);
			simd_store_s(zero, &deltaForce[j]);
			simd_store_s(one, &m_lowerBoundFrictionCoefficent[i]);
			simd_store_s(zero, &m_upperBoundFrictionCoefficent[i]);
		}
	}

	// akNum = dgFloat32 (0.0f);
	// accNorm = dgFloat32(0.0f);
	// maxPasses = count;
	akNumSimd = zero;
	accNormSimd = zero;
	maxPassesSimd = zero;
	//  tmp3 = simd_set1(dgFloat32 (1.0f));

	// for (j = 0; j < count; j ++) {
	for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) {
		dgInt32 i;
		// dgInt32 k0;
		// dgInt32 k1;
		// dgInt32 k2;
		// dgInt32 k3;
		// simd_type index_k;
		// simd_type accel_j;
		simd_type force_k;
		// simd_type force_i;
		// simd_type lowBound_j;
		// simd_type higntBound_j;
		// simd_type deltaforce_j;
		simd_type lowHighBound_test;

		i = first + j;
		// k = bilateralForceBounds[i].m_normalIndex;
		// k0 = normalForceIndex[i + 0];
		// k1 = normalForceIndex[i + 1];
		// k2 = normalForceIndex[i + 2];
		// k3 = normalForceIndex[i + 3];

		// val = (k >= 0) ? force[k] : dgFloat32 (1.0f);
		// val = force[k];
		// index_k = simd_move_lh_v (simd_pack_lo_v (simd_load_is (zero, k0), simd_load_is (zero, k1)), simd_pack_lo_v (simd_load_is (zero, k2), simd_load_is (zero, k3)));
		// index_k = simd_cmpge_v (index_k, zero);
		force_k =
		    simd_move_lh_v(simd_pack_lo_v(simd_load_s(force[normalForceIndex[i + 0]]), simd_load_s(force[normalForceIndex[i + 1]])), simd_pack_lo_v(simd_load_s(force[normalForceIndex[i + 2]]), simd_load_s(force[normalForceIndex[i + 3]])));
		// force_k = simd_or_v (simd_and_v(force_k, index_k), simd_andnot_v (one, index_k));

		// lowBound[j] = val * bilateralForceBounds[i].m_low;
		// highBound[j] = val * bilateralForceBounds[i].m_upper;
		(simd_type &)lowBound[j] =
		    simd_mul_v(force_k, (simd_type &)lowerFrictionCoef[i]);
		(simd_type &)highBound[j] =
		    simd_mul_v(force_k, (simd_type &)upperFrictionCoef[i]);

		// activeRow[j] = dgFloat32 (1.0f);
		// forceStep[j] = m_force[i];
		// if (force[i] < lowBound[j]) {
		//  maxPasses --;
		//  force[i] = lowBound[j];
		//  activeRow[j] = dgFloat32 (0.0f);
		// } else if (force[i] > highBound[j]) {
		//  maxPasses --;
		//  force[i] = highBound[j];
		//  activeRow[j] = dgFloat32 (0.0f);
		// }

		(simd_type &)forceStep[j] = (simd_type &)force[i];
		//      lowBound_test = simd_cmplt_v (force_i, lowBound_j);
		//      higntBound_test = simd_cmpgt_v (force_i, higntBound_j);
		lowHighBound_test =
		    simd_or_v(simd_cmplt_v((simd_type &)force[i], (simd_type &)lowBound[j]), simd_cmpgt_v((simd_type &)force[i], (simd_type &)highBound[j]));
		(simd_type &)activeRow[j] = simd_andnot_v(one, lowHighBound_test);
		maxPassesSimd = simd_add_v(maxPassesSimd, (simd_type &)activeRow[j]);

		// force_k = simd_or_v (simd_and_v (lowBound_j, lowBound_test), simd_and_v (higntBound_j, higntBound_test));
		//(simd_type&)force[i] = simd_mul_v (activeRow_j, simd_or_v (simd_and_v (force_k, lowHighBound_test), simd_andnot_v (force_i, lowHighBound_test)));
		(simd_type &)force[i] =
		    simd_min_v((simd_type &)highBound[j], simd_max_v((simd_type &)force[i], (simd_type &)lowBound[j]));

		// deltaForce[j] = accel[j] * invDJMinvJt[i] * activeRow[j];
		(simd_type &)deltaForce[j] =
		    simd_mul_v((simd_type &)accel[j], simd_mul_v((simd_type &)invDJMinvJt[i], (simd_type &)activeRow[j]));

		// akNum += accel[j] * deltaForce[j];
		akNumSimd =
		    simd_mul_add_v(akNumSimd, (simd_type &)accel[j], (simd_type &)deltaForce[j]);

		// accNorm = GetMax (dgAbsf (accel[j] * activeRow[j]), accNorm);
		accNormSimd =
		    simd_max_v(accNormSimd, simd_and_v(simd_mul_v((simd_type &)accel[j], (simd_type &)activeRow[j]), signMask));
	}
	akNumSimd = simd_add_v(akNumSimd, simd_move_hl_v(akNumSimd, akNumSimd));
	simd_store_s(
	    simd_add_s(akNumSimd, simd_permut_v(akNumSimd, akNumSimd, PURMUT_MASK(0, 0, 0, 1))),
	    &akNum);

	accNormSimd =
	    simd_max_v(accNormSimd, simd_move_hl_v(accNormSimd, accNormSimd));
	simd_store_s(
	    simd_max_s(accNormSimd, simd_permut_v(accNormSimd, accNormSimd, PURMUT_MASK(0, 0, 0, 1))),
	    &accNorm);

	maxPassesSimd =
	    simd_add_v(maxPassesSimd, simd_move_hl_v(maxPassesSimd, maxPassesSimd));
	maxPasses =
	    simd_store_is(simd_add_s(maxPassesSimd, simd_permut_v(maxPassesSimd, maxPassesSimd, PURMUT_MASK(0, 0, 0, 1))));

	retAccel = accNorm;
	clampedForceIndexValue = dgFloat32(0.0f);
	for (dgInt32 i = 0; (i < maxPasses) && (accNorm > maxAccNorm); i++) {
		simd_type akSimd;
		simd_type akDenSimd;

		// y0.m_linear = zero;
		// y0.m_angular = zero;
		// y1.m_linear = zero;
		// y1.m_angular = zero;
		y0_linear = zero;
		y0_angular = zero;
		y1_linear = zero;
		y1_angular = zero;
		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 k;
			simd_type tmp1;
			k = j + first;
			// ak = deltaForce[j];
			tmp1 = simd_set1(deltaForce[j]);
			// y0.m_linear += m_Jt[k].m_jacobian_IM0.m_linear.Scale (ak);
			// y0.m_angular += m_Jt[k].m_jacobian_IM0.m_angular.Scale (ak);
			// y1.m_linear += m_Jt[k].m_jacobian_IM1.m_linear.Scale (ak);
			// y1.m_angular += m_Jt[k].m_jacobian_IM1.m_angular.Scale (ak);

			y0_linear =
			    simd_mul_add_v(y0_linear, (simd_type &)Jt[k].m_jacobian_IM0.m_linear, tmp1);
			y0_angular =
			    simd_mul_add_v(y0_angular, (simd_type &)Jt[k].m_jacobian_IM0.m_angular, tmp1);
			y1_linear =
			    simd_mul_add_v(y1_linear, (simd_type &)Jt[k].m_jacobian_IM1.m_linear, tmp1);
			y1_angular =
			    simd_mul_add_v(y1_angular, (simd_type &)Jt[k].m_jacobian_IM1.m_angular, tmp1);
		}

		// akDen = dgFloat32 (0.0f);
		akDenSimd = zero;
		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 k;
			simd_type tmp1;
			k = j + first;

			// dgVector acc (m_JMinv[k].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
			// acc += m_JMinv[k].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
			// acc += m_JMinv[k].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
			// acc += m_JMinv[k].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);
			tmp1 =
			    simd_mul_v((simd_type &)JMinv[k].m_jacobian_IM0.m_linear, y0_linear);
			tmp1 =
			    simd_mul_add_v(tmp1, (simd_type &)JMinv[k].m_jacobian_IM0.m_angular, y0_angular);
			tmp1 =
			    simd_mul_add_v(tmp1, (simd_type &)JMinv[k].m_jacobian_IM1.m_linear, y1_linear);
			tmp1 =
			    simd_mul_add_v(tmp1, (simd_type &)JMinv[k].m_jacobian_IM1.m_angular, y1_angular);

			// deltaAccel[j] = acc.m_x + acc.m_y + acc.m_z + deltaForce[j] * m_diagDamp[k];
			tmp1 = simd_add_v(tmp1, simd_move_hl_v(tmp1, tmp1));
			tmp1 =
			    simd_add_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(3, 3, 3, 1)));
			tmp1 =
			    simd_mul_add_s(tmp1, simd_load_s(deltaForce[j]), simd_load_s(diagDamp[k]));
			simd_store_s(tmp1, &deltaAccel[j]);

			// akDen += deltaAccel[j] * deltaForce[j];
			akDenSimd = simd_mul_add_s(akDenSimd, tmp1, simd_load_s(deltaForce[j]));
		}

		//NEWTON_ASSERT (akDen > dgFloat32 (0.0f));
		// akDen = GetMax (akDen, dgFloat32(1.0e-16f));
		//NEWTON_ASSERT (dgAbsf (akDen) >= dgFloat32(1.0e-16f));
		// ak = akNum / akDen;
		akSimd =
		    simd_div_s(simd_load_s(akNum), simd_max_s(akDenSimd, tol_pos_1eNeg16));

		//      simd_store_s (tmp0, &ak);
		//      clampedForceIndex = -1;
		simd_type min_index;
		simd_type minClampIndex;
		simd_type min_index_step;
		simd_type campedIndexValue;

		campedIndexValue = zero;
		minClampIndex = simd_set1(dgFloat32(-1.0f));
		min_index_step = simd_set1(dgFloat32(4.0f));
		akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0));
		min_index =
		    simd_set(dgFloat32(0.0f), dgFloat32(1.0f), dgFloat32(2.0f), dgFloat32(3.0f));

		//      for (j = 0; j < roundCount; j ++) {
		for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) {
			//          if (activeRow[j]) {
			//          dgFloat32 val;
			//          k = j + first;
			//          if (deltaForce[j] < dgFloat32 (-1.0e-16f)) {
			//          val = force[k] + ak * deltaForce[j];
			//          if (val < lowBound[j]) {
			//          ak = (lowBound[j] - force[k]) / deltaForce[j];
			//          clampedForceIndex = j;
			//          clampedForceIndexValue = lowBound[j];
			//          if (ak < dgFloat32 (1.0e-8f)) {
			//          ak = dgFloat32 (0.0f);
			//          break;
			//          }
			//          }
			//          } else if (deltaForce[j] > dgFloat32 (1.0e-16f)) {
			//          val = force[k] + ak * deltaForce[j];
			//          if (val > highBound[j]) {
			//          ak = (highBound[j] - force[k]) / deltaForce[j];
			//          clampedForceIndex = j;
			//          clampedForceIndexValue = highBound[j];
			//          if (ak < dgFloat32 (1.0e-8f)) {
			//          ak = dgFloat32 (0.0f);
			//          break;
			//          }
			//          }
			//          }
			//          }
			//
			// bool test;
			// bool negTest;
			// bool posTest;
			// bool negValTest;
			// bool posValTest;
			// bool negDeltaForceTest;
			// bool posDeltaForceTest;
			// dgFloat32 val;
			// dgFloat32 num;
			// dgFloat32 den;

			dgInt32 k;
			simd_type val;
			simd_type num;
			simd_type den;
			simd_type test;
			simd_type negTest;
			simd_type posTest;
			simd_type negValTest;
			simd_type posValTest;
			simd_type negDeltaForceTest;
			simd_type posDeltaForceTest;

			// Make sure AK is not negative
			k = j + first;
			// val = force[k] + ak * deltaForce[j];
			val =
			    simd_mul_add_v((simd_type &)force[k], akSimd, (simd_type &)deltaForce[j]);

			// negValTest = val < lowBound[j];
			negValTest = simd_cmplt_v(val, (simd_type &)lowBound[j]);

			// posValTest = val > highBound[j];
			posValTest = simd_cmpgt_v(val, (simd_type &)highBound[j]);

			// negDeltaForceTest = deltaForce[j] < dgFloat32 (-1.0e-16f);
			negDeltaForceTest =
			    simd_cmplt_v((simd_type &)deltaForce[j], tol_neg_1eNeg16);

			// posDeltaForceTest = deltaForce[j] > dgFloat32 ( 1.0e-16f);
			posDeltaForceTest =
			    simd_cmpgt_v((simd_type &)deltaForce[j], tol_pos_1eNeg16);

			// negTest = negValTest & negDeltaForceTest;
			negTest = simd_and_v(negValTest, negDeltaForceTest);

			// posTest = posValTest & posDeltaForceTest;
			posTest = simd_and_v(posValTest, posDeltaForceTest);

			// test = negTest | posTest;
			test = simd_or_v(negTest, posTest);

			// num = negTest ? lowBound[j] : (posTest ? highBound[j] : force[k]);
			num =
			    simd_or_v(simd_and_v((simd_type &)lowBound[j], negTest), simd_and_v((simd_type &)highBound[j], posTest));
			num =
			    simd_or_v(simd_and_v(num, test), simd_andnot_v((simd_type &)force[k], test));

			// den = test ? deltaForce[j] : dgFloat32 (1.0f);
			den =
			    simd_or_v(simd_and_v((simd_type &)deltaForce[j], test), simd_andnot_v(one, test));

			// test = test & (activeRow[j] > dgFloat32 (0.0f));
			test = simd_and_v(test, simd_cmpgt_v((simd_type &)activeRow[j], zero));

			//NEWTON_ASSERT (dgAbsf (den) > 1.0e-16f);
			// ak = test ? (num - force[k]) / den : ak;
			akSimd =
			    simd_or_v(simd_div_v(simd_sub_v(num, (simd_type &)force[k]), den), simd_andnot_v(akSimd, test));

			// ak = (ak < dgFloat32 (1.0e-8f)) ? dgFloat32 (0.0f) : ak;
			akSimd = simd_and_v(akSimd, simd_cmpgt_v(akSimd, tol_pos_1eNeg8));

			// clampedForceIndex = test ? j : clampedForceIndex;
			minClampIndex =
			    simd_or_v(simd_and_v(min_index, test), simd_andnot_v(minClampIndex, test));
			min_index = simd_add_v(min_index, min_index_step);

			// clampedForceIndexValue = test ? num : clampedForceIndexValue;
			campedIndexValue =
			    simd_or_v(simd_and_v(num, test), simd_andnot_v(campedIndexValue, test));
		}

		akDenSimd = simd_move_hl_v(akSimd, akSimd);
		maxPassesSimd = simd_cmplt_v(akSimd, akDenSimd);
		akSimd = simd_min_v(akSimd, akDenSimd);
		minClampIndex =
		    simd_or_v(simd_and_v(minClampIndex, maxPassesSimd), simd_andnot_v(simd_move_hl_v(minClampIndex, minClampIndex), maxPassesSimd));
		campedIndexValue =
		    simd_or_v(simd_and_v(campedIndexValue, maxPassesSimd), simd_andnot_v(simd_move_hl_v(campedIndexValue, campedIndexValue), maxPassesSimd));

		akDenSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 1));
		maxPassesSimd = simd_cmplt_s(akSimd, akDenSimd);
		akSimd = simd_min_s(akSimd, akDenSimd);
		minClampIndex =
		    simd_or_v(simd_and_v(minClampIndex, maxPassesSimd), simd_andnot_v(simd_permut_v(minClampIndex, minClampIndex, PURMUT_MASK(0, 0, 0, 1)), maxPassesSimd));
		campedIndexValue =
		    simd_or_v(simd_and_v(campedIndexValue, maxPassesSimd), simd_andnot_v(simd_permut_v(campedIndexValue, campedIndexValue, PURMUT_MASK(0, 0, 0, 1)), maxPassesSimd));

		//      tmp2 = zero;
		simd_store_s(akSimd, &ak);
		clampedForceIndex = simd_store_is(minClampIndex);
		simd_store_s(campedIndexValue, &clampedForceIndexValue);
		if (ak == dgFloat32(0.0f) && (clampedForceIndex != -1)) {
			NEWTON_ASSERT(clampedForceIndex != -1);

			// akNum = dgFloat32 (0.0f);
			// accNorm = dgFloat32(0.0f);
			akNumSimd = zero;
			accNormSimd = zero;
			maxPassesSimd = zero;

			activeRow[clampedForceIndex] = dgFloat32(0.0f);
			deltaForce[clampedForceIndex] = dgFloat32(0.0f);
			force[clampedForceIndex + first] = clampedForceIndexValue;

			// for (j = 0; j < count; j ++) {
			for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) {
				// for (j = 0; j < roundCount; j ++) {
				//              if (((dgFloat32*)activeRow)[j]) {
				//              bool test0;
				//              bool test1;
				//              k = j + first;
				//              val = ((dgFloat32*)lowBound)[j] - force[k];
				//              if ((dgAbsf (val) < dgFloat32 (1.0e-5f)) && (accel[j] < dgFloat32 (0.0f))) {
				//              force[k] = lowBound[j];
				//              activeRow[j] = dgFloat32 (0.0f);
				//              deltaForce[j] = dgFloat32 (0.0f);
				//
				//              } else {
				//              val = highBound[j] - force[k];
				//              if ((dgAbsf (val) < dgFloat32 (1.0e-5f)) && (accel[j] > dgFloat32 (0.0f))) {
				//              force[k] = highBound[j];
				//              activeRow[j] = dgFloat32 (0.0f);
				//              deltaForce[j] = dgFloat32 (0.0f);
				//              } else {
				//              NEWTON_ASSERT (activeRow[j] > dgFloat32 (0.0f));
				//              deltaForce[j] = accel[j] * invDJMinvJt[k];
				//              akNum += accel[j] * deltaForce[j];
				//              accNorm = GetMax (dgAbsf (accel[j]), accNorm);
				//              }
				//              }
				//              }

				dgInt32 k;
				simd_type val_k;
				simd_type test_0;
				simd_type test_1;
				// simd_type test_2;
				simd_type accel_k;
				simd_type force_k;

				k = j + first;
				accel_k = (simd_type &)accel[j];
				force_k = (simd_type &)force[k];
				// val = dgAbsf (lowBound[j] - force[k]);
				val_k =
				    simd_and_v(simd_sub_v((simd_type &)lowBound[j], force_k), signMask);

				// test0 = (val < dgFloat32 (1.0e-5f)) & (accel[j] < dgFloat32 (0.0f));
				test_0 =
				    simd_and_v(simd_cmplt_v(val_k, tol_pos_1eNeg5), simd_cmplt_v(accel_k, zero));

				// val = dgAbsf (highBound[j] - force[k]);
				val_k =
				    simd_and_v(simd_sub_v((simd_type &)highBound[j], force_k), signMask);

				// test1 = (val < dgFloat32 (1.0e-5f)) & (accel[j] > dgFloat32 (0.0f));
				test_1 =
				    simd_and_v(simd_cmplt_v(val_k, tol_pos_1eNeg5), simd_cmpgt_v(accel_k, zero));

				// force[k] = test0 ? lowBound[j] : (test1 ? highBound[j] : force[k]);
				// val_k = simd_or_v (simd_and_v ((simd_type&)lowBound[j], test_0), simd_and_v ((simd_type&)highBound[j], test_1));
				//(simd_type&) force[k] = simd_or_v (simd_and_v (val_k, test_2) , simd_andnot_v (force_k, test_2));
				(simd_type &)force[k] =
				    simd_min_v((simd_type &)highBound[j], simd_max_v((simd_type &)force[k], (simd_type &)lowBound[j]));

				// activeRow[j] *= (test0 | test1) ? dgFloat32 (0.0f) : dgFloat32 (1.0f);
				// test_2 = simd_or_v (test_0, test_1);
				(simd_type &)activeRow[j] =
				    simd_mul_v((simd_type &)activeRow[j], simd_andnot_v(one, simd_or_v(test_0, test_1)));

				// deltaForce[j] = accel[j] * invDJMinvJt[k] * activeRow[j];
				(simd_type &)deltaForce[j] =
				    simd_mul_v(accel_k, simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[j]));

				// akNum += accel[j] * deltaForce[j];
				akNumSimd =
				    simd_mul_add_v(akNumSimd, (simd_type &)deltaForce[j], accel_k);

				// accNorm = GetMax (dgAbsf (accel[j] * activeRow[j]), accNorm);
				accNormSimd =
				    simd_max_v(accNormSimd, simd_and_v(simd_mul_v(accel_k, (simd_type &)activeRow[j]), signMask));

				// masPases += 1;
				maxPassesSimd = simd_add_v(maxPassesSimd, (simd_type &)activeRow[j]);
			}

			NEWTON_ASSERT(activeRow[clampedForceIndex] == dgFloat32(0.0f));

			akNumSimd = simd_add_v(akNumSimd, simd_move_hl_v(akNumSimd, akNumSimd));
			akNumSimd =
			    simd_add_s(akNumSimd, simd_permut_v(akNumSimd, akNumSimd, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(akNumSimd, &akNum);

			accNormSimd =
			    simd_max_v(accNormSimd, simd_move_hl_v(accNormSimd, accNormSimd));
			accNormSimd =
			    simd_max_s(accNormSimd, simd_permut_v(accNormSimd, accNormSimd, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(accNormSimd, &accNorm);

			i = -1;
			// maxPasses = GetMax (maxPasses - 1, 1);
			maxPassesSimd =
			    simd_add_v(maxPassesSimd, simd_move_hl_v(maxPassesSimd, maxPassesSimd));
			maxPasses =
			    simd_store_is(simd_add_s(maxPassesSimd, simd_permut_v(maxPassesSimd, maxPassesSimd, PURMUT_MASK(0, 0, 0, 1))));

		} else if (clampedForceIndex >= 0) {
			// akNum = dgFloat32(0.0f);
			// accNorm = dgFloat32(0.0f);

			akNumSimd = zero;
			accNormSimd = zero;
			//          tmp2 = zero;
			akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0));
			activeRow[clampedForceIndex] = dgFloat32(0.0f);

			// for (j = 0; j < count; j ++) {
			for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) {
				dgInt32 k;
				k = j + first;
				// m_force[k] += ak * deltaForce[j];
				// m_accel[j] -= ak * deltaAccel[j];
				(simd_type &)force[k] =
				    simd_mul_add_v((simd_type &)force[k], akSimd, (simd_type &)deltaForce[j]);
				(simd_type &)accel[j] =
				    simd_mul_sub_v((simd_type &)accel[j], akSimd, (simd_type &)deltaAccel[j]);

				// accNorm = GetMax (dgAbsf (m_accel[j] * activeRow[j]), accNorm);
				accNormSimd =
				    simd_max_v(accNormSimd, simd_and_v(simd_mul_v((simd_type &)accel[j], (simd_type &)activeRow[j]), signMask));
				//NEWTON_ASSERT (dgCheckFloat(m_force[k]));
				//NEWTON_ASSERT (dgCheckFloat(m_accel[j]));

				// deltaForce[j] = m_accel[j] * m_invDJMinvJt[k] * activeRow[j];
				(simd_type &)deltaForce[j] =
				    simd_mul_v((simd_type &)accel[j], simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[j]));

				// akNum += deltaForce[j] * m_accel[j];
				akNumSimd =
				    simd_mul_add_v(akNumSimd, (simd_type &)deltaForce[j], (simd_type &)accel[j]);
			}

			akNumSimd = simd_add_v(akNumSimd, simd_move_hl_v(akNumSimd, akNumSimd));
			akNumSimd =
			    simd_add_s(akNumSimd, simd_permut_v(akNumSimd, akNumSimd, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(akNumSimd, &akNum);

			accNormSimd =
			    simd_max_v(accNormSimd, simd_move_hl_v(accNormSimd, accNormSimd));
			accNormSimd =
			    simd_max_s(accNormSimd, simd_permut_v(accNormSimd, accNormSimd, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(accNormSimd, &accNorm);

			force[clampedForceIndex + first] = clampedForceIndexValue;

			i = -1;
			maxPasses = GetMax(maxPasses - 1, 1);

		} else {
			// accNorm = dgFloat32(0.0f);
			//          tmp2 = zero;
			accNormSimd = zero;
			akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0));
			// for (j = 0; j < count; j ++) {
			for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) {
				dgInt32 k;
				k = j + first;
				// m_force[k] += ak * deltaForce[j];
				// m_accel[j] -= ak * deltaAccel[j];
				(simd_type &)force[k] =
				    simd_mul_add_v((simd_type &)force[k], akSimd, (simd_type &)deltaForce[j]);
				(simd_type &)accel[j] =
				    simd_mul_sub_v((simd_type &)accel[j], akSimd, (simd_type &)deltaAccel[j]);

				// accNorm = GetMax (dgAbsf (m_accel[j] * activeRow[j]), accNorm);
				accNormSimd =
				    simd_max_v(accNormSimd, simd_and_v(simd_mul_v((simd_type &)accel[j], (simd_type &)activeRow[j]), signMask));
				//NEWTON_ASSERT (dgCheckFloat(m_force[k]));
				//NEWTON_ASSERT (dgCheckFloat(m_accel[j]));
			}
			accNormSimd =
			    simd_max_v(accNormSimd, simd_move_hl_v(accNormSimd, accNormSimd));
			accNormSimd =
			    simd_max_s(accNormSimd, simd_permut_v(accNormSimd, accNormSimd, PURMUT_MASK(0, 0, 0, 1)));
			simd_store_s(accNormSimd, &accNorm);
			if (accNorm > maxAccNorm) {

				// akDen = akNum;
				// akNum = dgFloat32(0.0f);
				akDenSimd = simd_set1(akNum);
				akNumSimd = zero;
				// for (j = 0; j < count; j ++) {
				for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) {
					dgInt32 k;
					k = j + first;
					// deltaAccel[j] = m_accel[j] * m_invDJMinvJt[k] * activeRow[j];
					(simd_type &)deltaAccel[j] =
					    simd_mul_v((simd_type &)accel[j], simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[j]));
					// akNum += m_accel[j] * deltaAccel[j];
					akNumSimd =
					    simd_mul_add_v(akNumSimd, (simd_type &)accel[j], (simd_type &)deltaAccel[j]);
				}

				akNumSimd = simd_add_v(akNumSimd, simd_move_hl_v(akNumSimd, akNumSimd));
				akNumSimd =
				    simd_add_s(akNumSimd, simd_permut_v(akNumSimd, akNumSimd, PURMUT_MASK(0, 0, 0, 1)));
				simd_store_s(akNumSimd, &akNum);

				//NEWTON_ASSERT (akDen > dgFloat32(0.0f));
				// akDen = GetMax (akDen, dgFloat32 (1.0e-17f));
				akDenSimd = simd_max_s(akDenSimd, simd_set1(dgFloat32(1.0e-17f)));
				// ak = dgFloat32 (akNum / akDen);
				akSimd = simd_div_s(akSimd, akDenSimd);
				akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0));

				// for (j = 0; j < count; j ++) {
				for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) {
					// deltaForce[j] = deltaAccel[j] + ak * deltaForce[j];
					(simd_type &)deltaForce[j] =
					    simd_mul_add_v((simd_type &)deltaAccel[j], akSimd, (simd_type &)deltaForce[j]);
				}
			}
		}
	}

	//  for (j = 0; j < count; j ++) {
	for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) {
		// forceStep[j] = force[j + first] - forceStep[j];
		(simd_type &)forceStep[j] =
		    simd_sub_v((simd_type &)force[j + first], (simd_type &)forceStep[j]);
	}

	//  tmp0 = simd_set1(DG_SSOR_FACTOR);
	//  for (j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) {
	//      tmp1 = simd_mul_add_v((simd_type&)forceStep[j], simd_sub_v ((simd_type&)force[j + first], (simd_type&)forceStep[j]), tmp0);
	//      (simd_type&)force[j + first] = simd_min_v ((simd_type&) highBound[j], simd_max_v(tmp1, (simd_type&)lowBound[j]));
	//      (simd_type&)forceStep[j] = simd_sub_v ((simd_type&)force[j + first], (simd_type&)forceStep[j]);
	//  }

	return retAccel;

#else
	return dgFloat32(0.0f);
#endif
}

dgFloat32 dgJacobianMemory::CalculateJointForces(dgInt32 joint,
        dgFloat32 *forceStep, dgFloat32 maxAccNorm) const {
	dgInt32 first;
	dgInt32 count;
	dgInt32 maxPasses;
	dgInt32 clampedForceIndex;
	dgFloat32 ak;
	dgFloat32 val;
	dgFloat32 akNum;
	dgFloat32 akDen;
	dgFloat32 accNorm;
	dgFloat32 retAccel;
	dgFloat32 clampedForceIndexValue;
	dgJacobian y0;
	dgJacobian y1;
	dgFloat32 deltaAccel[DG_CONSTRAINT_MAX_ROWS];
	dgFloat32 deltaForce[DG_CONSTRAINT_MAX_ROWS];
	dgFloat32 activeRow[DG_CONSTRAINT_MAX_ROWS];
	dgFloat32 lowBound[DG_CONSTRAINT_MAX_ROWS];
	dgFloat32 highBound[DG_CONSTRAINT_MAX_ROWS];

	dgFloat32 *const accel = m_accel;
	dgFloat32 *const force = m_force;
	const dgJacobianPair *const Jt = m_Jt;
	const dgJacobianPair *const JMinv = m_JMinv;
	const dgFloat32 *const diagDamp = m_diagDamp;
	const dgFloat32 *const invDJMinvJt = m_invDJMinvJt;
	const dgInt32 *const normalForceIndex = m_normalForceIndex;
	const dgJointInfo *const constraintArray = m_constraintArray;
	const dgFloat32 *const lowerFriction = m_lowerBoundFrictionCoefficent;
	const dgFloat32 *const upperFriction = m_upperBoundFrictionCoefficent;
	// dgBilateralBounds* const bilateralForceBounds = m_bilateralForceBounds;
	// const dgJacobianIndex* const jacobianIndexArray = m_jacobianIndexArray;

	first = constraintArray[joint].m_autoPairstart;
	count = constraintArray[joint].m_autoPaircount;

	akNum = dgFloat32(0.0f);
	accNorm = dgFloat32(0.0f);
	maxPasses = count;

	for (dgInt32 j = 0; j < count; j++) {
		dgInt32 i;
		dgInt32 k;
		i = first + j;
		// k = bilateralForceBounds[i].m_normalIndex;
		k = normalForceIndex[i];

		//      val = (k >= 0) ? force[k] : dgFloat32 (1.0f);
		NEWTON_ASSERT(
		    ((k < 0) && (force[k] == dgFloat32(1.0f))) || ((k >= 0) && (force[k] >= dgFloat32(0.0f))));
		val = force[k];
		lowBound[j] = val * lowerFriction[i];
		highBound[j] = val * upperFriction[i];

		activeRow[j] = dgFloat32(1.0f);
		forceStep[j] = force[i];
		if (force[i] < lowBound[j]) {
			maxPasses--;
			force[i] = lowBound[j];
			activeRow[j] = dgFloat32(0.0f);
		} else if (force[i] > highBound[j]) {
			maxPasses--;
			force[i] = highBound[j];
			activeRow[j] = dgFloat32(0.0f);
		}

		deltaForce[j] = accel[j] * invDJMinvJt[i] * activeRow[j];
		akNum += accel[j] * deltaForce[j];
		accNorm = GetMax(dgAbsf(accel[j] * activeRow[j]), accNorm);
	}

	retAccel = accNorm;
	clampedForceIndexValue = dgFloat32(0.0f);

	dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
	              dgFloat32(0.0f));
	for (dgInt32 i = 0; (i < maxPasses) && (accNorm > maxAccNorm); i++) {
		y0.m_linear = zero;
		y0.m_angular = zero;
		y1.m_linear = zero;
		y1.m_angular = zero;
		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 k;
			k = j + first;
			ak = deltaForce[j];

			y0.m_linear += Jt[k].m_jacobian_IM0.m_linear.Scale(ak);
			y0.m_angular += Jt[k].m_jacobian_IM0.m_angular.Scale(ak);
			y1.m_linear += Jt[k].m_jacobian_IM1.m_linear.Scale(ak);
			y1.m_angular += Jt[k].m_jacobian_IM1.m_angular.Scale(ak);
		}

		akDen = dgFloat32(0.0f);
		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 k;
			k = j + first;

			dgVector acc(JMinv[k].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear));
			acc += JMinv[k].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular);
			acc += JMinv[k].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear);
			acc += JMinv[k].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular);
			deltaAccel[j] = acc.m_x + acc.m_y + acc.m_z + deltaForce[j] * diagDamp[k];
			akDen += deltaAccel[j] * deltaForce[j];
		}

		NEWTON_ASSERT(akDen > dgFloat32(0.0f));
		akDen = GetMax(akDen, dgFloat32(1.0e-16f));
		NEWTON_ASSERT(dgAbsf(akDen) >= dgFloat32(1.0e-16f));
		ak = akNum / akDen;

		clampedForceIndex = -1;
		for (dgInt32 j = 0; j < count; j++) {
			if (activeRow[j]) {
				dgInt32 k;
				k = j + first;
				if (deltaForce[j] < dgFloat32(-1.0e-16f)) {
					val = force[k] + ak * deltaForce[j];
					if (val < lowBound[j]) {
						ak = GetMax((lowBound[j] - force[k]) / deltaForce[j],
						            dgFloat32(0.0f));
						clampedForceIndex = j;
						clampedForceIndexValue = lowBound[j];
						if (ak < dgFloat32(1.0e-8f)) {
							ak = dgFloat32(0.0f);
							break;
						}
					}
				} else if (deltaForce[j] > dgFloat32(1.0e-16f)) {
					val = force[k] + ak * deltaForce[j];
					if (val > highBound[j]) {
						ak = GetMax((highBound[j] - force[k]) / deltaForce[j],
						            dgFloat32(0.0f));
						clampedForceIndex = j;
						clampedForceIndexValue = highBound[j];
						if (ak < dgFloat32(1.0e-8f)) {
							ak = dgFloat32(0.0f);
							break;
						}
					}
				}
			}
		}

		//      if (ak == dgFloat32 (0.0f)) {
		if (ak == dgFloat32(0.0f) && (clampedForceIndex != -1)) {

			NEWTON_ASSERT(clampedForceIndex != -1);
			akNum = dgFloat32(0.0f);
			accNorm = dgFloat32(0.0f);

			activeRow[clampedForceIndex] = dgFloat32(0.0f);
			deltaForce[clampedForceIndex] = dgFloat32(0.0f);
			force[clampedForceIndex + first] = clampedForceIndexValue;
			for (dgInt32 j = 0; j < count; j++) {
				if (activeRow[j]) {
					dgInt32 k;
					k = j + first;
					val = lowBound[j] - force[k];
					if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[j] < dgFloat32(0.0f))) {
						force[k] = lowBound[j];
						activeRow[j] = dgFloat32(0.0f);
						deltaForce[j] = dgFloat32(0.0f);

					} else {
						val = highBound[j] - force[k];
						if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[j] > dgFloat32(0.0f))) {
							force[k] = highBound[j];
							activeRow[j] = dgFloat32(0.0f);
							deltaForce[j] = dgFloat32(0.0f);
						} else {
							NEWTON_ASSERT(activeRow[j] > dgFloat32(0.0f));
							deltaForce[j] = accel[j] * invDJMinvJt[k];
							akNum += accel[j] * deltaForce[j];
							accNorm = GetMax(dgAbsf(accel[j]), accNorm);
						}
					}
				}
			}
			NEWTON_ASSERT(activeRow[clampedForceIndex] == dgFloat32(0.0f));

			i = -1;
			maxPasses = GetMax(maxPasses - 1, dgInt32(1));

		} else if (clampedForceIndex >= 0) {
			akNum = dgFloat32(0.0f);
			accNorm = dgFloat32(0.0f);
			activeRow[clampedForceIndex] = dgFloat32(0.0f);
			for (dgInt32 j = 0; j < count; j++) {
				dgInt32 k;
				k = j + first;
				force[k] += ak * deltaForce[j];
				accel[j] -= ak * deltaAccel[j];
				accNorm = GetMax(dgAbsf(accel[j] * activeRow[j]), accNorm);
				NEWTON_ASSERT(dgCheckFloat(force[k]));
				NEWTON_ASSERT(dgCheckFloat(accel[j]));

				deltaForce[j] = accel[j] * invDJMinvJt[k] * activeRow[j];
				akNum += deltaForce[j] * accel[j];
			}
			force[clampedForceIndex + first] = clampedForceIndexValue;

			i = -1;
			maxPasses = GetMax(maxPasses - 1, dgInt32(1));

		} else {
			accNorm = dgFloat32(0.0f);
			for (dgInt32 j = 0; j < count; j++) {
				dgInt32 k;
				k = j + first;
				force[k] += ak * deltaForce[j];
				accel[j] -= ak * deltaAccel[j];
				accNorm = GetMax(dgAbsf(accel[j] * activeRow[j]), accNorm);
				NEWTON_ASSERT(dgCheckFloat(force[k]));
				NEWTON_ASSERT(dgCheckFloat(accel[j]));
			}

			if (accNorm > maxAccNorm) {

				akDen = akNum;
				akNum = dgFloat32(0.0f);
				for (dgInt32 j = 0; j < count; j++) {
					dgInt32 k;
					k = j + first;
					deltaAccel[j] = accel[j] * invDJMinvJt[k] * activeRow[j];
					akNum += accel[j] * deltaAccel[j];
				}

				NEWTON_ASSERT(akDen > dgFloat32(0.0f));
				akDen = GetMax(akDen, dgFloat32(1.0e-17f));
				ak = dgFloat32(akNum / akDen);
				for (dgInt32 j = 0; j < count; j++) {
					deltaForce[j] = deltaAccel[j] + ak * deltaForce[j];
				}
			}
		}
	}

	for (dgInt32 j = 0; j < count; j++) {
		forceStep[j] = force[j + first] - forceStep[j];
	}
	return retAccel;
}

void dgJacobianMemory::CalculateForcesGameModeSimd(dgInt32 iterations,
        dgFloat32 maxAccNorm) const {
#ifdef DG_BUILD_SIMD_CODE
	dgFloat32 *const force = m_force;
	const dgJacobianPair *const Jt = m_Jt;
	const dgJacobianPair *const JMinv = m_JMinv;
	const dgFloat32 *const diagDamp = m_diagDamp;
	const dgFloat32 *const invDJMinvJt = m_invDJMinvJt;
	const dgBodyInfo *bodyArray = m_bodyArray;
	dgFloat32 *const penetration = m_penetration;
	const dgFloat32 *const externAccel = m_deltaAccel;
	const dgFloat32 *const restitution = m_restitution;
	dgFloat32 *const coordenateAccel = m_coordenateAccel;
	dgJacobian *const internalVeloc = m_internalVeloc;
	dgJacobian *const internalForces = m_internalForces;
	dgFloat32 **const jointForceFeeback = m_jointFeebackForce;
	const dgInt32 *const normalForceIndex = m_normalForceIndex;
	const dgInt32 *const accelIsMortor = m_accelIsMotor;
	const dgJointInfo *const constraintArray = m_constraintArray;
	const dgFloat32 *const penetrationStiffness = m_penetrationStiffness;
	const dgFloat32 *const lowerFrictionCoef = m_lowerBoundFrictionCoefficent;
	const dgFloat32 *const upperFrictionCoef = m_upperBoundFrictionCoefficent;

	dgFloat32 invStep = (dgFloat32(1.0f) / dgFloat32(LINEAR_SOLVER_SUB_STEPS));
	dgFloat32 timeStep = m_timeStep * invStep;
	dgFloat32 invTimeStep = m_invTimeStep * dgFloat32(LINEAR_SOLVER_SUB_STEPS);

	dgFloatSign tmpIndex;
	tmpIndex.m_integer.m_iVal = 0x7fffffff;
	simd_type signMask = simd_set1(tmpIndex.m_fVal);

	simd_type zero = simd_set1(dgFloat32(0.0f));
	for (dgInt32 i = 1; i < m_bodyCount; i++) {
		dgBody *const body = m_bodyArray[i].m_body;
		(simd_type &)internalVeloc[i].m_linear = zero;
		(simd_type &)internalVeloc[i].m_angular = zero;
		(simd_type &)internalForces[i].m_linear = zero;
		(simd_type &)internalForces[i].m_angular = zero;
		(simd_type &)body->m_netForce = (simd_type &)body->m_veloc;
		(simd_type &)body->m_netTorque = (simd_type &)body->m_omega;
	}
	(simd_type &)internalVeloc[0].m_linear = zero;
	(simd_type &)internalVeloc[0].m_angular = zero;
	(simd_type &)internalForces[0].m_linear = zero;
	(simd_type &)internalForces[0].m_angular = zero;

	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 first = constraintArray[i].m_autoPairstart;
		dgInt32 count = constraintArray[i].m_autoPairActiveCount;
		dgInt32 m0 = constraintArray[i].m_m0;
		dgInt32 m1 = constraintArray[i].m_m1;
		// dgJacobian y0 (internalForces[k0]);
		// dgJacobian y1 (internalForces[k1]);
		simd_type y0_linear = zero;
		simd_type y0_angular = zero;
		simd_type y1_linear = zero;
		simd_type y1_angular = zero;
		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 index = j + first;
			// val = force[index];
			simd_type tmp0 = simd_set1(force[index]);
			// y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (val);
			// y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (val);
			// y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (val);
			// y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (val);
			y0_linear =
			    simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, tmp0);
			y0_angular =
			    simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, tmp0);
			y1_linear =
			    simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, tmp0);
			y1_angular =
			    simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, tmp0);
		}
		// internalForces[k0] = y0;
		// internalForces[k1] = y1;
		(simd_type &)internalForces[m0].m_linear =
		    simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear);
		(simd_type &)internalForces[m0].m_angular =
		    simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular);
		(simd_type &)internalForces[m1].m_linear =
		    simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear);
		(simd_type &)internalForces[m1].m_angular =
		    simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular);
	}

	simd_type timeStepSimd = simd_set1(timeStep);
	dgFloat32 firstPassCoef = dgFloat32(0.0f);
	dgInt32 maxPasses = iterations + DG_BASE_ITERATION_COUNT;
	for (dgInt32 step = 0; step < LINEAR_SOLVER_SUB_STEPS; step++) {
		for (dgInt32 curJoint = 0; curJoint < m_jointCount; curJoint++) {
			dgJointAccelerationDecriptor joindDesc;

			dgInt32 index = constraintArray[curJoint].m_autoPairstart;
			joindDesc.m_rowsCount = constraintArray[curJoint].m_autoPaircount;

			joindDesc.m_timeStep = timeStep;
			joindDesc.m_invTimeStep = invTimeStep;
			joindDesc.m_firstPassCoefFlag = firstPassCoef;

			joindDesc.m_Jt = &Jt[index];

			joindDesc.m_penetration = &penetration[index];
			joindDesc.m_restitution = &restitution[index];
			joindDesc.m_accelIsMotor = &accelIsMortor[index];
			joindDesc.m_externAccelaration = &externAccel[index];
			joindDesc.m_coordenateAccel = &coordenateAccel[index];
			joindDesc.m_normalForceIndex = &normalForceIndex[index];
			joindDesc.m_penetrationStiffness = &penetrationStiffness[index];
			constraintArray[curJoint].m_joint->JointAccelerationsSimd(joindDesc);
		}
		firstPassCoef = dgFloat32(1.0f);

		dgFloat32 accNorm;
		accNorm = maxAccNorm * dgFloat32(2.0f);
		for (dgInt32 passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm);
		        passes++) {
			simd_type accNormSimd = zero;
			for (dgInt32 curJoint = 0; curJoint < m_jointCount; curJoint++) {
				dgInt32 index = constraintArray[curJoint].m_autoPairstart;
				dgInt32 rowsCount = constraintArray[curJoint].m_autoPaircount;
				dgInt32 m0 = constraintArray[curJoint].m_m0;
				dgInt32 m1 = constraintArray[curJoint].m_m1;

				simd_type linearM0 = (simd_type &)internalForces[m0].m_linear;
				simd_type angularM0 = (simd_type &)internalForces[m0].m_angular;
				simd_type linearM1 = (simd_type &)internalForces[m1].m_linear;
				simd_type angularM1 = (simd_type &)internalForces[m1].m_angular;
				for (dgInt32 k = 0; k < rowsCount; k++) {
					//              dgVector acc (m_JMinv[index].m_jacobian_IM0.m_linear.CompProduct(linearM0));
					//              acc += m_JMinv[index].m_jacobian_IM0.m_angular.CompProduct (angularM0);
					//              acc += m_JMinv[index].m_jacobian_IM1.m_linear.CompProduct (linearM1);
					//              acc += m_JMinv[index].m_jacobian_IM1.m_angular.CompProduct (angularM1);
					simd_type a =
					    simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, linearM0);
					a =
					    simd_mul_add_v(a, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, angularM0);
					a =
					    simd_mul_add_v(a, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, linearM1);
					a =
					    simd_mul_add_v(a, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, angularM1);

					// a = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index];
					a = simd_add_v(a, simd_move_hl_v(a, a));
					a = simd_add_s(a, simd_permut_v(a, a, PURMUT_MASK(3, 3, 3, 1)));
					a =
					    simd_sub_s(simd_load_s(coordenateAccel[index]), simd_mul_add_s(a, simd_load_s(force[index]), simd_load_s(diagDamp[index])));

					// f = force[index] + invDJMinvJt[index] * a;
					simd_type f =
					    simd_mul_add_s(simd_load_s(force[index]), simd_load_s(invDJMinvJt[index]), a);

					dgInt32 frictionIndex = m_normalForceIndex[index];
					NEWTON_ASSERT(
					    ((frictionIndex < 0) && (force[frictionIndex] == dgFloat32(1.0f))) || ((frictionIndex >= 0) && (force[frictionIndex] >= dgFloat32(0.0f))));

					// frictionNormal = force[frictionIndex];
					// lowerFrictionForce = frictionNormal * lowerFrictionCoef[index];
					// upperFrictionForce = frictionNormal * upperFrictionCoef[index];
					simd_type frictionNormal = simd_load_s(force[frictionIndex]);
					simd_type lowerFrictionForce =
					    simd_mul_s(frictionNormal, simd_load_s(lowerFrictionCoef[index]));
					simd_type upperFrictionForce =
					    simd_mul_s(frictionNormal, simd_load_s(upperFrictionCoef[index]));

					// if (f > upperFrictionForce) {
					//  a = dgFloat32 (0.0f);
					//  f = upperFrictionForce;
					// } else if (f < lowerFrictionForce) {
					//  a = dgFloat32 (0.0f);
					//  f = lowerFrictionForce;
					// }
					f =
					    simd_min_s(simd_max_s(f, lowerFrictionForce), upperFrictionForce);
					a =
					    simd_andnot_v(a, simd_or_v(simd_cmplt_s(f, lowerFrictionForce), simd_cmpgt_s(f, upperFrictionForce)));
					accNormSimd = simd_max_s(accNormSimd, simd_and_v(a, signMask));

					// prevValue = f - force[index]);
					a = simd_sub_s(f, simd_load_s(force[index]));
					a = simd_permut_v(a, a, PURMUT_MASK(0, 0, 0, 0));

					// force[index] = f;
					simd_store_s(f, &force[index]);

					linearM0 =
					    simd_mul_add_v(linearM0, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, a);
					angularM0 =
					    simd_mul_add_v(angularM0, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, a);
					linearM1 =
					    simd_mul_add_v(linearM1, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, a);
					angularM1 =
					    simd_mul_add_v(angularM1, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, a);
					index++;
				}

				// internalForces[prevM0].m_linear += Jt[prevIndex].m_jacobian_IM0.m_linear.Scale (prevValue);
				// internalForces[prevM0].m_angular += Jt[prevIndex].m_jacobian_IM0.m_angular.Scale (prevValue);
				// internalForces[prevM1].m_linear += Jt[prevIndex].m_jacobian_IM1.m_linear.Scale (prevValue);
				// internalForces[prevM1].m_angular += Jt[prevIndex].m_jacobian_IM1.m_angular.Scale (prevValue);
				(simd_type &)internalForces[m0].m_linear = linearM0;
				(simd_type &)internalForces[m0].m_angular = angularM0;
				(simd_type &)internalForces[m1].m_linear = linearM1;
				(simd_type &)internalForces[m1].m_angular = angularM1;
			}
			simd_store_s(accNormSimd, &accNorm);
		}

		for (dgInt32 i = 1; i < m_bodyCount; i++) {
			dgBody *const body = bodyArray[i].m_body;
			// dgVector force (body->m_accel + internalForces[i].m_linear);
			// dgVector torque (body->m_alpha + internalForces[i].m_angular);

			simd_type force =
			    simd_add_v((simd_type &)body->m_accel, (simd_type &)internalForces[i].m_linear);
			simd_type torque =
			    simd_add_v((simd_type &)body->m_alpha, (simd_type &)internalForces[i].m_angular);

			// dgVector accel (force.Scale (body->m_invMass.m_w));
			simd_type accel = simd_mul_v(force, simd_set1(body->m_invMass.m_w));

			// dgVector alpha (body->m_invWorldInertiaMatrix.RotateVector (torque));
			simd_type alpha =
			    simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_invWorldInertiaMatrix[0], simd_permut_v(torque, torque, PURMUT_MASK(0, 0, 0, 0))),
			                                  (simd_type &)body->m_invWorldInertiaMatrix[1], simd_permut_v(torque, torque, PURMUT_MASK(1, 1, 1, 1))),
			                   (simd_type &)body->m_invWorldInertiaMatrix[2], simd_permut_v(torque, torque, PURMUT_MASK(2, 2, 2, 2)));

			// body->m_veloc += accel.Scale(timeStep);
			(simd_type &)body->m_veloc =
			    simd_mul_add_v((simd_type &)body->m_veloc, accel, timeStepSimd);
			// body->m_omega += alpha.Scale(timeStep);
			(simd_type &)body->m_omega =
			    simd_mul_add_v((simd_type &)body->m_omega, alpha, timeStepSimd);

			// body->m_netForce += body->m_veloc;
			(simd_type &)internalVeloc[i].m_linear =
			    simd_add_v((simd_type &)internalVeloc[i].m_linear, (simd_type &)body->m_veloc);
			// body->m_netTorque += body->m_omega;
			(simd_type &)internalVeloc[i].m_angular =
			    simd_add_v((simd_type &)internalVeloc[i].m_angular, (simd_type &)body->m_omega);
		}
	}

	dgInt32 hasJointFeeback = 0;
	for (dgInt32 i = 0; i < m_jointCount; i++) {
		//      maxForce = dgFloat32 (0.0f);
		dgInt32 first = constraintArray[i].m_autoPairstart;
		dgInt32 count = constraintArray[i].m_autoPaircount;
		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 index = j + first;
			dgFloat32 val = force[index];
			NEWTON_ASSERT(dgCheckFloat(val));
			//          maxForce = GetMax (dgAbsf (val), maxForce);
			jointForceFeeback[index][0] = val;
		}
		//      if (constraintArray[i].m_joint->GetId() == dgContactConstraintId) {
		//          m_world->AddToBreakQueue ((dgContact*)constraintArray[i].m_joint, maxForce);
		//      }

		//      hasJointFeeback |= dgUnsigned32 (constraintArray[i].m_joint->m_updaFeedbackCallback);
		hasJointFeeback |= (constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0);
		//      if (constraintArray[i].m_joint->m_updaFeedbackCallback) {
		//          constraintArray[i].m_joint->m_updaFeedbackCallback (*constraintArray[i].m_joint, m_timeStep, m_threadIndex);
		//      }
	}

//	simd_type invStepSimd;
//	signMask = simd_set1 (invStep);
#ifdef DG_WIGHT_FINAL_RK4_DERIVATIVES
	simd_type invStepSimd = simd_set1(invStep);
#endif
	simd_type invTimeStepSimd = simd_set1(m_invTimeStep);
	simd_type accelerationTolerance = simd_set1(maxAccNorm);
	accelerationTolerance =
	    simd_mul_s(accelerationTolerance, accelerationTolerance);
	for (dgInt32 i = 1; i < m_bodyCount; i++) {
		dgBody *const body = bodyArray[i].m_body;

#ifdef DG_WIGHT_FINAL_RK4_DERIVATIVES
		// body->m_veloc = internalVeloc[i].m_linear.Scale(invStep);
		// body->m_omega = internalVeloc[i].m_angular.Scale(invStep);
		(simd_type &)body->m_veloc = simd_mul_v((simd_type &)internalVeloc[i].m_linear, invStepSimd);
		(simd_type &)body->m_omega = simd_mul_v((simd_type &)internalVeloc[i].m_angular, invStepSimd);
#endif

		// dgVector accel = (body->m_veloc - body->m_netForce).Scale (m_invTimeStep);
		// dgVector alpha = (body->m_omega - body->m_netTorque).Scale (m_invTimeStep);
		simd_type accel = simd_mul_v(simd_sub_v((simd_type &)body->m_veloc, (simd_type &)body->m_netForce), invTimeStepSimd);
		simd_type alpha = simd_mul_v(simd_sub_v((simd_type &)body->m_omega, (simd_type &)body->m_netTorque), invTimeStepSimd);

		// if ((accel % accel) < maxAccNorm2) {
		//  accel = zero;
		// }
		// body->m_accel = accel;
		// body->m_netForce = accel.Scale (body->m_mass[3]);
		simd_type tmp = simd_mul_v(accel, accel);
		tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
		tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
		tmp = simd_cmplt_s(tmp, accelerationTolerance);
		(simd_type &)body->m_accel =
		    simd_andnot_v(accel, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0)));
		(simd_type &)body->m_netForce =
		    simd_mul_v((simd_type &)body->m_accel, simd_set1(body->m_mass[3]));

		// if ((alpha % alpha) < maxAccNorm2) {
		//  alpha = zero;
		// }
		// body->m_alpha = alpha;
		tmp = simd_mul_v(alpha, alpha);
		tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
		tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
		tmp = simd_cmplt_s(tmp, accelerationTolerance);
		(simd_type &)body->m_alpha =
		    simd_andnot_v(alpha, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0)));

		// alpha = body->m_matrix.UnrotateVector(alpha);
		alpha =
		    simd_mul_v((simd_type &)body->m_matrix[0], (simd_type &)body->m_alpha);
		alpha = simd_add_v(alpha, simd_move_hl_v(alpha, alpha));
		alpha =
		    simd_add_s(alpha, simd_permut_v(alpha, alpha, PURMUT_MASK(0, 0, 0, 1)));

		tmp = simd_mul_v((simd_type &)body->m_matrix[1], (simd_type &)body->m_alpha);
		tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
		tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
		alpha = simd_pack_lo_v(alpha, tmp);

		tmp = simd_mul_v((simd_type &)body->m_matrix[2], (simd_type &)body->m_alpha);
		tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
		tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
		alpha = simd_permut_v(alpha, tmp, PURMUT_MASK(3, 0, 1, 0));

		// body->m_netTorque = body->m_matrix.RotateVector (alpha.CompProduct(body->m_mass));
		alpha = simd_mul_v(alpha, (simd_type &)body->m_mass);
		(simd_type &)body->m_netTorque =
		    simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_matrix[0], simd_permut_v(alpha, alpha, PURMUT_MASK(0, 0, 0, 0))),
		                                  (simd_type &)body->m_matrix[1], simd_permut_v(alpha, alpha, PURMUT_MASK(1, 1, 1, 1))),
		                   (simd_type &)body->m_matrix[2], simd_permut_v(alpha, alpha, PURMUT_MASK(2, 2, 2, 2)));
	}

	if (hasJointFeeback) {
		for (dgInt32 i = 0; i < m_jointCount; i++) {
			if (constraintArray[i].m_joint->m_updaFeedbackCallback) {
				constraintArray[i].m_joint->m_updaFeedbackCallback(
				    *constraintArray[i].m_joint, m_timeStep, m_threadIndex);
			}
		}
	}
#endif
}

void dgJacobianMemory::CalculateForcesGameMode(dgInt32 iterations,
        dgFloat32 maxAccNorm) const {
	dgFloat32 *const force = m_force;
	const dgJacobianPair *const Jt = m_Jt;
	const dgJacobianPair *const JMinv = m_JMinv;
	const dgFloat32 *const diagDamp = m_diagDamp;
	const dgFloat32 *const invDJMinvJt = m_invDJMinvJt;
	const dgBodyInfo *bodyArray = m_bodyArray;
	dgFloat32 *const penetration = m_penetration;
	const dgFloat32 *const externAccel = m_deltaAccel;
	const dgFloat32 *const restitution = m_restitution;
	dgFloat32 *const coordenateAccel = m_coordenateAccel;
	dgJacobian *const internalVeloc = m_internalVeloc;
	dgJacobian *const internalForces = m_internalForces;
	;
	dgFloat32 **const jointForceFeeback = m_jointFeebackForce;
	const dgInt32 *const accelIsMortor = m_accelIsMotor;
	const dgInt32 *const normalForceIndex = m_normalForceIndex;
	const dgJointInfo *const constraintArray = m_constraintArray;
	const dgFloat32 *const penetrationStiffness = m_penetrationStiffness;
	const dgFloat32 *const lowerFrictionCoef = m_lowerBoundFrictionCoefficent;
	const dgFloat32 *const upperFrictionCoef = m_upperBoundFrictionCoefficent;
	dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
	              dgFloat32(0.0f));

	//  steps = 4;
	dgFloat32 invStep = (dgFloat32(1.0f) / dgFloat32(LINEAR_SOLVER_SUB_STEPS));
	dgFloat32 timeStep = m_timeStep * invStep;
	dgFloat32 invTimeStep = m_invTimeStep * dgFloat32(LINEAR_SOLVER_SUB_STEPS);

	NEWTON_ASSERT(m_bodyArray[0].m_body == m_world->m_sentionelBody);
	for (dgInt32 i = 1; i < m_bodyCount; i++) {
		dgBody *const body = m_bodyArray[i].m_body;

		body->m_netForce = body->m_veloc;
		body->m_netTorque = body->m_omega;
		internalVeloc[i].m_linear = zero;
		internalVeloc[i].m_angular = zero;
		internalForces[i].m_linear = zero;
		internalForces[i].m_angular = zero;
	}

	internalVeloc[0].m_linear = zero;
	internalVeloc[0].m_angular = zero;
	internalForces[0].m_linear = zero;
	internalForces[0].m_angular = zero;

	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgJacobian y0;
		dgJacobian y1;

		dgInt32 first = constraintArray[i].m_autoPairstart;
		dgInt32 count = constraintArray[i].m_autoPaircount;

		dgInt32 m0 = constraintArray[i].m_m0;
		dgInt32 m1 = constraintArray[i].m_m1;

		y0.m_linear = zero;
		y0.m_angular = zero;
		y1.m_linear = zero;
		y1.m_angular = zero;
		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 index = j + first;
			dgFloat32 val = force[index];
			NEWTON_ASSERT(dgCheckFloat(val));
			y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(val);
			y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(val);
			y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(val);
			y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(val);
		}
		internalForces[m0].m_linear += y0.m_linear;
		internalForces[m0].m_angular += y0.m_angular;
		internalForces[m1].m_linear += y1.m_linear;
		internalForces[m1].m_angular += y1.m_angular;
	}

	//  if (sleepCount) {
	//      for (dgInt32 i = 1; i < m_bodyCount; i ++) {
	//          dgBody* body;
	//          body = m_bodyArray[i].m_body;
	//          if (body->m_equilibrium)
	//              body = m_bodyArray[i].m_body;
	//      }
	//  }

	dgFloat32 firstPassCoef = dgFloat32(0.0f);
	dgInt32 maxPasses = iterations + DG_BASE_ITERATION_COUNT;
	for (dgInt32 step = 0; step < LINEAR_SOLVER_SUB_STEPS; step++) {
		for (dgInt32 curJoint = 0; curJoint < m_jointCount; curJoint++) {
			dgJointAccelerationDecriptor joindDesc;

			dgInt32 index = constraintArray[curJoint].m_autoPairstart;
			joindDesc.m_rowsCount = constraintArray[curJoint].m_autoPaircount;

			joindDesc.m_timeStep = timeStep;
			joindDesc.m_invTimeStep = invTimeStep;
			joindDesc.m_firstPassCoefFlag = firstPassCoef;

			joindDesc.m_Jt = &Jt[index];
			joindDesc.m_penetration = &penetration[index];
			joindDesc.m_restitution = &restitution[index];
			joindDesc.m_externAccelaration = &externAccel[index];
			joindDesc.m_coordenateAccel = &coordenateAccel[index];
			joindDesc.m_accelIsMotor = &accelIsMortor[index];
			joindDesc.m_normalForceIndex = &normalForceIndex[index];
			joindDesc.m_penetrationStiffness = &penetrationStiffness[index];
			constraintArray[curJoint].m_joint->JointAccelerations(joindDesc);
		}
		firstPassCoef = dgFloat32(1.0f);

		dgFloat32 accNorm = maxAccNorm * dgFloat32(2.0f);
		for (dgInt32 passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm);
		        passes++) {
			accNorm = dgFloat32(0.0f);
			for (dgInt32 curJoint = 0; curJoint < m_jointCount; curJoint++) {
				dgInt32 index = constraintArray[curJoint].m_autoPairstart;
				dgInt32 rowsCount = constraintArray[curJoint].m_autoPaircount;
				dgInt32 m0 = constraintArray[curJoint].m_m0;
				dgInt32 m1 = constraintArray[curJoint].m_m1;

				dgVector linearM0(internalForces[m0].m_linear);
				dgVector angularM0(internalForces[m0].m_angular);
				dgVector linearM1(internalForces[m1].m_linear);
				dgVector angularM1(internalForces[m1].m_angular);
				for (dgInt32 k = 0; k < rowsCount; k++) {
					dgVector acc(
					    JMinv[index].m_jacobian_IM0.m_linear.CompProduct(linearM0));
					acc += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(angularM0);
					acc += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(linearM1);
					acc += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(angularM1);

					dgFloat32 a = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index];
					dgFloat32 f = force[index] + invDJMinvJt[index] * a;

					dgInt32 frictionIndex = normalForceIndex[index];
					NEWTON_ASSERT(
					    ((frictionIndex < 0) && (force[frictionIndex] == dgFloat32(1.0f))) || ((frictionIndex >= 0) && (force[frictionIndex] >= dgFloat32(0.0f))));

					dgFloat32 frictionNormal = force[frictionIndex];
					dgFloat32 lowerFrictionForce = frictionNormal * lowerFrictionCoef[index];
					dgFloat32 upperFrictionForce = frictionNormal * upperFrictionCoef[index];

					if (f > upperFrictionForce) {
						a = dgFloat32(0.0f);
						f = upperFrictionForce;
					} else if (f < lowerFrictionForce) {
						a = dgFloat32(0.0f);
						f = lowerFrictionForce;
					}

					accNorm = GetMax(accNorm, dgAbsf(a));
					dgFloat32 prevValue = f - force[index];
					force[index] = f;

					linearM0 += Jt[index].m_jacobian_IM0.m_linear.Scale(prevValue);
					angularM0 += Jt[index].m_jacobian_IM0.m_angular.Scale(prevValue);
					linearM1 += Jt[index].m_jacobian_IM1.m_linear.Scale(prevValue);
					angularM1 += Jt[index].m_jacobian_IM1.m_angular.Scale(prevValue);
					index++;
				}

				internalForces[m0].m_linear = linearM0;
				internalForces[m0].m_angular = angularM0;
				internalForces[m1].m_linear = linearM1;
				internalForces[m1].m_angular = angularM1;
			}
			// accNorm  = 1.0f;
		}

		for (dgInt32 i = 1; i < m_bodyCount; i++) {
			dgBody *const body = bodyArray[i].m_body;
			dgVector forceTmp(body->m_accel + internalForces[i].m_linear);
			dgVector torque(body->m_alpha + internalForces[i].m_angular);

			dgVector accel(forceTmp.Scale(body->m_invMass.m_w));
			dgVector alpha(body->m_invWorldInertiaMatrix.RotateVector(torque));
			body->m_veloc += accel.Scale(timeStep);
			body->m_omega += alpha.Scale(timeStep);
			internalVeloc[i].m_linear += body->m_veloc;
			internalVeloc[i].m_angular += body->m_omega;
		}
	}

	dgInt32 hasJointFeeback = 0;
	for (dgInt32 i = 0; i < m_jointCount; i++) {
		dgInt32 first = constraintArray[i].m_autoPairstart;
		dgInt32 count = constraintArray[i].m_autoPaircount;

		//      maxForce = dgFloat32 (0.0f);
		for (dgInt32 j = 0; j < count; j++) {
			dgInt32 index = j + first;
			dgFloat32 val = force[index];
			//          maxForce = GetMax (dgAbsf (val), maxForce);
			NEWTON_ASSERT(dgCheckFloat(val));
			jointForceFeeback[index][0] = val;
		}
		//      if (constraintArray[i].m_joint->GetId() == dgContactConstraintId) {
		//          m_world->AddToBreakQueue ((dgContact*)constraintArray[i].m_joint, maxForce);
		//      }

		hasJointFeeback |= (constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0);
		//      if (constraintArray[i].m_joint->m_updaFeedbackCallback) {
		//          constraintArray[i].m_joint->m_updaFeedbackCallback (*constraintArray[i].m_joint, m_timeStep, m_threadIndex);
		//      }
	}

	dgFloat32 maxAccNorm2 = maxAccNorm * maxAccNorm;
	for (dgInt32 i = 1; i < m_bodyCount; i++) {
		dgBody *const body = bodyArray[i].m_body;

#ifdef DG_WIGHT_FINAL_RK4_DERIVATIVES
		body->m_veloc = internalVeloc[i].m_linear.Scale(invStep);
		body->m_omega = internalVeloc[i].m_angular.Scale(invStep);
#endif

		dgVector accel = (body->m_veloc - body->m_netForce).Scale(m_invTimeStep);
		dgVector alpha = (body->m_omega - body->m_netTorque).Scale(m_invTimeStep);
		if ((accel % accel) < maxAccNorm2) {
			accel = zero;
		}

		if ((alpha % alpha) < maxAccNorm2) {
			alpha = zero;
		}

		body->m_accel = accel;
		body->m_alpha = alpha;
		body->m_netForce = accel.Scale(body->m_mass[3]);

		alpha = body->m_matrix.UnrotateVector(alpha);
		body->m_netTorque = body->m_matrix.RotateVector(
		                        alpha.CompProduct(body->m_mass));
	}

	if (hasJointFeeback) {
		for (dgInt32 i = 0; i < m_jointCount; i++) {
			if (constraintArray[i].m_joint->m_updaFeedbackCallback) {
				constraintArray[i].m_joint->m_updaFeedbackCallback(
					reinterpret_cast<const NewtonJoint *>(constraintArray[i].m_joint), m_timeStep, m_threadIndex);
			}
		}
	}
}

////////////////////////////////////////////////////////////////////////////////////////
//
//  Parallel solver
//
////////////////////////////////////////////////////////////////////////////////////////

void dgParallelSolverBodyInertia::ThreadExecute() {
	//  dgUnsigned32 ticks;
	//  ticks = m_world->m_getPerformanceCount();

	if (m_useSimd) {
		for (dgInt32 j = 0; j < m_count; j++) {
			dgBody *body;
			body = m_bodyArray[j].m_body;
			NEWTON_ASSERT(body->m_invMass.m_w > dgFloat32(0.0f));
			body->AddDamingAcceleration();
			body->CalcInvInertiaMatrixSimd();
		}
	} else {
		for (dgInt32 j = 0; j < m_count; j++) {
			dgBody *body;
			body = m_bodyArray[j].m_body;
			NEWTON_ASSERT(body->m_invMass.m_w > dgFloat32(0.0f));
			body->AddDamingAcceleration();
			body->CalcInvInertiaMatrix();
		}
	}

	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

void dgParallelSolverBuildJacobianRows::ThreadExecute() {
	//  dgUnsigned32 ticks;
	//  ticks = m_world->m_getPerformanceCount();

	dgFloat32 *const force = m_force;
	const dgJacobianPair *const Jt = m_Jt;
	dgJacobianPair *const JMinv = m_JMinv;
	dgFloat32 *const diagDamp = m_diagDamp;
	dgFloat32 *const extAccel = m_deltaAccel;
	dgFloat32 *const invDJMinvJt = m_invDJMinvJt;
	dgFloat32 *const coordenateAccel = m_coordenateAccel;
	dgFloat32 **const jointForceFeeback = m_jointFeebackForce;
	//  dgInt32* const accelIsMotor = solverMemory.m_accelIsMotor;

	dgBodyInfo *const bodyArray = m_bodyArray;
	dgJointInfo *const constraintArray = m_constraintArray;

	if (m_useSimd) {
#ifdef DG_BUILD_SIMD_CODE
		simd_type zero;
		zero = simd_set1(dgFloat32(0.0f));
		for (dgInt32 k = 0; k < m_count; k++) {
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 index;
			dgInt32 count;
			dgFloat32 diag;
			dgFloat32 stiffness;
			simd_type tmp0;
			simd_type tmp1;
			simd_type invMass0;
			simd_type invMass1;
			simd_type tmpDiag;
			simd_type tmpAccel;
			dgBody *body0;
			dgBody *body1;

			index = constraintArray[k].m_autoPairstart;
			count = constraintArray[k].m_autoPaircount;
			m0 = constraintArray[k].m_m0;
			m1 = constraintArray[k].m_m1;
			NEWTON_ASSERT(m0 >= 0);
			NEWTON_ASSERT(m0 < m_bodyCount);
			NEWTON_ASSERT(m1 >= 0);
			NEWTON_ASSERT(m1 < m_bodyCount);

			body0 = bodyArray[m0].m_body;
			// invMass0 = body0->m_invMass[3];
			invMass0 = simd_set1(body0->m_invMass[3]);
			NEWTON_ASSERT((dgUnsigned64(&body0->m_invWorldInertiaMatrix) & 0x0f) == 0);
			const dgMatrix &invInertia0 = body0->m_invWorldInertiaMatrix;

			body1 = bodyArray[m1].m_body;
			// invMass1 = body1->m_invMass[3];
			invMass1 = simd_set1(body1->m_invMass[3]);
			NEWTON_ASSERT((dgUnsigned64(&body1->m_invWorldInertiaMatrix) & 0x0f) == 0);
			const dgMatrix &invInertia1 = body1->m_invWorldInertiaMatrix;
			for (dgInt32 i = 0; i < count; i++) {
				// JMinv[index].m_jacobian_IM0.m_linear = Jt[index].m_jacobian_IM0.m_linear.Scale (invMass0);
				// JMinv[index].m_jacobian_IM0.m_angular = invInertia0.UnrotateVector (Jt[index].m_jacobian_IM0.m_angular);
				// dgVector tmpDiag (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(Jt[index].m_jacobian_IM0.m_linear));
				// tmpDiag += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(Jt[index].m_jacobian_IM0.m_angular);
				// dgVector tmpAccel (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(body0->m_accel));
				// tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(body0->m_alpha);

				((simd_type &)JMinv[index].m_jacobian_IM0.m_linear) =
				    simd_mul_v((simd_type &)Jt[index].m_jacobian_IM0.m_linear, invMass0);
				tmp0 = (simd_type &)Jt[index].m_jacobian_IM0.m_angular;
				tmp1 =
				    simd_mul_v((simd_type &)invInertia0.m_front, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 0, 0, 0)));
				tmp1 =
				    simd_mul_add_v(tmp1, (simd_type &)invInertia0.m_up, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 1, 1, 1)));
				((simd_type &)JMinv[index].m_jacobian_IM0.m_angular) =
				    simd_mul_add_v(tmp1, (simd_type &)invInertia0.m_right, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 2, 2, 2)));
				tmpDiag =
				    simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear);
				tmpDiag =
				    simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular);
				tmpAccel =
				    simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, (simd_type &)body0->m_accel);
				tmpAccel =
				    simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, (simd_type &)body0->m_alpha);

				// JMinv[index].m_jacobian_IM1.m_linear = Jt[index].m_jacobian_IM1.m_linear.Scale (invMass1);
				// JMinv[index].m_jacobian_IM1.m_angular = invInertia1.UnrotateVector (Jt[index].m_jacobian_IM1.m_angular);
				// tmpDiag += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(Jt[index].m_jacobian_IM1.m_linear);
				// tmpDiag += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(Jt[index].m_jacobian_IM1.m_angular);
				// tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(body1->m_accel);
				// tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(body1->m_alpha);

				((simd_type &)JMinv[index].m_jacobian_IM1.m_linear) =
				    simd_mul_v((simd_type &)Jt[index].m_jacobian_IM1.m_linear, invMass1);
				tmp0 = (simd_type &)Jt[index].m_jacobian_IM1.m_angular;
				tmp1 =
				    simd_mul_v((simd_type &)invInertia1.m_front, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 0, 0, 0)));
				tmp1 =
				    simd_mul_add_v(tmp1, (simd_type &)invInertia1.m_up, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 1, 1, 1)));
				((simd_type &)JMinv[index].m_jacobian_IM1.m_angular) =
				    simd_mul_add_v(tmp1, (simd_type &)invInertia1.m_right, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 2, 2, 2)));
				tmpDiag =
				    simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear);
				tmpDiag =
				    simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular);
				tmpAccel =
				    simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, (simd_type &)body1->m_accel);
				tmpAccel =
				    simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, (simd_type &)body1->m_alpha);

				// coordenateAccel[index] -= (tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z);
				tmpAccel = simd_add_v(tmpAccel, simd_move_hl_v(tmpAccel, tmpAccel));
				tmpAccel =
				    simd_sub_s(zero, simd_add_s(tmpAccel, simd_permut_v(tmpAccel, tmpAccel, PURMUT_MASK(3, 3, 3, 1))));

				simd_store_s(tmpAccel, &extAccel[index]);
				simd_store_s(simd_add_s(simd_load_s(coordenateAccel[index]), tmpAccel),
				             &coordenateAccel[index]);

				// force[index] = bilateralForceBounds[index].m_jointForce[0];
				force[index] = jointForceFeeback[index][0];

				NEWTON_ASSERT(diagDamp[index] >= dgFloat32(0.1f));
				NEWTON_ASSERT(diagDamp[index] <= dgFloat32(100.0f));
				stiffness = DG_PSD_DAMP_TOL * diagDamp[index];

				// diag = (tmpDiag.m_x + tmpDiag.m_y + tmpDiag.m_z);
				tmpDiag = simd_add_v(tmpDiag, simd_move_hl_v(tmpDiag, tmpDiag));
				simd_store_s(
				    simd_add_s(tmpDiag, simd_permut_v(tmpDiag, tmpDiag, PURMUT_MASK(3, 3, 3, 1))),
				    &diag);
				NEWTON_ASSERT(diag > dgFloat32(0.0f));
				diagDamp[index] = diag * stiffness;

				diag *= (dgFloat32(1.0f) + stiffness);
				invDJMinvJt[index] = dgFloat32(1.0f) / diag;

				index++;
			}
		}
#endif
	} else {
		for (dgInt32 k = 0; k < m_count; k++) {
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 index;
			dgInt32 count;
			dgFloat32 invMass0;
			dgFloat32 invMass1;
			dgBody *body0;
			dgBody *body1;

			index = constraintArray[k].m_autoPairstart;
			count = constraintArray[k].m_autoPaircount;
			m0 = constraintArray[k].m_m0;
			m1 = constraintArray[k].m_m1;

			NEWTON_ASSERT(m0 >= 0);
			NEWTON_ASSERT(m0 < m_bodyCount);
			body0 = bodyArray[m0].m_body;
			invMass0 = body0->m_invMass[3];
			const dgMatrix &invInertia0 = body0->m_invWorldInertiaMatrix;

			NEWTON_ASSERT(m1 >= 0);
			NEWTON_ASSERT(m1 < m_bodyCount);
			body1 = bodyArray[m1].m_body;
			invMass1 = body1->m_invMass[3];
			const dgMatrix &invInertia1 = body1->m_invWorldInertiaMatrix;

			for (dgInt32 i = 0; i < count; i++) {
				dgFloat32 diag;
				dgFloat32 stiffness;
				dgFloat32 extenalAcceleration;

				JMinv[index].m_jacobian_IM0.m_linear =
				    Jt[index].m_jacobian_IM0.m_linear.Scale(invMass0);
				JMinv[index].m_jacobian_IM0.m_angular = invInertia0.UnrotateVector(
				        Jt[index].m_jacobian_IM0.m_angular);
				dgVector tmpDiag(
				    JMinv[index].m_jacobian_IM0.m_linear.CompProduct(
				        Jt[index].m_jacobian_IM0.m_linear));
				tmpDiag += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(
				               Jt[index].m_jacobian_IM0.m_angular);
				dgVector tmpAccel(
				    JMinv[index].m_jacobian_IM0.m_linear.CompProduct(body0->m_accel));
				tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(
				                body0->m_alpha);

				JMinv[index].m_jacobian_IM1.m_linear =
				    Jt[index].m_jacobian_IM1.m_linear.Scale(invMass1);
				JMinv[index].m_jacobian_IM1.m_angular = invInertia1.UnrotateVector(
				        Jt[index].m_jacobian_IM1.m_angular);
				tmpDiag += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(
				               Jt[index].m_jacobian_IM1.m_linear);
				tmpDiag += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(
				               Jt[index].m_jacobian_IM1.m_angular);
				tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(
				                body1->m_accel);
				tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(
				                body1->m_alpha);

				extenalAcceleration = -(tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z);
				extAccel[index] = extenalAcceleration;
				coordenateAccel[index] += extenalAcceleration;
				force[index] = jointForceFeeback[index][0];

				NEWTON_ASSERT(diagDamp[index] >= dgFloat32(0.1f));
				NEWTON_ASSERT(diagDamp[index] <= dgFloat32(100.0f));
				stiffness = DG_PSD_DAMP_TOL * diagDamp[index];

				diag = (tmpDiag.m_x + tmpDiag.m_y + tmpDiag.m_z);
				NEWTON_ASSERT(diag > dgFloat32(0.0f));
				diagDamp[index] = diag * stiffness;

				diag *= (dgFloat32(1.0f) + stiffness);
				// solverMemory.m_diagJMinvJt[index] = diag;
				invDJMinvJt[index] = dgFloat32(1.0f) / diag;
				index++;
			}
		}
	}

	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

void dgParallelSolverBuildJacobianMatrix::ThreadExecute() {
	//  dgInt32 count;
	//  dgInt32 rowCount;
	//  dgInt32 jointSolved;
	dgContraintDescritor constraintParams;
	dgJointInfo *const constraintArray = m_constraintArray;

	//  ticks = m_world->m_getPerformanceCount();

	constraintParams.m_world = m_world;
	constraintParams.m_threadIndex = m_threadIndex;
	constraintParams.m_timestep = m_timestep;
	constraintParams.m_invTimestep = m_invTimestep;

	dgJacobianMemory &solverMemory = *m_solverMemory;

	dgInt32 count = m_count;
	dgInt32 jointSolved = m_jointSolved;
	for (dgInt32 j = 0; j < count; j++) {
		dgConstraint *const constraint = constraintArray[j].m_joint;

		if (constraint->m_isUnilateral ^ m_bitMode) {

			dgInt32 dof = dgInt32(constraint->m_maxDOF);
			jointSolved++;

			NEWTON_ASSERT(dof <= DG_CONSTRAINT_MAX_ROWS);
			for (dgInt32 i = 0; i < dof; i++) {
				constraintParams.m_forceBounds[i].m_low = DG_MIN_BOUND;
				constraintParams.m_forceBounds[i].m_upper = DG_MAX_BOUND;
				constraintParams.m_forceBounds[i].m_jointForce = NULL;
				constraintParams.m_forceBounds[i].m_normalIndex =
				    DG_BILATERAL_CONSTRAINT;
			}

			NEWTON_ASSERT(constraint->m_body0);
			NEWTON_ASSERT(constraint->m_body1);

			constraint->m_body0->m_inCallback = true;
			constraint->m_body1->m_inCallback = true;

			dof = dgInt32(constraint->JacobianDerivative(constraintParams));

			constraint->m_body0->m_inCallback = false;
			constraint->m_body1->m_inCallback = false;

			dgInt32 m0 =
			    (constraint->m_body0->m_invMass.m_w != dgFloat32(0.0f)) ? constraint->m_body0->m_index : 0;
			dgInt32 m1 =
			    (constraint->m_body1->m_invMass.m_w != dgFloat32(0.0f)) ? constraint->m_body1->m_index : 0;

			m_world->dgGetUserLock();
			dgInt32 rowCount = m_rowsCount[0];
			m_rowsCount[0] = rowCount + ((dof & (DG_SIMD_WORD_SIZE - 1)) ? ((dof & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : dof);
			m_world->dgReleasedUserLock();

			constraintArray[j].m_autoPairstart = rowCount;
			constraintArray[j].m_autoPaircount = dgInt32(dof);
			constraintArray[j].m_autoPairActiveCount = dgInt32(dof);
			constraintArray[j].m_m0 = m0;
			constraintArray[j].m_m1 = m1;

			for (dgInt32 i = 0; i < dof; i++) {
				NEWTON_ASSERT(constraintParams.m_forceBounds[i].m_jointForce);
				solverMemory.m_Jt[rowCount] = constraintParams.m_jacobian[i];

				NEWTON_ASSERT(constraintParams.m_jointStiffness[i] >= dgFloat32(0.1f));
				NEWTON_ASSERT(constraintParams.m_jointStiffness[i] <= dgFloat32(100.0f));
				solverMemory.m_diagDamp[rowCount] =
				    constraintParams.m_jointStiffness[i];
				solverMemory.m_coordenateAccel[rowCount] =
				    constraintParams.m_jointAccel[i];
				solverMemory.m_accelIsMotor[rowCount] = dgInt32(
				        constraintParams.m_isMotor[i]);
				solverMemory.m_restitution[rowCount] =
				    constraintParams.m_restitution[i];
				solverMemory.m_penetration[rowCount] =
				    constraintParams.m_penetration[i];
				solverMemory.m_penetrationStiffness[rowCount] =
				    constraintParams.m_penetrationStiffness[i];

				solverMemory.m_lowerBoundFrictionCoefficent[rowCount] =
				    constraintParams.m_forceBounds[i].m_low;
				solverMemory.m_upperBoundFrictionCoefficent[rowCount] =
				    constraintParams.m_forceBounds[i].m_upper;
				solverMemory.m_jointFeebackForce[rowCount] =
				    constraintParams.m_forceBounds[i].m_jointForce;
				solverMemory.m_normalForceIndex[rowCount] =
				    constraintParams.m_forceBounds[i].m_normalIndex + ((constraintParams.m_forceBounds[i].m_normalIndex >= 0) ? (rowCount - i) : 0);

				rowCount++;
			}

#ifdef _DEBUG
			for (dgInt32 i = 0; i < ((rowCount + 3) & 0xfffc) - rowCount; i++) {
				solverMemory.m_diagDamp[rowCount + i] = dgFloat32(0.0f);
				solverMemory.m_coordenateAccel[rowCount + i] = dgFloat32(0.0f);
				solverMemory.m_restitution[rowCount + i] = dgFloat32(0.0f);
				solverMemory.m_penetration[rowCount + i] = dgFloat32(0.0f);
				solverMemory.m_penetrationStiffness[rowCount + i] = dgFloat32(0.0f);

				solverMemory.m_lowerBoundFrictionCoefficent[rowCount + i] = dgFloat32(
				            0.0f);
				solverMemory.m_upperBoundFrictionCoefficent[rowCount + i] = dgFloat32(
				            0.0f);
				solverMemory.m_jointFeebackForce[rowCount + i] = 0;
				solverMemory.m_normalForceIndex[rowCount + i] = 0;
			}
#endif

			//          rowCount = (rowCount & (DG_SIMD_WORD_SIZE - 1)) ? ((rowCount & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : rowCount;
			//          NEWTON_ASSERT ((rowCount & (DG_SIMD_WORD_SIZE - 1)) == 0);
		}
	}

	m_jointSolved = jointSolved;

	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

dgInt32 dgWorldDynamicUpdate::GetJacobialDerivativesParallel(
    const dgIsland &island, bool bitMode, dgInt32 rowCount, dgFloat32 timestep) {
	dgInt32 chunkSizes[DG_MAXIMUN_THREADS];
	dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart];

	dgInt32 acc = 0;
	dgInt32 threadCounts = dgInt32(m_world->m_numberOfTheads);
	dgInt32 jointCount = island.m_jointCount;
	m_world->m_threadsManager.CalculateChunkSizes(jointCount, chunkSizes);
	for (dgInt32 threadIndex = 0; threadIndex < threadCounts; threadIndex++) {
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_bitMode = bitMode;
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_world = m_world;
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_jointSolved = 0;
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_count =
		    chunkSizes[threadIndex];
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_timestep = timestep;
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_invTimestep = dgFloat32(
		            1.0f / timestep);
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_threadIndex =
		    threadIndex;
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_rowsCount = &rowCount;
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_constraintArray =
		    &constraintArray[acc];
		m_parallelSolverBuildJacobianMatrix[threadIndex].m_solverMemory =
		    &m_solverMemory[0];
		m_world->m_threadsManager.SubmitJob(
		    &m_parallelSolverBuildJacobianMatrix[threadIndex]);
		acc += chunkSizes[threadIndex];
	}
	m_world->m_threadsManager.SynchronizationBarrier();

	return rowCount;
}

void dgWorldDynamicUpdate::BuildJacobianMatrixParallel(const dgIsland &island,
        dgFloat32 timestep, dgInt32 archModel) {

	NEWTON_ASSERT(island.m_bodyCount >= 2);
	//NEWTON_ASSERT (island.m_jointCount >= 1);

	dgInt32 threads = dgInt32(m_world->m_numberOfTheads);
	dgJacobianMemory &solverMemory = m_solverMemory[0];

	dgInt32 bodyCount = island.m_bodyCount;
	dgBodyInfo *const bodyArray = &m_bodyArray[island.m_bodyStart];

	bodyArray[0].m_body = m_world->GetSentinelBody();
	NEWTON_ASSERT(
	    (bodyArray[0].m_body->m_accel % bodyArray[0].m_body->m_accel) == dgFloat32(0.0f));
	NEWTON_ASSERT(
	    (bodyArray[0].m_body->m_alpha % bodyArray[0].m_body->m_alpha) == dgFloat32(0.0f));

	dgInt32 acc = 0;
	dgInt32 chunkSizes[DG_MAXIMUN_THREADS];
	m_world->m_threadsManager.CalculateChunkSizes(bodyCount - 1, chunkSizes);
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		m_parallelBodyInertiaMatrix[threadIndex].m_useSimd = archModel;
		m_parallelBodyInertiaMatrix[threadIndex].m_bodyArray = &bodyArray[acc + 1];
		m_parallelBodyInertiaMatrix[threadIndex].m_threadIndex = threadIndex;
		m_parallelBodyInertiaMatrix[threadIndex].m_world = m_world;
		m_parallelBodyInertiaMatrix[threadIndex].m_count = chunkSizes[threadIndex];
		m_world->m_threadsManager.SubmitJob(
		    &m_parallelBodyInertiaMatrix[threadIndex]);
		acc += chunkSizes[threadIndex];
	}
	m_world->m_threadsManager.SynchronizationBarrier();

	while (bodyCount > solverMemory.m_maxBodiesCount) {
		m_world->dgGetUserLock();
		ReallocIntenalForcesMemory(bodyCount, 0);
		m_world->dgReleasedUserLock();
	}

	dgInt32 jointCount = island.m_jointCount;
	dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart];
	solverMemory.m_constraintArray = constraintArray;

	dgInt32 maxRowCount = 0;
	for (dgInt32 j = 0; j < jointCount; j++) {
		dgConstraint *const constraint = constraintArray[j].m_joint;
		;
		dgInt32 dof =
		    dgInt32(
		        (constraint->m_maxDOF & (DG_SIMD_WORD_SIZE - 1)) ? ((constraint->m_maxDOF & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : constraint->m_maxDOF);
		maxRowCount += dof;
	}

	while (maxRowCount > solverMemory.m_maxJacobiansCount) {
		m_world->dgGetUserLock();
		ReallocJacobiansMemory(solverMemory.m_maxJacobiansCount * 2, 0);
		m_world->dgReleasedUserLock();
	}

	dgInt32 rowCount = 0;
	if (island.m_hasUnilateralJoints) {
		rowCount = GetJacobialDerivativesParallel(island, false, rowCount,
		           timestep);
	}
	rowCount = GetJacobialDerivativesParallel(island, true, rowCount, timestep);

	solverMemory.m_rowCount = rowCount;
	solverMemory.m_bodyCount = bodyCount;
	solverMemory.m_bodyArray = bodyArray;
	solverMemory.m_jointCount = jointCount;
	solverMemory.m_timeStep = timestep;
	solverMemory.m_invTimeStep = dgFloat32(1.0f) / solverMemory.m_timeStep;

	acc = 0;
	m_world->m_threadsManager.CalculateChunkSizes(jointCount, chunkSizes);
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		m_parallelSolverBuildJacobianRows[threadIndex].m_useSimd = archModel;
		m_parallelSolverBuildJacobianRows[threadIndex].m_bodyCount = bodyCount;
		m_parallelSolverBuildJacobianRows[threadIndex].m_bodyArray = bodyArray;
		m_parallelSolverBuildJacobianRows[threadIndex].m_threadIndex = threadIndex;
		m_parallelSolverBuildJacobianRows[threadIndex].m_world = m_world;
		m_parallelSolverBuildJacobianRows[threadIndex].m_count =
		    chunkSizes[threadIndex];
		m_parallelSolverBuildJacobianRows[threadIndex].m_force =
		    solverMemory.m_force;
		m_parallelSolverBuildJacobianRows[threadIndex].m_Jt = solverMemory.m_Jt;
		m_parallelSolverBuildJacobianRows[threadIndex].m_JMinv =
		    solverMemory.m_JMinv;
		m_parallelSolverBuildJacobianRows[threadIndex].m_diagDamp =
		    solverMemory.m_diagDamp;
		m_parallelSolverBuildJacobianRows[threadIndex].m_deltaAccel =
		    solverMemory.m_deltaAccel;
		m_parallelSolverBuildJacobianRows[threadIndex].m_invDJMinvJt =
		    solverMemory.m_invDJMinvJt;
		m_parallelSolverBuildJacobianRows[threadIndex].m_coordenateAccel =
		    solverMemory.m_coordenateAccel;
		m_parallelSolverBuildJacobianRows[threadIndex].m_jointFeebackForce =
		    solverMemory.m_jointFeebackForce;
		m_parallelSolverBuildJacobianRows[threadIndex].m_constraintArray =
		    &constraintArray[acc];
		m_world->m_threadsManager.SubmitJob(
		    &m_parallelSolverBuildJacobianRows[threadIndex]);
		acc += chunkSizes[threadIndex];
	}
	m_world->m_threadsManager.SynchronizationBarrier();
}

void dgParallelSolverClear::ThreadExecute() {
	if (m_useSimd) {
#ifdef DG_BUILD_SIMD_CODE
		simd_type zero;
		zero = simd_set1(dgFloat32(0.0f));
		for (dgInt32 i = 0; i < m_count; i++) {
			dgBody *body;
			body = m_bodyArray[i].m_body;

			((simd_type &)body->m_netForce) = zero;
			((simd_type &)body->m_netTorque) = zero;
			((simd_type &)m_internalVeloc[i].m_linear) = zero;
			((simd_type &)m_internalVeloc[i].m_angular) = zero;
			((simd_type &)m_internalForces[i].m_linear) = zero;
			((simd_type &)m_internalForces[i].m_angular) = zero;
		}
#endif
	} else {
		dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
		              dgFloat32(0.0f));
		for (dgInt32 i = 0; i < m_count; i++) {
			dgBody *body;
			body = m_bodyArray[i].m_body;
			body->m_netForce = body->m_veloc;
			body->m_netTorque = body->m_omega;
			m_internalVeloc[i].m_linear = zero;
			m_internalVeloc[i].m_angular = zero;
			m_internalForces[i].m_linear = zero;
			m_internalForces[i].m_angular = zero;
		}
	}

	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

void dgParallelSolverInitInternalForces::ThreadExecute() {
	//  dgUnsigned32 ticks;
	//  ticks = m_world->m_getPerformanceCount();

	if (m_useSimd) {
#ifdef DG_BUILD_SIMD_CODE
		simd_type zero;
		zero = simd_set1(dgFloat32(0.0f));

		for (dgInt32 i = 0; i < m_count; i++) {
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 first;
			dgInt32 count;

			simd_type y0_linear;
			simd_type y0_angular;
			simd_type y1_linear;
			simd_type y1_angular;

			first = m_constraintArray[i].m_autoPairstart;
			count = m_constraintArray[i].m_autoPairActiveCount;
			m0 = m_constraintArray[i].m_m0;
			m1 = m_constraintArray[i].m_m1;
			// dgJacobian y0 (internalForces[k0]);
			// dgJacobian y1 (internalForces[k1]);
			y0_linear = zero;
			y0_angular = zero;
			y1_linear = zero;
			y1_angular = zero;
			for (dgInt32 j = 0; j < count; j++) {
				dgInt32 index;
				simd_type tmp0;
				index = j + first;
				// val = force[index];
				tmp0 = simd_set1(m_force[index]);
				// y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (val);
				// y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (val);
				// y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (val);
				// y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (val);
				y0_linear =
				    simd_mul_add_v(y0_linear, (simd_type &)m_Jt[index].m_jacobian_IM0.m_linear, tmp0);
				y0_angular =
				    simd_mul_add_v(y0_angular, (simd_type &)m_Jt[index].m_jacobian_IM0.m_angular, tmp0);
				y1_linear =
				    simd_mul_add_v(y1_linear, (simd_type &)m_Jt[index].m_jacobian_IM1.m_linear, tmp0);
				y1_angular =
				    simd_mul_add_v(y1_angular, (simd_type &)m_Jt[index].m_jacobian_IM1.m_angular, tmp0);
			}
			// internalForces[k0] = y0;
			// internalForces[k1] = y1;
			//          m_world->dgGetUserLock();
			m_world->dgGetIndirectLock(&m_locks[m0]);
			(simd_type &)m_internalForces[m0].m_linear =
			    simd_add_v((simd_type &)m_internalForces[m0].m_linear, y0_linear);
			(simd_type &)m_internalForces[m0].m_angular =
			    simd_add_v((simd_type &)m_internalForces[m0].m_angular, y0_angular);
			m_world->dgReleaseIndirectLock(&m_locks[m0]);

			m_world->dgGetIndirectLock(&m_locks[m1]);
			(simd_type &)m_internalForces[m1].m_linear =
			    simd_add_v((simd_type &)m_internalForces[m1].m_linear, y1_linear);
			(simd_type &)m_internalForces[m1].m_angular =
			    simd_add_v((simd_type &)m_internalForces[m1].m_angular, y1_angular);
			m_world->dgReleaseIndirectLock(&m_locks[m1]);
			//          m_world->dgReleasedUserLock();
		}
#endif
	} else {
		dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
		              dgFloat32(0.0f));
		for (dgInt32 i = 0; i < m_count; i++) {
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 first;
			dgInt32 count;

			dgFloat32 val;
			dgJacobian y0;
			dgJacobian y1;
			first = m_constraintArray[i].m_autoPairstart;
			count = m_constraintArray[i].m_autoPaircount;

			m0 = m_constraintArray[i].m_m0;
			m1 = m_constraintArray[i].m_m1;

			y0.m_linear = zero;
			y0.m_angular = zero;
			y1.m_linear = zero;
			y1.m_angular = zero;
			for (dgInt32 j = 0; j < count; j++) {
				dgInt32 index;

				index = j + first;
				val = m_force[index];
				NEWTON_ASSERT(dgCheckFloat(val));
				y0.m_linear += m_Jt[index].m_jacobian_IM0.m_linear.Scale(val);
				y0.m_angular += m_Jt[index].m_jacobian_IM0.m_angular.Scale(val);
				y1.m_linear += m_Jt[index].m_jacobian_IM1.m_linear.Scale(val);
				y1.m_angular += m_Jt[index].m_jacobian_IM1.m_angular.Scale(val);
			}

			m_world->dgGetIndirectLock(&m_locks[m0]);
			m_internalForces[m0].m_linear += y0.m_linear;
			m_internalForces[m0].m_angular += y0.m_angular;
			m_world->dgReleaseIndirectLock(&m_locks[m0]);

			m_world->dgGetIndirectLock(&m_locks[m1]);
			m_internalForces[m1].m_linear += y1.m_linear;
			m_internalForces[m1].m_angular += y1.m_angular;
			m_world->dgReleaseIndirectLock(&m_locks[m1]);
		}
	}

	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

void dgParallelSolverJointAcceleration::ThreadExecute() {
	//  dgUnsigned32 ticks;
	//  ticks = m_world->m_getPerformanceCount();

	if (m_useSimd) {
		dgInt32 currJoint;

		currJoint = m_jointStart;
		for (dgInt32 i = 0; i < m_count; i++) {
			dgInt32 index;

			dgJointAccelerationDecriptor joindDesc;
			index = m_constraintArray[currJoint].m_autoPairstart;
			joindDesc.m_rowsCount = m_constraintArray[currJoint].m_autoPaircount;

			joindDesc.m_timeStep = m_timeStep;
			joindDesc.m_invTimeStep = m_invTimeStep;
			joindDesc.m_firstPassCoefFlag = m_firstPassCoef;

			joindDesc.m_Jt = &m_Jt[index];
			joindDesc.m_penetration = &m_penetration[index];
			joindDesc.m_restitution = &m_restitution[index];
			joindDesc.m_externAccelaration = &m_externAccel[index];
			joindDesc.m_coordenateAccel = &m_coordenateAccel[index];
			joindDesc.m_accelIsMotor = &m_accelIsMortor[index];
			joindDesc.m_normalForceIndex = &m_normalForceIndex[index];
			joindDesc.m_penetrationStiffness = &m_penetrationStiffness[index];
			m_constraintArray[currJoint].m_joint->JointAccelerationsSimd(joindDesc);

			currJoint++;
		}
	} else {
		dgInt32 currJoint;

		currJoint = m_jointStart;
		for (dgInt32 i = 0; i < m_count; i++) {
			dgInt32 index;
			dgJointAccelerationDecriptor joindDesc;

			index = m_constraintArray[currJoint].m_autoPairstart;
			joindDesc.m_rowsCount = m_constraintArray[currJoint].m_autoPaircount;

			joindDesc.m_timeStep = m_timeStep;
			joindDesc.m_invTimeStep = m_invTimeStep;
			joindDesc.m_firstPassCoefFlag = m_firstPassCoef;

			joindDesc.m_Jt = &m_Jt[index];
			joindDesc.m_penetration = &m_penetration[index];
			joindDesc.m_restitution = &m_restitution[index];
			joindDesc.m_externAccelaration = &m_externAccel[index];
			joindDesc.m_coordenateAccel = &m_coordenateAccel[index];
			joindDesc.m_accelIsMotor = &m_accelIsMortor[index];
			joindDesc.m_normalForceIndex = &m_normalForceIndex[index];
			joindDesc.m_penetrationStiffness = &m_penetrationStiffness[index];
			m_constraintArray[currJoint].m_joint->JointAccelerations(joindDesc);

			currJoint++;
		}
	}

	m_firstPassCoef = dgFloat32(1.0f);
	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

void dgParallelSolverUpdateVeloc::ThreadExecute() {
	//  dgUnsigned32 ticks;
	//  ticks = m_world->m_getPerformanceCount();

	if (m_useSimd) {
#ifdef DG_BUILD_SIMD_CODE
		simd_type timeStepSimd;
		timeStepSimd = simd_set1(m_timeStep);
		for (dgInt32 i = 0; i < m_count; i++) {
			dgBody *body;
			simd_type force;
			simd_type torque;
			simd_type accel;
			simd_type alpha;

			body = m_bodyArray[i].m_body;
			force =
			    simd_add_v((simd_type &)body->m_accel, (simd_type &)m_internalForces[i].m_linear);
			torque =
			    simd_add_v((simd_type &)body->m_alpha, (simd_type &)m_internalForces[i].m_angular);

			// dgVector accel (force.Scale (body->m_invMass.m_w));
			accel = simd_mul_v(force, simd_set1(body->m_invMass.m_w));
			// dgVector alpha (body->m_invWorldInertiaMatrix.RotateVector (torque));
			alpha =
			    simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_invWorldInertiaMatrix[0], simd_permut_v(torque, torque, PURMUT_MASK(0, 0, 0, 0))),
			                                  (simd_type &)body->m_invWorldInertiaMatrix[1], simd_permut_v(torque, torque, PURMUT_MASK(1, 1, 1, 1))),
			                   (simd_type &)body->m_invWorldInertiaMatrix[2], simd_permut_v(torque, torque, PURMUT_MASK(2, 2, 2, 2)));

			// body->m_veloc += accel.Scale(timeStep);
			(simd_type &)body->m_veloc =
			    simd_mul_add_v((simd_type &)body->m_veloc, accel, timeStepSimd);
			// body->m_omega += alpha.Scale(timeStep);
			(simd_type &)body->m_omega =
			    simd_mul_add_v((simd_type &)body->m_omega, alpha, timeStepSimd);

			// body->m_netForce += body->m_veloc;
			(simd_type &)m_internalVeloc[i].m_linear =
			    simd_add_v((simd_type &)m_internalVeloc[i].m_linear, (simd_type &)body->m_veloc);
			// body->m_netTorque += body->m_omega;
			(simd_type &)m_internalVeloc[i].m_angular =
			    simd_add_v((simd_type &)m_internalVeloc[i].m_angular, (simd_type &)body->m_omega);
		}
#endif
	} else {

		for (dgInt32 i = 0; i < m_count; i++) {
			dgBody *body;
			body = m_bodyArray[i].m_body;
			dgVector force(body->m_accel + m_internalForces[i].m_linear);
			dgVector torque(body->m_alpha + m_internalForces[i].m_angular);

			dgVector accel(force.Scale(body->m_invMass.m_w));
			dgVector alpha(body->m_invWorldInertiaMatrix.RotateVector(torque));
			body->m_veloc += accel.Scale(m_timeStep);
			body->m_omega += alpha.Scale(m_timeStep);
			m_internalVeloc[i].m_linear += body->m_veloc;
			m_internalVeloc[i].m_angular += body->m_omega;
		}
	}

	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

void dgParallelSolverUpdateForce::ThreadExecute() {
	//  dgUnsigned32 ticks;
	//  ticks = m_world->m_getPerformanceCount();

	if (m_useSimd) {
#ifdef DG_BUILD_SIMD_CODE
		simd_type invStepSimd;
		simd_type invTimeStepSimd;
		simd_type accelerationTolerance;
		//  signMask = simd_set1 (invStep);
		invStepSimd = simd_set1(m_invStep);

		invTimeStepSimd = simd_set1(m_invTimeStep);
		accelerationTolerance = simd_set1(m_maxAccNorm2);
		for (dgInt32 i = 0; i < m_count; i++) {
			dgBody *body;
			simd_type tmp;
			simd_type accel;
			simd_type alpha;
			body = m_bodyArray[i].m_body;

#ifdef DG_WIGHT_FINAL_RK4_DERIVATIVES
			// body->m_veloc = internalVeloc[i].m_linear.Scale(invStep);
			// body->m_omega = internalVeloc[i].m_angular.Scale(invStep);
			(simd_type &)body->m_veloc = simd_mul_v((simd_type &)m_internalVeloc[i].m_linear, invStepSimd);
			(simd_type &)body->m_omega = simd_mul_v((simd_type &)m_internalVeloc[i].m_angular, invStepSimd);
#endif

			// dgVector accel = (body->m_veloc - body->m_netForce).Scale (m_invTimeStep);
			// dgVector alpha = (body->m_omega - body->m_netTorque).Scale (m_invTimeStep);
			accel =
			    simd_mul_v(simd_sub_v((simd_type &)body->m_veloc, (simd_type &)body->m_netForce), invTimeStepSimd);
			alpha =
			    simd_mul_v(simd_sub_v((simd_type &)body->m_omega, (simd_type &)body->m_netTorque), invTimeStepSimd);

			// if ((accel % accel) < maxAccNorm2) {
			//  accel = zero;
			// }
			// body->m_accel = accel;
			// body->m_netForce = accel.Scale (body->m_mass[3]);
			tmp = simd_mul_v(accel, accel);
			tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
			tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
			tmp = simd_cmplt_s(tmp, accelerationTolerance);
			(simd_type &)body->m_accel =
			    simd_andnot_v(accel, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0)));
			(simd_type &)body->m_netForce =
			    simd_mul_v((simd_type &)body->m_accel, simd_set1(body->m_mass[3]));

			// if ((alpha % alpha) < maxAccNorm2) {
			//  alpha = zero;
			// }
			// body->m_alpha = alpha;
			tmp = simd_mul_v(alpha, alpha);
			tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
			tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
			tmp = simd_cmplt_s(tmp, accelerationTolerance);
			(simd_type &)body->m_alpha =
			    simd_andnot_v(alpha, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0)));

			// alpha = body->m_matrix.UnrotateVector(alpha);
			alpha =
			    simd_mul_v((simd_type &)body->m_matrix[0], (simd_type &)body->m_alpha);
			alpha = simd_add_v(alpha, simd_move_hl_v(alpha, alpha));
			alpha =
			    simd_add_s(alpha, simd_permut_v(alpha, alpha, PURMUT_MASK(0, 0, 0, 1)));

			tmp =
			    simd_mul_v((simd_type &)body->m_matrix[1], (simd_type &)body->m_alpha);
			tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
			tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
			alpha = simd_pack_lo_v(alpha, tmp);

			tmp =
			    simd_mul_v((simd_type &)body->m_matrix[2], (simd_type &)body->m_alpha);
			tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp));
			tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1)));
			alpha = simd_permut_v(alpha, tmp, PURMUT_MASK(3, 0, 1, 0));

			// body->m_netTorque = body->m_matrix.RotateVector (alpha.CompProduct(body->m_mass));
			alpha = simd_mul_v(alpha, (simd_type &)body->m_mass);
			(simd_type &)body->m_netTorque =
			    simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_matrix[0], simd_permut_v(alpha, alpha, PURMUT_MASK(0, 0, 0, 0))),
			                                  (simd_type &)body->m_matrix[1], simd_permut_v(alpha, alpha, PURMUT_MASK(1, 1, 1, 1))),
			                   (simd_type &)body->m_matrix[2], simd_permut_v(alpha, alpha, PURMUT_MASK(2, 2, 2, 2)));
		}
#endif

	} else {
		dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
		              dgFloat32(0.0f));
		for (dgInt32 i = 0; i < m_count; i++) {
			dgBody *body;
			body = m_bodyArray[i].m_body;

			body->m_veloc = m_internalVeloc[i].m_linear.Scale(m_invStep);
			body->m_omega = m_internalVeloc[i].m_angular.Scale(m_invStep);

			dgVector accel = (body->m_veloc - body->m_netForce).Scale(m_invTimeStep);
			dgVector alpha = (body->m_omega - body->m_netTorque).Scale(m_invTimeStep);
			if ((accel % accel) < m_maxAccNorm2) {
				accel = zero;
			}

			if ((alpha % alpha) < m_maxAccNorm2) {
				alpha = zero;
			}

			body->m_accel = accel;
			body->m_alpha = alpha;
			body->m_netForce = accel.Scale(body->m_mass[3]);

			alpha = body->m_matrix.UnrotateVector(alpha);
			body->m_netTorque = body->m_matrix.RotateVector(
			                        alpha.CompProduct(body->m_mass));
		}
	}

	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

void dgParallelSolverInitFeedbackUpdate::ThreadExecute() {
	//  dgUnsigned32 ticks;
	//  ticks = m_world->m_getPerformanceCount();

	for (dgInt32 i = 0; i < m_count; i++) {
		dgInt32 first;
		dgInt32 count;
		dgInt32 index;
		dgFloat32 val;
		//      dgFloat32 maxForce;

		//      maxForce = dgFloat32 (0.0f);
		first = m_constraintArray[i].m_autoPairstart;
		count = m_constraintArray[i].m_autoPaircount;
		for (dgInt32 j = 0; j < count; j++) {
			index = j + first;
			val = m_force[index];
			//          maxForce = GetMax(dgAbsf (val), maxForce);
			NEWTON_ASSERT(dgCheckFloat(val));
			m_jointForceFeeback[index][0] = val;
		}
		m_hasJointFeeback |= (m_constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0);
		//      if (m_constraintArray[i].m_joint->m_updaFeedbackCallback) {
		//          m_constraintArray[i].m_joint->m_updaFeedbackCallback (*m_constraintArray[i].m_joint, m_timestep, m_threadIndex);
		//      }

		//      if (m_constraintArray[i].m_joint->GetId() == dgContactConstraintId) {
		//          dgInt32 m0;
		//          dgInt32 m1;
		//          m0 = m_constraintArray[i].m_m0;
		//          m1 = m_constraintArray[i].m_m1;
		//          m_world->dgGetIndirectLock(&m_locks[m0]);
		//              m_world->AddToBreakQueue ((dgContact*)m_constraintArray[i].m_joint, m_constraintArray[i].m_joint->m_body0, maxForce);
		//          m_world->dgReleaseIndirectLock(&m_locks[m0]);
		//          m_world->dgGetIndirectLock(&m_locks[m1]);
		//              m_world->AddToBreakQueue ((dgContact*)m_constraintArray[i].m_joint, m_constraintArray[i].m_joint->m_body1, maxForce);
		//          m_world->dgReleaseIndirectLock(&m_locks[m1]);
		//      }
	}

	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

void dgParallelSolverCalculateForces::ThreadExecute() {
	//  dgInt32 threadIndex;
	//  dgUnsigned32 ticks;
	//  ticks = m_world->m_getPerformanceCount();

	//  threadIndex = m_threadIndex;
	if (m_useSimd) {
#ifdef DG_BUILD_SIMD_CODE
		dgFloatSign tmpIndex;
		simd_type signMask;
		simd_type accNorm;

		tmpIndex.m_integer.m_iVal = 0x7fffffff;
		signMask = simd_set1(tmpIndex.m_fVal);

		accNorm = simd_set1(dgFloat32(0.0f));
		for (dgInt32 i = 0; i < m_count; i += m_threads) {
			//      for (dgInt32 currJoint = 0; currJoint < m_count; currJoint += m_threads) {

			dgInt32 m0;
			dgInt32 m1;
			dgInt32 index;
			//          dgInt32 rowIndex;
			dgInt32 currJoint;
			dgInt32 rowsCount;
			simd_type linearM0;
			simd_type angularM0;
			simd_type linearM1;
			simd_type angularM1;

			currJoint = m_threadIndex + i;
			//          rowIndex = m_jointRemapArray[i];
			index = m_constraintArray[currJoint].m_autoPairstart;
			rowsCount = m_constraintArray[currJoint].m_autoPaircount;
			m0 = m_constraintArray[currJoint].m_m0;
			m1 = m_constraintArray[currJoint].m_m1;

			if (m0) {
				m_world->dgGetIndirectLock(&m_locks[m0]);
			}
			if (m1) {
				m_world->dgGetIndirectLock(&m_locks[m1]);
			}

			linearM0 = (simd_type &)m_internalForces[m0].m_linear;
			angularM0 = (simd_type &)m_internalForces[m0].m_angular;
			linearM1 = (simd_type &)m_internalForces[m1].m_linear;
			angularM1 = (simd_type &)m_internalForces[m1].m_angular;

			for (dgInt32 k = 0; k < rowsCount; k++) {
				dgInt32 frictionIndex;
				simd_type a;
				simd_type f;
				simd_type frictionNormal;
				simd_type lowerFrictionForce;
				simd_type upperFrictionForce;

				//              dgVector acc (m_JMinv[index].m_jacobian_IM0.m_linear.CompProduct(linearM0));
				//              acc += m_JMinv[index].m_jacobian_IM0.m_angular.CompProduct (angularM0);
				//              acc += m_JMinv[index].m_jacobian_IM1.m_linear.CompProduct (linearM1);
				//              acc += m_JMinv[index].m_jacobian_IM1.m_angular.CompProduct (angularM1);

				a =
				    simd_mul_v((simd_type &)m_JMinv[index].m_jacobian_IM0.m_linear, linearM0);
				a =
				    simd_mul_add_v(a, (simd_type &)m_JMinv[index].m_jacobian_IM0.m_angular, angularM0);
				a =
				    simd_mul_add_v(a, (simd_type &)m_JMinv[index].m_jacobian_IM1.m_linear, linearM1);
				a =
				    simd_mul_add_v(a, (simd_type &)m_JMinv[index].m_jacobian_IM1.m_angular, angularM1);

				// a = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index];
				a = simd_add_v(a, simd_move_hl_v(a, a));
				a = simd_add_s(a, simd_permut_v(a, a, PURMUT_MASK(3, 3, 3, 1)));
				a =
				    simd_sub_s(simd_load_s(m_coordenateAccel[index]), simd_mul_add_s(a, simd_load_s(m_force[index]), simd_load_s(m_diagDamp[index])));

				// f = force[index] + invDJMinvJt[index] * a;
				f =
				    simd_mul_add_s(simd_load_s(m_force[index]), simd_load_s(m_invDJMinvJt[index]), a);

				frictionIndex = m_normalForceIndex[index];
				NEWTON_ASSERT(
				    ((frictionIndex < 0) && (m_force[frictionIndex] == dgFloat32(1.0f))) || ((frictionIndex >= 0) && (m_force[frictionIndex] >= dgFloat32(0.0f))));

				// frictionNormal = force[frictionIndex];
				// lowerFrictionForce = frictionNormal * lowerFrictionCoef[index];
				// upperFrictionForce = frictionNormal * upperFrictionCoef[index];
				frictionNormal = simd_load_s(m_force[frictionIndex]);
				lowerFrictionForce =
				    simd_mul_s(frictionNormal, simd_load_s(m_lowerFrictionCoef[index]));
				upperFrictionForce =
				    simd_mul_s(frictionNormal, simd_load_s(m_upperFrictionCoef[index]));

				// if (f > upperFrictionForce) {
				//  a = dgFloat32 (0.0f);
				//  f = upperFrictionForce;
				// } else if (f < lowerFrictionForce) {
				//  a = dgFloat32 (0.0f);
				//  f = lowerFrictionForce;
				// }
				// frictionNormal = simd_or_v (simd_cmplt_s (f, lowerFrictionForce), simd_cmpgt_s (f, upperFrictionForce));
				f = simd_min_s(simd_max_s(f, lowerFrictionForce), upperFrictionForce);
				a =
				    simd_andnot_v(a, simd_or_v(simd_cmplt_s(f, lowerFrictionForce), simd_cmpgt_s(f, upperFrictionForce)));
				accNorm = simd_max_s(accNorm, simd_and_v(a, signMask));

				// prevValue = f - force[index]);
				a = simd_sub_s(f, simd_load_s(m_force[index]));
				a = simd_permut_v(a, a, PURMUT_MASK(0, 0, 0, 0));

				// force[index] = f;
				simd_store_s(f, &m_force[index]);

				linearM0 =
				    simd_mul_add_v(linearM0, (simd_type &)m_Jt[index].m_jacobian_IM0.m_linear, a);
				angularM0 =
				    simd_mul_add_v(angularM0, (simd_type &)m_Jt[index].m_jacobian_IM0.m_angular, a);
				linearM1 =
				    simd_mul_add_v(linearM1, (simd_type &)m_Jt[index].m_jacobian_IM1.m_linear, a);
				angularM1 =
				    simd_mul_add_v(angularM1, (simd_type &)m_Jt[index].m_jacobian_IM1.m_angular, a);
				index++;
			}

			(simd_type &)m_internalForces[m0].m_linear = linearM0;
			(simd_type &)m_internalForces[m0].m_angular = angularM0;
			(simd_type &)m_internalForces[m1].m_linear = linearM1;
			(simd_type &)m_internalForces[m1].m_angular = angularM1;

			if (m1) {
				m_world->dgReleaseIndirectLock(&m_locks[m1]);
			}
			if (m0) {
				m_world->dgReleaseIndirectLock(&m_locks[m0]);
			}
		}
		simd_store_s(accNorm, &m_accNorm);
#endif

	} else {
		dgFloat32 accNorm;
		accNorm = dgFloat32(0.0f);
		for (dgInt32 i = 0; i < m_count; i += m_threads) {
			dgInt32 m0;
			dgInt32 m1;
			dgInt32 index;
			dgInt32 currJoint;
			dgInt32 rowsCount;

			currJoint = m_threadIndex + i;
			index = m_constraintArray[currJoint].m_autoPairstart;
			rowsCount = m_constraintArray[currJoint].m_autoPaircount;
			m0 = m_constraintArray[currJoint].m_m0;
			m1 = m_constraintArray[currJoint].m_m1;

			if (m0) {
				m_world->dgGetIndirectLock(&m_locks[m0]);
			}
			if (m1) {
				m_world->dgGetIndirectLock(&m_locks[m1]);
			}

			dgVector linearM0(m_internalForces[m0].m_linear);
			dgVector angularM0(m_internalForces[m0].m_angular);
			dgVector linearM1(m_internalForces[m1].m_linear);
			dgVector angularM1(m_internalForces[m1].m_angular);
			for (dgInt32 k = 0; k < rowsCount; k++) {
				dgInt32 frictionIndex;
				dgFloat32 a;
				dgFloat32 f;
				dgFloat32 prevValue;
				dgFloat32 frictionNormal;
				dgFloat32 lowerFrictionForce;
				dgFloat32 upperFrictionForce;

				dgVector acc(
				    m_JMinv[index].m_jacobian_IM0.m_linear.CompProduct(linearM0));
				acc += m_JMinv[index].m_jacobian_IM0.m_angular.CompProduct(angularM0);
				acc += m_JMinv[index].m_jacobian_IM1.m_linear.CompProduct(linearM1);
				acc += m_JMinv[index].m_jacobian_IM1.m_angular.CompProduct(angularM1);

				a = m_coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - m_force[index] * m_diagDamp[index];
				f = m_force[index] + m_invDJMinvJt[index] * a;

				frictionIndex = m_normalForceIndex[index];
				NEWTON_ASSERT(
				    ((frictionIndex < 0) && (m_force[frictionIndex] == dgFloat32(1.0f))) || ((frictionIndex >= 0) && (m_force[frictionIndex] >= dgFloat32(0.0f))));

				frictionNormal = m_force[frictionIndex];
				lowerFrictionForce = frictionNormal * m_lowerFrictionCoef[index];
				upperFrictionForce = frictionNormal * m_upperFrictionCoef[index];

				if (f > upperFrictionForce) {
					a = dgFloat32(0.0f);
					f = upperFrictionForce;
				} else if (f < lowerFrictionForce) {
					a = dgFloat32(0.0f);
					f = lowerFrictionForce;
				}

				accNorm = GetMax(accNorm, dgAbsf(a));
				prevValue = f - m_force[index];
				m_force[index] = f;

				linearM0 += m_Jt[index].m_jacobian_IM0.m_linear.Scale(prevValue);
				angularM0 += m_Jt[index].m_jacobian_IM0.m_angular.Scale(prevValue);
				linearM1 += m_Jt[index].m_jacobian_IM1.m_linear.Scale(prevValue);
				angularM1 += m_Jt[index].m_jacobian_IM1.m_angular.Scale(prevValue);
				index++;
			}
			m_internalForces[m0].m_linear = linearM0;
			m_internalForces[m0].m_angular = angularM0;
			m_internalForces[m1].m_linear = linearM1;
			m_internalForces[m1].m_angular = angularM1;

			if (m1) {
				m_world->dgReleaseIndirectLock(&m_locks[m1]);
			}
			if (m0) {
				m_world->dgReleaseIndirectLock(&m_locks[m0]);
			}
		}
		m_accNorm = accNorm;
	}

	//  m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks);
}

void dgJacobianMemory::CalculateForcesGameModeParallel(dgInt32 iterations,
        dgFloat32 maxAccNorm, dgInt32 archModel) const {
	dgFloat32 *const force = m_force;
	const dgJacobianPair *const Jt = m_Jt;
	const dgJacobianPair *const JMinv = m_JMinv;
	const dgFloat32 *const diagDamp = m_diagDamp;
	const dgFloat32 *const invDJMinvJt = m_invDJMinvJt;
	const dgBodyInfo *bodyArray = m_bodyArray;
	dgFloat32 *const penetration = m_penetration;
	const dgFloat32 *const externAccel = m_deltaAccel;
	const dgFloat32 *const restitution = m_restitution;
	dgFloat32 *const coordenateAccel = m_coordenateAccel;
	dgJacobian *const internalVeloc = m_internalVeloc;
	dgJacobian *const internalForces = m_internalForces;
	;
	dgFloat32 **const jointForceFeeback = m_jointFeebackForce;
	const dgInt32 *const accelIsMortor = m_accelIsMotor;
	const dgInt32 *const normalForceIndex = m_normalForceIndex;
	const dgJointInfo *const constraintArray = m_constraintArray;
	const dgFloat32 *const penetrationStiffness = m_penetrationStiffness;
	const dgFloat32 *const lowerFrictionCoef = m_lowerBoundFrictionCoefficent;
	const dgFloat32 *const upperFrictionCoef = m_upperBoundFrictionCoefficent;
	dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f),
	              dgFloat32(0.0f));

	dgFloat32 invStep = (dgFloat32(1.0f) / dgFloat32(LINEAR_SOLVER_SUB_STEPS));
	dgFloat32 timeStep = m_timeStep * invStep;
	dgFloat32 invTimeStep = m_invTimeStep * dgFloat32(LINEAR_SOLVER_SUB_STEPS);

	dgInt32 threads = dgInt32(m_world->m_numberOfTheads);

	dgInt32 chunkJointSizes[DG_MAXIMUN_THREADS];
	dgInt32 chunkBodiesSizes[DG_MAXIMUN_THREADS];

	dgInt32 acc = 0;
	m_world->m_threadsManager.CalculateChunkSizes(m_bodyCount - 1,
	        chunkBodiesSizes);
	dgParallelSolverClear *const clearAccumulators =
	    m_world->m_dynamicSolver.m_clearAccumulators;
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		clearAccumulators[threadIndex].m_useSimd = archModel;
		clearAccumulators[threadIndex].m_world = m_world;
		clearAccumulators[threadIndex].m_threadIndex = threadIndex;
		clearAccumulators[threadIndex].m_count = chunkBodiesSizes[threadIndex];

		clearAccumulators[threadIndex].m_bodyArray = &bodyArray[acc + 1];
		clearAccumulators[threadIndex].m_internalVeloc = &internalVeloc[acc + 1];
		clearAccumulators[threadIndex].m_internalForces = &internalForces[acc + 1];
		m_world->m_threadsManager.SubmitJob(&clearAccumulators[threadIndex]);
		acc += chunkBodiesSizes[threadIndex];
	}
	m_world->m_threadsManager.SynchronizationBarrier();

	internalVeloc[0].m_linear = zero;
	internalVeloc[0].m_angular = zero;
	internalForces[0].m_linear = zero;
	internalForces[0].m_angular = zero;

	acc = 0;
	m_world->m_threadsManager.CalculateChunkSizes(m_jointCount, chunkJointSizes);
	dgParallelSolverInitInternalForces *const parallelInitIntenalForces =
	    m_world->m_dynamicSolver.m_parallelInitIntenalForces;
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		parallelInitIntenalForces[threadIndex].m_useSimd = archModel;
		parallelInitIntenalForces[threadIndex].m_world = m_world;
		parallelInitIntenalForces[threadIndex].m_locks = m_treadLocks;
		parallelInitIntenalForces[threadIndex].m_threadIndex = threadIndex;
		parallelInitIntenalForces[threadIndex].m_Jt = Jt;
		parallelInitIntenalForces[threadIndex].m_force = force;
		parallelInitIntenalForces[threadIndex].m_internalForces = internalForces;
		parallelInitIntenalForces[threadIndex].m_count =
		    chunkJointSizes[threadIndex];
		parallelInitIntenalForces[threadIndex].m_constraintArray =
		    &constraintArray[acc];
		m_world->m_threadsManager.SubmitJob(
		    &parallelInitIntenalForces[threadIndex]);
		acc += chunkJointSizes[threadIndex];
	}
	m_world->m_threadsManager.SynchronizationBarrier();

	acc = 0;
	dgParallelSolverJointAcceleration *const parallelSolverJointAcceleration =
	    m_world->m_dynamicSolver.m_parallelSolverJointAcceleration;
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		parallelSolverJointAcceleration[threadIndex].m_useSimd = archModel;
		parallelSolverJointAcceleration[threadIndex].m_world = m_world;
		parallelSolverJointAcceleration[threadIndex].m_threadIndex = threadIndex;
		parallelSolverJointAcceleration[threadIndex].m_Jt = Jt;
		parallelSolverJointAcceleration[threadIndex].m_count =
		    chunkJointSizes[threadIndex];
		parallelSolverJointAcceleration[threadIndex].m_constraintArray =
		    constraintArray;
		parallelSolverJointAcceleration[threadIndex].m_jointStart = acc;
		//      parallelSolverJointAcceleration[threadIndex].m_jointRemapArray = &jointRemapArray[acc];
		parallelSolverJointAcceleration[threadIndex].m_timeStep = timeStep;
		parallelSolverJointAcceleration[threadIndex].m_invTimeStep = invTimeStep;
		parallelSolverJointAcceleration[threadIndex].m_firstPassCoef = dgFloat32(
		            1.0f);
		parallelSolverJointAcceleration[threadIndex].m_penetration = penetration;
		parallelSolverJointAcceleration[threadIndex].m_restitution = restitution;
		parallelSolverJointAcceleration[threadIndex].m_externAccel = externAccel;
		parallelSolverJointAcceleration[threadIndex].m_coordenateAccel =
		    coordenateAccel;
		parallelSolverJointAcceleration[threadIndex].m_accelIsMortor =
		    accelIsMortor;
		parallelSolverJointAcceleration[threadIndex].m_normalForceIndex =
		    normalForceIndex;
		parallelSolverJointAcceleration[threadIndex].m_penetrationStiffness =
		    penetrationStiffness;
		acc += chunkJointSizes[threadIndex];
	}

	acc = 0;
	dgParallelSolverUpdateVeloc *const parallelSolverUpdateVeloc =
	    m_world->m_dynamicSolver.m_parallelSolverUpdateVeloc;
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		parallelSolverUpdateVeloc[threadIndex].m_useSimd = archModel;
		parallelSolverUpdateVeloc[threadIndex].m_world = m_world;
		parallelSolverUpdateVeloc[threadIndex].m_threadIndex = threadIndex;
		parallelSolverUpdateVeloc[threadIndex].m_timeStep = timeStep;
		parallelSolverUpdateVeloc[threadIndex].m_count =
		    chunkBodiesSizes[threadIndex];
		parallelSolverUpdateVeloc[threadIndex].m_bodyArray = &bodyArray[acc + 1];
		parallelSolverUpdateVeloc[threadIndex].m_internalVeloc = &internalVeloc[acc + 1];
		parallelSolverUpdateVeloc[threadIndex].m_internalForces =
		    &internalForces[acc + 1];
		acc += chunkBodiesSizes[threadIndex];
	}

	dgParallelSolverCalculateForces *const parallelSolverCalculateForces =
	    m_world->m_dynamicSolver.m_parallelSolverCalculateForces;
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		parallelSolverCalculateForces[threadIndex].m_useSimd = archModel;
		parallelSolverCalculateForces[threadIndex].m_world = m_world;
		parallelSolverCalculateForces[threadIndex].m_threads = threads;
		parallelSolverCalculateForces[threadIndex].m_threadIndex = threadIndex;

		parallelSolverCalculateForces[threadIndex].m_Jt = Jt;
		parallelSolverCalculateForces[threadIndex].m_JMinv = JMinv;
		parallelSolverCalculateForces[threadIndex].m_count =
		    chunkJointSizes[threadIndex] * threads;
		//      parallelSolverCalculateForces[threadIndex].m_jointRemapArray = &jointRemapArray[threadIndex];
		parallelSolverCalculateForces[threadIndex].m_constraintArray =
		    constraintArray;

		parallelSolverCalculateForces[threadIndex].m_locks = m_treadLocks;
		parallelSolverCalculateForces[threadIndex].m_force = force;
		parallelSolverCalculateForces[threadIndex].m_diagDamp = diagDamp;
		parallelSolverCalculateForces[threadIndex].m_invDJMinvJt = invDJMinvJt;
		parallelSolverCalculateForces[threadIndex].m_normalForceIndex =
		    normalForceIndex;
		parallelSolverCalculateForces[threadIndex].m_internalForces =
		    internalForces;
		parallelSolverCalculateForces[threadIndex].m_coordenateAccel =
		    coordenateAccel;
		parallelSolverCalculateForces[threadIndex].m_lowerFrictionCoef =
		    lowerFrictionCoef;
		parallelSolverCalculateForces[threadIndex].m_upperFrictionCoef =
		    upperFrictionCoef;
	}

	dgInt32 maxPasses = iterations + DG_BASE_ITERATION_COUNT;
	for (dgInt32 step = 0; step < LINEAR_SOLVER_SUB_STEPS; step++) {
		for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
			m_world->m_threadsManager.SubmitJob(
			    &parallelSolverJointAcceleration[threadIndex]);
		}
		m_world->m_threadsManager.SynchronizationBarrier();

		dgFloat32 accNorm = maxAccNorm * dgFloat32(2.0f);
		for (dgInt32 passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm);
		        passes++) {

			for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
				m_world->m_threadsManager.SubmitJob(
				    &parallelSolverCalculateForces[threadIndex]);
			}
			m_world->m_threadsManager.SynchronizationBarrier();

			accNorm = dgFloat32(0.0f);
			for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
				if (parallelSolverCalculateForces[threadIndex].m_accNorm > accNorm) {
					accNorm = parallelSolverCalculateForces[threadIndex].m_accNorm;
				}
			}
			// accNorm = 1.0f;
		}

		for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
			m_world->m_threadsManager.SubmitJob(
			    &parallelSolverUpdateVeloc[threadIndex]);
		}
		m_world->m_threadsManager.SynchronizationBarrier();
	}

	acc = 0;
	m_world->m_threadsManager.CalculateChunkSizes(m_jointCount, chunkJointSizes);
	dgParallelSolverInitFeedbackUpdate *const parallelSolverInitFeedbackUpdate =
	    m_world->m_dynamicSolver.m_parallelSolverInitFeedbackUpdate;
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		parallelSolverInitFeedbackUpdate[threadIndex].m_locks = m_treadLocks;
		parallelSolverInitFeedbackUpdate[threadIndex].m_world = m_world;
		parallelSolverInitFeedbackUpdate[threadIndex].m_hasJointFeeback = 0;
		parallelSolverInitFeedbackUpdate[threadIndex].m_threadIndex = threadIndex;
		parallelSolverInitFeedbackUpdate[threadIndex].m_force = force;
		parallelSolverInitFeedbackUpdate[threadIndex].m_timestep = timeStep;
		parallelSolverInitFeedbackUpdate[threadIndex].m_jointForceFeeback =
		    jointForceFeeback;
		parallelSolverInitFeedbackUpdate[threadIndex].m_count =
		    chunkJointSizes[threadIndex];
		parallelSolverInitFeedbackUpdate[threadIndex].m_constraintArray =
		    &constraintArray[acc];
		m_world->m_threadsManager.SubmitJob(
		    &parallelSolverInitFeedbackUpdate[threadIndex]);
		acc += chunkJointSizes[threadIndex];
	}
	m_world->m_threadsManager.SynchronizationBarrier();

	acc = 0;
	dgFloat32 maxAccNorm2 = maxAccNorm * maxAccNorm;
	dgParallelSolverUpdateForce *const parallelSolverUpdateForce =
	    m_world->m_dynamicSolver.m_parallelSolverUpdateForce;
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		parallelSolverUpdateForce[threadIndex].m_useSimd = archModel;
		parallelSolverUpdateForce[threadIndex].m_world = m_world;
		parallelSolverUpdateForce[threadIndex].m_threadIndex = threadIndex;
		parallelSolverUpdateForce[threadIndex].m_invStep = invStep;
		parallelSolverUpdateForce[threadIndex].m_invTimeStep = m_invTimeStep;
		parallelSolverUpdateForce[threadIndex].m_maxAccNorm2 = maxAccNorm2;
		parallelSolverUpdateForce[threadIndex].m_count =
		    chunkBodiesSizes[threadIndex];
		parallelSolverUpdateForce[threadIndex].m_bodyArray = &bodyArray[acc + 1];
		parallelSolverUpdateForce[threadIndex].m_internalVeloc = &internalVeloc[acc + 1];
		acc += chunkBodiesSizes[threadIndex];
		m_world->m_threadsManager.SubmitJob(
		    &parallelSolverUpdateForce[threadIndex]);
	}
	m_world->m_threadsManager.SynchronizationBarrier();

	dgInt32 hasJointFeeback = 0;
	for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) {
		hasJointFeeback |=
		    parallelSolverInitFeedbackUpdate[threadIndex].m_hasJointFeeback;
	}

	if (hasJointFeeback) {
		for (dgInt32 i = 0; i < m_jointCount; i++) {
			if (constraintArray[i].m_joint->m_updaFeedbackCallback) {
				constraintArray[i].m_joint->m_updaFeedbackCallback(
					reinterpret_cast<const NewtonJoint *>(constraintArray[i].m_joint), m_timeStep, m_threadIndex);
			}
		}
	}
}

void dgJacobianMemory::CalculateReactionsForcesParallel(dgInt32 solverMode,
        dgFloat32 tolerance, dgInt32 archModel) const {
	//  NEWTON_ASSERT (m_jointCount >= DG_PARALLEL_JOINT_COUNT);
	if (solverMode) {
		CalculateForcesGameModeParallel(solverMode, tolerance, archModel);
	} else if (archModel) {
		CalculateForcesSimulationModeSimd(tolerance);
	} else {
		CalculateForcesSimulationMode(tolerance);
	}
}