/* Copyright (c) <2003-2011> * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * * 3. This notice may not be removed or altered from any source distribution. */ #include "hpl1/engine/libraries/newton/core/dg.h" #include "dgBody.h" #include "dgConstraint.h" #include "dgWorld.h" #include "dgWorldDynamicUpdate.h" #define DG_FREEZZING_VELOCITY_DRAG dgFloat32(0.9f) #define DG_SOLVER_MAX_ERROR (DG_FREEZE_MAG * dgFloat32(0.5f)) #ifdef DG_BUILD_SIMD_CODE #define DG_SIMD_WORD_SIZE dgInt32(sizeof(simd_type) / sizeof(dgFloat32)) #else #define DG_SIMD_WORD_SIZE dgInt32(sizeof(dgVector) / sizeof(dgFloat32)) #endif #define DG_PARALLEL_JOINT_COUNT 64 // in my twist to RK4 I am no sure if the final derivative have to be weighted, it seems a mistake, I need to investigate more // uncomment this out for more stable behavior but no exactly correct solution on the iterative solver //#define DG_WIGHT_FINAL_RK4_DERIVATIVES #ifdef TARGET_OS_IPHONE #define DG_BASE_ITERATION_COUNT 2 #else #define DG_BASE_ITERATION_COUNT 3 #endif class dgBodyInfo { public: dgBody *m_body; }; class dgJointInfo { public: dgConstraint *m_joint; dgInt32 m_autoPairstart; dgInt32 m_autoPaircount; dgInt32 m_autoPairActiveCount; dgInt32 m_m0; dgInt32 m_m1; }; class dgIsland { public: dgInt32 m_bodyCount; dgInt32 m_bodyStart; dgInt32 m_jointCount; dgInt32 m_jointStart; dgInt32 m_hasUnilateralJoints : 1; dgInt32 m_isContinueCollision : 1; }; ////////////////////////////////////////////////////////////////////// // Construction/Destruction ////////////////////////////////////////////////////////////////////// static inline dgInt32 CompareIslands(const dgIsland *const islandA, const dgIsland *const islandB, void *notUsed) { dgInt32 countA; dgInt32 countB; countA = islandA->m_jointCount - (islandA->m_hasUnilateralJoints << 23); countB = islandB->m_jointCount - (islandB->m_hasUnilateralJoints << 23); if (countA < countB) { return -1; } if (countA > countB) { return 1; } return 0; } dgBody *dgWorld::GetIslandBody(const void *const islandPtr, dgInt32 index) const { const dgIslandCallbackStruct &island = *(const dgIslandCallbackStruct *)islandPtr; char *ptr = &((char *)island.m_bodyArray)[island.m_strideInByte * index]; dgBody **bodyPtr = (dgBody **)ptr; return (index < island.m_count) ? ((index >= 0) ? *bodyPtr : NULL) : NULL; } dgWorldDynamicUpdate::dgWorldDynamicUpdate() { m_bodies = 0; ; m_joints = 0; m_islands = 0; m_markLru = 0; m_maxJointCount = 0; m_maxBodiesCount = 0; m_maxIslandCount = 0; } void dgWorldDynamicUpdate::UpdateDynamics(dgWorld *const world, dgInt32 archModel, dgFloat32 timestep) { dgUnsigned32 updateTime = world->m_getPerformanceCount(); m_bodies = 0; m_joints = 0; m_islands = 0; m_markLru = 0; m_world = world; ReallocBodyMemory(0); ReallocIslandMemory(0); ReallocJointsMemory(0); dgInt32 threadCounts = dgInt32(m_world->m_numberOfTheads); for (dgInt32 i = 0; i < threadCounts; i++) { m_solverMemory[i].m_world = m_world; m_solverMemory[i].m_threadIndex = i; ReallocJacobiansMemory(0, i); ReallocIntenalForcesMemory(0, i); } m_world->m_dynamicsLru = m_world->m_dynamicsLru + 2; m_markLru = dgInt32(m_world->m_dynamicsLru); dgInt32 lru = m_markLru - 1; dgBodyMasterList &me = *m_world; NEWTON_ASSERT(me.GetFirst()->GetInfo().GetBody() == m_world->m_sentionelBody); dgUnsigned32 solverMode = m_world->m_solverMode; m_world->m_sentionelBody->m_index = 0; m_world->m_sentionelBody->m_dynamicsLru = dgUnsigned32(m_markLru); for (dgBodyMasterList::dgListNode *node = me.GetLast(); node; node = node->GetPrev()) { const dgBodyMasterListRow &graphNode = node->GetInfo(); dgBody *const body = graphNode.GetBody(); if (body->m_invMass.m_w == dgFloat32(0.0f)) { #ifdef _DEBUG for (; node; node = node->GetPrev()) { NEWTON_ASSERT(node->GetInfo().GetBody()->m_invMass.m_w == dgFloat32(0.0f)); } #endif break; } if (dgInt32(body->m_dynamicsLru) < lru) { if (!(body->m_freeze | body->m_spawnnedFromCallback | body->m_sleeping | !body->m_isInWorld)) { SpanningTree(body); } } body->m_spawnnedFromCallback = false; } dgSort(m_islandArray, m_islands, CompareIslands); // dgRadixSort (m_islandArray, &m_islandArray[m_islands], m_islands, 3, GetIslandsKey); dgUnsigned32 dynamicsTime = m_world->m_getPerformanceCount(); m_world->m_perfomanceCounters[m_dynamicsBuildSpanningTreeTicks] = dynamicsTime - updateTime; if (threadCounts > 1) { dgInt32 chunkSizes[DG_MAXIMUN_THREADS]; if (m_world->m_singleIslandMultithreading) { const dgJacobianMemory &system = m_solverMemory[0]; while (m_islands && (m_islandArray[m_islands - 1].m_jointCount >= DG_PARALLEL_JOINT_COUNT)) { m_islands--; NEWTON_ASSERT(!m_islandArray[m_islands].m_isContinueCollision); BuildJacobianMatrixParallel(m_islandArray[m_islands], timestep, archModel); system.CalculateReactionsForcesParallel(dgInt32(solverMode), DG_SOLVER_MAX_ERROR, archModel); IntegrateArray(&system.m_bodyArray[1], system.m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep, 0, true); } } if (m_islands) { m_world->m_threadsManager.CalculateChunkSizes(m_islands, chunkSizes); for (dgInt32 threadIndex = 0; threadIndex < threadCounts; threadIndex++) { m_workerThreads[threadIndex].m_useSimd = archModel; m_workerThreads[threadIndex].m_world = m_world; m_workerThreads[threadIndex].m_dynamics = this; m_workerThreads[threadIndex].m_count = chunkSizes[threadIndex] * threadCounts; m_workerThreads[threadIndex].m_threads = threadCounts; m_workerThreads[threadIndex].m_timestep = timestep; m_workerThreads[threadIndex].m_solverMode = dgInt32(solverMode); m_workerThreads[threadIndex].m_threadIndex = threadIndex; m_workerThreads[threadIndex].m_system = &m_solverMemory[threadIndex]; m_world->m_threadsManager.SubmitJob(&m_workerThreads[threadIndex]); } m_world->m_threadsManager.SynchronizationBarrier(); } } else { m_workerThreads[0].m_threads = 1; m_workerThreads[0].m_useSimd = archModel; m_workerThreads[0].m_count = m_islands; m_workerThreads[0].m_world = m_world; m_workerThreads[0].m_dynamics = this; m_workerThreads[0].m_threadIndex = 0; m_workerThreads[0].m_solverMode = dgInt32(solverMode); m_workerThreads[0].m_timestep = timestep; m_workerThreads[0].m_system = &m_solverMemory[0]; m_workerThreads[0].ThreadExecute(); } dgUnsigned32 ticks = m_world->m_getPerformanceCount(); m_world->m_perfomanceCounters[m_dynamicsSolveSpanningTreeTicks] = ticks - dynamicsTime; m_world->m_perfomanceCounters[m_dynamicsTicks] = ticks - updateTime; } void dgSolverWorlkerThreads::ThreadExecute() { const dgIsland *const m_islandArray = m_dynamics->m_islandArray; dgContactPoint *const contactBuffer = (dgContactPoint *)m_world->m_contactBuffers[m_threadIndex]; dgInt32 step = m_threads; dgInt32 count = m_count; if (m_useSimd) { for (dgInt32 i = 0; i < count; i += step) { const dgIsland &island = m_islandArray[i + m_threadIndex]; if (!island.m_isContinueCollision) { m_dynamics->BuildJacobianMatrixSimd(island, m_threadIndex, m_timestep); m_system->CalculateReactionsForcesSimd(m_solverMode, DG_SOLVER_MAX_ERROR); m_dynamics->IntegrateArray(&m_system->m_bodyArray[1], m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, m_timestep, m_threadIndex, true); } else { dgBodyInfo *const bodyArray = &m_dynamics->m_bodyArray[island.m_bodyStart]; dgJointInfo *const constraintArray = &m_dynamics->m_constraintArray[island.m_jointStart]; dgFloat32 dist = dgFloat32(0.0f); dgFloat32 maxSpeed2 = dgFloat32(0.0f); for (dgInt32 k = 1; k < island.m_bodyCount; k++) { dgVector veloc; dgVector omega; dgBody *const body = bodyArray[k].m_body; NEWTON_ASSERT(body->m_mass.m_w > dgFloat32(0.0f)); body->CalculateContinueVelocity(m_timestep, veloc, omega); dgFloat32 mag2 = veloc % veloc; if (mag2 > maxSpeed2) { maxSpeed2 = mag2; dist = body->m_collision->GetBoxMinRadius(); } } NEWTON_ASSERT(dist > dgFloat32(0.0f)); dgInt32 steps = dgInt32( dgFloat32(4.0f) * dgSqrt(maxSpeed2) * m_timestep / dist) + 1; dgFloat32 timestep = m_timestep / dgFloat32(steps); if (steps > 8) { steps = 8; } for (dgInt32 j = 0; j < steps - 1; j++) { m_dynamics->BuildJacobianMatrixSimd(island, m_threadIndex, timestep); m_system->CalculateReactionsForcesSimd(m_solverMode, DG_SOLVER_MAX_ERROR); m_dynamics->IntegrateArray(&m_system->m_bodyArray[1], m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep, m_threadIndex, false); for (dgInt32 k = 1; k < island.m_bodyCount; k++) { dgBody *const body = bodyArray[k].m_body; body->m_accel = body->m_prevExternalForce; body->m_alpha = body->m_prevExternalTorque; } for (dgInt32 k = 0; k < island.m_jointCount; k++) { dgConstraint *const constraint = constraintArray[k].m_joint; if (constraint->GetId() == dgContactConstraintId) { dgContact *const contact = (dgContact *)constraint; const dgContactMaterial *const material = contact->m_myCacheMaterial; if (material->m_flags & dgContactMaterial::m_collisionEnable__) { if (material->m_aabbOverlap) { NEWTON_ASSERT(contact->m_body0); NEWTON_ASSERT(contact->m_body1); material->m_aabbOverlap(reinterpret_cast(material), reinterpret_cast(contact->m_body0), reinterpret_cast(contact->m_body1), m_threadIndex); } dgCollidingPairCollector::dgPair pair; pair.m_body0 = contact->m_body0; pair.m_body1 = contact->m_body1; pair.m_contact = contact; pair.m_material = material; pair.m_contactBuffer = contactBuffer; m_world->CalculateContactsSimd(&pair, timestep, m_threadIndex); //NEWTON_ASSERT (contact->m_maxDOF == 0); NEWTON_ASSERT(pair.m_contact); if (pair.m_contactCount) { NEWTON_ASSERT(pair.m_contactCount <= (DG_CONSTRAINT_MAX_ROWS / 3)); m_world->ProcessContacts(&pair, timestep, m_threadIndex); } else if (!pair.m_contactBuffer) { m_world->ProcessCachedContacts(pair.m_contact, pair.m_material, timestep, m_threadIndex); } } } } } m_dynamics->BuildJacobianMatrixSimd(island, m_threadIndex, timestep); m_system->CalculateReactionsForcesSimd(m_solverMode, DG_SOLVER_MAX_ERROR); m_dynamics->IntegrateArray(&m_system->m_bodyArray[1], m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep, m_threadIndex, true); } } } else { for (dgInt32 i = 0; i < count; i += step) { const dgIsland &island = m_islandArray[i + m_threadIndex]; if (!island.m_isContinueCollision) { m_dynamics->BuildJacobianMatrix(island, m_threadIndex, m_timestep); m_system->CalculateReactionsForces(m_solverMode, DG_SOLVER_MAX_ERROR); m_dynamics->IntegrateArray(&m_system->m_bodyArray[1], m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, m_timestep, m_threadIndex, true); } else { dgBodyInfo *const bodyArray = &m_dynamics->m_bodyArray[island.m_bodyStart]; dgJointInfo *const constraintArray = &m_dynamics->m_constraintArray[island.m_jointStart]; dgFloat32 dist = dgFloat32(0.0f); dgFloat32 maxSpeed2 = dgFloat32(0.0f); for (dgInt32 k = 1; k < island.m_bodyCount; k++) { dgVector veloc; dgVector omega; dgBody *const body = bodyArray[k].m_body; NEWTON_ASSERT(body->m_mass.m_w > dgFloat32(0.0f)); body->CalculateContinueVelocity(m_timestep, veloc, omega); dgFloat32 mag2 = veloc % veloc; if (mag2 > maxSpeed2) { maxSpeed2 = mag2; dist = body->m_collision->GetBoxMinRadius(); } } NEWTON_ASSERT(dist > dgFloat32(0.0f)); dgInt32 steps = dgInt32( dgFloat32(4.0f) * dgSqrt(maxSpeed2) * m_timestep / dist) + 1; dgFloat32 timestep = m_timestep / dgFloat32(steps); if (steps > 8) { steps = 8; } for (dgInt32 j = 0; j < steps - 1; j++) { m_dynamics->BuildJacobianMatrix(island, m_threadIndex, timestep); m_system->CalculateReactionsForces(m_solverMode, DG_SOLVER_MAX_ERROR); m_dynamics->IntegrateArray(&m_system->m_bodyArray[1], m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep, m_threadIndex, false); for (dgInt32 k = 1; k < island.m_bodyCount; k++) { dgBody *const body = bodyArray[k].m_body; body->m_accel = body->m_prevExternalForce; body->m_alpha = body->m_prevExternalTorque; } for (dgInt32 k = 0; k < island.m_jointCount; k++) { dgConstraint *const constraint = constraintArray[k].m_joint; if (constraint->GetId() == dgContactConstraintId) { dgContact *const contact = (dgContact *)constraint; const dgContactMaterial *const material = contact->m_myCacheMaterial; if (material->m_flags & dgContactMaterial::m_collisionEnable__) { if (material->m_aabbOverlap) { NEWTON_ASSERT(contact->m_body0); NEWTON_ASSERT(contact->m_body1); material->m_aabbOverlap(reinterpret_cast(material), reinterpret_cast(contact->m_body0), reinterpret_cast(contact->m_body1), m_threadIndex); } dgCollidingPairCollector::dgPair pair; pair.m_body0 = contact->m_body0; pair.m_body1 = contact->m_body1; pair.m_contact = contact; pair.m_material = material; pair.m_contactBuffer = contactBuffer; m_world->CalculateContacts(&pair, timestep, m_threadIndex); NEWTON_ASSERT(pair.m_contact); if (pair.m_contactCount) { NEWTON_ASSERT(pair.m_contactCount <= (DG_CONSTRAINT_MAX_ROWS / 3)); m_world->ProcessContacts(&pair, timestep, m_threadIndex); } else if (!pair.m_contactBuffer) { m_world->ProcessCachedContacts(pair.m_contact, pair.m_material, timestep, m_threadIndex); } } } } } m_dynamics->BuildJacobianMatrix(island, m_threadIndex, timestep); m_system->CalculateReactionsForces(m_solverMode, DG_SOLVER_MAX_ERROR); m_dynamics->IntegrateArray(&m_system->m_bodyArray[1], m_system->m_bodyCount - 1, DG_SOLVER_MAX_ERROR, timestep, m_threadIndex, true); } } } } void dgWorldDynamicUpdate::ReallocBodyMemory(dgInt32 bodyCount) { void *memory; dgInt32 stride; dgInt32 newCount; dgBodyInfo *bodyArray; stride = sizeof(dgBodyInfo); if (bodyCount) { m_world->m_bodiesMemorySizeInBytes = m_world->m_bodiesMemorySizeInBytes * 2; memory = m_world->GetAllocator()->MallocLow( m_world->m_bodiesMemorySizeInBytes); newCount = (m_world->m_bodiesMemorySizeInBytes / stride) & (-4); bodyArray = (dgBodyInfo *)memory; memcpy(bodyArray, m_bodyArray, bodyCount * sizeof(dgBodyInfo)); m_world->GetAllocator()->FreeLow(m_world->m_bodiesMemory); } else { memory = m_world->m_bodiesMemory; newCount = (m_world->m_bodiesMemorySizeInBytes / stride) & (-4); bodyArray = (dgBodyInfo *)memory; } m_maxBodiesCount = newCount; m_world->m_bodiesMemory = memory; m_bodyArray = bodyArray; NEWTON_ASSERT((dgUnsigned64(m_bodyArray) & 0x0f) == 0); } void dgWorldDynamicUpdate::ReallocIslandMemory(dgInt32 islandCount) { void *memory; dgInt32 stride; dgInt32 newCount; dgIsland *islandArray; stride = sizeof(dgIsland); if (islandCount) { m_world->m_islandMemorySizeInBytes = m_world->m_islandMemorySizeInBytes * 2; memory = m_world->GetAllocator()->MallocLow( m_world->m_islandMemorySizeInBytes); newCount = (m_world->m_islandMemorySizeInBytes / stride) & (-4); islandArray = (dgIsland *)memory; memcpy(islandArray, m_islandArray, islandCount * sizeof(dgIsland)); m_world->GetAllocator()->FreeLow(m_world->m_islandMemory); } else { memory = m_world->m_islandMemory; newCount = (m_world->m_islandMemorySizeInBytes / stride) & (-4); islandArray = (dgIsland *)memory; } m_maxIslandCount = newCount; m_world->m_islandMemory = memory; m_islandArray = islandArray; NEWTON_ASSERT((dgUnsigned64(m_islandArray) & 0x0f) == 0); } void dgWorldDynamicUpdate::ReallocJointsMemory(dgInt32 count) { void *memory; dgInt32 stride; dgInt32 newCount; dgJointInfo *constraintArray; stride = sizeof(dgJointInfo); if (count) { m_world->m_jointsMemorySizeInBytes = m_world->m_jointsMemorySizeInBytes * 2; memory = m_world->GetAllocator()->MallocLow( m_world->m_jointsMemorySizeInBytes); newCount = (m_world->m_jointsMemorySizeInBytes / stride) & (-4); constraintArray = (dgJointInfo *)memory; memcpy(constraintArray, m_constraintArray, count * sizeof(dgJointInfo)); m_world->GetAllocator()->FreeLow(m_world->m_jointsMemory); } else { memory = m_world->m_jointsMemory; newCount = (m_world->m_jointsMemorySizeInBytes / stride) & (-4); constraintArray = (dgJointInfo *)memory; } m_maxJointCount = newCount; m_world->m_jointsMemory = memory; m_constraintArray = constraintArray; NEWTON_ASSERT((dgUnsigned64(m_constraintArray) & 0x0f) == 0); } void dgWorldDynamicUpdate::ReallocJacobiansMemory(dgInt32 count, dgInt32 threadIndex) { void *memory; dgInt32 stride; dgInt32 newCount; dgJacobianPair *Jt; dgJacobianPair *JMinv; dgFloat32 *force; dgFloat32 *accel; dgFloat32 *deltaAccel; dgFloat32 *deltaForce; dgFloat32 *diagDamp; dgFloat32 *invDJMinvJt; dgFloat32 *restitution; dgFloat32 *penetration; dgFloat32 *coordenateAccel; dgFloat32 *penetrationStiffness; dgFloat32 *lowerBoundFrictionCoefficent; dgFloat32 *upperBoundFrictionCoefficent; dgFloat32 **forceFeedback; dgInt32 *normalForceIndex; dgInt32 *accelIsMotor; stride = sizeof(dgJacobianPair) + // Jt sizeof(dgJacobianPair) + // JtMinv sizeof(dgFloat32) + // lowerBoundFrictionCoefficent sizeof(dgFloat32) + // upperBoundFrictionCoefficent sizeof(dgFloat32 *) + // forceFeedback sizeof(dgFloat32) + // force sizeof(dgFloat32) + // accel sizeof(dgFloat32) + // deltaAccel sizeof(dgFloat32) + // deltaForce sizeof(dgFloat32) + // diagDamp sizeof(dgFloat32) + // invDJMinvJt sizeof(dgFloat32) + // penetration sizeof(dgFloat32) + // restitution sizeof(dgFloat32) + // coordinateAccel sizeof(dgFloat32) + // penetrationStiffness sizeof(dgInt32) + // normalForceIndex sizeof(dgInt32) + // accelIsMotor; 0; if (count) { m_world->m_jacobiansMemorySizeInBytes[threadIndex] = m_world->m_jacobiansMemorySizeInBytes[threadIndex] * 2; memory = m_world->GetAllocator()->MallocLow( m_world->m_jacobiansMemorySizeInBytes[threadIndex] + 64); newCount = ((m_world->m_jacobiansMemorySizeInBytes[threadIndex] - 16) / stride) & (-8); Jt = (dgJacobianPair *)memory; JMinv = (dgJacobianPair *)&Jt[newCount]; force = (dgFloat32 *)&JMinv[newCount]; force += 4; accel = (dgFloat32 *)&force[newCount]; ; deltaAccel = (dgFloat32 *)&accel[newCount]; deltaForce = (dgFloat32 *)&deltaAccel[newCount]; // forceStep = (dgFloat32*) &deltaForce [newCount]; diagDamp = (dgFloat32 *)&deltaForce[newCount]; coordenateAccel = (dgFloat32 *)&diagDamp[newCount]; restitution = (dgFloat32 *)&coordenateAccel[newCount]; penetration = (dgFloat32 *)&restitution[newCount]; penetrationStiffness = (dgFloat32 *)&penetration[newCount]; invDJMinvJt = (dgFloat32 *)&penetrationStiffness[newCount]; lowerBoundFrictionCoefficent = (dgFloat32 *)&invDJMinvJt[newCount]; upperBoundFrictionCoefficent = (dgFloat32 *)&lowerBoundFrictionCoefficent[newCount]; forceFeedback = (dgFloat32 **)&upperBoundFrictionCoefficent[newCount]; normalForceIndex = (dgInt32 *)&forceFeedback[newCount]; accelIsMotor = (dgInt32 *)&normalForceIndex[newCount]; m_world->GetAllocator()->FreeLow(m_world->m_jacobiansMemory[threadIndex]); } else { memory = m_world->m_jacobiansMemory[threadIndex]; newCount = ((m_world->m_jacobiansMemorySizeInBytes[threadIndex] - 16) / stride) & (-8); Jt = (dgJacobianPair *)memory; JMinv = (dgJacobianPair *)&Jt[newCount]; force = (dgFloat32 *)&JMinv[newCount]; force += 4; accel = (dgFloat32 *)&force[newCount]; ; deltaAccel = (dgFloat32 *)&accel[newCount]; deltaForce = (dgFloat32 *)&deltaAccel[newCount]; diagDamp = (dgFloat32 *)&deltaForce[newCount]; coordenateAccel = (dgFloat32 *)&diagDamp[newCount]; restitution = (dgFloat32 *)&coordenateAccel[newCount]; penetration = (dgFloat32 *)&restitution[newCount]; penetrationStiffness = (dgFloat32 *)&penetration[newCount]; invDJMinvJt = (dgFloat32 *)&penetrationStiffness[newCount]; lowerBoundFrictionCoefficent = (dgFloat32 *)&invDJMinvJt[newCount]; upperBoundFrictionCoefficent = (dgFloat32 *)&lowerBoundFrictionCoefficent[newCount]; forceFeedback = (dgFloat32 **)&upperBoundFrictionCoefficent[newCount]; normalForceIndex = (dgInt32 *)&forceFeedback[newCount]; accelIsMotor = (dgInt32 *)&normalForceIndex[newCount]; } m_solverMemory[threadIndex].m_maxJacobiansCount = newCount; m_world->m_jacobiansMemory[threadIndex] = memory; force[-1] = dgFloat32(1.0f); force[-2] = dgFloat32(1.0f); force[-3] = dgFloat32(1.0f); force[-4] = dgFloat32(1.0f); m_solverMemory[threadIndex].m_Jt = Jt; m_solverMemory[threadIndex].m_JMinv = JMinv; m_solverMemory[threadIndex].m_force = force; m_solverMemory[threadIndex].m_accel = accel; m_solverMemory[threadIndex].m_deltaForce = deltaForce; m_solverMemory[threadIndex].m_deltaAccel = deltaAccel; m_solverMemory[threadIndex].m_diagDamp = diagDamp; m_solverMemory[threadIndex].m_invDJMinvJt = invDJMinvJt; m_solverMemory[threadIndex].m_penetration = penetration; m_solverMemory[threadIndex].m_restitution = restitution; m_solverMemory[threadIndex].m_coordenateAccel = coordenateAccel; m_solverMemory[threadIndex].m_penetrationStiffness = penetrationStiffness; m_solverMemory[threadIndex].m_lowerBoundFrictionCoefficent = lowerBoundFrictionCoefficent; m_solverMemory[threadIndex].m_upperBoundFrictionCoefficent = upperBoundFrictionCoefficent; m_solverMemory[threadIndex].m_jointFeebackForce = forceFeedback; m_solverMemory[threadIndex].m_normalForceIndex = normalForceIndex; m_solverMemory[threadIndex].m_accelIsMotor = accelIsMotor; NEWTON_ASSERT((dgUnsigned64(Jt) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(JMinv) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(force) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(accel) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(deltaAccel) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(deltaForce) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(diagDamp) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(penetration) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(restitution) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(invDJMinvJt) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(coordenateAccel) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(penetrationStiffness) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(lowerBoundFrictionCoefficent) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(upperBoundFrictionCoefficent) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(forceFeedback) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(normalForceIndex) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(accelIsMotor) & 0x0f) == 0); } void dgWorldDynamicUpdate::ReallocIntenalForcesMemory(dgInt32 count, dgInt32 threadIndex) { dgInt32 stride; dgInt32 newCount; void *memory; dgInt32 *treadLocks; dgJacobian *internalForces; dgJacobian *internalVeloc; stride = sizeof(dgJacobian) + sizeof(dgJacobian) + sizeof(dgInt32) + 0; if (count) { m_world->m_internalForcesMemorySizeInBytes[threadIndex] = m_world->m_internalForcesMemorySizeInBytes[threadIndex] * 2; memory = m_world->GetAllocator()->MallocLow( m_world->m_internalForcesMemorySizeInBytes[threadIndex]); // newCount = m_world->m_internalForcesMemorySizeInBytes[threadIndex] / stride; newCount = ((m_world->m_internalForcesMemorySizeInBytes[threadIndex] - 16) / stride) & (-8); internalForces = (dgJacobian *)memory; internalVeloc = &internalForces[newCount]; treadLocks = (dgInt32 *)&internalVeloc[newCount]; m_world->GetAllocator()->FreeLow( m_world->m_internalForcesMemory[threadIndex]); } else { memory = m_world->m_internalForcesMemory[threadIndex]; newCount = ((m_world->m_internalForcesMemorySizeInBytes[threadIndex] - 16) / stride) & (-8); internalForces = (dgJacobian *)memory; internalVeloc = &internalForces[newCount]; treadLocks = (dgInt32 *)&internalVeloc[newCount]; } NEWTON_ASSERT( (((dgInt8 *)&treadLocks[newCount - 1]) - ((dgInt8 *)memory)) < m_world->m_internalForcesMemorySizeInBytes[threadIndex]); m_solverMemory[threadIndex].m_maxBodiesCount = newCount; m_world->m_internalForcesMemory[threadIndex] = memory; m_solverMemory[threadIndex].m_internalForces = internalForces; m_solverMemory[threadIndex].m_internalVeloc = internalVeloc; m_solverMemory[threadIndex].m_treadLocks = treadLocks; memset(treadLocks, 0, newCount * sizeof(dgInt32)); // m_solverMemory[threadIndex].m_bodyFreeze = bodyFreeze; NEWTON_ASSERT((dgUnsigned64(internalForces) & 0x0f) == 0); NEWTON_ASSERT((dgUnsigned64(internalVeloc) & 0x0f) == 0); } void dgWorldDynamicUpdate::BuildIsland(dgQueue &queue, dgInt32 jointCount, dgInt32 hasUnilateralJoints, dgInt32 isContinueCollisionIsland) { dgInt32 bodyCount = 1; dgInt32 lruMark = m_markLru; if (m_bodies >= m_maxBodiesCount) { ReallocBodyMemory(m_bodies); } m_bodyArray[m_bodies].m_body = m_world->m_sentionelBody; NEWTON_ASSERT(m_world->m_sentionelBody->m_index == 0); NEWTON_ASSERT(dgInt32(m_world->m_sentionelBody->m_dynamicsLru) == m_markLru); while (!queue.IsEmpty()) { dgInt32 count = queue.m_firstIndex - queue.m_lastIndex; if (count < 0) { NEWTON_ASSERT(0); count += queue.m_mod; } dgInt32 index = queue.m_lastIndex; queue.Reset(); for (dgInt32 j = 0; j < count; j++) { dgBody *const body = queue.m_pool[index]; NEWTON_ASSERT(body); NEWTON_ASSERT(dgInt32(body->m_dynamicsLru) == lruMark); NEWTON_ASSERT(body->m_masterNode); if (body->m_invMass.m_w > dgFloat32(0.0f)) { dgInt32 bodyIndex = m_bodies + bodyCount; if (bodyIndex >= m_maxBodiesCount) { ReallocBodyMemory(bodyIndex); } body->m_index = bodyCount; m_bodyArray[bodyIndex].m_body = body; bodyCount++; } for (dgBodyMasterListRow::dgListNode *jointNode = body->m_masterNode->GetInfo().GetFirst(); jointNode; jointNode = jointNode->GetNext()) { dgBodyMasterListCell &cell = jointNode->GetInfo(); dgBody *const bodyN = cell.m_bodyNode; dgConstraint *const constraint = cell.m_joint; if (dgInt32(constraint->m_dynamicsLru) != lruMark) { constraint->m_dynamicsLru = dgUnsigned32(lruMark); dgInt32 jointIndex = m_joints + jointCount; if (jointIndex >= m_maxJointCount) { ReallocJointsMemory(jointIndex); } if (constraint->m_isUnilateral) { hasUnilateralJoints = 1; NEWTON_ASSERT( (constraint->m_body0 == m_world->m_sentionelBody) || (constraint->m_body1 == m_world->m_sentionelBody)); } constraint->m_index = dgUnsigned32(jointCount); m_constraintArray[jointIndex].m_joint = constraint; jointCount++; NEWTON_ASSERT(constraint->m_body0); NEWTON_ASSERT(constraint->m_body1); } if (dgInt32(bodyN->m_dynamicsLru) != lruMark) { if (bodyN->m_invMass.m_w > dgFloat32(0.0f)) { queue.Insert(bodyN); bodyN->m_dynamicsLru = dgUnsigned32(lruMark); } } } index++; if (index >= queue.m_mod) { NEWTON_ASSERT(0); index = 0; } } } if (bodyCount > 1) { if (m_islands >= m_maxIslandCount) { ReallocIslandMemory(m_islands); } m_islandArray[m_islands].m_bodyStart = m_bodies; m_islandArray[m_islands].m_jointStart = m_joints; m_islandArray[m_islands].m_bodyCount = bodyCount; m_islandArray[m_islands].m_jointCount = jointCount; m_islandArray[m_islands].m_hasUnilateralJoints = hasUnilateralJoints; m_islandArray[m_islands].m_isContinueCollision = isContinueCollisionIsland; m_islands++; m_bodies += bodyCount; m_joints += jointCount; } } void dgWorldDynamicUpdate::SpanningTree(dgBody *const body) { dgInt32 bodyCount = 0; dgInt32 jointCount = 0; dgInt32 staticCount = 0; dgInt32 lruMark = m_markLru - 1; dgInt32 isInWorld = 0; dgInt32 isInEquilibrium = 1; dgInt32 hasUnilateralJoints = 0; dgInt32 isContinueCollisionIsland = 0; dgBody *heaviestBody = NULL; dgFloat32 haviestMass = dgFloat32(0.0f); dgQueue queue( (dgBody **)m_world->m_pairMemoryBuffer, (dgInt32(m_world->m_pairMemoryBufferSizeInBytes >> 1)) / dgInt32(sizeof(void *))); dgBody **const staticPool = &queue.m_pool[queue.m_mod]; body->m_dynamicsLru = dgUnsigned32(lruMark); queue.Insert(body); while (!queue.IsEmpty()) { dgInt32 count = queue.m_firstIndex - queue.m_lastIndex; if (count < 0) { NEWTON_ASSERT(0); count += queue.m_mod; } dgInt32 index = queue.m_lastIndex; queue.Reset(); for (dgInt32 j = 0; j < count; j++) { dgBody *const srcBody = queue.m_pool[index]; NEWTON_ASSERT(srcBody); NEWTON_ASSERT(srcBody->m_invMass.m_w > dgFloat32(0.0f)); NEWTON_ASSERT(dgInt32(srcBody->m_dynamicsLru) == lruMark); NEWTON_ASSERT(srcBody->m_masterNode); dgInt32 bodyIndex = m_bodies + bodyCount; if (bodyIndex >= m_maxBodiesCount) { ReallocBodyMemory(bodyIndex); } m_bodyArray[bodyIndex].m_body = srcBody; isInWorld |= srcBody->m_isInWorld; isInEquilibrium &= srcBody->m_equilibrium; isInEquilibrium &= srcBody->m_autoSleep; // isInEquilibrium = 0; srcBody->m_sleeping = false; isContinueCollisionIsland |= srcBody->m_solverInContinueCollision; if (srcBody->m_mass.m_w > haviestMass) { haviestMass = srcBody->m_mass.m_w; heaviestBody = srcBody; } bodyCount++; for (dgBodyMasterListRow::dgListNode *jointNode = srcBody->m_masterNode->GetInfo().GetFirst(); jointNode; jointNode = jointNode->GetNext()) { dgBodyMasterListCell &cell = jointNode->GetInfo(); dgBody *const bodyN = cell.m_bodyNode; if (dgInt32(bodyN->m_dynamicsLru) < lruMark) { NEWTON_ASSERT(bodyN == cell.m_bodyNode); bodyN->m_dynamicsLru = dgUnsigned32(lruMark); if (bodyN->m_invMass.m_w > dgFloat32(0.0f)) { queue.Insert(bodyN); } else { dgInt32 duplicateBody = 0; for (; duplicateBody < staticCount; duplicateBody++) { if (staticPool[duplicateBody] == srcBody) { break; } } if (duplicateBody == staticCount) { staticPool[staticCount] = srcBody; staticCount++; NEWTON_ASSERT(srcBody->m_invMass.m_w > dgFloat32(0.0f)); } dgConstraint *const constraint = cell.m_joint; NEWTON_ASSERT(dgInt32(constraint->m_dynamicsLru) != m_markLru); dgInt32 jointIndex = m_joints + jointCount; if (jointIndex >= m_maxJointCount) { ReallocJointsMemory(jointIndex); } if (constraint->m_isUnilateral) { hasUnilateralJoints = 1; NEWTON_ASSERT( (constraint->m_body0 == m_world->m_sentionelBody) || (constraint->m_body1 == m_world->m_sentionelBody)); } constraint->m_index = dgUnsigned32(jointCount); m_constraintArray[jointIndex].m_joint = constraint; jointCount++; NEWTON_ASSERT(constraint->m_body0); NEWTON_ASSERT(constraint->m_body1); } } else if (cell.m_bodyNode->m_invMass.m_w == dgFloat32(0.0f)) { dgInt32 duplicateBody = 0; for (; duplicateBody < staticCount; duplicateBody++) { if (staticPool[duplicateBody] == srcBody) { break; } } if (duplicateBody == staticCount) { staticPool[staticCount] = srcBody; staticCount++; NEWTON_ASSERT(srcBody->m_invMass.m_w > dgFloat32(0.0f)); } dgConstraint *const constraint = cell.m_joint; NEWTON_ASSERT(dgInt32(constraint->m_dynamicsLru) != m_markLru); dgInt32 jointIndex = m_joints + jointCount; if (jointIndex >= m_maxJointCount) { ReallocJointsMemory(jointIndex); } if (constraint->m_isUnilateral) { hasUnilateralJoints = 1; NEWTON_ASSERT( (constraint->m_body0 == m_world->m_sentionelBody) || (constraint->m_body1 == m_world->m_sentionelBody)); } constraint->m_index = dgUnsigned32(jointCount); m_constraintArray[jointIndex].m_joint = constraint; jointCount++; NEWTON_ASSERT(constraint->m_body0); NEWTON_ASSERT(constraint->m_body1); } } index++; if (index >= queue.m_mod) { NEWTON_ASSERT(0); index = 0; } } } if (isInEquilibrium | !isInWorld) { for (dgInt32 i = 0; i < bodyCount; i++) { dgBody *const bodyI = m_bodyArray[m_bodies + i].m_body; bodyI->m_dynamicsLru = dgUnsigned32(m_markLru); bodyI->m_sleeping = true; } } else { if (m_world->m_islandUpdate) { dgIslandCallbackStruct record; record.m_world = m_world; record.m_count = bodyCount; record.m_strideInByte = sizeof(dgBodyInfo); record.m_bodyArray = &m_bodyArray[m_bodies].m_body; if (!m_world->m_islandUpdate(m_world, &record, bodyCount)) { for (dgInt32 i = 0; i < bodyCount; i++) { dgBody *const bodyI = m_bodyArray[m_bodies + i].m_body; bodyI->m_dynamicsLru = dgUnsigned32(m_markLru); } return; } } if (staticCount) { queue.Reset(); for (dgInt32 i = 0; i < staticCount; i++) { dgBody *const bodyI = staticPool[i]; bodyI->m_dynamicsLru = dgUnsigned32(m_markLru); queue.Insert(bodyI); NEWTON_ASSERT(dgInt32(bodyI->m_dynamicsLru) == m_markLru); } for (dgInt32 i = 0; i < jointCount; i++) { dgConstraint *const constraint = m_constraintArray[m_joints + i].m_joint; constraint->m_dynamicsLru = dgUnsigned32(m_markLru); } } else { NEWTON_ASSERT(heaviestBody); queue.Insert(heaviestBody); // body->m_dynamicsLru = m_markLru; heaviestBody->m_dynamicsLru = dgUnsigned32(m_markLru); } BuildIsland(queue, jointCount, hasUnilateralJoints, isContinueCollisionIsland); } } void dgWorldDynamicUpdate::IntegrateArray(const dgBodyInfo *bodyArray, dgInt32 count, dgFloat32 accelTolerance, dgFloat32 timestep, dgInt32 threadIndex, bool update) const { bool isAutoSleep = true; bool stackSleeping = true; dgInt32 sleepCounter = 10000; dgFloat32 speedFreeze = m_world->m_freezeSpeed2; dgFloat32 accelFreeze = m_world->m_freezeAccel2; dgFloat32 maxAccel = dgFloat32(0.0f); dgFloat32 maxAlpha = dgFloat32(0.0f); dgFloat32 maxSpeed = dgFloat32(0.0f); dgFloat32 maxOmega = dgFloat32(0.0f); dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); dgFloat32 forceDamp = DG_FREEZZING_VELOCITY_DRAG; if (count <= 2) { bool autosleep = bodyArray[0].m_body->m_autoSleep; if (count == 2) { autosleep &= bodyArray[1].m_body->m_autoSleep; } if (!autosleep) { forceDamp = dgFloat32(0.9999f); } } for (dgInt32 i = 0; i < count; i++) { dgBody *const body = bodyArray[i].m_body; if (body->m_invMass.m_w && body->m_isInWorld) { body->IntegrateVelocity(timestep); dgFloat32 accel2 = body->m_accel % body->m_accel; dgFloat32 alpha2 = body->m_alpha % body->m_alpha; dgFloat32 speed2 = body->m_veloc % body->m_veloc; dgFloat32 omega2 = body->m_omega % body->m_omega; maxAccel = GetMax(maxAccel, accel2); maxAlpha = GetMax(maxAlpha, alpha2); maxSpeed = GetMax(maxSpeed, speed2); maxOmega = GetMax(maxOmega, omega2); bool equilibrium = (accel2 < accelFreeze) && (alpha2 < accelFreeze) && (speed2 < speedFreeze) && (omega2 < speedFreeze); if (equilibrium) { body->m_veloc = body->m_veloc.Scale(forceDamp); body->m_omega = body->m_omega.Scale(forceDamp); } body->m_equilibrium = dgUnsigned32(equilibrium); stackSleeping &= equilibrium; isAutoSleep &= body->m_autoSleep; sleepCounter = GetMin(sleepCounter, body->m_sleepingCounter); } } if (update) { for (dgInt32 i = 0; i < count; i++) { dgBody *const body = bodyArray[i].m_body; if (body->m_invMass.m_w && body->m_isInWorld) { body->UpdateMatrix(timestep, threadIndex); } } if (isAutoSleep) { if (stackSleeping) { for (dgInt32 i = 0; i < count; i++) { dgBody *const body = bodyArray[i].m_body; body->m_netForce = zero; body->m_netTorque = zero; body->m_veloc = zero; body->m_omega = zero; } } else { if ((maxAccel > m_world->m_sleepTable[DG_SLEEP_ENTRIES - 1].m_maxAccel) || (maxAlpha > m_world->m_sleepTable[DG_SLEEP_ENTRIES - 1].m_maxAlpha) || (maxSpeed > m_world->m_sleepTable[DG_SLEEP_ENTRIES - 1].m_maxVeloc) || (maxOmega > m_world->m_sleepTable[DG_SLEEP_ENTRIES - 1].m_maxOmega)) { for (dgInt32 i = 0; i < count; i++) { dgBody *const body = bodyArray[i].m_body; body->m_sleepingCounter = 0; } } else { dgInt32 index = 0; for (dgInt32 i = 0; i < DG_SLEEP_ENTRIES; i++) { if ((maxAccel <= m_world->m_sleepTable[i].m_maxAccel) && (maxAlpha <= m_world->m_sleepTable[i].m_maxAlpha) && (maxSpeed <= m_world->m_sleepTable[i].m_maxVeloc) && (maxOmega <= m_world->m_sleepTable[i].m_maxOmega)) { index = i; break; } } dgInt32 timeScaleSleepCount = dgInt32( dgFloat32(60.0f) * sleepCounter * timestep); if (timeScaleSleepCount > m_world->m_sleepTable[index].m_steps) { for (dgInt32 i = 0; i < count; i++) { dgBody *const body = bodyArray[i].m_body; body->m_netForce = zero; body->m_netTorque = zero; body->m_veloc = zero; body->m_omega = zero; body->m_equilibrium = true; } } else { sleepCounter++; for (dgInt32 i = 0; i < count; i++) { dgBody *const body = bodyArray[i].m_body; body->m_sleepingCounter = sleepCounter; } } } } } } } dgInt32 dgWorldDynamicUpdate::GetJacobialDerivatives(const dgIsland &island, dgInt32 threadIndex, bool bitMode, dgInt32 rowCount, dgFloat32 timestep) { dgContraintDescritor constraintParams; dgInt32 jointCount = island.m_jointCount; constraintParams.m_world = m_world; constraintParams.m_threadIndex = threadIndex; constraintParams.m_timestep = timestep; constraintParams.m_invTimestep = dgFloat32(1.0f / timestep); dgJacobianMemory &solverMemory = m_solverMemory[threadIndex]; dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart]; for (dgInt32 j = 0; j < jointCount; j++) { dgConstraint *const constraint = constraintArray[j].m_joint; ; if (constraint->m_isUnilateral ^ bitMode) { dgInt32 dof = dgInt32(constraint->m_maxDOF); NEWTON_ASSERT(dof <= DG_CONSTRAINT_MAX_ROWS); for (dgInt32 i = 0; i < dof; i++) { constraintParams.m_forceBounds[i].m_low = DG_MIN_BOUND; constraintParams.m_forceBounds[i].m_upper = DG_MAX_BOUND; constraintParams.m_forceBounds[i].m_jointForce = NULL; constraintParams.m_forceBounds[i].m_normalIndex = DG_BILATERAL_CONSTRAINT; } NEWTON_ASSERT(constraint->m_body0); NEWTON_ASSERT(constraint->m_body1); constraint->m_body0->m_inCallback = true; constraint->m_body1->m_inCallback = true; dof = dgInt32(constraint->JacobianDerivative(constraintParams)); constraint->m_body0->m_inCallback = false; constraint->m_body1->m_inCallback = false; dgInt32 m0 = (constraint->m_body0->m_invMass.m_w != dgFloat32(0.0f)) ? constraint->m_body0->m_index : 0; dgInt32 m1 = (constraint->m_body1->m_invMass.m_w != dgFloat32(0.0f)) ? constraint->m_body1->m_index : 0; NEWTON_ASSERT(constraint->m_index == dgUnsigned32(j)); constraintArray[j].m_autoPairstart = rowCount; constraintArray[j].m_autoPaircount = dof; constraintArray[j].m_autoPairActiveCount = dof; constraintArray[j].m_m0 = m0; constraintArray[j].m_m1 = m1; for (dgInt32 i = 0; i < dof; i++) { NEWTON_ASSERT(constraintParams.m_forceBounds[i].m_jointForce); solverMemory.m_Jt[rowCount] = constraintParams.m_jacobian[i]; NEWTON_ASSERT(constraintParams.m_jointStiffness[i] >= dgFloat32(0.1f)); NEWTON_ASSERT(constraintParams.m_jointStiffness[i] <= dgFloat32(100.0f)); solverMemory.m_diagDamp[rowCount] = constraintParams.m_jointStiffness[i]; solverMemory.m_coordenateAccel[rowCount] = constraintParams.m_jointAccel[i]; solverMemory.m_accelIsMotor[rowCount] = dgInt32( constraintParams.m_isMotor[i]); solverMemory.m_restitution[rowCount] = constraintParams.m_restitution[i]; solverMemory.m_penetration[rowCount] = constraintParams.m_penetration[i]; solverMemory.m_penetrationStiffness[rowCount] = constraintParams.m_penetrationStiffness[i]; solverMemory.m_lowerBoundFrictionCoefficent[rowCount] = constraintParams.m_forceBounds[i].m_low; solverMemory.m_upperBoundFrictionCoefficent[rowCount] = constraintParams.m_forceBounds[i].m_upper; solverMemory.m_jointFeebackForce[rowCount] = constraintParams.m_forceBounds[i].m_jointForce; solverMemory.m_normalForceIndex[rowCount] = constraintParams.m_forceBounds[i].m_normalIndex + ((constraintParams.m_forceBounds[i].m_normalIndex >= 0) ? (rowCount - i) : 0); rowCount++; } #ifdef _DEBUG for (dgInt32 i = 0; i < ((rowCount + 3) & 0xfffc) - rowCount; i++) { solverMemory.m_diagDamp[rowCount + i] = dgFloat32(0.0f); solverMemory.m_coordenateAccel[rowCount + i] = dgFloat32(0.0f); solverMemory.m_restitution[rowCount + i] = dgFloat32(0.0f); solverMemory.m_penetration[rowCount + i] = dgFloat32(0.0f); solverMemory.m_penetrationStiffness[rowCount + i] = dgFloat32(0.0f); solverMemory.m_lowerBoundFrictionCoefficent[rowCount + i] = dgFloat32( 0.0f); solverMemory.m_upperBoundFrictionCoefficent[rowCount + i] = dgFloat32( 0.0f); solverMemory.m_jointFeebackForce[rowCount + i] = 0; solverMemory.m_normalForceIndex[rowCount + i] = 0; } #endif rowCount = (rowCount & (DG_SIMD_WORD_SIZE - 1)) ? ((rowCount & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : rowCount; NEWTON_ASSERT((rowCount & (DG_SIMD_WORD_SIZE - 1)) == 0); } } return rowCount; } void dgWorldDynamicUpdate::BuildJacobianMatrixSimd(const dgIsland &island, dgInt32 threadIndex, dgFloat32 timestep) { #ifdef DG_BUILD_SIMD_CODE // dgInt32 threads; // dgInt32 rowCount; // dgInt32 bodyCount; // dgInt32 jointCount; // dgContraintDescritor constraintParams; // dgVector zeroVector (dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); NEWTON_ASSERT(island.m_bodyCount >= 2); //NEWTON_ASSERT (island.m_jointCount >= 1); // dgInt32 threads = m_world->m_numberOfTheads; dgJacobianMemory &solverMemory = m_solverMemory[threadIndex]; dgInt32 bodyCount = island.m_bodyCount; dgBodyInfo *const bodyArray = &m_bodyArray[island.m_bodyStart]; bodyArray[0].m_body = m_world->GetSentinelBody(); // body = bodyArray[0].m_body; // body->m_hasRelativeVelocity = false; // body->m_accel = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); // body->m_alpha = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); NEWTON_ASSERT( (bodyArray[0].m_body->m_accel % bodyArray[0].m_body->m_accel) == dgFloat32(0.0f)); NEWTON_ASSERT( (bodyArray[0].m_body->m_alpha % bodyArray[0].m_body->m_alpha) == dgFloat32(0.0f)); for (dgInt32 i = 1; i < bodyCount; i++) { dgBody *const body = bodyArray[i].m_body; NEWTON_ASSERT(body->m_invMass.m_w > dgFloat32(0.0f)); // body->m_hasRelativeVelocity = false; // body->AddGyroscopicTorque(); body->AddDamingAcceleration(); body->CalcInvInertiaMatrixSimd(); } while (bodyCount >= solverMemory.m_maxBodiesCount) { m_world->dgGetUserLock(); ReallocIntenalForcesMemory(bodyCount, threadIndex); m_world->dgReleasedUserLock(); } dgInt32 jointCount = island.m_jointCount; // constraintArray = &m_constraintArray[island.m_jointStart]; dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart]; solverMemory.m_constraintArray = constraintArray; dgInt32 rowCount = 0; dgFloat32 maxRowCount = 0; for (dgInt32 j = 0; j < jointCount; j++) { dgConstraint *const constraint = constraintArray[j].m_joint; dgInt32 dof = dgInt32( (constraint->m_maxDOF & (DG_SIMD_WORD_SIZE - 1)) ? ((constraint->m_maxDOF & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : constraint->m_maxDOF); maxRowCount += dof; } while (maxRowCount > solverMemory.m_maxJacobiansCount) { m_world->dgGetUserLock(); ReallocJacobiansMemory(solverMemory.m_maxJacobiansCount * 2, threadIndex); m_world->dgReleasedUserLock(); } if (island.m_hasUnilateralJoints) { rowCount = GetJacobialDerivatives(island, threadIndex, false, rowCount, timestep); } rowCount = GetJacobialDerivatives(island, threadIndex, true, rowCount, timestep); solverMemory.m_rowCount = rowCount; solverMemory.m_bodyCount = bodyCount; solverMemory.m_bodyArray = bodyArray; solverMemory.m_jointCount = jointCount; solverMemory.m_timeStep = timestep; solverMemory.m_invTimeStep = dgFloat32(1.0f) / solverMemory.m_timeStep; dgFloat32 *const force = solverMemory.m_force; const dgJacobianPair *const Jt = solverMemory.m_Jt; dgJacobianPair *const JMinv = solverMemory.m_JMinv; dgFloat32 *const diagDamp = solverMemory.m_diagDamp; dgFloat32 *const extAccel = solverMemory.m_deltaAccel; dgFloat32 *const invDJMinvJt = solverMemory.m_invDJMinvJt; dgFloat32 *const coordenateAccel = solverMemory.m_coordenateAccel; dgFloat32 **const jointForceFeeback = solverMemory.m_jointFeebackForce; // dgJacobianIndex* const jacobianIndexArray = solverMemory.m_jacobianIndexArray; // dgBilateralBounds* const bilateralForceBounds = solverMemory.m_bilateralForceBounds; simd_type zero; zero = simd_set1(dgFloat32(0.0f)); for (dgInt32 k = 0; k < jointCount; k++) { // dgInt32 m0; // dgInt32 m1; // dgInt32 index; // dgInt32 count; // dgFloat32 diag; // dgFloat32 stiffness; // simd_type tmp0; // simd_type tmp1; // simd_type invMass0; // simd_type invMass1; // simd_type tmpDiag; // simd_type tmpAccel; // dgBody* body0; // dgBody* body1; dgInt32 index = constraintArray[k].m_autoPairstart; dgInt32 count = constraintArray[k].m_autoPaircount; dgInt32 m0 = constraintArray[k].m_m0; dgInt32 m1 = constraintArray[k].m_m1; NEWTON_ASSERT(m0 >= 0); NEWTON_ASSERT(m0 < bodyCount); NEWTON_ASSERT(m1 >= 0); NEWTON_ASSERT(m1 < bodyCount); dgBody *const body0 = bodyArray[m0].m_body; // invMass0 = body0->m_invMass[3]; simd_type invMass0 = simd_set1(body0->m_invMass[3]); NEWTON_ASSERT((dgUnsigned64(&body0->m_invWorldInertiaMatrix) & 0x0f) == 0); const dgMatrix &invInertia0 = body0->m_invWorldInertiaMatrix; dgBody *const body1 = bodyArray[m1].m_body; // invMass1 = body1->m_invMass[3]; simd_type invMass1 = simd_set1(body1->m_invMass[3]); NEWTON_ASSERT((dgUnsigned64(&body1->m_invWorldInertiaMatrix) & 0x0f) == 0); const dgMatrix &invInertia1 = body1->m_invWorldInertiaMatrix; for (dgInt32 i = 0; i < count; i++) { // JMinv[index].m_jacobian_IM0.m_linear = Jt[index].m_jacobian_IM0.m_linear.Scale (invMass0); // JMinv[index].m_jacobian_IM0.m_angular = invInertia0.UnrotateVector (Jt[index].m_jacobian_IM0.m_angular); // dgVector tmpDiag (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(Jt[index].m_jacobian_IM0.m_linear)); // tmpDiag += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(Jt[index].m_jacobian_IM0.m_angular); // dgVector tmpAccel (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(body0->m_accel)); // tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(body0->m_alpha); ((simd_type &)JMinv[index].m_jacobian_IM0.m_linear) = simd_mul_v((simd_type &)Jt[index].m_jacobian_IM0.m_linear, invMass0); simd_type tmp0 = (simd_type &)Jt[index].m_jacobian_IM0.m_angular; simd_type tmp1 = simd_mul_v((simd_type &)invInertia0.m_front, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 0, 0, 0))); tmp1 = simd_mul_add_v(tmp1, (simd_type &)invInertia0.m_up, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 1, 1, 1))); ((simd_type &)JMinv[index].m_jacobian_IM0.m_angular) = simd_mul_add_v(tmp1, (simd_type &)invInertia0.m_right, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 2, 2, 2))); simd_type tmpDiag = simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear); tmpDiag = simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular); simd_type tmpAccel = simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, (simd_type &)body0->m_accel); tmpAccel = simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, (simd_type &)body0->m_alpha); // JMinv[index].m_jacobian_IM1.m_linear = Jt[index].m_jacobian_IM1.m_linear.Scale (invMass1); // JMinv[index].m_jacobian_IM1.m_angular = invInertia1.UnrotateVector (Jt[index].m_jacobian_IM1.m_angular); // tmpDiag += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(Jt[index].m_jacobian_IM1.m_linear); // tmpDiag += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(Jt[index].m_jacobian_IM1.m_angular); // tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(body1->m_accel); // tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(body1->m_alpha); ((simd_type &)JMinv[index].m_jacobian_IM1.m_linear) = simd_mul_v((simd_type &)Jt[index].m_jacobian_IM1.m_linear, invMass1); tmp0 = (simd_type &)Jt[index].m_jacobian_IM1.m_angular; tmp1 = simd_mul_v((simd_type &)invInertia1.m_front, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 0, 0, 0))); tmp1 = simd_mul_add_v(tmp1, (simd_type &)invInertia1.m_up, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 1, 1, 1))); ((simd_type &)JMinv[index].m_jacobian_IM1.m_angular) = simd_mul_add_v(tmp1, (simd_type &)invInertia1.m_right, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 2, 2, 2))); tmpDiag = simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear); tmpDiag = simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular); tmpAccel = simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, (simd_type &)body1->m_accel); tmpAccel = simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, (simd_type &)body1->m_alpha); // coordenateAccel[index] -= (tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z); //NEWTON_ASSERT(tmpAccel.m128_f32[3] == dgFloat32 (0.0f)); tmpAccel = simd_add_v(tmpAccel, simd_move_hl_v(tmpAccel, tmpAccel)); tmpAccel = simd_sub_s(zero, simd_add_s(tmpAccel, simd_permut_v(tmpAccel, tmpAccel, PURMUT_MASK(3, 3, 3, 1)))); simd_store_s(tmpAccel, &extAccel[index]); simd_store_s(simd_add_s(simd_load_s(coordenateAccel[index]), tmpAccel), &coordenateAccel[index]); // force[index] = bilateralForceBounds[index].m_jointForce[0]; force[index] = jointForceFeeback[index][0]; NEWTON_ASSERT(diagDamp[index] >= dgFloat32(0.1f)); NEWTON_ASSERT(diagDamp[index] <= dgFloat32(100.0f)); dgFloat32 stiffness = DG_PSD_DAMP_TOL * diagDamp[index]; // diag = (tmpDiag.m_x + tmpDiag.m_y + tmpDiag.m_z); tmpDiag = simd_add_v(tmpDiag, simd_move_hl_v(tmpDiag, tmpDiag)); dgFloat32 diag; simd_store_s( simd_add_s(tmpDiag, simd_permut_v(tmpDiag, tmpDiag, PURMUT_MASK(3, 3, 3, 1))), &diag); NEWTON_ASSERT(diag > dgFloat32(0.0f)); diagDamp[index] = diag * stiffness; diag *= (dgFloat32(1.0f) + stiffness); invDJMinvJt[index] = dgFloat32(1.0f) / diag; index++; } } #endif } void dgWorldDynamicUpdate::BuildJacobianMatrix(const dgIsland &island, dgInt32 threadIndex, dgFloat32 timestep) { NEWTON_ASSERT(island.m_bodyCount >= 2); //NEWTON_ASSERT (island.m_jointCount >= 1); dgJacobianMemory &solverMemory = m_solverMemory[threadIndex]; dgInt32 bodyCount = island.m_bodyCount; dgBodyInfo *const bodyArray = &m_bodyArray[island.m_bodyStart]; NEWTON_ASSERT( (bodyArray[0].m_body->m_accel % bodyArray[0].m_body->m_accel) == dgFloat32(0.0f)); NEWTON_ASSERT( (bodyArray[0].m_body->m_alpha % bodyArray[0].m_body->m_alpha) == dgFloat32(0.0f)); for (dgInt32 i = 1; i < bodyCount; i++) { dgBody *const body = bodyArray[i].m_body; NEWTON_ASSERT(body->m_invMass.m_w > dgFloat32(0.0f)); body->AddDamingAcceleration(); body->CalcInvInertiaMatrix(); } while (bodyCount >= solverMemory.m_maxBodiesCount) { m_world->dgGetUserLock(); ReallocIntenalForcesMemory(bodyCount, threadIndex); m_world->dgReleasedUserLock(); } dgInt32 jointCount = island.m_jointCount; dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart]; solverMemory.m_constraintArray = constraintArray; dgInt32 maxRowCount = 0; for (dgInt32 j = 0; j < jointCount; j++) { dgConstraint *const constraint = constraintArray[j].m_joint; ; dgInt32 dof = dgInt32( (constraint->m_maxDOF & (DG_SIMD_WORD_SIZE - 1)) ? ((constraint->m_maxDOF & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : constraint->m_maxDOF); maxRowCount += dof; } while (maxRowCount > solverMemory.m_maxJacobiansCount) { m_world->dgGetUserLock(); ReallocJacobiansMemory(solverMemory.m_maxJacobiansCount * 2, threadIndex); m_world->dgReleasedUserLock(); } dgInt32 rowCount = 0; if (island.m_hasUnilateralJoints) { rowCount = GetJacobialDerivatives(island, threadIndex, false, rowCount, timestep); } rowCount = GetJacobialDerivatives(island, threadIndex, true, rowCount, timestep); solverMemory.m_rowCount = rowCount; solverMemory.m_bodyCount = bodyCount; solverMemory.m_bodyArray = bodyArray; solverMemory.m_jointCount = jointCount; solverMemory.m_timeStep = timestep; solverMemory.m_invTimeStep = dgFloat32(1.0f) / solverMemory.m_timeStep; dgFloat32 *const force = solverMemory.m_force; const dgJacobianPair *const Jt = solverMemory.m_Jt; dgJacobianPair *const JMinv = solverMemory.m_JMinv; dgFloat32 *const diagDamp = solverMemory.m_diagDamp; dgFloat32 *const extAccel = solverMemory.m_deltaAccel; dgFloat32 *const invDJMinvJt = solverMemory.m_invDJMinvJt; dgFloat32 *const coordenateAccel = solverMemory.m_coordenateAccel; dgFloat32 **const jointForceFeeback = solverMemory.m_jointFeebackForce; // dgInt32* const accelIsMotor = solverMemory.m_accelIsMotor; for (dgInt32 k = 0; k < jointCount; k++) { dgInt32 index = constraintArray[k].m_autoPairstart; dgInt32 count = constraintArray[k].m_autoPaircount; dgInt32 m0 = constraintArray[k].m_m0; dgInt32 m1 = constraintArray[k].m_m1; NEWTON_ASSERT(m0 >= 0); NEWTON_ASSERT(m0 < bodyCount); dgBody *const body0 = bodyArray[m0].m_body; dgFloat32 invMass0 = body0->m_invMass[3]; const dgMatrix &invInertia0 = body0->m_invWorldInertiaMatrix; NEWTON_ASSERT(m1 >= 0); NEWTON_ASSERT(m1 < bodyCount); dgBody *const body1 = bodyArray[m1].m_body; dgFloat32 invMass1 = body1->m_invMass[3]; const dgMatrix &invInertia1 = body1->m_invWorldInertiaMatrix; for (dgInt32 i = 0; i < count; i++) { // dgFloat32 extenalAcceleration; JMinv[index].m_jacobian_IM0.m_linear = Jt[index].m_jacobian_IM0.m_linear.Scale(invMass0); JMinv[index].m_jacobian_IM0.m_angular = invInertia0.UnrotateVector( Jt[index].m_jacobian_IM0.m_angular); dgVector tmpDiag( JMinv[index].m_jacobian_IM0.m_linear.CompProduct( Jt[index].m_jacobian_IM0.m_linear)); tmpDiag += JMinv[index].m_jacobian_IM0.m_angular.CompProduct( Jt[index].m_jacobian_IM0.m_angular); dgVector tmpAccel( JMinv[index].m_jacobian_IM0.m_linear.CompProduct(body0->m_accel)); tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct( body0->m_alpha); JMinv[index].m_jacobian_IM1.m_linear = Jt[index].m_jacobian_IM1.m_linear.Scale(invMass1); JMinv[index].m_jacobian_IM1.m_angular = invInertia1.UnrotateVector( Jt[index].m_jacobian_IM1.m_angular); tmpDiag += JMinv[index].m_jacobian_IM1.m_linear.CompProduct( Jt[index].m_jacobian_IM1.m_linear); tmpDiag += JMinv[index].m_jacobian_IM1.m_angular.CompProduct( Jt[index].m_jacobian_IM1.m_angular); tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct( body1->m_accel); tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct( body1->m_alpha); dgFloat32 extenalAcceleration = -(tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z); extAccel[index] = extenalAcceleration; coordenateAccel[index] += extenalAcceleration; force[index] = jointForceFeeback[index][0]; // force[index] = 0.0f; NEWTON_ASSERT(diagDamp[index] >= dgFloat32(0.1f)); NEWTON_ASSERT(diagDamp[index] <= dgFloat32(100.0f)); dgFloat32 stiffness = DG_PSD_DAMP_TOL * diagDamp[index]; dgFloat32 diag = (tmpDiag.m_x + tmpDiag.m_y + tmpDiag.m_z); NEWTON_ASSERT(diag > dgFloat32(0.0f)); diagDamp[index] = diag * stiffness; diag *= (dgFloat32(1.0f) + stiffness); // solverMemory.m_diagJMinvJt[index] = diag; invDJMinvJt[index] = dgFloat32(1.0f) / diag; index++; } } } void dgJacobianMemory::CalculateReactionsForcesSimd(dgInt32 solverMode, dgFloat32 tolerance) const { #ifdef DG_BUILD_SIMD_CODE if (m_jointCount == 0) { // ApplyExternalForcesAndAccelerationSimd (tolerance); ApplyExternalForcesAndAccelerationSimd(0.0f); return; } if (m_jointCount == 1) { CalculateSimpleBodyReactionsForcesSimd(tolerance); ApplyExternalForcesAndAccelerationSimd(tolerance * 0.001f); return; } if (solverMode) { CalculateForcesGameModeSimd(solverMode, tolerance); } else { CalculateForcesSimulationModeSimd(tolerance); } #endif } void dgJacobianMemory::CalculateReactionsForces(dgInt32 solverMode, dgFloat32 tolerance) const { if (m_jointCount == 0) { // ApplyExternalForcesAndAcceleration (tolerance); ApplyExternalForcesAndAcceleration(0.0f); return; } if (m_jointCount == 1) { CalculateSimpleBodyReactionsForces(tolerance); ApplyExternalForcesAndAcceleration(tolerance * 0.001f); return; } if (solverMode) { CalculateForcesGameMode(solverMode, tolerance); } else { CalculateForcesSimulationMode(tolerance); } } void dgJacobianMemory::ApplyExternalForcesAndAccelerationSimd( dgFloat32 tolerance) const { #ifdef DG_BUILD_SIMD_CODE // dgInt32 hasJointFeeback; // simd_type zero; // simd_type absMask; // simd_type timeStep; // simd_type toleranceSimd; dgFloat32 *const force = m_force; dgJacobian *const internalForces = m_internalForces; const dgJacobianPair *const Jt = m_Jt; const dgBodyInfo *const bodyArray = m_bodyArray; const dgJointInfo *const constraintArray = m_constraintArray; dgFloat32 **const jointForceFeeback = m_jointFeebackForce; simd_type zero = simd_set1(dgFloat32(0.0f)); for (dgInt32 i = 0; i < m_bodyCount; i++) { (simd_type &)internalForces[i].m_linear = zero; (simd_type &)internalForces[i].m_angular = zero; } // dgInt32 tmp = 0x7fffffff; dgInt32 hasJointFeeback = 0; // simd_type absMask = simd_set1(*((dgFloat32*)&tmp)); simd_type timeStep = simd_set1(m_timeStep); simd_type toleranceSimd = simd_set1(tolerance); toleranceSimd = simd_mul_s(toleranceSimd, toleranceSimd); for (dgInt32 i = 0; i < m_jointCount; i++) { // dgInt32 m0; // dgInt32 m1; // dgInt32 first; // dgInt32 count; // dgInt32 index; // simd_type y0_linear; // simd_type y0_angular; // simd_type y1_linear; // simd_type y1_angular; dgInt32 first = constraintArray[i].m_autoPairstart; dgInt32 count = constraintArray[i].m_autoPaircount; dgInt32 m0 = constraintArray[i].m_m0; dgInt32 m1 = constraintArray[i].m_m1; simd_type y0_linear = zero; simd_type y0_angular = zero; simd_type y1_linear = zero; simd_type y1_angular = zero; for (dgInt32 j = 0; j < count; j++) { dgInt32 index = j + first; // val = force[index]; //NEWTON_ASSERT (dgCheckFloat(val)); simd_type val = simd_set1(force[index]); simd_store_s(val, &jointForceFeeback[index][0]); // maxForce = simd_max_s (maxForce, simd_and_v (val, absMask)); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, val); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, val); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, val); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, val); } // if (constraintArray[i].m_joint->GetId() == dgContactConstraintId) { // dgFloat32 force; // simd_store_s (maxForce, &force); // m_world->AddToBreakQueue ((dgContact*)constraintArray[i].m_joint, force); // } hasJointFeeback |= (constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0); // if (constraintArray[i].m_joint->m_updaFeedbackCallback) { // constraintArray[i].m_joint->m_updaFeedbackCallback (*constraintArray[i].m_joint, m_timeStep, m_threadIndex); // } (simd_type &)internalForces[m0].m_linear = simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear); (simd_type &)internalForces[m0].m_angular = simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular); (simd_type &)internalForces[m1].m_linear = simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear); (simd_type &)internalForces[m1].m_angular = simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular); } for (dgInt32 i = 1; i < m_bodyCount; i++) { // simd_type tmp; // simd_type accel; // simd_type alpha; // dgBody* body; dgBody *const body = bodyArray[i].m_body; (simd_type &)body->m_accel = simd_add_v((simd_type &)internalForces[i].m_linear, (simd_type &)body->m_accel); (simd_type &)body->m_alpha = simd_add_v((simd_type &)internalForces[i].m_angular, (simd_type &)body->m_alpha); // dgVector accel (body->m_accel.Scale (body->m_invMass.m_w)); simd_type accel = simd_mul_v((simd_type &)body->m_accel, simd_set1(body->m_invMass.m_w)); // dgVector alpha (body->m_invWorldInertiaMatrix.RotateVector (body->m_alpha)); simd_type alpha = simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_invWorldInertiaMatrix[0], simd_set1(body->m_alpha.m_x)), (simd_type &)body->m_invWorldInertiaMatrix[1], simd_set1(body->m_alpha.m_y)), (simd_type &)body->m_invWorldInertiaMatrix[2], simd_set1(body->m_alpha.m_z)); // dgVector accel (body->m_accel.Scale (body->m_invMass.m_w)); // dgVector alpha (body->m_invWorldInertiaMatrix.RotateVector (body->m_alpha)); // error = accel % accel; // if (error < accelTol2) { // accel = zero; // body->m_accel = zero; // } simd_type tmp = simd_mul_v(accel, accel); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); tmp = simd_cmplt_s(tmp, toleranceSimd); tmp = simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0)); accel = simd_andnot_v(accel, tmp); (simd_type &)body->m_accel = simd_andnot_v((simd_type &)body->m_accel, tmp); // error = alpha % alpha; // if (error < accelTol2) { // alpha = zero; // body->m_alpha = zero; // } tmp = simd_mul_v(alpha, alpha); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); tmp = simd_cmplt_s(tmp, toleranceSimd); tmp = simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0)); alpha = simd_andnot_v(alpha, tmp); (simd_type &)body->m_alpha = simd_andnot_v((simd_type &)body->m_alpha, tmp); (simd_type &)body->m_netForce = (simd_type &)body->m_accel; (simd_type &)body->m_netTorque = (simd_type &)body->m_alpha; (simd_type &)body->m_veloc = simd_mul_add_v((simd_type &)body->m_veloc, accel, timeStep); (simd_type &)body->m_omega = simd_mul_add_v((simd_type &)body->m_omega, alpha, timeStep); } if (hasJointFeeback) { for (dgInt32 i = 0; i < m_jointCount; i++) { if (constraintArray[i].m_joint->m_updaFeedbackCallback) { constraintArray[i].m_joint->m_updaFeedbackCallback( *constraintArray[i].m_joint, m_timeStep, m_threadIndex); } } } #endif } void dgJacobianMemory::ApplyExternalForcesAndAcceleration( dgFloat32 tolerance) const { // dgInt32 hasJointFeeback; // dgFloat32 accelTol2; dgFloat32 *const force = m_force; const dgJacobianPair *const Jt = m_Jt; const dgBodyInfo *const bodyArray = m_bodyArray; const dgJointInfo *const constraintArray = m_constraintArray; dgFloat32 **const jointForceFeeback = m_jointFeebackForce; dgJacobian *const internalForces = m_internalForces; dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); for (dgInt32 i = 0; i < m_bodyCount; i++) { internalForces[i].m_linear = zero; internalForces[i].m_angular = zero; } dgInt32 hasJointFeeback = 0; dgFloat32 accelTol2 = tolerance * tolerance; for (dgInt32 i = 0; i < m_jointCount; i++) { // dgInt32 m0; // dgInt32 m1; // dgInt32 first; // dgInt32 count; // dgInt32 index; // dgFloat32 val; dgInt32 first = constraintArray[i].m_autoPairstart; dgInt32 count = constraintArray[i].m_autoPaircount; dgInt32 m0 = constraintArray[i].m_m0; dgInt32 m1 = constraintArray[i].m_m1; dgJacobian y0; dgJacobian y1; y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (dgInt32 j = 0; j < count; j++) { dgInt32 index = j + first; dgFloat32 val = force[index]; NEWTON_ASSERT(dgCheckFloat(val)); jointForceFeeback[index][0] = val; y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(val); y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(val); y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(val); y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(val); } // if (constraintArray[i].m_joint->GetId() == dgContactConstraintId) { // m_world->AddToBreakQueue ((dgContact*)constraintArray[i].m_joint, maxForce); // } // hasJointFeeback |= dgUnsigned32 (constraintArray[i].m_joint->m_updaFeedbackCallback); hasJointFeeback |= (constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0); internalForces[m0].m_linear += y0.m_linear; internalForces[m0].m_angular += y0.m_angular; internalForces[m1].m_linear += y1.m_linear; internalForces[m1].m_angular += y1.m_angular; } for (dgInt32 i = 1; i < m_bodyCount; i++) { dgBody *const body = bodyArray[i].m_body; body->m_accel += internalForces[i].m_linear; body->m_alpha += internalForces[i].m_angular; dgVector accel(body->m_accel.Scale(body->m_invMass.m_w)); dgVector alpha(body->m_invWorldInertiaMatrix.RotateVector(body->m_alpha)); dgFloat32 error = accel % accel; if (error < accelTol2) { accel = zero; body->m_accel = zero; } error = alpha % alpha; if (error < accelTol2) { alpha = zero; body->m_alpha = zero; } body->m_netForce = body->m_accel; body->m_netTorque = body->m_alpha; body->m_veloc += accel.Scale(m_timeStep); body->m_omega += alpha.Scale(m_timeStep); } if (hasJointFeeback) { for (dgInt32 i = 0; i < m_jointCount; i++) { if (constraintArray[i].m_joint->m_updaFeedbackCallback) { constraintArray[i].m_joint->m_updaFeedbackCallback( reinterpret_cast(constraintArray[i].m_joint), m_timeStep, m_threadIndex); } } } } void dgJacobianMemory::SwapRowsSimd(dgInt32 i, dgInt32 j) const { #ifdef DG_BUILD_SIMD_CODE // simd_type tmp0; // simd_type tmp1; // simd_type tmp2; // simd_type tmp3; // simd_type tmp4; // simd_type tmp5; // simd_type tmp6; // simd_type tmp7; NEWTON_ASSERT(i != j); // Swap (m_Jt[i], m_Jt[j]); dgJacobianPair *const ptr0 = m_Jt; simd_type tmp0 = (simd_type &)ptr0[i].m_jacobian_IM0.m_linear; simd_type tmp1 = (simd_type &)ptr0[i].m_jacobian_IM0.m_angular; simd_type tmp2 = (simd_type &)ptr0[i].m_jacobian_IM1.m_linear; simd_type tmp3 = (simd_type &)ptr0[i].m_jacobian_IM1.m_angular; simd_type tmp4 = (simd_type &)ptr0[j].m_jacobian_IM0.m_linear; simd_type tmp5 = (simd_type &)ptr0[j].m_jacobian_IM0.m_angular; simd_type tmp6 = (simd_type &)ptr0[j].m_jacobian_IM1.m_linear; simd_type tmp7 = (simd_type &)ptr0[j].m_jacobian_IM1.m_angular; (simd_type &)ptr0[j].m_jacobian_IM0.m_linear = tmp0; (simd_type &)ptr0[j].m_jacobian_IM0.m_angular = tmp1; (simd_type &)ptr0[j].m_jacobian_IM1.m_linear = tmp2; (simd_type &)ptr0[j].m_jacobian_IM1.m_angular = tmp3; (simd_type &)ptr0[i].m_jacobian_IM0.m_linear = tmp4; (simd_type &)ptr0[i].m_jacobian_IM0.m_angular = tmp5; (simd_type &)ptr0[i].m_jacobian_IM1.m_linear = tmp6; (simd_type &)ptr0[i].m_jacobian_IM1.m_angular = tmp7; // Swap (m_JMinv[i], m_JMinv[j]); dgJacobianPair *const ptr1 = m_JMinv; tmp0 = (simd_type &)ptr1[i].m_jacobian_IM0.m_linear; tmp1 = (simd_type &)ptr1[i].m_jacobian_IM0.m_angular; tmp2 = (simd_type &)ptr1[i].m_jacobian_IM1.m_linear; tmp3 = (simd_type &)ptr1[i].m_jacobian_IM1.m_angular; tmp4 = (simd_type &)ptr1[j].m_jacobian_IM0.m_linear; tmp5 = (simd_type &)ptr1[j].m_jacobian_IM0.m_angular; tmp6 = (simd_type &)ptr1[j].m_jacobian_IM1.m_linear; tmp7 = (simd_type &)ptr1[j].m_jacobian_IM1.m_angular; (simd_type &)ptr1[j].m_jacobian_IM0.m_linear = tmp0; (simd_type &)ptr1[j].m_jacobian_IM0.m_angular = tmp1; (simd_type &)ptr1[j].m_jacobian_IM1.m_linear = tmp2; (simd_type &)ptr1[j].m_jacobian_IM1.m_angular = tmp3; (simd_type &)ptr1[i].m_jacobian_IM0.m_linear = tmp4; (simd_type &)ptr1[i].m_jacobian_IM0.m_angular = tmp5; (simd_type &)ptr1[i].m_jacobian_IM1.m_linear = tmp6; (simd_type &)ptr1[i].m_jacobian_IM1.m_angular = tmp7; Swap(m_diagDamp[i], m_diagDamp[j]); Swap(m_invDJMinvJt[i], m_invDJMinvJt[j]); // Swap (m_jacobianIndexArray[i], m_jacobianIndexArray[j]); Swap(m_normalForceIndex[i], m_normalForceIndex[j]); // Swap (m_lowerBoundFrictionForce[i], m_lowerBoundFrictionForce[j]); // Swap (m_upperBoundFrictionForce[i], m_upperBoundFrictionForce[j]); Swap(m_lowerBoundFrictionCoefficent[i], m_lowerBoundFrictionCoefficent[j]); Swap(m_upperBoundFrictionCoefficent[i], m_upperBoundFrictionCoefficent[j]); Swap(m_jointFeebackForce[i], m_jointFeebackForce[j]); Swap(m_coordenateAccel[i], m_coordenateAccel[j]); Swap(m_force[i], m_force[j]); Swap(m_accel[i], m_accel[j]); // Swap (m_forceStep[i], m_forceStep[j]); Swap(m_deltaAccel[i], m_deltaAccel[j]); Swap(m_deltaForce[i], m_deltaForce[j]); #else #endif } void dgJacobianMemory::SwapRows(dgInt32 i, dgInt32 j) const { NEWTON_ASSERT(i != j); #define SwapMacro(a, b) Swap(a, b) SwapMacro(m_Jt[i], m_Jt[j]); SwapMacro(m_JMinv[i], m_JMinv[j]); SwapMacro(m_diagDamp[i], m_diagDamp[j]); SwapMacro(m_invDJMinvJt[i], m_invDJMinvJt[j]); // SwapMacro (m_diagJMinvJt[i], m_diagJMinvJt[j]); // SwapMacro (m_frictionThreshold[i], m_frictionThreshold[j]); // SwapMacro (m_jacobianIndexArray[i], m_jacobianIndexArray[j]); // SwapMacro (m_bilateralForceBounds[i], m_bilateralForceBounds[j]); SwapMacro(m_normalForceIndex[i], m_normalForceIndex[j]); // SwapMacro (m_lowerBoundFrictionForce[i], m_lowerBoundFrictionForce[j]); // SwapMacro (m_upperBoundFrictionForce[i], m_upperBoundFrictionForce[j]); SwapMacro(m_lowerBoundFrictionCoefficent[i], m_lowerBoundFrictionCoefficent[j]); SwapMacro(m_upperBoundFrictionCoefficent[i], m_upperBoundFrictionCoefficent[j]); SwapMacro(m_jointFeebackForce[i], m_jointFeebackForce[j]); SwapMacro(m_coordenateAccel[i], m_coordenateAccel[j]); SwapMacro(m_force[i], m_force[j]); SwapMacro(m_accel[i], m_accel[j]); // SwapMacro (m_forceStep[i], m_forceStep[j]); SwapMacro(m_deltaAccel[i], m_deltaAccel[j]); SwapMacro(m_deltaForce[i], m_deltaForce[j]); } void dgJacobianMemory::CalculateSimpleBodyReactionsForcesSimd( dgFloat32 maxAccNorm) const { #ifdef DG_BUILD_SIMD_CODE simd_type accelPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; simd_type activeRowPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; simd_type lowBoundPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; simd_type highBoundPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; simd_type deltaAccelPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; simd_type deltaForcePtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; dgFloat32 *const force = m_force; dgFloat32 *const activeRow = (dgFloat32 *)activeRowPtr; dgFloat32 *const lowBound = (dgFloat32 *)lowBoundPtr; dgFloat32 *const highBound = (dgFloat32 *)highBoundPtr; dgFloat32 *const accel = (dgFloat32 *)accelPtr; dgFloat32 *const deltaAccel = (dgFloat32 *)deltaAccelPtr; dgFloat32 *const deltaForce = (dgFloat32 *)deltaForcePtr; const dgJacobianPair *const Jt = m_Jt; const dgJacobianPair *const JMinv = m_JMinv; const dgFloat32 *const diagDamp = m_diagDamp; const dgFloat32 *const invDJMinvJt = m_invDJMinvJt; const dgInt32 *const normalForceIndex = m_normalForceIndex; const dgFloat32 *const coordenateAccel = m_coordenateAccel; const dgJointInfo *const constraintArray = m_constraintArray; const dgFloat32 *const lowerFriction = m_lowerBoundFrictionCoefficent; const dgFloat32 *const upperFriction = m_upperBoundFrictionCoefficent; dgFloatSign tmpIndex; tmpIndex.m_integer.m_iVal = 0x7fffffff; simd_type signMask = simd_set1(tmpIndex.m_fVal); simd_type one = simd_set1(dgFloat32(1.0f)); simd_type zero = simd_set1(dgFloat32(0.0f)); simd_type tol_pos_1eNeg5 = simd_set1(dgFloat32(1.0e-5f)); simd_type tol_pos_1eNeg8 = simd_set1(dgFloat32(1.0e-8f)); simd_type tol_pos_1eNeg16 = simd_set1(dgFloat32(1.0e-16f)); simd_type tol_neg_1eNeg16 = simd_set1(dgFloat32(-1.0e-16f)); // dgVector zero (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); dgInt32 count = constraintArray[0].m_autoPaircount; NEWTON_ASSERT(constraintArray[0].m_autoPairstart == 0); dgInt32 roundCount = count & (-DG_SIMD_WORD_SIZE); if (roundCount != count) { roundCount += 4; for (dgInt32 i = count; i < roundCount; i++) { // force[i] = dgFloat32 (0.0f); // accel[i] = dgFloat32 (0.0f); // activeRow[i] = dgFloat32 (0.0f); // deltaAccel[i] = dgFloat32 (0.0f); // deltaForce[i] = dgFloat32 (0.0f); m_normalForceIndex[i] = -1; simd_store_s(zero, &force[i]); simd_store_s(zero, &accel[i]); simd_store_s(zero, &activeRow[i]); simd_store_s(zero, &deltaAccel[i]); simd_store_s(zero, &deltaForce[i]); simd_store_s(one, &m_lowerBoundFrictionCoefficent[i]); simd_store_s(zero, &m_upperBoundFrictionCoefficent[i]); } } // j0 = jacobianIndexArray[0].m_m0; // j1 = jacobianIndexArray[0].m_m1; // dgInt32 m0 = constraintArray[0].m_m0; // dgInt32 m1 = constraintArray[0].m_m1; // dgBody* const body0 = bodyArray[m0].m_body; // dgBody* const body1 = bodyArray[m1].m_body; // ak = dgSqrt((body1->m_accel % body1->m_accel) + (body0->m_accel % body0->m_accel)); // tmp3 = simd_mul_add_v (simd_mul_v ((simd_type&)body1->m_accel, (simd_type&)body1->m_accel), // (simd_type&)body0->m_accel, (simd_type&)body0->m_accel); // tmp3 = simd_add_v (tmp3, simd_permut_v (tmp3, tmp3, PURMUT_MASK (0, 0, 3, 2))); // tmp3 = simd_add_s (tmp3, simd_permut_v (tmp3, tmp3, PURMUT_MASK (0, 0, 0, 1))); // tmp3 = simd_mul_s (tmp3, simd_rsqrt_s(tmp3)); // maxPasses = 0; simd_type tmp2 = zero; // tmp3 = simd_permut_v (tmp3, tmp3, PURMUT_MASK (0, 0, 0, 0)); // for (i = 0; i = 0) { // val = force[k]; // if (val < dgFloat32 (1.0e-2f)) { // val = ak; // } // } else { // val = dgFloat32 (1.0f); // } // normalForce = force[normalForceIndex[i]] ; // normalForce = (normalForce > dgFloat32 (1.0e-2f)) ? normalForce : ak; simd_type normalForce = simd_move_lh_v(simd_pack_lo_v(simd_load_s(force[normalForceIndex[i + 0]]), simd_load_s(force[normalForceIndex[i + 1]])), simd_pack_lo_v(simd_load_s(force[normalForceIndex[i + 2]]), simd_load_s(force[normalForceIndex[i + 3]]))); // test = simd_cmplt_v (normalForce, tol_pos_1eNeg2); // normalForce = simd_or_v (simd_and_v (tmp3, test), simd_andnot_v (normalForce, test)); // lowBound[i] = normalForce * lowerFriction[i]; // highBound[i] = normalForce * upperFriction[i]; (simd_type &)lowBound[i] = simd_mul_v(normalForce, (simd_type &)lowerFriction[i]); (simd_type &)highBound[i] = simd_mul_v(normalForce, (simd_type &)upperFriction[i]); // activeRow[i] = dgFloat32 (1.0f); // if (force[i] < lowBound[i]) { // maxPasses --; // force[i] = lowBound[i]; // activeRow[i] = dgFloat32 (0.0f); // } else if (force[i] > highBound[i]) { // maxPasses --; // force[i] = highBound[i]; // activeRow[i] = dgFloat32 (0.0f); // } // lowBoundTest = force[i] < lowBound[i]; // highBoundTest = force[i] > highBound[i]; // lowBoundTest = simd_cmplt_v ((simd_type&)force[i], (simd_type&)lowBound[i]); // highBoundTest = simd_cmpgt_v ((simd_type&)force[i], (simd_type&)highBound[i]); // test = lowBoundTest | highBoundTest; // test = simd_or_v (lowBoundTest, highBoundTest); simd_type test = simd_or_v(simd_cmplt_v((simd_type &)force[i], (simd_type &)lowBound[i]), simd_cmpgt_v((simd_type &)force[i], (simd_type &)highBound[i])); // activeRow[i] = test ? dgFloat32 (0.0f) : dgFloat32 (1.0f); (simd_type &)activeRow[i] = simd_andnot_v(one, test); // maxPasses += test ? 0 : 1; tmp2 = simd_add_v(tmp2, (simd_type &)activeRow[i]); // force[i] = lowBoundTest ? lowBound[i] : (highBoundTest ? highBound[i] : force[i]); // normalForce = simd_or_v (simd_and_v ((simd_type&)lowBound[i], lowBoundTest), simd_and_v ((simd_type&)highBound[i], highBoundTest)); // (simd_type&)force[i] = simd_or_v (simd_and_v (normalForce, test), simd_andnot_v ((simd_type&)force[i], test)); (simd_type &)force[i] = simd_min_v((simd_type &)highBound[i], simd_max_v((simd_type &)force[i], (simd_type &)lowBound[i])); } tmp2 = simd_add_v(tmp2, simd_move_hl_v(tmp2, tmp2)); dgInt32 maxPasses = simd_store_is(simd_add_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(0, 0, 0, 1)))); // y0.m_linear = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); // y0.m_angular = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); // y1.m_linear = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); // y1.m_angular = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); simd_type y0_linear = zero; simd_type y0_angular = zero; simd_type y1_linear = zero; simd_type y1_angular = zero; for (dgInt32 i = 0; i < count; i++) { // force = m_force[i]; simd_type tmp1 = simd_set1(force[i]); // y0.m_linear += m_Jt[i].m_jacobian_IM0.m_linear.Scale (force); // y0.m_angular += m_Jt[i].m_jacobian_IM0.m_angular.Scale (force); // y1.m_linear += m_Jt[i].m_jacobian_IM1.m_linear.Scale (force); // y1.m_angular += m_Jt[i].m_jacobian_IM1.m_angular.Scale (force); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)Jt[i].m_jacobian_IM0.m_linear, tmp1); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)Jt[i].m_jacobian_IM0.m_angular, tmp1); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)Jt[i].m_jacobian_IM1.m_linear, tmp1); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)Jt[i].m_jacobian_IM1.m_angular, tmp1); } // akNum = dgFloat32 (0.0f); // accNorm = dgFloat32(0.0f); simd_type tmp0 = zero; simd_type tmp1 = zero; for (dgInt32 i = 0; i < count; i++) { // dgVector acc (m_JMinv[i].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); // acc += m_JMinv[i].m_jacobian_IM0.m_angular.CompProduct (y0.m_angular); // acc += m_JMinv[i].m_jacobian_IM1.m_linear.CompProduct (y1.m_linear); // acc += m_JMinv[i].m_jacobian_IM1.m_angular.CompProduct (y1.m_angular); simd_type tmp2 = simd_mul_v((simd_type &)JMinv[i].m_jacobian_IM0.m_linear, y0_linear); tmp2 = simd_mul_add_v(tmp2, (simd_type &)JMinv[i].m_jacobian_IM0.m_angular, y0_angular); tmp2 = simd_mul_add_v(tmp2, (simd_type &)JMinv[i].m_jacobian_IM1.m_linear, y1_linear); tmp2 = simd_mul_add_v(tmp2, (simd_type &)JMinv[i].m_jacobian_IM1.m_angular, y1_angular); // m_accel[i] = m_coordenateAccel[i] - acc.m_x - acc.m_y - acc.m_z - m_force[i] * m_diagDamp[i]; tmp2 = simd_add_v(tmp2, simd_move_hl_v(tmp2, tmp2)); tmp2 = simd_add_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(3, 3, 3, 1))); tmp2 = simd_sub_s(simd_load_s(coordenateAccel[i]), simd_mul_add_s(tmp2, simd_load_s(force[i]), simd_load_s(diagDamp[i]))); simd_store_s(tmp2, &accel[i]); // m_deltaForce[i] = m_accel[i] * m_invDJMinvJt[i] * activeRow[i]; simd_type tmp3 = simd_mul_s(tmp2, simd_mul_s(simd_load_s(invDJMinvJt[i]), simd_load_s(activeRow[i]))); simd_store_s(tmp3, &deltaForce[i]); // akNum += m_accel[i] * m_deltaForce[i]; tmp0 = simd_mul_add_v(tmp0, tmp3, tmp2); // accNorm = GetMax (dgAbsf (m_accel[i] * activeRow[i]), accNorm); tmp1 = simd_max_s(tmp1, simd_and_v(simd_mul_v(tmp2, simd_load_s(activeRow[i])), signMask)); } dgFloat32 akNum; dgFloat32 accNorm; simd_store_s(tmp0, &akNum); simd_store_s(tmp1, &accNorm); for (dgInt32 i = 0; (i < maxPasses) && (accNorm > maxAccNorm); i++) { // dgInt32 clampedForceIndex; // dgFloat32 ak; // dgFloat32 clampedForceIndexValue; // MatrixTimeVector (forceRows, m_deltaForce, m_deltaAccel); // y0.m_linear = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); // y0.m_angular = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); // y1.m_linear = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); // y1.m_angular = dgVector (dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f), dgFloat32 (0.0f)); simd_type y0_linear = zero; simd_type y0_angular = zero; simd_type y1_linear = zero; simd_type y1_angular = zero; for (dgInt32 k = 0; k < count; k++) { // ak = m_deltaForce[k]; simd_type tmp1 = simd_set1(deltaForce[k]); // y0.m_linear += m_Jt[k].m_jacobian_IM0.m_linear.Scale (ak); // y0.m_angular += m_Jt[k].m_jacobian_IM0.m_angular.Scale (ak); // y1.m_linear += m_Jt[k].m_jacobian_IM1.m_linear.Scale (ak); // y1.m_angular += m_Jt[k].m_jacobian_IM1.m_angular.Scale (ak); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)Jt[k].m_jacobian_IM0.m_linear, tmp1); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)Jt[k].m_jacobian_IM0.m_angular, tmp1); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)Jt[k].m_jacobian_IM1.m_linear, tmp1); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)Jt[k].m_jacobian_IM1.m_angular, tmp1); } // akDen = dgFloat32 (0.0f); simd_type tmp0 = zero; for (dgInt32 k = 0; k < count; k++) { // dgVector acc (m_JMinv[k].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); // acc += m_JMinv[k].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); // acc += m_JMinv[k].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); // acc += m_JMinv[k].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); simd_type tmp2 = simd_mul_v((simd_type &)JMinv[k].m_jacobian_IM0.m_linear, y0_linear); tmp2 = simd_mul_add_v(tmp2, (simd_type &)JMinv[k].m_jacobian_IM0.m_angular, y0_angular); tmp2 = simd_mul_add_v(tmp2, (simd_type &)JMinv[k].m_jacobian_IM1.m_linear, y1_linear); tmp2 = simd_mul_add_v(tmp2, (simd_type &)JMinv[k].m_jacobian_IM1.m_angular, y1_angular); // m_deltaAccel[k] = acc.m_x + acc.m_y + acc.m_z + m_deltaForce[k] * m_diagDamp[k]; simd_type tmp1 = simd_load_s(deltaForce[k]); tmp2 = simd_add_v(tmp2, simd_move_hl_v(tmp2, tmp2)); tmp2 = simd_add_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(3, 3, 3, 1))); tmp2 = simd_mul_add_s(tmp2, tmp1, simd_load_s(diagDamp[k])); simd_store_s(tmp2, &deltaAccel[k]); // akDen += m_deltaAccel[k] * m_deltaForce[k]; tmp0 = simd_mul_add_v(tmp0, tmp2, tmp1); } //NEWTON_ASSERT (akDen > dgFloat32 (0.0f)); // akDen = GetMax (akDen, dgFloat32(1.0e-16f)); //NEWTON_ASSERT (dgAbsf (akDen) >= dgFloat32(1.0e-16f)); // ak = akNum / akDen; tmp0 = simd_div_s(simd_load_s(akNum), simd_max_s(tmp0, tol_pos_1eNeg16)); // simd_type min_index; // simd_type minClampIndex; // simd_type min_index_step; // simd_type campedIndexValue; simd_type campedIndexValue = zero; simd_type minClampIndex = simd_set1(dgFloat32(-1.0f)); simd_type min_index_step = simd_set1(dgFloat32(4.0f)); tmp0 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 0)); simd_type min_index = simd_set(dgFloat32(0.0f), dgFloat32(1.0f), dgFloat32(2.0f), dgFloat32(3.0f)); // for (k = 0; k < roundCount; k ++) { for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) { // simd_type val; // simd_type num; // simd_type den; // simd_type test; // simd_type negTest; // simd_type posTest; // simd_type negValTest; // simd_type posValTest; // simd_type negDeltaForceTest; // simd_type posDeltaForceTest; // Make sure AK is not negative // val = force[k] + ak * deltaForce[k]; simd_type val = simd_mul_add_v((simd_type &)force[k], tmp0, (simd_type &)deltaForce[k]); // negValTest = val < lowBound[k]; simd_type negValTest = simd_cmplt_v(val, (simd_type &)lowBound[k]); // posValTest = val > highBound[k]; simd_type posValTest = simd_cmpgt_v(val, (simd_type &)highBound[k]); // negDeltaForceTest = deltaForce[k] < dgFloat32 (-1.0e-16f); simd_type negDeltaForceTest = simd_cmplt_v((simd_type &)deltaForce[k], tol_neg_1eNeg16); // posDeltaForceTest = deltaForce[k] > dgFloat32 ( 1.0e-16f); simd_type posDeltaForceTest = simd_cmpgt_v((simd_type &)deltaForce[k], tol_pos_1eNeg16); // negTest = negValTest & negDeltaForceTest; simd_type negTest = simd_and_v(negValTest, negDeltaForceTest); // posTest = posValTest & posDeltaForceTest; simd_type posTest = simd_and_v(posValTest, posDeltaForceTest); // test = negTest | posTest; simd_type test = simd_or_v(negTest, posTest); // num = negTest ? lowBound[k] : (posTest ? highBound[k] : force[k]); simd_type num = simd_or_v(simd_and_v((simd_type &)lowBound[k], negTest), simd_and_v((simd_type &)highBound[k], posTest)); num = simd_or_v(simd_and_v(num, test), simd_andnot_v((simd_type &)force[k], test)); // den = test ? deltaForce[k] : dgFloat32 (1.0f); simd_type den = simd_or_v(simd_and_v((simd_type &)deltaForce[k], test), simd_andnot_v(one, test)); // test = test & (activeRow[k] > dgFloat32 (0.0f)); test = simd_and_v(test, simd_cmpgt_v((simd_type &)activeRow[k], zero)); // NEWTON_ASSERT (dgAbsf (den) > 1.0e-16f); // ak = test ? (num - force[k]) / den : ak; tmp0 = simd_or_v(simd_div_v(simd_sub_v(num, (simd_type &)force[k]), den), simd_andnot_v(tmp0, test)); // ak = (ak < dgFloat32 (1.0e-8f)) ? dgFloat32 (0.0f) : ak; tmp0 = simd_and_v(tmp0, simd_cmpgt_v(tmp0, tol_pos_1eNeg8)); // clampedForceIndex = test ? k : clampedForceIndex; minClampIndex = simd_or_v(simd_and_v(min_index, test), simd_andnot_v(minClampIndex, test)); min_index = simd_add_v(min_index, min_index_step); // clampedForceIndexValue = test ? num : clampedForceIndexValue; campedIndexValue = simd_or_v(simd_and_v(num, test), simd_andnot_v(campedIndexValue, test)); } simd_type tmp2 = simd_move_hl_v(tmp0, tmp0); simd_type tmp3 = simd_cmplt_v(tmp0, tmp2); tmp0 = simd_min_v(tmp0, tmp2); minClampIndex = simd_or_v(simd_and_v(minClampIndex, tmp3), simd_andnot_v(simd_move_hl_v(minClampIndex, minClampIndex), tmp3)); campedIndexValue = simd_or_v(simd_and_v(campedIndexValue, tmp3), simd_andnot_v(simd_move_hl_v(campedIndexValue, campedIndexValue), tmp3)); tmp2 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 1)); tmp3 = simd_cmplt_s(tmp0, tmp2); tmp0 = simd_min_s(tmp0, tmp2); minClampIndex = simd_or_v(simd_and_v(minClampIndex, tmp3), simd_andnot_v(simd_permut_v(minClampIndex, minClampIndex, PURMUT_MASK(0, 0, 0, 1)), tmp3)); campedIndexValue = simd_or_v(simd_and_v(campedIndexValue, tmp3), simd_andnot_v(simd_permut_v(campedIndexValue, campedIndexValue, PURMUT_MASK(0, 0, 0, 1)), tmp3)); dgFloat32 ak; simd_store_s(tmp0, &ak); dgInt32 clampedForceIndex = simd_store_is(minClampIndex); dgFloat32 clampedForceIndexValue; simd_store_s(campedIndexValue, &clampedForceIndexValue); tmp2 = zero; tmp0 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 0)); // if (ak == dgFloat32 (0.0f)) { if (ak == dgFloat32(0.0f) && (clampedForceIndex != -1)) { NEWTON_ASSERT(clampedForceIndex != -1); // akNum = dgFloat32 (0.0f); // accNorm = dgFloat32(0.0f); simd_type tmp1 = zero; // tmp2 = zero; simd_type tmp3 = zero; // for (k = 0; k < count; k ++) { // for (k = 0; k < roundCount; k ++) { activeRow[clampedForceIndex] = dgFloat32(0.0f); deltaForce[clampedForceIndex] = dgFloat32(0.0f); force[clampedForceIndex] = clampedForceIndexValue; for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) { // simd_type val_k; // simd_type test_0; // simd_type test_1; // simd_type accel_k; // simd_type force_k; // bool test0; // bool test1; // dgFloat32 val; simd_type accel_k = (simd_type &)accel[k]; simd_type force_k = (simd_type &)force[k]; // val = dgAbsf (lowBound[k] - force[k]); simd_type val_k = simd_and_v(simd_sub_v((simd_type &)lowBound[k], force_k), signMask); // test0 = (val < dgFloat32 (1.0e-5f)) & (accel[k] < dgFloat32 (0.0f)); simd_type test_0 = simd_and_v(simd_cmplt_v(val_k, tol_pos_1eNeg5), simd_cmplt_v(accel_k, zero)); // val = dgAbsf (highBound[k] - force[k]); val_k = simd_and_v(simd_sub_v((simd_type &)highBound[k], force_k), signMask); // test1 = (val < dgFloat32 (1.0e-5f)) & (accel[k] > dgFloat32 (0.0f)); simd_type test_1 = simd_and_v(simd_cmplt_v(val_k, tol_pos_1eNeg5), simd_cmpgt_v(accel_k, zero)); // force[k] = test0 ? lowBound[k] : (test1 ? highBound[k] : force[k]); // val_k = simd_or_v (simd_and_v ((simd_type&)lowBound[k], test_0), simd_and_v ((simd_type&)highBound[k], test_1)); //(simd_type&) force[k] = simd_or_v (simd_and_v (val_k, test_2) , simd_andnot_v (force_k, test_2)); (simd_type &)force[k] = simd_min_v((simd_type &)highBound[k], simd_max_v((simd_type &)force[k], (simd_type &)lowBound[k])); // activeRow[k] *= (test0 | test1) ? dgFloat32 (0.0f) : dgFloat32 (1.0f); // test_2 = simd_or_v (test_0, test_1); (simd_type &)activeRow[k] = simd_mul_v((simd_type &)activeRow[k], simd_andnot_v(one, simd_or_v(test_0, test_1))); // deltaForce[k] = accel[k] * invDJMinvJt[k] * activeRow[k]; (simd_type &)deltaForce[k] = simd_mul_v(accel_k, simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[k])); // akNum += accel[k] * deltaForce[k]; tmp1 = simd_mul_add_v(tmp1, (simd_type &)deltaForce[k], accel_k); // accNorm = GetMax (dgAbsf (accel[k] * activeRow[k]), accNorm); tmp2 = simd_max_v(tmp2, simd_and_v(simd_mul_v(accel_k, (simd_type &)activeRow[k]), signMask)); // maxPases += 1; tmp3 = simd_add_v(tmp3, (simd_type &)activeRow[k]); } tmp1 = simd_add_v(tmp1, simd_move_hl_v(tmp1, tmp1)); tmp1 = simd_add_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp1, &akNum); tmp2 = simd_max_v(tmp2, simd_move_hl_v(tmp2, tmp2)); tmp2 = simd_max_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp2, &accNorm); i = -1; // maxPasses = GetMax (maxPasses - 1, 1); tmp3 = simd_add_v(tmp3, simd_move_hl_v(tmp3, tmp3)); maxPasses = simd_store_is(simd_add_s(tmp3, simd_permut_v(tmp3, tmp3, PURMUT_MASK(0, 0, 0, 1)))); } else if (clampedForceIndex >= 0) { // akNum = dgFloat32(0.0f); // accNorm = dgFloat32(0.0f); tmp1 = zero; activeRow[clampedForceIndex] = dgFloat32(0.0f); // for (k = 0; k < count; k ++) { for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) { // m_force[k] += ak * m_deltaForce[k]; // m_accel[k] -= ak * m_deltaAccel[k]; (simd_type &)force[k] = simd_mul_add_v((simd_type &)force[k], tmp0, (simd_type &)deltaForce[k]); (simd_type &)accel[k] = simd_mul_sub_v((simd_type &)accel[k], tmp0, (simd_type &)deltaAccel[k]); // accNorm = GetMax (dgAbsf (m_accel[k] * activeRow[k]), accNorm); tmp2 = simd_max_v(tmp2, simd_and_v(simd_mul_v((simd_type &)accel[k], (simd_type &)activeRow[k]), signMask)); //NEWTON_ASSERT (dgCheckFloat(m_force[k])); //NEWTON_ASSERT (dgCheckFloat(m_accel[k])); // m_deltaForce[k] = m_accel[k] * m_invDJMinvJt[k] * activeRow[k]; (simd_type &)deltaForce[k] = simd_mul_v((simd_type &)accel[k], simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[k])); // akNum += m_deltaForce[k] * m_accel[k]; tmp1 = simd_mul_add_v(tmp1, (simd_type &)deltaForce[k], (simd_type &)accel[k]); } tmp1 = simd_add_v(tmp1, simd_move_hl_v(tmp1, tmp1)); tmp1 = simd_add_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp1, &akNum); tmp2 = simd_max_v(tmp2, simd_move_hl_v(tmp2, tmp2)); tmp2 = simd_max_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp2, &accNorm); force[clampedForceIndex] = clampedForceIndexValue; i = -1; maxPasses = GetMax(maxPasses - 1, 1); } else { // accNorm = dgFloat32(0.0f); // for (k = 0; k < count; k ++) { for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) { // m_force[k] += ak * m_deltaForce[k]; // m_accel[k] -= ak * m_deltaAccel[k]; (simd_type &)force[k] = simd_mul_add_v((simd_type &)force[k], tmp0, (simd_type &)deltaForce[k]); (simd_type &)accel[k] = simd_mul_sub_v((simd_type &)accel[k], tmp0, (simd_type &)deltaAccel[k]); // accNorm = GetMax (dgAbsf (m_accel[k] * activeRow[k]), accNorm); tmp2 = simd_max_v(tmp2, simd_and_v(simd_mul_v((simd_type &)accel[k], (simd_type &)activeRow[k]), signMask)); } tmp2 = simd_max_v(tmp2, simd_move_hl_v(tmp2, tmp2)); tmp2 = simd_max_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp2, &accNorm); if (accNorm > maxAccNorm) { // akDen = akNum; // akNum = dgFloat32(0.0f); tmp1 = simd_set1(akNum); tmp0 = zero; // for (k = 0; k < count; k ++) { for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) { // m_deltaAccel[k] = m_accel[k] * m_invDJMinvJt[k] * activeRow[k]; (simd_type &)deltaAccel[k] = simd_mul_v((simd_type &)accel[k], simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[k])); // akNum += m_accel[k] * m_deltaAccel[k]; tmp0 = simd_mul_add_v(tmp0, (simd_type &)accel[k], (simd_type &)deltaAccel[k]); } tmp0 = simd_add_v(tmp0, simd_move_hl_v(tmp0, tmp0)); tmp0 = simd_add_s(tmp0, simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp0, &akNum); //NEWTON_ASSERT (bk > dgFloat32(0.0f)); // bk = GetMax (bk, dgFloat32 (1.0e-17f)); tmp1 = simd_max_s(tmp1, simd_set1(dgFloat32(1.0e-17f))); // ak = dgFloat32 (akNum / akDen); tmp0 = simd_div_s(tmp0, tmp1); tmp0 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 0)); // for (k = 0; k < count; k ++) { for (dgInt32 k = 0; k < roundCount; k += DG_SIMD_WORD_SIZE) { // m_deltaForce[k] = m_deltaAccel[k] + ak * m_deltaForce[k]; (simd_type &)deltaForce[k] = simd_mul_add_v((simd_type &)deltaAccel[k], tmp0, (simd_type &)deltaForce[k]); } } } } #else #endif } void dgJacobianMemory::CalculateSimpleBodyReactionsForces( dgFloat32 maxAccNorm) const { dgFloat32 accel[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 activeRow[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 lowBound[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 highBound[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 deltaForce[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 deltaAccel[DG_CONSTRAINT_MAX_ROWS]; // dgFloat32* const accel = m_accel; dgFloat32 *const force = m_force; const dgJacobianPair *const Jt = m_Jt; const dgJacobianPair *const JMinv = m_JMinv; const dgFloat32 *const diagDamp = m_diagDamp; const dgFloat32 *const invDJMinvJt = m_invDJMinvJt; const dgFloat32 *const lowerFriction = m_lowerBoundFrictionCoefficent; const dgFloat32 *const upperFriction = m_upperBoundFrictionCoefficent; const dgInt32 *const normalForceIndex = m_normalForceIndex; const dgFloat32 *const coordenateAccel = m_coordenateAccel; const dgJointInfo *const constraintArray = m_constraintArray; dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); dgInt32 count = constraintArray[0].m_autoPaircount; NEWTON_ASSERT(constraintArray[0].m_autoPairstart == 0); // ak = dgFloat32 (3.0f) * dgSqrt((body1->m_accel % body1->m_accel) + (body0->m_accel % body0->m_accel)) / dgFloat32 (count); // ak = dgSqrt((body1->m_accel % body1->m_accel) + (body0->m_accel % body0->m_accel)); // dTrace (("%f\n", ak)); dgInt32 maxPasses = count; for (dgInt32 i = 0; i < count; i++) { // k = bilateralForceBounds[i].m_normalIndex; dgInt32 k = normalForceIndex[i]; // force = (k >= 0) ? m_force[k] : dgFloat32 (1.0f); NEWTON_ASSERT( ((k < 0) && (force[k] == dgFloat32(1.0f))) || ((k >= 0) && (force[k] >= dgFloat32(0.0f)))); dgFloat32 val = force[k]; // if (dgAbsf (val) < dgFloat32 (1.0e-2f)) { // val = ak; // } // lowBound[i] = val * bilateralForceBounds[i].m_low; // highBound[i] = val * bilateralForceBounds[i].m_upper; lowBound[i] = val * lowerFriction[i]; highBound[i] = val * upperFriction[i]; activeRow[i] = dgFloat32(1.0f); if (force[i] < lowBound[i]) { maxPasses--; force[i] = lowBound[i]; activeRow[i] = dgFloat32(0.0f); } else if (force[i] > highBound[i]) { maxPasses--; force[i] = highBound[i]; activeRow[i] = dgFloat32(0.0f); } } dgJacobian y0; dgJacobian y1; y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (dgInt32 i = 0; i < count; i++) { dgFloat32 val = force[i]; y0.m_linear += Jt[i].m_jacobian_IM0.m_linear.Scale(val); y0.m_angular += Jt[i].m_jacobian_IM0.m_angular.Scale(val); y1.m_linear += Jt[i].m_jacobian_IM1.m_linear.Scale(val); y1.m_angular += Jt[i].m_jacobian_IM1.m_angular.Scale(val); } dgFloat32 akNum = dgFloat32(0.0f); dgFloat32 accNorm = dgFloat32(0.0f); for (dgInt32 i = 0; i < count; i++) { dgVector acc(JMinv[i].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); acc += JMinv[i].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); acc += JMinv[i].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); acc += JMinv[i].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); accel[i] = coordenateAccel[i] - acc.m_x - acc.m_y - acc.m_z - force[i] * diagDamp[i]; deltaForce[i] = accel[i] * invDJMinvJt[i] * activeRow[i]; akNum += accel[i] * deltaForce[i]; accNorm = GetMax(dgAbsf(accel[i] * activeRow[i]), accNorm); } dgFloat32 clampedForceIndexValue = dgFloat32(0.0f); for (dgInt32 i = 0; (i < maxPasses) && (accNorm > maxAccNorm); i++) { // MatrixTimeVector (forceRows, m_deltaForce, m_deltaAccel); y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (dgInt32 k = 0; k < count; k++) { dgFloat32 val = deltaForce[k]; y0.m_linear += Jt[k].m_jacobian_IM0.m_linear.Scale(val); y0.m_angular += Jt[k].m_jacobian_IM0.m_angular.Scale(val); y1.m_linear += Jt[k].m_jacobian_IM1.m_linear.Scale(val); y1.m_angular += Jt[k].m_jacobian_IM1.m_angular.Scale(val); } dgFloat32 akDen = dgFloat32(0.0f); for (dgInt32 k = 0; k < count; k++) { dgVector acc(JMinv[k].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); acc += JMinv[k].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); acc += JMinv[k].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); acc += JMinv[k].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); // m_accel[i0] = m_coordenateAccel[i0] - acc.m_x - acc.m_y - acc.m_z - m_force[i0] * m_diagDamp[i0]; deltaAccel[k] = acc.m_x + acc.m_y + acc.m_z + deltaForce[k] * diagDamp[k]; akDen += deltaAccel[k] * deltaForce[k]; } NEWTON_ASSERT(akDen > dgFloat32(0.0f)); akDen = GetMax(akDen, dgFloat32(1.0e-16f)); NEWTON_ASSERT(dgAbsf(akDen) >= dgFloat32(1.0e-16f)); dgFloat32 ak = akNum / akDen; dgInt32 clampedForceIndex = -1; for (dgInt32 k = 0; k < count; k++) { if (activeRow[k]) { dgFloat32 val = force[k] + ak * deltaForce[k]; if (deltaForce[k] < dgFloat32(-1.0e-16f)) { if (val < lowBound[k]) { ak = GetMax((lowBound[k] - force[k]) / deltaForce[k], dgFloat32(0.0f)); clampedForceIndex = k; clampedForceIndexValue = lowBound[k]; if (ak < dgFloat32(1.0e-8f)) { ak = dgFloat32(0.0f); break; } } } else if (deltaForce[k] > dgFloat32(1.0e-16f)) { if (val >= highBound[k]) { ak = GetMax((highBound[k] - force[k]) / deltaForce[k], dgFloat32(0.0f)); ; clampedForceIndex = k; clampedForceIndexValue = highBound[k]; if (ak < dgFloat32(1.0e-8f)) { ak = dgFloat32(0.0f); break; } } } } } // if (ak == dgFloat32 (0.0f)) { if (ak == dgFloat32(0.0f) && (clampedForceIndex != -1)) { NEWTON_ASSERT(clampedForceIndex != -1); akNum = dgFloat32(0.0f); accNorm = dgFloat32(0.0f); activeRow[clampedForceIndex] = dgFloat32(0.0f); deltaForce[clampedForceIndex] = dgFloat32(0.0f); force[clampedForceIndex] = clampedForceIndexValue; for (dgInt32 k = 0; k < count; k++) { if (activeRow[k]) { dgFloat32 val; val = lowBound[k] - force[k]; if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[k] < dgFloat32(0.0f))) { force[k] = lowBound[k]; activeRow[k] = dgFloat32(0.0f); deltaForce[k] = dgFloat32(0.0f); } else { val = highBound[k] - force[k]; if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[k] > dgFloat32(0.0f))) { force[k] = highBound[k]; activeRow[k] = dgFloat32(0.0f); deltaForce[k] = dgFloat32(0.0f); } else { NEWTON_ASSERT(activeRow[k] > dgFloat32(0.0f)); deltaForce[k] = accel[k] * invDJMinvJt[k]; akNum += accel[k] * deltaForce[k]; accNorm = GetMax(dgAbsf(accel[k]), accNorm); } } } } i = -1; maxPasses = GetMax(maxPasses - 1, dgInt32(1)); } else if (clampedForceIndex >= 0) { akNum = dgFloat32(0.0f); accNorm = dgFloat32(0.0f); activeRow[clampedForceIndex] = dgFloat32(0.0f); for (dgInt32 k = 0; k < count; k++) { force[k] += ak * deltaForce[k]; accel[k] -= ak * deltaAccel[k]; accNorm = GetMax(dgAbsf(accel[k] * activeRow[k]), accNorm); NEWTON_ASSERT(dgCheckFloat(force[k])); NEWTON_ASSERT(dgCheckFloat(accel[k])); deltaForce[k] = accel[k] * invDJMinvJt[k] * activeRow[k]; akNum += deltaForce[k] * accel[k]; } force[clampedForceIndex] = clampedForceIndexValue; i = -1; maxPasses = GetMax(maxPasses - 1, dgInt32(1)); } else { accNorm = dgFloat32(0.0f); for (dgInt32 k = 0; k < count; k++) { force[k] += ak * deltaForce[k]; accel[k] -= ak * deltaAccel[k]; accNorm = GetMax(dgAbsf(accel[k] * activeRow[k]), accNorm); NEWTON_ASSERT(dgCheckFloat(force[k])); NEWTON_ASSERT(dgCheckFloat(accel[k])); } if (accNorm > maxAccNorm) { akDen = akNum; akNum = dgFloat32(0.0f); for (dgInt32 k = 0; k < count; k++) { deltaAccel[k] = accel[k] * invDJMinvJt[k] * activeRow[k]; akNum += accel[k] * deltaAccel[k]; } NEWTON_ASSERT(akDen > dgFloat32(0.0f)); akDen = GetMax(akDen, dgFloat32(1.0e-17f)); ak = dgFloat32(akNum / akDen); for (dgInt32 k = 0; k < count; k++) { deltaForce[k] = deltaAccel[k] + ak * deltaForce[k]; } } } } } void dgJacobianMemory::CalculateForcesSimulationModeSimd( dgFloat32 maxAccNorm) const { #ifdef DG_BUILD_SIMD_CODE dgInt32 passes; dgInt32 prevJoint; dgInt32 maxPasses; dgInt32 forceRows; dgInt32 roundBodyCount; dgInt32 totalPassesCount; dgFloatSign tmpIndex; dgFloat32 akNum; dgFloat32 accNorm; dgFloat32 *const force = m_force; dgFloat32 *const accel = m_accel; const dgJacobianPair *const Jt = m_Jt; dgFloat32 *const deltaAccel = m_deltaAccel; dgFloat32 *const deltaForce = m_deltaForce; const dgJacobianPair *const JMinv = m_JMinv; const dgFloat32 *const diagDamp = m_diagDamp; const dgFloat32 *const invDJMinvJt = m_invDJMinvJt; dgJacobian *const internalForces = m_internalForces; dgInt32 *const normalForceIndex = m_normalForceIndex; dgJointInfo *const constraintArray = m_constraintArray; const dgFloat32 *const coordenateAccel = m_coordenateAccel; dgFloat32 *const lowerForceBound = m_lowerBoundFrictionCoefficent; dgFloat32 *const upperForceBound = m_upperBoundFrictionCoefficent; simd_type forceStepPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; dgFloat32 *const forceStep = (dgFloat32 *)forceStepPtr; simd_type one; simd_type zero; simd_type signMask; simd_type tol_pos_1eNeg8; simd_type tol_pos_1eNeg16; simd_type tol_neg_1eNeg16; zero = simd_set1(dgFloat32(0.0f)); roundBodyCount = m_bodyCount & -4; for (dgInt32 i = 0; i < roundBodyCount; i += 4) { ((simd_type &)internalForces[i + 0].m_linear) = zero; ((simd_type &)internalForces[i + 0].m_angular) = zero; ((simd_type &)internalForces[i + 1].m_linear) = zero; ((simd_type &)internalForces[i + 1].m_angular) = zero; ((simd_type &)internalForces[i + 2].m_linear) = zero; ((simd_type &)internalForces[i + 2].m_angular) = zero; ((simd_type &)internalForces[i + 3].m_linear) = zero; ((simd_type &)internalForces[i + 3].m_angular) = zero; } for (dgInt32 i = roundBodyCount; i < m_bodyCount; i++) { ((simd_type &)internalForces[i].m_linear) = zero; ((simd_type &)internalForces[i].m_angular) = zero; } for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 m0; dgInt32 m1; dgInt32 index; dgInt32 first; dgInt32 count; simd_type y0_linear; simd_type y0_angular; simd_type y1_linear; simd_type y1_angular; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; m0 = constraintArray[i].m_m0; m1 = constraintArray[i].m_m1; y0_linear = zero; y0_angular = zero; y1_linear = zero; y1_angular = zero; for (j = 0; j < count; j++) { simd_type tmp0; index = first + j; // force = m_force[index]; tmp0 = simd_set1(force[index]); // y0.m_linear += m_Jt[index].m_jacobian_IM0.m_linear.Scale (m_force[index]); // y0.m_angular += m_Jt[index].m_jacobian_IM0.m_angular.Scale (m_force[index]); // y1.m_linear += m_Jt[index].m_jacobian_IM1.m_linear.Scale (m_force[index]); // y1.m_angular += m_Jt[index].m_jacobian_IM1.m_angular.Scale (m_force[index]); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, tmp0); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, tmp0); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, tmp0); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, tmp0); } // m_internalForces[j0] = y0; // m_internalForces[j1] = y1; (simd_type &)internalForces[m0].m_linear = simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear); (simd_type &)internalForces[m0].m_angular = simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular); (simd_type &)internalForces[m1].m_linear = simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear); (simd_type &)internalForces[m1].m_angular = simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular); } for (dgInt32 i = 0; i < DG_CONSTRAINT_MAX_ROWS; i += (DG_SIMD_WORD_SIZE * 4)) { (simd_type &)forceStep[i + DG_SIMD_WORD_SIZE * 0] = zero; (simd_type &)forceStep[i + DG_SIMD_WORD_SIZE * 1] = zero; (simd_type &)forceStep[i + DG_SIMD_WORD_SIZE * 2] = zero; (simd_type &)forceStep[i + DG_SIMD_WORD_SIZE * 3] = zero; } maxPasses = 4; prevJoint = 0; accNorm = maxAccNorm * dgFloat32(2.0f); for (passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) { accNorm = dgFloat32(0.0f); for (dgInt32 currJoint = 0; currJoint < m_jointCount; currJoint++) { dgInt32 m0; dgInt32 m1; dgInt32 index; // dgInt32 currJoint; dgInt32 rowsCount; dgFloat32 jointAccel; simd_type y0_linear; simd_type y0_angular; simd_type y1_linear; simd_type y1_angular; index = constraintArray[prevJoint].m_autoPairstart; rowsCount = constraintArray[prevJoint].m_autoPaircount; m0 = constraintArray[prevJoint].m_m0; m1 = constraintArray[prevJoint].m_m1; y0_linear = zero; y0_angular = zero; y1_linear = zero; y1_angular = zero; for (dgInt32 i = 0; i < rowsCount; i++) { // dgFloat32 deltaForce; // deltaForce = forceStep[i]; // y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (deltaForce); // y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (deltaForce); // y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (deltaForce); // y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (deltaForce); simd_type deltaForce; deltaForce = simd_set1(forceStep[i]); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, deltaForce); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, deltaForce); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, deltaForce); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, deltaForce); index++; } (simd_type &)internalForces[m0].m_linear = simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear); (simd_type &)internalForces[m0].m_angular = simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular); (simd_type &)internalForces[m1].m_linear = simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear); (simd_type &)internalForces[m1].m_angular = simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular); // currJoint = jointRemapArray[j]; index = constraintArray[currJoint].m_autoPairstart; rowsCount = constraintArray[currJoint].m_autoPaircount; m0 = constraintArray[currJoint].m_m0; m1 = constraintArray[currJoint].m_m1; // y0 = internalForces[m0]; // y1 = internalForces[m1]; y0_linear = (simd_type &)internalForces[m0].m_linear; y0_angular = (simd_type &)internalForces[m0].m_angular; y1_linear = (simd_type &)internalForces[m1].m_linear; y1_angular = (simd_type &)internalForces[m1].m_angular; for (dgInt32 i = 0; i < rowsCount; i++) { simd_type tmpAccel; // dgVector acc (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); // acc += JMinv[index].m_jacobian_IM0.m_angular.CompProduct (y0.m_angular); // acc += JMinv[index].m_jacobian_IM1.m_linear.CompProduct (y1.m_linear); // acc += JMinv[index].m_jacobian_IM1.m_angular.CompProduct (y1.m_angular); tmpAccel = simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, y0_linear); tmpAccel = simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, y0_angular); tmpAccel = simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, y1_linear); tmpAccel = simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, y1_angular); // accel[i] = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index]; tmpAccel = simd_add_v(tmpAccel, simd_move_hl_v(tmpAccel, tmpAccel)); tmpAccel = simd_add_s(tmpAccel, simd_permut_v(tmpAccel, tmpAccel, PURMUT_MASK(3, 3, 3, 1))); tmpAccel = simd_mul_add_s(tmpAccel, simd_load_s(force[index]), simd_load_s(diagDamp[index])); simd_store_s(simd_sub_s(simd_load_s(coordenateAccel[index]), tmpAccel), &accel[i]); index++; } jointAccel = CalculateJointForcesSimd(currJoint, forceStep, maxAccNorm); accNorm = GetMax(accNorm, jointAccel); prevJoint = currJoint; } } one = simd_set1(dgFloat32(1.0f)); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 first; dgInt32 count; dgInt32 index; dgInt32 roundCount; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; roundCount = count & (-DG_SIMD_WORD_SIZE); if (roundCount != count) { roundCount += DG_SIMD_WORD_SIZE; for (dgInt32 k = count; k < roundCount; k++) { dgInt32 j; j = first + k; // force[j] = dgFloat32 (0.0f); // accel[j] -= dgFloat32 (0.0f); // deltaAccel[j] = dgFloat32 (0.0f); // deltaForce[j] = dgFloat32 (0.0f); normalForceIndex[j] = -1; simd_store_s(zero, &force[j]); simd_store_s(zero, &accel[j]); simd_store_s(zero, &deltaAccel[j]); simd_store_s(zero, &deltaForce[j]); simd_store_s(one, &lowerForceBound[j]); simd_store_s(zero, &upperForceBound[j]); } } for (dgInt32 k = 0; k < count; k += DG_SIMD_WORD_SIZE) { simd_type normalForce; index = first + k; // dgBilateralBounds& forceBounds = bilateralForceBounds[index]; // j = normalForceIndex[index]; //NEWTON_ASSERT (((j < 0) && (force[j] == dgFloat32 (1.0f))) || ((j >= 0) && (force[j] >= dgFloat32 (0.0f)))); // val = GetMax (force[j], dgFloat32(0.0f)); // val = force[j]; normalForce = simd_move_lh_v(simd_pack_lo_v(simd_load_s(force[normalForceIndex[index + 0]]), simd_load_s(force[normalForceIndex[index + 1]])), simd_pack_lo_v(simd_load_s(force[normalForceIndex[index + 2]]), simd_load_s(force[normalForceIndex[index + 3]]))); // lowerForceBound[index] *= val; // upperForceBound[index] *= val; (simd_type &)lowerForceBound[index] = simd_mul_v(normalForce, (simd_type &)lowerForceBound[index]); (simd_type &)upperForceBound[index] = simd_mul_v(normalForce, (simd_type &)upperForceBound[index]); // force[index] = ClampValue(force[index], lowerForceBound[index], upperForceBound[index]); (simd_type &)force[index] = simd_min_v((simd_type &)upperForceBound[index], simd_max_v((simd_type &)force[index], (simd_type &)lowerForceBound[index])); } } for (dgInt32 i = 0; i < roundBodyCount; i += 4) { (simd_type &)internalForces[i + 0].m_linear = zero; (simd_type &)internalForces[i + 0].m_angular = zero; (simd_type &)internalForces[i + 1].m_linear = zero; (simd_type &)internalForces[i + 1].m_angular = zero; (simd_type &)internalForces[i + 2].m_linear = zero; (simd_type &)internalForces[i + 2].m_angular = zero; (simd_type &)internalForces[i + 3].m_linear = zero; (simd_type &)internalForces[i + 3].m_angular = zero; } for (dgInt32 i = roundBodyCount; i < m_bodyCount; i++) { (simd_type &)internalForces[i].m_linear = zero; (simd_type &)internalForces[i].m_angular = zero; } for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; dgInt32 index; // dgFloat32 val; simd_type y0_linear; simd_type y0_angular; simd_type y1_linear; simd_type y1_angular; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; m0 = constraintArray[i].m_m0; m1 = constraintArray[i].m_m1; // dgJacobian y0 (internalForces[k0]); // dgJacobian y1 (internalForces[k1]); y0_linear = zero; y0_angular = zero; y1_linear = zero; y1_angular = zero; for (dgInt32 j = 0; j < count; j++) { simd_type tmp0; index = j + first; // val = force[index]; tmp0 = simd_set1(force[index]); // y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (val); // y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (val); // y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (val); // y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (val); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, tmp0); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, tmp0); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, tmp0); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, tmp0); } // internalForces[k0] = y0; // internalForces[k1] = y1; (simd_type &)internalForces[m0].m_linear = simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear); (simd_type &)internalForces[m0].m_angular = simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular); (simd_type &)internalForces[m1].m_linear = simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear); (simd_type &)internalForces[m1].m_angular = simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular); } forceRows = 0; akNum = dgFloat32(0.0f); accNorm = dgFloat32(0.0f); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; dgInt32 index; dgInt32 activeCount; simd_type y0_linear; simd_type y0_angular; simd_type y1_linear; simd_type y1_angular; bool isClamped[DG_CONSTRAINT_MAX_ROWS]; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; m0 = constraintArray[i].m_m0; m1 = constraintArray[i].m_m1; // const dgJacobian& y0 = internalForces[m0]; // const dgJacobian& y1 = internalForces[m1]; y0_linear = (simd_type &)internalForces[m0].m_linear; y0_angular = (simd_type &)internalForces[m0].m_angular; y1_linear = (simd_type &)internalForces[m1].m_linear; y1_angular = (simd_type &)internalForces[m1].m_angular; for (dgInt32 j = 0; j < count; j++) { simd_type tmp0; index = j + first; // dgVector tmpAccel (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); // tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); // tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); // tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); tmp0 = simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, y0_linear); tmp0 = simd_mul_add_v(tmp0, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, y0_angular); tmp0 = simd_mul_add_v(tmp0, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, y1_linear); tmp0 = simd_mul_add_v(tmp0, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, y1_angular); // accel[index] = coordenateAccel[index] - (tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z + force[index] * diagDamp[index]); tmp0 = simd_add_v(tmp0, simd_move_hl_v(tmp0, tmp0)); tmp0 = simd_add_s(tmp0, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 3, 3, 1))); tmp0 = simd_sub_s(simd_load_s(coordenateAccel[index]), simd_mul_add_s(tmp0, simd_load_s(force[index]), simd_load_s(diagDamp[index]))); simd_store_s(tmp0, &accel[index]); } activeCount = 0; for (dgInt32 j = 0; j < count; j++) { dgFloat32 val; index = j + first; val = lowerForceBound[index] - force[index]; if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[index] < dgFloat32(0.0f))) { force[index] = lowerForceBound[index]; accel[index] = dgFloat32(0.0f); deltaForce[index] = dgFloat32(0.0f); deltaAccel[index] = dgFloat32(0.0f); isClamped[j] = true; } else { val = upperForceBound[index] - force[index]; if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[index] > dgFloat32(0.0f))) { force[index] = upperForceBound[index]; accel[index] = dgFloat32(0.0f); deltaForce[index] = dgFloat32(0.0f); deltaAccel[index] = dgFloat32(0.0f); isClamped[j] = true; } else { forceRows++; activeCount++; deltaForce[index] = accel[index] * invDJMinvJt[index]; akNum += accel[index] * deltaForce[index]; accNorm = GetMax(dgAbsf(accel[index]), accNorm); isClamped[j] = false; } } } if (activeCount < count) { dgInt32 i0; dgInt32 i1; i0 = 0; i1 = count - 1; constraintArray[i].m_autoPairActiveCount = activeCount; do { while ((i0 <= i1) && !isClamped[i0]) i0++; while ((i0 <= i1) && isClamped[i1]) i1--; if (i0 < i1) { SwapRowsSimd(first + i0, first + i1); i0++; i1--; } } while (i0 < i1); } } // maxPasses = 0x7fffffff; // signMask = simd_set1((dgFloat32&) maxPasses); tmpIndex.m_integer.m_iVal = 0x7fffffff; signMask = simd_set1(tmpIndex.m_fVal); zero = simd_set1(dgFloat32(0.0f)); tol_pos_1eNeg8 = simd_set1(dgFloat32(1.0e-8f)); tol_pos_1eNeg16 = simd_set1(dgFloat32(1.0e-16f)); tol_neg_1eNeg16 = simd_set1(dgFloat32(-1.0e-16f)); maxPasses = forceRows; totalPassesCount = 0; for (passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) { dgInt32 clampedForceIndex; dgInt32 clampedForceJoint; // dgFloat32 ak; // dgFloat32 akDen; // dgFloat32 clampedForceIndexValue; simd_type akSimd; simd_type akDenSimd; simd_type clampedForceIndexValue; for (dgInt32 i = 0; i < roundBodyCount; i += 4) { (simd_type &)internalForces[i + 0].m_linear = zero; (simd_type &)internalForces[i + 0].m_angular = zero; (simd_type &)internalForces[i + 1].m_linear = zero; (simd_type &)internalForces[i + 1].m_angular = zero; (simd_type &)internalForces[i + 2].m_linear = zero; (simd_type &)internalForces[i + 2].m_angular = zero; (simd_type &)internalForces[i + 3].m_linear = zero; (simd_type &)internalForces[i + 3].m_angular = zero; } for (dgInt32 i = roundBodyCount; i < m_bodyCount; i++) { (simd_type &)internalForces[i].m_linear = zero; (simd_type &)internalForces[i].m_angular = zero; } for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; dgInt32 index; // dgFloat32 ak; simd_type y0_linear; simd_type y0_angular; simd_type y1_linear; simd_type y1_angular; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; m0 = constraintArray[i].m_m0; m1 = constraintArray[i].m_m1; // dgJacobian y0 (internalForces[m0]); // dgJacobian y1 (internalForces[m1]); y0_linear = zero; y0_angular = zero; y1_linear = zero; y1_angular = zero; for (j = 0; j < count; j++) { simd_type tmp0; index = j + first; // ak = deltaForce[index]; tmp0 = simd_set1(deltaForce[index]); // y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (ak); // y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (ak); // y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (ak); // y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (ak); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, tmp0); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, tmp0); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, tmp0); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, tmp0); } // internalForces[m0] = y0; // internalForces[m1] = y1; (simd_type &)internalForces[m0].m_linear = simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear); (simd_type &)internalForces[m0].m_angular = simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular); (simd_type &)internalForces[m1].m_linear = simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear); (simd_type &)internalForces[m1].m_angular = simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular); } akDenSimd = zero; for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; dgInt32 index; simd_type y0_linear; simd_type y0_angular; simd_type y1_linear; simd_type y1_angular; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; m0 = constraintArray[i].m_m0; m1 = constraintArray[i].m_m1; // const dgJacobian& y0 = internalForces[k0]; // const dgJacobian& y1 = internalForces[k1]; y0_linear = (simd_type &)internalForces[m0].m_linear; y0_angular = (simd_type &)internalForces[m0].m_angular; y1_linear = (simd_type &)internalForces[m1].m_linear; y1_angular = (simd_type &)internalForces[m1].m_angular; for (j = 0; j < count; j++) { simd_type tmp1; simd_type tmp2; index = j + first; // dgVector tmpAccel (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); // tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); // tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); // tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); tmp2 = simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, y0_linear); tmp2 = simd_mul_add_v(tmp2, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, y0_angular); tmp2 = simd_mul_add_v(tmp2, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, y1_linear); tmp2 = simd_mul_add_v(tmp2, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, y1_angular); // deltaAccel[index] = tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z + deltaForce[index] * diagDamp[index]; tmp1 = simd_load_s(deltaForce[index]); tmp2 = simd_add_v(tmp2, simd_move_hl_v(tmp2, tmp2)); tmp2 = simd_add_s(tmp2, simd_permut_v(tmp2, tmp2, PURMUT_MASK(3, 3, 3, 1))); tmp2 = simd_mul_add_s(tmp2, tmp1, simd_load_s(diagDamp[index])); simd_store_s(tmp2, &deltaAccel[index]); // akDen += deltaAccel[index] * deltaForce[index]; akDenSimd = simd_mul_add_s(akDenSimd, tmp2, tmp1); } } // NEWTON_ASSERT (akDen > dgFloat32 (0.0f)); // akDen = GetMax (akDen, dgFloat32(1.0e-16f)); // NEWTON_ASSERT (dgAbsf (akDen) >= dgFloat32(1.0e-16f)); // ak = akNum / akDen; akSimd = simd_div_s(simd_load_s(akNum), simd_max_s(akDenSimd, tol_pos_1eNeg16)); clampedForceIndex = -1; clampedForceJoint = -1; clampedForceIndexValue = zero; for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 first; dgInt32 count; dgInt32 index; if (simd_store_is(simd_cmpgt_s(akSimd, tol_pos_1eNeg8))) { first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; for (j = 0; j < count; j++) { simd_type tmp1; simd_type test0; simd_type test1; index = j + first; // tmp = force[index] + ak * deltaForce[index]; simd_type force_index; simd_type delta_force; simd_type low_force; simd_type high_force; // Make sure AK is not negative force_index = simd_set1(force[index]); delta_force = simd_set1(deltaForce[index]); low_force = simd_set1(lowerForceBound[index]); high_force = simd_set1(upperForceBound[index]); tmp1 = simd_mul_add_s(force_index, akSimd, delta_force); // test0 = (deltaForce[index] < dgFloat32 (-1.0e-16f)) & (tmp < lowerForceBound[index]); test0 = simd_and_v(simd_cmplt_s(delta_force, tol_neg_1eNeg16), simd_cmplt_s(tmp1, low_force)); // test1 = (deltaForce[index] > dgFloat32 ( 1.0e-16f)) & (tmp > upperForceBound[index]); test1 = simd_and_v(simd_cmpgt_s(delta_force, tol_pos_1eNeg16), simd_cmpgt_s(tmp1, high_force)); // test = test0 | test1; if (simd_store_is(simd_or_v(test0, test1))) { // clampedForceIndexValue = lowerForceBound[index]; clampedForceIndexValue = simd_or_v(simd_and_v(low_force, test0), simd_and_v(high_force, test1)); akSimd = simd_div_s(simd_sub_s(clampedForceIndexValue, force_index), delta_force); // clampedForceIndex = test ? j : clampedForceIndex; // clampedForceJoint = test ? i : clampedForceJoint; clampedForceIndex = j; clampedForceJoint = i; } } } } akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0)); if (clampedForceIndex >= 0) { dgInt32 first; dgInt32 count; dgInt32 activeCount; simd_type tmp0; simd_type tmp1; bool isClamped[DG_CONSTRAINT_MAX_ROWS]; for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 first; dgInt32 count; dgInt32 index; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; for (j = 0; j < count; j += DG_SIMD_WORD_SIZE) { index = j + first; // force[index] += ak * deltaForce[index]; // accel[index] -= ak * deltaAccel[index]; (simd_type &)force[index] = simd_mul_add_v((simd_type &)force[index], akSimd, (simd_type &)deltaForce[index]); (simd_type &)accel[index] = simd_mul_sub_v((simd_type &)accel[index], akSimd, (simd_type &)deltaAccel[index]); } } first = constraintArray[clampedForceJoint].m_autoPairstart; count = constraintArray[clampedForceJoint].m_autoPairActiveCount; count--; // force[first + clampedForceIndex] = clampedForceIndexValue; simd_store_s(clampedForceIndexValue, &force[first + clampedForceIndex]); accel[first + clampedForceIndex] = dgFloat32(0.0f); deltaForce[first + clampedForceIndex] = dgFloat32(0.0f); deltaAccel[first + clampedForceIndex] = dgFloat32(0.0f); if (clampedForceIndex != count) { SwapRowsSimd(first + clampedForceIndex, first + count); } activeCount = count; for (dgInt32 i = 0; i < count; i++) { dgInt32 index; dgFloat32 val; index = first + i; isClamped[i] = false; val = lowerForceBound[index] - force[index]; if ((val > dgFloat32(-1.0e-5f)) && (accel[index] < dgFloat32(0.0f))) { activeCount--; isClamped[i] = true; accel[index] = dgFloat32(0.0f); deltaForce[index] = dgFloat32(0.0f); deltaAccel[index] = dgFloat32(0.0f); } else { val = upperForceBound[index] - force[index]; if ((val < dgFloat32(1.0e-5f)) && (accel[index] > dgFloat32(0.0f))) { activeCount--; isClamped[i] = true; accel[index] = dgFloat32(0.0f); deltaForce[index] = dgFloat32(0.0f); deltaAccel[index] = dgFloat32(0.0f); } } } if (activeCount < count) { dgInt32 i0; dgInt32 i1; i0 = 0; i1 = count - 1; do { while ((i0 <= i1) && !isClamped[i0]) i0++; while ((i0 <= i1) && isClamped[i1]) i1--; if (i0 < i1) { SwapRowsSimd(first + i0, first + i1); i0++; i1--; } } while (i0 < i1); } constraintArray[clampedForceJoint].m_autoPairActiveCount = activeCount; forceRows = 0; // akNum = dgFloat32 (0.0f); // accNorm = dgFloat32(0.0f); tmp0 = zero; tmp1 = zero; for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 first; dgInt32 count; dgInt32 index; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; forceRows += count; // for (j = 0; j < count; j ++) { for (j = 0; j < count; j += DG_SIMD_WORD_SIZE) { index = first + j; //NEWTON_ASSERT ((i != clampedForceJoint) || !((dgAbsf (lowerForceBound[index] - force[index]) < dgFloat32 (1.0e-5f)) && (accel[index] < dgFloat32 (0.0f)))); //NEWTON_ASSERT ((i != clampedForceJoint) || !((dgAbsf (upperForceBound[index] - force[index]) < dgFloat32 (1.0e-5f)) && (accel[index] > dgFloat32 (0.0f)))); // deltaForce[index] = accel[index] * invDJMinvJt[index]; (simd_type &)deltaForce[index] = simd_mul_v((simd_type &)accel[index], (simd_type &)invDJMinvJt[index]); // akNum += deltaForce[index] * accel[index]; tmp0 = simd_mul_add_v(tmp0, (simd_type &)accel[index], (simd_type &)deltaForce[index]); // accNorm = GetMax (dgAbsf (accel[index]), accNorm); tmp1 = simd_max_v(tmp1, simd_and_v((simd_type &)accel[index], signMask)); //NEWTON_ASSERT (dgCheckFloat(deltaForce[index])); } } tmp0 = simd_add_v(tmp0, simd_move_hl_v(tmp0, tmp0)); tmp0 = simd_add_s(tmp0, simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp0, &akNum); tmp1 = simd_max_v(tmp1, simd_move_hl_v(tmp1, tmp1)); tmp1 = simd_max_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp1, &accNorm); NEWTON_ASSERT(akNum >= dgFloat32(0.0f)); passes = -1; maxPasses = forceRows; } else { simd_type tmp1; // accNorm = dgFloat32(0.0f); tmp1 = zero; for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 first; dgInt32 count; dgInt32 index; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; for (dgInt32 j = 0; j < count; j += DG_SIMD_WORD_SIZE) { index = j + first; // force[index] += ak * deltaForce[index]; // accel[index] -= ak * deltaAccel[index]; (simd_type &)force[index] = simd_mul_add_v((simd_type &)force[index], akSimd, (simd_type &)deltaForce[index]); (simd_type &)accel[index] = simd_mul_sub_v((simd_type &)accel[index], akSimd, (simd_type &)deltaAccel[index]); // accNorm = GetMax (dgAbsf (accel[index]), accNorm); tmp1 = simd_max_v(tmp1, simd_and_v((simd_type &)accel[index], signMask)); } } tmp1 = simd_max_v(tmp1, simd_move_hl_v(tmp1, tmp1)); tmp1 = simd_max_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp1, &accNorm); if (accNorm > maxAccNorm) { simd_type tmp0; // simd_type akNum; // akDen = akNum; // akNum = dgFloat32(0.0f); tmp0 = zero; akDenSimd = simd_set1(akNum); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 first; dgInt32 count; dgInt32 index; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; for (dgInt32 j = 0; j < count; j += DG_SIMD_WORD_SIZE) { index = j + first; // deltaAccel[index] = accel[index] * invDJMinvJt[index]; (simd_type &)deltaAccel[index] = simd_mul_v((simd_type &)accel[index], (simd_type &)invDJMinvJt[index]); // akNum += accel[index] * deltaAccel[index]; tmp0 = simd_mul_add_v(tmp0, (simd_type &)accel[index], (simd_type &)deltaAccel[index]); } } tmp0 = simd_add_v(tmp0, simd_move_hl_v(tmp0, tmp0)); tmp0 = simd_add_s(tmp0, simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(tmp0, &akNum); // NEWTON_ASSERT (akNum >= dgFloat32 (0.0f)); // NEWTON_ASSERT (akDen > dgFloat32(0.0f)); // akDen = GetMax (akDen, dgFloat32 (1.0e-17f)); // ak = dgFloat32 (akNum / akDen); tmp0 = simd_div_s(tmp0, simd_max_s(akDenSimd, simd_set1(dgFloat32(1.0e-17f)))); tmp0 = simd_permut_v(tmp0, tmp0, PURMUT_MASK(0, 0, 0, 0)); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 first; dgInt32 count; dgInt32 index; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; for (dgInt32 j = 0; j < count; j += DG_SIMD_WORD_SIZE) { index = j + first; // deltaForce[index] = deltaAccel[index] + ak * deltaForce[index]; (simd_type &)deltaForce[index] = simd_mul_add_v((simd_type &)deltaAccel[index], tmp0, (simd_type &)deltaForce[index]); } } } } totalPassesCount++; } ApplyExternalForcesAndAccelerationSimd(maxAccNorm); #endif } void dgJacobianMemory::CalculateForcesSimulationMode(dgFloat32 maxAccNorm) const { dgInt32 passes; dgInt32 prevJoint; dgInt32 maxPasses; dgInt32 forceRows; //dgInt32 totalPassesCount; dgFloat32 akNum; dgFloat32 accNorm; dgFloat32 *const force = m_force; dgFloat32 *const accel = m_accel; const dgJacobianPair *const Jt = m_Jt; dgFloat32 *const deltaAccel = m_deltaAccel; dgFloat32 *const deltaForce = m_deltaForce; const dgJacobianPair *const JMinv = m_JMinv; const dgFloat32 *const diagDamp = m_diagDamp; const dgFloat32 *const invDJMinvJt = m_invDJMinvJt; dgJacobian *const internalForces = m_internalForces; dgInt32 *const normalForceIndex = m_normalForceIndex; dgJointInfo *const constraintArray = m_constraintArray; const dgFloat32 *const coordenateAccel = m_coordenateAccel; dgFloat32 *const lowerForceBound = m_lowerBoundFrictionCoefficent; dgFloat32 *const upperForceBound = m_upperBoundFrictionCoefficent; dgFloat32 forceStep[DG_CONSTRAINT_MAX_ROWS]; dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); // initialize the intermediate force accumulation to zero for (dgInt32 i = 0; i < m_bodyCount; i++) { internalForces[i].m_linear = zero; internalForces[i].m_angular = zero; } for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 m0; dgInt32 m1; dgInt32 index; dgInt32 first; dgInt32 count; dgFloat32 val; dgJacobian y0; dgJacobian y1; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; m0 = constraintArray[i].m_m0; m1 = constraintArray[i].m_m1; y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (dgInt32 j = 0; j < count; j++) { index = first + j; val = force[index]; y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(val); y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(val); y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(val); y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(val); } internalForces[m0].m_linear += y0.m_linear; internalForces[m0].m_angular += y0.m_angular; internalForces[m1].m_linear += y1.m_linear; internalForces[m1].m_angular += y1.m_angular; } for (dgInt32 i = 0; i < DG_CONSTRAINT_MAX_ROWS; i++) { forceStep[i] = dgFloat32(0.0f); } maxPasses = 4; prevJoint = 0; accNorm = maxAccNorm * dgFloat32(2.0f); for (passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) { accNorm = dgFloat32(0.0f); for (dgInt32 currJoint = 0; currJoint < m_jointCount; currJoint++) { dgInt32 m0; dgInt32 m1; dgInt32 index; dgInt32 rowsCount; dgFloat32 jointAccel; dgJacobian y0; dgJacobian y1; index = constraintArray[prevJoint].m_autoPairstart; rowsCount = constraintArray[prevJoint].m_autoPaircount; m0 = constraintArray[prevJoint].m_m0; m1 = constraintArray[prevJoint].m_m1; y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (dgInt32 i = 0; i < rowsCount; i++) { dgFloat32 deltaForceI; deltaForceI = forceStep[i]; y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(deltaForceI); y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(deltaForceI); y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(deltaForceI); y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(deltaForceI); index++; } internalForces[m0].m_linear += y0.m_linear; internalForces[m0].m_angular += y0.m_angular; internalForces[m1].m_linear += y1.m_linear; internalForces[m1].m_angular += y1.m_angular; index = constraintArray[currJoint].m_autoPairstart; rowsCount = constraintArray[currJoint].m_autoPaircount; m0 = constraintArray[currJoint].m_m0; m1 = constraintArray[currJoint].m_m1; y0 = internalForces[m0]; y1 = internalForces[m1]; for (dgInt32 i = 0; i < rowsCount; i++) { dgVector acc( JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); acc += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); acc += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); acc += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); accel[i] = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index]; index++; } jointAccel = CalculateJointForces(currJoint, forceStep, maxAccNorm); accNorm = GetMax(accNorm, jointAccel); prevJoint = currJoint; } } for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 k; dgInt32 first; dgInt32 count; dgInt32 index; dgFloat32 val; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPaircount; constraintArray[i].m_autoPaircount = count; for (k = 0; k < count; k++) { index = first + k; j = normalForceIndex[index]; NEWTON_ASSERT(((j < 0) && (force[j] == dgFloat32(1.0f))) || ((j >= 0) && (force[j] >= dgFloat32(0.0f)))); val = force[j]; lowerForceBound[index] *= val; upperForceBound[index] *= val; val = force[index]; force[index] = ClampValue(val, lowerForceBound[index], upperForceBound[index]); } } for (dgInt32 i = 0; i < m_bodyCount; i++) { internalForces[i].m_linear[0] = dgFloat32(0.0f); internalForces[i].m_linear[1] = dgFloat32(0.0f); internalForces[i].m_linear[2] = dgFloat32(0.0f); internalForces[i].m_linear[3] = dgFloat32(0.0f); internalForces[i].m_angular[0] = dgFloat32(0.0f); internalForces[i].m_angular[1] = dgFloat32(0.0f); internalForces[i].m_angular[2] = dgFloat32(0.0f); internalForces[i].m_angular[3] = dgFloat32(0.0f); } for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; dgInt32 index; dgJacobian y0; dgJacobian y1; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; m0 = constraintArray[i].m_m0; m1 = constraintArray[i].m_m1; y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (j = 0; j < count; j++) { dgFloat32 val; index = j + first; val = force[index]; y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(val); y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(val); y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(val); y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(val); } internalForces[m0].m_linear += y0.m_linear; internalForces[m0].m_angular += y0.m_angular; internalForces[m1].m_linear += y1.m_linear; internalForces[m1].m_angular += y1.m_angular; } forceRows = 0; akNum = dgFloat32(0.0f); accNorm = dgFloat32(0.0f); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; dgInt32 index; dgInt32 activeCount; bool isClamped[DG_CONSTRAINT_MAX_ROWS]; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; m0 = constraintArray[i].m_m0; m1 = constraintArray[i].m_m1; const dgJacobian &y0 = internalForces[m0]; const dgJacobian &y1 = internalForces[m1]; for (dgInt32 j = 0; j < count; j++) { index = j + first; dgVector tmpAccel(JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); accel[index] = coordenateAccel[index] - (tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z + force[index] * diagDamp[index]); } activeCount = 0; for (dgInt32 j = 0; j < count; j++) { dgFloat32 val; index = j + first; val = lowerForceBound[index] - force[index]; if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[index] < dgFloat32(0.0f))) { force[index] = lowerForceBound[index]; isClamped[j] = true; } else { val = upperForceBound[index] - force[index]; if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[index] > dgFloat32(0.0f))) { force[index] = upperForceBound[index]; isClamped[j] = true; } else { forceRows++; activeCount++; deltaForce[index] = accel[index] * invDJMinvJt[index]; akNum += accel[index] * deltaForce[index]; accNorm = GetMax(dgAbsf(accel[index]), accNorm); isClamped[j] = false; } } } if (activeCount < count) { dgInt32 i0; dgInt32 i1; i0 = 0; i1 = count - 1; constraintArray[i].m_autoPairActiveCount = activeCount; do { while ((i0 <= i1) && !isClamped[i0]) i0++; while ((i0 <= i1) && isClamped[i1]) i1--; if (i0 < i1) { SwapRows(first + i0, first + i1); i0++; i1--; } } while (i0 < i1); } } maxPasses = forceRows; //totalPassesCount = 0; for (passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) { dgInt32 clampedForceIndex; dgInt32 clampedForceJoint; dgFloat32 ak; dgFloat32 akDen; dgFloat32 clampedForceIndexValue; for (dgInt32 i = 0; i < m_bodyCount; i++) { internalForces[i].m_linear[0] = dgFloat32(0.0f); internalForces[i].m_linear[1] = dgFloat32(0.0f); internalForces[i].m_linear[2] = dgFloat32(0.0f); internalForces[i].m_linear[3] = dgFloat32(0.0f); internalForces[i].m_angular[0] = dgFloat32(0.0f); internalForces[i].m_angular[1] = dgFloat32(0.0f); internalForces[i].m_angular[2] = dgFloat32(0.0f); internalForces[i].m_angular[3] = dgFloat32(0.0f); } for (dgInt32 i = 0; i < m_jointCount; i++) { dgJacobian y0; dgJacobian y1; dgInt32 jfirst = constraintArray[i].m_autoPairstart; dgInt32 jcount = constraintArray[i].m_autoPairActiveCount; dgInt32 m0 = constraintArray[i].m_m0; dgInt32 m1 = constraintArray[i].m_m1; y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (dgInt32 j = 0; j < jcount; j++) { dgInt32 index = j + jfirst; dgFloat32 jak = deltaForce[index]; y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(jak); y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(jak); y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(jak); y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(jak); } internalForces[m0].m_linear += y0.m_linear; internalForces[m0].m_angular += y0.m_angular; internalForces[m1].m_linear += y1.m_linear; internalForces[m1].m_angular += y1.m_angular; } akDen = dgFloat32(0.0f); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; dgInt32 index; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; m0 = constraintArray[i].m_m0; m1 = constraintArray[i].m_m1; const dgJacobian &y0 = internalForces[m0]; const dgJacobian &y1 = internalForces[m1]; for (j = 0; j < count; j++) { index = j + first; dgVector tmpAccel(JMinv[index].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); deltaAccel[index] = tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z + deltaForce[index] * diagDamp[index]; akDen += deltaAccel[index] * deltaForce[index]; } } NEWTON_ASSERT(akDen > dgFloat32(0.0f)); akDen = GetMax(akDen, dgFloat32(1.0e-16f)); NEWTON_ASSERT(dgAbsf(akDen) >= dgFloat32(1.0e-16f)); ak = akNum / akDen; clampedForceIndex = -1; clampedForceJoint = -1; clampedForceIndexValue = dgFloat32(0.0f); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 first; dgInt32 count; dgInt32 index; dgFloat32 val; if (ak > dgFloat32(1.0e-8f)) { first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; for (j = 0; j < count; j++) { index = j + first; val = force[index] + ak * deltaForce[index]; if (deltaForce[index] < dgFloat32(-1.0e-16f)) { // if (val < bilateralForceBounds[index].m_low) { if (val < lowerForceBound[index]) { ak = GetMax((lowerForceBound[index] - force[index]) / deltaForce[index], dgFloat32(0.0f)); NEWTON_ASSERT(ak >= dgFloat32(0.0f)); clampedForceIndex = j; clampedForceJoint = i; // clampedForceIndexValue = bilateralForceBounds[index].m_low; clampedForceIndexValue = lowerForceBound[index]; } } else if (deltaForce[index] > dgFloat32(1.0e-16f)) { // if (val > bilateralForceBounds[index].m_upper) { if (val > upperForceBound[index]) { ak = GetMax((upperForceBound[index] - force[index]) / deltaForce[index], dgFloat32(0.0f)); NEWTON_ASSERT(ak >= dgFloat32(0.0f)); clampedForceIndex = j; clampedForceJoint = i; clampedForceIndexValue = upperForceBound[index]; } } } } } if (clampedForceIndex >= 0) { dgInt32 first; dgInt32 count; dgInt32 activeCount; bool isClamped[DG_CONSTRAINT_MAX_ROWS]; for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 jfirst = constraintArray[i].m_autoPairstart; dgInt32 jcount = constraintArray[i].m_autoPairActiveCount; for (dgInt32 j = 0; j < jcount; j++) { dgInt32 jindex = j + jfirst; force[jindex] += ak * deltaForce[jindex]; accel[jindex] -= ak * deltaAccel[jindex]; } } first = constraintArray[clampedForceJoint].m_autoPairstart; count = constraintArray[clampedForceJoint].m_autoPairActiveCount; count--; force[first + clampedForceIndex] = clampedForceIndexValue; if (clampedForceIndex != count) { SwapRows(first + clampedForceIndex, first + count); } activeCount = count; for (dgInt32 i = 0; i < count; i++) { dgInt32 index; dgFloat32 val; index = first + i; isClamped[i] = false; val = lowerForceBound[index] - force[index]; if ((val > dgFloat32(-1.0e-5f)) && (accel[index] < dgFloat32(0.0f))) { activeCount--; isClamped[i] = true; } else { val = upperForceBound[index] - force[index]; if ((val < dgFloat32(1.0e-5f)) && (accel[index] > dgFloat32(0.0f))) { activeCount--; isClamped[i] = true; } } } if (activeCount < count) { dgInt32 i0; dgInt32 i1; i0 = 0; i1 = count - 1; do { while ((i0 <= i1) && !isClamped[i0]) i0++; while ((i0 <= i1) && isClamped[i1]) i1--; if (i0 < i1) { SwapRows(first + i0, first + i1); // Swap (isClamped[i0], isClamped[i1]); // Swap (permutationIndex[i0], permutationIndex[i1]); i0++; i1--; } } while (i0 < i1); } constraintArray[clampedForceJoint].m_autoPairActiveCount = activeCount; forceRows = 0; akNum = dgFloat32(0.0f); accNorm = dgFloat32(0.0f); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 jfirst = constraintArray[i].m_autoPairstart; dgInt32 jcount = constraintArray[i].m_autoPairActiveCount; forceRows += jcount; for (dgInt32 j = 0; j < jcount; j++) { dgInt32 jindex = jfirst + j; NEWTON_ASSERT((i != clampedForceJoint) || !((dgAbsf(lowerForceBound[jindex] - force[jindex]) < dgFloat32(1.0e-5f)) && (accel[jindex] < dgFloat32(0.0f)))); NEWTON_ASSERT((i != clampedForceJoint) || !((dgAbsf(upperForceBound[jindex] - force[jindex]) < dgFloat32(1.0e-5f)) && (accel[jindex] > dgFloat32(0.0f)))); deltaForce[jindex] = accel[jindex] * invDJMinvJt[jindex]; akNum += deltaForce[jindex] * accel[jindex]; accNorm = GetMax(dgAbsf(accel[jindex]), accNorm); NEWTON_ASSERT(dgCheckFloat(deltaForce[jindex])); } } NEWTON_ASSERT(akNum >= dgFloat32(0.0f)); passes = -1; maxPasses = forceRows; } else { accNorm = dgFloat32(0.0f); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 first; dgInt32 count; dgInt32 index; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; for (j = 0; j < count; j++) { index = j + first; force[index] += ak * deltaForce[index]; accel[index] -= ak * deltaAccel[index]; accNorm = GetMax(dgAbsf(accel[index]), accNorm); } } if (accNorm > maxAccNorm) { akDen = akNum; akNum = dgFloat32(0.0f); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 first; dgInt32 count; dgInt32 index; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; for (j = 0; j < count; j++) { index = j + first; deltaAccel[index] = accel[index] * invDJMinvJt[index]; akNum += accel[index] * deltaAccel[index]; } } NEWTON_ASSERT(akNum >= dgFloat32(0.0f)); NEWTON_ASSERT(akDen > dgFloat32(0.0f)); akDen = GetMax(akDen, dgFloat32(1.0e-17f)); ak = dgFloat32(akNum / akDen); for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 j; dgInt32 first; dgInt32 count; dgInt32 index; first = constraintArray[i].m_autoPairstart; count = constraintArray[i].m_autoPairActiveCount; for (j = 0; j < count; j++) { index = j + first; deltaForce[index] = deltaAccel[index] + ak * deltaForce[index]; } } } } //totalPassesCount++; } ApplyExternalForcesAndAcceleration(maxAccNorm); } dgFloat32 dgJacobianMemory::CalculateJointForcesSimd(dgInt32 joint, dgFloat32 *forceStep, dgFloat32 maxAccNorm) const { #ifdef DG_BUILD_SIMD_CODE dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; dgInt32 roundCount; dgInt32 maxPasses; dgInt32 clampedForceIndex; dgFloat32 ak; dgFloat32 akNum; dgFloat32 retAccel; dgFloatSign tmpIndex; // dgFloat32 akDen; // dgFloat32 force; dgFloat32 accNorm; // dgFloat32 retAccNorm; dgFloat32 clampedForceIndexValue; // dgJacobian y0; // dgJacobian y1; // simd_type tmp0; // simd_type tmp1; // simd_type tmp2; // simd_type tmp3; simd_type akNumSimd; simd_type accNormSimd; simd_type maxPassesSimd; simd_type y0_linear; simd_type y0_angular; simd_type y1_linear; simd_type y1_angular; simd_type one; simd_type zero; simd_type signMask; simd_type tol_pos_1eNeg5; simd_type tol_pos_1eNeg8; simd_type tol_neg_1eNeg16; simd_type tol_pos_1eNeg16; simd_type deltaAccelPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; simd_type deltaForcePtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; simd_type activeRowPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; simd_type lowBoundPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; simd_type highBoundPtr[DG_CONSTRAINT_MAX_ROWS / DG_SIMD_WORD_SIZE]; dgFloat32 *const deltaAccel = (dgFloat32 *)deltaAccelPtr; dgFloat32 *const deltaForce = (dgFloat32 *)deltaForcePtr; dgFloat32 *const activeRow = (dgFloat32 *)activeRowPtr; dgFloat32 *const lowBound = (dgFloat32 *)lowBoundPtr; dgFloat32 *const highBound = (dgFloat32 *)highBoundPtr; dgFloat32 *const accel = m_accel; dgFloat32 *const force = m_force; const dgJacobianPair *const Jt = m_Jt; const dgJacobianPair *const JMinv = m_JMinv; const dgFloat32 *const diagDamp = m_diagDamp; const dgFloat32 *const invDJMinvJt = m_invDJMinvJt; // const dgFloat32* const lowerFrictionForce = m_lowerBoundFrictionCoefficent; // const dgFloat32* const upperFrictionForce = m_upperBoundFrictionCoefficent; const dgFloat32 *const lowerFrictionCoef = m_lowerBoundFrictionCoefficent; const dgFloat32 *const upperFrictionCoef = m_upperBoundFrictionCoefficent; const dgInt32 *const normalForceIndex = m_normalForceIndex; const dgJointInfo *const constraintArray = m_constraintArray; // const dgJacobianIndex* const jacobianIndexArray = m_jacobianIndexArray; // count = 0x7fffffff; // signMask = simd_set1((dgFloat32&) count); tmpIndex.m_integer.m_iVal = 0x7fffffff; signMask = simd_set1(tmpIndex.m_fVal); one = simd_set1(dgFloat32(1.0f)); zero = simd_set1(dgFloat32(0.0f)); tol_pos_1eNeg8 = simd_set1(dgFloat32(1.0e-8f)); tol_pos_1eNeg5 = simd_set1(dgFloat32(1.0e-5f)); tol_pos_1eNeg16 = simd_set1(dgFloat32(1.0e-16f)); tol_neg_1eNeg16 = simd_set1(dgFloat32(-1.0e-16f)); first = constraintArray[joint].m_autoPairstart; count = constraintArray[joint].m_autoPaircount; m0 = constraintArray[joint].m_m0; m1 = constraintArray[joint].m_m1; roundCount = count & (-DG_SIMD_WORD_SIZE); if (roundCount != count) { roundCount += 4; for (dgInt32 j = count; j < roundCount; j++) { dgInt32 i; i = first + j; // force[i] = dgFloat32 (0.0f); // accel[j] -= dgFloat32 (0.0f); // activeRow[j] = dgFloat32 (0.0f); // deltaAccel[j] = dgFloat32 (0.0f); // deltaForce[j] = dgFloat32 (0.0f); m_normalForceIndex[i] = -1; simd_store_s(zero, &force[i]); simd_store_s(zero, &accel[j]); simd_store_s(zero, &activeRow[j]); simd_store_s(zero, &deltaAccel[j]); simd_store_s(zero, &deltaForce[j]); simd_store_s(one, &m_lowerBoundFrictionCoefficent[i]); simd_store_s(zero, &m_upperBoundFrictionCoefficent[i]); } } // akNum = dgFloat32 (0.0f); // accNorm = dgFloat32(0.0f); // maxPasses = count; akNumSimd = zero; accNormSimd = zero; maxPassesSimd = zero; // tmp3 = simd_set1(dgFloat32 (1.0f)); // for (j = 0; j < count; j ++) { for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) { dgInt32 i; // dgInt32 k0; // dgInt32 k1; // dgInt32 k2; // dgInt32 k3; // simd_type index_k; // simd_type accel_j; simd_type force_k; // simd_type force_i; // simd_type lowBound_j; // simd_type higntBound_j; // simd_type deltaforce_j; simd_type lowHighBound_test; i = first + j; // k = bilateralForceBounds[i].m_normalIndex; // k0 = normalForceIndex[i + 0]; // k1 = normalForceIndex[i + 1]; // k2 = normalForceIndex[i + 2]; // k3 = normalForceIndex[i + 3]; // val = (k >= 0) ? force[k] : dgFloat32 (1.0f); // val = force[k]; // index_k = simd_move_lh_v (simd_pack_lo_v (simd_load_is (zero, k0), simd_load_is (zero, k1)), simd_pack_lo_v (simd_load_is (zero, k2), simd_load_is (zero, k3))); // index_k = simd_cmpge_v (index_k, zero); force_k = simd_move_lh_v(simd_pack_lo_v(simd_load_s(force[normalForceIndex[i + 0]]), simd_load_s(force[normalForceIndex[i + 1]])), simd_pack_lo_v(simd_load_s(force[normalForceIndex[i + 2]]), simd_load_s(force[normalForceIndex[i + 3]]))); // force_k = simd_or_v (simd_and_v(force_k, index_k), simd_andnot_v (one, index_k)); // lowBound[j] = val * bilateralForceBounds[i].m_low; // highBound[j] = val * bilateralForceBounds[i].m_upper; (simd_type &)lowBound[j] = simd_mul_v(force_k, (simd_type &)lowerFrictionCoef[i]); (simd_type &)highBound[j] = simd_mul_v(force_k, (simd_type &)upperFrictionCoef[i]); // activeRow[j] = dgFloat32 (1.0f); // forceStep[j] = m_force[i]; // if (force[i] < lowBound[j]) { // maxPasses --; // force[i] = lowBound[j]; // activeRow[j] = dgFloat32 (0.0f); // } else if (force[i] > highBound[j]) { // maxPasses --; // force[i] = highBound[j]; // activeRow[j] = dgFloat32 (0.0f); // } (simd_type &)forceStep[j] = (simd_type &)force[i]; // lowBound_test = simd_cmplt_v (force_i, lowBound_j); // higntBound_test = simd_cmpgt_v (force_i, higntBound_j); lowHighBound_test = simd_or_v(simd_cmplt_v((simd_type &)force[i], (simd_type &)lowBound[j]), simd_cmpgt_v((simd_type &)force[i], (simd_type &)highBound[j])); (simd_type &)activeRow[j] = simd_andnot_v(one, lowHighBound_test); maxPassesSimd = simd_add_v(maxPassesSimd, (simd_type &)activeRow[j]); // force_k = simd_or_v (simd_and_v (lowBound_j, lowBound_test), simd_and_v (higntBound_j, higntBound_test)); //(simd_type&)force[i] = simd_mul_v (activeRow_j, simd_or_v (simd_and_v (force_k, lowHighBound_test), simd_andnot_v (force_i, lowHighBound_test))); (simd_type &)force[i] = simd_min_v((simd_type &)highBound[j], simd_max_v((simd_type &)force[i], (simd_type &)lowBound[j])); // deltaForce[j] = accel[j] * invDJMinvJt[i] * activeRow[j]; (simd_type &)deltaForce[j] = simd_mul_v((simd_type &)accel[j], simd_mul_v((simd_type &)invDJMinvJt[i], (simd_type &)activeRow[j])); // akNum += accel[j] * deltaForce[j]; akNumSimd = simd_mul_add_v(akNumSimd, (simd_type &)accel[j], (simd_type &)deltaForce[j]); // accNorm = GetMax (dgAbsf (accel[j] * activeRow[j]), accNorm); accNormSimd = simd_max_v(accNormSimd, simd_and_v(simd_mul_v((simd_type &)accel[j], (simd_type &)activeRow[j]), signMask)); } akNumSimd = simd_add_v(akNumSimd, simd_move_hl_v(akNumSimd, akNumSimd)); simd_store_s( simd_add_s(akNumSimd, simd_permut_v(akNumSimd, akNumSimd, PURMUT_MASK(0, 0, 0, 1))), &akNum); accNormSimd = simd_max_v(accNormSimd, simd_move_hl_v(accNormSimd, accNormSimd)); simd_store_s( simd_max_s(accNormSimd, simd_permut_v(accNormSimd, accNormSimd, PURMUT_MASK(0, 0, 0, 1))), &accNorm); maxPassesSimd = simd_add_v(maxPassesSimd, simd_move_hl_v(maxPassesSimd, maxPassesSimd)); maxPasses = simd_store_is(simd_add_s(maxPassesSimd, simd_permut_v(maxPassesSimd, maxPassesSimd, PURMUT_MASK(0, 0, 0, 1)))); retAccel = accNorm; clampedForceIndexValue = dgFloat32(0.0f); for (dgInt32 i = 0; (i < maxPasses) && (accNorm > maxAccNorm); i++) { simd_type akSimd; simd_type akDenSimd; // y0.m_linear = zero; // y0.m_angular = zero; // y1.m_linear = zero; // y1.m_angular = zero; y0_linear = zero; y0_angular = zero; y1_linear = zero; y1_angular = zero; for (dgInt32 j = 0; j < count; j++) { dgInt32 k; simd_type tmp1; k = j + first; // ak = deltaForce[j]; tmp1 = simd_set1(deltaForce[j]); // y0.m_linear += m_Jt[k].m_jacobian_IM0.m_linear.Scale (ak); // y0.m_angular += m_Jt[k].m_jacobian_IM0.m_angular.Scale (ak); // y1.m_linear += m_Jt[k].m_jacobian_IM1.m_linear.Scale (ak); // y1.m_angular += m_Jt[k].m_jacobian_IM1.m_angular.Scale (ak); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)Jt[k].m_jacobian_IM0.m_linear, tmp1); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)Jt[k].m_jacobian_IM0.m_angular, tmp1); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)Jt[k].m_jacobian_IM1.m_linear, tmp1); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)Jt[k].m_jacobian_IM1.m_angular, tmp1); } // akDen = dgFloat32 (0.0f); akDenSimd = zero; for (dgInt32 j = 0; j < count; j++) { dgInt32 k; simd_type tmp1; k = j + first; // dgVector acc (m_JMinv[k].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); // acc += m_JMinv[k].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); // acc += m_JMinv[k].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); // acc += m_JMinv[k].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); tmp1 = simd_mul_v((simd_type &)JMinv[k].m_jacobian_IM0.m_linear, y0_linear); tmp1 = simd_mul_add_v(tmp1, (simd_type &)JMinv[k].m_jacobian_IM0.m_angular, y0_angular); tmp1 = simd_mul_add_v(tmp1, (simd_type &)JMinv[k].m_jacobian_IM1.m_linear, y1_linear); tmp1 = simd_mul_add_v(tmp1, (simd_type &)JMinv[k].m_jacobian_IM1.m_angular, y1_angular); // deltaAccel[j] = acc.m_x + acc.m_y + acc.m_z + deltaForce[j] * m_diagDamp[k]; tmp1 = simd_add_v(tmp1, simd_move_hl_v(tmp1, tmp1)); tmp1 = simd_add_s(tmp1, simd_permut_v(tmp1, tmp1, PURMUT_MASK(3, 3, 3, 1))); tmp1 = simd_mul_add_s(tmp1, simd_load_s(deltaForce[j]), simd_load_s(diagDamp[k])); simd_store_s(tmp1, &deltaAccel[j]); // akDen += deltaAccel[j] * deltaForce[j]; akDenSimd = simd_mul_add_s(akDenSimd, tmp1, simd_load_s(deltaForce[j])); } //NEWTON_ASSERT (akDen > dgFloat32 (0.0f)); // akDen = GetMax (akDen, dgFloat32(1.0e-16f)); //NEWTON_ASSERT (dgAbsf (akDen) >= dgFloat32(1.0e-16f)); // ak = akNum / akDen; akSimd = simd_div_s(simd_load_s(akNum), simd_max_s(akDenSimd, tol_pos_1eNeg16)); // simd_store_s (tmp0, &ak); // clampedForceIndex = -1; simd_type min_index; simd_type minClampIndex; simd_type min_index_step; simd_type campedIndexValue; campedIndexValue = zero; minClampIndex = simd_set1(dgFloat32(-1.0f)); min_index_step = simd_set1(dgFloat32(4.0f)); akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0)); min_index = simd_set(dgFloat32(0.0f), dgFloat32(1.0f), dgFloat32(2.0f), dgFloat32(3.0f)); // for (j = 0; j < roundCount; j ++) { for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) { // if (activeRow[j]) { // dgFloat32 val; // k = j + first; // if (deltaForce[j] < dgFloat32 (-1.0e-16f)) { // val = force[k] + ak * deltaForce[j]; // if (val < lowBound[j]) { // ak = (lowBound[j] - force[k]) / deltaForce[j]; // clampedForceIndex = j; // clampedForceIndexValue = lowBound[j]; // if (ak < dgFloat32 (1.0e-8f)) { // ak = dgFloat32 (0.0f); // break; // } // } // } else if (deltaForce[j] > dgFloat32 (1.0e-16f)) { // val = force[k] + ak * deltaForce[j]; // if (val > highBound[j]) { // ak = (highBound[j] - force[k]) / deltaForce[j]; // clampedForceIndex = j; // clampedForceIndexValue = highBound[j]; // if (ak < dgFloat32 (1.0e-8f)) { // ak = dgFloat32 (0.0f); // break; // } // } // } // } // // bool test; // bool negTest; // bool posTest; // bool negValTest; // bool posValTest; // bool negDeltaForceTest; // bool posDeltaForceTest; // dgFloat32 val; // dgFloat32 num; // dgFloat32 den; dgInt32 k; simd_type val; simd_type num; simd_type den; simd_type test; simd_type negTest; simd_type posTest; simd_type negValTest; simd_type posValTest; simd_type negDeltaForceTest; simd_type posDeltaForceTest; // Make sure AK is not negative k = j + first; // val = force[k] + ak * deltaForce[j]; val = simd_mul_add_v((simd_type &)force[k], akSimd, (simd_type &)deltaForce[j]); // negValTest = val < lowBound[j]; negValTest = simd_cmplt_v(val, (simd_type &)lowBound[j]); // posValTest = val > highBound[j]; posValTest = simd_cmpgt_v(val, (simd_type &)highBound[j]); // negDeltaForceTest = deltaForce[j] < dgFloat32 (-1.0e-16f); negDeltaForceTest = simd_cmplt_v((simd_type &)deltaForce[j], tol_neg_1eNeg16); // posDeltaForceTest = deltaForce[j] > dgFloat32 ( 1.0e-16f); posDeltaForceTest = simd_cmpgt_v((simd_type &)deltaForce[j], tol_pos_1eNeg16); // negTest = negValTest & negDeltaForceTest; negTest = simd_and_v(negValTest, negDeltaForceTest); // posTest = posValTest & posDeltaForceTest; posTest = simd_and_v(posValTest, posDeltaForceTest); // test = negTest | posTest; test = simd_or_v(negTest, posTest); // num = negTest ? lowBound[j] : (posTest ? highBound[j] : force[k]); num = simd_or_v(simd_and_v((simd_type &)lowBound[j], negTest), simd_and_v((simd_type &)highBound[j], posTest)); num = simd_or_v(simd_and_v(num, test), simd_andnot_v((simd_type &)force[k], test)); // den = test ? deltaForce[j] : dgFloat32 (1.0f); den = simd_or_v(simd_and_v((simd_type &)deltaForce[j], test), simd_andnot_v(one, test)); // test = test & (activeRow[j] > dgFloat32 (0.0f)); test = simd_and_v(test, simd_cmpgt_v((simd_type &)activeRow[j], zero)); //NEWTON_ASSERT (dgAbsf (den) > 1.0e-16f); // ak = test ? (num - force[k]) / den : ak; akSimd = simd_or_v(simd_div_v(simd_sub_v(num, (simd_type &)force[k]), den), simd_andnot_v(akSimd, test)); // ak = (ak < dgFloat32 (1.0e-8f)) ? dgFloat32 (0.0f) : ak; akSimd = simd_and_v(akSimd, simd_cmpgt_v(akSimd, tol_pos_1eNeg8)); // clampedForceIndex = test ? j : clampedForceIndex; minClampIndex = simd_or_v(simd_and_v(min_index, test), simd_andnot_v(minClampIndex, test)); min_index = simd_add_v(min_index, min_index_step); // clampedForceIndexValue = test ? num : clampedForceIndexValue; campedIndexValue = simd_or_v(simd_and_v(num, test), simd_andnot_v(campedIndexValue, test)); } akDenSimd = simd_move_hl_v(akSimd, akSimd); maxPassesSimd = simd_cmplt_v(akSimd, akDenSimd); akSimd = simd_min_v(akSimd, akDenSimd); minClampIndex = simd_or_v(simd_and_v(minClampIndex, maxPassesSimd), simd_andnot_v(simd_move_hl_v(minClampIndex, minClampIndex), maxPassesSimd)); campedIndexValue = simd_or_v(simd_and_v(campedIndexValue, maxPassesSimd), simd_andnot_v(simd_move_hl_v(campedIndexValue, campedIndexValue), maxPassesSimd)); akDenSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 1)); maxPassesSimd = simd_cmplt_s(akSimd, akDenSimd); akSimd = simd_min_s(akSimd, akDenSimd); minClampIndex = simd_or_v(simd_and_v(minClampIndex, maxPassesSimd), simd_andnot_v(simd_permut_v(minClampIndex, minClampIndex, PURMUT_MASK(0, 0, 0, 1)), maxPassesSimd)); campedIndexValue = simd_or_v(simd_and_v(campedIndexValue, maxPassesSimd), simd_andnot_v(simd_permut_v(campedIndexValue, campedIndexValue, PURMUT_MASK(0, 0, 0, 1)), maxPassesSimd)); // tmp2 = zero; simd_store_s(akSimd, &ak); clampedForceIndex = simd_store_is(minClampIndex); simd_store_s(campedIndexValue, &clampedForceIndexValue); if (ak == dgFloat32(0.0f) && (clampedForceIndex != -1)) { NEWTON_ASSERT(clampedForceIndex != -1); // akNum = dgFloat32 (0.0f); // accNorm = dgFloat32(0.0f); akNumSimd = zero; accNormSimd = zero; maxPassesSimd = zero; activeRow[clampedForceIndex] = dgFloat32(0.0f); deltaForce[clampedForceIndex] = dgFloat32(0.0f); force[clampedForceIndex + first] = clampedForceIndexValue; // for (j = 0; j < count; j ++) { for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) { // for (j = 0; j < roundCount; j ++) { // if (((dgFloat32*)activeRow)[j]) { // bool test0; // bool test1; // k = j + first; // val = ((dgFloat32*)lowBound)[j] - force[k]; // if ((dgAbsf (val) < dgFloat32 (1.0e-5f)) && (accel[j] < dgFloat32 (0.0f))) { // force[k] = lowBound[j]; // activeRow[j] = dgFloat32 (0.0f); // deltaForce[j] = dgFloat32 (0.0f); // // } else { // val = highBound[j] - force[k]; // if ((dgAbsf (val) < dgFloat32 (1.0e-5f)) && (accel[j] > dgFloat32 (0.0f))) { // force[k] = highBound[j]; // activeRow[j] = dgFloat32 (0.0f); // deltaForce[j] = dgFloat32 (0.0f); // } else { // NEWTON_ASSERT (activeRow[j] > dgFloat32 (0.0f)); // deltaForce[j] = accel[j] * invDJMinvJt[k]; // akNum += accel[j] * deltaForce[j]; // accNorm = GetMax (dgAbsf (accel[j]), accNorm); // } // } // } dgInt32 k; simd_type val_k; simd_type test_0; simd_type test_1; // simd_type test_2; simd_type accel_k; simd_type force_k; k = j + first; accel_k = (simd_type &)accel[j]; force_k = (simd_type &)force[k]; // val = dgAbsf (lowBound[j] - force[k]); val_k = simd_and_v(simd_sub_v((simd_type &)lowBound[j], force_k), signMask); // test0 = (val < dgFloat32 (1.0e-5f)) & (accel[j] < dgFloat32 (0.0f)); test_0 = simd_and_v(simd_cmplt_v(val_k, tol_pos_1eNeg5), simd_cmplt_v(accel_k, zero)); // val = dgAbsf (highBound[j] - force[k]); val_k = simd_and_v(simd_sub_v((simd_type &)highBound[j], force_k), signMask); // test1 = (val < dgFloat32 (1.0e-5f)) & (accel[j] > dgFloat32 (0.0f)); test_1 = simd_and_v(simd_cmplt_v(val_k, tol_pos_1eNeg5), simd_cmpgt_v(accel_k, zero)); // force[k] = test0 ? lowBound[j] : (test1 ? highBound[j] : force[k]); // val_k = simd_or_v (simd_and_v ((simd_type&)lowBound[j], test_0), simd_and_v ((simd_type&)highBound[j], test_1)); //(simd_type&) force[k] = simd_or_v (simd_and_v (val_k, test_2) , simd_andnot_v (force_k, test_2)); (simd_type &)force[k] = simd_min_v((simd_type &)highBound[j], simd_max_v((simd_type &)force[k], (simd_type &)lowBound[j])); // activeRow[j] *= (test0 | test1) ? dgFloat32 (0.0f) : dgFloat32 (1.0f); // test_2 = simd_or_v (test_0, test_1); (simd_type &)activeRow[j] = simd_mul_v((simd_type &)activeRow[j], simd_andnot_v(one, simd_or_v(test_0, test_1))); // deltaForce[j] = accel[j] * invDJMinvJt[k] * activeRow[j]; (simd_type &)deltaForce[j] = simd_mul_v(accel_k, simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[j])); // akNum += accel[j] * deltaForce[j]; akNumSimd = simd_mul_add_v(akNumSimd, (simd_type &)deltaForce[j], accel_k); // accNorm = GetMax (dgAbsf (accel[j] * activeRow[j]), accNorm); accNormSimd = simd_max_v(accNormSimd, simd_and_v(simd_mul_v(accel_k, (simd_type &)activeRow[j]), signMask)); // masPases += 1; maxPassesSimd = simd_add_v(maxPassesSimd, (simd_type &)activeRow[j]); } NEWTON_ASSERT(activeRow[clampedForceIndex] == dgFloat32(0.0f)); akNumSimd = simd_add_v(akNumSimd, simd_move_hl_v(akNumSimd, akNumSimd)); akNumSimd = simd_add_s(akNumSimd, simd_permut_v(akNumSimd, akNumSimd, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(akNumSimd, &akNum); accNormSimd = simd_max_v(accNormSimd, simd_move_hl_v(accNormSimd, accNormSimd)); accNormSimd = simd_max_s(accNormSimd, simd_permut_v(accNormSimd, accNormSimd, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(accNormSimd, &accNorm); i = -1; // maxPasses = GetMax (maxPasses - 1, 1); maxPassesSimd = simd_add_v(maxPassesSimd, simd_move_hl_v(maxPassesSimd, maxPassesSimd)); maxPasses = simd_store_is(simd_add_s(maxPassesSimd, simd_permut_v(maxPassesSimd, maxPassesSimd, PURMUT_MASK(0, 0, 0, 1)))); } else if (clampedForceIndex >= 0) { // akNum = dgFloat32(0.0f); // accNorm = dgFloat32(0.0f); akNumSimd = zero; accNormSimd = zero; // tmp2 = zero; akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0)); activeRow[clampedForceIndex] = dgFloat32(0.0f); // for (j = 0; j < count; j ++) { for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) { dgInt32 k; k = j + first; // m_force[k] += ak * deltaForce[j]; // m_accel[j] -= ak * deltaAccel[j]; (simd_type &)force[k] = simd_mul_add_v((simd_type &)force[k], akSimd, (simd_type &)deltaForce[j]); (simd_type &)accel[j] = simd_mul_sub_v((simd_type &)accel[j], akSimd, (simd_type &)deltaAccel[j]); // accNorm = GetMax (dgAbsf (m_accel[j] * activeRow[j]), accNorm); accNormSimd = simd_max_v(accNormSimd, simd_and_v(simd_mul_v((simd_type &)accel[j], (simd_type &)activeRow[j]), signMask)); //NEWTON_ASSERT (dgCheckFloat(m_force[k])); //NEWTON_ASSERT (dgCheckFloat(m_accel[j])); // deltaForce[j] = m_accel[j] * m_invDJMinvJt[k] * activeRow[j]; (simd_type &)deltaForce[j] = simd_mul_v((simd_type &)accel[j], simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[j])); // akNum += deltaForce[j] * m_accel[j]; akNumSimd = simd_mul_add_v(akNumSimd, (simd_type &)deltaForce[j], (simd_type &)accel[j]); } akNumSimd = simd_add_v(akNumSimd, simd_move_hl_v(akNumSimd, akNumSimd)); akNumSimd = simd_add_s(akNumSimd, simd_permut_v(akNumSimd, akNumSimd, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(akNumSimd, &akNum); accNormSimd = simd_max_v(accNormSimd, simd_move_hl_v(accNormSimd, accNormSimd)); accNormSimd = simd_max_s(accNormSimd, simd_permut_v(accNormSimd, accNormSimd, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(accNormSimd, &accNorm); force[clampedForceIndex + first] = clampedForceIndexValue; i = -1; maxPasses = GetMax(maxPasses - 1, 1); } else { // accNorm = dgFloat32(0.0f); // tmp2 = zero; accNormSimd = zero; akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0)); // for (j = 0; j < count; j ++) { for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) { dgInt32 k; k = j + first; // m_force[k] += ak * deltaForce[j]; // m_accel[j] -= ak * deltaAccel[j]; (simd_type &)force[k] = simd_mul_add_v((simd_type &)force[k], akSimd, (simd_type &)deltaForce[j]); (simd_type &)accel[j] = simd_mul_sub_v((simd_type &)accel[j], akSimd, (simd_type &)deltaAccel[j]); // accNorm = GetMax (dgAbsf (m_accel[j] * activeRow[j]), accNorm); accNormSimd = simd_max_v(accNormSimd, simd_and_v(simd_mul_v((simd_type &)accel[j], (simd_type &)activeRow[j]), signMask)); //NEWTON_ASSERT (dgCheckFloat(m_force[k])); //NEWTON_ASSERT (dgCheckFloat(m_accel[j])); } accNormSimd = simd_max_v(accNormSimd, simd_move_hl_v(accNormSimd, accNormSimd)); accNormSimd = simd_max_s(accNormSimd, simd_permut_v(accNormSimd, accNormSimd, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(accNormSimd, &accNorm); if (accNorm > maxAccNorm) { // akDen = akNum; // akNum = dgFloat32(0.0f); akDenSimd = simd_set1(akNum); akNumSimd = zero; // for (j = 0; j < count; j ++) { for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) { dgInt32 k; k = j + first; // deltaAccel[j] = m_accel[j] * m_invDJMinvJt[k] * activeRow[j]; (simd_type &)deltaAccel[j] = simd_mul_v((simd_type &)accel[j], simd_mul_v((simd_type &)invDJMinvJt[k], (simd_type &)activeRow[j])); // akNum += m_accel[j] * deltaAccel[j]; akNumSimd = simd_mul_add_v(akNumSimd, (simd_type &)accel[j], (simd_type &)deltaAccel[j]); } akNumSimd = simd_add_v(akNumSimd, simd_move_hl_v(akNumSimd, akNumSimd)); akNumSimd = simd_add_s(akNumSimd, simd_permut_v(akNumSimd, akNumSimd, PURMUT_MASK(0, 0, 0, 1))); simd_store_s(akNumSimd, &akNum); //NEWTON_ASSERT (akDen > dgFloat32(0.0f)); // akDen = GetMax (akDen, dgFloat32 (1.0e-17f)); akDenSimd = simd_max_s(akDenSimd, simd_set1(dgFloat32(1.0e-17f))); // ak = dgFloat32 (akNum / akDen); akSimd = simd_div_s(akSimd, akDenSimd); akSimd = simd_permut_v(akSimd, akSimd, PURMUT_MASK(0, 0, 0, 0)); // for (j = 0; j < count; j ++) { for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) { // deltaForce[j] = deltaAccel[j] + ak * deltaForce[j]; (simd_type &)deltaForce[j] = simd_mul_add_v((simd_type &)deltaAccel[j], akSimd, (simd_type &)deltaForce[j]); } } } } // for (j = 0; j < count; j ++) { for (dgInt32 j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) { // forceStep[j] = force[j + first] - forceStep[j]; (simd_type &)forceStep[j] = simd_sub_v((simd_type &)force[j + first], (simd_type &)forceStep[j]); } // tmp0 = simd_set1(DG_SSOR_FACTOR); // for (j = 0; j < roundCount; j += DG_SIMD_WORD_SIZE) { // tmp1 = simd_mul_add_v((simd_type&)forceStep[j], simd_sub_v ((simd_type&)force[j + first], (simd_type&)forceStep[j]), tmp0); // (simd_type&)force[j + first] = simd_min_v ((simd_type&) highBound[j], simd_max_v(tmp1, (simd_type&)lowBound[j])); // (simd_type&)forceStep[j] = simd_sub_v ((simd_type&)force[j + first], (simd_type&)forceStep[j]); // } return retAccel; #else return dgFloat32(0.0f); #endif } dgFloat32 dgJacobianMemory::CalculateJointForces(dgInt32 joint, dgFloat32 *forceStep, dgFloat32 maxAccNorm) const { dgInt32 first; dgInt32 count; dgInt32 maxPasses; dgInt32 clampedForceIndex; dgFloat32 ak; dgFloat32 val; dgFloat32 akNum; dgFloat32 akDen; dgFloat32 accNorm; dgFloat32 retAccel; dgFloat32 clampedForceIndexValue; dgJacobian y0; dgJacobian y1; dgFloat32 deltaAccel[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 deltaForce[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 activeRow[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 lowBound[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 highBound[DG_CONSTRAINT_MAX_ROWS]; dgFloat32 *const accel = m_accel; dgFloat32 *const force = m_force; const dgJacobianPair *const Jt = m_Jt; const dgJacobianPair *const JMinv = m_JMinv; const dgFloat32 *const diagDamp = m_diagDamp; const dgFloat32 *const invDJMinvJt = m_invDJMinvJt; const dgInt32 *const normalForceIndex = m_normalForceIndex; const dgJointInfo *const constraintArray = m_constraintArray; const dgFloat32 *const lowerFriction = m_lowerBoundFrictionCoefficent; const dgFloat32 *const upperFriction = m_upperBoundFrictionCoefficent; // dgBilateralBounds* const bilateralForceBounds = m_bilateralForceBounds; // const dgJacobianIndex* const jacobianIndexArray = m_jacobianIndexArray; first = constraintArray[joint].m_autoPairstart; count = constraintArray[joint].m_autoPaircount; akNum = dgFloat32(0.0f); accNorm = dgFloat32(0.0f); maxPasses = count; for (dgInt32 j = 0; j < count; j++) { dgInt32 i; dgInt32 k; i = first + j; // k = bilateralForceBounds[i].m_normalIndex; k = normalForceIndex[i]; // val = (k >= 0) ? force[k] : dgFloat32 (1.0f); NEWTON_ASSERT( ((k < 0) && (force[k] == dgFloat32(1.0f))) || ((k >= 0) && (force[k] >= dgFloat32(0.0f)))); val = force[k]; lowBound[j] = val * lowerFriction[i]; highBound[j] = val * upperFriction[i]; activeRow[j] = dgFloat32(1.0f); forceStep[j] = force[i]; if (force[i] < lowBound[j]) { maxPasses--; force[i] = lowBound[j]; activeRow[j] = dgFloat32(0.0f); } else if (force[i] > highBound[j]) { maxPasses--; force[i] = highBound[j]; activeRow[j] = dgFloat32(0.0f); } deltaForce[j] = accel[j] * invDJMinvJt[i] * activeRow[j]; akNum += accel[j] * deltaForce[j]; accNorm = GetMax(dgAbsf(accel[j] * activeRow[j]), accNorm); } retAccel = accNorm; clampedForceIndexValue = dgFloat32(0.0f); dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); for (dgInt32 i = 0; (i < maxPasses) && (accNorm > maxAccNorm); i++) { y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (dgInt32 j = 0; j < count; j++) { dgInt32 k; k = j + first; ak = deltaForce[j]; y0.m_linear += Jt[k].m_jacobian_IM0.m_linear.Scale(ak); y0.m_angular += Jt[k].m_jacobian_IM0.m_angular.Scale(ak); y1.m_linear += Jt[k].m_jacobian_IM1.m_linear.Scale(ak); y1.m_angular += Jt[k].m_jacobian_IM1.m_angular.Scale(ak); } akDen = dgFloat32(0.0f); for (dgInt32 j = 0; j < count; j++) { dgInt32 k; k = j + first; dgVector acc(JMinv[k].m_jacobian_IM0.m_linear.CompProduct(y0.m_linear)); acc += JMinv[k].m_jacobian_IM0.m_angular.CompProduct(y0.m_angular); acc += JMinv[k].m_jacobian_IM1.m_linear.CompProduct(y1.m_linear); acc += JMinv[k].m_jacobian_IM1.m_angular.CompProduct(y1.m_angular); deltaAccel[j] = acc.m_x + acc.m_y + acc.m_z + deltaForce[j] * diagDamp[k]; akDen += deltaAccel[j] * deltaForce[j]; } NEWTON_ASSERT(akDen > dgFloat32(0.0f)); akDen = GetMax(akDen, dgFloat32(1.0e-16f)); NEWTON_ASSERT(dgAbsf(akDen) >= dgFloat32(1.0e-16f)); ak = akNum / akDen; clampedForceIndex = -1; for (dgInt32 j = 0; j < count; j++) { if (activeRow[j]) { dgInt32 k; k = j + first; if (deltaForce[j] < dgFloat32(-1.0e-16f)) { val = force[k] + ak * deltaForce[j]; if (val < lowBound[j]) { ak = GetMax((lowBound[j] - force[k]) / deltaForce[j], dgFloat32(0.0f)); clampedForceIndex = j; clampedForceIndexValue = lowBound[j]; if (ak < dgFloat32(1.0e-8f)) { ak = dgFloat32(0.0f); break; } } } else if (deltaForce[j] > dgFloat32(1.0e-16f)) { val = force[k] + ak * deltaForce[j]; if (val > highBound[j]) { ak = GetMax((highBound[j] - force[k]) / deltaForce[j], dgFloat32(0.0f)); clampedForceIndex = j; clampedForceIndexValue = highBound[j]; if (ak < dgFloat32(1.0e-8f)) { ak = dgFloat32(0.0f); break; } } } } } // if (ak == dgFloat32 (0.0f)) { if (ak == dgFloat32(0.0f) && (clampedForceIndex != -1)) { NEWTON_ASSERT(clampedForceIndex != -1); akNum = dgFloat32(0.0f); accNorm = dgFloat32(0.0f); activeRow[clampedForceIndex] = dgFloat32(0.0f); deltaForce[clampedForceIndex] = dgFloat32(0.0f); force[clampedForceIndex + first] = clampedForceIndexValue; for (dgInt32 j = 0; j < count; j++) { if (activeRow[j]) { dgInt32 k; k = j + first; val = lowBound[j] - force[k]; if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[j] < dgFloat32(0.0f))) { force[k] = lowBound[j]; activeRow[j] = dgFloat32(0.0f); deltaForce[j] = dgFloat32(0.0f); } else { val = highBound[j] - force[k]; if ((dgAbsf(val) < dgFloat32(1.0e-5f)) && (accel[j] > dgFloat32(0.0f))) { force[k] = highBound[j]; activeRow[j] = dgFloat32(0.0f); deltaForce[j] = dgFloat32(0.0f); } else { NEWTON_ASSERT(activeRow[j] > dgFloat32(0.0f)); deltaForce[j] = accel[j] * invDJMinvJt[k]; akNum += accel[j] * deltaForce[j]; accNorm = GetMax(dgAbsf(accel[j]), accNorm); } } } } NEWTON_ASSERT(activeRow[clampedForceIndex] == dgFloat32(0.0f)); i = -1; maxPasses = GetMax(maxPasses - 1, dgInt32(1)); } else if (clampedForceIndex >= 0) { akNum = dgFloat32(0.0f); accNorm = dgFloat32(0.0f); activeRow[clampedForceIndex] = dgFloat32(0.0f); for (dgInt32 j = 0; j < count; j++) { dgInt32 k; k = j + first; force[k] += ak * deltaForce[j]; accel[j] -= ak * deltaAccel[j]; accNorm = GetMax(dgAbsf(accel[j] * activeRow[j]), accNorm); NEWTON_ASSERT(dgCheckFloat(force[k])); NEWTON_ASSERT(dgCheckFloat(accel[j])); deltaForce[j] = accel[j] * invDJMinvJt[k] * activeRow[j]; akNum += deltaForce[j] * accel[j]; } force[clampedForceIndex + first] = clampedForceIndexValue; i = -1; maxPasses = GetMax(maxPasses - 1, dgInt32(1)); } else { accNorm = dgFloat32(0.0f); for (dgInt32 j = 0; j < count; j++) { dgInt32 k; k = j + first; force[k] += ak * deltaForce[j]; accel[j] -= ak * deltaAccel[j]; accNorm = GetMax(dgAbsf(accel[j] * activeRow[j]), accNorm); NEWTON_ASSERT(dgCheckFloat(force[k])); NEWTON_ASSERT(dgCheckFloat(accel[j])); } if (accNorm > maxAccNorm) { akDen = akNum; akNum = dgFloat32(0.0f); for (dgInt32 j = 0; j < count; j++) { dgInt32 k; k = j + first; deltaAccel[j] = accel[j] * invDJMinvJt[k] * activeRow[j]; akNum += accel[j] * deltaAccel[j]; } NEWTON_ASSERT(akDen > dgFloat32(0.0f)); akDen = GetMax(akDen, dgFloat32(1.0e-17f)); ak = dgFloat32(akNum / akDen); for (dgInt32 j = 0; j < count; j++) { deltaForce[j] = deltaAccel[j] + ak * deltaForce[j]; } } } } for (dgInt32 j = 0; j < count; j++) { forceStep[j] = force[j + first] - forceStep[j]; } return retAccel; } void dgJacobianMemory::CalculateForcesGameModeSimd(dgInt32 iterations, dgFloat32 maxAccNorm) const { #ifdef DG_BUILD_SIMD_CODE dgFloat32 *const force = m_force; const dgJacobianPair *const Jt = m_Jt; const dgJacobianPair *const JMinv = m_JMinv; const dgFloat32 *const diagDamp = m_diagDamp; const dgFloat32 *const invDJMinvJt = m_invDJMinvJt; const dgBodyInfo *bodyArray = m_bodyArray; dgFloat32 *const penetration = m_penetration; const dgFloat32 *const externAccel = m_deltaAccel; const dgFloat32 *const restitution = m_restitution; dgFloat32 *const coordenateAccel = m_coordenateAccel; dgJacobian *const internalVeloc = m_internalVeloc; dgJacobian *const internalForces = m_internalForces; dgFloat32 **const jointForceFeeback = m_jointFeebackForce; const dgInt32 *const normalForceIndex = m_normalForceIndex; const dgInt32 *const accelIsMortor = m_accelIsMotor; const dgJointInfo *const constraintArray = m_constraintArray; const dgFloat32 *const penetrationStiffness = m_penetrationStiffness; const dgFloat32 *const lowerFrictionCoef = m_lowerBoundFrictionCoefficent; const dgFloat32 *const upperFrictionCoef = m_upperBoundFrictionCoefficent; dgFloat32 invStep = (dgFloat32(1.0f) / dgFloat32(LINEAR_SOLVER_SUB_STEPS)); dgFloat32 timeStep = m_timeStep * invStep; dgFloat32 invTimeStep = m_invTimeStep * dgFloat32(LINEAR_SOLVER_SUB_STEPS); dgFloatSign tmpIndex; tmpIndex.m_integer.m_iVal = 0x7fffffff; simd_type signMask = simd_set1(tmpIndex.m_fVal); simd_type zero = simd_set1(dgFloat32(0.0f)); for (dgInt32 i = 1; i < m_bodyCount; i++) { dgBody *const body = m_bodyArray[i].m_body; (simd_type &)internalVeloc[i].m_linear = zero; (simd_type &)internalVeloc[i].m_angular = zero; (simd_type &)internalForces[i].m_linear = zero; (simd_type &)internalForces[i].m_angular = zero; (simd_type &)body->m_netForce = (simd_type &)body->m_veloc; (simd_type &)body->m_netTorque = (simd_type &)body->m_omega; } (simd_type &)internalVeloc[0].m_linear = zero; (simd_type &)internalVeloc[0].m_angular = zero; (simd_type &)internalForces[0].m_linear = zero; (simd_type &)internalForces[0].m_angular = zero; for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 first = constraintArray[i].m_autoPairstart; dgInt32 count = constraintArray[i].m_autoPairActiveCount; dgInt32 m0 = constraintArray[i].m_m0; dgInt32 m1 = constraintArray[i].m_m1; // dgJacobian y0 (internalForces[k0]); // dgJacobian y1 (internalForces[k1]); simd_type y0_linear = zero; simd_type y0_angular = zero; simd_type y1_linear = zero; simd_type y1_angular = zero; for (dgInt32 j = 0; j < count; j++) { dgInt32 index = j + first; // val = force[index]; simd_type tmp0 = simd_set1(force[index]); // y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (val); // y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (val); // y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (val); // y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (val); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, tmp0); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, tmp0); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, tmp0); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, tmp0); } // internalForces[k0] = y0; // internalForces[k1] = y1; (simd_type &)internalForces[m0].m_linear = simd_add_v((simd_type &)internalForces[m0].m_linear, y0_linear); (simd_type &)internalForces[m0].m_angular = simd_add_v((simd_type &)internalForces[m0].m_angular, y0_angular); (simd_type &)internalForces[m1].m_linear = simd_add_v((simd_type &)internalForces[m1].m_linear, y1_linear); (simd_type &)internalForces[m1].m_angular = simd_add_v((simd_type &)internalForces[m1].m_angular, y1_angular); } simd_type timeStepSimd = simd_set1(timeStep); dgFloat32 firstPassCoef = dgFloat32(0.0f); dgInt32 maxPasses = iterations + DG_BASE_ITERATION_COUNT; for (dgInt32 step = 0; step < LINEAR_SOLVER_SUB_STEPS; step++) { for (dgInt32 curJoint = 0; curJoint < m_jointCount; curJoint++) { dgJointAccelerationDecriptor joindDesc; dgInt32 index = constraintArray[curJoint].m_autoPairstart; joindDesc.m_rowsCount = constraintArray[curJoint].m_autoPaircount; joindDesc.m_timeStep = timeStep; joindDesc.m_invTimeStep = invTimeStep; joindDesc.m_firstPassCoefFlag = firstPassCoef; joindDesc.m_Jt = &Jt[index]; joindDesc.m_penetration = &penetration[index]; joindDesc.m_restitution = &restitution[index]; joindDesc.m_accelIsMotor = &accelIsMortor[index]; joindDesc.m_externAccelaration = &externAccel[index]; joindDesc.m_coordenateAccel = &coordenateAccel[index]; joindDesc.m_normalForceIndex = &normalForceIndex[index]; joindDesc.m_penetrationStiffness = &penetrationStiffness[index]; constraintArray[curJoint].m_joint->JointAccelerationsSimd(joindDesc); } firstPassCoef = dgFloat32(1.0f); dgFloat32 accNorm; accNorm = maxAccNorm * dgFloat32(2.0f); for (dgInt32 passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) { simd_type accNormSimd = zero; for (dgInt32 curJoint = 0; curJoint < m_jointCount; curJoint++) { dgInt32 index = constraintArray[curJoint].m_autoPairstart; dgInt32 rowsCount = constraintArray[curJoint].m_autoPaircount; dgInt32 m0 = constraintArray[curJoint].m_m0; dgInt32 m1 = constraintArray[curJoint].m_m1; simd_type linearM0 = (simd_type &)internalForces[m0].m_linear; simd_type angularM0 = (simd_type &)internalForces[m0].m_angular; simd_type linearM1 = (simd_type &)internalForces[m1].m_linear; simd_type angularM1 = (simd_type &)internalForces[m1].m_angular; for (dgInt32 k = 0; k < rowsCount; k++) { // dgVector acc (m_JMinv[index].m_jacobian_IM0.m_linear.CompProduct(linearM0)); // acc += m_JMinv[index].m_jacobian_IM0.m_angular.CompProduct (angularM0); // acc += m_JMinv[index].m_jacobian_IM1.m_linear.CompProduct (linearM1); // acc += m_JMinv[index].m_jacobian_IM1.m_angular.CompProduct (angularM1); simd_type a = simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, linearM0); a = simd_mul_add_v(a, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, angularM0); a = simd_mul_add_v(a, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, linearM1); a = simd_mul_add_v(a, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, angularM1); // a = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index]; a = simd_add_v(a, simd_move_hl_v(a, a)); a = simd_add_s(a, simd_permut_v(a, a, PURMUT_MASK(3, 3, 3, 1))); a = simd_sub_s(simd_load_s(coordenateAccel[index]), simd_mul_add_s(a, simd_load_s(force[index]), simd_load_s(diagDamp[index]))); // f = force[index] + invDJMinvJt[index] * a; simd_type f = simd_mul_add_s(simd_load_s(force[index]), simd_load_s(invDJMinvJt[index]), a); dgInt32 frictionIndex = m_normalForceIndex[index]; NEWTON_ASSERT( ((frictionIndex < 0) && (force[frictionIndex] == dgFloat32(1.0f))) || ((frictionIndex >= 0) && (force[frictionIndex] >= dgFloat32(0.0f)))); // frictionNormal = force[frictionIndex]; // lowerFrictionForce = frictionNormal * lowerFrictionCoef[index]; // upperFrictionForce = frictionNormal * upperFrictionCoef[index]; simd_type frictionNormal = simd_load_s(force[frictionIndex]); simd_type lowerFrictionForce = simd_mul_s(frictionNormal, simd_load_s(lowerFrictionCoef[index])); simd_type upperFrictionForce = simd_mul_s(frictionNormal, simd_load_s(upperFrictionCoef[index])); // if (f > upperFrictionForce) { // a = dgFloat32 (0.0f); // f = upperFrictionForce; // } else if (f < lowerFrictionForce) { // a = dgFloat32 (0.0f); // f = lowerFrictionForce; // } f = simd_min_s(simd_max_s(f, lowerFrictionForce), upperFrictionForce); a = simd_andnot_v(a, simd_or_v(simd_cmplt_s(f, lowerFrictionForce), simd_cmpgt_s(f, upperFrictionForce))); accNormSimd = simd_max_s(accNormSimd, simd_and_v(a, signMask)); // prevValue = f - force[index]); a = simd_sub_s(f, simd_load_s(force[index])); a = simd_permut_v(a, a, PURMUT_MASK(0, 0, 0, 0)); // force[index] = f; simd_store_s(f, &force[index]); linearM0 = simd_mul_add_v(linearM0, (simd_type &)Jt[index].m_jacobian_IM0.m_linear, a); angularM0 = simd_mul_add_v(angularM0, (simd_type &)Jt[index].m_jacobian_IM0.m_angular, a); linearM1 = simd_mul_add_v(linearM1, (simd_type &)Jt[index].m_jacobian_IM1.m_linear, a); angularM1 = simd_mul_add_v(angularM1, (simd_type &)Jt[index].m_jacobian_IM1.m_angular, a); index++; } // internalForces[prevM0].m_linear += Jt[prevIndex].m_jacobian_IM0.m_linear.Scale (prevValue); // internalForces[prevM0].m_angular += Jt[prevIndex].m_jacobian_IM0.m_angular.Scale (prevValue); // internalForces[prevM1].m_linear += Jt[prevIndex].m_jacobian_IM1.m_linear.Scale (prevValue); // internalForces[prevM1].m_angular += Jt[prevIndex].m_jacobian_IM1.m_angular.Scale (prevValue); (simd_type &)internalForces[m0].m_linear = linearM0; (simd_type &)internalForces[m0].m_angular = angularM0; (simd_type &)internalForces[m1].m_linear = linearM1; (simd_type &)internalForces[m1].m_angular = angularM1; } simd_store_s(accNormSimd, &accNorm); } for (dgInt32 i = 1; i < m_bodyCount; i++) { dgBody *const body = bodyArray[i].m_body; // dgVector force (body->m_accel + internalForces[i].m_linear); // dgVector torque (body->m_alpha + internalForces[i].m_angular); simd_type force = simd_add_v((simd_type &)body->m_accel, (simd_type &)internalForces[i].m_linear); simd_type torque = simd_add_v((simd_type &)body->m_alpha, (simd_type &)internalForces[i].m_angular); // dgVector accel (force.Scale (body->m_invMass.m_w)); simd_type accel = simd_mul_v(force, simd_set1(body->m_invMass.m_w)); // dgVector alpha (body->m_invWorldInertiaMatrix.RotateVector (torque)); simd_type alpha = simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_invWorldInertiaMatrix[0], simd_permut_v(torque, torque, PURMUT_MASK(0, 0, 0, 0))), (simd_type &)body->m_invWorldInertiaMatrix[1], simd_permut_v(torque, torque, PURMUT_MASK(1, 1, 1, 1))), (simd_type &)body->m_invWorldInertiaMatrix[2], simd_permut_v(torque, torque, PURMUT_MASK(2, 2, 2, 2))); // body->m_veloc += accel.Scale(timeStep); (simd_type &)body->m_veloc = simd_mul_add_v((simd_type &)body->m_veloc, accel, timeStepSimd); // body->m_omega += alpha.Scale(timeStep); (simd_type &)body->m_omega = simd_mul_add_v((simd_type &)body->m_omega, alpha, timeStepSimd); // body->m_netForce += body->m_veloc; (simd_type &)internalVeloc[i].m_linear = simd_add_v((simd_type &)internalVeloc[i].m_linear, (simd_type &)body->m_veloc); // body->m_netTorque += body->m_omega; (simd_type &)internalVeloc[i].m_angular = simd_add_v((simd_type &)internalVeloc[i].m_angular, (simd_type &)body->m_omega); } } dgInt32 hasJointFeeback = 0; for (dgInt32 i = 0; i < m_jointCount; i++) { // maxForce = dgFloat32 (0.0f); dgInt32 first = constraintArray[i].m_autoPairstart; dgInt32 count = constraintArray[i].m_autoPaircount; for (dgInt32 j = 0; j < count; j++) { dgInt32 index = j + first; dgFloat32 val = force[index]; NEWTON_ASSERT(dgCheckFloat(val)); // maxForce = GetMax (dgAbsf (val), maxForce); jointForceFeeback[index][0] = val; } // if (constraintArray[i].m_joint->GetId() == dgContactConstraintId) { // m_world->AddToBreakQueue ((dgContact*)constraintArray[i].m_joint, maxForce); // } // hasJointFeeback |= dgUnsigned32 (constraintArray[i].m_joint->m_updaFeedbackCallback); hasJointFeeback |= (constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0); // if (constraintArray[i].m_joint->m_updaFeedbackCallback) { // constraintArray[i].m_joint->m_updaFeedbackCallback (*constraintArray[i].m_joint, m_timeStep, m_threadIndex); // } } // simd_type invStepSimd; // signMask = simd_set1 (invStep); #ifdef DG_WIGHT_FINAL_RK4_DERIVATIVES simd_type invStepSimd = simd_set1(invStep); #endif simd_type invTimeStepSimd = simd_set1(m_invTimeStep); simd_type accelerationTolerance = simd_set1(maxAccNorm); accelerationTolerance = simd_mul_s(accelerationTolerance, accelerationTolerance); for (dgInt32 i = 1; i < m_bodyCount; i++) { dgBody *const body = bodyArray[i].m_body; #ifdef DG_WIGHT_FINAL_RK4_DERIVATIVES // body->m_veloc = internalVeloc[i].m_linear.Scale(invStep); // body->m_omega = internalVeloc[i].m_angular.Scale(invStep); (simd_type &)body->m_veloc = simd_mul_v((simd_type &)internalVeloc[i].m_linear, invStepSimd); (simd_type &)body->m_omega = simd_mul_v((simd_type &)internalVeloc[i].m_angular, invStepSimd); #endif // dgVector accel = (body->m_veloc - body->m_netForce).Scale (m_invTimeStep); // dgVector alpha = (body->m_omega - body->m_netTorque).Scale (m_invTimeStep); simd_type accel = simd_mul_v(simd_sub_v((simd_type &)body->m_veloc, (simd_type &)body->m_netForce), invTimeStepSimd); simd_type alpha = simd_mul_v(simd_sub_v((simd_type &)body->m_omega, (simd_type &)body->m_netTorque), invTimeStepSimd); // if ((accel % accel) < maxAccNorm2) { // accel = zero; // } // body->m_accel = accel; // body->m_netForce = accel.Scale (body->m_mass[3]); simd_type tmp = simd_mul_v(accel, accel); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); tmp = simd_cmplt_s(tmp, accelerationTolerance); (simd_type &)body->m_accel = simd_andnot_v(accel, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0))); (simd_type &)body->m_netForce = simd_mul_v((simd_type &)body->m_accel, simd_set1(body->m_mass[3])); // if ((alpha % alpha) < maxAccNorm2) { // alpha = zero; // } // body->m_alpha = alpha; tmp = simd_mul_v(alpha, alpha); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); tmp = simd_cmplt_s(tmp, accelerationTolerance); (simd_type &)body->m_alpha = simd_andnot_v(alpha, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0))); // alpha = body->m_matrix.UnrotateVector(alpha); alpha = simd_mul_v((simd_type &)body->m_matrix[0], (simd_type &)body->m_alpha); alpha = simd_add_v(alpha, simd_move_hl_v(alpha, alpha)); alpha = simd_add_s(alpha, simd_permut_v(alpha, alpha, PURMUT_MASK(0, 0, 0, 1))); tmp = simd_mul_v((simd_type &)body->m_matrix[1], (simd_type &)body->m_alpha); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); alpha = simd_pack_lo_v(alpha, tmp); tmp = simd_mul_v((simd_type &)body->m_matrix[2], (simd_type &)body->m_alpha); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); alpha = simd_permut_v(alpha, tmp, PURMUT_MASK(3, 0, 1, 0)); // body->m_netTorque = body->m_matrix.RotateVector (alpha.CompProduct(body->m_mass)); alpha = simd_mul_v(alpha, (simd_type &)body->m_mass); (simd_type &)body->m_netTorque = simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_matrix[0], simd_permut_v(alpha, alpha, PURMUT_MASK(0, 0, 0, 0))), (simd_type &)body->m_matrix[1], simd_permut_v(alpha, alpha, PURMUT_MASK(1, 1, 1, 1))), (simd_type &)body->m_matrix[2], simd_permut_v(alpha, alpha, PURMUT_MASK(2, 2, 2, 2))); } if (hasJointFeeback) { for (dgInt32 i = 0; i < m_jointCount; i++) { if (constraintArray[i].m_joint->m_updaFeedbackCallback) { constraintArray[i].m_joint->m_updaFeedbackCallback( *constraintArray[i].m_joint, m_timeStep, m_threadIndex); } } } #endif } void dgJacobianMemory::CalculateForcesGameMode(dgInt32 iterations, dgFloat32 maxAccNorm) const { dgFloat32 *const force = m_force; const dgJacobianPair *const Jt = m_Jt; const dgJacobianPair *const JMinv = m_JMinv; const dgFloat32 *const diagDamp = m_diagDamp; const dgFloat32 *const invDJMinvJt = m_invDJMinvJt; const dgBodyInfo *bodyArray = m_bodyArray; dgFloat32 *const penetration = m_penetration; const dgFloat32 *const externAccel = m_deltaAccel; const dgFloat32 *const restitution = m_restitution; dgFloat32 *const coordenateAccel = m_coordenateAccel; dgJacobian *const internalVeloc = m_internalVeloc; dgJacobian *const internalForces = m_internalForces; ; dgFloat32 **const jointForceFeeback = m_jointFeebackForce; const dgInt32 *const accelIsMortor = m_accelIsMotor; const dgInt32 *const normalForceIndex = m_normalForceIndex; const dgJointInfo *const constraintArray = m_constraintArray; const dgFloat32 *const penetrationStiffness = m_penetrationStiffness; const dgFloat32 *const lowerFrictionCoef = m_lowerBoundFrictionCoefficent; const dgFloat32 *const upperFrictionCoef = m_upperBoundFrictionCoefficent; dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); // steps = 4; dgFloat32 invStep = (dgFloat32(1.0f) / dgFloat32(LINEAR_SOLVER_SUB_STEPS)); dgFloat32 timeStep = m_timeStep * invStep; dgFloat32 invTimeStep = m_invTimeStep * dgFloat32(LINEAR_SOLVER_SUB_STEPS); NEWTON_ASSERT(m_bodyArray[0].m_body == m_world->m_sentionelBody); for (dgInt32 i = 1; i < m_bodyCount; i++) { dgBody *const body = m_bodyArray[i].m_body; body->m_netForce = body->m_veloc; body->m_netTorque = body->m_omega; internalVeloc[i].m_linear = zero; internalVeloc[i].m_angular = zero; internalForces[i].m_linear = zero; internalForces[i].m_angular = zero; } internalVeloc[0].m_linear = zero; internalVeloc[0].m_angular = zero; internalForces[0].m_linear = zero; internalForces[0].m_angular = zero; for (dgInt32 i = 0; i < m_jointCount; i++) { dgJacobian y0; dgJacobian y1; dgInt32 first = constraintArray[i].m_autoPairstart; dgInt32 count = constraintArray[i].m_autoPaircount; dgInt32 m0 = constraintArray[i].m_m0; dgInt32 m1 = constraintArray[i].m_m1; y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (dgInt32 j = 0; j < count; j++) { dgInt32 index = j + first; dgFloat32 val = force[index]; NEWTON_ASSERT(dgCheckFloat(val)); y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale(val); y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale(val); y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale(val); y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale(val); } internalForces[m0].m_linear += y0.m_linear; internalForces[m0].m_angular += y0.m_angular; internalForces[m1].m_linear += y1.m_linear; internalForces[m1].m_angular += y1.m_angular; } // if (sleepCount) { // for (dgInt32 i = 1; i < m_bodyCount; i ++) { // dgBody* body; // body = m_bodyArray[i].m_body; // if (body->m_equilibrium) // body = m_bodyArray[i].m_body; // } // } dgFloat32 firstPassCoef = dgFloat32(0.0f); dgInt32 maxPasses = iterations + DG_BASE_ITERATION_COUNT; for (dgInt32 step = 0; step < LINEAR_SOLVER_SUB_STEPS; step++) { for (dgInt32 curJoint = 0; curJoint < m_jointCount; curJoint++) { dgJointAccelerationDecriptor joindDesc; dgInt32 index = constraintArray[curJoint].m_autoPairstart; joindDesc.m_rowsCount = constraintArray[curJoint].m_autoPaircount; joindDesc.m_timeStep = timeStep; joindDesc.m_invTimeStep = invTimeStep; joindDesc.m_firstPassCoefFlag = firstPassCoef; joindDesc.m_Jt = &Jt[index]; joindDesc.m_penetration = &penetration[index]; joindDesc.m_restitution = &restitution[index]; joindDesc.m_externAccelaration = &externAccel[index]; joindDesc.m_coordenateAccel = &coordenateAccel[index]; joindDesc.m_accelIsMotor = &accelIsMortor[index]; joindDesc.m_normalForceIndex = &normalForceIndex[index]; joindDesc.m_penetrationStiffness = &penetrationStiffness[index]; constraintArray[curJoint].m_joint->JointAccelerations(joindDesc); } firstPassCoef = dgFloat32(1.0f); dgFloat32 accNorm = maxAccNorm * dgFloat32(2.0f); for (dgInt32 passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) { accNorm = dgFloat32(0.0f); for (dgInt32 curJoint = 0; curJoint < m_jointCount; curJoint++) { dgInt32 index = constraintArray[curJoint].m_autoPairstart; dgInt32 rowsCount = constraintArray[curJoint].m_autoPaircount; dgInt32 m0 = constraintArray[curJoint].m_m0; dgInt32 m1 = constraintArray[curJoint].m_m1; dgVector linearM0(internalForces[m0].m_linear); dgVector angularM0(internalForces[m0].m_angular); dgVector linearM1(internalForces[m1].m_linear); dgVector angularM1(internalForces[m1].m_angular); for (dgInt32 k = 0; k < rowsCount; k++) { dgVector acc( JMinv[index].m_jacobian_IM0.m_linear.CompProduct(linearM0)); acc += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(angularM0); acc += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(linearM1); acc += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(angularM1); dgFloat32 a = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index]; dgFloat32 f = force[index] + invDJMinvJt[index] * a; dgInt32 frictionIndex = normalForceIndex[index]; NEWTON_ASSERT( ((frictionIndex < 0) && (force[frictionIndex] == dgFloat32(1.0f))) || ((frictionIndex >= 0) && (force[frictionIndex] >= dgFloat32(0.0f)))); dgFloat32 frictionNormal = force[frictionIndex]; dgFloat32 lowerFrictionForce = frictionNormal * lowerFrictionCoef[index]; dgFloat32 upperFrictionForce = frictionNormal * upperFrictionCoef[index]; if (f > upperFrictionForce) { a = dgFloat32(0.0f); f = upperFrictionForce; } else if (f < lowerFrictionForce) { a = dgFloat32(0.0f); f = lowerFrictionForce; } accNorm = GetMax(accNorm, dgAbsf(a)); dgFloat32 prevValue = f - force[index]; force[index] = f; linearM0 += Jt[index].m_jacobian_IM0.m_linear.Scale(prevValue); angularM0 += Jt[index].m_jacobian_IM0.m_angular.Scale(prevValue); linearM1 += Jt[index].m_jacobian_IM1.m_linear.Scale(prevValue); angularM1 += Jt[index].m_jacobian_IM1.m_angular.Scale(prevValue); index++; } internalForces[m0].m_linear = linearM0; internalForces[m0].m_angular = angularM0; internalForces[m1].m_linear = linearM1; internalForces[m1].m_angular = angularM1; } // accNorm = 1.0f; } for (dgInt32 i = 1; i < m_bodyCount; i++) { dgBody *const body = bodyArray[i].m_body; dgVector forceTmp(body->m_accel + internalForces[i].m_linear); dgVector torque(body->m_alpha + internalForces[i].m_angular); dgVector accel(forceTmp.Scale(body->m_invMass.m_w)); dgVector alpha(body->m_invWorldInertiaMatrix.RotateVector(torque)); body->m_veloc += accel.Scale(timeStep); body->m_omega += alpha.Scale(timeStep); internalVeloc[i].m_linear += body->m_veloc; internalVeloc[i].m_angular += body->m_omega; } } dgInt32 hasJointFeeback = 0; for (dgInt32 i = 0; i < m_jointCount; i++) { dgInt32 first = constraintArray[i].m_autoPairstart; dgInt32 count = constraintArray[i].m_autoPaircount; // maxForce = dgFloat32 (0.0f); for (dgInt32 j = 0; j < count; j++) { dgInt32 index = j + first; dgFloat32 val = force[index]; // maxForce = GetMax (dgAbsf (val), maxForce); NEWTON_ASSERT(dgCheckFloat(val)); jointForceFeeback[index][0] = val; } // if (constraintArray[i].m_joint->GetId() == dgContactConstraintId) { // m_world->AddToBreakQueue ((dgContact*)constraintArray[i].m_joint, maxForce); // } hasJointFeeback |= (constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0); // if (constraintArray[i].m_joint->m_updaFeedbackCallback) { // constraintArray[i].m_joint->m_updaFeedbackCallback (*constraintArray[i].m_joint, m_timeStep, m_threadIndex); // } } dgFloat32 maxAccNorm2 = maxAccNorm * maxAccNorm; for (dgInt32 i = 1; i < m_bodyCount; i++) { dgBody *const body = bodyArray[i].m_body; #ifdef DG_WIGHT_FINAL_RK4_DERIVATIVES body->m_veloc = internalVeloc[i].m_linear.Scale(invStep); body->m_omega = internalVeloc[i].m_angular.Scale(invStep); #endif dgVector accel = (body->m_veloc - body->m_netForce).Scale(m_invTimeStep); dgVector alpha = (body->m_omega - body->m_netTorque).Scale(m_invTimeStep); if ((accel % accel) < maxAccNorm2) { accel = zero; } if ((alpha % alpha) < maxAccNorm2) { alpha = zero; } body->m_accel = accel; body->m_alpha = alpha; body->m_netForce = accel.Scale(body->m_mass[3]); alpha = body->m_matrix.UnrotateVector(alpha); body->m_netTorque = body->m_matrix.RotateVector( alpha.CompProduct(body->m_mass)); } if (hasJointFeeback) { for (dgInt32 i = 0; i < m_jointCount; i++) { if (constraintArray[i].m_joint->m_updaFeedbackCallback) { constraintArray[i].m_joint->m_updaFeedbackCallback( reinterpret_cast(constraintArray[i].m_joint), m_timeStep, m_threadIndex); } } } } //////////////////////////////////////////////////////////////////////////////////////// // // Parallel solver // //////////////////////////////////////////////////////////////////////////////////////// void dgParallelSolverBodyInertia::ThreadExecute() { // dgUnsigned32 ticks; // ticks = m_world->m_getPerformanceCount(); if (m_useSimd) { for (dgInt32 j = 0; j < m_count; j++) { dgBody *body; body = m_bodyArray[j].m_body; NEWTON_ASSERT(body->m_invMass.m_w > dgFloat32(0.0f)); body->AddDamingAcceleration(); body->CalcInvInertiaMatrixSimd(); } } else { for (dgInt32 j = 0; j < m_count; j++) { dgBody *body; body = m_bodyArray[j].m_body; NEWTON_ASSERT(body->m_invMass.m_w > dgFloat32(0.0f)); body->AddDamingAcceleration(); body->CalcInvInertiaMatrix(); } } // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } void dgParallelSolverBuildJacobianRows::ThreadExecute() { // dgUnsigned32 ticks; // ticks = m_world->m_getPerformanceCount(); dgFloat32 *const force = m_force; const dgJacobianPair *const Jt = m_Jt; dgJacobianPair *const JMinv = m_JMinv; dgFloat32 *const diagDamp = m_diagDamp; dgFloat32 *const extAccel = m_deltaAccel; dgFloat32 *const invDJMinvJt = m_invDJMinvJt; dgFloat32 *const coordenateAccel = m_coordenateAccel; dgFloat32 **const jointForceFeeback = m_jointFeebackForce; // dgInt32* const accelIsMotor = solverMemory.m_accelIsMotor; dgBodyInfo *const bodyArray = m_bodyArray; dgJointInfo *const constraintArray = m_constraintArray; if (m_useSimd) { #ifdef DG_BUILD_SIMD_CODE simd_type zero; zero = simd_set1(dgFloat32(0.0f)); for (dgInt32 k = 0; k < m_count; k++) { dgInt32 m0; dgInt32 m1; dgInt32 index; dgInt32 count; dgFloat32 diag; dgFloat32 stiffness; simd_type tmp0; simd_type tmp1; simd_type invMass0; simd_type invMass1; simd_type tmpDiag; simd_type tmpAccel; dgBody *body0; dgBody *body1; index = constraintArray[k].m_autoPairstart; count = constraintArray[k].m_autoPaircount; m0 = constraintArray[k].m_m0; m1 = constraintArray[k].m_m1; NEWTON_ASSERT(m0 >= 0); NEWTON_ASSERT(m0 < m_bodyCount); NEWTON_ASSERT(m1 >= 0); NEWTON_ASSERT(m1 < m_bodyCount); body0 = bodyArray[m0].m_body; // invMass0 = body0->m_invMass[3]; invMass0 = simd_set1(body0->m_invMass[3]); NEWTON_ASSERT((dgUnsigned64(&body0->m_invWorldInertiaMatrix) & 0x0f) == 0); const dgMatrix &invInertia0 = body0->m_invWorldInertiaMatrix; body1 = bodyArray[m1].m_body; // invMass1 = body1->m_invMass[3]; invMass1 = simd_set1(body1->m_invMass[3]); NEWTON_ASSERT((dgUnsigned64(&body1->m_invWorldInertiaMatrix) & 0x0f) == 0); const dgMatrix &invInertia1 = body1->m_invWorldInertiaMatrix; for (dgInt32 i = 0; i < count; i++) { // JMinv[index].m_jacobian_IM0.m_linear = Jt[index].m_jacobian_IM0.m_linear.Scale (invMass0); // JMinv[index].m_jacobian_IM0.m_angular = invInertia0.UnrotateVector (Jt[index].m_jacobian_IM0.m_angular); // dgVector tmpDiag (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(Jt[index].m_jacobian_IM0.m_linear)); // tmpDiag += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(Jt[index].m_jacobian_IM0.m_angular); // dgVector tmpAccel (JMinv[index].m_jacobian_IM0.m_linear.CompProduct(body0->m_accel)); // tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct(body0->m_alpha); ((simd_type &)JMinv[index].m_jacobian_IM0.m_linear) = simd_mul_v((simd_type &)Jt[index].m_jacobian_IM0.m_linear, invMass0); tmp0 = (simd_type &)Jt[index].m_jacobian_IM0.m_angular; tmp1 = simd_mul_v((simd_type &)invInertia0.m_front, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 0, 0, 0))); tmp1 = simd_mul_add_v(tmp1, (simd_type &)invInertia0.m_up, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 1, 1, 1))); ((simd_type &)JMinv[index].m_jacobian_IM0.m_angular) = simd_mul_add_v(tmp1, (simd_type &)invInertia0.m_right, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 2, 2, 2))); tmpDiag = simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, (simd_type &)Jt[index].m_jacobian_IM0.m_linear); tmpDiag = simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, (simd_type &)Jt[index].m_jacobian_IM0.m_angular); tmpAccel = simd_mul_v((simd_type &)JMinv[index].m_jacobian_IM0.m_linear, (simd_type &)body0->m_accel); tmpAccel = simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM0.m_angular, (simd_type &)body0->m_alpha); // JMinv[index].m_jacobian_IM1.m_linear = Jt[index].m_jacobian_IM1.m_linear.Scale (invMass1); // JMinv[index].m_jacobian_IM1.m_angular = invInertia1.UnrotateVector (Jt[index].m_jacobian_IM1.m_angular); // tmpDiag += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(Jt[index].m_jacobian_IM1.m_linear); // tmpDiag += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(Jt[index].m_jacobian_IM1.m_angular); // tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct(body1->m_accel); // tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct(body1->m_alpha); ((simd_type &)JMinv[index].m_jacobian_IM1.m_linear) = simd_mul_v((simd_type &)Jt[index].m_jacobian_IM1.m_linear, invMass1); tmp0 = (simd_type &)Jt[index].m_jacobian_IM1.m_angular; tmp1 = simd_mul_v((simd_type &)invInertia1.m_front, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 0, 0, 0))); tmp1 = simd_mul_add_v(tmp1, (simd_type &)invInertia1.m_up, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 1, 1, 1))); ((simd_type &)JMinv[index].m_jacobian_IM1.m_angular) = simd_mul_add_v(tmp1, (simd_type &)invInertia1.m_right, simd_permut_v(tmp0, tmp0, PURMUT_MASK(3, 2, 2, 2))); tmpDiag = simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, (simd_type &)Jt[index].m_jacobian_IM1.m_linear); tmpDiag = simd_mul_add_v(tmpDiag, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, (simd_type &)Jt[index].m_jacobian_IM1.m_angular); tmpAccel = simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_linear, (simd_type &)body1->m_accel); tmpAccel = simd_mul_add_v(tmpAccel, (simd_type &)JMinv[index].m_jacobian_IM1.m_angular, (simd_type &)body1->m_alpha); // coordenateAccel[index] -= (tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z); tmpAccel = simd_add_v(tmpAccel, simd_move_hl_v(tmpAccel, tmpAccel)); tmpAccel = simd_sub_s(zero, simd_add_s(tmpAccel, simd_permut_v(tmpAccel, tmpAccel, PURMUT_MASK(3, 3, 3, 1)))); simd_store_s(tmpAccel, &extAccel[index]); simd_store_s(simd_add_s(simd_load_s(coordenateAccel[index]), tmpAccel), &coordenateAccel[index]); // force[index] = bilateralForceBounds[index].m_jointForce[0]; force[index] = jointForceFeeback[index][0]; NEWTON_ASSERT(diagDamp[index] >= dgFloat32(0.1f)); NEWTON_ASSERT(diagDamp[index] <= dgFloat32(100.0f)); stiffness = DG_PSD_DAMP_TOL * diagDamp[index]; // diag = (tmpDiag.m_x + tmpDiag.m_y + tmpDiag.m_z); tmpDiag = simd_add_v(tmpDiag, simd_move_hl_v(tmpDiag, tmpDiag)); simd_store_s( simd_add_s(tmpDiag, simd_permut_v(tmpDiag, tmpDiag, PURMUT_MASK(3, 3, 3, 1))), &diag); NEWTON_ASSERT(diag > dgFloat32(0.0f)); diagDamp[index] = diag * stiffness; diag *= (dgFloat32(1.0f) + stiffness); invDJMinvJt[index] = dgFloat32(1.0f) / diag; index++; } } #endif } else { for (dgInt32 k = 0; k < m_count; k++) { dgInt32 m0; dgInt32 m1; dgInt32 index; dgInt32 count; dgFloat32 invMass0; dgFloat32 invMass1; dgBody *body0; dgBody *body1; index = constraintArray[k].m_autoPairstart; count = constraintArray[k].m_autoPaircount; m0 = constraintArray[k].m_m0; m1 = constraintArray[k].m_m1; NEWTON_ASSERT(m0 >= 0); NEWTON_ASSERT(m0 < m_bodyCount); body0 = bodyArray[m0].m_body; invMass0 = body0->m_invMass[3]; const dgMatrix &invInertia0 = body0->m_invWorldInertiaMatrix; NEWTON_ASSERT(m1 >= 0); NEWTON_ASSERT(m1 < m_bodyCount); body1 = bodyArray[m1].m_body; invMass1 = body1->m_invMass[3]; const dgMatrix &invInertia1 = body1->m_invWorldInertiaMatrix; for (dgInt32 i = 0; i < count; i++) { dgFloat32 diag; dgFloat32 stiffness; dgFloat32 extenalAcceleration; JMinv[index].m_jacobian_IM0.m_linear = Jt[index].m_jacobian_IM0.m_linear.Scale(invMass0); JMinv[index].m_jacobian_IM0.m_angular = invInertia0.UnrotateVector( Jt[index].m_jacobian_IM0.m_angular); dgVector tmpDiag( JMinv[index].m_jacobian_IM0.m_linear.CompProduct( Jt[index].m_jacobian_IM0.m_linear)); tmpDiag += JMinv[index].m_jacobian_IM0.m_angular.CompProduct( Jt[index].m_jacobian_IM0.m_angular); dgVector tmpAccel( JMinv[index].m_jacobian_IM0.m_linear.CompProduct(body0->m_accel)); tmpAccel += JMinv[index].m_jacobian_IM0.m_angular.CompProduct( body0->m_alpha); JMinv[index].m_jacobian_IM1.m_linear = Jt[index].m_jacobian_IM1.m_linear.Scale(invMass1); JMinv[index].m_jacobian_IM1.m_angular = invInertia1.UnrotateVector( Jt[index].m_jacobian_IM1.m_angular); tmpDiag += JMinv[index].m_jacobian_IM1.m_linear.CompProduct( Jt[index].m_jacobian_IM1.m_linear); tmpDiag += JMinv[index].m_jacobian_IM1.m_angular.CompProduct( Jt[index].m_jacobian_IM1.m_angular); tmpAccel += JMinv[index].m_jacobian_IM1.m_linear.CompProduct( body1->m_accel); tmpAccel += JMinv[index].m_jacobian_IM1.m_angular.CompProduct( body1->m_alpha); extenalAcceleration = -(tmpAccel.m_x + tmpAccel.m_y + tmpAccel.m_z); extAccel[index] = extenalAcceleration; coordenateAccel[index] += extenalAcceleration; force[index] = jointForceFeeback[index][0]; NEWTON_ASSERT(diagDamp[index] >= dgFloat32(0.1f)); NEWTON_ASSERT(diagDamp[index] <= dgFloat32(100.0f)); stiffness = DG_PSD_DAMP_TOL * diagDamp[index]; diag = (tmpDiag.m_x + tmpDiag.m_y + tmpDiag.m_z); NEWTON_ASSERT(diag > dgFloat32(0.0f)); diagDamp[index] = diag * stiffness; diag *= (dgFloat32(1.0f) + stiffness); // solverMemory.m_diagJMinvJt[index] = diag; invDJMinvJt[index] = dgFloat32(1.0f) / diag; index++; } } } // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } void dgParallelSolverBuildJacobianMatrix::ThreadExecute() { // dgInt32 count; // dgInt32 rowCount; // dgInt32 jointSolved; dgContraintDescritor constraintParams; dgJointInfo *const constraintArray = m_constraintArray; // ticks = m_world->m_getPerformanceCount(); constraintParams.m_world = m_world; constraintParams.m_threadIndex = m_threadIndex; constraintParams.m_timestep = m_timestep; constraintParams.m_invTimestep = m_invTimestep; dgJacobianMemory &solverMemory = *m_solverMemory; dgInt32 count = m_count; dgInt32 jointSolved = m_jointSolved; for (dgInt32 j = 0; j < count; j++) { dgConstraint *const constraint = constraintArray[j].m_joint; if (constraint->m_isUnilateral ^ m_bitMode) { dgInt32 dof = dgInt32(constraint->m_maxDOF); jointSolved++; NEWTON_ASSERT(dof <= DG_CONSTRAINT_MAX_ROWS); for (dgInt32 i = 0; i < dof; i++) { constraintParams.m_forceBounds[i].m_low = DG_MIN_BOUND; constraintParams.m_forceBounds[i].m_upper = DG_MAX_BOUND; constraintParams.m_forceBounds[i].m_jointForce = NULL; constraintParams.m_forceBounds[i].m_normalIndex = DG_BILATERAL_CONSTRAINT; } NEWTON_ASSERT(constraint->m_body0); NEWTON_ASSERT(constraint->m_body1); constraint->m_body0->m_inCallback = true; constraint->m_body1->m_inCallback = true; dof = dgInt32(constraint->JacobianDerivative(constraintParams)); constraint->m_body0->m_inCallback = false; constraint->m_body1->m_inCallback = false; dgInt32 m0 = (constraint->m_body0->m_invMass.m_w != dgFloat32(0.0f)) ? constraint->m_body0->m_index : 0; dgInt32 m1 = (constraint->m_body1->m_invMass.m_w != dgFloat32(0.0f)) ? constraint->m_body1->m_index : 0; m_world->dgGetUserLock(); dgInt32 rowCount = m_rowsCount[0]; m_rowsCount[0] = rowCount + ((dof & (DG_SIMD_WORD_SIZE - 1)) ? ((dof & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : dof); m_world->dgReleasedUserLock(); constraintArray[j].m_autoPairstart = rowCount; constraintArray[j].m_autoPaircount = dgInt32(dof); constraintArray[j].m_autoPairActiveCount = dgInt32(dof); constraintArray[j].m_m0 = m0; constraintArray[j].m_m1 = m1; for (dgInt32 i = 0; i < dof; i++) { NEWTON_ASSERT(constraintParams.m_forceBounds[i].m_jointForce); solverMemory.m_Jt[rowCount] = constraintParams.m_jacobian[i]; NEWTON_ASSERT(constraintParams.m_jointStiffness[i] >= dgFloat32(0.1f)); NEWTON_ASSERT(constraintParams.m_jointStiffness[i] <= dgFloat32(100.0f)); solverMemory.m_diagDamp[rowCount] = constraintParams.m_jointStiffness[i]; solverMemory.m_coordenateAccel[rowCount] = constraintParams.m_jointAccel[i]; solverMemory.m_accelIsMotor[rowCount] = dgInt32( constraintParams.m_isMotor[i]); solverMemory.m_restitution[rowCount] = constraintParams.m_restitution[i]; solverMemory.m_penetration[rowCount] = constraintParams.m_penetration[i]; solverMemory.m_penetrationStiffness[rowCount] = constraintParams.m_penetrationStiffness[i]; solverMemory.m_lowerBoundFrictionCoefficent[rowCount] = constraintParams.m_forceBounds[i].m_low; solverMemory.m_upperBoundFrictionCoefficent[rowCount] = constraintParams.m_forceBounds[i].m_upper; solverMemory.m_jointFeebackForce[rowCount] = constraintParams.m_forceBounds[i].m_jointForce; solverMemory.m_normalForceIndex[rowCount] = constraintParams.m_forceBounds[i].m_normalIndex + ((constraintParams.m_forceBounds[i].m_normalIndex >= 0) ? (rowCount - i) : 0); rowCount++; } #ifdef _DEBUG for (dgInt32 i = 0; i < ((rowCount + 3) & 0xfffc) - rowCount; i++) { solverMemory.m_diagDamp[rowCount + i] = dgFloat32(0.0f); solverMemory.m_coordenateAccel[rowCount + i] = dgFloat32(0.0f); solverMemory.m_restitution[rowCount + i] = dgFloat32(0.0f); solverMemory.m_penetration[rowCount + i] = dgFloat32(0.0f); solverMemory.m_penetrationStiffness[rowCount + i] = dgFloat32(0.0f); solverMemory.m_lowerBoundFrictionCoefficent[rowCount + i] = dgFloat32( 0.0f); solverMemory.m_upperBoundFrictionCoefficent[rowCount + i] = dgFloat32( 0.0f); solverMemory.m_jointFeebackForce[rowCount + i] = 0; solverMemory.m_normalForceIndex[rowCount + i] = 0; } #endif // rowCount = (rowCount & (DG_SIMD_WORD_SIZE - 1)) ? ((rowCount & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : rowCount; // NEWTON_ASSERT ((rowCount & (DG_SIMD_WORD_SIZE - 1)) == 0); } } m_jointSolved = jointSolved; // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } dgInt32 dgWorldDynamicUpdate::GetJacobialDerivativesParallel( const dgIsland &island, bool bitMode, dgInt32 rowCount, dgFloat32 timestep) { dgInt32 chunkSizes[DG_MAXIMUN_THREADS]; dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart]; dgInt32 acc = 0; dgInt32 threadCounts = dgInt32(m_world->m_numberOfTheads); dgInt32 jointCount = island.m_jointCount; m_world->m_threadsManager.CalculateChunkSizes(jointCount, chunkSizes); for (dgInt32 threadIndex = 0; threadIndex < threadCounts; threadIndex++) { m_parallelSolverBuildJacobianMatrix[threadIndex].m_bitMode = bitMode; m_parallelSolverBuildJacobianMatrix[threadIndex].m_world = m_world; m_parallelSolverBuildJacobianMatrix[threadIndex].m_jointSolved = 0; m_parallelSolverBuildJacobianMatrix[threadIndex].m_count = chunkSizes[threadIndex]; m_parallelSolverBuildJacobianMatrix[threadIndex].m_timestep = timestep; m_parallelSolverBuildJacobianMatrix[threadIndex].m_invTimestep = dgFloat32( 1.0f / timestep); m_parallelSolverBuildJacobianMatrix[threadIndex].m_threadIndex = threadIndex; m_parallelSolverBuildJacobianMatrix[threadIndex].m_rowsCount = &rowCount; m_parallelSolverBuildJacobianMatrix[threadIndex].m_constraintArray = &constraintArray[acc]; m_parallelSolverBuildJacobianMatrix[threadIndex].m_solverMemory = &m_solverMemory[0]; m_world->m_threadsManager.SubmitJob( &m_parallelSolverBuildJacobianMatrix[threadIndex]); acc += chunkSizes[threadIndex]; } m_world->m_threadsManager.SynchronizationBarrier(); return rowCount; } void dgWorldDynamicUpdate::BuildJacobianMatrixParallel(const dgIsland &island, dgFloat32 timestep, dgInt32 archModel) { NEWTON_ASSERT(island.m_bodyCount >= 2); //NEWTON_ASSERT (island.m_jointCount >= 1); dgInt32 threads = dgInt32(m_world->m_numberOfTheads); dgJacobianMemory &solverMemory = m_solverMemory[0]; dgInt32 bodyCount = island.m_bodyCount; dgBodyInfo *const bodyArray = &m_bodyArray[island.m_bodyStart]; bodyArray[0].m_body = m_world->GetSentinelBody(); NEWTON_ASSERT( (bodyArray[0].m_body->m_accel % bodyArray[0].m_body->m_accel) == dgFloat32(0.0f)); NEWTON_ASSERT( (bodyArray[0].m_body->m_alpha % bodyArray[0].m_body->m_alpha) == dgFloat32(0.0f)); dgInt32 acc = 0; dgInt32 chunkSizes[DG_MAXIMUN_THREADS]; m_world->m_threadsManager.CalculateChunkSizes(bodyCount - 1, chunkSizes); for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { m_parallelBodyInertiaMatrix[threadIndex].m_useSimd = archModel; m_parallelBodyInertiaMatrix[threadIndex].m_bodyArray = &bodyArray[acc + 1]; m_parallelBodyInertiaMatrix[threadIndex].m_threadIndex = threadIndex; m_parallelBodyInertiaMatrix[threadIndex].m_world = m_world; m_parallelBodyInertiaMatrix[threadIndex].m_count = chunkSizes[threadIndex]; m_world->m_threadsManager.SubmitJob( &m_parallelBodyInertiaMatrix[threadIndex]); acc += chunkSizes[threadIndex]; } m_world->m_threadsManager.SynchronizationBarrier(); while (bodyCount > solverMemory.m_maxBodiesCount) { m_world->dgGetUserLock(); ReallocIntenalForcesMemory(bodyCount, 0); m_world->dgReleasedUserLock(); } dgInt32 jointCount = island.m_jointCount; dgJointInfo *const constraintArray = &m_constraintArray[island.m_jointStart]; solverMemory.m_constraintArray = constraintArray; dgInt32 maxRowCount = 0; for (dgInt32 j = 0; j < jointCount; j++) { dgConstraint *const constraint = constraintArray[j].m_joint; ; dgInt32 dof = dgInt32( (constraint->m_maxDOF & (DG_SIMD_WORD_SIZE - 1)) ? ((constraint->m_maxDOF & (-DG_SIMD_WORD_SIZE)) + DG_SIMD_WORD_SIZE) : constraint->m_maxDOF); maxRowCount += dof; } while (maxRowCount > solverMemory.m_maxJacobiansCount) { m_world->dgGetUserLock(); ReallocJacobiansMemory(solverMemory.m_maxJacobiansCount * 2, 0); m_world->dgReleasedUserLock(); } dgInt32 rowCount = 0; if (island.m_hasUnilateralJoints) { rowCount = GetJacobialDerivativesParallel(island, false, rowCount, timestep); } rowCount = GetJacobialDerivativesParallel(island, true, rowCount, timestep); solverMemory.m_rowCount = rowCount; solverMemory.m_bodyCount = bodyCount; solverMemory.m_bodyArray = bodyArray; solverMemory.m_jointCount = jointCount; solverMemory.m_timeStep = timestep; solverMemory.m_invTimeStep = dgFloat32(1.0f) / solverMemory.m_timeStep; acc = 0; m_world->m_threadsManager.CalculateChunkSizes(jointCount, chunkSizes); for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { m_parallelSolverBuildJacobianRows[threadIndex].m_useSimd = archModel; m_parallelSolverBuildJacobianRows[threadIndex].m_bodyCount = bodyCount; m_parallelSolverBuildJacobianRows[threadIndex].m_bodyArray = bodyArray; m_parallelSolverBuildJacobianRows[threadIndex].m_threadIndex = threadIndex; m_parallelSolverBuildJacobianRows[threadIndex].m_world = m_world; m_parallelSolverBuildJacobianRows[threadIndex].m_count = chunkSizes[threadIndex]; m_parallelSolverBuildJacobianRows[threadIndex].m_force = solverMemory.m_force; m_parallelSolverBuildJacobianRows[threadIndex].m_Jt = solverMemory.m_Jt; m_parallelSolverBuildJacobianRows[threadIndex].m_JMinv = solverMemory.m_JMinv; m_parallelSolverBuildJacobianRows[threadIndex].m_diagDamp = solverMemory.m_diagDamp; m_parallelSolverBuildJacobianRows[threadIndex].m_deltaAccel = solverMemory.m_deltaAccel; m_parallelSolverBuildJacobianRows[threadIndex].m_invDJMinvJt = solverMemory.m_invDJMinvJt; m_parallelSolverBuildJacobianRows[threadIndex].m_coordenateAccel = solverMemory.m_coordenateAccel; m_parallelSolverBuildJacobianRows[threadIndex].m_jointFeebackForce = solverMemory.m_jointFeebackForce; m_parallelSolverBuildJacobianRows[threadIndex].m_constraintArray = &constraintArray[acc]; m_world->m_threadsManager.SubmitJob( &m_parallelSolverBuildJacobianRows[threadIndex]); acc += chunkSizes[threadIndex]; } m_world->m_threadsManager.SynchronizationBarrier(); } void dgParallelSolverClear::ThreadExecute() { if (m_useSimd) { #ifdef DG_BUILD_SIMD_CODE simd_type zero; zero = simd_set1(dgFloat32(0.0f)); for (dgInt32 i = 0; i < m_count; i++) { dgBody *body; body = m_bodyArray[i].m_body; ((simd_type &)body->m_netForce) = zero; ((simd_type &)body->m_netTorque) = zero; ((simd_type &)m_internalVeloc[i].m_linear) = zero; ((simd_type &)m_internalVeloc[i].m_angular) = zero; ((simd_type &)m_internalForces[i].m_linear) = zero; ((simd_type &)m_internalForces[i].m_angular) = zero; } #endif } else { dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); for (dgInt32 i = 0; i < m_count; i++) { dgBody *body; body = m_bodyArray[i].m_body; body->m_netForce = body->m_veloc; body->m_netTorque = body->m_omega; m_internalVeloc[i].m_linear = zero; m_internalVeloc[i].m_angular = zero; m_internalForces[i].m_linear = zero; m_internalForces[i].m_angular = zero; } } // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } void dgParallelSolverInitInternalForces::ThreadExecute() { // dgUnsigned32 ticks; // ticks = m_world->m_getPerformanceCount(); if (m_useSimd) { #ifdef DG_BUILD_SIMD_CODE simd_type zero; zero = simd_set1(dgFloat32(0.0f)); for (dgInt32 i = 0; i < m_count; i++) { dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; simd_type y0_linear; simd_type y0_angular; simd_type y1_linear; simd_type y1_angular; first = m_constraintArray[i].m_autoPairstart; count = m_constraintArray[i].m_autoPairActiveCount; m0 = m_constraintArray[i].m_m0; m1 = m_constraintArray[i].m_m1; // dgJacobian y0 (internalForces[k0]); // dgJacobian y1 (internalForces[k1]); y0_linear = zero; y0_angular = zero; y1_linear = zero; y1_angular = zero; for (dgInt32 j = 0; j < count; j++) { dgInt32 index; simd_type tmp0; index = j + first; // val = force[index]; tmp0 = simd_set1(m_force[index]); // y0.m_linear += Jt[index].m_jacobian_IM0.m_linear.Scale (val); // y0.m_angular += Jt[index].m_jacobian_IM0.m_angular.Scale (val); // y1.m_linear += Jt[index].m_jacobian_IM1.m_linear.Scale (val); // y1.m_angular += Jt[index].m_jacobian_IM1.m_angular.Scale (val); y0_linear = simd_mul_add_v(y0_linear, (simd_type &)m_Jt[index].m_jacobian_IM0.m_linear, tmp0); y0_angular = simd_mul_add_v(y0_angular, (simd_type &)m_Jt[index].m_jacobian_IM0.m_angular, tmp0); y1_linear = simd_mul_add_v(y1_linear, (simd_type &)m_Jt[index].m_jacobian_IM1.m_linear, tmp0); y1_angular = simd_mul_add_v(y1_angular, (simd_type &)m_Jt[index].m_jacobian_IM1.m_angular, tmp0); } // internalForces[k0] = y0; // internalForces[k1] = y1; // m_world->dgGetUserLock(); m_world->dgGetIndirectLock(&m_locks[m0]); (simd_type &)m_internalForces[m0].m_linear = simd_add_v((simd_type &)m_internalForces[m0].m_linear, y0_linear); (simd_type &)m_internalForces[m0].m_angular = simd_add_v((simd_type &)m_internalForces[m0].m_angular, y0_angular); m_world->dgReleaseIndirectLock(&m_locks[m0]); m_world->dgGetIndirectLock(&m_locks[m1]); (simd_type &)m_internalForces[m1].m_linear = simd_add_v((simd_type &)m_internalForces[m1].m_linear, y1_linear); (simd_type &)m_internalForces[m1].m_angular = simd_add_v((simd_type &)m_internalForces[m1].m_angular, y1_angular); m_world->dgReleaseIndirectLock(&m_locks[m1]); // m_world->dgReleasedUserLock(); } #endif } else { dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); for (dgInt32 i = 0; i < m_count; i++) { dgInt32 m0; dgInt32 m1; dgInt32 first; dgInt32 count; dgFloat32 val; dgJacobian y0; dgJacobian y1; first = m_constraintArray[i].m_autoPairstart; count = m_constraintArray[i].m_autoPaircount; m0 = m_constraintArray[i].m_m0; m1 = m_constraintArray[i].m_m1; y0.m_linear = zero; y0.m_angular = zero; y1.m_linear = zero; y1.m_angular = zero; for (dgInt32 j = 0; j < count; j++) { dgInt32 index; index = j + first; val = m_force[index]; NEWTON_ASSERT(dgCheckFloat(val)); y0.m_linear += m_Jt[index].m_jacobian_IM0.m_linear.Scale(val); y0.m_angular += m_Jt[index].m_jacobian_IM0.m_angular.Scale(val); y1.m_linear += m_Jt[index].m_jacobian_IM1.m_linear.Scale(val); y1.m_angular += m_Jt[index].m_jacobian_IM1.m_angular.Scale(val); } m_world->dgGetIndirectLock(&m_locks[m0]); m_internalForces[m0].m_linear += y0.m_linear; m_internalForces[m0].m_angular += y0.m_angular; m_world->dgReleaseIndirectLock(&m_locks[m0]); m_world->dgGetIndirectLock(&m_locks[m1]); m_internalForces[m1].m_linear += y1.m_linear; m_internalForces[m1].m_angular += y1.m_angular; m_world->dgReleaseIndirectLock(&m_locks[m1]); } } // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } void dgParallelSolverJointAcceleration::ThreadExecute() { // dgUnsigned32 ticks; // ticks = m_world->m_getPerformanceCount(); if (m_useSimd) { dgInt32 currJoint; currJoint = m_jointStart; for (dgInt32 i = 0; i < m_count; i++) { dgInt32 index; dgJointAccelerationDecriptor joindDesc; index = m_constraintArray[currJoint].m_autoPairstart; joindDesc.m_rowsCount = m_constraintArray[currJoint].m_autoPaircount; joindDesc.m_timeStep = m_timeStep; joindDesc.m_invTimeStep = m_invTimeStep; joindDesc.m_firstPassCoefFlag = m_firstPassCoef; joindDesc.m_Jt = &m_Jt[index]; joindDesc.m_penetration = &m_penetration[index]; joindDesc.m_restitution = &m_restitution[index]; joindDesc.m_externAccelaration = &m_externAccel[index]; joindDesc.m_coordenateAccel = &m_coordenateAccel[index]; joindDesc.m_accelIsMotor = &m_accelIsMortor[index]; joindDesc.m_normalForceIndex = &m_normalForceIndex[index]; joindDesc.m_penetrationStiffness = &m_penetrationStiffness[index]; m_constraintArray[currJoint].m_joint->JointAccelerationsSimd(joindDesc); currJoint++; } } else { dgInt32 currJoint; currJoint = m_jointStart; for (dgInt32 i = 0; i < m_count; i++) { dgInt32 index; dgJointAccelerationDecriptor joindDesc; index = m_constraintArray[currJoint].m_autoPairstart; joindDesc.m_rowsCount = m_constraintArray[currJoint].m_autoPaircount; joindDesc.m_timeStep = m_timeStep; joindDesc.m_invTimeStep = m_invTimeStep; joindDesc.m_firstPassCoefFlag = m_firstPassCoef; joindDesc.m_Jt = &m_Jt[index]; joindDesc.m_penetration = &m_penetration[index]; joindDesc.m_restitution = &m_restitution[index]; joindDesc.m_externAccelaration = &m_externAccel[index]; joindDesc.m_coordenateAccel = &m_coordenateAccel[index]; joindDesc.m_accelIsMotor = &m_accelIsMortor[index]; joindDesc.m_normalForceIndex = &m_normalForceIndex[index]; joindDesc.m_penetrationStiffness = &m_penetrationStiffness[index]; m_constraintArray[currJoint].m_joint->JointAccelerations(joindDesc); currJoint++; } } m_firstPassCoef = dgFloat32(1.0f); // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } void dgParallelSolverUpdateVeloc::ThreadExecute() { // dgUnsigned32 ticks; // ticks = m_world->m_getPerformanceCount(); if (m_useSimd) { #ifdef DG_BUILD_SIMD_CODE simd_type timeStepSimd; timeStepSimd = simd_set1(m_timeStep); for (dgInt32 i = 0; i < m_count; i++) { dgBody *body; simd_type force; simd_type torque; simd_type accel; simd_type alpha; body = m_bodyArray[i].m_body; force = simd_add_v((simd_type &)body->m_accel, (simd_type &)m_internalForces[i].m_linear); torque = simd_add_v((simd_type &)body->m_alpha, (simd_type &)m_internalForces[i].m_angular); // dgVector accel (force.Scale (body->m_invMass.m_w)); accel = simd_mul_v(force, simd_set1(body->m_invMass.m_w)); // dgVector alpha (body->m_invWorldInertiaMatrix.RotateVector (torque)); alpha = simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_invWorldInertiaMatrix[0], simd_permut_v(torque, torque, PURMUT_MASK(0, 0, 0, 0))), (simd_type &)body->m_invWorldInertiaMatrix[1], simd_permut_v(torque, torque, PURMUT_MASK(1, 1, 1, 1))), (simd_type &)body->m_invWorldInertiaMatrix[2], simd_permut_v(torque, torque, PURMUT_MASK(2, 2, 2, 2))); // body->m_veloc += accel.Scale(timeStep); (simd_type &)body->m_veloc = simd_mul_add_v((simd_type &)body->m_veloc, accel, timeStepSimd); // body->m_omega += alpha.Scale(timeStep); (simd_type &)body->m_omega = simd_mul_add_v((simd_type &)body->m_omega, alpha, timeStepSimd); // body->m_netForce += body->m_veloc; (simd_type &)m_internalVeloc[i].m_linear = simd_add_v((simd_type &)m_internalVeloc[i].m_linear, (simd_type &)body->m_veloc); // body->m_netTorque += body->m_omega; (simd_type &)m_internalVeloc[i].m_angular = simd_add_v((simd_type &)m_internalVeloc[i].m_angular, (simd_type &)body->m_omega); } #endif } else { for (dgInt32 i = 0; i < m_count; i++) { dgBody *body; body = m_bodyArray[i].m_body; dgVector force(body->m_accel + m_internalForces[i].m_linear); dgVector torque(body->m_alpha + m_internalForces[i].m_angular); dgVector accel(force.Scale(body->m_invMass.m_w)); dgVector alpha(body->m_invWorldInertiaMatrix.RotateVector(torque)); body->m_veloc += accel.Scale(m_timeStep); body->m_omega += alpha.Scale(m_timeStep); m_internalVeloc[i].m_linear += body->m_veloc; m_internalVeloc[i].m_angular += body->m_omega; } } // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } void dgParallelSolverUpdateForce::ThreadExecute() { // dgUnsigned32 ticks; // ticks = m_world->m_getPerformanceCount(); if (m_useSimd) { #ifdef DG_BUILD_SIMD_CODE simd_type invStepSimd; simd_type invTimeStepSimd; simd_type accelerationTolerance; // signMask = simd_set1 (invStep); invStepSimd = simd_set1(m_invStep); invTimeStepSimd = simd_set1(m_invTimeStep); accelerationTolerance = simd_set1(m_maxAccNorm2); for (dgInt32 i = 0; i < m_count; i++) { dgBody *body; simd_type tmp; simd_type accel; simd_type alpha; body = m_bodyArray[i].m_body; #ifdef DG_WIGHT_FINAL_RK4_DERIVATIVES // body->m_veloc = internalVeloc[i].m_linear.Scale(invStep); // body->m_omega = internalVeloc[i].m_angular.Scale(invStep); (simd_type &)body->m_veloc = simd_mul_v((simd_type &)m_internalVeloc[i].m_linear, invStepSimd); (simd_type &)body->m_omega = simd_mul_v((simd_type &)m_internalVeloc[i].m_angular, invStepSimd); #endif // dgVector accel = (body->m_veloc - body->m_netForce).Scale (m_invTimeStep); // dgVector alpha = (body->m_omega - body->m_netTorque).Scale (m_invTimeStep); accel = simd_mul_v(simd_sub_v((simd_type &)body->m_veloc, (simd_type &)body->m_netForce), invTimeStepSimd); alpha = simd_mul_v(simd_sub_v((simd_type &)body->m_omega, (simd_type &)body->m_netTorque), invTimeStepSimd); // if ((accel % accel) < maxAccNorm2) { // accel = zero; // } // body->m_accel = accel; // body->m_netForce = accel.Scale (body->m_mass[3]); tmp = simd_mul_v(accel, accel); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); tmp = simd_cmplt_s(tmp, accelerationTolerance); (simd_type &)body->m_accel = simd_andnot_v(accel, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0))); (simd_type &)body->m_netForce = simd_mul_v((simd_type &)body->m_accel, simd_set1(body->m_mass[3])); // if ((alpha % alpha) < maxAccNorm2) { // alpha = zero; // } // body->m_alpha = alpha; tmp = simd_mul_v(alpha, alpha); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); tmp = simd_cmplt_s(tmp, accelerationTolerance); (simd_type &)body->m_alpha = simd_andnot_v(alpha, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 0))); // alpha = body->m_matrix.UnrotateVector(alpha); alpha = simd_mul_v((simd_type &)body->m_matrix[0], (simd_type &)body->m_alpha); alpha = simd_add_v(alpha, simd_move_hl_v(alpha, alpha)); alpha = simd_add_s(alpha, simd_permut_v(alpha, alpha, PURMUT_MASK(0, 0, 0, 1))); tmp = simd_mul_v((simd_type &)body->m_matrix[1], (simd_type &)body->m_alpha); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); alpha = simd_pack_lo_v(alpha, tmp); tmp = simd_mul_v((simd_type &)body->m_matrix[2], (simd_type &)body->m_alpha); tmp = simd_add_v(tmp, simd_move_hl_v(tmp, tmp)); tmp = simd_add_s(tmp, simd_permut_v(tmp, tmp, PURMUT_MASK(0, 0, 0, 1))); alpha = simd_permut_v(alpha, tmp, PURMUT_MASK(3, 0, 1, 0)); // body->m_netTorque = body->m_matrix.RotateVector (alpha.CompProduct(body->m_mass)); alpha = simd_mul_v(alpha, (simd_type &)body->m_mass); (simd_type &)body->m_netTorque = simd_mul_add_v(simd_mul_add_v(simd_mul_v((simd_type &)body->m_matrix[0], simd_permut_v(alpha, alpha, PURMUT_MASK(0, 0, 0, 0))), (simd_type &)body->m_matrix[1], simd_permut_v(alpha, alpha, PURMUT_MASK(1, 1, 1, 1))), (simd_type &)body->m_matrix[2], simd_permut_v(alpha, alpha, PURMUT_MASK(2, 2, 2, 2))); } #endif } else { dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); for (dgInt32 i = 0; i < m_count; i++) { dgBody *body; body = m_bodyArray[i].m_body; body->m_veloc = m_internalVeloc[i].m_linear.Scale(m_invStep); body->m_omega = m_internalVeloc[i].m_angular.Scale(m_invStep); dgVector accel = (body->m_veloc - body->m_netForce).Scale(m_invTimeStep); dgVector alpha = (body->m_omega - body->m_netTorque).Scale(m_invTimeStep); if ((accel % accel) < m_maxAccNorm2) { accel = zero; } if ((alpha % alpha) < m_maxAccNorm2) { alpha = zero; } body->m_accel = accel; body->m_alpha = alpha; body->m_netForce = accel.Scale(body->m_mass[3]); alpha = body->m_matrix.UnrotateVector(alpha); body->m_netTorque = body->m_matrix.RotateVector( alpha.CompProduct(body->m_mass)); } } // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } void dgParallelSolverInitFeedbackUpdate::ThreadExecute() { // dgUnsigned32 ticks; // ticks = m_world->m_getPerformanceCount(); for (dgInt32 i = 0; i < m_count; i++) { dgInt32 first; dgInt32 count; dgInt32 index; dgFloat32 val; // dgFloat32 maxForce; // maxForce = dgFloat32 (0.0f); first = m_constraintArray[i].m_autoPairstart; count = m_constraintArray[i].m_autoPaircount; for (dgInt32 j = 0; j < count; j++) { index = j + first; val = m_force[index]; // maxForce = GetMax(dgAbsf (val), maxForce); NEWTON_ASSERT(dgCheckFloat(val)); m_jointForceFeeback[index][0] = val; } m_hasJointFeeback |= (m_constraintArray[i].m_joint->m_updaFeedbackCallback ? 1 : 0); // if (m_constraintArray[i].m_joint->m_updaFeedbackCallback) { // m_constraintArray[i].m_joint->m_updaFeedbackCallback (*m_constraintArray[i].m_joint, m_timestep, m_threadIndex); // } // if (m_constraintArray[i].m_joint->GetId() == dgContactConstraintId) { // dgInt32 m0; // dgInt32 m1; // m0 = m_constraintArray[i].m_m0; // m1 = m_constraintArray[i].m_m1; // m_world->dgGetIndirectLock(&m_locks[m0]); // m_world->AddToBreakQueue ((dgContact*)m_constraintArray[i].m_joint, m_constraintArray[i].m_joint->m_body0, maxForce); // m_world->dgReleaseIndirectLock(&m_locks[m0]); // m_world->dgGetIndirectLock(&m_locks[m1]); // m_world->AddToBreakQueue ((dgContact*)m_constraintArray[i].m_joint, m_constraintArray[i].m_joint->m_body1, maxForce); // m_world->dgReleaseIndirectLock(&m_locks[m1]); // } } // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } void dgParallelSolverCalculateForces::ThreadExecute() { // dgInt32 threadIndex; // dgUnsigned32 ticks; // ticks = m_world->m_getPerformanceCount(); // threadIndex = m_threadIndex; if (m_useSimd) { #ifdef DG_BUILD_SIMD_CODE dgFloatSign tmpIndex; simd_type signMask; simd_type accNorm; tmpIndex.m_integer.m_iVal = 0x7fffffff; signMask = simd_set1(tmpIndex.m_fVal); accNorm = simd_set1(dgFloat32(0.0f)); for (dgInt32 i = 0; i < m_count; i += m_threads) { // for (dgInt32 currJoint = 0; currJoint < m_count; currJoint += m_threads) { dgInt32 m0; dgInt32 m1; dgInt32 index; // dgInt32 rowIndex; dgInt32 currJoint; dgInt32 rowsCount; simd_type linearM0; simd_type angularM0; simd_type linearM1; simd_type angularM1; currJoint = m_threadIndex + i; // rowIndex = m_jointRemapArray[i]; index = m_constraintArray[currJoint].m_autoPairstart; rowsCount = m_constraintArray[currJoint].m_autoPaircount; m0 = m_constraintArray[currJoint].m_m0; m1 = m_constraintArray[currJoint].m_m1; if (m0) { m_world->dgGetIndirectLock(&m_locks[m0]); } if (m1) { m_world->dgGetIndirectLock(&m_locks[m1]); } linearM0 = (simd_type &)m_internalForces[m0].m_linear; angularM0 = (simd_type &)m_internalForces[m0].m_angular; linearM1 = (simd_type &)m_internalForces[m1].m_linear; angularM1 = (simd_type &)m_internalForces[m1].m_angular; for (dgInt32 k = 0; k < rowsCount; k++) { dgInt32 frictionIndex; simd_type a; simd_type f; simd_type frictionNormal; simd_type lowerFrictionForce; simd_type upperFrictionForce; // dgVector acc (m_JMinv[index].m_jacobian_IM0.m_linear.CompProduct(linearM0)); // acc += m_JMinv[index].m_jacobian_IM0.m_angular.CompProduct (angularM0); // acc += m_JMinv[index].m_jacobian_IM1.m_linear.CompProduct (linearM1); // acc += m_JMinv[index].m_jacobian_IM1.m_angular.CompProduct (angularM1); a = simd_mul_v((simd_type &)m_JMinv[index].m_jacobian_IM0.m_linear, linearM0); a = simd_mul_add_v(a, (simd_type &)m_JMinv[index].m_jacobian_IM0.m_angular, angularM0); a = simd_mul_add_v(a, (simd_type &)m_JMinv[index].m_jacobian_IM1.m_linear, linearM1); a = simd_mul_add_v(a, (simd_type &)m_JMinv[index].m_jacobian_IM1.m_angular, angularM1); // a = coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - force[index] * diagDamp[index]; a = simd_add_v(a, simd_move_hl_v(a, a)); a = simd_add_s(a, simd_permut_v(a, a, PURMUT_MASK(3, 3, 3, 1))); a = simd_sub_s(simd_load_s(m_coordenateAccel[index]), simd_mul_add_s(a, simd_load_s(m_force[index]), simd_load_s(m_diagDamp[index]))); // f = force[index] + invDJMinvJt[index] * a; f = simd_mul_add_s(simd_load_s(m_force[index]), simd_load_s(m_invDJMinvJt[index]), a); frictionIndex = m_normalForceIndex[index]; NEWTON_ASSERT( ((frictionIndex < 0) && (m_force[frictionIndex] == dgFloat32(1.0f))) || ((frictionIndex >= 0) && (m_force[frictionIndex] >= dgFloat32(0.0f)))); // frictionNormal = force[frictionIndex]; // lowerFrictionForce = frictionNormal * lowerFrictionCoef[index]; // upperFrictionForce = frictionNormal * upperFrictionCoef[index]; frictionNormal = simd_load_s(m_force[frictionIndex]); lowerFrictionForce = simd_mul_s(frictionNormal, simd_load_s(m_lowerFrictionCoef[index])); upperFrictionForce = simd_mul_s(frictionNormal, simd_load_s(m_upperFrictionCoef[index])); // if (f > upperFrictionForce) { // a = dgFloat32 (0.0f); // f = upperFrictionForce; // } else if (f < lowerFrictionForce) { // a = dgFloat32 (0.0f); // f = lowerFrictionForce; // } // frictionNormal = simd_or_v (simd_cmplt_s (f, lowerFrictionForce), simd_cmpgt_s (f, upperFrictionForce)); f = simd_min_s(simd_max_s(f, lowerFrictionForce), upperFrictionForce); a = simd_andnot_v(a, simd_or_v(simd_cmplt_s(f, lowerFrictionForce), simd_cmpgt_s(f, upperFrictionForce))); accNorm = simd_max_s(accNorm, simd_and_v(a, signMask)); // prevValue = f - force[index]); a = simd_sub_s(f, simd_load_s(m_force[index])); a = simd_permut_v(a, a, PURMUT_MASK(0, 0, 0, 0)); // force[index] = f; simd_store_s(f, &m_force[index]); linearM0 = simd_mul_add_v(linearM0, (simd_type &)m_Jt[index].m_jacobian_IM0.m_linear, a); angularM0 = simd_mul_add_v(angularM0, (simd_type &)m_Jt[index].m_jacobian_IM0.m_angular, a); linearM1 = simd_mul_add_v(linearM1, (simd_type &)m_Jt[index].m_jacobian_IM1.m_linear, a); angularM1 = simd_mul_add_v(angularM1, (simd_type &)m_Jt[index].m_jacobian_IM1.m_angular, a); index++; } (simd_type &)m_internalForces[m0].m_linear = linearM0; (simd_type &)m_internalForces[m0].m_angular = angularM0; (simd_type &)m_internalForces[m1].m_linear = linearM1; (simd_type &)m_internalForces[m1].m_angular = angularM1; if (m1) { m_world->dgReleaseIndirectLock(&m_locks[m1]); } if (m0) { m_world->dgReleaseIndirectLock(&m_locks[m0]); } } simd_store_s(accNorm, &m_accNorm); #endif } else { dgFloat32 accNorm; accNorm = dgFloat32(0.0f); for (dgInt32 i = 0; i < m_count; i += m_threads) { dgInt32 m0; dgInt32 m1; dgInt32 index; dgInt32 currJoint; dgInt32 rowsCount; currJoint = m_threadIndex + i; index = m_constraintArray[currJoint].m_autoPairstart; rowsCount = m_constraintArray[currJoint].m_autoPaircount; m0 = m_constraintArray[currJoint].m_m0; m1 = m_constraintArray[currJoint].m_m1; if (m0) { m_world->dgGetIndirectLock(&m_locks[m0]); } if (m1) { m_world->dgGetIndirectLock(&m_locks[m1]); } dgVector linearM0(m_internalForces[m0].m_linear); dgVector angularM0(m_internalForces[m0].m_angular); dgVector linearM1(m_internalForces[m1].m_linear); dgVector angularM1(m_internalForces[m1].m_angular); for (dgInt32 k = 0; k < rowsCount; k++) { dgInt32 frictionIndex; dgFloat32 a; dgFloat32 f; dgFloat32 prevValue; dgFloat32 frictionNormal; dgFloat32 lowerFrictionForce; dgFloat32 upperFrictionForce; dgVector acc( m_JMinv[index].m_jacobian_IM0.m_linear.CompProduct(linearM0)); acc += m_JMinv[index].m_jacobian_IM0.m_angular.CompProduct(angularM0); acc += m_JMinv[index].m_jacobian_IM1.m_linear.CompProduct(linearM1); acc += m_JMinv[index].m_jacobian_IM1.m_angular.CompProduct(angularM1); a = m_coordenateAccel[index] - acc.m_x - acc.m_y - acc.m_z - m_force[index] * m_diagDamp[index]; f = m_force[index] + m_invDJMinvJt[index] * a; frictionIndex = m_normalForceIndex[index]; NEWTON_ASSERT( ((frictionIndex < 0) && (m_force[frictionIndex] == dgFloat32(1.0f))) || ((frictionIndex >= 0) && (m_force[frictionIndex] >= dgFloat32(0.0f)))); frictionNormal = m_force[frictionIndex]; lowerFrictionForce = frictionNormal * m_lowerFrictionCoef[index]; upperFrictionForce = frictionNormal * m_upperFrictionCoef[index]; if (f > upperFrictionForce) { a = dgFloat32(0.0f); f = upperFrictionForce; } else if (f < lowerFrictionForce) { a = dgFloat32(0.0f); f = lowerFrictionForce; } accNorm = GetMax(accNorm, dgAbsf(a)); prevValue = f - m_force[index]; m_force[index] = f; linearM0 += m_Jt[index].m_jacobian_IM0.m_linear.Scale(prevValue); angularM0 += m_Jt[index].m_jacobian_IM0.m_angular.Scale(prevValue); linearM1 += m_Jt[index].m_jacobian_IM1.m_linear.Scale(prevValue); angularM1 += m_Jt[index].m_jacobian_IM1.m_angular.Scale(prevValue); index++; } m_internalForces[m0].m_linear = linearM0; m_internalForces[m0].m_angular = angularM0; m_internalForces[m1].m_linear = linearM1; m_internalForces[m1].m_angular = angularM1; if (m1) { m_world->dgReleaseIndirectLock(&m_locks[m1]); } if (m0) { m_world->dgReleaseIndirectLock(&m_locks[m0]); } } m_accNorm = accNorm; } // m_world->m_perfomanceCounters[m_threadIndex][m_dynamicsSolveSpanningTreeTicks] += (m_world->m_getPerformanceCount() - ticks); } void dgJacobianMemory::CalculateForcesGameModeParallel(dgInt32 iterations, dgFloat32 maxAccNorm, dgInt32 archModel) const { dgFloat32 *const force = m_force; const dgJacobianPair *const Jt = m_Jt; const dgJacobianPair *const JMinv = m_JMinv; const dgFloat32 *const diagDamp = m_diagDamp; const dgFloat32 *const invDJMinvJt = m_invDJMinvJt; const dgBodyInfo *bodyArray = m_bodyArray; dgFloat32 *const penetration = m_penetration; const dgFloat32 *const externAccel = m_deltaAccel; const dgFloat32 *const restitution = m_restitution; dgFloat32 *const coordenateAccel = m_coordenateAccel; dgJacobian *const internalVeloc = m_internalVeloc; dgJacobian *const internalForces = m_internalForces; ; dgFloat32 **const jointForceFeeback = m_jointFeebackForce; const dgInt32 *const accelIsMortor = m_accelIsMotor; const dgInt32 *const normalForceIndex = m_normalForceIndex; const dgJointInfo *const constraintArray = m_constraintArray; const dgFloat32 *const penetrationStiffness = m_penetrationStiffness; const dgFloat32 *const lowerFrictionCoef = m_lowerBoundFrictionCoefficent; const dgFloat32 *const upperFrictionCoef = m_upperBoundFrictionCoefficent; dgVector zero(dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f), dgFloat32(0.0f)); dgFloat32 invStep = (dgFloat32(1.0f) / dgFloat32(LINEAR_SOLVER_SUB_STEPS)); dgFloat32 timeStep = m_timeStep * invStep; dgFloat32 invTimeStep = m_invTimeStep * dgFloat32(LINEAR_SOLVER_SUB_STEPS); dgInt32 threads = dgInt32(m_world->m_numberOfTheads); dgInt32 chunkJointSizes[DG_MAXIMUN_THREADS]; dgInt32 chunkBodiesSizes[DG_MAXIMUN_THREADS]; dgInt32 acc = 0; m_world->m_threadsManager.CalculateChunkSizes(m_bodyCount - 1, chunkBodiesSizes); dgParallelSolverClear *const clearAccumulators = m_world->m_dynamicSolver.m_clearAccumulators; for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { clearAccumulators[threadIndex].m_useSimd = archModel; clearAccumulators[threadIndex].m_world = m_world; clearAccumulators[threadIndex].m_threadIndex = threadIndex; clearAccumulators[threadIndex].m_count = chunkBodiesSizes[threadIndex]; clearAccumulators[threadIndex].m_bodyArray = &bodyArray[acc + 1]; clearAccumulators[threadIndex].m_internalVeloc = &internalVeloc[acc + 1]; clearAccumulators[threadIndex].m_internalForces = &internalForces[acc + 1]; m_world->m_threadsManager.SubmitJob(&clearAccumulators[threadIndex]); acc += chunkBodiesSizes[threadIndex]; } m_world->m_threadsManager.SynchronizationBarrier(); internalVeloc[0].m_linear = zero; internalVeloc[0].m_angular = zero; internalForces[0].m_linear = zero; internalForces[0].m_angular = zero; acc = 0; m_world->m_threadsManager.CalculateChunkSizes(m_jointCount, chunkJointSizes); dgParallelSolverInitInternalForces *const parallelInitIntenalForces = m_world->m_dynamicSolver.m_parallelInitIntenalForces; for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { parallelInitIntenalForces[threadIndex].m_useSimd = archModel; parallelInitIntenalForces[threadIndex].m_world = m_world; parallelInitIntenalForces[threadIndex].m_locks = m_treadLocks; parallelInitIntenalForces[threadIndex].m_threadIndex = threadIndex; parallelInitIntenalForces[threadIndex].m_Jt = Jt; parallelInitIntenalForces[threadIndex].m_force = force; parallelInitIntenalForces[threadIndex].m_internalForces = internalForces; parallelInitIntenalForces[threadIndex].m_count = chunkJointSizes[threadIndex]; parallelInitIntenalForces[threadIndex].m_constraintArray = &constraintArray[acc]; m_world->m_threadsManager.SubmitJob( ¶llelInitIntenalForces[threadIndex]); acc += chunkJointSizes[threadIndex]; } m_world->m_threadsManager.SynchronizationBarrier(); acc = 0; dgParallelSolverJointAcceleration *const parallelSolverJointAcceleration = m_world->m_dynamicSolver.m_parallelSolverJointAcceleration; for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { parallelSolverJointAcceleration[threadIndex].m_useSimd = archModel; parallelSolverJointAcceleration[threadIndex].m_world = m_world; parallelSolverJointAcceleration[threadIndex].m_threadIndex = threadIndex; parallelSolverJointAcceleration[threadIndex].m_Jt = Jt; parallelSolverJointAcceleration[threadIndex].m_count = chunkJointSizes[threadIndex]; parallelSolverJointAcceleration[threadIndex].m_constraintArray = constraintArray; parallelSolverJointAcceleration[threadIndex].m_jointStart = acc; // parallelSolverJointAcceleration[threadIndex].m_jointRemapArray = &jointRemapArray[acc]; parallelSolverJointAcceleration[threadIndex].m_timeStep = timeStep; parallelSolverJointAcceleration[threadIndex].m_invTimeStep = invTimeStep; parallelSolverJointAcceleration[threadIndex].m_firstPassCoef = dgFloat32( 1.0f); parallelSolverJointAcceleration[threadIndex].m_penetration = penetration; parallelSolverJointAcceleration[threadIndex].m_restitution = restitution; parallelSolverJointAcceleration[threadIndex].m_externAccel = externAccel; parallelSolverJointAcceleration[threadIndex].m_coordenateAccel = coordenateAccel; parallelSolverJointAcceleration[threadIndex].m_accelIsMortor = accelIsMortor; parallelSolverJointAcceleration[threadIndex].m_normalForceIndex = normalForceIndex; parallelSolverJointAcceleration[threadIndex].m_penetrationStiffness = penetrationStiffness; acc += chunkJointSizes[threadIndex]; } acc = 0; dgParallelSolverUpdateVeloc *const parallelSolverUpdateVeloc = m_world->m_dynamicSolver.m_parallelSolverUpdateVeloc; for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { parallelSolverUpdateVeloc[threadIndex].m_useSimd = archModel; parallelSolverUpdateVeloc[threadIndex].m_world = m_world; parallelSolverUpdateVeloc[threadIndex].m_threadIndex = threadIndex; parallelSolverUpdateVeloc[threadIndex].m_timeStep = timeStep; parallelSolverUpdateVeloc[threadIndex].m_count = chunkBodiesSizes[threadIndex]; parallelSolverUpdateVeloc[threadIndex].m_bodyArray = &bodyArray[acc + 1]; parallelSolverUpdateVeloc[threadIndex].m_internalVeloc = &internalVeloc[acc + 1]; parallelSolverUpdateVeloc[threadIndex].m_internalForces = &internalForces[acc + 1]; acc += chunkBodiesSizes[threadIndex]; } dgParallelSolverCalculateForces *const parallelSolverCalculateForces = m_world->m_dynamicSolver.m_parallelSolverCalculateForces; for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { parallelSolverCalculateForces[threadIndex].m_useSimd = archModel; parallelSolverCalculateForces[threadIndex].m_world = m_world; parallelSolverCalculateForces[threadIndex].m_threads = threads; parallelSolverCalculateForces[threadIndex].m_threadIndex = threadIndex; parallelSolverCalculateForces[threadIndex].m_Jt = Jt; parallelSolverCalculateForces[threadIndex].m_JMinv = JMinv; parallelSolverCalculateForces[threadIndex].m_count = chunkJointSizes[threadIndex] * threads; // parallelSolverCalculateForces[threadIndex].m_jointRemapArray = &jointRemapArray[threadIndex]; parallelSolverCalculateForces[threadIndex].m_constraintArray = constraintArray; parallelSolverCalculateForces[threadIndex].m_locks = m_treadLocks; parallelSolverCalculateForces[threadIndex].m_force = force; parallelSolverCalculateForces[threadIndex].m_diagDamp = diagDamp; parallelSolverCalculateForces[threadIndex].m_invDJMinvJt = invDJMinvJt; parallelSolverCalculateForces[threadIndex].m_normalForceIndex = normalForceIndex; parallelSolverCalculateForces[threadIndex].m_internalForces = internalForces; parallelSolverCalculateForces[threadIndex].m_coordenateAccel = coordenateAccel; parallelSolverCalculateForces[threadIndex].m_lowerFrictionCoef = lowerFrictionCoef; parallelSolverCalculateForces[threadIndex].m_upperFrictionCoef = upperFrictionCoef; } dgInt32 maxPasses = iterations + DG_BASE_ITERATION_COUNT; for (dgInt32 step = 0; step < LINEAR_SOLVER_SUB_STEPS; step++) { for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { m_world->m_threadsManager.SubmitJob( ¶llelSolverJointAcceleration[threadIndex]); } m_world->m_threadsManager.SynchronizationBarrier(); dgFloat32 accNorm = maxAccNorm * dgFloat32(2.0f); for (dgInt32 passes = 0; (passes < maxPasses) && (accNorm > maxAccNorm); passes++) { for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { m_world->m_threadsManager.SubmitJob( ¶llelSolverCalculateForces[threadIndex]); } m_world->m_threadsManager.SynchronizationBarrier(); accNorm = dgFloat32(0.0f); for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { if (parallelSolverCalculateForces[threadIndex].m_accNorm > accNorm) { accNorm = parallelSolverCalculateForces[threadIndex].m_accNorm; } } // accNorm = 1.0f; } for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { m_world->m_threadsManager.SubmitJob( ¶llelSolverUpdateVeloc[threadIndex]); } m_world->m_threadsManager.SynchronizationBarrier(); } acc = 0; m_world->m_threadsManager.CalculateChunkSizes(m_jointCount, chunkJointSizes); dgParallelSolverInitFeedbackUpdate *const parallelSolverInitFeedbackUpdate = m_world->m_dynamicSolver.m_parallelSolverInitFeedbackUpdate; for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { parallelSolverInitFeedbackUpdate[threadIndex].m_locks = m_treadLocks; parallelSolverInitFeedbackUpdate[threadIndex].m_world = m_world; parallelSolverInitFeedbackUpdate[threadIndex].m_hasJointFeeback = 0; parallelSolverInitFeedbackUpdate[threadIndex].m_threadIndex = threadIndex; parallelSolverInitFeedbackUpdate[threadIndex].m_force = force; parallelSolverInitFeedbackUpdate[threadIndex].m_timestep = timeStep; parallelSolverInitFeedbackUpdate[threadIndex].m_jointForceFeeback = jointForceFeeback; parallelSolverInitFeedbackUpdate[threadIndex].m_count = chunkJointSizes[threadIndex]; parallelSolverInitFeedbackUpdate[threadIndex].m_constraintArray = &constraintArray[acc]; m_world->m_threadsManager.SubmitJob( ¶llelSolverInitFeedbackUpdate[threadIndex]); acc += chunkJointSizes[threadIndex]; } m_world->m_threadsManager.SynchronizationBarrier(); acc = 0; dgFloat32 maxAccNorm2 = maxAccNorm * maxAccNorm; dgParallelSolverUpdateForce *const parallelSolverUpdateForce = m_world->m_dynamicSolver.m_parallelSolverUpdateForce; for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { parallelSolverUpdateForce[threadIndex].m_useSimd = archModel; parallelSolverUpdateForce[threadIndex].m_world = m_world; parallelSolverUpdateForce[threadIndex].m_threadIndex = threadIndex; parallelSolverUpdateForce[threadIndex].m_invStep = invStep; parallelSolverUpdateForce[threadIndex].m_invTimeStep = m_invTimeStep; parallelSolverUpdateForce[threadIndex].m_maxAccNorm2 = maxAccNorm2; parallelSolverUpdateForce[threadIndex].m_count = chunkBodiesSizes[threadIndex]; parallelSolverUpdateForce[threadIndex].m_bodyArray = &bodyArray[acc + 1]; parallelSolverUpdateForce[threadIndex].m_internalVeloc = &internalVeloc[acc + 1]; acc += chunkBodiesSizes[threadIndex]; m_world->m_threadsManager.SubmitJob( ¶llelSolverUpdateForce[threadIndex]); } m_world->m_threadsManager.SynchronizationBarrier(); dgInt32 hasJointFeeback = 0; for (dgInt32 threadIndex = 0; threadIndex < threads; threadIndex++) { hasJointFeeback |= parallelSolverInitFeedbackUpdate[threadIndex].m_hasJointFeeback; } if (hasJointFeeback) { for (dgInt32 i = 0; i < m_jointCount; i++) { if (constraintArray[i].m_joint->m_updaFeedbackCallback) { constraintArray[i].m_joint->m_updaFeedbackCallback( reinterpret_cast(constraintArray[i].m_joint), m_timeStep, m_threadIndex); } } } } void dgJacobianMemory::CalculateReactionsForcesParallel(dgInt32 solverMode, dgFloat32 tolerance, dgInt32 archModel) const { // NEWTON_ASSERT (m_jointCount >= DG_PARALLEL_JOINT_COUNT); if (solverMode) { CalculateForcesGameModeParallel(solverMode, tolerance, archModel); } else if (archModel) { CalculateForcesSimulationModeSimd(tolerance); } else { CalculateForcesSimulationMode(tolerance); } }