Horde3D http://horde3d.org/forums/ |
|
Optimization of the software skinning + math http://horde3d.org/forums/viewtopic.php?f=8&t=653 |
Page 1 of 1 |
Author: | ii001 [ 22.02.2009, 00:48 ] |
Post subject: | Optimization of the software skinning + math |
I know that most of you are using HW skinning, but sometimes we need software skinning. So I decided to optimize this part of code in Horde3D. According to my measurements is my skinning loop about 800% faster (Pentium M 1,73GHz, VS2003). There is my version of ModelNode::updateGeometry: Code: bool ModelNode::updateGeometry( bool skinningDirty ) { if( !skinningDirty && !_morpherDirty ) return false; if( _baseGeoRes == 0x0 || _baseGeoRes->getVertData() == 0x0 ) return false; if( _geometryRes == 0x0 || _geometryRes->getVertData() == 0x0 ) return false; // Reset vertices to base data memcpy( _geometryRes->_vertData->memory, _baseGeoRes->_vertData->memory, _geometryRes->_vertCount * sizeof( Vec3f ) * 4 ); if( _morpherUsed ) { // Recalculate vertex positions for morph targets for( uint32 i = 0; i < _morphers.size(); ++i ) { if( _morphers[i].weight > Math::Epsilon ) { MorphTarget &mt = _geometryRes->_morphTargets[_morphers[i].index]; float weight = _morphers[i].weight; for( uint32 j = 0; j < mt.diffs.size(); ++j ) { MorphDiff &md = mt.diffs[j]; VertexData &vd = *_geometryRes->getVertData(); vd.positions[md.vertIndex] += md.posDiff * weight; vd.normals[md.vertIndex] += md.normDiff * weight; vd.tangents[md.vertIndex] += md.tanDiff * weight; vd.bitangents[md.vertIndex] += md.bitanDiff * weight; } } } } if( skinningDirty ) { Matrix4f skinningMat; Vec4f* Rows = &_skinMatRows[ 0 ]; VertexData &vd = *_geometryRes->_vertData; for( uint32 i = 0, s = _geometryRes->getVertCount(); i < s; ++i ) { Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ]; Vec4f* Rows1 = &Rows[ FloatToInt(vd.staticData[i].jointVec[1]) * 3 ]; Vec4f* Rows2 = &Rows[ FloatToInt(vd.staticData[i].jointVec[2]) * 3 ]; Vec4f* Rows3 = &Rows[ FloatToInt(vd.staticData[i].jointVec[3]) * 3 ]; Vec4f weight = *((Vec4f*)&vd.staticData[i].weightVec[0]); skinningMat.x[0] = (Rows0)->x * weight.x + (Rows1)->x * weight.y + (Rows2)->x * weight.z + (Rows3)->x * weight.w; skinningMat.x[1] = (Rows0 + 1)->x * weight.x + (Rows1 + 1)->x * weight.y + (Rows2 + 1)->x * weight.z + (Rows3 + 1)->x * weight.w; skinningMat.x[2] = (Rows0 + 2)->x * weight.x + (Rows1 + 2)->x * weight.y + (Rows2 + 2)->x * weight.z + (Rows3 + 2)->x * weight.w; skinningMat.x[4] = (Rows0)->y * weight.x + (Rows1)->y * weight.y + (Rows2)->y * weight.z + (Rows3)->y * weight.w; skinningMat.x[5] = (Rows0 + 1)->y * weight.x + (Rows1 + 1)->y * weight.y + (Rows2 + 1)->y * weight.z + (Rows3 + 1)->y * weight.w; skinningMat.x[6] = (Rows0 + 2)->y * weight.x + (Rows1 + 2)->y * weight.y + (Rows2 + 2)->y * weight.z + (Rows3 + 2)->y * weight.w; skinningMat.x[8] = (Rows0)->z * weight.x + (Rows1)->z * weight.y + (Rows2)->z * weight.z + (Rows3)->z * weight.w; skinningMat.x[9] = (Rows0 + 1)->z * weight.x + (Rows1 + 1)->z * weight.y + (Rows2 + 1)->z * weight.z + (Rows3 + 1)->z * weight.w; skinningMat.x[10] = (Rows0 + 2)->z * weight.x + (Rows1 + 2)->z * weight.y + (Rows2 + 2)->z * weight.z + (Rows3 + 2)->z * weight.w; skinningMat.x[12] = (Rows0)->w * weight.x + (Rows1)->w * weight.y + (Rows2)->w * weight.z + (Rows3)->w * weight.w; skinningMat.x[13] = (Rows0 + 1)->w * weight.x + (Rows1 + 1)->w * weight.y + (Rows2 + 1)->w * weight.z + (Rows3 + 1)->w * weight.w; skinningMat.x[14] = (Rows0 + 2)->w * weight.x + (Rows1 + 2)->w * weight.y + (Rows2 + 2)->w * weight.z + (Rows3 + 2)->w * weight.w; // Skin position vd.positions[i] = skinningMat * vd.positions[i]; // Skin tangent space basis vd.normals[i] = skinningMat.vecMul( vd.normals[i] ).normalized(); vd.tangents[i] = skinningMat.vecMul( vd.tangents[i] ).normalized(); vd.bitangents[i] = skinningMat.vecMul( vd.bitangents[i] ).normalized(); } } if( skinningDirty == false ) { // Renormalize tangent space basis for( uint32 i = 0, s = _geometryRes->getVertCount(); i < s; ++i ) { VertexData &vd = *_geometryRes->getVertData(); vd.normals[i] = vd.normals[i].normalized(); vd.tangents[i] = vd.tangents[i].normalized(); vd.bitangents[i] = vd.bitangents[i].normalized(); } } _morpherDirty = false; // Upload geometry _geometryRes->updateDynamicVertData(); markMeshBBoxesDirty(); return true; } As you can see I also normalizing normals, tangents, bitangents during skinning loop. Following version use sorted weights. With sorted weights I can easily detect how many matrices is needed to blend. This gives as about 20% more performance (it depends on used model). Sort can be done during load of geometry or during conversion of the DAE (which is much better). Code: bool ModelNode::updateGeometry( bool skinningDirty ) { if( !skinningDirty && !_morpherDirty ) return false; if( _baseGeoRes == 0x0 || _baseGeoRes->getVertData() == 0x0 ) return false; if( _geometryRes == 0x0 || _geometryRes->getVertData() == 0x0 ) return false; // Reset vertices to base data memcpy( _geometryRes->_vertData->memory, _baseGeoRes->_vertData->memory, _geometryRes->_vertCount * sizeof( Vec3f ) * 4 ); if( _morpherUsed ) { // Recalculate vertex positions for morph targets for( uint32 i = 0; i < _morphers.size(); ++i ) { if( _morphers[i].weight > Math::Epsilon ) { MorphTarget &mt = _geometryRes->_morphTargets[_morphers[i].index]; float weight = _morphers[i].weight; for( uint32 j = 0; j < mt.diffs.size(); ++j ) { MorphDiff &md = mt.diffs[j]; VertexData &vd = *_geometryRes->getVertData(); vd.positions[md.vertIndex] += md.posDiff * weight; vd.normals[md.vertIndex] += md.normDiff * weight; vd.tangents[md.vertIndex] += md.tanDiff * weight; vd.bitangents[md.vertIndex] += md.bitanDiff * weight; } } } } if( skinningDirty ) { Matrix4f skinningMat; Vec4f* Rows = &_skinMatRows[ 0 ]; VertexData &vd = *_geometryRes->_vertData; for( uint32 i = 0, s = _geometryRes->getVertCount(); i < s; ++i ) { Vec4f weight = *((Vec4f*)&vd.staticData[i].weightVec[0]); if ( weight.y < Math::Epsilon ) { Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ]; skinningMat.x[0] = (Rows0)->x; skinningMat.x[1] = (Rows0 + 1)->x; skinningMat.x[2] = (Rows0 + 2)->x; skinningMat.x[4] = (Rows0)->y; skinningMat.x[5] = (Rows0 + 1)->y; skinningMat.x[6] = (Rows0 + 2)->y; skinningMat.x[8] = (Rows0)->z * weight.x; skinningMat.x[9] = (Rows0 + 1)->z; skinningMat.x[10] = (Rows0 + 2)->z; skinningMat.x[12] = (Rows0)->w * weight.x; skinningMat.x[13] = (Rows0 + 1)->w; skinningMat.x[14] = (Rows0 + 2)->w; } else if ( weight.z < Math::Epsilon ) { Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ]; Vec4f* Rows1 = &Rows[ FloatToInt(vd.staticData[i].jointVec[1]) * 3 ]; skinningMat.x[0] = (Rows0)->x * weight.x + (Rows1)->x * weight.y; skinningMat.x[1] = (Rows0 + 1)->x * weight.x + (Rows1 + 1)->x * weight.y; skinningMat.x[2] = (Rows0 + 2)->x * weight.x + (Rows1 + 2)->x * weight.y; skinningMat.x[4] = (Rows0)->y * weight.x + (Rows1)->y * weight.y; skinningMat.x[5] = (Rows0 + 1)->y * weight.x + (Rows1 + 1)->y * weight.y; skinningMat.x[6] = (Rows0 + 2)->y * weight.x + (Rows1 + 2)->y * weight.y; skinningMat.x[8] = (Rows0)->z * weight.x + (Rows1)->z * weight.y; skinningMat.x[9] = (Rows0 + 1)->z * weight.x + (Rows1 + 1)->z * weight.y; skinningMat.x[10] = (Rows0 + 2)->z * weight.x + (Rows1 + 2)->z * weight.y; skinningMat.x[12] = (Rows0)->w * weight.x + (Rows1)->w * weight.y; skinningMat.x[13] = (Rows0 + 1)->w * weight.x + (Rows1 + 1)->w * weight.y; skinningMat.x[14] = (Rows0 + 2)->w * weight.x + (Rows1 + 2)->w * weight.y; } else if ( weight.w < Math::Epsilon ) { Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ]; Vec4f* Rows1 = &Rows[ FloatToInt(vd.staticData[i].jointVec[1]) * 3 ]; Vec4f* Rows2 = &Rows[ FloatToInt(vd.staticData[i].jointVec[2]) * 3 ]; skinningMat.x[0] = (Rows0)->x * weight.x + (Rows1)->x * weight.y + (Rows2)->x * weight.z; skinningMat.x[1] = (Rows0 + 1)->x * weight.x + (Rows1 + 1)->x * weight.y + (Rows2 + 1)->x * weight.z; skinningMat.x[2] = (Rows0 + 2)->x * weight.x + (Rows1 + 2)->x * weight.y + (Rows2 + 2)->x * weight.z; skinningMat.x[4] = (Rows0)->y * weight.x + (Rows1)->y * weight.y + (Rows2)->y * weight.z; skinningMat.x[5] = (Rows0 + 1)->y * weight.x + (Rows1 + 1)->y * weight.y + (Rows2 + 1)->y * weight.z; skinningMat.x[6] = (Rows0 + 2)->y * weight.x + (Rows1 + 2)->y * weight.y + (Rows2 + 2)->y * weight.z; skinningMat.x[8] = (Rows0)->z * weight.x + (Rows1)->z * weight.y + (Rows2)->z * weight.z; skinningMat.x[9] = (Rows0 + 1)->z * weight.x + (Rows1 + 1)->z * weight.y + (Rows2 + 1)->z * weight.z; skinningMat.x[10] = (Rows0 + 2)->z * weight.x + (Rows1 + 2)->z * weight.y + (Rows2 + 2)->z * weight.z; skinningMat.x[12] = (Rows0)->w * weight.x + (Rows1)->w * weight.y + (Rows2)->w * weight.z; skinningMat.x[13] = (Rows0 + 1)->w * weight.x + (Rows1 + 1)->w * weight.y + (Rows2 + 1)->w * weight.z; skinningMat.x[14] = (Rows0 + 2)->w * weight.x + (Rows1 + 2)->w * weight.y + (Rows2 + 2)->w * weight.z; } else { Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ]; Vec4f* Rows1 = &Rows[ FloatToInt(vd.staticData[i].jointVec[1]) * 3 ]; Vec4f* Rows2 = &Rows[ FloatToInt(vd.staticData[i].jointVec[2]) * 3 ]; Vec4f* Rows3 = &Rows[ FloatToInt(vd.staticData[i].jointVec[3]) * 3 ]; skinningMat.x[0] = (Rows0)->x * weight.x + (Rows1)->x * weight.y + (Rows2)->x * weight.z + (Rows3)->x * weight.w; skinningMat.x[1] = (Rows0 + 1)->x * weight.x + (Rows1 + 1)->x * weight.y + (Rows2 + 1)->x * weight.z + (Rows3 + 1)->x * weight.w; skinningMat.x[2] = (Rows0 + 2)->x * weight.x + (Rows1 + 2)->x * weight.y + (Rows2 + 2)->x * weight.z + (Rows3 + 2)->x * weight.w; skinningMat.x[4] = (Rows0)->y * weight.x + (Rows1)->y * weight.y + (Rows2)->y * weight.z + (Rows3)->y * weight.w; skinningMat.x[5] = (Rows0 + 1)->y * weight.x + (Rows1 + 1)->y * weight.y + (Rows2 + 1)->y * weight.z + (Rows3 + 1)->y * weight.w; skinningMat.x[6] = (Rows0 + 2)->y * weight.x + (Rows1 + 2)->y * weight.y + (Rows2 + 2)->y * weight.z + (Rows3 + 2)->y * weight.w; skinningMat.x[8] = (Rows0)->z * weight.x + (Rows1)->z * weight.y + (Rows2)->z * weight.z + (Rows3)->z * weight.w; skinningMat.x[9] = (Rows0 + 1)->z * weight.x + (Rows1 + 1)->z * weight.y + (Rows2 + 1)->z * weight.z + (Rows3 + 1)->z * weight.w; skinningMat.x[10] = (Rows0 + 2)->z * weight.x + (Rows1 + 2)->z * weight.y + (Rows2 + 2)->z * weight.z + (Rows3 + 2)->z * weight.w; skinningMat.x[12] = (Rows0)->w * weight.x + (Rows1)->w * weight.y + (Rows2)->w * weight.z + (Rows3)->w * weight.w; skinningMat.x[13] = (Rows0 + 1)->w * weight.x + (Rows1 + 1)->w * weight.y + (Rows2 + 1)->w * weight.z + (Rows3 + 1)->w * weight.w; skinningMat.x[14] = (Rows0 + 2)->w * weight.x + (Rows1 + 2)->w * weight.y + (Rows2 + 2)->w * weight.z + (Rows3 + 2)->w * weight.w; } // Skin position vd.positions[i] = skinningMat * vd.positions[i]; // Skin tangent space basis vd.normals[i] = skinningMat.vecMul( vd.normals[i] ).normalized(); vd.tangents[i] = skinningMat.vecMul( vd.tangents[i] ).normalized(); vd.bitangents[i] = skinningMat.vecMul( vd.bitangents[i] ).normalized(); } } if( skinningDirty == false ) { // Renormalize tangent space basis for( uint32 i = 0, s = _geometryRes->getVertCount(); i < s; ++i ) { VertexData &vd = *_geometryRes->getVertData(); vd.normals[i] = vd.normals[i].normalized(); vd.tangents[i] = vd.tangents[i].normalized(); vd.bitangents[i] = vd.bitangents[i].normalized(); } } _morpherDirty = false; // Upload geometry _geometryRes->updateDynamicVertData(); markMeshBBoxesDirty(); return true; } With sorted weights is loop about 980% faster than original Horde3D version. Yes, this can probably be even more optimized by using SSE etc., but look at this as platform independent version. As you noticed I am using some optimized math functions. There are: Code: #define CPU_X86 inline int FloatToInt( float f ) { #ifdef CPU_X86 int i; __asm fld f __asm fistp i return i; #else // Fallback return (int)f; #endif } This function only rotates/scales vector. vecMul is only working name (any better name is welcome). Code: Vec3f Matrif4f::vecMul( const Vec3f &v ) const { return Vec3f( v.x * c[0][0] + v.y * c[1][0] + v.z * c[2][0], v.x * c[0][1] + v.y * c[1][1] + v.z * c[2][1], v.x * c[0][2] + v.y * c[1][2] + v.z * c[2][2] ); } I prefer this optimization of Matrix4f * float Code: Matrix4f operator*( const float f ) const
{ Matrix4f m( *this ); m.c[0][0] *= f; m.c[1][0] *= f; m.c[2][0] *= f; m.c[3][0] *= f; m.c[0][1] *= f; m.c[1][1] *= f; m.c[2][1] *= f; m.c[3][1] *= f; m.c[0][2] *= f; m.c[1][2] *= f; m.c[2][2] *= f; m.c[3][2] *= f; m.c[0][3] *= f; m.c[1][3] *= f; m.c[2][3] *= f; m.c[3][3] *= f; return m; } |
Author: | Siavash [ 23.02.2009, 17:59 ] |
Post subject: | Re: Optimization of the software skinning + math |
It's too interesting So which pieces of code must be optimized by using SIMD extensions EDIT : WAIT A MINUTE, I'LL SEND YOU THE OPTIMIZED ONES |
Author: | Siavash [ 23.02.2009, 18:39 ] |
Post subject: | Re: Optimization of the software skinning + math |
enjoy this : Code: //don't forget to include xmmintrin.h ;)
Vec3f Matrif4f::vecMul( const Vec3f &v ) const { float out[4]; __m128 a=_mm_mul_ps(_mm_setr_ps(c[0][0],c[0][1],c[0][2],0), _mm_set_ps1(v.x)); __m128 b=_mm_mul_ps(_mm_setr_ps(c[1][0],c[1][1],c[1][2],0), _mm_set_ps1(v.y)); __m128 c=_mm_mul_ps(_mm_setr_ps(c[2][0],c[2][1],c[2][2],0), _mm_set_ps1(v.z)); a=_mm_add_ps(a,b); a=_mm_add_ps(a,c); _mm_storeu_ps(out, a); return Vec3f( out[0], out[1], out[2] ); } Matrix4f operator*( const float f ) const { Matrix4f m( *this ); __m128 m128f=_mm_set_ps1(f); _mm_storeu_ps(m.c[0], _mm_mul_ps(m128f,_mm_loadu_ps(m.c[0]))); _mm_storeu_ps(m.c[1], _mm_mul_ps(m128f,_mm_loadu_ps(m.c[1]))); _mm_storeu_ps(m.c[2], _mm_mul_ps(m128f,_mm_loadu_ps(m.c[2]))); _mm_storeu_ps(m.c[3], _mm_mul_ps(m128f,_mm_loadu_ps(m.c[3]))); return m; } |
Author: | ii001 [ 23.02.2009, 22:49 ] |
Post subject: | Re: Optimization of the software skinning + math |
Thanks Siavash, your version of Matrif4f::vecMul will probably speed up loop little bit. Unfortunately I can't test it now. I am porting my project to H3D Beta3. I do not use Matrix4f * float in my version of skinning loop. This function was used in original H3D Beta2 version of loop. I placed my version Matrix4f * float at the end of post as general idea. So SIMD version of this function does not have effect on new version of the loop. Biggest performance "eaters" in loop are normalizations of normals,tangents,.. You are probably already using your SIMD version of utMath where is normalization already optimized. Other parts of loop are not well suited for SIMD optimization. Aha, I just get idea how to speed up loop even more. I can use row oriented matrix in the loop, which allows me to make loop more efficient and also more suitable for SIMD optimization. I will let you know when I will have new faster version done. |
Author: | swiftcoder [ 24.02.2009, 03:36 ] |
Post subject: | Re: Optimization of the software skinning + math |
ii001 wrote: Biggest performance "eaters" in loop are normalizations of normals,tangents,. I may be missing something elementary, but can't we pass this normalisation off to the vertex shader, where such things are relatively cheap?
|
Author: | Siavash [ 24.02.2009, 03:40 ] |
Post subject: | Re: Optimization of the software skinning + math |
ii001 wrote: Biggest performance "eaters" in loop are normalizations of normals,tangents,.. You are probably already using your SIMD version of utMath where is normalization already optimized. SIMD version of utMath hasn't finished yet and there is some problems with aligned memory [m128 union] but I can help you in vectorization of critical code pieces.ii001 wrote: Other parts of loop are not well suited for SIMD optimization. Aha, I just get idea how to speed up loop even more. I can use column oriented matrix in the loop, which allows me to make loop more efficient and also more suitable for SIMD optimization. Good idea !!! Another way to gain more performance is going to use Structure Of Arrays [SOA] instead of Array Of Structures [AOS], this is at least ~30% faster [depending on intel's optimization manuals and ...] but requires some changes in engine.Just call me when fastest version is done |
Author: | ii001 [ 24.02.2009, 09:05 ] |
Post subject: | Re: Optimization of the software skinning + math |
swiftcoder wrote: ii001 wrote: Biggest performance "eaters" in loop are normalizations of normals,tangents,. I may be missing something elementary, but can't we pass this normalisation off to the vertex shader, where such things are relatively cheap?You are right. Only disadvantage of this approach is that user has to be aware of that and enable/support it in the shaders when needed. |
Author: | marciano [ 27.02.2009, 09:07 ] |
Post subject: | Re: Optimization of the software skinning + math |
Thanks a lot for removing the "TODO: Optimize this" that was in the software skinning routine Looks very promising. I'm currently quite busy but I will take a closer look when I have a bit more time again... |
Author: | ii001 [ 27.02.2009, 12:54 ] |
Post subject: | Re: Optimization of the software skinning + math |
I did some experiments with row oriented matrix. Speed seems to be similar with my previous approach but it may be more suitable for SIMD enthusiasts. Loop concept (no sorted weights optimalization or compiler optimalization): Code: Matrix4f skinningMat;
Vec4f* Rows = &_skinMatRows[ 0 ]; VertexData &vd = *_geometryRes->_vertData; for( uint32 i = 0, s = _geometryRes->getVertCount(); i < s; ++i ) { Vec4f weight = *((Vec4f*)&vd.staticData[i].weightVec[0]); Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ]; Vec4f* Rows1 = &Rows[ FloatToInt(vd.staticData[i].jointVec[1]) * 3 ]; Vec4f* Rows2 = &Rows[ FloatToInt(vd.staticData[i].jointVec[2]) * 3 ]; Vec4f* Rows3 = &Rows[ FloatToInt(vd.staticData[i].jointVec[3]) * 3 ]; *((Vec4f*)&skinningMat.x[0]) = (*Rows0) * weight.x + (*Rows1) * weight.y + (*Rows2) * weight.z + (*Rows3) * weight.w; *((Vec4f*)&skinningMat.x[4]) = (*(Rows0+1)) * weight.x + (*(Rows1+1)) * weight.y + (*(Rows2+1)) * weight.z + (*(Rows3+1)) * weight.w; *((Vec4f*)&skinningMat.x[8]) = (*(Rows0+2)) * weight.x + (*(Rows1+2)) * weight.y + (*(Rows2+2)) * weight.z + (*(Rows3+2)) * weight.w; // Skin position vd.positions[i] = skinningMat.multiply43R( vd.positions[i] ); // Skin tangent space basis vd.normals[i] = skinningMat.multiply33R( vd.normals[i] );//.normalized(); vd.tangents[i] = skinningMat.multiply33R( vd.tangents[i] );//.normalized(); vd.bitangents[i] = skinningMat.multiply33R( vd.bitangents[i] );//.normalized(); } |
Author: | Siavash [ 04.04.2009, 04:30 ] |
Post subject: | Re: Optimization of the software skinning + math |
any updates? |
Author: | ii001 [ 04.04.2009, 09:43 ] |
Post subject: | Re: Optimization of the software skinning + math |
Siavash wrote: any updates? no, I am happy with my first version. |
Author: | Siavash [ 04.04.2009, 09:44 ] |
Post subject: | Re: Optimization of the software skinning + math |
OK |
Author: | Volker [ 04.04.2009, 11:46 ] |
Post subject: | Re: Optimization of the software skinning + math |
Some of the changes proposed by ii001 were integrated into Beta3. Thanks again for sharing it. |
Page 1 of 1 | All times are UTC + 1 hour |
Powered by phpBB® Forum Software © phpBB Group https://www.phpbb.com/ |