I know that most of you are using HW skinning, but sometimes we need software skinning. So I decided to optimize this part of code in Horde3D. According to my measurements is my skinning loop about 800% faster (Pentium M 1,73GHz, VS2003). There is my version of ModelNode::updateGeometry:
Code:
bool ModelNode::updateGeometry( bool skinningDirty )
{
if( !skinningDirty && !_morpherDirty ) return false;
if( _baseGeoRes == 0x0 || _baseGeoRes->getVertData() == 0x0 ) return false;
if( _geometryRes == 0x0 || _geometryRes->getVertData() == 0x0 ) return false;
// Reset vertices to base data
memcpy( _geometryRes->_vertData->memory, _baseGeoRes->_vertData->memory,
_geometryRes->_vertCount * sizeof( Vec3f ) * 4 );
if( _morpherUsed )
{
// Recalculate vertex positions for morph targets
for( uint32 i = 0; i < _morphers.size(); ++i )
{
if( _morphers[i].weight > Math::Epsilon )
{
MorphTarget &mt = _geometryRes->_morphTargets[_morphers[i].index];
float weight = _morphers[i].weight;
for( uint32 j = 0; j < mt.diffs.size(); ++j )
{
MorphDiff &md = mt.diffs[j];
VertexData &vd = *_geometryRes->getVertData();
vd.positions[md.vertIndex] += md.posDiff * weight;
vd.normals[md.vertIndex] += md.normDiff * weight;
vd.tangents[md.vertIndex] += md.tanDiff * weight;
vd.bitangents[md.vertIndex] += md.bitanDiff * weight;
}
}
}
}
if( skinningDirty )
{
Matrix4f skinningMat;
Vec4f* Rows = &_skinMatRows[ 0 ];
VertexData &vd = *_geometryRes->_vertData;
for( uint32 i = 0, s = _geometryRes->getVertCount(); i < s; ++i )
{
Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ];
Vec4f* Rows1 = &Rows[ FloatToInt(vd.staticData[i].jointVec[1]) * 3 ];
Vec4f* Rows2 = &Rows[ FloatToInt(vd.staticData[i].jointVec[2]) * 3 ];
Vec4f* Rows3 = &Rows[ FloatToInt(vd.staticData[i].jointVec[3]) * 3 ];
Vec4f weight = *((Vec4f*)&vd.staticData[i].weightVec[0]);
skinningMat.x[0] = (Rows0)->x * weight.x + (Rows1)->x * weight.y + (Rows2)->x * weight.z + (Rows3)->x * weight.w;
skinningMat.x[1] = (Rows0 + 1)->x * weight.x + (Rows1 + 1)->x * weight.y + (Rows2 + 1)->x * weight.z + (Rows3 + 1)->x * weight.w;
skinningMat.x[2] = (Rows0 + 2)->x * weight.x + (Rows1 + 2)->x * weight.y + (Rows2 + 2)->x * weight.z + (Rows3 + 2)->x * weight.w;
skinningMat.x[4] = (Rows0)->y * weight.x + (Rows1)->y * weight.y + (Rows2)->y * weight.z + (Rows3)->y * weight.w;
skinningMat.x[5] = (Rows0 + 1)->y * weight.x + (Rows1 + 1)->y * weight.y + (Rows2 + 1)->y * weight.z + (Rows3 + 1)->y * weight.w;
skinningMat.x[6] = (Rows0 + 2)->y * weight.x + (Rows1 + 2)->y * weight.y + (Rows2 + 2)->y * weight.z + (Rows3 + 2)->y * weight.w;
skinningMat.x[8] = (Rows0)->z * weight.x + (Rows1)->z * weight.y + (Rows2)->z * weight.z + (Rows3)->z * weight.w;
skinningMat.x[9] = (Rows0 + 1)->z * weight.x + (Rows1 + 1)->z * weight.y + (Rows2 + 1)->z * weight.z + (Rows3 + 1)->z * weight.w;
skinningMat.x[10] = (Rows0 + 2)->z * weight.x + (Rows1 + 2)->z * weight.y + (Rows2 + 2)->z * weight.z + (Rows3 + 2)->z * weight.w;
skinningMat.x[12] = (Rows0)->w * weight.x + (Rows1)->w * weight.y + (Rows2)->w * weight.z + (Rows3)->w * weight.w;
skinningMat.x[13] = (Rows0 + 1)->w * weight.x + (Rows1 + 1)->w * weight.y + (Rows2 + 1)->w * weight.z + (Rows3 + 1)->w * weight.w;
skinningMat.x[14] = (Rows0 + 2)->w * weight.x + (Rows1 + 2)->w * weight.y + (Rows2 + 2)->w * weight.z + (Rows3 + 2)->w * weight.w;
// Skin position
vd.positions[i] = skinningMat * vd.positions[i];
// Skin tangent space basis
vd.normals[i] = skinningMat.vecMul( vd.normals[i] ).normalized();
vd.tangents[i] = skinningMat.vecMul( vd.tangents[i] ).normalized();
vd.bitangents[i] = skinningMat.vecMul( vd.bitangents[i] ).normalized();
}
}
if( skinningDirty == false )
{
// Renormalize tangent space basis
for( uint32 i = 0, s = _geometryRes->getVertCount(); i < s; ++i )
{
VertexData &vd = *_geometryRes->getVertData();
vd.normals[i] = vd.normals[i].normalized();
vd.tangents[i] = vd.tangents[i].normalized();
vd.bitangents[i] = vd.bitangents[i].normalized();
}
}
_morpherDirty = false;
// Upload geometry
_geometryRes->updateDynamicVertData();
markMeshBBoxesDirty();
return true;
}
As you can see I also normalizing normals, tangents, bitangents during skinning loop.
Following version use sorted weights. With sorted weights I can easily detect how many matrices is needed to blend. This gives as about 20% more performance (it depends on used model). Sort can be done during load of geometry or during conversion of the DAE (which is much better).
Code:
bool ModelNode::updateGeometry( bool skinningDirty )
{
if( !skinningDirty && !_morpherDirty ) return false;
if( _baseGeoRes == 0x0 || _baseGeoRes->getVertData() == 0x0 ) return false;
if( _geometryRes == 0x0 || _geometryRes->getVertData() == 0x0 ) return false;
// Reset vertices to base data
memcpy( _geometryRes->_vertData->memory, _baseGeoRes->_vertData->memory,
_geometryRes->_vertCount * sizeof( Vec3f ) * 4 );
if( _morpherUsed )
{
// Recalculate vertex positions for morph targets
for( uint32 i = 0; i < _morphers.size(); ++i )
{
if( _morphers[i].weight > Math::Epsilon )
{
MorphTarget &mt = _geometryRes->_morphTargets[_morphers[i].index];
float weight = _morphers[i].weight;
for( uint32 j = 0; j < mt.diffs.size(); ++j )
{
MorphDiff &md = mt.diffs[j];
VertexData &vd = *_geometryRes->getVertData();
vd.positions[md.vertIndex] += md.posDiff * weight;
vd.normals[md.vertIndex] += md.normDiff * weight;
vd.tangents[md.vertIndex] += md.tanDiff * weight;
vd.bitangents[md.vertIndex] += md.bitanDiff * weight;
}
}
}
}
if( skinningDirty )
{
Matrix4f skinningMat;
Vec4f* Rows = &_skinMatRows[ 0 ];
VertexData &vd = *_geometryRes->_vertData;
for( uint32 i = 0, s = _geometryRes->getVertCount(); i < s; ++i )
{
Vec4f weight = *((Vec4f*)&vd.staticData[i].weightVec[0]);
if ( weight.y < Math::Epsilon )
{
Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ];
skinningMat.x[0] = (Rows0)->x;
skinningMat.x[1] = (Rows0 + 1)->x;
skinningMat.x[2] = (Rows0 + 2)->x;
skinningMat.x[4] = (Rows0)->y;
skinningMat.x[5] = (Rows0 + 1)->y;
skinningMat.x[6] = (Rows0 + 2)->y;
skinningMat.x[8] = (Rows0)->z * weight.x;
skinningMat.x[9] = (Rows0 + 1)->z;
skinningMat.x[10] = (Rows0 + 2)->z;
skinningMat.x[12] = (Rows0)->w * weight.x;
skinningMat.x[13] = (Rows0 + 1)->w;
skinningMat.x[14] = (Rows0 + 2)->w;
}
else if ( weight.z < Math::Epsilon )
{
Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ];
Vec4f* Rows1 = &Rows[ FloatToInt(vd.staticData[i].jointVec[1]) * 3 ];
skinningMat.x[0] = (Rows0)->x * weight.x + (Rows1)->x * weight.y;
skinningMat.x[1] = (Rows0 + 1)->x * weight.x + (Rows1 + 1)->x * weight.y;
skinningMat.x[2] = (Rows0 + 2)->x * weight.x + (Rows1 + 2)->x * weight.y;
skinningMat.x[4] = (Rows0)->y * weight.x + (Rows1)->y * weight.y;
skinningMat.x[5] = (Rows0 + 1)->y * weight.x + (Rows1 + 1)->y * weight.y;
skinningMat.x[6] = (Rows0 + 2)->y * weight.x + (Rows1 + 2)->y * weight.y;
skinningMat.x[8] = (Rows0)->z * weight.x + (Rows1)->z * weight.y;
skinningMat.x[9] = (Rows0 + 1)->z * weight.x + (Rows1 + 1)->z * weight.y;
skinningMat.x[10] = (Rows0 + 2)->z * weight.x + (Rows1 + 2)->z * weight.y;
skinningMat.x[12] = (Rows0)->w * weight.x + (Rows1)->w * weight.y;
skinningMat.x[13] = (Rows0 + 1)->w * weight.x + (Rows1 + 1)->w * weight.y;
skinningMat.x[14] = (Rows0 + 2)->w * weight.x + (Rows1 + 2)->w * weight.y;
}
else if ( weight.w < Math::Epsilon )
{
Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ];
Vec4f* Rows1 = &Rows[ FloatToInt(vd.staticData[i].jointVec[1]) * 3 ];
Vec4f* Rows2 = &Rows[ FloatToInt(vd.staticData[i].jointVec[2]) * 3 ];
skinningMat.x[0] = (Rows0)->x * weight.x + (Rows1)->x * weight.y + (Rows2)->x * weight.z;
skinningMat.x[1] = (Rows0 + 1)->x * weight.x + (Rows1 + 1)->x * weight.y + (Rows2 + 1)->x * weight.z;
skinningMat.x[2] = (Rows0 + 2)->x * weight.x + (Rows1 + 2)->x * weight.y + (Rows2 + 2)->x * weight.z;
skinningMat.x[4] = (Rows0)->y * weight.x + (Rows1)->y * weight.y + (Rows2)->y * weight.z;
skinningMat.x[5] = (Rows0 + 1)->y * weight.x + (Rows1 + 1)->y * weight.y + (Rows2 + 1)->y * weight.z;
skinningMat.x[6] = (Rows0 + 2)->y * weight.x + (Rows1 + 2)->y * weight.y + (Rows2 + 2)->y * weight.z;
skinningMat.x[8] = (Rows0)->z * weight.x + (Rows1)->z * weight.y + (Rows2)->z * weight.z;
skinningMat.x[9] = (Rows0 + 1)->z * weight.x + (Rows1 + 1)->z * weight.y + (Rows2 + 1)->z * weight.z;
skinningMat.x[10] = (Rows0 + 2)->z * weight.x + (Rows1 + 2)->z * weight.y + (Rows2 + 2)->z * weight.z;
skinningMat.x[12] = (Rows0)->w * weight.x + (Rows1)->w * weight.y + (Rows2)->w * weight.z;
skinningMat.x[13] = (Rows0 + 1)->w * weight.x + (Rows1 + 1)->w * weight.y + (Rows2 + 1)->w * weight.z;
skinningMat.x[14] = (Rows0 + 2)->w * weight.x + (Rows1 + 2)->w * weight.y + (Rows2 + 2)->w * weight.z;
}
else
{
Vec4f* Rows0 = &Rows[ FloatToInt(vd.staticData[i].jointVec[0]) * 3 ];
Vec4f* Rows1 = &Rows[ FloatToInt(vd.staticData[i].jointVec[1]) * 3 ];
Vec4f* Rows2 = &Rows[ FloatToInt(vd.staticData[i].jointVec[2]) * 3 ];
Vec4f* Rows3 = &Rows[ FloatToInt(vd.staticData[i].jointVec[3]) * 3 ];
skinningMat.x[0] = (Rows0)->x * weight.x + (Rows1)->x * weight.y + (Rows2)->x * weight.z + (Rows3)->x * weight.w;
skinningMat.x[1] = (Rows0 + 1)->x * weight.x + (Rows1 + 1)->x * weight.y + (Rows2 + 1)->x * weight.z + (Rows3 + 1)->x * weight.w;
skinningMat.x[2] = (Rows0 + 2)->x * weight.x + (Rows1 + 2)->x * weight.y + (Rows2 + 2)->x * weight.z + (Rows3 + 2)->x * weight.w;
skinningMat.x[4] = (Rows0)->y * weight.x + (Rows1)->y * weight.y + (Rows2)->y * weight.z + (Rows3)->y * weight.w;
skinningMat.x[5] = (Rows0 + 1)->y * weight.x + (Rows1 + 1)->y * weight.y + (Rows2 + 1)->y * weight.z + (Rows3 + 1)->y * weight.w;
skinningMat.x[6] = (Rows0 + 2)->y * weight.x + (Rows1 + 2)->y * weight.y + (Rows2 + 2)->y * weight.z + (Rows3 + 2)->y * weight.w;
skinningMat.x[8] = (Rows0)->z * weight.x + (Rows1)->z * weight.y + (Rows2)->z * weight.z + (Rows3)->z * weight.w;
skinningMat.x[9] = (Rows0 + 1)->z * weight.x + (Rows1 + 1)->z * weight.y + (Rows2 + 1)->z * weight.z + (Rows3 + 1)->z * weight.w;
skinningMat.x[10] = (Rows0 + 2)->z * weight.x + (Rows1 + 2)->z * weight.y + (Rows2 + 2)->z * weight.z + (Rows3 + 2)->z * weight.w;
skinningMat.x[12] = (Rows0)->w * weight.x + (Rows1)->w * weight.y + (Rows2)->w * weight.z + (Rows3)->w * weight.w;
skinningMat.x[13] = (Rows0 + 1)->w * weight.x + (Rows1 + 1)->w * weight.y + (Rows2 + 1)->w * weight.z + (Rows3 + 1)->w * weight.w;
skinningMat.x[14] = (Rows0 + 2)->w * weight.x + (Rows1 + 2)->w * weight.y + (Rows2 + 2)->w * weight.z + (Rows3 + 2)->w * weight.w;
}
// Skin position
vd.positions[i] = skinningMat * vd.positions[i];
// Skin tangent space basis
vd.normals[i] = skinningMat.vecMul( vd.normals[i] ).normalized();
vd.tangents[i] = skinningMat.vecMul( vd.tangents[i] ).normalized();
vd.bitangents[i] = skinningMat.vecMul( vd.bitangents[i] ).normalized();
}
}
if( skinningDirty == false )
{
// Renormalize tangent space basis
for( uint32 i = 0, s = _geometryRes->getVertCount(); i < s; ++i )
{
VertexData &vd = *_geometryRes->getVertData();
vd.normals[i] = vd.normals[i].normalized();
vd.tangents[i] = vd.tangents[i].normalized();
vd.bitangents[i] = vd.bitangents[i].normalized();
}
}
_morpherDirty = false;
// Upload geometry
_geometryRes->updateDynamicVertData();
markMeshBBoxesDirty();
return true;
}
With sorted weights is loop about 980% faster than original Horde3D version. Yes, this can probably be even more optimized by using SSE etc., but look at this as platform independent version.
As you noticed I am using some optimized math functions. There are:
Code:
#define CPU_X86
inline int FloatToInt( float f )
{
#ifdef CPU_X86
int i;
__asm fld f
__asm fistp i
return i;
#else // Fallback
return (int)f;
#endif
}
This function only rotates/scales vector. vecMul is only working name (any better name is welcome).
Code:
Vec3f Matrif4f::vecMul( const Vec3f &v ) const
{
return Vec3f( v.x * c[0][0] + v.y * c[1][0] + v.z * c[2][0],
v.x * c[0][1] + v.y * c[1][1] + v.z * c[2][1],
v.x * c[0][2] + v.y * c[1][2] + v.z * c[2][2] );
}
I prefer this optimization of Matrix4f * float
Code:
Matrix4f operator*( const float f ) const
{
Matrix4f m( *this );
m.c[0][0] *= f; m.c[1][0] *= f; m.c[2][0] *= f; m.c[3][0] *= f;
m.c[0][1] *= f; m.c[1][1] *= f; m.c[2][1] *= f; m.c[3][1] *= f;
m.c[0][2] *= f; m.c[1][2] *= f; m.c[2][2] *= f; m.c[3][2] *= f;
m.c[0][3] *= f; m.c[1][3] *= f; m.c[2][3] *= f; m.c[3][3] *= f;
return m;
}