Why didn't you try something like this for mult?
It's mult and transpose that are "slow" in a matrix.
Code:
Mat4x4 operator*(const Mat4x4 &o)
{
Mat4x4 res;
#if ASM_MATH
__asm
{
mov eax,[o]
mov ecx,[this]
movups xmm4,[eax] // Other.M[0][0-3]
movups xmm5,[eax+16] // Other.M[1][0-3]
movups xmm6,[eax+32] // Other.M[2][0-3]
movups xmm7,[eax+48] // Other.M[3][0-3]
lea eax,[res]
// Begin first row of result.
movss xmm0,[ecx] // M[0][0]
shufps xmm0,xmm0,0
mulps xmm0,xmm4
movss xmm1,[ecx+4] // M[0][1]
shufps xmm1,xmm1,0
mulps xmm1,xmm5
movss xmm2,[ecx+8] // M[0][2]
shufps xmm2,xmm2,0
mulps xmm2,xmm6
addps xmm1,xmm0 // First row done with xmm0
movss xmm3,[ecx+12] // M[0][3]
shufps xmm3,xmm3,0
mulps xmm3,xmm7
// Begin second row of result.
movss xmm0,[ecx+16] // M[1][0]
shufps xmm0,xmm0,0
mulps xmm0,xmm4
addps xmm3,xmm2 // First row done with xmm2
movss xmm2,[ecx+20] // M[1][1]
shufps xmm2,xmm2,0
mulps xmm2,xmm5
addps xmm3,xmm1 // First row done with xmm1
movss xmm1,[ecx+24] // M[1][2]
shufps xmm1,xmm1,0
mulps xmm1,xmm6
movups [eax],xmm3 // Store Result.M[0][0-3]
// Done computing first row.
addps xmm2,xmm0 // Second row done with xmm0
movss xmm3,[ecx+28] // M[1][3]
shufps xmm3,xmm3,0
mulps xmm3,xmm7
// Begin third row of result.
movss xmm0,[ecx+32] // M[2][0]
shufps xmm0,xmm0,0
mulps xmm0,xmm4
addps xmm3,xmm1 // Second row done with xmm1
movss xmm1,[ecx+36] // M[2][1]
shufps xmm1,xmm1,0
mulps xmm1,xmm5
addps xmm3,xmm2 // Second row done with xmm2
movss xmm2,[ecx+40] // M[2][2]
shufps xmm2,xmm2,0
mulps xmm2,xmm6
movups [eax+16],xmm3 // Store Result.M[1][0-3]
// Done computing second row.
addps xmm1,xmm0 // Third row done with xmm0
movss xmm3,[ecx+44] // M[2][3]
shufps xmm3,xmm3,0
mulps xmm3,xmm7
// Begin fourth row of result.
movss xmm0,[ecx+48] // M[3][0]
shufps xmm0,xmm0,0
mulps xmm0,xmm4
addps xmm3,xmm2 // Third row done with xmm2
movss xmm2,[ecx+52] // M[3][1]
shufps xmm2,xmm2,0
mulps xmm2,xmm5
addps xmm3,xmm1 // Third row done with xmm1
movss xmm1,[ecx+56] // M[3][2]
shufps xmm1,xmm1,0
mulps xmm1,xmm6
movups [eax+32],xmm3 // Store Result.M[2][0-3]
// Done computing third row.
addps xmm2,xmm0
movss xmm3,[ecx+60] // M[3][3]
shufps xmm3,xmm3,0
mulps xmm3,xmm7
// stall
addps xmm3,xmm1
// stall
addps xmm3,xmm2
movups [eax+48],xmm3 // Store Result.M[3][0-3]
// Done computing fourth row.
}
#else
Mat4x4 r;
r.m[0] = m[0] * o.m[0] + m[4] * o.m[1] + m[8] * o.m[2] + m[12] * o.m[3];
r.m[1] = m[1] * o.m[0] + m[5] * o.m[1] + m[9] * o.m[2] + m[13] * o.m[3];
r.m[2] = m[2] * o.m[0] + m[6] * o.m[1] + m[10] * o.m[2] + m[14] * o.m[3];
r.m[3] = m[3] * o.m[0] + m[7] * o.m[1] + m[11] * o.m[2] + m[15] * o.m[3];
r.m[4] = m[0] * o.m[4] + m[4] * o.m[5] + m[8] * o.m[6] + m[12] * o.m[7];
r.m[5] = m[1] * o.m[4] + m[5] * o.m[5] + m[9] * o.m[6] + m[13] * o.m[7];
r.m[6] = m[2] * o.m[4] + m[6] * o.m[5] + m[10] * o.m[6] + m[14] * o.m[7];
r.m[7] = m[3] * o.m[4] + m[7] * o.m[5] + m[11] * o.m[6] + m[15] * o.m[7];
r.m[8] = m[0] * o.m[8] + m[4] * o.m[9] + m[8] * o.m[10] + m[12] * o.m[11];
r.m[9] = m[1] * o.m[8] + m[5] * o.m[9] + m[9] * o.m[10] + m[13] * o.m[11];
r.m[10] = m[2] * o.m[8] + m[6] * o.m[9] + m[10] * o.m[10] + m[14] * o.m[11];
r.m[11] = m[3] * o.m[8] + m[7] * o.m[9] + m[11] * o.m[10] + m[15] * o.m[11];
r.m[12] = m[0] * o.m[12] + m[4] * o.m[13] + m[8] * o.m[14] + m[12] * o.m[15];
r.m[13] = m[1] * o.m[12] + m[5] * o.m[13] + m[9] * o.m[14] + m[13] * o.m[15];
r.m[14] = m[2] * o.m[12] + m[6] * o.m[13] + m[10] * o.m[14] + m[14] * o.m[15];
r.m[15] = m[3] * o.m[12] + m[7] * o.m[13] + m[11] * o.m[14] + m[15] * o.m[15];
return r;
#endif