Are there any cuda libararies for 3x3 matrix & vector3 & quaternion operations?

Hi guys,

Currently I’m implementing a cuda based physics simulation framework, and will be using matrix3x3 vector3 quaternion operations a lot. I’d like to use some useful cuda device functions similar to this one https://github.com/erwincoumans/experiments/blob/master/opencl/primitives/AdlPrimitives/Math/MathCL.h

If there exists a good cuda libarary for these operations, I won’t implement my own version. Otherwise, I will slightly change the code from the above link, and use it in my project.

Any advice?

Thanks.

So I have implemeted a simple version, If some of you have a better solution, please give some advice.

Thanks.

// This implementation follows the code from
// https://github.com/erwincoumans/experiments/blob/master/opencl/primitives/AdlPrimitives/Math/MathCL.h

#ifndef UNIFIED_MATH_CUDA_H
#define UNIFIED_MATH_CUDA_H

/*****************************************
                Vector
/*****************************************/

__device__
    inline float fastDiv(float numerator, float denominator)
{
    return __fdividef(numerator, denominator);        
    //        return numerator/denominator;        
}

__device__
    inline float getSqrtf(float f2)
{
    return sqrtf(f2);
    //        return sqrt(f2);
}

__device__
    inline float getReverseSqrt(float f2)
{
    return rsqrtf(f2);
}

__device__
    inline float3 getCrossProduct(float3 a, float3 b)
{
    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 
}

__device__
    inline float4 getCrossProduct(float4 a, float4 b)
{
    float3 v1 = make_float3(a.x, a.y, a.z);
    float3 v2 = make_float3(b.x, b.y, b.z);
    float3 v3 = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 

    return make_float4(v3.x, v3.y, v3.z, 0.0f);
}

__device__
    inline float getDotProduct(float3 a, float3 b)
{
     return a.x * b.x + a.y * b.y + a.z * b.z;
}

__device__
    inline float getDotProduct(float4 a, float4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

__device__ float3 getNormalizedVec(const float3 v)
{
    float invLen = 1.0f / sqrtf(getDotProduct(v, v));
    return make_float3(v.x * invLen, v.y * invLen, v.z * invLen);
}

__device__ float4 getNormalizedVec(const float4 v)
{
    float invLen = 1.0f / sqrtf(getDotProduct(v, v));
    return make_float4(v.x * invLen, v.y * invLen, v.z * invLen, v.w * invLen);
}

__device__
    inline float dot3F4(float4 a, float4 b)
{
    float4 a1 = make_float4(a.x, a.y, a.z,0.f);
    float4 b1 = make_float4(b.x, b.y, b.z,0.f);
    return getDotProduct(a1, b1);
}

__device__
    inline float getLength(float3 a)
{
    return sqrtf(getDotProduct(a, a));
}

__device__
    inline float getLength(float4 a)
{
    return sqrtf(getDotProduct(a, a));
}

/*****************************************
                Matrix3x3
/*****************************************/
typedef struct
{
    float4 m_row[3];
}Matrix3x3_d;

__device__
    inline void setZero(Matrix3x3_d& m)
{
    m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}

__device__
    inline Matrix3x3_d getZeroMatrix3x3()
{
    Matrix3x3_d m;
    m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    return m;
}

__device__ 
    inline void setIdentity(Matrix3x3_d& m)
{
    m.m_row[0] = make_float4(1,0,0,0);
    m.m_row[1] = make_float4(0,1,0,0);
    m.m_row[2] = make_float4(0,0,1,0);
}

__device__
    inline Matrix3x3_d getIdentityMatrix3x3()
{
    Matrix3x3_d m;
    m.m_row[0] = make_float4(1,0,0,0);
    m.m_row[1] = make_float4(0,1,0,0);
    m.m_row[2] = make_float4(0,0,1,0);
    return m;
}

__device__
    inline Matrix3x3_d getTranspose(const Matrix3x3_d m)
{
    Matrix3x3_d out;
    out.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
    out.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
    out.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
    return out;
}

__device__
    inline Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b)
{
    Matrix3x3_d transB = getTranspose( b );
    Matrix3x3_d ans;
    //        why this doesn't run when 0ing in the for{}
    a.m_row[0].w = 0.f;
    a.m_row[1].w = 0.f;
    a.m_row[2].w = 0.f;
    for(int i=0; i<3; i++)
    {
        //        a.m_row[i].w = 0.f;
        ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
        ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
        ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
        ans.m_row[i].w = 0.f;
    }
    return ans;
}

/*****************************************
                Quaternion
/*****************************************/

typedef float4 Quaternion;

__device__
    inline Quaternion quaternionMul(Quaternion a, Quaternion b);

__device__
    inline Quaternion qtNormalize(Quaternion in);

__device__
    inline float4 qtRotate(Quaternion q, float4 vec);

__device__
    inline Quaternion qtInvert(Quaternion q);

__device__
    inline Matrix3x3_d qtGetRotationMatrix(Quaternion q);

__device__
    inline Quaternion quaternionMul(Quaternion a, Quaternion b)
{
    Quaternion ans;
    ans = getCrossProduct(a, b);
    ans = make_float4(ans.x + a.w*b.x + b.w*a.x + b.w*a.y, ans.y + a.w*b.y + b.w*a.z, ans.z + a.w*b.z, ans.w + a.w*b.w + b.w*a.w);
    //        ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
    ans.w = a.w*b.w - dot3F4(a, b);
    return ans;
}

__device__
    inline Quaternion qtNormalize(Quaternion in)
{
    return getNormalizedVec(in);
    //        in /= length( in );
    //        return in;
}

__device__
    inline Quaternion qtInvert(const Quaternion q)
{
    return make_float4(-q.x, -q.y, -q.z, q.w);
}

__device__
    inline float4 qtRotate(const Quaternion q, const float4 vec)
{
    Quaternion qInv = qtInvert( q );
    float4 vcpy = vec;
    vcpy.w = 0.f;
    float4 out = quaternionMul(quaternionMul(q,vcpy),qInv);
    return out;
}

__device__
    inline float4 qtInvRotate(const Quaternion q, const float4 vec)
{
    return qtRotate( qtInvert( q ), vec );
}

__device__
    inline Matrix3x3_d qtGetRotationMatrix(Quaternion quat)
{
    float4 quat2 = make_float4(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
    Matrix3x3_d out;

    out.m_row[0].x=1-2*quat2.y-2*quat2.z;
    out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;
    out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;
    out.m_row[0].w = 0.f;

    out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;
    out.m_row[1].y=1-2*quat2.x-2*quat2.z;
    out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;
    out.m_row[1].w = 0.f;

    out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;
    out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;
    out.m_row[2].z=1-2*quat2.x-2*quat2.y;
    out.m_row[2].w = 0.f;

    return out;
}

#endif  // UNIFIED_MATH_CUDA_H

We had to code a similar library for 2x2 and 4x4 matrices, including matrix inversion. We had to take care that everything stayed in registers (any spillage to local memory would have killed our performance). We later also added some support for column and row vectors.

We then applied this code to do radio simulations for MIMO channels in LTE and radio systems beyond 4G.

I have posted my 2x2 matrix code to these forums (use the search feature). The 4x4 version was left as an excercise to the reader ;) It appears that your code is quite similar to ours.

After searching in this forum using your username, I got nothing related to your 2x2 cuda matrix operation code. If it’s possible, please give me the specific link. Thank you!

https://devtalk.nvidia.com/default/topic/418753/fun-with-complex-numbers-today-2x2-matrix-inversions-millions-example-use-of-my-cudacomplex-cl/

Thanks^-^