Hi,
I have implemented a little example of calculus rotation of point.
For that, I have use two different methods.
The first use directly the trigonometric method and the second, use tables of values loading in the texture unit.
In result, all the GPU version are better as the classical C version (build with the table of value) but it’s the trigonometric GPU version witch obtain the best result (8818 ms for the C version, 1636 ms for the GPU using table, and 1188 ms for trigo GPU version).
I am a little disappointed by this result … What do you think about it? It’s normal or my code isn’t optimal?
Tanks for your advice
++ Beleys
ps : I add the twho kernel
Version texture
__global__ void calculAlphaBetaBrutFor(float *dom, float * res,float AzimutX100, float cosTl, float sinTl,float MMPI, float pi, int nbPoint)
{
int alpha, beta;
int indice, Pai, Pdir;
const unsigned int tidX = threadIdx.x; // Récupération de la position
const unsigned int bx = blockIdx.x;
const unsigned int idBlocX = blockDim.x;
indice = tidX +bx *idBlocX;
__syncthreads();
if (indice < nbPoint){
Pdir=(int) dom[indice*2];
Pai = (int)dom[indice*2+1];
float sinai= tex1Dfetch(texSinus100P,Pai+9000);
float cosai = tex1Dfetch(texCosinus100P,Pai+9000);
float sinda_cosai=tex1Dfetch(texSinus100,Pdir);
sinda_cosai =sinda_cosai *cosai;
beta = tex1Dfetch(texACosinusP,(int) ((sinTl*sinda_cosai+sinai*cosTl)) + 120000);
float acos100 = tex1Dfetch(texCosinus100,Pdir);
float unSsinus = tex1Dfetch(texUnSurSinus1024_100k,beta);
alpha= tex1Dfetch(texACosinusP,(int) ((acos100*cosai* unSsinus))+ 120000);
if (cosTl*sinda_cosai-sinai*sinTl<0)
alpha=-alpha;
res[indice*2] = alpha;
res[indice*2+1] = beta;
}
}
Version trigo
__global__ void calculAlphaBetaBrutTer(float *dom, float * res,float AzimutX100, float cosTl, float sinTl, float MMPI, float pi, int nbPoint){
// Variable interne
float ai, dir, aai, cosai, Sinai, ddir, sinda_cosai;
float beta, beta1, cosDir, alpha1, alpha; Pai, Pdir;
float unSurSinus1024;
int indice;
const unsigned int tidX = threadIdx.x; // Récupération de la position
const unsigned int bx = blockIdx.x;
indice = tidX +bx * numTh; // *idBlocX;
if (indice < nbPoint){
//Récupération des données
Pdir= dom[indice*2];
Pai = dom[indice*2+1];
ai = Pai;
dir = Pdir + AzimutX100;
aai = ai/100 * MMPI;
cosai = cosf ( aai);
sinai = sinf (aai);
ddir = dir/100 * MMPI;
sinda_cosai = sinf (ddir);
sinda_cosai = sinda_cosai * cosai;
beta1 = sinTl * sinda_cosai;
beta1 = beta1 + sinai*cosTl;
beta1 += 120000;
if (beta1 < 20000){
beta = 1024 * 180;
}else{
if (beta1 > 220000){
beta = 0;
}else{
beta = 1024 * acosf( ((beta1-200)/100000.0 - 1)) *180/pi;
}
}
cosDir = cos( dir/100 * MMPI);
unSurSinus1024 = 100000.0 / ( sinf( beta/1024 * MMPI ) );
alpha1 = cosDir * cosai * unSurSinus1024;
alpha1 += 120000;
if (alpha1 < 20000){
alpha = 1024 * 180;
}else{
if (alpha1 > 220000){
alpha = 0;
}else{
alpha = 1024 * acosf( ((alpha1-200)/100000.0 - 1)) *180/pi;
}
}
if (cosTl*sinda_cosai-sinai*sinTl<0)
alpha=-alpha;
res[indice*2] = alpha;
res[indice*2+1] = beta;
}
}