----- Synopsis description of the problem -----
Kernel fails to run strangely.
----- Detailed description of the problem -----
Kernel fails to run:
return data are incorrect (tipically zeros),
kernel execution time is almost zero,
but no errors are reported.
Kernel works incorrectly when
the following operation is performed:
[codebox]// UNCOMMENT THE FOLLOWING LINE TO "SOLVE" THE PROBLEM//#define WORKAROUND_ON
#include <cuda_runtime.h>
#include <stdio.h>
#include <cutil_inline.h>
#define em2(xmask, k) ((float) ( (((xmask) >> (k)) & 3) - 1))
#define BIT(PROG, IND) (((PROG) >> (IND) ) & 1)
#define drand (((double) rand()) / RAND_MAX)
long int NTHREADS;
unsigned char outputCouples2[64];
device void d_vprod(float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2, int ien2)
{
float4 in1 = *vinp1, in2 = *vinp2;
#ifdef WORKAROUND_ON
float bug1, bug2;
bug1 = em2(ien2,28);
bug2 = in1.w*in2.z;
vout1->x = em2(ien2,6)(in1.x)(in2.w) + em2(ien2,24)(in1.w)(in2.x);
vout1->y = em2(ien2,14)*(in1.y)*(in2.w) + em2(ien2,26)*(in1.w)*(in2.y);
#ifdef WORKAROUND_ON
vout1->z = em2(ien2,22)*(in1.z)*(in2.w) + bug1*bug2;
vout1->z = em2(ien2,22)*(in1.z)*(in2.w) + em2(ien2,28)*in1.w*in2.z;
vout1->w = em2(ien2,20)*(in1.z)*(in2.z) + em2(ien2,30)*(in1.w)*(in2.w); //scalar
vout2->x = em2(ien2,12)(in1.y)(in2.z) + em2(ien2,18)(in1.z)(in2.y);
vout2->y = em2(ien2,4)*(in1.x)*(in2.z) + em2(ien2,16)*(in1.z)*(in2.x);
vout2->z = em2(ien2,2)*(in1.x)*(in2.y) + em2(ien2,8)*(in1.y)*(in2.x);
vout2->w = em2(ien2,0)*(in1.x)*(in2.x) + em2(ien2,10)*(in1.y)*(in2.y); //scalar
vout1->w += vout2->w;
vout2->w = 0;
}
global void d_gprod(unsigned short code, float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2,
unsigned char* outputCouples2, unsigned char* outtype, int ien2)
{
const unsigned long long outswaps = 0x55aa55aa55aa55aaLL;
unsigned short code0;
float4 *voutS, *voutP;
unsigned short swapon;
int tid = blockIdx.x*512+threadIdx.x;
code0 = code & 0x3f;
swapon = (unsigned short) BIT(outswaps, code0);
if (swapon) {
voutS = vout2+tid;
voutP = vout1+tid;
} else {
voutS = vout1+tid;
voutP = vout2+tid;
}
d_vprod(vinp1, vinp2+tid, voutS, voutP, ien2);
*(outtype+tid) = outputCouples2[code0];
}
int main(int argc, char** argv)
{
// INIT
NTHREADS = 512*512;
for(int i=0; i<=63; i++) outputCouples2[i]=0;
// HOST MEMORY ALLOCATION
unsigned short codein;
float4 qin1;
float4* qin2 = (float4*) malloc(NTHREADS*sizeof(float4));
float4* qout1 = (float4*) malloc(NTHREADS*sizeof(float4));
float4* qout2 = (float4*) malloc(NTHREADS*sizeof(float4));
unsigned char* codeout = (unsigned char*) malloc(NTHREADS*sizeof(unsigned char));
// DEVICE MEMORY ALLOCATION
float4* d_qin1; cutilSafeCall(cudaMalloc((void**)&d_qin1,sizeof(float4)));
float4* d_qin2; cutilSafeCall(cudaMalloc((void**)&d_qin2,NTHREADS*sizeof(float4)));
float4* d_qout1; cutilSafeCall(cudaMalloc((void**)&d_qout1,NTHREADS*sizeof(float4)));
float4* d_qout2; cutilSafeCall(cudaMalloc((void**)&d_qout2,NTHREADS*sizeof(float4)));
unsigned char* d_outputCouples2; cutilSafeCall(cudaMalloc((void**)&d_outputCouples2,64*sizeof(unsigned char)));
unsigned char* d_codeout; cutilSafeCall(cudaMalloc((void**)&d_codeout,NTHREADS*sizeof(unsigned char)));
// DATA TRANSFER TO DEVICE
cutilSafeCall(cudaMemcpy(d_outputCouples2,outputCouples2,64*
sizeof(unsigned char),cudaMemcpyHostToDevice));
// TIMER INIT
double gpu_kernel_time = 0, gpu_total_time = 0;
unsigned int hTimer, hTimer2;
// OPERANDS INIT
qin1.x = drand; qin1.y = drand; qin1.z = drand; qin1.w = drand;
for (int i=0; i<NTHREADS; i++)
{
qin2[i].x = drand; qin2[i].y=drand; qin2[i].z = drand; qin2[i].w = drand;
}
codein = 0;
// TIMER START (KERNEL + DATA TRANSFER)
cutilCheckError( cutCreateTimer(&hTimer2) );
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutResetTimer(hTimer2) );
cutilCheckError( cutStartTimer(hTimer2) );
// DATA TRANSFER TO DEVICE
cutilSafeCall(cudaMemcpy(d_qin2,qin2,NTHREADS*sizeof(float4)
,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(d_qin1,&qin1,sizeof(float4),cudaMemcpyHostToDevice));
// TIMER START (KERNEL ONLY)
cutilCheckError( cutCreateTimer(&hTimer) );
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutResetTimer(hTimer) );
cutilCheckError( cutStartTimer(hTimer) );
// COMPUTATION ON DEVICE (KERNEL EXECUTION)
dim3 dimBlock(((NTHREADS<=512)?NTHREADS:512),1);
dim3 dimGrid(((NTHREADS<=512)?1:NTHREADS/512),1);
d_gprod<<<dimGrid,dimBlock>>>(codein, d_qin1, d_qin2, d_qout1, d_qout2, d_outputCouples2, d_codeout, 0);
// TIMER END (KERNEL ONLY)
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutStopTimer(hTimer) );
gpu_kernel_time = cutGetTimerValue(hTimer);
// DATA TRANSFER TO HOST
cutilSafeCall(cudaMemcpy(qout1,d_qout1,NTHREADS*sizeof(float
4),cudaMemcpyDeviceToHost));
cutilSafeCall(cudaMemcpy(qout2,d_qout2,NTHREADS*sizeof(float
4),cudaMemcpyDeviceToHost));
cutilSafeCall(cudaMemcpy(codeout,d_codeout,NTHREADS*sizeof(u
nsigned char),cudaMemcpyDeviceToHost));
// TIMER END (KERNEL + DATA TRANSFER)
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutStopTimer(hTimer2) );
gpu_total_time = cutGetTimerValue(hTimer2);
// SHOW FIRST 3 RESULTS
for(int i=0; i<=2; i++)
{
printf("--- Operation n. %d ---\n", i+1);
printf("(%.2f, %.2f, %.2f, %.2f) GP ", qin1.x, qin1.y, qin1.z, qin1.w);
printf("(%.2f, %.2f, %.2f, %.2f) = ", qin2[i].x, qin2[i].y, qin2[i].z, qin2[i].w);
printf("(%.2f, %.2f, %.2f, %.2f) + ", qout1[i].x, qout1[i].y, qout1[i].z, qout1[i].w);
printf("(%.2f, %.2f, %.2f, %.2f)\n", qout2[i].x, qout2[i].y, qout2[i].z, qout2[i].w);
//printf("Output code: %d \n", codeout[i]);
}
// SHOW TOTAL TIME
printf(“\nNumber of threads: %li\n”,NTHREADS);
printf(“GPU kernel time: %lf msec\n”,gpu_kernel_time);
printf(“GPU total time: %lf msec\n”,gpu_total_time);
return 0;
}[/codebox]