Then, is there any limit on the number of texture fetches using tex1Dfetch() per kernel?
The reason for the above question is because of the following program; it runs correctly on Quadro FX 5600 (Driver API v2.0) and ION (Driver API v3.2). On Tesla C2050 (Driver API v4.0), however,
depending on the number of texture-fetched variables, it may generate incorrect output.
(In main_kernel0() function in the below code, four variables (rowptr, colind, values, and x) are fetched thru tex1Dfetch() functions. On Tesla, if at least one of the variables is not fetched thru texture function, the program runs correctly, but if all of these variables are fetched together, the program runs incorrectly. Of course, this problem occurs only when run on Tesla C2050.
Can you find any bug in the program? or explain why this weird behavior can occur?
Thanks.
==================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <cutil.h>
#include <math.h>
static unsigned int gpuBytes = 0;
texture<int, 1, cudaReadModeElementType> texture__colind;
texture<int, 1, cudaReadModeElementType> texture__rowptr;
texture<float, 1, cudaReadModeElementType> texture__values;
texture<float, 1, cudaReadModeElementType> texture__x;
texture<float, 1, cudaReadModeElementType> texture__y;
int * gpu__colind__main;
int * gpu__rowptr__main;
float * gpu__values__main;
float * gpu__x__main;
float * gpu__y__main;
extern double timer_();
int colind[1853104];
int rowptr[(14000+1)];
float values[1853104];
float x[14000];
float y[14000];
global void main_kernel0(int * colind, int * rowptr, float * values, float * x, float * y)
{
int i;
int j;
float temp;
float tx;
int ii;
int _bid = (blockIdx.x+(blockIdx.ygridDim.x));
int _gtid = (threadIdx.x+(_bidblockDim.x));
i=_gtid;
if (i<14000)
{
temp=0.0F;
//for (j=rowptr[i]; j<rowptr[(1+i)]; j ++ )
for (j=tex1Dfetch(texture__rowptr, i); j<tex1Dfetch(texture__rowptr, (1+i)); j ++ )
{
ii = tex1Dfetch(texture__colind, (j-1))-1;
//ii = colind[(j-1)]-1;
tx = tex1Dfetch(texture__x, ii);
//tx = x[ii];
temp=(temp+(tex1Dfetch(texture__values, (j-1))*tx));
//temp=(temp+(values[(j-1)]*tx));
}
y[i]=temp;
}
}
global void main_kernel1(float * x, float * y)
{
int exp0;
int i;
int j;
int _bid = (blockIdx.x+(blockIdx.ygridDim.x));
int _gtid = (threadIdx.x+(_bidblockDim.x));
i=_gtid;
if (i<14000)
{
exp0=((int)log10f(fabsf(tex1Dfetch(texture__y, i))));
x[i]=tex1Dfetch(texture__y, i);
if ((( - exp0)<=0))
{
for (j=1; j<=(1+exp0); j ++ )
{
x[i]=(x[i]/10.0F);
}
}
else
{
if (((1+exp0)<=0))
{
j=( - 1);
for (j=1; j<=( - exp0); j ++ )
{
x[i]=(10.0F*x[i]);
}
}
}
}
}
int main()
{
FILE * fp10;
char filename1[96] = “/home/f6l/CUDAInput/”;
char filename2[32] = “appu.rbC”;
float temp;
float x_sum;
double s_time1;
double e_time1;
double s_time2;
double e_time2;
double s_time3;
double e_time3;
int exp0;
int i;
int j;
int k;
int r_ncol;
int r_nnzero;
int r_nrow;
int _ret_val_0;
////////////////////////////////
// CUDA Device Initialization //
////////////////////////////////
int deviceCount;
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));
if (deviceCount == 0) {
fprintf(stderr, "cutil error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
int dev = 0;
cudaDeviceProp deviceProp;
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev));
fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);
CUDA_SAFE_CALL(cudaSetDevice(dev));
gpuBytes=(1853104*sizeof (int));
CUDA_SAFE_CALL(cudaMalloc(((void * * )( & gpu__colind__main)), gpuBytes));
cudaBindTexture(0, texture__colind, gpu__colind__main, gpuBytes);
gpuBytes=((14000+1)*sizeof (int));
CUDA_SAFE_CALL(cudaMalloc(((void * * )( & gpu__rowptr__main)), gpuBytes));
cudaBindTexture(0, texture__rowptr, gpu__rowptr__main, gpuBytes);
gpuBytes=(1853104*sizeof (float));
CUDA_SAFE_CALL(cudaMalloc(((void * * )( & gpu__values__main)), gpuBytes));
cudaBindTexture(0, texture__values, gpu__values__main, gpuBytes);
gpuBytes=(14000*sizeof (float));
CUDA_SAFE_CALL(cudaMalloc(((void * * )( & gpu__x__main)), gpuBytes));
cudaBindTexture(0, texture__x, gpu__x__main, gpuBytes);
gpuBytes=(14000*sizeof (float));
CUDA_SAFE_CALL(cudaMalloc(((void * * )( & gpu__y__main)), gpuBytes));
gpuBytes=(14000*sizeof (float));
cudaBindTexture(0, texture__y, gpu__y__main, gpuBytes);
printf("**** SerialSpmul starts! ****\n");
strcat(filename1, filename2);
printf("Input file: %s\n", filename2);
s_time1=timer_();
s_time2=timer_();
if (((fp10=fopen(filename1, "r"))==((void * )0)))
{
printf("FILE %s DOES NOT EXIST; STOP\n", filename1);
exit(1);
}
printf("FILE open done\n");
fscanf(fp10, "%d %d %d", ( & r_nrow), ( & r_ncol), ( & r_nnzero));
if ((r_nrow!=14000))
{
printf("alarm: incorrect row\n");
exit(1);
}
if ((r_ncol!=14000))
{
printf("alarm: incorrect col\n");
exit(1);
}
if ((r_nnzero!=1853104))
{
printf("alarm: incorrect nzero\n");
exit(1);
}
for (i=0; i<=14000; i ++ )
{
fscanf(fp10, "%d", (rowptr+i));
}
for (i=0; i<1853104; i ++ )
{
fscanf(fp10, "%d", (colind+i));
}
for (i=0; i<1853104; i ++ )
{
fscanf(fp10, "%E", (values+i));
/* for float variables */
}
fclose(fp10);
j=0;
for (i=0; i<14000; i ++ )
{
{
LB99:
temp=values[j];
}
if (((( - 0.1F)<temp)&&(temp<0.1F)))
{
j+=1;
/* goto LB99; /
/ Added by SYLee /
if ((temp==0.0F))
{
goto LB99;
}
x[i]=temp;
continue;
}
exp0=((int)log10f(fabsf(temp)));
x[i]=temp;
if ((( - exp0)<=0))
{
for (k=1; k<=(1+exp0); k ++ )
{
x[i]=(x[i]/10.0F);
}
}
else
{
if (((1+exp0)<=0))
{
k=( - 1);
for (k=1; k<=( - exp0); k ++ )
{
x[i]=(10.0Fx[i]);
}
}
}
if (((1.0F<x[i])||(x[i]<( - 1.0F))))
{
printf(“alarm initial i = %d\n”, i);
printf(“x = %E\n”, x[i]);
printf(“value = %E\n”, values[(1000+i)]);
printf(“exp = %d\n”, exp0);
exit(1);
}
j+=1;
}
printf(“initialization done\n”);
e_time2=timer_();
s_time3=timer_();
dim3 dimBlock0(384, 1, 1);
dim3 dimGrid0(37, 1, 1);
gpuBytes=(1853104*sizeof (int));
CUDA_SAFE_CALL(cudaMemcpy(gpu__colind__main, colind, gpuBytes, cudaMemcpyHostToDevice));
gpuBytes=((14000+1)*sizeof (int));
CUDA_SAFE_CALL(cudaMemcpy(gpu__rowptr__main, rowptr, gpuBytes, cudaMemcpyHostToDevice));
gpuBytes=(1853104*sizeof (float));
CUDA_SAFE_CALL(cudaMemcpy(gpu__values__main, values, gpuBytes, cudaMemcpyHostToDevice));
gpuBytes=(14000*sizeof (float));
CUDA_SAFE_CALL(cudaMemcpy(gpu__x__main, x, gpuBytes, cudaMemcpyHostToDevice));
dim3 dimBlock1(384, 1, 1);
dim3 dimGrid1(37, 1, 1);
//cudaThreadSynchronize();
for (k=0; k<500; k ++ )
{
main_kernel0<<<dimGrid0, dimBlock0, 0, 0>>>(gpu__colind__main, gpu__rowptr__main, gpu__values__main, gpu__x__main, gpu__y__main);
//cudaThreadSynchronize();
main_kernel1<<<dimGrid1, dimBlock1, 0, 0>>>(gpu__x__main, gpu__y__main);
//cudaThreadSynchronize();
}
//cudaThreadSynchronize();
gpuBytes=(14000*sizeof (float));
CUDA_SAFE_CALL(cudaMemcpy(x, gpu__x__main, gpuBytes, cudaMemcpyDeviceToHost));
e_time3=timer_();
e_time1=timer_();
printf("Total elapsed time = %f seconds\n", (e_time1-s_time1));
printf("Initialize time = %f seconds\n", (e_time2-s_time2));
printf("Main Comp time = %f seconds\n", (e_time3-s_time3));
x_sum=0.0F;
for (i=0; i<14000; i ++ )
{
x_sum+=x[i];
}
printf("%d: x_sum = %.12E\n", (k+1), x_sum);
_ret_val_0=0;
CUDA_SAFE_CALL(cudaFree(gpu__colind__main));
CUDA_SAFE_CALL(cudaFree(gpu__rowptr__main));
CUDA_SAFE_CALL(cudaFree(gpu__values__main));
CUDA_SAFE_CALL(cudaFree(gpu__x__main));
CUDA_SAFE_CALL(cudaFree(gpu__y__main));
fflush(stdout);
fflush(stderr);
return _ret_val_0;
}