Texture memory coherency with respect to concurrent kernel execution

Suppose that one kernel (kernel1) writes an array on the global memory and the other kernel (kernel2) reads the same array using tex1Dfetch() function.

(Suppose the two kernels are in a loop body and executed multiple times, as shown below:

for(i=0; i<500; i++) {
    kernel1<<<...>>>(...);
    kernel2<<<...>>>(...);
}

)

If the underlying GPU supports “concurrent kernel exection” (ex: Tesla C2050/C2070), can these two kernels (kernel1 and kernel2) be executed concurrently?
If so, is the texture cache coherent with respect to the global memory writes?

CUDA Manual (NVIDIA CUDA C Programming Guide Version 4.0 Chapter 3.2.10.4) says that texture cache is coherent w.r.t writes by “the previous kernel calls”, but not mentioning about writes by “concurrent kernels”.

“A thread can safely read some texture or surface memory location only if this memory location has been updated by a pervious kernel call or memory copy, but not if it has been previously updated by the same thread or another thread from the same kernel call.”

Thanks.

I think you can write a small sample code to simulate this and share the results here. I don’t expect correct results with concurrent kernels reading and writing to memory bound to textures…

I posted this question since it is not easy to verify the behavior. The one shown below is the trouble-making program; it runs correctly on GPUs such as ION and Quadro 5600, but gives wrong results on Tesla C2050/C2070 GPUs. One interesting thing is that if I disable at least one of texture accesses in main_kernel0() function, it returns correct output on Tesla GPU, or if I compile it with –G option, it also returns correct output.

I can’t find any apparent error in the source code, and thus I guess that it may be a bug related to Tesla C2050 GPU executing compatibility 2.0 codes. One unique feature of Tesla C2050 GPU is its capability of concurrent kernel execution. Of course, I tried to disable this behavior using cudaThreadSynchronize(), but I’m not sure whether it really blocks concurrent kernel execution or not.

Any suggestion or correction will be thankful!

===================

include <stdio.h>

include <stdlib.h>

include <math.h>

include <string.h>

include <math.h>

include <sys/time.h>

static unsigned int gpuBytes = 0;

texture<int, 1, cudaReadModeElementType> texture__colind;

texture<int, 1, cudaReadModeElementType> texture__rowptr;

texture<float, 1, cudaReadModeElementType> texture__values;

texture<float, 1, cudaReadModeElementType> texture__x;

texture<float, 1, cudaReadModeElementType> texture__y;

int * gpu__colind__main;

int * gpu__rowptr__main;

float * gpu__values__main;

float * gpu__x__main;

float * gpu__y__main;

double timer_( )

{

struct timeval time;

gettimeofday(( & time), 0);

return (time.tv_sec+(time.tv_usec/1000000.0));

}

int colind[1853104];

int rowptr[(14000+1)];

float values[1853104];

float x[14000];

float y[14000];

global void main_kernel0(int * colind, int * rowptr, float * values, float * x, float * y)

{

	int i;

	int j;

	float temp;

	float tx;

	int ii;

	int _bid = (blockIdx.x+(blockIdx.y*gridDim.x));

	int _gtid = (threadIdx.x+(_bid*blockDim.x));

	i=_gtid;

	if (i<14000)

	{

			temp=0.0F;

			//for (j=rowptr[i]; j<rowptr[(1+i)]; j ++ )

			for (j=tex1Dfetch(texture__rowptr, i); j<tex1Dfetch(texture__rowptr, (1+i)); j ++ )

			{

					ii = tex1Dfetch(texture__colind, (j-1))-1;

					//ii = colind[(j-1)]-1;

					tx = tex1Dfetch(texture__x, ii);

					//tx = x[ii];

					temp=(temp+(tex1Dfetch(texture__values, (j-1))*tx));

					//temp=(temp+(values[(j-1)]*tx));

			}

			y[i]=temp;

	}

}

global void main_kernel1(float * x, float * y)

{

	int exp0;

	int i;

	int j;

	int _bid = (blockIdx.x+(blockIdx.y*gridDim.x));

	int _gtid = (threadIdx.x+(_bid*blockDim.x));

	i=_gtid;

	if (i<14000)

	{

			exp0=((int)log10f(fabsf(tex1Dfetch(texture__y, i))));

			x[i]=tex1Dfetch(texture__y, i);

			if ((( - exp0)<=0))

			{

					for (j=1; j<=(1+exp0); j ++ )

					{

							x[i]=(x[i]/10.0F);

					}

			}

			else

			{

					if (((1+exp0)<=0))

					{

							j=( - 1);

							for (j=1; j<=( - exp0); j ++ )

							{

									x[i]=(10.0F*x[i]);

							}

					}

			}

	}

}

int main()

{

	FILE * fp10;

	//char filename1[96] = "/home/f6l/CUDAInput/";

	char filename1[96] = "/tmp/CUDAInput/";

	char filename2[32] = "appu.rbC";

	float temp;

	float x_sum;

	double s_time1;

	double e_time1;

	double s_time2;

	double e_time2;

	double s_time3;

	double e_time3;

	int exp0;

	int i;

	int j;

	int k;

	int r_ncol;

	int r_nnzero;

	int r_nrow;

	int _ret_val_0;

	////////////////////////////////

	// CUDA Device Initialization //

	////////////////////////////////

	int deviceCount;

	cudaGetDeviceCount(&deviceCount);

	if (deviceCount == 0) {

			fprintf(stderr, "cutil error: no devices supporting CUDA.\n");

			exit(EXIT_FAILURE);

	}

	int dev = 0;

	cudaDeviceProp deviceProp;

	cudaGetDeviceProperties(&deviceProp, dev);

	fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);

	cudaSetDevice(dev);

	gpuBytes=(1853104*sizeof (int));

	cudaMalloc(((void *  * )( & gpu__colind__main)), gpuBytes);

	cudaBindTexture(0, texture__colind, gpu__colind__main, gpuBytes);

	gpuBytes=((14000+1)*sizeof (int));

	cudaMalloc(((void *  * )( & gpu__rowptr__main)), gpuBytes);

	cudaBindTexture(0, texture__rowptr, gpu__rowptr__main, gpuBytes);

	gpuBytes=(1853104*sizeof (float));

	cudaMalloc(((void *  * )( & gpu__values__main)), gpuBytes);

	cudaBindTexture(0, texture__values, gpu__values__main, gpuBytes);

	gpuBytes=(14000*sizeof (float));

	cudaMalloc(((void *  * )( & gpu__x__main)), gpuBytes);

	cudaBindTexture(0, texture__x, gpu__x__main, gpuBytes);

	gpuBytes=(14000*sizeof (float));

	cudaMalloc(((void *  * )( & gpu__y__main)), gpuBytes);

	gpuBytes=(14000*sizeof (float));

	cudaBindTexture(0, texture__y, gpu__y__main, gpuBytes);

	printf("**** SerialSpmul starts! ****\n");

	strcat(filename1, filename2);

	printf("Input file: %s\n", filename2);

	s_time1=timer_();

	s_time2=timer_();

	if (((fp10=fopen(filename1, "r"))==((void * )0)))

	{

			printf("FILE %s DOES NOT EXIST; STOP\n", filename1);

			exit(1);

	}

	printf("FILE open done\n");

	fscanf(fp10, "%d %d %d", ( & r_nrow), ( & r_ncol), ( & r_nnzero));

	if ((r_nrow!=14000))

	{

			printf("alarm: incorrect row\n");

			exit(1);

	}

	if ((r_ncol!=14000))

	{

			printf("alarm: incorrect col\n");

			exit(1);

	}

	if ((r_nnzero!=1853104))

	{

			printf("alarm: incorrect nzero\n");

			exit(1);

	}

	for (i=0; i<=14000; i ++ )

	{

			fscanf(fp10, "%d", (rowptr+i));

	}

	for (i=0; i<1853104; i ++ )

	{

			fscanf(fp10, "%d", (colind+i));

	}

	for (i=0; i<1853104; i ++ )

	{

			fscanf(fp10, "%E", (values+i));

			/* for float variables */

	}

	fclose(fp10);

	j=0;

	for (i=0; i<14000; i ++ )

	{

			{

LB99:

					temp=values[j];

			}

			if (((( - 0.1F)<temp)&&(temp<0.1F)))

			{

					j+=1;

					/* goto LB99; */

					/* Added by SYLee */

					if ((temp==0.0F))

					{

							goto LB99;

					}

					x[i]=temp;

					continue;

			}

			exp0=((int)log10f(fabsf(temp)));

			x[i]=temp;

			if ((( - exp0)<=0))

			{

					for (k=1; k<=(1+exp0); k ++ )

					{

							x[i]=(x[i]/10.0F);

					}

			}

			else

			{

					if (((1+exp0)<=0))

					{

							k=( - 1);

							for (k=1; k<=( - exp0); k ++ )

							{

									x[i]=(10.0F*x[i]);

							}

					}

			}

			if (((1.0F<x[i])||(x[i]<( - 1.0F))))

			{

					printf("alarm initial i = %d\n", i);

					printf("x = %E\n", x[i]);

					printf("value = %E\n", values[(1000+i)]);

					printf("exp = %d\n", exp0);

					exit(1);

			}

			j+=1;

	}

	printf("initialization done\n");

	e_time2=timer_();

	s_time3=timer_();

	dim3 dimBlock0(384, 1, 1);

	dim3 dimGrid0(37, 1, 1);

	gpuBytes=(1853104*sizeof (int));

	cudaMemcpy(gpu__colind__main, colind, gpuBytes, cudaMemcpyHostToDevice);

	gpuBytes=((14000+1)*sizeof (int));

	cudaMemcpy(gpu__rowptr__main, rowptr, gpuBytes, cudaMemcpyHostToDevice);

	gpuBytes=(1853104*sizeof (float));

	cudaMemcpy(gpu__values__main, values, gpuBytes, cudaMemcpyHostToDevice);

	gpuBytes=(14000*sizeof (float));

	cudaMemcpy(gpu__x__main, x, gpuBytes, cudaMemcpyHostToDevice);

	dim3 dimBlock1(384, 1, 1);

	dim3 dimGrid1(37, 1, 1);

	cudaThreadSynchronize();

	for (k=0; k<500; k ++ )

	{

			main_kernel0<<<dimGrid0, dimBlock0, 0, 0>>>(gpu__colind__main, gpu__rowptr__main, gpu__values__main, gpu__x__main, gpu__y__main);

			cudaThreadSynchronize();

			main_kernel1<<<dimGrid1, dimBlock1, 0, 0>>>(gpu__x__main, gpu__y__main);

			cudaThreadSynchronize();

	}

	cudaThreadSynchronize();

	gpuBytes=(14000*sizeof (float));

	cudaMemcpy(x, gpu__x__main, gpuBytes, cudaMemcpyDeviceToHost);

	e_time3=timer_();

	e_time1=timer_();

	printf("Total elapsed time = %f seconds\n", (e_time1-s_time1));

	printf("Initialize time = %f seconds\n", (e_time2-s_time2));

	printf("Main Comp time = %f seconds\n", (e_time3-s_time3));

	x_sum=0.0F;

	for (i=0; i<14000; i ++ )

	{

			x_sum+=x[i];

	}

	printf("%d: x_sum = %.12E\n", (k+1), x_sum);

	_ret_val_0=0;

	cudaFree(gpu__colind__main);

	cudaFree(gpu__rowptr__main);

	cudaFree(gpu__values__main);

	cudaFree(gpu__x__main);

	cudaFree(gpu__y__main);

	fflush(stdout);

	fflush(stderr);

	return _ret_val_0;

}

(1) The texture cache is guaranteed to be coherent with respect to writes by a previous kernel in the same stream.
(2) CUDA operations not assigned to a specific stream by the programmer are assigned to the null stream by default, thus all CUDA operations are part of a particular stream
(3) For two kernels to execute concurrently, they must be in different streams.
(4) If there is a data dependency between kernels in different streams (regardless of whether this involves textures), explicit inter-stream synchronization (e.g. cudaStreamWaitEvent) must be used, otherwise a race condition exists.

Thank you for this clarification; then in the above program, at least concurrent execution is not the reason for the incorrect output, since two kernels are assigned to the same, default stream.