writing to texture memory

From the Dr. Dobb’s article 13 on CUDA: “The CUDA Toolkit 2.2 introduced the ability to write to 2D textures bound to pitch linear memory on the GPU that has a texture bound to it. In other words, the data within the texture can be updated within a kernel running on the GPU.”

Can anyone point me to an example of how to do this or provide one? I couldn’t find any mention of this is the programming guide or any examples in the SDK.
Thanks

That is an interesting find which if true is a welcome step.

It is true, and it isn’t. Yes, you can bind linear memory to a texture, and yes, you can modify that linear memory in a running kernel. But there is no guarantee of texture cache coherence, so any texture reads made within a kernel which is modifying the source device memory won’t reflect those modifications. Subsequent kernels will see those modifications, however.

Even I would be interested in seeing an example/tutorial… avidday, any examples?

Hi there…

I’m also very interested in using linear memory for texture access and wrote a small example code, but it obviously doesn’t work.

Actually I used Dr. Dobb’s example implementation and tried to bind pitched linear memory to the texture and then use the texture to linearly interpolate between the values.

Here’s the code:

//readTexels.cu 

#include <stdio.h>

#define LIN_MEM

void checkCUDAError(const char *msg) 

{ 

	cudaError_t err = cudaGetLastError(); 

	if( cudaSuccess != err) 

	{ 

		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 

		exit(EXIT_FAILURE); 

	} 

}

texture<float, 1, cudaReadModeElementType> texRef;

__global__ void readTexels(int n, float *d_out) 

{ 

	int idx = blockIdx.x*blockDim.x + threadIdx.x; 

	if(idx < n) 

	{ 

		float f_idx = float(idx)/2.f;

				f_idx += 0.5f;

		//Note: Appendix D.2 gives formula for interpolation 

#ifndef LIN_MEM

		float x = tex1D(texRef, f_idx); 

#else

		//float x = tex1Dfetch(texRef, f_idx);

		float x = tex1D(texRef, f_idx);

#endif

		d_out[idx] = x; 

	} 

} 

#define NUM_THREADS 256

int main() 

{ 

	// 10 is illustrative and should be larger in practice 

	int N = 10;

	

	int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0); 

	

	// allocate space on the device for the results 

	float *d_out; 

	cudaMalloc((void**)&d_out, sizeof(float)*N);

	// allocate space on the host for the results

	float *h_out = (float*)malloc(sizeof(float)*N);

	// data fill array with increasing values

	float *data = (float*)malloc(N*sizeof(float));

	for (int i = 0; i < N; i++) data[i] = float(i);

	

#ifndef LIN_MEM

	// create a CUDA array on the device

	cudaArray* cuArray;

	cudaMallocArray (&cuArray, &texRef.channelDesc, N, 1); 

	cudaMemcpyToArray(cuArray, 0, 0, data, sizeof(float)*N , cudaMemcpyHostToDevice ); 

	// bind a texture to the CUDA array 

	cudaBindTextureToArray (texRef, cuArray);

#else

	// use linear memory...

	float *d_data;	

	size_t pitch;

	cudaMallocPitch( (void**) &d_data, &pitch, N * sizeof(float), 1 * sizeof(float) );

	cudaMemcpy( d_data, data, N * sizeof(float), cudaMemcpyHostToDevice );

	

	// bind a texture to the CUDA array 

	cudaBindTexture( 0, texRef, d_data, N*sizeof(float) );

	checkCUDAError("bind");

#endif

	// host side settable texture attributes 

	texRef.normalized = false; 

	texRef.filterMode = cudaFilterModeLinear; 

	

	// read texels from texture 

	readTexels <<< nBlocks, NUM_THREADS >>> (N, d_out); 

	

	// copy texels to host 

	cudaMemcpy(h_out, d_out, sizeof(float)*N, cudaMemcpyDeviceToHost); 

	

	// look at them 

	for (int i = 0; i < N; i++) 

	{ 

		printf("%f\n",h_out[i]); 

	} 

	

	free(h_out);

	cudaFree(d_out); 

#ifndef LIN_MEM

	cudaFreeArray(cuArray); 

#else

	cudaFree(d_data);

#endif

	cudaUnbindTexture(texRef); 

	checkCUDAError("cuda free operations"); 

}

This little test allocates an array with ten floats { 0, 1, …, 9 } on the device, binds this memory to a texture reference and accesses it in the kernel on the indexes { 0, 0.5, 1, 1.5, … 4.5 }.

I’d expect an output of exactly these values, but all values returned equal zero…

What’s wrong here?

If I use

float x = tex1Dfetch(texRef, f_idx);

instead of

float x = tex1D(texRef, f_idx);

the texture accesses work, but of course no interpolation is done…

Further, I even don’t get the expected results when I use

You cannot use cudaFilterModeLinear on textures bound to linear memory.

That’s right, but what about pitch linear memory? In Dr. Dobbs article he mentions that this ought to be possible…

Does anybody know if this is true?

Well… I finally achieved to use the texture unit to do filtering on pitch linear memory. All I had to do is to use cudaBindTexture2D() instead of cudaBindTexture(). To fetch the data I used tex2D().

Now I think it should be possible to write to the pitch linear memory and later use the texture unit to read from it, as well as doing linear interpolation on it.

The code now looks like this:

//readTexels.cu 

#include <cuda_runtime.h>

#include <cutil_inline.h>

#include <stdio.h>

void checkCUDAError(const char *msg) 

{ 

	cudaError_t err = cudaGetLastError(); 

	if( cudaSuccess != err) 

	{ 

		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 

		exit(EXIT_FAILURE); 

	} 

}

texture<float, 2, cudaReadModeElementType> texRef;

__global__ void readTexels(int n, float *d_out) 

{ 

	int idx = blockIdx.x*blockDim.x + threadIdx.x;

	if(idx < n) 

	{ 

		float f_idx = float(idx)/2.f;

		f_idx += 0.5f;

		

		float x = tex2D( texRef, f_idx, 0 );

		d_out[idx] = x; 

	} 

} 

#define NUM_THREADS 256

int main() 

{ 

	// 10 is illustrative and should be larger in practice 

	int N = 10;

	

	int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0); 

	

	// allocate space on the device for the results 

	float *d_out; 

	cudaMalloc((void**)&d_out, sizeof(float)*N);

	// allocate space on the host for the results

	float *h_out = (float*)malloc(sizeof(float)*N);

	// data fill array with increasing values

	float *data = (float*)malloc(N*sizeof(float));

	for (int i = 0; i < N; i++) data[i] = float(i);

	

	// use linear memory...

	float *d_data;	

	size_t pitch;

	size_t height = 1;

	size_t width = N;

	// allocate pitch linear memory

	cudaMallocPitch( (void**) &d_data, &pitch, width * sizeof(float), height * sizeof(float) );

	

	size_t h_pitch = width * sizeof(float);

	cudaMemcpy2D( d_data, pitch, data, h_pitch, width*sizeof(float), height, cudaMemcpyHostToDevice );

	size_t offset = 0;

	// bind texture

	int err = cudaBindTexture2D( &offset, texRef, d_data, texRef.channelDesc, width, height, pitch );

	checkCUDAError("bind");

	// host side settable texture attributes 

	texRef.normalized = false; 

	texRef.filterMode = cudaFilterModeLinear; 

	

	// read texels from texture 

	readTexels <<< nBlocks, NUM_THREADS >>> (N, d_out); 

	

	// copy texels to host 

	cudaMemcpy(h_out, d_out, sizeof(float)*N, cudaMemcpyDeviceToHost); 

	

	// look at them 

	for (int i = 0; i < N; i++) 

	{ 

		printf("%f\n",h_out[i]); 

	} 

	

	free(h_out);

	cudaFree(d_out); 

	cudaFree(d_data);

	cudaUnbindTexture(texRef); 

	checkCUDAError("cuda free operations"); 

}