Hi,
I am experimenting with Texture fetches and trying to write program utilizing texture reads. I have few small doubts;
1- With reference to the following kernelwhy a temporary variable x is used here? Cant we do d_out[idx]=tex1D(texRef, float(idx)); ?
global void readTexels(int n, float *d_out)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < n) {
//Note: Appendix D.2 gives formula for interpolation
float x = tex1D(texRef, float(idx));
d_out[idx] = x;
}
2- With reference to program shown below, the author says that this program binds a 1D texture to a linear memory. The texture is used to fetch floating-point values from the linear memory and the texture is then updated in-place. The results are then brought back to the host and checked for correctness.
After going through this code I am unable to find “how the texture is updated in-place”, If we look at the
kernel<<<nBlocks, NUM_THREADS>>>(N, d_a); it is clear that it is passing d_a which has some data as the following line shows
cudaMemcpy( d_a, data, memSize, cudaMemcpyHostToDevice );
Now when the kernel is called the location d_a is passed which already has some data. Inside the kernel the statement
d_out[idx] = -tex1Dfetch(texRef, idx);
will overwrite d_out and NOT the texture!
Would anybody please help me in understanding this point.
#include <stdio.h>
#include <assert.h>
void checkCUDAError(const char *msg)
{ cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
texture<float, 1, cudaReadModeElementType> texRef;
__global__ void kernel(int n, float *d_out) { int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < n)
{
d_out[idx] = -tex1Dfetch(texRef, idx);
}
}
#define NUM_THREADS 256
int main()
{
int N = 2560;
int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0);
int memSize = N*sizeof(float);
// data fill array with increasing values
float *data; data = (float*) malloc(memSize);
for (int i = 0; i < N; i++) data[i] = float(i);
float *d_a; cudaMalloc( (void **) &d_a, memSize );
cudaMemcpy( d_a, data, memSize, cudaMemcpyHostToDevice );
cudaBindTexture(0,texRef,d_a,memSize);
checkCUDAError("bind");
kernel<<<nBlocks, NUM_THREADS>>>(N, d_a);
float *h_out = (float*)malloc(memSize);
cudaMemcpy(h_out, d_a, memSize, cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy");
for (int i = 0; i <<N; i++)
{
assert(data[i] == -h_out[i]);
}
printf("Correct\n");
cudaUnbindTexture(texRef);
checkCUDAError("cudaUnbindTexture");
free(h_out); free(data);
}