Hi there…
I’m also very interested in using linear memory for texture access and wrote a small example code, but it obviously doesn’t work.
Actually I used Dr. Dobb’s example implementation and tried to bind pitched linear memory to the texture and then use the texture to linearly interpolate between the values.
Here’s the code:
//readTexels.cu
#include <stdio.h>
#define LIN_MEM
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
texture<float, 1, cudaReadModeElementType> texRef;
__global__ void readTexels(int n, float *d_out)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < n)
{
float f_idx = float(idx)/2.f;
f_idx += 0.5f;
//Note: Appendix D.2 gives formula for interpolation
#ifndef LIN_MEM
float x = tex1D(texRef, f_idx);
#else
//float x = tex1Dfetch(texRef, f_idx);
float x = tex1D(texRef, f_idx);
#endif
d_out[idx] = x;
}
}
#define NUM_THREADS 256
int main()
{
// 10 is illustrative and should be larger in practice
int N = 10;
int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0);
// allocate space on the device for the results
float *d_out;
cudaMalloc((void**)&d_out, sizeof(float)*N);
// allocate space on the host for the results
float *h_out = (float*)malloc(sizeof(float)*N);
// data fill array with increasing values
float *data = (float*)malloc(N*sizeof(float));
for (int i = 0; i < N; i++) data[i] = float(i);
#ifndef LIN_MEM
// create a CUDA array on the device
cudaArray* cuArray;
cudaMallocArray (&cuArray, &texRef.channelDesc, N, 1);
cudaMemcpyToArray(cuArray, 0, 0, data, sizeof(float)*N , cudaMemcpyHostToDevice );
// bind a texture to the CUDA array
cudaBindTextureToArray (texRef, cuArray);
#else
// use linear memory...
float *d_data;
size_t pitch;
cudaMallocPitch( (void**) &d_data, &pitch, N * sizeof(float), 1 * sizeof(float) );
cudaMemcpy( d_data, data, N * sizeof(float), cudaMemcpyHostToDevice );
// bind a texture to the CUDA array
cudaBindTexture( 0, texRef, d_data, N*sizeof(float) );
checkCUDAError("bind");
#endif
// host side settable texture attributes
texRef.normalized = false;
texRef.filterMode = cudaFilterModeLinear;
// read texels from texture
readTexels <<< nBlocks, NUM_THREADS >>> (N, d_out);
// copy texels to host
cudaMemcpy(h_out, d_out, sizeof(float)*N, cudaMemcpyDeviceToHost);
// look at them
for (int i = 0; i < N; i++)
{
printf("%f\n",h_out[i]);
}
free(h_out);
cudaFree(d_out);
#ifndef LIN_MEM
cudaFreeArray(cuArray);
#else
cudaFree(d_data);
#endif
cudaUnbindTexture(texRef);
checkCUDAError("cuda free operations");
}
This little test allocates an array with ten floats { 0, 1, …, 9 } on the device, binds this memory to a texture reference and accesses it in the kernel on the indexes { 0, 0.5, 1, 1.5, … 4.5 }.
I’d expect an output of exactly these values, but all values returned equal zero…
What’s wrong here?
If I use
float x = tex1Dfetch(texRef, f_idx);
instead of
float x = tex1D(texRef, f_idx);
the texture accesses work, but of course no interpolation is done…
Further, I even don’t get the expected results when I use