I tried to use texture memory to accelerate my application and wrote a piece of code as below:
----------------------file_name: main.cu------------------------------------------
#include <stdio.h>
texture<int, 1>ref;
__global__ void fetch_kernel()
{
int val = tex1Dfetch(ref, 0);
printf("val = %d.", val);
}
int main(int argc, char *argv[])
{
int in[2]={1,2};
int *d_in;
cudaMalloc((void **)&d_in, 2 * sizeof(int));
cudaMemcpy(d_in, in, 2 * sizeof(int), cudaMemcpyHostToDevice);
cudaBindTexture(0, ref, d_in, 2 * sizeof(int));
fetch_kernel<<<1, 1>>>();
cudaDeviceSynchronize();
cudaUnbindTexture(ref);
return 0;
}
The output of this code is “val = 1.” The first element of array d_in was fetched.
And then, I restructured this code in three files as below:
------------------------kernel.h----------------------------------------
#ifndef KERNEL_H_
#define KERNEL_H_
texture<int, 1>ref;
global void fetch_kernel(int *d_in);
#endif /* KERNEL_H_ */
-----------------------kernel.cu-----------------------------
#include “kernel.h”
#include <stdio.h>
global void fetch_kernel()
{
int val = tex1Dfetch(ref, 0);
printf(“val = %d.”, val);
}
-----------------------main.cu---------------------------------
#include “kernel.h”
int main(int argc, char *argv)
{
int in[2]={1,2};
int *d_in;
cudaMalloc((void **)&d_in, 2 * sizeof(int));
cudaMemcpy(d_in, in, 2 * sizeof(int), cudaMemcpyHostToDevice);
cudaBindTexture(0, ref, d_in, 2 * sizeof(int));
fetch_kernel<<<1, 1>>>();
cudaDeviceSynchronize();
cudaUnbindTexture(ref);
return 0;
}
The output of this code is “val = 0.” The first element of array d_in was not fetched correctly!!!
Why?? Thanks for your help.
I use SLES 11 SP2 with cuda 5.0 and Tesla S2050 GPU.
The code is compilered as below:
nvcc -c kernel.cu -arch=sm_20
nvcc -c main.cu -arch=sm_20
nvcc kernel.o main.o -o tex