fetch texture memory error!

I tried to use texture memory to accelerate my application and wrote a piece of code as below:
----------------------file_name: main.cu------------------------------------------

#include <stdio.h>
texture<int, 1>ref;

__global__ void fetch_kernel()
{
	int val = tex1Dfetch(ref, 0);
	printf("val = %d.", val);
}

int main(int argc, char *argv[])
{
	int in[2]={1,2};
	int *d_in;
	cudaMalloc((void **)&d_in, 2 * sizeof(int));
	cudaMemcpy(d_in, in, 2 * sizeof(int), cudaMemcpyHostToDevice);
	cudaBindTexture(0, ref, d_in, 2 * sizeof(int));
	fetch_kernel<<<1, 1>>>();
	cudaDeviceSynchronize();
	cudaUnbindTexture(ref);
	return 0;
}

The output of this code is “val = 1.” The first element of array d_in was fetched.

And then, I restructured this code in three files as below:
------------------------kernel.h----------------------------------------

#ifndef KERNEL_H_
#define KERNEL_H_

texture<int, 1>ref;
global void fetch_kernel(int *d_in);

#endif /* KERNEL_H_ */

-----------------------kernel.cu-----------------------------
#include “kernel.h”
#include <stdio.h>

global void fetch_kernel()
{
int val = tex1Dfetch(ref, 0);
printf(“val = %d.”, val);
}

-----------------------main.cu---------------------------------
#include “kernel.h”
int main(int argc, char *argv)
{
int in[2]={1,2};
int *d_in;
cudaMalloc((void **)&d_in, 2 * sizeof(int));
cudaMemcpy(d_in, in, 2 * sizeof(int), cudaMemcpyHostToDevice);
cudaBindTexture(0, ref, d_in, 2 * sizeof(int));
fetch_kernel<<<1, 1>>>();
cudaDeviceSynchronize();
cudaUnbindTexture(ref);
return 0;
}

The output of this code is “val = 0.” The first element of array d_in was not fetched correctly!!!
Why?? Thanks for your help.


I use SLES 11 SP2 with cuda 5.0 and Tesla S2050 GPU.
The code is compilered as below:
nvcc -c kernel.cu -arch=sm_20
nvcc -c main.cu -arch=sm_20
nvcc kernel.o main.o -o tex