Getting confused when using tex2D

I want to read a ppm and simply show the picture. but when I run the program, the window is just black, and I find that tex2D return everything zero. I’m new to cuda and I’m not sure what’s wrong with my program.Here is some parts of my code.

cudaArray *cuArray;
texture<uchar4, 2, cudaReadModeElementType> tex;
uchar4 *h_output = new uchar4[width*height];

//read texture
void readTexture(char *filename,const char *exe_path, uint width, uint height) {

		char *imgPath = sdkFindFilePath(filename, exe_path);
		//Load ppm picture
		unsigned char *h_data = new unsigned char[width*height*4];
		if (sdkLoadPPM4(imgPath, &h_data, &width, &height) == 0)
			cout << "Error load pic." << endl;
		else 
			cout << "Succeeded." << endl;

		

		for(int i = 0; i < width*height; i++) {
			h_output[i].x = h_data[4*i];
			h_output[i].y = h_data[4*i+1];
			h_output[i].z = h_data[4*i+2];
			h_output[i].w = h_data[4*i+3];
		}
		cout << (int)h_output[0].x <<  " " << (int)h_output[0].y << " " << (int)h_output[0].z << " " << (int)h_output[0].w << endl;

		// Copy to device memory some data located at address h_data in host memory
		cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar4>();
		checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
		cudaMemcpyToArray(cuArray, 0, 0, h_output, sizeof(h_output),cudaMemcpyHostToDevice);

		// Set texture parameters
		tex.normalized     = 1;                    // access with normalized texture coordinates
		tex.filterMode     = cudaFilterModePoint;  // and without any filtering (we want raw 2x32bit values)
		tex.addressMode[0] = cudaAddressModeClamp; //clamped, eg 1.25 -> 1.0 in [0,1.0);warped, eg 1.25 -> 0.25
		tex.addressMode[1] = cudaAddressModeClamp;
		tex.addressMode[2] = cudaAddressModeClamp;

		// Bind the array to the texture
		checkCudaErrors(cudaBindTextureToArray(tex, cuArray, channelDesc));
}

This part read rgb from texture and write it into pixels_device. Then I use OpenGL to show it.

__global__ void kernel_function(uchar *pixels_device, int num_particle, int width, int height)
{
    unsigned int x=threadIdx.x+blockIdx.x*blockDim.x;
    unsigned int y=threadIdx.y+blockIdx.y*blockDim.y;

    if(x >= ((unsigned int)width) || y >= ((unsigned int)height))
    {
        return;
    }

    float u = (x / (float) width);
    float v = (y / (float) height);
	float w = (u+v*width)/width/height;
	//if (x <= 10 &&  y <= 10)
		//printf("%d %d %.2f %.2f\n",x, y, u, v);
    unsigned int offset=x+y*width;
    pixels_device[offset*4]   =	tex2D(tex, u, v).x;//it's zero.I don't know why...
    pixels_device[offset*4+1] = tex2D(tex, u, v).y;//it's zero 
    pixels_device[offset*4+2] = tex2D(tex, u, v).z;//and this zero too
    pixels_device[offset*4+3] = 0;
    __syncthreads();
}

Thanks for every reply…

How big is sizeof(h_output)

it’s 4…and i change it to widthheightsizeof(uchar4) and get the right answers.
can’t believe this cost me half day to debug…
Thanks a lot!!!