Textures Anything wrong with this

Anything wrong with this?

#include<stdio.h>

#include<cuda.h>

#include<iostream>

#define height 64

#define width 64

#define BLOCKSIZE 16

using namespace std;

// this code will copy data to cudamallocpitch'd

// device memory, allocate cudamalloc'd dev mem

// then copy mpit to m mem,read back, an print

// Texture I M using

texture<float,2> texRefEx;

// Serial Kernel

__global__ void kernel(float* devMPPtr, float * devMPtr, int pitch)

{

	// This is looping through a malloc pitch memory. Please, don't loop.

    // r is the row

    for (int r = 0; r < height; ++r) {

        float* row = (float*)((char*)devMPPtr + r * pitch);

        for (int c = 0; c < width; ++c) {

	     // Lets add 1.0f to everything! helz yeahz

             devMPtr[r*width+c] = row[c]+1.0f;

        }

    }

}

// Parallel Kernel, reads from the malloc pitch kernel and then writes back to it

__global__ void kernel_wo_loop(float* devMPPtr, float * devMPtr, int pitch)

{

	unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;

	unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

	//float* row = (float*)((char*)devMPPtr + idy * pitch);

	

	// Read from the mallocPitch memory and write to malloc 

	// memory. Can replace pitch/sizeof(float) with width if width%16=0

	devMPtr[idy*width+idx]=devMPPtr[idy*pitch/sizeof(float)+idx]+2.0f;

}

// Another Parallel Kernel that reads from the texture, and writes back to the memory it is bound to

__global__ void kernel_w_textures(float* devMPPtr, float * devMPtr, int pitch)

{

        unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;

        unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

        //float* row = (float*)((char*)devMPPtr + idy * pitch);

	// Normalized Floating point texture coordinates (with a shift of (-1,-1)

	float u=(idx-1.0f)/float(width);

	float v=(idy-1.0f)/float(height);

	// write back to the devMPPtr that the tex2D is originall bound to and add 1

        devMPPtr[idy*width+idx]=tex2D(texRefEx,u,v)+1.0f;

	// write to malloc memory for fun

	devMPtr[idy*width+idx]=devMPPtr[idy*width+idx];

}

// cuda error checking wrapper:

void checkCUDAError(const char *msg) {

  cudaError_t err = cudaGetLastError();

  if( cudaSuccess != err) {

    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 

    exit(EXIT_FAILURE); 

  }

} 

//Host Code

int main()

{

size_t memsize=height*width;

//host data, and storage for output from device

float * data=(float *)malloc(sizeof(float)*memsize);

float * h_out=(float *)malloc(sizeof(float)*memsize);

// MallocPitch ptr

float* devMPPtr;

// Malloc ptr

float* devMPtr;

size_t pitch;

// Assign the memory on host in data.

for (int i = 0; i <  memsize; i++){

	data[i]=1.0;

}

// I forget why I did this... something from the example I copied

dim3 grid((int)(width/BLOCKSIZE+1),(int)(height/BLOCKSIZE+1)), threads(BLOCKSIZE,BLOCKSIZE);

// allocate malloc pitch

cudaMallocPitch((void**)&devMPPtr,&pitch, width * sizeof(float), height);

checkCUDAError("Error after cudaMallocPitch" );

printf("The pitch is %d \n",pitch);

// Texture Channel Description (don't understand just do)

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

// bind texture to pitch mem:

cudaBindTexture2D(0,&texRefEx,devMPPtr,&channelDesc,width,height,pitch);

// Set mutable properties:

// normalized = floating point texture coordinates that are [0,1]

texRefEx.normalized=true;

// wrap, warp ( can be set to Clamp - look up!)

texRefEx.addressMode[0]=cudaAddressModeWrap;

texRefEx.addressMode[1]=cudaAddressModeWrap;

texRefEx.filterMode= cudaFilterModePoint;

// allocate regular malloc memory

cudaMalloc((void**)&devMPtr,memsize*sizeof(float));

checkCUDAError("Error after cudaMallocPitch" );

// Read data from host to device

cudaMemcpy2D(devMPPtr,pitch,data,sizeof(float)*width,

		sizeof(float)*height,height,cudaMemcpyHostToDevice);

checkCUDAError("Error after memcp2d" );

kernel<<<100, 512>>>(devMPPtr, devMPtr, pitch);

//kernel_wo_loop<<<100,512>>>(devMPPtr, devMPtr, pitch);

kernel_wo_loop<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

kernel_wo_loop<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

kernel_w_textures<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

checkCUDAError("Error after kernel_w_textures" );

// Copy back data to host

cudaMemcpy(h_out, devMPtr,memsize*sizeof(float),cudaMemcpyDeviceToHost);

checkCUDAError("Error after memcpy here" );

// Print

 cout << endl;

	for (int i=0; i<width; i++){

		for (int j=0; j<height; j++){

			printf("%2.2f ",h_out[j+i*width]);

		}

	cout << "back n" << endl;

	}

return 0;

}