Textures Anything wrong with this

Anything wrong with this?




#define height 64

#define width 64

#define BLOCKSIZE 16

using namespace std;

// this code will copy data to cudamallocpitch'd

// device memory, allocate cudamalloc'd dev mem

// then copy mpit to m mem,read back, an print

// Texture I M using

texture<float,2> texRefEx;

// Serial Kernel

__global__ void kernel(float* devMPPtr, float * devMPtr, int pitch)


	// This is looping through a malloc pitch memory. Please, don't loop.

    // r is the row

    for (int r = 0; r < height; ++r) {

        float* row = (float*)((char*)devMPPtr + r * pitch);

        for (int c = 0; c < width; ++c) {

	     // Lets add 1.0f to everything! helz yeahz

             devMPtr[r*width+c] = row[c]+1.0f;




// Parallel Kernel, reads from the malloc pitch kernel and then writes back to it

__global__ void kernel_wo_loop(float* devMPPtr, float * devMPtr, int pitch)


	unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;

	unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

	//float* row = (float*)((char*)devMPPtr + idy * pitch);


	// Read from the mallocPitch memory and write to malloc 

	// memory. Can replace pitch/sizeof(float) with width if width%16=0



// Another Parallel Kernel that reads from the texture, and writes back to the memory it is bound to

__global__ void kernel_w_textures(float* devMPPtr, float * devMPtr, int pitch)


        unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;

        unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

        //float* row = (float*)((char*)devMPPtr + idy * pitch);

	// Normalized Floating point texture coordinates (with a shift of (-1,-1)

	float u=(idx-1.0f)/float(width);

	float v=(idy-1.0f)/float(height);

	// write back to the devMPPtr that the tex2D is originall bound to and add 1


	// write to malloc memory for fun



// cuda error checking wrapper:

void checkCUDAError(const char *msg) {

  cudaError_t err = cudaGetLastError();

  if( cudaSuccess != err) {

    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 




//Host Code

int main()


size_t memsize=height*width;

//host data, and storage for output from device

float * data=(float *)malloc(sizeof(float)*memsize);

float * h_out=(float *)malloc(sizeof(float)*memsize);

// MallocPitch ptr

float* devMPPtr;

// Malloc ptr

float* devMPtr;

size_t pitch;

// Assign the memory on host in data.

for (int i = 0; i <  memsize; i++){



// I forget why I did this... something from the example I copied

dim3 grid((int)(width/BLOCKSIZE+1),(int)(height/BLOCKSIZE+1)), threads(BLOCKSIZE,BLOCKSIZE);

// allocate malloc pitch

cudaMallocPitch((void**)&devMPPtr,&pitch, width * sizeof(float), height);

checkCUDAError("Error after cudaMallocPitch" );

printf("The pitch is %d \n",pitch);

// Texture Channel Description (don't understand just do)

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

// bind texture to pitch mem:


// Set mutable properties:

// normalized = floating point texture coordinates that are [0,1]


// wrap, warp ( can be set to Clamp - look up!)



texRefEx.filterMode= cudaFilterModePoint;

// allocate regular malloc memory


checkCUDAError("Error after cudaMallocPitch" );

// Read data from host to device



checkCUDAError("Error after memcp2d" );

kernel<<<100, 512>>>(devMPPtr, devMPtr, pitch);

//kernel_wo_loop<<<100,512>>>(devMPPtr, devMPtr, pitch);

kernel_wo_loop<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

kernel_wo_loop<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

kernel_w_textures<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

checkCUDAError("Error after kernel_w_textures" );

// Copy back data to host

cudaMemcpy(h_out, devMPtr,memsize*sizeof(float),cudaMemcpyDeviceToHost);

checkCUDAError("Error after memcpy here" );

// Print

 cout << endl;

	for (int i=0; i<width; i++){

		for (int j=0; j<height; j++){

			printf("%2.2f ",h_out[j+i*width]);


	cout << "back n" << endl;


return 0;