Anything wrong with this?
#include<stdio.h>
#include<cuda.h>
#include<iostream>
#define height 64
#define width 64
#define BLOCKSIZE 16
using namespace std;
// this code will copy data to cudamallocpitch'd
// device memory, allocate cudamalloc'd dev mem
// then copy mpit to m mem,read back, an print
// Texture I M using
texture<float,2> texRefEx;
// Serial Kernel
__global__ void kernel(float* devMPPtr, float * devMPtr, int pitch)
{
// This is looping through a malloc pitch memory. Please, don't loop.
// r is the row
for (int r = 0; r < height; ++r) {
float* row = (float*)((char*)devMPPtr + r * pitch);
for (int c = 0; c < width; ++c) {
// Lets add 1.0f to everything! helz yeahz
devMPtr[r*width+c] = row[c]+1.0f;
}
}
}
// Parallel Kernel, reads from the malloc pitch kernel and then writes back to it
__global__ void kernel_wo_loop(float* devMPPtr, float * devMPtr, int pitch)
{
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;
//float* row = (float*)((char*)devMPPtr + idy * pitch);
// Read from the mallocPitch memory and write to malloc
// memory. Can replace pitch/sizeof(float) with width if width%16=0
devMPtr[idy*width+idx]=devMPPtr[idy*pitch/sizeof(float)+idx]+2.0f;
}
// Another Parallel Kernel that reads from the texture, and writes back to the memory it is bound to
__global__ void kernel_w_textures(float* devMPPtr, float * devMPtr, int pitch)
{
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;
//float* row = (float*)((char*)devMPPtr + idy * pitch);
// Normalized Floating point texture coordinates (with a shift of (-1,-1)
float u=(idx-1.0f)/float(width);
float v=(idy-1.0f)/float(height);
// write back to the devMPPtr that the tex2D is originall bound to and add 1
devMPPtr[idy*width+idx]=tex2D(texRefEx,u,v)+1.0f;
// write to malloc memory for fun
devMPtr[idy*width+idx]=devMPPtr[idy*width+idx];
}
// cuda error checking wrapper:
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
//Host Code
int main()
{
size_t memsize=height*width;
//host data, and storage for output from device
float * data=(float *)malloc(sizeof(float)*memsize);
float * h_out=(float *)malloc(sizeof(float)*memsize);
// MallocPitch ptr
float* devMPPtr;
// Malloc ptr
float* devMPtr;
size_t pitch;
// Assign the memory on host in data.
for (int i = 0; i < memsize; i++){
data[i]=1.0;
}
// I forget why I did this... something from the example I copied
dim3 grid((int)(width/BLOCKSIZE+1),(int)(height/BLOCKSIZE+1)), threads(BLOCKSIZE,BLOCKSIZE);
// allocate malloc pitch
cudaMallocPitch((void**)&devMPPtr,&pitch, width * sizeof(float), height);
checkCUDAError("Error after cudaMallocPitch" );
printf("The pitch is %d \n",pitch);
// Texture Channel Description (don't understand just do)
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
// bind texture to pitch mem:
cudaBindTexture2D(0,&texRefEx,devMPPtr,&channelDesc,width,height,pitch);
// Set mutable properties:
// normalized = floating point texture coordinates that are [0,1]
texRefEx.normalized=true;
// wrap, warp ( can be set to Clamp - look up!)
texRefEx.addressMode[0]=cudaAddressModeWrap;
texRefEx.addressMode[1]=cudaAddressModeWrap;
texRefEx.filterMode= cudaFilterModePoint;
// allocate regular malloc memory
cudaMalloc((void**)&devMPtr,memsize*sizeof(float));
checkCUDAError("Error after cudaMallocPitch" );
// Read data from host to device
cudaMemcpy2D(devMPPtr,pitch,data,sizeof(float)*width,
sizeof(float)*height,height,cudaMemcpyHostToDevice);
checkCUDAError("Error after memcp2d" );
kernel<<<100, 512>>>(devMPPtr, devMPtr, pitch);
//kernel_wo_loop<<<100,512>>>(devMPPtr, devMPtr, pitch);
kernel_wo_loop<<<grid,threads>>>(devMPPtr, devMPtr, pitch);
kernel_wo_loop<<<grid,threads>>>(devMPPtr, devMPtr, pitch);
kernel_w_textures<<<grid,threads>>>(devMPPtr, devMPtr, pitch);
checkCUDAError("Error after kernel_w_textures" );
// Copy back data to host
cudaMemcpy(h_out, devMPtr,memsize*sizeof(float),cudaMemcpyDeviceToHost);
checkCUDAError("Error after memcpy here" );
// Print
cout << endl;
for (int i=0; i<width; i++){
for (int j=0; j<height; j++){
printf("%2.2f ",h_out[j+i*width]);
}
cout << "back n" << endl;
}
return 0;
}