Hi all,
I’m a reasonably experienced programmer but am new to both CUDA and C and am unsuprisingly having a few issues.
Scenario:
[*]Create an 11 by 11 2D array in C of type int
[*]Initialise array mainly 0’s with some 1’s
[*]Using Cuda add 1 to each element in 2D array
[*]Print array to see if it’s changed
I have written the code and it compiles and runs fine, however it doesn’t behave as expected. The array before and after calling the kernel is the same. It doesn’t add 1, and I’m not sure it’s even copying the array backwards and forwards.
From reading the programming guide it suggests I use cudaMallocPitch and cudaMemcpy2D to allocate the memory on the device and copy it to and from device memory, which is what I’ve done. Hopefully the code below is commented enough to make it easy to follow what I’m trying to do. As you can see I create a pointer ptrHostResult that points to the start of the 2D array automaton, my thinking is that when I call cudaMemcpy2D to copy the result back to the host, the array passed from device memory will overwrite the one on host memory. However it isn’t working.
I have even tried creating another 2D array of all 1’s and copying it into to device memory and back to host memory, but it doesn’t overwrite the original array.
Any help is greatly appreciated :)
#define GRIDWIDTH 11
#define GRIDHEIGHT 11
#include <stdio.h>
#include <stdlib.h>
/*
*
* CUDA Kernel
*
*/
__global__ void incrementCell(int* ptrDevA, int* ptrDevB, int pitchA, int pitchB){
int x = threadIdx.x;
int y = threadIdx.y;
int temp = ptrDevA[y * pitchA + x];
ptrDevB[y * pitchB + x] = temp + 1;
}
/*
*
* Host Code
*
*/
int automaton[GRIDHEIGHT][GRIDWIDTH];
void printAutomaton(void);
void initAutomaton(void);
int main(){
/*
* CUDA Stuff
*/
// 1 Block with a 11 * 11 Grid of threads
int numBlocks = 1;
dim3 threadsPerBlock(GRIDWIDTH, GRIDHEIGHT);
// Allocate and initialise memory on host
initAutomaton();
const int* ptrHost;
ptrHost = &automaton[0][0];
// Print initial automaton
printAutomaton();
// Create pointer to store result
int* ptrHostResult;
//ptrHostResult = (int *)malloc((GRIDWIDTH * sizeof(int))*GRIDHEIGHT);
ptrHostResult = &automaton[0][0];
// Allocate memory on device, A holds original B will hold result
int* ptrDevA;
size_t pitchA;
cudaMallocPitch((void**)&ptrDevA, &pitchA, GRIDWIDTH * sizeof(int), GRIDHEIGHT);
int* ptrDevB;
size_t pitchB;
cudaMallocPitch((void**)&ptrDevB, &pitchB, GRIDWIDTH * sizeof(int), GRIDHEIGHT);
// Copy host memory to device memory
cudaMemcpy2D(ptrDevA, pitchA, ptrHost, sizeof(int), GRIDWIDTH * sizeof(int),
GRIDHEIGHT, cudaMemcpyHostToDevice);
// Execute the kernel
incrementCell<<<numBlocks, threadsPerBlock>>>(ptrDevA, ptrDevB, pitchA, pitchB);
// Copy result from device memory to host
cudaMemcpy2D(ptrHostResult, sizeof(int), ptrDevB, pitchB, pitchB * GRIDWIDTH, GRIDHEIGHT,
cudaMemcpyDeviceToHost);
// Print returned automaton
printAutomaton();
exit(0);
}
void initAutomaton(void){
// Set all cells to 0
for(int y = 0; y < GRIDHEIGHT; y++){
/* Loop through each cell */
for(int x = 0; x < GRIDWIDTH; x++){
automaton[y][x] = 0;
}
}
// Set line in middle to 1
automaton[5][3] = 1;
automaton[5][4] = 1;
automaton[5][5] = 1;
automaton[5][6] = 1;
automaton[5][7] = 1;
}
void printAutomaton(void){
/* Loop through each row */
for(int y = 0; y < GRIDHEIGHT; y++){
/* Loop through each cell */
for(int x = 0; x < GRIDWIDTH; x++){
printf("%d ", automaton[y][x]);
}
printf("\n");
}
printf("\n\n");
}