Hello!
I am new to programming on cuda and i’ll deeply appreciated it if someone can help me look at my code and why it is printing all zeroes when I copied it back to host. Below is my code:
#include
#include
#include
#include
#include
using namespace std;
// Device code
global void CopyData(float* d_array,
float* d_destinationArray,
size_t pitch,
int columnCount,
int rowCount)
{
for (int row = 0; row < rowCount; row++)
{
float* rowData = (float*)(((char*)d_array) + (row * pitch));
for (int column = 0; column < columnCount; column++)
{
rowData[column] = 11.0;
d_destinationArray[(row*columnCount) + column] = rowData[column];
}
}
}
int main(int argc, char** argv)
{
int columnCount = 4;
int rowCount = 19998;
// Host code dealing with memory and the device
float *d_array; // the device array which memory will be allocated to
float *d_destinationArray; // the device array
// allocate memory on the host
float* h_array = new float[columnCount*rowCount];
// the pitch value assigned by cudaMallocPitch which ensures correct data structure alignment
size_t pitch;
// allocated the device memory for source array
cudaMallocPitch(&d_array, &pitch, columnCount * sizeof(float), rowCount);
// allocate the device memory for destination array
cudaMalloc(&d_destinationArray, columnCountrowCountsizeof(float));
// call the kernel which copies values from d_array to d_destinationArray
CopyData<<<100,512>>>(d_array, d_destinationArray, pitch, columnCount, rowCount);
// copy the data back to the host memory
cudaMemcpy(h_array,
d_destinationArray,
columnCountrowCountsizeof(float),
cudaMemcpyDeviceToHost);
// print out the test values, all the 111111.0
for ( i = 0 ; i < rowCount ; i++)
{
for ( j = 0 ; j < columnCount ; j++)
{
cout << “h_array[” << (icolumnCount) + j << “]=” << h_array[(icolumnCount) + j] << endl;
}
}
return 0;
}
i am getting this as results:
h_array[79977]=0
h_array[79978]=0
h_array[79979]=0
h_array[79980]=0
h_array[79981]=0
h_array[79982]=0
h_array[79983]=0
h_array[79984]=0
h_array[79985]=0
h_array[79986]=0
h_array[79987]=0
h_array[79988]=0
h_array[79989]=0
h_array[79990]=0
h_array[79991]=0
I am not sure why as I thought I am giving each element in my array the value of 11.0 is there a problem when I copied from device back to host or is my program just not running on the GPU?
Thank you for any suggestion!!