Hi Folks,
Am a newbie to CUDA and am stuck while playing around with arrays.
All I am trying to do is, copy a single dimensional array from host to device and multiply each element of an array with some scalar value and return the results.
Eg:
Src : 5, 10, 20, 30, 40
Scalar Val : 10
Exp Result: 50, 100, 200, 300, 400
It works file when I compile in device emulation mode, but when that flag is turned off it just doesn’t. It always return zero’s
Before I go ahead I would like to post the code below
Kernel Code
__global__ void multiply(double* srcElems, double* resElems, int multiplier){
int threadIndex = threadIdx.x;
resElems[threadIndex] = srcElems[threadIndex] * multiplier;
return;
}
C host
void checkCudaError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( 0 != err)
{
printf( "\nCUDA error> %s %s.\n", msg, cudaGetErrorString( err) );
exit(0);
}
}
double* multiplyArray(const double* srcArr, int multiplier, int size){
double* hostResElements = (double*)calloc(size, sizeof(double));
double* deviceSrcElements;
double* deviceResElements;
int memSize = size * sizeof(double);
int index;
for(index=0; index<size; index++){
printf("\nSrc [%d] : %f", index, srcArr[index]);
}
cudaMalloc((void**) &deviceSrcElements, memSize);
checkCudaError("Memory Allocated to device src elements");
cudaMalloc((void**) &deviceResElements, memSize);
checkCudaError("Memory Allocated to device target elements");
cudaMemcpy(deviceSrcElements, srcArr, memSize, cudaMemcpyHostToDevice);
checkCudaError("Memory copied from host src to device src");
multiply<<<1, size>>>(deviceSrcElements, deviceResElements, multiplier);
checkCudaError("Kernel code invoked");
cudaMemcpy(hostResElements, deviceResElements, memSize, cudaMemcpyDeviceToHost);
checkCudaError("Memory copied from device result to host result");
for(index=0; index<size; index++){
printf("\nRes [%d] : %f", index, hostResElements[index]);
}
cudaFree(deviceSrcElements);
checkCudaError("Cleaning up device src elements");
cudaFree(deviceResElements);
checkCudaError("Cleaning up device res elements");
return hostResElements;
}
int main(){
double* srcArr = (double*) calloc ( 5, sizeof(double));
int size = 5;
int index;
for(index=0; index<size; index++){
srcArr[index] = size * index;
}
double* result = multiplyArray(srcArr, 5, 5);
for(index=0; index<size; index++){
printf ("Result in C [%d] : %f", index, result[index]);
}
return 0;
}
Please let me know if I am doing anything wrong.
Thanx,
Katta