Hello , I am trying to call a Scal cublas call from kernel.
I am using an input array ( 0,1,2,3,4,…31) and a scalar = 2.0.
So , I want the result ( 0,2,4,6,8…) but instead I am getting ( 0,4,16,48…2.09715e+06
,4.45645e+06,9.43718e+06…)
#include <assert.h>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
using namespace std;
//----function to check for errors-------------------------------------------------
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert( cudaError_t code, const char * file, int line, bool abort = true )
{
if ( cudaSuccess != code )
{
fprintf( stderr, "\nGPUassert: %s %s %d\n", cudaGetErrorString( code ), file, line );
if ( abort )
exit( code );
}
}
__global__ void myfunc( cublasStatus_t * returnStatus, int N , float * const devB )
{
const float alpha = 2.0f;
cublasHandle_t theCublasHandle;
cublasStatus_t theCublasStatus = cublasCreate( &theCublasHandle );
if (theCublasStatus != CUBLAS_STATUS_SUCCESS)
{
*returnStatus = theCublasStatus;
return;
}
int i = threadIdx.x + blockIdx.x * blockDim.x;
if ( i < N)
{
// copy input data to inV ( decorelated Coil Sens ) in order to use it for our calculations
theCublasStatus = cublasSscal( theCublasHandle , N , &alpha , ( devB + i ) , 1 );
cublasDestroy( theCublasHandle );
*returnStatus = theCublasStatus;
//cudaDeviceSynchronize();
}
}
int main(){
int N = 32; // numner of elements
// allocate host memory
float * inA;
inA = (float*) malloc ( N * sizeof(float));
assert( NULL != inA );
//allocate device memory
float *devB;
gpuErrchk( cudaMalloc( (void **) &devB, N * sizeof(float) ) );
for ( int i = 0; i < N; i++ )
inA[ i ] = i;
//------------------------------------------------------------------------------------------------
/* Get handle to the CUBLAS context */
cublasStatus_t * devCublasStatus;
cublasStatus_t theCublasStatus;
gpuErrchk( cudaMalloc( (void**) &devCublasStatus, sizeof(cublasStatus_t) ) );
// define threads and blocks
int theBlocksPerGrid = 1;
int theThreadsPerBlock = 32;
gpuErrchk( cudaMemcpy( devB, inA , N * sizeof(float), cudaMemcpyHostToDevice ) );
myfunc<<< theBlocksPerGrid , theThreadsPerBlock>>>( devCublasStatus , N , devB );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
//copy back to host
gpuErrchk( cudaMemcpy( inA , devB , N * sizeof(float), cudaMemcpyDeviceToHost ) );
gpuErrchk( cudaMemcpy( &theCublasStatus , devCublasStatus , sizeof(cublasStatus_t), cudaMemcpyDeviceToHost ) );
if (theCublasStatus != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr,"!!!! CUBLAS Device API call failed with code %d\n",theCublasStatus);
exit(EXIT_FAILURE);
}
cout << "\nAfter : "<<endl;
for (int i = 0; i < N; i++)
cout << inA[ i ]<<endl;
cout <<endl;
//clean host memory
free ( inA );
// clean device memory
gpuErrchk( cudaFree( devB ) );
return 0;
}
Also , I wanted to ask .
- If I use another cublas call after the first and I must take the results first , I have to call
cudaDeviceSynchronize();
after the first call,right?
- The
*returnStatus = theCublasStatus;
.Should I put it after every cublas call?
And the
cublasDestroy( theCublasHandle );
?Only at the end?
Thank you!