cublas call from kernel ( not getting right results )

Hello , I am trying to call a Scal cublas call from kernel.

I am using an input array ( 0,1,2,3,4,…31) and a scalar = 2.0.
So , I want the result ( 0,2,4,6,8…) but instead I am getting ( 0,4,16,48…2.09715e+06
,4.45645e+06,9.43718e+06…)

#include <assert.h>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas_v2.h>

using namespace std;

//----function to check for errors-------------------------------------------------
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert( cudaError_t code, const char * file, int line, bool abort = true )
{
	if ( cudaSuccess != code )
	{
		fprintf( stderr, "\nGPUassert: %s %s %d\n", cudaGetErrorString( code ), file, line );
		if ( abort )
			exit( code );
	}
}



__global__ void myfunc( cublasStatus_t * returnStatus, int N , float * const devB )
{

    const float alpha = 2.0f;

    cublasHandle_t theCublasHandle;
    cublasStatus_t theCublasStatus = cublasCreate( &theCublasHandle );

    if (theCublasStatus != CUBLAS_STATUS_SUCCESS)
    {
	*returnStatus = theCublasStatus;
	return;
    }

    int i = threadIdx.x + blockIdx.x * blockDim.x;
	
    if ( i < N)
    {

	// copy input data to inV ( decorelated Coil Sens ) in order to use it for our calculations

	theCublasStatus = cublasSscal( theCublasHandle , N , &alpha , ( devB + i ) , 1 );
	cublasDestroy( theCublasHandle );
	*returnStatus = theCublasStatus;
	//cudaDeviceSynchronize();


	}
	

}


int main(){

    int N = 32; // numner of elements

    // allocate host memory
    float * inA;
    inA = (float*) malloc ( N * sizeof(float));
    assert( NULL != inA );

    //allocate device memory
    float *devB;
    gpuErrchk( cudaMalloc( (void **) &devB, N * sizeof(float) ) );

    for ( int i = 0; i < N; i++ )
	inA[ i ] = i;


    //------------------------------------------------------------------------------------------------
    /* Get handle to the CUBLAS context */
    cublasStatus_t * devCublasStatus;
    cublasStatus_t theCublasStatus;
    gpuErrchk( cudaMalloc( (void**) &devCublasStatus, sizeof(cublasStatus_t) ) );


    // define threads and blocks
    int theBlocksPerGrid = 1;
    int theThreadsPerBlock = 32;

    gpuErrchk( cudaMemcpy( devB, inA , N * sizeof(float), cudaMemcpyHostToDevice ) );

    myfunc<<< theBlocksPerGrid , theThreadsPerBlock>>>( devCublasStatus , N ,  devB );

    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    //copy back to host
    gpuErrchk( cudaMemcpy( inA , devB , N * sizeof(float), cudaMemcpyDeviceToHost ) );
    gpuErrchk( cudaMemcpy( &theCublasStatus , devCublasStatus , sizeof(cublasStatus_t), cudaMemcpyDeviceToHost ) );

    if (theCublasStatus != CUBLAS_STATUS_SUCCESS)
    {
	    fprintf(stderr,"!!!! CUBLAS Device API call failed with code %d\n",theCublasStatus);
	    exit(EXIT_FAILURE);
    }

    cout << "\nAfter : "<<endl;
    for (int i = 0; i < N; i++)
	 cout << inA[ i ]<<endl;
    cout <<endl;

    //clean host memory
    free ( inA );

    // clean device memory
    gpuErrchk( cudaFree( devB ) );

    return 0;
}

Also , I wanted to ask .

  1. If I use another cublas call after the first and I must take the results first , I have to call
cudaDeviceSynchronize();

after the first call,right?

  1. The
*returnStatus = theCublasStatus;

.Should I put it after every cublas call?
And the

cublasDestroy( theCublasHandle );

?Only at the end?

Thank you!