Hello ,
I am having some data with number of pixels for example 200000.
I am running the code (cublas calls from device) with only 1 thread and 1 block and I am getting the right results!!!
How can this happen??
If I try to increase the number of threads/block in order to correspond to the number of pixels I want to process , the code runs very slowly and it may hang.
Or If I use for example 16 threads and 16 blocks ( remember 200000 pixels ) ,the code runs mch slower!
I am providing a simple code ( not the above I am working on ) to make my point.
Here, I am just calling cublas to make a copy from the input array devA to the output devB.
And it works with just 1 thread! ,where number of elements are 32!
#include <assert.h>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
using namespace std;
__global__ void myfunc( cublasStatus_t * returnStatus, int N ,float * const devA ,float * const devV, float * const devB )
{
cublasHandle_t theCublasHandle;
cublasStatus_t theCublasStatus = cublasCreate( &theCublasHandle );
if (theCublasStatus != CUBLAS_STATUS_SUCCESS)
{
*returnStatus = theCublasStatus;
return;
}
int i = threadIdx.x + blockIdx.x * blockDim.x;
if ( i < N)
{
theCublasStatus = cublasScopy( theCublasHandle , N , devA + i , 1 , devV + i , 1 );
theCublasStatus = cublasScopy( theCublasHandle , N , devV + i , 1 , devB + i , 1 );
}
cublasDestroy( theCublasHandle );
*returnStatus = theCublasStatus;
}
int main(){
int N = 32;
// allocate host memory
float * inA;
inA = (float*) malloc ( N * sizeof(float));
assert( NULL != inA );
//allocate device memory
float *devB , * devA , *devV;
CudaErrChk( cudaMalloc( (void **) &devB, N * sizeof(float) ) );
CudaErrChk( cudaMalloc( (void **) &devA, N * sizeof(float) ) );
CudaErrChk( cudaMalloc( (void **) &devV, N * sizeof(float) ) );
for ( int i = 0; i < N; i++ )
inA[ i ] = i;
/* Get handle to the CUBLAS context */
cublasStatus_t * devCublasStatus;
cublasStatus_t theCublasStatus;
CudaErrChk( cudaMalloc( (void**) &devCublasStatus, sizeof(cublasStatus_t) ) );
// define threads and blocks
int Blocks = 1;
int Threads = 1;
CudaErrChk( cudaMemcpy( devA, inA , N * sizeof(float), cudaMemcpyHostToDevice ) );
myfunc<<< Blocks , Threads>>>( devCublasStatus , N , devA , devV, devB );
//copy back to host
CudaErrChk( cudaMemcpy( inA , devB , N * sizeof(float), cudaMemcpyDeviceToHost ) );
CudaErrChk( cudaMemcpy( &theCublasStatus , devCublasStatus , sizeof(cublasStatus_t), cudaMemcpyDeviceToHost ) );
if (theCublasStatus != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr,"!!!! CUBLAS Device API call failed with code %d\n",theCublasStatus);
exit(EXIT_FAILURE);
}
cout << "\nAfter : "<<endl;
for (int i = 0; i < N; i++)
cout << inA[ i ]<<endl;
cout <<endl;
//clean host memory
free ( inA );
// clean device memory
CudaErrChk( cudaFree( devB ) );
CudaErrChk( cudaFree( devA ) );
CudaErrChk( cudaFree( devV ) );
return 0;
}
I also found this https://devtalk.nvidia.com/default/topic/378247/?comment=2699621 but I am not sure I understand it.