Cublas, sum and dot. Newbie question.

gatts · April 19, 2012, 4:24pm

I am trying to compute using cublas, the sum of the elements of a vector and vector dot product. My problem is that I can’t get the results back due a problem with the memory allocation of the double/pointer which will contain the result at the end of the computation.

int  vector_size = 3;

double* h_M;

  double* h_H;

  double* d_M = 0;

  double* d_H = 0;

cublasStatus_t status;

  cublasHandle_t handle;

status = cublasCreate(&handle);

if (status != CUBLAS_STATUS_SUCCESS) {

        fprintf (stderr, "!!!! CUBLAS initialization error\n");

        return EXIT_FAILURE;

  }

/* Allocate host memory for the matrices */

  h_M = (double*)malloc( vector_size * sizeof(h_M[0]));

  if (h_M == 0) {

    fprintf (stderr, "!!!! host memory allocation error (M)\n");

    return EXIT_FAILURE;

  }

h_H = (double*)malloc( vector_size * sizeof(h_H[0]));

  if (h_H == 0) {

    fprintf (stderr, "!!!! host memory allocation error (H)\n");

    return EXIT_FAILURE;

  }

h_M[0]=1.0;

  h_M[1]=0.0;

  h_M[2]=0.0;

h_H[0]=0.0;

  h_H[1]=1.0;

  h_H[2]=0.0;

/* Allocate device memory for the matrices */

    if (cudaMalloc((void**)&d_M, vector_size * sizeof(d_M[0])) != cudaSuccess) {

        fprintf (stderr, "!!!! device memory allocation error (allocate A)\n");

        return EXIT_FAILURE;

    }

    if (cudaMalloc((void**)&d_H, vector_size * sizeof(d_H[0])) != cudaSuccess) {

        fprintf (stderr, "!!!! device memory allocation error (allocate B)\n");

        return EXIT_FAILURE;

    }

/* Initialize the device matrices with the host matrices */

  status = cublasSetVector(vector_size, sizeof(h_M[0]), h_M, 1, d_M, 1);

  if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device access error (write M)\n");

    return EXIT_FAILURE;

  }

  status = cublasSetVector(vector_size, sizeof(h_H[0]), h_H, 1, d_H, 1);

  if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device access error (write H)\n");

    return EXIT_FAILURE;

  }

/* Performs operation using cublas */

status = cublasDasum(handle, vector_size, d_M,1,result);

  if (status != CUBLAS_STATUS_SUCCESS) {

      fprintf (stderr, "!!!! kernel execution error.\n");

      return EXIT_FAILURE;

  }

//   /* Read the result back */

   double* sum = 0;

   status = cublasGetVector(1, sizeof(void**), result, 1, sum, 1);// I only require the first element of result, that's why I chose 1 as vector length, is that ok?

   if (status != CUBLAS_STATUS_SUCCESS) {

     fprintf (stderr, "!!!! device access error (read C)\n");

     return EXIT_FAILURE;

   } 

cout << *sum << endl;

/* Performs operation using cublas */

   double* result = 0;

   status = cublasDdot(handle, vector_size, d_M,1,d_H,1, result);// is it ok to not have allocated result into the memory, the manual says that it could be located in the host.

   if (status != CUBLAS_STATUS_SUCCESS) {

     fprintf (stderr, "!!!! kernel execution error.\n");

     return EXIT_FAILURE;

   }

cout << *result << endl;

/* Memory clean up */

   if (cudaFree(d_H) != cudaSuccess) {

     fprintf (stderr, "!!!! memory free error (H)\n");

     return EXIT_FAILURE;

   }

   if (cudaFree(d_M) != cudaSuccess) {

     fprintf (stderr, "!!!! memory free error (M)\n");

     return EXIT_FAILURE;

   }

Then I can’t find the mistake and I obtain “Segmentation fault” when I run the executable. I’m really puzzled about what’s wrong, your help would be very appreciated.

many thanks!

pasoleatis · April 19, 2012, 8:10pm

I have question. After the line double* sum = 0; shouldn’t you allocate sum of size 1 with malloc? Do you know at which line does it crash?

gatts · April 20, 2012, 12:47am

I don’t know if I have to allocate it. In the manual for this function says that the “role of dot” could be in in the host or the device. Then my question is, does cudablas fill it automatically in the host? or do I have to send it to the device and the copy it back to the host?

(I’ll try meanwhile)

It fails at blas operations.

gatts · April 20, 2012, 1:08am

Ok, I tried this

// /* Performs operation using cublas */

double*  result ;

if (cudaMalloc((void**)&result , sizeof(double) != cudaSuccess)) { //*result &result result double

    fprintf (stderr, "!!!! device memory allocation error (allocate result)\n");

    return EXIT_FAILURE;

  }

status = cublasDdot(handle, vector_size, d_M,1,d_H,1, result);

  if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! kernel execution error.\n");

    return EXIT_FAILURE;

  }

double* dot;

  cudaMemcpy( &result, dot, sizeof(double),cudaMemcpyDeviceToHost);

cout <<   dot << endl;

And again if I comment the cublasDot operation it works and I obtain 0 printed on screen. If do the cudblasDot operation it says operation failed, and I’m puzzled.

thanks again for your advice.

gatts · April 20, 2012, 10:12pm

I got the solution, it was really an easy problem once I figured out. The declaration of double result was incorrect. I didn’t allocate enough memory, It should have been double sum = new double[1]; and not only double* sum.

pasoleatis · April 21, 2012, 9:52am

Congrats. You discovered the same thing I said previous.

KenP · November 29, 2012, 10:52am

Hi guys, I new to CUBLAS… trying to use cublasDdot; not sure why I’m getting answer t = 0?
will be great if you can help take a look. cheers Ken

#include
#include
#include
#include
#include
#include
#include “cuda.h”
#include
#include
#include
#include
#include

int main(void)
{
int i,n;
double *x, *y, *t;
double *d_x, *d_y, *d_t;
n = 100;

x = (double*)malloc(n *sizeof(double));
y = (double*)malloc(n *sizeof(double));
t = (double*)malloc(sizeof(double));

for( i = 0; i < n; i++)
{
	x[i] = i;
	y[i] = i + 3;
}
cudaMalloc((void **)&d_x, n*sizeof(double));
cudaMalloc((void **)&d_y, n*sizeof(double));
cudaMalloc((void **)&d_t, sizeof(double));
cudaMemcpy(d_x, x, n*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, n*sizeof(double), cudaMemcpyHostToDevice);

cublasHandle_t handle;
cublasCreate(&handle);
cublasDdot(handle, n, d_x, 1, d_y, 1, d_t);
cudaMemcpy(t, d_t, sizeof(double), cudaMemcpyDeviceToHost);
printf("GPU = %lf\n",  t);
system("PAUSE");

free(x);
free(y);
free(t);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_t);
cublasDestroy(handle);
return 0;

}

Topic		Replies	Views
Doubts about CUBLAS CUDA Programming and Performance	3	3460	February 18, 2009
cuBLAS call from kernel in CUDA 10.0 GPU-Accelerated Libraries	9	4860	April 7, 2021
cublasAlloc fails even though there is enough memory CUDA Programming and Performance	4	10892	December 15, 2009
cublas<t>dot_v2 Legacy PGI Compilers	5	4767	April 4, 2012
Issue when calling cublasDdot from within kernel GPU-Accelerated Libraries	7	932	March 21, 2018
submatrix computation CUDA Programming and Performance	3	5165	November 23, 2009
cublasDasum with device result CUDA Programming and Performance	1	2062	April 13, 2012
Program hit cudaErrorInvalidValue (error 1) due to "invalid argument" on CUDA API call to cudaMemsetAsync CUDA Programming and Performance	7	7783	January 11, 2020
Cublas matrix dot product ? CUDA Programming and Performance	5	15511	January 7, 2011
Help Matrix Multiplication using cuBLAS CUDA Programming and Performance	10	23879	July 24, 2010

Cublas, sum and dot. Newbie question.

Related topics