I am trying to compute using cublas, the sum of the elements of a vector and vector dot product. My problem is that I can’t get the results back due a problem with the memory allocation of the double/pointer which will contain the result at the end of the computation.
int vector_size = 3;
double* h_M;
double* h_H;
double* d_M = 0;
double* d_H = 0;
cublasStatus_t status;
cublasHandle_t handle;
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
/* Allocate host memory for the matrices */
h_M = (double*)malloc( vector_size * sizeof(h_M[0]));
if (h_M == 0) {
fprintf (stderr, "!!!! host memory allocation error (M)\n");
return EXIT_FAILURE;
}
h_H = (double*)malloc( vector_size * sizeof(h_H[0]));
if (h_H == 0) {
fprintf (stderr, "!!!! host memory allocation error (H)\n");
return EXIT_FAILURE;
}
h_M[0]=1.0;
h_M[1]=0.0;
h_M[2]=0.0;
h_H[0]=0.0;
h_H[1]=1.0;
h_H[2]=0.0;
/* Allocate device memory for the matrices */
if (cudaMalloc((void**)&d_M, vector_size * sizeof(d_M[0])) != cudaSuccess) {
fprintf (stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void**)&d_H, vector_size * sizeof(d_H[0])) != cudaSuccess) {
fprintf (stderr, "!!!! device memory allocation error (allocate B)\n");
return EXIT_FAILURE;
}
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(vector_size, sizeof(h_M[0]), h_M, 1, d_M, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write M)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(vector_size, sizeof(h_H[0]), h_H, 1, d_H, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write H)\n");
return EXIT_FAILURE;
}
/* Performs operation using cublas */
status = cublasDasum(handle, vector_size, d_M,1,result);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
// /* Read the result back */
double* sum = 0;
status = cublasGetVector(1, sizeof(void**), result, 1, sum, 1);// I only require the first element of result, that's why I chose 1 as vector length, is that ok?
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
cout << *sum << endl;
/* Performs operation using cublas */
double* result = 0;
status = cublasDdot(handle, vector_size, d_M,1,d_H,1, result);// is it ok to not have allocated result into the memory, the manual says that it could be located in the host.
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
cout << *result << endl;
/* Memory clean up */
if (cudaFree(d_H) != cudaSuccess) {
fprintf (stderr, "!!!! memory free error (H)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_M) != cudaSuccess) {
fprintf (stderr, "!!!! memory free error (M)\n");
return EXIT_FAILURE;
}
Then I can’t find the mistake and I obtain “Segmentation fault” when I run the executable. I’m really puzzled about what’s wrong, your help would be very appreciated.
many thanks!