Hello everyone,
I have made an initial port of a C code to the GPU (K40, CUDA 9.0), which works as expected. The kernel I created receives as input a vector of doubles of size n (u_d) and a 2D matrix of doubles of size nxn (sigma_d). Part of the code executed in the kernel is a dot product between a row of sigma_d and the vector u_d. I create a 1D grid containing >= n threads in total and each thread performs one such dot product.
Now I want to use cublasDdot within each thread to perform the dot product, with the hope that it will perform faster. Although I have managed to modify the code, compile it and execute it, I am stuck as I have issues: either I cannot copy back the results from the device to the host, or the results are incorrect.
I have reduced the code to the smallest working example that exhibits the same behavior as the original application. For testing purposes, I store and copy back to the host the results of the dot products. In the real application only the updated vector u_d needs to be copied back. This is why I left the second cudaMemcpy. The code consists of the following two files:
kernel.cu
#include <cuda.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <math.h>
#include <getopt.h>
#include <stdlib.h>
#include <sys/time.h>
/******************************************************************************/
__constant__ long n_d;
/******************************************************************************/
__global__ void do_timestep(double *u_d, double *sigma_d, double *res_d)
{
long i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n_d) {
#if 0
long j;
res_d[i] = 0.0;
for (j = 0; j < n_d; j++) {
res_d[i] += sigma_d[i * n_d + j] * u_d[j];
}
#else
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
status = cublasDdot(cnpHandle, n_d, &sigma_d[i * n_d], 1, u_d, 1, &res_d[i]);
cublasDestroy(cnpHandle);
#endif
}
}
/******************************************************************************/
extern "C" void invoke_do_timestep(double *u_d, double *sigma_d, double *res_d, long n)
{
cudaError_t err;
do_timestep<<<(n - 1) / 32 + 1, 32>>>(u_d, sigma_d, res_d);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA error: %s\n", cudaGetErrorString(err));
exit(1);
}
}
/******************************************************************************/
extern "C" void copy_to_constant_memory(long n)
{
cudaError_t err;
err = cudaMemcpyToSymbol(n_d, &n, sizeof(long));
if (err != cudaSuccess) {
printf("Could not copy \"n\" to constant memory.\n");
exit(1);
}
}
main.c
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#define DEF_N (100L)
void copy_to_constant_memory(long n);
void invoke_do_timestep(double *u_d, double *sigma_d, double *res_d, long n);
/******************************************************************************/
int main(int argc, char *argv[])
{
long n, i, j;
double *u_h, *u_d, *sigma_h, *sigma_d, *res_h, *res_d, *res_from_d;
cudaError_t err;
n = DEF_N;
u_h = (double *)calloc(n, sizeof(double));
if (u_h == NULL) {
printf("Could not allocate memory for \"u_h\".\n");
exit(1);
}
sigma_h = (double *)calloc(n * n, sizeof(double));
if (sigma_h == NULL) {
printf("Could not allocate memory for \"sigma_h\".\n");
exit(1);
}
res_h = (double *)calloc(n, sizeof(double));
if (res_h == NULL) {
printf("Could not allocate memory for \"res_h\".\n");
exit(1);
}
res_from_d = (double *)calloc(n, sizeof(double));
if (res_from_d == NULL) {
printf("Could not allocate memory for \"res_from_d\".\n");
exit(1);
}
err = cudaMalloc((void **)&u_d, n * sizeof(double));
if (err != cudaSuccess) {
printf("Could not allocate memory for \"u_d\".\n");
exit(1);
}
err = cudaMalloc((void **)&sigma_d, n * n * sizeof(double));
if (err != cudaSuccess) {
printf("Could not allocate memory for \"sigma_d\".\n");
exit(1);
}
err = cudaMalloc((void **)&res_d, n * sizeof(double));
if (err != cudaSuccess) {
printf("Could not allocate memory for \"res_d\".\n");
exit(1);
}
/*
* Initialize arrays with random numbers.
*/
for (i = 0; i < n; i++ ) {
u_h[i] = drand48();
#if defined(PRINT)
printf("%ld\t%f\n", i, u_h[i]);
#endif
}
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
sigma_h[i * n + j] = drand48();
}
}
#if defined(PRINT)
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
printf("%4.1f", sigma[i * n + j]);
}
printf("\n");
}
#endif
err = cudaMemcpy(u_d, u_h, n * sizeof(double), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("Could not copy \"u_h\" to \"u_d\".\n");
exit(1);
}
err = cudaMemcpy(sigma_d, sigma_h, n * n * sizeof(double), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("Could not copy \"sigma_h\" to \"sigma_d\".\n");
exit(1);
}
copy_to_constant_memory(n);
/*
* Perform operation on CPU.
*/
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
res_h[i] += sigma_h[i * n + j] * u_h[j];
}
}
/*
* Perform operation on GPU using cuBLAS.
*/
invoke_do_timestep(u_d, sigma_d, res_d, n);
err = cudaMemcpy(res_from_d, res_d, n * sizeof(double), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("Could not copy \"res_d\" to \"res_from_d\".\n");
exit(1);
}
/*
* This is required in the original application, but fails when
* cublasDestroy() is called within the kernel.
*/
err = cudaMemcpy(u_h, u_d, n * sizeof(double), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("Could not copy \"u_d\" to \"u_h\".\n");
exit(1);
}
for (i = 0; i < n; i++) {
if (fabs(res_h[i] - res_from_d[i]) > 10e-10) {
printf("Diff at position %ld (%f, %f)\n", i, res_h[i], res_from_d[i]);
}
}
return 0;
}
The compilation and linking is performed with the following commands:
nvcc -dc -O3 --ptxas-options=-v --compiler-options -O3,-Wall --gpu-architecture compute_35 --gpu-code sm_35 --maxrregcount 32 -o kernel.o kernel.cu
nvcc -dc -O3 --ptxas-options=-v --compiler-options -O3,-Wall --gpu-architecture compute_35 --gpu-code sm_35 --maxrregcount 32 -o main.o -c main.c
nvcc -O3 --ptxas-options=-v --compiler-options -O3,-Wall --gpu-architecture compute_35 --gpu-code sm_35 --maxrregcount 32 -o main kernel.o main.o -lcublas_device
Notice that I perform the same dot products on the CPU and save the results in the vector res_h for comparison.
-
In the file kernel.cu, if the dot product is performed manually (what is now in the #if 0 part), the results copied back top the host are the same as in res_h.
-
If the code is run as is (using the cuBLAS function call), I get an error that res_d cannot be copied back to the host to the vector res_from_d.
-
If I comment out the cublasDestroy(cnpHandle); instruction the code runs, the copy from the device to the host succeeds, but the results are wrong.
Could someone shed some light and help me figure out where the error(s) is?
Thank you!