Previously I had interpreted your comments to mean that you could not reproduce the issue in a purely CUDA C++ setting:
But no matter.
The logical error you have is this:
- you are attempting to use more than one GPU
- you are correctly calling cudaSetDevice and then cublasCreate, during initCuda, to properly define a handle for each GPU
- during processing (i.e. actual use of these handles) nowhere are you changing the device to the selected one (whichGPU). Therefore all of your subsequent loops and cublas calls/requests are going to a single GPU, but for some of these you are using handles created on other GPUs. This is broken.
The fix is to properly switch devices during your loop/processing:
$ cat t1632.cu
#include <assert.h>
#include <stdio.h>
#include <time.h>
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// Helper functions and utilities to work with CUDA
#include <helper_cuda.h>
#include <helper_functions.h>
using namespace std;
#define swap(a, b) {float *hold = a; a = b; b = hold;}
#define swapD(a, b) {double *hold = a; a = b; b = hold;}
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
#define kBlockSize 32
/**
* Compare the contents of two T[]. If any value of {@code first} differs from {@code second}
* by {@code epsilon} or more, return false. Else return true
*
* @param first double[] holding values to test, same length as {@code second}
* @param second double[] holding values to test, same length as {@code first}
* @param size Length of both {@code first} and {@code second}
* @param epsilon Test value. All matching elements of {@code first} and {@code second} must
* differ by less than this
* @param diffCount Count of values in {@code first} that differ from {@code second} by {@code epsilon} or more,
*/
template <int BLOCK_SIZE, class T> __global__ void
compare (T *first, T *second, size_t size, T epsilon, size_t *diffCount)
{
size_t blockX = blockIdx.x * BLOCK_SIZE;
size_t threadX = threadIdx.x;
size_t pos = blockX + threadX;
if (pos < size)
{
T value = first[pos] - second[pos];
if (value < 0)
value = -value;
if (value > epsilon)
++(*diffCount);
}
}
/**
* Copy the contents of {@code source} into {@code target}
*
* @param source float[] / double[] to read from. Must not be null, and length {@code theSize}
* @param target float[] / double[] to write to. Must not be null, and length {@code theSize}
* @param theSize Size of both arrays
*/
template <int BLOCK_SIZE, class T> __global__ void
copy (const T *source, T *target, int theSize)
{
// Block index
int blockX = blockIdx.x * BLOCK_SIZE;
// Thread index
int threadX = threadIdx.x;
// Source location
int pos = blockX + threadX;
if (pos < theSize)
target[pos] = source[pos];
}
/**
* Write the transpose of {@code source} into {@code target}, on the Host
*
* @param source float / double[] to read from. Must not be null, and length {@code numRows * numCols}
* @param target float / double[] to write to. Must not be null, and length {@code numRows * numCols}
* @param numRows Number of rows in {@code source}, will be number of cols in {@code target}
* @param numCols Number of cols in {@code source}, will be number of rows in {@code target}
*/
template <class T, class U> void transposeRCHost (const T source[], U target[], int numRows, int numCols)
{
int i, readPos = 0;
for (i = 0; i < numRows; ++i)
{
int j, writePos = i;
for (j = 0; j < numCols; ++j)
{
target[writePos] = (U) (source[readPos]);
++readPos;
writePos += numRows;
}
}
}
/**
* Chose which GPU to use, exiting on any error
*/
void cSetDevice (int whichGPU, int line)
{
cudaError_t error = cudaSetDevice (whichGPU);
if (error != cudaSuccess)
{
printf ("cudaSetDevice (%d) returned error %s (code %d), line (%d)\n", whichGPU, cudaGetErrorString (error), error, line);
exit (EXIT_FAILURE);
}
// printf ("cudaSetDevice (%d) succeeded\n", whichGPU);
}
/**
* Chose which GPU to use, exiting on any error
*/
void cuBlasSetup (cublasHandle_t *handle, int whichGPU, int line)
{
cSetDevice (whichGPU, line);
cublasStatus_t stat = cublasCreate (handle);
if (stat != CUBLAS_STATUS_SUCCESS)
{
printf ("CUBLAS initialization failed, cublasCreate () returned error %s (code %d), line (%d)\n", _cudaGetErrorEnum (stat), stat, line);
exit (EXIT_FAILURE);
}
}
/**
* Matrix multiplication and addition on the device: {@code result = (mulMatrix * fxnMatrix) + addMatrix}
* {@code widthMul} is {@code mulMatrix}'s width and {@code widthFxn} is {@code fxnMatrix}'s width
* {@code result} and {@code addMatrix} have height {@code heightMul and width {@code widthFxn}
*/
void cuBlasMul (cublasHandle_t handle, double *result, double *mulMatrix, double *fxnMatrix,
double *addMatrix, int colsMul, int rowsMul, int colsFxn, int rowsFxn, int line)
{
cublasStatus_t stat;
const double alpha = 1.0f;
const double beta = 1.0f;
int arraySize = colsFxn * rowsFxn;
// Replace contents of result with addMatrix, so can use the beta add, rather than a separate operation
copy<1024, double><<< (arraySize + 1023) / 1024, 1024 >>>(addMatrix, result, arraySize);
stat = cublasDgemm (handle, CUBLAS_OP_N, CUBLAS_OP_N, rowsMul, colsFxn, rowsFxn, &alpha,
mulMatrix, rowsMul, fxnMatrix, rowsFxn, &beta, result, rowsMul);
if (stat != CUBLAS_STATUS_SUCCESS)
{
printf ("cublasDgemm () returned error %s (code %d), line (%d)\n", _cudaGetErrorEnum (stat), stat, line);
exit (EXIT_FAILURE);
}
}
/**
* Allocate cuda memory, exiting on any error
*/
void cMalloc (void **target, size_t theSize, const char *name, int line)
{
cudaError_t error = cudaMalloc (target, theSize);
if (error != cudaSuccess)
{
printf ("cudaMalloc %s returned error %s (code %d), line (%d)\n", name, cudaGetErrorString (error), error, line);
exit (EXIT_FAILURE);
}
}
/**
* Copy to or from cuda memory, exiting on any error
*/
void cCopy (void *target, const void *source, size_t theSize, enum cudaMemcpyKind kind, const char *name, int line)
{
cudaError_t error = cudaMemcpy (target, source, theSize, kind);
if (error != cudaSuccess)
{
printf ("cudaMemcpy %s returned error %s (code %d), line (%d)\n", name, cudaGetErrorString (error), error, line);
exit (EXIT_FAILURE);
}
}
cublasHandle_t *handles;
/**
* Routine to let us know the library really is loaded
*/
void initCuda (int numGPUs)
{
handles = (cublasHandle_t *) malloc (numGPUs * sizeof(cublasHandle_t));
for (int whichGPU = 0; whichGPU < numGPUs; ++whichGPU)
{
cuBlasSetup (handles + whichGPU, whichGPU, __LINE__); // Create a cublasHandle_t for each GPU
}
}
bool convergeMatrixCuBLASD (double *fxnMatrixH, double *mulMatrixH, double *addMatrixH, double *results,
int numRows, int numCols, int maxIter, int whichGPU, double epsilon)
{
cublasHandle_t handle = handles[whichGPU];
cSetDevice (whichGPU, __LINE__); // ADDED LINE
size_t arraySize = numRows * numCols;
size_t memSizeFxn = arraySize * sizeof(double);
size_t memSizeMul = numRows * numRows * sizeof(double);
double *mulMatrix, *fxnMatrix, *addMatrix, *resultHold;
size_t changeCountH;
size_t *changeCount;
bool converged = false;
struct timeval startTV;
gettimeofday (&startTV, NULL);
// Allocate device memory
cMalloc ((void **) &mulMatrix, memSizeMul, "mulMatrix", __LINE__);
cMalloc ((void **) &fxnMatrix, memSizeFxn, "fxnMatrix", __LINE__);
cMalloc ((void **) &addMatrix, memSizeFxn, "addMatrix", __LINE__);
cMalloc ((void **) &resultHold, memSizeFxn, "resultHold", __LINE__);
cMalloc ((void **) &changeCount, sizeof(size_t), "changeCount", __LINE__);
// copy host memory to device
cCopy (mulMatrix, mulMatrixH, memSizeMul, cudaMemcpyHostToDevice, "(mulMatrix, mulMatrixH)", __LINE__);
cCopy (fxnMatrix, fxnMatrixH, memSizeFxn, cudaMemcpyHostToDevice, "(fxnMatrix, fxnMatrixH)", __LINE__);
cCopy (addMatrix, addMatrixH, memSizeFxn, cudaMemcpyHostToDevice, "(addMatrix, addMatrixH)", __LINE__);
int i, numBlocks = (arraySize + 1023) / 1024;
for (i = 0; i < maxIter; ++i)
{
cuBlasMul (handle, resultHold, mulMatrix, fxnMatrix, addMatrix, numRows, numRows, numCols, numRows, __LINE__);
cudaDeviceSynchronize ();
swapD(resultHold, fxnMatrix); // Want fxnMatrix to always hold the final results when we exit
changeCountH = 0;
cCopy (changeCount, &changeCountH, sizeof(size_t), cudaMemcpyHostToDevice, "(changeCount, changeCountH)", __LINE__);
compare<1024, double><<< numBlocks, 1024 >>>(fxnMatrix, resultHold, arraySize, epsilon, changeCount);
cudaDeviceSynchronize ();
cCopy (&changeCountH, changeCount, sizeof(size_t), cudaMemcpyDeviceToHost, "(changeCountH, changeCount)", __LINE__);
if (changeCountH == 0)
{
converged = true;
break;
}
}
struct timeval endTV;
gettimeofday (&endTV, NULL);
unsigned long theStart = (1000000 * startTV.tv_sec) + startTV.tv_usec;
unsigned long done = (1000000 * endTV.tv_sec) + endTV.tv_usec;
printf ("cuBlasMul (double) used %lu microseconds for %d iterations\n", done - theStart, i);
cCopy (results, fxnMatrix, memSizeFxn, cudaMemcpyDeviceToHost, "(results, resultHold)", __LINE__);
cudaFree (mulMatrix);
cudaFree (fxnMatrix);
cudaFree (addMatrix);
cudaFree (resultHold);
cudaFree (changeCount);
return converged;
}
void testMulD (int numRows, int numCols, int maxIter, int whichGPU, int whichRun)
{
double epsilon = (double) 1e-12;
size_t arraySize = numRows * numCols;
size_t memSizeFxn = arraySize * sizeof(double);
size_t memSizeMul = numRows * numRows * sizeof(double);
double *mulMatrix, *fxnMatrix, *addMatrix;
mulMatrix = (double *) malloc (memSizeMul);
addMatrix = (double *) malloc (memSizeFxn);
fxnMatrix = (double *) malloc (memSizeFxn);
for (int row = 0; row < numRows; ++row)
{
double rowF = row;
int pos = row * numCols; // Row offset
for (int col = 0; col < numCols; ++col)
{
double value = rowF + col;
addMatrix[pos + col] = fxnMatrix[pos + col] = value;
}
pos = row * numRows; // Row offset
for (int col = 0; col < numRows; ++col)
{
double value = rowF + col;
mulMatrix[pos + col] = value;
}
}
double *mulMatrixT, *fxnMatrixT, *addMatrixT, *resultsBLAS;
mulMatrixT = (double *) malloc (memSizeMul);
addMatrixT = (double *) malloc (memSizeFxn);
fxnMatrixT = (double *) malloc (memSizeFxn);
resultsBLAS = (double *) malloc (memSizeFxn);
transposeRCHost<double, double> (fxnMatrix, fxnMatrixT, numRows, numCols);
transposeRCHost<double, double> (addMatrix, addMatrixT, numRows, numCols);
transposeRCHost<double, double> (mulMatrix, mulMatrixT, numRows, numRows);
convergeMatrixCuBLASD (fxnMatrixT, mulMatrixT, addMatrixT, resultsBLAS, numRows, numCols, maxIter, whichGPU, epsilon);
free ((void *) addMatrix);
free ((void *) addMatrixT);
free ((void *) fxnMatrix);
free ((void *) fxnMatrixT);
free ((void *) mulMatrix);
free ((void *) mulMatrixT);
free ((void *) resultsBLAS);
}
int main (int argc, char **argv)
{
int numRows = 1758;
int numCols = 296;
int maxIter = 1;
int numGPUs = 4;
int numRuns = 1;
initCuda (numGPUs);
for (int run = 0; run < numRuns; ++run)
{
for (int whichGPU = 0; whichGPU < numGPUs; ++whichGPU)
{
testMulD (numRows, numCols, maxIter, whichGPU, run);
}
}
}
$ nvcc -g -G -I/usr/local/cuda/samples/common/inc t1632.cu -o t1632 -lcublas
$ cuda-memcheck ./t1632
========= CUDA-MEMCHECK
cuBlasMul (double) used 115091 microseconds for 1 iterations
cuBlasMul (double) used 121288 microseconds for 1 iterations
cuBlasMul (double) used 113102 microseconds for 1 iterations
cuBlasMul (double) used 113514 microseconds for 1 iterations
========= ERROR SUMMARY: 0 errors
$
In the example above I marked the line I added to attempt to address the above issue with the comment:
// ADDED LINE
I’m not guaranteeing the correctness of your code or that it is defect free. I’m just pointing out what is in my opinion a design defect and indicated how it may be modified to address that design issue. I’m not suggesting your code otherwise has no issues.