I’m using cublas for the first time (I’m trying to multiply two matrices A and B).
Not sure what I’m missing or doing wrong, but things are not working. Here is a simplification of the code (not working):
bool DeviceSupportsCUDA() {
int numberDevices;
if (cudaGetDeviceCount(&numberDevices) == cudaSuccess) {
for(int device = 0; device < numberDevices; device++) {
cudaDeviceProp deviceProperties;
if(cudaGetDeviceProperties(&deviceProperties, device) == cudaSuccess && deviceProperties.major >= 1) {
if (cudaSetDevice(device) == cudaSuccess) return true;
}
}
}
return false;
}
int main(int argc, char* argv[]) {
if(!DeviceSupportsCUDA()) {
cout << "Device does not support cuda" << endl;
return 0;
}
// Matrix dimensions
int A_rows = 1600;
int A_cols = 3200;
int B_rows = A_cols;
int B_cols = 4000;
int C_rows = A_rows;
int C_cols = B_cols;
// Create the host matrices
float * A = new float[A_rows * A_cols];
float * B = new float[B_rows * B_cols];
float * C = new float[C_rows * C_cols];
// Fill the matrix A
for(int y = 0; y < A_rows; y++) {
for(int x = 0; x < A_cols; x++) {
A[y * A_cols + x] = (x - y);
}
}
// Fill the matrix B
for(int y = 0; y < B_rows; y++) {
for(int x = 0; x < B_cols; x++) {
B[y * B_cols + x] = (y - x);
}
}
// Create the device matrices
float * d_A;
float * d_B;
float * d_C;
int size = (A_rows * A_cols) * sizeof(float);
cudaMalloc((void **) &d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (B_rows * B_cols) * sizeof(float);
cudaMalloc((void **) &d_B, size);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_C, (C_rows * C_cols) * sizeof(float));
// Multiply the matrices
cublasInit();
cublasSgemm('N', 'N', A_rows, B_cols, A_cols, 1.0f, d_A, A_rows, B, B_rows, 0.0f, d_C, C_rows);
//get the result
cudaMemcpy(C, d_C, (C_rows * C_cols) * sizeof(float), cudaMemcpyDeviceToHost);
// Check the result
for(int y = 0; y < C_rows; y++) {
for(int x = 0; x < C_cols; x++) {
float deviceValue = C[y * C_cols + x];
float sum = 0.0f;
for(int m = 0; m < A_cols; m++) sum += A[y * A_cols + m] * B[m * B_cols + x];
if (sum != deviceValue) {
cout << y << ", " << x << " -> " << deviceValue << " != " << sum << " -> difference = " << (sum - deviceValue) << endl;
cout << ":(" << endl;
return 0;
}
}
}
cout << ":)" << endl;
// free the device matrices
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// delete host matrices
delete [] A;
delete [] B;
delete [] C;
cublasShutdown();
return 0;
}