cublas - cublasSgemm - problem

I’m using cublas for the first time (I’m trying to multiply two matrices A and B).

Not sure what I’m missing or doing wrong, but things are not working. Here is a simplification of the code (not working):

bool DeviceSupportsCUDA() {	

	int numberDevices;

	if (cudaGetDeviceCount(&numberDevices) == cudaSuccess) {				

		for(int device = 0; device < numberDevices; device++) {

			cudaDeviceProp deviceProperties;

			if(cudaGetDeviceProperties(&deviceProperties, device) == cudaSuccess && deviceProperties.major >= 1) {

				if (cudaSetDevice(device) == cudaSuccess) return true;

			}

		}

	}

	return false;

}

int main(int argc, char* argv[]) {

	if(!DeviceSupportsCUDA()) {

		cout << "Device does not support cuda" << endl;

		return 0;

	}

	// Matrix dimensions

	int A_rows = 1600;

	int A_cols = 3200;

	int B_rows = A_cols;

	int B_cols = 4000;

	int C_rows = A_rows;

	int C_cols = B_cols;

	// Create the host matrices

	float * A = new float[A_rows * A_cols];

	float * B = new float[B_rows * B_cols];

	float * C = new float[C_rows * C_cols];

	// Fill the matrix A

	for(int y = 0; y < A_rows; y++) {

		for(int x = 0; x < A_cols; x++) {

			A[y * A_cols + x] = (x - y);

		}

	}

	// Fill the matrix B

	for(int y = 0; y < B_rows; y++) {

		for(int x = 0; x < B_cols; x++) {

			B[y * B_cols + x] = (y - x);

		}

	}

	// Create the device matrices

	float * d_A;

	float * d_B;

	float * d_C;

	int size = (A_rows * A_cols) * sizeof(float);

	cudaMalloc((void **) &d_A, size);

	cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);

	size = (B_rows * B_cols) * sizeof(float);

	cudaMalloc((void **) &d_B, size);

	cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

	cudaMalloc((void **) &d_C, (C_rows * C_cols) * sizeof(float));	

	// Multiply the matrices

	cublasInit();

	cublasSgemm('N', 'N', A_rows, B_cols, A_cols, 1.0f, d_A, A_rows, B, B_rows, 0.0f, d_C, C_rows);

	//get the result

	cudaMemcpy(C, d_C, (C_rows * C_cols) * sizeof(float), cudaMemcpyDeviceToHost);

	// Check the result

	for(int y = 0; y < C_rows; y++) {

		for(int x = 0; x < C_cols; x++) {

			float deviceValue = C[y * C_cols + x];

			float sum = 0.0f;

			for(int m = 0; m < A_cols; m++) sum += A[y * A_cols + m] * B[m * B_cols + x];

			

			if (sum != deviceValue) {

				cout << y << ", " << x << " -> " << deviceValue << " != " << sum << " -> difference = " << (sum - deviceValue) << endl;

				cout << ":(" << endl;

				return 0;

			}

		}

	}

	cout << ":)" << endl;

	// free the device matrices

	cudaFree(d_A);

	cudaFree(d_B);

	cudaFree(d_C);

	// delete host matrices

	delete [] A;

	delete [] B;

	delete [] C;

	cublasShutdown();

	return 0;

}

CUBALS uses column-major but you use row-major in your host code.

please use column-major in host code when you want to compare the results

Thank you