Hello.
I’m practicing with cublas, a simple matrix multiplication.
I succeeded on default device to do the thing.
And then, I wanted to use two GPUs. (GTX295 has two GPUs).
Before trying that I tried to use device 1, which is not a default device.
I just added “cudaSetDevice(1);”, in the first line of the code.
After adding the code, the program become strange.
Usually, Unknown error occurred, in the cublasSgemm function.
Rarely, cublas initialization error occurred.
The problem doesn’t occur, when I used matrix multiplication code I made or I used device 0.
Below is the code I used.
Any guess to fix this problem will be very helpful for me.
Thank you in advance.
#include <stdio.h>
#include <cuda.h>
#include <cublas_v2.h>
#define M 4096
#define N 4096
#define BLOCK_SIZE 16
__global__ void initMatrix(float *A, float *B)
{
int t = M * (BLOCK_SIZE * blockIdx.x + threadIdx.x) + BLOCK_SIZE * blockIdx.y + threadIdx.y;
A[t] = BLOCK_SIZE * blockIdx.y + threadIdx.y + 1;
B[t] = (int)(BLOCK_SIZE * blockIdx.y + threadIdx.y) - (int)(BLOCK_SIZE * blockIdx.x + threadIdx.x);
}
int main()
{
cudaError_t err;
float* A;
float* B;
float* C;
float* h_C;
float a = 1;
float b = 0;
err = cudaSetDevice(1);
cublasStatus_t cublasStatus;
cublasHandle_t cublasHandle;
cublasStatus = cublasCreate(&cublasHandle);
cudaMalloc(&A, M * N * sizeof(float));
cudaMalloc(&B, M * N * sizeof(float));
cudaMalloc(&C, M * N * sizeof(float));
h_C = (float*)malloc(M * N * sizeof(float));
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(M / BLOCK_SIZE, N / BLOCK_SIZE);
initMatrix<<<dimGrid, dimBlock>>>(A, B);
err = cudaDeviceSynchronize();
if (err != cudaSuccess)
{
printf("CUDA INITMATRIX error: %s\n", cudaGetErrorString(err));
return;
}
for (int i = 0; i < 100; i++)
{
cublasStatus = cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, M, &a, A, M, B, M, &b, C, M);
switch (cublasStatus)
{
case CUBLAS_STATUS_SUCCESS:
printf("%d: success\n", i);
break;
default:
printf("stat: %d\n", cublasStatus);
return;
}
err = cudaDeviceSynchronize(); //Error occurred here.
if (err != cudaSuccess)
{
printf("CUDA synchronize error: %s\n", cudaGetErrorString(err));
return;
}
}
err = cudaMemcpy(h_C, C, M * M * sizeof(float), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
printf("CUDA Memcpy error: %s\n", cudaGetErrorString(err));
return;
}
cudaFree(A);
cudaFree(B);
cudaFree(C);
free(h_C);
cublasDestroy(cublasHandle);
return 0;
}