Hi all,
I want to multiply two large matrices. I have GeForce GTX 950 and OS Windows. This is my attempt with cublasSgemm from cublas_v2.h :
#include <cstdlib>
#include <cuda_runtime.h>
#include <cstdio>
#include <curand_kernel.h>
#include <ctime>
#include <cublas_v2.h>
#include <iostream>
#include <string>
#define MAX 10
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU
void GPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
// Create a pseudo-random number generator
curandGenerator_t prng;
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
// Set the seed for the random number generator using the system clock
curandSetPseudoRandomGeneratorSeed(prng, 1234ULL);
// Fill the array with random numbers on the device
curandGenerateUniform(prng, A, nr_rows_A * nr_cols_A);
}
__global__ void randomMatrix(unsigned int seed, float* result_M, int row, int col) {
curandState_t st;
curand_init(seed, /* the seed controls the sequence of random values that are produced */
0, /* the sequence number is only important with multiple cores */
0, /* the offset is how much extra we advance in the sequence for each call, can be 0 */
&st);
for (int i = 0; i < row; ++i) {
for (int j = 0; j < col; ++j) {
result_M[i*col + j] = curand(&st) % MAX;
}
}
}
// Multiply the arrays A and B on GPU and save the result in C
// C(m,n) = A(m,k) * B(k,n)
void gpu_blas_mmul(const float *A, const float *B, float *C, const int m, const int k, const int n) {
int lda = m, ldb = k, ldc = m;
const float alf = 1;
const float bet = 0;
const float *alpha = &alf;
const float *beta = &bet;
// Create a handle for CUBLAS
cublasHandle_t handle;
cublasCreate(&handle);
// Do the actual multiplication
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, n, k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
// Destroy the handle
cublasDestroy(handle);
}
int main(int argc, char* argv[])
{
int dim = std::stoi(argv[1]);
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = dim;
float *h_A = (float *)malloc(nr_rows_A * nr_cols_A * sizeof(float));
float *h_B = (float *)malloc(nr_rows_B * nr_cols_B * sizeof(float));
float *h_C = (float *)malloc(nr_rows_C * nr_cols_C * sizeof(float));
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, nr_rows_A * nr_cols_A * sizeof(float));
cudaMalloc(&d_B, nr_rows_B * nr_cols_B * sizeof(float));
cudaMalloc(&d_C, nr_rows_C * nr_cols_C * sizeof(float));
randomMatrix<<<1,1>>>(time(NULL), d_A, nr_rows_A, nr_cols_A);
randomMatrix<<<1, 1>>>(time(NULL) + 100000, d_B, nr_rows_B, nr_cols_B);
// Multiply A and B on GPU
gpu_blas_mmul(d_A, d_B, d_C, nr_rows_A, nr_cols_A, nr_cols_B);
// Copy (and print) the result on host memory
cudaMemcpy(h_C, d_C, nr_rows_C * nr_cols_C * sizeof(float), cudaMemcpyDeviceToHost);
//Free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free CPU memory
free(h_A);
free(h_B);
free(h_C);
//std::getchar();
return 0;
}
But this code works only for matrices not more that 64 x 64. After cuda-memckeck run, I get next messages:
========= CUDA-MEMCHECK
========= Program hit cudaErrorInvalidDeviceFunction (error 8) due to "invalid device function" on CUDA API call to cudaLaunch.
========= Saved host backtrace up to driver entry point at error
...
=========
========= Program hit cudaErrorInvalidDeviceFunction (error 8) due to "invalid device function" on CUDA API call to cudaGetLastError.
========= Saved host backtrace up to driver entry point at error
...
=========
========= ERROR SUMMARY: 2 errors
I can’t understand why and how to multiply matrices more large? Please, help me!