Hey, I’m relatively new to CUDA development, however, I managed to get a simple matrix-vector multiplication program making use of cuBLAS to work, here’s the code snippet:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <iostream>
/* Using updated (v2) interfaces to cublas and cusparse */
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
int main(void){
const int M = 40000 ;
const int N = 6 ;
// variables declaration
float x[N], c[M] ;
float *dev_x, *dev_c ;
float matrix[M][N] ;
float *dev_matrix ;
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_x, N*sizeof(float) ) ;
cudaMalloc( (void**)&dev_c, M*sizeof(float) ) ;
cudaMalloc( (void**)&dev_matrix, M*N*sizeof(float) ) ;
// fill the array x and initiate c by zero on the CPU
for (int i=0; i<N; i++) {
x[i] = (float)(0.1*i) ;
c[i] = 0 ;
}
// fill in the matrix
for(int i=0; i<M; i++){
for(int j=0; j<N; j++){
//matrix[i][j] = (float)((0.6*i+0.4*j)/(0.8*i+0.2*j)) ;
matrix[i][j] = (float)(0.1*i*j) ;
}
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_x, x, N*sizeof(float), cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_c, c, M*sizeof(float), cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_matrix, matrix, M*N*sizeof(float), cudaMemcpyHostToDevice ) ;
/* Get handle to the CUBLAS context */
cublasHandle_t cublasHandle = 0 ;
cublasStatus_t cublasStatus ;
cublasStatus = cublasCreate(&cublasHandle) ;
cublasOperation_t trans = CUBLAS_OP_N ;
float alpha = 1.0 ;
float beta = 0.0 ;
cublasStatus = cublasSgemv(cublasHandle, trans, M, N, &alpha, dev_matrix, M, dev_x, 1, &beta, dev_c, 1) ;
if(cublasStatus != CUBLAS_STATUS_SUCCESS) printf("%s \n","cuBLAS SGEMV Error") ;
// copy result back to host memory
cudaMemcpy( c, dev_c, M*sizeof(float), cudaMemcpyDeviceToHost ) ; // c = alpha*matrix*a + beta*c
// free device memory
cudaFree(dev_x);
cudaFree(dev_matrix);
cudaFree(dev_c);
return 0 ;
}
The programs WORKS FINE and gives correct results for small dimensions matrix (around 250,000 elements), however, when I try to increase the matrix dimensions, the program crashes on execution, I get the widows message: PROGRAM_NAME.exe has stopped working, and the console returns a blank screen, is it a limitation of my GPU memory (GeForce GT 240) or what ?!