Hi,
I guess that there are many developers using CUDA/CUBLAS for computational works. I have noticed that computing on small set of data with CUDA/CUBLAS providing the same result as we simple CPU program does (of course with higher speed). However, when the size of data set is growing (more computation, larger value of result) the difference between results from CUDA/CUPLAS program and CPU one is increasing. For example, just make a small test with matrix matrix multiplication.
// -*- C++ -*-
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <cublas.h>
void mprintf(float* data,int rows,int cols,char* name){
printf("Matrix : %s:\n",name);
for(int i=0;i<rows;i++){
for(int j=0;j<cols;j++){
printf("%f ",data[j*rows+i]);
}
printf("\n");
}
}
int main(void){
float * h_A, *h_B, *h_C;
float * d_A, *d_B, *d_C;
int wA = 3000;
int hA = 3000;
int wB = 1000;
int hB = wA;
int wC = wB;
int hC = hA;
h_A = (float*)malloc(wA*hA*sizeof(float));
h_B = (float*)malloc(wB*hB*sizeof(float));
h_C = (float*)malloc(wC*hC*sizeof(float));
// Allocation des structures
for (int i = 0; i < wA*hA; i++){
h_A[i] = (float)rand()/(float)RAND_MAX;
}
for (int i = 0; i < wB*hB; i++){
h_B[i] = (float)rand()/(float)RAND_MAX;
}
for (int i = 0; i < wC*hC; i++){ //not necessary
h_C[i] = 0.0;
}
cublasStatus status;
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
status = cublasAlloc(wA*hA, sizeof(d_A[0]), (void**)&d_A);
status = cublasAlloc(wB*hB, sizeof(d_B[0]), (void**)&d_B);
status = cublasAlloc(wC*hC, sizeof(d_C[0]), (void**)&d_C);
cublasSetMatrix( hA, wA, sizeof(float), h_A, hA, d_A, hA);
cublasSetMatrix( hB, wB, sizeof(float), h_B, hB, d_B, hB);
float alpha = 1.0;
float beta = 0.0;
// cublasSgemm('t', 'n', L, K, N, alpha, d_A, N, d_B, N, beta, d_C, N);
cublasSgemm('N', 'N', hA, wB, hB, alpha, d_A, hA, d_B, hB, beta, d_C, hC);
status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
status = cublasGetVector(wC*hC, sizeof(h_C[0]), d_C, 1, h_C, 1);
// CPU computation
float* h_CPU = (float*)malloc(wC*hC*sizeof(float));
for (int l = 0; l < hC; l++){
for(int k=0; k < wC; k++){
float sum = 0.0;
for(int n=0;n < wA;n++) sum+= h_A[n*hA+l]*h_B[k*hB+n];
h_CPU[k*hC+l] = sum;
}
}
CUTBoolean res = cutCompareL2fe(h_C, h_CPU,wC*hC, 1e-6f);
printf("Test %s \n", (1 == res) ? "PASSED":"FAILED");
status = cublasFree(d_A);
status = cublasFree(d_B);
status = cublasFree(d_C);
free(h_A);
free(h_B);
free(h_C);
status = cublasShutdown();
return 1;
}
The code above pass its testing with matrices size A(3000x3000) B(1000x3000) (colxrow), however, if we make just wA to be more than 4500 (no matter how big of hA,wB are) the test will be failed.
At first I thought it is problem of CUBLAS, then I try to make my CUDA program and it provides the same result as CUBLAS one.
Can anyone help me out of this issue?
Thank you very much,
IZ