Hey guys…
I am using the CUBLAS func sgemm and wanted to do a little speed test. However, I am able to do tenthousands of iterations with 512x512 matrices but I am not able to do even 10 with 1024x1024 matrices. I get a display drive error and my program exits. I really do not understand why I am able to do 1 iteration with 1024x1024 and not 20 or 30. I think he got problems when copying the data to GPU-Memory, but I don’t understand how the num of iterations affects this.
#include<stdio.h>
#include<cuda.h>
#include<cublas.h>
#include<stdlib.h>
#include<time.h>
#include<cutil_inline.h>
#define QM 1024
#define W_A QM
#define H_A QM
#define W_B QM
#define H_B W_A
#define W_C W_B
#define H_C H_A
#define ITERS 20
int main(int argc, char **argv) {
clock_t start, end;
double diff, diffs, speed;
cublasStatus stat;
printf("|=================================|\n");
printf("|CUBLAS FUNCTION SGEMM COMPUTATION|\n");
printf("|=================================|\n\n");
stat = cublasInit();
if(stat == CUBLAS_STATUS_SUCCESS) {
printf("Cublas successfully initialized ..\n");
}
else if(stat != CUBLAS_STATUS_SUCCESS) {
printf("Cublas not initialized ..\n");
}
// allocate device memory
float* d_A;
stat = cublasAlloc(W_A * H_A, sizeof(float), (void**) &d_A);
if(stat != CUBLAS_STATUS_SUCCESS) {
printf("Mem for m A not get ..\n");
}
float* d_B;
stat = cublasAlloc(W_B * H_B, sizeof(float), (void**) &d_B);
if(stat != CUBLAS_STATUS_SUCCESS) {
printf("Mem for m B not get ..\n");
}
// allocate device memory for result
size_t size_C = W_C * H_C;
size_t mem_size_C = sizeof(float) * size_C;
float* d_C;
stat = cublasAlloc(H_C*W_C, sizeof(float), (void**) &d_C);
if(stat != CUBLAS_STATUS_SUCCESS) {
printf("Mem for m C not get ..\n");
}
printf("Device Memory successfully allocated ..\n");
// allocate host memory for matrices A and B
size_t size_A = W_A * H_A;
size_t mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
size_t size_B = W_B * H_B;
size_t mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
printf("Host Memory successfully allocated ..\n");
for(int i = 0; i < W_A * H_A; i++) {
h_A[i] = 1.0f;
}
for(int i = 0; i < W_B * H_B; i++) {
h_B[i] = 2.0f;
}
// allocate host memory for the result
float* h_C = (float*) malloc(mem_size_C);
cublasSetMatrix(H_A, W_A, sizeof(float), h_A, H_A, d_A, H_A);
cublasSetMatrix(H_B, W_B, sizeof(float), h_B, H_B, d_B, H_B);
printf("Data successfully copied to GPU-Memory ..\n");
//Warmup
printf("\nPerforming Warmup ..\n");
for(int it = 0; it < ITERS; ++it) {
cublasSgemm('n', 'n', H_C, W_C, W_A, 1.0f, d_A, H_A, d_B, H_B, 1.0f, d_C, H_C);
}
cutilSafeCall( cudaThreadSynchronize() );
printf("Done.\n\n");
printf("Performing Computation ..\n");
start = clock();
for(int it = 0; it < ITERS; ++it) {
cublasSgemm('n', 'n', H_C, W_C, W_A, 1.0f, d_A, H_A, d_B, H_B, 1.0f, d_C, H_C);
}
cutilSafeCall( cudaThreadSynchronize() );
end = clock();
printf("Done.\n\n");
//cublasGetMatrix(H_C, W_C, sizeof(float), d_B, H_B, h_B, H_B);
diff = end - start;
diffs = diff / CLOCKS_PER_SEC;
speed = (ITERS * 1E-9 * ((2*QM*QM*QM) + (3*QM*QM))) / diffs;
printf("\nRESULTS OF COMPUTING:\n\n");
printf("Matrix size: %dx%d\n", H_A, W_A);
printf("Num Iterations: %d\n", ITERS);
printf("Elapsed Time [s]: %3f\n", diffs);
printf("Speed [GFLOP/s]: %2f", speed);
cublasFree(d_A);
cublasFree(d_B);
cublasFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cublasShutdown();
}
Regards.