Hello
I develop a project with CUDA for my studies, and I encounter some troubles.
-
With the same matrix and the same calculations, results with CUDA are different of results without CUDA, with a margin of error of one thousandth, but just for a part of the results.
-
I can calculate the multiplication of 2 square matrix of size 1000*1000, but above the nVidia driver crashes, and I can’t find where is the limit of memory or calculations I can have with my card Quadro FX 880M.
My program :
// Test.cpp : entry point
//
#include "stdafx.h"
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "timer.h"
// Multiply Md by Nd = Pd, with "col1d" number of columns of Md, "col2d" number of columns of Nd,
// "max_thread_for_last" number of used threads in the last block
__global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int col1d, int col2d, int max_thread_for_last)
{
if(blockIdx.x != gridDim.x || threadIdx.x < max_thread_for_last) // If this is not the last block or an existing calculation
{
int tId = threadIdx.x + blockIdx.x * blockDim.x;
float sum = 0.0;
for(int i = 0; i < col1d; i++) // Matrix calculation
sum += Md[(tId/col2d)*col1d + i] * Nd[(tId%col2d)+(i*col2d)];
Pd[tId] = sum;
}
}
int _tmain(int argc, _TCHAR* argv[])
{
timer temps; // Timer for comparison
int affichage = 99999, amplitude = 67, nb_float_stockes = 0; // Variables for display and initialisation of matrix
/* MULTIPLICATION OF MATRIX WITH CUDA */
temps.Start(); // Timer start
int lig1=1000, col1=1000, taille1 = lig1*col1; // First matrix : lig1*col1
int lig2=col1, col2=1000, taille2 = lig2*col2; // Second matrix : lig2*col2
int taille3 = lig1 * col2; // Result : lig1 * col2
printf("\n - Nombre de threads : %i", taille3); // Number of used threads
int size1 = taille1*sizeof(float), size2 = taille2*sizeof(float), size3 = taille3*sizeof(float); // Memory size of each matrix
float *m, *n, *p; // Matrix
m = new float[taille1];
n = new float[taille2];
p = new float[taille3];
for(int i = 0; i < taille1; i++) // Fill first matrix
m[i] = (float)(i%amplitude)/(float)amplitude + 0.5;
for(int i = 0; i < taille2; i++) // Fill second matrix
n[i] = (float)(i%amplitude)/(float)amplitude + 0.5;
nb_float_stockes += taille1 + taille2 + taille3; // Display number of floats to stock in memory
printf("\n - Nombre d'float stockes : %i", nb_float_stockes);
float *p_m, *p_n, *p_p; // Pointers of matrix
temps.Loop(); // Intermediate time
printf("Copie..."); // Copy the matrix into GPU memory
cudaMalloc((void**) &p_m, size1);
cudaMalloc((void**) &p_n, size2);
cudaMalloc((void**) &p_p, size3);
cudaMemcpy(p_m, m, size1, cudaMemcpyHostToDevice);
cudaMemcpy(p_n, n, size2, cudaMemcpyHostToDevice);
temps.Loop();
int nbThreads = 256, nbBlocks = (taille3 - 1)/nbThreads; // Calculation of distribution of blocks into the grid and threads into a block
dim3 dimGrid(nbBlocks + 1, 1, 1);
dim3 dimBlock(nbThreads, 1, 1);
printf("\nNombre de blocks : %i, Nombre de threads par block : %i", nbBlocks + 1, nbThreads);
temps.Loop();
printf("Calcul...");
MatrixMulKernel<<<dimGrid,dimBlock>>>(p_m, p_n, p_p, col1, col2, taille3%nbThreads + 1); // Execution
cudaStreamSynchronize(0); // Wait for results
temps.Loop();
printf("Copie...");
cudaMemcpy(p, p_p, size3, cudaMemcpyDeviceToHost); // Copy into main memory
temps.Loop();
cudaFree(p_m);
cudaFree(p_n);
cudaFree(p_p);
for(int i=0; i<taille3; i++) // Display of some of the results
if(i%affichage == 0)
printf("\nResultat %i : %f", i, p[i]);
temps.End();
system("PAUSE");
/* SAME MULTIPLICATION OF MATRIX WITHOUT CUDA */
temps.Start();
float *r; // Matrix
r = new float[taille3];
temps.Loop();
printf("Calcul...");
for(int i = 0; i < lig1; i++)
{
for(int j = 0; j < col2; j++)
{
r[i*col2 + j] = 0;
for(int k = 0; k < col1; k++)
r[i*col2 + j] += m[i*col1 + k] * n[k*col2 + j];
}
}
temps.Loop();
for(int i=0; i<taille3; i++) // Display of some of the results
if(i%affichage == 0)
printf("\nResultat %i : %f", i, r[i]);
delete r;
temps.End();
system("PAUSE");
return 0;
}
At left : results with CUDA. At right : results without CUDA.
If the image doesn’t display : http://www.hostingpics.net/viewer.php?id=212555CUDA.png
Thank you