Hello all,
I’ve a problem with my matrix multiplication program in CUDA. I’m running on Windows 7 with Visual Studio 2008 Express, with Nvidia GTX 285. When my matrix size is more than 1000 (#define WMATRIX 100 and #define HMATRIX 100), my program doesn’t want to run. I’ve no error message, the console is launched but it disappear before I can do anything).
My display is connected in the same graphic card because i’ve not Chipset neither other PCI-e or PCI card in my computer…
There’s my code :
[codebox]#include “cutil_inline.h”
#include <shrUtils.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <conio.h>
#define BLOCK_SIZE 16
#define WMATRIX 100
#define HMATRIX 100
#define WA (5 * WMATRIX)
#define HA (10 * HMATRIX)
#define WB (5 * WMATRIX)
#define HB WA
#define WC WB
#define HC HA
#define TAB(i,j,dim) ((i) * (dim) + (j))
global void multiplication_matrice (float *matrice1, float *matrice2, float *matrice3, int ligne, int colonne)
{
int k;
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
matrice3[TAB(i,j,colonne)] = 0;
for (k = 0; k < colonne; k++)
matrice3[TAB(i,j,colonne)] += matrice1[TAB(i,k,ligne)] * matrice2[TAB(k,j,colonne)];
}
int main(void)
{
int i,j;
int devID;
cudaDeviceProp props;
cudaGetDevice(&devID);
cudaGetDeviceProperties(&props, devID);
printf(“GPU %d: "%s" avec Compute %d.%d capability.\n”, devID, props.name, props.major, props.minor);
int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
int iSizeMultiple = 1;
iSizeMultiple = CLAMP(iSizeMultiple, 1, 10);
if (props.multiProcessorCount <= 4) {
uiWA = 2 * WMATRIX * iSizeMultiple;
uiHA = 4 * HMATRIX * iSizeMultiple;
uiWB = 2 * WMATRIX * iSizeMultiple;
uiHB = 4 * HMATRIX * iSizeMultiple;
uiWC = 2 * WMATRIX * iSizeMultiple;
uiHC = 4 * HMATRIX * iSizeMultiple;
} else {
uiWA = WA * iSizeMultiple;
uiHA = HA * iSizeMultiple;
uiWB = WB * iSizeMultiple;
uiHB = HB * iSizeMultiple;
uiWC = WC * iSizeMultiple;
uiHC = HC * iSizeMultiple;
}
unsigned int size_A = uiWA * uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *a_h = (float*)malloc(mem_size_A);
unsigned int size_B = uiWB * uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *b_h = (float*)malloc(mem_size_B);
unsigned int size_C = uiWC * uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
float *c_h = (float*)malloc(mem_size_C);
float *a_d;
cutilSafeCall( cudaMalloc((void **)&a_d,mem_size_A) );
float *b_d;
cutilSafeCall( cudaMalloc((void **)&b_d,mem_size_B) );
float *c_d;
cutilSafeCall( cudaMalloc((void **)&c_d,mem_size_C) );
for(i = 0; i < uiWA; i++) {
for(j = 0; j < uiHA; j++) {
a_h[TAB(i,j,uiHA)]=2;
}
}
for(i = 0; i < uiWB; i++) {
for(j = 0; j < uiHB; j++) {
b_h[TAB(i,j,uiHB)]=3;
}
}
cutilSafeCall( cudaMemcpy( a_d, a_h, mem_size_A, cudaMemcpyHostToDevice ) );
cutilSafeCall( cudaMemcpy( b_d, b_h, mem_size_B, cudaMemcpyHostToDevice ) );
cutilSafeCall( cudaMemcpy( c_d, c_h, mem_size_C, cudaMemcpyHostToDevice ) );
dim3 threads( BLOCK_SIZE, BLOCK_SIZE );
dim3 dimGrid( ceil(float(uiWC)/float(threads.x)), ceil(float(uiHA)/float(threads.y)) );
unsigned int timer = 0;
cutilCheckError(cutCreateTimer(&timer));
cutilCheckError(cutStartTimer(timer));
int nIter = 1;
for (int j = 0; j < nIter; j++)
multiplication_matrice<<<dimGrid, threads>>>(a_d,b_d,c_d,uiWA,uiWB);
cutilCheckMsg(“Erreur dans l execution du kernel”);
cutilCheckError(cutStopTimer(timer));
cutilSafeCall( cudaThreadSynchronize() );
double dSeconds = cutGetTimerValue(timer)/((double)nIter * 1000.0);
double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
double gflops = 1.0e-9 * dNumOps/dSeconds;
printf(“\Matrix multiplication : \n\n”);
shrLogEx(LOGBOTH | MASTER, 0, "Throughput = %.4f GFlop/s, Time = %.5f ms, Size = %.0f Ops, NumDevsUsed = %d,Workgroup = %u\n"
,gflops, (dSeconds*1000), dNumOps, 1, threads.x * threads.y);
cutilCheckError(cutDeleteTimer(timer));
cutilSafeCall( cudaMemcpy( c_h, c_d, mem_size_C, cudaMemcpyDeviceToHost ) );
// Affichage d’un élément de la matrice résultante
printf ("\n\nJust a print of one element\n");
printf("%f\n",c_h[TAB(120,47,uiHC)]);
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
_getch();
return 0;
}[/codebox]
Thanks a lot ! :)