matrix multiplication program

Hello all,

I’ve a problem with my matrix multiplication program in CUDA. I’m running on Windows 7 with Visual Studio 2008 Express, with Nvidia GTX 285. When my matrix size is more than 1000 (#define WMATRIX 100 and #define HMATRIX 100), my program doesn’t want to run. I’ve no error message, the console is launched but it disappear before I can do anything).

My display is connected in the same graphic card because i’ve not Chipset neither other PCI-e or PCI card in my computer…

There’s my code :

[codebox]#include “cutil_inline.h”

#include <shrUtils.h>

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <cuda.h>

#include <conio.h>

#define BLOCK_SIZE 16

#define WMATRIX 100

#define HMATRIX 100

#define WA (5 * WMATRIX)

#define HA (10 * HMATRIX)

#define WB (5 * WMATRIX)

#define HB WA

#define WC WB

#define HC HA

#define TAB(i,j,dim) ((i) * (dim) + (j))

global void multiplication_matrice (float *matrice1, float *matrice2, float *matrice3, int ligne, int colonne)

{

int k;

int i = blockIdx.x*blockDim.x + threadIdx.x;

int j = blockIdx.y*blockDim.y + threadIdx.y;

matrice3[TAB(i,j,colonne)] = 0;

for (k = 0; k < colonne; k++)

    matrice3[TAB(i,j,colonne)] += matrice1[TAB(i,k,ligne)] * matrice2[TAB(k,j,colonne)];

}

int main(void)

{

int i,j;

int devID;

cudaDeviceProp props;  

cudaGetDevice(&devID);

cudaGetDeviceProperties(&props, devID);

printf(“GPU %d: "%s" avec Compute %d.%d capability.\n”, devID, props.name, props.major, props.minor);

int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;

int iSizeMultiple = 1;

iSizeMultiple = CLAMP(iSizeMultiple, 1, 10);

if (props.multiProcessorCount <= 4) {

    uiWA = 2 * WMATRIX * iSizeMultiple;

    uiHA = 4 * HMATRIX * iSizeMultiple;

    uiWB = 2 * WMATRIX * iSizeMultiple;

    uiHB = 4 * HMATRIX * iSizeMultiple;

    uiWC = 2 * WMATRIX * iSizeMultiple;

    uiHC = 4 * HMATRIX * iSizeMultiple;

} else {

    uiWA = WA * iSizeMultiple;

    uiHA = HA * iSizeMultiple;

    uiWB = WB * iSizeMultiple;

    uiHB = HB * iSizeMultiple;

    uiWC = WC * iSizeMultiple;

    uiHC = HC * iSizeMultiple;

}

unsigned int size_A = uiWA * uiHA;

unsigned int mem_size_A = sizeof(float) * size_A;

float *a_h = (float*)malloc(mem_size_A);

unsigned int size_B = uiWB * uiHB;

unsigned int mem_size_B = sizeof(float) * size_B;

float *b_h = (float*)malloc(mem_size_B);

unsigned int size_C = uiWC * uiHC;

unsigned int mem_size_C = sizeof(float) * size_C;

float *c_h = (float*)malloc(mem_size_C);

float *a_d;

cutilSafeCall( cudaMalloc((void **)&a_d,mem_size_A) );

float *b_d;

cutilSafeCall( cudaMalloc((void **)&b_d,mem_size_B) );

float *c_d;

cutilSafeCall( cudaMalloc((void **)&c_d,mem_size_C) );

for(i = 0; i < uiWA; i++) {

    for(j = 0; j < uiHA; j++) {

        a_h[TAB(i,j,uiHA)]=2;

    }

}

for(i = 0; i < uiWB; i++) {

    for(j = 0; j < uiHB; j++) {

        b_h[TAB(i,j,uiHB)]=3;

    }

}

cutilSafeCall( cudaMemcpy( a_d, a_h, mem_size_A, cudaMemcpyHostToDevice ) );

cutilSafeCall( cudaMemcpy( b_d, b_h, mem_size_B, cudaMemcpyHostToDevice ) );

cutilSafeCall( cudaMemcpy( c_d, c_h, mem_size_C, cudaMemcpyHostToDevice ) );

dim3 threads( BLOCK_SIZE, BLOCK_SIZE );

dim3 dimGrid( ceil(float(uiWC)/float(threads.x)), ceil(float(uiHA)/float(threads.y)) );

unsigned int timer = 0;

    cutilCheckError(cutCreateTimer(&timer));

    cutilCheckError(cutStartTimer(timer));

int nIter = 1;

   for (int j = 0; j < nIter; j++)

multiplication_matrice<<<dimGrid, threads>>>(a_d,b_d,c_d,uiWA,uiWB);

cutilCheckMsg(“Erreur dans l execution du kernel”);

cutilCheckError(cutStopTimer(timer));

cutilSafeCall( cudaThreadSynchronize() );

double dSeconds = cutGetTimerValue(timer)/((double)nIter * 1000.0);

    double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;

    double gflops = 1.0e-9 * dNumOps/dSeconds;

printf(“\Matrix multiplication : \n\n”);

shrLogEx(LOGBOTH | MASTER, 0, "Throughput = %.4f GFlop/s, Time = %.5f ms, Size = %.0f Ops, NumDevsUsed = %d,Workgroup = %u\n"

    ,gflops, (dSeconds*1000), dNumOps, 1, threads.x * threads.y);

    cutilCheckError(cutDeleteTimer(timer));

cutilSafeCall( cudaMemcpy( c_h, c_d, mem_size_C, cudaMemcpyDeviceToHost ) );

// Affichage d’un élément de la matrice résultante

printf ("\n\nJust a print of one element\n");

printf("%f\n",c_h[TAB(120,47,uiHC)]);

free(a_h);

free(b_h);

free(c_h);

cudaFree(a_d);

cudaFree(b_d);

cudaFree(c_d);

_getch();

return 0;

}[/codebox]

Thanks a lot ! :)