Problem with multiplication vector by array with big size

Hi People, You ok !!

I’m brazillian begginer student in cuda… i need help and hint to a single problem - “multiplication vector by array with big size”.

I am not able to multiply an array by a vector for values greater than 256, my first result (0,0) is incorrect and to other higher values (1024 …) i get a NaN for some elements of the resultant vector.

I post my program and my results. That your see, results to 512 x 512 to line and col of vector and array are:

[b]Executtando GPU kernel…

GPU time: 1103.490967 msecs.

Executando funcao na CPU…

CPU time: 0.462000 msecs.[/b]

Why worst results ocurrs ? What best path to otmization this sinple code

Helpe me, please !

The CUDA Program

[codebox]

//inclue os arquivos principais

#include <stdio.h>

#include <cutil_inline.h>

#include “/usr/local/cuda/include/cuda.h”

#define DEB

//#define DEB_MAT

//#define DEB_SYNC

#define BLOCK_SIZE 16 //numero de threads em um bloco (16,16,1)= 32 Threads/Bloco

#define N_LINES 512 //numero de linhas do vetor/matriz

#define M_COLS 512 //numero de colunas da matriz

#define DATA 2*N_LINES * M_COLS

//#define RAND_MAT() (((int)(rand()/(float)RAND_MAX *10))%5)+1

//#define RAND_MAT() rand()/(float)RAND_MAX

#define RAND_MAT() 1.0

//Variaveis usadas no host

float* h_vectorA; //Vetor A

float* h_matrixB; //Matriz B

float* h_vectorC; //Vetor resultante C

float* h_vectorT; //Vetor de Teste para o calculo de computaSerial()

//Variaveis usadas no device

float* d_vectorA; //Vetor A

float* d_matrixB; //Matriz B

float* d_vectorC; //Vetor resultante C

//prompt para mostrar mensagem

bool noprompt = false;

//funcoes auxiliares declaradas adiante

void cleanup(void);

void randomInitVector(float*);

void randomInitMatrix(float*);

void computaSerial(float*,const float*, const float*);

void printMatrix(float*);

void printVector(float*);

//codigo do device

global void vectorMultArray(const float* A,

            const float* B, float* C)

{

int row = blockDim.x * blockIdx.x + threadIdx.x;

int col = blockDim.y * blockIdx.y + threadIdx.y;

if( row<N_LINES && col<M_COLS )

{

  float Cvalue=0.0F;

  C[__umul24(row , M_COLS) + col] = 0.0;

  for (int e = 0; e < M_COLS; ++e)

Cvalue += A[ __umul24(row , M_COLS) + e] * B[ __umul24(col , M_COLS) + e ];

C[__umul24(row , M_COLS) + col] = Cvalue;

}

}

// Host code

int main(int argc, char** argv)

{

printf("Multiplica vetor por Matriz\n");

printf("A(n) x B(m,n) = C(m)\n\n");

size_t sizeVector = N_LINES * sizeof(float);

size_t sizeMatrix = N_LINES * M_COLS * sizeof(float);

size_t sizeVectorResult = M_COLS * sizeof(float);

unsigned int hTimer;

// use command-line specified CUDA device, otherwise use device with highest Gflops/s

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

  cutilDeviceInit(argc, argv);

else

  cudaSetDevice( cutGetMaxGflopsDeviceId() );

cutilCheckError( cutCreateTimer(&hTimer) );

// Aloca o vetor de entrada A(n)

h_vectorA = (float*)malloc(sizeVector);

if (h_vectorA == 0) cleanup();

// Aloca a matriz de entrada B(m,n)

h_matrixB = (float*)malloc(sizeMatrix);

if (h_matrixB == 0) cleanup();

// Aloca o vetor resultante C(m)

h_vectorC = (float*)malloc(sizeVector);

if (h_vectorC == 0) cleanup();

// Aloca o vetor de teste T(m)

h_vectorT = (float*)malloc(sizeVectorResult);

if (h_vectorT == 0) cleanup();

// Initialize input vectors

randomInitVector(h_vectorA);

randomInitMatrix(h_matrixB);

// Exibe o Vetor e a Matriz inicializada

#ifdef DEBMAT

printVector(h_vectorA);

printMatrix(h_matrixB);

#endif

// Alocate o vetor A, matriz B e vetor resultante C na memória do device

cutilSafeCall( cudaMalloc((void**)&d_vectorA, sizeVector) );

cutilSafeCall( cudaMalloc((void**)&d_matrixB, sizeMatrix) );

cutilSafeCall( cudaMalloc((void**)&d_vectorC, sizeVector) );

// Copia o vetores/matriz da memória do host para o device

cutilSafeCall( cudaMemcpy(d_vectorA,

              h_vectorA, sizeVector, cudaMemcpyHostToDevice) );

cutilSafeCall( cudaMemcpy(d_matrixB,

              h_matrixB, sizeMatrix, cudaMemcpyHostToDevice) );

int threadsPorBloco = BLOCK_SIZE;

dim3 block(threadsPorBloco,

       threadsPorBloco,

       1);

//dim3 grid ( (N_LINES + block.x -1 )/block.x,

//    (N_LINES + block.y -1 )/block.y,

//    1);

dim3 grid (N_LINES/block.x, N_LINES/block.y,1);

printf("Executtando GPU kernel...\n");

  cutilSafeCall( cudaThreadSynchronize() );

  cutilCheckError( cutResetTimer(hTimer) );

  cutilCheckError( cutStartTimer(hTimer) );

vectorMultArray<<<grid,block>>>(d_vectorA, d_matrixB, d_vectorC);

cutilCheckMsg(“A execucao da rotina vectorMultArray() falhou\n”);

  cutilSafeCall( cudaThreadSynchronize() );

  cutilCheckError( cutStopTimer(hTimer) );

printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));

#ifdef DEBSYNC

cutilSafeCall( cudaThreadSynchronize() );

#endif

// Copia o resultado da memoria do device para o host

// h_vectorC contém o resultado da memória do host

cutilSafeCall( cudaMemcpy(h_vectorC,

              d_vectorC, sizeVector, cudaMemcpyDeviceToHost) );

//calcula serial

printf("Executando funcao na CPU...\n");

    cutilCheckError( cutResetTimer(hTimer) );

cutilCheckError( cutStartTimer(hTimer) );

computaSerial(h_vectorT,h_vectorA,h_matrixB);

cutilCheckError( cutStopTimer(hTimer) );

printf("CPU time: %f msecs.\n", cutGetTimerValue(hTimer));

#ifdef DEB

printf("\n*****Vetor C Resultante do CUDA*****\n");

printVector(h_vectorC);

printf(“\nVetor T Resultante Serial*\n”);

printVector(h_vectorT);

#endif

// Verifica o resultado

int i;

for (i = 0; i < M_COLS; ++i) {

     if (fabs(h_vectorC[i] - h_vectorT[i]) > 1e-5){

    printf("\n** Problemas na linha do vetor: %d **\n",i);

        break;

    }

}

printf("%s \n", (i == M_COLS) ? "PASSOU" : "FALHOU");

cleanup();

}

/************************************************************

*Função que imprime uma matriz *


*/

void printMatrix(float* m)

{

printf(" Matriz inicializada para:\n");

for(int i = 0 ; i < M_COLS ; i++)

  {

printf("\n");

for(int j = 0 ; j < N_LINES ; j++)

  {

    printf("%5.2f", m[ i * M_COLS +j ] );

  }

  }

printf("\n");

}

/************************************************************

*Função que imprime um vetor *


*/

void printVector(float* v)

{

printf(" Vetor inicializado para: \n[");

for(int i = 0; i<N_LINES; i++)

printf("%5.2f ",v[i]);

printf(“]\n”);

}

/************************************************************

*Função que libera a memória do device e do host *


*/

void cleanup(void)

{

// Libera a memória do device

if (d_vectorA)

    cudaFree(d_vectorA);

if (d_matrixB)

    cudaFree(d_matrixB);

if (d_vectorC)

    cudaFree(d_vectorC);

// Libera a memória do host

if (h_vectorA)

    free(h_vectorA);

if (h_matrixB)

    free(h_matrixB);

if (h_vectorC)

    free(h_vectorC);

cutilSafeCall( cudaThreadExit() );

if (!noprompt) {

    printf("\nPressione ENTER para sair...\n");

    fflush( stdout);

    fflush( stderr);

    getchar();

}

exit(0);

}

/************************************************************

*Função que gera o conteúdo aleatório do vetor *


*/

void randomInitVector(float* v)

{

for (int i = 0; i < N_LINES; ++i)

  v[i] = RAND_MAT();

}

/************************************************************

  • Função que gera o conteúdo aleatório da matriz *

  • *(M.elementos + row * M.largura + col) *


*/

void randomInitMatrix(float* mat)

{

for (int i = 0; i < M_COLS; ++i)

  for(int j = 0; j < N_LINES; ++j)

mat[ i * M_COLS + j ] = RAND_MAT();

}

/************************************************************

  • Função que calcula a multiplica de um vetor pela matriz *

  • faz um processamento serial e retorna em C *

  • *(M.elementos + row * M.largura + col) *


*/

void computaSerial(float* C,const float* A, const float* B)

{

for (unsigned int i = 0; i < M_COLS; ++i){

double sum = 0.0F;

for (unsigned int j = 0; j < N_LINES; ++j)

  {

double b = B[i * M_COLS + j];

sum += A[j] * b;

  }

  C[i] = (float)sum;

}

}

[/codebox]

Results ============================================================

=====

[b]Multiplica vetor por Matriz

A(n) x B(m,n) = C(m)

Executtando GPU kernel…

GPU time: 1103.490967 msecs.

Executando funcao na CPU…

CPU time: 0.462000 msecs.[/b]

Vetor C Resultante do CUDA

Vetor inicializado para:



Vetor T Resultante Serial*

Vetor inicializado para:



** Problemas na linha do vetor: 0 **

FALHOU

Pressione ENTER para sair…