# Problem with multiplication vector by array with big size

Hi People, You ok !!

I’m brazillian begginer student in cuda… i need help and hint to a single problem - “multiplication vector by array with big size”.

I am not able to multiply an array by a vector for values greater than 256, my first result (0,0) is incorrect and to other higher values (1024 …) i get a NaN for some elements of the resultant vector.

I post my program and my results. That your see, results to 512 x 512 to line and col of vector and array are:

[b]Executtando GPU kernel…

GPU time: 1103.490967 msecs.

Executando funcao na CPU…

CPU time: 0.462000 msecs.[/b]

Why worst results ocurrs ? What best path to otmization this sinple code

Helpe me, please !

The CUDA Program

[codebox]

//inclue os arquivos principais

#include <stdio.h>

#include <cutil_inline.h>

#include “/usr/local/cuda/include/cuda.h”

#define DEB

//#define DEB_MAT

//#define DEB_SYNC

#define BLOCK_SIZE 16 //numero de threads em um bloco (16,16,1)= 32 Threads/Bloco

#define N_LINES 512 //numero de linhas do vetor/matriz

#define M_COLS 512 //numero de colunas da matriz

#define DATA 2*N_LINES * M_COLS

//#define RAND_MAT() (((int)(rand()/(float)RAND_MAX *10))%5)+1

//#define RAND_MAT() rand()/(float)RAND_MAX

#define RAND_MAT() 1.0

//Variaveis usadas no host

float* h_vectorA; //Vetor A

float* h_matrixB; //Matriz B

float* h_vectorC; //Vetor resultante C

float* h_vectorT; //Vetor de Teste para o calculo de computaSerial()

//Variaveis usadas no device

float* d_vectorA; //Vetor A

float* d_matrixB; //Matriz B

float* d_vectorC; //Vetor resultante C

//prompt para mostrar mensagem

bool noprompt = false;

void cleanup(void);

void randomInitVector(float*);

void randomInitMatrix(float*);

void computaSerial(float*,const float*, const float*);

void printMatrix(float*);

void printVector(float*);

//codigo do device

global void vectorMultArray(const float* A,

``````            const float* B, float* C)
``````

{

int row = blockDim.x * blockIdx.x + threadIdx.x;

int col = blockDim.y * blockIdx.y + threadIdx.y;

if( row<N_LINES && col<M_COLS )

``````{

float Cvalue=0.0F;

C[__umul24(row , M_COLS) + col] = 0.0;

for (int e = 0; e < M_COLS; ++e)

Cvalue += A[ __umul24(row , M_COLS) + e] * B[ __umul24(col , M_COLS) + e ];
``````

C[__umul24(row , M_COLS) + col] = Cvalue;

``````}
``````

}

// Host code

int main(int argc, char** argv)

{

``````printf("Multiplica vetor por Matriz\n");

printf("A(n) x B(m,n) = C(m)\n\n");
``````

size_t sizeVector = N_LINES * sizeof(float);

``````size_t sizeMatrix = N_LINES * M_COLS * sizeof(float);

size_t sizeVectorResult = M_COLS * sizeof(float);

unsigned int hTimer;
``````

// use command-line specified CUDA device, otherwise use device with highest Gflops/s

``````if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

cutilDeviceInit(argc, argv);

else

cudaSetDevice( cutGetMaxGflopsDeviceId() );
``````

cutilCheckError( cutCreateTimer(&hTimer) );

// Aloca o vetor de entrada A(n)

``````h_vectorA = (float*)malloc(sizeVector);

if (h_vectorA == 0) cleanup();
``````

// Aloca a matriz de entrada B(m,n)

``````h_matrixB = (float*)malloc(sizeMatrix);

if (h_matrixB == 0) cleanup();
``````

// Aloca o vetor resultante C(m)

``````h_vectorC = (float*)malloc(sizeVector);

if (h_vectorC == 0) cleanup();
``````

// Aloca o vetor de teste T(m)

``````h_vectorT = (float*)malloc(sizeVectorResult);

if (h_vectorT == 0) cleanup();
``````

// Initialize input vectors

``````randomInitVector(h_vectorA);

randomInitMatrix(h_matrixB);
``````

// Exibe o Vetor e a Matriz inicializada

#ifdef DEBMAT

``````printVector(h_vectorA);

printMatrix(h_matrixB);
``````

#endif

// Alocate o vetor A, matriz B e vetor resultante C na memÃ³ria do device

``````cutilSafeCall( cudaMalloc((void**)&d_vectorA, sizeVector) );

cutilSafeCall( cudaMalloc((void**)&d_matrixB, sizeMatrix) );

cutilSafeCall( cudaMalloc((void**)&d_vectorC, sizeVector) );
``````

// Copia o vetores/matriz da memÃ³ria do host para o device

``````cutilSafeCall( cudaMemcpy(d_vectorA,

h_vectorA, sizeVector, cudaMemcpyHostToDevice) );
``````

cutilSafeCall( cudaMemcpy(d_matrixB,

``````              h_matrixB, sizeMatrix, cudaMemcpyHostToDevice) );
``````

int threadsPorBloco = BLOCK_SIZE;

``````       threadsPorBloco,

1);

//dim3 grid ( (N_LINES + block.x -1 )/block.x,

//    (N_LINES + block.y -1 )/block.y,

//    1);
``````

dim3 grid (N_LINES/block.x, N_LINES/block.y,1);

``````printf("Executtando GPU kernel...\n");

cutilCheckError( cutResetTimer(hTimer) );

cutilCheckError( cutStartTimer(hTimer) );
``````

vectorMultArray<<<grid,block>>>(d_vectorA, d_matrixB, d_vectorC);

cutilCheckMsg(“A execucao da rotina vectorMultArray() falhou\n”);

``````  cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutStopTimer(hTimer) );

printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));
``````

#ifdef DEBSYNC

``````cutilSafeCall( cudaThreadSynchronize() );
``````

#endif

// Copia o resultado da memoria do device para o host

``````// h_vectorC contÃ©m o resultado da memÃ³ria do host

cutilSafeCall( cudaMemcpy(h_vectorC,

d_vectorC, sizeVector, cudaMemcpyDeviceToHost) );
``````

//calcula serial

``````printf("Executando funcao na CPU...\n");

cutilCheckError( cutResetTimer(hTimer) );

cutilCheckError( cutStartTimer(hTimer) );
``````

computaSerial(h_vectorT,h_vectorA,h_matrixB);

cutilCheckError( cutStopTimer(hTimer) );

``````printf("CPU time: %f msecs.\n", cutGetTimerValue(hTimer));
``````

#ifdef DEB

``````printf("\n*****Vetor C Resultante do CUDA*****\n");

printVector(h_vectorC);
``````

printf("\nVetor T Resultante Serial*\n");

``````printVector(h_vectorT);
``````

#endif

// Verifica o resultado

``````int i;

for (i = 0; i < M_COLS; ++i) {

if (fabs(h_vectorC[i] - h_vectorT[i]) > 1e-5){

printf("\n** Problemas na linha do vetor: %d **\n",i);

break;

}

}

printf("%s \n", (i == M_COLS) ? "PASSOU" : "FALHOU");
``````

cleanup();

}

/************************************************************

*FunÃ§Ã£o que imprime uma matriz *

*/

void printMatrix(float* m)

{

printf(" Matriz inicializada para:\n");

``````for(int i = 0 ; i < M_COLS ; i++)

{

printf("\n");

for(int j = 0 ; j < N_LINES ; j++)

{

printf("%5.2f", m[ i * M_COLS +j ] );

}

}

printf("\n");
``````

}

/************************************************************

*FunÃ§Ã£o que imprime um vetor *

*/

void printVector(float* v)

{

printf(" Vetor inicializado para: \n[");

for(int i = 0; i<N_LINES; i++)

``````printf("%5.2f ",v[i]);
``````

printf("]\n");

}

/************************************************************

*FunÃ§Ã£o que libera a memÃ³ria do device e do host *

*/

void cleanup(void)

{

``````// Libera a memÃ³ria do device

if (d_vectorA)

cudaFree(d_vectorA);

if (d_matrixB)

cudaFree(d_matrixB);

if (d_vectorC)

cudaFree(d_vectorC);
``````

// Libera a memÃ³ria do host

``````if (h_vectorA)

free(h_vectorA);

if (h_matrixB)

free(h_matrixB);

if (h_vectorC)

free(h_vectorC);
``````

``````if (!noprompt) {

printf("\nPressione ENTER para sair...\n");

fflush( stdout);

fflush( stderr);

getchar();

}
``````

exit(0);

}

/************************************************************

*FunÃ§Ã£o que gera o conteÃºdo aleatÃ³rio do vetor *

*/

void randomInitVector(float* v)

{

``````for (int i = 0; i < N_LINES; ++i)

v[i] = RAND_MAT();
``````

}

/************************************************************

• FunÃ§Ã£o que gera o conteÃºdo aleatÃ³rio da matriz *

• *(M.elementos + row * M.largura + col) *

*/

void randomInitMatrix(float* mat)

{

for (int i = 0; i < M_COLS; ++i)

``````  for(int j = 0; j < N_LINES; ++j)

mat[ i * M_COLS + j ] = RAND_MAT();
``````

}

/************************************************************

• FunÃ§Ã£o que calcula a multiplica de um vetor pela matriz *

• faz um processamento serial e retorna em C *

• *(M.elementos + row * M.largura + col) *

*/

void computaSerial(float* C,const float* A, const float* B)

{

for (unsigned int i = 0; i < M_COLS; ++i){

``````double sum = 0.0F;

for (unsigned int j = 0; j < N_LINES; ++j)

{

double b = B[i * M_COLS + j];

sum += A[j] * b;

}

C[i] = (float)sum;
``````

}

}

[/codebox]

Results ============================================================

=====

[b]Multiplica vetor por Matriz

A(n) x B(m,n) = C(m)

Executtando GPU kernel…

GPU time: 1103.490967 msecs.

Executando funcao na CPU…

CPU time: 0.462000 msecs.[/b]

Vetor C Resultante do CUDA



Vetor T Resultante Serial*



** Problemas na linha do vetor: 0 **

FALHOU

Pressione ENTER para sair…