Problem with multiplication vector by array with big size

eaceiro · June 17, 2010, 12:55am

Hi People, You ok !!

I’m brazillian begginer student in cuda… i need help and hint to a single problem - “multiplication vector by array with big size”.

I am not able to multiply an array by a vector for values greater than 256, my first result (0,0) is incorrect and to other higher values (1024 …) i get a NaN for some elements of the resultant vector.

I post my program and my results. That your see, results to 512 x 512 to line and col of vector and array are:

[b]Executtando GPU kernel…

GPU time: 1103.490967 msecs.

Executando funcao na CPU…

CPU time: 0.462000 msecs.[/b]

Why worst results ocurrs ? What best path to otmization this sinple code

Helpe me, please !

The CUDA Program

[codebox]

//inclue os arquivos principais

#include <stdio.h>

#include <cutil_inline.h>

#include “/usr/local/cuda/include/cuda.h”

#define DEB

//#define DEB_MAT

//#define DEB_SYNC

#define BLOCK_SIZE 16 //numero de threads em um bloco (16,16,1)= 32 Threads/Bloco

#define N_LINES 512 //numero de linhas do vetor/matriz

#define M_COLS 512 //numero de colunas da matriz

#define DATA 2*N_LINES * M_COLS

//#define RAND_MAT() (((int)(rand()/(float)RAND_MAX *10))%5)+1

//#define RAND_MAT() rand()/(float)RAND_MAX

#define RAND_MAT() 1.0

//Variaveis usadas no host

float* h_vectorA; //Vetor A

float* h_matrixB; //Matriz B

float* h_vectorC; //Vetor resultante C

float* h_vectorT; //Vetor de Teste para o calculo de computaSerial()

//Variaveis usadas no device

float* d_vectorA; //Vetor A

float* d_matrixB; //Matriz B

float* d_vectorC; //Vetor resultante C

//prompt para mostrar mensagem

bool noprompt = false;

//funcoes auxiliares declaradas adiante

void cleanup(void);

void randomInitVector(float*);

void randomInitMatrix(float*);

void computaSerial(float*,const float*, const float*);

void printMatrix(float*);

void printVector(float*);

//codigo do device

global void vectorMultArray(const float* A,

            const float* B, float* C)

{

int row = blockDim.x * blockIdx.x + threadIdx.x;

int col = blockDim.y * blockIdx.y + threadIdx.y;

if( row<N_LINES && col<M_COLS )

{

  float Cvalue=0.0F;

  C[__umul24(row , M_COLS) + col] = 0.0;

  for (int e = 0; e < M_COLS; ++e)

Cvalue += A[ __umul24(row , M_COLS) + e] * B[ __umul24(col , M_COLS) + e ];

C[__umul24(row , M_COLS) + col] = Cvalue;

}

// Host code

int main(int argc, char** argv)

{

printf("Multiplica vetor por Matriz\n");

printf("A(n) x B(m,n) = C(m)\n\n");

size_t sizeVector = N_LINES * sizeof(float);

size_t sizeMatrix = N_LINES * M_COLS * sizeof(float);

size_t sizeVectorResult = M_COLS * sizeof(float);

unsigned int hTimer;

// use command-line specified CUDA device, otherwise use device with highest Gflops/s

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

  cutilDeviceInit(argc, argv);

else

  cudaSetDevice( cutGetMaxGflopsDeviceId() );

cutilCheckError( cutCreateTimer(&hTimer) );

// Aloca o vetor de entrada A(n)

h_vectorA = (float*)malloc(sizeVector);

if (h_vectorA == 0) cleanup();

// Aloca a matriz de entrada B(m,n)

h_matrixB = (float*)malloc(sizeMatrix);

if (h_matrixB == 0) cleanup();

// Aloca o vetor resultante C(m)

h_vectorC = (float*)malloc(sizeVector);

if (h_vectorC == 0) cleanup();

// Aloca o vetor de teste T(m)

h_vectorT = (float*)malloc(sizeVectorResult);

if (h_vectorT == 0) cleanup();

// Initialize input vectors

randomInitVector(h_vectorA);

randomInitMatrix(h_matrixB);

// Exibe o Vetor e a Matriz inicializada

#ifdef DEBMAT

printVector(h_vectorA);

printMatrix(h_matrixB);

#endif

// Alocate o vetor A, matriz B e vetor resultante C na memÃ³ria do device

cutilSafeCall( cudaMalloc((void**)&d_vectorA, sizeVector) );

cutilSafeCall( cudaMalloc((void**)&d_matrixB, sizeMatrix) );

cutilSafeCall( cudaMalloc((void**)&d_vectorC, sizeVector) );

// Copia o vetores/matriz da memÃ³ria do host para o device

cutilSafeCall( cudaMemcpy(d_vectorA,

              h_vectorA, sizeVector, cudaMemcpyHostToDevice) );

cutilSafeCall( cudaMemcpy(d_matrixB,

              h_matrixB, sizeMatrix, cudaMemcpyHostToDevice) );

int threadsPorBloco = BLOCK_SIZE;

dim3 block(threadsPorBloco,

       threadsPorBloco,

       1);

//dim3 grid ( (N_LINES + block.x -1 )/block.x,

//    (N_LINES + block.y -1 )/block.y,

//    1);

dim3 grid (N_LINES/block.x, N_LINES/block.y,1);

printf("Executtando GPU kernel...\n");

  cutilSafeCall( cudaThreadSynchronize() );

  cutilCheckError( cutResetTimer(hTimer) );

  cutilCheckError( cutStartTimer(hTimer) );

vectorMultArray<<<grid,block>>>(d_vectorA, d_matrixB, d_vectorC);

cutilCheckMsg(“A execucao da rotina vectorMultArray() falhou\n”);

  cutilSafeCall( cudaThreadSynchronize() );

  cutilCheckError( cutStopTimer(hTimer) );

printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));

#ifdef DEBSYNC

cutilSafeCall( cudaThreadSynchronize() );

#endif

// Copia o resultado da memoria do device para o host

// h_vectorC contÃ©m o resultado da memÃ³ria do host

cutilSafeCall( cudaMemcpy(h_vectorC,

              d_vectorC, sizeVector, cudaMemcpyDeviceToHost) );

//calcula serial

printf("Executando funcao na CPU...\n");

    cutilCheckError( cutResetTimer(hTimer) );

cutilCheckError( cutStartTimer(hTimer) );

computaSerial(h_vectorT,h_vectorA,h_matrixB);

cutilCheckError( cutStopTimer(hTimer) );

printf("CPU time: %f msecs.\n", cutGetTimerValue(hTimer));

#ifdef DEB

printf("\n*****Vetor C Resultante do CUDA*****\n");

printVector(h_vectorC);

printf(“\nVetor T Resultante Serial*\n”);

printVector(h_vectorT);

#endif

// Verifica o resultado

int i;

for (i = 0; i < M_COLS; ++i) {

     if (fabs(h_vectorC[i] - h_vectorT[i]) > 1e-5){

    printf("\n** Problemas na linha do vetor: %d **\n",i);

        break;

    }

}

printf("%s \n", (i == M_COLS) ? "PASSOU" : "FALHOU");

cleanup();

}

/************************************************************

*FunÃ§Ã£o que imprime uma matriz *

*/

void printMatrix(float* m)

{

printf(" Matriz inicializada para:\n");

for(int i = 0 ; i < M_COLS ; i++)

  {

printf("\n");

for(int j = 0 ; j < N_LINES ; j++)

  {

    printf("%5.2f", m[ i * M_COLS +j ] );

  }

  }

printf("\n");

}

/************************************************************

*FunÃ§Ã£o que imprime um vetor *

*/

void printVector(float* v)

{

printf(" Vetor inicializado para: \n[");

for(int i = 0; i<N_LINES; i++)

printf("%5.2f ",v[i]);

printf(“]\n”);

}

/************************************************************

*FunÃ§Ã£o que libera a memÃ³ria do device e do host *

*/

void cleanup(void)

{

// Libera a memÃ³ria do device

if (d_vectorA)

    cudaFree(d_vectorA);

if (d_matrixB)

    cudaFree(d_matrixB);

if (d_vectorC)

    cudaFree(d_vectorC);

// Libera a memÃ³ria do host

if (h_vectorA)

    free(h_vectorA);

if (h_matrixB)

    free(h_matrixB);

if (h_vectorC)

    free(h_vectorC);

cutilSafeCall( cudaThreadExit() );

if (!noprompt) {

    printf("\nPressione ENTER para sair...\n");

    fflush( stdout);

    fflush( stderr);

    getchar();

}

exit(0);

}

/************************************************************

*FunÃ§Ã£o que gera o conteÃºdo aleatÃ³rio do vetor *

*/

void randomInitVector(float* v)

{

for (int i = 0; i < N_LINES; ++i)

  v[i] = RAND_MAT();

}

/************************************************************

FunÃ§Ã£o que gera o conteÃºdo aleatÃ³rio da matriz *
*(M.elementos + row * M.largura + col) *

*/

void randomInitMatrix(float* mat)

{

for (int i = 0; i < M_COLS; ++i)

  for(int j = 0; j < N_LINES; ++j)

mat[ i * M_COLS + j ] = RAND_MAT();

}

/************************************************************

FunÃ§Ã£o que calcula a multiplica de um vetor pela matriz *
faz um processamento serial e retorna em C *
*(M.elementos + row * M.largura + col) *

*/

void computaSerial(float* C,const float* A, const float* B)

{

for (unsigned int i = 0; i < M_COLS; ++i){

double sum = 0.0F;

for (unsigned int j = 0; j < N_LINES; ++j)

  {

double b = B[i * M_COLS + j];

sum += A[j] * b;

  }

  C[i] = (float)sum;

}

[/codebox]

Results ============================================================

=====

[b]Multiplica vetor por Matriz

A(n) x B(m,n) = C(m)

Executtando GPU kernel…

GPU time: 1103.490967 msecs.

Executando funcao na CPU…

CPU time: 0.462000 msecs.[/b]

Vetor C Resultante do CUDA

Vetor inicializado para:

[496.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 ]

Vetor T Resultante Serial*

Vetor inicializado para:

[512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 512.00 ]

** Problemas na linha do vetor: 0 **

FALHOU

Pressione ENTER para sair…

Topic		Replies	Views
Matrix by Vector multiplication CUDA Programming and Performance	8	1769	September 30, 2010
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10104	January 18, 2012
best possible matrix-vector multiplication performance? poor guy with only an emulator wonders about CUDA Programming and Performance	6	5676	August 12, 2009
Matrix Multiplication Inconsistency Different values output in every run of the matixMul program CUDA Programming and Performance	29	8924	December 16, 2009
multi dimension array CUDA Programming and Performance	26	32916	February 12, 2010
problem of matrix multiplication vector x matrix CUDA Programming and Performance	4	1273	August 22, 2010
matrix multiplication CUDA Programming and Performance	10	3908	March 7, 2010
Matrix Multiplication Garbage value :( CUDA Programming and Performance	10	3478	July 25, 2009
Weird Matrix-Vector Results - Help? CUDA Programming and Performance	2	4963	April 6, 2010
Matrix multiplication ERRORS & few thoughts on CUDA Basic programming errors need correction CUDA Programming and Performance	14	13414	January 24, 2009

Problem with multiplication vector by array with big size

Related topics