Some issues on matrix calculation

hushell · March 24, 2010, 5:57pm

I am new in cuda zone. External Image

unlucky… My GPU is the most unsophisticated type Geforce 210 series…

It doesn’t matter, 2 stream multiprocessor left for me

I launched my first program using <<<…>>> ï¼ˆexciting~~ï¼‰and compared the time cost in CPU and GPU

76.89(s) VS 23.28(s)

Disappointed!

But to my surprise the results are incorrect!

[codebox]h_C[0] = 1068906752.000000 = h_A[0] + h_B[0] = 1068906784.000000

h_C[1] = 429811072.000000 = h_A[1] + h_B[1] = 429811056.000000

h_C[2] = 2324269568.000000 = h_A[2] + h_B[2] = 2324269600.000000

h_C[3] = 1003220096.000000 = h_A[3] + h_B[3] = 1003220096.000000

h_C[4] = 2390923520.000000 = h_A[4] + h_B[4] = 2390923456.000000

h_C[5] = 2594363648.000000 = h_A[5] + h_B[5] = 2594363712.000000

h_C[6] = 3930544128.000000 = h_A[6] + h_B[6] = 3930544256.000000

h_C[7] = 1411991936.000000 = h_A[7] + h_B[7] = 1411991924.000000

h_C[8] = 1954509568.000000 = h_A[8] + h_B[8] = 1954509568.000000

h_C[9] = 2879152640.000000 = h_A[9] + h_B[9] = 2879152512.000000[/codebox]

and I check the data of h_A h_B which have been past to device, the former and later retrieved back are the same

[codebox]Using device 0: GeForce 210

h_A[0] = 835687808.000000, h_B[0] = 233218976.000000

h_A[1] = 188649760.000000, h_B[1] = 241161296.000000

h_A[2] = 293071904.000000, h_B[2] = 2031197696.000000

h_A[3] = 383138048.000000, h_B[3] = 620082048.000000

h_A[4] = 626796736.000000, h_B[4] = 1764126720.000000

h_A[5] = 987687872.000000, h_B[5] = 1606675840.000000

h_A[6] = 2146680448.000000, h_B[6] = 1783863808.000000

h_A[7] = 1392509696.000000, h_B[7] = 19482228.000000

h_A[8] = 1705589760.000000, h_B[8] = 248919808.000000

h_A[9] = 1430608256.000000, h_B[9] = 1448544256.000000

GPU Time = 0.03(s).

h_A[0] = 835687808.000000, h_B[0] = 233218976.000000

h_A[1] = 188649760.000000, h_B[1] = 241161296.000000

h_A[2] = 293071904.000000, h_B[2] = 2031197696.000000

h_A[3] = 383138048.000000, h_B[3] = 620082048.000000

h_A[4] = 626796736.000000, h_B[4] = 1764126720.000000

h_A[5] = 987687872.000000, h_B[5] = 1606675840.000000

h_A[6] = 2146680448.000000, h_B[6] = 1783863808.000000

h_A[7] = 1392509696.000000, h_B[7] = 19482228.000000

h_A[8] = 1705589760.000000, h_B[8] = 248919808.000000

h_A[9] = 1430608256.000000, h_B[9] = 1448544256.000000[/codebox]

Apparently, the problem occured on GPU, who ever faced this kind of problem?

Show you my code:

[codebox]/*

============================================================

=========================

```
  Filename:  mat_add.c
```
Description: demonstrate the cpu computing of matrix add
```
   Version:  1.0
```
```
   Created:  
```
```
  Revision:  none
```
```
  Compiler:  gcc
```

    Author:  Shelling Hu (sh), home343@163.com

```
   Company:  Creative
```
============================================================

=========================

*/

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include “cutil.h”

#define BLOCK_SIZE 16

#define WIDTH (200 * BLOCK_SIZE)

#define HEIGHT (200 * BLOCK_SIZE)

global void MatrixAdd(float *C, float *A, float *\B)

{

/*

int Index;

Index = blockIdx.y * WIDTH * blockDim.y + blockIdx.x * blockDim.x * blockDim.y \

		+ threadIdx.y * blockDim.x + threadIdx.x;

*/

int bx = blockIdx.x;

int by = blockIdx.y;

int tx = threadIdx.x;

int ty = threadIdx.y;

int Index = WIDTH * BLOCK_SIZE * by + BLOCK_SIZE * bx + WIDTH * ty + tx;

C[Index] = A[Index] + B[Index];

}

int main(int argc, char **argv)

{

CUT_DEVICE_INIT(argc, argv);

srand(2006);

int sizeMat = WIDTH * HEIGHT;

int memSize = sizeof(float) * sizeMat;

float *h_A;

float *h_B;

int i;

int j;

clock_t costTime;

float *h_C;

dim3 block(BLOCK_SIZE, BLOCK_SIZE);

dim3 grid(WIDTH/block.x, HEIGHT/block.y);



/*variables for device*/

float *d_A, *d_B, *d_C;

CUDA_SAFE_CALL(cudaSetDevice(0));

h_A = (float *)malloc(memSize);

h_B = (float *)malloc(memSize);

h_C = (float *)malloc(memSize);

for (i = 0; i < sizeMat; i++)

{

	h_A[i] = rand();

	h_B[i] = rand();

}



for (i = 0; i < 10; i ++)

{

	printf("h_A[%d] = %f, h_B[%d] = %f\n", i, h_A[i], i, h_B[i]);

}



CUDA_SAFE_CALL(cudaMalloc((void **)&d_A, memSize));

CUDA_SAFE_CALL(cudaMalloc((void **)&d_B, memSize));

CUDA_SAFE_CALL(cudaMalloc((void **)&d_C, memSize));

CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, memSize, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, memSize, cudaMemcpyHostToDevice));

costTime = 0;

for (j = 0; j < 1; j++)

{

	costTime -= clock();

	MatrixAdd<<<grid, block>>>(d_C, d_A, d_B);

	CUDA_SAFE_CALL(cudaThreadSynchronize());

	costTime += clock();

}

printf("GPU Time = %g(s).\n", costTime/1000000.0f);

CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, memSize, cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaMemcpy(h_A, d_A, memSize, cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, memSize, cudaMemcpyDeviceToHost));

for (i = 0; i < 10; i ++)

{

	printf("h_A[%d] = %f, h_B[%d] = %f\n", i, h_A[i], i, h_B[i]);

}

printf("---------------------------------------------------------\n");

for (i = 0; i < 10; i ++)

{

	printf("h_C[%d] = %f = h_A[%d] + h_B[%d] = %f\n", i, h_C[i], i, i, h_A[i] + h_B[i]);

}



free(h_A);

free(h_B);

free(h_C);

CUDA_SAFE_CALL(cudaFree(d_A));

CUDA_SAFE_CALL(cudaFree(d_B));

CUDA_SAFE_CALL(cudaFree(d_C));

CUT_EXIT(argc, argv);

return 0;

}

[/codebox]

tera · March 24, 2010, 6:52pm

The results agree to about 7 digits, which is all you can expect using single precision (the Geforce 210 does not support double precision anyway).

hushell · March 25, 2010, 10:39am

what?
you means float adding result in 7 digits deviation?
The precision seems sheer bad

LSChien · March 25, 2010, 1:02pm

you should measure relative error, not absolute error

for example

h_A[0] = 835687808.000000, h_B[0] = 233218976.000000

d_C[0] = 1068906752

h_A[0] + h_B[0] - d_C[0] = 32

but 32/(h_A[0] + h_B[0]) = 2.9937e-008 attains limit of single precision

hushell · March 26, 2010, 8:15am

thanks all
It because the machine storage of floating ponit digit as binary format.
The representing from binary fraction to decimal fraction are only approximated .

but only 7 digits are promised to single precision,
what if I need full precision? I’m doing some tiff processing

hushell · March 26, 2010, 8:26am

CUDA only support single precision now
How can I calculate a digit beyond 9999999?
seems like a stupid question? I never used “float” before… forgive me

LSChien · March 26, 2010, 2:44pm

you can use “double” which gives you 15 digits accuracy.

However 210 series does not support “double”, you need a card with compute capability 1.3

Topic		Replies	Views
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10034	January 18, 2012
float asssociative Debugging error CUDA Programming and Performance	10	2263	April 12, 2010
Getting started with CUDA ... cannot add simple vectors CUDA Programming and Performance	9	20965	January 31, 2011
Different results using diffrent memory types CUDA Programming and Performance	22	4478	April 14, 2010
Adding Large Matrices CUDA Programming and Performance	5	1395	July 19, 2010
Matrix Addition Failing Simple matrix addition acting up... CUDA Programming and Performance	0	3979	July 16, 2010
I got the wrong result from matrix summation CUDA Programming and Performance	0	4344	June 1, 2011
problem with double precision unpredictable results Different run give differents errors or no error CUDA Programming and Performance	12	2850	September 10, 2010
Same Code (really, it is) - Much Different Results CUDA Programming and Performance	38	13881	September 30, 2010
cuda float point precision CUDA Programming and Performance	3	8003	November 5, 2010

Some issues on matrix calculation

Related topics