Some issues on matrix calculation

I am new in cuda zone. External Image

unlucky… My GPU is the most unsophisticated type Geforce 210 series…

It doesn’t matter, 2 stream multiprocessor left for me

I launched my first program using <<<…>>> (exciting~~)and compared the time cost in CPU and GPU

76.89(s) VS 23.28(s)

Disappointed!

But to my surprise the results are incorrect!

[codebox]h_C[0] = 1068906752.000000 = h_A[0] + h_B[0] = 1068906784.000000

h_C[1] = 429811072.000000 = h_A[1] + h_B[1] = 429811056.000000

h_C[2] = 2324269568.000000 = h_A[2] + h_B[2] = 2324269600.000000

h_C[3] = 1003220096.000000 = h_A[3] + h_B[3] = 1003220096.000000

h_C[4] = 2390923520.000000 = h_A[4] + h_B[4] = 2390923456.000000

h_C[5] = 2594363648.000000 = h_A[5] + h_B[5] = 2594363712.000000

h_C[6] = 3930544128.000000 = h_A[6] + h_B[6] = 3930544256.000000

h_C[7] = 1411991936.000000 = h_A[7] + h_B[7] = 1411991924.000000

h_C[8] = 1954509568.000000 = h_A[8] + h_B[8] = 1954509568.000000

h_C[9] = 2879152640.000000 = h_A[9] + h_B[9] = 2879152512.000000[/codebox]

and I check the data of h_A h_B which have been past to device, the former and later retrieved back are the same

[codebox]Using device 0: GeForce 210

h_A[0] = 835687808.000000, h_B[0] = 233218976.000000

h_A[1] = 188649760.000000, h_B[1] = 241161296.000000

h_A[2] = 293071904.000000, h_B[2] = 2031197696.000000

h_A[3] = 383138048.000000, h_B[3] = 620082048.000000

h_A[4] = 626796736.000000, h_B[4] = 1764126720.000000

h_A[5] = 987687872.000000, h_B[5] = 1606675840.000000

h_A[6] = 2146680448.000000, h_B[6] = 1783863808.000000

h_A[7] = 1392509696.000000, h_B[7] = 19482228.000000

h_A[8] = 1705589760.000000, h_B[8] = 248919808.000000

h_A[9] = 1430608256.000000, h_B[9] = 1448544256.000000

GPU Time = 0.03(s).

h_A[0] = 835687808.000000, h_B[0] = 233218976.000000

h_A[1] = 188649760.000000, h_B[1] = 241161296.000000

h_A[2] = 293071904.000000, h_B[2] = 2031197696.000000

h_A[3] = 383138048.000000, h_B[3] = 620082048.000000

h_A[4] = 626796736.000000, h_B[4] = 1764126720.000000

h_A[5] = 987687872.000000, h_B[5] = 1606675840.000000

h_A[6] = 2146680448.000000, h_B[6] = 1783863808.000000

h_A[7] = 1392509696.000000, h_B[7] = 19482228.000000

h_A[8] = 1705589760.000000, h_B[8] = 248919808.000000

h_A[9] = 1430608256.000000, h_B[9] = 1448544256.000000[/codebox]

Apparently, the problem occured on GPU, who ever faced this kind of problem?

Show you my code:

[codebox]/*

  • ============================================================

=========================

  •   Filename:  mat_add.c
    
  • Description: demonstrate the cpu computing of matrix add

  •    Version:  1.0
    
  •    Created:  
    
  •   Revision:  none
    
  •   Compiler:  gcc
    
  •     Author:  Shelling Hu (sh), home343@163.com
    
  •    Company:  Creative
    
  • ============================================================

=========================

*/

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include “cutil.h”

#define BLOCK_SIZE 16

#define WIDTH (200 * BLOCK_SIZE)

#define HEIGHT (200 * BLOCK_SIZE)

global void MatrixAdd(float *C, float *A, float *\B)

{

/*

int Index;

Index = blockIdx.y * WIDTH * blockDim.y + blockIdx.x * blockDim.x * blockDim.y \

		+ threadIdx.y * blockDim.x + threadIdx.x;

*/

int bx = blockIdx.x;

int by = blockIdx.y;

int tx = threadIdx.x;

int ty = threadIdx.y;

int Index = WIDTH * BLOCK_SIZE * by + BLOCK_SIZE * bx + WIDTH * ty + tx;

C[Index] = A[Index] + B[Index];

}

int main(int argc, char **argv)

{

CUT_DEVICE_INIT(argc, argv);

srand(2006);

int sizeMat = WIDTH * HEIGHT;

int memSize = sizeof(float) * sizeMat;

float *h_A;

float *h_B;

int i;

int j;

clock_t costTime;

float *h_C;

dim3 block(BLOCK_SIZE, BLOCK_SIZE);

dim3 grid(WIDTH/block.x, HEIGHT/block.y);



/*variables for device*/

float *d_A, *d_B, *d_C;

CUDA_SAFE_CALL(cudaSetDevice(0));

h_A = (float *)malloc(memSize);

h_B = (float *)malloc(memSize);

h_C = (float *)malloc(memSize);

for (i = 0; i < sizeMat; i++)

{

	h_A[i] = rand();

	h_B[i] = rand();

}



for (i = 0; i < 10; i ++)

{

	printf("h_A[%d] = %f, h_B[%d] = %f\n", i, h_A[i], i, h_B[i]);

}



CUDA_SAFE_CALL(cudaMalloc((void **)&d_A, memSize));

CUDA_SAFE_CALL(cudaMalloc((void **)&d_B, memSize));

CUDA_SAFE_CALL(cudaMalloc((void **)&d_C, memSize));

CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, memSize, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, memSize, cudaMemcpyHostToDevice));

costTime = 0;

for (j = 0; j < 1; j++)

{

	costTime -= clock();

	MatrixAdd<<<grid, block>>>(d_C, d_A, d_B);

	CUDA_SAFE_CALL(cudaThreadSynchronize());

	costTime += clock();

}

printf("GPU Time = %g(s).\n", costTime/1000000.0f);

CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, memSize, cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaMemcpy(h_A, d_A, memSize, cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, memSize, cudaMemcpyDeviceToHost));

for (i = 0; i < 10; i ++)

{

	printf("h_A[%d] = %f, h_B[%d] = %f\n", i, h_A[i], i, h_B[i]);

}

printf("---------------------------------------------------------\n");

for (i = 0; i < 10; i ++)

{

	printf("h_C[%d] = %f = h_A[%d] + h_B[%d] = %f\n", i, h_C[i], i, i, h_A[i] + h_B[i]);

}



free(h_A);

free(h_B);

free(h_C);

CUDA_SAFE_CALL(cudaFree(d_A));

CUDA_SAFE_CALL(cudaFree(d_B));

CUDA_SAFE_CALL(cudaFree(d_C));

CUT_EXIT(argc, argv);

return 0;

}

[/codebox]

The results agree to about 7 digits, which is all you can expect using single precision (the Geforce 210 does not support double precision anyway).

what?
you means float adding result in 7 digits deviation?
The precision seems sheer bad

you should measure relative error, not absolute error

for example

h_A[0] = 835687808.000000, h_B[0] = 233218976.000000

d_C[0] = 1068906752

h_A[0] + h_B[0] - d_C[0] = 32

but 32/(h_A[0] + h_B[0]) = 2.9937e-008 attains limit of single precision

thanks all
It because the machine storage of floating ponit digit as binary format.
The representing from binary fraction to decimal fraction are only approximated .

but only 7 digits are promised to single precision,
what if I need full precision? I’m doing some tiff processing

CUDA only support single precision now
How can I calculate a digit beyond 9999999?
seems like a stupid question? I never used “float” before… forgive me

you can use “double” which gives you 15 digits accuracy.

However 210 series does not support “double”, you need a card with compute capability 1.3