I am new in cuda zone. External Image
unlucky… My GPU is the most unsophisticated type Geforce 210 series…
It doesn’t matter, 2 stream multiprocessor left for me
I launched my first program using <<<…>>> (exciting~~)and compared the time cost in CPU and GPU
76.89(s) VS 23.28(s)
Disappointed!
But to my surprise the results are incorrect!
[codebox]h_C[0] = 1068906752.000000 = h_A[0] + h_B[0] = 1068906784.000000
h_C[1] = 429811072.000000 = h_A[1] + h_B[1] = 429811056.000000
h_C[2] = 2324269568.000000 = h_A[2] + h_B[2] = 2324269600.000000
h_C[3] = 1003220096.000000 = h_A[3] + h_B[3] = 1003220096.000000
h_C[4] = 2390923520.000000 = h_A[4] + h_B[4] = 2390923456.000000
h_C[5] = 2594363648.000000 = h_A[5] + h_B[5] = 2594363712.000000
h_C[6] = 3930544128.000000 = h_A[6] + h_B[6] = 3930544256.000000
h_C[7] = 1411991936.000000 = h_A[7] + h_B[7] = 1411991924.000000
h_C[8] = 1954509568.000000 = h_A[8] + h_B[8] = 1954509568.000000
h_C[9] = 2879152640.000000 = h_A[9] + h_B[9] = 2879152512.000000[/codebox]
and I check the data of h_A h_B which have been past to device, the former and later retrieved back are the same
[codebox]Using device 0: GeForce 210
h_A[0] = 835687808.000000, h_B[0] = 233218976.000000
h_A[1] = 188649760.000000, h_B[1] = 241161296.000000
h_A[2] = 293071904.000000, h_B[2] = 2031197696.000000
h_A[3] = 383138048.000000, h_B[3] = 620082048.000000
h_A[4] = 626796736.000000, h_B[4] = 1764126720.000000
h_A[5] = 987687872.000000, h_B[5] = 1606675840.000000
h_A[6] = 2146680448.000000, h_B[6] = 1783863808.000000
h_A[7] = 1392509696.000000, h_B[7] = 19482228.000000
h_A[8] = 1705589760.000000, h_B[8] = 248919808.000000
h_A[9] = 1430608256.000000, h_B[9] = 1448544256.000000
GPU Time = 0.03(s).
h_A[0] = 835687808.000000, h_B[0] = 233218976.000000
h_A[1] = 188649760.000000, h_B[1] = 241161296.000000
h_A[2] = 293071904.000000, h_B[2] = 2031197696.000000
h_A[3] = 383138048.000000, h_B[3] = 620082048.000000
h_A[4] = 626796736.000000, h_B[4] = 1764126720.000000
h_A[5] = 987687872.000000, h_B[5] = 1606675840.000000
h_A[6] = 2146680448.000000, h_B[6] = 1783863808.000000
h_A[7] = 1392509696.000000, h_B[7] = 19482228.000000
h_A[8] = 1705589760.000000, h_B[8] = 248919808.000000
h_A[9] = 1430608256.000000, h_B[9] = 1448544256.000000[/codebox]
Apparently, the problem occured on GPU, who ever faced this kind of problem?
Show you my code:
[codebox]/*
- ============================================================
=========================
-
Filename: mat_add.c
-
Description: demonstrate the cpu computing of matrix add
-
Version: 1.0
-
Created:
-
Revision: none
-
Compiler: gcc
-
Author: Shelling Hu (sh), home343@163.com
-
Company: Creative
-
============================================================
=========================
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include “cutil.h”
#define BLOCK_SIZE 16
#define WIDTH (200 * BLOCK_SIZE)
#define HEIGHT (200 * BLOCK_SIZE)
global void MatrixAdd(float *C, float *A, float *\B)
{
/*
int Index;
Index = blockIdx.y * WIDTH * blockDim.y + blockIdx.x * blockDim.x * blockDim.y \
+ threadIdx.y * blockDim.x + threadIdx.x;
*/
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int Index = WIDTH * BLOCK_SIZE * by + BLOCK_SIZE * bx + WIDTH * ty + tx;
C[Index] = A[Index] + B[Index];
}
int main(int argc, char **argv)
{
CUT_DEVICE_INIT(argc, argv);
srand(2006);
int sizeMat = WIDTH * HEIGHT;
int memSize = sizeof(float) * sizeMat;
float *h_A;
float *h_B;
int i;
int j;
clock_t costTime;
float *h_C;
dim3 block(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WIDTH/block.x, HEIGHT/block.y);
/*variables for device*/
float *d_A, *d_B, *d_C;
CUDA_SAFE_CALL(cudaSetDevice(0));
h_A = (float *)malloc(memSize);
h_B = (float *)malloc(memSize);
h_C = (float *)malloc(memSize);
for (i = 0; i < sizeMat; i++)
{
h_A[i] = rand();
h_B[i] = rand();
}
for (i = 0; i < 10; i ++)
{
printf("h_A[%d] = %f, h_B[%d] = %f\n", i, h_A[i], i, h_B[i]);
}
CUDA_SAFE_CALL(cudaMalloc((void **)&d_A, memSize));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_B, memSize));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_C, memSize));
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, memSize, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, memSize, cudaMemcpyHostToDevice));
costTime = 0;
for (j = 0; j < 1; j++)
{
costTime -= clock();
MatrixAdd<<<grid, block>>>(d_C, d_A, d_B);
CUDA_SAFE_CALL(cudaThreadSynchronize());
costTime += clock();
}
printf("GPU Time = %g(s).\n", costTime/1000000.0f);
CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, memSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_A, d_A, memSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, memSize, cudaMemcpyDeviceToHost));
for (i = 0; i < 10; i ++)
{
printf("h_A[%d] = %f, h_B[%d] = %f\n", i, h_A[i], i, h_B[i]);
}
printf("---------------------------------------------------------\n");
for (i = 0; i < 10; i ++)
{
printf("h_C[%d] = %f = h_A[%d] + h_B[%d] = %f\n", i, h_C[i], i, i, h_A[i] + h_B[i]);
}
free(h_A);
free(h_B);
free(h_C);
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_C));
CUT_EXIT(argc, argv);
return 0;
}
[/codebox]