2matrix addition

hi

I am majid… new in CUDA… i want to wrote program for 2dimentional array that add 2 matrix A & B and store the result in matrix C.

i write it based on ‘CUDA 2.3 programing guide’ and use 1dimentional array. but when i run my program the result is wrong. help me anyone please kindly.

entire program is:

#include <stdlib.h>

#include <stdio.h>

#include <conio.h>

#include <math.h>

#include <cuda.h>

#define N 32

#define M 16

void VecAdd();

// Device code

__global__ void VecAdd(float* A, float* B, float* C)

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockDim.y;

if (i < N && j < M)

C[j*N + i] = A[j*N + i] + B[j*N + i];

}

// Host code

int main()

{

int i;

FILE *out1;

size_t size_x = M*N * sizeof(float);

out1=fopen("output.txt","w+");

// Allocate input vectors h_A and h_B in host memory

float *h_A, *h_B, *h_C; 

h_A = (float *)malloc(size_x);

h_B = (float *)malloc(size_x);

h_C = (float *)malloc(size_x);

for(i = 0; i < M*N; i++) {

h_A[i] = 2;

h_B[i] = 1;

h_C[i] = 0;

}

// Allocate vectors in device memory

float* d_A;

cudaMalloc((void**)&d_A, size_x);

float* d_B;

cudaMalloc((void**)&d_B, size_x);

float* d_C;

cudaMalloc((void**)&d_C, size_x);

// Copy vectors from host memory to device memory

cudaMemcpy(d_A, h_A, size_x, cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, size_x, cudaMemcpyHostToDevice);

// Invoke kernel

dim3 dimBlock  (16,1,1);

dim3 dimGrid  (2,16);

VecAdd<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

// Copy result from device memory to host memory

// h_C contains the result in host memory

cudaMemcpy(h_C, d_C, size_x, cudaMemcpyDeviceToHost);

// Free device memory

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

for(i = 0; i < N*M; i++)

fprintf(out1,"%d, %f, %f, %f\n", i, h_A[i], h_B[i], h_C[i] );

printf( "Press inter");

getch();

}

Hi Majid,

Welcome.

I would suggest that you run your code in EmuDebug and make sure that the values of i and j (and j*N+i) are what you intended.

I think a problem is

int j = blockDim.y;

blockDim.y is effectively constant as 1, maybe you meant blockIdx.y

Cheers

this is my fault
tank you very much