hi
I am majid… new in CUDA… i want to wrote program for 2dimentional array that add 2 matrix A & B and store the result in matrix C.
i write it based on ‘CUDA 2.3 programing guide’ and use 1dimentional array. but when i run my program the result is wrong. help me anyone please kindly.
entire program is:
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
#include <math.h>
#include <cuda.h>
#define N 32
#define M 16
void VecAdd();
// Device code
__global__ void VecAdd(float* A, float* B, float* C)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockDim.y;
if (i < N && j < M)
C[j*N + i] = A[j*N + i] + B[j*N + i];
}
// Host code
int main()
{
int i;
FILE *out1;
size_t size_x = M*N * sizeof(float);
out1=fopen("output.txt","w+");
// Allocate input vectors h_A and h_B in host memory
float *h_A, *h_B, *h_C;
h_A = (float *)malloc(size_x);
h_B = (float *)malloc(size_x);
h_C = (float *)malloc(size_x);
for(i = 0; i < M*N; i++) {
h_A[i] = 2;
h_B[i] = 1;
h_C[i] = 0;
}
// Allocate vectors in device memory
float* d_A;
cudaMalloc((void**)&d_A, size_x);
float* d_B;
cudaMalloc((void**)&d_B, size_x);
float* d_C;
cudaMalloc((void**)&d_C, size_x);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size_x, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size_x, cudaMemcpyHostToDevice);
// Invoke kernel
dim3 dimBlock (16,1,1);
dim3 dimGrid (2,16);
VecAdd<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C, d_C, size_x, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
for(i = 0; i < N*M; i++)
fprintf(out1,"%d, %f, %f, %f\n", i, h_A[i], h_B[i], h_C[i] );
printf( "Press inter");
getch();
}