The code given below is simple enough to solve your problems. If you are not able to understand anything please reply. (Areas specific to your questions are highlighted)
Given example uses global memory.
Be cautious while using “constant” memory. It has following characteristics (See NVIDIA CUDA Programming Guide)
- It is limited to 64KB
-
constant variables cannot be assigned to from the device, only from host through host runtime functions.
=============================================
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cutil.h>
#define MAX_COL 2
#define MAX_ROW 8
#define BLOCK_DIM 2
global void Sample_Kernel(unsigned int *d_a, unsigned int *d_b)
{
int iy = blockDim.y * blockIdx.y + threadIdx.y;
int ix = blockDim.x * blockIdx.x + threadIdx.x;
int idx = iy * MAX_COL + ix;
d_a[idx] = d_a[idx] + d_b[idx];
}
int main(int argc, char** argv)
{
unsigned int *devPtr1;
unsigned int *devPtr2;
unsigned int data1[MAX_ROW][MAX_COL] = { 10,20, 30,40, 50,60, 70,80, 90,100, 200,300, 400,500, 600,700};
unsigned int data2[MAX_ROW][MAX_COL] = { 1,2, 3,4, 5,6, 7,8, 9,10, 11,12, 13,14, 15,16 };
int size = MAX_COL * sizeof(unsigned int) * MAX_ROW;
CUT_DEVICE_INIT();
[b]
cudaMalloc((void**)&devPtr1, size);
cudaMalloc((void**)&devPtr2, size);
cudaMemcpy(devPtr1, data1, size, cudaMemcpyHostToDevice);
cudaMemcpy(devPtr2, data2, size, cudaMemcpyHostToDevice);
[/b]
unsigned int timer;
cutCreateTimer(&timer);
dim3 grid(MAX_COL/BLOCK_DIM, MAX_ROW/BLOCK_DIM, 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);
cutStartTimer(timer);
Sample_Kernel<<<grid, threads>>>(devPtr1, devPtr2);
cudaThreadSynchronize();
cutStopTimer(timer);
float naiveTime = cutGetTimerValue(timer);
printf("\n\ntime taken: %0.3f ms\n", naiveTime);
[b]
cudaMemcpy(data1, devPtr1, size, cudaMemcpyDeviceToHost);
cudaMemcpy(data2, devPtr2, size, cudaMemcpyDeviceToHost);
cudaFree(devPtr1);
cudaFree(devPtr2);
[/b]
printf(“\n”);
for (int i = 0; i < MAX_ROW; i++)
{
for (int j = 0; j < MAX_COL; j++)
{
printf("data1[%d][%d] => %d\n", i, j, data1[i][j]);
}
printf("\n");
}
return 0;
}