// example0.cpp : Defines the entry point for the console application.
//
#include “stdafx.h”
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cutil_inline.h>
global void incrementArrayOnDevice(float a,floatB)
{
int w = 4;
int iy = blockDim.y * blockIdx.x + threadIdx.y;
int ix = blockDim.x * blockIdx.x + threadIdx.x;
int idx = ix*w + iy;
b[idx] = a[idx];
}
int main(void)
{
float *a,*b;
float *a_d,*b_d;
int i;
int N = 4;
int size = sizeof(float)NN;
a = (float *)(malloc(size));
b = (float *)(malloc(size));
cudaMalloc((void **) &a_d, (size));
cudaMalloc((void **) &b_d, (size));
for(i=0;i<(N*N);i++)
{
a[i] = i;
b[i] = 0;
}
cudaMemcpy(a_d, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b, size, cudaMemcpyHostToDevice);
dim3 dimGrid(2,2);
dim3 dimBlock(2,2);
/Start execution and timing***/
unsigned int timer;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
incrementArrayOnDevice<<<dimGrid,dimBlock>>>(a_d,b_d);
//Timer End
CUT_SAFE_CALL(cutStopTimer(timer));
cudaMemcpy(a, a_d, size, cudaMemcpyDeviceToHost);
cudaMemcpy(b, b_d, size, cudaMemcpyDeviceToHost);
for(i=0;i<(N*N);i++)
{
printf(“%f\t%f”,a[i],b[i]);
printf(“\n”);
}
printf(“\nTime:%f ms\n”, cutGetTimerValue(timer));
free(a);free(B);
cudaFree(a_d);cudaFree(b_d);
}
i tried interchanging iy for ix yet i ended up with the same result. i incremented ix by 1, iy by 1 but i am getting the same answer all the time. how does the values i initialise in the program { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
map into a 4 codeblock, each codeblock containing 4 threads