Why is it that the value displayed by nNum is 19? What I am trying to do is add the indices of the threads to nNum so the value is supposed to be 0+1+2+3…+19. It turns out that the value displayed is the addition of the last index of the thread to nNum.
This is my code.
main.cpp:
#include <stdio.h>
#include <stdlib.h>
#define NUM 20
extern “C” void test (int* nNum, int nSize);
int main(int argc, char *argv)
{
int temp = 0;
test(&temp, NUM);
printf (“%d\n”, temp);
}
threadstest.cu:
global void compute_testd(int* nNum, int nSize)
{
unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < nSize)
{
*nNum += index;
}
}
extern “C” void test (int* nNum, int nSize)
{
int* nNumd;
cudaMalloc((void**)&nNumd, sizeof(int));
cudaMemcpy(nNumd, nNum, sizeof(int), cudaMemcpyHostToDevice);
compute_testd<<< ceil((float)nSize/256.0f), 256>>> (nNumd, nSize);
cudaMemcpy(nNum, nNumd, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(nNumd);
}
And is there a way for us to put this loop into CUDA:
for (int i = 0; i < 20; i++)
{
for (int j = 0; j < 20; j++)
{
array[i][j] = 3;
}
}