I have no idea why, this seems really straight forward, all I want to do is add 2 numbers, but the kernel isnt launching. thanks so much.
// Example 1 for testing
#include "stdio.h"
#include "cuda.h"
__global__ void AddTwoNos(float* dA, float* dB, float* dC, int ksize)
{
int i;
for (i=1;i<2;i++)
{
dC[i] = dA[i] + dB[i];
}
}
int main()
{
int N = 1;
size_t size= N * sizeof(float);
float A,B,C;
float* pA;
float* pB;
float* pC;
float* hA;
float* hB;
float* hC;
A=1.0;
B=2.0;
C=0.0;
hA=&A;
hB=&B;
hC=&C;
cudaMalloc((void**)&pA, size);
cudaMemcpy(pA,hA,size,cudaMemcpyHostToDevice);
cudaMalloc((void**)&pB, size);
cudaMemcpy(pB,hB,size,cudaMemcpyHostToDevice);
cudaMalloc((void**)&pC, size);
// int threadsPerBlock = 256;
// int blocksPerGrid = (N + threadsPerBlock -1)/threadsPerBlock;
AddTwoNos<<<32,256>>>(pA,pB,pC,N);
cudaMemcpy(hC,pC,size,cudaMemcpyDeviceToHost);
cudaFree(pA);
cudaFree(pB);
cudaFree(pC);
printf("%f\n",C);
}
and its printing the answer as 0