help! kernel not launching

I have no idea why, this seems really straight forward, all I want to do is add 2 numbers, but the kernel isnt launching. thanks so much.

// Example 1 for testing

#include "stdio.h"

#include "cuda.h"

__global__ void AddTwoNos(float* dA, float* dB, float* dC, int ksize)

{

   int i;

   for (i=1;i<2;i++)

   {

	 dC[i] = dA[i] + dB[i];

   } 

}

int main()

{

   int N = 1;

   size_t size=  N * sizeof(float);

   float A,B,C;

   float* pA;

   float* pB;

   float* pC;

float* hA;

   float* hB;

   float* hC;

A=1.0;

   B=2.0;

   C=0.0;

hA=&A;

   hB=&B;

   hC=&C;

cudaMalloc((void**)&pA, size);  

   cudaMemcpy(pA,hA,size,cudaMemcpyHostToDevice);

cudaMalloc((void**)&pB, size);

   cudaMemcpy(pB,hB,size,cudaMemcpyHostToDevice);

cudaMalloc((void**)&pC, size);

//   int threadsPerBlock = 256;

//   int blocksPerGrid = (N + threadsPerBlock -1)/threadsPerBlock;

AddTwoNos<<<32,256>>>(pA,pB,pC,N);

cudaMemcpy(hC,pC,size,cudaMemcpyDeviceToHost);

cudaFree(pA);

   cudaFree(pB);

   cudaFree(pC);

printf("%f\n",C);

}

and its printing the answer as 0

Looks like you’re using 32 blocks of 256 threads to do a single addition.
Also, array indexing starts at 0 instead of one

float a = 1.0f;
float* ap = &a;

means

ap[0] == 1.0f

N.

haha yea, I was just messing around with the thread blocks and stuff, so I changed that back to one.

I dont know how I didnt catch the array thing. Ive just been jumping around on too many platforms I think., That fixed it though, thanks a ton.

Chris