VectorAdd can compile, but output is odd.

Hello. I made this code to run vectorAdd.

Compile is good and no problem.

But output of this code is odd that starting part is not added.(Last part is good.)

what’s the problem?

I’m using VS2010 & CUDA4.0 & WIN7.

#include <stdio.h>

#include <cuda.h>

#include <cuda_runtime.h> // c++ - style function

#include <cuda_runtime_api.h> // c - style function

#include <device_launch_parameters.h> //This header has launch parameters such as blockIdx.

#include <device_functions.h>

void cudaErr(const char *msg);

global void VectorAdd(int* a, int* b, int* c)

{

int tid = blockIdx.x+blockDim.x+threadIdx.x;

c[tid] = a[tid] + b[tid];

}

int main()

{

const int size = 128*64;

cudaSetDevice(0); 

int* InputA = new int;

int* InputB = new int;

int* Result = new int;

for(int i = 0; i < size; i++)

{

	InputA[i] = i;

	InputB[i] = i;

	Result[i] = 0;

}// Initialization of Host Memory

for(int i = 0; i< 3 ; i++)

{

	printf("InputA[%d] : %d    InputB[%d]  : %d\n",i,InputA[i],i,InputB[i]);

}

printf(".....\n");



int* dev_A;

int* dev_B;

int* dev_R; 

cudaMalloc((void**)&dev_A,size*sizeof(int));

cudaMalloc((void**)&dev_B,size*sizeof(int));

cudaMalloc((void**)&dev_R,size*sizeof(int));

cudaErr("Malloc to device");

cudaMemcpy(dev_A,InputA,size*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(dev_B,InputB,size*sizeof(int),cudaMemcpyHostToDevice);

cudaErr("memcpy HtD");	

VectorAdd<<<65535,512>>>(dev_A,dev_B, dev_R); 

cudaErr("VectorSum");

cudaMemcpy(Result,dev_R,size*sizeof(int),cudaMemcpyDeviceToHost);

cudaErr("memcpy DtH");

for(int i = 0; i< 5 ; i++)

{

	printf("Result[%d] : %d\n",i,Result[i]);

}

printf(".....\n");

for(int i = size-5; i< size; i++)

{

	printf("Result[%d] : %d\n",i,Result[i]);

}

cudaFree(dev_A);

cudaFree(dev_B);

cudaFree(dev_R);

delete[] InputA;

delete[] InputB;

delete[] Result;



return 0;

}

void cudaErr(const char *msg){

cudaError_t err = cudaGetLastError();

if( err !=cudaSuccess) {

printf(“%d %s %s \n”, err, msg, cudaGetErrorString(err) );

}

}

int tid = blockIdx.x*blockDim.x+threadIdx.x;

Oh Thanks!

Why did I do?// Anyway, Thanks!