Why does my streaming vector add fails?

I have written the following code to do streaming on the GPU but i believe that my arithmetic is somehow flawed because it works for small input sizes but fails for larger ones.

Here is the code


 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.


 * Please refer to the NVIDIA end user license agreement (EULA) associated

 * with this source code for terms and conditions that govern your use of

 * this software. Any use, reproduction, disclosure, or distribution of

 * this software and related documentation outside the terms of the EULA

 * is strictly prohibited.



/* Vector addition: C = A + B.


 * This sample is a very basic sample that implements element by element

 * vector addition. It is the same as the sample illustrating Chapter 3

 * of the programming guide with some additions like error checking.



// Includes

#include <stdio.h>

#include <stdlib.h>

#include "timer.h"

#define GPU_MEM_SIZE 512*1024*1024

// Variables

float* h_A;

float* h_B;

float* h_C;

float* d_A;

float* d_B;

float* d_C;

bool noprompt = false;

// Functions

void Cleanup(void);

void RandomInit(float*, int);

// Device code

__global__ void VecAdd(const float* A, const float* B, float* C, int N)


    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < N)

        C[i] = A[i] + B[i];


// Host code

int main(int argc, char** argv)


int blockSize=(GPU_MEM_SIZE/3);

     int numEl=atoi(argv[1]);

   // int num64=(16*1024*1024);

    printf("Vector addition\n");

int N = numEl;    

printf("adding two vectors of %d Elements \n",N);

timer elapsed_timer;

int size=N*sizeof(float);

// Allocate input vectors h_A and h_B in host memory

    h_A = (float*)malloc(size);

    if (h_A == 0) Cleanup();

    h_B = (float*)malloc(size);

    if (h_B == 0) Cleanup();

    h_C = (float*)malloc(size);

    if (h_C == 0) Cleanup();

    printf("allocated host memory \n");

    // Initialize input vectors

    RandomInit(h_A, N);

    RandomInit(h_B, N);

printf("initialized host memory \n");

    // Allocate vectors in device memory

 cudaMalloc((void**)&d_A, blockSize) ;

 cudaMalloc((void**)&d_B, blockSize) ;

 cudaMalloc((void**)&d_C, blockSize) ;

printf("allocated device memory \n");

int iter=0;

int numElRem=N-(blockSize/sizeof(float))*iter;

for (iter=0;numElRem>0;iter++){


    // Copy vectors from host memory to device memory

 cudaMemcpy(d_A, h_A+blockSize*iter, numElRem*sizeof(float), cudaMemcpyHostToDevice) ;

 cudaMemcpy(d_B, h_B+blockSize*iter, numElRem*sizeof(float), cudaMemcpyHostToDevice) ;

// Invoke kernel

int numElKernel=(numElRem>blockSize/sizeof(float))?blockSize/sizeof(float):numElRem;

    int threadsPerBlock = 256;

    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B,  d_C, numElKernel);

// Copy result from device memory to host memory

    // h_C contains the result in host memory

 cudaMemcpy(h_C+blockSize*iter, d_C, numElRem*sizeof(float), cudaMemcpyDeviceToHost) ;


float elapsed_time=elapsed_timer.milliseconds_elapsed();

    // Verify result

    int i;

    for (i = 0; i < N; ++i) {

        float sum = h_A[i] + h_B[i];

        if (fabs(h_C[i] - sum) > 1e-5)



    printf("%s\n", (i == N) ? "PASSED" : "FAILED");

printf("time elapsed in milliseconds is %f \n",elapsed_time);



void Cleanup(void)


    // Free device memory

    if (d_A)


    if (d_B)


    if (d_C)


// Free host memory

    if (h_A)


    if (h_B)


    if (h_C)





// Allocates an array with random float entries.

void RandomInit(float* data, int n)


    for (int i = 0; i < n; ++i)

        data[i] = rand() / (float)RAND_MAX;


Any help will be appreciated.


What sizes does it work correctly on, and at what size(s) does it fail on? Also, what manner of failure is it?

i think that it should work now,i haven’t fully tested it yet but it definitelly looks (and runs better

vectorAdd2.cu (4.17 KB)