why vector addition sdk example fails when using zero copy?

I modified the SDK vector addiction example to use zero copy memory,but for some reason the result isn’t what it should have been.

I’m attaching the code:

/*

 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

 *

 * Please refer to the NVIDIA end user license agreement (EULA) associated

 * with this source code for terms and conditions that govern your use of

 * this software. Any use, reproduction, disclosure, or distribution of

 * this software and related documentation outside the terms of the EULA

 * is strictly prohibited.

 *

 */

/* Vector addition: C = A + B.

 *

 * This sample is a very basic sample that implements element by element

 * vector addition. It is the same as the sample illustrating Chapter 3

 * of the programming guide with some additions like error checking.

 *

 */

// Includes

#include <stdio.h>

// Variables

float* h_A;

float* h_B;

float* h_C;

float* d_A;

float* d_B;

float* d_C;

bool noprompt = false;

// Functions

void Cleanup(void);

void RandomInit(float*, int);

// Device code

__global__ void VecAdd(const float* A, const float* B, float* C, int N)

{

    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < N)

        C[i] = A[i] + B[i];

}

// Host code

int main(int argc, char** argv)

{

    printf("Vector addition\n");

    int N = 50000;

    int size = N * sizeof(float);

cudaSetDeviceFlags(cudaDeviceMapHost);

// Allocate vectors in device memory

cudaHostAlloc( (void**)&h_A,size,cudaHostAllocMapped ) ;

cudaHostAlloc( (void**)&h_B,size,cudaHostAllocMapped ) ;

cudaHostAlloc( (void**)&h_C,size,cudaHostAllocMapped ) ;

// Initialize input vectors

    RandomInit(h_A, N);

    RandomInit(h_B, N);

cudaHostGetDevicePointer(&d_A,h_A, 0);

cudaHostGetDevicePointer(&d_B, h_B, 0);

cudaHostGetDevicePointer(&d_C, h_C, 0);

    // Invoke kernel

    int threadsPerBlock = 256;

    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    cudaDeviceSynchronize();

// Verify result

    int i;

    for (i = 0; i < N; ++i) {

        float sum = h_A[i] + h_B[i];

        if (fabs(h_C[i] - sum) > 1e-5)

            break;

    }

    printf("%s\n", (i == N) ? "PASSED" : "FAILED");

Cleanup();

}

void Cleanup(void)

{

// Free host memory

    if (h_A)

       cudaFreeHost(h_A);

    if (h_B)

        cudaFreeHost(h_B);

    if (h_C)

	cudaFreeHost(h_C);

cudaDeviceReset();

exit(0);

}

// Allocates an array with random float entries.

void RandomInit(float* data, int n)

{

    for (int i = 0; i < n; ++i)

        data[i] = rand() / (float)RAND_MAX;

}

Edit:i fixed the mistakes now the above program works as it should.

[Edit:i fixed the mistakes now the above program works as it should.

Well, wat actually was the error and how u fixed can u please elaborate…