The program in emulation mode works but on GPU is help please...

Greetings!

I can not understand why on GPU does not work, and in an emulation mode it’s OK. The program puts elements of 2 files поэлементно

[1 [2] [3] [4]

[1 [2] [3] [4]

=

[2 [4] [6] [8]

The sense is shown with top. The initial code is not copied, but only that is written, what that discrepancy but an essence in general in the following therefore are possible:

MyProgramma.cpp

   extern "C" void

    Start (const int argc, const char ** argv);

     

    extern "C" void

    RaschdZnach (const int argc, const char ** argv, float *h_A, float *h_B, int AND Time, int &Quantity);

     

    int

    main (int argc, char ** argv)

    {

    printf ("Preparation initial data...\n");

       int Quantity = 0;

       int Q = 0;

       int Time = 10000;

       float *h_A;

       h_A = (float *) malloc (Time * sizeof (float));

       float *h_B;

       h_B = (float *) malloc (Time * sizeof (float));

       for (int i = 0; i <Time; i ++) h_B [i] =1.0;

     

     

    printf ("we Start calculation...\n");

    extern "C" void

    Start (argc, (const char **) argv);

     

    do {

     

    RaschdZnach (argc, (const char **) argv, h_A, h_B, Time, Quantity);

     

    }

     

    Quantity=Q+1;

     

    while (Quantity <Q);

     

    }

cppIntegrationMyProgramma.cu

   extern "C" void

    Start (const int argc, const char ** argv);

    {

    CUT_DEVICE_INIT (argc, argv);

    }

    extern "C" void

    RaschdZnach (const int argc, const char ** argv, float *h_A, float *h_B, int AND Time, int &Quantity);

    {

    float *d_A;

    float *d_B;

    CUDA_SAFE_CALL (cudaMalloc ((void **) &d_A, Time * sizeof (float)));

      if (Quantity! = 0)

        CUDA_SAFE_CALL (cudaMemcpy (d_A, h_A, sizeof (float) *TimeOfModeling, cudaMemcpyHostToDevice));

     

        CUDA_SAFE_CALL (cudaMalloc ((void **) &d_B, Time * sizeof (float)));

        CUDA_SAFE_CALL (cudaMemcpy (d_B, h_B, sizeof (float) *TimeOfModeling, cudaMemcpyHostToDevice));

     

     

     

            int blockSize = 4;

            int nBlocks = TimeOfModeling/blockSize + (TimeOfModeling%blockSize == 0? 0:1);

            CUDA_SAFE_CALL (cudaThreadSynchronize ());

            GPURas <<<nBlocks, blockSize>>> (d_A, Time, d_B);

            CUT_CHECK_ERROR ("GPURas () execution failed\n");

            CUDA_SAFE_CALL (cudaThreadSynchronize ());

     

     

    CUDA_SAFE_CALL (cudaMemcpy (h_A, d_B, sizeof (float) *TimeOfModeling, cudaMemcpyDeviceToHost));

     

            /* we Release memory */

    	CUDA_SAFE_CALL (cudaFree (d_A));

        	CUDA_SAFE_CALL (cudaFree (d_B));

    }

MersenneTwister_kernel.cu

    

    __ global __ void GPURas (float *d_A, int nTime, float *d_B)

    {

      int idx = blockIdx.x * blockDim.x + threadIdx.x;

      if (idx <nTime)

      {

     

      d_B [idx] = d_A [idx] +d_B [idx];

      

       }

And one more question what less to throw data between GPU and CPU, CPU and GPU?

For example, it is clear that each time to allocate memory under d_B and to copy data from h_B it is not necessary, but a problem in that that at… I do not know as (if d_B it is declared also to it are passed values h_B in other function for example):

extern “C” void

Rasd_B (const int argc, const char ** argv, float *h_B, int AND Time);

{

float *d_B;

CUDA_SAFE_CALL (cudaMalloc ((void **) &d_B, Time * sizeof (float)));

CUDA_SAFE_CALL (cudaMemcpy (d_B, h_B, sizeof (float) *TimeOfModeling, cudaMemcpyHostToDevice));

}

As it is possible to make d_B visible for RaschdZnach? :(