Greetings!
I can not understand why on GPU does not work, and in an emulation mode it’s OK. The program puts elements of 2 files поÑлементно
[1 [2] [3] [4]
[1 [2] [3] [4]
=
[2 [4] [6] [8]
The sense is shown with top. The initial code is not copied, but only that is written, what that discrepancy but an essence in general in the following therefore are possible:
MyProgramma.cpp
extern "C" void
Start (const int argc, const char ** argv);
extern "C" void
RaschdZnach (const int argc, const char ** argv, float *h_A, float *h_B, int AND Time, int &Quantity);
int
main (int argc, char ** argv)
{
printf ("Preparation initial data...\n");
int Quantity = 0;
int Q = 0;
int Time = 10000;
float *h_A;
h_A = (float *) malloc (Time * sizeof (float));
float *h_B;
h_B = (float *) malloc (Time * sizeof (float));
for (int i = 0; i <Time; i ++) h_B [i] =1.0;
printf ("we Start calculation...\n");
extern "C" void
Start (argc, (const char **) argv);
do {
RaschdZnach (argc, (const char **) argv, h_A, h_B, Time, Quantity);
}
Quantity=Q+1;
while (Quantity <Q);
}
cppIntegrationMyProgramma.cu
extern "C" void
Start (const int argc, const char ** argv);
{
CUT_DEVICE_INIT (argc, argv);
}
extern "C" void
RaschdZnach (const int argc, const char ** argv, float *h_A, float *h_B, int AND Time, int &Quantity);
{
float *d_A;
float *d_B;
CUDA_SAFE_CALL (cudaMalloc ((void **) &d_A, Time * sizeof (float)));
if (Quantity! = 0)
CUDA_SAFE_CALL (cudaMemcpy (d_A, h_A, sizeof (float) *TimeOfModeling, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL (cudaMalloc ((void **) &d_B, Time * sizeof (float)));
CUDA_SAFE_CALL (cudaMemcpy (d_B, h_B, sizeof (float) *TimeOfModeling, cudaMemcpyHostToDevice));
int blockSize = 4;
int nBlocks = TimeOfModeling/blockSize + (TimeOfModeling%blockSize == 0? 0:1);
CUDA_SAFE_CALL (cudaThreadSynchronize ());
GPURas <<<nBlocks, blockSize>>> (d_A, Time, d_B);
CUT_CHECK_ERROR ("GPURas () execution failed\n");
CUDA_SAFE_CALL (cudaThreadSynchronize ());
CUDA_SAFE_CALL (cudaMemcpy (h_A, d_B, sizeof (float) *TimeOfModeling, cudaMemcpyDeviceToHost));
/* we Release memory */
CUDA_SAFE_CALL (cudaFree (d_A));
CUDA_SAFE_CALL (cudaFree (d_B));
}
MersenneTwister_kernel.cu
__ global __ void GPURas (float *d_A, int nTime, float *d_B)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx <nTime)
{
d_B [idx] = d_A [idx] +d_B [idx];
}
And one more question what less to throw data between GPU and CPU, CPU and GPU?
For example, it is clear that each time to allocate memory under d_B and to copy data from h_B it is not necessary, but a problem in that that at… I do not know as (if d_B it is declared also to it are passed values h_B in other function for example):
extern “C” void
Rasd_B (const int argc, const char ** argv, float *h_B, int AND Time);
{
float *d_B;
CUDA_SAFE_CALL (cudaMalloc ((void **) &d_B, Time * sizeof (float)));
CUDA_SAFE_CALL (cudaMemcpy (d_B, h_B, sizeof (float) *TimeOfModeling, cudaMemcpyHostToDevice));
}
As it is possible to make d_B visible for RaschdZnach? :(