How to get the data back in C++

Hi Everyone,

I have the following files, which are working perfectly, I have a question. I would like to get the data back in my C++ program, how do I get it? For example if you see in the program, I would like float * c_d back in my c++ program. Please pardon my naming convention, I started with helloWorld example then ended up with matrix addition.

Any help is appreciated.

Regards,

Arup

helloWorld.cuh


#pragma once

void device_greetings(float * a_d);

helloWorld.cu


__global__ void device_greetings_kernel(float *a, int N)

{

   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   if (idx<N)

   {

	  a[idx]=a[idx]+1.f;

   }

}

void device_greetings(float * a_d)

{

   float * b_d; //array to be assigned to devicie

   float * c_d;

   int N = 14;

int blocksize = 4;

   int nBlocks = N/blocksize + (N%blocksize == 0?0:1);

printf("%d\n",nBlocks);

//allocate memory to the array in CUDA

   cudaMalloc((void **) &b_d, sizeof(float)*N);

   cudaMalloc((void **) &c_d, sizeof(float)*N);

cudaMemcpy(b_d,a_d, sizeof(float)*N,cudaMemcpyHostToDevice);

device_greetings_kernel <<< nBlocks, blocksize >>> (b_d, N);

cudaMemcpy(c_d,b_d, sizeof(float)*N,cudaMemcpyDeviceToHost);

cudaMalloc((void **) &a_d, sizeof(float)*N);

   cudaMemcpy(a_d,b_d, sizeof(float)*N,cudaMemcpyDeviceToHost);

//traverse the array

   for(int i=0;i<N;i++)

   {

	  //increment c_d by 1

	  //c_d[i]=c_d[i]+1;

	  printf("%f\n",a_d[i]);

	  printf("%f\n", c_d[i]);

}

   cudaFree(b_d);

   cudaFree(c_d);

   printf("Hello World from device.!!!\n");

}

helloWorld.cpp


#include <stdio.h>

#include <iostream>

#include "helloWorld.cuh"

using namespace std;

int main()

{

   std::cout << "Hello World. !!!" << std::endl;

   //create an array in host

   float * a_h, * b_h; //array to be assigned in host

int N = 14;

   int i;

   a_h = new float[N]; //allocate array values in host

   b_h = new float[N]; //allocate array values in host

for(i=0;i<N;i++)

   {

	  a_h[i]=10.f+i;

	  b_h[i]=0.f;

	  std::cout << "a_h["<<i<<"]:" << a_h[i]<<std::endl;

	  //std::cout << "b_h["<<i<<"]:" << b_h[i]<<std::endl;

   }

	 // launch a kernel with a single thread to greet from the device

   device_greetings(a_h);

for(int j=0;j<N;j++)

   {

	  std::cout<< "a_h["<<j<<"]: "<< a_h[j]<<std::endl;

   }

free(a_h);

   free(b_h);

return 0;

}

Hi Everyone,

I have the following files, which are working perfectly, I have a question. I would like to get the data back in my C++ program, how do I get it? For example if you see in the program, I would like float * c_d back in my c++ program. Please pardon my naming convention, I started with helloWorld example then ended up with matrix addition.

Any help is appreciated.

Regards,

Arup

helloWorld.cuh


#pragma once

void device_greetings(float * a_d);

helloWorld.cu


__global__ void device_greetings_kernel(float *a, int N)

{

   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   if (idx<N)

   {

	  a[idx]=a[idx]+1.f;

   }

}

void device_greetings(float * a_d)

{

   float * b_d; //array to be assigned to devicie

   float * c_d;

   int N = 14;

int blocksize = 4;

   int nBlocks = N/blocksize + (N%blocksize == 0?0:1);

printf("%d\n",nBlocks);

//allocate memory to the array in CUDA

   cudaMalloc((void **) &b_d, sizeof(float)*N);

   cudaMalloc((void **) &c_d, sizeof(float)*N);

cudaMemcpy(b_d,a_d, sizeof(float)*N,cudaMemcpyHostToDevice);

device_greetings_kernel <<< nBlocks, blocksize >>> (b_d, N);

cudaMemcpy(c_d,b_d, sizeof(float)*N,cudaMemcpyDeviceToHost);

cudaMalloc((void **) &a_d, sizeof(float)*N);

   cudaMemcpy(a_d,b_d, sizeof(float)*N,cudaMemcpyDeviceToHost);

//traverse the array

   for(int i=0;i<N;i++)

   {

	  //increment c_d by 1

	  //c_d[i]=c_d[i]+1;

	  printf("%f\n",a_d[i]);

	  printf("%f\n", c_d[i]);

}

   cudaFree(b_d);

   cudaFree(c_d);

   printf("Hello World from device.!!!\n");

}

helloWorld.cpp


#include <stdio.h>

#include <iostream>

#include "helloWorld.cuh"

using namespace std;

int main()

{

   std::cout << "Hello World. !!!" << std::endl;

   //create an array in host

   float * a_h, * b_h; //array to be assigned in host

int N = 14;

   int i;

   a_h = new float[N]; //allocate array values in host

   b_h = new float[N]; //allocate array values in host

for(i=0;i<N;i++)

   {

	  a_h[i]=10.f+i;

	  b_h[i]=0.f;

	  std::cout << "a_h["<<i<<"]:" << a_h[i]<<std::endl;

	  //std::cout << "b_h["<<i<<"]:" << b_h[i]<<std::endl;

   }

	 // launch a kernel with a single thread to greet from the device

   device_greetings(a_h);

for(int j=0;j<N;j++)

   {

	  std::cout<< "a_h["<<j<<"]: "<< a_h[j]<<std::endl;

   }

free(a_h);

   free(b_h);

return 0;

}

cudaMemcpy

cudaMemcpy

float * c_d; – Seems to be device pointer:

cudaMalloc((void **) &c_d, sizeof(float)*N)

cudaMemcpy(c_d,b_d, sizeof(float)*N,cudaMemcpyDeviceToHost); – c_d would need to be a pointer to host side memory, so just pass it like you passed a_d…

float * c_d; – Seems to be device pointer:

cudaMalloc((void **) &c_d, sizeof(float)*N)

cudaMemcpy(c_d,b_d, sizeof(float)*N,cudaMemcpyDeviceToHost); – c_d would need to be a pointer to host side memory, so just pass it like you passed a_d…