Hi guys,
I have a trouble in how to get in the host a memory allocated from device. In host I don’t know the size of memory necessary so I use a struct to get this values to get back in host but not work
//~ nvcc -g -G -arch=sm_35 -o allocinsidekernel allocinsidekernel.cu -lcudadevrt -rdc=true
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <curand.h>
#include <curand_kernel.h>
#define _THREADSFX 32
#define _THREADSTOTAL 64
#define cudaCheckErrors(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline __host__ __device__ void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess) {
#ifdef __CUDACC__
printf("GPUassert: Error(%d) %s %s in line %d\n", (int)code, cudaGetErrorString(code), file, line);
if (abort)
assert(code);
#else
fprintf(stderr,"GPUassert: Error(%d) %s %s in line %d\n", (int)code, cudaGetErrorString(code), file, line);
if (abort)
exit(code);
#endif
}
}
struct myrows{
int size;
int *matrix;
};
__host__ __device__ unsigned int nextPow2(unsigned int x){
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
__host__ __device__ void getNumBlocksAndThreads(int n, int maxThreads, int &blocks, int &threads){
if(n<=0){
n *= (-1);
if(n == 0){
n = 1;
}
}
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
blocks = (n + threads - 1) / threads;
}
__global__ void kernel(struct myrows *data, int maxsize){
int tid = (blockIdx.x + gridDim.x * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
int i;
if(tid < maxsize){
data[tid].size = 100;
data[tid].matrix = (int*)malloc(100*sizeof(int));
for(i=0; i<100; i++){
curandState state;
curand_init(1, tid, 0, &state);
data[tid].matrix[i] = curand_uniform(&state);
printf("%d, ", data[tid].matrix[i]);
}
}
}
int main(){
struct myrows *host_data, *device_data;
int i, j;
cudaCheckErrors(cudaMalloc((void**)&device_data, _THREADSTOTAL*sizeof(struct myrows)));
for(i=0; i<_THREADSTOTAL; i++){
cudaCheckErrors(cudaMalloc((void**)&device_data[i].matrix, 2*sizeof(int)));
}
int blocos = 0, threads = 0;
getNumBlocksAndThreads(_THREADSTOTAL, _THREADSFX, blocos, threads);
kernel<<<blocos, threads>>>(device_data, _THREADSTOTAL);
cudaCheckErrors(cudaDeviceSynchronize());
host_data = (struct myrows*)malloc(_THREADSTOTAL*sizeof(struct myrows));
for(i=0; i<_THREADSTOTAL; i++){
cudaCheckErrors(cudaMemcpy(&host_data[i].size, &device_data[i].size, sizeof(int), cudaMemcpyDeviceToHost));
host_data[i].matrix = (int*)malloc(host_data[i].size*sizeof(int));
cudaCheckErrors(cudaMemcpy(host_data[i].matrix, device_data[i].matrix, host_data[i].size*sizeof(int), cudaMemcpyDeviceToHost));
}
//~ cudaCheckErrors(cudaMemcpy(h_output, d_output, _QNT_MATRIX*sizeof(int), cudaMemcpyDeviceToHost));
for(i=0; i<_THREADSTOTAL; i++){
for(j=0; j<host_data[i].size; j++){
printf("%d ", host_data[i].matrix[j]);
}
printf("\n");
}
for(i=0; i<_THREADSTOTAL; i++){
cudaFree(device_data[i].matrix);
free(host_data[i].matrix);
}
cudaFree(device_data);
free(host_data);
return 0;
}
I’m allocating device memory on lines 75~78, call kernel on line 82, reading back to host on lines 85~90