Hello!
I’m trying to make a 2d array, copy to cuda device increase every element by 1.0 and copy back to host memory, but the code dies in cudaMemcpy2d. What did i do wrong?
[codebox]// example1.cpp : Defines the entry point for the console application.
//
//#include “stdafx.h”
#include <stdio.h>
#include <cuda.h>
#include <cutil.h>
// Kernel that executes on the CUDA device
global void mod_array(float **d, int x, int y, int dimA)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int row = idx/x;
int column = idx%x;
if (idx<dimA)
d[row][column] = d[row][column] +1.f;
}
// main routine that executes on the host
int main(void)
{
float **host_array2d, **device_array2d;
int size_X = 160;
int size_Y = 120;
int dim = size_X * size_Y;
int numThreadsPerBlock = 4;
int n_blocks = dim/numThreadsPerBlock + (dim%numThreadsPerBlock == 0?0:1);
size_t memSize = size_X * sizeof(float);
host_array2d = (float **) malloc(memSize);
// Initialize host array
for (int i=1; i<size_X-1; i++){
host_array2d[i] =(float*) malloc(size_Y * sizeof(float));
for (int j=1; j<size_Y-1; j++){
host_array2d[i][j]=((float)i * (float)size_X)+(float)j;
}
}
size_t d_pitch ;
cudaMallocPitch( (void **) &device_array2d, &d_pitch, size_Y * sizeof(float), size_X);
printf(" d_pitch = %d \n", d_pitch);
//copy host_array to device_memory
cudaMemcpy2D( device_array2d, d_pitch, host_array2d, size_Y*sizeof(float), size_Y*sizeof(float), size_X, cudaMemcpyHostToDevice );
// Do calculation on device:
mod_array <<< n_blocks, numThreadsPerBlock >>> (device_array2d, size_Y, size_X, dim);
// Retrieve result from device and store it in host array
cudaMemcpy2D( host_array2d, size_Y*sizeof(float), device_array2d, d_pitch, size_Y*sizeof(float), size_X, cudaMemcpyDeviceToHost );
// Cleanup
free(host_array2d);
cudaFree(device_array2d);
}
[/codebox]