Hi,
I just updated to CUDA 3.2 and I got problems getting my older code using cudaPitchedPtr to work properly. My code compiles, links and runs without any errors. However, my kernel does not seem to write anything into the device memory pointed to by cudaPitchedPtr. Below is a small self contained example.
Could anybody please tell me what is wrong?
thanks
Kenny
#include <cuda_runtime.h> // needed for CUDA C++ runtime api
# define SAFE_CALL( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
std::cerr << "Cuda error in file " \
<< __FILE__ \
<< " line " \
<< __LINE__ \
<< " : " \
<< cudaGetErrorString( err) \
<< std::endl; \
exit(EXIT_FAILURE); \
} }
#include <string>
#include <iostream>
#include <vector>
__global__ void d_compute_device_values(float * values, int const width, int const height, int const depth, int const pitch
)
{
//--- Compute voxel indices
int const x = blockIdx.x * blockDim.x + threadIdx.x;
int const y = blockIdx.y * blockDim.y + threadIdx.y;
int const z = blockIdx.z * blockDim.z + threadIdx.z;
//--- Guard if we are out of range of volume
if ( (x < width) && (y < height) && (z < depth) )
{
//--- Get corresponding linear index of voxel
int const i = ( z * height + y) * pitch + x;
//--- Write output value to data
values[i] = x+y+z;
}
}
void compute_host_values(float * raw_ptr, int const width, int const height, int const depth)
{
assert( raw_ptr );
size_t i = 0u;
float max_value = 0.0f;
for(int z = 0; z < depth; ++z)
for(int y = 0; y < height; ++y)
for(int x = 0; x < width; ++x)
{
float const value = x+y+z;
max_value = (value>max_value) ? value : max_value;
raw_ptr[i] = value;
++i;
}
size_t const N = width*height*depth;
for(i=0u; i < N; ++i)
{
raw_ptr[i] /= max_value;
}
}
void clear_host_values(float * raw_ptr, int const width, int const height, int const depth)
{
assert( raw_ptr );
size_t const N = width*height*depth;
for(size_t i=0u; i < N; ++i)
{
raw_ptr[i] = 0.0f;
}
}
void copy_host_to_device( float * h_raw_ptr, cudaPitchedPtr d_pitched_ptr, int const width, int const height, int const depth )
{
cudaPitchedPtr h_pitched_ptr = make_cudaPitchedPtr( h_raw_ptr, width*sizeof(float), width, height );
cudaExtent const extent = make_cudaExtent(sizeof(float)*width, height, depth);
cudaMemcpy3DParms params = {0};
params.srcPtr = h_pitched_ptr;
params.dstPtr = d_pitched_ptr;
params.extent = extent;
params.kind = cudaMemcpyHostToDevice;
SAFE_CALL( cudaMemcpy3D( ¶ms ) );
}
void copy_device_to_host(cudaPitchedPtr d_pitched_ptr, float * h_raw_ptr, int const width, int const height, int const depth)
{
cudaPitchedPtr h_pitched_ptr = make_cudaPitchedPtr( h_raw_ptr, width*sizeof(float), width, height );
cudaExtent const extent = make_cudaExtent(sizeof(float)*width, height, depth);
cudaMemcpy3DParms params = {0};
params.srcPtr = d_pitched_ptr;
params.dstPtr = h_pitched_ptr;
params.extent = extent;
params.kind = cudaMemcpyDeviceToHost;
SAFE_CALL( cudaMemcpy3D( ¶ms ) );
}
void print_host(float * h_raw_ptr, int const width, int const height, int const depth)
{
size_t const N = width*height*depth;
std::cout << "H = [";
for(size_t i = 0u; i < N; ++i)
{
std::cout << " " << h_raw_ptr[i];
}
std::cout << "]; " << std::endl;
}
int main(int argc, const char** argv)
{
int const width = 16;
int const height = 16;
int const depth = 16;
SAFE_CALL( cudaSetDevice( 0 ) );
// Allocate host memory
std::vector<float> h_vector;
h_vector.resize( width*height*depth );
float * h_raw_ptr = static_cast<float *>( & h_vector[0] );
// Allocate device memory
cudaPitchedPtr d_pitched_ptr;
cudaExtent const extent = make_cudaExtent(sizeof(float)*width, height, depth);
SAFE_CALL( cudaMalloc3D(&d_pitched_ptr, extent) );
SAFE_CALL( cudaMemset3D(d_pitched_ptr, 0, extent ) );
// Compute values on device
dim3 const threads_per_block(16, 16, 16);
dim3 const number_of_blocks( (width + 15) / threads_per_block.x, (height + 15) / threads_per_block.y, (depth + 15) / threads_per_block.z );
int const pitch = d_pitched_ptr.pitch / sizeof(float);
assert( pitch >= width );
float * d_raw_ptr = static_cast< float *>( d_pitched_ptr.ptr );
assert( d_raw_ptr );
d_compute_device_values<<< number_of_blocks, threads_per_block >>>( d_raw_ptr, width, height, depth, pitch );
// Get computed values and see if they are different from zero
copy_device_to_host( d_pitched_ptr, h_raw_ptr, width, height, depth);
print_host( h_raw_ptr, width, height, depth );
// Just print host sided computed values for comparison purpose only
compute_host_values(h_raw_ptr, width, height, depth);
print_host( h_raw_ptr, width, height, depth );
// Copy to cudaPitchedPtr Test
compute_host_values(h_raw_ptr, width, height, depth);
copy_host_to_device( h_raw_ptr, d_pitched_ptr, width, height, depth);
clear_host_values(h_raw_ptr, width, height, depth);
copy_device_to_host( d_pitched_ptr, h_raw_ptr, width, height, depth);
print_host( h_raw_ptr, width, height, depth );
SAFE_CALL( cudaFree(d_pitched_ptr.ptr) );
SAFE_CALL( cudaThreadSynchronize () );
SAFE_CALL( cudaThreadExit() );
}