Problem with cudaPitchedPtr Can not copy or write to memory pointed to by cudaPitchedPtr

Hi,

I just updated to CUDA 3.2 and I got problems getting my older code using cudaPitchedPtr to work properly. My code compiles, links and runs without any errors. However, my kernel does not seem to write anything into the device memory pointed to by cudaPitchedPtr. Below is a small self contained example.

Could anybody please tell me what is wrong?

thanks

Kenny

#include <cuda_runtime.h>    // needed for CUDA C++ runtime api

#  define SAFE_CALL( call) {                                                 \

    cudaError err = call;                                                    \

    if( cudaSuccess != err) {                                                \

        std::cerr << "Cuda error in file "                                   \

                  << __FILE__                                                \

                  <<  " line "                                               \

                  << __LINE__                                                \

                  << " : "                                                   \

                  <<  cudaGetErrorString( err)                               \

                  << std::endl;                                              \

        exit(EXIT_FAILURE);                                                  \

} }

#include <string>

#include <iostream>

#include <vector>

__global__ void d_compute_device_values(float * values, int const width, int const height, int const depth, int const pitch

)

{ 

  //--- Compute voxel indices

  int const x = blockIdx.x * blockDim.x + threadIdx.x;

  int const y = blockIdx.y * blockDim.y + threadIdx.y;

  int const z = blockIdx.z * blockDim.z + threadIdx.z;

//--- Guard if we are out of range of volume

  if ( (x < width) && (y < height) && (z < depth) ) 

  {

    //--- Get corresponding linear index of voxel

    int const i = ( z * height  +  y) * pitch + x;

//--- Write output value to data

    values[i] = x+y+z;

  }

}

void compute_host_values(float * raw_ptr, int const width, int const height, int const depth)

{

  assert( raw_ptr );

size_t i         = 0u;

  float  max_value = 0.0f;

for(int z = 0; z < depth;  ++z)

  for(int y = 0; y < height; ++y)

  for(int x = 0; x < width;  ++x)

  {

    float const value =   x+y+z;

    max_value         = (value>max_value) ? value : max_value;

    raw_ptr[i] = value;

    ++i;

  }

size_t const N = width*height*depth;

for(i=0u; i < N; ++i)

  {

    raw_ptr[i] /= max_value;

  }  

}

void clear_host_values(float * raw_ptr, int const width, int const height, int const depth)

{

  assert( raw_ptr );

  size_t const N = width*height*depth;

  for(size_t i=0u; i < N; ++i)

  {

    raw_ptr[i] = 0.0f;

  }  

}

void copy_host_to_device( float * h_raw_ptr, cudaPitchedPtr d_pitched_ptr, int const width, int const height, int const depth )

{    

  cudaPitchedPtr h_pitched_ptr = make_cudaPitchedPtr( h_raw_ptr, width*sizeof(float), width, height );

cudaExtent const extent  = make_cudaExtent(sizeof(float)*width, height, depth);

cudaMemcpy3DParms params = {0};

	params.srcPtr     = h_pitched_ptr;

	params.dstPtr     = d_pitched_ptr;

	params.extent     = extent;

	params.kind       = cudaMemcpyHostToDevice;

	SAFE_CALL( cudaMemcpy3D( &params ) );

}

void copy_device_to_host(cudaPitchedPtr d_pitched_ptr, float * h_raw_ptr, int const width, int const height, int const depth)

{    

  cudaPitchedPtr h_pitched_ptr = make_cudaPitchedPtr( h_raw_ptr, width*sizeof(float), width, height );

cudaExtent const extent  = make_cudaExtent(sizeof(float)*width, height, depth);

	cudaMemcpy3DParms params = {0};

	params.srcPtr     = d_pitched_ptr;

	params.dstPtr     = h_pitched_ptr;

	params.extent     = extent;

	params.kind       = cudaMemcpyDeviceToHost;

	SAFE_CALL( cudaMemcpy3D( &params ) );

}

void print_host(float * h_raw_ptr, int const width, int const height, int const depth)

{

  size_t const N = width*height*depth;

std::cout << "H = [";

  for(size_t i = 0u; i < N; ++i)

  {

    std::cout << " " <<  h_raw_ptr[i];

  }

  std::cout << "]; " << std::endl;

}

int main(int argc, const char** argv) 

{	

  int const width  = 16;

  int const height = 16;

  int const depth  = 16;

SAFE_CALL( cudaSetDevice( 0 ) );

// Allocate host memory    

  std::vector<float> h_vector;

  h_vector.resize( width*height*depth );

  float * h_raw_ptr = static_cast<float *>( & h_vector[0] );

// Allocate device memory

  cudaPitchedPtr  d_pitched_ptr;

cudaExtent const extent = make_cudaExtent(sizeof(float)*width, height, depth);

SAFE_CALL( cudaMalloc3D(&d_pitched_ptr, extent) );

  SAFE_CALL( cudaMemset3D(d_pitched_ptr, 0, extent ) ); 

// Compute values on device

  dim3 const threads_per_block(16, 16, 16); 

  dim3 const number_of_blocks( (width  + 15) / threads_per_block.x, (height + 15) / threads_per_block.y, (depth  + 15) / threads_per_block.z );

int const pitch = d_pitched_ptr.pitch / sizeof(float);

  assert( pitch >= width );

float * d_raw_ptr = static_cast< float *>( d_pitched_ptr.ptr );

  assert( d_raw_ptr );

d_compute_device_values<<< number_of_blocks, threads_per_block >>>( d_raw_ptr, width, height, depth, pitch );

// Get computed values and see if they are different from zero  

  copy_device_to_host( d_pitched_ptr, h_raw_ptr, width, height, depth);

  print_host( h_raw_ptr, width, height, depth );

// Just print host sided computed values for comparison purpose only

  compute_host_values(h_raw_ptr, width, height, depth);

  print_host( h_raw_ptr, width, height, depth );

// Copy to cudaPitchedPtr Test  

  compute_host_values(h_raw_ptr, width, height, depth);

  copy_host_to_device( h_raw_ptr, d_pitched_ptr, width, height, depth);

  clear_host_values(h_raw_ptr, width, height, depth);

  copy_device_to_host( d_pitched_ptr, h_raw_ptr, width, height, depth);

  print_host( h_raw_ptr, width, height, depth );

SAFE_CALL( cudaFree(d_pitched_ptr.ptr) );

  SAFE_CALL( cudaThreadSynchronize () );

  SAFE_CALL( cudaThreadExit() );

}

Never mind. I have figured out what was wrong. The kernel was never launched due to an invalid configuration. I re-sized the computational grid and reorganized the memory layout. Now it works.

/Kenny