Can't get cuMemcpy2D to Work

gmacindoe · October 7, 2010, 12:21pm

Hi,

I am trying to use cuMemcpy2D to copy a matrix from host memory into device memory but cannot get it to work for matrices with more than 128 rows of type float or 64 rows of type double. The corresponding pitches are 512 in both cases which is well below the maximum pitch reported by deviceQuery from the SDK. I am using CUDA Toolkit and SDK versions 3.2 and driver version 260.24.

Below is a sample program to illustrate the issue. It compiles with “gcc -std=c99 -I/opt/cuda/include -o memcpy2d memcpy2d.c -L/opt/cuda/lib64 -lcuda”. I keep getting “Error code 700 at line 48” when I run it. Error code 700 is “Launch failed”.

#include <stdio.h>

#include <stdbool.h>

#include <cuda.h>

#define N 1024

#define T float

#define SAFE_CALL(call) \

 do { \

   CUresult error = call; \

   if (error != CUDA_SUCCESS) { \

	 fprintf(stderr, "Error code %d at line %d\n", error, __LINE__); \

	 return -1; \

   } \

 } while (false)

int main(int argc, char * argv[]) {

  CUdevice device;

  CUcontext context;

SAFE_CALL(cuInit(0));

  SAFE_CALL(cuDeviceGet(&device, 0));

  SAFE_CALL(cuCtxCreate(&context, 0, device));

T * h_ptr;

  CUdeviceptr d_ptr;

  unsigned int pitch;

  SAFE_CALL(cuMemAllocPitch(&d_ptr, &pitch, N * sizeof(T), N, sizeof(T)));

  SAFE_CALL(cuMemAllocHost((void **)&h_ptr, N * N * sizeof(T)));

CUDA_MEMCPY2D p;

p.srcXInBytes = 0;

  p.srcY = 0;

  p.srcMemoryType = CU_MEMORYTYPE_HOST;

  p.srcHost = h_ptr;

  p.srcPitch = N * sizeof(T);

p.dstXInBytes = 0;

  p.dstY = 0;

  p.dstMemoryType = CU_MEMORYTYPE_DEVICE;

  p.dstDevice = d_ptr;

  p.dstPitch = pitch * sizeof(T);

p.WidthInBytes = N * sizeof(T);

  p.Height = N;

SAFE_CALL(cuMemcpy2D(&p));

SAFE_CALL(cuMemFree(d_ptr));

  SAFE_CALL(cuMemFreeHost(h_ptr));

SAFE_CALL(cuCtxDestroy(context));

return 0;

}

gmacindoe · October 7, 2010, 12:21pm

Hi,

I am trying to use cuMemcpy2D to copy a matrix from host memory into device memory but cannot get it to work for matrices with more than 128 rows of type float or 64 rows of type double. The corresponding pitches are 512 in both cases which is well below the maximum pitch reported by deviceQuery from the SDK. I am using CUDA Toolkit and SDK versions 3.2 and driver version 260.24.

Below is a sample program to illustrate the issue. It compiles with “gcc -std=c99 -I/opt/cuda/include -o memcpy2d memcpy2d.c -L/opt/cuda/lib64 -lcuda”. I keep getting “Error code 700 at line 48” when I run it. Error code 700 is “Launch failed”.

#include <stdio.h>

#include <stdbool.h>

#include <cuda.h>

#define N 1024

#define T float

#define SAFE_CALL(call) \

 do { \

   CUresult error = call; \

   if (error != CUDA_SUCCESS) { \

	 fprintf(stderr, "Error code %d at line %d\n", error, __LINE__); \

	 return -1; \

   } \

 } while (false)

int main(int argc, char * argv[]) {

  CUdevice device;

  CUcontext context;

SAFE_CALL(cuInit(0));

  SAFE_CALL(cuDeviceGet(&device, 0));

  SAFE_CALL(cuCtxCreate(&context, 0, device));

T * h_ptr;

  CUdeviceptr d_ptr;

  unsigned int pitch;

  SAFE_CALL(cuMemAllocPitch(&d_ptr, &pitch, N * sizeof(T), N, sizeof(T)));

  SAFE_CALL(cuMemAllocHost((void **)&h_ptr, N * N * sizeof(T)));

CUDA_MEMCPY2D p;

p.srcXInBytes = 0;

  p.srcY = 0;

  p.srcMemoryType = CU_MEMORYTYPE_HOST;

  p.srcHost = h_ptr;

  p.srcPitch = N * sizeof(T);

p.dstXInBytes = 0;

  p.dstY = 0;

  p.dstMemoryType = CU_MEMORYTYPE_DEVICE;

  p.dstDevice = d_ptr;

  p.dstPitch = pitch * sizeof(T);

p.WidthInBytes = N * sizeof(T);

  p.Height = N;

SAFE_CALL(cuMemcpy2D(&p));

SAFE_CALL(cuMemFree(d_ptr));

  SAFE_CALL(cuMemFreeHost(h_ptr));

SAFE_CALL(cuCtxDestroy(context));

return 0;

}

Topic		Replies	Views
trouble with cudaMemcpy2D I cant get a matrix to copy into 2D pitched memory CUDA Programming and Performance	1	958	July 13, 2009
Why cudaMemcpy2D cause "invalid pitch argument"? CUDA Programming and Performance	2	6696	June 10, 2008
help with cudaMemcpy2D I can't get a matrix/ array to copy correctly from host to device CUDA Programming and Performance	3	5103	July 14, 2009
Avoiding cudaMemcpy2D() because of 65536 pitch limit CUDA Programming and Performance	1	2423	April 14, 2009
problem with cudaMallocPitch and cudaMemcpy2D CUDA Programming and Performance	5	6418	April 22, 2009
Memcpy2D error? CUDA Programming and Performance	2	2287	July 23, 2007
cudaMallocPitch + cudaMemcpy2D results in 0 ! I use mallocpitch and memcpy2D to copy a matrix to CUDA Programming and Performance	0	2584	June 23, 2011
cudaMemcpy2D / Grid size / MxN double matrix Problem copying a MxN double matrix from Host to Device CUDA Programming and Performance	3	1376	March 18, 2010
need help for cudaMemcpy2D() CUDA Programming and Performance	5	4644	December 8, 2009
Using cudaMemcpy2D very strange CUDA Programming and Performance	2	1406	March 10, 2009

Can't get cuMemcpy2D to Work

Related topics