Simple Malloc on host

Hello !

I don’t know if my problem is a simple one or if I’m really a noob !

Here is my code :

__global__ void test(float *p, size_t pitch, int H, int W){

  int idx=threadIdx.x + blockIdx.x* blockDim.x;

  int idy=threadIdx.y + blockIdx.y* blockDim.y;

  int index =idx + idy*W;

  if ( idx < W && idy < H ) p[index] = 3;

}

main()

{

  float *p;

  float p_h[256][24];

int i, j;

  int H =24;

  int W =256;

size_t pitch;

  dim3 threadPerBlock(32,8);

  dim3 dimGrid(W/threadPerBlock.x , H/threadPerBlock.y);

cudaMallocPitch((void**) &p, &pitch, W*sizeof(float), H);

test<<<dimGrid,threadPerBlock>>>(p, pitch,H,W);

  cudaMemcpy2D(p_h,W*sizeof(float),p,pitch,W*sizeof(float),H,cudaMemcpyDeviceToHost);

for(i=0;i<W;i++) for(j=0;j<H;j++) printf("%d ", (int)p_h[i][j]);

cudaFree(p);

}

And I get the good results ! But know i juste replace

by

And when I do so, it compiles but I’ve got a Segmentation Fault when the program tries to access to p_h[0][0] !

Can anybody tells me why ? And how I can deal with it ?

Thank you for your time !

I didn’t see anything wrong in your main function.
But when I read your kernel, I wonder that if you clearly understanding the meaning of “pitch” or it may not necessary in your code.
“int index =idx + idy*W;”
The meaning of “pitch” has been discussing in many threads in this forum, so you can search to get more information.
I don’t know the reason why "Segmentation Fault " occurs, are you using Linux?
you has better use the “printf(“CUDA Error: %s\n”, cudaGetErrorString(cudaGetLastError()));” to receive the error report from CUDA driver.

Hello,

Thanks for your answer, I’m now trying to understand better this “pitch” but I’m encountering some issues…

As I read on other topics, the pitch is the nearest multiple of 64 of the memory of each row !

In this new code, I just try to copy an array A to a device array B then from the array B to a host array C…

#include <stdio.h>

#include <cuda.h>

main()

{

  float *d_ParamA;

  float **ParamA;

  float h_ParamA[32][32];

	

  int i, j;

  int H =32;

  int W =32;

//Host memory allocation for the ParamA array

  ParamA=(float**)malloc(W*sizeof(float*));

  for(i=0;i<W;i++) ParamA[i]=(float*)malloc((H)*sizeof(float));

//Filling of the array ParamA

  for(i=0;i<W;i++) for(j=0;j<H;j++) ParamA[i][j]=4;

size_t pitch_ad;

  // Pitch = W*sizeof(float)= 32*32 = 1024  rounded to the multiple of 64 : 1024

  size_t pitch_ah = 1024;

dim3 threadPerBlock(16,16);

  dim3 dimGrid(W/threadPerBlock.x , H/threadPerBlock.y);

//Device memory allocation for the d_paramA array

  cudaMallocPitch((void**) &d_ParamA, &pitch_ad, W*sizeof(float), H);

// Copy from home to device

  cudaMemcpy2D(d_ParamA,pitch_ad,ParamA,pitch_ah,W*sizeof(float),H,cudaMemcpyHostToDevice);

// Copy from device to Home

  cudaMemcpy2D(h_ParamA,pitch_ah,d_ParamA,pitch_ad,W*sizeof(float),H,cudaMemcpyDeviceToHost);

printf("\nCUDA\n");

  for(i=0;i<W;i++)

  {

	  for(j=0;j<H;j++) printf("%d ", (int)h_ParamA[i][j]);

	  printf("\n");

  }

printf("\nNON CUDA\n");

  for(i=0;i<W;i++)

  {

	  for(j=0;j<H;j++)printf("%d ", (int)ParamA[i][j]);

	  printf("\n");

  }

cudaFree(d_ParamA);

}

but I’ve again a segmentation fault before the

"cudaMemcpy2D(h_ParamA,pitch_ah,d_ParamA,pitch_ad,W*size

of(float),H,cudaMemcpyDeviceToHost);"

Can somebody understand why ?

I’m using linux by the way ! And “printf(“CUDA Error: %s\n”, cudaGetErrorString(cudaGetLastError()));” returns me “no error”

I had changed your code and hoped it work properly.

#include <stdio.h>

#include <cuda.h>

main()

{

  const int H = 32;

  const int W = 32;

  /*you can copy a host dynamic matrix (a matrix allocated with dynamic method "pointer of pointer") to device memory, but you may not receive properly data because all elements of dynamic matrix will not be allocated with continuous memory space. however you can copy a 1D dynamic array to 2D matrix allocated with cudaMallocPitch() in device memory.*/

  float ParamA[W][H];

  float h_ParamA[W][H];

  int i, j;

  //Filling of the array ParamA

  for (i = 0; i < W; i++) {

	 for (j = 0; j < H; j++) {

		ParamA[i][j] = 4;

	  }

   }  

size_t pitch_ad;

  // size_t pitch_ah = 1024; don't need to initialize data for pitch, it will be assigned by cudaMallocPitch automatically

//Device memory allocation for the d_paramA array

  float *d_ParamA;

  cudaMallocPitch((void**) &d_ParamA, &pitch_ad, W*sizeof(float), H);

// Copy from home to device

  cudaMemcpy2D(d_ParamA, pitch_ad, ParamA, W * sizeof(float), W * sizeof(float), H, cudaMemcpyHostToDevice);

dim3 threadPerBlock(16, 16);

  dim3 dimGrid(W / threadPerBlock.x , H / threadPerBlock.y);

  //call kernel function

// Copy from device to Home

  cudaMemcpy2D(h_ParamA, W * sizeof(float), d_ParamA, pitch_ad, W * sizeof(float), H, cudaMemcpyDeviceToHost);

printf("\nCUDA\n");

  for ( i = 0; i < W; i++) {

	  for (j = 0; j < H; j++) printf("%d ", (int)h_ParamA[i][j]);

	  printf("\n");

  }

printf("\nNON CUDA\n");

  for (i = 0; i < W; i++) {

	  for (j = 0; j < H; j++) printf("%d ", (int)ParamA[i][j]);

	  printf("\n");

  }

cudaFree(d_ParamA);

}

You should pay attention at

  1. create a dynamic matric on host memory

  2. pitch

  3. how to use cudaMemcpy2D() correctly.

Hope this code woks properly on your machine. :rolleyes:

Hello again !
Thanks a lot for your time ! Your code is working on my device so I will spend some time to think about what I was doing wrong to be sure I get the whole functioning !
Thanks again !