I am using temp to allocate rows of d_Ptr. I followed your example in earlier post of this thread. As an alternative I used
for(i = 0; i < 5; i++)
cudaMalloc((void**)&d_Ptr[i] , 3 * sizeof(float));
Instead of
for(i = 0; i < 5; i++)
cudaMalloc((void**)&temp[i] , 3 * sizeof(float));
This I doubt and I saw while debugging.
The example I posted is to just allocate and de-allocate memory. I am neither passing in any content or retrieving from device pointer. In this connection I need your help.
As I tried different experiments with cudaMemcpy2D in both directions. ( h->D and D->H) .
The attempt of using copy contents to or from results in seg fault when trying to free memory from device pointer.
Can you illustrate how to copy to and copy from device pointer to host array to check. That wll be a great help. thanks a tonne.
float **device2DArray;
float *h_temp[5];
// Create 2D Array
cudaMalloc((void **)&device2DArray, 5*sizeof(float *));
for(int i=0; i<5; i++)
{
cudaMalloc( (void **)&h_temp[i], 3*sizeof(float));
}
cudaMemcpy(device2DArray, h_temp, 5*sizeof(float *), cudaMemcpyHostToDevice);
// Do not destroy the contents of h_temp
// So, we dont need to copy the pointers from the device again and again. We will hold a copy of the row pointers in h_temp
Million Thanks Sarnath, I was looking for this. but after analysing your solution, I think this will involve lot of transfer time between CPU->GPU & vice-versa. To offset this I used cudaMemcpy2D, as it copies everything in one shot.
Can you think of a solution where cudaMemcpy2D can be used???
Hi,
You have not allocated a 2D block…
<<<4,4>>> means that you are spawning 4 blocks with 4 threads each.
Thus a total of 16 threads in X dimension.
Y dimension is always only 1.
Going by your kernel source, “y” wiall always be “1” and hence only column 1 will be populated for some elements.
Whats the answer that you are getting?
btw,
Your allocation of 2D arrays is correct. Nice.
I had tried to allocate 2Dhost and 2Ddevice array. And then try to use GPU to do some calculation in there.
Badluck, I can not figure out How to deal with 41st line ?
I know that 1D array is better than 2D, but n00b (me) prefer this one.
Here is my error.
C:\cuda_by_example\sarit>nvcc matadd.cu
matadd.cu
matadd.cu(40): warning: variable "offset" was declared but never referenced
tmpxft_00000f5c_00000000-3_matadd.cudafe1.gpu
tmpxft_00000f5c_00000000-8_matadd.cudafe2.gpu
matadd.cu
matadd.cu(40): warning: variable "offset" was declared but never referenced
./matadd.cu(41): Warning: Cannot tell what pointer points to, assuming global me
mory space
ptxas C:/Users/7-64/AppData/Local/Temp/tmpxft_00000f5c_00000000-4_matadd.ptx, li
ne 594; warning : Double is not supported. Demoting to float
tmpxft_00000f5c_00000000-3_matadd.cudafe1.cpp
tmpxft_00000f5c_00000000-14_matadd.ii
C:\cuda_by_example\sarit>
/*
* CONSIDER 4 BLOCKS, WITH 4 THREADS/BLOCS
*
*/
#include"../common/book.h"
#define N 8
__host__ double** Make2DDoubleArray(int arraySizeX, int arraySizeY)
{
double** theArray;
theArray = (double**) malloc(arraySizeX*sizeof(double*));
for (int i = 0; i < arraySizeX; i++)
theArray[i] = (double*) malloc(arraySizeY*sizeof(double));
return theArray;
}
__host__ void showxx(double **A)
{ int i,j;
for(i=0;i<N;i++){
printf("\n");
for(j=0;j<N;j++){
printf("%.3f ",A[i][j]);;
}
}
}
__host__ void init2D(double **A)
{ int i,j;
for(i=0;i<N;i++){
for(j=0;j<N;j++){
A[i][j] = i+j;
}
}
}
__global__ void adda( double **A)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
A[x][y] = 4; //HERE IS 41ST LINE
}
int main(void)
{
double** A = Make2DDoubleArray(N, N);
double **d_2Darray;
double *h_temp[N];
int i;
init2D(A);
showxx(A);
// Create 2D Array on device
cudaMalloc((void **)&d_2Darray, N*sizeof(double *));
for(int i=0; i<N; i++)
{
cudaMalloc( (void **)&h_temp[i], N*sizeof(double));
}
cudaMemcpy(d_2Darray, h_temp, N*sizeof(double *), cudaMemcpyHostToDevice);
//Copy host** arrayn to device
for(int i=0; i<N; i++)
{
cudaMemcpy(h_temp[i], A[i], N*sizeof(double), cudaMemcpyHostToDevice);
}
adda<<<4,4>>>(d_2Darray);
printf("===========================");
//copy back to host
for(int i=0; i<N; i++)
{
cudaMemcpy(A[i], h_temp[i], N*sizeof(double), cudaMemcpyDeviceToHost);
}
showxx(A);
for (i = 0; i < N; i++){
free(A[i]);
}
free(A);
return 0;
}
Try setting values to some arbitrary number greater than 8 in your kernel and then print the results.
OR there could be some problems with your cudaMemcpys failing. DO an erro check
The problem is with your kernel dimensions.
I just commented out “A[y] = xxx;” and added “A[0][0] = 5.;” in the kernel and it just works fine.
The launch fails with A[y]. Because blockIdx.xblockDim.x + threadIdx.x can result in values of 34 + 3 = 15 – which is well past the allocated pointers… Hence invalid pointer causes kernel to abort… Simple…
Thats why I have been asking from morning to pass 2 dimensional block.
Always check bounds before accessing…
Like say:
if ((x > 8) && (y > 8))
A[y] = …
You can use:
"cudaError_t err;
…
…
adda<<<…>>>(…);
err = cudaThreadSynchronize();
if (err != cudaSuccess)
printf(“Kernel launch failed…\n”);
"
Technically the error returned could be from some other GPU operation before the kernel launch too.
Read the CUDA manual
Glad yu got it working…
But please be aware that having 2D arrays in GPU causes one xtra pointer fetch which can be REALLY costly if done repeatedly… So, if you are gonna use this inside LOOPs then yu may want to re-think twice.
No performance sensitive code ever uses 2D arrays this way…
You will be fine with a single dimensinal array to represent the 2D array.
GPUs have very heavy flops. So, it will be ecnomical to spend some FLOPs in finding the address within the 1D array.
Memry accesses are costly…It can cost you several hundred cycles…
There are other aspects like coalesced memry access, shared memory bank conflicts which play a role in affecting performance. You can learn more frm the manual…
Programming is very simple. You can experiment easily with the computer… and offers a much more controlled environment for experimentation… unlike physics which can be much tougher.
Hi! I was following your post to allocate a 2D memory on GPU and did something like you have mentioned. But I am not sure what I did was correct or not and moreover I am still not very much clear about the freeing the memory occupied by the 2D array from GPU. I will post the code that I have written below but it would be very much helpful if you could post a small sample code for the whole procedure. Like allocating 2D array,copying to device,kernel function operation,copying back the 2D array to host and freeing the memory in a single program then it would be very much helpful. Anyway I will post my code below that I am trying to write :
#include "all_headers.h"
__global__ void test_kernel(int **dev_env_t)
{
int tidx = blockDim.x +blockIdx.x*threadIdx.x;
int tidy = blockDim.x+blockIdx.x*threadIdx.y;
dev_env_t[tidx][tidy] = dev_env_t[tidx][tidy] +10;
}
void test_func_1(void)
{
int i,k;
int **env_t;
int **dev_env_t;
int env_end =30;
int *temp[30];
env_t =(int **) malloc(env_end * sizeof *env_t);
for(k=0;k<env_end;k++)
{env_t[k] = (int *)malloc(env_end* env_end* sizeof *env_t[0]);
}
for (k = 1; k < env_end; ++k)
env_t[k] = env_t[k - 1] + env_end;
memset(*env_t, 0, env_end * env_end* sizeof **env_t);
cudaMalloc((void **)&dev_env_t,env_end*sizeof(int));
for(i=0;i<env_end;i++)
{
cudaMalloc((void **)&temp[i],env_end*sizeof(int)); }
cudaMemcpy(dev_env_t,temp,env_end*sizeof(int),cudaMemcpyHostToDevice);
for (i=0;i<env_end;i++)
{ cudaMemcpy(temp[i],env_t[i],env_end*sizeof(int),cudaMemcpyHostToDevice);
}
dim3 gridDim(1,1);
dim3 blockDim(env_end,env_end,1);
test_kernel<<<gridDim,blockDim>>>(dev_env_t);
for (i=0;i<env_end;i++)
{ cudaMemcpy(env_t[i],temp[i],env_end*sizeof(int),cudaMemcpyDeviceToHost);
}
for (i=0;i<env_end;i++)
{ free(env_t[i]);
}
free(env_t);
}
Notice : That I haven’t freed the 2D memory of the GPU as it is still not very much clear to me.