CudaFree 2D-Array

Hello every1!

The error: argument of type “float” is incompatible with parameter of type “void *” appears in the line where I call cutilSafeCall( cudaFree(someHost[i]) ). I am not getting the solution for my problem. I did it the way discribed in Romants topic.

#include<stdio.h>

#include<cuda.h>

#include<stdlib.h>

#include<cutil_inline.h>

#define N 256

__global__ void dumbKernel(float **Mx, float **My) {

	

	__shared__ int pp;

	

	int i = threadIdx.x + blockIdx.x * blockDim.x;

	

	if(i < N) {

		pp += i;

	}

	

	__syncthreads();

}

int main(int argc, char** argv) {

	

	float **h_Mx;

	float **h_My;

	float **d_Mx;

	float **d_My;

	float *someHost;

	size_t size_M = N * sizeof(float *);

	

	

	//DEVICE-MEMORY

	cutilSafeCall( cudaMalloc((void **) &d_Mx, size_M) );

	cutilSafeCall( cudaMalloc((void **) &d_My, size_M) );

	

	for(int i = 0; i < N; i++) {

		cutilSafeCall( cudaMalloc((void **)&someHost[i], N * sizeof(float)) );

	}

	cutilSafeCall( cudaMemcpy(d_Mx, someHost, size_M, cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_My, someHost, size_M, cudaMemcpyHostToDevice) );

	

	//HOST-MEMORY

	h_Mx = (float **) malloc(size_M);

	h_My = (float **) malloc(size_M);

	

	for(int i = 0; i < N; i++) {

		h_Mx[i] = (float *) malloc(N * sizeof(float));

		h_My[i] = (float *) malloc(N * sizeof(float));

	}

	

	dumbKernel<<<1, 256>>>(d_Mx, d_My);

	

	cutilSafeCall( cudaMemcpy(someHost, d_Mx, size_M, cudaMemcpyDeviceToHost) );

	

	for(int i = 0; i < N; i++) {

		cutilSafeCall( cudaFree(someHost[i]) );

	}

	

	cutilSafeCall( cudaFree(d_Mx) );

	

	cutilSafeCall( cudaMemcpy(someHost, d_My, size_M, cudaMemcpyDeviceToHost) );

	

	for(int i = 0; i < N; i++) {

		cutilSafeCall( cudaFree(someHost[i]) );

	}

	

	cutilSafeCall( cudaFree(d_My) );

	for(int i = 0; i < N; i++) {

		free(h_Mx[i]);

		free(h_My[i]);

	}

	return 0;

}

Thank you for looking through my code!

Regards,

A.

You are double-freeing…

The 2D arrays are sharing the same row-pointers - and you are freeing them twice

Thanks for your fast response.

You mean the cudaFree(d_Mx) is useless? I now created a someHostX array for matrix Mx and someHostY for the matrix My. However, I am still getting the same error discribed above.

cutilSafeCall( cudaMemcpy(d_Mx, someHost, size_M, cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_My, someHost, size_M, cudaMemcpyHostToDevice) );

When you are doing the lines above, d_Mx and d_My have the same data. i.e. the row pointers are the same.

btw, I see that you do NOT “malloc” for “someHost” at all… That is yet another problem.

cutilSafeCall( cudaMemcpy(someHost, d_Mx, size_M, cudaMemcpyDeviceToHost) );

	

	for(int i = 0; i < N; i++) {

		cutilSafeCall( cudaFree(someHost[i]) );

	}

And so, when you are doing the code above, you have already freed the row pointers belonging to “d_My” as well.

And Hence the following code is a double-free…

cutilSafeCall( cudaMemcpy(someHost, d_My, size_M, cudaMemcpyDeviceToHost) );

	

	for(int i = 0; i < N; i++) {

		cutilSafeCall( cudaFree(someHost[i]) );

	}

Did that help?

btw,

Your original post was about a compiler error, I guess…

Just like how you cast it for “cudaMalloc”, you need to cast it for “cudaFree” as well…

Like this: cudaFree((void *)someHost[i]);

Secondly,
I dont know why “someHost” is declared as a “float *”. It has nothing to do with floats. It just holds some pointers. Anyway, as long as you cast, it should be fine.

I am understanding your point. I have integrated your thoughts in my code.

/*

#############################################

#DYNAMICALLY ALLOCATING MEMORY FOR 2D-ARRAYS#

#############################################

*/

//INCLUDES

#include<stdio.h>

#include<cuda.h>

#include<stdlib.h>

#include<cutil_inline.h>

//MATRIX DIMENSIONS

#define M 200

#define N 200

//KERNEL

__global__ void dumbKernel(float **Mx, float **My) {

	

	__shared__ int pp[256];

	

	int i = threadIdx.x + blockIdx.x * blockDim.x;

	

	if(i < N) {

		pp[0] += i;

	}

	

	__syncthreads();

}

//MAIN-PROGRAM

int main(int argc, char** argv) {

	

	//Host-Variables

	float **h_Mx;

	float **h_My;

	

	//Device-Variables

	float **d_Mx;

	float **d_My;

	//Dummy-Arrays

	float *someHostX;

	float *someHostY;

	

	someHostX = (float *) malloc(M * sizeof(float));

	someHostY = (float *) malloc(M * sizeof(float));

	

	//DEVICE-MEMORY

	//Arrays of pointers to arrays

	cutilSafeCall( cudaMalloc((void **) &d_Mx, M * sizeof(float *)) );

	cutilSafeCall( cudaMalloc((void **) &d_My, M * sizeof(float *)) );

	

	//For each pointer, memory is allocated for an array

	//someHost Array is needed, because d_Mx[i] would be a host-call on a device-array what is not allowed

	for(int i = 0; i < N; i++) {

		cutilSafeCall( cudaMalloc((void **)&someHostX[i], N * sizeof(float)) );

		cutilSafeCall( cudaMalloc((void **)&someHostY[i], N * sizeof(float)) );

	}

	

	//Copying memory to GPU-Memoryspace

	cutilSafeCall( cudaMemcpy(d_Mx, someHostX, M * sizeof(float *), cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_My, someHostY, M * sizeof(float *), cudaMemcpyHostToDevice) );

	

	//HOST-MEMORY

	//Arrays of pointers to arrays

	h_Mx = (float **) malloc(M * sizeof(float *));

	h_My = (float **) malloc(M * sizeof(float *));

	

	//For each pointer, memory is allocated for an array

	for(int i = 0; i < M; i++) {

		h_Mx[i] = (float *) malloc(N * sizeof(float));

		h_My[i] = (float *) malloc(N * sizeof(float));

	}

	

	//Initializing Kernel

	dumbKernel<<<1, 256>>>(d_Mx, d_My);

	

	cutilSafeCall( cudaMemcpy(someHostX, d_Mx, M * sizeof(float *), cudaMemcpyDeviceToHost) );

	

	for(int i = 0; i < M; i++) {

		cutilSafeCall( cudaFree(someHostX[i]) );

	}

	

	cutilSafeCall( cudaFree(d_Mx) );

	

	cutilSafeCall( cudaMemcpy(someHostY, d_My, M * sizeof(float *), cudaMemcpyDeviceToHost) );

	

	for(int i = 0; i < M; i++) {

		cutilSafeCall( cudaFree(someHostY[i]) );

	}

	

	cutilSafeCall( cudaFree(d_My) );

	//Free each pointers array

	for(int i = 0; i < N; i++) {

		free(h_Mx[i]);

		free(h_My[i]);

	}

	

	//Free array of pointers

	free(h_Mx);

	free(h_My);

	

	cudaThreadExit();

	return 0;

}

However, I am still getting the error argument of type “float” is incompatible with parameter of type “void *” when calling cudaFree(someHost[i]) :(

Regards.

The (void *) cast throws the error: invalid type conversion

The FOR loop above shold run from 0 to M.

I will make your code work and post it,

Check this out.

I have modified “float *someHostX” as “float **”. SImilarly for the Y part as well.

You cant cast a “float” to “void *” and viceversa…

Remember, I told you it was NOT natural…

Your representation was like – you are having an array of floats and treating them like an array of pointers to floats… Un-natural coding will always have side-effects…

Making it “float **” should fix the problem

I cast the cudaFree thing…

DId not compile coz I dont have cutil.

Let me know how this one goes…

/*

#############################################

#DYNAMICALLY ALLOCATING MEMORY FOR 2D-ARRAYS#

#############################################

*/

//INCLUDES

#include<stdio.h>

#include<cuda.h>

#include<stdlib.h>

#include<cutil_inline.h>

//MATRIX DIMENSIONS

#define M 200

#define N 200

//KERNEL

__global__ void dumbKernel(float **Mx, float **My) {

	

	__shared__ int pp[256];

	

	int i = threadIdx.x + blockIdx.x * blockDim.x;

	

	if(i < N) {

		pp[0] += i;

	}

	

	__syncthreads();

}

//MAIN-PROGRAM

int main(int argc, char** argv) {

	

	//Host-Variables

	float **h_Mx;

	float **h_My;

	

	//Device-Variables

	float **d_Mx;

	float **d_My;

	//Dummy-Arrays

	float **someHostX;

	float **someHostY;

	

	someHostX = (float *) malloc(M * sizeof(float*));

	someHostY = (float *) malloc(M * sizeof(float*));

	

	//DEVICE-MEMORY

	//Arrays of pointers to arrays

	cutilSafeCall( cudaMalloc((void **) &d_Mx, M * sizeof(float *)) );

	cutilSafeCall( cudaMalloc((void **) &d_My, M * sizeof(float *)) );

	

	//For each pointer, memory is allocated for an array

	//someHost Array is needed, because d_Mx[i] would be a host-call on a device-array what is not allowed

	for(int i = 0; i < M; i++) {

		cutilSafeCall( cudaMalloc((void **)&someHostX[i], N * sizeof(float)) );

		cutilSafeCall( cudaMalloc((void **)&someHostY[i], N * sizeof(float)) );

	}

	

	//Copying memory to GPU-Memoryspace

	cutilSafeCall( cudaMemcpy(d_Mx, someHostX, M * sizeof(float *), cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_My, someHostY, M * sizeof(float *), cudaMemcpyHostToDevice) );

	

	//HOST-MEMORY

	//Arrays of pointers to arrays

	h_Mx = (float **) malloc(M * sizeof(float *));

	h_My = (float **) malloc(M * sizeof(float *));

	

	//For each pointer, memory is allocated for an array

	for(int i = 0; i < M; i++) {

		h_Mx[i] = (float *) malloc(N * sizeof(float));

		h_My[i] = (float *) malloc(N * sizeof(float));

	}

	

	//Initializing Kernel

	dumbKernel<<<1, 256>>>(d_Mx, d_My);

	

	cutilSafeCall( cudaMemcpy(someHostX, d_Mx, M * sizeof(float *), cudaMemcpyDeviceToHost) );

	

	for(int i = 0; i < M; i++) {

		cutilSafeCall( cudaFree((void*)someHostX[i]) );

	}

	

	cutilSafeCall( cudaFree(d_Mx) );

	

	cutilSafeCall( cudaMemcpy(someHostY, d_My, M * sizeof(float *), cudaMemcpyDeviceToHost) );

	

	for(int i = 0; i < M; i++) {

		cutilSafeCall( cudaFree((void*)someHostY[i]) );

	}

	

	cutilSafeCall( cudaFree(d_My) );

	//Free each pointers array

	for(int i = 0; i < N; i++) {

		free(h_Mx[i]);

		free(h_My[i]);

	}

	

	//Free array of pointers

	free(h_Mx);

	free(h_My);

	

	cudaThreadExit();

	return 0;

}

Working! Thank you very much for your efforts. I am not completely understanding why, but at least it works :)

Regards.

GOod…

You cannot cast a “float” to “void *” and vice-versa. That was the main problem.

btw, The code is straight forward,I believe.

Which part do you not understand?

btw,
It is better to have 2D arrays as one single huge 1D array and use “i*cols + j” to index it… You can save on global memory access…

The 2D arrays that you create will involve 2 Gmem access for every element. If u have them as 1 huge 1D array then it will require only 1 GMEM access…

THat can make a big difference…

If u r still constrained, you can cache “row” pointers in shared memory and reduce the latency…

Its upto you and your aplication now. Gooood Luck! Bye,