cudaMalloc initialization error

Hi,

excuse me for my poor English… :">

I’ve a problem with a simple CUDA program, which computes the median of 15 images.

The initialization phttp://forums.nvidia.com/index.php?act=post&do=new_post&f=71#hase returns cudaSuccess, but, when I try to alloc some memory on the device, the system returns an initialization error.

Can someone hel me?

Thank you!

In my code, I call InitCUDA and CUDA_Background.

This is my code:

bool InitCUDA(void)

{

	int count = 0;

	int i = 0;

	cudaGetDeviceCount(&count);

	if(count == 0) {

		fprintf(stderr, "There is no device.\n");

		return false;

	}

	printf("Device Count: %d\n", count);

	for(i = 0; i < count; i++) {

		cudaDeviceProp prop;

		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess)

		{

			if(prop.major >= 1)

			{

				printf("Prop Major: %s\n", &prop);

				break;

			}

		}

	}

	if(i == count) {

		printf("There is no device supporting CUDA.\n");

		return false;

	}

	cudaSetDevice(i);

	printf("CUDA initialized.\n");

	return true;

}

The kernel function:

#define BE_ORDER(a,b) tmp = a > b ? a : b; b = a < b ? a : b; a = tmp;

__global__ void CUDA_Median(unsigned char* src[15], unsigned char* dst)

{

	//Block index

	int bx = blockIdx.x;

	// Thread index

	int tx = threadIdx.x;

	

	//__device__ unsigned char list[15];

	int i = tx + bx * LINE_SIZE;

	unsigned char tmp;

	

	unsigned char v1 =  (src[0][i]);

	unsigned char v2 =  (src[1][i]);

	unsigned char v3 =  (src[2][i]);

	unsigned char v4 =  (src[3][i]);

	unsigned char v5 =  (src[4][i]);

	unsigned char v6 =  (src[5][i]);

	unsigned char v7 =  (src[6][i]);

	unsigned char v8 =  (src[7][i]);

	unsigned char v9 =  (src[8][i]);

	unsigned char v10 = (src[9][i]);

	unsigned char v11 = (src[10][i]); 

	unsigned char v12 = (src[11][i]);

	unsigned char v13 = (src[12][i]);

	unsigned char v14 = (src[13][i]);

	unsigned char v15 = (src[14][i]);

	BE_ORDER(v1,  v2)

	BE_ORDER(v3,  v4)

	BE_ORDER(v5,  v6)

	BE_ORDER(v7,  v8)

	BE_ORDER(v9, v10)

	BE_ORDER(v11,v12)

	BE_ORDER(v13,v14)

	BE_ORDER(v1,  v3)

	BE_ORDER(v5,  v7)

	BE_ORDER(v9, v11)

	BE_ORDER(v13,v15)

	BE_ORDER(v1, v5)

	BE_ORDER(v9,v13)

	BE_ORDER(v1, v9)

	// via v1

	BE_ORDER(v5, v9)

	//via v5

	BE_ORDER(v2,  v4)

	BE_ORDER(v6,  v8)

	BE_ORDER(v10,v12)

	BE_ORDER(v14,v15)

	BE_ORDER(v4,  v8)

	BE_ORDER(v12,v15)

	BE_ORDER(v8, v15)

	//via v15

	BE_ORDER(v4, v12)

	//via v12

	BE_ORDER(v2,  v3)

	BE_ORDER(v4,  v6)

	BE_ORDER(v7,  v8)

	BE_ORDER(v9, v10)

	BE_ORDER(v2,  v4)

	BE_ORDER(v7,  v9)

	BE_ORDER(v2,  v7)

	//via v2

	BE_ORDER(v4,  v7)

	BE_ORDER(v3,  v4)

	//via v3  //adesso per i bassi v3 è sostituito da v4...

	BE_ORDER(v4,  v6)

	BE_ORDER(v8, v10)

				

	BE_ORDER(v6, v10)

	//via v10

	BE_ORDER(v6,  v8)

	BE_ORDER(v8,  v9)

	//via v9

	BE_ORDER(v4,  v6)

	BE_ORDER(v7,  v8)

	BE_ORDER(v10,v11)

	BE_ORDER(v4,  v7)

	BE_ORDER(v10,v13)

	BE_ORDER(v4, v10)

	//via v4

	BE_ORDER(v7, v10)

	//via v7

	BE_ORDER(v6,  v8)

	BE_ORDER(v11,v13)

	BE_ORDER(v8, v13)

	//via v13

	BE_ORDER(v8, v11)

	//via v11

	BE_ORDER(v6,  v8)

	BE_ORDER(v6, v10)

	//via v6

	BE_ORDER(v8, v10)

	dst[i] = v8;

}

#undef BE_ORDER

The function is called in the program

void CUDA_Background(unsigned char* src_list[15], unsigned char* dst, int size)

{

	cudaEvent_t start, stop;

	//load src1 and src2 to the device

	unsigned char* src[15];

	int i;

	

	for (i = 0; i < 15; i++)

	{

		cudaError_t s = CUDA_SAFE_CALL(cudaMalloc((void**)&src[i], size));

		cudaMemcpy(src[i], src_list[i], size, cudaMemcpyHostToDevice);

		if (s != cudaSuccess)

		{

			printf("Median Malloc 1 %s\n", cudaGetErrorString(s));

		}

	}	

	// Allocate destination to the device memory

	unsigned char* dstD;

	cudaError_t s = cudaMalloc((void**)&dstD, size);

	if (s != cudaSuccess)

	{

		printf("Median Malloc 2 %s\n", cudaGetErrorString(s));

	}

	

	dim3 dimBlock(LINE_SIZE);

	dim3 dimGrid(size / dimBlock.x);

	

	cudaEventRecord(start, 0);

	

	CUDA_Median<<<dimGrid, dimBlock>>>(src, dstD);

	

	cudaMemcpy(dst, dstD, size, cudaMemcpyDeviceToHost);

	cudaEventRecord(stop, 0);

		

	// Free device memory

	for (i = 0; i < 15; i++)

	{

		cudaFree(src[i]);

	}

	cudaFree(dstD);

	cudaEventSynchronize(stop);

	float elapsedTime = 0;

	cudaError_t el = cudaEventElapsedTime(&elapsedTime, start, stop);

	cudaEventDestroy(start);

	cudaEventDestroy(stop);	

}

Your English is good.

Which allocation fails?

The first allocation fails with an initialization error…

How large are the 15 images? You may not have enough device memory. How did you get the initialization error, debug mode?

An image is 176 x 144 pixels, so the size is 25344 bytes.

I receive the error with this code line:

cudaError_t err = cudaMalloc( … );

printf(“%s”, cudaGetErrorString(err));

How can I know the size of the memory device?

use the deviceQuery example form SDK

Obviously youll have more than 25Kb if device memory.

Do the exemples in the SDK work? Have you been able to allocate memory in any other program?

I dont know what it would fail to work with array[]'s but if i were you id try to allocate one big array of, say, 15size and then handle the stride with an offset in the big array.

I tried my program on other devices, and it runs without errors.
I think that the xfx devices don’t support cuda… I tried with a xfx 8600gt and with a xfx 9500 gt.

Now I have a problem with the results… The program runs in debug mode, but in release mode I haven’t any result…
please, if you can, read my other post in this forum…