Hi,
excuse me for my poor English… :">
I’ve a problem with a simple CUDA program, which computes the median of 15 images.
The initialization phttp://forums.nvidia.com/index.php?act=post&do=new_post&f=71#hase returns cudaSuccess, but, when I try to alloc some memory on the device, the system returns an initialization error.
Can someone hel me?
Thank you!
In my code, I call InitCUDA and CUDA_Background.
This is my code:
bool InitCUDA(void)
{
int count = 0;
int i = 0;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
printf("Device Count: %d\n", count);
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess)
{
if(prop.major >= 1)
{
printf("Prop Major: %s\n", &prop);
break;
}
}
}
if(i == count) {
printf("There is no device supporting CUDA.\n");
return false;
}
cudaSetDevice(i);
printf("CUDA initialized.\n");
return true;
}
The kernel function:
#define BE_ORDER(a,b) tmp = a > b ? a : b; b = a < b ? a : b; a = tmp;
__global__ void CUDA_Median(unsigned char* src[15], unsigned char* dst)
{
//Block index
int bx = blockIdx.x;
// Thread index
int tx = threadIdx.x;
//__device__ unsigned char list[15];
int i = tx + bx * LINE_SIZE;
unsigned char tmp;
unsigned char v1 = (src[0][i]);
unsigned char v2 = (src[1][i]);
unsigned char v3 = (src[2][i]);
unsigned char v4 = (src[3][i]);
unsigned char v5 = (src[4][i]);
unsigned char v6 = (src[5][i]);
unsigned char v7 = (src[6][i]);
unsigned char v8 = (src[7][i]);
unsigned char v9 = (src[8][i]);
unsigned char v10 = (src[9][i]);
unsigned char v11 = (src[10][i]);
unsigned char v12 = (src[11][i]);
unsigned char v13 = (src[12][i]);
unsigned char v14 = (src[13][i]);
unsigned char v15 = (src[14][i]);
BE_ORDER(v1, v2)
BE_ORDER(v3, v4)
BE_ORDER(v5, v6)
BE_ORDER(v7, v8)
BE_ORDER(v9, v10)
BE_ORDER(v11,v12)
BE_ORDER(v13,v14)
BE_ORDER(v1, v3)
BE_ORDER(v5, v7)
BE_ORDER(v9, v11)
BE_ORDER(v13,v15)
BE_ORDER(v1, v5)
BE_ORDER(v9,v13)
BE_ORDER(v1, v9)
// via v1
BE_ORDER(v5, v9)
//via v5
BE_ORDER(v2, v4)
BE_ORDER(v6, v8)
BE_ORDER(v10,v12)
BE_ORDER(v14,v15)
BE_ORDER(v4, v8)
BE_ORDER(v12,v15)
BE_ORDER(v8, v15)
//via v15
BE_ORDER(v4, v12)
//via v12
BE_ORDER(v2, v3)
BE_ORDER(v4, v6)
BE_ORDER(v7, v8)
BE_ORDER(v9, v10)
BE_ORDER(v2, v4)
BE_ORDER(v7, v9)
BE_ORDER(v2, v7)
//via v2
BE_ORDER(v4, v7)
BE_ORDER(v3, v4)
//via v3 //adesso per i bassi v3 è sostituito da v4...
BE_ORDER(v4, v6)
BE_ORDER(v8, v10)
BE_ORDER(v6, v10)
//via v10
BE_ORDER(v6, v8)
BE_ORDER(v8, v9)
//via v9
BE_ORDER(v4, v6)
BE_ORDER(v7, v8)
BE_ORDER(v10,v11)
BE_ORDER(v4, v7)
BE_ORDER(v10,v13)
BE_ORDER(v4, v10)
//via v4
BE_ORDER(v7, v10)
//via v7
BE_ORDER(v6, v8)
BE_ORDER(v11,v13)
BE_ORDER(v8, v13)
//via v13
BE_ORDER(v8, v11)
//via v11
BE_ORDER(v6, v8)
BE_ORDER(v6, v10)
//via v6
BE_ORDER(v8, v10)
dst[i] = v8;
}
#undef BE_ORDER
The function is called in the program
void CUDA_Background(unsigned char* src_list[15], unsigned char* dst, int size)
{
cudaEvent_t start, stop;
//load src1 and src2 to the device
unsigned char* src[15];
int i;
for (i = 0; i < 15; i++)
{
cudaError_t s = CUDA_SAFE_CALL(cudaMalloc((void**)&src[i], size));
cudaMemcpy(src[i], src_list[i], size, cudaMemcpyHostToDevice);
if (s != cudaSuccess)
{
printf("Median Malloc 1 %s\n", cudaGetErrorString(s));
}
}
// Allocate destination to the device memory
unsigned char* dstD;
cudaError_t s = cudaMalloc((void**)&dstD, size);
if (s != cudaSuccess)
{
printf("Median Malloc 2 %s\n", cudaGetErrorString(s));
}
dim3 dimBlock(LINE_SIZE);
dim3 dimGrid(size / dimBlock.x);
cudaEventRecord(start, 0);
CUDA_Median<<<dimGrid, dimBlock>>>(src, dstD);
cudaMemcpy(dst, dstD, size, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
// Free device memory
for (i = 0; i < 15; i++)
{
cudaFree(src[i]);
}
cudaFree(dstD);
cudaEventSynchronize(stop);
float elapsedTime = 0;
cudaError_t el = cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}