I try to accomplish AI model by C++ and opencl.
This is C++ code problematic part:
cl_mem inply = clCreateBuffer(context_main, CL_MEM_COPY_HOST_PTR, sizeof(double) * 260, &inputlayer, &err); //輸入層
cl_mem in2fw = clCreateBuffer(context_main, CL_MEM_COPY_HOST_PTR, sizeof(double) * 260 * 128, &input_and_previousoutput_to_f_weights, &err); //輸入到f層權重
cl_mem fb = clCreateBuffer(context_main, CL_MEM_COPY_HOST_PTR, sizeof(double) * 128, &f_bias, &err); //f層偏值
cl_mem fl = clCreateBuffer(context_main, CL_MEM_COPY_HOST_PTR, sizeof(double) * 128, &flayer, &err); //f層
unsigned int input_num = 260;
cl_mem i_num = clCreateBuffer(context_main, CL_MEM_USE_HOST_PTR, sizeof(unsigned int), &input_num, &err); //(全連結層)輸入量
int actfn = 0;
cl_mem act = clCreateBuffer(context_main, CL_MEM_USE_HOST_PTR, sizeof(int), &actfn, &err); //激活函數代碼
//建立主設備內核(處理全連結)
cl_kernel mainkernel_alk = clCreateKernel(prog_main, "alllinklayer", &err);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: Creating Kernel from program.(clCreateKernel),rank0_main\n");
printf("建立全連結層正向傳播內核\n");
system("pause");
}
//分配執行函數記憶體
err = clSetKernelArg(mainkernel_alk, 0, sizeof(cl_mem), (void*)&inply);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("0:inply\n");
system("pause");
}
err = clSetKernelArg(mainkernel_alk, 1, sizeof(cl_mem), (void*)&fl);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("1:fl\n");
system("pause");
}
err = clSetKernelArg(mainkernel_alk, 2, sizeof(cl_mem), (void*)&in2fw);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("2:in2fw\n");
system("pause");
}
err = clSetKernelArg(mainkernel_alk, 3, sizeof(cl_mem), (void*)&fb);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("3:fb\n");
system("pause");
}
err = clSetKernelArg(mainkernel_alk, 4, sizeof(cl_mem), (void*)&i_num);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("4:i_num\n");
system("pause");
}
err = clSetKernelArg(mainkernel_alk, 5, sizeof(cl_mem), (void*)&act);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("5:act\n");
system("pause");
}
err = clSetKernelArg(mainkernel_alk, 6, sizeof(cl_mem), (void*)&nullbuf);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("6:nullbuf\n");
system("pause");
}
size_t globalThreads = 128; //設定每個維度上work_items總數量
size_t localThreads = 1; //設定每個工作組中work_items的數量,對應於local_size
//設備執行 (輸入到f層)
err = clEnqueueNDRangeKernel(commandQueue_main, mainkernel_alk, 1, NULL, &globalThreads, &localThreads, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: Enqueueing kernel\n");
cout << "輸入到f層全連結" << "\n";
system("pause");
}
err = clFinish(commandQueue_main);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: Finish command queue\n");
printf("測試節點\n");
system("pause");
}
//建立內核(處理各個相乘)
cl_kernel mainkernel_im = clCreateKernel(prog_main, "individuallymultiply", &err);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: Creating Kernel from program.(clCreateKernel),rank0_main\n");
printf("建立各自相乘正向傳播內核\n");
system("pause");
}
err = clSetKernelArg(mainkernel_im, 0, sizeof(cl_mem), (void*)&fl);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("0:fl\n");
system("pause");
}
err = clSetKernelArg(mainkernel_im, 1, sizeof(cl_mem), (void*)&stl);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("1:stl\n");
system("pause");
}
err = clSetKernelArg(mainkernel_im, 2, sizeof(cl_mem), (void*)&im0_gpu);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: clSetKernelArg\n");
printf("2:im0_gpu\n");
system("pause");
}
//執行(f層到狀態層)
err = clEnqueueNDRangeKernel(commandQueue_main, mainkernel_im, 1, NULL, &globalThreads, &localThreads, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
cout << "\n" << err << "\n";
printf("Error: Enqueueing kernel\n");
cout << "f層輸出與狀態層相乘" << "\n";
system("pause");
}
This is gpu code problematic part:
__kernel void alllinklayer //全連結層_正向傳播(線程數為輸出(下一層)變數量)
(
__global double* inlayer,
__global double* outlayer,
__global double* weight,
__global double* outlybias,
__global unsigned int* input_num,
__global int* actfunction,
__global double* elsevariable
)
{
int id = get_global_id(0);
int gs = get_global_size(0);
outlayer[id] = outlybias[id];
//printf("%f\n", outlayer[id]);
for (int i = 0; i < input_num; i++)
{
outlayer[id] = outlayer[id] + (inlayer[i] * weight[(i * gs) + id]);
if (id == 2)
{
printf("%d\n", i);
}
}
//printf("%f\n", outlayer[id]);
if (actfunction == 0)//relu
{
if (outlayer[id] < 0)
{
outlayer[id] = 0;
}
}
else if (actfunction == 1) //tanh
{
outlayer[id] = tanh(outlayer[id]);
}
else if (actfunction == 2) //softmax
{
int allnum = get_global_size(0);
elsevariable[id] = outlayer[id];
double tol = 0;
for (int i = 0; i < allnum; i++)
{
tol = tol + exp(outlayer[i]);
}
outlayer[id] = exp(outlayer[id]) / tol;
}
}
When run the program , I aware output is :
-36
Error: Finish command queue
測試節點
So I think the problem is in mainkernel_alk kernel.
Therefore I add some printf in gpu code to know what happened in kernel.
I aware the kernel is died , when this for loop run 16 times.
I don’t know how to solve this…
I use GPU is NVIDIA GeForce MX350