Kernel died (solve by myself can delete)

I try to accomplish AI model by C++ and opencl.
This is C++ code problematic part:

cl_mem inply = clCreateBuffer(context_main, CL_MEM_COPY_HOST_PTR, sizeof(double) * 260, &inputlayer, &err); //輸入層
	cl_mem in2fw = clCreateBuffer(context_main, CL_MEM_COPY_HOST_PTR, sizeof(double) * 260 * 128, &input_and_previousoutput_to_f_weights, &err); //輸入到f層權重
	cl_mem fb = clCreateBuffer(context_main, CL_MEM_COPY_HOST_PTR, sizeof(double) * 128, &f_bias, &err); //f層偏值
	cl_mem fl = clCreateBuffer(context_main, CL_MEM_COPY_HOST_PTR, sizeof(double) * 128, &flayer, &err); //f層 
	unsigned int input_num = 260;
	cl_mem i_num = clCreateBuffer(context_main, CL_MEM_USE_HOST_PTR, sizeof(unsigned int), &input_num, &err); //(全連結層)輸入量
	int actfn = 0;
	cl_mem act = clCreateBuffer(context_main, CL_MEM_USE_HOST_PTR, sizeof(int), &actfn, &err); //激活函數代碼
        //建立主設備內核(處理全連結)
	cl_kernel mainkernel_alk = clCreateKernel(prog_main, "alllinklayer", &err);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: Creating Kernel from program.(clCreateKernel),rank0_main\n");
		printf("建立全連結層正向傳播內核\n");
		system("pause");
	}
	//分配執行函數記憶體
	err = clSetKernelArg(mainkernel_alk, 0, sizeof(cl_mem), (void*)&inply);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("0:inply\n");
		system("pause");
	}
	err = clSetKernelArg(mainkernel_alk, 1, sizeof(cl_mem), (void*)&fl);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("1:fl\n");
		system("pause");
	}
	err = clSetKernelArg(mainkernel_alk, 2, sizeof(cl_mem), (void*)&in2fw);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("2:in2fw\n");
		system("pause");
	}
	err = clSetKernelArg(mainkernel_alk, 3, sizeof(cl_mem), (void*)&fb);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("3:fb\n");
		system("pause");
	}
	err = clSetKernelArg(mainkernel_alk, 4, sizeof(cl_mem), (void*)&i_num);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("4:i_num\n");
		system("pause");
	}
	err = clSetKernelArg(mainkernel_alk, 5, sizeof(cl_mem), (void*)&act);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("5:act\n");
		system("pause");
	}
	err = clSetKernelArg(mainkernel_alk, 6, sizeof(cl_mem), (void*)&nullbuf);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("6:nullbuf\n");
		system("pause");
	}
	size_t globalThreads = 128; //設定每個維度上work_items總數量
	size_t localThreads = 1; //設定每個工作組中work_items的數量,對應於local_size
	//設備執行 (輸入到f層)
	err = clEnqueueNDRangeKernel(commandQueue_main, mainkernel_alk, 1, NULL, &globalThreads, &localThreads, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: Enqueueing kernel\n");
		cout << "輸入到f層全連結" << "\n";
		system("pause");
	}

	err = clFinish(commandQueue_main);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: Finish command queue\n");
		printf("測試節點\n");
		system("pause");
	}
	//建立內核(處理各個相乘)
	cl_kernel mainkernel_im = clCreateKernel(prog_main, "individuallymultiply", &err);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: Creating Kernel from program.(clCreateKernel),rank0_main\n");
		printf("建立各自相乘正向傳播內核\n");
		system("pause");
	}
	err = clSetKernelArg(mainkernel_im, 0, sizeof(cl_mem), (void*)&fl);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("0:fl\n");
		system("pause");
	}
	err = clSetKernelArg(mainkernel_im, 1, sizeof(cl_mem), (void*)&stl);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("1:stl\n");
		system("pause");
	}
	err = clSetKernelArg(mainkernel_im, 2, sizeof(cl_mem), (void*)&im0_gpu);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: clSetKernelArg\n");
		printf("2:im0_gpu\n");
		system("pause");
	}
	//執行(f層到狀態層)
	err = clEnqueueNDRangeKernel(commandQueue_main, mainkernel_im, 1, NULL, &globalThreads, &localThreads, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		cout << "\n" << err << "\n";
		printf("Error: Enqueueing kernel\n");
		cout << "f層輸出與狀態層相乘" << "\n";
		system("pause");
	}
	

This is gpu code problematic part:

__kernel void alllinklayer //全連結層_正向傳播(線程數為輸出(下一層)變數量)
	(
		__global double* inlayer,
		__global double* outlayer,
		__global double* weight,
		__global double* outlybias,
		__global unsigned int* input_num,
		__global int* actfunction,
		__global double* elsevariable
	)
	{
		int id = get_global_id(0);
		int gs = get_global_size(0);
		outlayer[id] = outlybias[id];
		//printf("%f\n", outlayer[id]);
		for (int i = 0; i < input_num; i++)
		{
			outlayer[id] = outlayer[id] + (inlayer[i] * weight[(i * gs) + id]);
			if (id == 2)
			{
				printf("%d\n", i);
			}
		}
		//printf("%f\n", outlayer[id]);
		if (actfunction == 0)//relu
		{
			if (outlayer[id] < 0)
			{
				outlayer[id] = 0;
			}
		}
		else if (actfunction == 1) //tanh
		{
			outlayer[id] = tanh(outlayer[id]);
		}
		else if (actfunction == 2) //softmax
		{
			int allnum = get_global_size(0);
			elsevariable[id] = outlayer[id];
			double tol = 0;
			for (int i = 0; i < allnum; i++)
			{
				tol = tol + exp(outlayer[i]);
			}
			outlayer[id] = exp(outlayer[id]) / tol;
		}
	}

When run the program , I aware output is :
-36
Error: Finish command queue
測試節點
So I think the problem is in mainkernel_alk kernel.
Therefore I add some printf in gpu code to know what happened in kernel.
I aware the kernel is died , when this for loop run 16 times.

I don’t know how to solve this…
I use GPU is NVIDIA GeForce MX350

OK,this problem solve.