Unknow program in built kernel

Hello!

I got the following problem that unable to identify the problem. Anyone can help me?

Picture of error:

http://i1015.photobucket.com/albums/af280/southgary/failure.jpg<<<

Kernel:

[codebox]__kernel void

iter(__global float *jdiag,

__global int *col_ind,

__global int *jd_ptr,

__global float *B,

__global float *xnew)

{

int col = 0;

int i = 0;

int j = 0;

int N = 4;

float xold[4] = {0};

int detect_limit = 0;

float temp = 0;

float tempnew = 0;

float tempold = 0;

float limit = 0.001f;

int k;

int jdiag_ptr = 0;

int Ndata = 0;

/Start iterative/

do{

	detect_limit=0;

	for (i=0 ; i<N ; i++){

		Ndata = jd_ptr[i+1] - jd_ptr[i];

		jdiag_ptr = jd_ptr[i];

		for(j=0;j<Ndata;j++){

			col = col_ind[jdiag_ptr+j];

			if(col != i){

				temp += jdiag[jdiag_ptr+j]*xold[col];

			}

		}

		for (k=0; k<Ndata; k++){

			if(col_ind[jdiag_ptr+k] == i){

				col = jdiag_ptr+k;}

		}

		xnew[i] = (B[i]-temp)/jdiag[col];

	}

	for (i=0;i<N; i++){

		tempnew = xnew[i];

		tempold = xold[i];

		if(xnew[i] < 0){

			tempnew = xnew[i]*(-1);

		}

		if(xold[i] < 0){

			tempold = xold[i]*(-1);

		}

		temp = tempnew-tempold;

		if(temp < 0){

			temp = temp*(-1);

		}

		if (temp > limit){

			xold[i]=xnew[i];

			detect_limit++;

		}

	}

}while(detect_limit > 0);

}[/codebox]

Host code:

[codebox]

define N 4

char * load_program_source(const char *filename)

{

struct stat statbuf;

FILE *fh; 

char *source; 



fh = fopen(filename, "r");

if (fh == 0)

	return 0; 



stat(filename, &statbuf);

source = (char *) malloc(statbuf.st_size + 1);

fread(source, statbuf.st_size, 1, fh);

source[statbuf.st_size] = '\0'; 



return source; 

}

//-------------------------------Run OpenCL-----------------------------------------

int runCL (float *jdiag, int *col_ind, int *jd_ptr, float *b, float *results, int row)

{

cl_context mycontext;

cl_command_queue cmd_queue;

cl_int err;

size_t returned_size = 0;

size_t buffer_size = sizeof(float) * row;

cl_device_id devices;

cl_device_id cpu;

cl_char vendor_name[1024] = {0};

cl_char device_name[1024] = {0};

cl_uint max_compute_units[1] = {0};

cl_program program;

cl_kernel kernel;

cl_mem jdiag_mem, col_ind_mem, jd_ptr_mem, B_mem, Xnew_mem;

DWORD start2,end2;

	

//Find the CPU OpenCL devices that could be used

err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);

if (err != CL_SUCCESS)

{

	printf("\nFail to create CPU devices group!\n");

	system("pause");

	return EXIT_FAILURE;

}

else if (err == CL_SUCCESS)

{

	printf("\nSuccess to create CPU devices group!\n");

}

// Find the GPU CL device, this is what we really want

// If there is no GPU device is CL capable, fall back to CPU

err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &devices, NULL);

if (err != CL_SUCCESS)

{

	printf("\nFail to create GPU devices group!\nReturn to CPU devices group\n");

	devices = cpu;

}

else if (err == CL_SUCCESS)

{

	printf("\nSuccess to create GPU devices group!\n");

}

// Get some information about the returned device

err = clGetDeviceInfo(devices, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);

err |= clGetDeviceInfo(devices, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);

err |= clGetDeviceInfo(devices, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_compute_units), max_compute_units, &returned_size);

if (err == CL_SUCCESS)

{

printf("Connecting to %s\n%s\n", vendor_name, device_name);	

printf("Maximum compute units are used: %d\n", max_compute_units);

}

//Create the context

printf("Creating the context: ");

mycontext = clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU, NULL, NULL, &err);

if (!mycontext)

{

	printf("Fail!\n");

}

else if (mycontext)

{

	printf("Success!\n");

}



// Create the command queue for the context

printf("Creating the command queue: ");

cmd_queue = clCreateCommandQueue(mycontext, devices, 0, NULL);

if (!cmd_queue)

{

	printf("Fail!\n");

}

else if (cmd_queue)

{

	printf("Success!\n");

}



// Allocate memory on the device to hold our data and store the results into

// Input array jdiag

jdiag_mem = clCreateBuffer(mycontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, buffer_size, (void*)jdiag, NULL);



//Input array col_ind

col_ind_mem = clCreateBuffer(mycontext, CL_MEM_READ_ONLY  | CL_MEM_COPY_HOST_PTR, buffer_size, (void*)col_ind, NULL);

//Input array jd_ptr

jd_ptr_mem = clCreateBuffer(mycontext, CL_MEM_READ_ONLY  | CL_MEM_COPY_HOST_PTR, buffer_size, (void*)jd_ptr, NULL);

// Input array B

B_mem = clCreateBuffer(mycontext, CL_MEM_READ_ONLY  | CL_MEM_COPY_HOST_PTR, buffer_size, (void*)b, NULL);

// Input array X

Xnew_mem = clCreateBuffer(mycontext, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);



// Get all of the stuff written and allocated 

clFinish(cmd_queue);

//Create and build the program

const char * filename = "iter.cl";

char *program_source = load_program_source(filename);

program = clCreateProgramWithSource(mycontext, 1, (const char**)&program_source, NULL, &err);

if (err != CL_SUCCESS)

{

	printf("\nFail to create the program\n");	

	return EXIT_FAILURE;

}

else if (err == CL_SUCCESS)

{

	printf("\nSuccess to create the program\n");	

}



err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

if (err != CL_SUCCESS)

{	

	printf("\nFail to build the program\n");

	size_t len;

	char buffer[2048];

	clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);

	printf("%s\n", buffer);

	//clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_STATUS, sizeof(buffer), buffer, &len);

	//printf("%s\n", buffer);

	//clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_OPTIONS, sizeof(buffer), buffer, &len);

	//printf("%s\n", buffer);

	return EXIT_FAILURE;

}

else if (err == CL_SUCCESS)

{

	printf("\nSuccess to build the program\n");	

}

//Create the Kernal

kernel = clCreateKernel(program, "iter", &err);

// Now setup the arguments to our kernel

err  = clSetKernelArg(kernel,  0, sizeof(cl_mem), &jdiag_mem);

err |= clSetKernelArg(kernel,  1, sizeof(cl_mem), &col_ind_mem);

err |= clSetKernelArg(kernel,  2, sizeof(cl_mem), &jd_ptr_mem);

err |= clSetKernelArg(kernel,  3, sizeof(cl_mem), &B_mem);

err |= clSetKernelArg(kernel,  4, sizeof(cl_mem), &Xnew_mem);

//if (err != CL_SUCCESS)

//{	

//	printf("\nFail to Set Kernal Arg value! Reason:");

	//return EXIT_FAILURE;

//}

if (err == CL_SUCCESS)

{

	printf("\nSuccess to Set Kernal Arg value!\n");	

}	

else if (err == CL_INVALID_KERNEL)

{

	printf("\nError: CL_INVALID_KERNEL\n");

	return EXIT_FAILURE;	

}	

else if (err == CL_INVALID_ARG_VALUE)

{

	printf("\nError: CL_INVALID_ARG_VALUE\n");	

	return EXIT_FAILURE;

}	

else if (err == CL_INVALID_MEM_OBJECT)

{

	printf("\nError: CL_INVALID_MEM_OBJECT\n");	

	return EXIT_FAILURE;

}	

else if (err == CL_INVALID_SAMPLER)

{

	printf("\nError: CL_INVALID_SAMPLER\n");	

	return EXIT_FAILURE;

}	

else if (err == CL_INVALID_ARG_SIZE)

{

	printf("\nError: CL_INVALID_ARG_SIZE\n");	

	return EXIT_FAILURE;

}

else if (err == CL_INVALID_ARG_INDEX)

{

	printf("\nError: CL_INVALID_ARG_INDEX\n");	

	return EXIT_FAILURE;

}	

start2=GetTickCount();

// Run the calculation by enqueuing it and forcing the 

// command queue to complete the task

size_t global_work_size = row;

err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);

if (err != CL_SUCCESS)

{	

	printf("\nFail to Enqueue the NDRange!\n");

	return EXIT_FAILURE;

}

else if (err == CL_SUCCESS)

{

	printf("\nSuccess to Enqueue the NDRange!\n");	

}

clFinish(cmd_queue);



end2=GetTickCount();

printf("\nThe times that kernel has taken to execute: %ldms\n\n", end2-start2);

// Once finished read back the results from the answer 

// array into the results array

err = clEnqueueReadBuffer(cmd_queue, Xnew_mem, CL_TRUE, 0, buffer_size, results, 0, NULL, NULL);

if (err != CL_SUCCESS)

{	

	printf("\nFail to read the result from kernel!\n");

	return EXIT_FAILURE;

}

else if (err == CL_SUCCESS)

{

	printf("\nSuccess to read the result from kernel!\n");	

}

clFinish(cmd_queue);

//Rekease everything that is used

clReleaseMemObject(jdiag_mem);

clReleaseMemObject(col_ind_mem);

clReleaseMemObject(jd_ptr_mem);

clReleaseMemObject(B_mem);

clReleaseMemObject(Xnew_mem);

clReleaseCommandQueue(cmd_queue);

clReleaseContext(mycontext);

return CL_SUCCESS;

}

int main (){

//Define the varible

int row = N;

float **A;

float *b;

float *results;

int i = 0;

int j = 0;

int k = 0;

int ptr = 0;

float *jdiag;

int *col_ind;

int *jd_ptr; 

DWORD start1,end1;

//Assign the memory of matrix a

A=(float **)malloc(sizeof(float*)*N); 

for(i=0;i<N;i++) 

{ 

	A[i]=(float*)malloc(sizeof(float)*N); 

}



jdiag=(float *)malloc(sizeof(float)*(N*N)); 

col_ind=(int *)malloc(sizeof(int)*(N*N));

jd_ptr=(int *)malloc(sizeof(int)*(N+1)); 

b=(float *)malloc(sizeof(float)*N); 

results=(float *)malloc(sizeof(float)*N); 



//Read the matrix from file

FILE *in_A=fopen("C:\\matrix_A.txt","r"); 

for(i=0;i<N;i++){  

    for(j=0;j<N;j++){  

        fscanf(in_A,"%f",&A[i][j]);  

    }  

}  

FILE *in_B=fopen("C:\\matrix_B.txt","r"); 

for(i=0;i<N;i++){  

	fscanf(in_B,"%f",&b[i]);  

}  

//initial the array



for(i=0;i<(N*N);i++){  

    jdiag[i] = 0;

	col_ind[i] = 0;

}



for(i=0;i<(N+1);i++){

	jd_ptr[i] = 0;

}

//Start to rearrange the matrix

for(i=0;i<N;i++) 

{

	for(j=0;j<N;j++) 

	{			

		if (A[i][j] != 0)

		{

		jdiag[k] = A[i][j];

		col_ind[k] = j;

		k++;

		}

	}

	ptr++;		

	jd_ptr[ptr] = k;

}

//Start to count the times

start1=GetTickCount();



runCL(jdiag, col_ind, jd_ptr, b, results, row); 



//Print out the result

printf("\nThe matrix X is:\n");

for(i=0;i<N;i++){  

   // for(j=0;j<row;j++){  

        printf("%6.2f", results[i]);  

    //}  

    printf("\n");  

}



fclose(in_A);

fclose(in_B);

//End to count the times

end1=GetTickCount();

printf("\nThe times that system has taken to execute: %ldms\n\n", end1-start1);

system("pause");

return 0;

}

[/codebox]
failure.jpg