Hello!
I got the following problem that unable to identify the problem. Anyone can help me?
Picture of error:
http://i1015.photobucket.com/albums/af280/southgary/failure.jpg<<<
Kernel:
[codebox]__kernel void
iter(__global float *jdiag,
__global int *col_ind,
__global int *jd_ptr,
__global float *B,
__global float *xnew)
{
int col = 0;
int i = 0;
int j = 0;
int N = 4;
float xold[4] = {0};
int detect_limit = 0;
float temp = 0;
float tempnew = 0;
float tempold = 0;
float limit = 0.001f;
int k;
int jdiag_ptr = 0;
int Ndata = 0;
/Start iterative/
do{
detect_limit=0;
for (i=0 ; i<N ; i++){
Ndata = jd_ptr[i+1] - jd_ptr[i];
jdiag_ptr = jd_ptr[i];
for(j=0;j<Ndata;j++){
col = col_ind[jdiag_ptr+j];
if(col != i){
temp += jdiag[jdiag_ptr+j]*xold[col];
}
}
for (k=0; k<Ndata; k++){
if(col_ind[jdiag_ptr+k] == i){
col = jdiag_ptr+k;}
}
xnew[i] = (B[i]-temp)/jdiag[col];
}
for (i=0;i<N; i++){
tempnew = xnew[i];
tempold = xold[i];
if(xnew[i] < 0){
tempnew = xnew[i]*(-1);
}
if(xold[i] < 0){
tempold = xold[i]*(-1);
}
temp = tempnew-tempold;
if(temp < 0){
temp = temp*(-1);
}
if (temp > limit){
xold[i]=xnew[i];
detect_limit++;
}
}
}while(detect_limit > 0);
}[/codebox]
Host code:
[codebox]
define N 4
char * load_program_source(const char *filename)
{
struct stat statbuf;
FILE *fh;
char *source;
fh = fopen(filename, "r");
if (fh == 0)
return 0;
stat(filename, &statbuf);
source = (char *) malloc(statbuf.st_size + 1);
fread(source, statbuf.st_size, 1, fh);
source[statbuf.st_size] = '\0';
return source;
}
//-------------------------------Run OpenCL-----------------------------------------
int runCL (float *jdiag, int *col_ind, int *jd_ptr, float *b, float *results, int row)
{
cl_context mycontext;
cl_command_queue cmd_queue;
cl_int err;
size_t returned_size = 0;
size_t buffer_size = sizeof(float) * row;
cl_device_id devices;
cl_device_id cpu;
cl_char vendor_name[1024] = {0};
cl_char device_name[1024] = {0};
cl_uint max_compute_units[1] = {0};
cl_program program;
cl_kernel kernel;
cl_mem jdiag_mem, col_ind_mem, jd_ptr_mem, B_mem, Xnew_mem;
DWORD start2,end2;
//Find the CPU OpenCL devices that could be used
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
if (err != CL_SUCCESS)
{
printf("\nFail to create CPU devices group!\n");
system("pause");
return EXIT_FAILURE;
}
else if (err == CL_SUCCESS)
{
printf("\nSuccess to create CPU devices group!\n");
}
// Find the GPU CL device, this is what we really want
// If there is no GPU device is CL capable, fall back to CPU
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &devices, NULL);
if (err != CL_SUCCESS)
{
printf("\nFail to create GPU devices group!\nReturn to CPU devices group\n");
devices = cpu;
}
else if (err == CL_SUCCESS)
{
printf("\nSuccess to create GPU devices group!\n");
}
// Get some information about the returned device
err = clGetDeviceInfo(devices, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
err |= clGetDeviceInfo(devices, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
err |= clGetDeviceInfo(devices, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_compute_units), max_compute_units, &returned_size);
if (err == CL_SUCCESS)
{
printf("Connecting to %s\n%s\n", vendor_name, device_name);
printf("Maximum compute units are used: %d\n", max_compute_units);
}
//Create the context
printf("Creating the context: ");
mycontext = clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU, NULL, NULL, &err);
if (!mycontext)
{
printf("Fail!\n");
}
else if (mycontext)
{
printf("Success!\n");
}
// Create the command queue for the context
printf("Creating the command queue: ");
cmd_queue = clCreateCommandQueue(mycontext, devices, 0, NULL);
if (!cmd_queue)
{
printf("Fail!\n");
}
else if (cmd_queue)
{
printf("Success!\n");
}
// Allocate memory on the device to hold our data and store the results into
// Input array jdiag
jdiag_mem = clCreateBuffer(mycontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, buffer_size, (void*)jdiag, NULL);
//Input array col_ind
col_ind_mem = clCreateBuffer(mycontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, buffer_size, (void*)col_ind, NULL);
//Input array jd_ptr
jd_ptr_mem = clCreateBuffer(mycontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, buffer_size, (void*)jd_ptr, NULL);
// Input array B
B_mem = clCreateBuffer(mycontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, buffer_size, (void*)b, NULL);
// Input array X
Xnew_mem = clCreateBuffer(mycontext, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
// Get all of the stuff written and allocated
clFinish(cmd_queue);
//Create and build the program
const char * filename = "iter.cl";
char *program_source = load_program_source(filename);
program = clCreateProgramWithSource(mycontext, 1, (const char**)&program_source, NULL, &err);
if (err != CL_SUCCESS)
{
printf("\nFail to create the program\n");
return EXIT_FAILURE;
}
else if (err == CL_SUCCESS)
{
printf("\nSuccess to create the program\n");
}
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("\nFail to build the program\n");
size_t len;
char buffer[2048];
clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
//clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_STATUS, sizeof(buffer), buffer, &len);
//printf("%s\n", buffer);
//clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_OPTIONS, sizeof(buffer), buffer, &len);
//printf("%s\n", buffer);
return EXIT_FAILURE;
}
else if (err == CL_SUCCESS)
{
printf("\nSuccess to build the program\n");
}
//Create the Kernal
kernel = clCreateKernel(program, "iter", &err);
// Now setup the arguments to our kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &jdiag_mem);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &col_ind_mem);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &jd_ptr_mem);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &B_mem);
err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &Xnew_mem);
//if (err != CL_SUCCESS)
//{
// printf("\nFail to Set Kernal Arg value! Reason:");
//return EXIT_FAILURE;
//}
if (err == CL_SUCCESS)
{
printf("\nSuccess to Set Kernal Arg value!\n");
}
else if (err == CL_INVALID_KERNEL)
{
printf("\nError: CL_INVALID_KERNEL\n");
return EXIT_FAILURE;
}
else if (err == CL_INVALID_ARG_VALUE)
{
printf("\nError: CL_INVALID_ARG_VALUE\n");
return EXIT_FAILURE;
}
else if (err == CL_INVALID_MEM_OBJECT)
{
printf("\nError: CL_INVALID_MEM_OBJECT\n");
return EXIT_FAILURE;
}
else if (err == CL_INVALID_SAMPLER)
{
printf("\nError: CL_INVALID_SAMPLER\n");
return EXIT_FAILURE;
}
else if (err == CL_INVALID_ARG_SIZE)
{
printf("\nError: CL_INVALID_ARG_SIZE\n");
return EXIT_FAILURE;
}
else if (err == CL_INVALID_ARG_INDEX)
{
printf("\nError: CL_INVALID_ARG_INDEX\n");
return EXIT_FAILURE;
}
start2=GetTickCount();
// Run the calculation by enqueuing it and forcing the
// command queue to complete the task
size_t global_work_size = row;
err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("\nFail to Enqueue the NDRange!\n");
return EXIT_FAILURE;
}
else if (err == CL_SUCCESS)
{
printf("\nSuccess to Enqueue the NDRange!\n");
}
clFinish(cmd_queue);
end2=GetTickCount();
printf("\nThe times that kernel has taken to execute: %ldms\n\n", end2-start2);
// Once finished read back the results from the answer
// array into the results array
err = clEnqueueReadBuffer(cmd_queue, Xnew_mem, CL_TRUE, 0, buffer_size, results, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("\nFail to read the result from kernel!\n");
return EXIT_FAILURE;
}
else if (err == CL_SUCCESS)
{
printf("\nSuccess to read the result from kernel!\n");
}
clFinish(cmd_queue);
//Rekease everything that is used
clReleaseMemObject(jdiag_mem);
clReleaseMemObject(col_ind_mem);
clReleaseMemObject(jd_ptr_mem);
clReleaseMemObject(B_mem);
clReleaseMemObject(Xnew_mem);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(mycontext);
return CL_SUCCESS;
}
int main (){
//Define the varible
int row = N;
float **A;
float *b;
float *results;
int i = 0;
int j = 0;
int k = 0;
int ptr = 0;
float *jdiag;
int *col_ind;
int *jd_ptr;
DWORD start1,end1;
//Assign the memory of matrix a
A=(float **)malloc(sizeof(float*)*N);
for(i=0;i<N;i++)
{
A[i]=(float*)malloc(sizeof(float)*N);
}
jdiag=(float *)malloc(sizeof(float)*(N*N));
col_ind=(int *)malloc(sizeof(int)*(N*N));
jd_ptr=(int *)malloc(sizeof(int)*(N+1));
b=(float *)malloc(sizeof(float)*N);
results=(float *)malloc(sizeof(float)*N);
//Read the matrix from file
FILE *in_A=fopen("C:\\matrix_A.txt","r");
for(i=0;i<N;i++){
for(j=0;j<N;j++){
fscanf(in_A,"%f",&A[i][j]);
}
}
FILE *in_B=fopen("C:\\matrix_B.txt","r");
for(i=0;i<N;i++){
fscanf(in_B,"%f",&b[i]);
}
//initial the array
for(i=0;i<(N*N);i++){
jdiag[i] = 0;
col_ind[i] = 0;
}
for(i=0;i<(N+1);i++){
jd_ptr[i] = 0;
}
//Start to rearrange the matrix
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
if (A[i][j] != 0)
{
jdiag[k] = A[i][j];
col_ind[k] = j;
k++;
}
}
ptr++;
jd_ptr[ptr] = k;
}
//Start to count the times
start1=GetTickCount();
runCL(jdiag, col_ind, jd_ptr, b, results, row);
//Print out the result
printf("\nThe matrix X is:\n");
for(i=0;i<N;i++){
// for(j=0;j<row;j++){
printf("%6.2f", results[i]);
//}
printf("\n");
}
fclose(in_A);
fclose(in_B);
//End to count the times
end1=GetTickCount();
printf("\nThe times that system has taken to execute: %ldms\n\n", end1-start1);
system("pause");
return 0;
}
[/codebox]