Hello.
OS: linux
Driver: 260.19.26
CUDA toolkit: 3.2
Two cards on this machine - c2050, c1060. Program works on c1060, but not working on c2050.
Also tested on two other machines: c1070 and gtx460 - works perfectly on both.
I can not find any problems in the code and could assume that problem is in drivers?
CUDA version - freezes (only on c2050)
#include<stdio.h>
#define SIZE 0x100000
#define MASK 0xfffff
#define CHECK(A) if((err = A) != cudaSuccess) printf("error %d at %d\n", err, __LINE__)
__global__ void work(int* in, int* out)
{
int l = threadIdx.x+blockIdx.x*blockDim.x;
int idx = l;
for(int i = 0; i < 1024; i++)
idx = in[idx & MASK];
out[l & MASK] = idx;
}
int main()
{
int* in = new int;
int* out = new int;
int err = 0;
for (int i = 0; i < SIZE; i++)
{
in[i] = rand()%SIZE;
out[i] = 0;
}
int* dev_in;
int* dev_out;
CHECK(cudaMalloc((void**)&dev_in, sizeof(int) * SIZE));
CHECK(cudaMalloc((void**)&dev_out, sizeof(int) * SIZE));
CHECK(cudaMemcpy(dev_in, in, sizeof(int) * SIZE, cudaMemcpyHostToDevice));
dim3 grid_size = dim3(SIZE/256, 1, 1);
dim3 block_size = dim3(256, 1, 1);
printf("Running kernel\n");
work<<<grid_size, block_size>>>(dev_in, dev_out);
CHECK(cudaThreadSynchronize());
CHECK(cudaMemcpy(out, dev_out, sizeof(int) * SIZE, cudaMemcpyDeviceToHost));
for (int i = 0; i < 10; i++)
printf("Element #%d: %d\n", i, out[i]);
cudaFree(dev_in);
cudaFree(dev_out);
delete[] in;
delete[] out;
}
OpenCL version - CL_INVALID_QUEUE on clFinish after calling the kernel and CL_OUT_OF_RESOURCES on clEnqueueReadBuffer after it. (only on c2050)
#include "opencl.h"
#include <stdio.h>
#define SIZE 0x100000
#define MASK 0xfffff
#define CHECK(A) if((err = A) != CL_SUCCESS) printf("error %d at %d\n", err, __LINE__)
const char *source =
{
"__kernel void work(__global int* in, __global int* out)\n"
"{\n"
" int l = get_global_id(0);\n"
" int idx = l;\n"
"\n"
" for(int i = 0; i < 1024; i++)\n"
" idx = in[idx & 0xfffff];\n" // hardcoded MASK
"\n"
" out[l & 0xfffff] = idx;\n" // hardcoded MASK
"}\n"
};
int main()
{
cl_device_id* device;
cl_uint device_count;
cl_platform_id platform;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem dev_in;
cl_mem dev_out;
char dev_name[128];
int err = 0;
cl_int *in = new cl_int;
cl_int *out = new cl_int;
for(int i = 0; i < SIZE; i++)
{
in[i] = rand() & MASK;
out[i] = 0;
}
CHECK(clGetPlatformIDs(1, &platform, NULL));
CHECK(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &device_count));
device = new cl_device_id[device_count];
CHECK(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, device_count, device, NULL));
CHECK(clGetDeviceInfo(device[0], CL_DEVICE_NAME, 128, (void*)dev_name, NULL));
printf("%s\n", dev_name);
context = clCreateContext(NULL, 1, &device[0], NULL, NULL, &err); CHECK(err);
queue = clCreateCommandQueue(context, device[0], (cl_command_queue_properties)0, &err); CHECK(err);
dev_in = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_int)*SIZE, in , &err); CHECK(err);
dev_out = clCreateBuffer(context, CL_MEM_READ_WRITE , sizeof(cl_int)*SIZE, NULL, &err); CHECK(err);
program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, &err); CHECK(err);
CHECK(clBuildProgram(program, 0, NULL, NULL, NULL, NULL));
kernel = clCreateKernel(program, "work", &err); CHECK(err);
CHECK(clSetKernelArg(kernel, 0, sizeof(dev_in ), &dev_in ));
CHECK(clSetKernelArg(kernel, 1, sizeof(dev_out), &dev_out));
size_t global_dimensions[] = {SIZE,0,0};
printf("Running kernel\n");
CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_dimensions, NULL, 0, NULL, NULL));
CHECK(clFinish(queue));
CHECK(clEnqueueReadBuffer(queue, dev_out, CL_TRUE, 0, sizeof(cl_int)*SIZE, out, 0, NULL, NULL));
CHECK(clFinish(queue));
CHECK(clReleaseMemObject (dev_in ));
CHECK(clReleaseMemObject (dev_out));
CHECK(clReleaseKernel (kernel ));
CHECK(clReleaseProgram (program));
CHECK(clReleaseCommandQueue(queue ));
CHECK(clReleaseContext (context));
for (int i = 0; i < 10; i++)
printf("Element #%d: %d\n", i, out[i]);
delete[] in;
delete[] out;
}
Thanks for any help.