Program not working on c2050 but works on other cards

Hello.

OS: linux

Driver: 260.19.26

CUDA toolkit: 3.2

Two cards on this machine - c2050, c1060. Program works on c1060, but not working on c2050.

Also tested on two other machines: c1070 and gtx460 - works perfectly on both.

I can not find any problems in the code and could assume that problem is in drivers?

CUDA version - freezes (only on c2050)

#include<stdio.h>

#define SIZE 0x100000

#define MASK 0xfffff

#define CHECK(A) if((err = A) != cudaSuccess) printf("error %d at %d\n", err, __LINE__)

__global__ void work(int* in, int* out)

{

	int l = threadIdx.x+blockIdx.x*blockDim.x;

	int idx = l;

	for(int i = 0; i < 1024; i++)

		idx = in[idx & MASK];

	out[l & MASK] = idx;

}

int main() 

{

	int* in  = new int;

	int* out = new int;

	int err = 0;

	for (int i = 0; i < SIZE; i++)

	{

		in[i]  = rand()%SIZE;

		out[i] = 0;

	}

	int* dev_in;

	int* dev_out;

	CHECK(cudaMalloc((void**)&dev_in, sizeof(int) * SIZE));

	

	CHECK(cudaMalloc((void**)&dev_out, sizeof(int) * SIZE));

	

	CHECK(cudaMemcpy(dev_in, in, sizeof(int) * SIZE, cudaMemcpyHostToDevice));

	dim3 grid_size = dim3(SIZE/256, 1, 1);

	dim3 block_size = dim3(256, 1, 1);

	printf("Running kernel\n");

	work<<<grid_size, block_size>>>(dev_in, dev_out);

	CHECK(cudaThreadSynchronize());

	CHECK(cudaMemcpy(out, dev_out, sizeof(int) * SIZE, cudaMemcpyDeviceToHost));

	for (int i = 0; i < 10; i++) 

		printf("Element #%d: %d\n", i, out[i]);

	cudaFree(dev_in);

	cudaFree(dev_out);

	delete[] in;

	delete[] out;

}

OpenCL version - CL_INVALID_QUEUE on clFinish after calling the kernel and CL_OUT_OF_RESOURCES on clEnqueueReadBuffer after it. (only on c2050)

#include "opencl.h"

#include <stdio.h>

#define SIZE 0x100000

#define MASK 0xfffff

#define CHECK(A) if((err = A) != CL_SUCCESS) printf("error %d at %d\n", err, __LINE__)

const char *source = 

{

	"__kernel void work(__global int* in, __global int* out)\n"

	"{\n"

	"	int l = get_global_id(0);\n"

	"	int idx = l;\n"

	"\n"

	"	for(int i = 0; i < 1024; i++)\n"

	"		idx = in[idx & 0xfffff];\n" // hardcoded MASK

	"\n"

	"	out[l & 0xfffff] = idx;\n" // hardcoded MASK

	"}\n"

};

int main() 

{

	cl_device_id* device;

	cl_uint device_count;

	cl_platform_id platform;

	cl_context context;

	cl_command_queue queue;

	cl_program program;

	cl_kernel kernel;

	cl_mem dev_in;	

	cl_mem dev_out;	

	char dev_name[128];

	

	int err = 0;

	

	cl_int *in  = new cl_int;

	cl_int *out = new cl_int;

	

	for(int i = 0; i < SIZE; i++) 

	{

		in[i] = rand() & MASK;

		out[i] = 0;

	}

	

	CHECK(clGetPlatformIDs(1, &platform, NULL));

	CHECK(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &device_count));

	

	device = new cl_device_id[device_count];

	

	CHECK(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, device_count, device, NULL));

	CHECK(clGetDeviceInfo(device[0], CL_DEVICE_NAME, 128, (void*)dev_name, NULL));

	printf("%s\n", dev_name);

	context = clCreateContext(NULL, 1, &device[0], NULL, NULL, &err); CHECK(err);

	queue = clCreateCommandQueue(context, device[0], (cl_command_queue_properties)0, &err); CHECK(err);

	

	dev_in  = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_int)*SIZE, in  , &err); CHECK(err);

	dev_out = clCreateBuffer(context, CL_MEM_READ_WRITE   , sizeof(cl_int)*SIZE, NULL, &err); CHECK(err);

	

	program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, &err); CHECK(err);

	

	CHECK(clBuildProgram(program, 0, NULL, NULL, NULL, NULL));

	

	kernel = clCreateKernel(program, "work", &err); CHECK(err);

	

	CHECK(clSetKernelArg(kernel, 0, sizeof(dev_in ), &dev_in ));

	CHECK(clSetKernelArg(kernel, 1, sizeof(dev_out), &dev_out));

	

	size_t global_dimensions[] = {SIZE,0,0};

	

	printf("Running kernel\n");

	

	CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_dimensions, NULL, 0, NULL, NULL));

	

	CHECK(clFinish(queue));

	

	CHECK(clEnqueueReadBuffer(queue, dev_out, CL_TRUE, 0, sizeof(cl_int)*SIZE, out, 0, NULL, NULL));

	CHECK(clFinish(queue));

	CHECK(clReleaseMemObject   (dev_in ));

	CHECK(clReleaseMemObject   (dev_out));

	CHECK(clReleaseKernel      (kernel ));

	CHECK(clReleaseProgram     (program));

	CHECK(clReleaseCommandQueue(queue  ));

	CHECK(clReleaseContext     (context));

	

	for (int i = 0; i < 10; i++) 

		printf("Element #%d: %d\n", i, out[i]);

	

	delete[] in;

	delete[] out;

}

Thanks for any help.

this happened to me when I switched to FERMI architecture as well.

it seems that FERMI is less tolerant to memory overflows than the previous architecture.

I would run the program with a memory bounds checker to verify no such overflows are occurring.

eldad.

i thought the same but there is no place where i can read or write wrong memory.

code is very simple, 2 arrays have size 0x100000

and in addressing i always use mask [… & 0xfffff]