cudaMemcpyDeviceToHost problem not work on primary video card

Hello. Recently began working with nsight debugger. I have installed two video cards: GF GTX 460 and GF 210. Run the project in 2010 VS. For calculations using the GTX 460. VGA cable is connected to 210. The following code.

# Include <stdio.h>

# Include <cutil_inline.h>

# Define BLOCK_SIZE 16 // submatrix size

# Define N 1024 // matrix size is N * N

__global__ void reduce4 (int * inData, int * outData)

{

__shared__ int data [BLOCK_SIZE];

int tid = threadIdx.x;

int i = 2 * blockIdx.x * blockDim.x + threadIdx.x;

data [tid] = inData [i] + inData [i + blockDim.x]; // load into shared memeory

__syncthreads ();

for (int s = blockDim.x / 2; s> 0; s *= 2)

{

if (tid <s)

data [tid] + = data [tid + s];

__syncthreads ();

}

if (tid == 0) // write result of block reduction

outData [blockIdx.x] = data [0];

}

int main (int argc, char * argv [])

{

int numBytes = N * sizeof (int);

int n = N;

int i = 0;

int sum = 0;

// Allocate host memory

int * a = new int [N];

int * b = new int [N];

// Init with random values

for (i = 0; i <N; i + +)

{

a [i] = 1; // (rand () & 0xFF) - 127;

sum + = a [i];

}

// Allocate device memory

int * adev [2] = {NULL, NULL};

cudaEvent_t start, stop;

float gpuTime = 0.0f;

int num_devices, device;

cudaGetDeviceCount (& num_devices);

// CudaSetDevice (1);

cudaDeviceProp properties;

cudaGetDeviceProperties (& properties, 0);

cudaMalloc ((void **) & adev [0], numBytes);

cudaMalloc ((void **) & adev [1], numBytes);

// Create cuda event handles

cudaEventCreate (& start);

cudaEventCreate (& stop);

// Asynchronously issue work to the GPU (all to stream 0)

cudaEventRecord (start, 0);

cudaMemcpy (adev [0], a, numBytes, cudaMemcpyHostToDevice);

for (i = 0; n> = BLOCK_SIZE; n / = (2 * BLOCK_SIZE), i ^ = 1)

{

// Set kernel launch configuration

dim3 dimBlock (BLOCK_SIZE, 1, 1);

dim3 dimGrid (n / (2 * dimBlock.x), 1, 1);

reduce4 <<<dimGrid, dimBlock>>> (adev [i], adev [i ^ 1]);

}

cutilSafeCall (cudaMemcpy (b, adev [i], 4 * N, cudaMemcpyDeviceToHost));

cudaEventRecord (stop, 0);

cudaEventSynchronize (stop);

cudaEventElapsedTime (& gpuTime, start, stop);

for (i = 1; i <n; i + +)

b [0] + = b [i];

// Print the cpu and gpu times

printf ("time spent executing by the GPU:% .2 f milliseconds \ n", gpuTime);

printf ("CPU sum% d, CUDA sum% d, N =% d \ n", sum, b [0], N);

// Release resources

cudaEventDestroy (start);

cudaEventDestroy (stop);

cudaFree (adev [0]);

cudaFree (adev [1]);

delete a;

delete b;

return 0;

}

On line:

cutilSafeCall (cudaMemcpy (b, adev [i], 4 * N, cudaMemcpyDeviceToHost));

cudaEventRecord (stop, 0); crash error: Runtime API error: “unknown error”.

Exclusively when copying from device to host. If you specify: cudaSetDevice (1) (use GF 210), then everything works fine. Configuration software: cuda toolkit 3.2 SDK 3.2 (x64), VS 2010, driver: 260.93_desktop_win7_winvista_64bit_international.

ie: cudaSetDevice (1) works, cudaSetDevice (0) - no.

What’s the problem?

The problem probably is in the kernel calls before, not the memcpy itself. You might have an out-of-bounds array access somewhere, or maybe CUDA is just unhappy about kernel invocations with a gridsize of zero (n will be zero after a few iterations since it gets divided by (2 * BLOCK_SIZE) instead of 4 on every iteration).

Also have a look at the reduction example in the SDK.

I found a bug in the code. Invalid reduction algorithm in cuda procedure: s *= 2 should be replaced with s / = 2. I made a mistake when the deal with a similar example. There was specified bit operation <<=, it seems that this division. It seemed to me that this multiplication.

It is interesting that in the GF 210 are copying the memory (of course, the calculations are wrong).

Thank’s for help!