Hello. Recently began working with nsight debugger. I have installed two video cards: GF GTX 460 and GF 210. Run the project in 2010 VS. For calculations using the GTX 460. VGA cable is connected to 210. The following code.
# Include <stdio.h>
# Include <cutil_inline.h>
# Define BLOCK_SIZE 16 // submatrix size
# Define N 1024 // matrix size is N * N
__global__ void reduce4 (int * inData, int * outData)
{
__shared__ int data [BLOCK_SIZE];
int tid = threadIdx.x;
int i = 2 * blockIdx.x * blockDim.x + threadIdx.x;
data [tid] = inData [i] + inData [i + blockDim.x]; // load into shared memeory
__syncthreads ();
for (int s = blockDim.x / 2; s> 0; s *= 2)
{
if (tid <s)
data [tid] + = data [tid + s];
__syncthreads ();
}
if (tid == 0) // write result of block reduction
outData [blockIdx.x] = data [0];
}
int main (int argc, char * argv [])
{
int numBytes = N * sizeof (int);
int n = N;
int i = 0;
int sum = 0;
// Allocate host memory
int * a = new int [N];
int * b = new int [N];
// Init with random values
for (i = 0; i <N; i + +)
{
a [i] = 1; // (rand () & 0xFF) - 127;
sum + = a [i];
}
// Allocate device memory
int * adev [2] = {NULL, NULL};
cudaEvent_t start, stop;
float gpuTime = 0.0f;
int num_devices, device;
cudaGetDeviceCount (& num_devices);
// CudaSetDevice (1);
cudaDeviceProp properties;
cudaGetDeviceProperties (& properties, 0);
cudaMalloc ((void **) & adev [0], numBytes);
cudaMalloc ((void **) & adev [1], numBytes);
// Create cuda event handles
cudaEventCreate (& start);
cudaEventCreate (& stop);
// Asynchronously issue work to the GPU (all to stream 0)
cudaEventRecord (start, 0);
cudaMemcpy (adev [0], a, numBytes, cudaMemcpyHostToDevice);
for (i = 0; n> = BLOCK_SIZE; n / = (2 * BLOCK_SIZE), i ^ = 1)
{
// Set kernel launch configuration
dim3 dimBlock (BLOCK_SIZE, 1, 1);
dim3 dimGrid (n / (2 * dimBlock.x), 1, 1);
reduce4 <<<dimGrid, dimBlock>>> (adev [i], adev [i ^ 1]);
}
cutilSafeCall (cudaMemcpy (b, adev [i], 4 * N, cudaMemcpyDeviceToHost));
cudaEventRecord (stop, 0);
cudaEventSynchronize (stop);
cudaEventElapsedTime (& gpuTime, start, stop);
for (i = 1; i <n; i + +)
b [0] + = b [i];
// Print the cpu and gpu times
printf ("time spent executing by the GPU:% .2 f milliseconds \ n", gpuTime);
printf ("CPU sum% d, CUDA sum% d, N =% d \ n", sum, b [0], N);
// Release resources
cudaEventDestroy (start);
cudaEventDestroy (stop);
cudaFree (adev [0]);
cudaFree (adev [1]);
delete a;
delete b;
return 0;
}
On line:
cutilSafeCall (cudaMemcpy (b, adev [i], 4 * N, cudaMemcpyDeviceToHost));
cudaEventRecord (stop, 0); crash error: Runtime API error: “unknown error”.
Exclusively when copying from device to host. If you specify: cudaSetDevice (1) (use GF 210), then everything works fine. Configuration software: cuda toolkit 3.2 SDK 3.2 (x64), VS 2010, driver: 260.93_desktop_win7_winvista_64bit_international.
ie: cudaSetDevice (1) works, cudaSetDevice (0) - no.
What’s the problem?