Hi,
I’m working on a Tesla K20. According to the deviceQuery utility:
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
However when I run the program below, the program works as long as the number of blocks (computed as NUM_INTS/1024) stays under 65535. If I make NUM_INTS 1610241024, bringing the number of blocks to 65536, the kernel invocation fails with “Invalid argument”. I assume the argument it’s complaining about is the number of blocks >65535, but according to the parameters above I should be ok up to
2147483647. can somebody comment?
#include <string.h>
#include <stdio.h>
#define NUM_INTS (15*1024*1024)
#define BLOCK_SIZE 1024
int a[NUM_INTS], b[NUM_INTS], r[NUM_INTS];
__global__ void vector_add(
int *ad,
int *bd,
int *rd,
int offset,
int n) {
uint global_idx = blockIdx.x *blockDim.x + threadIdx.x;
if (global_idx < n) {
global_idx += offset;
rd[global_idx] = ad[global_idx] + bd[global_idx];
}
}
int main(int argc, char **argv) {
int i;
int *ad, *bd, *rd;
// compute number of blocks
int size = NUM_INTS*sizeof(int);
int nblocks = size/BLOCK_SIZE;
if (nblocks*BLOCK_SIZE < size)
size++;
printf("will use %d blocks\n", nblocks);
// initialize input arrays
for(i=0; i<NUM_INTS; i++) {
a[i] = i;
b[i] = i;
}
// allocate device memory
cudaMalloc((void**)&ad, size);
cudaMalloc((void**)&bd, size);
cudaMalloc((void**)&rd, size);
// copy host data to device
cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(bd, b, size, cudaMemcpyHostToDevice);
// run kernel
vector_add<<<nblocks, BLOCK_SIZE>>>(ad, bd, rd, 0, NUM_INTS);
cudaError_t err = cudaGetLastError();
if ( cudaSuccess != err ) {
fprintf( stderr, "kernel invocationi failed: %s\n", cudaGetErrorString( err ) );
exit( -1 );
}
// copy data back
cudaMemcpy(r, rd, size, cudaMemcpyDeviceToHost);
return 0;
}