I am running a C2050
I have the simple cuda program
cat slag.cu
1 #include <stdlib.h>
2 #include <string.h>
3 #include <assert.h>
4 #include “book.h”
5 global void gpuKernel( int nBLks, int nThreadsPerBlk)
6 {
7 printf(“Bid=%06ld, Tid=%06ld\n”, blockIdx.x, threadIdx.x);
8 };
9
10 int main(int argc,char *argv)
11 {
12 if (argc == 1)
13 {
14 struct cudaDeviceProp P;
15 cudaError_t e = cudaGetDeviceProperties(&P,0);
16 assert( e == cudaSuccess);
17 fprintf(stdout,
18 “name=%s\ntotalGlobalMem=%lu\nsharedMemPerBlock=%lu\nregsPerBlock=%ld\n”
19 “warpSize=%ld\nmemPitch=%ld\nmaxThreadsPerBlock=%ld\n”
20 “maxThreadsDim=(%ld,%ld%ld)\n” “maxGridSize=(%ld,%ld%ld)\n”
21 “totalConstMem=%ld\nmajor=%ld minor=%ld\nclockRate=%ld\n”
22 “textureAlignment=%ld\ndeviceOverlap=%ld\nmultiProcessorCount=%ld\n”
23 “kernelExecTimeoutEnabled=%ld\nintegrated=%ld\ncanMapHostMemory=%ld\n”
24 “computeMode=%ld\n”,
25
26 P.name,P.totalGlobalMem, P.sharedMemPerBlock,P.regsPerBlock,P.warpSize,
27 P.memPitch,P.maxThreadsPerBlock,
28 P.maxThreadsDim[0],P.maxThreadsDim[1],P.maxThreadsDim[2],
29 P.maxGridSize[0], P.maxGridSize[1], P.maxGridSize[2],
30 P.totalConstMem,P.major,P.minor,P.clockRate,
31 P.textureAlignment,P.deviceOverlap, P.multiProcessorCount,
32 P.kernelExecTimeoutEnabled,P.integrated,P.canMapHostMemory, P.computeMode);
33 }
34 else
35 {
36 int nBlks = atoi(argv[1]);
37 int nThreadsPerBlk = atoi(argv[2]);
38 cudaThreadSynchronize(); / sync up mapped mem with host /
39 gpuKernel<<<nBlks,nThreadsPerBlk>>>(nBlks,nThreadsPerBlk);
40 cudaThreadSynchronize(); / sync up mapped mem with host */
41
42 cudaError_t e= cudaGetLastError();
43 if ( e != cudaSuccess)
44 {
45 const char *es = cudaGetErrorString(e);
46 printf(“gpu launch error %s %s %ld\n”, es,FILE,LINE);
47 printf(“so sad\n”);
48 }
49 }
50 return (0);
51 }
withoug arg, characterizes the device
1 slag
2
3 name=Tesla C2050
4 totalGlobalMem=3220897792
5 sharedMemPerBlock=49152
6 regsPerBlock=32768
7 warpSize=32
8 memPitch=2147483647
9 maxThreadsPerBlock=1024
10 maxThreadsDim=(1024,102464)
11 maxGridSize=(65535,655351)
12 totalConstMem=65536
13 major=2 minor=0
14 clockRate=1147000
15 textureAlignment=512
16 deviceOverlap=1
17 multiProcessorCount=14
18 kernelExecTimeoutEnabled=0
19 integrated=0
20 canMapHostMemory=1
21 computeMode=0
with args of number of blocks and thread count per block,
Just launches threads a and counts them.
first arg is the number of blocks and the second is the number of threads per block.
1 slag 10 10|wc
2 100 200 2400
the number of threads per block, the second arg, cant exceed 1024, thats understood
1 slag 2 1024|wc
2 2048 4096 49152
3
4 slag 2 1025
5 gpu launch error invalid configuration argument slag.cu 23
6 so sad
but there is a limit of 4096 on the total number of threads per kernel launch.
1 slag 4 1024|wc
2 4096 8192 98304
3
4 slag 5 1024|wc
5 4096 8192 98304
6
7 slag 5 1024|sort|greatawk 1 100000
8 some of the threads in the 5 blocks just dont appear
9
10 slag 1024 5|wc
11 4096 8192 98304
Why is that.