No, it looks to me that it’s using the expected number of threads. Only the final sum reduction uses a single thread, but this is by design.
Setting the environment variable “COMPUTE_PROFILE=1” and “COMPUTE_PROFILE_CONFIG=prof.txt” where prof.txt is a text file with:
gridsize
threadblocksize
I get the following profile:
% cat cuda_profile_0.log
# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 690
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 130cc16f22d6a307
method,gputime,cputime,gridsizeX,gridsizeY,threadblocksizeX,threadblocksizeY,threadblocksizeZ,occupancy
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14680.096 ] cputime=[ 15.109 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1904.640 ] cputime=[ 5.631 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 27527.520 ] cputime=[ 5.463 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1903.648 ] cputime=[ 5.144 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6461.824 ] cputime=[ 3.418 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1848.736 ] cputime=[ 3.240 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1834.912 ] cputime=[ 2.829 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.400 ] cputime=[ 14.075 ]
method=[ memcpyDtoH ] gputime=[ 2.208 ] cputime=[ 12.673 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14728.480 ] cputime=[ 15.014 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1686.240 ] cputime=[ 5.516 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 27531.584 ] cputime=[ 5.150 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1685.888 ] cputime=[ 4.679 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6467.680 ] cputime=[ 3.823 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1846.848 ] cputime=[ 3.427 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1831.648 ] cputime=[ 3.358 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.656 ] cputime=[ 16.057 ]
method=[ memcpyDtoH ] gputime=[ 2.240 ] cputime=[ 12.602 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14752.320 ] cputime=[ 15.474 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1904.768 ] cputime=[ 5.054 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 27504.129 ] cputime=[ 5.050 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1903.872 ] cputime=[ 4.612 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6421.024 ] cputime=[ 3.741 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1846.688 ] cputime=[ 3.793 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1832.864 ] cputime=[ 3.081 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.656 ] cputime=[ 16.274 ]
method=[ memcpyDtoH ] gputime=[ 2.240 ] cputime=[ 12.589 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14717.024 ] cputime=[ 15.317 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1686.656 ] cputime=[ 5.547 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 27548.928 ] cputime=[ 5.010 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1685.888 ] cputime=[ 4.801 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6424.512 ] cputime=[ 3.439 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1853.152 ] cputime=[ 3.514 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1831.808 ] cputime=[ 3.148 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.624 ] cputime=[ 16.451 ]
method=[ memcpyDtoH ] gputime=[ 2.240 ] cputime=[ 12.563 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14720.064 ] cputime=[ 15.670 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1904.224 ] cputime=[ 5.102 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 27664.352 ] cputime=[ 5.280 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1903.520 ] cputime=[ 4.848 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6451.392 ] cputime=[ 3.670 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1846.560 ] cputime=[ 3.821 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1830.304 ] cputime=[ 3.027 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.656 ] cputime=[ 16.227 ]
method=[ memcpyDtoH ] gputime=[ 2.208 ] cputime=[ 12.685 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14846.336 ] cputime=[ 15.388 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1686.368 ] cputime=[ 5.425 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 28299.232 ] cputime=[ 5.678 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1685.664 ] cputime=[ 5.306 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6453.056 ] cputime=[ 3.542 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1852.864 ] cputime=[ 3.468 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1824.928 ] cputime=[ 2.984 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.688 ] cputime=[ 16.396 ]
method=[ memcpyDtoH ] gputime=[ 2.240 ] cputime=[ 12.472 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14804.864 ] cputime=[ 16.177 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1904.416 ] cputime=[ 5.234 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 28253.217 ] cputime=[ 5.290 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1903.584 ] cputime=[ 4.783 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6478.880 ] cputime=[ 3.631 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1848.736 ] cputime=[ 4.638 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1832.928 ] cputime=[ 3.051 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.656 ] cputime=[ 16.245 ]
method=[ memcpyDtoH ] gputime=[ 2.208 ] cputime=[ 12.449 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14793.216 ] cputime=[ 15.610 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1686.144 ] cputime=[ 5.442 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 28278.016 ] cputime=[ 5.161 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1685.856 ] cputime=[ 5.207 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6454.080 ] cputime=[ 3.523 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1843.136 ] cputime=[ 3.704 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1833.632 ] cputime=[ 3.028 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.624 ] cputime=[ 16.240 ]
method=[ memcpyDtoH ] gputime=[ 2.208 ] cputime=[ 12.442 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14823.872 ] cputime=[ 16.002 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1904.256 ] cputime=[ 5.463 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 28342.527 ] cputime=[ 5.398 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1903.520 ] cputime=[ 4.832 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6420.416 ] cputime=[ 3.468 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1854.208 ] cputime=[ 4.085 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1831.744 ] cputime=[ 3.325 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.656 ] cputime=[ 16.082 ]
method=[ memcpyDtoH ] gputime=[ 2.240 ] cputime=[ 12.627 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 14828.704 ] cputime=[ 15.520 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1686.944 ] cputime=[ 5.813 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z20generate_seed_pseudoyyy14curandOrderingP17curandStateXORWOWPj ] gputime=[ 28175.520 ] cputime=[ 5.102 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ _Z13gen_sequencedI17curandStateXORWOWfiXadL_Z21curand_uniform_noargsIS0_EfPT_iEEEvS3_PT0_mmT1_ ] gputime=[ 1685.568 ] cputime=[ 5.098 ] gridsize=[ 64, 1 ] threadblocksize=[ 64, 1, 1 ] occupancy=[ 0.500 ]
method=[ mccuf_5_montecarlo_cuf5_kernel_ ] gputime=[ 6440.544 ] cputime=[ 3.426 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum0_ ] gputime=[ 1846.432 ] cputime=[ 3.888 ] gridsize=[ 128, 1 ] threadblocksize=[ 32, 1, 1 ] occupancy=[ 0.250 ]
method=[ mccuf_5_montecarlo_cuf5_sum_ ] gputime=[ 1826.560 ] cputime=[ 2.944 ] gridsize=[ 1, 1 ] threadblocksize=[ 1, 1, 1 ] occupancy=[ 0.016 ]
method=[ memcpyDtoH ] gputime=[ 2.688 ] cputime=[ 16.256 ]
method=[ memcpyDtoH ] gputime=[ 2.240 ] cputime=[ 12.637 ]