When I compile your code for sm_89
, I observe in the SASS that in the shared case, the two global loads are issued back to back, but in the non shared case, the order is load-store-load-store. I’m not sure why the compiler has done this, but I suspect it is the source of the perf difference. The shared kernel can get twice as many loads in flight.
When I “force” compiler to behave similarly, the non-shared case becomes faster, in my test:
# cat t162.cu
// copy data directly
__global__ void copyRow(float * MatA,float * MatB,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x * 2;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx=ix+iy*nx;
if (ix<nx && iy<ny)
{
#ifndef USE_FIX
MatB[idx]=MatA[idx];
MatB[idx+blockDim.x*1]=MatA[idx+blockDim.x*1];
#else
auto t1 = MatA[idx];
auto t2 = MatA[idx+blockDim.x];
MatB[idx] = t1;
MatB[idx+blockDim.x]= t2;
#endif
}
}
__global__ void copyRow_Sheme(float * MatA,float* MatB,int nx,int ny)
{
__shared__ float tile[16][16 * 2];
int ix=threadIdx.x+blockDim.x * blockIdx.x * 2;
int iy=threadIdx.y+blockDim.y * blockIdx.y;
int idx=ix+iy * nx;
// loop unroll 2
if(ix<nx&& iy<ny)
{
tile[threadIdx.y][threadIdx.x]=MatA[idx];
tile[threadIdx.y][threadIdx.x+blockDim.x]=MatA[idx+blockDim.x];
__syncthreads();
MatB[idx]=tile[threadIdx.y][threadIdx.x];
MatB[idx+blockDim.x]=tile[threadIdx.y][threadIdx.x+blockDim.x];
}
}
using mt = float;
int main(){
size_t sz = 4096;
size_t msz = sz*sz;
dim3 grid = dim3(sz/16/2, sz/16);
dim3 block = dim3(16,16);
mt *d_MatA, *d_MatB;
cudaMalloc(&d_MatA, sizeof(float)*msz);
cudaMalloc(&d_MatB, sizeof(float)*msz);
#ifdef USE_SHARED
copyRow_Sheme<<<grid,block>>>(d_MatA,d_MatB,sz,sz);
cudaDeviceSynchronize();
copyRow_Sheme<<<grid,block>>>(d_MatA,d_MatB,sz,sz);
cudaDeviceSynchronize();
#else
copyRow<<<grid,block>>>(d_MatA, d_MatB, sz, sz);
cudaDeviceSynchronize();
copyRow<<<grid,block>>>(d_MatA, d_MatB, sz, sz);
cudaDeviceSynchronize();
#endif
}
# nvcc -o t162 t162.cu -arch=sm_89 -DUSE_FIX
# nsys profile --stats=true ./t162
Generating '/tmp/nsys-report-be42.qdstrm'
[1/8] [========================100%] report27.nsys-rep
[2/8] [========================100%] report27.sqlite
[3/8] Executing 'nvtx_sum' stats report
SKIPPED: /root/bobc/report27.sqlite does not contain NV Tools Extension (NVTX) data.
[4/8] Executing 'osrt_sum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------ ----------- -------- ----------- ------------ --------------
52.3 217,365,746 480 452,845.3 14,868.5 1,033 79,909,360 4,271,395.4 ioctl
41.5 172,245,111 11 15,658,646.5 3,077,555.0 7,483 100,128,120 29,519,094.6 poll
5.2 21,537,869 29 742,685.1 5,355.0 2,024 21,364,588 3,966,156.7 fopen
0.5 2,014,520 27 74,611.9 12,468.0 10,445 1,241,603 234,543.8 mmap64
0.2 793,336 44 18,030.4 16,883.0 6,824 34,644 4,928.5 open64
0.1 409,543 9 45,504.8 40,925.0 34,647 77,714 13,645.9 sem_timedwait
0.1 277,140 2 138,570.0 138,570.0 117,402 159,738 29,936.1 pthread_create
0.0 167,968 16 10,498.0 4,914.0 2,585 70,610 16,499.6 mmap
0.0 84,472 48 1,759.8 65.0 58 81,167 11,705.3 fgets
0.0 67,531 23 2,936.1 3,120.0 1,518 4,320 725.3 fclose
0.0 55,234 51 1,083.0 1,065.0 728 2,053 199.6 fcntl
0.0 52,046 7 7,435.1 6,060.0 3,730 15,864 4,131.8 munmap
0.0 39,856 6 6,642.7 6,639.5 2,940 10,945 2,806.5 open
0.0 32,234 13 2,479.5 2,020.0 1,310 4,792 1,129.2 read
0.0 30,052 10 3,005.2 2,594.5 1,538 5,595 1,227.2 write
0.0 17,645 2 8,822.5 8,822.5 5,113 12,532 5,246.0 socket
0.0 16,858 1 16,858.0 16,858.0 16,858 16,858 0.0 fread
0.0 14,620 1 14,620.0 14,620.0 14,620 14,620 0.0 connect
0.0 9,747 1 9,747.0 9,747.0 9,747 9,747 0.0 pipe2
0.0 6,197 7 885.3 875.0 816 1,010 63.2 dup
0.0 2,483 1 2,483.0 2,483.0 2,483 2,483 0.0 bind
0.0 1,654 1 1,654.0 1,654.0 1,654 1,654 0.0 listen
[5/8] Executing 'cuda_api_sum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------ ------------ -------- ----------- ------------- ----------------------
99.4 186,035,644 2 93,017,822.0 93,017,822.0 233,576 185,802,068 131,216,739.1 cudaMalloc
0.5 1,016,510 2 508,255.0 508,255.0 443,663 572,847 91,346.9 cudaDeviceSynchronize
0.1 179,584 2 89,792.0 89,792.0 11,242 168,342 111,086.5 cudaLaunchKernel
0.0 1,499 1 1,499.0 1,499.0 1,499 1,499 0.0 cuModuleGetLoadingMode
[6/8] Executing 'cuda_gpu_kern_sum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------- --------- -------- -------- ----------- -----------------------------------
100.0 1,013,984 2 506,992.0 506,992.0 443,744 570,240 89,446.2 copyRow(float *, float *, int, int)
[7/8] Executing 'cuda_gpu_mem_time_sum' stats report
SKIPPED: /root/bobc/report27.sqlite does not contain GPU memory data.
[8/8] Executing 'cuda_gpu_mem_size_sum' stats report
SKIPPED: /root/bobc/report27.sqlite does not contain GPU memory data.
Generated:
/root/bobc/report27.nsys-rep
/root/bobc/report27.sqlite
# nvcc -o t162 t162.cu -arch=sm_89 -DUSE_FIX -DUSE_SHARED
# nsys profile --stats=true ./t162
Generating '/tmp/nsys-report-4b6f.qdstrm'
[1/8] [========================100%] report28.nsys-rep
[2/8] [========================100%] report28.sqlite
[3/8] Executing 'nvtx_sum' stats report
SKIPPED: /root/bobc/report28.sqlite does not contain NV Tools Extension (NVTX) data.
[4/8] Executing 'osrt_sum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------ ----------- -------- ----------- ------------ --------------
51.9 210,899,831 480 439,374.6 15,051.0 1,085 79,522,396 4,121,169.9 ioctl
42.3 171,815,180 11 15,619,561.8 2,985,520.0 5,269 100,075,732 29,506,774.7 poll
4.8 19,419,139 29 669,625.5 5,852.0 2,063 19,231,020 3,569,865.5 fopen
0.5 1,989,173 27 73,673.1 12,448.0 10,363 1,220,594 230,570.8 mmap64
0.2 801,706 44 18,220.6 16,734.0 7,009 33,571 5,210.7 open64
0.1 446,043 9 49,560.3 42,284.0 36,660 79,520 14,803.7 sem_timedwait
0.1 288,506 2 144,253.0 144,253.0 123,757 164,749 28,985.7 pthread_create
0.0 171,854 16 10,740.9 4,950.5 2,650 70,439 16,488.8 mmap
0.0 84,842 48 1,767.5 66.5 58 81,468 11,748.5 fgets
0.0 71,426 23 3,105.5 3,196.0 1,507 5,050 909.3 fclose
0.0 55,901 51 1,096.1 1,068.0 738 1,877 222.7 fcntl
0.0 54,169 7 7,738.4 6,390.0 4,255 17,515 4,436.7 munmap
0.0 42,000 6 7,000.0 6,445.5 2,757 10,867 3,259.5 open
0.0 33,855 13 2,604.2 2,046.0 1,548 5,887 1,381.6 read
0.0 28,509 10 2,850.9 2,604.5 1,546 5,662 1,104.8 write
0.0 18,407 2 9,203.5 9,203.5 5,402 13,005 5,376.1 socket
0.0 17,405 1 17,405.0 17,405.0 17,405 17,405 0.0 fread
0.0 14,655 1 14,655.0 14,655.0 14,655 14,655 0.0 connect
0.0 8,949 1 8,949.0 8,949.0 8,949 8,949 0.0 pipe2
0.0 6,485 7 926.4 900.0 870 1,023 63.7 dup
0.0 2,552 1 2,552.0 2,552.0 2,552 2,552 0.0 bind
0.0 1,807 1 1,807.0 1,807.0 1,807 1,807 0.0 listen
[5/8] Executing 'cuda_api_sum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------ ------------ -------- ----------- ------------- ----------------------
99.4 185,583,566 2 92,791,783.0 92,791,783.0 233,933 185,349,633 130,896,566.8 cudaMalloc
0.6 1,030,060 2 515,030.0 515,030.0 460,124 569,936 77,648.8 cudaDeviceSynchronize
0.1 180,565 2 90,282.5 90,282.5 10,882 169,683 112,289.3 cudaLaunchKernel
0.0 1,658 1 1,658.0 1,658.0 1,658 1,658 0.0 cuModuleGetLoadingMode
[6/8] Executing 'cuda_gpu_kern_sum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------- --------- -------- -------- ----------- -----------------------------------------
100.0 1,027,041 2 513,520.5 513,520.5 459,393 567,648 76,547.8 copyRow_Sheme(float *, float *, int, int)
[7/8] Executing 'cuda_gpu_mem_time_sum' stats report
SKIPPED: /root/bobc/report28.sqlite does not contain GPU memory data.
[8/8] Executing 'cuda_gpu_mem_size_sum' stats report
SKIPPED: /root/bobc/report28.sqlite does not contain GPU memory data.
Generated:
/root/bobc/report28.nsys-rep
/root/bobc/report28.sqlite
#
I also witnessed that kernel invocation order seemed to affect the results significantly, so I modifed my test to invoke only one type of kernel. This methodology suggests to me that the non-shared method is slightly faster.