why is shared memory example not faster

LHickey · April 23, 2012, 4:27am

I made up an example to see if shared memory is really faster.

The two programs are x.cu and y.cu
They both return the same answer like they should
x.cu uses shared memory, so must declare an array to support the working set
per thread,

y.cu juse uses the device memory you get with the stack frame so each thread
gets its copy of the working set this way

both do a reduction of the data set computed by thread id, into item 0

Time results are essentially the same. same answer.
1 time x
2 2089967186485776.000000
3
4 real. 0m10.53s
5 user. 0m10.42s
6 sys. 0m 0.09s
7
8 time y
9 2089967186485776.000000
10
11 real. 0m 9.75s
12 user. 0m 9.68s
13 sys. 0m 0.06s

Here is the source for both

cat x.cu|greatawk 1 10000
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <assert.h>
4 #include <math.h>
5 #include “book.h”
6
7 enum {M=2000};
8 enum {N=1024};
9 struct workdef
10 {
11 . int i,j,k1,k2;
12 . double A;
13 . double B;
14 . double S;
15 };
16
17 global void K(double (*D)[N])
18 {
19 . //int n = gridDim.x;
20 . //int m = blockDim.x;
21 . //int bid = blockIdx.x;
22 . int tid = threadIdx.x;

23 . shared struct workdef Z[N];
24 . struct workdef *W = &Z[tid];

25 . {
26 . . W->S = tid;
27 . . W->A = 0.0;
28 . . W->B = 0.0;
29 . }
30 . syncthreads();
31 . for ( W->i=0;W->i < M;(W->i)++)
32 . {
33 . . W->k1 = (tid + W->i) % N;
34 . . W->A= (double) W->i;
35 . . for ( W->j=0;W->j < M;W->j++)
36 . . {
37 . . . W->k2 = (tid + W->j) % N;
38 . . . W->B = (double) W->k2;
39 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));
40 . . }
41 . }
42 . (*D)[tid] = W->S;
43 . syncthreads();

44 . {
45 . . int s;
46 . . for (
47 . . . s=N/2;
48 . . . s>0;
49 . . . s>>=1)
50 . . {
51 . . . if ( tid < s)
52 . . . {
53 . . . . (D)[tid] += (D)[tid+s];
54 . . . }
55 . . . __syncthreads();
56 . . }
57 . }

58 }
59
60 int main( void )
61 {
62 . double (D)[N] = 0;
63 . double H;
64 . size_t sz = sizeof( double [N]);
65
66 . HANDLE_ERROR( cudaMalloc( (void*)&D , sz ));
67
68 . K<<<1,N>>>(D);
69 . cudaThreadSynchronize(); /* sync up mapped mem with host */
70 . {
71 . . cudaError_t e= cudaGetLastError();
72 . . assert ( e == cudaSuccess);
73 . }
74
75 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
76 . printf(“%lf\n”, H);
77 . HANDLE_ERROR( cudaFree(D));
78 }

cat y.cu|greatawk 1 10000
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <assert.h>
4 #include <math.h>
5 #include “book.h”
6
7 enum {M=2000};
8 enum {N=1024};
9 struct workdef
10 {
11 . int i,j,k1,k2;
12 . double A;
13 . double B;
14 . double S;
15 };
16
17 global void K(double (*D)[N])
18 {
19 . //int n = gridDim.x;
20 . //int m = blockDim.x;
21 . //int bid = blockIdx.x;
22 . int tid = threadIdx.x;

23 . struct workdef Z[1];
24 . struct workdef *W = &Z[0];

25 . //if( tid == 0)
26 . {
27 . . int i;
28 . . W->S = tid;
29 . . for ( i=0;i < N;i++)
30 . . {
31 . . . W->A = 0.0;
32 . . . W->B = 0.0;
33 . . }
34 . }
35 . syncthreads();
36 . for ( W->i=0;W->i < M;(W->i)++)
37 . {
38 . . W->k1 = (tid + W->i) % N;
39 . . W->A= (double) W->i;
40 . . for ( W->j=0;W->j < M;W->j++)
41 . . {
42 . . . W->k2 = (tid + W->j) % N;
43 . . . W->B = (double) W->k2;
44 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));
45 . . }
46 . }
47 . (*D)[tid] = W->S;
48 . syncthreads();

49 . {
50 . . int s;
51 . . for (
52 . . . s=N/2;
53 . . . s>0;
54 . . . s>>=1)
55 . . {
56 . . . if ( tid < s)
57 . . . {
58 . . . . (D)[tid] += (D)[tid+s];
59 . . . }
60 . . . __syncthreads();
61 . . }
62 . }

63 }
64
65 int main( void )
66 {
67 . double (D)[N] = 0;
68 . double H;
69 . size_t sz = sizeof( double [N]);
70
71 . HANDLE_ERROR( cudaMalloc( (void*)&D , sz ));
72
73 . K<<<1,N>>>(D);
74 . cudaThreadSynchronize(); /* sync up mapped mem with host */
75 . {
76 . . cudaError_t e= cudaGetLastError();
77 . . assert ( e == cudaSuccess);
78 . }
79
80 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
81 . printf(“%lf\n”, H);
82 . HANDLE_ERROR( cudaFree(D));
83 }

Can you explain what I have to do to get the big time difference that shared memory is supposed to
yield?

seibert · April 23, 2012, 8:46pm

What GPU is this? It is possible that the L1 and L2 cache are mitigating the performance impact of using local instead of shared memory. The speed reduction from using local arrays is much higher for compute capability 1.x devices.

Edit: Oops, I see that this question was asked twice, and the responses in the other thread are way better:

Topic		Replies	Views
why is shared memory example not faster CUDA Programming and Performance	7	1303	May 16, 2012
__shared__ memory confused me. __shared__ memory CUDA Programming and Performance	7	3995	August 1, 2009
Shared memory as slow as global memory CUDA Programming and Performance	8	4371	September 5, 2016
Local vs Shared Memory execution slows down when using shared memory CUDA Programming and Performance	6	3183	October 14, 2009
access speed of shared memory and global memory CUDA Programming and Performance	1	1070	August 6, 2009
Problems with using shared memory CUDA Programming and Performance	5	5846	September 14, 2009
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16295	January 30, 2011
shared memory vs local memory CUDA Programming and Performance	1	8066	December 12, 2011
Problem about time of copy data through shared memory CUDA Programming and Performance	4	356	March 30, 2024
Problems when using shared memory CUDA Programming and Performance cuda	1	301	May 15, 2024

why is shared memory example not faster

23 . shared struct workdef Z[N]; 24 . struct workdef *W = &Z[tid];

44 . { 45 . . int s; 46 . . for ( 47 . . . s=N/2; 48 . . . s>0; 49 . . . s>>=1) 50 . . { 51 . . . if ( tid < s) 52 . . . { 53 . . . . (*D)[tid] += (*D)[tid+s]; 54 . . . } 55 . . . __syncthreads(); 56 . . } 57 . }

23 . struct workdef Z[1]; 24 . struct workdef *W = &Z[0];

49 . { 50 . . int s; 51 . . for ( 52 . . . s=N/2; 53 . . . s>0; 54 . . . s>>=1) 55 . . { 56 . . . if ( tid < s) 57 . . . { 58 . . . . (*D)[tid] += (*D)[tid+s]; 59 . . . } 60 . . . __syncthreads(); 61 . . } 62 . }

Related topics

23 . shared struct workdef Z[N];
24 . struct workdef *W = &Z[tid];

44 . {
45 . . int s;
46 . . for (
47 . . . s=N/2;
48 . . . s>0;
49 . . . s>>=1)
50 . . {
51 . . . if ( tid < s)
52 . . . {
53 . . . . (D)[tid] += (D)[tid+s];
54 . . . }
55 . . . __syncthreads();
56 . . }
57 . }

23 . struct workdef Z[1];
24 . struct workdef *W = &Z[0];

49 . {
50 . . int s;
51 . . for (
52 . . . s=N/2;
53 . . . s>0;
54 . . . s>>=1)
55 . . {
56 . . . if ( tid < s)
57 . . . {
58 . . . . (D)[tid] += (D)[tid+s];
59 . . . }
60 . . . __syncthreads();
61 . . }
62 . }