why is shared memory example not faster

I made up an example to see if shared memory is really faster.

The two programs are x.cu and y.cu
They both return the same answer like they should
x.cu uses shared memory, so must declare an array to support the working set
per thread,

y.cu juse uses the device memory you get with the stack frame so each thread
gets its copy of the working set this way

both do a reduction of the data set computed by thread id, into item 0

Time results are essentially the same. same answer.
1 time x
2 2089967186485776.000000
3
4 real. 0m10.53s
5 user. 0m10.42s
6 sys. 0m 0.09s
7
8 time y
9 2089967186485776.000000
10
11 real. 0m 9.75s
12 user. 0m 9.68s
13 sys. 0m 0.06s

Here is the source for both

cat x.cu|greatawk 1 10000
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <assert.h>
4 #include <math.h>
5 #include “book.h”
6
7 enum {M=2000};
8 enum {N=1024};
9 struct workdef
10 {
11 . int i,j,k1,k2;
12 . double A;
13 . double B;
14 . double S;
15 };
16
17 global void K(double (*D)[N])
18 {
19 . //int n = gridDim.x;
20 . //int m = blockDim.x;
21 . //int bid = blockIdx.x;
22 . int tid = threadIdx.x;


23 . shared struct workdef Z[N];
24 . struct workdef *W = &Z[tid];

25 . {
26 . . W->S = tid;
27 . . W->A = 0.0;
28 . . W->B = 0.0;
29 . }
30 . __syncthreads();
31 . for ( W->i=0;W->i < M;(W->i)++)
32 . {
33 . . W->k1 = (tid + W->i) % N;
34 . . W->A= (double) W->i;
35 . . for ( W->j=0;W->j < M;W->j++)
36 . . {
37 . . . W->k2 = (tid + W->j) % N;
38 . . . W->B = (double) W->k2;
39 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));
40 . . }
41 . }
42 . (*D)[tid] = W->S;
43 . __syncthreads();

44 . {
45 . . int s;
46 . . for (
47 . . . s=N/2;
48 . . . s>0;
49 . . . s>>=1)
50 . . {
51 . . . if ( tid < s)
52 . . . {
53 . . . . (*D)[tid] += (*D)[tid+s];
54 . . . }
55 . . . __syncthreads();
56 . . }
57 . }

58 }
59
60 int main( void )
61 {
62 . double (D)[N] = 0;
63 . double H;
64 . size_t sz = sizeof( double [N]);
65
66 . HANDLE_ERROR( cudaMalloc( (void
*)&D , sz ));
67
68 . K<<<1,N>>>(D);
69 . cudaThreadSynchronize(); /* sync up mapped mem with host */
70 . {
71 . . cudaError_t e= cudaGetLastError();
72 . . assert ( e == cudaSuccess);
73 . }
74
75 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
76 . printf("%lf\n", H);
77 . HANDLE_ERROR( cudaFree(D));
78 }

cat y.cu|greatawk 1 10000
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <assert.h>
4 #include <math.h>
5 #include “book.h”
6
7 enum {M=2000};
8 enum {N=1024};
9 struct workdef
10 {
11 . int i,j,k1,k2;
12 . double A;
13 . double B;
14 . double S;
15 };
16
17 global void K(double (*D)[N])
18 {
19 . //int n = gridDim.x;
20 . //int m = blockDim.x;
21 . //int bid = blockIdx.x;
22 . int tid = threadIdx.x;


23 . struct workdef Z[1];
24 . struct workdef *W = &Z[0];

25 . //if( tid == 0)
26 . {
27 . . int i;
28 . . W->S = tid;
29 . . for ( i=0;i < N;i++)
30 . . {
31 . . . W->A = 0.0;
32 . . . W->B = 0.0;
33 . . }
34 . }
35 . __syncthreads();
36 . for ( W->i=0;W->i < M;(W->i)++)
37 . {
38 . . W->k1 = (tid + W->i) % N;
39 . . W->A= (double) W->i;
40 . . for ( W->j=0;W->j < M;W->j++)
41 . . {
42 . . . W->k2 = (tid + W->j) % N;
43 . . . W->B = (double) W->k2;
44 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));
45 . . }
46 . }
47 . (*D)[tid] = W->S;
48 . __syncthreads();

49 . {
50 . . int s;
51 . . for (
52 . . . s=N/2;
53 . . . s>0;
54 . . . s>>=1)
55 . . {
56 . . . if ( tid < s)
57 . . . {
58 . . . . (*D)[tid] += (*D)[tid+s];
59 . . . }
60 . . . __syncthreads();
61 . . }
62 . }

63 }
64
65 int main( void )
66 {
67 . double (D)[N] = 0;
68 . double H;
69 . size_t sz = sizeof( double [N]);
70
71 . HANDLE_ERROR( cudaMalloc( (void
*)&D , sz ));
72
73 . K<<<1,N>>>(D);
74 . cudaThreadSynchronize(); /* sync up mapped mem with host */
75 . {
76 . . cudaError_t e= cudaGetLastError();
77 . . assert ( e == cudaSuccess);
78 . }
79
80 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
81 . printf("%lf\n", H);
82 . HANDLE_ERROR( cudaFree(D));
83 }

Can you explain what I have to do to get the big time difference that shared memory is supposed to
yield?

What GPU is this? It is possible that the L1 and L2 cache are mitigating the performance impact of using local instead of shared memory. The speed reduction from using local arrays is much higher for compute capability 1.x devices.

Edit: Oops, I see that this question was asked twice, and the responses in the other thread are way better:

http://forums.nvidia.com/index.php?showtopic=227968