This is a quick (and perhaps dumb) question. Say I have a kernel function:

```
__global__ void foo_inline(unsigned int n, float* lhs, const float* rhs, const float alpha) {
__shared__ float s_data[];
float* s_lhs = s_data;
float* s_rhs = &s_data[n];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
s_lhs[threadIdx.x] = lhs[i];
s_rhs[threadIdx.x] = rhs[i];
lhs[i] = s_lhs[threadIdx.x] * alpha * s_rhs[threadIdx.x];
}
}
```

I am copying the arrays into shared memory so as to reduce the access time, but I have inlined the computation and the assignment back out to global memory. My question is, is this the same as storing the result in shared memory and then assigning to global memory? i.e.

```
__global__ void foo(unsigned int n, float* lhs, const float* rhs, const float alpha) {
__shared__ float s_data[];
float* s_lhs = s_data;
float* s_rhs = &s_data[n];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
s_lhs[threadIdx.x] = lhs[i];
s_rhs[threadIdx.x] = rhs[i];
s_lhs[threadIdx.x] *= alpha * s_rhs[threadIdx.x];
lhs[i] = s_lhs[threadIdx.x];
}
}
```

It would seem that assigning the result back to shared memory is unnecessary, and that this will yield the same efficiency (perhaps even less, because we need to assign to a shared memory space), but I just wanted to make sure.

Thanks!