Dear all;

I had written this program to multiply each element of the matrix with a fixed number (2) in shared memory, the program compile and run correct, but the result is not as wanted.

the first row of matrix is just multiplied by , the rest row is the same

how can I solve this problem??

```
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#define N 4
__global__ void calculate_ratios(float *a,float *b)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
__shared__ float temp[N][N];
temp[ty][tx] = a[ty*(N ) + tx];
__syncthreads();
temp[ty][tx] = temp[ty][tx]*2;
__syncthreads();
b[ty*(N ) + tx] = temp[ty][tx];
__syncthreads();
}
int main()
{
float a_h[N*N] = { 3, 5, 2, 19, 2, 3, 11, 11, 1, 2, 2, 11, 0, 0, 0, 0 };
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("%.1f ", a_h[i *N + j]);
}
printf("\n");
}
printf("\n"); printf("\n");
float *a_d;
float *b_d;
cudaMalloc((void **)&a_d, N*sizeof(float));
cudaMalloc((void **)&b_d, N*sizeof(float));
cudaMemcpy(a_d, a_h, N*sizeof(float), cudaMemcpyHostToDevice);
dim3 dimBlock(N, N, 1);
calculate_ratios << <1, dimBlock >> >(a_d,b_d);
cudaMemcpy(a_h, b_d, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("%.1f ", a_h[i *N + j]);
}
printf("\n");
}
cudaFree(a_d);
cudaFree(b_d);
}
```