Matrix threads seem to find the same element multiple times?

Hi I am just at the moment outputting what the threads are finding when looping through a matrix and when I do it I seem to get the elements of the matrix multiple times… I can’t see the problem any ideas?

#include <string.h>

#include <stdlib.h>

#include <stdio.h>

#define BLOCK_SIZE 4

#define H  4

#define W  4

#define S  16

#define AS(i, j) As[i][j]

__global__ void

matrix(float* N, int w)

{

	int blockX = blockIdx.x;

	int blockY = blockIdx.y;

	int threadX = threadIdx.x;

	int threadY = threadIdx.y;

	int begin = w * BLOCK_SIZE * blockY;

	

	int end = begin + w - 1;

	int step = BLOCK_SIZE;

	float n = 0;

	int k = 0;

	for(int a = begin; a<=end; a+=step)

	{

		

		__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

		

		//AS(ty, tx) = A[a + wA * ty + tx];

		AS(threadY,threadX) = N[a + w * threadY + threadX];

		

		__syncthreads();

		for(int k = 0; k < BLOCK_SIZE; ++k)

		{

			 //Csub += AS(ty, k) * BS(k, tx);

			n += AS(threadY, k);

			

			printf("You found the value%f\n" ,n);

			

		}

		__syncthreads();

	}

	

}