Cuda, coping data from global to shared memory

Hi,
I am a beginner in CUDA programming.I have to use shared memory. In my task I have 145 vectors with many thousands of elements. There vectors are written in a one-dimensional array. The result must be a scalar product of all vectors. I can’t load data into buffer. I should load warp with 32 elements for each vector and then make the product. Can you show me what I’m doing wrong?

Here is my code

global void scalar_13(real *Mat, int chunk_size, int Len, int Size, real *resMat ){

int s_x = threadIdx.x + blockIdx.x * blockDim.x;
int s_y = threadIdx.y + blockIdx.y * blockDim.y;
int i;
int ad_x;
int ad_y;
double res;
int th_num = threadIdx.y*blockDim.x+threadIdx.x;
int iter = Len/chunk_size;
int remain, start_x, start_y;
extern __shared__ float buffer[];
res = 0.0;
iter = Len / chunk_size; 
remain = Len % chunk_size; 
int n;
ad_x = s_x*Len; //beginning of the vector X
ad_y = s_y*Len; //beginning of the vector Y

int offset_x = threadIdx.x * chunk_size;
int offset_y = blockDim.x * chunk_size + threadIdx.y * chunk_size;

if ( (s_x<Size) & (s_y<Size) ) { 
    for (n=0;n<iter;n++) {  
        if (th_num < 32) { 

            for (int ii=0;ii<blockDim.x;ii++) {

                buffer[ii * chunk_size + th_num] = Mat[ad_x + n * chunk_size + th_num]; 

            }

            for (int ii=0;ii<blockDim.y;ii++) { 

                buffer[(blockDim.x + ii) * chunk_size + th_num] = Mat[ad_y + n * chunk_size + th_num];

            } 

        }

        __syncthreads();

        for (int i=0; i<chunk_size; i++) {

            res+= buffer[offset_x+i]*buffer[offset_y+i];

            ad_x++;
            ad_y++;

        }

    }

}

__syncthreads();

if ( (s_x<Size) & (s_y<Size) ) { 

    for (int ii=0;ii<blockDim.x;ii++) {

        buffer[offset_x + th_num + ii * chunk_size] = Mat[ad_x + iter * chunk_size + th_num]; 

    }

    for (int ii=0;ii<blockDim.y;ii++) { 

        buffer[offset_y + th_num + ii * chunk_size] = Mat[ad_y + iter * chunk_size + th_num];

    }   

    __syncthreads();

    for (int i=0; i<remain; i++) {

        res+= buffer[offset_x+i]*buffer[offset_y+i];

        ad_x++;

        ad_y++;

    }

}

__syncthreads();

resMat[s_x*Size+s_y] = res;

__syncthreads();

}

Thanks