Why does my code gives wrong output when number of blocks exceed 1

I am new to CUDA and I am trying to write power iteration method for calculating dominant eigen vector in CUDA and here is my approach

#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

#define N 256
#define THREADS_PER_BLOCK 256


__global__ void matrix_mul(float *d_A , float *d_x)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;

if (idx < N)
{
float sum = 0.0f;


#pragma unroll
for (int j = 0; j < N; j++)
{
sum += d_A[idx * N + j] * d_x[j];
__syncthreads();
}
d_x[idx] = sum;
__syncthreads();

}

}

__global__ void normalize(float *d_x)
{

int idx = threadIdx.x + blockDim.x * blockIdx.x;

if(idx<N)
{

d_x[idx] = d_x[idx] / d_x[N-1];
__syncthreads();
}
}


__global__ void eigenvalue(float *d_x , float *d_temp , float *d_result)
{


int idx = threadIdx.x + blockDim.x * blockIdx.x;


__syncthreads();
if (idx == 0 )
{
float numerator = 0.0f;
float denominator = 0.0f;

#pragma unroll
for (int i = 0; i < N; i++)
{
numerator += (d_temp[i] * d_x[i]);
denominator += (d_x[i] * d_x[i]);
}

*d_result = numerator/denominator;

}

}

__global__ void copy(float *d_x , float *d_temp)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if(idx<N)
{
d_temp[idx] = d_x[idx];
__syncthreads();
}
}

void power_iteration(float *h_A , float *h_x)
{

float *d_A ;

cudaMalloc((void **)&d_A, N * N * sizeof(float));


float *d_x , *d_temp;

cudaMalloc((void **)&d_x, N * sizeof(float));
cudaMalloc((void **)&d_temp, N * sizeof(float));


cudaMemcpy(d_A, h_A, N*N*sizeof(float) , cudaMemcpyHostToDevice);
cudaMemcpy(d_x, h_x, N*sizeof(float) , cudaMemcpyHostToDevice);
cudaMemcpy(d_temp, h_x, N*sizeof(float) , cudaMemcpyHostToDevice);

float  *d_result;
cudaMalloc((void **)&d_result, sizeof(float));


int numBlocks = (N+THREADS_PER_BLOCK-1) / (THREADS_PER_BLOCK);
int numThreads= (THREADS_PER_BLOCK);

float tolerance = 1e-4;
int iterations = 0;
float old = 0.0f;
float error = 0.0f;
float *h_result = (float *) malloc(sizeof(float));


while(error>tolerance || iterations ==0)
{
iterations++;

matrix_mul<<<numBlocks, numThreads>>>(d_A,d_x);
normalize<<<numBlocks, numThreads>>>(d_x);
eigenvalue<<<numBlocks, numThreads>>>(d_x , d_temp , d_result);

cudaMemcpy(h_result , d_result , sizeof(float) , cudaMemcpyDeviceToHost);

error = *h_result-old;
old = *h_result;

copy<<<numBlocks, numThreads>>>(d_x,d_temp);
}

printf("%d\n" ,iterations);


cudaDeviceSynchronize();

cudaMemcpy(h_x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost);

for(int i = 0 ; i<N ; i++)
{
printf("%f ", h_x[i]);
}
printf("\n");


cudaFree(d_x);
cudaFree(d_temp);
cudaFree(d_result);
free(h_result);

cudaFree(d_A);
}



int main() {



float *h_A;
h_A = (float *)malloc(N*N*sizeof(float));



for(int i = 0 ; i<N ; i++)
{
for(int j =0 ; j< N ; j++)
h_A[i*N+j] = (float) i/N;
}


float *h_x;
h_x = (float *)malloc(N*sizeof(float));




for (int i = 0; i < N; ++i) {
h_x[i] = 1.0;
}

power_iteration(h_A , h_x);



free(h_x);
free(h_A);

return 0;
}

Here I have divided the procedure into several kernels and launched them , this code works perfect for N<=256 but for N > 256 say 257 it fails , I am unable to find why can any one help