How to find the maximum number in an array in GPU and CPU and calculate the time for the processes.

I want to find the maximum number in an array and get the time for this process of GPU and CPU. I have attached my code for this. But the result is wrong (Specifically, the maximum number and minimum number are same, and the calculation time for GPU is 0). Can anyone help me how to change the code to get the expected result?
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
float cpu1;
float cpu2;
float gpu1;
float gpu2;
int max1;
int min1;
int m1;
int m2;
int m3;
int m4;

global void arradd( float *A, int N)
{
extern shared int sdata;

unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;

sdata[tid] = A[i];

__syncthreads();
for (unsigned int s = blockDim.x/2; s>=1; s=s/2)
{
if (tid< s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if(tid ==0) A[blockIdx.x] = sdata[0];
}

int helper(int N){
cudaError_t err = cudaSuccess;
cudaEvent_t start1, stop1;
cudaEvent_t start2, stop2;
cudaEvent_t start3, stop3;
cudaEvent_t start4, stop4;
float time1;
float time2;
float time3;
float time4;
N = N * 1048576;

size_t size = N *sizeof(float);

float *h_A = (float *)malloc(size);

for (int i = 0; i < N; i++)
{
srand((unsigned)time(NULL));
h_A[i] = rand();
}

float *d_A = NULL;

err = cudaMalloc((void **)&d_A, size);

int max = h_A[0];
int min = h_A[0];

err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

cudaEventCreate(&start1);
cudaEventRecord(start1,0);

for (int j = 1; j < N; j++)
{
if (h_A[j] > max)
{
max = h_A[j];
}
}

cudaEventCreate(&stop1);
cudaEventRecord(stop1,0);
cudaEventSynchronize(stop1);
cudaEventElapsedTime(&time1, start1, stop1);
cpu1 = time1;
m1 = max;

cudaEventCreate(&start2);
cudaEventRecord(start2,0);

for (int k = 1; k < N; k++)
{
if (h_A[k] < min)
{
min = h_A[k];
}
}
cudaEventCreate(&stop2);
cudaEventRecord(stop2,0);
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&time2,start2,stop2);
//printf(“The time for kernal is %fms\n”,time2);
cpu2 = time2;
m2 = min;

cudaEventCreate(&start3);
cudaEventRecord(start3,0);
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
arradd<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);
err = cudaGetLastError();

cudaEventCreate(&stop3);
cudaEventRecord(stop3,0);
cudaEventSynchronize(stop3);
cudaEventElapsedTime(&time3,start3,stop3);
gpu1 = time3;
//m3 = max1;

err = cudaFree(d_A);

free(h_A);

err = cudaDeviceReset();

return 0;

}

int main(void){
int a[3] = {2, 8, 32};
float element1;
printf(“N GPUmax CPUmax GPUtime CPUtime GPUSpeedup \n”);
for (int i=0; i<=2; i++){
helper(a[i]);
element1 = a[i];
printf(“%6f “, element1);
printf(”%12f “, m3);
printf(”%16f “, m1);
printf(”%20f “, gpu1);
printf(”%25f \n”, cpu1);
}
}