compare 2 array - stack

when I run the following code:

#define THREADS_BLOCKS_IN_THE_GRID 8
#define NUM_OF_THREADS 1024
#define PIXELS 1024

global void Compare (unsigned char *sub_dm_ORG , unsigned char *dm_SMP , float *successPercent , int rows_SMP)
{

shared float temp[NUM_OF_THREADS];

int index = threadIdx.x + blockIdx.x * blockDim.x;

.
.
.
}

In the main:

int threadBlockInTheGrid = (PIXELS + NUM_OF_THREADS) / (NUM_OF_THREADS+1) ;
hashCompare<<<threadBlockInTheGrid , NUM_OF_THREADS >>> (Image_ORG , Image_SMP , d_successPercent , rows_SMP);

for pixels = 1024 points the code works fine.
for more than pixels = 1024, the code compile and not running
Someone can help?

in the failing case, try running your code with cuda-memcheck

int threadBlockInTheGrid = (PIXELS + NUM_OF_THREADS) / (NUM_OF_THREADS+1) ;

should be:

int threadBlockInTheGrid = (PIXELS + NUM_OF_THREADS -1) / NUM_OF_THREADS ;

I tried to change the code to this.
but it is still not working for more then 1024 points:

my code is:

#include <stdio.h>
#include <time.h>

#define THREAD_NUMBER 1024 // Number of Threads inside the block
#define ELEMENT_NUMBER 1024

global void Compare (unsigned char *sub_dm_ORG , unsigned char *dm_SMP , float *successPercent)
{

shared float temp[THREAD_NUMBER];

int index = threadIdx.x + blockIdx.x * blockDim.x;

.
.
.
}

int main()
{
clock_t startTime = clock();

.

.
.
int BlockNumber = ELEMENT_NUMBER / THREAD_NUMBER;
//int ThreadBlockInTheGrid = (ELEMENT_NUMBER + THREAD_NUMBER) / (THREAD_NUMBER+1) ;
hashCompare<<<BlockNumber , THREAD_NUMBER >>> (Image_ORG , Image_SMP , d_successPercent);

.
.
.

return 0;

}
If I have more then 1 block.
does i need to control the thread work dividing? and how?
I can not understand how to use this kernel call.
I need 50,000 elements.

int BlockNumber = (ELEMENT_NUMBER + THREAD_NUMBER -1) / THREAD_NUMBER; // round-up div.

unfortunately,If I change the ELEMENT_NUMBER to 10240 (for exmpale) It’s still not working )-:

If I change the ELEMENT_NUMBER to 10240 (for exmpale) It’s still not working )-:

see following sample. it works.

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

__global__ void kernel_compare(const int *a, const int *b, int* diff_count, unsigned int size) {
  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i == 0) {
    *diff_count = 0;
  }
  __syncthreads();
  if (i < size) {
    if (a[i] != b[i]) {
      atomicAdd(diff_count, 1);
    }
  }
}

#include <iostream>
 
int main() {
  const int n = 10240;
  int* a;
  cudaMalloc(&a, n*sizeof(int));
  cudaMemset(a, 0, n * sizeof(int)); // fill 0
  int* b;
  cudaMalloc(&b, n * sizeof(int));
  cudaMemset(b, 0, n * sizeof(int)); // fill 0
  cudaMemset(b, 1, (n/2)*sizeof(int)); // half of b[] are non-0

  int* count;
  cudaMalloc(&count, sizeof(int));

  kernel_compare<<<(n + 255)/256, 256>>>(a, b, count, n);
  int result;
  cudaMemcpy(&result, count, sizeof(int), cudaMemcpyDeviceToHost);

  std::cout << result << " of " << n << " are different\n";


  cudaFree(count);
  cudaFree(b);
  cudaFree(a);

}