I wrote this minimal version of the code to get the same error
#include <cmath>
#include <iostream>
#include <ctime>
#include <stdio.h>
#define NoOfthread 800
#define NoOfBlock 3
__global__ void simulate(double* q_global, double* u_global, double* qf_global, double* uf_global) {
int i;
double q[19], u[18];
for (int i = 0; i < 19; i++)
q[i] = q_global[blockDim.x * blockIdx.x * 19 + threadIdx.x * 19 + i];
q[6] += threadIdx.x * 5.0 + 10;
for (int i = 0; i < 18; i++)
u[i] = u_global[blockDim.x * blockIdx.x * 18 + threadIdx.x * 18 + i];
for (i = 0; i < 19; i++)
qf_global[blockDim.x * blockIdx.x * 19 + threadIdx.x * 19 + i] = q[i];
for (i = 0; i < 18; i++)
uf_global[blockDim.x * blockIdx.x * 18 + threadIdx.x * 18 + i] = u[i];
}
int main() {
double *a, *b, *c, *d;
double *d_a, *d_b;
double *d_c, *d_d;
a = (double *) malloc(19 * NoOfthread * NoOfBlock * sizeof(double));
b = (double *) malloc(18 * NoOfthread * NoOfBlock * sizeof(double));
c = (double *) malloc(19 * NoOfthread * NoOfBlock * sizeof(double));
d = (double *) malloc(18 * NoOfthread * NoOfBlock * sizeof(double));
for (int i = 0; i < 19 * NoOfthread * NoOfBlock; i++)
a[i] = 0;
for (int i = 0; i < 18 * NoOfthread * NoOfBlock; i++)
b[i] = 0;
cudaMalloc(&d_a, 19 * NoOfthread * NoOfBlock * sizeof(double));
cudaMalloc(&d_b, 18 * NoOfthread * NoOfBlock * sizeof(double));
cudaMalloc(&d_c, 19 * NoOfthread * NoOfBlock * sizeof(double));
cudaMalloc(&d_d, 18 * NoOfthread * NoOfBlock * sizeof(double));
cudaMemcpy(d_a, a, 19 * NoOfthread * NoOfBlock * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, 18 * NoOfthread * NoOfBlock * sizeof(double), cudaMemcpyHostToDevice);
simulate<<<NoOfBlock, NoOfthread>>>(d_a, d_b, d_c, d_d);
cudaMemcpy(c, d_c, 19 * NoOfthread * NoOfBlock * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(d, d_d, 18 * NoOfthread * NoOfBlock * sizeof(double), cudaMemcpyDeviceToHost);
for (int i = 0; i < 19; i++)
printf("q0[%d]=%f ", i, c[i]);
printf("\n");
for (int i = 19; i < 38; i++)
printf("q1[%d]=%f ", i, c[i]);
free(a);
free(b);
free(c);
free(d);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_d);
return 0;
}
Basically I send data (2 types) to the global memory, modify it, and send it back to the host memory. It works perfectly with less than 200 threads but when I use more than that, I get corrupted data. I use one Titan X and eclipse Nsight.