memcheck different error every run

I am getting cuda memcheck memory errors. What is strange is that it executes without problem in VS, (but it doesn’t if I scale up). But if I run cuda debug with memcheck, without changing the code, it sometimes runs without problem (usualy first few runs - am I somehow not clearing some memory on gpu?), other times I get either misaligned memory or invalid address on different lines everytime, sometimes even my pc completely freezes during debug and hard reset is required. Seems random to me, every run is different.

Here is as minimal verifiable example I am able to provide (amateur freetime programmer)

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include “time.h”
#include <stdio.h>
#include
#include <stdlib.h>

#include
#include
#include
#include
#include
#include
#include
#include
#include

using namespace std;

global void addKernel(float column1, float column2, float column3, float hi, float low, float close, int lines)
{
int sl = 0;
int pt = 0;

int sla = 11;
int pta = 11;

float state1 = 10;
float state2 = 10;
float state3 = 10;

int m = 0;

bool condition1 = false;
bool condition2 = false;
bool condition3 = false;

for (int n = 0; n < lines; n++) {

condition1 = false;
condition2 = false;
condition3 = false;

if (state1 > 0) {
    if (column1[n] > state1) {  //one of errors here or
        condition1 = true;
    }
}

if (state1 < 0) {
    if (column1[n] < state1) { //one of errors here or
        condition1 = true;
    }
}

if (state2 > 0) {
    if (column2[n] > state2) { //one of errors here or
        condition2 = true;
    }
}

if (state2 < 0) {
    if (column2[n] < state2) { //one of errors here or
        condition2 = true;
    }
}

if (state3 > 0) {
    if (column3[n] > state3) { //one of errors here or
        condition3 = true;
    }
}

if (state3 < 0) {
    if (column3[n] < state3) { //one of errors here or
        condition3 = true;
    }
}

if (m <= n) {

    for (m = n + 1; m < lines && condition1 == true && condition2 == true && condition3 == true; m++) {

        if (hi[m] - close[n] >= sla) { //one of errors here or

            condition1 = false;
            condition2 = false;
            condition3 = false;
            sl++;

        }
        else if (close[n] - low[m] >= pta) { //one of errors here or

            condition1 = false;
            condition2 = false;
            condition3 = false;
            pt++;

        }
    }
}

}
}

int main()
{
int lines = 25200;

float* column1 = new float[lines];
float* column2 = new float[lines];
float* column3 = new float[lines];

for (int i = 0; i < lines; i++) {

column1[i] = rand() % 1000;
column2[i] = rand() % 1000;
column3[i] = rand() % 1000;

}

float* h_ind1 = new float[lines];
float* h_ind2 = new float[lines];
float* h_ind3 = new float[lines];

for (int i = 0; i < lines; i++) {

h_ind1[i] = rand() % 1000;
h_ind2[i] = rand() % 1000;
h_ind3[i] = rand() % 1000;

}

cudaSetDevice(0);

float dev_ind1 = 0;
cudaMalloc((void
*)&dev_ind1, lines * sizeof(float));
cudaMemcpy(dev_ind1, h_ind1, lines * sizeof(float), cudaMemcpyHostToDevice);

float dev_ind2 = 0;
cudaMalloc((void
*)&dev_ind2, lines * sizeof(float));
cudaMemcpy(dev_ind2, h_ind2, lines * sizeof(float), cudaMemcpyHostToDevice);

float dev_ind3 = 0;
cudaMalloc((void
*)&dev_ind3, lines * sizeof(float));
cudaMemcpy(dev_ind3, h_ind3, lines * sizeof(float), cudaMemcpyHostToDevice);

float* dev_hi = 0;
cudaMalloc((void**)&dev_hi, lines * sizeof(float));
cudaMemcpy(dev_hi, column1, lines * sizeof(float), cudaMemcpyHostToDevice);

float* dev_low = 0;
cudaMalloc((void**)&dev_low, lines * sizeof(float));
cudaMemcpy(dev_low, column2, lines * sizeof(float), cudaMemcpyHostToDevice);

float* dev_close = 0;
cudaMalloc((void**)&dev_close, lines * sizeof(float));
cudaMemcpy(dev_close, column3, lines * sizeof(float), cudaMemcpyHostToDevice);

cudaDeviceSynchronize();

addKernel << <64, 10 >> >(dev_ind1, dev_ind2, dev_ind3, dev_hi, dev_low, dev_close, lines);

cudaDeviceSynchronize();

cudaFree(dev_ind1);
cudaFree(dev_ind2);
cudaFree(dev_ind3);
cudaFree(dev_hi);
cudaFree(dev_low);
cudaFree(dev_close);

cudaDeviceReset();

cout << “\n”;
cout << “DONE”;
cout << “\n”;

return 0;
}

basicaly program should check every “line” in column 1,2,3. if they meet the condition, do some code from that line onwards. (but don’t do the code, if one is already in progres)

this is just min ver. example , in orig code paralelism makes sence, for example variable state is cahnged by thread

Thank you for any help, I am completly lost where might be the problem.