Hi. im writing a code for accelerated pulse compression and im having trouble with just printig the results. everything else works just fine. the code used to work all fine before i turned off windows WDDM TDR. the problem was execution time limitation since this code can go on for hours because of the while(1) loop. when windows terminated the process the already printed result were correct so the code works just fine. i disabled TDR through regedit as instructed here : https://docs.microsoft.com/en-us/windows-hardware/drivers/display/tdr-registry-keys

but now just nothing is printed. neither the process is terminated nor anything is printed.

i suspect something is wrong with the printf being inside a loop. i think it wont print until the buffer is full or process is being terminated. any idea why is this happening or how it can be avoided? heres the code:

```
__global__ void chk() {
int id = threadIdx.x + blockDim.x*blockIdx.x;
int NBITS = 96;
int RPSL = 9;
unsigned long long int CFF = 0xffffffffffffffff;
unsigned long long int C33 = 0x3333333333333333;
unsigned long long int C55 = 0x5555555555555555;
unsigned long long int C0F = 0x0f0f0f0f0f0f0f0f;
unsigned long long int C01 = 0x0101010101010101;
unsigned long long int ZERO64 = 0x0000000000000000;
unsigned long long int LSB64 = 0x0000000000000001;
unsigned long long int MSB64 = 0x8000000000000000;
unsigned long long int n0, n1;
unsigned long long int b0, b1;
unsigned long long int c0, c1;
unsigned long long int ones0;
unsigned long long int ones1;
unsigned long long int cnt;
int PSL = 0;
int i;
int SL;
unsigned long long int msb;
unsigned long long int partialset;
curandState s;
partialset = CFF >> (128 - NBITS);
curand_init(id , 0ULL, 0ULL, &s);
n0 = 0x1234567812345678ULL;
n1 = n0;
while (1) {
n0++;
n1++;
n0 += curand(&s);
n1 += curand(&s);
//printf("%x \n%llx BEFORE PARTIALSET thread id is: %d\n",n1 , n0 , id);
n1 &= partialset;
//printf("%x \n%llx thread id is: %d\n",n1 , n0 , id);
b0 = n0;
b1 = n1;
ones0 = CFF;
ones1 = partialset;
PSL = 0;
i = 1;
while ((PSL <= RPSL) && (i <= (NBITS - 64))) {
b0 >>= 1;
msb = (b1 & LSB64) ? MSB64 : ZERO64;
b0 |= msb;
b1 >>= 1;
ones1 >>= 1;
c0 = n0 ^ b0;
c1 = n1 ^ b1;
c1 &= ones1;
c0 -= (c0 >> 1) & C55; // put count of each 2 bits into those 2 bits
c0 = (c0 & C33) + ((c0 >> 2) & C33);// put count of each 4 bits into those 4 bits
c0 = (c0 + (c0 >> 4)) & C0F; // put count of each 8 bits into those 8 bits
cnt = (c0 * C01) >> 56; // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
c1 -= (c1 >> 1) & C55; // put count of each 2 bits into those 2 bits
c1 = (c1 & C33) + ((c1 >> 2) & C33);// put count of each 4 bits into those 4 bits
c1 = (c1 + (c1 >> 4)) & C0F; // put count of each 8 bits into those 8 bits
cnt += (c1 * C01) >> 56; // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
SL = NBITS - i - 2 * cnt;
SL = SL > 0 ? SL : -SL;
if (SL > PSL) {
PSL = SL;
}
i++;
}
while ((PSL <= RPSL) && (i < (NBITS - RPSL))) {
b0 >>= 1;
ones0 >>= 1;
c0 = n0 ^ b0;
c0 &= ones0;
c0 -= (c0 >> 1) & C55; // put count of each 2 bits into those 2 bits
c0 = (c0 & C33) + ((c0 >> 2) & C33);// put count of each 4 bits into those 4 bits
c0 = (c0 + (c0 >> 4)) & C0F; // put count of each 8 bits into those 8 bits
cnt = (c0 * C01) >> 56; // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
SL = NBITS - i - 2 * cnt;
SL = SL > 0 ? SL : -SL;
if (SL > PSL) {
PSL = SL;
}
i++;
}
if (PSL <= RPSL) {
printf("%x%llx , ", n1, n0);
// cout << hex << setw(16) << n1 << n0 << ", ";
printf("PSL = %d , thread id = %d \n", PSL, threadIdx.x);
//cout << "PSL = " << dec << PSL << ", thread id = " << id << endl;
cudaDeviceSynchronize();
}
}
}
void main(int argc, char **argv) {
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 1024*1024*80);
chk<<<32, 64 >>>();
cudaDeviceSynchronize();
}
```