This has been a rather irritating issue. When the execution of this kernel reaches a certain point in the code, the kernel crashes and resets my display driver. I’m not sure why this is happening. I’ve investigated possible memory corruption, but I didn’t come across anything that looked suspicious.

In any case, here is the code. It should compile and run. Note that I am compiling as 64bit compute capability > 2. (This is a shortened version of my program that replicate the exact issue).

```
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <math.h>
#include <stdlib.h>
__global__ void example(int offset, int reqThreads, int length, int base){
//Declarations
char comb[13] = {0}; //Fixed size char array initialized to all 0's (Max size of string is 12 words + null terminator)
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
__int64 placeHolder; //Maps to character in charset (result of %)
__int64 quotient; //Quotient after division by base
__int64 nComb = 1; //Number of combinations
__int64 diff; //Length of new combinations
__int64 i, j, t; //Dummy indices
char charSet[27] = "abcdefghijklmnopqrstuvwxyz"; //Random character set for example
//Thread Check - more threads were spawned than will actually do work
if(idx < reqThreads){
unsigned int tid = offset*idx;
for(i = 0; i < offset; i++){
comb[i] = charSet[i%26]; //Will never go beyond bounds of charSet (for demonstration purposes)
}
//Compute the number of distinct combinations ((size of charset)^length)
diff = length - offset;
if(diff == 0); diff = 2; //Again, just for demonstration
for(j = 0; j < diff; j++) nComb *= base;
//Begin generating combinations
for(t = 0; t < nComb; t++){
quotient = t;
for(j = 0; j < diff; j++){
placeHolder = quotient % base; <--Problem is encountered here
comb[offset + j] = charSet[placeHolder]; //Copy the indicated character;
quotient /= base; //Divide the number by its base to calulate the next character. <--Problem also occurs here
}
}
}
}
int main(void){
//Declarations
int minLength = 1;
int maxLength = 4;
int offset;
int length;
int totalThreads;
int reqThreads;
int maxThreads = 512;
int blocks;
char charSet[27] = "abcdefghijklmnopqrstuvwxyz";
int base = strlen(charSet);
int i,j;
cudaError_t error;
for(i = minLength; i<=maxLength; i++){
offset = i;
length = i;
if(offset > 2) offset = 2; //My main program does this
//Calculate parameters
reqThreads = (int) pow((double) base, (double) length); //Casting I would never do, but works here
totalThreads = reqThreads;
for(j = 1;(totalThreads % maxThreads) != 0; j++) totalThreads += 1; //Create a multiple of 512
blocks = totalThreads/maxThreads;
system("pause"); std::cout <<"Total Threads: "<<totalThreads << " Blocks: "<<blocks<"\n\n";
//Call the kernel
std::cout <<"\n\n" <<"Length: "<<i;
example<<<blocks, maxThreads>>>(offset, reqThreads, i, base);
cudaDeviceSynchronize();
error = cudaGetLastError();
if(error != cudaSuccess) std::cout << cudaGetErrorString(error);
std::cout<<"\n\n";
}
system("pause");
return 0;
}
```

The program generates distinct combinations via a non-recursive method. In fact, I wrote an equivalent C program prior to writing a version in CUDA:

```
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
using namespace std; //Yes it's not C, but it's so much easier to print a string to stdout
typedef unsigned long long ull;
int main(void){
//Declarations
int init = 2;
char final[12] = {'a', 'a'};
char charSet[27] = "abcdefghijklmnopqrstuvwxyz";
ull max = 2; //Modify as need be
int base = strlen(charSet);
int placeHolder; //Maps to character in charset (result of %)
ull quotient; //Quotient after division by base
ull nComb = 1;
char comb[max+1]; //Array to hold combinations
int c = 0;
ull i,j;
//Compute the number of distinct combinations ((size of charset)^length)
for(j = 0; j < max; j++) nComb *= strlen(charSet);
//Begin computing combinations
for(i = 0; i < nComb; i++){
quotient = i;
for(j = 0; j < max; j++){ //No need to check whether the quotient is zero
placeHolder = quotient % base;
final[init+j] = charSet[placeHolder]; //Copy the indicated character
quotient /= base; //Divide the number by its base to calculate the next character
}
string str(final);
c++;
//Print combinations
cout << final << "\n";
}
cout << "\n\n" << c << " combinations calculated";
getchar();
}
```

The C code works without issue. I was hoping cudaGetLastError() would be more insightful. Unfortunately, all I get is “Unknown Error” or “Kernel Launch Failure.” My display driver says it resets after my displays go black. So, it’s obvious something is definitely going wrong.

Any constructive input is appreciated.