When this CUDA Kernel is executed, it appears to crash at a specific point in the code...

This has been a rather irritating issue. When the execution of this kernel reaches a certain point in the code, the kernel crashes and resets my display driver. I’m not sure why this is happening. I’ve investigated possible memory corruption, but I didn’t come across anything that looked suspicious.

In any case, here is the code. It should compile and run. Note that I am compiling as 64bit compute capability > 2. (This is a shortened version of my program that replicate the exact issue).

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <math.h>
#include <stdlib.h>

__global__ void example(int offset, int reqThreads, int length, int base){

    //Declarations
    char comb[13] = {0}; //Fixed size char array initialized to all 0's (Max size of string is 12 words + null terminator)
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;

    __int64 placeHolder; //Maps to character in charset (result of %)
    __int64 quotient; //Quotient after division by base
    __int64 nComb = 1; //Number of combinations
    __int64 diff; //Length of new combinations
    __int64 i, j, t; //Dummy indices
    char charSet[27] = "abcdefghijklmnopqrstuvwxyz"; //Random character set for example

//Thread Check - more threads were spawned than will actually do work
    if(idx < reqThreads){
            unsigned int tid = offset*idx;

        for(i = 0; i < offset; i++){
            comb[i] = charSet[i%26]; //Will never go beyond bounds of charSet (for demonstration purposes)
        }

        //Compute the number of distinct combinations ((size of charset)^length)
        diff = length - offset;
        if(diff == 0); diff = 2; //Again, just for demonstration

        for(j = 0; j < diff; j++) nComb *= base;

        //Begin generating combinations
        for(t = 0; t < nComb; t++){
            quotient = t;

            for(j = 0; j < diff; j++){
                placeHolder = quotient % base; <--Problem is encountered here
                comb[offset + j] = charSet[placeHolder]; //Copy the indicated character;
                quotient /= base; //Divide the number by its base to calulate the next character. <--Problem also occurs here

            }
        }

    }
}

int main(void){
    //Declarations
    int minLength = 1;
    int maxLength = 4;
    int offset;
    int length;
    int totalThreads;
    int reqThreads;
    int maxThreads = 512;
    int blocks;
    char charSet[27] = "abcdefghijklmnopqrstuvwxyz";
    int base = strlen(charSet);
    int i,j; 
        cudaError_t error;

    for(i = minLength; i<=maxLength; i++){
        offset = i;
        length = i;

        if(offset > 2) offset = 2; //My main program does this

        //Calculate parameters
        reqThreads = (int) pow((double) base, (double) length); //Casting I would never do, but works here

        totalThreads = reqThreads;

        for(j = 1;(totalThreads % maxThreads) != 0; j++) totalThreads += 1; //Create a multiple of 512

        blocks = totalThreads/maxThreads;
        system("pause"); std::cout <<"Total Threads: "<<totalThreads << " Blocks: "<<blocks<"\n\n";

        //Call the kernel
        std::cout <<"\n\n" <<"Length: "<<i;
        example<<<blocks, maxThreads>>>(offset, reqThreads, i, base);
        cudaDeviceSynchronize();

        error = cudaGetLastError();
        if(error != cudaSuccess) std::cout << cudaGetErrorString(error);

        std::cout<<"\n\n";
    }
    system("pause");
    return 0;
}

The program generates distinct combinations via a non-recursive method. In fact, I wrote an equivalent C program prior to writing a version in CUDA:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>

using namespace std; //Yes it's not C, but it's so much easier to print a string to stdout
typedef unsigned long long ull;

int main(void){

    //Declarations
    int init = 2;
    char final[12] = {'a', 'a'};
    char charSet[27] = "abcdefghijklmnopqrstuvwxyz"; 
    ull max = 2; //Modify as need be
    int base = strlen(charSet);
    int placeHolder; //Maps to character in charset (result of %)
    ull quotient;  //Quotient after division by base
    ull nComb = 1;
    char comb[max+1]; //Array to hold combinations
    int c = 0;
    ull i,j;

//Compute the number of distinct combinations ((size of charset)^length)
    for(j = 0; j < max; j++) nComb *= strlen(charSet);

    //Begin computing combinations
    for(i = 0; i < nComb; i++){
        quotient = i;

        for(j = 0; j < max; j++){ //No need to check whether the quotient is zero
             placeHolder = quotient % base;
             final[init+j] = charSet[placeHolder]; //Copy the indicated character
             quotient /= base; //Divide the number by its base to calculate the next character
        }

        string str(final);
        c++;
        //Print combinations
        cout << final << "\n";
    }
    cout << "\n\n" << c << " combinations calculated";
    getchar();
}

The C code works without issue. I was hoping cudaGetLastError() would be more insightful. Unfortunately, all I get is “Unknown Error” or “Kernel Launch Failure.” My display driver says it resets after my displays go black. So, it’s obvious something is definitely going wrong.

Any constructive input is appreciated.