The variable 'cudaStatus' is being used without being initialized

tiputa · October 27, 2022, 9:27am

What I am doing wrong in the code below, adapted from Linux? I get: Run-Time Check Failure #3 - The variable ‘cudaStatus’ is being used without being initialized at the begging of function code cudaError_t launchGPUHandlerThread(void) in the row if (cudaStatus != cudaSuccess) { after cudaSetDevice(0);


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <winsock2.h> // struct timeval
#include <windows.h> // timeGetTime()

#pragma comment(lib, "winmm.lib") // timeGetTime()

#include <stdlib.h>
#include <locale.h>
#include "sha256.cuh"

#define TEXT_TXT "Caster"
#define TEXT_LEN 6
#define THREADS 1500
#define BLOCKS 256
#define GPUS 1
#define DIFFICULTY 4
#define RANDOM_LEN 20

typedef unsigned long DWORD;

int gettimeofday(struct timeval* tp, void* tzp);
long long timems(void);

cudaError_t launchGPUHandlerThread(void);

__constant__ BYTE characterSet[63] = { "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890" };

__global__ void initSolutionMemory(int* blockContainsSolution) {
    *blockContainsSolution = -1;
}

__device__ uint64_t deviceRandomGen(uint64_t x) {
    x ^= (x << 21);
    x ^= (x >> 35);
    x ^= (x << 4);
    return x;
}

__global__ void sha256_cuda(BYTE* prefix, BYTE* solution, int* blockContainsSolution, uint64_t baseSeed) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    SHA256_CTX ctx;
    BYTE digest[32];
    BYTE random[RANDOM_LEN];
    uint64_t seed = baseSeed;
    seed += (uint64_t)i;
    for (int j = 0; j < RANDOM_LEN; j++) {
        seed = deviceRandomGen(seed);
        int randomIdx = (int)(seed % 62);
        random[j] = characterSet[randomIdx];
    }
    sha256_init(&ctx);
    sha256_update(&ctx, prefix, TEXT_LEN);
    sha256_update(&ctx, random, RANDOM_LEN);
    sha256_final(&ctx, digest);
    for (int j = 0; j < DIFFICULTY; j++)
        if (digest[j] > 0)
            return;
    if ((digest[DIFFICULTY] & 0xF0) > 0)
        return;
    if (*blockContainsSolution == 1)
        return;
    *blockContainsSolution = 1;
    for (int j = 0; j < RANDOM_LEN; j++)
        solution[j] = random[j];
}

void hostRandomGen(uint64_t* x) {
    *x ^= (*x << 21);
    *x ^= (*x >> 35);
    *x ^= (*x << 4);
}

void pre_sha256() {
    cudaMemcpyToSymbol(dev_k, host_k, sizeof(host_k), 0, cudaMemcpyHostToDevice);
}

int gettimeofday(struct timeval* tp, void* tzp) {
    DWORD t;

    t = timeGetTime();
    tp->tv_sec = t / 1000;
    tp->tv_usec = t % 1000;

    return 0;
}

long long timems(void) {
    struct timeval end;
    gettimeofday(&end, NULL);
    return end.tv_sec * 1000LL + end.tv_usec / 1000;
}

uint64_t hashesProcessed;

//pthread_mutex_t solutionLock;
BYTE* solution;

cudaError_t launchGPUHandlerThread(void) {

    cudaError_t cudaStatus;
    cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    pre_sha256();

    BYTE cpuPrefix[] = { TEXT_TXT };
    BYTE* d_prefix;
    // Allocate GPU buffer A
    cudaStatus = cudaMalloc(&d_prefix, TEXT_LEN);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy data from host memory to GPU buffer A
    cudaStatus = cudaMemcpy(d_prefix, cpuPrefix, TEXT_LEN, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    BYTE* blockSolution = (BYTE*)malloc(sizeof(BYTE) * RANDOM_LEN);
    BYTE* d_solution;
    // Allocate GPU buffer B
    cudaStatus = cudaMalloc(&d_solution, sizeof(BYTE) * RANDOM_LEN);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    int* blockContainsSolution = (int*)malloc(sizeof(int));
    int* d_blockContainsSolution;
        // Allocate GPU buffer C
    cudaStatus = cudaMalloc(&d_blockContainsSolution, sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    uint64_t rngSeed = timems();

    // Launch a kernel on the GPU with one thread and one element.
    initSolutionMemory << <1, 1 >> > (d_blockContainsSolution);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "initSolutionMemory launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }


    while (1) {
        hostRandomGen(&rngSeed);

        /*hi->hashesProcessed += THREADS * BLOCKS;*/
        hashesProcessed += THREADS * BLOCKS;
        sha256_cuda << <THREADS, BLOCKS >> > (d_prefix, d_solution, d_blockContainsSolution, rngSeed);

        // Check for any errors launching the kernel
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "sha256_cuda launch failed: %s\n", cudaGetErrorString(cudaStatus));
            goto Error;
        }

        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
            goto Error;
        }

        // Copy output vector from GPU buffer to host memory.
        cudaStatus = cudaMemcpy(blockContainsSolution, d_blockContainsSolution, sizeof(int), cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
        if (*blockContainsSolution == 1) {
            cudaStatus = cudaMemcpy(blockSolution, d_solution, sizeof(BYTE) * RANDOM_LEN, cudaMemcpyDeviceToHost);
            if (cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaMemcpy failed!");
                goto Error;
            }
            solution = blockSolution;
            break;
        }

        if (solution) {
            break;
        }
    }

Error:
    cudaDeviceReset();
    /*return NULL;*/
    return cudaStatus;
}

int main()
{
    setlocale(LC_NUMERIC, "");

    //uint64_t** processedPtrs = (uint64_t**)malloc(sizeof(uint64_t*) * GPUS);
    uint64_t processedPtrs;
    long long start = timems();

    hashesProcessed = 0;
    processedPtrs = hashesProcessed;
    cudaError_t cudaStatus = launchGPUHandlerThread();


    while (1) {
        uint64_t totalProcessed = 0;
        totalProcessed += processedPtrs;
        long long elapsed = timems() - start;
        printf("Hashes %llu Seconds %f Hashes/sec %llu\r", totalProcessed, ((float)elapsed) / 1000.0, (uint64_t)((double)totalProcessed / (double)elapsed) * 1000);
        if (solution) {
            break;
        }
    }
    printf("\n");

    long long end = timems();
    long long elapsed = end - start;


    uint64_t totalProcessed = 0;
    totalProcessed += processedPtrs;

    printf("Solution: %.20s\n", solution);
    printf("Hashes processed: %llu\n", totalProcessed);
    printf("Time: %llu\n", elapsed);
    printf("Hashes/sec: %llu\n", (uint64_t)((double)totalProcessed / (double)elapsed) * 1000);

    //// cudaDeviceReset must be called before exiting in order for profiling and
    //// tracing tools such as Nsight and Visual Profiler to show complete traces.
    //cudaStatus = cudaDeviceReset();
    //if (cudaStatus != cudaSuccess) {
    //    fprintf(stderr, "cudaDeviceReset failed!");
    //    return 1;
    //}

    return 0;
}

njuffa · October 27, 2022, 10:32am

Well, the variable is uninitialized. I think you meant to write

cudaStatus = cudaSetDevice(0);

tiputa · October 27, 2022, 2:36pm

Thank you, you are right I forgot to inicializace the cudaStatus variable.

Now I have another problem with:

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "initSolutionMemory launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

I got the error:

initSolutionMemory launch failed: no kernel image is available for execution on the device

tiputa · October 27, 2022, 3:29pm

I had to change the Project Properties, CUDA C/C++, Device, Code Generation: compute_50,sm_50 from 52. The error 29 is gone.