cudaMalloc hang when building x64 version binary

Hi there,

I’m new to cuda computing. I’m using cuda SDK version 8 on windows10 with K20c. After I installed SDK on OS, I try to run a test code which just do add of two array in visual studio. However, I found if I build with x64, the code will hang on cudamalloc, nothing error but just hang there, but if I build it with x86 configuration, the code works and finish successfully.

I will appreciate that for any help.

Thanks,
Yuting

Your version of Visual Studio may be set up incorrectly. Try building from command line (an example program, zcopy.cu, is shown below that you can try).

C:\Users\Norbert\My Programs>nvcc -arch=sm_35 -o zcopy.exe zcopy.cu
nvcc warning : nvcc support for Microsoft Visual Studio 2010 and earlier has been deprecated and is no longer being maintaine
zcopy.cu
support for Microsoft Visual Studio 2010 has been deprecated!
   Creating library zcopy.lib and object zcopy.exp

C:\Users\Norbert\My Programs>dumpbin /HEADERS zcopy.exe | grep machine
            8664 machine (x64)

C:\Users\Norbert\My Programs>zcopy
zcopy: operating on vectors of 10000000 double2s (= 1.600e+008 bytes)
zcopy: using 128 threads per block, 65520 blocks
zcopy: mintime = 6.220 msec  throughput = 51.44 GB/sec

Note that dumpbin headers reveal the executable is for an x64 target.

#include <stdlib.h>
#include <stdio.h>

#define ZCOPY_THREADS  128
#define ZCOPY_DEFLEN   10000000
#define ZCOPY_ITER     10           // as in STREAM benchmark

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// A routine to give access to a high precision timer on most systems.
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
    LARGE_INTEGER t;
    static double oofreq;
    static int checkedForHighResTimer;
    static BOOL hasHighResTimer;

    if (!checkedForHighResTimer) {
        hasHighResTimer = QueryPerformanceFrequency (&t);
        oofreq = 1.0 / (double)t.QuadPart;
        checkedForHighResTimer = 1;
    }
    if (hasHighResTimer) {
        QueryPerformanceCounter (&t);
        return (double)t.QuadPart * oofreq;
    } else {
        return (double)GetTickCount() * 1.0e-3;
    }
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif

__global__ void zcopy (const double2 * __restrict__ src, 
                       double2 * __restrict__ dst, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
        dst[i] = src[i];
    }
}    

struct zcopyOpts {
    int len;
};

static int processArgs (int argc, char *argv[], struct zcopyOpts *opts)
{
    int error = 0;
    memset (opts, 0, sizeof(*opts));
    while (argc) {
        if (*argv[0] == '-') {
            switch (*(argv[0]+1)) {
            case 'n':
                opts->len = atol(argv[0]+2);
                break;
            default:
                fprintf (stderr, "Unknown switch '%c%s'\n", '-', argv[0]+1);
                error++;
                break;
            }
        }
        argc--;
        argv++;
    }
    return error;
}

int main (int argc, char *argv[])
{
    double start, stop, elapsed, mintime;
    double2 *d_a, *d_b;
    int errors;
    struct zcopyOpts opts;

    errors = processArgs (argc, argv, &opts);
    if (errors) {
        return EXIT_FAILURE;
    }
    opts.len = (opts.len) ? opts.len : ZCOPY_DEFLEN;

    /* Allocate memory on device */
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * opts.len));
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * opts.len));
    
    /* Initialize device memory */
    CUDA_SAFE_CALL (cudaMemset(d_a, 0x00, sizeof(d_a[0]) * opts.len)); // zero
    CUDA_SAFE_CALL (cudaMemset(d_b, 0xff, sizeof(d_b[0]) * opts.len)); // NaN

    /* Compute execution configuration */
    dim3 dimBlock(ZCOPY_THREADS);
    int threadBlocks = (opts.len + (dimBlock.x - 1)) / dimBlock.x;
    if (threadBlocks > 65520) threadBlocks = 65520;
    dim3 dimGrid(threadBlocks);
    
    printf ("zcopy: operating on vectors of %d double2s (= %.3e bytes)\n", 
            opts.len, (double)sizeof(d_a[0]) * opts.len);
    printf ("zcopy: using %d threads per block, %d blocks\n", 
            dimBlock.x, dimGrid.x);

    mintime = fabs(log(0.0));
    for (int k = 0; k < ZCOPY_ITER; k++) {
        start = second();
        zcopy<<<dimGrid,dimBlock>>>(d_a, d_b, opts.len);
        CHECK_LAUNCH_ERROR();
        stop = second();
        elapsed = stop - start;
        if (elapsed < mintime) mintime = elapsed;
    }
    printf ("zcopy: mintime = %.3f msec  throughput = %.2f GB/sec\n",
            1.0e3 * mintime, (2.0e-9 * sizeof(d_a[0]) * opts.len) / mintime);

    CUDA_SAFE_CALL (cudaFree(d_a));
    CUDA_SAFE_CALL (cudaFree(d_b));

    return EXIT_SUCCESS;
}

Hi njuffa,

Thanks so much for your reply. It’s really a good idea to build with command line. I did a test as you said, but program still hang there even the dumpbin output is x64:

C:\>dumpbin /HEADERS zcopy.exe
Microsoft (R) COFF/PE Dumper Version 14.00.24215.1
Copyright (C) Microsoft Corporation.  All rights reserved.


Dump of file zcopy.exe

PE signature found

File Type: EXECUTABLE IMAGE

FILE HEADER VALUES
            8664 machine (x64)
               8 number of sections
        58E9C157 time date stamp Sun Apr  9 13:06:31 2017
               0 file pointer to symbol table
               0 number of symbols
              F0 size of optional header
              22 characteristics
                   Executable
                   Application can handle large (>2GB) addresses

OPTIONAL HEADER VALUES
             20B magic # (PE32+)
           14.00 linker version
           21800 size of code
           1D200 size of initialized data
               0 size of uninitialized data
            D614 entry point (000000014000D614)
            1000 base of code
       140000000 image base (0000000140000000 to 0000000140042FFF)
            1000 section alignment
             200 file alignment
            6.00 operating system version
            0.00 image version
            6.00 subsystem version
               0 Win32 version
           43000 size of image
             400 size of headers
               0 checksum
               3 subsystem (Windows CUI)
            8160 DLL characteristics
                   High Entropy Virtual Addresses
                   Dynamic base
                   NX compatible
                   Terminal Server Aware
          100000 size of stack reserve
            1000 size of stack commit
          100000 size of heap reserve
            1000 size of heap commit
               0 loader flags
              10 number of directories
           38D90 [      54] RVA  of Export Directory
           38DE4 [      28] RVA  of Import Directory
               0 [       0] RVA  of Resource Directory
           3D000 [    1C14] RVA  of Exception Directory
               0 [       0] RVA  of Certificates Directory
           42000 [     9F8] RVA  of Base Relocation Directory
           37220 [      1C] RVA  of Debug Directory
               0 [       0] RVA  of Architecture Directory
               0 [       0] RVA  of Global Pointer Directory
               0 [       0] RVA  of Thread Storage Directory
           37240 [      94] RVA  of Load Configuration Directory
               0 [       0] RVA  of Bound Import Directory
           23000 [     270] RVA  of Import Address Table Directory
               0 [       0] RVA  of Delay Import Directory
               0 [       0] RVA  of COM Descriptor Directory
               0 [       0] RVA  of Reserved Directory


SECTION HEADER #1
   .text name
   216DD virtual size
    1000 virtual address (0000000140001000 to 00000001400226DC)
   21800 size of raw data
     400 file pointer to raw data (00000400 to 00021BFF)
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
60000020 flags
         Code
         Execute Read

SECTION HEADER #2
  .rdata name
   16636 virtual size
   23000 virtual address (0000000140023000 to 0000000140039635)
   16800 size of raw data
   21C00 file pointer to raw data (00021C00 to 000383FF)
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
40000040 flags
         Initialized Data
         Read Only

  Debug Directories

        Time Type        Size      RVA  Pointer
    -------- ------- -------- -------- --------
    58E9C157 coffgrp      35C 000372D8    35ED8

SECTION HEADER #3
   .data name
    2E70 virtual size
   3A000 virtual address (000000014003A000 to 000000014003CE6F)
     A00 size of raw data
   38400 file pointer to raw data (00038400 to 00038DFF)
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
C0000040 flags
         Initialized Data
         Read Write

SECTION HEADER #4
  .pdata name
    1C14 virtual size
   3D000 virtual address (000000014003D000 to 000000014003EC13)
    1E00 size of raw data
   38E00 file pointer to raw data (00038E00 to 0003ABFF)
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
40000040 flags
         Initialized Data
         Read Only

SECTION HEADER #5
.nv_fatb name
     D68 virtual size
   3F000 virtual address (000000014003F000 to 000000014003FD67)
     E00 size of raw data
   3AC00 file pointer to raw data (0003AC00 to 0003B9FF)
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
C0000040 flags
         Initialized Data
         Read Write

SECTION HEADER #6
.nvFatBi name
      30 virtual size
   40000 virtual address (0000000140040000 to 000000014004002F)
     200 size of raw data
   3BA00 file pointer to raw data (0003BA00 to 0003BBFF)
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
C0000040 flags
         Initialized Data
         Read Write

SECTION HEADER #7
  .gfids name
      A0 virtual size
   41000 virtual address (0000000140041000 to 000000014004109F)
     200 size of raw data
   3BC00 file pointer to raw data (0003BC00 to 0003BDFF)
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
40000040 flags
         Initialized Data
         Read Only

SECTION HEADER #8
  .reloc name
     9F8 virtual size
   42000 virtual address (0000000140042000 to 00000001400429F7)
     A00 size of raw data
   3BE00 file pointer to raw data (0003BE00 to 0003C7FF)
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
42000040 flags
         Initialized Data
         Discardable
         Read Only

  Summary

        3000 .data
        1000 .gfids
        1000 .nvFatBi
        1000 .nv_fatb
        2000 .pdata
       17000 .rdata
        1000 .reloc
       22000 .text

Then I try to execute the binary file zcopy.exe, and it hang without any error:

C:\>zcopy.exe

I’m really confused about why x64 version binary will hang, but build in VS with x86 works. Do we have some limitations for K20c?

BTW, I’m running a 64bit windows10 OS.

Thanks,
Yuting

One update:

I sent zcopy.exe to my friend which use another GPU, he can successfully execute this binary.

Just to confirm, when you run the zcopy executable, it does not print any error messages, it just hangs without producing any output at all? The program has proper error checking, so any CUDA errors should be caught.

Honestly, I have no idea what could be causing an error-less hang. I am running Windows 7 Professional, with CUDA 8.0 and driver version 376.84, which is a recent WHQL driver. Not sure whether the Windows 10 drivers are numbered the same, but I think they are.

Can you post the output of nvidia-smi -q? I wonder whether there is something wrong with the K20c. Did this GPU come with the machine, or did you install it yourself? Did you buy it new or used?

Hi njuffa,

Yes, the program has no any error message here, just hang and stopped(Just for x64 version). The driver version I used is 376.84 same as yours.

Thanks again, I will search and do more test… or maybe contact nvidia.

Yuting

It’s the weekend now, which means very light traffic on this site. If you wait a couple of days, someone else might come by who has an idea as to what may be going on here.

This is (I believe) a cross posting of this:

[url]windows 10 - cuda simple application working for 32 bit not for 64 bit - Stack Overflow

I also believe that this may be related to a previous posting of a similar issue with K40c:

[url]windows 10 - cuda simple application working for 32 bit not for 64 bit - Stack Overflow

As you can see from the 2nd posting, there is a NVIDIA bug opened to track this issue (it has not been resolved yet) and I have added a comment to that bug linking this as another possible related observation.

As a temporary workaround, you might try placing the K20c in WDDM mode.

How would a hang in cudaMalloc() on Kepler-class GPUs, reproducable with trivial apps, be missed during regression testing at NVIDIA? Very puzzling …

Well, of course ideally it should not have been, and this is a typical question that gets asked during internal post-mortem of such bugs. However, to pick just a few examples, it appears that:

  • it happens in TCC but not WDDM mode
  • it happens on Win10 but not Win 8 or 7.
  • it happens in 64-bit but not 32-bit mode

I’m not going to dump the whole bug report here, but there has been a bunch of testing to also understand how this got missed. A number of similar configs were tested, all of which passed.

Initial triage was slowed down by the fact that (for whatever reason) initial repro attempt in QA was successful but in dev it was not, so there may be additional configuration factors that have not been uncovered yet.

So there is clearly a QA hole here, but the exact description of the hole is not fully known yet. I believe that once the bug is root caused, we will attempt to address this concern as well - that is a typical process step.

Hi txbob,

Thanks for your reply, the existing stackoverflow link(windows 10 - cuda simple application working for 32 bit not for 64 bit - Stack Overflow) looks very similar as mine. And it mentioned there is a Nvidia bug(#1855074) for this issue. Do we have access to some pages which show the bug status?

BTW, I will try the work around and update here. Thanks.

Strange, because TCC mode should be more stable and have less issues in windows (like the issue with slow cudaMalloc in never drivers - https://devtalk.nvidia.com/default/topic/963440/cudamalloc-pitch-_significantly_-slower-on-windows-with-geforce-drivers-gt-350-12/ etc.).

With WDDM mode, we had a couple of times the cuda program hanging at the first cudaMalloc call, usually due to one of the followign reasons:

  • Driver too old
  • Multiple driverse installed. Check in the system settings menu in the list of installed programs, if multiple Geforce drivers with different version numbers are appearing.
  • A huge third-party library is linked (like CUDPP) and JIT compilation is invoked automatically by driver (e.g. occuring for Pascal GPUs if cuda toolkit 7.0 is used).

Re #11: There is often much sensitive customer information captured in bug reports, so bug reports are confidential and only visible to the bug filer and relevant NVIDIA engineers. So mentioning bug numbers in a public venue is primarily useful to NVIDIA engineers who can use these to cross reference other issues that have been reported.

Because there is no access to other customer’s bug reports, I always encourage CUDA users to file their own bug reports, that way they will be appraised of forward progress on their issue. And not every issue that seems similar, or even identical, to an issue mentioned by someone else has necessarily the same root cause. Worst case, you might file a redundant bug report, but even that carries some helpful information in that it is an indication to NVIDIA how frequently customers hit that issue. So on balance, it is always best to file your own bug report.

Hi HannestF99,

Thanks for your share about potential reasons, I will check and update here.

Thanks njuffa,

Since I’m new to CUDA family, could you please send me a link to Nvidia bug track site like bugzilla?

There’s nothing like bugzilla

You can file bugs at developer.nvidia.com

You’ll need to become a registered developer.

Hi txbob,

Looks like my k20c does not support WDDM mode, seems workaround disappeared and now I have to wait for the fix.

C:\Program Files\NVIDIA Corporation\NVSMI>nvidia-smi.exe -L
GPU 0: Tesla K20c (UUID: GPU-7ca176ee-e7ae-e99a-5ab4-f14d27d97c49)

C:\Program Files\NVIDIA Corporation\NVSMI>nvidia-smi.exe -g 0 -dm 0
Unable to set driver model for GPU 0000:06:00.0: Not Supported
Treating as warning and moving on.
All done.

Hi njuffa,

I installed this K20c myself, and someone use it before and give it to me. Show the nvida-smi -q info here:

C:\Program Files\NVIDIA Corporation\NVSMI>nvidia-smi -q

==============NVSMI LOG==============

Timestamp                           : Tue Apr 11 20:43:41 2017
Driver Version                      : 376.84

Attached GPUs                       : 1
GPU 0000:06:00.0
    Product Name                    : Tesla K20c
    Product Brand                   : Tesla
    Display Mode                    : Disabled
    Display Active                  : Disabled
    Persistence Mode                : N/A
    Accounting Mode                 : Disabled
    Accounting Mode Buffer Size     : 1920
    Driver Model
        Current                     : TCC
        Pending                     : TCC
    Serial Number                   : 0325112010618
    GPU UUID                        : GPU-7ca176ee-e7ae-e99a-5ab4-f14d27d97c49
    Minor Number                    : N/A
    VBIOS Version                   : 80.10.14.00.02
    MultiGPU Board                  : No
    Board ID                        : 0x600
    GPU Part Number                 : 900-22081-2220-000
    Inforom Version
        Image Version               : 2081.0204.00.07
        OEM Object                  : 1.1
        ECC Object                  : 3.0
        Power Management Object     : N/A
    GPU Operation Mode
        Current                     : N/A
        Pending                     : N/A
    GPU Virtualization Mode
        Virtualization mode         : None
    PCI
        Bus                         : 0x06
        Device                      : 0x00
        Domain                      : 0x0000
        Device Id                   : 0x102210DE
        Bus Id                      : 0000:06:00.0
        Sub System Id               : 0x098210DE
        GPU Link Info
            PCIe Generation
                Max                 : 2
                Current             : 1
            Link Width
                Max                 : 16x
                Current             : 4x
        Bridge Chip
            Type                    : N/A
            Firmware                : N/A
        Replays since reset         : 0
        Tx Throughput               : N/A
        Rx Throughput               : N/A
    Fan Speed                       : 30 %
    Performance State               : P8
    Clocks Throttle Reasons
        Idle                        : Active
        Applications Clocks Setting : Not Active
        SW Power Cap                : Not Active
        HW Slowdown                 : Not Active
        Sync Boost                  : Not Active
        Unknown                     : Not Active
    FB Memory Usage
        Total                       : 4736 MiB
        Used                        : 0 MiB
        Free                        : 4736 MiB
    BAR1 Memory Usage
        Total                       : 256 MiB
        Used                        : 2 MiB
        Free                        : 254 MiB
    Compute Mode                    : Default
    Utilization
        Gpu                         : 0 %
        Memory                      : 0 %
        Encoder                     : 0 %
        Decoder                     : 0 %
    Ecc Mode
        Current                     : Enabled
        Pending                     : Enabled
    ECC Errors
        Volatile
            Single Bit
                Device Memory       : 0
                Register File       : 0
                L1 Cache            : 0
                L2 Cache            : 0
                Texture Memory      : 0
                Texture Shared      : N/A
                Total               : 0
            Double Bit
                Device Memory       : 0
                Register File       : 0
                L1 Cache            : 0
                L2 Cache            : 0
                Texture Memory      : 0
                Texture Shared      : N/A
                Total               : 0
        Aggregate
            Single Bit
                Device Memory       : 0
                Register File       : 0
                L1 Cache            : 0
                L2 Cache            : 0
                Texture Memory      : 0
                Texture Shared      : N/A
                Total               : 0
            Double Bit
                Device Memory       : 0
                Register File       : 0
                L1 Cache            : 0
                L2 Cache            : 0
                Texture Memory      : 0
                Texture Shared      : N/A
                Total               : 0
    Retired Pages
        Single Bit ECC              : 0
        Double Bit ECC              : 0
        Pending                     : No
    Temperature
        GPU Current Temp            : 23 C
        GPU Shutdown Temp           : 95 C
        GPU Slowdown Temp           : 90 C
    Power Readings
        Power Management            : Supported
        Power Draw                  : 15.45 W
        Power Limit                 : 225.00 W
        Default Power Limit         : 225.00 W
        Enforced Power Limit        : 225.00 W
        Min Power Limit             : 150.00 W
        Max Power Limit             : 225.00 W
    Clocks
        Graphics                    : 324 MHz
        SM                          : 324 MHz
        Memory                      : 324 MHz
        Video                       : 405 MHz
    Applications Clocks
        Graphics                    : 705 MHz
        Memory                      : 2600 MHz
    Default Applications Clocks
        Graphics                    : 705 MHz
        Memory                      : 2600 MHz
    Max Clocks
        Graphics                    : 758 MHz
        SM                          : 758 MHz
        Memory                      : 2600 MHz
        Video                       : 540 MHz
    Clock Policy
        Auto Boost                  : N/A
        Auto Boost Default          : N/A
    Processes                       : None

I asked for nvidia-smi output before txbob revealed that there is a known issue with cudaMalloc() hanging on Windows 10 with Tesla Kepler parts, something I did not expect (nightly regression testing checks hundreds of CUDA apps, and with few exceptions they all use cudaMalloc(); how could a basic issue with that function be missed?). I suspected that maybe the hardware isn’t fully operational, which is why I asked for the log.

Nothing about the log above looks unusual to me.

Yes, I had forgotten that K20c does not support WDDM.