NVC++-F-0000-Internal compiler error. no type conversion available 0 (FDTD3dGPU.cpp: 127)

Hello I am using nvc++ (HPCSDK 23.3.0) to compile C code and I received this message:
NVC+±F-0000-Internal compiler error. no type conversion available 0 (FDTD3dGPU.cpp: 127)
NVC++/x86-64 Linux 23.3-0: compilation aborted
make: *** [FDTD3dGPU.o] Error 2

The code in question is:

mtml@gpnpusc3000003[pts/1]fdtd3d-omp $ cat -n   FDTD3dGPU.cpp
     1	/*
     2	 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
     3	 *
     4	 * Please refer to the NVIDIA end user license agreement (EULA) associated
     5	 * with this source code for terms and conditions that govern your use of
     6	 * this software. Any use, reproduction, disclosure, or distribution of
     7	 * this software and related documentation outside the terms of the EULA
     8	 * is strictly prohibited.
     9	 *
    10	 */
    11	
    12	#include <iostream>
    13	#include <algorithm>
    14	#include <cstring>
    15	#include <cmath>
    16	#include <chrono>
    17	#include <omp.h>
    18	#include "FDTD3dGPU.h"
    19	#include "shrUtils.h"
    20	
    21	bool fdtdGPU(float *output, float *input, const float *coeff, 
    22	             const int dimx, const int dimy, const int dimz, const int radius, 
    23	             const int timesteps, const int argc, const char **argv)
    24	{
    25	    bool ok = true;
    26	    const int         outerDimx  = dimx + 2 * radius;
    27	    const int         outerDimy  = dimy + 2 * radius;
    28	    const int         outerDimz  = dimz + 2 * radius;
    29	    const size_t      volumeSize = outerDimx * outerDimy * outerDimz;
    30	    size_t            teamSize[2];
    31	    size_t            threadSize[2];
    32	
    33	    // Ensure that the inner data starts on a 128B boundary
    34	    const int padding = (128 / sizeof(float)) - radius;
    35	    const size_t paddedVolumeSize = volumeSize + padding;
    36	
    37	    float* bufferOut = (float*) malloc (paddedVolumeSize * sizeof(float));
    38	    float* bufferIn = (float*) malloc (paddedVolumeSize * sizeof(float));
    39	
    40	    memcpy(bufferIn + padding, input, volumeSize * sizeof(float));
    41	    memcpy(bufferOut + padding, input, volumeSize * sizeof(float)); 
    42	
    43	    // Get the maximum work group size
    44	    size_t userWorkSize = 256;
    45	
    46	    // Set the work group size
    47	    threadSize[0] = k_localWorkX;
    48	    threadSize[1] = userWorkSize / k_localWorkX;
    49	    teamSize[0] = (unsigned int)ceil((float)dimx / threadSize[0]);
    50	    teamSize[1] = (unsigned int)ceil((float)dimy / threadSize[1]);
    51	
    52	    int teamX = teamSize[0];
    53	    int teamY = teamSize[1];
    54	    int numTeam = teamX * teamY;
    55	
    56	    shrLog(" set thread size to %dx%d\n", threadSize[0], threadSize[1]);
    57	    shrLog(" set team size to %dx%d\n", teamSize[0], teamSize[1]);
    58	
    59	    // Execute the FDTD
    60	    shrLog(" GPU FDTD loop\n");
    61	
    62	    #pragma omp target data map(to: bufferIn[0:paddedVolumeSize], \
    63	                                    bufferOut[0:paddedVolumeSize], \
    64	                                    coeff[0:radius+1]) 
    65	    {
    66	    auto start = std::chrono::steady_clock::now();
    67	
    68	    for (int it = 0 ; it < timesteps ; it++)
    69	    {
    70	      #pragma omp target teams num_teams(numTeam) thread_limit(userWorkSize)
    71	      {
    72	        float tile [localWorkMaxY + 2*k_radius_default][localWorkMaxX + 2*k_radius_default];
    73	        #pragma omp parallel 
    74		{
    75	          bool valid = true;
    76	          const int ltidx = omp_get_thread_num() % k_localWorkX;
    77	          const int ltidy = omp_get_thread_num() / k_localWorkX;
    78	          const int workx = k_localWorkX;
    79	          const int worky = userWorkSize / k_localWorkX;
    80	          const int gtidx = (omp_get_team_num() % teamX) * workx + ltidx;
    81	          const int gtidy = (omp_get_team_num() / teamX) * worky + ltidy;
    82	          
    83	          const int stride_y = dimx + 2 * k_radius_default;
    84	          const int stride_z = stride_y * (dimy + 2 * k_radius_default);
    85	
    86	          int inputIndex  = 0;
    87	          int outputIndex = 0;
    88	
    89	          // Advance inputIndex to start of inner volume
    90	          inputIndex += k_radius_default * stride_y + k_radius_default + padding;
    91	          
    92	          // Advance inputIndex to target element
    93	          inputIndex += gtidy * stride_y + gtidx;
    94	
    95	          float infront[k_radius_default];
    96	          float behind[k_radius_default];
    97	          float current;
    98	
    99	          const int tx = ltidx + k_radius_default;
   100	          const int ty = ltidy + k_radius_default;
   101	
   102	          if (gtidx >= dimx) valid = false;
   103	          if (gtidy >= dimy) valid = false;
   104	
   105	          // For simplicity we assume that the global size is equal to the actual
   106	          // problem size; since the global size must be a multiple of the local size
   107	          // this means the problem size must be a multiple of the local size (or
   108	          // padded to meet this constraint).
   109	          // Preload the "infront" and "behind" data
   110	          for (int i = k_radius_default - 2 ; i >= 0 ; i--)
   111	          {
   112	              behind[i] = bufferIn[inputIndex];
   113	              inputIndex += stride_z;
   114	          }
   115	
   116	          current = bufferIn[inputIndex];
   117	          outputIndex = inputIndex;
   118	          inputIndex += stride_z;
   119	
   120	          for (int i = 0 ; i < k_radius_default ; i++)
   121	          {
   122	              infront[i] = bufferIn[inputIndex];
   123	              inputIndex += stride_z;
   124	          }
   125	
   126	          // Step through the xy-planes
   **127	          for (int iz = 0 ; iz < dimz ; iz++)**
   128	          {
   129	              // Advance the slice (move the thread-front)
   130	              for (int i = k_radius_default - 1 ; i > 0 ; i--)
   131	                  behind[i] = behind[i - 1];
   132	              behind[0] = current;
   133	              current = infront[0];
   134	              for (int i = 0 ; i < k_radius_default - 1 ; i++)
   135	                  infront[i] = infront[i + 1];
   136	              infront[k_radius_default - 1] = bufferIn[inputIndex];
   137	
   138	              inputIndex  += stride_z;
   139	              outputIndex += stride_z;
   140	              #pragma omp barrier
   141	
   142	              // Note that for the work items on the boundary of the problem, the
   143	              // supplied index when reading the halo (below) may wrap to the
   144	              // previous/next row or even the previous/next xy-plane. This is
   145	              // acceptable since a) we disable the output write for these work
   146	              // items and b) there is at least one xy-plane before/after the
   147	              // current plane, so the access will be within bounds.
   148	
   149	              // Update the data slice in the local tile
   150	              // Halo above & below
   151	              if (ltidy < k_radius_default)
   152	              {
   153	                  tile[ltidy][tx]                  = bufferIn[outputIndex - k_radius_default * stride_y];
   154	                  tile[ltidy + worky + k_radius_default][tx] = bufferIn[outputIndex + worky * stride_y];
   155	              }
   156	              // Halo left & right
   157	              if (ltidx < k_radius_default)
   158	              {
   159	                  tile[ty][ltidx]                  = bufferIn[outputIndex - k_radius_default];
   160	                  tile[ty][ltidx + workx + k_radius_default] = bufferIn[outputIndex + workx];
   161	              }
   162	              tile[ty][tx] = current;
   163	              #pragma omp barrier
   164	
   165	              // Compute the output value
   166	              float value = coeff[0] * current;
   167	              for (int i = 1 ; i <= k_radius_default ; i++)
   168	              {
   169	                  value += coeff[i] * (infront[i-1] + behind[i-1] + tile[ty - i][tx] + 
   170	                           tile[ty + i][tx] + tile[ty][tx - i] + tile[ty][tx + i]);
   171	              }
   172	
   173	              // Store the output value
   174	              if (valid) bufferOut[outputIndex] = value;
   175	          }
   176	        }
   177	      }
   178	
   179	      // Toggle the buffers
   180	      float* tmp = bufferIn;
   181	      bufferIn = bufferOut;
   182	      bufferOut = tmp;
   183	    }
   184	
   185	    auto end = std::chrono::steady_clock::now();
   186	    auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
   187	    printf("Average kernel execution time %f (s)\n", (time * 1e-9f) / timesteps);
   188	
   189	    #pragma omp target update from (bufferIn[0:paddedVolumeSize])
   190	    }
   191	
   192	    memcpy(output, bufferIn+padding, volumeSize*sizeof(float));
   193	    free(bufferIn);
   194	    free(bufferOut);
   195	    return ok;
   196	}

Thanks for the report.

Can you please also provide the two header files, “FDTD3dGPU.h” and “shrUtils.h”, so I can try to recreate the error?

Also, please don’t include the line numbers in the output, and put the code into a preformatted text block (i.e. highlight the code and then select the “</>” symbol). This way I can copy and paste the code without having to edit out the line numbers and fix the “” which are treated as html unless put in the text block.

Alternately, you can attach the files to the post.

Finally, can you please provide the compile flags you’re using?

-Mat

Sure, I missed the markup buttons… Sorry for the extra work !

I am attaching the 2 include files:

FDTD3dGPU.h (2.2 KB)
shrUtils.h (31.9 KB)

$ nvc++ -v
Export NVCOMPILER=/data/saet/mtml/software/x86_64/nvidia/hpc_sdk/Linux_x86_64/23.3
Export PGI=/data/saet/mtml/software/x86_64/nvidia/hpc_sdk

thanks!

Thanks, I was able to reproduce the error. Seems like its coming from line 133 since if I comment it out, the ICE no longer occurs. Likely an optimization is doing something to “current” that’s causing the error.

The work around would be to lower the opt level to “-O1”

The error doesn’t occur in our development compiler but I’m not sure that means this is a known issue. We do have another report with the same ICE, so possible. In case this is different, I filed a problem report, TPR #33678, and sent it to engineering for investigation.

-Mat

1 Like

Mat, many thanks for the prompt investigation!

I’ll try the workaround. The error message was not that informative.

regards

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.