It should be added to the cudaGetDeviceProperties because that function gets a LOT of information:
The properties I am dumping from the call:
It would be suggested to step this code might hold the tensor core parameter in the latest toolkit if you stepped it with a card that has Tensor cores? The 1660 does not.
void Cuda_device_poll(cudaDeviceProp* props, int device_count)
{
cudaGetDeviceCount(&device_count);
if (device_count > 0)
for(int t = 0; t < device_count; t++)
{
printf("Found %d cuda device...\n", device_count);
cudaGetDeviceProperties(&props[t], 0);
}
if (device_count == 0)
{
printf("No cuda devices found exiting...\n");
exit(-1);
}
}
void Cuda_print_properties(cudaDeviceProp* prop)
{
int card_count = Cuda_device_count();
for (int t = 0; t < card_count; t++)
{
cudaDeviceProp a = prop[t];
printf("Device: %d [%s] \n", t, a.name);
unsigned char uuid[16] = {0};
for (int u = 0; u < 16; u++)
{
uuid[u] = a.uuid.bytes[u];
}
printf(" UUID: %x%x%x%x-%x%x%x%x-%x%x%x%x-%x%x%x%x\n",uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], uuid[8],
uuid[9], uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]);
unsigned char luid[8] = {0};
for (int u = 0; u < 8; u++)
{
luid[u] = a.luid[u];
}
printf(" LUID: %x%x%x%x-%x%x%x%x\n", luid[0], luid[1], luid[2], luid[3], luid[4], luid[5], luid[6], luid[7]);
printf(" luidDeviceNodeMask: %u\n", a.luidDeviceNodeMask);
char metric[20] = {0};
size_t total_mem = a.totalGlobalMem;
char_to_NB(total_mem, metric);
printf(" totalGlobalMem: %ld\n", a.totalGlobalMem);
printf(" sharedMemPerBlock: %zu\n", a.sharedMemPerBlock);
printf(" regsPerBlock: %d\n", a.regsPerBlock);
printf(" warpSize: %d\n", a.warpSize);
printf(" memPitch: %zu\n", a.memPitch);
printf(" clockRate: %d\n", a.clockRate);
printf(" memoryClockRate: %d\n", a.memoryClockRate);
printf(" memoryBusWidth: %d\n", a.memoryBusWidth);
printf(" maxThreadsPerBlock: %d\n", a.maxThreadsPerBlock);
printf("maxThreadsPerMultiP: %d\n", a.maxThreadsPerMultiProcessor);
printf("ShrdMemPerMultiProc: %zu\n", a.sharedMemPerMultiprocessor);
printf("regsPerMultiProcess: %d\n", a.regsPerMultiprocessor);
printf(" maxThreadsDim(x): %d\n", a.maxThreadsDim[0]);
printf(" maxThreadsDim(y): %d\n", a.maxThreadsDim[1]);
printf(" maxThreadsDim(z): %d\n", a.maxThreadsDim[2]);
printf(" maxGridSize(x): %d\n", a.maxGridSize[0]);
printf(" maxGridSize(y): %d\n", a.maxGridSize[1]);
printf(" maxGridSize(y): %d\n", a.maxGridSize[2]);
printf(" totalConstMem: %zu\n", a.totalConstMem);
printf(" major: %d\n", a.major);
printf(" minor: %d\n", a.minor);
printf(" textureAlignment: %zu\n", a.textureAlignment);
printf(" deviceOverlap: %d\n", a.deviceOverlap);
printf("multiProcessorCount: %d\n", a.multiProcessorCount);
printf("kernelExecTimeOEnab: %d\n", a.kernelExecTimeoutEnabled);
printf(" integrated: %d\n", a.integrated);
printf(" canMapHostMemory: %d\n", a.canMapHostMemory);
printf(" computeMode: %d\n", a.computeMode);
printf(" maxTexture1D: %d\n", a.maxTexture1D);
printf(" maxTexture1DMipmap: %d\n", a.maxTexture1DMipmap);
printf(" maxTexture1DLinear: %d\n", a.maxTexture1DLinear);
printf(" surfaceAlignemt: %zu\n", a.surfaceAlignment);
printf(" concurrentKernels: %d\n", a.concurrentKernels);
printf(" ECCEnabled: %d\n", a.ECCEnabled);
printf(" pciBusID: %d\n", a.pciBusID);
printf(" pciDeviceID: %d\n", a.pciDeviceID);
printf(" pciDomainID: %d\n", a.pciDomainID);
printf(" tccDriver: %d\n", a.tccDriver);
printf(" asyncEngineCount: %d\n", a.asyncEngineCount);
printf(" streamPrioritiesSp: %d\n", a.streamPrioritiesSupported);
printf("globalL1CacheSupprt: %d\n", a.globalL1CacheSupported);
printf(" localL1CacheSupprt: %d\n", a.localL1CacheSupported);
printf("\n");
}
}