I’m running your code on linux, CUDA 13.0, L4 GPU, so I have made some modifications along the lines you suggest, such as modifying file paths. I also fixed a few compiler warnings.
When I run the code it reports using 32 threads. Running under compute-sanitizer as well as the subtools reports no issue.
When I diff the output files against each other (InterImg000.raw against each of the others InterImg001.raw to InterImg049.raw), diff reports no differences in any case.
So I assume I am not witnessing the issue?
My test case:
# ls
cuSZp_kernels_f32.cu II000.raw multiCuSZp.h
cuSZp_kernels_f32.cubin multi.cpp OriData.raw
# cat multi.cpp
#define _CRT_SECURE_NO_WARNINGS
#include <cuda_fp16.h>
#include <cuda.h>
#include <malloc.h>
#include <vector>
#include <omp.h>
#include <iostream>
#include <fstream>
#include "multiCuSZp.h"
using namespace std;
//#define WRITEALL
const size_t MAX_NBELE = 2048 * 2048 * 4;
const int DEC_TBLOCK_SIZE = 32;
const int DEC_CHUNK = 1024;
const char* CUBIN_PATH = "./cuSZp_kernels_f32.cubin";
int nThreadsNum = 0;
static int init_thread_cuda_resources() {
CUresult status;
status = cuInit(0);
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuInit Err: %s\n", err_msg);
return -1;
}
CUdevice device;
status = cuDeviceGet(&device, 0);
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuDeviceGet Err: %s\n", err_msg);
return -1;
}
CUctxCreateParams ctxCreateParams = {};
status = cuCtxCreate(&m_cuContext, &ctxCreateParams, 0, device);
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuCtxCreate Err: %s\n", err_msg);
return -1;
}
status = cuModuleLoad(&m_cuModuleDecompress, CUBIN_PATH);
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuModuleLoad Err: %s\n", err_msg);
return -1;
}
int gsize = (MAX_NBELE + DEC_TBLOCK_SIZE * DEC_CHUNK - 1) / (DEC_TBLOCK_SIZE * DEC_CHUNK);
m_nCmpOffset = gsize + 1;
for (int i = 0; i < nThreadsNum; i++)
{
status = cuMemAlloc(&m_vecDecompressDataIn_d[i], MAX_NBELE * sizeof(short));
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuMemAlloc d_in Err: %s\n", err_msg);
return -1;
}
status = cuMemAlloc(&m_vecDecompressDataOut_d[i], MAX_NBELE * sizeof(short));
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuMemAlloc d_out Err: %s\n", err_msg);
return -1;
}
status = cuMemAlloc(&m_vecCmpOffset_d[i], sizeof(unsigned int) * m_nCmpOffset);
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuMemAlloc d_cmpOffset Err: %s\n", err_msg);
return -1;
}
status = cuMemAlloc(&m_vecLocOffset_d[i], sizeof(unsigned int) * m_nCmpOffset);
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuMemAlloc d_locOffset Err: %s\n", err_msg);
return -1;
}
status = cuMemAlloc(&m_vecFlag_d[i], sizeof(int) * m_nCmpOffset);
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuMemAlloc d_flag Err: %s\n", err_msg);
return -1;
}
}
printf("Thread %d CUDA resources initialized successfully\n", omp_get_thread_num());
return 0;
}
int multi_cuSZp_decompress(char* inputData, size_t m_DataNum, size_t cmpSize, int nThreadId, int nIdx)
{
CUresult status;
const char* kernel_name = "cuSZp_decompress_kernel_outlier_short";
status = cuCtxSetCurrent(m_cuContext);
CUfunction kernel;
status = cuModuleGetFunction(&kernel, m_cuModuleDecompress, kernel_name);
if (status != CUDA_SUCCESS) {
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuModuleGetFunction Err: %s, Thread %d\n", err_msg, nThreadId);
return -1;
}
char filename[255] = { 0 };
FILE* out_file = NULL;
size_t written_bytes = 0;
int dec_tblock_size = 32;
int bsize = dec_tblock_size;//
int dec_chunk = 1024;
size_t nbEle = m_DataNum;
int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
status = cuMemsetD16(m_vecDecompressDataIn_d[nThreadId], 0, nbEle);
status = cuMemsetD16(m_vecDecompressDataOut_d[nThreadId], 0, nbEle);
status = cuMemcpyHtoD(m_vecDecompressDataIn_d[nThreadId], inputData, cmpSize);
if (status != CUDA_SUCCESS)
{
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("m_vecDecompressDataIn_d cuMemcpyHtoD Err : %s., nThreadId = % d\n", err_msg, nThreadId);
return -1;
}
float errorBound = 1.0;
unsigned int glob_sync = 0;
status = cuMemsetD32(m_vecCmpOffset_d[nThreadId], 0, m_nCmpOffset);
if (status != CUDA_SUCCESS)
{
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("m_vecCmpOffset_d cuMemsetD32 Err : %s.,nThreadId=%d\n", err_msg, nThreadId);
return -1;
}
status = cuMemsetD32(m_vecLocOffset_d[nThreadId], 0, m_nCmpOffset);
if (status != CUDA_SUCCESS)
{
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("m_vecLocOffset_d cuMemsetD32 Err : %s.,nThreadId=%d\n", err_msg, nThreadId);
return -1;
}
status = cuMemsetD32(m_vecFlag_d[nThreadId], 0, m_nCmpOffset);
if (status != CUDA_SUCCESS)
{
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("m_vecFlag_d cuMemsetD32 Err : %s.,nThreadId=%d\n", err_msg, nThreadId);
return -1;
}
dim3 blockSize(bsize);
dim3 gridSize(gsize);
void* kernel_args[] = { &m_vecDecompressDataOut_d[nThreadId],&m_vecDecompressDataIn_d[nThreadId] ,&m_vecCmpOffset_d[nThreadId],&m_vecLocOffset_d[nThreadId],&m_vecFlag_d[nThreadId],&errorBound,&nbEle };
status = cuLaunchKernel(kernel, gridSize.x, gridSize.y, gridSize.z, blockSize.x, blockSize.y, blockSize.z, 0, NULL, kernel_args, NULL);
if (status != CUDA_SUCCESS)
{
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("cuLaunchKernel Err:%s.,nThreadId=%d\n", err_msg, nThreadId);
return -1;
}
#ifdef WRITEALL
status = cuMemcpyDtoH(inputData, m_vecCmpOffset_d[nThreadId], m_nCmpOffset * sizeof(unsigned int));
if (status != CUDA_SUCCESS)
{
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("m_vecCmpOffset_d cuMemcpyDtoH Err : %s.,nThreadId=%d", err_msg, nThreadId);
return -1;
}
sprintf(filename, "./InterImg%03d.raw", nIdx);
out_file = fopen(filename, "wb");
if (out_file == nullptr) {
printf("fopen error, nThreadId = % d", nThreadId);
}
written_bytes = fwrite(
inputData,
1,
m_nCmpOffset * sizeof(unsigned int),
out_file
);
fclose(out_file);
status = cuMemcpyDtoH(inputData, m_vecLocOffset_d[nThreadId], m_nCmpOffset * sizeof(unsigned int));
if (status != CUDA_SUCCESS)
{
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("m_vecLocOffset_d cuMemcpyDtoH Err : %s.,nThreadId=%d", err_msg, nThreadId);
return -1;
}
sprintf(filename, "./InterImg%03d.raw", nIdx);
out_file = fopen(filename, "wb");
if (out_file == nullptr) {
printf("fopen error, nThreadId = % d", nThreadId);
}
written_bytes = fwrite(
inputData,
1,
m_nCmpOffset * sizeof(unsigned int),
out_file
);
fclose(out_file);
status = cuMemcpyDtoH(inputData, m_vecFlag_d[nThreadId], m_nCmpOffset * sizeof(unsigned int));
if (status != CUDA_SUCCESS)
{
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("m_vecFlag_d cuMemcpyDtoH Err : %s.,nThreadId=%d", err_msg, nThreadId);
return -1;
}
sprintf(filename, "./InterImg%03d.raw", nIdx);
out_file = fopen(filename, "wb");
if (out_file == nullptr) {
printf("fopen error, nThreadId = % d", nThreadId);
}
written_bytes = fwrite(
inputData,
1,
m_nCmpOffset * sizeof(unsigned int),
out_file
);
fclose(out_file);
#endif // !WRITEALL
//status = cuCtxSynchronize();
status = cuMemcpyDtoH(inputData, m_vecDecompressDataOut_d[nThreadId], nbEle * sizeof(short));
if (status != CUDA_SUCCESS)
{
const char* err_msg;
cuGetErrorString(status, &err_msg);
printf("m_vecDecompressDataOut_d cuMemcpyDtoH Err : %s.,nThreadId=%d", err_msg, nThreadId);
return -1;
}
sprintf(filename, "./InterImg%03d.raw", nIdx);
out_file = fopen(filename, "wb");
if (out_file == nullptr) {
printf("fopen error, nThreadId = % d", nThreadId);
}
written_bytes = fwrite(
inputData,
1,
nbEle * sizeof(short),
out_file
);
fclose(out_file);
printf("decompress success., nThreadId = % d\n", nThreadId);
return 0;
}
long long get_file_size(FILE* pFile) {
if (pFile == NULL) {
printf("Error: File pointer is NULL!\n");
return -1;
}
long long nOriginalPos;
long long nFileSize;
nOriginalPos = (long long)ftell(pFile);
if (nOriginalPos == -1LL) {
printf("Error: ftell failed to get original position!\n");
return -1;
}
if (fseek(pFile, 0, SEEK_END) != 0) {
printf("Error: fseek failed to move to file end!\n");
return -1;
}
nFileSize = (long long)ftell(pFile);
if (nFileSize == -1LL) {
printf("Error: ftell failed to get file size!\n");
return -1;
}
if (fseek(pFile, (long)nOriginalPos, SEEK_SET) != 0) {
printf("Error: fseek failed to restore original position!\n");
return -1;
}
return nFileSize;
}
int ReadImages(char* TempBuffer, size_t ImageSize, int imgNum, vector<size_t>& allCmpSize)
{
char filename[255];
FILE* pf = NULL;
for (int j = 0; j < imgNum; j++)
{
int i = 0;
sprintf(filename, "./II%03d.raw", i);
pf = fopen(filename, "rb");
if (pf == NULL)
{
printf("read data error.\n");
return -1;
}
size_t cmpSize = get_file_size(pf);
allCmpSize[j] = cmpSize;
size_t nReadBytes = fread(TempBuffer + ImageSize * j, 1, cmpSize, pf);
fclose(pf);
pf = NULL;
}
return 0;
}
int main()
{
nThreadsNum = omp_get_max_threads();
omp_set_num_threads(nThreadsNum);
printf("nThreadsNum = %d\n", nThreadsNum);
CUresult result = cuInit(0);
CUdevice device;
result = cuDeviceGet(&device, 0);
CUctxCreateParams ctxCreateParams = {};
result = cuCtxCreate(&m_cuContext, &ctxCreateParams, 0, device);
size_t nbEle = MAX_NBELE;
int dec_tblock_size = 32;
int bsize = dec_tblock_size;
int dec_chunk = 1024;
int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
m_nCmpOffset = gsize + 1;
m_vecDecompressDataIn_d.resize(nThreadsNum);
m_vecDecompressDataOut_d.resize(nThreadsNum);
m_vecCmpOffset_d.resize(nThreadsNum);
m_vecLocOffset_d.resize(nThreadsNum);
m_vecFlag_d.resize(nThreadsNum);
init_thread_cuda_resources();
char* pcTempBuffer = NULL;
int imgNum = 50;
pcTempBuffer = new(nothrow)char[nbEle * sizeof(short) * imgNum];
char* pcInterTemp = NULL;
vector<size_t> allCmpSize;
allCmpSize.resize(imgNum);
ReadImages(pcTempBuffer, nbEle * sizeof(short), imgNum, allCmpSize);
int i = 0;
int nOmpThreadNum = 0;
#pragma omp parallel for private(i,nOmpThreadNum,pcInterTemp) shared(pcTempBuffer)
for (i = 0; i < imgNum; i++)
{
nOmpThreadNum = omp_get_thread_num();
pcInterTemp = pcTempBuffer + i * nbEle * sizeof(short);
multi_cuSZp_decompress(pcInterTemp, nbEle, allCmpSize[i], nOmpThreadNum, i);
}
return 0;
}
# g++ -I/usr/local/cuda/include multi.cpp -o test -L/usr/local/cuda/lib64 -lcuda -fopenmp -lgomp
# ./test
nThreadsNum = 32
Thread 0 CUDA resources initialized successfully
decompress success., nThreadId = 10
decompress success., nThreadId = 9
decompress success., nThreadId = 30
decompress success., nThreadId = 8
decompress success., nThreadId = 6
decompress success., nThreadId = 25
decompress success., nThreadId = 23
decompress success., nThreadId = 17
decompress success., nThreadId = 20
decompress success., nThreadId = 0
decompress success., nThreadId = 28
decompress success., nThreadId = 1
decompress success., nThreadId = 2
decompress success., nThreadId = 4
decompress success., nThreadId = 5
decompress success., nThreadId = 14
decompress success., nThreadId = 8
decompress success., nThreadId = 29
decompress success., nThreadId = 9
decompress success., nThreadId = 31
decompress success., nThreadId = 15
decompress success., nThreadId = 6
decompress success., nThreadId = 13
decompress success., nThreadId = 21
decompress success., nThreadId = 10
decompress success., nThreadId = 24
decompress success., nThreadId = 18
decompress success., nThreadId = 4
decompress success., nThreadId = 7
decompress success., nThreadId = 2
decompress success., nThreadId = 27
decompress success., nThreadId = 22
decompress success., nThreadId = 1
decompress success., nThreadId = 12
decompress success., nThreadId = 16
decompress success., nThreadId = 15
decompress success., nThreadId = 3
decompress success., nThreadId = 26
decompress success., nThreadId = 14
decompress success., nThreadId = 7
decompress success., nThreadId = 17
decompress success., nThreadId = 19
decompress success., nThreadId = 5
decompress success., nThreadId = 11
decompress success., nThreadId = 16
decompress success., nThreadId = 0
decompress success., nThreadId = 13
decompress success., nThreadId = 12
decompress success., nThreadId = 3
decompress success., nThreadId = 11
# diff InterImg000.raw InterImg001.raw
# diff InterImg000.raw InterImg002.raw
# diff InterImg000.raw InterImg003.raw
# diff InterImg000.raw InterImg004.raw
# diff InterImg000.raw InterImg005.raw
# diff InterImg000.raw InterImg006.raw
# diff InterImg000.raw InterImg007.raw
# diff InterImg000.raw InterImg008.raw
# diff InterImg000.raw InterImg009.raw
# diff InterImg000.raw InterImg010.raw
...
# diff InterImg000.raw InterImg049.raw
#
(II000.raw is the InterImg000.raw file from your zip file. I have renamed it so the output is not overwriting the input file.)