When my program is compiled and run, the output result is normal, when I put the program execution into rc.local, the output to the txt file is all 0, and I find the code with zero output in the middle like this:
void copy_zero(cublasHandle_t handle,float* input,int nfft,int M_Array,int fs,float* output)
{
for(int i=0;i<M_Array;i++)
{
cudaDeviceSynchronize();
#pragma acc host_data use_device(input,output)
cublasScopy(handle, fs, input+i*fs,1,output+i*nfft,1);
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
//fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(cudaStatus));
string sss=cudaGetErrorString(cudaStatus);
logWrite("sss"+sss);
}
cudaDeviceSynchronize();
}
}
Call in the main function:
copy_zero(handle1,process_recv_data,NFFT,M_Array,Fs,process_recv_data32k);
#pragma acc update host(process_recv_data32k[:10])
for(int f=0;f<10;f++)
logWrite("process_recv_data32k"+to_string(f)+"="+to_string(process_recv_data32k[f]));
In the txt file process_recv_data32k it’s all zeros, of course, the back ones are also zeros, and when I don’t boot up, it process_recv_data32k’s not zero. (Because of the limited equipment, I use the ssh remote vim to open the txt, I think this does not affect), I suspect that the boot self-starting linear algebra library is not configured well, but I see in the copy_zero(handle1,process_recv_data,NFFT,M_Array,Fs,process_recv_data32k); In the previous code, there was cuda’s fft function and the dot multiplication function cublasSscal, which ran without any problems:
#pragma acc host_data use_device(input_ifft)
{
cublasSscal(handle1, M_Array * FsJ2, &alpha, input_ifft, 1);
}
It doesn’t feel like a library issue, I tried changing cublasScopy to cudaMemcpy code
void copy_zero(cublasHandle_t handle,float* input,int nfft,int M_Array,int fs,float* output)
{
for(int i=0;i<M_Array;i++)
{
cudaDeviceSynchronize();
#pragma acc host_data use_device(input,output)
cudaMemcpy(output+i*nfft,input+i*fs,fs*sizeof(float),cudaMemcpyDeviceToDevice);
cudaDeviceSynchronize();
}
}
In this case, the function can be run with results, but many places in the code after copy_zero use CUDA’s functions and code, obviously my solution is not good.So I don’t have a clue right now.
I use echo $LD_LIBRARY_PATH to check the environment and display /usr/local/cuda-11.4/lib64: I want to start the environment has been configured, I also tried to write the environment configuration to rc-local.service, add it before running the program, but the operation still has no result.
Here’s my boot auto-start script:
# SPDX-License-Identifier: LGPL-2.1+
#
# This file is part of systemd.
#
# systemd is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
# This unit gets pulled automatically into multi-user.target by
# systemd-rc-local-generator if /etc/rc.local is executable.
[Unit]
Description=/etc/rc.local Compatibility
Documentation=man:systemd-rc-local-generator(8)
ConditionFileIsExecutable=/etc/rc.local
After=network.target
[Service]
Type=simple
ExecStart=/etc/rc.local start
TimeoutSec=0
RemainAfterExit=yes
Environment="LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64:$LD_LIBRARY_PATH"
Environment="CUDA_HOME=/usr/local/cuda-11.4"
GuessMainPID=no
[Install]
WantedBy=multi-user.target
Alias=rc-local.service
rc.local
#!/bin/bash
sleep 5
export LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64:$LD_LIBRARY_PATH
export CUDA_HOME=/usr/local/cuda-11.4
su - orin<<!
cd /home/orin/Desktop
(sleep 40 && ./daemonnode) &
exit
!
exit 0
Here’s how the program is compiled:
process:process.o cudaCode.o
nvc++ -acc -gpu=cc87 -fast -cuda -cudalib -std=c++17 -Minfo=accel -lnppig -lnppc -lnppisu -lnppidei -o process process.o cudaCode.o
process.o:process.c
nvc++ -fast -acc -gpu=cc87 -std=c++17 -cudalib -c process.c
cudaCode.o:cudaCode.cu
nvcc -rdc=true -c cudaCode.cu
datapack.o:datapack.c
nvc++ -c datapack.c -o datapack.o
CircularBuffer.o:CircularBuffer.c
nvc++ -c CircularBuffer.c -o CircularBuffer.o
CircularBufferSpliceData.o:CircularBufferSpliceData.c
nvc++ -c CircularBufferSpliceData.c -o CircularBufferSpliceData.o
Resample.o:Resample.cu
nvc++ -c Resample.cu -o Resample.o
FFTuse.o:FFTuse.c
nvc++ -c -cuda -cudalib FFTuse.c -o FFTuse.o
macros.o:macros.c
nvc++ -c macros.h -o macros.o
base_function.o:base_function.c
nvc++ -c base_function.c -o base_function.o
clean:
rm -f process.o cudaCode.o