Hello MatColgrove, thank you very much for your reply
Here is the example:
main.cc:
#include <mpi.h>
#include <iostream>
#include <cmath>
#include <numeric>
#include <algorithm>
#include <execution>
#include <thrust/iterator/counting_iterator.h>
#include "lbm_d2q9.h"
template <typename Func>
void exec(bool use_gpu, int n, Func func)
{
std::for_each_n(std::execution::par_unseq, thrust::counting_iterator<int>(0), n, [=](int i){
func(i);
});
}
template <typename FLOAT>
void fill(bool use_gpu, int n, FLOAT *f, double value)
{
exec(use_gpu, n, [=](int i) {
f[i] = (FLOAT)value;
});
}
template <typename FLOAT>
int run()
{
int nprocs = 1;
int rank = 0;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int max_ngpus = 0;
cudaGetDeviceCount(&max_ngpus);
cudaSetDevice(rank % max_ngpus);
// global
const int mgn0 = 3;
const int nx0 = 800;//if you change the value from 800 to 80000, the memory access error will happen.
const int ny0 = 800;
const int lnx0 = nx0 + 2*mgn0;
const int lny0 = ny0 + 2*mgn0;
const int mgn = mgn0;
const int nx = nx0;
const int ny = ny0;
const int lnx = nx + 2*mgn;
const int lny = ny + 2*mgn;
const int ln = lnx*lny;
FLOAT *f = nullptr;
FLOAT *r = nullptr;
FLOAT *u = nullptr;
FLOAT *v = nullptr;
f = new FLOAT[ln*lbm_d2q9::NQ];
r = new FLOAT[ln];
u = new FLOAT[ln];
v = new FLOAT[ln];
fill(true, ln*lbm_d2q9::NQ, f , 0.0);
fill(true, ln, r, 0.0);
fill(true, ln, u, 0.0);
fill(true, ln, v, 0.0);
lbm_d2q9::rho_velocity_to_lbm_vdf(true, nx, ny, mgn, r, u, v, f);
delete [] f;
delete [] r;
delete [] u;
delete [] v;
f = nullptr;
r = nullptr;
u = nullptr;
v = nullptr;
return 0;
}
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
int nprocs = 1;
int rank = 0;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
run<float>();
MPI_Finalize();
return 0;
}
If you change the data size from 800x800 to 80000x80000, the illegal memory access error will happen.
Next is lbm_d2q9.cc
#include "lbm_d2q9.h"
namespace lbm_d2q9 {
// 0 1 2 3 4 5 6 7 8
constexpr int exs[] = {0, 1, 0, -1, 0, 1, -1, -1, 1 };
constexpr int eys[] = {0, 0, 1, 0, -1, 1, 1, -1, -1 };
constexpr unsigned int odirs[] = {0, 3, 4, 1, 2, 7, 8, 5, 6 };
namespace {
constexpr int ex(unsigned int id) { return exs[id]; }
constexpr int ey(unsigned int id) { return eys[id]; }
constexpr float w(unsigned int id) { return id == 0 ? 4.0/9.0 :
id <= 4 ? 1.0/9.0 :
1.0/36.0; }
template <unsigned int ID, typename FLOAT>
FLOAT cdot(FLOAT u, FLOAT v) { return ex(ID)*u + ey(ID)*v; }
template <unsigned int ID, typename FLOAT>
FLOAT cdot2(FLOAT u, FLOAT v) { return cdot<ID>(u, v) * cdot<ID>(u, v); }
template <typename FLOAT>
void lbm_feq(FLOAT rho, FLOAT u, FLOAT v, FLOAT feq[])
{
const FLOAT vel2 = u*u + v*v;
feq[ 0] = w(0) * rho * (1.0 + 3.0*cdot<0>(u, v) + 4.5*cdot2<0>(u, v) - 1.5*vel2);
feq[ 1] = w(1) * rho * (1.0 + 3.0*cdot<1>(u, v) + 4.5*cdot2<1>(u, v) - 1.5*vel2);
feq[ 2] = w(2) * rho * (1.0 + 3.0*cdot<2>(u, v) + 4.5*cdot2<2>(u, v) - 1.5*vel2);
feq[ 3] = w(3) * rho * (1.0 + 3.0*cdot<3>(u, v) + 4.5*cdot2<3>(u, v) - 1.5*vel2);
feq[ 4] = w(4) * rho * (1.0 + 3.0*cdot<4>(u, v) + 4.5*cdot2<4>(u, v) - 1.5*vel2);
feq[ 5] = w(5) * rho * (1.0 + 3.0*cdot<5>(u, v) + 4.5*cdot2<5>(u, v) - 1.5*vel2);
feq[ 6] = w(6) * rho * (1.0 + 3.0*cdot<6>(u, v) + 4.5*cdot2<6>(u, v) - 1.5*vel2);
feq[ 7] = w(7) * rho * (1.0 + 3.0*cdot<7>(u, v) + 4.5*cdot2<7>(u, v) - 1.5*vel2);
feq[ 8] = w(8) * rho * (1.0 + 3.0*cdot<8>(u, v) + 4.5*cdot2<8>(u, v) - 1.5*vel2);
}
} // namespace
template <typename Func>
void exec(bool use_gpu, int nx, int ny, Func func)
{
std::for_each_n(std::execution::par_unseq, thrust::counting_iterator<int>(0), nx*ny, [=](int c){
const int j = c / nx;
const int i = c - j*nx;
func(i, j);
});
}
template <typename FLOAT>
void rho_velocity_to_lbm_vdf(bool use_gpu, int nx, int ny, int mgn, const FLOAT *r, const FLOAT *u, const FLOAT *v, FLOAT *f)
{
const int lnx = nx + 2*mgn;
const int lny = ny + 2*mgn;
const int ln = lnx*lny;
exec(use_gpu, lnx, lny, [=] (int i, int j) {
FLOAT feq[NQ];
const int ix = i + j*lnx;
lbm_feq(r[ix], u[ix], v[ix], feq);
for (unsigned int l=0; l<NQ; l++) {
f[ln*l + ix] = feq[l];
}
});
}
#define FUNC(FLOAT) \
template \
void rho_velocity_to_lbm_vdf(bool use_gpu, int nx, int ny, int mgn, const FLOAT *r, const FLOAT *u, const FLOAT *v, FLOAT *f)
FUNC(float);
FUNC(double);
#undef FUNC
} // namespace lbm_d2q9
the last one is lbm_d2q9.h
#include "lbm_d2q9.h"
namespace lbm_d2q9 {
// 0 1 2 3 4 5 6 7 8
constexpr int exs[] = {0, 1, 0, -1, 0, 1, -1, -1, 1 };
constexpr int eys[] = {0, 0, 1, 0, -1, 1, 1, -1, -1 };
constexpr unsigned int odirs[] = {0, 3, 4, 1, 2, 7, 8, 5, 6 };
namespace {
constexpr int ex(unsigned int id) { return exs[id]; }
constexpr int ey(unsigned int id) { return eys[id]; }
constexpr float w(unsigned int id) { return id == 0 ? 4.0/9.0 :
id <= 4 ? 1.0/9.0 :
1.0/36.0; }
template <unsigned int ID, typename FLOAT>
FLOAT cdot(FLOAT u, FLOAT v) { return ex(ID)*u + ey(ID)*v; }
template <unsigned int ID, typename FLOAT>
FLOAT cdot2(FLOAT u, FLOAT v) { return cdot<ID>(u, v) * cdot<ID>(u, v); }
template <typename FLOAT>
void lbm_feq(FLOAT rho, FLOAT u, FLOAT v, FLOAT feq[])
{
const FLOAT vel2 = u*u + v*v;
feq[ 0] = w(0) * rho * (1.0 + 3.0*cdot<0>(u, v) + 4.5*cdot2<0>(u, v) - 1.5*vel2);
feq[ 1] = w(1) * rho * (1.0 + 3.0*cdot<1>(u, v) + 4.5*cdot2<1>(u, v) - 1.5*vel2);
feq[ 2] = w(2) * rho * (1.0 + 3.0*cdot<2>(u, v) + 4.5*cdot2<2>(u, v) - 1.5*vel2);
feq[ 3] = w(3) * rho * (1.0 + 3.0*cdot<3>(u, v) + 4.5*cdot2<3>(u, v) - 1.5*vel2);
feq[ 4] = w(4) * rho * (1.0 + 3.0*cdot<4>(u, v) + 4.5*cdot2<4>(u, v) - 1.5*vel2);
feq[ 5] = w(5) * rho * (1.0 + 3.0*cdot<5>(u, v) + 4.5*cdot2<5>(u, v) - 1.5*vel2);
feq[ 6] = w(6) * rho * (1.0 + 3.0*cdot<6>(u, v) + 4.5*cdot2<6>(u, v) - 1.5*vel2);
feq[ 7] = w(7) * rho * (1.0 + 3.0*cdot<7>(u, v) + 4.5*cdot2<7>(u, v) - 1.5*vel2);
feq[ 8] = w(8) * rho * (1.0 + 3.0*cdot<8>(u, v) + 4.5*cdot2<8>(u, v) - 1.5*vel2);
}
} // namespace
template <typename Func>
void exec(bool use_gpu, int nx, int ny, Func func)
{
std::for_each_n(std::execution::par_unseq, thrust::counting_iterator<int>(0), nx*ny, [=](int c){
const int j = c / nx;
const int i = c - j*nx;
func(i, j);
});
}
template <typename FLOAT>
void rho_velocity_to_lbm_vdf(bool use_gpu, int nx, int ny, int mgn, const FLOAT *r, const FLOAT *u, const FLOAT *v, FLOAT *f)
{
const int lnx = nx + 2*mgn;
const int lny = ny + 2*mgn;
const int ln = lnx*lny;
exec(use_gpu, lnx, lny, [=] (int i, int j) {
FLOAT feq[NQ];
const int ix = i + j*lnx;
lbm_feq(r[ix], u[ix], v[ix], feq);
for (unsigned int l=0; l<NQ; l++) {
f[ln*l + ix] = feq[l];
}
});
}
#define FUNC(FLOAT) \
template \
void rho_velocity_to_lbm_vdf(bool use_gpu, int nx, int ny, int mgn, const FLOAT *r, const FLOAT *u, const FLOAT *v, FLOAT *f)
FUNC(float);
FUNC(double);
#undef FUNC
} // namespace lbm_d2q9
The Makefile is:
CC = nvcc
CXX = nvc++
GCC = gcc
RM = rm -f
MAKEDEPEND = makedepend
USE_BUFFER ?= 1
include ./include.mk
CFLAGS = -Wall -O3 -I $(MPI_PATH)/include
CXXFLAGS = -std=c++20 -stdpar=gpu $(CFLAGS)
GCCFLAGS = $(CFLAGS) -std=c++17
LDFLAGS = -L${MPI_PATH}/lib -lmpi
SRCS = main.cc lbm_d2q9.cc
TARGET = run
DISTTARGET = $(TARGET)_1.0.0
OBJS += $(filter %.o,$(SRCS:%.c=%.o))
OBJS += $(filter %.o,$(SRCS:%.cc=%.o))
OBJS += $(filter %.o,$(SRCS:%.cpp=%.o))
OBJS += $(filter %.o,$(SRCS:%.cu=%.o))
CFLAGS += -DUSE_BUFFER
SRCS += $(GPUSRCS)
LDFLAGS += -L $(CUDA_PATH)/lib64 -lcuda -lcudart
DEPENDENCIES = $(subst .o,.d,$(OBJS))
.PHONY: all
all : $(TARGET)
$(TARGET) : $(OBJS)
$(CXX) $(CXXFLAGS) $(TARGET_ARCH) $(OBJS) -o $@ $(LDFLAGS)
%.o : %.c
$(call make-depend,$<,$@,$(subst .o,.d,$@))
$(CC) $(CFLAGS) $(TARGET_ARCH)-c $<
%.o : %.cc
$(call make-depend,$<,$@,$(subst .o,.d,$@))
$(CXX) $(CXXFLAGS) $(TARGET_ARCH) -c $<
%.o : %.cpp
$(call make-depend,$<,$@,$(subst .o,.d,$@))
$(CXX) $(CXXFLAGS) $(TARGET_ARCH) -c $<
.PHONY: dist
dist :
mkdir -p $(DISTTARGET)
@for h in `makedepend -Y -f- -- $(CXXFLAGS) -- $(SRCS) | grep -e ":" | sed -e "s/.*: //" | tr " " "\n" | sort | uniq` ; \
do \
cp -p $$h $(DISTTARGET); \
done
cp -p $(SRCS) $(DISTTARGET)
cp -p Makefile $(DISTTARGET)
tar -zcvf $(DISTTARGET).tar.gz $(DISTTARGET)
rm -rf $(DISTTARGET)
.PHONY: clean
clean :
$(RM) $(TARGET)
$(RM) $(OBJS)
$(RM) $(DEPENDENCIES)
$(RM) *~
ifneq "$(MAKECMDGOALS)" "clean"
-include $(DEPENDENCIES)
endif
# $(call make-depend,source-file,object-file,depend-file)
define make-depend
@$(GCC) -MM \
-MF $3 \
-MP \
-MT $2 \
$(CFLAGS) \
$(GCCFLAGS) \
$(TARGET_ARCH) \
$1
endef
modules required for compilation:
module load nvidia/22.7 cuda/11.4 ompi-cuda/4.1.4-11.4
execution command is:
mpiexec -n 1 ./run
Originally, this program was designed for multi-GPU execution on A100 GPUs. However, this error occurs in both multi-GPU and single-GPU cases. Therefore, I modified the execution command.
I hope this information helps you reproduce the error.