-
Notifications
You must be signed in to change notification settings - Fork 117
MPI FFTW #997
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
MPI FFTW #997
Conversation
PR Reviewer Guide 🔍Here are some key observations to aid the review process:
|
src/post_process/m_start_up.fpp
Outdated
end do | ||
|
||
call s_mpi_transpose_x2y !!Change Pencil from data_cmplx to data_cmpx_y | ||
|
||
do l = 1, Nzloc | ||
do k = 1, Nxloc | ||
do j = 1, Ny | ||
data_out(j + (k-1)*Ny + (l-1)*Ny*Nxloc) = data_cmplx_y(k, j, l) | ||
end do | ||
end do | ||
end do | ||
|
||
call fftw_execute_dft(fwd_plan_y, data_out, data_in) | ||
|
||
do l = 1, Nzloc | ||
do k = 1, Nxloc | ||
do j = 1, Ny | ||
data_cmplx_y(k, j, l) = data_in(j + (k-1)*Ny + (l-1)*Ny*Nxloc) | ||
end do | ||
end do | ||
end do | ||
|
||
call s_mpi_transpose_y2z !!Change Pencil from data_cmplx_y to data_cmpx_z | ||
|
||
do l = 1, Nyloc2 | ||
do k = 1, Nxloc | ||
do j = 1, Nz | ||
data_in(j + (k-1)*Nz + (l-1)*Nz*Nxloc) = data_cmplx_z(k, l, j) | ||
end do | ||
end do | ||
end do | ||
|
||
call fftw_execute_dft(fwd_plan_z, data_in, data_out) | ||
|
||
do l = 1, Nyloc2 | ||
do k = 1, Nxloc | ||
do j = 1, Nz | ||
data_cmplx_z(k, l, j) = data_out(j + (k-1)*Nz + (l-1)*Nz*Nxloc) | ||
end do | ||
end do | ||
end do | ||
|
||
end subroutine s_mpi_FFT_fwd | ||
|
||
subroutine s_mpi_transpose_x2y | ||
complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:) | ||
integer :: dest_rank, src_rank | ||
integer :: i, j, k, l | ||
|
||
allocate(sendbuf(Nx*Nyloc*Nzloc)) | ||
allocate(recvbuf(Nx*Nyloc*Nzloc)) | ||
|
||
do dest_rank = 0, num_procs_y - 1 | ||
do l = 1, Nzloc | ||
do k = 1, Nyloc | ||
do j = 1, Nxloc | ||
sendbuf(j + (k-1)*Nxloc + (l-1)*Nxloc*Nyloc + dest_rank*Nxloc*Nyloc*Nzloc) = data_cmplx(j + dest_rank*Nxloc, k, l) | ||
end do | ||
end do | ||
end do | ||
end do | ||
|
||
call MPI_Alltoall(sendbuf, Nxloc*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & | ||
recvbuf, Nxloc*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_CART12, ierr) | ||
|
||
do src_rank = 0, num_procs_y - 1 | ||
do l = 1, Nzloc | ||
do k = 1, Nyloc | ||
do j = 1, Nxloc | ||
data_cmplx_y(j, k + src_rank*Nyloc, l) = recvbuf(j + (k-1)*Nxloc + (l-1)*Nxloc*Nyloc + src_rank*Nxloc*Nyloc*Nzloc) | ||
end do | ||
end do | ||
end do | ||
end do | ||
|
||
deallocate(sendbuf) | ||
deallocate(recvbuf) | ||
|
||
end subroutine s_mpi_transpose_x2y | ||
|
||
subroutine s_mpi_transpose_y2z | ||
complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:) | ||
integer :: dest_rank, src_rank | ||
integer :: j, k, l | ||
|
||
allocate(sendbuf(Ny*Nxloc*Nzloc)) | ||
allocate(recvbuf(Ny*Nxloc*Nzloc)) | ||
|
||
do dest_rank = 0, num_procs_z - 1 | ||
do l = 1, Nzloc | ||
do j = 1, Nxloc | ||
do k = 1, Nyloc2 | ||
sendbuf(k + (j-1)*Nyloc2 + (l-1)*(Nyloc2*Nxloc) + dest_rank*Nyloc2*Nxloc*Nzloc) = data_cmplx_y(j, k + dest_rank*Nyloc2, l) | ||
end do | ||
end do | ||
end do | ||
end do | ||
|
||
call MPI_Alltoall(sendbuf, Nyloc2*Nxloc*Nzloc, MPI_DOUBLE_COMPLEX, & | ||
recvbuf, Nyloc2*Nxloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_CART13, ierr) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
✅ Suggestion: The current Alltoall uses 2D Cartesian subcommunicators, causing sendcount*comm_size to mismatch the packed buffer size and risking out-of-bounds. Create 1D subcommunicators along Y and Z only and use the MPI datatype that matches complex(c_double_complex)
. This fixes the communicator size mismatch and avoids undefined behavior due to datatype mismatch. [possible issue, importance: 10]
call MPI_Cart_SUB(MPI_COMM_CART, (/.true., .true., .false./), MPI_COMM_CART12, ierr) | |
call MPI_COMM_RANK(MPI_COMM_CART12, proc_rank12, ierr) | |
call MPI_CART_COORDS(MPI_COMM_CART12, proc_rank12, 2, cart2d12_coords, ierr) | |
call MPI_Cart_SUB(MPI_COMM_CART, (/.true., .false., .true./), MPI_COMM_CART13, ierr) | |
call MPI_COMM_RANK(MPI_COMM_CART13, proc_rank13, ierr) | |
call MPI_CART_COORDS(MPI_COMM_CART13, proc_rank13, 2, cart2d13_coords, ierr) | |
end if | |
end subroutine s_initialize_modules | |
subroutine s_mpi_FFT_fwd | |
integer :: j, k, l | |
do l = 1, Nzloc | |
do k = 1, Nyloc | |
do j = 1, Nx | |
data_in(j + (k-1)*Nx + (l-1)*Nx*Nyloc) = data_cmplx(j, k, l) | |
end do | |
end do | |
end do | |
call fftw_execute_dft(fwd_plan_x, data_in, data_out) | |
do l = 1, Nzloc | |
do k = 1, Nyloc | |
do j = 1, Nx | |
data_cmplx(j, k, l) = data_out(j + (k-1)*Nx + (l-1)*Nx*Nyloc) | |
end do | |
end do | |
end do | |
call s_mpi_transpose_x2y !!Change Pencil from data_cmplx to data_cmpx_y | |
do l = 1, Nzloc | |
do k = 1, Nxloc | |
do j = 1, Ny | |
data_out(j + (k-1)*Ny + (l-1)*Ny*Nxloc) = data_cmplx_y(k, j, l) | |
end do | |
end do | |
end do | |
call fftw_execute_dft(fwd_plan_y, data_out, data_in) | |
do l = 1, Nzloc | |
do k = 1, Nxloc | |
do j = 1, Ny | |
data_cmplx_y(k, j, l) = data_in(j + (k-1)*Ny + (l-1)*Ny*Nxloc) | |
end do | |
end do | |
end do | |
call s_mpi_transpose_y2z !!Change Pencil from data_cmplx_y to data_cmpx_z | |
do l = 1, Nyloc2 | |
do k = 1, Nxloc | |
do j = 1, Nz | |
data_in(j + (k-1)*Nz + (l-1)*Nz*Nxloc) = data_cmplx_z(k, l, j) | |
end do | |
end do | |
end do | |
call fftw_execute_dft(fwd_plan_z, data_in, data_out) | |
do l = 1, Nyloc2 | |
do k = 1, Nxloc | |
do j = 1, Nz | |
data_cmplx_z(k, l, j) = data_out(j + (k-1)*Nz + (l-1)*Nz*Nxloc) | |
end do | |
end do | |
end do | |
end subroutine s_mpi_FFT_fwd | |
subroutine s_mpi_transpose_x2y | |
complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:) | |
integer :: dest_rank, src_rank | |
integer :: i, j, k, l | |
allocate(sendbuf(Nx*Nyloc*Nzloc)) | |
allocate(recvbuf(Nx*Nyloc*Nzloc)) | |
do dest_rank = 0, num_procs_y - 1 | |
do l = 1, Nzloc | |
do k = 1, Nyloc | |
do j = 1, Nxloc | |
sendbuf(j + (k-1)*Nxloc + (l-1)*Nxloc*Nyloc + dest_rank*Nxloc*Nyloc*Nzloc) = data_cmplx(j + dest_rank*Nxloc, k, l) | |
end do | |
end do | |
end do | |
end do | |
call MPI_Alltoall(sendbuf, Nxloc*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & | |
recvbuf, Nxloc*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_CART12, ierr) | |
do src_rank = 0, num_procs_y - 1 | |
do l = 1, Nzloc | |
do k = 1, Nyloc | |
do j = 1, Nxloc | |
data_cmplx_y(j, k + src_rank*Nyloc, l) = recvbuf(j + (k-1)*Nxloc + (l-1)*Nxloc*Nyloc + src_rank*Nxloc*Nyloc*Nzloc) | |
end do | |
end do | |
end do | |
end do | |
deallocate(sendbuf) | |
deallocate(recvbuf) | |
end subroutine s_mpi_transpose_x2y | |
subroutine s_mpi_transpose_y2z | |
complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:) | |
integer :: dest_rank, src_rank | |
integer :: j, k, l | |
allocate(sendbuf(Ny*Nxloc*Nzloc)) | |
allocate(recvbuf(Ny*Nxloc*Nzloc)) | |
do dest_rank = 0, num_procs_z - 1 | |
do l = 1, Nzloc | |
do j = 1, Nxloc | |
do k = 1, Nyloc2 | |
sendbuf(k + (j-1)*Nyloc2 + (l-1)*(Nyloc2*Nxloc) + dest_rank*Nyloc2*Nxloc*Nzloc) = data_cmplx_y(j, k + dest_rank*Nyloc2, l) | |
end do | |
end do | |
end do | |
end do | |
call MPI_Alltoall(sendbuf, Nyloc2*Nxloc*Nzloc, MPI_DOUBLE_COMPLEX, & | |
recvbuf, Nyloc2*Nxloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_CART13, ierr) | |
call MPI_Cart_SUB(MPI_COMM_CART, (/.false., .true., .false./), MPI_COMM_CART12, ierr) | |
call MPI_Cart_SUB(MPI_COMM_CART, (/.false., .false., .true./), MPI_COMM_CART13, ierr) | |
... | |
call MPI_Alltoall(sendbuf, Nxloc*Nyloc*Nzloc, MPI_C_DOUBLE_COMPLEX, & | |
recvbuf, Nxloc*Nyloc*Nzloc, MPI_C_DOUBLE_COMPLEX, MPI_COMM_CART12, ierr) | |
... | |
call MPI_Alltoall(sendbuf, Nyloc2*Nxloc*Nzloc, MPI_C_DOUBLE_COMPLEX, & | |
recvbuf, Nyloc2*Nxloc*Nzloc, MPI_C_DOUBLE_COMPLEX, MPI_COMM_CART13, ierr) |
j_glb = j + proc_coords(2)*Nxloc | ||
k_glb = k + proc_coords(3)*Nyloc2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
✅ Suggestion: proc_coords
is undefined in this scope, which will fail compilation. Use the computed 3D Cartesian coordinates returned by MPI_CART_COORDS
. [possible issue, importance: 10]
j_glb = j + proc_coords(2)*Nxloc | |
k_glb = k + proc_coords(3)*Nyloc2 | |
j_glb = j + cart3d_coords(2)*Nxloc | |
k_glb = k + cart3d_coords(3)*Nyloc2 |
src/post_process/m_start_up.fpp
Outdated
Nf = max(Nx, Ny, Nz) | ||
|
||
@:ALLOCATE(data_in(Nx*Nyloc*Nzloc)) | ||
@:ALLOCATE(data_out(Nx*Nyloc*Nzloc)) | ||
|
||
@:ALLOCATE(data_cmplx(Nx, Nyloc, Nzloc)) | ||
@:ALLOCATE(data_cmplx_y(Nxloc, Ny, Nzloc)) | ||
@:ALLOCATE(data_cmplx_z(Nxloc, Nyloc2, Nz)) | ||
|
||
@:ALLOCATE(En_real(Nxloc, Nyloc2, Nz)) | ||
@:ALLOCATE(En(Nf)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion: The spectrum bin index kf
can exceed Nf
, causing out-of-bounds. Size Nf
to the maximum possible radial wavenumber and guard the accumulation to stay within bounds. [possible issue, importance: 9]
Nf = max(Nx, Ny, Nz) | |
@:ALLOCATE(data_in(Nx*Nyloc*Nzloc)) | |
@:ALLOCATE(data_out(Nx*Nyloc*Nzloc)) | |
@:ALLOCATE(data_cmplx(Nx, Nyloc, Nzloc)) | |
@:ALLOCATE(data_cmplx_y(Nxloc, Ny, Nzloc)) | |
@:ALLOCATE(data_cmplx_z(Nxloc, Nyloc2, Nz)) | |
@:ALLOCATE(En_real(Nxloc, Nyloc2, Nz)) | |
@:ALLOCATE(En(Nf)) | |
Nf = int(sqrt(real((Nx/2)**2 + (Ny/2)**2 + (Nz/2)**2, wp))) + 1 | |
@:ALLOCATE(En(Nf)) | |
... | |
kf = int(nint(sqrt(real(kx, wp)**2 + real(ky, wp)**2 + real(kz, wp)**2))) + 1 | |
if (kf <= Nf) then | |
En(kf) = En(kf) + En_real(j, k, l) | |
end if |
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #997 +/- ##
==========================================
- Coverage 40.92% 40.62% -0.30%
==========================================
Files 70 70
Lines 20299 20464 +165
Branches 2521 2530 +9
==========================================
+ Hits 8307 8314 +7
- Misses 10454 10609 +155
- Partials 1538 1541 +3 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
User description
Description
Fast Fourier transform for energy cascade implemented on multiple ranks. Improves upon Conrad's implementation by using Pencil decomposition (2D) instead of Slabs (1D) in post_process. This required the use of cartesian sub-communicators.
This is in the draft stage, as I have yet to test for correctness, but code ran successfully on 1-8 ranks. Will subsequently test for correctness on TGV problem before merging.
Fixes #(issue) [optional]
Type of change
Please delete options that are not relevant.
Scope
If you cannot check the above box, please split your PR into multiple PRs that each have a common goal.
How Has This Been Tested?
Please describe the tests that you ran to verify your changes.
Provide instructions so we can reproduce.
Please also list any relevant details for your test configuration
Test Configuration:
Checklist
docs/
)examples/
that demonstrate my new feature performing as expected.They run to completion and demonstrate "interesting physics"
./mfc.sh format
before committing my codeIf your code changes any code source files (anything in
src/simulation
)To make sure the code is performing as expected on GPU devices, I have:
nvtx
ranges so that they can be identified in profiles./mfc.sh run XXXX --gpu -t simulation --nsys
, and have attached the output file (.nsys-rep
) and plain text results to this PR./mfc.sh run XXXX --gpu -t simulation --rsys --hip-trace
, and have attached the output file and plain text results to this PR.PR Type
Enhancement
Description
Implement MPI-based 3D FFT for energy cascade analysis
Add pencil decomposition using cartesian sub-communicators
Integrate FFTW library with MPI transpose operations
Add input validation for FFT requirements
Diagram Walkthrough
File Walkthrough
m_mpi_common.fpp
Add FFT-specific processor topology optimization
src/common/m_mpi_common.fpp
m_checker.fpp
Add FFT input validation constraints
src/post_process/m_checker.fpp
s_check_inputs_fft
subroutine for FFT parameter validationfile_per_process
when FFT is enabledm_global_parameters.fpp
Add FFT write parameter
src/post_process/m_global_parameters.fpp
fft_wrt
logical parameter for FFT output controlfft_wrt
to false in default settingsm_mpi_proxy.fpp
Include FFT parameter in MPI broadcasts
src/post_process/m_mpi_proxy.fpp
fft_wrt
to MPI broadcast variable listm_start_up.fpp
Implement complete MPI-based 3D FFT system
src/post_process/m_start_up.fpp
case_dicts.py
Add FFT parameter to toolchain configuration
toolchain/mfc/run/case_dicts.py
fft_wrt
parameter to post-processing parameter dictionary