Runge Kutta 4 With CUDA Fortran
I am trying to convert this FORTRAN (pendulum movement) program to CUDA FORTRAN, but I can only use 1 block with 2 threads. Is there a way to use more than 2 threads ...
MODULE CB
REAL :: Q,B,W
END MODULE CB
PROGRAM PENDULUM
USE CB
IMPLICIT NONE
INTEGER, PARAMETER :: N=10,L=100,M=1
INTEGER :: I,count_rate,count_max,count(2)
REAL :: PI,H,T,Y1,Y2,G1,G1F,G2,G2F
REAL :: DK11,DK21,DK12,DK22,DK13,DK23,DK14,DK24
REAL, DIMENSION (2,N) :: Y
PI = 4.0*ATAN(1.0)
H = 3.0*PI/L
Q = 0.5
B = 0.9
W = 2.0/3.0
Y(1,1) = 0.0
Y(2,1) = 2.0
DO I = 1, N-1
T = H*I
Y1 = Y(1,I)
Y2 = Y(2,I)
DK11 = H*G1F(Y1,Y2,T)
DK21 = H*G2F(Y1,Y2,T)
DK12 = H*G1F((Y1+DK11/2.0),(Y2+DK21/2.0),(T+H/2.0))
DK22 = H*G2F((Y1+DK11/2.0),(Y2+DK21/2.0),(T+H/2.0))
DK13 = H*G1F((Y1+DK12/2.0),(Y2+DK22/2.0),(T+H/2.0))
DK23 = H*G2F((Y1+DK12/2.0),(Y2+DK22/2.0),(T+H/2.0))
DK14 = H*G1F((Y1+DK13),(Y2+DK23),(T+H))
DK24 = H*G2F((Y1+DK13),(Y2+DK23),(T+H))
Y(1,I+1) = Y(1,I)+(DK11+2.0*(DK12+DK13)+DK14)/6.0
Y(2,I+1) = Y(2,I)+(DK21+2.0*(DK22+DK23)+DK24)/6.0
! Bring theta back to the region [-pi,pi]
Y(1,I+1) = Y(1,I+1)-2.0*PI*NINT(Y(1,I+1)/(2.0*PI))
END DO
call system_clock ( count(2), count_rate, count_max )
WRITE (6,"(2F16.8)") (Y(1,I),Y(2,I),I=1,N,M)
END PROGRAM PENDULUM
FUNCTION G1F (Y1,Y2,T) RESULT (G1)
USE CB
IMPLICIT NONE
REAL :: Y1,Y2,T,G1
G1 = Y2
END FUNCTION G1F
FUNCTION G2F (Y1,Y2,T) RESULT (G2)
USE CB
IMPLICIT NONE
REAL :: Y1,Y2,T,G2
G2 = -Q*Y2-SIN(Y1)+B*COS(W*T)
END FUNCTION G2F
CUDA FORTRAN SOFTWARE VERSION
MODULE KERNEL
CONTAINS
attributes(global) subroutine mykernel(Y_d,N,L,M)
INTEGER,value:: N,L,M
INTEGER ::tid
REAL:: Y_d(:,:)
REAL :: PI,H,T,G1,G1F,G2,G2F
REAL,shared :: DK11,DK21,DK12,DK22,DK13,DK23,DK14,DK24,Y1,Y2
PI = 4.0*ATAN(1.0)
H = 3.0*PI/L
Y_d(1,1) = 0.0
Y_d(2,1) = 2.0
tid=threadidx%x
DO I = 1, N-1
T = H*I
Y1 = Y_d(1,I)
Y2 = Y_d(2,I)
if(tid==1)then
DK11 = H*G1F(Y1,Y2,T)
else
DK21 = H*G2F(Y1,Y2,T)
endif
call syncthreads ()
if(tid==1)then
DK12 = H*G1F((Y1+DK11/2.0),(Y2+DK21/2.0),(T+H/2.0))
else
DK22 = H*G2F((Y1+DK11/2.0),(Y2+DK21/2.0),(T+H/2.0))
endif
call syncthreads ()
if(tid==1)then
DK13 = H*G1F((Y1+DK12/2.0),(Y2+DK22/2.0),(T+H/2.0))
else
DK23 = H*G2F((Y1+DK12/2.0),(Y2+DK22/2.0),(T+H/2.0))
endif
call syncthreads ()
if(tid==1)then
DK14 = H*G1F((Y1+DK13),(Y2+DK23),(T+H))
else
DK24 = H*G2F((Y1+DK13),(Y2+DK23),(T+H))
endif
call syncthreads ()
if(tid==1)then
Y_d(1,I+1) = Y1+(DK11+2.0*(DK12+DK13)+DK14)/6.0
else
Y_d(2,I+1) = Y2+(DK21+2.0*(DK22+DK23)+DK24)/6.0
endif
Y_d(1,I+1) = Y_d(1,I+1)-2.0*PI*NINT(Y_d(1,I+1)/(2.0*PI))
call syncthreads ()
END DO
end subroutine mykernel
attributes(device) FUNCTION G1F (Y1,Y2,T) RESULT (G1)
IMPLICIT NONE
REAL :: Y1,Y2,T,G1
G1 = Y2
END FUNCTION G1F
attributes(device) FUNCTION G2F (Y1,Y2,T) RESULT (G2)
IMPLICIT NONE
REAL :: Y1,Y2,T,G2
G2 = -0.5*Y2-SIN(Y1)+0.9*COS((2.0/3.0)*T)
END FUNCTION G2F
END MODULE KERNEL
PROGRAM PENDULUM
use cudafor
use KERNEL
IMPLICIT NONE
INTEGER, PARAMETER :: N=100000,L=1000,M=1
INTEGER :: I,d,count_max,count_rate
REAL,device :: Y_d(2,N)
REAL, DIMENSION (2,N) :: Y
INTEGER :: count(2)
call mykernel<<<1,2>>>(Y_d,N,L,M)
Y=Y_d
WRITE (6,"(2F16.8)") (Y(1,I),Y(2,I),I=1,N,M)
END PROGRAM PENDULUM
source to share
You can see that only two independent threads of execution are possible by analyzing the data dependencies of the source code. It is easiest to think of this as the "outside" and "inside" part.
The "outside" part is dependence Y(1:2,i+1)
on Y(1:2,i)
. At each time step, you need to use the values Y(1:2,i)
to calculate Y(1:2,i+1)
, so it is not possible to perform calculations for multiple time steps in parallel, simply because of the sequential dependency structure - you need to know what happens in time i
to calculate what happens at a point in time i+1
, you need to know what happens at the moment i+1
to calculate what happens at the moment i+2
, and so on. The best thing you can hope to do is calculate Y(1,i+1)
and Y(2,i+1)
in parallel exactly what you are doing.
The "inner" part is based on the relationship between intermediate values ββin the Runge-Kutta scheme, values DK11
, DK12
etc. in your code. When calculating, Y(1:2,i+1)
each of the DK[n,m]
depends on Y(1:2,i)
, and for m > 1
each of it DK[n,m]
depends both on DK[1,m-1]
and on DK[2,m-1]
. If you plot these dependencies (which my ASCII art skills are not very good at!), You will see that at each step of the computation there are only two possible sub-computations that can be performed in parallel.
The result of all this is that you can't do better than two parallel threads for this computation. As one of the commenters above said, you can certainly do much better if you simulate a particle system or some other mechanical system with several independent degrees of freedom that can then be integrated in parallel.
source to share