root / synthbench / euroben-dm / mod2f / mod2f.f @ 0:839f52ef7657
History | View | Annotate | Download (7.7 kB)
1 |
Program mod2f |
---|---|
2 |
! ********************************************************************** |
3 |
! *** This program is part of the EuroBen Benchmark *** |
4 |
! *** *** |
5 |
! *** Copyright: EuroBen Benchmark Group p/o *** |
6 |
! *** Academic Computing Centre Utrecht *** |
7 |
! *** P.O. Box 80.011 *** |
8 |
! *** 3508 TA Utrecht *** |
9 |
! *** The Netherlands *** |
10 |
! *** *** |
11 |
! *** Author of this program: Aad van der Steen *** |
12 |
! *** Date Version 1.0 Summer 1996 *** |
13 |
! *** Date Version 2.0 Spring 1998 *** |
14 |
! *** Date Version 2.1 Autumn 1998 *** |
15 |
! *** Date Version 3.0 Autumn 2005 *** |
16 |
! ********************************************************************** |
17 |
! =============== |
18 |
! MPI Version 3.0 |
19 |
! =============== |
20 |
! ---------------------------------------------------------------------- |
21 |
! --- Program 'mod2f' measures the speed of a parallel 1-D |
22 |
! complex-to-complex FFT on 'nodes' processors. |
23 |
! |
24 |
! The FFT is factorised into 'nc' 1-D FFTs of length 'nr'. |
25 |
! After the first sequence of small FFTs the Real and Imaginary |
26 |
! parts of the sequences that are transformed are multiplied |
27 |
! with the appropriate twiddle factors. |
28 |
! Next the sequences are transposed and 'nr' 1-D FFTs of length |
29 |
! 'nc' are done to complete the transformation process. |
30 |
! See for a description of this kind of implementation: |
31 |
! |
32 |
! R.C. Agarwal, F.G. Gustavson, M. Zubair, "A High Performance |
33 |
! Parallel Algorithm for 1-D FFT", Proc. Supercomputing '94, |
34 |
! IEEE Press, 1994, 34--40. |
35 |
! |
36 |
! --- From version 2.0 on the number of columns 'nc' and the number of |
37 |
! rows 'nr' need not be divided exactly by the number of processors |
38 |
! anymore. |
39 |
! ---------------------------------------------------------------------- |
40 |
! Parameters: |
41 |
! |
42 |
! NC*NR -- Total length of FFT. |
43 |
! NC -- No. FFTs of length NR. |
44 |
! NODES -- No. of processors used. |
45 |
! M1 -- No. of rows of the arrays holding the NR-length FFTs. |
46 |
! M2 -- No. of rows of the arrays holding the NC-length FFTs. |
47 |
! ---------------------------------------------------------------------- |
48 |
Use mpi_module |
49 |
Use numerics |
50 |
Implicit None |
51 |
|
52 |
Real(l_), Allocatable :: arr1r(:,:), arr1i(:,:), |
53 |
& arr2r(:,:), arr2i(:,:), |
54 |
& carr(:,:), cari(:,:), |
55 |
& warr(:,:), wari(:,:), |
56 |
& ur(:), ui(:), |
57 |
& actsiz(:,:), base(:,:) |
58 |
Real(l_) :: time1, time2, timx1, timx2 |
59 |
Integer :: ml, nl, ok |
60 |
Real(l_) :: corr, ctime, err, frac, wclock |
61 |
Real(l_) :: rmflint, rmfltrn, mflops |
62 |
Integer :: i, irep, mc, mr, m1, m2, mflint, |
63 |
& mfltrn, mfltot, nc, nr, ncase, nrep |
64 |
! ---------------------------------------------------------------------- |
65 |
! --- Set up communication. |
66 |
|
67 |
Call csetup |
68 |
If ( me == 0 ) Then |
69 |
Call state( 'mod2f ' ) |
70 |
Print 1000, nodes |
71 |
End If |
72 |
! ---------------------------------------------------------------------- |
73 |
! --- Read problem sizes and initialise arrays. |
74 |
|
75 |
Open ( 1, File = 'mod2f.in' ) |
76 |
10 Read ( 1, *, End = 20 ) m1, m2, nrep |
77 |
nc = 2**m2 |
78 |
nr = 2**m1 |
79 |
mr = nr/nodes |
80 |
mc = nc/nodes |
81 |
! ----------------------------------------------------------------------- |
82 |
! --- Check that the number of processors is a power of 2. |
83 |
|
84 |
If ( mr*nodes /= nr .OR. mc*nodes /= nc ) Then |
85 |
Print *, 'Stop: number of processors should be a power of 2.' |
86 |
Stop |
87 |
End If |
88 |
ml = m1+m2 |
89 |
nl = nc*nr |
90 |
err = ( 10.0_l_*nl*ml*nodes ) * 1.0e-10_l_ |
91 |
Call sizoff( nr, nc ) |
92 |
Allocate( carr(nr,mc), cari(nr,mc), |
93 |
& arr1r(nr,mc), arr1i(nr,mc), |
94 |
& arr2r(nc,mr), arr2i(nc,mr), |
95 |
& warr(nc,mr), wari(nc,mr), |
96 |
& ur(nr), ui(nr) ) |
97 |
Call datgen( carr, cari, nr, mc ) |
98 |
! ---------------------------------------------------------------------- |
99 |
! --- Repeat FFT 'nrep' times for this problem size. |
100 |
|
101 |
Call MPI_Barrier( comm, ierr ) |
102 |
time2 = 0.0_l_ |
103 |
time1 = MPI_Wtime() |
104 |
Do irep = 1, nrep |
105 |
arr1r = carr |
106 |
arr1i = cari |
107 |
Call cfft4( 0, m1, ur, ui, arr1r, arr1i, warr, wari ) |
108 |
Do i = 1, mc |
109 |
Call cfft4( 1, m1, ur, ui, arr1r(1,i), arr1i(1,i), warr, |
110 |
& wari ) |
111 |
End Do |
112 |
! ---------------------------------------------------------------------- |
113 |
! --- Multiply with twiddle factors. |
114 |
|
115 |
Call twiddle( arr1r, arr1i, nr, mc ) |
116 |
! ---------------------------------------------------------------------- |
117 |
! --- Do transposition. |
118 |
|
119 |
! ---------------------------------------------------------------------- |
120 |
! --- Do global block transpose. |
121 |
|
122 |
Call gtrans( arr1r, arr1i, arr2r, arr2i, nr, nc, mr, mc, time2) |
123 |
! ---------------------------------------------------------------------- |
124 |
! --- Do second pass of M1 NC-length FFTs per processor. |
125 |
|
126 |
Call cfft4( 0, m2, ur, ui, arr2r, arr2i, warr, wari ) |
127 |
Do i = 1, mr |
128 |
Call cfft4( 1, m2, ur, ui, arr2r(1,i), arr2i(1,i), warr, |
129 |
& wari) |
130 |
End Do |
131 |
End Do |
132 |
time1 = MPI_Wtime() - time1 |
133 |
! ---------------------------------------------------------------------- |
134 |
! --- Check for errors and correct timing for filling of arrays. |
135 |
|
136 |
Call errchk( arr2r, arr2i, nc, mr, err, ok ) |
137 |
corr = MPI_Wtime() |
138 |
Do irep = 1, nrep |
139 |
arr1r = carr |
140 |
arr1i = cari |
141 |
End Do |
142 |
corr = MPI_Wtime() - corr |
143 |
time1 = time1 - corr |
144 |
Call MPI_Reduce( time1, timx1, 1, rtyp, MPI_Max, 0, comm, ierr ) |
145 |
time1 = timx1/ Real( nrep, l_ ) |
146 |
Call MPI_Reduce( time2, timx2, 1, rtyp, MPI_Max, 0, comm, ierr ) |
147 |
time2 = timx2/ Real( nrep, l_ ) |
148 |
If ( me == 0 ) Then |
149 |
! ---------------------------------------------------------------------- |
150 |
! --- Calculate Mflop rates. |
151 |
|
152 |
Call nflops( ml, mflint, mfltrn ) |
153 |
mflops = 1.0E-6_l_*( mfltrn + mflint ) |
154 |
& / timx1 |
155 |
frac = 100.0_l_*( time2/time1 ) |
156 |
Print 1010, nl, time1, mflops, time2, frac, ok |
157 |
End If |
158 |
Deallocate( carr, cari, arr1r, arr1i, arr2r, arr2i, |
159 |
& warr, wari, ur, ui ) |
160 |
! ---------------------------------------------------------------------- |
161 |
! --- Get new problem. |
162 |
|
163 |
Go To 10 |
164 |
|
165 |
! ---------------------------------------------------------------------- |
166 |
! --- End of measurements: report results. |
167 |
! Execution time is maximum time over all nodes. |
168 |
|
169 |
20 If ( me == 0 ) Print 1020 |
170 |
Call MPI_bye |
171 |
! ---------------------------------------------------------------------- |
172 |
1000 Format( /'Program mod2f computes FFT: No. of procs. = ', i3/ |
173 |
& 73('-')/ |
174 |
& ' FFT results, Radix-4 algorithm',/ |
175 |
& 73('-')/ |
176 |
& ' Length | Total Time | Speed | Comm. Time |', |
177 |
& ' Frac. | Check |'/ |
178 |
& ' N = | (sec) | (Mflop/s) | (sec) |', |
179 |
& ' (%) | OK |'/ |
180 |
& 73('-') ) |
181 |
1010 Format ( i8, '| ', g13.5, '| ', g13.5, '| ', g13.5, '| ', f7.3, |
182 |
& ' | ', i6, 1x, '|' ) |
183 |
1020 Format ( 73('-') ) |
184 |
! ---------------------------------------------------------------------- |
185 |
End Program mod2f |