program test

  implicit none

! Show cache influence on code performances
! "good" loop sums matrixes in column order (best)
! "bad" loop sums matrixes in row order

! module load intel
! ifort 1-loop-interch.f90 -O0 -g -o loop-interch
! ifort 1-loop-interch.f90 -o loop-interch
! ifort 1-loop-interch.f90 -O3 -o loop-interch
! ./loop-interch
! time ./loop-interch

! ifort 1-loop-interch.f90 -O0 -g -pg -o loop-interch
! gprof ./loop-interch gmon.out > loop.gprof
! gprof -l ./loop-interch gmon.out > loop.gprof

! #########################################################

! HOMEPC
! gfortran 1-loop-interch.f90 -O0 -g -o loop-interch
! gfortran 1-loop-interch.f90 -o loop-interch
! gfortran 1-loop-interch.f90 -O3 -o loop-interch
! time ./loop-interch

! gfortran 1-loop-interch.f90 -O0 -g -pg -o loop-interch
! gprof ./loop-interch gmon.out > loop.gprof
! gprof -l ./loop-interch gmon.out > loop.gprof
  real, allocatable :: a(:,:), b(:,:), c(:,:),d(:,:)
  real alpha 
  integer i, j, dim, niter, t
  integer t1, t2, tr,nmax
  dim = 14000
  niter = 2
  alpha = sqrt(1.)

  allocate(a(dim, dim))
  allocate(b(dim, dim))
  allocate(c(dim, dim))
  allocate(d(dim, dim))

  a = 0.
  b = 1.5
  c = 2.34
  
!!!!!!!!!!!!!!!!!!!!!!!!! GOOD START !!!!!!!!!!!!!!!!!!!!!!!
  write(*,*) "bad start - two separate loops"
  call system_clock(t1, tr,nmax)
  do t=1, niter

     do j=1, dim
        do i=1, dim
           a(i,j) = b(i,j) + c(i,j) + alpha
        enddo
     enddo

     do j=1, dim
        do i=2, dim
           d(i,j) = sqrt(a(i-1,j))
        enddo
     enddo

  enddo
  call system_clock(t2)
    write(*,*) "bad end - time ", real(t2-t1)/real(tr)
!!!!!!!!!!!!!!!!!!!!!!!!! GOOD END !!!!!!!!!!!!!!!!!!!!!!!
  if(alpha < -1) write(*,*) a, b, c


  a = 0.
  b = 1.5
  c = 2.34
!!!!!!!!!!!!!!!!!!!!!!!!! BAD START !!!!!!!!!!!!!!!!!!!!!!!
  write(*,*) "good start - loop fusion"
  call system_clock(t1, tr,nmax)
  do t=1, niter

     do j=1, dim
!         a(1,j) = b(1,j) + c(1,j) + alpha
!         do i=2, dim
        do i=1, dim-1
           a(i,j) = b(i,j) + c(i,j) + alpha
!            d(i,j) = sqrt(a(i-1,j))
           d(i+1,j) = sqrt(a(i,j))
        enddo
        a(dim,j) = b(dim,j) + c(dim,j) + alpha
     enddo

  enddo
  call system_clock(t2)
    write(*,*) "good end - time ", real(t2-t1)/real(tr)

!!!!!!!!!!!!!!!!!!!!!!!!! BAD END !!!!!!!!!!!!!!!!!!!!!!!

  deallocate(a)
  deallocate(b)
  deallocate(c)
  deallocate(d)

end program test
