/*****************************************************************************/
/* Laplace Equation Numerically Solved by Jacobi Iteration Method on PSC T3E */ 
/* Temperature T is initially Tinit everywhere except at boundaries.         */
/* Revision for Correcting Errors and Stopping for Convergence.              */
/* Laplace Code in row panel block for C with source in cpgm.c and data      */
/* Fall00: Revised for MPT (MPI) upgrade: interlace Sends & Recvs            */
/* Fall00: Correction for New MPI from MPT (Message Passsing ToolKit), NPACI */
/* Fall00: Thanks to NPACI Consultant Kent MilFeld for MPT Changes Fix.      */
/*                                                                           */
/*      T=300.0+100*x/LX                      0<x<LX                         */
/*         _________                LY  ___________________   Y,NR+1-I       */
/*         |       |                   |*      VPE0        |   |             */
/*         |       |              3LY/4|___________________|   |             */
/* T=100.0 |T=Tinit| T=100.0    0<y<LY |       VPE1        |   |             */
/*+200*y/LY|initial| +300*y/LY     LY/2|___________________|   |             */
/*         |    -ly|                   |       VPE2        |   |             */
/*         |       |               LY/4|___________________|   |             */
/*         |       |                   |       VPE3        |   |             */
/*         |_______|                0.0|___________________|   v______>X,J   */
/*          T=100.0                   0.0                  LX                */
/* Use Central Differencing Method with 4 PEs.                               */
/* Rectangular Domain decomposed into 4 vertical panels for Rowwise C.  */
/* Each process gets its own copy of the code to execute,                    */
/* with parallel synchronization handled by MPI in mpprun environment.       */
/* Each process only has subgrid for VPE=0,1,2,...,npes-1=3 here,            */
/* where VPE=VirtualOrUserProcessingElement.                                 */
/* * Note: Geometric Origin is LowerLeft, But Array T[0][0] is UpperLeft     */
/* Note: Array T[i][j] is Only Local to Each PE and its Panel!               */
/* Each Processor works sub grid and then sends its Boundaries to neighbours */
/* Artificial, Interior Boundary OverLap, Using N = NR4, NumberYRows, Local  */
/*             and p = npes, NumberVirtualProcessors:                        */
/*           0 1  ...  N N+1     0 1  ...  N N+1     0 1  ...  N N+1 (local) */
/*     VPE0  | - - - - - |  VPE2 | - - - - - | ....  | - - - - - |           */
/* | - - - - - |  VPE1 | - - - - - |  VPE3 | - ....  - | VPE[p-1]            */
/* 0 1  ...  N N+1     0 1  ...  N N+1     0 1  ...  N N+1           (local) */
/*                                                                           */
/* 0 1       N N       2 2       3 3       4 4                 q q  (global  */
/*             +       * *       * *       * *                 * *    node   */
/*             1       N N       N N       N N                 N N numbering)*/
/*                       +         +         +                   +   (q=p-1) */
/*                       1         1         1                   1           */
/*97: FH/MCS572F97  interactive run suggestions:                             */
/* compile: cc -O3 -h scalar2 -h report=isvf -Xm -o [executable] [source].c &*/
/* execute: mpprun -n4 [executable] < [data] >& [source].output &            */
/* Data input for maximum number of iterations niter: [data]                 */
/* Try ps Process Status Command While Running & Should See 4 Jobs Executing!*/
/* Try jobs Jobs Status Command to Check for Running, Exit or Done.          */
/*****************************************************************************/
#define NPES     4                       /* Number of Processing Elements    */
#define NYI    201                       /* Number of Y-Intervals            */
#define NXI    201                       /* Number of X-Intervals            */
#define NC       NXI-1                   /* Number of Interior Cols          */
#define NR       NYI-1                   /* Number of Interior Rows          */
#define NR4   NYI/NPES                   /* Number of Local Interior Rows    */
#define MXITER    5000                   /* Max num of Iterations            */
#define DOWN     100                     /* MsgTag for messages down         */
#define UP       101                     /* MsgTag for messages up           */
#define ROOT     0                       /* Root PE                          */
#define MAX(x,y) ( ((x) > (y)) ? x : y ) /* Define Max Function              */
#define LY  1.e+0                        /* Y-Length                         */
#define LX  1.e+0                        /* X-Length                         */
#define HY  LY/NYI                       /* Y-StepSize                       */
#define HX  LX/NXI                       /* X-StepSize                       */
#define stoptol  0.5E-2                  /* stopping tolerance: Convergence  */
#define Tinit    225.0                   /* Starting T Iteration             */
#define NC2      NXI/2                   /* Central Root Sample Print Column */
#define NR0      NR4/25                 /* First  Root Sample Print Row     */
#define NRE      NR4-NR0+1              /* End    Root Sample Print Row     */
#define NRL2     NR4/2                  /* Middle Root Sample Print Row     */
#define DNR      NRL2-NR0                /* Step   Root Sample Print Row     */
#define NCO0     0                       /* First All Sample Print Column    */
#define NCOE     NXI                     /* End   All Sample Print Column:unu*/
#define DNCO     NXI/20                  /* Step  All Sample Print Column    */
#define NRO0     0                       /* First All Sample Print Row       */
#define NROE     NR4                    /* End   All Sample Print Row:unused*/
#define DNRO     NR4/5                  /* Step  All Sample Print Row       */
/*****************************************************************************/
#include <stdio.h>
#include <math.h>
#include <mpi.h>   /* MPI Step: Include MPI Fortran Header Defns. and Decls. */


void initialize( double t[NR4+2][NC+2] );
void set_bcs   ( double t[NR4+2][NC+2], int mype, int npes );

int main( int argc, char **argv ){      /* Main Procedure                    */
  
  int        npes;                      /* Actual Number of PEs       */
  int        mype;                      /* My PE number               */
  int        stat;                      /* Error Status/Return Code   */
  int        niter;                     /* Number of Iterations       */
  int        nstop;                     /* Interation Stop Parameter  */
  MPI_Status status;        /* MPI Step: Declare status structure     */

  double     t[NR4+2][NC+2], told[NR4+2][NC+2];  /* Temp. Arrays: Old & New  */
  double     dt;                      /* Delta t                             */
  double     dtg;                     /* Delta t global                      */
  double     starttime,stoptime,timerres,totalwalltime;/* Wall Time Variable */
  int        i, j, k, l;
  int        iter;                    /* Current Number of Iterations        */
/* MPI Step: Initialize MPI, giving each PE copy of code;                    */
/* MPI required subroutine using programming model:SPMD=SglPgmMultiData.     */
  MPI_Init(&argc, &argv);
/* MPI Step: Start MPI Wall Timer and get Tick Resolution:                   */
      starttime = MPI_Wtime();
      timerres = MPI_Wtick();
/* MPI Step: Get T3E assigned npes=#PEs or Communication_size:               */
  MPI_Comm_size(MPI_COMM_WORLD, &npes );
/*        where MPI_COMM_WORLD=MPI_Communication_World=UserSetOfPEs.         */
/* MPI Step: Get local (this PE) mype=PE#, also called Communication_rank:   */
  MPI_Comm_rank(MPI_COMM_WORLD, &mype );
  if ( npes != NPES ){                /* Currently FoolProofCheck Hardcoded  */
    MPI_Finalize();
    if( mype == 0 )
      fprintf(stderr, "The example is only for %d PEs\n", NPES);
    exit(1);
  }
/* Laplace Step: Set Local Initial Iteration Conditions for Laplace Equation:*/
  initialize(t);                  
/* Laplace Step: Set Local Boundary Conditions for Laplace Equation:         */
  set_bcs(t, mype, npes);         /* Set the Boundary values   */
/* Initialize Iteration Stopping Parameter                                   */
      nstop = 0;
/* Laplace Step: Root Reads NumberIterations from "data" Input File:         */
  if( mype == ROOT ){
       scanf("%d", &niter); /*  */
    if( niter>MXITER ) niter = MXITER;
/* Correct Root Local Interior Point Count so Sum= NC= #TotalInteriorPoints  */
  }
/* MPI Step: All VPES Get Collective Communication of niter from Root:       */
  MPI_Bcast(&niter, 1, MPI_INT, ROOT, MPI_COMM_WORLD);
/*      where count is 1 and datatype is MPI_INT.                        */
/* Laplace Step: Save Old Iteration:                                         */
  for( i=0; i<=NR4+1; i++ )       /* Copy the values into told */
    for( j=0; j<=NC+1; j++ )
      told[i][j] = t[i][j];
/*-------------------------------------------------------------------------*
 |  Do Computation on Local Sub-grid for Niter 5-Star Jacobi iterations.   |
 |    Corrected for Arbitrary (LY,LX)Lengths with (HY,HX)Steps.            |
 *-------------------------------------------------------------------------*/
  for( iter=1; iter<=niter; iter++ ) {
    for( i=1; i<=NR4; i++ )
      for( j=1; j<=NC; j++ )
        t[i][j] = 0.5 * ((told[i+1][j] + told[i-1][j])*HX*HX
                + (told[i][j+1] + told[i][j-1])*HY*HY)/(HX*HX+HY*HY);
/* Get dt=LocalMaxAbsChange for this PE and Save Old Temperatures:         */
    dt = 0.;
    for( i=1; i<=NR4; i++ )       
      for( j=1; j<=NC; j++ ){
        dt = MAX( fabs(t[i][j]-told[i][j]), dt);
        told[i][j] = t[i][j];
      }
/* MPI Step: mype Sends Down Boundary Data to Down Neighbor mype+1,         */
/*   using Point_to_Point Send Communication for mype=0,...,npes-2:         */
    if( mype < npes-1 )    /*   Sending Down; Only npes-1 do this           */
      MPI_Send( &t[NR4][1],  NC, MPI_DOUBLE, mype+1, DOWN, MPI_COMM_WORLD);
/* starting at pointer &t[NR4][1] for NC cols using datatype MPI_DOUBLE.    */
/*                                                                          */
/* MPI Step: mype Receives its UP Boundary Data from Up Neighbor mype-1     */
/* (here using wildcard name MPI_ANY_SOURCE; but note MsgTag=Down),         */
/* using Point_to_Point Receive Communication for mype=1,...,npes-1:        */
    if( mype != 0 )        /*   Receive  Msg DOWN mype-1 above              */
      MPI_Recv( &t[0][1], NC, MPI_DOUBLE, MPI_ANY_SOURCE, DOWN,
                             MPI_COMM_WORLD, &status);
/* starting at pointer &t[0][1] for NC cols using datatype MPI_DOUBLE,      */
/*                                                                          */
/* MPI Step: mype Sends Up Boundary Data to Up Neighbor mype-1,             */
/* using Point_to_Point Send Communication for mype=1,...,npes-1:           */
    if( mype != 0 )        /*   Sending Msg Up  ; Only mype+1 does this     */
      MPI_Send( &t[1][1],    NC, MPI_DOUBLE, mype-1, UP,   MPI_COMM_WORLD);
/* starting at pointer &t[1][1] for NC cols using datatype MPI_DOUBLE.      */
/* with Additional Vector Structure status Argument.                        */
/*                                                                          */
/* MPI Step: mype Receives Down Boundary Data from Down Neighbor mype+1     */
/* (here using wildcard name MPI_ANY_SOURCE; but note MsgTag=UP),           */
/* using Point_to_Point Receive Communication for mype=1,...,npes-1:        */
    if( mype != npes-1 )        /*   Receive Msg Up from mype+1 below       */
      MPI_Recv( &t[NR4+1][1],NC, MPI_DOUBLE, MPI_ANY_SOURCE, UP  ,
                                MPI_COMM_WORLD, &status);
/* starting at address &t[NR4+1][1] for NC rows using datatype MPI_DOUBLE,  */
/* with Additional Vector Structure status Argument.                        */
/*                                                                          */
/* MPI Step: mype Sends to RootPE LocalMaxAbsChange=dt using Collective     */
/* (Global) Reduction Maximum Function MPI_MAX with Result Eventually       */
/* Collected in GlobalMaxAbsChange=dtg of count 1 and datatype MPI_DoUBLE:  */
    MPI_Reduce(&dt, &dtg, 1, MPI_DOUBLE, MPI_MAX, ROOT, MPI_COMM_WORLD);
/* Root Print Selected Values at Every 100th Iteration:                     */
/* Important that Writes are in Serial Sections on DMP T3E:                 */
/* Root Also Check Stopping Criteria Tolerance                              */
    if( mype == 0 ) {
       if( (iter%100) == 0 ) {
          printf("\nMYPE = %4d; ITER = %5d; GlobalMaxAbsChange:dtg = %10.3e\n", 
             mype, iter, dtg); 
/* VPE=0 Writes Out a Sample Central Values:                                */
 printf("MYPE = %4d; ITER = %5d; t[%3d:%3d:%3d][%3d] = %9.4f %9.4f %9.4f\n",
             mype, iter, NR0, NRL2, NRE, DNR,
	     t[NR0][NC2], t[NRL2][NC2], t[NRE][NC2]); 
       }

/* If Iteration Stopping, set Stopping Flag:                                */
       if(dtg < stoptol) nstop = 1;
    }
/* Global Call Broadcast of Possible Stopping Value By All.                 */
/*                                                                          */
      MPI_Bcast(&nstop, 1, MPI_INT, ROOT, MPI_COMM_WORLD);
/*                                                                          */
/* MPI Step:  Barrier is Final Call for all PEs to Stop and Wait for Rest   */
/* in MPI_COMM_WORLD to Finish Before Continuing on to Next Iteration:      */
    MPI_Barrier( MPI_COMM_WORLD );
/* (Note old version also required the generic barrier _barrier();)         */
/*                                                                          */
/*  If Convergence for Small dtg, Then Break Out of For Loop!               */
    if(nstop == 1) break;
/*                                                                          */
  }  /* End of Big Iteration Loop */
/* Stopping Criterion Stop:                                                 */
      if( mype == 0 )  {
/* MPI Step: Stop MPI Wall Timer:                                           */
/*                                                                          */
       stoptime = MPI_Wtime();
/*                                                                          */
       totalwalltime = stoptime - starttime;
       printf("\nStoptol=%10.3e; GlobalMaxAbsChange:dtg=%10.3e;",stoptol,dtg);
 printf("\nMYPE = %4d; TotalWallTime = %12.4e secs; WallTick = %12.4e secs;\n", 
        mype, totalwalltime, timerres);                   
       if(dtg>stoptol){
          printf("\nMore Iterations Needed Since dtg > stoptol;");}
       else {
          printf("\nJacobi Iterations Converged Since dtg < stoptol;");}
     }
/* Print Final Sample Output Temperatures for Each mype's Panel:            */
  printf("\nSample Output: MYPE =%4d; ITER =%5d\n J/I ",mype, iter);
  for( k=0; k<=5; k++ ){
     i=NRO0+ k*DNRO;
     printf("%7d",i);}
     printf("%7d",NR4+1);
  for( l=0; l<=20; l++ ){
     j=NCO0+ l*DNCO;
     printf("\n%4d ",j);
     for( k=0; k<=5; k++ ){
     i=NRO0+ k*DNRO;
     printf("%7.2f",t[i][j]);}
     printf("%7.2f",t[NR4+1][j]);}
/* MPI Step: MPI Required Finalization Step:                                */
/*                                                                          */
  MPI_Finalize();
/*                                                                          */
}    /* End of Main Procedure  */

/********************************************************************
 *                                                                  *
 * Initialize all the values to Tinit as a starting iteration       *
 *                                                                  *
 ********************************************************************/

void initialize( double t[NR4+2][NC+2] ){

  int        i, j, iter;
  
  for( i=0; i<=NR4+1; i++ )        /* Initialize Iterations */
    for ( j=0; j<=NC+1; j++ )
      t[i][j] = Tinit;
}

/********************************************************************
 *                                                                  *
 * Set the values at the boundary.  Values at the boundary do not   *
 * Change through out the execution of the program                  *
 *                                                                  *
 ********************************************************************/

void set_bcs( double t[NR4+2][NC+2], int mype, int npes ){

  int i, j;

/* Reset Parameters for LocalInteriorRows to Preserve #GlobalInteriorPoints  */

  if( mype == 0 ) {                  
    for( j=1; j<NC+1; j++ )
      t[0][j] = 300.0 + 100.0*j*HX;   /* Top boundary */
      }

  if( mype == npes-1 ) 
    for( j=1; j<NC+1; j++ )
      t[NR4+1][j] = 100.0;              /* Bottom boundary */

  for( i=0; i<=NR4+1; i++ ) {      
    t[i][0]    = 300.0 - 200.0*(i+NR4*mype)*HY/LY; /* Set Left Boundary     */
/*                                                                          */
    t[i][NC+1] = 400.0 - 300.0*(i+NR4*mype)*HY/LY; /* Set Right Boundary    */
/* Note that 300.0=100.0+200*LY/LY for LBC; 400.0=100.0*+200LY/LY for RBC   */
  }
}
