Last commit for pardy.c: 826fd4b5e0c6c98a6ff5f960afbfc81e235e4dbb

made macros and ternary operators better

Ramses van Zon [2018-03-27 22:49:55]
made macros and ternary operators better
/*
 *  pardy.c
 *
 *  Example of a PARallel molecular DYnamics simulation in C, using
 *  mpi and openmp.
 *
 *  Input (from standard input or filename given on command line):
 *   N = <number of particles>
 *   rho = <density of the system>
 *   T = <initial temperature (standard deviation of the velocities)>
 *   runtime = <runtime>
 *   dt = <time step>
 *   seed = <random number generator seed>
 *   equil = <equilibration time>
 *   usecells
 *   (when the latter flag is not present, cells will not be used
 *   within processes)
 *
 *  Output (from standard output): lines containing:
 *   time, energy E, pot. en. U, kin. en. K, temperature T, fluctuations, walltime-per-step, walltime-overall
 *
 *  Notes:
 *  - To compile: make
 *  - To run: ./pardy input.ini
 *    where input parameters are listed in the file "input.ini", and
 *    output is sent to stdout.
 *  - Appropriate values for the equilibrium time are best found by
 *    doing a short run and seeing when the potential energy has reach
 *    a stationary value.
 *  - All reported energies values are divided by the number of particles N.
 *  - Fluctuations are the root mean square of E-<E> with <E> the mean energy E.
 *
 *  Ramses van Zon, 13 November 2008
 *  - In Nov 25 2009:
 *    + condensed code
 *    + OpenMP version
 *  - In Aug 2016:
 *    + Renamed to ljhpc, included mpi header (not used yet)
 *  - In Sep 2016:
 *    + Added cells to determine interacting pairs.
 *    + Added MPI
 *    + Renamed to pardy
 *    + Change input format and modularized
 */

#include <assert.h>
#include <string.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <mpi.h>
#include <ndmalloc.h>
#include "lcg.h"
#include "lattice.h"
#include "debug.h"
#include "system.h"
#include "atom.h"
#include "inifile.h"
#include "global.h"
#include "estimates.h"
#include "parallelwork.h"
#include "forces.h"
#include "cells.h"
#include "mpicommunication.h"

void sortParticlesAndComputeForces(int *N, atom_t atoms[], interaction_pairs_t* p, system_t* sys, parallel_work_t* work)
{
    /* sort particles into cells */
    cellDivide(atoms, sys);
    /* send ghost particles */
    int         bufmax = estimateMaxNGhostPerFaceNeighbor(sys->rho, sys->localL, sys->cellsize);
    MPI_Request send_requests[SENDNUM];
    MPI_Request recv_requests[RECVNUM];
    int         sendnum = 0;
    int         recvnum = 0;
    assert(ndsize(work->recv_buffer_atoms,0) >= RECVNUM);
    assert(ndsize(work->recv_buffer_atoms,1) >= bufmax);
    detect_and_send_ghost_particles(atoms, bufmax,
                                    work->recv_buffer_atoms,
                                    send_requests,
                                    recv_requests,
                                    &sendnum, &recvnum,
                                    sys, work);
    /* Part of the force calculation involving local particles
       (computation overlaps with communication) */
    findPairsFromCells(p, sys);
    sys->U = computeForces(*N, atoms, p, sys->L, false, work);
    /* Collect neighbors */
    atom_t* ghost_atoms = atoms + *N;
    size_t  ghost_count = wait_for_particles(sendnum, send_requests,
                                             recvnum, recv_requests,
                                             work->recv_buffer_atoms, ghost_atoms);
    size_t NselfPlusGhosts = *N+ghost_count;
    #ifndef NDEBUG
    size_t atomsCapacity = ndsize(atoms,0);
    assert_le(NselfPlusGhosts, atomsCapacity);
    #endif
    /* Neighbor part of force calculation */
    findGhostPairsFromCells(p, ghost_count, ghost_atoms, *N, sys);
    sys->U += 0.5*computeForces(NselfPlusGhosts, atoms, p, sys->L, true, work);
}

void initialize(atom_t atoms[], interaction_pairs_t* p, system_t* sys, parallel_work_t* work)
{
    double scale;
    int i;
    /* generate positions */
    /* initialize lattice position generator, then pick at most Ntot (will be less if parallel) */
    lattice_t lat = init_partial_lattice(sys->L, sys->Ntot, sys->origin, sys->localL);
    for (i = 0; i < sys->Ntot; ++i) {
        long long index = make_lattice_position(&lat, &(atoms[i].rx), &(atoms[i].ry), &(atoms[i].rz));
        if (index != NO_LATTICE_POSITION) {
            atoms[i].index = index;
        } else {
            break; /* no more points */
        }
    }
    sys->N = i;
    /* report the number of particles held by each process */
    int Nall[global_size];
    MPI_Allgather(&(sys->N), 1, MPI_INT, Nall, 1, MPI_INT, MPI_COMM_WORLD);
    long long checkNtot = 0;
    for (int i = 0; i < global_size; ++i) {
        if (i_am_root)
            printf("# N[%d]=%d\n", i, Nall[i]);
        checkNtot += Nall[i];
    }
    /* sanity check */
    if (checkNtot != sys->Ntot) {
        if (i_am_root)
            fprintf(stderr, "\nPARDY PARALLEL CONSISTENCY ERROR: Combined number of particles in all processes (%lld) does not match parameter N (%lld)\n", checkNtot, sys->Ntot);
        MPI_Abort(sys->comm, 7);
    }
    /* generate momenta */
    lcg_t rng = lcg_init_rand48(sys->seed);  /* initialize random number
                                                generator used in gaussian
                                                instead of "srand48(seed)" */
    scale = sqrt(sys->T0);
    sys->K = 0;
    /* must do some skipping (4 random numbers per particle)*/
    lcg_skip(&rng, 4*atoms[0].index);
    for (int i = 0; i < sys->N; ++i){
        atoms[i].px = scale*lcg_normal(&rng);
        atoms[i].py = scale*lcg_normal(&rng);
        atoms[i].pz = scale*lcg_normal(&rng);
        lcg_normal(&rng); /* must have even number of normal random numbers */
        sys->K += atoms[i].px*atoms[i].px + atoms[i].py*atoms[i].py + atoms[i].pz*atoms[i].pz;
        if (i<(sys->N)-1) {
            int index_diff = atoms[i+1].index - atoms[i].index;
            if (index_diff > 1)
                lcg_skip(&rng, 4*(index_diff-1));
        }
    }
    /* compute instantaneous kinetic temperature and energy */
    sys->K *= 0.5;
    sortParticlesAndComputeForces(&(sys->N), atoms, p, sys, work);
    double Utot, Ktot;
    MPI_Reduce(&(sys->U), &Utot, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&(sys->K), &Ktot, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    /* report results */
    if (i_am_root) {
        printf("#  step    time    E       U       K       T    <[E-<E>]^2>  walltime(s) cum.walltime(s)\n");
        printf("%7d %7.3f %7.3f %7.3f %7.3f %7.3f   .....         0.000      0.000\n",
               0, 0., (Ktot+Utot) / (sys->Ntot), Utot / (sys->Ntot), Ktot / (sys->Ntot), (2*Ktot)/(DIM*(sys->Ntot)));
    }
}

/* Verlet integration step */
void integrateStep(system_t* sys, atom_t atoms[], interaction_pairs_t* p, parallel_work_t* work)
{
    /* first momentum half step (forces assume to be computed in
       initialization or in the last step) */
    #pragma omp parallel for
    for (int i = 0; i < sys->N; ++i) {
        atoms[i].px += 0.5*sys->dt*atoms[i].fx;
        atoms[i].py += 0.5*sys->dt*atoms[i].fy;
        atoms[i].pz += 0.5*sys->dt*atoms[i].fz;
    }
    /* full free motion step */
    #pragma omp parallel for
    for (int i = 0; i<sys->N; ++i) {
        atoms[i].rx = atoms[i].rx + sys->dt*atoms[i].px;
        atoms[i].ry = atoms[i].ry + sys->dt*atoms[i].py;
        atoms[i].rz = atoms[i].rz + sys->dt*atoms[i].pz;
    }
    /* send atoms that have digressed too far to other processes (also enforces pbc)*/
    exchangeParticles(atoms, sys, work);
    /* positions were changed, so recompute the forces */
    sortParticlesAndComputeForces(&(sys->N), atoms, p, sys, work);
    /* final momentum half-step */
    double K2 = 0;
    #pragma omp parallel for reduction(+:K2)
    for (int i = 0; i < sys->N; ++i) {
        atoms[i].px+=0.5*sys->dt*atoms[i].fx;
        atoms[i].py+=0.5*sys->dt*atoms[i].fy;
        atoms[i].pz+=0.5*sys->dt*atoms[i].fz;
        K2 += atoms[i].px*atoms[i].px + atoms[i].py*atoms[i].py + atoms[i].pz*atoms[i].pz;
    }
    /* finish computing K */
    sys->K = 0.5*K2;
}

/* integration and measurement */
void run(system_t* sys)
{
    const int Nmax = estimateNmax(sys->N, sys->rho, sys->localL, sys->cellsize);
    const int bufmax_exchange = estimateMaxNCrossThroughFace(sys->rho, sys->localL, sys->T0, sys->dt, 0.5*rcp);
    const int bufmax_ghost = estimateMaxNGhostPerFaceNeighbor(sys->rho, sys->localL, sys->cellsize);
    const int bufmax = bufmax_exchange>bufmax_ghost?bufmax_exchange:bufmax_ghost;
    const int maxsendcells = max3(sys->nc[0]*sys->nc[1], sys->nc[0]*sys->nc[2], sys->nc[1]*sys->nc[2]);
    const long long Npairsmax = estimateNpairsmax(sys->N, sys->rho, sys->localL, sys->cellsize);
    interaction_pairs_t p;
    parallel_work_t work;
    atom_t*   atoms  = ndmalloc(sizeof(atom_t), 1, Nmax);
    int       numSteps = (int)(sys->runtime/sys->dt+0.5);
    int       equilSteps = (int)(sys->equil/sys->dt+0.5);
    int       count;                 /* counts time steps */
    int       numPoints = 0;         /* counts measurements */
    double    sumE = 0;              /* total energy accumulated over steps */
    double    sumE2 = 0;             /* total energy squared accumulated */
    double    avgE, avgE2, fluctE;   /* average energy, its square, and its fluctuations */
    interaction_pairs_alloc(&p, Npairsmax);
    work_alloc(&work, Nmax, SENDNUM, bufmax, maxsendcells);
    /* draw initial conditions */
    initialize(atoms, &p, sys, &work);
    double  walltimestart = MPI_Wtime();
    double  walltimelast = walltimestart;
    /* perform equilibration steps */
    for (count = 0; count < equilSteps; ++count) {
        integrateStep(sys, atoms, &p, &work);
        double Utot, Ktot;
        MPI_Reduce(&(sys->U), &Utot, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&(sys->K), &Ktot, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        if (i_am_root) {
            double t = MPI_Wtime();
            printf("%7d %7.3f %7.3f %7.3f %7.3f %7.3f   .....    %10.3lf %10.3lf\n",
                   count+1, (count+1)*sys->dt, (Ktot+Utot)/sys->Ntot, Utot/sys->Ntot, Ktot/sys->Ntot, (2*Ktot)/(DIM*sys->Ntot), t-walltimelast, t-walltimestart );
            walltimelast = t;
        }
    }
    /* perform rest of time steps */
    for (count = equilSteps; count<numSteps; ++count) {
        integrateStep(sys, atoms, &p, &work);
        double Utot, Ktot;
        MPI_Reduce(&(sys->U), &Utot, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&(sys->K), &Ktot, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        if (i_am_root) {
            double Etot = Ktot + Utot;
            sumE  += Etot;
            sumE2 += Etot * Etot;
            ++numPoints;
            avgE   = sumE/numPoints;
            avgE2  = sumE2/numPoints;
            if (avgE2 > avgE*avgE) {
                fluctE = sqrt(avgE2-avgE*avgE);
            } else {  /* correct for round-off */
                fluctE = 0.0;
            }
            double t = MPI_Wtime();
            printf("%7d %7.3f %7.3f %7.3f %7.3f %7.3f %10.2e %10.3lf %10.3lf\n",
                   count+1, (count+1)*sys->dt, Etot/sys->Ntot, Utot/sys->Ntot, Ktot/sys->Ntot, (2*Ktot)/(DIM*sys->Ntot), fluctE/sys->Ntot, t-walltimelast, t-walltimestart);
            walltimelast = t;
        }
    }
    interaction_pairs_free(&p);
    work_free(&work);
    ndfree(atoms);
    ndfree(sys->n_in_cell);
    ndfree(sys->start_of_cell);
}

/* program start */
int main(int argc, char* argv[])
{
    /* start mpi with thread support */
    int required_support_for_threads = MPI_THREAD_FUNNELED, provided_support_for_threads;
    MPI_Init_thread(&argc, &argv, required_support_for_threads, &provided_support_for_threads);
    assert_le(provided_support_for_threads, required_support_for_threads);
    /* who am i and how many of us are there? */
    MPI_Comm_size(MPI_COMM_WORLD, &global_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &global_rank);
    /* new mpi data types */
    define_MPI_ATOM();
    define_MPI_PARAMETERS();
    /* debug? */
    ENTERDEBUGGER;
    /* report parallel setup */
    if (i_am_root) {
        printf("#");
        if (global_size != 1)
            printf("Number of processes = %d, ", global_size);
        /* get number of threads available:  */
        #pragma omp parallel default(none)
        #pragma omp single
        printf("Number of threads = %d\n", omp_get_num_threads());
    }
    /**/
    system_t sys;
    if (i_am_root) {
        FILE* file = ((argc>1)?fopen(argv[1],"r"):stdin);
        if (file == NULL)
            MPI_Abort(MPI_COMM_WORLD, 1);
        key_value_table_t* ini = calloc(1,sizeof(*ini));
        kvt_read(ini, file);
        sys.Ntot     = kvt_lookup_long_long(ini, "N");
        sys.rho      = kvt_lookup_double(ini, "rho");
        sys.T0       = kvt_lookup_double(ini, "T");
        sys.runtime  = kvt_lookup_double(ini, "runtime");
        sys.dt       = kvt_lookup_double(ini, "dt");
        sys.seed     = kvt_lookup_long(ini, "seed");
        sys.equil    = kvt_lookup_double(ini, "equil");
        sys.usecells = kvt_lookup_entry(ini, "usecells") != NULL;
        sys.L        = cbrt(sys.Ntot/sys.rho);
        if (argc > 1 && file != NULL)
            fclose(file);
        free(ini);
        printf("#Ntot=%lld rho=%f L=%f T=%f runtime=%f dt=%f seed=%ld equil=%f usecells=%d\n",
               sys.Ntot,sys.rho,sys.L,sys.T0,sys.runtime,sys.dt,sys.seed,sys.equil,sys.usecells);
    }
    MPI_Bcast(&sys, 1, MPI_PARAMETERS, 0, MPI_COMM_WORLD);
    sys.comm = MPI_COMM_WORLD;
    sys.nprocs = global_size;
    sys.rank = global_rank;
    if (sanity_check(&sys, rc, i_am_root)) {
        distribute(&sys);
        run(&sys);
        MPI_Finalize();
    } else {
        MPI_Finalize();
        return 1;
    }
    return 0;
}
ViewGit