pdelab/master/repartition_8hh_source.html

// SPDX-FileCopyrightText: Copyright © DUNE Project contributors, see file LICENSE.md in module root

// SPDX-License-Identifier: LicenseRef-GPL-2.0-only-with-DUNE-exception

// -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-

// vi: set et ts=4 sw=2 sts=2:

#ifndef DUNE_ISTL_REPARTITION_HH

#define DUNE_ISTL_REPARTITION_HH


#include <cassert>

#include <map>

#include <utility>

#include <cmath>


#if HAVE_PARMETIS

// Explicitly use C linkage as scotch does not extern "C" in its headers.

// Works because ParMETIS/METIS checks whether compiler is C++ and otherwise

// does not use extern "C". Therefore no nested extern "C" will be created.

extern "C"

{

#include <parmetis.h>

}

#endif


#include <dune/common/timer.hh>

#include <dune/common/enumset.hh>

#include <dune/common/stdstreams.hh>

#include <dune/common/parallel/mpitraits.hh>

#include <dune/common/parallel/communicator.hh>

#include <dune/common/parallel/indexset.hh>

#include <dune/common/parallel/indicessyncer.hh>

#include <dune/common/parallel/remoteindices.hh>

#include <dune/common/rangeutilities.hh>


#include <dune/istl/owneroverlapcopy.hh>

#include <dune/istl/paamg/graph.hh>


namespace Dune

{

  namespace Metis

  {

    // Explicitly specify a real_t and idx_t for older (Par)METIS versions that do not

    // provide these typedefs

#if HAVE_PARMETIS && defined(REALTYPEWIDTH)

    using real_t = ::real_t;

#else

    using real_t = float;

#endif


#if HAVE_PARMETIS && defined(IDXTYPEWIDTH)

    using idx_t = ::idx_t;

#elif HAVE_PARMETIS && defined(HAVE_SCOTCH_NUM_TYPE)

    using idx_t = SCOTCH_Num;

#elif HAVE_PARMETIS

    using idx_t = int;

#else

    using idx_t = std::size_t;

#endif

  }


#if HAVE_MPI

  template<class G, class T1, class T2>

  void fillIndexSetHoles(const G& graph, Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm)

  {

    typedef typename Dune::OwnerOverlapCopyCommunication<T1,T2>::ParallelIndexSet IndexSet;

    typedef typename IndexSet::LocalIndex::Attribute Attribute;


    IndexSet& indexSet = oocomm.indexSet();

    const typename Dune::OwnerOverlapCopyCommunication<T1,T2>::GlobalLookupIndexSet& lookup =oocomm.globalLookup();


    std::size_t sum=0, needed = graph.noVertices()-indexSet.size();

    std::vector<std::size_t> neededall(oocomm.communicator().size(), 0);


    MPI_Allgather(&needed, 1, MPITraits<std::size_t>::getType() , &(neededall[0]), 1, MPITraits<std::size_t>::getType(), oocomm.communicator());

    for(int i=0; i<oocomm.communicator().size(); ++i)

      sum=sum+neededall[i];   // MAke this for generic


    if(sum==0)

      // Nothing to do

      return;


    //Compute Maximum Global Index

    T1 maxgi=0;

    auto end = indexSet.end();

    for(auto it = indexSet.begin(); it != end; ++it)

      maxgi=std::max(maxgi,it->global());


    //Process p creates global indices consecutively

    //starting atmaxgi+\sum_{i=1}^p neededall[i]

    // All created indices are owned by the process

    maxgi=oocomm.communicator().max(maxgi);

    ++maxgi;  // Start with the next free index.


    for(int i=0; i<oocomm.communicator().rank(); ++i)

      maxgi=maxgi+neededall[i];   // TODO: make this more generic


    // Store the global index information for repairing the remote index information

    std::map<int,SLList<std::pair<T1,Attribute> > > globalIndices;

    storeGlobalIndicesOfRemoteIndices(globalIndices, oocomm.remoteIndices());

    indexSet.beginResize();


    for(auto vertex = graph.begin(), vend=graph.end(); vertex != vend; ++vertex) {

      const typename IndexSet::IndexPair* pair=lookup.pair(*vertex);

      if(pair==0) {

        // No index yet, add new one

        indexSet.add(maxgi, typename IndexSet::LocalIndex(*vertex, OwnerOverlapCopyAttributeSet::owner, false));

        ++maxgi;

      }

    }


    indexSet.endResize();


    repairLocalIndexPointers(globalIndices, oocomm.remoteIndices(), indexSet);


    oocomm.freeGlobalLookup();

    oocomm.buildGlobalLookup();

#ifdef DEBUG_REPART

    std::cout<<"Holes are filled!"<<std::endl;

    std::cout<<oocomm.communicator().rank()<<": "<<oocomm.indexSet()<<std::endl;

#endif

  }


  namespace

  {


    class ParmetisDuneIndexMap

    {

    public:

      template<class Graph, class OOComm>

      ParmetisDuneIndexMap(const Graph& graph, const OOComm& com);

      int toParmetis(int i) const

      {

        return duneToParmetis[i];

      }

      int toLocalParmetis(int i) const

      {

        return duneToParmetis[i]-base_;

      }

      int operator[](int i) const

      {

        return duneToParmetis[i];

      }

      int toDune(int i) const

      {

        return parmetisToDune[i];

      }

      std::vector<int>::size_type numOfOwnVtx() const

      {

        return parmetisToDune.size();

      }

      Metis::idx_t* vtxDist()

      {

        return &vtxDist_[0];

      }

      int globalOwnerVertices;

    private:

      int base_;

      std::vector<int> duneToParmetis;

      std::vector<int> parmetisToDune;

      // range of vertices for processor i: vtxdist[i] to vtxdist[i+1] (parmetis global)

      std::vector<Metis::idx_t> vtxDist_;

    };


    template<class G, class OOComm>

    ParmetisDuneIndexMap::ParmetisDuneIndexMap(const G& graph, const OOComm& oocomm)

      : duneToParmetis(graph.noVertices(), -1), vtxDist_(oocomm.communicator().size()+1)

    {

      int npes=oocomm.communicator().size(), mype=oocomm.communicator().rank();


      typedef typename OOComm::OwnerSet OwnerSet;


      int numOfOwnVtx=0;

      auto end = oocomm.indexSet().end();

      for(auto index = oocomm.indexSet().begin(); index != end; ++index) {

        if (OwnerSet::contains(index->local().attribute())) {

          numOfOwnVtx++;

        }

      }

      parmetisToDune.resize(numOfOwnVtx);

      std::vector<int> globalNumOfVtx(npes);

      // make this number available to all processes

      MPI_Allgather(&numOfOwnVtx, 1, MPI_INT, &(globalNumOfVtx[0]), 1, MPI_INT, oocomm.communicator());


      int base=0;

      vtxDist_[0] = 0;

      for(int i=0; i<npes; i++) {

        if (i<mype) {

          base += globalNumOfVtx[i];

        }

        vtxDist_[i+1] = vtxDist_[i] + globalNumOfVtx[i];

      }

      globalOwnerVertices=vtxDist_[npes];

      base_=base;


#ifdef DEBUG_REPART

      std::cout << oocomm.communicator().rank()<<" vtxDist: ";

      for(int i=0; i<= npes; ++i)

        std::cout << vtxDist_[i]<<" ";

      std::cout<<std::endl;

#endif


      // Traverse the graph and assign a new consecutive number/index

      // starting by "base" to all owner vertices.

      // The new index is used as the ParMETIS global index and is

      // stored in the vector "duneToParmetis"

      auto vend = graph.end();

      for(auto vertex = graph.begin(); vertex != vend; ++vertex) {

        const typename OOComm::ParallelIndexSet::IndexPair* index=oocomm.globalLookup().pair(*vertex);

        assert(index);

        if (OwnerSet::contains(index->local().attribute())) {

          // assign and count the index

          parmetisToDune[base-base_]=index->local();

          duneToParmetis[index->local()] = base++;

        }

      }


      // At this point, every process knows the ParMETIS global index

      // of it's owner vertices. The next step is to get the

      // ParMETIS global index of the overlap vertices from the

      // associated processes. To do this, the Dune::Interface class

      // is used.

#ifdef DEBUG_REPART

      std::cout <<oocomm.communicator().rank()<<": before ";

      for(std::size_t i=0; i<duneToParmetis.size(); ++i)

        std::cout<<duneToParmetis[i]<<" ";

      std::cout<<std::endl;

#endif

      oocomm.copyOwnerToAll(duneToParmetis,duneToParmetis);

#ifdef DEBUG_REPART

      std::cout <<oocomm.communicator().rank()<<": after ";

      for(std::size_t i=0; i<duneToParmetis.size(); ++i)

        std::cout<<duneToParmetis[i]<<" ";

      std::cout<<std::endl;

#endif

    }

  }


  struct RedistributeInterface

    : public Interface

  {

    void setCommunicator(MPI_Comm comm)

    {

      communicator_=comm;

    }

    template<class Flags,class IS>

    void buildSendInterface(const std::vector<int>& toPart, const IS& idxset)

    {

      std::map<int,int> sizes;


      for(auto i=idxset.begin(), end=idxset.end(); i!=end; ++i)

        if(Flags::contains(i->local().attribute()))

          ++sizes[toPart[i->local()]];


      // Allocate the necessary space

      for(auto i=sizes.begin(), end=sizes.end(); i!=end; ++i)

        interfaces()[i->first].first.reserve(i->second);


      //Insert the interface information

      for(auto i=idxset.begin(), end=idxset.end(); i!=end; ++i)

        if(Flags::contains(i->local().attribute()))

          interfaces()[toPart[i->local()]].first.add(i->local());

    }


    void reserveSpaceForReceiveInterface(int proc, int size)

    {

      interfaces()[proc].second.reserve(size);

    }

    void addReceiveIndex(int proc, std::size_t idx)

    {

      interfaces()[proc].second.add(idx);

    }

    template<typename TG>

    void buildReceiveInterface(std::vector<std::pair<TG,int> >& indices)

    {

      std::size_t i=0;

      for(auto idx=indices.begin(); idx!= indices.end(); ++idx) {

        interfaces()[idx->second].second.add(i++);

      }

    }


  };


  namespace

  {

    template<class GI>

    void createSendBuf(std::vector<GI>& ownerVec, std::set<GI>& overlapVec, std::set<int>& neighbors, char *sendBuf, int buffersize, MPI_Comm comm) {

      // Pack owner vertices

      std::size_t s=ownerVec.size();

      int pos=0;

      if(s==0)

        ownerVec.resize(1); // otherwise would read beyond the memory bound

      MPI_Pack(&s, 1, MPITraits<std::size_t>::getType(), sendBuf, buffersize, &pos, comm);

      MPI_Pack(&(ownerVec[0]), s, MPITraits<GI>::getType(), sendBuf, buffersize, &pos, comm);

      s = overlapVec.size();

      MPI_Pack(&s, 1, MPITraits<std::size_t>::getType(), sendBuf, buffersize, &pos, comm);

      for(auto i=overlapVec.begin(), end= overlapVec.end(); i != end; ++i)

        MPI_Pack(const_cast<GI*>(&(*i)), 1, MPITraits<GI>::getType(), sendBuf, buffersize, &pos, comm);


      s=neighbors.size();

      MPI_Pack(&s, 1, MPITraits<std::size_t>::getType(), sendBuf, buffersize, &pos, comm);


      for(auto i=neighbors.begin(), end= neighbors.end(); i != end; ++i)

        MPI_Pack(const_cast<int*>(&(*i)), 1, MPI_INT, sendBuf, buffersize, &pos, comm);

    }

    template<class GI>

    void saveRecvBuf(char *recvBuf, int bufferSize, std::vector<std::pair<GI,int> >& ownerVec,

                     std::set<GI>& overlapVec, std::set<int>& neighbors, RedistributeInterface& inf, int from, MPI_Comm comm) {

      std::size_t size;

      int pos=0;

      // unpack owner vertices

      MPI_Unpack(recvBuf, bufferSize, &pos, &size, 1, MPITraits<std::size_t>::getType(), comm);

      inf.reserveSpaceForReceiveInterface(from, size);

      ownerVec.reserve(ownerVec.size()+size);

      for(; size!=0; --size) {

        GI gi;

        MPI_Unpack(recvBuf, bufferSize, &pos, &gi, 1, MPITraits<GI>::getType(), comm);

        ownerVec.push_back(std::make_pair(gi,from));

      }

      // unpack overlap vertices

      MPI_Unpack(recvBuf, bufferSize, &pos, &size, 1, MPITraits<std::size_t>::getType(), comm);

      typename std::set<GI>::iterator ipos = overlapVec.begin();

      Dune::dverb << "unpacking "<<size<<" overlap"<<std::endl;

      for(; size!=0; --size) {

        GI gi;

        MPI_Unpack(recvBuf, bufferSize, &pos, &gi, 1, MPITraits<GI>::getType(), comm);

        ipos=overlapVec.insert(ipos, gi);

      }

      //unpack neighbors

      MPI_Unpack(recvBuf, bufferSize, &pos, &size, 1,  MPITraits<std::size_t>::getType(), comm);

      Dune::dverb << "unpacking "<<size<<" neighbors"<<std::endl;

      typename std::set<int>::iterator npos = neighbors.begin();

      for(; size!=0; --size) {

        int n;

        MPI_Unpack(recvBuf, bufferSize, &pos, &n, 1, MPI_INT, comm);

        npos=neighbors.insert(npos, n);

      }

    }


    template<typename T>

    void getDomain(const MPI_Comm& comm, T *part, int numOfOwnVtx, int nparts, int *myDomain, std::vector<int> &domainMapping) {

      int npes, mype;

      MPI_Comm_size(comm, &npes);

      MPI_Comm_rank(comm, &mype);

      MPI_Status status;


      *myDomain = -1;


      std::vector<int> domain(nparts, 0);

      std::vector<int> assigned(npes, 0);

      // init domain Mapping

      domainMapping.assign(domainMapping.size(), -1);


      // count the occurrence of domains

      for (int i = 0; i < numOfOwnVtx; i++) {

        domain[part[i]]++;

      }


      std::vector<int> domainMatrix(npes * nparts, -1);


      // init buffer with the own domain

      int *buf = new int[nparts];

      for (int i = 0; i < nparts; i++) {

        buf[i] = domain[i];

        domainMatrix[mype*nparts+i] = domain[i];

      }

      int pe=0;

      int src = (mype-1+npes)%npes;

      int dest = (mype+1)%npes;

      // ring communication, we need n-1 communications for n processors

      for (int i = 0; i < npes-1; i++) {

        MPI_Sendrecv_replace(buf, nparts, MPI_INT, dest, 0, src, 0, comm, &status);

        // pe is the process of the actual received buffer

        pe = ((mype-1-i)+npes)%npes;

        for(int j = 0; j < nparts; j++) {

          // save the values to the domain matrix

          domainMatrix[pe*nparts+j] = buf[j];

        }

      }

      delete[] buf;


      // Start the domain calculation.

      // The process which contains the maximum number of vertices of a

      // particular domain is selected to choose it's favorate domain

      int maxOccurance = 0;

      pe = -1;

      std::set<std::size_t> unassigned;


      for (int i = 0; i < nparts; i++) {

        for (int j = 0; j < npes; j++) {

          // process has no domain assigned

          if (assigned[j]==0) {

            if (maxOccurance < domainMatrix[j*nparts+i]) {

              maxOccurance = domainMatrix[j*nparts+i];

              pe = j;

            }

          }


        }

        if (pe!=-1) {

          // process got a domain, ...

          domainMapping[i] = pe;

          // ...mark as assigned

          assigned[pe] = 1;

          if (pe==mype) {

            *myDomain = i;

          }

          pe = -1;

        }

        else

        {

          unassigned.insert(i);

        }

        maxOccurance = 0;

      }


      typename std::vector<int>::iterator next_free = assigned.begin();


      for(auto udomain = unassigned.begin(),

            end = unassigned.end(); udomain != end; ++udomain)

      {

        next_free = std::find_if(next_free, assigned.end(), std::bind(std::less<int>(), std::placeholders::_1, 1));

        assert(next_free !=  assigned.end());

        domainMapping[*udomain] = next_free-assigned.begin();

        *next_free = 1;

      }

    }


    struct SortFirst

    {

      template<class T>

      bool operator()(const T& t1, const T& t2) const

      {

        return t1<t2;

      }

    };


    template<class GI>

    void mergeVec(std::vector<std::pair<GI, int> >& ownerVec, std::set<GI>& overlapSet) {


#ifdef DEBUG_REPART

      // Safety check for duplicates.

      if(ownerVec.size()>0)

      {

        auto old=ownerVec.begin();

        for(auto i=old+1, end=ownerVec.end(); i != end; old=i++)

        {

          if(i->first==old->first)

          {

            std::cerr<<"Value at index "<<old-ownerVec.begin()<<" is the same as at index "

                     <<i-ownerVec.begin()<<" ["<<old->first<<","<<old->second<<"]==["

                     <<i->first<<","<<i->second<<"]"<<std::endl;

            throw "Huch!";

          }

        }

      }


#endif


      auto v=ownerVec.begin(), vend=ownerVec.end();

      for(auto s=overlapSet.begin(), send=overlapSet.end(); s!=send;)

      {

        while(v!=vend && v->first<*s) ++v;

        if(v!=vend && v->first==*s) {

          // Move to the next element before erasing

          // thus s stays valid!

          auto tmp=s;

          ++s;

          overlapSet.erase(tmp);

        }else

          ++s;

      }

    }


    template<class OwnerSet, class Graph, class IS, class GI>

    void getNeighbor(const Graph& g, std::vector<int>& part,

                     typename Graph::VertexDescriptor vtx, const IS& indexSet,

                     int toPe, std::set<GI>& neighbor, std::set<int>& neighborProcs) {

      for(auto edge=g.beginEdges(vtx), end=g.endEdges(vtx); edge!=end; ++edge)

      {

        const typename IS::IndexPair* pindex = indexSet.pair(edge.target());

        assert(pindex);

        if(part[pindex->local()]!=toPe || !OwnerSet::contains(pindex->local().attribute()))

        {

          // is sent to another process and therefore becomes overlap

          neighbor.insert(pindex->global());

          neighborProcs.insert(part[pindex->local()]);

        }

      }

    }


    template<class T, class I>

    void my_push_back(std::vector<T>& ownerVec, const I& index, [[maybe_unused]] int proc)

    {

      ownerVec.push_back(index);

    }


    template<class T, class I>

    void my_push_back(std::vector<std::pair<T,int> >& ownerVec, const I& index, int proc)

    {

      ownerVec.push_back(std::make_pair(index,proc));

    }

    template<class T>

    void reserve(std::vector<T>&, RedistributeInterface&, int)

    {}

    template<class T>

    void reserve(std::vector<std::pair<T,int> >& ownerVec, RedistributeInterface& redist, int proc)

    {

      redist.reserveSpaceForReceiveInterface(proc, ownerVec.size());

    }


    template<class OwnerSet, class G, class IS, class T, class GI>

    void getOwnerOverlapVec(const G& graph, std::vector<int>& part, IS& indexSet,

                            [[maybe_unused]] int myPe, int toPe, std::vector<T>& ownerVec, std::set<GI>& overlapSet,

                            RedistributeInterface& redist, std::set<int>& neighborProcs) {

      for(auto index = indexSet.begin(); index != indexSet.end(); ++index) {

        // Only Process owner vertices, the others are not in the parmetis graph.

        if(OwnerSet::contains(index->local().attribute()))

        {

          if(part[index->local()]==toPe)

          {

            getNeighbor<OwnerSet>(graph, part, index->local(), indexSet,

                                  toPe, overlapSet, neighborProcs);

            my_push_back(ownerVec, index->global(), toPe);

          }

        }

      }

      reserve(ownerVec, redist, toPe);


    }


    template<class F, class IS>

    inline bool isOwner(IS& indexSet, int index) {


      const typename IS::IndexPair* pindex=indexSet.pair(index);


      assert(pindex);

      return F::contains(pindex->local().attribute());

    }


    class BaseEdgeFunctor

    {

    public:

      BaseEdgeFunctor(Metis::idx_t* adj,const ParmetisDuneIndexMap& data)

        : i_(), adj_(adj), data_(data)

      {}


      template<class T>

      void operator()(const T& edge)

      {

        // Get the edge weight

        // const Weight& weight=edge.weight();

        adj_[i_] = data_.toParmetis(edge.target());

        i_++;

      }

      std::size_t index()

      {

        return i_;

      }


    private:

      std::size_t i_;

      Metis::idx_t* adj_;

      const ParmetisDuneIndexMap& data_;

    };


    template<typename G>

    struct EdgeFunctor

      : public BaseEdgeFunctor

    {

      EdgeFunctor(Metis::idx_t* adj, const ParmetisDuneIndexMap& data, std::size_t)

        : BaseEdgeFunctor(adj, data)

      {}


      Metis::idx_t* getWeights()

      {

        return NULL;

      }

      void free(){}

    };


    template<class G, class V, class E, class VM, class EM>

    class EdgeFunctor<Dune::Amg::PropertiesGraph<G,V,E,VM,EM> >

      :  public BaseEdgeFunctor

    {

    public:

      EdgeFunctor(Metis::idx_t* adj, const ParmetisDuneIndexMap& data, std::size_t s)

        : BaseEdgeFunctor(adj, data)

      {

        weight_=new Metis::idx_t[s];

      }


      template<class T>

      void operator()(const T& edge)

      {

        weight_[index()]=edge.properties().depends() ? 3 : 1;

        BaseEdgeFunctor::operator()(edge);

      }

      Metis::idx_t* getWeights()

      {

        return weight_;

      }

      void free(){

        delete[] weight_;

        weight_ = nullptr;

      }

    private:

      Metis::idx_t* weight_;

    };


    template<class F, class G, class IS, class EW>

    void getAdjArrays(G& graph, IS& indexSet, Metis::idx_t *xadj,

                      EW& ew)

    {

      int j=0;

      auto vend = graph.end();


      for(auto vertex = graph.begin(); vertex != vend; ++vertex) {

        if (isOwner<F>(indexSet,*vertex)) {

          // The type of const edge iterator.

          auto eend = vertex.end();

          xadj[j] = ew.index();

          j++;

          for(auto edge = vertex.begin(); edge != eend; ++edge) {

            ew(edge);

          }

        }

      }

      xadj[j] = ew.index();

    }

  } // end anonymous namespace


  template<class G, class T1, class T2>

  bool buildCommunication(const G& graph, std::vector<int>& realparts,

                          Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm,

                          std::shared_ptr<Dune::OwnerOverlapCopyCommunication<T1,T2>>& outcomm,

                          RedistributeInterface& redistInf,

                          bool verbose=false);

#if HAVE_PARMETIS

#ifndef METIS_VER_MAJOR

  extern "C"

  {

    // backwards compatibility to parmetis < 4.0.0

    void METIS_PartGraphKway(int *nvtxs, Metis::idx_t *xadj, Metis::idx_t *adjncy, Metis::idx_t *vwgt,

                             Metis::idx_t *adjwgt, int *wgtflag, int *numflag, int *nparts,

                             int *options, int *edgecut, Metis::idx_t *part);


    void METIS_PartGraphRecursive(int *nvtxs, Metis::idx_t *xadj, Metis::idx_t *adjncy, Metis::idx_t *vwgt,

                                  Metis::idx_t *adjwgt, int *wgtflag, int *numflag, int *nparts,

                                  int *options, int *edgecut, Metis::idx_t *part);

  }

#endif

#endif // HAVE_PARMETIS


  template<class S, class T>

  inline void print_carray(S& os, T* array, std::size_t l)

  {

    for(T *cur=array, *end=array+l; cur!=end; ++cur)

      os<<*cur<<" ";

  }


  template<class S, class T>

  inline bool isValidGraph(std::size_t noVtx, std::size_t gnoVtx, S noEdges, T* xadj,

                           T* adjncy, bool checkSymmetry)

  {

    bool correct=true;


    using std::signbit;

    for(Metis::idx_t vtx=0; vtx<(Metis::idx_t)noVtx; ++vtx) {

      if(static_cast<S>(xadj[vtx])>noEdges || signbit(xadj[vtx])) {

        std::cerr <<"Check graph: xadj["<<vtx<<"]="<<xadj[vtx]<<" (>"

                  <<noEdges<<") out of range!"<<std::endl;

        correct=false;

      }

      if(static_cast<S>(xadj[vtx+1])>noEdges || signbit(xadj[vtx+1])) {

        std::cerr <<"Check graph: xadj["<<vtx+1<<"]="<<xadj[vtx+1]<<" (>"

                  <<noEdges<<") out of range!"<<std::endl;

        correct=false;

      }

      // Check numbers in adjncy

      for(Metis::idx_t i=xadj[vtx]; i< xadj[vtx+1]; ++i) {

        if(signbit(adjncy[i]) || ((std::size_t)adjncy[i])>gnoVtx) {

          std::cerr<<" Edge "<<adjncy[i]<<" out of range ["<<0<<","<<noVtx<<")"

                   <<std::endl;

          correct=false;

        }

      }

      if(checkSymmetry) {

        for(Metis::idx_t i=xadj[vtx]; i< xadj[vtx+1]; ++i) {

          Metis::idx_t target=adjncy[i];

          // search for symmetric edge

          int found=0;

          for(Metis::idx_t j=xadj[target]; j< xadj[target+1]; ++j)

            if(adjncy[j]==vtx)

              found++;

          if(found!=1) {

            std::cerr<<"Edge ("<<target<<","<<vtx<<") "<<i<<" time"<<std::endl;

            correct=false;

          }

        }

      }

    }

    return correct;

  }


  template<class M, class T1, class T2>

  bool commGraphRepartition(const M& mat, Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm,

                            Metis::idx_t nparts,

                            std::shared_ptr<Dune::OwnerOverlapCopyCommunication<T1,T2>>& outcomm,

                            RedistributeInterface& redistInf,

                            bool verbose=false)

  {

    if(verbose && oocomm.communicator().rank()==0)

      std::cout<<"Repartitioning from "<<oocomm.communicator().size()

               <<" to "<<nparts<<" parts"<<std::endl;

    Timer time;

    int rank = oocomm.communicator().rank();

#if !HAVE_PARMETIS

    int* part = new int[1];

    part[0]=0;

#else

    Metis::idx_t* part = new Metis::idx_t[1]; // where all our data moves to


    if(nparts>1) {


      part[0]=rank;


      { // sublock for automatic memory deletion


        // Build the graph of the communication scheme and create an appropriate indexset.

        // calculate the neighbour vertices

        int noNeighbours = oocomm.remoteIndices().neighbours();


        for(auto n= oocomm.remoteIndices().begin(); n !=  oocomm.remoteIndices().end();

            ++n)

          if(n->first==rank) {

            //do not include ourselves.

            --noNeighbours;

            break;

          }


        // A parmetis graph representing the communication graph.

        // The diagonal entries are the number of nodes on the process.

        // The offdiagonal entries are the number of edges leading to other processes.


        Metis::idx_t *xadj=new Metis::idx_t[2];

        Metis::idx_t *vtxdist=new Metis::idx_t[oocomm.communicator().size()+1];

        Metis::idx_t *adjncy=new Metis::idx_t[noNeighbours];

#ifdef USE_WEIGHTS

        Metis::idx_t *vwgt = 0;

        Metis::idx_t *adjwgt = 0;

#endif


        // each process has exactly one vertex!

        for(int i=0; i<oocomm.communicator().size(); ++i)

          vtxdist[i]=i;

        vtxdist[oocomm.communicator().size()]=oocomm.communicator().size();


        xadj[0]=0;

        xadj[1]=noNeighbours;


        // count edges to other processor

        // a vector mapping the index to the owner

        // std::vector<int> owner(mat.N(), oocomm.communicator().rank());

        // for(NeighbourIterator n= oocomm.remoteIndices().begin(); n !=  oocomm.remoteIndices().end();

        //     ++n)

        //   {

        //     if(n->first!=oocomm.communicator().rank()){

        //       typedef typename RemoteIndices::RemoteIndexList RIList;

        //       const RIList& rlist = *(n->second.first);

        //       typedef typename RIList::const_iterator LIter;

        //       for(LIter entry=rlist.begin(); entry!=rlist.end(); ++entry){

        //         if(entry->attribute()==OwnerOverlapCopyAttributeSet::owner)

        //           owner[entry->localIndexPair().local()] = n->first;

        //       }

        //     }

        //   }


        // std::map<int,Metis::idx_t> edgecount; // edges to other processors

        // typedef typename M::ConstRowIterator RIter;

        // typedef typename M::ConstColIterator CIter;


        // // calculate edge count

        // for(RIter row=mat.begin(), endr=mat.end(); row != endr; ++row)

        //   if(owner[row.index()]==OwnerOverlapCopyAttributeSet::owner)

        //     for(CIter entry= row->begin(), end = row->end(); entry != end; ++entry)

        //       ++edgecount[owner[entry.index()]];


        // setup edge and weight pattern


        Metis::idx_t* adjp=adjncy;


#ifdef USE_WEIGHTS

        vwgt   = new Metis::idx_t[1];

        vwgt[0]= mat.N(); // weight is number of rows TODO: Should actually be the nonzeros.


        adjwgt = new Metis::idx_t[noNeighbours];

        Metis::idx_t* adjwp=adjwgt;

#endif


        for(auto n= oocomm.remoteIndices().begin(); n !=  oocomm.remoteIndices().end();

            ++n)

          if(n->first != rank) {

            *adjp=n->first;

            ++adjp;

#ifdef USE_WEIGHTS

            *adjwp=1; //edgecount[n->first];

            ++adjwp;

#endif

          }

        assert(isValidGraph(vtxdist[rank+1]-vtxdist[rank],

                            vtxdist[oocomm.communicator().size()],

                            noNeighbours, xadj, adjncy, false));


        [[maybe_unused]] Metis::idx_t wgtflag=0;

        Metis::idx_t numflag=0;

        Metis::idx_t edgecut;

#ifdef USE_WEIGHTS

        wgtflag=3;

#endif

        Metis::real_t *tpwgts = new Metis::real_t[nparts];

        for(int i=0; i<nparts; ++i)

          tpwgts[i]=1.0/nparts;

        MPI_Comm comm=oocomm.communicator();


        Dune::dinfo<<rank<<" vtxdist: ";

        print_carray(Dune::dinfo, vtxdist, oocomm.communicator().size()+1);

        Dune::dinfo<<std::endl<<rank<<" xadj: ";

        print_carray(Dune::dinfo, xadj, 2);

        Dune::dinfo<<std::endl<<rank<<" adjncy: ";

        print_carray(Dune::dinfo, adjncy, noNeighbours);


#ifdef USE_WEIGHTS

        Dune::dinfo<<std::endl<<rank<<" vwgt: ";

        print_carray(Dune::dinfo, vwgt, 1);

        Dune::dinfo<<std::endl<<rank<<" adwgt: ";

        print_carray(Dune::dinfo, adjwgt, noNeighbours);

#endif

        Dune::dinfo<<std::endl;

        oocomm.communicator().barrier();

        if(verbose && oocomm.communicator().rank()==0)

          std::cout<<"Creating comm graph took "<<time.elapsed()<<std::endl;

        time.reset();


#ifdef PARALLEL_PARTITION

        Metis::real_t ubvec = 1.15;

        int ncon=1;

        int options[5] ={ 0,1,15,0,0};


        //=======================================================

        // ParMETIS_V3_PartKway

        //=======================================================

        ParMETIS_V3_PartKway(vtxdist, xadj, adjncy,

                             vwgt, adjwgt, &wgtflag,

                             &numflag, &ncon, &nparts, tpwgts, &ubvec, options, &edgecut, part,

                             &comm);

        if(verbose && oocomm.communicator().rank()==0)

          std::cout<<"ParMETIS took "<<time.elapsed()<<std::endl;

        time.reset();

#else

        Timer time1;

        std::size_t gnoedges=0;

        int* noedges = 0;

        noedges = new int[oocomm.communicator().size()];

        Dune::dverb<<"noNeighbours: "<<noNeighbours<<std::endl;

        // gather number of edges for each vertex.

        MPI_Allgather(&noNeighbours,1,MPI_INT,noedges,1, MPI_INT,oocomm.communicator());


        if(verbose && oocomm.communicator().rank()==0)

          std::cout<<"Gathering noedges took "<<time1.elapsed()<<std::endl;

        time1.reset();


        Metis::idx_t noVertices = vtxdist[oocomm.communicator().size()];

        Metis::idx_t *gxadj = 0;

        Metis::idx_t *gvwgt = 0;

        Metis::idx_t *gadjncy = 0;

        Metis::idx_t *gadjwgt = 0;

        Metis::idx_t *gpart = 0;

        int* displ = 0;

        int* noxs = 0;

        int* xdispl = 0;  // displacement for xadj

        int* novs = 0;

        int* vdispl=0; // real vertex displacement

#ifdef USE_WEIGHTS

        std::size_t localNoVtx=vtxdist[rank+1]-vtxdist[rank];

#endif

        std::size_t gxadjlen = vtxdist[oocomm.communicator().size()]-vtxdist[0]+oocomm.communicator().size();


        {

          Dune::dinfo<<"noedges: ";

          print_carray(Dune::dinfo, noedges, oocomm.communicator().size());

          Dune::dinfo<<std::endl;

          displ = new int[oocomm.communicator().size()];

          xdispl = new int[oocomm.communicator().size()];

          noxs = new int[oocomm.communicator().size()];

          vdispl = new int[oocomm.communicator().size()];

          novs = new int[oocomm.communicator().size()];


          for(int i=0; i < oocomm.communicator().size(); ++i) {

            noxs[i]=vtxdist[i+1]-vtxdist[i]+1;

            novs[i]=vtxdist[i+1]-vtxdist[i];

          }


          Metis::idx_t *so= vtxdist;

          int offset = 0;

          for(int *xcurr = xdispl, *vcurr = vdispl, *end=vdispl+oocomm.communicator().size();

              vcurr!=end; ++vcurr, ++xcurr, ++so, ++offset) {

            *vcurr = *so;

            *xcurr = offset + *so;

          }


          int *pdispl =displ;

          int cdispl = 0;

          *pdispl = 0;

          for(int *curr=noedges, *end=noedges+oocomm.communicator().size()-1;

              curr!=end; ++curr) {

            ++pdispl; // next displacement

            cdispl += *curr; // next value

            *pdispl = cdispl;

          }

          Dune::dinfo<<"displ: ";

          print_carray(Dune::dinfo, displ, oocomm.communicator().size());

          Dune::dinfo<<std::endl;


          // calculate global number of edges

          // It is bigger than the actual one as we habe size-1 additional end entries

          for(int *curr=noedges, *end=noedges+oocomm.communicator().size();

              curr!=end; ++curr)

            gnoedges += *curr;


          // allocate global graph

          Dune::dinfo<<"gxadjlen: "<<gxadjlen<<" noVertices: "<<noVertices

                     <<" gnoedges: "<<gnoedges<<std::endl;

          gxadj = new Metis::idx_t[gxadjlen];

          gpart = new Metis::idx_t[noVertices];

#ifdef USE_WEIGHTS

          gvwgt = new Metis::idx_t[noVertices];

          gadjwgt = new Metis::idx_t[gnoedges];

#endif

          gadjncy = new Metis::idx_t[gnoedges];

        }


        if(verbose && oocomm.communicator().rank()==0)

          std::cout<<"Preparing global graph took "<<time1.elapsed()<<std::endl;

        time1.reset();

        // Communicate data


        MPI_Allgatherv(xadj,2,MPITraits<Metis::idx_t>::getType(),

                       gxadj,noxs,xdispl,MPITraits<Metis::idx_t>::getType(),

                       comm);

        MPI_Allgatherv(adjncy,noNeighbours,MPITraits<Metis::idx_t>::getType(),

                       gadjncy,noedges,displ,MPITraits<Metis::idx_t>::getType(),

                       comm);

#ifdef USE_WEIGHTS

        MPI_Allgatherv(adjwgt,noNeighbours,MPITraits<Metis::idx_t>::getType(),

                       gadjwgt,noedges,displ,MPITraits<Metis::idx_t>::getType(),

                       comm);

        MPI_Allgatherv(vwgt,localNoVtx,MPITraits<Metis::idx_t>::getType(),

                       gvwgt,novs,vdispl,MPITraits<Metis::idx_t>::getType(),

                       comm);

#endif

        if(verbose && oocomm.communicator().rank()==0)

          std::cout<<"Gathering global graph data took "<<time1.elapsed()<<std::endl;

        time1.reset();


        {

          // create the real gxadj array

          // i.e. shift entries and add displacements.


          print_carray(Dune::dinfo, gxadj, gxadjlen);


          int offset = 0;

          Metis::idx_t increment = vtxdist[1];

          Metis::idx_t *start=gxadj+1;

          for(int i=1; i<oocomm.communicator().size(); ++i) {

            offset+=1;

            int lprev = vtxdist[i]-vtxdist[i-1];

            int l = vtxdist[i+1]-vtxdist[i];

            start+=lprev;

            assert((start+l+offset)-gxadj<=static_cast<Metis::idx_t>(gxadjlen));

            increment = *(start-1);

            std::transform(start+offset, start+l+offset, start, std::bind(std::plus<Metis::idx_t>(), std::placeholders::_1, increment));

          }

          Dune::dinfo<<std::endl<<"shifted xadj:";

          print_carray(Dune::dinfo, gxadj, noVertices+1);

          Dune::dinfo<<std::endl<<" gadjncy: ";

          print_carray(Dune::dinfo, gadjncy, gnoedges);

#ifdef USE_WEIGHTS

          Dune::dinfo<<std::endl<<" gvwgt: ";

          print_carray(Dune::dinfo, gvwgt, noVertices);

          Dune::dinfo<<std::endl<<"adjwgt: ";

          print_carray(Dune::dinfo, gadjwgt, gnoedges);

          Dune::dinfo<<std::endl;

#endif

          // everything should be fine now!!!

          if(verbose && oocomm.communicator().rank()==0)

            std::cout<<"Postprocessing global graph data took "<<time1.elapsed()<<std::endl;

          time1.reset();

#ifndef NDEBUG

          assert(isValidGraph(noVertices, noVertices, gnoedges,

                              gxadj, gadjncy, true));

#endif


          if(verbose && oocomm.communicator().rank()==0)

            std::cout<<"Creating grah one 1 process took "<<time.elapsed()<<std::endl;

          time.reset();

#if METIS_VER_MAJOR >= 5

          Metis::idx_t ncon = 1;

          Metis::idx_t moptions[METIS_NOPTIONS];

          METIS_SetDefaultOptions(moptions);

          moptions[METIS_OPTION_NUMBERING] = numflag;

          METIS_PartGraphRecursive(&noVertices, &ncon, gxadj, gadjncy, gvwgt, NULL, gadjwgt,

                         &nparts, NULL, NULL, moptions, &edgecut, gpart);

#else

          int options[5] = {0, 1, 1, 3, 3};

          // Call metis

          METIS_PartGraphRecursive(&noVertices, gxadj, gadjncy, gvwgt, gadjwgt, &wgtflag,

                                   &numflag, &nparts, options, &edgecut, gpart);

#endif


          if(verbose && oocomm.communicator().rank()==0)

            std::cout<<"METIS took "<<time.elapsed()<<std::endl;

          time.reset();


          Dune::dinfo<<std::endl<<"part:";

          print_carray(Dune::dinfo, gpart, noVertices);


          delete[] gxadj;

          delete[] gadjncy;

#ifdef USE_WEIGHTS

          delete[] gvwgt;

          delete[] gadjwgt;

#endif

        }

        // Scatter result

        MPI_Scatter(gpart, 1, MPITraits<Metis::idx_t>::getType(), part, 1,

                    MPITraits<Metis::idx_t>::getType(), 0, comm);


        {

          // release remaining memory

          delete[] gpart;

          delete[] noedges;

          delete[] displ;

        }


#endif

        delete[] xadj;

        delete[] vtxdist;

        delete[] adjncy;

#ifdef USE_WEIGHTS

        delete[] vwgt;

        delete[] adjwgt;

#endif

        delete[] tpwgts;

      }

    }else{

      part[0]=0;

    }

#endif

    Dune::dinfo<<" repart "<<rank <<" -> "<< part[0]<<std::endl;


    std::vector<int> realpart(mat.N(), part[0]);

    delete[] part;


    oocomm.copyOwnerToAll(realpart, realpart);


    if(verbose && oocomm.communicator().rank()==0)

      std::cout<<"Scattering repartitioning took "<<time.elapsed()<<std::endl;

    time.reset();


    oocomm.buildGlobalLookup(mat.N());

    Dune::Amg::MatrixGraph<M> graph(const_cast<M&>(mat));

    fillIndexSetHoles(graph, oocomm);

    if(verbose && oocomm.communicator().rank()==0)

      std::cout<<"Filling index set took "<<time.elapsed()<<std::endl;

    time.reset();


    if(verbose) {

      int noNeighbours=oocomm.remoteIndices().neighbours();

      noNeighbours = oocomm.communicator().sum(noNeighbours)

                     / oocomm.communicator().size();

      if(oocomm.communicator().rank()==0)

        std::cout<<"Average no neighbours was "<<noNeighbours<<std::endl;

    }

    bool ret = buildCommunication(graph, realpart, oocomm, outcomm, redistInf,

                                  verbose);

    if(verbose && oocomm.communicator().rank()==0)

      std::cout<<"Building index sets took "<<time.elapsed()<<std::endl;

    time.reset();


    return ret;


  }


  template<class G, class T1, class T2>

  bool graphRepartition(const G& graph, Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm, Metis::idx_t nparts,

                        std::shared_ptr<Dune::OwnerOverlapCopyCommunication<T1,T2>>& outcomm,

                        RedistributeInterface& redistInf,

                        bool verbose=false)

  {

    Timer time;


    MPI_Comm comm=oocomm.communicator();

    oocomm.buildGlobalLookup(graph.noVertices());

    fillIndexSetHoles(graph, oocomm);


    if(verbose && oocomm.communicator().rank()==0)

      std::cout<<"Filling holes took "<<time.elapsed()<<std::endl;

    time.reset();


    // simple precondition checks


#ifdef PERF_REPART

    // Profiling variables

    double t1=0.0, t2=0.0, t3=0.0, t4=0.0, tSum=0.0;

#endif


    // MPI variables

    int mype = oocomm.communicator().rank();


    assert(nparts<=static_cast<Metis::idx_t>(oocomm.communicator().size()));


    int myDomain = -1;


    //

    // 1) Prepare the required parameters for using ParMETIS

    //    Especially the arrays that represent the graph must be

    //    generated by the DUNE Graph and IndexSet input variables.

    //    These are the arrays:

    //    - vtxdist

    //    - xadj

    //    - adjncy

    //

    //

#ifdef PERF_REPART

    // reset timer for step 1)

    t1=MPI_Wtime();

#endif


    typedef typename  Dune::OwnerOverlapCopyCommunication<T1,T2> OOComm;

    typedef typename  OOComm::OwnerSet OwnerSet;


    // Create the vtxdist array and parmetisVtxMapping.

    // Global communications are necessary

    // The parmetis global identifiers for the owner vertices.

    ParmetisDuneIndexMap indexMap(graph,oocomm);

    Metis::idx_t *part = new Metis::idx_t[indexMap.numOfOwnVtx()];

    for(std::size_t i=0; i < indexMap.numOfOwnVtx(); ++i)

      part[i]=mype;


#if !HAVE_PARMETIS

    if(oocomm.communicator().rank()==0 && nparts>1)

      std::cerr<<"ParMETIS not activated. Will repartition to 1 domain instead of requested "

               <<nparts<<" domains."<<std::endl;

    nparts=1; // No parmetis available, fallback to agglomerating to 1 process


#else


    if(nparts>1) {

      // Create the xadj and adjncy arrays

      Metis::idx_t *xadj = new  Metis::idx_t[indexMap.numOfOwnVtx()+1];

      Metis::idx_t *adjncy = new Metis::idx_t[graph.noEdges()];

      EdgeFunctor<G> ef(adjncy, indexMap, graph.noEdges());

      getAdjArrays<OwnerSet>(graph, oocomm.globalLookup(), xadj, ef);


      //

      // 2) Call ParMETIS

      //

      //

      Metis::idx_t numflag=0, wgtflag=0, options[3], edgecut=0, ncon=1;

      //float *tpwgts = NULL;

      Metis::real_t *tpwgts = new Metis::real_t[nparts];

      for(int i=0; i<nparts; ++i)

        tpwgts[i]=1.0/nparts;

      Metis::real_t ubvec[1];

      options[0] = 0; // 0=default, 1=options are defined in [1]+[2]

#ifdef DEBUG_REPART

      options[1] = 3; // show info: 0=no message

#else

      options[1] = 0; // show info: 0=no message

#endif

      options[2] = 1; // random number seed, default is 15

      wgtflag = (ef.getWeights()!=NULL) ? 1 : 0;

      numflag = 0;

      edgecut = 0;

      ncon=1;

      ubvec[0]=1.05; // recommended by ParMETIS


#ifdef DEBUG_REPART

      if (mype == 0) {

        std::cout<<std::endl;

        std::cout<<"Testing ParMETIS_V3_PartKway with options[1-2] = {"

                 <<options[1]<<" "<<options[2]<<"}, Ncon: "

                 <<ncon<<", Nparts: "<<nparts<<std::endl;

      }

#endif

#ifdef PERF_REPART

      // stop the time for step 1)

      t1=MPI_Wtime()-t1;

      // reset timer for step 2)

      t2=MPI_Wtime();

#endif


      if(verbose) {

        oocomm.communicator().barrier();

        if(oocomm.communicator().rank()==0)

          std::cout<<"Preparing for parmetis took "<<time.elapsed()<<std::endl;

      }

      time.reset();


      //=======================================================

      // ParMETIS_V3_PartKway

      //=======================================================

      ParMETIS_V3_PartKway(indexMap.vtxDist(), xadj, adjncy,

                           NULL, ef.getWeights(), &wgtflag,

                           &numflag, &ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &const_cast<MPI_Comm&>(comm));


      delete[] xadj;

      delete[] adjncy;

      delete[] tpwgts;


      ef.free();


#ifdef DEBUG_REPART

      if (mype == 0) {

        std::cout<<std::endl;

        std::cout<<"ParMETIS_V3_PartKway reported a cut of "<<edgecut<<std::endl;

        std::cout<<std::endl;

      }

      std::cout<<mype<<": PARMETIS-Result: ";

      for(int i=0; i < indexMap.vtxDist()[mype+1]-indexMap.vtxDist()[mype]; ++i) {

        std::cout<<part[i]<<" ";

      }

      std::cout<<std::endl;

      std::cout<<"Testing ParMETIS_V3_PartKway with options[1-2] = {"

               <<options[1]<<" "<<options[2]<<"}, Ncon: "

               <<ncon<<", Nparts: "<<nparts<<std::endl;

#endif

#ifdef PERF_REPART

      // stop the time for step 2)

      t2=MPI_Wtime()-t2;

      // reset timer for step 3)

      t3=MPI_Wtime();

#endif


      if(verbose) {

        oocomm.communicator().barrier();

        if(oocomm.communicator().rank()==0)

          std::cout<<"Parmetis took "<<time.elapsed()<<std::endl;

      }

      time.reset();

    }else

#endif

    {

      // Everything goes to process 0!

      for(std::size_t i=0; i<indexMap.numOfOwnVtx(); ++i)

        part[i]=0;

    }


    //

    // 3) Find a optimal domain based on the ParMETIS repartitioning

    //    result

    //


    std::vector<int> domainMapping(nparts);

    if(nparts>1)

      getDomain(comm, part, indexMap.numOfOwnVtx(), nparts, &myDomain, domainMapping);

    else

      domainMapping[0]=0;


#ifdef DEBUG_REPART

    std::cout<<mype<<": myDomain: "<<myDomain<<std::endl;

    std::cout<<mype<<": DomainMapping: ";

    for(auto j : range(nparts)) {

      std::cout<<" do: "<<j<<" pe: "<<domainMapping[j]<<" ";

    }

    std::cout<<std::endl;

#endif


    // Make a domain mapping for the indexset and translate

    //domain number to real process number

    // domainMapping is the one of parmetis, that is without

    // the overlap/copy vertices

    std::vector<int> setPartition(oocomm.indexSet().size(), -1);


    std::size_t i=0; // parmetis index

    for(auto index = oocomm.indexSet().begin(); index != oocomm.indexSet().end(); ++index)

      if(OwnerSet::contains(index->local().attribute())) {

        setPartition[index->local()]=domainMapping[part[i++]];

      }


    delete[] part;

    oocomm.copyOwnerToAll(setPartition, setPartition);

    // communication only needed for ALU

    // (ghosts with same global id as owners on the same process)

    if (SolverCategory::category(oocomm) ==

        static_cast<int>(SolverCategory::nonoverlapping))

      oocomm.copyCopyToAll(setPartition, setPartition);

    bool ret = buildCommunication(graph, setPartition, oocomm, outcomm, redistInf,

                                  verbose);

    if(verbose) {

      oocomm.communicator().barrier();

      if(oocomm.communicator().rank()==0)

        std::cout<<"Creating indexsets took "<<time.elapsed()<<std::endl;

    }

    return ret;

  }


  template<class G, class T1, class T2>

  bool buildCommunication(const G& graph,

                          std::vector<int>& setPartition, Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm,

                          std::shared_ptr<Dune::OwnerOverlapCopyCommunication<T1,T2>>& outcomm,

                          RedistributeInterface& redistInf,

                          bool verbose)

  {

    typedef typename  Dune::OwnerOverlapCopyCommunication<T1,T2> OOComm;

    typedef typename  OOComm::OwnerSet OwnerSet;


    Timer time;


    // Build the send interface

    redistInf.buildSendInterface<OwnerSet>(setPartition, oocomm.indexSet());


#ifdef PERF_REPART

    // stop the time for step 3)

    t3=MPI_Wtime()-t3;

    // reset timer for step 4)

    t4=MPI_Wtime();

#endif


    //

    // 4) Create the output IndexSet and RemoteIndices

    //    4.1) Determine the "send to" and "receive from" relation

    //         according to the new partition using a MPI ring

    //         communication.

    //

    //    4.2) Depends on the "send to" and "receive from" vector,

    //         the processes will exchange the vertices each other

    //

    //    4.3) Create the IndexSet, RemoteIndices and the new MPI

    //         communicator

    //


    //

    // 4.1) Let's start...

    //

    int npes = oocomm.communicator().size();

    int *sendTo = 0;

    int noSendTo = 0;

    std::set<int> recvFrom;


    // the max number of vertices is stored in the sendTo buffer,

    // not the number of vertices to send! Because the max number of Vtx

    // is used as the fixed buffer size by the MPI send/receive calls


    int mype = oocomm.communicator().rank();


    {

      std::set<int> tsendTo;

      for(auto i=setPartition.begin(), iend = setPartition.end(); i!=iend; ++i)

        tsendTo.insert(*i);


      noSendTo = tsendTo.size();

      sendTo = new int[noSendTo];

      int idx=0;

      for(auto i=tsendTo.begin(); i != tsendTo.end(); ++i, ++idx)

        sendTo[idx]=*i;

    }


    //

    int* gnoSend= new int[oocomm.communicator().size()];

    int* gsendToDispl =  new int[oocomm.communicator().size()+1];


    MPI_Allgather(&noSendTo, 1, MPI_INT, gnoSend, 1,

                  MPI_INT, oocomm.communicator());


    // calculate total receive message size

    int totalNoRecv = 0;

    for(int i=0; i<npes; ++i)

      totalNoRecv += gnoSend[i];


    int *gsendTo = new int[totalNoRecv];


    // calculate displacement for allgatherv

    gsendToDispl[0]=0;

    for(int i=0; i<npes; ++i)

      gsendToDispl[i+1]=gsendToDispl[i]+gnoSend[i];


    // gather the data

    MPI_Allgatherv(sendTo, noSendTo, MPI_INT, gsendTo, gnoSend, gsendToDispl,

                   MPI_INT, oocomm.communicator());


    // Extract from which processes we will receive data

    for(int proc=0; proc < npes; ++proc)

      for(int i=gsendToDispl[proc]; i < gsendToDispl[proc+1]; ++i)

        if(gsendTo[i]==mype)

          recvFrom.insert(proc);


    bool existentOnNextLevel = recvFrom.size()>0;


    // Delete memory

    delete[] gnoSend;

    delete[] gsendToDispl;

    delete[] gsendTo;


#ifdef DEBUG_REPART

    if(recvFrom.size()) {

      std::cout<<mype<<": recvFrom: ";

      for(auto i=recvFrom.begin(); i!= recvFrom.end(); ++i) {

        std::cout<<*i<<" ";

      }

    }


    std::cout<<std::endl<<std::endl;

    std::cout<<mype<<": sendTo: ";

    for(int i=0; i<noSendTo; i++) {

      std::cout<<sendTo[i]<<" ";

    }

    std::cout<<std::endl<<std::endl;

#endif


    if(verbose)

      if(oocomm.communicator().rank()==0)

        std::cout<<" Communicating the receive information took "<<

        time.elapsed()<<std::endl;

    time.reset();


    //

    // 4.2) Start the communication

    //


    // Get all the owner and overlap vertices for myself and save

    // it in the vectors myOwnerVec and myOverlapVec.

    // The received vertices from the other processes are simple

    // added to these vector.

    //


    typedef typename OOComm::ParallelIndexSet::GlobalIndex GI;

    typedef std::vector<GI> GlobalVector;

    std::vector<std::pair<GI,int> > myOwnerVec;

    std::set<GI> myOverlapSet;

    GlobalVector sendOwnerVec;

    std::set<GI> sendOverlapSet;

    std::set<int> myNeighbors;


    //    getOwnerOverlapVec<OwnerSet>(graph, setPartition, oocomm.globalLookup(),

    //                           mype, mype, myOwnerVec, myOverlapSet, redistInf, myNeighbors);


    char **sendBuffers=new char*[noSendTo];

    MPI_Request *requests = new MPI_Request[noSendTo];


    // Create all messages to be sent

    for(int i=0; i < noSendTo; ++i) {

      // clear the vector for sending

      sendOwnerVec.clear();

      sendOverlapSet.clear();

      // get all owner and overlap vertices for process j and save these

      // in the vectors sendOwnerVec and sendOverlapSet

      std::set<int> neighbors;

      getOwnerOverlapVec<OwnerSet>(graph, setPartition, oocomm.globalLookup(),

                                   mype, sendTo[i], sendOwnerVec, sendOverlapSet, redistInf,

                                   neighbors);

      // +2, we need 2 integer more for the length of each part

      // (owner/overlap) of the array

      int buffersize=0;

      int tsize;

      MPI_Pack_size(1, MPITraits<std::size_t>::getType(), oocomm.communicator(), &buffersize);

      MPI_Pack_size(sendOwnerVec.size(), MPITraits<GI>::getType(), oocomm.communicator(), &tsize);

      buffersize +=tsize;

      MPI_Pack_size(1, MPITraits<std::size_t>::getType(), oocomm.communicator(), &tsize);

      buffersize +=tsize;

      MPI_Pack_size(sendOverlapSet.size(), MPITraits<GI>::getType(), oocomm.communicator(), &tsize);

      buffersize += tsize;

      MPI_Pack_size(1, MPITraits<std::size_t>::getType(), oocomm.communicator(), &tsize);

      buffersize += tsize;

      MPI_Pack_size(neighbors.size(), MPI_INT, oocomm.communicator(), &tsize);

      buffersize += tsize;


      sendBuffers[i] = new char[buffersize];


#ifdef DEBUG_REPART

      std::cout<<mype<<" sending "<<sendOwnerVec.size()<<" owner and "<<

      sendOverlapSet.size()<<" overlap to "<<sendTo[i]<<" buffersize="<<buffersize<<std::endl;

#endif

      createSendBuf(sendOwnerVec, sendOverlapSet, neighbors, sendBuffers[i], buffersize, oocomm.communicator());

      MPI_Issend(sendBuffers[i], buffersize, MPI_PACKED, sendTo[i], 99, oocomm.communicator(), requests+i);

    }


    if(verbose) {

      oocomm.communicator().barrier();

      if(oocomm.communicator().rank()==0)

        std::cout<<" Creating sends took "<<

        time.elapsed()<<std::endl;

    }

    time.reset();


    // Receive Messages

    int noRecv = recvFrom.size();

    int oldbuffersize=0;

    char* recvBuf = 0;

    while(noRecv>0) {

      // probe for an incoming message

      MPI_Status stat;

      MPI_Probe(MPI_ANY_SOURCE, 99,  oocomm.communicator(), &stat);

      int buffersize;

      MPI_Get_count(&stat, MPI_PACKED, &buffersize);


      if(oldbuffersize<buffersize) {

        // buffer too small, reallocate

        delete[] recvBuf;

        recvBuf = new char[buffersize];

        oldbuffersize = buffersize;

      }

      MPI_Recv(recvBuf, buffersize, MPI_PACKED, stat.MPI_SOURCE, 99, oocomm.communicator(), &stat);

      saveRecvBuf(recvBuf, buffersize, myOwnerVec, myOverlapSet, myNeighbors, redistInf,

                  stat.MPI_SOURCE, oocomm.communicator());

      --noRecv;

    }


    if(recvBuf)

      delete[] recvBuf;


    time.reset();

    // Wait for sending messages to complete

    MPI_Status *statuses = new MPI_Status[noSendTo];

    int send = MPI_Waitall(noSendTo, requests, statuses);


    // check for errors

    if(send==MPI_ERR_IN_STATUS) {

      std::cerr<<mype<<": Error in sending :"<<std::endl;

      // Search for the error

      for(int i=0; i< noSendTo; i++)

        if(statuses[i].MPI_ERROR!=MPI_SUCCESS) {

          char message[300];

          int messageLength;

          MPI_Error_string(statuses[i].MPI_ERROR, message, &messageLength);

          std::cerr<<" source="<<statuses[i].MPI_SOURCE<<" message: ";

          for(int j = 0; j < messageLength; j++)

            std::cout<<message[j];

        }

      std::cerr<<std::endl;

    }


    if(verbose) {

      oocomm.communicator().barrier();

      if(oocomm.communicator().rank()==0)

        std::cout<<" Receiving and saving took "<<

        time.elapsed()<<std::endl;

    }

    time.reset();


    for(int i=0; i < noSendTo; ++i)

      delete[] sendBuffers[i];


    delete[] sendBuffers;

    delete[] statuses;

    delete[] requests;


    redistInf.setCommunicator(oocomm.communicator());


    //

    // 4.2) Create the IndexSet etc.

    //


    // build the new outputIndexSet


    int color=0;


    if (!existentOnNextLevel) {

      // this process is not used anymore

      color= MPI_UNDEFINED;

    }

    MPI_Comm outputComm;


    MPI_Comm_split(oocomm.communicator(), color, oocomm.communicator().rank(), &outputComm);

    outcomm = std::make_shared<OOComm>(outputComm,SolverCategory::category(oocomm),true);


    // translate neighbor ranks.

    int newrank=outcomm->communicator().rank();

    int *newranks=new int[oocomm.communicator().size()];

    std::vector<int> tneighbors;

    tneighbors.reserve(myNeighbors.size());


    typename OOComm::ParallelIndexSet& outputIndexSet = outcomm->indexSet();


    MPI_Allgather(&newrank, 1, MPI_INT, newranks, 1,

                  MPI_INT, oocomm.communicator());


#ifdef DEBUG_REPART

    std::cout<<oocomm.communicator().rank()<<" ";

    for(auto i=myNeighbors.begin(), end=myNeighbors.end();

        i!=end; ++i) {

      assert(newranks[*i]>=0);

      std::cout<<*i<<"->"<<newranks[*i]<<" ";

      tneighbors.push_back(newranks[*i]);

    }

    std::cout<<std::endl;

#else

    for(auto i=myNeighbors.begin(), end=myNeighbors.end();

        i!=end; ++i) {

      tneighbors.push_back(newranks[*i]);

    }

#endif

    delete[] newranks;

    myNeighbors.clear();


    if(verbose) {

      oocomm.communicator().barrier();

      if(oocomm.communicator().rank()==0)

        std::cout<<" Calculating new neighbours ("<<tneighbors.size()<<") took "<<

        time.elapsed()<<std::endl;

    }

    time.reset();


    outputIndexSet.beginResize();

    // 1) add the owner vertices

    // Sort the owners

    std::sort(myOwnerVec.begin(), myOwnerVec.end(), SortFirst());

    // The owners are sorted according to there global index

    // Therefore the entries of ownerVec are the same as the

    // ones in the resulting index set.

    int i=0;

    using LocalIndexT = typename OOComm::ParallelIndexSet::LocalIndex;

    for(auto g=myOwnerVec.begin(), end =myOwnerVec.end(); g!=end; ++g, ++i ) {

      outputIndexSet.add(g->first,LocalIndexT(i, OwnerOverlapCopyAttributeSet::owner, true));

      redistInf.addReceiveIndex(g->second, i);

    }


    if(verbose) {

      oocomm.communicator().barrier();

      if(oocomm.communicator().rank()==0)

        std::cout<<" Adding owner indices took "<<

        time.elapsed()<<std::endl;

    }

    time.reset();


    // After all the vertices are received, the vectors must

    // be "merged" together to create the final vectors.

    // Because some vertices that are sent as overlap could now

    // already included as owner vertiecs in the new partition

    mergeVec(myOwnerVec, myOverlapSet);


    // Trick to free memory

    myOwnerVec.clear();

    myOwnerVec.swap(myOwnerVec);


    if(verbose) {

      oocomm.communicator().barrier();

      if(oocomm.communicator().rank()==0)

        std::cout<<" Merging indices took "<<

        time.elapsed()<<std::endl;

    }

    time.reset();


    // 2) add the overlap vertices

    for(auto g=myOverlapSet.begin(), end=myOverlapSet.end(); g!=end; ++g, i++) {

      outputIndexSet.add(*g,LocalIndexT(i, OwnerOverlapCopyAttributeSet::copy, true));

    }

    myOverlapSet.clear();

    outputIndexSet.endResize();


#ifdef DUNE_ISTL_WITH_CHECKING

    int numOfOwnVtx =0;

    auto end = outputIndexSet.end();

    for(auto index = outputIndexSet.begin(); index != end; ++index) {

      if (OwnerSet::contains(index->local().attribute())) {

        numOfOwnVtx++;

      }

    }

    numOfOwnVtx = oocomm.communicator().sum(numOfOwnVtx);

    // if(numOfOwnVtx!=indexMap.globalOwnerVertices)

    //   {

    //     std::cerr<<numOfOwnVtx<<"!="<<indexMap.globalOwnerVertices<<" owners missing or additional ones!"<<std::endl;

    //     DUNE_THROW(ISTLError, numOfOwnVtx<<"!="<<indexMap.globalOwnerVertices<<" owners missing or additional ones"

    //             <<" during repartitioning.");

    //   }

    if (!std::is_sorted(outputIndexSet.begin(), outputIndexSet.end(),

                   [](const auto& v1, const auto& v2){ return v1.global() < v2.global();}))

      DUNE_THROW(ISTLError, "OutputIndexSet is not sorted.");

#endif

    if(verbose) {

      oocomm.communicator().barrier();

      if(oocomm.communicator().rank()==0)

        std::cout<<" Adding overlap indices took "<<

        time.elapsed()<<std::endl;

    }

    time.reset();


    if(color != MPI_UNDEFINED) {

      outcomm->remoteIndices().setNeighbours(tneighbors);

      outcomm->remoteIndices().template rebuild<true>();


    }


    // release the memory

    delete[] sendTo;


    if(verbose) {

      oocomm.communicator().barrier();

      if(oocomm.communicator().rank()==0)

        std::cout<<" Storing indexsets took "<<

        time.elapsed()<<std::endl;

    }


#ifdef PERF_REPART

    // stop the time for step 4) and print the results

    t4=MPI_Wtime()-t4;

    tSum = t1 + t2 + t3 + t4;

    std::cout<<std::endl

             <<mype<<": WTime for step 1): "<<t1

             <<" 2): "<<t2

             <<" 3): "<<t3

             <<" 4): "<<t4

             <<" total: "<<tSum

             <<std::endl;

#endif


    return color!=MPI_UNDEFINED;


  }

#else

  template<class G, class P,class T1, class T2, class R>

  bool graphRepartition(const G& graph, P& oocomm, int nparts,

                        std::shared_ptr<P>& outcomm,

                        R& redistInf,

                        bool v=false)

  {

    if(nparts!=oocomm.size())

      DUNE_THROW(NotImplemented, "only available for MPI programs");

  }


  template<class G, class P,class T1, class T2, class R>

  bool commGraphRepartition(const G& graph, P& oocomm, int nparts,

                            std::shared_ptr<P>& outcomm,

                            R& redistInf,

                            bool v=false)

  {

    if(nparts!=oocomm.size())

      DUNE_THROW(NotImplemented, "only available for MPI programs");

  }

#endif // HAVE_MPI

} // end of namespace Dune

#endif

Dune::Amg::MatrixGraph
The (undirected) graph of a matrix.
Definition: graph.hh:51

Dune::Communication< MPI_Comm >::max
T max(const T &in) const
Compute the maximum of the argument over all processes and return the result in every process....
Definition: mpicommunication.hh:257

Dune::Communication< MPI_Comm >::barrier
int barrier() const
Wait until all processes have arrived at this point in the program.
Definition: mpicommunication.hh:272

Dune::Communication< MPI_Comm >::rank
int rank() const
Return rank, is between 0 and size()-1.
Definition: mpicommunication.hh:133

Dune::Communication< MPI_Comm >::sum
T sum(const T &in) const
Compute the sum of the argument over all processes and return the result in every process....
Definition: mpicommunication.hh:208

Dune::Communication< MPI_Comm >::size
int size() const
Number of processes in set, is greater than 0.
Definition: mpicommunication.hh:139

Dune::GlobalLookupIndexSet< ParallelIndexSet >

Dune::IndexSet
Index Set Interface base class.
Definition: indexidset.hh:78

Dune::IndexSet::size
auto size(GeometryType type) const
Return total number of entities of given geometry type in entity set .
Definition: indexidset.hh:223

Dune::Interface::communicator_
MPI_Comm communicator_
The MPI communicator we use.
Definition: interface.hh:325

Dune::Matrix::N
size_type N() const
Return the number of rows.
Definition: matrix.hh:691

Dune::OwnerOverlapCopyCommunication
A class setting up standard communication for a two-valued attribute set with owner/overlap/copy sema...
Definition: owneroverlapcopy.hh:174

Dune::OwnerOverlapCopyCommunication::indexSet
const ParallelIndexSet & indexSet() const
Get the underlying parallel index set.
Definition: owneroverlapcopy.hh:462

Dune::OwnerOverlapCopyCommunication::copyCopyToAll
void copyCopyToAll(const T &source, T &dest) const
Communicate values from copy data points to all other data points.
Definition: owneroverlapcopy.hh:328

Dune::OwnerOverlapCopyCommunication::copyOwnerToAll
void copyOwnerToAll(const T &source, T &dest) const
Communicate values from owner data points to all other data points.
Definition: owneroverlapcopy.hh:311

Dune::OwnerOverlapCopyCommunication::remoteIndices
const RemoteIndices & remoteIndices() const
Get the underlying remote indices.
Definition: owneroverlapcopy.hh:471

Dune::ParallelIndexSet< GlobalIdType, LI, 512 >

Dune::RemoteIndices::Attribute
LocalIndex::Attribute Attribute
The type of the attribute.
Definition: remoteindices.hh:227

Dune::Timer
A simple stop watch.
Definition: timer.hh:31

Dune::Timer::reset
void reset() noexcept
Reset timer while keeping the running/stopped state.
Definition: timer.hh:47

Dune::Timer::elapsed
double elapsed() const noexcept
Get elapsed user-time from last reset until now/last stop in seconds.
Definition: timer.hh:67

communicator.hh
Provides utility classes for syncing distributed data via MPI communication.

enumset.hh
Classes for building sets out of enumeration values.

graph.hh
Provides classes for building the matrix graph.

Dune::ParallelIndexSet::size
size_t size() const
Get the total number (public and nonpublic) indices.

Dune::repairLocalIndexPointers
void repairLocalIndexPointers(std::map< int, SLList< std::pair< typename T::GlobalIndex, typename T::LocalIndex::Attribute >, A > > &globalMap, RemoteIndices< T, A1 > &remoteIndices, const T &indexSet)
Repair the pointers to the local indices in the remote indices.
Definition: indicessyncer.hh:495

Dune::ParallelIndexSet::begin
iterator begin()
Get an iterator over the indices positioned at the first index.

Dune::ParallelIndexSet::end
iterator end()
Get an iterator over the indices positioned after the last index.

Dune::RemoteIndices::end
const_iterator end() const
Get an iterator over all remote index lists.
Definition: remoteindices.hh:1528

Dune::Interface::interfaces
const InformationMap & interfaces() const
Get information about the interfaces.
Definition: interface.hh:433

Dune::RemoteIndices::neighbours
int neighbours() const
Get the number of processors we share indices with.
Definition: remoteindices.hh:1445

Dune::GlobalLookupIndexSet::pair
const IndexPair * pair(const std::size_t &local) const
Get the index pair corresponding to a local index.

Dune::storeGlobalIndicesOfRemoteIndices
void storeGlobalIndicesOfRemoteIndices(std::map< int, SLList< std::pair< typename T::GlobalIndex, typename T::LocalIndex::Attribute >, A > > &globalMap, const RemoteIndices< T, A1 > &remoteIndices)
Stores the corresponding global indices of the remote index information.
Definition: indicessyncer.hh:469

Dune::RemoteIndices::begin
const_iterator begin() const
Get an iterator over all remote index lists.
Definition: remoteindices.hh:1521

Dune::Indices::_1
constexpr index_constant< 1 > _1
Compile time index with value 1.
Definition: indices.hh:55

Dune::real_t
typename FieldTraits< Type >::real_type real_t
Convenient access to FieldTraits<Type>::real_type.
Definition: typetraits.hh:301

DUNE_THROW
#define DUNE_THROW(E,...)
Definition: exceptions.hh:314

Dune::GeometryTypes::vertex
constexpr GeometryType vertex
GeometryType representing a vertex.
Definition: type.hh:492

Dune::Hybrid::max
constexpr auto max
Function object that returns the greater of the given values.
Definition: hybridutilities.hh:485

Dune::range
static constexpr IntegralRange< std::decay_t< T > > range(T &&from, U &&to) noexcept
free standing function for setting up a range based for loop over an integer range for (auto i: range...
Definition: rangeutilities.hh:288

Dune::dinfo
DInfoType dinfo(std::cout)
Stream for informative output.
Definition: stdstreams.hh:141

Dune::dverb
DVerbType dverb(std::cout)
Singleton of verbose debug stream.
Definition: stdstreams.hh:117

indexset.hh
Provides a map between global and local indices.

indicessyncer.hh
Class for adding missing indices of a distributed index set in a local communication.

mpitraits.hh
Traits classes for mapping types onto MPI_Datatype.

Dune
Dune namespace.
Definition: alignedallocator.hh:13

Dune::size
constexpr std::integral_constant< std::size_t, sizeof...(II)> size(std::integer_sequence< T, II... >)
Return the size of the sequence.
Definition: integersequence.hh:75

Dune::fillIndexSetHoles
void fillIndexSetHoles(const G &graph, Dune::OwnerOverlapCopyCommunication< T1, T2 > &oocomm)
Fills the holes in an index set.
Definition: repartition.hh:83

Dune::graphRepartition
bool graphRepartition(const G &graph, Dune::OwnerOverlapCopyCommunication< T1, T2 > &oocomm, Metis::idx_t nparts, std::shared_ptr< Dune::OwnerOverlapCopyCommunication< T1, T2 > > &outcomm, RedistributeInterface &redistInf, bool verbose=false)
execute a graph repartition for a giving graph and indexset.
Definition: repartition.hh:1228

owneroverlapcopy.hh
Classes providing communication interfaces for overlapping Schwarz methods.

rangeutilities.hh
Utilities for reduction like operations on ranges.

remoteindices.hh
Classes describing a distributed indexset.

stdstreams.hh
Standard Dune debug streams.

Dune::MPITraits
A traits class describing the mapping of types onto MPI_Datatypes.
Definition: mpitraits.hh:41

Dune::SolverCategory::nonoverlapping
@ nonoverlapping
Category for non-overlapping solvers.
Definition: solvercategory.hh:27

Dune::SolverCategory::category
static Category category(const OP &op, decltype(op.category()) *=nullptr)
Helperfunction to extract the solver category either from an enum, or from the newly introduced virtu...
Definition: solvercategory.hh:34

timer.hh
A simple timing class.