ejr / matchpres (public) (License: BSD) (since 2018-08-29) (hash sha1)
Ye olde thesis code implementing an auction-based weighted bipartite matching algorithm over MPI.

/stripesmut.c (5389671d60b8e2ddd1f9360551377d35c8ccdbd2) (39548 bytes) (mode 100644) (type blob)

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>

#include <assert.h>

#include <mpi.h>

#define SMUT_NO_IO
#include "smut.h"
#undef SMUT_NO_IO
#include "stripesmut.h"

void
gcsr_init_clear (struct gcsr_t *gA)
{
  if (!gA) return;
  gA->nr = gA->nc  = 0;
  spcsr_init_clear (&gA->local);
  gA->rowmap.p = gA->rowmap.np = 0;
  gA->rowmap.lindstart = gA->rowmap.lnind = 0;
  gA->rowmap.indstart = NULL;
  gA->colmap.l2g = NULL;
}

void
gcsr_free (struct gcsr_t *gA, MPI_Comm comm)
{
  if (gA->colmap.l2g) free (gA->colmap.l2g);
  if (gA->rowmap.indstart) free (gA->rowmap.indstart);
  spcsr_free (&gA->local);
  gcsr_init_clear (gA);
#if !defined(NDEBUG)
  MPI_Barrier (comm);
#endif
}

void
gcsr_expand_rowmap (struct gcsr_t *gA, MPI_Comm comm)
{
  int rank, csize;
  int *indstart;
#if !defined(NDEBUG)
  int p;
#endif

  if (!gA || gA->rowmap.indstart) return;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  indstart = malloc ((csize+1) * sizeof(int));
  if (!indstart) MPI_Abort(comm, -8922);

  MPI_Allgather (&gA->rowmap.lindstart, 1, MPI_INT,
		 indstart, 1, MPI_INT,
		 comm);
  indstart[csize] = gA->nr;
#if !defined(NDEBUG)
  assert(0 == indstart[0]);
  for (p = 1; p < csize; ++p) {
    assert(indstart[p] >= 0);
    assert(indstart[p] <= indstart[p+1]);
  }
#endif
  gA->rowmap.indstart = indstart;
}

int
gcsr_take_csr_root (struct spcsr_t *A, struct gcsr_t *gA, MPI_Comm comm)
{
  int rank, csize, p;
  int errflg = 0;
  int tmpdata[2];

  if (!gA) return 1;
  {
    int initflg = 0;
    errflg = MPI_Initialized(&initflg);
    if (MPI_SUCCESS != errflg) return errflg;
    if (!initflg) return 1;
  }

  MPI_Comm_size (comm, &csize);
  MPI_Comm_rank (comm, &rank);

  if (0 == rank && !A) MPI_Abort(comm, EINVAL);

  gA->rowmap.p = rank;
  gA->rowmap.np = csize;
  gA->rowmap.indstart = malloc ((csize+1) * sizeof(int));
  if (!gA->rowmap.indstart) MPI_Abort(comm, ENOMEM);

  if (0 == rank) {
    gA->local = *A;
    tmpdata[0] = A->nr;
    tmpdata[1] = A->nc;
  }
  else {
    spcsr_init_clear (&gA->local);
  }
  MPI_Bcast (tmpdata, 2, MPI_INT, 0, comm);
  gA->nr = tmpdata[0];
  gA->nc = tmpdata[1];

  gA->rowmap.lindstart = (rank == 0? 0 : gA->nr);
  gA->rowmap.lnind = (rank == 0? gA->nr : 0);
  gA->rowmap.indstart[0] = 0;
  for (p = 1; p <= csize; ++p)
    gA->rowmap.indstart[p] = gA->nr;

  gA->colmap.l2g = NULL;

  spcsr_init_clear (A);
  return 0;
}

int
gcsr_give_csr_root (struct gcsr_t *gA, struct spcsr_t *A, MPI_Comm comm)
{
  int errflg;

  errflg = gcsr_redist_root (gA, comm);
  if (errflg) return errflg;

  *A = gA->local;
  spcsr_init_clear (&gA->local);
  gcsr_free (gA, comm);

  return 0;
}

void
gcsr_unpack_colinds (struct gcsr_t *gA)
{
  struct spcsr_t local;
  int *l2g, k;
  if (!gA || !gA->colmap.l2g)
    return;
  l2g = gA->colmap.l2g;
  local=gA->local;

  for (k = 0; k < local.nent; ++k) {
    const int j = local.colind[k];
    assert (j >= 0);
    assert (j < local.nc);
    local.colind[k] = l2g[j];
  }

  gA->colmap.l2g = NULL;
  free(l2g);
  gA->local.nc = gA->nc;
}

int
gcsr_redist_root (struct gcsr_t* gA, MPI_Comm comm)
{
  int rank, csize, am_root;
  struct spcsr_t local;
  int *counts = NULL;
  int *disps = NULL;
  int gnent;
  struct spcsr_t newlocal;

  int p;

  /* get # entries per proc */
  /* realloc local to fit */
  /* MPI_Gatherv */
  /* local freeing */

  if (!gA || !gA->nr || !gA->nc) return 0;

  spcsr_init_clear (&newlocal);

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -22);
    if (mpicsize != csize) MPI_Abort(comm, -23);
  }
#endif

  if (am_root) {
    counts = malloc (csize * sizeof(int));
    disps = malloc (csize * sizeof(int));
    if (!counts || !disps) MPI_Abort(comm, -398202);
  }

  /*
  fprintf (stderr, "%d: foo1\n", rank);
  */

  local = gA->local;

  gcsr_unpack_colinds (gA);

  /*
  fprintf (stderr, "%d: foo2\n", rank);
  */

  /* start setting up root's newlocal */
  if (am_root) {
    gnent = 0;
    newlocal.nr = gA->nr;
    newlocal.nc = gA->nc;
    newlocal.rowoff = malloc ( (newlocal.nr+1) * sizeof(int) );
    if (!newlocal.rowoff) MPI_Abort(comm, -348362);
  }

  /*
  fprintf (stderr, "%d: foo3\n", rank);
  */

#if !defined(NDEBUG)
  /* Double-check the number of entries in a few places*/
  MPI_Reduce (&local.nent, &gnent, 1, MPI_INT, MPI_SUM, 0, comm);
#endif

  /*
  fprintf (stderr, "%d: foo4\n", rank);
  */

  /* gather row offsets and clear any row mapping */
  if (gA->rowmap.indstart) {
    if (am_root) {
      for (p = 0; p < csize; ++p) {
	counts[p] = gA->rowmap.indstart[p+1] - gA->rowmap.indstart[p];
      }
    }
  } else {
    MPI_Gather (&gA->rowmap.lnind, 1, MPI_INT,
		counts, 1, MPI_INT,
		0, comm);
  }
  if (am_root) {
    disps[0] = 0;
    for (p = 1; p < csize; ++p)
      disps[p] = disps[p-1] + counts[p-1];
  }
  gA->rowmap.lindstart = (am_root? 0 : gA->nr);
  gA->rowmap.lnind = (am_root? gA->nr : 0);
  if (gA->rowmap.indstart) {
    gA->rowmap.indstart[0] = 0;
    for (p = 1; p <= csize; ++p)
      gA->rowmap.indstart[p] = gA->nr;
  }

  /*
  fprintf (stderr, "%d: foo5\n", rank);
  */

  MPI_Gatherv ((local.rowoff? &local.rowoff[1] : NULL), local.nr, MPI_INT,
	       newlocal.rowoff, counts, disps, MPI_INT,
	       0, comm);

  /*
  fprintf (stderr, "%d: foo6\n", rank);
  */

  /* convert row offsets while collecting num. entry info */
  if (am_root) {
    int k;
    int cumsum, nents_p;

    /*
    fprintf (stderr, "%d: foo6.1 %p %p %d %p\n",
	     rank, counts, disps, newlocal.nr, newlocal.rowoff);
    */

    cumsum = 0;
    for (p = 0; p < csize; ++p) {
      const int cnt = counts[p];
      const int off = disps[p];
      nents_p = 0;

      /*
      fprintf (stderr, "%d: foo6.1 cnt=%d  off=%d span=[%d,%d)  nents_p=%d  cumsum=%d\n",
	       rank, cnt, off, off, off+cnt, nents_p, cumsum);
      fprintf (stderr, "%d: foo6.1 newlocal.rowoff[%d and +1] = %d %d\n",
	       rank, off, newlocal.rowoff[off], newlocal.rowoff[off+1]);
      */
      if (cnt > 0) nents_p = newlocal.rowoff[off + cnt - 1];
      for (k = 0; k < cnt; ++k) {
	newlocal.rowoff[off + k] += cumsum;
      }
      counts[p] = nents_p;
      cumsum += nents_p;
      /*
      fprintf (stderr, "%d: foo6.1.1 cnt=%d  off=%d span=[%d,%d)  nents_p=%d  cumsum=%d\n",
	       rank, cnt, off, off, off+cnt, nents_p, cumsum);
      fprintf (stderr, "%d: foo6.1.1 newlocal.rowoff[%d and +1] = %d %d\n",
	       rank, off, newlocal.rowoff[off], newlocal.rowoff[off+1]);
      */
    }
    assert(cumsum == gnent);

    /*
    fprintf (stderr, "%d: foo6.5\n", rank);
    */

    for (k = newlocal.nr; k > 0; --k)
      newlocal.rowoff[k] = newlocal.rowoff[k-1];
    newlocal.rowoff[0] = 0;
    newlocal.nent = newlocal.rowoff[newlocal.nr];
    assert(newlocal.nent = gnent);
    gnent = newlocal.nent;

    disps[0] = 0;
    for (p = 1; p < csize; ++p)
      disps[p] = disps[p-1] + counts[p-1];

    newlocal.colind = malloc (gnent * sizeof(int));
    newlocal.entry = malloc (gnent * sizeof(double));
    if (!newlocal.colind || !newlocal.entry) MPI_Abort(comm, -222018);
  }

  /*
  fprintf (stderr, "%d: foo7 %d %p %p\n", rank, local.nent, counts, disps);
  if (counts) {
    for (p = 0; p < csize; ++p)
      fprintf (stderr, "%d: counts[%d] = %d, disps[%d] = %d\n",
	       rank, p, counts[p], p, disps[p]);
  }
  */

  MPI_Gatherv (local.colind, local.nent, MPI_INT,
	       newlocal.colind, counts, disps, MPI_INT,
	       0, comm);

  MPI_Gatherv (local.entry, local.nent, MPI_DOUBLE,
	       newlocal.entry, counts, disps, MPI_DOUBLE,
	       0, comm);

  /*
  fprintf (stderr, "%d: foo8\n", rank);
  */

  spcsr_free (&gA->local);
  gA->local = newlocal;
  if (counts) free(counts);
  if (disps) free(disps);
  return 0;
}

/* XXX: there must be a prettier way */
static void
make_row_sr_counts (const int rank, const int csize,
		    const int lindstart, const int lnr,
		    const int *indstart,
		    const int *new_indstart,
		    int *scounts, int *sdisps,
		    int *rcounts, int *rdisps,
		    MPI_Comm comm)
{
  int p;
  int total_rows_to_send = lnr;
  int cur_local_row = 0;
  int cumsum;

  for (p = 0; p < csize; ++p) {
    int lrow_incr, nrows_req;
    sdisps[p] = cur_local_row;

    lrow_incr = new_indstart[p+1] - lindstart;

    if (lrow_incr <= 0) continue; /* haven't reached recipients yet */
    if (0 == total_rows_to_send) continue; /* past all recipients */
    assert(total_rows_to_send > 0);

    /* otherwise there's an intersection */
    nrows_req = new_indstart[p+1] - new_indstart[p];
    if (lrow_incr > nrows_req)
      lrow_incr = nrows_req;
    if (lrow_incr > total_rows_to_send)
      lrow_incr = total_rows_to_send;
    total_rows_to_send -= lrow_incr;
    cur_local_row += lrow_incr;
  }
  sdisps[p] = cur_local_row;
  assert(cur_local_row == lnr);
  for (p = 0; p < csize; ++p) scounts[p] = sdisps[p+1] - sdisps[p];

  if (!indstart) {
    MPI_Alltoall (scounts, 1, MPI_INT, rcounts, 1, MPI_INT, comm);
    cumsum = 0;
    for (p = 0; p < csize; ++p) {
      rdisps[p] = cumsum;
      cumsum += rcounts[p];
    }
    rdisps[p] = cumsum;
    assert(cumsum == new_indstart[rank+1] - new_indstart[rank]);
  }
  else {
    int new_lindstart = new_indstart[rank];
    int total_rows_to_recv = new_indstart[rank+1] - new_indstart[rank];
    int cur_new_local_row = 0;

    for (p = 0; p < csize; ++p) {
      int lrow_incr, nrows_req;
      rdisps[p] = cur_new_local_row;

      lrow_incr = indstart[p+1] - new_lindstart;

      if (lrow_incr <= 0) continue; /* haven't reached recipients yet */
      if (0 == total_rows_to_recv) continue; /* past all recipients */
      assert(total_rows_to_recv > 0);

      /* otherwise there's an intersection */
      nrows_req = indstart[p+1] - indstart[p];
      if (lrow_incr > nrows_req)
	lrow_incr = nrows_req;
      if (lrow_incr > total_rows_to_recv)
	lrow_incr = total_rows_to_recv;
      total_rows_to_recv -= lrow_incr;
      cur_new_local_row += lrow_incr;
    }
    rdisps[p] = cur_new_local_row;
    assert(cur_new_local_row == new_indstart[rank+1] - new_indstart[rank]);
    for (p = 0; p < csize; ++p) rcounts[p] = rdisps[p+1] - rdisps[p];
  }
}

static void
make_nent_sr_counts (const int rank, const int csize,
		     const int *sdisps,
		     const struct spcsr_t local,
		     int *nent_send, int *nent_recv,
		     MPI_Comm comm)
{
  int p;

  /*
  for (p = 0; p < csize; ++p) {
    int p2;
    MPI_Barrier(comm);
    if (rank == p) {
      for (p2 = 0; p2 <= csize; ++p2) {
	fprintf (stderr, "%d: sdisps[%d] = %d\n",
		 rank, p2, sdisps[p2]);
      }
    }
    MPI_Barrier(comm);
  }
  */

  for (p = 0; p < csize; ++p) {
    if (sdisps[p+1] == sdisps[p]) /* local.rowoff may be null */
      nent_send[p] = 0;
    else
      nent_send[p] = local.rowoff[sdisps[p+1]] - local.rowoff[sdisps[p]];
    assert(nent_send[p] >= 0);
    assert(nent_send[p] <= local.nent);
  }

  MPI_Alltoall (nent_send, 1, MPI_INT, nent_recv, 1, MPI_INT, comm);
}

int
gcsr_redist_rowinds (struct gcsr_t *gA, const int *new_indstart, MPI_Comm comm)
{
  int rank, csize, am_root;
  int lindstart, lindend;
  int new_lindstart, new_lindend;
  struct spcsr_t local, new_local;

  int cumsum, p, i;

  int *scounts, *sdisps, *rcounts, *rdisps;

  if (!gA || !new_indstart) return -1;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2233);
    if (mpicsize != csize) MPI_Abort(comm, -2333);
  }
#endif

  if (new_indstart[csize] != gA->nr) return -1;

  local = gA->local;

  gcsr_unpack_colinds (gA);

  lindstart = gA->rowmap.lindstart;
  lindend = lindstart + gA->rowmap.lnind;

  new_lindstart = new_indstart[rank];
  new_lindend = new_indstart[rank+1];

  spcsr_init_clear (&new_local);
  new_local.nr = new_indstart[rank+1] - new_indstart[rank];
  new_local.nc = (new_local.nr? gA->nc : 0);
  if (0 != new_local.nr) {
    new_local.rowoff = malloc ((new_local.nr + 1) * sizeof(int));
    if (!new_local.rowoff) MPI_Abort(comm, -3939821);
  } else
    new_local.rowoff = NULL;

  /*
  for (p = 0; p < csize; ++p) {
    int p2;
    MPI_Barrier(comm);
    if (rank == p) {
      fprintf(stderr, "%d: local nr old: %d, new: %d\n", rank,
	      local.nr, new_local.nr);
    }
    MPI_Barrier(comm);
  }
  */

  scounts = malloc (4 * (csize+1) * sizeof(int));
  sdisps = scounts + csize + 1;
  rcounts = sdisps + csize + 1;
  rdisps = rcounts + csize + 1;

  make_row_sr_counts (rank, csize, lindstart, local.nr,
		      gA->rowmap.indstart, new_indstart,
		      scounts, sdisps, rcounts, rdisps, comm);

  /*
  for (p = 0; p < csize; ++p) {
    int p2;
    MPI_Barrier(comm);
    if (rank == p) {
      for (p2 = 0; p2 < csize; ++p2) {
	fprintf (stderr, "%d: sending to %d: (%d, %d)\n",
		 rank, p2, scounts[p2], sdisps[p2]);
	fprintf (stderr, "%d: recving from %d: (%d, %d)\n",
		 rank, p2, rcounts[p2], rdisps[p2]);
      }
    }
    MPI_Barrier(comm);
  }
  */

  MPI_Alltoallv (local.rowoff, scounts, sdisps, MPI_INT,
		 new_local.rowoff, rcounts, rdisps, MPI_INT,
		 comm);

  /*
  fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
  */

  make_nent_sr_counts (rank, csize, sdisps, local, scounts, rcounts, comm);
  /* now scounts and rcounts count _entries_, not rows */

  /*
  fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
  */

  /* fixup offsets */
  cumsum = 0;
  for (p = 0; p < csize; ++p) {
    if (rdisps[p] != rdisps[p+1]) { /* else new_local.rowoff may be NULL */
      int firstoff = new_local.rowoff[rdisps[p]];
      for (i = rdisps[p]; i < rdisps[p+1]; ++i) {
	new_local.rowoff[i] -= firstoff;
	new_local.rowoff[i] += cumsum;
      }
      cumsum += rcounts[p];
    }
  }
  new_local.nent = cumsum;
  if (new_local.rowoff) new_local.rowoff[new_local.nr] = cumsum;
  assert( !new_local.rowoff || (0 == new_local.rowoff[0]) );

  /*
  fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
  */

  if (cumsum > 0) {
    new_local.colind = malloc (cumsum * sizeof(int));
    new_local.entry = malloc (cumsum * sizeof(double));
    if (!new_local.colind || !new_local.entry) MPI_Abort(comm, -9083);
  } else {
    new_local.colind = NULL;
    new_local.entry = NULL;
  }

  /*
  fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
  */

  for (p = 0; p < csize; ++p) {
    if (local.rowoff) sdisps[p] = local.rowoff[sdisps[p]];
    assert(local.rowoff || (sdisps[p] == 0));
    if (new_local.rowoff) rdisps[p] = new_local.rowoff[rdisps[p]];
    assert(new_local.rowoff || (rdisps[p] == 0));
  }

  /*
  for (p = 0; p < csize; ++p) {
    int p2;
    MPI_Barrier(comm);
    if (rank == p) {
      for (p2 = 0; p2 < csize; ++p2) {
	fprintf (stderr, "%d: sending to %d: (%d, %d)\n",
		 rank, p2, scounts[p2], sdisps[p2]);
	fprintf (stderr, "%d: recving from %d: (%d, %d)\n",
		 rank, p2, rcounts[p2], rdisps[p2]);
      }
    }
    MPI_Barrier(comm);
  }
  */

  /*
  fprintf (stderr, "%d: here a\n", rank); MPI_Barrier(comm);
  */

  MPI_Alltoallv (local.colind, scounts, sdisps, MPI_INT,
		 new_local.colind, rcounts, rdisps, MPI_INT,
		 comm);

  /*
  fprintf (stderr, "%d: here b\n", rank); MPI_Barrier(comm);
  */

  MPI_Alltoallv (local.entry, scounts, sdisps, MPI_DOUBLE,
		 new_local.entry, rcounts, rdisps, MPI_DOUBLE,
		 comm);

  /*
  fprintf (stderr, "%d: here c\n", rank); MPI_Barrier(comm);
  */

  spcsr_free (&local);
  gA->local = new_local;
  gA->rowmap.lindstart = new_lindstart;
  gA->rowmap.lnind = new_lindend - new_lindstart;
  if (!gA->rowmap.indstart)
    gA->rowmap.indstart = malloc ((csize+1) * sizeof(int));
  if (gA->rowmap.indstart) /* if malloc fails, no big deal */
    memcpy (gA->rowmap.indstart, new_indstart, (csize+1)*sizeof(int));
  assert(gA->rowmap.lnind == gA->local.nr);
  free(scounts);

  /*
  fprintf (stderr, "%d: done!\n", rank); MPI_Barrier(comm);
  */

  /*
  for (p = 0; p < csize; ++p) {
    MPI_Barrier(comm);
    if (rank == p) {
      fprintf (stderr, "%d: local (%d, %d; %d)\n",
	       rank, gA->local.nr, gA->local.nc, gA->local.nent);
    }
    MPI_Barrier(comm);
  }
  */

  return 0;
}

int
gcsr_redist_nrows (struct gcsr_t *gA, int minrows, MPI_Comm comm)
{
  int rank, csize, am_root;
  int *new_indstart;
  int errflg;

  int p, nr, nrows_per;

  if (!gA) return -1;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2233);
    if (mpicsize != csize) MPI_Abort(comm, -2333);
  }
#endif

  nr = gA->nr;

  new_indstart = malloc ((csize+1) * sizeof(int));
  if (!new_indstart) MPI_Abort(comm, -303030);

  nrows_per = (nr + csize - 1) / csize;
  if (nrows_per < minrows) nrows_per = minrows;
  new_indstart[0] = 0;
  for (p = 1; p <= csize; ++p) {
    new_indstart[p] = new_indstart[p-1] + nrows_per;
    if (new_indstart[p] > nr) new_indstart[p] = nr;
  }

  errflg = gcsr_redist_rowinds (gA, new_indstart, comm);
  free(new_indstart);
  return errflg;
}

int
gcsr_redist_ents (struct gcsr_t *gA, int minrows, MPI_Comm comm)
{
  int rank, csize, am_root;
  int *new_indstart = NULL, *rowdeg = NULL;
  int *rcounts = NULL, *rdisps = NULL;
  int errflg;

  int ents_target, ents_so_far, rows_so_far;

  int i, p, nr;

  int gnent;

  if (!gA) return -1;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2238);
    if (mpicsize != csize) MPI_Abort(comm, -2339);
  }
#endif
  /*
  fprintf (stderr, "%d: baz0\n", rank);
  */

  nr = gA->nr;

  new_indstart = malloc ((csize+1) * sizeof(int));
  if (!new_indstart) MPI_Abort(comm, -303031);

  /*
  fprintf (stderr, "%d: baz1\n", rank);
  */

  if (am_root) {
    rowdeg = malloc ((1 + gA->nr + 2*csize + 2) * sizeof(int));
    /* XXX: too much memory */
    if (!rowdeg) MPI_Abort(comm, -22292);
    rcounts = rowdeg + 1 + gA->nr;
    rdisps = rcounts + csize;
  }

  /*
  fprintf (stderr, "%d: baz2 %p %p %p\n", rank, rowdeg, rcounts, rdisps);
  */

  MPI_Gather (&gA->local.nr, 1, MPI_INT, rcounts, 1, MPI_INT, 0, comm);
  /* fprintf (stderr, "%d: baz2.1\n", rank); */
  if (am_root) {
    rdisps[0] = 0;
    for (p = 1; p <= csize; ++p) {
      rdisps[p] = rdisps[p-1] + rcounts[p-1];
    }
  }
  /* fprintf (stderr, "%d: baz3\n", rank); */

  MPI_Gatherv (gA->local.rowoff + 1, gA->local.nr, MPI_INT,
	       rowdeg, rcounts, rdisps, MPI_INT,
	       0, comm);
  MPI_Reduce (&gA->local.nent, &gnent, 1, MPI_INT, MPI_SUM, 0, comm);

  /* fprintf (stderr, "%d: baz4\n", rank); */

  if (am_root) {
    /* turn into coldegs */
    for (p = 0; p < csize; ++p) {
      int prevoff, disp, k;
      /* fprintf (stderr, "%d: baz5 rcounts[%d] = %d\n", rank, p, rcounts[p]); */
      if (rcounts[p] <= 1) continue;
      disp = rdisps[p];
      prevoff = rowdeg[disp];
      for (k = 1; k < rcounts[p]; ++k) {
	int tmp = rowdeg[disp+k];
	rowdeg[disp+k] -= prevoff;
	prevoff = tmp;
      }
    }

    /* now split */
    ents_target = (gnent + csize - 1) / csize;

    i = 0;
    for (p = 0; p < csize; ++p) {
      new_indstart[p] = i;
      ents_so_far = rows_so_far = 0;
      for (; i < nr; ++i) {
	ents_so_far += rowdeg[i];
	++rows_so_far;

	if (rows_so_far > minrows && ents_so_far > ents_target)
	  break;
      }
    }
    new_indstart[p] = nr;
    /*
    for (p = 0; p <= csize; ++p) {
      fprintf (stderr, "%d: new_indstart[%d] = %d\n",
	       rank, p, new_indstart[p]);
    }
    */
  }

  /*
  fprintf (stderr, "%d: baz6\n", rank);
  */

  MPI_Bcast (new_indstart, csize+1, MPI_INT, 0, comm);

  /*
  fprintf (stderr, "%d: baz7\n", rank);
  */

  errflg = gcsr_redist_rowinds (gA, new_indstart, comm);
  if (rowdeg) free(rowdeg);
  free(new_indstart);

  /*
  fprintf (stderr, "%d: baz8\n", rank);
  */

  return errflg;
}



#if 0
int
gcsr_redist_ents (struct gcsr_t *gA, int minrows, MPI_Comm comm)
{
  int rank, csize, am_root;
  int *new_indstart;
  int errflg;

  int p, nr, nrows_per, nent_per;

  int gnent, nent_start, nent_finish;
  int lneigh, rneigh;

  if (!gA) return -1;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2233);
    if (mpicsize != csize) MPI_Abort(comm, -2333);
  }
#endif

  nr = gA->nr;

  new_indstart = malloc ((p+1) * sizeof(int));
  if (!new_indstart) MPI_Abort(comm, -303031);

  /* Make an exclusive scan.
     Could use MPI_Exscan; then add local.nent to gnent. */
  MPI_Scan (&gA->local.nent, &gnent, 1, MPI_INT, MPI_SUM, comm);
  lneigh = rank-1;
  if (lneigh < 0) lneigh = MPI_PROC_NULL;
  rneigh = rank+1;
  if (rneigh >= csize) rneigh = MPI_PROC_NULL;
  nent_start = 0;
  MPI_Sendrecv (&gnent, 1, MPI_INT, rneigh, 99992,
		&nent_start, 1, MPI_INT, lneigh, 99992,
		comm, MPI_STATUS_IGNORE);

#if !defined(NDEBUG)
  {
    int gnent_reduct;
    MPI_Allreduce (&gA->local.nent, &gnent_reduct, 1, MPI_INT,
		   MPI_SUM, comm);
    if (rank == csize-1)
      assert(gnent == gnent_reduct);
  }
#endif
  MPI_Bcast (&gnent, 1, MPI_INT, csize-1, comm);

  nent_per = (gnent + csize - 1) / csize;

  /*AUUUGH!  minrows kills everything quickly! */
}
#endif /* until I figure out how to do it in parallel */


static int
compind (const void *p1, const void *p2)
{
  const int a = *(const int*) p1;
  const int b = *(const int*) p2;
  return b - a;
}


void
gcsr_pack_colinds (struct gcsr_t* gA)
{
  struct spcsr_t local;
  int k, nent, nc, lnc;
  int *g2l = NULL, *l2g = NULL, *tmp = NULL;

  if (!gA) return;
  if (0 == gA->local.nr || 0 == gA->local.nc || 0 == gA->local.nent) return;
  /*return;*/

  nc = gA->nc;
  l2g = malloc (nc * sizeof(int));
  g2l = malloc (nc * sizeof(int));
  if (!l2g || !g2l) goto memfree;

  gcsr_unpack_colinds (gA);

  local = gA->local;
  nent = local.nent;

  memset (g2l, -1, nc * sizeof(int));

  lnc = 0;
  for (k = 0; k < nent; ++k) {
    const int j = local.colind[k];
    assert(j >= 0);
    assert(j < nc);
    if (g2l[j] < 0) {
      l2g[lnc] = j;
      g2l[j] = 1;
      ++lnc;
    }
  }

  if (lnc >= (3*nc)/4) goto memfree; /* Don't bother. */

  /*
    Keep in same order as original columns.
    Makes transpose-like ops easier, shouldn't cost much.
  */
  qsort (l2g, lnc, sizeof(int), compind);
  for (k = 0; k < lnc; ++k) {
    const int gj = l2g[k];
    assert(gj >= 0);
    assert(gj < gA->nc);
    g2l[gj] = k;
  }

  tmp = realloc(l2g, lnc * sizeof(int));
  if (tmp) l2g = tmp;

  for (k = 0; k < nent; ++k) {
    const int j = local.colind[k];
    const int new_j = g2l[j];
    assert(j >= 0);
    assert(j < nc);
    assert(new_j >= 0);
    assert(new_j < lnc);
    local.colind[k] = new_j;
  }

  gA->local.nc = lnc;
  gA->colmap.l2g = l2g;
  free (g2l);
  return;

 memfree:
  if (g2l) free (g2l);
  if (l2g) free (l2g);
}


int
gcsr_take_csr_locally (const int gnr, const int gnc,
		       struct spcsr_t *local, struct gcsr_t *gA,
		       int **indstart, MPI_Comm comm)
{
  int rank, csize;
  if (!local || !gA || !indstart || !*indstart) return 1;

  gcsr_init_clear (gA);
  gA->nr = gnr;
  gA->nc = gnc;
  gA->local = *local;
  spcsr_init_clear (local);

  MPI_Comm_size (comm, &csize);
  MPI_Comm_rank (comm, &rank);
  gA->rowmap.p = rank;
  gA->rowmap.np = csize;
  gA->rowmap.indstart = *indstart;
  gA->rowmap.lindstart = gA->rowmap.indstart[rank];
  gA->rowmap.lnind = gA->rowmap.indstart[rank+1] - gA->rowmap.lindstart;
  *indstart = NULL;

  /*
  fprintf (stderr, "%d: locally (%d, %d) (%d, %d; %d)\n", rank,
	   gnr, gnc, gA->local.nr, gA->local.nc, gA->local.nent);
  */

  gcsr_pack_colinds (gA);

  return 0;
}


static int
transpose_matrix_chunks (const struct gcsr_t *gA, const int *indstartT,
			 int *p_rows, int **p_rowind_, int **p_rowoff_,
			 int **p_colind_, double **p_entry_,
			 MPI_Comm comm)
{
  int rank, csize, p;
  int *p_rowind, *p_rowoff, *p_colind;
  double *p_entry;

  int *scounts, *sdisps, *rcounts, *rdisps, ndata_req;

  struct spcsr_t local;
  struct spcsr_t localT;
  int iT, nent, cum_nent;
  int *l2g, *l2g_own = NULL;

  spcsr_init_clear (&localT);

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2233);
    if (mpicsize != csize) MPI_Abort(comm, -2333);
  }
#endif

  local = gA->local;
  scounts = malloc (4 * (csize + 1) * sizeof(int));
  if (!scounts) MPI_Abort(comm, -389762);
  sdisps = scounts + csize + 1;
  rcounts = sdisps + csize + 1;
  rdisps = rcounts + csize + 1;

  l2g = gA->colmap.l2g;
  if (!l2g) {
    int j;
    l2g_own = malloc (local.nc * sizeof(int));
    for (j = 0; j < local.nc; ++j) l2g_own[j] = j;
    l2g = l2g_own;
  }

  /* get p_rows, the map from p -> [rows) */
  memset (scounts, 0, (csize+1) * sizeof(int)); /* rows to send */
  p = 0;
  assert(gA->nc == indstartT[csize]);
  for (iT = 0; iT < local.nc; ++iT) {
    const int giT = (l2g? l2g[iT] : iT);
    assert (giT < gA->nc);
    while (giT >= indstartT[p+1]) ++p; /* the giT are sorted... */
    ++scounts[p];
  }

  /* fprintf (stderr, "%d: boing1\n", rank); */
  MPI_Alltoall (scounts, 1, MPI_INT,   rcounts, 1, MPI_INT,
		comm);
  sdisps[0] = 0;
  for (p = 0; p < csize; ++p) {
    sdisps[p+1] = sdisps[p] + scounts[p];
  }

  ndata_req = 0;
  for (p = 0; p < csize; ++p) {
    p_rows[p] = ndata_req;
    ndata_req += rcounts[p];
  }
  p_rows[csize] = ndata_req; /* total number of rows to be received */




  /*
    now get
      p_rowind: map of packed row -> _global_ row index
      p_rowoff: map of p_row[p] -> [offs), shifted
  */

  p_rowind = malloc ((2 * ndata_req + 1) * sizeof(int));
  if (!p_rowind) MPI_Abort (comm, -467474);
  p_rowoff = p_rowind + ndata_req;
  *p_rowind_ = p_rowind;
  *p_rowoff_ = p_rowoff;

  /* fprintf (stderr, "%d: boing2 %p %p\n", rank, l2g, p_rowind); */
  MPI_Alltoallv (l2g, scounts, sdisps, MPI_INT,
		 p_rowind, rcounts, p_rows, MPI_INT, comm);

  spcsr_transpose (&local, &localT); /* ideally during above */
  /* Make localT's column indices (local's rows) globally numbered */
  assert(local.nent == localT.nent);
  {
    int k;
    const int rowbase = gA->rowmap.lindstart;
    for (k = 0; k < localT.nent; ++k) {
      const int j = localT.colind[k];
      assert(j >= 0);
      assert(j < localT.nc);
      localT.colind[k] = j + rowbase;
      assert(localT.colind[k] < gA->nr);
      /*
      if (localT.entry[k] == 0.0)
	fprintf(stderr, "%d: found zero off %d\n", rank, k);
      */
    }
    localT.nc = gA->nr;
  }

  /* fprintf (stderr, "%d: boing3\n", rank); */
  MPI_Alltoallv (localT.rowoff, scounts, sdisps, MPI_INT,
		 p_rowoff, rcounts, p_rows, MPI_INT, comm);

  /* shift row indices into local space */ /* ideally during above */
  {
    const int lindstart = indstartT[rank];
#if !defined(NDEBUG)
    const int lindend = indstartT[rank+1];
#endif
    int k;
    for (k = 0; k < p_rows[csize]; ++k) {
      assert(p_rowind[k] >= lindstart);
      assert(p_rowind[k] < lindend);
      p_rowind[k] -= lindstart;
    }
  }
  /* fprintf (stderr, "%d: boing4\n", rank); */



  /* need # of entries to unshift p_rowoff and recv colinds and entries */

  /* scounts[p] = # entries to send to p */
  for (p = 0; p < csize; ++p) {
    if (localT.rowoff)
      scounts[p] = localT.rowoff[sdisps[p+1]] - localT.rowoff[sdisps[p]];
    else
      scounts[p] = 0;
    /*
    fprintf (stderr, "%d: sending %d entries (of %d) to %d\n", rank,
	     scounts[p], localT.nent, p);
    */
  }

  /* swap # entries */
  MPI_Alltoall (scounts, 1, MPI_INT,   rcounts, 1, MPI_INT,
		comm);
  sdisps[0] = 0; /* disps for sending colinds and entries */
  for (p = 0; p < csize; ++p) {
    sdisps[p+1] = sdisps[p] + scounts[p];
  }

  /* Note: rdisps is not the same as p_rowoff.  rdisps covers the whole of
     a proc's contributions, p_rowoff is broken into each row within each
     proc.
  */
  nent = 0;
  for (p = 0; p < csize; ++p) {
    rdisps[p] = nent;
    nent += rcounts[p];
  }
  rdisps[csize] = nent;

  /*
  for (p = 0; p < csize; ++p) {
    fprintf (stderr, "%d: sending to %d: %d entries , disp %d\n", rank,
	     p, scounts[p], sdisps[p]);
    fprintf (stderr, "%d: recving from %d: %d entries , disp %d\n", rank,
	     p, rcounts[p], rdisps[p]);
  }
  */

  cum_nent = 0;
  for (p = 0; p < csize; ++p) {
    const int firstoff = p_rowoff[p_rows[p]];
    int k;
    assert(cum_nent == rdisps[p]); /* consistent with rdisps */
    for (k = p_rows[p]; k < p_rows[p+1]; ++k) {
      p_rowoff[k] -= firstoff;
      p_rowoff[k] += cum_nent;
    }
    cum_nent += rcounts[p];
  }
  p_rowoff[p_rows[csize]] = cum_nent;
  assert(cum_nent == nent); /* trivially true */

#if !defined(NDEBUG)
  /* check that p_rowoff is self-consistent */
  {
    int k;
    assert(0 == p_rows[0]);
    assert(0 == p_rowoff[0]);
    for (p = 0; p < csize; ++p) {
      assert(p_rows[p] <= p_rows[p+1]);
      for (k = p_rows[p]; k < p_rows[p+1]; ++k) {
	assert(p_rowoff[p] <= p_rowoff[p+1]);
      }
    }
    assert(k == p_rows[csize]);
    assert(cum_nent == p_rowoff[k]);
  }
#endif

  /* now set up packed colind and entry arrays */
  /* fprintf (stderr, "%d: allocing for %d entries\n", rank, nent); */
  p_colind = malloc (nent * sizeof(int));
  p_entry = malloc (nent * sizeof(double));
  if (!p_colind || !p_entry) MPI_Abort(comm, -74423);
  *p_colind_ = p_colind;
  *p_entry_ = p_entry;

  /*
  for (p = 0; p < csize; ++p) {
    fprintf (stderr, "%d: sending to %d: %d entries , disp %d\n", rank,
	     p, scounts[p], sdisps[p]);
    fprintf (stderr, "%d: recving from %d: %d entries , disp %d\n", rank,
	     p, rcounts[p], rdisps[p]);
  }
  */

  MPI_Alltoallv (localT.colind, scounts, sdisps, MPI_INT,
		 p_colind, rcounts, rdisps, MPI_INT,
		 comm);

  MPI_Alltoallv (localT.entry, scounts, sdisps, MPI_DOUBLE,
		 p_entry, rcounts, rdisps, MPI_DOUBLE,
		 comm);

  /*memcpy (p_entry, localT.entry, nent*sizeof(double));*/

  /*
  {
    int zz;
    fprintf (stderr, "%d: nent = %d\n", rank, nent);
    for (zz = 0; zz < nent; ++zz) {
      if (p_entry[zz] == 0.0)
	fprintf (stderr, "%d: zero entry off %d\n", rank, zz);
    }
  }
  */

  if (l2g_own) free (l2g_own);
  if (scounts) free (scounts);
  spcsr_free (&localT);
  return 0;
}

static int
collapse_matrix_chunks (const int csize,
			const int *p_rows, const int *p_rowind,
			const int *p_rowoff,
			const int *p_colind, const double *p_entry,
			struct spcsr_t *local)
{
  int *first = NULL;
  const int nr = local->nr;
#if !defined(NDEBUG)
  const int nc = local->nc;
#endif
  int i, rdeg, p;

  int *rowoff = local->rowoff;
  int *colind = local->colind;
  double *entry = local->entry;

#if !defined(NDEBUG)
  int *ci = NULL;
#endif

  /*
  {
    int rank, k, zz;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    fprintf (stderr, "%d: p_rows[%d]: [%d, %d)\n", rank,
	     0, p_rows[0], p_rows[1]);
    fprintf (stderr, "%d:   p_rowind: ", rank);
    zz = 0;
    for (k = p_rows[0]; k < p_rows[1]; ++k, ++zz) {
      if (zz >= 10) { fprintf (stderr, "..."); break; }
      fprintf(stderr, " %d", p_rowind[k]);
    }
    fprintf(stderr, "\n%d:   p_rowoff: ", rank);
    zz = 0;
    for (k = p_rows[0]; k < p_rows[1]; ++k, ++zz) {
      if (zz >= 10) { fprintf (stderr, "..."); break; }
      fprintf(stderr, " %d", p_rowoff[k]);
    }
    fprintf (stderr, "\n");
  }
  */


  first = malloc ( csize * sizeof(int));
  if (!first) return -1;
  for (p = 0; p < csize; ++p) first[p] = p_rows[p];

#if !defined (NDEBUG)
  ci = malloc ( nc * sizeof(int) );
  memset (ci, -1, nc * sizeof(int));
#endif

  rdeg = 0;
  for (i = 0; i < nr; ++i) {
    int p;
    rowoff[i] = rdeg;
    for (p = 0; p < csize; ++p) {
      const int lastp = p_rows[p+1];
      int rioff = first[p], k;

      while (rioff < lastp && i < p_rowind[rioff]) ++rioff;
      first[p] = rioff;
      if (lastp == rioff || i != p_rowind[rioff]) continue;
      ++first[p];

      /* Now at row i, so scatter row it */
      /* scatter is trivial as there's no overlap */
      /*  debug var ci checks for overlap */
      for (k = p_rowoff[rioff]; k < p_rowoff[rioff+1]; ++k) {
	const int j = p_colind[k];
	const double e = p_entry[k];

	assert (j >= 0);
	assert (j < nc);
	assert (ci[j] < 0);

#if !defined(NDEBUG)
	ci[j] = rdeg;
#endif
	colind[rdeg] = j;
	/*
	if (0.0 == e) {
	  int rank;
	  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	  fprintf(stderr, "%d: moving a zero (%d,%d) off %d\n", rank, i, j, rdeg);
	}
	*/
	entry[rdeg] = e;
	++rdeg;
      }
    }
#if !defined(NDEBUG)
    { /* clear ci */
      int k;
      for (k = rowoff[i]; k < rdeg; ++k) {
	const int j = colind[k];
	assert (j >= 0);
	assert (j < nc);
	assert (ci[j] >= 0);
	ci[j] = -1;
      }
    }
#endif
  }
  rowoff[i] = rdeg;
  /*
  {
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    fprintf (stderr, "%d: rowoff[%d] = %d (%d)\n", rank, i, rdeg, local->nent);
  }
  */

#if !defined(NDEBUG)
  {
    int rank, csize;
    int *seenr = NULL, *seenc = NULL;
    int i, k;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &csize);
    if (1 == csize) {
      seenr = calloc(nr+nc, sizeof(int));
      seenc = seenr + nr;
    }
    assert(rowoff[0] == 0);
    assert(rowoff[nr] == local->nent);
    for (i = 0; i < nr; ++i) {
      assert(rowoff[i] >= 0);
      assert(rowoff[i] <= rowoff[i+1]);
      if (seenr) if (rowoff[i] != rowoff[i+1]) seenr[i] = 1;
      for (k = rowoff[i]; k < rowoff[i+1]; ++k) {
	const int j = colind[k];
	if (seenc) seenc[j] = 1;
	assert(j >= 0);
	assert(j < nc);
      }
    }
    if (seenr)
      for (i = 0; i < nr; ++i) {
	if (!seenr[i])
	  fprintf (stderr, "%d: seenr[%i] missing\n", rank, i);
      }
    if (seenc)
      for (i = 0; i < nc; ++i) {
	if (!seenc[i])
	  fprintf (stderr, "%d: seenc[%i] missing\n", rank, i);
      }
    if (seenr) free (seenr);
    /* fprintf (stderr, "%d: done with (%d, %d)\n", rank, nr, nc);*/
  }
#endif

  if (first) free (first);
#if !defined(NDEBUG)
  if (ci) free (ci);
#endif
  return 0;
}


void
gcsr_transpose (struct gcsr_t *gA, struct gcsr_t *gAT,
		const int *indstartT_in, MPI_Comm comm)
{
  int rank, csize;
  struct spcsr_t localT;
  const int *indstartT = NULL;
  int *indstartT_own = NULL;

  int *p_rows = NULL, *p_rowoff = NULL, *p_rowind = NULL, *p_colind = NULL;
  double *p_entry = NULL;

  int nent;

  if (!gA || !gAT) return;
  spcsr_init_clear (&localT);
  rank = gA->rowmap.p;
  csize = gA->rowmap.np;

  /* build indstartT, and indstartT_own for transpose's map */
  gcsr_expand_rowmap (gA, comm);
  indstartT_own = malloc ((csize+1) * sizeof(int));
  if (!indstartT_own) MPI_Abort(comm, -393927);
  if (indstartT_in) {
    indstartT = indstartT_in;
    memcpy (indstartT_own, indstartT, (csize+1)*sizeof(int));
  } else {
    indstartT = gA->rowmap.indstart;
    memcpy (indstartT_own, indstartT, (csize+1)*sizeof(int));
  }

  p_rows = malloc ((csize+1) * sizeof(int));
  if (!p_rows) MPI_Abort (comm, 643443);


  if (transpose_matrix_chunks (gA, indstartT,
			       p_rows, &p_rowind, &p_rowoff,
			       &p_colind, &p_entry, comm))
    MPI_Abort (comm, 101028);

  nent = p_rowoff[p_rows[csize]];
#if !defined(NDEBUG)
  {
    int gnentin[2];
    int gnent[2];
    gnentin[0] = gA->local.nent;
    gnentin[1] = nent;
    MPI_Allreduce (&gnentin, &gnent, 2, MPI_INT, MPI_SUM, comm);
    assert(gnent[0] == gnent[1]);
  }
#endif

  localT.nc = gA->nr;
  localT.nr = indstartT[rank+1] - indstartT[rank];
  localT.nent = nent;
  if (spcsr_alloc_ (localT.nr, localT.nc, localT.nent,
		    &localT.rowoff, &localT.colind, &localT.entry,
		    NULL))
    MPI_Abort (comm, 653747);

  if (collapse_matrix_chunks (csize, p_rows, p_rowind, p_rowoff,
			      p_colind, p_entry, &localT))
    MPI_Abort (comm, 456746);

  /*
  {
    int p;
    for (p = 0; p < csize; ++p) {
      MPI_Barrier(MPI_COMM_WORLD);
      if (p == rank) {
	printf("%d: gA(%d, %d), lA(%d, %d; %d) rows [%d, %d)\n",
	       rank, gA->nr, gA->nc, localT.nr, localT.nc, localT.nent,
	       indstartT[rank], indstartT[rank+1]);
	printf ("%d: %d %d %d\n", rank, localT.rowoff[0], localT.rowoff[1], localT.rowoff[2]);
      }
      MPI_Barrier(MPI_COMM_WORLD);
    }
  }
  */

  if (gcsr_take_csr_locally (gA->nc, gA->nr, &localT, gAT,
			     &indstartT_own, comm))
    MPI_Abort (comm, 89372);

#if !defined(NDEBUG)
  {
    int i, j, k;
    int *seenr, *seenc;
    int *rootseenr = NULL, *rootseenc;
    seenr = calloc (gA->nr + gA->nc, sizeof(int));
    seenc = seenr + gA->nr;
    if (0 == rank) {
      rootseenr = calloc (gA->nr + gA->nc, sizeof(int));
      rootseenc = rootseenr + gA->nr;
    }
    for (i = 0; i < gA->local.nr; ++i) {
      const int gi = i + gA->rowmap.lindstart;
      if (gA->local.rowoff[i] != gA->local.rowoff[i+1])
	seenr[gi] = 1;
      for (k = gA->local.rowoff[i]; k < gA->local.rowoff[i+1]; ++k) {
	j = gA->local.colind[k];
	if (gA->colmap.l2g)
	  seenc[gA->colmap.l2g[j]] = 1;
	else
	  seenc[j] = 1;
      }
    }

    MPI_Reduce (seenr, rootseenr, gA->nr + gA->nc, MPI_INT, MPI_MAX, 0, comm);
    if (0 == rank) {
      for (i = 0; i < gA->nr; ++i)
	if (!rootseenr[i])
	  fprintf (stderr, "%d:: no one sees row %d\n", rank, i);
      for (j = 0; j < gA->nc; ++j)
	if (!rootseenc[j])
	  fprintf (stderr, "%d:: no one sees col %d\n", rank, j);
    }
    free(seenr);
    if (rootseenr) free(rootseenr);
  }
#endif

  if (p_rows) free (p_rows);
  if (p_rowind) free (p_rowind);
  if (p_colind) free (p_colind);
  if (p_entry) free (p_entry);
}

int
gcsr_copy (const struct gcsr_t *gA, struct gcsr_t *gAcopy, MPI_Comm comm)
{
  gcsr_init_clear (gAcopy);
  spcsr_copy (&gA->local, &gAcopy->local);
  gAcopy->nr = gA->nr;
  gAcopy->nc = gA->nc;
  gAcopy->rowmap.p = gA->rowmap.p;
  gAcopy->rowmap.np = gA->rowmap.np;
  gAcopy->rowmap.lindstart = gA->rowmap.lindstart;
  gAcopy->rowmap.lnind = gA->rowmap.lnind;
  gAcopy->rowmap.indstart = NULL;
  if (gA->rowmap.indstart) {
    gAcopy->rowmap.indstart = malloc ((gA->rowmap.np + 1) * sizeof(int));
    if (gAcopy->rowmap.indstart)
      memcpy (gAcopy->rowmap.indstart, gA->rowmap.indstart,
	      (gA->rowmap.np + 1) * sizeof(int));
  }
  if (gA->colmap.l2g) {
    gAcopy->colmap.l2g = malloc (gA->nc * sizeof(int));
    if (gAcopy->colmap.l2g)
      memcpy(gAcopy->colmap.l2g, gA->colmap.l2g, gA->nc * sizeof(int));
    else {
      MPI_Abort (comm, 934743); /* could just reindex to global */
      return ENOMEM;
    }
  }
  return 0;
}

void
gcsr_transpose_onroot (struct gcsr_t *gA, struct gcsr_t *gAT,
		       const int *indstartT_in, MPI_Comm comm)
{
  if (gcsr_copy (gA, gAT, comm)) MPI_Abort(comm, 998701);
  if (gcsr_redist_root (gAT, comm)) MPI_Abort(comm, 998702);

  if (0 == gA->rowmap.p) {
    struct spcsr_t localT;
    if (spcsr_transpose (&gAT->local, &localT)) MPI_Abort(comm, 998703);
    spcsr_free(&gAT->local);
    gAT->local = localT;
  }

  gAT->nr = gA->nc;
  gAT->nc = gA->nr;

  if (indstartT_in) {
    gcsr_redist_rowinds (gAT, indstartT_in, comm);
  }
  else {
    gcsr_expand_rowmap (gA, comm);
    gcsr_redist_rowinds (gAT, gA->rowmap.indstart, comm);
  }
}


Mode Type Size Ref File
100644 blob 442 86dd3c214f1a734e4a55303fc785d6eb63ed23d9 .gitignore
100644 blob 733 f571749ef02b737310e27e981218650fbcce8be6 Make.ejr-citris
100644 blob 478 5fb839dab7306b59d49e9a8626e1ca89dcea26db Make.ejr-home-gcc
100644 blob 205 a9b00f1db5934084c2751081636cf80d078b858b Make.seaborg
100644 blob 3307 f715c36bf2ae4a4f455cc7177350dba21c21c1ca Makefile
100644 blob 43442 c184552cceaac75622af1b01c27fd73172448ff6 auction.c
100644 blob 3016 a574b362b4d966599d6720ca0439d385a92084e1 auction.h
100644 blob 302 6995992eb49553aadebe4dddd1a3a74eacf4f332 cacheblast.c
100644 blob 3386 2b76fb3e0a5bb941d13e1ff6f4b2cf152af929e3 colcomp.c
100644 blob 582 3afbfafab8b4b83786e48a66cf20df7eb1feaf8c convmats.m
100644 blob 2189 fba47c6f3908699da08dfac9469d296af62dc234 lsame.c
100644 blob 6357 de79a078bc59834980be9a99d01eb72f3f39cab0 matchmtx.c
040000 tree - 3fcccf0fc51adf74ea4fc6c501e65306ada6fdd1 mc64
040000 tree - c1a5dab1b4b8999153b6d04dae4624af7038dcec mlb
100644 blob 6344 74992c1d4736ff84fdcdcf2b1be7f4eaf7de3b63 mpitransposebin.c
100644 blob 6351 f4beee0452662b6f5bc6d79b839faec02ce6802e mpitransposebin_root.c
100644 blob 1092028 89dad69d7b2de3e183bfc343e98e1943a9d34e16 orani678.bin
100644 blob 19852 8ab7953ae5f8d31b10c483ceae57098f7531abb5 parauction.c
100644 blob 19456 3da978418d76db4c6cc57635646b473e8c5b9445 parauction_allgather.c
100644 blob 14875 102e4d81d979f10c3b3ac8cdac1b257527b324ff parauction_allreduce.c
100644 blob 19123 581100799f474a87d2a1745e09fe07ea7475a84b parauction_root.c
100644 blob 1154 2fc93276b1723c805e82d0e9eb4781184bfcca2e readbin.c
100644 blob 864 5f791479f6982f22101a5094efb0fa0947b54310 readbin.m
100644 blob 696 6151ee6074636c5bdd0d5f34ee4d9b0c76e0501d readmatch.m
100644 blob 1182 8b0bfcc7b6bac0c7f701ae317cbeeff5feec512a redist1.c
100644 blob 1412 4dae58f785498844725f72c26f70621b9287ad1e redist2.c
100644 blob 3002 f41bc38e57d9c613a6426045d0ea7ac22bd339de redist3.c
100644 blob 2903 857ee46542a70759476e2a7437e900c32f12001c redist4.c
100644 blob 6659 ea56d6005d1762a46ff330eebe26e575d9f7a2d9 seqauction.c
100644 blob 15392 9fac5aac4170c6cd7d9c2327911a531ea7950dad smut.c
100644 blob 2338 06f2519a40af810ecfaa5e250459109aed6ee642 smut.h
100644 blob 13867 4344501525923bf457618dcdba1993607caba923 stripe_auction.c
100644 blob 1771 f709206b1e373cf8c7a3c3d3fb702b4bae72d435 stripe_auction.h
100644 blob 39548 5389671d60b8e2ddd1f9360551377d35c8ccdbd2 stripesmut.c
100644 blob 1402 075d97c492be1e6546e24d4a330566981616ac0e stripesmut.h
100644 blob 627 9909b624922275b657ab5d66774f4c40c8beb760 timer.c
100644 blob 405 46eb6c267780afd6b1ee7816a42332cdf79d2683 timer.h
100644 blob 942 d254d606053141bc0a6ffdf714a436980900c028 transposebin.c
100644 blob 697 3728ae018c10cf4308bce2388b75bafe4be41942 writebin.c
100644 blob 1159 87b57c3343da1b9296045ce1d2b2441b490f81d7 writebin.m
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/ejr/matchpres

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/ejr/matchpres

Clone this repository using git:
git clone git://git.rocketgit.com/user/ejr/matchpres

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main