ejr / matchpres (public) (License: BSD) (since 2018-08-29) (hash sha1)
Ye olde thesis code implementing an auction-based weighted bipartite matching algorithm over MPI.

/stripesmut.c (5389671d60b8e2ddd1f9360551377d35c8ccdbd2) (39548 bytes) (mode 100644) (type blob)

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>

#include <assert.h>

#include <mpi.h>

#define SMUT_NO_IO
#include "smut.h"
#undef SMUT_NO_IO
#include "stripesmut.h"

void
gcsr_init_clear (struct gcsr_t *gA)
{
  if (!gA) return;
  gA->nr = gA->nc  = 0;
  spcsr_init_clear (&gA->local);
  gA->rowmap.p = gA->rowmap.np = 0;
  gA->rowmap.lindstart = gA->rowmap.lnind = 0;
  gA->rowmap.indstart = NULL;
  gA->colmap.l2g = NULL;
}

void
gcsr_free (struct gcsr_t *gA, MPI_Comm comm)
{
  if (gA->colmap.l2g) free (gA->colmap.l2g);
  if (gA->rowmap.indstart) free (gA->rowmap.indstart);
  spcsr_free (&gA->local);
  gcsr_init_clear (gA);
#if !defined(NDEBUG)
  MPI_Barrier (comm);
#endif
}

void
gcsr_expand_rowmap (struct gcsr_t *gA, MPI_Comm comm)
{
  int rank, csize;
  int *indstart;
#if !defined(NDEBUG)
  int p;
#endif

  if (!gA || gA->rowmap.indstart) return;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  indstart = malloc ((csize+1) * sizeof(int));
  if (!indstart) MPI_Abort(comm, -8922);

  MPI_Allgather (&gA->rowmap.lindstart, 1, MPI_INT,
		 indstart, 1, MPI_INT,
		 comm);
  indstart[csize] = gA->nr;
#if !defined(NDEBUG)
  assert(0 == indstart[0]);
  for (p = 1; p < csize; ++p) {
    assert(indstart[p] >= 0);
    assert(indstart[p] <= indstart[p+1]);
  }
#endif
  gA->rowmap.indstart = indstart;
}

int
gcsr_take_csr_root (struct spcsr_t *A, struct gcsr_t *gA, MPI_Comm comm)
{
  int rank, csize, p;
  int errflg = 0;
  int tmpdata[2];

  if (!gA) return 1;
  {
    int initflg = 0;
    errflg = MPI_Initialized(&initflg);
    if (MPI_SUCCESS != errflg) return errflg;
    if (!initflg) return 1;
  }

  MPI_Comm_size (comm, &csize);
  MPI_Comm_rank (comm, &rank);

  if (0 == rank && !A) MPI_Abort(comm, EINVAL);

  gA->rowmap.p = rank;
  gA->rowmap.np = csize;
  gA->rowmap.indstart = malloc ((csize+1) * sizeof(int));
  if (!gA->rowmap.indstart) MPI_Abort(comm, ENOMEM);

  if (0 == rank) {
    gA->local = *A;
    tmpdata[0] = A->nr;
    tmpdata[1] = A->nc;
  }
  else {
    spcsr_init_clear (&gA->local);
  }
  MPI_Bcast (tmpdata, 2, MPI_INT, 0, comm);
  gA->nr = tmpdata[0];
  gA->nc = tmpdata[1];

  gA->rowmap.lindstart = (rank == 0? 0 : gA->nr);
  gA->rowmap.lnind = (rank == 0? gA->nr : 0);
  gA->rowmap.indstart[0] = 0;
  for (p = 1; p <= csize; ++p)
    gA->rowmap.indstart[p] = gA->nr;

  gA->colmap.l2g = NULL;

  spcsr_init_clear (A);
  return 0;
}

int
gcsr_give_csr_root (struct gcsr_t *gA, struct spcsr_t *A, MPI_Comm comm)
{
  int errflg;

  errflg = gcsr_redist_root (gA, comm);
  if (errflg) return errflg;

  *A = gA->local;
  spcsr_init_clear (&gA->local);
  gcsr_free (gA, comm);

  return 0;
}

void
gcsr_unpack_colinds (struct gcsr_t *gA)
{
  struct spcsr_t local;
  int *l2g, k;
  if (!gA || !gA->colmap.l2g)
    return;
  l2g = gA->colmap.l2g;
  local=gA->local;

  for (k = 0; k < local.nent; ++k) {
    const int j = local.colind[k];
    assert (j >= 0);
    assert (j < local.nc);
    local.colind[k] = l2g[j];
  }

  gA->colmap.l2g = NULL;
  free(l2g);
  gA->local.nc = gA->nc;
}

int
gcsr_redist_root (struct gcsr_t* gA, MPI_Comm comm)
{
  int rank, csize, am_root;
  struct spcsr_t local;
  int *counts = NULL;
  int *disps = NULL;
  int gnent;
  struct spcsr_t newlocal;

  int p;

  /* get # entries per proc */
  /* realloc local to fit */
  /* MPI_Gatherv */
  /* local freeing */

  if (!gA || !gA->nr || !gA->nc) return 0;

  spcsr_init_clear (&newlocal);

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -22);
    if (mpicsize != csize) MPI_Abort(comm, -23);
  }
#endif

  if (am_root) {
    counts = malloc (csize * sizeof(int));
    disps = malloc (csize * sizeof(int));
    if (!counts || !disps) MPI_Abort(comm, -398202);
  }

  /*
  fprintf (stderr, "%d: foo1\n", rank);
  */

  local = gA->local;

  gcsr_unpack_colinds (gA);

  /*
  fprintf (stderr, "%d: foo2\n", rank);
  */

  /* start setting up root's newlocal */
  if (am_root) {
    gnent = 0;
    newlocal.nr = gA->nr;
    newlocal.nc = gA->nc;
    newlocal.rowoff = malloc ( (newlocal.nr+1) * sizeof(int) );
    if (!newlocal.rowoff) MPI_Abort(comm, -348362);
  }

  /*
  fprintf (stderr, "%d: foo3\n", rank);
  */

#if !defined(NDEBUG)
  /* Double-check the number of entries in a few places*/
  MPI_Reduce (&local.nent, &gnent, 1, MPI_INT, MPI_SUM, 0, comm);
#endif

  /*
  fprintf (stderr, "%d: foo4\n", rank);
  */

  /* gather row offsets and clear any row mapping */
  if (gA->rowmap.indstart) {
    if (am_root) {
      for (p = 0; p < csize; ++p) {
	counts[p] = gA->rowmap.indstart[p+1] - gA->rowmap.indstart[p];
      }
    }
  } else {
    MPI_Gather (&gA->rowmap.lnind, 1, MPI_INT,
		counts, 1, MPI_INT,
		0, comm);
  }
  if (am_root) {
    disps[0] = 0;
    for (p = 1; p < csize; ++p)
      disps[p] = disps[p-1] + counts[p-1];
  }
  gA->rowmap.lindstart = (am_root? 0 : gA->nr);
  gA->rowmap.lnind = (am_root? gA->nr : 0);
  if (gA->rowmap.indstart) {
    gA->rowmap.indstart[0] = 0;
    for (p = 1; p <= csize; ++p)
      gA->rowmap.indstart[p] = gA->nr;
  }

  /*
  fprintf (stderr, "%d: foo5\n", rank);
  */

  MPI_Gatherv ((local.rowoff? &local.rowoff[1] : NULL), local.nr, MPI_INT,
	       newlocal.rowoff, counts, disps, MPI_INT,
	       0, comm);

  /*
  fprintf (stderr, "%d: foo6\n", rank);
  */

  /* convert row offsets while collecting num. entry info */
  if (am_root) {
    int k;
    int cumsum, nents_p;

    /*
    fprintf (stderr, "%d: foo6.1 %p %p %d %p\n",
	     rank, counts, disps, newlocal.nr, newlocal.rowoff);
    */

    cumsum = 0;
    for (p = 0; p < csize; ++p) {
      const int cnt = counts[p];
      const int off = disps[p];
      nents_p = 0;

      /*
      fprintf (stderr, "%d: foo6.1 cnt=%d  off=%d span=[%d,%d)  nents_p=%d  cumsum=%d\n",
	       rank, cnt, off, off, off+cnt, nents_p, cumsum);
      fprintf (stderr, "%d: foo6.1 newlocal.rowoff[%d and +1] = %d %d\n",
	       rank, off, newlocal.rowoff[off], newlocal.rowoff[off+1]);
      */
      if (cnt > 0) nents_p = newlocal.rowoff[off + cnt - 1];
      for (k = 0; k < cnt; ++k) {
	newlocal.rowoff[off + k] += cumsum;
      }
      counts[p] = nents_p;
      cumsum += nents_p;
      /*
      fprintf (stderr, "%d: foo6.1.1 cnt=%d  off=%d span=[%d,%d)  nents_p=%d  cumsum=%d\n",
	       rank, cnt, off, off, off+cnt, nents_p, cumsum);
      fprintf (stderr, "%d: foo6.1.1 newlocal.rowoff[%d and +1] = %d %d\n",
	       rank, off, newlocal.rowoff[off], newlocal.rowoff[off+1]);
      */
    }
    assert(cumsum == gnent);

    /*
    fprintf (stderr, "%d: foo6.5\n", rank);
    */

    for (k = newlocal.nr; k > 0; --k)
      newlocal.rowoff[k] = newlocal.rowoff[k-1];
    newlocal.rowoff[0] = 0;
    newlocal.nent = newlocal.rowoff[newlocal.nr];
    assert(newlocal.nent = gnent);
    gnent = newlocal.nent;

    disps[0] = 0;
    for (p = 1; p < csize; ++p)
      disps[p] = disps[p-1] + counts[p-1];

    newlocal.colind = malloc (gnent * sizeof(int));
    newlocal.entry = malloc (gnent * sizeof(double));
    if (!newlocal.colind || !newlocal.entry) MPI_Abort(comm, -222018);
  }

  /*
  fprintf (stderr, "%d: foo7 %d %p %p\n", rank, local.nent, counts, disps);
  if (counts) {
    for (p = 0; p < csize; ++p)
      fprintf (stderr, "%d: counts[%d] = %d, disps[%d] = %d\n",
	       rank, p, counts[p], p, disps[p]);
  }
  */

  MPI_Gatherv (local.colind, local.nent, MPI_INT,
	       newlocal.colind, counts, disps, MPI_INT,
	       0, comm);

  MPI_Gatherv (local.entry, local.nent, MPI_DOUBLE,
	       newlocal.entry, counts, disps, MPI_DOUBLE,
	       0, comm);

  /*
  fprintf (stderr, "%d: foo8\n", rank);
  */

  spcsr_free (&gA->local);
  gA->local = newlocal;
  if (counts) free(counts);
  if (disps) free(disps);
  return 0;
}

/* XXX: there must be a prettier way */
static void
make_row_sr_counts (const int rank, const int csize,
		    const int lindstart, const int lnr,
		    const int *indstart,
		    const int *new_indstart,
		    int *scounts, int *sdisps,
		    int *rcounts, int *rdisps,
		    MPI_Comm comm)
{
  int p;
  int total_rows_to_send = lnr;
  int cur_local_row = 0;
  int cumsum;

  for (p = 0; p < csize; ++p) {
    int lrow_incr, nrows_req;
    sdisps[p] = cur_local_row;

    lrow_incr = new_indstart[p+1] - lindstart;

    if (lrow_incr <= 0) continue; /* haven't reached recipients yet */
    if (0 == total_rows_to_send) continue; /* past all recipients */
    assert(total_rows_to_send > 0);

    /* otherwise there's an intersection */
    nrows_req = new_indstart[p+1] - new_indstart[p];
    if (lrow_incr > nrows_req)
      lrow_incr = nrows_req;
    if (lrow_incr > total_rows_to_send)
      lrow_incr = total_rows_to_send;
    total_rows_to_send -= lrow_incr;
    cur_local_row += lrow_incr;
  }
  sdisps[p] = cur_local_row;
  assert(cur_local_row == lnr);
  for (p = 0; p < csize; ++p) scounts[p] = sdisps[p+1] - sdisps[p];

  if (!indstart) {
    MPI_Alltoall (scounts, 1, MPI_INT, rcounts, 1, MPI_INT, comm);
    cumsum = 0;
    for (p = 0; p < csize; ++p) {
      rdisps[p] = cumsum;
      cumsum += rcounts[p];
    }
    rdisps[p] = cumsum;
    assert(cumsum == new_indstart[rank+1] - new_indstart[rank]);
  }
  else {
    int new_lindstart = new_indstart[rank];
    int total_rows_to_recv = new_indstart[rank+1] - new_indstart[rank];
    int cur_new_local_row = 0;

    for (p = 0; p < csize; ++p) {
      int lrow_incr, nrows_req;
      rdisps[p] = cur_new_local_row;

      lrow_incr = indstart[p+1] - new_lindstart;

      if (lrow_incr <= 0) continue; /* haven't reached recipients yet */
      if (0 == total_rows_to_recv) continue; /* past all recipients */
      assert(total_rows_to_recv > 0);

      /* otherwise there's an intersection */
      nrows_req = indstart[p+1] - indstart[p];
      if (lrow_incr > nrows_req)
	lrow_incr = nrows_req;
      if (lrow_incr > total_rows_to_recv)
	lrow_incr = total_rows_to_recv;
      total_rows_to_recv -= lrow_incr;
      cur_new_local_row += lrow_incr;
    }
    rdisps[p] = cur_new_local_row;
    assert(cur_new_local_row == new_indstart[rank+1] - new_indstart[rank]);
    for (p = 0; p < csize; ++p) rcounts[p] = rdisps[p+1] - rdisps[p];
  }
}

static void
make_nent_sr_counts (const int rank, const int csize,
		     const int *sdisps,
		     const struct spcsr_t local,
		     int *nent_send, int *nent_recv,
		     MPI_Comm comm)
{
  int p;

  /*
  for (p = 0; p < csize; ++p) {
    int p2;
    MPI_Barrier(comm);
    if (rank == p) {
      for (p2 = 0; p2 <= csize; ++p2) {
	fprintf (stderr, "%d: sdisps[%d] = %d\n",
		 rank, p2, sdisps[p2]);
      }
    }
    MPI_Barrier(comm);
  }
  */

  for (p = 0; p < csize; ++p) {
    if (sdisps[p+1] == sdisps[p]) /* local.rowoff may be null */
      nent_send[p] = 0;
    else
      nent_send[p] = local.rowoff[sdisps[p+1]] - local.rowoff[sdisps[p]];
    assert(nent_send[p] >= 0);
    assert(nent_send[p] <= local.nent);
  }

  MPI_Alltoall (nent_send, 1, MPI_INT, nent_recv, 1, MPI_INT, comm);
}

int
gcsr_redist_rowinds (struct gcsr_t *gA, const int *new_indstart, MPI_Comm comm)
{
  int rank, csize, am_root;
  int lindstart, lindend;
  int new_lindstart, new_lindend;
  struct spcsr_t local, new_local;

  int cumsum, p, i;

  int *scounts, *sdisps, *rcounts, *rdisps;

  if (!gA || !new_indstart) return -1;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2233);
    if (mpicsize != csize) MPI_Abort(comm, -2333);
  }
#endif

  if (new_indstart[csize] != gA->nr) return -1;

  local = gA->local;

  gcsr_unpack_colinds (gA);

  lindstart = gA->rowmap.lindstart;
  lindend = lindstart + gA->rowmap.lnind;

  new_lindstart = new_indstart[rank];
  new_lindend = new_indstart[rank+1];

  spcsr_init_clear (&new_local);
  new_local.nr = new_indstart[rank+1] - new_indstart[rank];
  new_local.nc = (new_local.nr? gA->nc : 0);
  if (0 != new_local.nr) {
    new_local.rowoff = malloc ((new_local.nr + 1) * sizeof(int));
    if (!new_local.rowoff) MPI_Abort(comm, -3939821);
  } else
    new_local.rowoff = NULL;

  /*
  for (p = 0; p < csize; ++p) {
    int p2;
    MPI_Barrier(comm);
    if (rank == p) {
      fprintf(stderr, "%d: local nr old: %d, new: %d\n", rank,
	      local.nr, new_local.nr);
    }
    MPI_Barrier(comm);
  }
  */

  scounts = malloc (4 * (csize+1) * sizeof(int));
  sdisps = scounts + csize + 1;
  rcounts = sdisps + csize + 1;
  rdisps = rcounts + csize + 1;

  make_row_sr_counts (rank, csize, lindstart, local.nr,
		      gA->rowmap.indstart, new_indstart,
		      scounts, sdisps, rcounts, rdisps, comm);

  /*
  for (p = 0; p < csize; ++p) {
    int p2;
    MPI_Barrier(comm);
    if (rank == p) {
      for (p2 = 0; p2 < csize; ++p2) {
	fprintf (stderr, "%d: sending to %d: (%d, %d)\n",
		 rank, p2, scounts[p2], sdisps[p2]);
	fprintf (stderr, "%d: recving from %d: (%d, %d)\n",
		 rank, p2, rcounts[p2], rdisps[p2]);
      }
    }
    MPI_Barrier(comm);
  }
  */

  MPI_Alltoallv (local.rowoff, scounts, sdisps, MPI_INT,
		 new_local.rowoff, rcounts, rdisps, MPI_INT,
		 comm);

  /*
  fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
  */

  make_nent_sr_counts (rank, csize, sdisps, local, scounts, rcounts, comm);
  /* now scounts and rcounts count _entries_, not rows */

  /*
  fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
  */

  /* fixup offsets */
  cumsum = 0;
  for (p = 0; p < csize; ++p) {
    if (rdisps[p] != rdisps[p+1]) { /* else new_local.rowoff may be NULL */
      int firstoff = new_local.rowoff[rdisps[p]];
      for (i = rdisps[p]; i < rdisps[p+1]; ++i) {
	new_local.rowoff[i] -= firstoff;
	new_local.rowoff[i] += cumsum;
      }
      cumsum += rcounts[p];
    }
  }
  new_local.nent = cumsum;
  if (new_local.rowoff) new_local.rowoff[new_local.nr] = cumsum;
  assert( !new_local.rowoff || (0 == new_local.rowoff[0]) );

  /*
  fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
  */

  if (cumsum > 0) {
    new_local.colind = malloc (cumsum * sizeof(int));
    new_local.entry = malloc (cumsum * sizeof(double));
    if (!new_local.colind || !new_local.entry) MPI_Abort(comm, -9083);
  } else {
    new_local.colind = NULL;
    new_local.entry = NULL;
  }

  /*
  fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
  */

  for (p = 0; p < csize; ++p) {
    if (local.rowoff) sdisps[p] = local.rowoff[sdisps[p]];
    assert(local.rowoff || (sdisps[p] == 0));
    if (new_local.rowoff) rdisps[p] = new_local.rowoff[rdisps[p]];
    assert(new_local.rowoff || (rdisps[p] == 0));
  }

  /*
  for (p = 0; p < csize; ++p) {
    int p2;
    MPI_Barrier(comm);
    if (rank == p) {
      for (p2 = 0; p2 < csize; ++p2) {
	fprintf (stderr, "%d: sending to %d: (%d, %d)\n",
		 rank, p2, scounts[p2], sdisps[p2]);
	fprintf (stderr, "%d: recving from %d: (%d, %d)\n",
		 rank, p2, rcounts[p2], rdisps[p2]);
      }
    }
    MPI_Barrier(comm);
  }
  */

  /*
  fprintf (stderr, "%d: here a\n", rank); MPI_Barrier(comm);
  */

  MPI_Alltoallv (local.colind, scounts, sdisps, MPI_INT,
		 new_local.colind, rcounts, rdisps, MPI_INT,
		 comm);

  /*
  fprintf (stderr, "%d: here b\n", rank); MPI_Barrier(comm);
  */

  MPI_Alltoallv (local.entry, scounts, sdisps, MPI_DOUBLE,
		 new_local.entry, rcounts, rdisps, MPI_DOUBLE,
		 comm);

  /*
  fprintf (stderr, "%d: here c\n", rank); MPI_Barrier(comm);
  */

  spcsr_free (&local);
  gA->local = new_local;
  gA->rowmap.lindstart = new_lindstart;
  gA->rowmap.lnind = new_lindend - new_lindstart;
  if (!gA->rowmap.indstart)
    gA->rowmap.indstart = malloc ((csize+1) * sizeof(int));
  if (gA->rowmap.indstart) /* if malloc fails, no big deal */
    memcpy (gA->rowmap.indstart, new_indstart, (csize+1)*sizeof(int));
  assert(gA->rowmap.lnind == gA->local.nr);
  free(scounts);

  /*
  fprintf (stderr, "%d: done!\n", rank); MPI_Barrier(comm);
  */

  /*
  for (p = 0; p < csize; ++p) {
    MPI_Barrier(comm);
    if (rank == p) {
      fprintf (stderr, "%d: local (%d, %d; %d)\n",
	       rank, gA->local.nr, gA->local.nc, gA->local.nent);
    }
    MPI_Barrier(comm);
  }
  */

  return 0;
}

int
gcsr_redist_nrows (struct gcsr_t *gA, int minrows, MPI_Comm comm)
{
  int rank, csize, am_root;
  int *new_indstart;
  int errflg;

  int p, nr, nrows_per;

  if (!gA) return -1;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2233);
    if (mpicsize != csize) MPI_Abort(comm, -2333);
  }
#endif

  nr = gA->nr;

  new_indstart = malloc ((csize+1) * sizeof(int));
  if (!new_indstart) MPI_Abort(comm, -303030);

  nrows_per = (nr + csize - 1) / csize;
  if (nrows_per < minrows) nrows_per = minrows;
  new_indstart[0] = 0;
  for (p = 1; p <= csize; ++p) {
    new_indstart[p] = new_indstart[p-1] + nrows_per;
    if (new_indstart[p] > nr) new_indstart[p] = nr;
  }

  errflg = gcsr_redist_rowinds (gA, new_indstart, comm);
  free(new_indstart);
  return errflg;
}

int
gcsr_redist_ents (struct gcsr_t *gA, int minrows, MPI_Comm comm)
{
  int rank, csize, am_root;
  int *new_indstart = NULL, *rowdeg = NULL;
  int *rcounts = NULL, *rdisps = NULL;
  int errflg;

  int ents_target, ents_so_far, rows_so_far;

  int i, p, nr;

  int gnent;

  if (!gA) return -1;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2238);
    if (mpicsize != csize) MPI_Abort(comm, -2339);
  }
#endif
  /*
  fprintf (stderr, "%d: baz0\n", rank);
  */

  nr = gA->nr;

  new_indstart = malloc ((csize+1) * sizeof(int));
  if (!new_indstart) MPI_Abort(comm, -303031);

  /*
  fprintf (stderr, "%d: baz1\n", rank);
  */

  if (am_root) {
    rowdeg = malloc ((1 + gA->nr + 2*csize + 2) * sizeof(int));
    /* XXX: too much memory */
    if (!rowdeg) MPI_Abort(comm, -22292);
    rcounts = rowdeg + 1 + gA->nr;
    rdisps = rcounts + csize;
  }

  /*
  fprintf (stderr, "%d: baz2 %p %p %p\n", rank, rowdeg, rcounts, rdisps);
  */

  MPI_Gather (&gA->local.nr, 1, MPI_INT, rcounts, 1, MPI_INT, 0, comm);
  /* fprintf (stderr, "%d: baz2.1\n", rank); */
  if (am_root) {
    rdisps[0] = 0;
    for (p = 1; p <= csize; ++p) {
      rdisps[p] = rdisps[p-1] + rcounts[p-1];
    }
  }
  /* fprintf (stderr, "%d: baz3\n", rank); */

  MPI_Gatherv (gA->local.rowoff + 1, gA->local.nr, MPI_INT,
	       rowdeg, rcounts, rdisps, MPI_INT,
	       0, comm);
  MPI_Reduce (&gA->local.nent, &gnent, 1, MPI_INT, MPI_SUM, 0, comm);

  /* fprintf (stderr, "%d: baz4\n", rank); */

  if (am_root) {
    /* turn into coldegs */
    for (p = 0; p < csize; ++p) {
      int prevoff, disp, k;
      /* fprintf (stderr, "%d: baz5 rcounts[%d] = %d\n", rank, p, rcounts[p]); */
      if (rcounts[p] <= 1) continue;
      disp = rdisps[p];
      prevoff = rowdeg[disp];
      for (k = 1; k < rcounts[p]; ++k) {
	int tmp = rowdeg[disp+k];
	rowdeg[disp+k] -= prevoff;
	prevoff = tmp;
      }
    }

    /* now split */
    ents_target = (gnent + csize - 1) / csize;

    i = 0;
    for (p = 0; p < csize; ++p) {
      new_indstart[p] = i;
      ents_so_far = rows_so_far = 0;
      for (; i < nr; ++i) {
	ents_so_far += rowdeg[i];
	++rows_so_far;

	if (rows_so_far > minrows && ents_so_far > ents_target)
	  break;
      }
    }
    new_indstart[p] = nr;
    /*
    for (p = 0; p <= csize; ++p) {
      fprintf (stderr, "%d: new_indstart[%d] = %d\n",
	       rank, p, new_indstart[p]);
    }
    */
  }

  /*
  fprintf (stderr, "%d: baz6\n", rank);
  */

  MPI_Bcast (new_indstart, csize+1, MPI_INT, 0, comm);

  /*
  fprintf (stderr, "%d: baz7\n", rank);
  */

  errflg = gcsr_redist_rowinds (gA, new_indstart, comm);
  if (rowdeg) free(rowdeg);
  free(new_indstart);

  /*
  fprintf (stderr, "%d: baz8\n", rank);
  */

  return errflg;
}



#if 0
int
gcsr_redist_ents (struct gcsr_t *gA, int minrows, MPI_Comm comm)
{
  int rank, csize, am_root;
  int *new_indstart;
  int errflg;

  int p, nr, nrows_per, nent_per;

  int gnent, nent_start, nent_finish;
  int lneigh, rneigh;

  if (!gA) return -1;

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
  am_root = (0 == rank);
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2233);
    if (mpicsize != csize) MPI_Abort(comm, -2333);
  }
#endif

  nr = gA->nr;

  new_indstart = malloc ((p+1) * sizeof(int));
  if (!new_indstart) MPI_Abort(comm, -303031);

  /* Make an exclusive scan.
     Could use MPI_Exscan; then add local.nent to gnent. */
  MPI_Scan (&gA->local.nent, &gnent, 1, MPI_INT, MPI_SUM, comm);
  lneigh = rank-1;
  if (lneigh < 0) lneigh = MPI_PROC_NULL;
  rneigh = rank+1;
  if (rneigh >= csize) rneigh = MPI_PROC_NULL;
  nent_start = 0;
  MPI_Sendrecv (&gnent, 1, MPI_INT, rneigh, 99992,
		&nent_start, 1, MPI_INT, lneigh, 99992,
		comm, MPI_STATUS_IGNORE);

#if !defined(NDEBUG)
  {
    int gnent_reduct;
    MPI_Allreduce (&gA->local.nent, &gnent_reduct, 1, MPI_INT,
		   MPI_SUM, comm);
    if (rank == csize-1)
      assert(gnent == gnent_reduct);
  }
#endif
  MPI_Bcast (&gnent, 1, MPI_INT, csize-1, comm);

  nent_per = (gnent + csize - 1) / csize;

  /*AUUUGH!  minrows kills everything quickly! */
}
#endif /* until I figure out how to do it in parallel */


static int
compind (const void *p1, const void *p2)
{
  const int a = *(const int*) p1;
  const int b = *(const int*) p2;
  return b - a;
}


void
gcsr_pack_colinds (struct gcsr_t* gA)
{
  struct spcsr_t local;
  int k, nent, nc, lnc;
  int *g2l = NULL, *l2g = NULL, *tmp = NULL;

  if (!gA) return;
  if (0 == gA->local.nr || 0 == gA->local.nc || 0 == gA->local.nent) return;
  /*return;*/

  nc = gA->nc;
  l2g = malloc (nc * sizeof(int));
  g2l = malloc (nc * sizeof(int));
  if (!l2g || !g2l) goto memfree;

  gcsr_unpack_colinds (gA);

  local = gA->local;
  nent = local.nent;

  memset (g2l, -1, nc * sizeof(int));

  lnc = 0;
  for (k = 0; k < nent; ++k) {
    const int j = local.colind[k];
    assert(j >= 0);
    assert(j < nc);
    if (g2l[j] < 0) {
      l2g[lnc] = j;
      g2l[j] = 1;
      ++lnc;
    }
  }

  if (lnc >= (3*nc)/4) goto memfree; /* Don't bother. */

  /*
    Keep in same order as original columns.
    Makes transpose-like ops easier, shouldn't cost much.
  */
  qsort (l2g, lnc, sizeof(int), compind);
  for (k = 0; k < lnc; ++k) {
    const int gj = l2g[k];
    assert(gj >= 0);
    assert(gj < gA->nc);
    g2l[gj] = k;
  }

  tmp = realloc(l2g, lnc * sizeof(int));
  if (tmp) l2g = tmp;

  for (k = 0; k < nent; ++k) {
    const int j = local.colind[k];
    const int new_j = g2l[j];
    assert(j >= 0);
    assert(j < nc);
    assert(new_j >= 0);
    assert(new_j < lnc);
    local.colind[k] = new_j;
  }

  gA->local.nc = lnc;
  gA->colmap.l2g = l2g;
  free (g2l);
  return;

 memfree:
  if (g2l) free (g2l);
  if (l2g) free (l2g);
}


int
gcsr_take_csr_locally (const int gnr, const int gnc,
		       struct spcsr_t *local, struct gcsr_t *gA,
		       int **indstart, MPI_Comm comm)
{
  int rank, csize;
  if (!local || !gA || !indstart || !*indstart) return 1;

  gcsr_init_clear (gA);
  gA->nr = gnr;
  gA->nc = gnc;
  gA->local = *local;
  spcsr_init_clear (local);

  MPI_Comm_size (comm, &csize);
  MPI_Comm_rank (comm, &rank);
  gA->rowmap.p = rank;
  gA->rowmap.np = csize;
  gA->rowmap.indstart = *indstart;
  gA->rowmap.lindstart = gA->rowmap.indstart[rank];
  gA->rowmap.lnind = gA->rowmap.indstart[rank+1] - gA->rowmap.lindstart;
  *indstart = NULL;

  /*
  fprintf (stderr, "%d: locally (%d, %d) (%d, %d; %d)\n", rank,
	   gnr, gnc, gA->local.nr, gA->local.nc, gA->local.nent);
  */

  gcsr_pack_colinds (gA);

  return 0;
}


static int
transpose_matrix_chunks (const struct gcsr_t *gA, const int *indstartT,
			 int *p_rows, int **p_rowind_, int **p_rowoff_,
			 int **p_colind_, double **p_entry_,
			 MPI_Comm comm)
{
  int rank, csize, p;
  int *p_rowind, *p_rowoff, *p_colind;
  double *p_entry;

  int *scounts, *sdisps, *rcounts, *rdisps, ndata_req;

  struct spcsr_t local;
  struct spcsr_t localT;
  int iT, nent, cum_nent;
  int *l2g, *l2g_own = NULL;

  spcsr_init_clear (&localT);

  rank = gA->rowmap.p;
  csize = gA->rowmap.np;
#if !defined(NDEBUG)
  {
    int mpirank, mpicsize;
    MPI_Comm_size(comm, &mpicsize);
    MPI_Comm_rank(comm, &mpirank);
    if (mpirank != rank) MPI_Abort(comm, -2233);
    if (mpicsize != csize) MPI_Abort(comm, -2333);
  }
#endif

  local = gA->local;
  scounts = malloc (4 * (csize + 1) * sizeof(int));
  if (!scounts) MPI_Abort(comm, -389762);
  sdisps = scounts + csize + 1;
  rcounts = sdisps + csize + 1;
  rdisps = rcounts + csize + 1;

  l2g = gA->colmap.l2g;
  if (!l2g) {
    int j;
    l2g_own = malloc (local.nc * sizeof(int));
    for (j = 0; j < local.nc; ++j) l2g_own[j] = j;
    l2g = l2g_own;
  }

  /* get p_rows, the map from p -> [rows) */
  memset (scounts, 0, (csize+1) * sizeof(int)); /* rows to send */
  p = 0;
  assert(gA->nc == indstartT[csize]);
  for (iT = 0; iT < local.nc; ++iT) {
    const int giT = (l2g? l2g[iT] : iT);
    assert (giT < gA->nc);
    while (giT >= indstartT[p+1]) ++p; /* the giT are sorted... */
    ++scounts[p];
  }

  /* fprintf (stderr, "%d: boing1\n", rank); */
  MPI_Alltoall (scounts, 1, MPI_INT,   rcounts, 1, MPI_INT,
		comm);
  sdisps[0] = 0;
  for (p = 0; p < csize; ++p) {
    sdisps[p+1] = sdisps[p] + scounts[p];
  }

  ndata_req = 0;
  for (p = 0; p < csize; ++p) {
    p_rows[p] = ndata_req;
    ndata_req += rcounts[p];
  }
  p_rows[csize] = ndata_req; /* total number of rows to be received */




  /*
    now get
      p_rowind: map of packed row -> _global_ row index
      p_rowoff: map of p_row[p] -> [offs), shifted
  */

  p_rowind = malloc ((2 * ndata_req + 1) * sizeof(int));
  if (!p_rowind) MPI_Abort (comm, -467474);
  p_rowoff = p_rowind + ndata_req;
  *p_rowind_ = p_rowind;
  *p_rowoff_ = p_rowoff;

  /* fprintf (stderr, "%d: boing2 %p %p\n", rank, l2g, p_rowind); */
  MPI_Alltoallv (l2g, scounts, sdisps, MPI_INT,
		 p_rowind, rcounts, p_rows, MPI_INT, comm);

  spcsr_transpose (&local, &localT); /* ideally during above */
  /* Make localT's column indices (local's rows) globally numbered */
  assert(local.nent == localT.nent);
  {
    int k;
    const int rowbase = gA->rowmap.lindstart;
    for (k = 0; k < localT.nent; ++k) {
      const int j = localT.colind[k];
      assert(j >= 0);
      assert(j < localT.nc);
      localT.colind[k] = j + rowbase;
      assert(localT.colind[k] < gA->nr);
      /*
      if (localT.entry[k] == 0.0)
	fprintf(stderr, "%d: found zero off %d\n", rank, k);
      */
    }
    localT.nc = gA->nr;
  }

  /* fprintf (stderr, "%d: boing3\n", rank); */
  MPI_Alltoallv (localT.rowoff, scounts, sdisps, MPI_INT,
		 p_rowoff, rcounts, p_rows, MPI_INT, comm);

  /* shift row indices into local space */ /* ideally during above */
  {
    const int lindstart = indstartT[rank];
#if !defined(NDEBUG)
    const int lindend = indstartT[rank+1];
#endif
    int k;
    for (k = 0; k < p_rows[csize]; ++k) {
      assert(p_rowind[k] >= lindstart);
      assert(p_rowind[k] < lindend);
      p_rowind[k] -= lindstart;
    }
  }
  /* fprintf (stderr, "%d: boing4\n", rank); */



  /* need # of entries to unshift p_rowoff and recv colinds and entries */

  /* scounts[p] = # entries to send to p */
  for (p = 0; p < csize; ++p) {
    if (localT.rowoff)
      scounts[p] = localT.rowoff[sdisps[p+1]] - localT.rowoff[sdisps[p]];
    else
      scounts[p] = 0;
    /*
    fprintf (stderr, "%d: sending %d entries (of %d) to %d\n", rank,
	     scounts[p], localT.nent, p);
    */
  }

  /* swap # entries */
  MPI_Alltoall (scounts, 1, MPI_INT,   rcounts, 1, MPI_INT,
		comm);
  sdisps[0] = 0; /* disps for sending colinds and entries */
  for (p = 0; p < csize; ++p) {
    sdisps[p+1] = sdisps[p] + scounts[p];
  }

  /* Note: rdisps is not the same as p_rowoff.  rdisps covers the whole of
     a proc's contributions, p_rowoff is broken into each row within each
     proc.
  */
  nent = 0;
  for (p = 0; p < csize; ++p) {
    rdisps[p] = nent;
    nent += rcounts[p];
  }
  rdisps[csize] = nent;

  /*
  for (p = 0; p < csize; ++p) {
    fprintf (stderr, "%d: sending to %d: %d entries , disp %d\n", rank,
	     p, scounts[p], sdisps[p]);
    fprintf (stderr, "%d: recving from %d: %d entries , disp %d\n", rank,
	     p, rcounts[p], rdisps[p]);
  }
  */

  cum_nent = 0;
  for (p = 0; p < csize; ++p) {
    const int firstoff = p_rowoff[p_rows[p]];
    int k;
    assert(cum_nent == rdisps[p]); /* consistent with rdisps */
    for (k = p_rows[p]; k < p_rows[p+1]; ++k) {
      p_rowoff[k] -= firstoff;
      p_rowoff[k] += cum_nent;
    }
    cum_nent += rcounts[p];
  }
  p_rowoff[p_rows[csize]] = cum_nent;
  assert(cum_nent == nent); /* trivially true */

#if !defined(NDEBUG)
  /* check that p_rowoff is self-consistent */
  {
    int k;
    assert(0 == p_rows[0]);
    assert(0 == p_rowoff[0]);
    for (p = 0; p < csize; ++p) {
      assert(p_rows[p] <= p_rows[p+1]);
      for (k = p_rows[p]; k < p_rows[p+1]; ++k) {
	assert(p_rowoff[p] <= p_rowoff[p+1]);
      }
    }
    assert(k == p_rows[csize]);
    assert(cum_nent == p_rowoff[k]);
  }
#endif

  /* now set up packed colind and entry arrays */
  /* fprintf (stderr, "%d: allocing for %d entries\n", rank, nent); */
  p_colind = malloc (nent * sizeof(int));
  p_entry = malloc (nent * sizeof(double));
  if (!p_colind || !p_entry) MPI_Abort(comm, -74423);
  *p_colind_ = p_colind;
  *p_entry_ = p_entry;

  /*
  for (p = 0; p < csize; ++p) {
    fprintf (stderr, "%d: sending to %d: %d entries , disp %d\n", rank,
	     p, scounts[p], sdisps[p]);
    fprintf (stderr, "%d: recving from %d: %d entries , disp %d\n", rank,
	     p, rcounts[p], rdisps[p]);
  }
  */

  MPI_Alltoallv (localT.colind, scounts, sdisps, MPI_INT,
		 p_colind, rcounts, rdisps, MPI_INT,
		 comm);

  MPI_Alltoallv (localT.entry, scounts, sdisps, MPI_DOUBLE,
		 p_entry, rcounts, rdisps, MPI_DOUBLE,
		 comm);

  /*memcpy (p_entry, localT.entry, nent*sizeof(double));*/

  /*
  {
    int zz;
    fprintf (stderr, "%d: nent = %d\n", rank, nent);
    for (zz = 0; zz < nent; ++zz) {
      if (p_entry[zz] == 0.0)
	fprintf (stderr, "%d: zero entry off %d\n", rank, zz);
    }
  }
  */

  if (l2g_own) free (l2g_own);
  if (scounts) free (scounts);
  spcsr_free (&localT);
  return 0;
}

static int
collapse_matrix_chunks (const int csize,
			const int *p_rows, const int *p_rowind,
			const int *p_rowoff,
			const int *p_colind, const double *p_entry,
			struct spcsr_t *local)
{
  int *first = NULL;
  const int nr = local->nr;
#if !defined(NDEBUG)
  const int nc = local->nc;
#endif
  int i, rdeg, p;

  int *rowoff = local->rowoff;
  int *colind = local->colind;
  double *entry = local->entry;

#if !defined(NDEBUG)
  int *ci = NULL;
#endif

  /*
  {
    int rank, k, zz;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    fprintf (stderr, "%d: p_rows[%d]: [%d, %d)\n", rank,
	     0, p_rows[0], p_rows[1]);
    fprintf (stderr, "%d:   p_rowind: ", rank);
    zz = 0;
    for (k = p_rows[0]; k < p_rows[1]; ++k, ++zz) {
      if (zz >= 10) { fprintf (stderr, "..."); break; }
      fprintf(stderr, " %d", p_rowind[k]);
    }
    fprintf(stderr, "\n%d:   p_rowoff: ", rank);
    zz = 0;
    for (k = p_rows[0]; k < p_rows[1]; ++k, ++zz) {
      if (zz >= 10) { fprintf (stderr, "..."); break; }
      fprintf(stderr, " %d", p_rowoff[k]);
    }
    fprintf (stderr, "\n");
  }
  */


  first = malloc ( csize * sizeof(int));
  if (!first) return -1;
  for (p = 0; p < csize; ++p) first[p] = p_rows[p];

#if !defined (NDEBUG)
  ci = malloc ( nc * sizeof(int) );
  memset (ci, -1, nc * sizeof(int));
#endif

  rdeg = 0;
  for (i = 0; i < nr; ++i) {
    int p;
    rowoff[i] = rdeg;
    for (p = 0; p < csize; ++p) {
      const int lastp = p_rows[p+1];
      int rioff = first[p], k;

      while (rioff < lastp && i < p_rowind[rioff]) ++rioff;
      first[p] = rioff;
      if (lastp == rioff || i != p_rowind[rioff]) continue;
      ++first[p];

      /* Now at row i, so scatter row it */
      /* scatter is trivial as there's no overlap */
      /*  debug var ci checks for overlap */
      for (k = p_rowoff[rioff]; k < p_rowoff[rioff+1]; ++k) {
	const int j = p_colind[k];
	const double e = p_entry[k];

	assert (j >= 0);
	assert (j < nc);
	assert (ci[j] < 0);

#if !defined(NDEBUG)
	ci[j] = rdeg;
#endif
	colind[rdeg] = j;
	/*
	if (0.0 == e) {
	  int rank;
	  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	  fprintf(stderr, "%d: moving a zero (%d,%d) off %d\n", rank, i, j, rdeg);
	}
	*/
	entry[rdeg] = e;
	++rdeg;
      }
    }
#if !defined(NDEBUG)
    { /* clear ci */
      int k;
      for (k = rowoff[i]; k < rdeg; ++k) {
	const int j = colind[k];
	assert (j >= 0);
	assert (j < nc);
	assert (ci[j] >= 0);
	ci[j] = -1;
      }
    }
#endif
  }
  rowoff[i] = rdeg;
  /*
  {
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    fprintf (stderr, "%d: rowoff[%d] = %d (%d)\n", rank, i, rdeg, local->nent);
  }
  */

#if !defined(NDEBUG)
  {
    int rank, csize;
    int *seenr = NULL, *seenc = NULL;
    int i, k;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &csize);
    if (1 == csize) {
      seenr = calloc(nr+nc, sizeof(int));
      seenc = seenr + nr;
    }
    assert(rowoff[0] == 0);
    assert(rowoff[nr] == local->nent);
    for (i = 0; i < nr; ++i) {
      assert(rowoff[i] >= 0);
      assert(rowoff[i] <= rowoff[i+1]);
      if (seenr) if (rowoff[i] != rowoff[i+1]) seenr[i] = 1;
      for (k = rowoff[i]; k < rowoff[i+1]; ++k) {
	const int j = colind[k];
	if (seenc) seenc[j] = 1;
	assert(j >= 0);
	assert(j < nc);
      }
    }
    if (seenr)
      for (i = 0; i < nr; ++i) {
	if (!seenr[i])
	  fprintf (stderr, "%d: seenr[%i] missing\n", rank, i);
      }
    if (seenc)
      for (i = 0; i < nc; ++i) {
	if (!seenc[i])
	  fprintf (stderr, "%d: seenc[%i] missing\n", rank, i);
      }
    if (seenr) free (seenr);
    /* fprintf (stderr, "%d: done with (%d, %d)\n", rank, nr, nc);*/
  }
#endif

  if (first) free (first);
#if !defined(NDEBUG)
  if (ci) free (ci);
#endif
  return 0;
}


void
gcsr_transpose (struct gcsr_t *gA, struct gcsr_t *gAT,
		const int *indstartT_in, MPI_Comm comm)
{
  int rank, csize;
  struct spcsr_t localT;
  const int *indstartT = NULL;
  int *indstartT_own = NULL;

  int *p_rows = NULL, *p_rowoff = NULL, *p_rowind = NULL, *p_colind = NULL;
  double *p_entry = NULL;

  int nent;

  if (!gA || !gAT) return;
  spcsr_init_clear (&localT);
  rank = gA->rowmap.p;
  csize = gA->rowmap.np;

  /* build indstartT, and indstartT_own for transpose's map */
  gcsr_expand_rowmap (gA, comm);
  indstartT_own = malloc ((csize+1) * sizeof(int));
  if (!indstartT_own) MPI_Abort(comm, -393927);
  if (indstartT_in) {
    indstartT = indstartT_in;
    memcpy (indstartT_own, indstartT, (csize+1)*sizeof(int));
  } else {
    indstartT = gA->rowmap.indstart;
    memcpy (indstartT_own, indstartT, (csize+1)*sizeof(int));
  }

  p_rows = malloc ((csize+1) * sizeof(int));
  if (!p_rows) MPI_Abort (comm, 643443);


  if (transpose_matrix_chunks (gA, indstartT,
			       p_rows, &p_rowind, &p_rowoff,
			       &p_colind, &p_entry, comm))
    MPI_Abort (comm, 101028);

  nent = p_rowoff[p_rows[csize]];
#if !defined(NDEBUG)
  {
    int gnentin[2];
    int gnent[2];
    gnentin[0] = gA->local.nent;
    gnentin[1] = nent;
    MPI_Allreduce (&gnentin, &gnent, 2, MPI_INT, MPI_SUM, comm);
    assert(gnent[0] == gnent[1]);
  }
#endif

  localT.nc = gA->nr;
  localT.nr = indstartT[rank+1] - indstartT[rank];
  localT.nent = nent;
  if (spcsr_alloc_ (localT.nr, localT.nc, localT.nent,
		    &localT.rowoff, &localT.colind, &localT.entry,
		    NULL))
    MPI_Abort (comm, 653747);

  if (collapse_matrix_chunks (csize, p_rows, p_rowind, p_rowoff,
			      p_colind, p_entry, &localT))
    MPI_Abort (comm, 456746);

  /*
  {
    int p;
    for (p = 0; p < csize; ++p) {
      MPI_Barrier(MPI_COMM_WORLD);
      if (p == rank) {
	printf("%d: gA(%d, %d), lA(%d, %d; %d) rows [%d, %d)\n",
	       rank, gA->nr, gA->nc, localT.nr, localT.nc, localT.nent,
	       indstartT[rank], indstartT[rank+1]);
	printf ("%d: %d %d %d\n", rank, localT.rowoff[0], localT.rowoff[1], localT.rowoff[2]);
      }
      MPI_Barrier(MPI_COMM_WORLD);
    }
  }
  */

  if (gcsr_take_csr_locally (gA->nc, gA->nr, &localT, gAT,
			     &indstartT_own, comm))
    MPI_Abort (comm, 89372);

#if !defined(NDEBUG)
  {
    int i, j, k;
    int *seenr, *seenc;
    int *rootseenr = NULL, *rootseenc;
    seenr = calloc (gA->nr + gA->nc, sizeof(int));
    seenc = seenr + gA->nr;
    if (0 == rank) {
      rootseenr = calloc (gA->nr + gA->nc, sizeof(int));
      rootseenc = rootseenr + gA->nr;
    }
    for (i = 0; i < gA->local.nr; ++i) {
      const int gi = i + gA->rowmap.lindstart;
      if (gA->local.rowoff[i] != gA->local.rowoff[i+1])
	seenr[gi] = 1;
      for (k = gA->local.rowoff[i]; k < gA->local.rowoff[i+1]; ++k) {
	j = gA->local.colind[k];
	if (gA->colmap.l2g)
	  seenc[gA->colmap.l2g[j]] = 1;
	else
	  seenc[j] = 1;
      }
    }

    MPI_Reduce (seenr, rootseenr, gA->nr + gA->nc, MPI_INT, MPI_MAX, 0, comm);
    if (0 == rank) {
      for (i = 0; i < gA->nr; ++i)
	if (!rootseenr[i])
	  fprintf (stderr, "%d:: no one sees row %d\n", rank, i);
      for (j = 0; j < gA->nc; ++j)
	if (!rootseenc[j])
	  fprintf (stderr, "%d:: no one sees col %d\n", rank, j);
    }
    free(seenr);
    if (rootseenr) free(rootseenr);
  }
#endif

  if (p_rows) free (p_rows);
  if (p_rowind) free (p_rowind);
  if (p_colind) free (p_colind);
  if (p_entry) free (p_entry);
}

int
gcsr_copy (const struct gcsr_t *gA, struct gcsr_t *gAcopy, MPI_Comm comm)
{
  gcsr_init_clear (gAcopy);
  spcsr_copy (&gA->local, &gAcopy->local);
  gAcopy->nr = gA->nr;
  gAcopy->nc = gA->nc;
  gAcopy->rowmap.p = gA->rowmap.p;
  gAcopy->rowmap.np = gA->rowmap.np;
  gAcopy->rowmap.lindstart = gA->rowmap.lindstart;
  gAcopy->rowmap.lnind = gA->rowmap.lnind;
  gAcopy->rowmap.indstart = NULL;
  if (gA->rowmap.indstart) {
    gAcopy->rowmap.indstart = malloc ((gA->rowmap.np + 1) * sizeof(int));
    if (gAcopy->rowmap.indstart)
      memcpy (gAcopy->rowmap.indstart, gA->rowmap.indstart,
	      (gA->rowmap.np + 1) * sizeof(int));
  }
  if (gA->colmap.l2g) {
    gAcopy->colmap.l2g = malloc (gA->nc * sizeof(int));
    if (gAcopy->colmap.l2g)
      memcpy(gAcopy->colmap.l2g, gA->colmap.l2g, gA->nc * sizeof(int));
    else {
      MPI_Abort (comm, 934743); /* could just reindex to global */
      return ENOMEM;
    }
  }
  return 0;
}

void
gcsr_transpose_onroot (struct gcsr_t *gA, struct gcsr_t *gAT,
		       const int *indstartT_in, MPI_Comm comm)
{
  if (gcsr_copy (gA, gAT, comm)) MPI_Abort(comm, 998701);
  if (gcsr_redist_root (gAT, comm)) MPI_Abort(comm, 998702);

  if (0 == gA->rowmap.p) {
    struct spcsr_t localT;
    if (spcsr_transpose (&gAT->local, &localT)) MPI_Abort(comm, 998703);
    spcsr_free(&gAT->local);
    gAT->local = localT;
  }

  gAT->nr = gA->nc;
  gAT->nc = gA->nr;

  if (indstartT_in) {
    gcsr_redist_rowinds (gAT, indstartT_in, comm);
  }
  else {
    gcsr_expand_rowmap (gA, comm);
    gcsr_redist_rowinds (gAT, gA->rowmap.indstart, comm);
  }
}


Mode Type Size Ref File
100644 blob 713 399ea10dab40d3201aa0e9f6eb62c51b099d970f Make.ejr-citris
100644 blob 486 dcf1d527e538d4ca5ebe2951800b3a3b70474872 Make.ejr-home-gcc
100644 blob 175 3592281b201df903859285e50c914c2f8b618b4b Make.seaborg
100644 blob 3307 f715c36bf2ae4a4f455cc7177350dba21c21c1ca Makefile
100644 blob 42677 f3dd0961175560130247e5c2288f6f5164790456 auction.c
100644 blob 3016 a574b362b4d966599d6720ca0439d385a92084e1 auction.h
100644 blob 302 6995992eb49553aadebe4dddd1a3a74eacf4f332 cacheblast.c
100644 blob 3386 2b76fb3e0a5bb941d13e1ff6f4b2cf152af929e3 colcomp.c
100644 blob 582 3afbfafab8b4b83786e48a66cf20df7eb1feaf8c convmats.m
100644 blob 2189 fba47c6f3908699da08dfac9469d296af62dc234 lsame.c
100644 blob 5244 cb06aa63b1f49d49da789a8feeaf3a19a3155b36 matchmtx.c
040000 tree - 3fcccf0fc51adf74ea4fc6c501e65306ada6fdd1 mc64
040000 tree - c1a5dab1b4b8999153b6d04dae4624af7038dcec mlb
100644 blob 6344 74992c1d4736ff84fdcdcf2b1be7f4eaf7de3b63 mpitransposebin.c
100644 blob 6351 f4beee0452662b6f5bc6d79b839faec02ce6802e mpitransposebin_root.c
100644 blob 1092028 89dad69d7b2de3e183bfc343e98e1943a9d34e16 orani678.bin
100644 blob 19048 d3d22a8af49f546fd09aaded2d7256d312d3b838 parauction.c
100644 blob 18896 7134aa840f527455d5bf617ee9e54c91ae2161d7 parauction_allgather.c
100644 blob 14315 4f475a8317c54d70c33d7d3744b7c47943d26883 parauction_allreduce.c
100644 blob 18564 37c1e950e97bd0fb9aafec2db7fdeeaf9339a09b parauction_root.c
100644 blob 1154 2fc93276b1723c805e82d0e9eb4781184bfcca2e readbin.c
100644 blob 864 5f791479f6982f22101a5094efb0fa0947b54310 readbin.m
100644 blob 696 6151ee6074636c5bdd0d5f34ee4d9b0c76e0501d readmatch.m
100644 blob 1182 8b0bfcc7b6bac0c7f701ae317cbeeff5feec512a redist1.c
100644 blob 1412 4dae58f785498844725f72c26f70621b9287ad1e redist2.c
100644 blob 3002 f41bc38e57d9c613a6426045d0ea7ac22bd339de redist3.c
100644 blob 2903 857ee46542a70759476e2a7437e900c32f12001c redist4.c
100644 blob 6598 f5eed2c9d95ff09dde048b97e8d11e14253c8bf1 seqauction.c
100644 blob 15392 9fac5aac4170c6cd7d9c2327911a531ea7950dad smut.c
100644 blob 2338 06f2519a40af810ecfaa5e250459109aed6ee642 smut.h
100644 blob 13779 5ac297a1fcfeeadc5d107470f1fcf6215c9521bc stripe_auction.c
100644 blob 1771 f709206b1e373cf8c7a3c3d3fb702b4bae72d435 stripe_auction.h
100644 blob 39548 5389671d60b8e2ddd1f9360551377d35c8ccdbd2 stripesmut.c
100644 blob 1402 075d97c492be1e6546e24d4a330566981616ac0e stripesmut.h
100644 blob 627 9909b624922275b657ab5d66774f4c40c8beb760 timer.c
100644 blob 405 46eb6c267780afd6b1ee7816a42332cdf79d2683 timer.h
100644 blob 942 d254d606053141bc0a6ffdf714a436980900c028 transposebin.c
100644 blob 697 3728ae018c10cf4308bce2388b75bafe4be41942 writebin.c
100644 blob 940 9643894cce30bc371dc4549a1fc1af4fa3dde2ac writebin.m
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/ejr/matchpres

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/ejr/matchpres

Clone this repository using git:
git clone git://git.rocketgit.com/user/ejr/matchpres

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main