#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <mpi.h>
#define SMUT_NO_IO
#include "smut.h"
#undef SMUT_NO_IO
#include "stripesmut.h"
void
gcsr_init_clear (struct gcsr_t *gA)
{
if (!gA) return;
gA->nr = gA->nc = 0;
spcsr_init_clear (&gA->local);
gA->rowmap.p = gA->rowmap.np = 0;
gA->rowmap.lindstart = gA->rowmap.lnind = 0;
gA->rowmap.indstart = NULL;
gA->colmap.l2g = NULL;
}
void
gcsr_free (struct gcsr_t *gA, MPI_Comm comm)
{
if (gA->colmap.l2g) free (gA->colmap.l2g);
if (gA->rowmap.indstart) free (gA->rowmap.indstart);
spcsr_free (&gA->local);
gcsr_init_clear (gA);
#if !defined(NDEBUG)
MPI_Barrier (comm);
#endif
}
void
gcsr_expand_rowmap (struct gcsr_t *gA, MPI_Comm comm)
{
int rank, csize;
int *indstart;
#if !defined(NDEBUG)
int p;
#endif
if (!gA || gA->rowmap.indstart) return;
rank = gA->rowmap.p;
csize = gA->rowmap.np;
indstart = malloc ((csize+1) * sizeof(int));
if (!indstart) MPI_Abort(comm, -8922);
MPI_Allgather (&gA->rowmap.lindstart, 1, MPI_INT,
indstart, 1, MPI_INT,
comm);
indstart[csize] = gA->nr;
#if !defined(NDEBUG)
assert(0 == indstart[0]);
for (p = 1; p < csize; ++p) {
assert(indstart[p] >= 0);
assert(indstart[p] <= indstart[p+1]);
}
#endif
gA->rowmap.indstart = indstart;
}
int
gcsr_take_csr_root (struct spcsr_t *A, struct gcsr_t *gA, MPI_Comm comm)
{
int rank, csize, p;
int errflg = 0;
int tmpdata[2];
if (!gA) return 1;
{
int initflg = 0;
errflg = MPI_Initialized(&initflg);
if (MPI_SUCCESS != errflg) return errflg;
if (!initflg) return 1;
}
MPI_Comm_size (comm, &csize);
MPI_Comm_rank (comm, &rank);
if (0 == rank && !A) MPI_Abort(comm, EINVAL);
gA->rowmap.p = rank;
gA->rowmap.np = csize;
gA->rowmap.indstart = malloc ((csize+1) * sizeof(int));
if (!gA->rowmap.indstart) MPI_Abort(comm, ENOMEM);
if (0 == rank) {
gA->local = *A;
tmpdata[0] = A->nr;
tmpdata[1] = A->nc;
}
else {
spcsr_init_clear (&gA->local);
}
MPI_Bcast (tmpdata, 2, MPI_INT, 0, comm);
gA->nr = tmpdata[0];
gA->nc = tmpdata[1];
gA->rowmap.lindstart = (rank == 0? 0 : gA->nr);
gA->rowmap.lnind = (rank == 0? gA->nr : 0);
gA->rowmap.indstart[0] = 0;
for (p = 1; p <= csize; ++p)
gA->rowmap.indstart[p] = gA->nr;
gA->colmap.l2g = NULL;
spcsr_init_clear (A);
return 0;
}
int
gcsr_give_csr_root (struct gcsr_t *gA, struct spcsr_t *A, MPI_Comm comm)
{
int errflg;
errflg = gcsr_redist_root (gA, comm);
if (errflg) return errflg;
*A = gA->local;
spcsr_init_clear (&gA->local);
gcsr_free (gA, comm);
return 0;
}
void
gcsr_unpack_colinds (struct gcsr_t *gA)
{
struct spcsr_t local;
int *l2g, k;
if (!gA || !gA->colmap.l2g)
return;
l2g = gA->colmap.l2g;
local=gA->local;
for (k = 0; k < local.nent; ++k) {
const int j = local.colind[k];
assert (j >= 0);
assert (j < local.nc);
local.colind[k] = l2g[j];
}
gA->colmap.l2g = NULL;
free(l2g);
gA->local.nc = gA->nc;
}
int
gcsr_redist_root (struct gcsr_t* gA, MPI_Comm comm)
{
int rank, csize, am_root;
struct spcsr_t local;
int *counts = NULL;
int *disps = NULL;
int gnent;
struct spcsr_t newlocal;
int p;
/* get # entries per proc */
/* realloc local to fit */
/* MPI_Gatherv */
/* local freeing */
if (!gA || !gA->nr || !gA->nc) return 0;
spcsr_init_clear (&newlocal);
rank = gA->rowmap.p;
csize = gA->rowmap.np;
am_root = (0 == rank);
#if !defined(NDEBUG)
{
int mpirank, mpicsize;
MPI_Comm_size(comm, &mpicsize);
MPI_Comm_rank(comm, &mpirank);
if (mpirank != rank) MPI_Abort(comm, -22);
if (mpicsize != csize) MPI_Abort(comm, -23);
}
#endif
if (am_root) {
counts = malloc (csize * sizeof(int));
disps = malloc (csize * sizeof(int));
if (!counts || !disps) MPI_Abort(comm, -398202);
}
/*
fprintf (stderr, "%d: foo1\n", rank);
*/
local = gA->local;
gcsr_unpack_colinds (gA);
/*
fprintf (stderr, "%d: foo2\n", rank);
*/
/* start setting up root's newlocal */
if (am_root) {
gnent = 0;
newlocal.nr = gA->nr;
newlocal.nc = gA->nc;
newlocal.rowoff = malloc ( (newlocal.nr+1) * sizeof(int) );
if (!newlocal.rowoff) MPI_Abort(comm, -348362);
}
/*
fprintf (stderr, "%d: foo3\n", rank);
*/
#if !defined(NDEBUG)
/* Double-check the number of entries in a few places*/
MPI_Reduce (&local.nent, &gnent, 1, MPI_INT, MPI_SUM, 0, comm);
#endif
/*
fprintf (stderr, "%d: foo4\n", rank);
*/
/* gather row offsets and clear any row mapping */
if (gA->rowmap.indstart) {
if (am_root) {
for (p = 0; p < csize; ++p) {
counts[p] = gA->rowmap.indstart[p+1] - gA->rowmap.indstart[p];
}
}
} else {
MPI_Gather (&gA->rowmap.lnind, 1, MPI_INT,
counts, 1, MPI_INT,
0, comm);
}
if (am_root) {
disps[0] = 0;
for (p = 1; p < csize; ++p)
disps[p] = disps[p-1] + counts[p-1];
}
gA->rowmap.lindstart = (am_root? 0 : gA->nr);
gA->rowmap.lnind = (am_root? gA->nr : 0);
if (gA->rowmap.indstart) {
gA->rowmap.indstart[0] = 0;
for (p = 1; p <= csize; ++p)
gA->rowmap.indstart[p] = gA->nr;
}
/*
fprintf (stderr, "%d: foo5\n", rank);
*/
MPI_Gatherv ((local.rowoff? &local.rowoff[1] : NULL), local.nr, MPI_INT,
newlocal.rowoff, counts, disps, MPI_INT,
0, comm);
/*
fprintf (stderr, "%d: foo6\n", rank);
*/
/* convert row offsets while collecting num. entry info */
if (am_root) {
int k;
int cumsum, nents_p;
/*
fprintf (stderr, "%d: foo6.1 %p %p %d %p\n",
rank, counts, disps, newlocal.nr, newlocal.rowoff);
*/
cumsum = 0;
for (p = 0; p < csize; ++p) {
const int cnt = counts[p];
const int off = disps[p];
nents_p = 0;
/*
fprintf (stderr, "%d: foo6.1 cnt=%d off=%d span=[%d,%d) nents_p=%d cumsum=%d\n",
rank, cnt, off, off, off+cnt, nents_p, cumsum);
fprintf (stderr, "%d: foo6.1 newlocal.rowoff[%d and +1] = %d %d\n",
rank, off, newlocal.rowoff[off], newlocal.rowoff[off+1]);
*/
if (cnt > 0) nents_p = newlocal.rowoff[off + cnt - 1];
for (k = 0; k < cnt; ++k) {
newlocal.rowoff[off + k] += cumsum;
}
counts[p] = nents_p;
cumsum += nents_p;
/*
fprintf (stderr, "%d: foo6.1.1 cnt=%d off=%d span=[%d,%d) nents_p=%d cumsum=%d\n",
rank, cnt, off, off, off+cnt, nents_p, cumsum);
fprintf (stderr, "%d: foo6.1.1 newlocal.rowoff[%d and +1] = %d %d\n",
rank, off, newlocal.rowoff[off], newlocal.rowoff[off+1]);
*/
}
assert(cumsum == gnent);
/*
fprintf (stderr, "%d: foo6.5\n", rank);
*/
for (k = newlocal.nr; k > 0; --k)
newlocal.rowoff[k] = newlocal.rowoff[k-1];
newlocal.rowoff[0] = 0;
newlocal.nent = newlocal.rowoff[newlocal.nr];
assert(newlocal.nent = gnent);
gnent = newlocal.nent;
disps[0] = 0;
for (p = 1; p < csize; ++p)
disps[p] = disps[p-1] + counts[p-1];
newlocal.colind = malloc (gnent * sizeof(int));
newlocal.entry = malloc (gnent * sizeof(double));
if (!newlocal.colind || !newlocal.entry) MPI_Abort(comm, -222018);
}
/*
fprintf (stderr, "%d: foo7 %d %p %p\n", rank, local.nent, counts, disps);
if (counts) {
for (p = 0; p < csize; ++p)
fprintf (stderr, "%d: counts[%d] = %d, disps[%d] = %d\n",
rank, p, counts[p], p, disps[p]);
}
*/
MPI_Gatherv (local.colind, local.nent, MPI_INT,
newlocal.colind, counts, disps, MPI_INT,
0, comm);
MPI_Gatherv (local.entry, local.nent, MPI_DOUBLE,
newlocal.entry, counts, disps, MPI_DOUBLE,
0, comm);
/*
fprintf (stderr, "%d: foo8\n", rank);
*/
spcsr_free (&gA->local);
gA->local = newlocal;
if (counts) free(counts);
if (disps) free(disps);
return 0;
}
/* XXX: there must be a prettier way */
static void
make_row_sr_counts (const int rank, const int csize,
const int lindstart, const int lnr,
const int *indstart,
const int *new_indstart,
int *scounts, int *sdisps,
int *rcounts, int *rdisps,
MPI_Comm comm)
{
int p;
int total_rows_to_send = lnr;
int cur_local_row = 0;
int cumsum;
for (p = 0; p < csize; ++p) {
int lrow_incr, nrows_req;
sdisps[p] = cur_local_row;
lrow_incr = new_indstart[p+1] - lindstart;
if (lrow_incr <= 0) continue; /* haven't reached recipients yet */
if (0 == total_rows_to_send) continue; /* past all recipients */
assert(total_rows_to_send > 0);
/* otherwise there's an intersection */
nrows_req = new_indstart[p+1] - new_indstart[p];
if (lrow_incr > nrows_req)
lrow_incr = nrows_req;
if (lrow_incr > total_rows_to_send)
lrow_incr = total_rows_to_send;
total_rows_to_send -= lrow_incr;
cur_local_row += lrow_incr;
}
sdisps[p] = cur_local_row;
assert(cur_local_row == lnr);
for (p = 0; p < csize; ++p) scounts[p] = sdisps[p+1] - sdisps[p];
if (!indstart) {
MPI_Alltoall (scounts, 1, MPI_INT, rcounts, 1, MPI_INT, comm);
cumsum = 0;
for (p = 0; p < csize; ++p) {
rdisps[p] = cumsum;
cumsum += rcounts[p];
}
rdisps[p] = cumsum;
assert(cumsum == new_indstart[rank+1] - new_indstart[rank]);
}
else {
int new_lindstart = new_indstart[rank];
int total_rows_to_recv = new_indstart[rank+1] - new_indstart[rank];
int cur_new_local_row = 0;
for (p = 0; p < csize; ++p) {
int lrow_incr, nrows_req;
rdisps[p] = cur_new_local_row;
lrow_incr = indstart[p+1] - new_lindstart;
if (lrow_incr <= 0) continue; /* haven't reached recipients yet */
if (0 == total_rows_to_recv) continue; /* past all recipients */
assert(total_rows_to_recv > 0);
/* otherwise there's an intersection */
nrows_req = indstart[p+1] - indstart[p];
if (lrow_incr > nrows_req)
lrow_incr = nrows_req;
if (lrow_incr > total_rows_to_recv)
lrow_incr = total_rows_to_recv;
total_rows_to_recv -= lrow_incr;
cur_new_local_row += lrow_incr;
}
rdisps[p] = cur_new_local_row;
assert(cur_new_local_row == new_indstart[rank+1] - new_indstart[rank]);
for (p = 0; p < csize; ++p) rcounts[p] = rdisps[p+1] - rdisps[p];
}
}
static void
make_nent_sr_counts (const int rank, const int csize,
const int *sdisps,
const struct spcsr_t local,
int *nent_send, int *nent_recv,
MPI_Comm comm)
{
int p;
/*
for (p = 0; p < csize; ++p) {
int p2;
MPI_Barrier(comm);
if (rank == p) {
for (p2 = 0; p2 <= csize; ++p2) {
fprintf (stderr, "%d: sdisps[%d] = %d\n",
rank, p2, sdisps[p2]);
}
}
MPI_Barrier(comm);
}
*/
for (p = 0; p < csize; ++p) {
if (sdisps[p+1] == sdisps[p]) /* local.rowoff may be null */
nent_send[p] = 0;
else
nent_send[p] = local.rowoff[sdisps[p+1]] - local.rowoff[sdisps[p]];
assert(nent_send[p] >= 0);
assert(nent_send[p] <= local.nent);
}
MPI_Alltoall (nent_send, 1, MPI_INT, nent_recv, 1, MPI_INT, comm);
}
int
gcsr_redist_rowinds (struct gcsr_t *gA, const int *new_indstart, MPI_Comm comm)
{
int rank, csize, am_root;
int lindstart, lindend;
int new_lindstart, new_lindend;
struct spcsr_t local, new_local;
int cumsum, p, i;
int *scounts, *sdisps, *rcounts, *rdisps;
if (!gA || !new_indstart) return -1;
rank = gA->rowmap.p;
csize = gA->rowmap.np;
am_root = (0 == rank);
#if !defined(NDEBUG)
{
int mpirank, mpicsize;
MPI_Comm_size(comm, &mpicsize);
MPI_Comm_rank(comm, &mpirank);
if (mpirank != rank) MPI_Abort(comm, -2233);
if (mpicsize != csize) MPI_Abort(comm, -2333);
}
#endif
if (new_indstart[csize] != gA->nr) return -1;
local = gA->local;
gcsr_unpack_colinds (gA);
lindstart = gA->rowmap.lindstart;
lindend = lindstart + gA->rowmap.lnind;
new_lindstart = new_indstart[rank];
new_lindend = new_indstart[rank+1];
spcsr_init_clear (&new_local);
new_local.nr = new_indstart[rank+1] - new_indstart[rank];
new_local.nc = (new_local.nr? gA->nc : 0);
if (0 != new_local.nr) {
new_local.rowoff = malloc ((new_local.nr + 1) * sizeof(int));
if (!new_local.rowoff) MPI_Abort(comm, -3939821);
} else
new_local.rowoff = NULL;
/*
for (p = 0; p < csize; ++p) {
int p2;
MPI_Barrier(comm);
if (rank == p) {
fprintf(stderr, "%d: local nr old: %d, new: %d\n", rank,
local.nr, new_local.nr);
}
MPI_Barrier(comm);
}
*/
scounts = malloc (4 * (csize+1) * sizeof(int));
sdisps = scounts + csize + 1;
rcounts = sdisps + csize + 1;
rdisps = rcounts + csize + 1;
make_row_sr_counts (rank, csize, lindstart, local.nr,
gA->rowmap.indstart, new_indstart,
scounts, sdisps, rcounts, rdisps, comm);
/*
for (p = 0; p < csize; ++p) {
int p2;
MPI_Barrier(comm);
if (rank == p) {
for (p2 = 0; p2 < csize; ++p2) {
fprintf (stderr, "%d: sending to %d: (%d, %d)\n",
rank, p2, scounts[p2], sdisps[p2]);
fprintf (stderr, "%d: recving from %d: (%d, %d)\n",
rank, p2, rcounts[p2], rdisps[p2]);
}
}
MPI_Barrier(comm);
}
*/
MPI_Alltoallv (local.rowoff, scounts, sdisps, MPI_INT,
new_local.rowoff, rcounts, rdisps, MPI_INT,
comm);
/*
fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
*/
make_nent_sr_counts (rank, csize, sdisps, local, scounts, rcounts, comm);
/* now scounts and rcounts count _entries_, not rows */
/*
fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
*/
/* fixup offsets */
cumsum = 0;
for (p = 0; p < csize; ++p) {
if (rdisps[p] != rdisps[p+1]) { /* else new_local.rowoff may be NULL */
int firstoff = new_local.rowoff[rdisps[p]];
for (i = rdisps[p]; i < rdisps[p+1]; ++i) {
new_local.rowoff[i] -= firstoff;
new_local.rowoff[i] += cumsum;
}
cumsum += rcounts[p];
}
}
new_local.nent = cumsum;
if (new_local.rowoff) new_local.rowoff[new_local.nr] = cumsum;
assert( !new_local.rowoff || (0 == new_local.rowoff[0]) );
/*
fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
*/
if (cumsum > 0) {
new_local.colind = malloc (cumsum * sizeof(int));
new_local.entry = malloc (cumsum * sizeof(double));
if (!new_local.colind || !new_local.entry) MPI_Abort(comm, -9083);
} else {
new_local.colind = NULL;
new_local.entry = NULL;
}
/*
fprintf (stderr, "%d: here\n", rank); MPI_Barrier(comm);
*/
for (p = 0; p < csize; ++p) {
if (local.rowoff) sdisps[p] = local.rowoff[sdisps[p]];
assert(local.rowoff || (sdisps[p] == 0));
if (new_local.rowoff) rdisps[p] = new_local.rowoff[rdisps[p]];
assert(new_local.rowoff || (rdisps[p] == 0));
}
/*
for (p = 0; p < csize; ++p) {
int p2;
MPI_Barrier(comm);
if (rank == p) {
for (p2 = 0; p2 < csize; ++p2) {
fprintf (stderr, "%d: sending to %d: (%d, %d)\n",
rank, p2, scounts[p2], sdisps[p2]);
fprintf (stderr, "%d: recving from %d: (%d, %d)\n",
rank, p2, rcounts[p2], rdisps[p2]);
}
}
MPI_Barrier(comm);
}
*/
/*
fprintf (stderr, "%d: here a\n", rank); MPI_Barrier(comm);
*/
MPI_Alltoallv (local.colind, scounts, sdisps, MPI_INT,
new_local.colind, rcounts, rdisps, MPI_INT,
comm);
/*
fprintf (stderr, "%d: here b\n", rank); MPI_Barrier(comm);
*/
MPI_Alltoallv (local.entry, scounts, sdisps, MPI_DOUBLE,
new_local.entry, rcounts, rdisps, MPI_DOUBLE,
comm);
/*
fprintf (stderr, "%d: here c\n", rank); MPI_Barrier(comm);
*/
spcsr_free (&local);
gA->local = new_local;
gA->rowmap.lindstart = new_lindstart;
gA->rowmap.lnind = new_lindend - new_lindstart;
if (!gA->rowmap.indstart)
gA->rowmap.indstart = malloc ((csize+1) * sizeof(int));
if (gA->rowmap.indstart) /* if malloc fails, no big deal */
memcpy (gA->rowmap.indstart, new_indstart, (csize+1)*sizeof(int));
assert(gA->rowmap.lnind == gA->local.nr);
free(scounts);
/*
fprintf (stderr, "%d: done!\n", rank); MPI_Barrier(comm);
*/
/*
for (p = 0; p < csize; ++p) {
MPI_Barrier(comm);
if (rank == p) {
fprintf (stderr, "%d: local (%d, %d; %d)\n",
rank, gA->local.nr, gA->local.nc, gA->local.nent);
}
MPI_Barrier(comm);
}
*/
return 0;
}
int
gcsr_redist_nrows (struct gcsr_t *gA, int minrows, MPI_Comm comm)
{
int rank, csize, am_root;
int *new_indstart;
int errflg;
int p, nr, nrows_per;
if (!gA) return -1;
rank = gA->rowmap.p;
csize = gA->rowmap.np;
am_root = (0 == rank);
#if !defined(NDEBUG)
{
int mpirank, mpicsize;
MPI_Comm_size(comm, &mpicsize);
MPI_Comm_rank(comm, &mpirank);
if (mpirank != rank) MPI_Abort(comm, -2233);
if (mpicsize != csize) MPI_Abort(comm, -2333);
}
#endif
nr = gA->nr;
new_indstart = malloc ((csize+1) * sizeof(int));
if (!new_indstart) MPI_Abort(comm, -303030);
nrows_per = (nr + csize - 1) / csize;
if (nrows_per < minrows) nrows_per = minrows;
new_indstart[0] = 0;
for (p = 1; p <= csize; ++p) {
new_indstart[p] = new_indstart[p-1] + nrows_per;
if (new_indstart[p] > nr) new_indstart[p] = nr;
}
errflg = gcsr_redist_rowinds (gA, new_indstart, comm);
free(new_indstart);
return errflg;
}
int
gcsr_redist_ents (struct gcsr_t *gA, int minrows, MPI_Comm comm)
{
int rank, csize, am_root;
int *new_indstart = NULL, *rowdeg = NULL;
int *rcounts = NULL, *rdisps = NULL;
int errflg;
int ents_target, ents_so_far, rows_so_far;
int i, p, nr;
int gnent;
if (!gA) return -1;
rank = gA->rowmap.p;
csize = gA->rowmap.np;
am_root = (0 == rank);
#if !defined(NDEBUG)
{
int mpirank, mpicsize;
MPI_Comm_size(comm, &mpicsize);
MPI_Comm_rank(comm, &mpirank);
if (mpirank != rank) MPI_Abort(comm, -2238);
if (mpicsize != csize) MPI_Abort(comm, -2339);
}
#endif
/*
fprintf (stderr, "%d: baz0\n", rank);
*/
nr = gA->nr;
new_indstart = malloc ((csize+1) * sizeof(int));
if (!new_indstart) MPI_Abort(comm, -303031);
/*
fprintf (stderr, "%d: baz1\n", rank);
*/
if (am_root) {
rowdeg = malloc ((1 + gA->nr + 2*csize + 2) * sizeof(int));
/* XXX: too much memory */
if (!rowdeg) MPI_Abort(comm, -22292);
rcounts = rowdeg + 1 + gA->nr;
rdisps = rcounts + csize;
}
/*
fprintf (stderr, "%d: baz2 %p %p %p\n", rank, rowdeg, rcounts, rdisps);
*/
MPI_Gather (&gA->local.nr, 1, MPI_INT, rcounts, 1, MPI_INT, 0, comm);
/* fprintf (stderr, "%d: baz2.1\n", rank); */
if (am_root) {
rdisps[0] = 0;
for (p = 1; p <= csize; ++p) {
rdisps[p] = rdisps[p-1] + rcounts[p-1];
}
}
/* fprintf (stderr, "%d: baz3\n", rank); */
MPI_Gatherv (gA->local.rowoff + 1, gA->local.nr, MPI_INT,
rowdeg, rcounts, rdisps, MPI_INT,
0, comm);
MPI_Reduce (&gA->local.nent, &gnent, 1, MPI_INT, MPI_SUM, 0, comm);
/* fprintf (stderr, "%d: baz4\n", rank); */
if (am_root) {
/* turn into coldegs */
for (p = 0; p < csize; ++p) {
int prevoff, disp, k;
/* fprintf (stderr, "%d: baz5 rcounts[%d] = %d\n", rank, p, rcounts[p]); */
if (rcounts[p] <= 1) continue;
disp = rdisps[p];
prevoff = rowdeg[disp];
for (k = 1; k < rcounts[p]; ++k) {
int tmp = rowdeg[disp+k];
rowdeg[disp+k] -= prevoff;
prevoff = tmp;
}
}
/* now split */
ents_target = (gnent + csize - 1) / csize;
i = 0;
for (p = 0; p < csize; ++p) {
new_indstart[p] = i;
ents_so_far = rows_so_far = 0;
for (; i < nr; ++i) {
ents_so_far += rowdeg[i];
++rows_so_far;
if (rows_so_far > minrows && ents_so_far > ents_target)
break;
}
}
new_indstart[p] = nr;
/*
for (p = 0; p <= csize; ++p) {
fprintf (stderr, "%d: new_indstart[%d] = %d\n",
rank, p, new_indstart[p]);
}
*/
}
/*
fprintf (stderr, "%d: baz6\n", rank);
*/
MPI_Bcast (new_indstart, csize+1, MPI_INT, 0, comm);
/*
fprintf (stderr, "%d: baz7\n", rank);
*/
errflg = gcsr_redist_rowinds (gA, new_indstart, comm);
if (rowdeg) free(rowdeg);
free(new_indstart);
/*
fprintf (stderr, "%d: baz8\n", rank);
*/
return errflg;
}
#if 0
int
gcsr_redist_ents (struct gcsr_t *gA, int minrows, MPI_Comm comm)
{
int rank, csize, am_root;
int *new_indstart;
int errflg;
int p, nr, nrows_per, nent_per;
int gnent, nent_start, nent_finish;
int lneigh, rneigh;
if (!gA) return -1;
rank = gA->rowmap.p;
csize = gA->rowmap.np;
am_root = (0 == rank);
#if !defined(NDEBUG)
{
int mpirank, mpicsize;
MPI_Comm_size(comm, &mpicsize);
MPI_Comm_rank(comm, &mpirank);
if (mpirank != rank) MPI_Abort(comm, -2233);
if (mpicsize != csize) MPI_Abort(comm, -2333);
}
#endif
nr = gA->nr;
new_indstart = malloc ((p+1) * sizeof(int));
if (!new_indstart) MPI_Abort(comm, -303031);
/* Make an exclusive scan.
Could use MPI_Exscan; then add local.nent to gnent. */
MPI_Scan (&gA->local.nent, &gnent, 1, MPI_INT, MPI_SUM, comm);
lneigh = rank-1;
if (lneigh < 0) lneigh = MPI_PROC_NULL;
rneigh = rank+1;
if (rneigh >= csize) rneigh = MPI_PROC_NULL;
nent_start = 0;
MPI_Sendrecv (&gnent, 1, MPI_INT, rneigh, 99992,
&nent_start, 1, MPI_INT, lneigh, 99992,
comm, MPI_STATUS_IGNORE);
#if !defined(NDEBUG)
{
int gnent_reduct;
MPI_Allreduce (&gA->local.nent, &gnent_reduct, 1, MPI_INT,
MPI_SUM, comm);
if (rank == csize-1)
assert(gnent == gnent_reduct);
}
#endif
MPI_Bcast (&gnent, 1, MPI_INT, csize-1, comm);
nent_per = (gnent + csize - 1) / csize;
/*AUUUGH! minrows kills everything quickly! */
}
#endif /* until I figure out how to do it in parallel */
static int
compind (const void *p1, const void *p2)
{
const int a = *(const int*) p1;
const int b = *(const int*) p2;
return b - a;
}
void
gcsr_pack_colinds (struct gcsr_t* gA)
{
struct spcsr_t local;
int k, nent, nc, lnc;
int *g2l = NULL, *l2g = NULL, *tmp = NULL;
if (!gA) return;
if (0 == gA->local.nr || 0 == gA->local.nc || 0 == gA->local.nent) return;
/*return;*/
nc = gA->nc;
l2g = malloc (nc * sizeof(int));
g2l = malloc (nc * sizeof(int));
if (!l2g || !g2l) goto memfree;
gcsr_unpack_colinds (gA);
local = gA->local;
nent = local.nent;
memset (g2l, -1, nc * sizeof(int));
lnc = 0;
for (k = 0; k < nent; ++k) {
const int j = local.colind[k];
assert(j >= 0);
assert(j < nc);
if (g2l[j] < 0) {
l2g[lnc] = j;
g2l[j] = 1;
++lnc;
}
}
if (lnc >= (3*nc)/4) goto memfree; /* Don't bother. */
/*
Keep in same order as original columns.
Makes transpose-like ops easier, shouldn't cost much.
*/
qsort (l2g, lnc, sizeof(int), compind);
for (k = 0; k < lnc; ++k) {
const int gj = l2g[k];
assert(gj >= 0);
assert(gj < gA->nc);
g2l[gj] = k;
}
tmp = realloc(l2g, lnc * sizeof(int));
if (tmp) l2g = tmp;
for (k = 0; k < nent; ++k) {
const int j = local.colind[k];
const int new_j = g2l[j];
assert(j >= 0);
assert(j < nc);
assert(new_j >= 0);
assert(new_j < lnc);
local.colind[k] = new_j;
}
gA->local.nc = lnc;
gA->colmap.l2g = l2g;
free (g2l);
return;
memfree:
if (g2l) free (g2l);
if (l2g) free (l2g);
}
int
gcsr_take_csr_locally (const int gnr, const int gnc,
struct spcsr_t *local, struct gcsr_t *gA,
int **indstart, MPI_Comm comm)
{
int rank, csize;
if (!local || !gA || !indstart || !*indstart) return 1;
gcsr_init_clear (gA);
gA->nr = gnr;
gA->nc = gnc;
gA->local = *local;
spcsr_init_clear (local);
MPI_Comm_size (comm, &csize);
MPI_Comm_rank (comm, &rank);
gA->rowmap.p = rank;
gA->rowmap.np = csize;
gA->rowmap.indstart = *indstart;
gA->rowmap.lindstart = gA->rowmap.indstart[rank];
gA->rowmap.lnind = gA->rowmap.indstart[rank+1] - gA->rowmap.lindstart;
*indstart = NULL;
/*
fprintf (stderr, "%d: locally (%d, %d) (%d, %d; %d)\n", rank,
gnr, gnc, gA->local.nr, gA->local.nc, gA->local.nent);
*/
gcsr_pack_colinds (gA);
return 0;
}
static int
transpose_matrix_chunks (const struct gcsr_t *gA, const int *indstartT,
int *p_rows, int **p_rowind_, int **p_rowoff_,
int **p_colind_, double **p_entry_,
MPI_Comm comm)
{
int rank, csize, p;
int *p_rowind, *p_rowoff, *p_colind;
double *p_entry;
int *scounts, *sdisps, *rcounts, *rdisps, ndata_req;
struct spcsr_t local;
struct spcsr_t localT;
int iT, nent, cum_nent;
int *l2g, *l2g_own = NULL;
spcsr_init_clear (&localT);
rank = gA->rowmap.p;
csize = gA->rowmap.np;
#if !defined(NDEBUG)
{
int mpirank, mpicsize;
MPI_Comm_size(comm, &mpicsize);
MPI_Comm_rank(comm, &mpirank);
if (mpirank != rank) MPI_Abort(comm, -2233);
if (mpicsize != csize) MPI_Abort(comm, -2333);
}
#endif
local = gA->local;
scounts = malloc (4 * (csize + 1) * sizeof(int));
if (!scounts) MPI_Abort(comm, -389762);
sdisps = scounts + csize + 1;
rcounts = sdisps + csize + 1;
rdisps = rcounts + csize + 1;
l2g = gA->colmap.l2g;
if (!l2g) {
int j;
l2g_own = malloc (local.nc * sizeof(int));
for (j = 0; j < local.nc; ++j) l2g_own[j] = j;
l2g = l2g_own;
}
/* get p_rows, the map from p -> [rows) */
memset (scounts, 0, (csize+1) * sizeof(int)); /* rows to send */
p = 0;
assert(gA->nc == indstartT[csize]);
for (iT = 0; iT < local.nc; ++iT) {
const int giT = (l2g? l2g[iT] : iT);
assert (giT < gA->nc);
while (giT >= indstartT[p+1]) ++p; /* the giT are sorted... */
++scounts[p];
}
/* fprintf (stderr, "%d: boing1\n", rank); */
MPI_Alltoall (scounts, 1, MPI_INT, rcounts, 1, MPI_INT,
comm);
sdisps[0] = 0;
for (p = 0; p < csize; ++p) {
sdisps[p+1] = sdisps[p] + scounts[p];
}
ndata_req = 0;
for (p = 0; p < csize; ++p) {
p_rows[p] = ndata_req;
ndata_req += rcounts[p];
}
p_rows[csize] = ndata_req; /* total number of rows to be received */
/*
now get
p_rowind: map of packed row -> _global_ row index
p_rowoff: map of p_row[p] -> [offs), shifted
*/
p_rowind = malloc ((2 * ndata_req + 1) * sizeof(int));
if (!p_rowind) MPI_Abort (comm, -467474);
p_rowoff = p_rowind + ndata_req;
*p_rowind_ = p_rowind;
*p_rowoff_ = p_rowoff;
/* fprintf (stderr, "%d: boing2 %p %p\n", rank, l2g, p_rowind); */
MPI_Alltoallv (l2g, scounts, sdisps, MPI_INT,
p_rowind, rcounts, p_rows, MPI_INT, comm);
spcsr_transpose (&local, &localT); /* ideally during above */
/* Make localT's column indices (local's rows) globally numbered */
assert(local.nent == localT.nent);
{
int k;
const int rowbase = gA->rowmap.lindstart;
for (k = 0; k < localT.nent; ++k) {
const int j = localT.colind[k];
assert(j >= 0);
assert(j < localT.nc);
localT.colind[k] = j + rowbase;
assert(localT.colind[k] < gA->nr);
/*
if (localT.entry[k] == 0.0)
fprintf(stderr, "%d: found zero off %d\n", rank, k);
*/
}
localT.nc = gA->nr;
}
/* fprintf (stderr, "%d: boing3\n", rank); */
MPI_Alltoallv (localT.rowoff, scounts, sdisps, MPI_INT,
p_rowoff, rcounts, p_rows, MPI_INT, comm);
/* shift row indices into local space */ /* ideally during above */
{
const int lindstart = indstartT[rank];
#if !defined(NDEBUG)
const int lindend = indstartT[rank+1];
#endif
int k;
for (k = 0; k < p_rows[csize]; ++k) {
assert(p_rowind[k] >= lindstart);
assert(p_rowind[k] < lindend);
p_rowind[k] -= lindstart;
}
}
/* fprintf (stderr, "%d: boing4\n", rank); */
/* need # of entries to unshift p_rowoff and recv colinds and entries */
/* scounts[p] = # entries to send to p */
for (p = 0; p < csize; ++p) {
if (localT.rowoff)
scounts[p] = localT.rowoff[sdisps[p+1]] - localT.rowoff[sdisps[p]];
else
scounts[p] = 0;
/*
fprintf (stderr, "%d: sending %d entries (of %d) to %d\n", rank,
scounts[p], localT.nent, p);
*/
}
/* swap # entries */
MPI_Alltoall (scounts, 1, MPI_INT, rcounts, 1, MPI_INT,
comm);
sdisps[0] = 0; /* disps for sending colinds and entries */
for (p = 0; p < csize; ++p) {
sdisps[p+1] = sdisps[p] + scounts[p];
}
/* Note: rdisps is not the same as p_rowoff. rdisps covers the whole of
a proc's contributions, p_rowoff is broken into each row within each
proc.
*/
nent = 0;
for (p = 0; p < csize; ++p) {
rdisps[p] = nent;
nent += rcounts[p];
}
rdisps[csize] = nent;
/*
for (p = 0; p < csize; ++p) {
fprintf (stderr, "%d: sending to %d: %d entries , disp %d\n", rank,
p, scounts[p], sdisps[p]);
fprintf (stderr, "%d: recving from %d: %d entries , disp %d\n", rank,
p, rcounts[p], rdisps[p]);
}
*/
cum_nent = 0;
for (p = 0; p < csize; ++p) {
const int firstoff = p_rowoff[p_rows[p]];
int k;
assert(cum_nent == rdisps[p]); /* consistent with rdisps */
for (k = p_rows[p]; k < p_rows[p+1]; ++k) {
p_rowoff[k] -= firstoff;
p_rowoff[k] += cum_nent;
}
cum_nent += rcounts[p];
}
p_rowoff[p_rows[csize]] = cum_nent;
assert(cum_nent == nent); /* trivially true */
#if !defined(NDEBUG)
/* check that p_rowoff is self-consistent */
{
int k;
assert(0 == p_rows[0]);
assert(0 == p_rowoff[0]);
for (p = 0; p < csize; ++p) {
assert(p_rows[p] <= p_rows[p+1]);
for (k = p_rows[p]; k < p_rows[p+1]; ++k) {
assert(p_rowoff[p] <= p_rowoff[p+1]);
}
}
assert(k == p_rows[csize]);
assert(cum_nent == p_rowoff[k]);
}
#endif
/* now set up packed colind and entry arrays */
/* fprintf (stderr, "%d: allocing for %d entries\n", rank, nent); */
p_colind = malloc (nent * sizeof(int));
p_entry = malloc (nent * sizeof(double));
if (!p_colind || !p_entry) MPI_Abort(comm, -74423);
*p_colind_ = p_colind;
*p_entry_ = p_entry;
/*
for (p = 0; p < csize; ++p) {
fprintf (stderr, "%d: sending to %d: %d entries , disp %d\n", rank,
p, scounts[p], sdisps[p]);
fprintf (stderr, "%d: recving from %d: %d entries , disp %d\n", rank,
p, rcounts[p], rdisps[p]);
}
*/
MPI_Alltoallv (localT.colind, scounts, sdisps, MPI_INT,
p_colind, rcounts, rdisps, MPI_INT,
comm);
MPI_Alltoallv (localT.entry, scounts, sdisps, MPI_DOUBLE,
p_entry, rcounts, rdisps, MPI_DOUBLE,
comm);
/*memcpy (p_entry, localT.entry, nent*sizeof(double));*/
/*
{
int zz;
fprintf (stderr, "%d: nent = %d\n", rank, nent);
for (zz = 0; zz < nent; ++zz) {
if (p_entry[zz] == 0.0)
fprintf (stderr, "%d: zero entry off %d\n", rank, zz);
}
}
*/
if (l2g_own) free (l2g_own);
if (scounts) free (scounts);
spcsr_free (&localT);
return 0;
}
static int
collapse_matrix_chunks (const int csize,
const int *p_rows, const int *p_rowind,
const int *p_rowoff,
const int *p_colind, const double *p_entry,
struct spcsr_t *local)
{
int *first = NULL;
const int nr = local->nr;
#if !defined(NDEBUG)
const int nc = local->nc;
#endif
int i, rdeg, p;
int *rowoff = local->rowoff;
int *colind = local->colind;
double *entry = local->entry;
#if !defined(NDEBUG)
int *ci = NULL;
#endif
/*
{
int rank, k, zz;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
fprintf (stderr, "%d: p_rows[%d]: [%d, %d)\n", rank,
0, p_rows[0], p_rows[1]);
fprintf (stderr, "%d: p_rowind: ", rank);
zz = 0;
for (k = p_rows[0]; k < p_rows[1]; ++k, ++zz) {
if (zz >= 10) { fprintf (stderr, "..."); break; }
fprintf(stderr, " %d", p_rowind[k]);
}
fprintf(stderr, "\n%d: p_rowoff: ", rank);
zz = 0;
for (k = p_rows[0]; k < p_rows[1]; ++k, ++zz) {
if (zz >= 10) { fprintf (stderr, "..."); break; }
fprintf(stderr, " %d", p_rowoff[k]);
}
fprintf (stderr, "\n");
}
*/
first = malloc ( csize * sizeof(int));
if (!first) return -1;
for (p = 0; p < csize; ++p) first[p] = p_rows[p];
#if !defined (NDEBUG)
ci = malloc ( nc * sizeof(int) );
memset (ci, -1, nc * sizeof(int));
#endif
rdeg = 0;
for (i = 0; i < nr; ++i) {
int p;
rowoff[i] = rdeg;
for (p = 0; p < csize; ++p) {
const int lastp = p_rows[p+1];
int rioff = first[p], k;
while (rioff < lastp && i < p_rowind[rioff]) ++rioff;
first[p] = rioff;
if (lastp == rioff || i != p_rowind[rioff]) continue;
++first[p];
/* Now at row i, so scatter row it */
/* scatter is trivial as there's no overlap */
/* debug var ci checks for overlap */
for (k = p_rowoff[rioff]; k < p_rowoff[rioff+1]; ++k) {
const int j = p_colind[k];
const double e = p_entry[k];
assert (j >= 0);
assert (j < nc);
assert (ci[j] < 0);
#if !defined(NDEBUG)
ci[j] = rdeg;
#endif
colind[rdeg] = j;
/*
if (0.0 == e) {
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
fprintf(stderr, "%d: moving a zero (%d,%d) off %d\n", rank, i, j, rdeg);
}
*/
entry[rdeg] = e;
++rdeg;
}
}
#if !defined(NDEBUG)
{ /* clear ci */
int k;
for (k = rowoff[i]; k < rdeg; ++k) {
const int j = colind[k];
assert (j >= 0);
assert (j < nc);
assert (ci[j] >= 0);
ci[j] = -1;
}
}
#endif
}
rowoff[i] = rdeg;
/*
{
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
fprintf (stderr, "%d: rowoff[%d] = %d (%d)\n", rank, i, rdeg, local->nent);
}
*/
#if !defined(NDEBUG)
{
int rank, csize;
int *seenr = NULL, *seenc = NULL;
int i, k;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &csize);
if (1 == csize) {
seenr = calloc(nr+nc, sizeof(int));
seenc = seenr + nr;
}
assert(rowoff[0] == 0);
assert(rowoff[nr] == local->nent);
for (i = 0; i < nr; ++i) {
assert(rowoff[i] >= 0);
assert(rowoff[i] <= rowoff[i+1]);
if (seenr) if (rowoff[i] != rowoff[i+1]) seenr[i] = 1;
for (k = rowoff[i]; k < rowoff[i+1]; ++k) {
const int j = colind[k];
if (seenc) seenc[j] = 1;
assert(j >= 0);
assert(j < nc);
}
}
if (seenr)
for (i = 0; i < nr; ++i) {
if (!seenr[i])
fprintf (stderr, "%d: seenr[%i] missing\n", rank, i);
}
if (seenc)
for (i = 0; i < nc; ++i) {
if (!seenc[i])
fprintf (stderr, "%d: seenc[%i] missing\n", rank, i);
}
if (seenr) free (seenr);
/* fprintf (stderr, "%d: done with (%d, %d)\n", rank, nr, nc);*/
}
#endif
if (first) free (first);
#if !defined(NDEBUG)
if (ci) free (ci);
#endif
return 0;
}
void
gcsr_transpose (struct gcsr_t *gA, struct gcsr_t *gAT,
const int *indstartT_in, MPI_Comm comm)
{
int rank, csize;
struct spcsr_t localT;
const int *indstartT = NULL;
int *indstartT_own = NULL;
int *p_rows = NULL, *p_rowoff = NULL, *p_rowind = NULL, *p_colind = NULL;
double *p_entry = NULL;
int nent;
if (!gA || !gAT) return;
spcsr_init_clear (&localT);
rank = gA->rowmap.p;
csize = gA->rowmap.np;
/* build indstartT, and indstartT_own for transpose's map */
gcsr_expand_rowmap (gA, comm);
indstartT_own = malloc ((csize+1) * sizeof(int));
if (!indstartT_own) MPI_Abort(comm, -393927);
if (indstartT_in) {
indstartT = indstartT_in;
memcpy (indstartT_own, indstartT, (csize+1)*sizeof(int));
} else {
indstartT = gA->rowmap.indstart;
memcpy (indstartT_own, indstartT, (csize+1)*sizeof(int));
}
p_rows = malloc ((csize+1) * sizeof(int));
if (!p_rows) MPI_Abort (comm, 643443);
if (transpose_matrix_chunks (gA, indstartT,
p_rows, &p_rowind, &p_rowoff,
&p_colind, &p_entry, comm))
MPI_Abort (comm, 101028);
nent = p_rowoff[p_rows[csize]];
#if !defined(NDEBUG)
{
int gnentin[2];
int gnent[2];
gnentin[0] = gA->local.nent;
gnentin[1] = nent;
MPI_Allreduce (&gnentin, &gnent, 2, MPI_INT, MPI_SUM, comm);
assert(gnent[0] == gnent[1]);
}
#endif
localT.nc = gA->nr;
localT.nr = indstartT[rank+1] - indstartT[rank];
localT.nent = nent;
if (spcsr_alloc_ (localT.nr, localT.nc, localT.nent,
&localT.rowoff, &localT.colind, &localT.entry,
NULL))
MPI_Abort (comm, 653747);
if (collapse_matrix_chunks (csize, p_rows, p_rowind, p_rowoff,
p_colind, p_entry, &localT))
MPI_Abort (comm, 456746);
/*
{
int p;
for (p = 0; p < csize; ++p) {
MPI_Barrier(MPI_COMM_WORLD);
if (p == rank) {
printf("%d: gA(%d, %d), lA(%d, %d; %d) rows [%d, %d)\n",
rank, gA->nr, gA->nc, localT.nr, localT.nc, localT.nent,
indstartT[rank], indstartT[rank+1]);
printf ("%d: %d %d %d\n", rank, localT.rowoff[0], localT.rowoff[1], localT.rowoff[2]);
}
MPI_Barrier(MPI_COMM_WORLD);
}
}
*/
if (gcsr_take_csr_locally (gA->nc, gA->nr, &localT, gAT,
&indstartT_own, comm))
MPI_Abort (comm, 89372);
#if !defined(NDEBUG)
{
int i, j, k;
int *seenr, *seenc;
int *rootseenr = NULL, *rootseenc;
seenr = calloc (gA->nr + gA->nc, sizeof(int));
seenc = seenr + gA->nr;
if (0 == rank) {
rootseenr = calloc (gA->nr + gA->nc, sizeof(int));
rootseenc = rootseenr + gA->nr;
}
for (i = 0; i < gA->local.nr; ++i) {
const int gi = i + gA->rowmap.lindstart;
if (gA->local.rowoff[i] != gA->local.rowoff[i+1])
seenr[gi] = 1;
for (k = gA->local.rowoff[i]; k < gA->local.rowoff[i+1]; ++k) {
j = gA->local.colind[k];
if (gA->colmap.l2g)
seenc[gA->colmap.l2g[j]] = 1;
else
seenc[j] = 1;
}
}
MPI_Reduce (seenr, rootseenr, gA->nr + gA->nc, MPI_INT, MPI_MAX, 0, comm);
if (0 == rank) {
for (i = 0; i < gA->nr; ++i)
if (!rootseenr[i])
fprintf (stderr, "%d:: no one sees row %d\n", rank, i);
for (j = 0; j < gA->nc; ++j)
if (!rootseenc[j])
fprintf (stderr, "%d:: no one sees col %d\n", rank, j);
}
free(seenr);
if (rootseenr) free(rootseenr);
}
#endif
if (p_rows) free (p_rows);
if (p_rowind) free (p_rowind);
if (p_colind) free (p_colind);
if (p_entry) free (p_entry);
}
int
gcsr_copy (const struct gcsr_t *gA, struct gcsr_t *gAcopy, MPI_Comm comm)
{
gcsr_init_clear (gAcopy);
spcsr_copy (&gA->local, &gAcopy->local);
gAcopy->nr = gA->nr;
gAcopy->nc = gA->nc;
gAcopy->rowmap.p = gA->rowmap.p;
gAcopy->rowmap.np = gA->rowmap.np;
gAcopy->rowmap.lindstart = gA->rowmap.lindstart;
gAcopy->rowmap.lnind = gA->rowmap.lnind;
gAcopy->rowmap.indstart = NULL;
if (gA->rowmap.indstart) {
gAcopy->rowmap.indstart = malloc ((gA->rowmap.np + 1) * sizeof(int));
if (gAcopy->rowmap.indstart)
memcpy (gAcopy->rowmap.indstart, gA->rowmap.indstart,
(gA->rowmap.np + 1) * sizeof(int));
}
if (gA->colmap.l2g) {
gAcopy->colmap.l2g = malloc (gA->nc * sizeof(int));
if (gAcopy->colmap.l2g)
memcpy(gAcopy->colmap.l2g, gA->colmap.l2g, gA->nc * sizeof(int));
else {
MPI_Abort (comm, 934743); /* could just reindex to global */
return ENOMEM;
}
}
return 0;
}
void
gcsr_transpose_onroot (struct gcsr_t *gA, struct gcsr_t *gAT,
const int *indstartT_in, MPI_Comm comm)
{
if (gcsr_copy (gA, gAT, comm)) MPI_Abort(comm, 998701);
if (gcsr_redist_root (gAT, comm)) MPI_Abort(comm, 998702);
if (0 == gA->rowmap.p) {
struct spcsr_t localT;
if (spcsr_transpose (&gAT->local, &localT)) MPI_Abort(comm, 998703);
spcsr_free(&gAT->local);
gAT->local = localT;
}
gAT->nr = gA->nc;
gAT->nc = gA->nr;
if (indstartT_in) {
gcsr_redist_rowinds (gAT, indstartT_in, comm);
}
else {
gcsr_expand_rowmap (gA, comm);
gcsr_redist_rowinds (gAT, gA->rowmap.indstart, comm);
}
}