#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
#include "Cnames.h"
#include "supermatrix.h"
#include "util_dist.h"
#include "psymbfact.h"
Go to the source code of this file.
Data Structures | |
struct | superlu_scope_t |
struct | gridinfo_t |
struct | Glu_persist_t |
struct | Glu_freeable_t |
struct | ScalePermstruct_t |
struct | superlu_options_t |
struct | mem_usage_t |
Defines | |
#define | mpi_int_t MPI_INT |
#define | BC_HEADER 2 |
#define | LB_DESCRIPTOR 2 |
#define | BR_HEADER 3 |
#define | UB_DESCRIPTOR 2 |
#define | NBUFFERS 5 |
#define | NTAGS 10000 |
#define | UjROW 10 |
#define | UkSUB 11 |
#define | UkVAL 12 |
#define | LkSUB 13 |
#define | LkVAL 14 |
#define | LkkDIAG 15 |
#define | XK_H 1 |
#define | LSUM_H 1 |
#define | GSUM 20 |
#define | Xk 21 |
#define | Yk 22 |
#define | LSUM 23 |
#define | COMM_ALL 100 |
#define | COMM_COLUMN 101 |
#define | COMM_ROW 102 |
#define | SUPER_LINEAR 11 |
#define | SUPER_BLOCK 12 |
#define | NO_MARKER 3 |
#define | IAM(comm) { int rank; MPI_Comm_rank ( comm, &rank ); rank}; |
#define | MYROW(iam, grid) ( (iam) / grid->npcol ) |
#define | MYCOL(iam, grid) ( (iam) % grid->npcol ) |
#define | BlockNum(i) ( supno[i] ) |
#define | FstBlockC(bnum) ( xsup[bnum] ) |
#define | SuperSize(bnum) ( xsup[bnum+1]-xsup[bnum] ) |
#define | LBi(bnum, grid) ( (bnum)/grid->nprow ) |
#define | LBj(bnum, grid) ( (bnum)/grid->npcol ) |
#define | PROW(bnum, grid) ( (bnum) % grid->nprow ) |
#define | PCOL(bnum, grid) ( (bnum) % grid->npcol ) |
#define | PNUM(i, j, grid) ( (i)*grid->npcol + j ) |
#define | CEILING(a, b) ( ((a)%(b)) ? ((a)/(b) + 1) : ((a)/(b)) ) |
#define | RHS_ITERATE(i) for (i = 0; i < nrhs; ++i) |
#define | X_BLK(i) ilsum[i] * nrhs + (i+1) * XK_H |
#define | LSUM_BLK(i) ilsum[i] * nrhs + (i+1) * LSUM_H |
#define | SuperLU_timer_ SuperLU_timer_dist_ |
#define | LOG2(x) (log10((double) x) / log10(2.0)) |
#define | VT_TRACEON |
#define | VT_TRACEOFF |
Typedefs | |
typedef int | int_t |
Enumerations | |
enum | yes_no_t { NO, YES } |
enum | fact_t { DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED } |
enum | rowperm_t { NOROWPERM, LargeDiag, MY_PERMR } |
enum | colperm_t { NATURAL, MMD_AT_PLUS_A, MMD_ATA, METIS_AT_PLUS_A, PARMETIS, MY_PERMC } |
enum | trans_t { NOTRANS, TRANS, CONJ } |
enum | DiagScale_t { NOEQUIL, ROW, COL, BOTH } |
enum | IterRefine_t { NOREFINE, SINGLE = 1, DOUBLE, EXTRA } |
enum | MemType { LUSUP, UCOL, LSUB, USUB } |
enum | stack_end_t { HEAD, TAIL } |
enum | LU_space_t { SYSTEM, USER } |
Functions | |
void | set_default_options_dist (superlu_options_t *) |
Set the default values for the options argument. | |
void | print_options_dist (superlu_options_t *) |
Print the options setting. | |
void | Destroy_CompCol_Matrix_dist (SuperMatrix *) |
void | Destroy_SuperNode_Matrix_dist (SuperMatrix *) |
void | Destroy_SuperMatrix_Store_dist (SuperMatrix *) |
Deallocate the structure pointing to the actual storage of the matrix. | |
void | Destroy_CompCol_Permuted_dist (SuperMatrix *) |
A is of type Stype==NCP. | |
void | Destroy_CompRowLoc_Matrix_dist (SuperMatrix *) |
void | Destroy_CompRow_Matrix_dist (SuperMatrix *) |
void | sp_colorder (superlu_options_t *, SuperMatrix *, int_t *, int_t *, SuperMatrix *) |
int_t | sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t, int_t *) |
Nonsymmetric elimination tree. | |
void | countnz_dist (const int_t, int_t *, int_t *, int_t *, Glu_persist_t *, Glu_freeable_t *) |
int_t | fixupL_dist (const int_t, const int_t *, Glu_persist_t *, Glu_freeable_t *) |
int_t * | TreePostorder_dist (int_t, int_t *) |
float | slamch_ (char *) |
double | dlamch_ (char *) |
void * | superlu_malloc_dist (size_t) |
void | superlu_free_dist (void *) |
int_t * | intMalloc_dist (int_t) |
int_t * | intCalloc_dist (int_t) |
double | SuperLU_timer_ () |
void | superlu_abort_and_exit_dist (char *) |
int_t | sp_ienv_dist (int_t) |
Purpose ======= | |
int | lsame_ (char *, char *) |
int | xerbla_ (char *, int *) |
void | ifill_dist (int_t *, int_t, int_t) |
Fills an integer array with a given value. | |
void | super_stats_dist (int_t, int_t *) |
void | ScalePermstructInit (const int_t, const int_t, ScalePermstruct_t *) |
Allocate storage in ScalePermstruct. | |
void | ScalePermstructFree (ScalePermstruct_t *) |
Deallocate ScalePermstruct. | |
void | superlu_gridinit (MPI_Comm, int_t, int_t, gridinfo_t *) |
All processes in the MPI communicator must call this routine. | |
void | superlu_gridmap (MPI_Comm, int_t, int_t, int_t[], int_t, gridinfo_t *) |
All processes in the MPI communicator must call this routine. | |
void | superlu_gridexit (gridinfo_t *) |
void | get_perm_c_dist (int_t, int_t, SuperMatrix *, int_t *) |
void | a_plus_at_dist (const int_t, const int_t, int_t *, int_t *, int_t *, int_t **, int_t **) |
void | bcast_tree (void *, int, MPI_Datatype, int, int, gridinfo_t *, int, int *) |
int_t | symbfact (superlu_options_t *, int, SuperMatrix *, int_t *, int_t *, Glu_persist_t *, Glu_freeable_t *) |
int_t | symbfact_SubInit (fact_t, void *, int_t, int_t, int_t, int_t, Glu_persist_t *, Glu_freeable_t *) |
int_t | symbfact_SubXpand (int_t, int_t, int_t, MemType, int_t *, Glu_freeable_t *) |
int_t | symbfact_SubFree (Glu_freeable_t *) |
void | get_diag_procs (int_t, Glu_persist_t *, gridinfo_t *, int_t *, int_t **, int_t **) |
int_t | QuerySpace_dist (int_t, int_t, Glu_freeable_t *, mem_usage_t *) |
void | pxerbla (char *, gridinfo_t *, int_t) |
void | PStatInit (SuperLUStat_t *) |
void | PStatFree (SuperLUStat_t *) |
void | PStatPrint (superlu_options_t *, SuperLUStat_t *, gridinfo_t *) |
float | symbfact_dist (int, int, SuperMatrix *, int_t *, int_t *, int_t *, int_t *, Pslu_freeable_t *, MPI_Comm *, MPI_Comm *, mem_usage_t *) |
float | get_perm_c_parmetis (SuperMatrix *, int_t *, int_t *, int, int, int_t **, int_t **, gridinfo_t *, MPI_Comm *) |
int_t | psymbfact_LUXpandMem (int_t, int_t, int_t, int_t, int_t, int_t, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *) |
int_t | psymbfact_LUXpand (int_t, int_t, int_t, int_t, int_t *, int_t, int_t, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *) |
int_t | psymbfact_LUXpand_RL (int_t, int_t, int_t, int_t, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *) |
int_t | psymbfact_prLUXpand (int_t, int_t, MemType, Llu_symbfact_t *, psymbfact_stat_t *) |
void | print_panel_seg_dist (int_t, int_t, int_t, int_t, int_t *, int_t *) |
Diagnostic print of segment info after panel_dfs(). | |
void | check_repfnz_dist (int_t, int_t, int_t, int_t *) |
Check whether repfnz[] == EMPTY after reset. | |
int_t | CheckZeroDiagonal (int_t, int_t *, int_t *, int_t *) |
void | PrintDouble5 (char *, int_t, double *) |
void | PrintInt10 (char *, int_t, int_t *) |
int | file_PrintInt10 (FILE *, char *, int_t, int_t *) |
-- Distributed SuperLU routine (version 2.2) -- Lawrence Berkeley National Lab, Univ. of California Berkeley. November 1, 2007 Feburary 20, 2008
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Purpose ======= Broadcast an array of *dtype* numbers. The communication pattern is a tree with number of branches equal to NBRANCHES. The process ranks are between 0 and Np-1.
The following two pairs of graphs give different ways of viewing the same algorithm. The first pair shows the trees as they should be visualized when examining the algorithm. The second pair are isomorphic graphs of of the first, which show the actual pattern of data movement. Note that a tree broadcast with NBRANCHES = 2 is isomorphic with a hypercube broadcast (however, it does not require the nodes be a power of two to work).
TREE BROADCAST, NBRANCHES = 2 * TREE BROADCAST, NBRANCHES = 3
root=2 i=4 &______________ * | \ * root=2 i=2 &______ &______ * i=3 &______________________ | \ | \ * | \ \ i=1 &__ &__ &__ &__ * i=1 &______ &______ &__ | \ | \ | \ | \ * | \ \ | \ \ | \ 2 3 4 5 6 7 0 1 * 2 3 4 5 6 7 0 1
ISOMORPHIC GRAPHS OF ABOVE, SHOWN IN MORE FAMILIAR TERMS:
2 2 _________|_________ ___________|____________ / | \ / | | \ 6 4 3 5 0 3 4 / \ | / \ | 0 7 5 6 7 1 | 1
Arguments =========
scope |
|
|
|
|
|
Count the total number of nonzeros in factors L and U, and in the symmetrically reduced L. |
|
|
|
|
|
|
|
|
|
|
|
|
|
Purpose =======
DLAMCH determines double precision machine parameters.
Arguments =========
CMACH (input) CHARACTER*1 Specifies the value to be returned by DLAMCH: = 'E' or 'e', DLAMCH := eps = 'S' or 's , DLAMCH := sfmin = 'B' or 'b', DLAMCH := base = 'P' or 'p', DLAMCH := eps*base = 'N' or 'n', DLAMCH := t = 'R' or 'r', DLAMCH := rnd = 'M' or 'm', DLAMCH := emin = 'U' or 'u', DLAMCH := rmin = 'L' or 'l', DLAMCH := emax = 'O' or 'o', DLAMCH := rmax
where
eps = relative machine precision sfmin = safe minimum, such that 1/sfmin does not overflow base = base of the machine prec = eps*base t = number of (base) digits in the mantissa rnd = 1.0 when rounding occurs in addition, 0.0 otherwise emin = minimum exponent before (gradual) underflow rmin = underflow threshold - base**(emin-1) emax = largest exponent before overflow rmax = overflow threshold - (base**emax)*(1-eps)
===================================================================== |
|
|
|
Fix up the data storage lsub for L-subscripts. It removes the subscript sets for structural pruning, and applies permuation to the remaining subscripts. |
|
|
|
Purpose =======
GET_PERM_C_DIST obtains a permutation matrix Pc, by applying the multiple minimum degree ordering code by Joseph Liu to matrix A'*A or A+A', or using approximate minimum degree column ordering by Davis et. al. The LU factorization of A*Pc tends to have less fill than the LU factorization of A.
Arguments =========
ispec (input) colperm_t Specifies what type of column permutation to use to reduce fill. = NATURAL: natural ordering (i.e., Pc = I) = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A = MMD_ATA: minimum degree ordering on structure of A'*A = METIS_AT_PLUS_A: MeTis on A'+A
A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Currently, the type of A can be: Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. In the future, more general A can be handled.
perm_c (output) int* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. |
|
Purpose =======
GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a graph partitioning algorithm to the symmetrized graph A+A'. The multilevel graph partitioning algorithm used is the ParMETIS_V3_NodeND routine available in the parallel graph partitioning package parMETIS.
The number of independent sub-domains noDomains computed by this algorithm has to be a power of 2. Hence noDomains is the larger number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow * npcol is the number of processors used in SuperLU_DIST.
Arguments =========
A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Matrix A is distributed in NRformat_loc format.
perm_r (input) int_t* Row permutation vector of size A->nrow, which defines the permutation matrix Pr; perm_r[i] = j means row i of A is in position j in Pr*A.
perm_c (output) int_t* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc.
nprocs_i (input) int* Number of processors the input matrix is distributed on in a block row format. It corresponds to number of processors used in SuperLU_DIST.
noDomains (input) int*, must be power of 2 Number of independent domains to be computed by the graph partitioning algorithm. ( noDomains <= nprocs_i )
sizes (output) int_t**, of size 2 * noDomains Returns pointer to an array containing the number of nodes for each sub-domain and each separator. Separators are stored from left to right. Memory for the array is allocated in this routine.
fstVtxSep (output) int_t**, of size 2 * noDomains Returns pointer to an array containing first node for each sub-domain and each separator. Memory for the array is allocated in this routine.
Return value ============ < 0, number of bytes allocated on return from the symbolic factorization. > 0, number of bytes allocated when out of memory. |
|
|
|
|
|
|
|
Purpose =======
LSAME returns .TRUE. if CA is the same letter as CB regardless of case.
Arguments =========
CA (input) CHARACTER*1 CB (input) CHARACTER*1 CA and CB specify the single characters to be compared.
===================================================================== |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Expand the data structures for L and U during the factorization. Return value: SUCCES_RET - successful return ERROR_RET - error due to a memory alocation failure |
|
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space |
|
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space |
|
Expand the data structures for L and U pruned during the factorization. Return value: SUCCES_RET - successful return ERROR_RET - error when run out of space |
|
|
|
mem_usage consists of the following fields:
|
|
|
|
|
|
|
|
Purpose =======
SLAMCH determines single precision machine parameters.
Arguments =========
CMACH (input) CHARACTER*1 Specifies the value to be returned by SLAMCH: = 'E' or 'e', SLAMCH := eps = 'S' or 's , SLAMCH := sfmin = 'B' or 'b', SLAMCH := base = 'P' or 'p', SLAMCH := eps*base = 'N' or 'n', SLAMCH := t = 'R' or 'r', SLAMCH := rnd = 'M' or 'm', SLAMCH := emin = 'U' or 'u', SLAMCH := rmin = 'L' or 'l', SLAMCH := emax = 'O' or 'o', SLAMCH := rmax
where
eps = relative machine precision sfmin = safe minimum, such that 1/sfmin does not overflow base = base of the machine prec = eps*base t = number of (base) digits in the mantissa rnd = 1.0 when rounding occurs in addition, 0.0 otherwise emin = minimum exponent before (gradual) underflow rmin = underflow threshold - base**(emin-1) emax = largest exponent before overflow rmax = overflow threshold - (base**emax)*(1-eps)
===================================================================== |
|
Find the elimination tree for A'*A. This uses something similar to Liu's algorithm. It runs in time O(nz(A)*log n) and does not form A'*A.
Input: Sparse matrix A. Numeric values are ignored, so any explicit zeros are treated as nonzero. Output: Integer array of parents representing the elimination tree of the symbolic product A'*A. Each vertex is a column of A, and nc means a root of the elimination forest.
John R. Gilbert, Xerox, 10 Dec 1990 Based on code by JRG dated 1987, 1988, and 1990. |
|
Purpose =======
sp_colorder() permutes the columns of the original matrix. It performs the following steps:
1. Apply column permutation perm_c[] to A's column pointers to form AC;
2. If options->Fact = DOFACT, then (1) Compute column elimination tree etree[] of AC'AC; (2) Post order etree[] to get a postordered elimination tree etree[], and a postorder permutation post[]; (3) Apply post[] permutation to columns of AC; (4) Overwrite perm_c[] with the product perm_c * post.
Arguments =========
options (input) superlu_options_t* Specifies whether or not the elimination tree will be re-used. If options->Fact == DOFACT, this means first time factor A, etree is computed and output. Otherwise, re-factor A, etree is input, unchanged on exit.
A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Currently, the type of A can be: Stype = SLU_NC or SLU_NCP; Dtype = SLU__D; Mtype = SLU_GE. In the future, more general A can be handled.
perm_c (input/output) int* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. If options->Fact == DOFACT, perm_c is both input and output. On output, it is changed according to a postorder of etree. Otherwise, perm_c is input.
etree (input/output) int* Elimination tree of Pc*(A'+A)*Pc', dimension A->ncol. If options->Fact == DOFACT, etree is an output argument, otherwise it is an input argument. Note: etree is a vector of parent pointers for a forest whose vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
AC (output) SuperMatrix* The resulting matrix after applied the column permutation perm_c[] to matrix A. The type of AC can be: Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE. |
|
sp_ienv_dist() is inquired to choose machine-dependent parameters for the local environment. See ISPEC for a description of the parameters. This version provides a set of parameters which should give good, but not optimal, performance on many of the currently available computers. Users are encouraged to modify this subroutine to set the tuning parameters for their particular machine using the option and problem size information in the arguments. Arguments ========= ISPEC (input) int Specifies the parameter to be returned as the value of SP_IENV_DIST. = 1: the panel size w; a panel consists of w consecutive columns of matrix A in the process of Gaussian elimination. The best value depends on machine's cache characters. = 2: the relaxation parameter relax; if the number of nodes (columns) in a subtree of the elimination tree is less than relax, this subtree is considered as one supernode, regardless of the their row structures. = 3: the maximum size for a supernode, which must be greater than or equal to relaxation parameter (see case 2); = 4: the minimum row dimension for 2-D blocking to be used; = 5: the minimum column dimension for 2-D blocking to be used; = 6: the estimated fills factor for the adjacency structures of L and U, compared with A; (SP_IENV_DIST) (output) int >= 0: the value of the parameter specified by ISPEC < 0: if SP_IENV_DIST = -k, the k-th argument had an illegal value. ===================================================================== |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Purpose ======= symbfact() performs a symbolic factorization on matrix A and sets up the nonzero data structures which are suitable for supernodal Gaussian elimination with no pivoting (GENP). This routine features: o depth-first search (DFS) o supernodes o symmetric structure pruning
Return value ============ < 0, number of bytes needed for LSUB. = 0, matrix dimension is 1. > 0, number of bytes allocated when out of memory. |
|
Purpose ======= symbfact_dist() performs symbolic factorization of matrix A suitable for performing the supernodal Gaussian elimination with no pivoting (GEPP). This routine computes the structure of one column of L and one row of U at a time. It uses: o distributed input matrix o supernodes o symmetric structure pruning
Arguments =========
nprocs_num (input) int Number of processors SuperLU_DIST is executed on, and the input matrix is distributed on.
nprocs_symb (input) int Number of processors on which the symbolic factorization is performed. It is equal to the number of independent domains idenfied in the graph partitioning algorithm executed previously and has to be a power of 2. It corresponds to number of leaves in the separator tree.
A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Matrix A is distributed in NRformat_loc format. Matrix A is not yet permuted by perm_c.
perm_c (input) int_t* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc.
perm_r (input) int_t* Row permutation vector of size A->nrow, which defines the permutation matrix Pr; perm_r[i] = j means column i of A is in position j in Pr*A.
sizes (input) int_t* Contains the number of vertices in each separator.
fstVtxSep (input) int_t* Contains first vertex for each separator.
Pslu_freeable (output) Pslu_freeable_t* Returns the local L and U structure, and global to local information on the indexing of the vertices. Contains all the information necessary for performing the data distribution towards the numeric factorization.
num_comm (input) MPI_Comm* Communicator for numerical factorization
symb_comm (input) MPI_Comm* Communicator for symbolic factorization
symb_mem_usage (input) mem_usage_t * Statistics on memory usage.
Return value ============ < 0, number of bytes allocated on return from the symbolic factorization. > 0, number of bytes allocated when out of memory.
Sketch of the algorithm =======================
Distrbute the vertices on the processors using a subtree to subcube algorithm.
Redistribute the structure of the input matrix A according to the subtree to subcube computed previously for the symbolic factorization routine. This implies in particular a distribution from nprocs_num processors to nprocs_symb processors.
Perform symbolic factorization guided by the separator tree provided by a graph partitioning algorithm. The symbolic factorization uses a combined left-looking, right-looking approach. |
|
Deallocate storage of the data structures common to symbolic factorization routines. |
|
Allocate storage for the data structures common to symbolic factorization routines. For those unpredictable size, make a guess as FILL * nnz(A). Return value: If lwork = -1, return the estimated amount of space required, plus n; otherwise, return the amount of space actually allocated when memory allocation failure occurred. |
|
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space |
|
|
|
Purpose =======
XERBLA is an error handler for the LAPACK routines. It is called by an LAPACK routine if an input parameter has an invalid value. A message is printed and execution stops.
Installers may consider modifying the STOP statement in order to call system-specific exception-handling facilities.
Arguments =========
SRNAME (input) CHARACTER*6 The name of the routine which called XERBLA.
INFO (input) INT The position of the invalid parameter in the parameter list
of the calling routine.
===================================================================== |