Mpi Matrix

MPI Sample Program
Parallel Matrix Multiplication

RMUTL 1st
Preeyakorn Tipwai
High Performance Computing
Workshop
Feb 2-3, 2009
Matrix Multiplication
P0
P1
P1
P2
P3
Pprocess_count-1
P4
Implementation
1.
2.
3.
4.
5.
6.
7.
Master (process 0) reads data

Master sends size of data to slaves
Slaves allocate memory
Master broadcasts second matrix to all other
processes
Master sends respective parts of first matrix to all
other processes
Every process performs its local multiplication
All slave processes send back their result.
Data partitioning
z
z
first partition the data.

distribute the relevant pieces of data to each
of the processors.
processors perform their operations on the
data.
results are either send to other processors or
used for further operations.
Matrix Multiplication
-We have to multiply each row of A with all of B, to result in the same row of C
-number of columns in A must equal number of rows in B
-lets try n*n matrices
C00 = A00 * B00 + A02 * B20 + A03 * B30 + ... + A0 n * Bn 0

C01 = A00 * B01 + A02 * B21 + A03 * B31 + ... + A0 n * Bn1
C0 n = A00 * B0 n + A02 * B2 n + A03 * B3n + ... + A0 n * Bnn
A00
A
10
..
An1
A01 .. A0 n
A11 .. A1n .
.. .. ..
An 2 .. Ann
B00
B
10
..
Bn1
B01 .. B0 n
B11 .. B1n
.. .. ..
Bn 2 .. Bnn
C00 C01
C
10 C11
..
..
Cn1 Cn 2
.. C0 n
.. C1n
.. ..
.. Cnn
Matrix definition
#define TAG_MATRIX_PARTITION
typedef struct
{ unsigned int
m, n;
double
*data;
double
**rows;
} TMatrix;
rows[0]
rows[1]
rows[n]
A00
A
10
..
An1
0x4560
// Rows, cols
// Data, ordered by row, then by col
// Pointers to rows in data
A01 .. A0 n
A11 .. A1n
.. .. ..
An 2 .. Ann
data
Matrix operations
TMatrix
TMatrix
void
int
TMatrix
TMatrix
void
void
createMatrix
(const unsigned int rows, const unsigned int cols);
readMatrix
(char filename[128]);
freeMatrix
(TMatrix *matrix);
validMatrix
(TMatrix matrix);
initMatrix
(void);
matrixMultiply
(TMatrix A, TMatrix B);
doMatrixMultiply (TMatrix A, TMatrix B, TMatrix C);
printMatrix
(char name[128],TMatrix A);
createMatrix
TMatrix createMatrix(const unsigned int rows, const unsigned int cols)
{
TMatrix
matrix;
unsigned long int m, n;
unsigned int
i,j;
m = rows; n = cols;
matrix.m
= rows;
matrix.n
= cols;
matrix.data = (double *) malloc(sizeof(double) * m * n);
matrix.rows = (double **) malloc(sizeof(double *) * m);
if (validMatrix(matrix))
{
matrix.m = rows;
matrix.n = cols;
for (i = 0; i < rows; i++)
{
matrix.rows[i] = matrix.data + (i * cols);
}
}
else
{
freeMatrix(&matrix);
}
return matrix;
}
freeMatrix
void freeMatrix (TMatrix *matrix)

{
if (matrix == NULL) return;
if (matrix -> data) { free(matrix -> data); matrix -> data = NULL; }
if (matrix -> rows) { free(matrix -> rows); matrix -> rows = NULL; }
matrix -> m = 0;
matrix -> n = 0;
}
validMatrix
int validMatrix (TMatrix matrix)

{
if ((matrix.data == NULL) || (matrix.rows == NULL) ||
(matrix.m == 0) || (matrix.n == 0))
return 0;
else return 1;
}
initMatrix
TMatrix initMatrix()
{
TMatrix matrix;
matrix.m = 0;
matrix.n = 0;
matrix.data = NULL;
matrix.rows = NULL;
return matrix;
}
matrixMultiply
TMatrix matrixMultiply(TMatrix A, TMatrix B)

{
TMatrix C;
C = initMatrix();
if (validMatrix(A) && validMatrix(B) && (A.n == B.m))
{
C = createMatrix(A.m, B.n);
if (validMatrix(C))
{
doMatrixMultiply(A, B, C);
}
}
return C;
}
doMatrixMultiply
void doMatrixMultiply(TMatrix A, TMatrix B, TMatrix C)

{
unsigned int i, j, k;
double sum;
for (i = 0; i < A.m; i++) // Rows
{
for (j = 0; j < B.n; j++) // Cols
{
sum = 0;
for (k = 0; k < A.n; k++)
sum += A.rows[i][k] * B.rows[k][j];
C.rows[i][j] = sum;
}
}
}
printMatrix
void printMatrix(char name[128], TMatrix A)

{
unsigned int i, j;
printf("%s:\n", name);
if (validMatrix(A))
{
for (i = 0; i < A.m; i++)
{
for (j = 0; j < A.n; j++)
printf ("%7.3f ", A.rows[i][j]);
printf ("\n");
}
}
}
readMatrix
int readMatrix(char *filename, TMatrix *A)
{ FILE *fp;
unsigned int m, n, i, j;
float d;
int result = 0;
if ((fp = fopen (filename, "r")) == NULL) return 0;
do
{ if (fscanf (fp, "%d%d", &m, &n) != 2) break;
if ((m == 0) || (n == 0)) break;
*A = createMatrix(m,n);
if (!validMatrix(*A)) break;
for (i = 0; i < m; i ++)
{ for (j = 0; j < n; j ++)
{ if (fscanf (fp, "%f", &d) != 1) break;
A -> rows[i][j] = d;
}
if (j != n) break;
}
if (i != m) break;
result = 1;
} while (0);
fclose (fp);
return result;
}
Write your main function

int main (int argc, char *argv[])
{
int processor_rank = 0;
int processor_count = 1;
MPI_Status
status;
TMatrix
A,B,C,D;
unsigned int m, n= 4, i, j, offset;
double
time0, time1;
A = initMatrix(); B = initMatrix();
C = initMatrix(); D = initMatrix();
MPI_Init(&argc, &argv);
MPI_Comm_size (MPI_COMM_WORLD, &processor_count);
MPI_Comm_rank (MPI_COMM_WORLD, &processor_rank );
WORK HERE
// Free matrix data
freeMatrix(&A); freeMatrix(&B); freeMatrix(&C);
// Wait for everyone to stop
MPI_Barrier(MPI_COMM_WORLD);
// Always use MPI_Finalize as the last instruction of the program
MPI_Finalize();
return 0;
}
Processes
Rank = 0
others
Time stamp
Read the matrices A, B
Allocate memories for matrix C
Broadcast(send) size of matrix
Broadcast(send) matrix B
Split A into parts
Send each process a part of A
Multiply first part here, result in C
Receive other parts of C
Time stamp
Broadcast(receive) size of matrix

Allocate memories for matrices
Broadcast(receive) matrix B
Receive a part of A
Multiply their part of matrix, result in C
Send the result back to 0
partitioning
n columns
P0
n rows
P1
Pprocessor_count-1
number of rows for each process (m) = n / processor_count

start row for each process i = m * i
amount of data for each process = m * n
For examples, 8*8 matrices

4 processors
8 columns
P0
8rows
P1
P3
m= 8 / 4 = 2
start row for each process i = 2 * i
amount of data for each process = 2 * 8
when rank = 0
if (processor_rank == 0)
{ time0 = MPI_Wtime();
readMatrix(argv[1], &A);
readMatrix(argv[2], &B);
n = A.n;
m = n / processor_count;
C = createMatrix(n,n);
// Broadcast (send) size of matrix
MPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
// Broadcast (send) B matrix

MPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Send each process it's own part of A
for (i = 1; i < processor_count; i++)
MPI_Send((void *)A.rows[m*i], m*n, MPI_DOUBLE,
i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD);
...
}
when rank = 0
if (processor_rank == 0)
{
...
// Multiply own part of matrix A with B into already existing matrix C

A.m = m;
doMatrixMultiply(A,B,C);
A.m = n;
// Receive part of C matrix from each process
for (i = 1; i < processor_count; i++)
MPI_Recv((void *)C.rows[m*i], m*n, MPI_DOUBLE,
i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD, &status);
// Record finish time
time1 = MPI_Wtime();
printMatrix("A",A);
printMatrix("B",B);
printMatrix("C",C);
// Print time statistics
printf ("Total time using [%2d] processors : [%f] seconds\n",
processor_count, time1 - time0);
}
Other ranks
else
{
// Broadcast (receive) size of matrix
MPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
//
m
A
B
Allocate memory for matrices

= n / processor_count;
= createMatrix(m, n);
= createMatrix(n ,n);
// Broadcast (receive) B matrix

MPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Recv((void *)A.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,
MPI_COMM_WORLD, &status);
// Multiply local matrices
C = matrixMultiply(A,B);
// Send back result
MPI_Send((void *)C.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,
MPI_COMM_WORLD);
}
Multiplication 1000 x 1000

1000 x 1000 Matrix multiplication
140
120
Time (s)
100
80
60
40
20
0
0
10
20
30
40
Processors
Tp
T1 / p
50
60
Multiplication 5000 x 5000

5000 x 5000 Matrix multiplication
90000
80000
70000
Time (s)
60000
50000
40000
30000
20000
10000
0
0
10
15
20
Processors
Tp
T1 / p
25
30
35

Mpi Matrix

Transféré par

Informations du document

Copyright

Formats disponibles

Partager ce document

Partager ou intégrer le document

Options de partage

Avez-vous trouvé ce document utile ?

Ce contenu est-il inapproprié ?

Droits d'auteur :

Formats disponibles

Mpi Matrix

Transféré par

Droits d'auteur :

Formats disponibles

MPI Sample Program

Parallel Matrix Multiplication

Master (process 0) reads data

first partition the data.

C00 = A00 * B00 + A02 * B20 + A03 * B30 + ... + A0 n * Bn 0

void freeMatrix (TMatrix *matrix)

int validMatrix (TMatrix matrix)

TMatrix matrixMultiply(TMatrix A, TMatrix B)

void doMatrixMultiply(TMatrix A, TMatrix B, TMatrix C)

void printMatrix(char name[128], TMatrix A)

Write your main function

Broadcast(receive) size of matrix

Send the result back to 0

number of rows for each process (m) = n / processor_count

For examples, 8*8 matrices

// Broadcast (send) B matrix

// Multiply own part of matrix A with B into already existing matrix C

Allocate memory for matrices

// Broadcast (receive) B matrix

Multiplication 1000 x 1000

Multiplication 5000 x 5000

Vous aimerez peut-être aussi