Vous êtes sur la page 1sur 24

MPI Sample Program

Parallel Matrix Multiplication


RMUTL 1st

Preeyakorn Tipwai
High Performance Computing
Workshop
Feb 2-3, 2009

Matrix Multiplication

P0

P1

P1

P2

P3

Pprocess_count-1

P4

Implementation
1.
2.
3.
4.

5.

6.
7.

Master (process 0) reads data


Master sends size of data to slaves
Slaves allocate memory
Master broadcasts second matrix to all other
processes
Master sends respective parts of first matrix to all
other processes
Every process performs its local multiplication
All slave processes send back their result.

Data partitioning
z
z

first partition the data.


distribute the relevant pieces of data to each
of the processors.
processors perform their operations on the
data.
results are either send to other processors or
used for further operations.

Matrix Multiplication
-We have to multiply each row of A with all of B, to result in the same row of C
-number of columns in A must equal number of rows in B
-lets try n*n matrices

C00 = A00 * B00 + A02 * B20 + A03 * B30 + ... + A0 n * Bn 0


C01 = A00 * B01 + A02 * B21 + A03 * B31 + ... + A0 n * Bn1
C0 n = A00 * B0 n + A02 * B2 n + A03 * B3n + ... + A0 n * Bnn
A00
A
10
..

An1

A01 .. A0 n
A11 .. A1n .
.. .. ..

An 2 .. Ann

B00
B
10
..

Bn1

B01 .. B0 n
B11 .. B1n
.. .. ..

Bn 2 .. Bnn

C00 C01
C
10 C11
..
..

Cn1 Cn 2

.. C0 n
.. C1n
.. ..

.. Cnn

Matrix definition

#define TAG_MATRIX_PARTITION
typedef struct
{ unsigned int
m, n;
double
*data;
double
**rows;
} TMatrix;
rows[0]
rows[1]

rows[n]

A00
A
10
..

An1

0x4560

// Rows, cols
// Data, ordered by row, then by col
// Pointers to rows in data

A01 .. A0 n
A11 .. A1n
.. .. ..

An 2 .. Ann

data

Matrix operations

TMatrix
TMatrix
void
int
TMatrix
TMatrix
void
void

createMatrix
(const unsigned int rows, const unsigned int cols);
readMatrix
(char filename[128]);
freeMatrix
(TMatrix *matrix);
validMatrix
(TMatrix matrix);
initMatrix
(void);
matrixMultiply
(TMatrix A, TMatrix B);
doMatrixMultiply (TMatrix A, TMatrix B, TMatrix C);
printMatrix
(char name[128],TMatrix A);

createMatrix
TMatrix createMatrix(const unsigned int rows, const unsigned int cols)
{
TMatrix
matrix;
unsigned long int m, n;
unsigned int
i,j;
m = rows; n = cols;
matrix.m
= rows;
matrix.n
= cols;
matrix.data = (double *) malloc(sizeof(double) * m * n);
matrix.rows = (double **) malloc(sizeof(double *) * m);
if (validMatrix(matrix))
{
matrix.m = rows;
matrix.n = cols;
for (i = 0; i < rows; i++)
{
matrix.rows[i] = matrix.data + (i * cols);
}
}
else
{
freeMatrix(&matrix);
}
return matrix;
}

freeMatrix

void freeMatrix (TMatrix *matrix)


{
if (matrix == NULL) return;
if (matrix -> data) { free(matrix -> data); matrix -> data = NULL; }
if (matrix -> rows) { free(matrix -> rows); matrix -> rows = NULL; }
matrix -> m = 0;
matrix -> n = 0;
}

validMatrix

int validMatrix (TMatrix matrix)


{
if ((matrix.data == NULL) || (matrix.rows == NULL) ||
(matrix.m == 0) || (matrix.n == 0))
return 0;
else return 1;
}

initMatrix
TMatrix initMatrix()
{
TMatrix matrix;
matrix.m = 0;
matrix.n = 0;
matrix.data = NULL;
matrix.rows = NULL;
return matrix;
}

matrixMultiply

TMatrix matrixMultiply(TMatrix A, TMatrix B)


{
TMatrix C;
C = initMatrix();
if (validMatrix(A) && validMatrix(B) && (A.n == B.m))
{
C = createMatrix(A.m, B.n);
if (validMatrix(C))
{
doMatrixMultiply(A, B, C);
}
}
return C;
}

doMatrixMultiply

void doMatrixMultiply(TMatrix A, TMatrix B, TMatrix C)


{
unsigned int i, j, k;
double sum;
for (i = 0; i < A.m; i++) // Rows
{
for (j = 0; j < B.n; j++) // Cols
{
sum = 0;
for (k = 0; k < A.n; k++)
sum += A.rows[i][k] * B.rows[k][j];
C.rows[i][j] = sum;
}
}
}

printMatrix

void printMatrix(char name[128], TMatrix A)


{
unsigned int i, j;
printf("%s:\n", name);
if (validMatrix(A))
{
for (i = 0; i < A.m; i++)
{
for (j = 0; j < A.n; j++)
printf ("%7.3f ", A.rows[i][j]);
printf ("\n");
}
}
}

readMatrix
int readMatrix(char *filename, TMatrix *A)
{ FILE *fp;
unsigned int m, n, i, j;
float d;
int result = 0;
if ((fp = fopen (filename, "r")) == NULL) return 0;
do
{ if (fscanf (fp, "%d%d", &m, &n) != 2) break;
if ((m == 0) || (n == 0)) break;
*A = createMatrix(m,n);
if (!validMatrix(*A)) break;
for (i = 0; i < m; i ++)
{ for (j = 0; j < n; j ++)
{ if (fscanf (fp, "%f", &d) != 1) break;
A -> rows[i][j] = d;
}
if (j != n) break;
}
if (i != m) break;
result = 1;
} while (0);
fclose (fp);
return result;
}

Write your main function


int main (int argc, char *argv[])
{
int processor_rank = 0;
int processor_count = 1;
MPI_Status
status;
TMatrix
A,B,C,D;
unsigned int m, n= 4, i, j, offset;
double
time0, time1;
A = initMatrix(); B = initMatrix();
C = initMatrix(); D = initMatrix();
MPI_Init(&argc, &argv);
MPI_Comm_size (MPI_COMM_WORLD, &processor_count);
MPI_Comm_rank (MPI_COMM_WORLD, &processor_rank );

WORK HERE
// Free matrix data
freeMatrix(&A); freeMatrix(&B); freeMatrix(&C);
// Wait for everyone to stop
MPI_Barrier(MPI_COMM_WORLD);
// Always use MPI_Finalize as the last instruction of the program
MPI_Finalize();
return 0;
}

Processes
Rank = 0

others

Time stamp
Read the matrices A, B
Allocate memories for matrix C
Broadcast(send) size of matrix
Broadcast(send) matrix B
Split A into parts
Send each process a part of A
Multiply first part here, result in C
Receive other parts of C
Time stamp

Broadcast(receive) size of matrix


Allocate memories for matrices
Broadcast(receive) matrix B
Receive a part of A
Multiply their part of matrix, result in C

Send the result back to 0

partitioning
n columns

P0
n rows

P1

Pprocessor_count-1

number of rows for each process (m) = n / processor_count


start row for each process i = m * i
amount of data for each process = m * n

For examples, 8*8 matrices


4 processors
8 columns

P0
8rows

P1

P3

m= 8 / 4 = 2
start row for each process i = 2 * i
amount of data for each process = 2 * 8

when rank = 0
if (processor_rank == 0)
{ time0 = MPI_Wtime();
readMatrix(argv[1], &A);
readMatrix(argv[2], &B);
n = A.n;
m = n / processor_count;
C = createMatrix(n,n);
// Broadcast (send) size of matrix
MPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

// Broadcast (send) B matrix


MPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Send each process it's own part of A
for (i = 1; i < processor_count; i++)
MPI_Send((void *)A.rows[m*i], m*n, MPI_DOUBLE,
i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD);
...
}

when rank = 0
if (processor_rank == 0)
{
...

// Multiply own part of matrix A with B into already existing matrix C


A.m = m;
doMatrixMultiply(A,B,C);
A.m = n;
// Receive part of C matrix from each process
for (i = 1; i < processor_count; i++)
MPI_Recv((void *)C.rows[m*i], m*n, MPI_DOUBLE,
i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD, &status);
// Record finish time
time1 = MPI_Wtime();
printMatrix("A",A);
printMatrix("B",B);
printMatrix("C",C);
// Print time statistics
printf ("Total time using [%2d] processors : [%f] seconds\n",
processor_count, time1 - time0);
}

Other ranks
else
{
// Broadcast (receive) size of matrix
MPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
//
m
A
B

Allocate memory for matrices


= n / processor_count;
= createMatrix(m, n);
= createMatrix(n ,n);

// Broadcast (receive) B matrix


MPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Recv((void *)A.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,
MPI_COMM_WORLD, &status);
// Multiply local matrices
C = matrixMultiply(A,B);
// Send back result
MPI_Send((void *)C.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,
MPI_COMM_WORLD);
}

Multiplication 1000 x 1000


1000 x 1000 Matrix multiplication
140
120

Time (s)

100
80
60
40
20
0
0

10

20

30

40

Processors
Tp

T1 / p

50

60

Multiplication 5000 x 5000


5000 x 5000 Matrix multiplication
90000
80000
70000

Time (s)

60000
50000
40000
30000
20000
10000
0
0

10

15

20

Processors
Tp

T1 / p

25

30

35

Vous aimerez peut-être aussi