Académique Documents
Professionnel Documents
Culture Documents
Preeyakorn Tipwai
High Performance Computing
Workshop
Feb 2-3, 2009
Matrix Multiplication
P0
P1
P1
P2
P3
Pprocess_count-1
P4
Implementation
1.
2.
3.
4.
5.
6.
7.
Data partitioning
z
z
Matrix Multiplication
-We have to multiply each row of A with all of B, to result in the same row of C
-number of columns in A must equal number of rows in B
-lets try n*n matrices
An1
A01 .. A0 n
A11 .. A1n .
.. .. ..
An 2 .. Ann
B00
B
10
..
Bn1
B01 .. B0 n
B11 .. B1n
.. .. ..
Bn 2 .. Bnn
C00 C01
C
10 C11
..
..
Cn1 Cn 2
.. C0 n
.. C1n
.. ..
.. Cnn
Matrix definition
#define TAG_MATRIX_PARTITION
typedef struct
{ unsigned int
m, n;
double
*data;
double
**rows;
} TMatrix;
rows[0]
rows[1]
rows[n]
A00
A
10
..
An1
0x4560
// Rows, cols
// Data, ordered by row, then by col
// Pointers to rows in data
A01 .. A0 n
A11 .. A1n
.. .. ..
An 2 .. Ann
data
Matrix operations
TMatrix
TMatrix
void
int
TMatrix
TMatrix
void
void
createMatrix
(const unsigned int rows, const unsigned int cols);
readMatrix
(char filename[128]);
freeMatrix
(TMatrix *matrix);
validMatrix
(TMatrix matrix);
initMatrix
(void);
matrixMultiply
(TMatrix A, TMatrix B);
doMatrixMultiply (TMatrix A, TMatrix B, TMatrix C);
printMatrix
(char name[128],TMatrix A);
createMatrix
TMatrix createMatrix(const unsigned int rows, const unsigned int cols)
{
TMatrix
matrix;
unsigned long int m, n;
unsigned int
i,j;
m = rows; n = cols;
matrix.m
= rows;
matrix.n
= cols;
matrix.data = (double *) malloc(sizeof(double) * m * n);
matrix.rows = (double **) malloc(sizeof(double *) * m);
if (validMatrix(matrix))
{
matrix.m = rows;
matrix.n = cols;
for (i = 0; i < rows; i++)
{
matrix.rows[i] = matrix.data + (i * cols);
}
}
else
{
freeMatrix(&matrix);
}
return matrix;
}
freeMatrix
validMatrix
initMatrix
TMatrix initMatrix()
{
TMatrix matrix;
matrix.m = 0;
matrix.n = 0;
matrix.data = NULL;
matrix.rows = NULL;
return matrix;
}
matrixMultiply
doMatrixMultiply
printMatrix
readMatrix
int readMatrix(char *filename, TMatrix *A)
{ FILE *fp;
unsigned int m, n, i, j;
float d;
int result = 0;
if ((fp = fopen (filename, "r")) == NULL) return 0;
do
{ if (fscanf (fp, "%d%d", &m, &n) != 2) break;
if ((m == 0) || (n == 0)) break;
*A = createMatrix(m,n);
if (!validMatrix(*A)) break;
for (i = 0; i < m; i ++)
{ for (j = 0; j < n; j ++)
{ if (fscanf (fp, "%f", &d) != 1) break;
A -> rows[i][j] = d;
}
if (j != n) break;
}
if (i != m) break;
result = 1;
} while (0);
fclose (fp);
return result;
}
WORK HERE
// Free matrix data
freeMatrix(&A); freeMatrix(&B); freeMatrix(&C);
// Wait for everyone to stop
MPI_Barrier(MPI_COMM_WORLD);
// Always use MPI_Finalize as the last instruction of the program
MPI_Finalize();
return 0;
}
Processes
Rank = 0
others
Time stamp
Read the matrices A, B
Allocate memories for matrix C
Broadcast(send) size of matrix
Broadcast(send) matrix B
Split A into parts
Send each process a part of A
Multiply first part here, result in C
Receive other parts of C
Time stamp
partitioning
n columns
P0
n rows
P1
Pprocessor_count-1
P0
8rows
P1
P3
m= 8 / 4 = 2
start row for each process i = 2 * i
amount of data for each process = 2 * 8
when rank = 0
if (processor_rank == 0)
{ time0 = MPI_Wtime();
readMatrix(argv[1], &A);
readMatrix(argv[2], &B);
n = A.n;
m = n / processor_count;
C = createMatrix(n,n);
// Broadcast (send) size of matrix
MPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
when rank = 0
if (processor_rank == 0)
{
...
Other ranks
else
{
// Broadcast (receive) size of matrix
MPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
//
m
A
B
Time (s)
100
80
60
40
20
0
0
10
20
30
40
Processors
Tp
T1 / p
50
60
Time (s)
60000
50000
40000
30000
20000
10000
0
0
10
15
20
Processors
Tp
T1 / p
25
30
35