blob: ade07d079bffb0794345dccd46ea877888608dcc [file] [log] [blame]
#define M 1024
#define N 1024
#define K 1024
float A[K][M];
float B[N][K];
float C[M][N];
/*
void matmul_vec(void) {
int i, j, k;
/* With much unrolling
for (i=0;i<=M;i++)
for (j=0;j<=N;j+=4)
for (k=0;k<=K;k+=8)
for (kk=k;kk<=k+7;kk++)
for (jj=j;jj<=j+3;jj++)
C[i][jj] += A[kk][i] * B[jj][kk];
vec_load splat scalar_load
*/
/* Without unrolling
for (i=0;i<=M;i++)
for (j=0;j<=N;j+=4)
for (k=0;k<=K;k++)
for (jj=j;jj<=j+3;jj++)
C[i][jj] += A[k][i] * B[jj][kk];
vec_load splat scalar_load
/
}
i*/
int main()
{
int i, j, k;
//matmul_vec();
for(i=0; i<M/4; i++)
for(k=0; k<K; k++) {
for(j=0; j<N; j++)
C[i+0][j] += A[k][i+0] * B[j][k];
C[i+1][j] += A[k][i+1] * B[j][k];
C[i+2][j] += A[k][i+2] * B[j][k];
C[i+3][j] += A[k][i+3] * B[j][k];
}
return A[42][42];
}