fork download
  1. #include <benchmark/benchmark.h>
  2. #include <vector>
  3. #include <random>
  4.  
  5. using namespace std;
  6.  
  7. const int MOD = 998'244'353;
  8. int add (int a, int b) { return a + b - (a + b < MOD ? 0 : MOD); }
  9. int sub (int a, int b) { return a - b + (a - b >= 0 ? 0 : MOD); }
  10. int mul (int a, int b) { return 1LL * a * b % MOD; }
  11.  
  12. struct Matrix : vector<int> {
  13. // initialization
  14. int n, m;
  15. Matrix (int n, int m) :
  16. vector<int>(n * m), n(n), m(m) {}
  17. Matrix (initializer_list<int> init, int row) :
  18. n(row), m(init.size() / n), vector<int>(init.begin(), init.end()) {}
  19.  
  20. // access operators for different scenarios
  21. int* operator[] (int i) { return data() + i * m; }
  22. const int* operator[] (int i) const { return const_cast<int*>(data()) + i * m; }
  23. };
  24.  
  25. static void matMulOriginal (benchmark::State &state) {
  26. int n = state.range(0);
  27. Matrix a(n, n), b(n, n);
  28.  
  29. mt19937 rng(21);
  30. for (int i = 0; i < n; i++)
  31. for (int j = 0; j < n; j++)
  32. a[i][j] = rng() % MOD, b[i][j] = rng() % MOD;
  33.  
  34. for (auto _ : state) {
  35. Matrix c(n, n);
  36. for (int i = 0; i < n; i++)
  37. for (int j = 0; j < n; j++)
  38. for (int k = 0; k < n; k++)
  39. c[i][j] = add(c[i][j], mul(a[i][k], b[k][j]));
  40.  
  41. benchmark::DoNotOptimize(c.data());
  42. benchmark::ClobberMemory();
  43. }
  44. }
  45. BENCHMARK(matMulOriginal)
  46. ->RangeMultiplier(2)
  47. ->Range(1 << 1, 1 << 10);
  48.  
  49. static void matMulTranspose (benchmark::State &state) {
  50. int n = state.range(0);
  51. Matrix a(n, n), b(n, n);
  52.  
  53. mt19937 rng(21);
  54. for (int i = 0; i < n; i++)
  55. for (int j = 0; j < n; j++)
  56. a[i][j] = rng() % MOD, b[i][j] = rng() % MOD;
  57.  
  58. for (auto _ : state) {
  59. Matrix bT(n, n), c(n, n);
  60. for (int i = 0; i < n; i++)
  61. for (int j = 0; j < n; j++) bT[i][j] = b[j][i];
  62. for (int i = 0; i < n; i++)
  63. for (int j = 0; j < n; j++)
  64. for (int k = 0; k < n; k++)
  65. c[i][j] = add(c[i][j], mul(a[i][k], bT[j][k]));
  66.  
  67. benchmark::DoNotOptimize(c.data());
  68. benchmark::ClobberMemory();
  69. }
  70. }
  71. BENCHMARK(matMulTranspose)
  72. ->RangeMultiplier(2)
  73. ->Range(1 << 1, 1 << 10);
  74.  
  75. const int TILESIZE = 16;
  76. int bCached[TILESIZE][TILESIZE];
  77.  
  78. static void matMulTiling (benchmark::State &state) {
  79. int n = state.range(0);
  80. Matrix a(n, n), b(n, n);
  81.  
  82. mt19937 rng(21);
  83. for (int i = 0; i < n; i++)
  84. for (int j = 0; j < n; j++)
  85. a[i][j] = rng() % MOD, b[i][j] = rng() % MOD;
  86.  
  87. for (auto _ : state) {
  88. Matrix c(a.n, b.m);
  89. for (int iTile = 0; iTile < a.n; iTile += TILESIZE) {
  90. int iSize = min(TILESIZE, a.n - iTile);
  91. for (int jTile = 0; jTile < b.m; jTile += TILESIZE) {
  92. int jSize = min(TILESIZE, b.m - jTile);
  93. for (int kTile = 0; kTile < a.m; kTile += TILESIZE) {
  94. int kSize = min(TILESIZE, a.m - kTile);
  95. // transfer data to be cached for b + in-place transpose
  96. for (int k = 0; k < kSize; k++)
  97. for (int j = 0; j < jSize; j++)
  98. bCached[j][k] = b[k + kTile][j + jTile];
  99.  
  100. // perform matrix multiplication for current block
  101. for (int i = 0; i < iSize; i++) {
  102. // dot product between 2 cached rows
  103. for (int j = 0; j < jSize; j++) {
  104. unsigned long long hold = c[i + iTile][j + jTile];
  105. for (int k = 0; k < kSize; k++)
  106. hold += 1ULL * a[i + iTile][k + kTile] * bCached[j][k];
  107. hold %= MOD, c[i + iTile][j + jTile] = hold;
  108. }
  109. }
  110. }
  111. }
  112. }
  113.  
  114. benchmark::DoNotOptimize(c.data());
  115. benchmark::ClobberMemory();
  116. }
  117. }
  118. BENCHMARK(matMulTiling)
  119. ->RangeMultiplier(2)
  120. ->Range(1 << 1, 1 << 10);
  121.  
  122. BENCHMARK_MAIN();
Compilation error #stdin compilation error #stdout 0s 0KB
stdin
Standard input is empty
compilation info
prog.cpp:1:10: fatal error: benchmark/benchmark.h: No such file or directory
 #include <benchmark/benchmark.h>
          ^~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
stdout
Standard output is empty