
Mit Mathematica überprüfte Modellierung der nichtlinearen Laufzeit \(T_n^b = \mathcal{O}(({\ell}^2)) n^a\)
Das folgende C++-Programm ergibt für jede \(n\times n\)-Matrix mit der Schleifenzahl \(a \in \{2, 3\}\), der Bitlänge \(b\) und der ganzzahligen (!) Rekursionstiefe \(\ell = {}_2n\) speziell die Laufzeiten \(T_c^b = \mathcal{O}(1) n^3\) und \(T_f^b =\mathcal{O}({\ell}^2) n^2\).
// FastShift algorithm as fastintopt.cpp by Boris Haase (01.01.2026)
// Compilation: g++ -O3 -std=gnu++17 -march=native -fopenmp fastintopt.cpp -o fastintopt -lgmpxx -lgmp
#include <algorithm>
#include <chrono>
#include <iostream>
#include <vector>
#include <gmpxx.h>
#include <omp.h>
using namespace std;
// Hilfsfunktionen für GMP-Arithmetik
static inline void mpz_addmul_cpp(mpz_class &rop, const mpz_class &op1, const mpz_class &op2) {
mpz_addmul(rop.get_mpz_t(), op1.get_mpz_t(), op2.get_mpz_t());
}
static inline void fdiv_qr(mpz_class &q, mpz_class &r, const mpz_class &a, const mpz_class &b) {
mpz_fdiv_qr(q.get_mpz_t(), r.get_mpz_t(), a.get_mpz_t(), b.get_mpz_t());
}
// Kompakte Matrix-Struktur für flache Speicherlayouts
struct FlatM {
mpz_class* ptr = nullptr;
std::size_t row_stride = 0;
std::vector<mpz_class> data;
FlatM() = default;
explicit FlatM(int n) : row_stride((std::size_t)n), data((std::size_t)n * n) {
ptr = data.data();
}
FlatM(mpz_class* raw_ptr, std::size_t stride) : ptr(raw_ptr), row_stride(stride) {}
inline mpz_class& operator()(int i, int j) { return *(ptr + i * row_stride + j); }
inline const mpz_class& operator()(int i, int j) const { return *(ptr + i * row_stride + j); }
inline FlatM submatrix(int ro, int co) const { return FlatM(ptr + ro * row_stride + co, row_stride); }
};
// Standard-Blockmultiplikation (optimiert für Cache-Efficiency)
static FlatM multiply_blocked(const FlatM& A, const FlatM& B, int n) {
FlatM C(n);
const int bs = 64; // Blockgröße für L1/L2 Cache
#pragma omp parallel for collapse(2) schedule(static) if(!omp_in_parallel())
for (int i0 = 0; i0 < n; i0 += bs) {
for (int j0 = 0; j0 < n; j0 += bs) {
int ie = min(i0 + bs, n), je = min(j0 + bs, n);
for (int k0 = 0; k0 < n; k0 += bs) {
int ke = min(k0 + bs, n);
for (int i = i0; i < ie; ++i) {
for (int k = k0; k < ke; ++k) {
const mpz_class& av = A(i, k);
for (int j = j0; j < je; ++j) {
mpz_addmul_cpp(C(i, j), av, B(k, j));
}
}
}
}
}
}
return C;
}
// FastShift: Add & Shift Operation (parallelisiert)
static FlatM add_shiR(const FlatM& C, const FlatM& D, int n, int r_bits) {
FlatM R(n);
#pragma omp parallel for collapse(2) schedule(static) if(!omp_in_parallel() && n >= 256)
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
mpz_mul_2exp(R(i, j).get_mpz_t(), C(i, j).get_mpz_t(), r_bits);
mpz_add(R(i, j).get_mpz_t(), R(i, j).get_mpz_t(), D(i, j).get_mpz_t());
}
}
return R;
}
// FastShift: Balancierte Division & Modulo (Kernstück der Rekursion)
static FlatM div_modR(const FlatM& S11, const FlatM& S12, const FlatM& S21, const FlatM& S22, int n, int s_bits) {
int m = n / 2;
FlatM R(n);
mpz_class B = mpz_class(1) << s_bits, H = B >> 1;
#pragma omp parallel for collapse(2) schedule(static) if(!omp_in_parallel() && m >= 256)
for (int i = 0; i < m; ++i) {
for (int j = 0; j < m; ++j) {
mpz_class q, r, St = S11(i, j) + S12(i, j);
fdiv_qr(q, r, St, B);
if (r > H) { ++q; r -= B; } else if (r <= -H) { --q; r += B; }
R(i, j) = q; R(i, j + m) = r;
mpz_class Sb = S21(i, j) + S22(i, j);
fdiv_qr(q, r, Sb, B);
if (r > H) { ++q; r -= B; } else if (r <= -H) { --q; r += B; }
R(i + m, j) = q; R(i + m, j + m) = r;
}
}
return R;
}
// Rekursive FastShift-Steuerung mit Task-Parallelismus
static FlatM fastshift_rec(const FlatM& Y, const FlatM& Z, int n, int s_bits, int leaf_cutoff, int depth) {
if (n <= leaf_cutoff) return multiply_blocked(Y, Z, n);
int m = n / 2, t_bits = s_bits * 2;
FlatM A = add_shiR(Z.submatrix(0, 0), Z.submatrix(0, m), m, s_bits);
FlatM B = add_shiR(Z.submatrix(m, 0), Z.submatrix(m, m), m, s_bits);
FlatM U, V, W, X;
if (depth < 3) { // Task-Parallelität für die oberen Rekursionsebenen
#pragma omp taskgroup
{
#pragma omp task shared(U)
U = fastshift_rec(Y.submatrix(0, 0), A, m, t_bits, leaf_cutoff, depth + 1);
#pragma omp task shared(V)
V = fastshift_rec(Y.submatrix(0, m), B, m, t_bits, leaf_cutoff, depth + 1);
#pragma omp task shared(W)
W = fastshift_rec(Y.submatrix(m, 0), A, m, t_bits, leaf_cutoff, depth + 1);
#pragma omp task shared(X)
X = fastshift_rec(Y.submatrix(m, m), B, m, t_bits, leaf_cutoff, depth + 1);
}
} else {
U = fastshift_rec(Y.submatrix(0, 0), A, m, t_bits, leaf_cutoff, depth + 1);
V = fastshift_rec(Y.submatrix(0, m), B, m, t_bits, leaf_cutoff, depth + 1);
W = fastshift_rec(Y.submatrix(m, 0), A, m, t_bits, leaf_cutoff, depth + 1);
X = fastshift_rec(Y.submatrix(m, m), B, m, t_bits, leaf_cutoff, depth + 1);
}
return div_modR(U, V, W, X, n, s_bits);
}
int main(int argc, char** argv) {
// Standardparameter für AMD EPYC Benchmarks
int threads = 8, bits = 256, m_min = 8, m_max = 13, leaf_cutoff = 2048;
if (argc > 1) threads = max(1, atoi(argv[1]));
if (argc > 2) bits = max(1, atoi(argv[2]));
if (argc > 3) m_min = max(1, atoi(argv[3]));
if (argc > 4) m_max = max(m_min, atoi(argv[4]));
if (argc > 5) leaf_cutoff = max(16, atoi(argv[5]));
omp_set_num_threads(threads);
for (int m = m_min; m <= m_max; ++m) {
int n = 1 << m, s_bits = bits * 2 + m;
cout << "FastShift " << bits << " Bits | n=2^" << m << " | Threads=" << threads << endl;
gmp_randclass rng(gmp_randinit_default); rng.seed(42u + m);
FlatM A(n), B(n);
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
A(i, j) = rng.get_z_bits(bits);
B(i, j) = rng.get_z_bits(bits);
}
}
auto t1 = chrono::steady_clock::now();
FlatM Cf = fastshift_rec(A, B, n, s_bits, leaf_cutoff, 0);
auto t2 = chrono::steady_clock::now();
FlatM Cs = multiply_blocked(A, B, n);
auto t3 = chrono::steady_clock::now();
auto ms_fs = chrono::duration_cast<chrono::milliseconds>(t2 - t1).count();
auto ms_std = chrono::duration_cast<chrono::milliseconds>(t3 - t2).count();
cout << "n = " << n << " | FS: " << ms_fs << " ms | Std: " << ms_std << " ms | "
<< (Cf(0,0) == Cs(0,0) ? "OK" : "Error") << endl;
}
return 0;
}
| \(\ell\) | \(T_c^{64}\) | \(T_f^{64}\) | \(T_c^{64}/T_f^{64}\) | \(T_c^{128}\) | \(T_f^{128}\) | \(T_c^{128}/T_f^{128}\) |
|---|---|---|---|---|---|---|
| 10 | 938 | 881 | 1,06 | 1096 | 1151 | 0,95 |
| 11 | 6157 | 6170 | 1,00 | 8295 | 8440 | 0,98 |
| 12 | 41525 | 27483 | 1,51 | 62070 | 42443 | 1,46 |
| 13 | 322066 | 131767 | 2,44 | 484108 | 257820 | 1,88 |
Für 256 Bit nivellieren sich die Vorteile auf dieser Architektur nahezu.
© 2026 by Boris Haase