Matrixmultiplikation

Matrixmultiplikation mit C++
Matrixmultiplikation mit C++

Mit Mathematica überprüfte Modellierung der nichtlinearen Laufzeit \(T_n^b = \mathcal{O}(({\ell}^2)) n^a\)

Das folgende C++-Programm ergibt für jede \(n\times n\)-Matrix mit der Schleifenzahl \(a \in \{2, 3\}\), der Bitlänge \(b\) und der ganzzahligen (!) Rekursionstiefe \(\ell = {}_2n\) speziell die Laufzeiten \(T_c^b = \mathcal{O}(1) n^3\) und \(T_f^b =\mathcal{O}({\ell}^2) n^2\).

// FastShift algorithm as fastintopt.cpp by Boris Haase (01.01.2026)
// Compilation: g++ -O3 -std=gnu++17 -march=native -fopenmp fastintopt.cpp -o fastintopt -lgmpxx -lgmp

#include <algorithm>
#include <chrono>
#include <iostream>
#include <vector>
#include <gmpxx.h>
#include <omp.h>

using namespace std;

// Hilfsfunktionen für GMP-Arithmetik
static inline void mpz_addmul_cpp(mpz_class &rop, const mpz_class &op1, const mpz_class &op2) {
    mpz_addmul(rop.get_mpz_t(), op1.get_mpz_t(), op2.get_mpz_t());
}

static inline void fdiv_qr(mpz_class &q, mpz_class &r, const mpz_class &a, const mpz_class &b) {
    mpz_fdiv_qr(q.get_mpz_t(), r.get_mpz_t(), a.get_mpz_t(), b.get_mpz_t());
}

// Kompakte Matrix-Struktur für flache Speicherlayouts
struct FlatM {
    mpz_class* ptr = nullptr;
    std::size_t row_stride = 0;
    std::vector<mpz_class> data;

    FlatM() = default;
    explicit FlatM(int n) : row_stride((std::size_t)n), data((std::size_t)n * n) { 
        ptr = data.data(); 
    }
    
    FlatM(mpz_class* raw_ptr, std::size_t stride) : ptr(raw_ptr), row_stride(stride) {}

    inline mpz_class& operator()(int i, int j) { return *(ptr + i * row_stride + j); }
    inline const mpz_class& operator()(int i, int j) const { return *(ptr + i * row_stride + j); }
    inline FlatM submatrix(int ro, int co) const { return FlatM(ptr + ro * row_stride + co, row_stride); }
};

// Standard-Blockmultiplikation (optimiert für Cache-Efficiency)
static FlatM multiply_blocked(const FlatM& A, const FlatM& B, int n) {
    FlatM C(n);
    const int bs = 64; // Blockgröße für L1/L2 Cache

    #pragma omp parallel for collapse(2) schedule(static) if(!omp_in_parallel())
    for (int i0 = 0; i0 < n; i0 += bs) {
        for (int j0 = 0; j0 < n; j0 += bs) {
            int ie = min(i0 + bs, n), je = min(j0 + bs, n);
            for (int k0 = 0; k0 < n; k0 += bs) {
                int ke = min(k0 + bs, n);
                for (int i = i0; i < ie; ++i) {
                    for (int k = k0; k < ke; ++k) {
                        const mpz_class& av = A(i, k);
                        for (int j = j0; j < je; ++j) {
                            mpz_addmul_cpp(C(i, j), av, B(k, j));
                        }
                    }
                }
            }
        }
    }
    return C;
}

// FastShift: Add & Shift Operation (parallelisiert)
static FlatM add_shiR(const FlatM& C, const FlatM& D, int n, int r_bits) {
    FlatM R(n);
    #pragma omp parallel for collapse(2) schedule(static) if(!omp_in_parallel() && n >= 256)
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            mpz_mul_2exp(R(i, j).get_mpz_t(), C(i, j).get_mpz_t(), r_bits);
            mpz_add(R(i, j).get_mpz_t(), R(i, j).get_mpz_t(), D(i, j).get_mpz_t());
        }
    }
    return R;
}

// FastShift: Balancierte Division & Modulo (Kernstück der Rekursion)
static FlatM div_modR(const FlatM& S11, const FlatM& S12, const FlatM& S21, const FlatM& S22, int n, int s_bits) {
    int m = n / 2; 
    FlatM R(n); 
    mpz_class B = mpz_class(1) << s_bits, H = B >> 1;

    #pragma omp parallel for collapse(2) schedule(static) if(!omp_in_parallel() && m >= 256)
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < m; ++j) {
            mpz_class q, r, St = S11(i, j) + S12(i, j);
            fdiv_qr(q, r, St, B);
            if (r > H) { ++q; r -= B; } else if (r <= -H) { --q; r += B; }
            R(i, j) = q; R(i, j + m) = r;

            mpz_class Sb = S21(i, j) + S22(i, j);
            fdiv_qr(q, r, Sb, B);
            if (r > H) { ++q; r -= B; } else if (r <= -H) { --q; r += B; }
            R(i + m, j) = q; R(i + m, j + m) = r;
        }
    }
    return R;
}

// Rekursive FastShift-Steuerung mit Task-Parallelismus
static FlatM fastshift_rec(const FlatM& Y, const FlatM& Z, int n, int s_bits, int leaf_cutoff, int depth) {
    if (n <= leaf_cutoff) return multiply_blocked(Y, Z, n);
    
    int m = n / 2, t_bits = s_bits * 2;
    FlatM A = add_shiR(Z.submatrix(0, 0), Z.submatrix(0, m), m, s_bits);
    FlatM B = add_shiR(Z.submatrix(m, 0), Z.submatrix(m, m), m, s_bits);
    FlatM U, V, W, X;

    if (depth < 3) { // Task-Parallelität für die oberen Rekursionsebenen
        #pragma omp taskgroup
        {
            #pragma omp task shared(U)
            U = fastshift_rec(Y.submatrix(0, 0), A, m, t_bits, leaf_cutoff, depth + 1);
            #pragma omp task shared(V)
            V = fastshift_rec(Y.submatrix(0, m), B, m, t_bits, leaf_cutoff, depth + 1);
            #pragma omp task shared(W)
            W = fastshift_rec(Y.submatrix(m, 0), A, m, t_bits, leaf_cutoff, depth + 1);
            #pragma omp task shared(X)
            X = fastshift_rec(Y.submatrix(m, m), B, m, t_bits, leaf_cutoff, depth + 1);
        }
    } else {
        U = fastshift_rec(Y.submatrix(0, 0), A, m, t_bits, leaf_cutoff, depth + 1);
        V = fastshift_rec(Y.submatrix(0, m), B, m, t_bits, leaf_cutoff, depth + 1);
        W = fastshift_rec(Y.submatrix(m, 0), A, m, t_bits, leaf_cutoff, depth + 1);
        X = fastshift_rec(Y.submatrix(m, m), B, m, t_bits, leaf_cutoff, depth + 1);
    }
    return div_modR(U, V, W, X, n, s_bits);
}

int main(int argc, char** argv) {
    // Standardparameter für AMD EPYC Benchmarks
    int threads = 8, bits = 256, m_min = 8, m_max = 13, leaf_cutoff = 2048;

    if (argc > 1) threads     = max(1, atoi(argv[1]));
    if (argc > 2) bits        = max(1, atoi(argv[2]));
    if (argc > 3) m_min       = max(1, atoi(argv[3]));
    if (argc > 4) m_max       = max(m_min, atoi(argv[4]));
    if (argc > 5) leaf_cutoff = max(16, atoi(argv[5]));

    omp_set_num_threads(threads);

    for (int m = m_min; m <= m_max; ++m) {
        int n = 1 << m, s_bits = bits * 2 + m;
        cout << "FastShift " << bits << " Bits | n=2^" << m << " | Threads=" << threads << endl;

        gmp_randclass rng(gmp_randinit_default); rng.seed(42u + m);
        FlatM A(n), B(n);
        for (int i = 0; i < n; ++i) {
            for (int j = 0; j < n; ++j) { 
                A(i, j) = rng.get_z_bits(bits);
                B(i, j) = rng.get_z_bits(bits); 
            }
        }

        auto t1 = chrono::steady_clock::now();
        FlatM Cf = fastshift_rec(A, B, n, s_bits, leaf_cutoff, 0);
        auto t2 = chrono::steady_clock::now();
        FlatM Cs = multiply_blocked(A, B, n);
        auto t3 = chrono::steady_clock::now();

        auto ms_fs = chrono::duration_cast<chrono::milliseconds>(t2 - t1).count();
        auto ms_std = chrono::duration_cast<chrono::milliseconds>(t3 - t2).count();

        cout << "n = " << n << " | FS: " << ms_fs << " ms | Std: " << ms_std << " ms | "
             << (Cf(0,0) == Cs(0,0) ? "OK" : "Error") << endl;
    }
    return 0;
}

 

Laufzeit-Mittelwerte auf AMD EPYC™ 9645 (24 Kerne, 128 GB DDR5). Vergleich von \(T_c\) (Standard), \(T_f\) (FastShift) und Quotienten \(T_c/T_f\) bei \(64\) und \(128\) Bit mit \(n = 2^{\ell}\) in Millisekunden
\(\ell\)\(T_c^{64}\)\(T_f^{64}\)\(T_c^{64}/T_f^{64}\)\(T_c^{128}\)\(T_f^{128}\)\(T_c^{128}/T_f^{128}\)
109388811,06109611510,95
11615761701,00829584400,98
1241525274831,5162070424431,46
133220661317672,444841082578201,88

Für 256 Bit nivellieren sich die Vorteile auf dieser Architektur nahezu.

© 2026 by Boris Haase

Seitenbeginn