makr-code · makr-code · Mar 19, 2026 · Mar 16, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -1,6 +1,37 @@
 name: Documentation Validation
 
 on:
+  push:
+    branches:
+      - main
+      - develop
+      - 'release/**'
+    paths:
+      - 'docs/**'
+      - 'compendium/**'
+      - 'aql/**'
+      - 'scripts/docs-lint.py'
+      - 'scripts/link-check.py'
+      - 'scripts/toc-check.py'
+      - 'scripts/doc-header-check.py'
+      - 'scripts/drift-detector.py'
+      - 'scripts/validate-docs.sh'
+      - 'mkdocs.yml'
+      - 'mkdocs-nopdf.yml'
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - 'docs/**'
+      - 'compendium/**'
+      - 'aql/**'
+      - 'scripts/docs-lint.py'
+      - 'scripts/link-check.py'
+      - 'scripts/toc-check.py'
+      - 'scripts/doc-header-check.py'
+      - 'scripts/drift-detector.py'
+      - 'scripts/validate-docs.sh'
+      - 'mkdocs.yml'
+      - 'mkdocs-nopdf.yml'
   # Called by docs/docs-pipeline.yml as Stage 1 (validate).
   # All push/PR events are funnelled through docs-pipeline.yml so this
   # workflow is never triggered directly by those events, preventing

@@ -64,6 +64,56 @@ concurrency:
 
 permissions:
   contents: read
+
+# ---------------------------------------------------------------------------
+# Stage 1 – Validate (lint + links + TOC)
+# Runs on every PR and push so problems are caught early.
+# ---------------------------------------------------------------------------
+jobs:
+  validate:
+    name: Validate (lint + links + TOC)
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: pip
+
+      - name: Install validation dependencies
+        run: pip install pyyaml requests
+
+      - name: Lint documentation
+        id: lint
+        run: python3 scripts/docs-lint.py
+
+      - name: Check internal links
+        id: link-check
+        run: python3 scripts/link-check.py --internal-only
+
+      - name: Validate TOC
+        id: toc
+        run: python3 scripts/toc-check.py
+
+      - name: Write job summary
+        if: always()
+        run: |
+          echo "## 📋 Docs Validation" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Check | Status |" >> "$GITHUB_STEP_SUMMARY"
+          echo "|-------|--------|" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Lint | ${{ steps.lint.outcome }} |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Internal links | ${{ steps.link-check.outcome }} |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| TOC | ${{ steps.toc.outcome }} |" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          echo "**Branch:** \`${{ github.ref_name }}\` | **Event:** \`${{ github.event_name }}\`" >> "$GITHUB_STEP_SUMMARY"
   issues: write   # required by the drift-detection job in documentation-validation.yml
 
 # ---------------------------------------------------------------------------

@@ -0,0 +1,117 @@
+name: Predictive Prefetcher ML CI
+
+# Triggered on every push/PR that touches the PredictivePrefetcher sources,
+# its test file, or this workflow itself.
+# Targets v1.8.0 (roadmap:cache:v1.8.0:predictive-prefetcher-ml-based-access-pattern-model).
+on:
+  push:
+    branches:
+      - main
+      - develop
+    paths:
+      - 'include/cache/predictive_prefetcher.h'
+      - 'src/cache/predictive_prefetcher.cpp'
+      - 'include/cache/adaptive_query_cache.h'
+      - 'src/cache/adaptive_query_cache.cpp'
+      - 'tests/test_predictive_prefetcher_markov.cpp'
+      - 'tests/CMakeLists.txt'
+      - '.github/workflows/predictive-prefetcher-ml-ci.yml'
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - 'include/cache/predictive_prefetcher.h'
+      - 'src/cache/predictive_prefetcher.cpp'
+      - 'include/cache/adaptive_query_cache.h'
+      - 'src/cache/adaptive_query_cache.cpp'
+      - 'tests/test_predictive_prefetcher_markov.cpp'
+      - 'tests/CMakeLists.txt'
+      - '.github/workflows/predictive-prefetcher-ml-ci.yml'
+  workflow_dispatch:
+
+concurrency:
+  group: predictive-prefetcher-ml-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  ci-scope-classifier:
+    permissions:
+      contents: read
+    uses: ./.github/workflows/01-core/ci-scope-classifier.yml
+
+  # ---------------------------------------------------------------------------
+  # Build and run the PredictivePrefetcherMarkovTests suite.
+  # Tests cover all four acceptance criteria:
+  #   AC-1  Markov chain + ToD bucketing (time-of-day weighted candidates)
+  #   AC-2  RocksDB persistence          (save/load round-trip via prefetch_model:: prefix)
+  #   AC-3  MetricsCollector emission    (cache.prefetch.hit_rate + overhead_bytes)
+  #   AC-4  A/B test toggle              (50/50 tenant split, per-group counters)
+  # ---------------------------------------------------------------------------
+  predictive-prefetcher-ml-unit-tests:
+    needs: ci-scope-classifier
+    if: needs.ci-scope-classifier.outputs.has_code_changes == 'true'
+    name: Predictive Prefetcher ML tests (${{ matrix.os }} / ${{ matrix.compiler }})
+    runs-on: ${{ matrix.os }}
+    permissions:
+      contents: read
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            compiler: gcc-12
+            cc: gcc-12
+            cxx: g++-12
+          - os: ubuntu-22.04
+            compiler: clang-15
+            cc: clang-15
+            cxx: clang++-15
+          - os: ubuntu-24.04
+            compiler: gcc-13
+            cc: gcc-13
+            cxx: g++-13
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Configure and build (predictive prefetcher ML test target)
+        uses: ./.github/actions/configure-themis
+        with:
+          cc:           ${{ matrix.cc }}
+          cxx:          ${{ matrix.cxx }}
+          build-target: test_predictive_prefetcher_markov_focused
+
+      - name: Run Predictive Prefetcher ML unit tests
+        run: |
+          cd build
+          ctest --test-dir . \
+            --tests-regex PredictivePrefetcherMarkovTests \
+            --output-on-failure \
+            --timeout 60 \
+            2>&1 | tee predictive_prefetcher_ml_test_output.txt
+
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: predictive-prefetcher-ml-results-${{ matrix.os }}-${{ matrix.compiler }}
+          path: |
+            build/predictive_prefetcher_ml_test_output.txt
+          retention-days: 14
+
+      - name: Write job summary
+        if: always()
+        run: |
+          echo "## 🧠 Predictive Prefetcher ML – Unit Tests" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY"
+          echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY"
+          echo "| **OS** | \`${{ matrix.os }}\` |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| **Compiler** | \`${{ matrix.compiler }}\` |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| **Event** | \`${{ github.event_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| **Branch** | \`${{ github.ref_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| **Commit** | \`${{ github.sha }}\` |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| **Triggered by** | ${{ github.actor }} |" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          echo "PredictivePrefetcher: Markov chain, time-of-day bucketing, RocksDB persistence, MetricsCollector, A/B test toggle." >> "$GITHUB_STEP_SUMMARY"
@@ -139,6 +139,8 @@ class AdaptiveQueryCache {
         size_t prefetch_max_predictions = 3;     // Max candidate fingerprints per prediction
         uint32_t prefetch_min_transition_count = 2; // Min observed transitions for a candidate
         double prefetch_min_confidence = 0.0;    // Min transition confidence (0.0 = disabled)
+        bool prefetch_enable_time_of_day_weighting = false; // Weight predictions by hour-of-day
+        bool prefetch_enable_ab_test = false;    // Route 50% tenants to Markov, 50% to baseline
         // Phase 4: Cache replication for high-availability multi-node deployments
         bool enable_replication = false;         // Enable cache replication via coordinator
 
@@ -535,6 +537,31 @@ class AdaptiveQueryCache {
      * false.
      */
     nlohmann::json getPrefetchStats() const;
+
+    /**
+     * @brief Account for prefetch overhead bytes (entries fetched but never hit).
+     *
+     * Callers should invoke this when a prefetched cache entry is evicted or
+     * expires without having been accessed.  The accumulated total is exported
+     * via the `cache.prefetch.overhead_bytes` metric.
+     *
+     * @param bytes Estimated byte size of the wasted prefetch.
+     */
+    void recordPrefetchOverheadBytes(uint64_t bytes);
+
+    /**
+     * @brief Persist the prefetch Markov model to the L3 RocksDB instance.
+     *
+     * No-op when the prefetcher is disabled or L3 is unavailable.
+     */
+    void savePrefetchModel();
+
+    /**
+     * @brief Restore the prefetch Markov model from the L3 RocksDB instance.
+     *
+     * No-op when the prefetcher is disabled or L3 is unavailable.
+     */
+    void loadPrefetchModel();
     // Phase 4: Cache Replication for High-Availability
     // ========================================================================
 

@@ -30,11 +30,16 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
+#include <array>
 #include <mutex>
 #include <cstdint>
+#include <ctime>
 #include <nlohmann/json.hpp>
 
 namespace themis {
+// Forward declaration – avoids pulling in all of rocksdb_wrapper.h
+class RocksDBWrapper;
+
 namespace cache {
 
 /**
@@ -81,6 +86,15 @@ class PredictivePrefetcher {
         /// from the source key that must lead to the successor.
         double min_confidence = 0.0;
 
+        /// When true, prediction scores are weighted by time-of-day access
+        /// frequency (24 one-hour buckets).
+        bool enable_time_of_day_weighting = false;
+
+        /// When true, exactly 50 % of tenants are routed to the Markov model
+        /// (with time-of-day weighting) and the other 50 % to the frequency
+        /// baseline.  The split is deterministic: hash(tenant_id) % 2.
+        bool enable_ab_test = false;
+
         static Config defaults() { return {}; }
     };
 
@@ -122,16 +136,58 @@ class PredictivePrefetcher {
      *
      * Used externally by AdaptiveQueryCache to track effective prefetch hits
      * for metrics purposes.
+     *
+     * @param tenant_id  Optional tenant identifier; used to attribute the hit
+     *                   to the correct A/B group when `enable_ab_test` is true.
      */
-    void recordPrefetchHit();
+    void recordPrefetchHit(const std::string& tenant_id = "");
 
     /**
      * @brief Record that prefetch candidates were generated for a key.
      *
-     * Called automatically by getPrefetchCandidates() when at least one
-     * candidate is returned.
+     * Called by getPrefetchCandidates() when at least one candidate is returned.
+     * Also emits the current `cache.prefetch.hit_rate` gauge to MetricsCollector
+     * so the gauge stays fresh even when hits are sparse.
+     *
+     * @param count      Number of candidates generated (1..max_predictions).
+     * @param tenant_id  Optional tenant identifier; used to attribute the
+     *                   generation event to the correct A/B group.
+     */
+    void recordCandidatesGenerated(size_t count = 1,
+                                   const std::string& tenant_id = "");
+
+    /**
+     * @brief Track bytes fetched via prefetch that were never subsequently hit.
+     *
+     * The caller (e.g. AdaptiveQueryCache) should call this when a prefetched
+     * entry expires or is evicted before being accessed.  Used to report the
+     * `cache.prefetch.overhead_bytes` metric.
+     *
+     * @param bytes Number of overhead bytes to record.
      */
-    void recordCandidatesGenerated();
+    void recordOverheadBytes(uint64_t bytes);
+
+    /**
+     * @brief Persist the Markov transition matrix to RocksDB.
+     *
+     * Keys are written under the prefix `prefetch_model::`.  Each key encodes
+     * the (from, to) fingerprint pair; the value is a JSON object containing
+     * the raw transition count and the 24-bucket time-of-day histogram.
+     *
+     * @param db  Open RocksDBWrapper instance.  If null this is a no-op.
+     */
+    void saveModel(RocksDBWrapper* db);
+
+    /**
+     * @brief Restore the Markov transition matrix from RocksDB.
+     *
+     * Scans `prefetch_model::` prefix and populates the in-memory transition
+     * table.  Existing in-memory state is merged (not replaced) so that
+     * concurrent learning is not lost.
+     *
+     * @param db  Open RocksDBWrapper instance.  If null this is a no-op.
+     */
+    void loadModel(RocksDBWrapper* db);
 
     /**
      * @brief Clear all transition state and reset counters.
@@ -155,12 +211,41 @@ class PredictivePrefetcher {
     // Per-tenant (or global if empty) last-seen fingerprint for session tracking.
     std::unordered_map<std::string, std::string> last_fingerprint_;
 
+    // Time-of-day access counts: from -> to -> hour[0..23].
+    // Tracked when config_.enable_time_of_day_weighting is true.
+    std::unordered_map<std::string,
+        std::unordered_map<std::string,
+            std::array<uint32_t, 24>>> tod_buckets_;
+
     mutable std::mutex mutex_;
 
     // Metrics
     uint64_t total_transitions_recorded_ = 0;
     uint64_t candidates_generated_ = 0;
     uint64_t prefetch_hits_ = 0;
+    uint64_t overhead_bytes_ = 0;
+
+    // A/B group hit-rate counters (only meaningful when enable_ab_test is true).
+    // Declared mutable so they can be updated from the const getPrefetchCandidates().
+    mutable uint64_t ab_markov_hits_ = 0;
+    mutable uint64_t ab_markov_generated_ = 0;
+    mutable uint64_t ab_baseline_hits_ = 0;
+    mutable uint64_t ab_baseline_generated_ = 0;
+
+    // Internal helpers
+    /// Returns true if ToD weighting should be applied for this tenant.
+    /// When enable_ab_test is true: group 0 (fnv1a(tenant_id) % 2 == 0) uses
+    /// Markov + ToD; group 1 uses raw Markov frequency without ToD weighting.
+    bool useToDWeighting(const std::string& tenant_id) const;
+
+    /// Stable FNV-1a hash of a string – used for deterministic A/B routing.
+    static uint64_t fnv1aHash(const std::string& s);
+
+    /// Return the current wall-clock hour in [0, 23].
+    static int currentHour();
+
+    /// Emit `cache.prefetch.hit_rate` and `cache.prefetch.overhead_bytes` via MetricsCollector.
+    void emitMetrics() const;
 };
 
 } // namespace cache