num42
diff --git a/‎.github/workflows/blocks.yml‎
Lines changed: 18 additions & 0 deletions b/‎.github/workflows/blocks.yml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 18 additions & 2 deletions b/‎README.md‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎action.yml‎
Lines changed: 1 addition & 1 deletion b/‎action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/domain_tagger_demo.exs‎
Lines changed: 134 additions & 0 deletions b/‎scripts/domain_tagger_demo.exs‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎scripts/inspect_blocks.exs‎
Lines changed: 134 additions & 0 deletions b/‎scripts/inspect_blocks.exs‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎scripts/run.sh‎
Lines changed: 5 additions & 1 deletion b/‎scripts/run.sh‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎test/support/block_matcher.ex‎
Lines changed: 17 additions & 0 deletions b/‎test/support/block_matcher.ex‎
Lines changed: 17 additions & 0 deletions
@@ -0,0 +1,18 @@
+name: Extract Code Blocks
+
+on:
+  pull_request:
+    branches: [main]
+
+permissions:
+  contents: read
+
+jobs:
+  blocks:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./
+        with:
+          command: blocks
+          build: source
@@ -2,10 +2,11 @@
 
 A GitHub Action for running [codeqa](https://github.com/num42/n42-agentic-helpers/tree/main/code-quality-analyzer-ex) code quality analysis on your repository.
 
-Supports three commands:
+Supports four commands:
 - **health-report** — Graded health report with worst offenders
 - **compare** — Metric comparison between git refs (e.g. PR vs base)
 - **analyze** — Raw JSON metrics output
+- **blocks** — Extract natural code blocks as JSON
 
 ## Usage
 
@@ -56,11 +57,26 @@ The base ref is auto-detected from the PR context. Override with `base-ref` if n
         run: cat ${{ steps.analysis.outputs.report-file }}
 ```
 
+### Extract Code Blocks
+
+```yaml
+      - uses: num42/codeqa-action@v1
+        id: blocks
+        with:
+          command: blocks
+          extra-args: --sub-blocks
+
+      - name: Use blocks
+        run: cat ${{ steps.blocks.outputs.report-file }}
+```
+
+Produces a JSON array of block objects. Each object includes `file`, `line_from`, `line_to`, `token_count`, and `depth` by default. Pass `--fields file,line_from,line_to,tokens` to customise. Add `--stream` for NDJSON output. Use `--workers N` for parallel file processing.
+
 ## Inputs
 
 | Input | Required | Default | Description |
 |-------|----------|---------|-------------|
-| `command` | yes | — | `health-report`, `compare`, or `analyze` |
+| `command` | yes | — | `health-report`, `compare`, `analyze`, or `blocks` |
 | `path` | no | `.` | Directory to analyze |
 | `comment` | no | `false` | Post result as sticky PR comment |
 | `fail-grade` | no | — | Minimum grade for health-report (e.g. `C`). Fails if below |
 
@@ -8,7 +8,7 @@ branding:
 
 inputs:
   command:
-    description: "Command to run: health-report, compare, or analyze"
+    description: "Command to run: health-report, compare, analyze, or blocks"
     required: true
   path:
     description: "Directory to analyze"
 
@@ -0,0 +1,134 @@
+# Domain tagger demo v3
+#   - noun extraction (no lookup table)
+#   - block-unique signal: block domains minus sub-block domains
+#   - language stopwords: extracted via Stopwords.find_stopwords on a synthetic
+#     Elixir keywords file — infra words that appear there are subtracted
+#
+# run with: mix run scripts/domain_tagger_demo.exs
+
+alias CodeQA.Metrics.{BlockAnalyzer, BlockDetector, TokenNormalizer}
+alias CodeQA.Stopwords
+
+defmodule DomainTagger do
+  @verbs MapSet.new(~w[
+    get fetch find list search query count
+    create build make add insert
+    update patch put set save store upsert
+    delete remove destroy drop clear purge flush
+    send receive deliver emit broadcast publish
+    upload download read write stream parse format render print log
+    check validate verify assert ensure guard
+    handle process compute calculate apply run exec call invoke dispatch
+    init start stop reset open close connect disconnect reload
+    is has can will do did should would may
+    by with for from to into on at of in out new
+    all some many each every any
+    a an the ok error nil true false
+  ])
+
+  def split(content) do
+    content
+    |> String.replace(~r/([a-z])([A-Z])/, "\\1_\\2")
+    |> String.replace(~r/([A-Z]+)([A-Z][a-z])/, "\\1_\\2")
+    |> String.split(~r/[_!?]/, trim: true)
+    |> Enum.map(&String.downcase/1)
+    |> Enum.reject(&(String.length(&1) <= 1))
+  end
+
+  def nouns(content) do
+    content |> split() |> Enum.reject(&MapSet.member?(@verbs, &1))
+  end
+
+  def tag(tokens) do
+    bound = BlockAnalyzer.bound_variables(tokens)
+
+    tokens
+    |> Enum.filter(&(&1.value == "<ID>"))
+    |> Enum.flat_map(&nouns(&1.content))
+    |> Enum.reject(fn noun -> MapSet.member?(bound, noun) end)
+    |> Enum.frequencies()
+    |> Enum.sort_by(fn {_, c} -> -c end)
+    |> Enum.map(fn {word, _} -> String.to_atom(word) end)
+  end
+end
+
+# ---------------------------------------------------------------------------
+# Step 1: derive language stopwords from the repository's own .ex files.
+#
+# Stopwords.find_stopwords counts how often each noun appears across files.
+# Nouns present in ≥15% of repo files are treated as infrastructure words.
+# ---------------------------------------------------------------------------
+
+repo_files =
+  Path.wildcard("lib/**/*.ex")
+  |> Map.new(fn path -> {path, File.read!(path)} end)
+
+noun_extractor = fn content ->
+  content
+  |> TokenNormalizer.normalize_structural()
+  |> DomainTagger.tag()
+  |> Enum.map(&Atom.to_string/1)
+end
+
+IO.puts("=== Deriving stopwords from #{map_size(repo_files)} repo files ===")
+
+lang_stopwords =
+  Stopwords.find_stopwords(repo_files, noun_extractor)
+
+IO.puts("=== Language stopwords extracted (#{MapSet.size(lang_stopwords)} terms) ===")
+IO.puts(inspect(lang_stopwords |> MapSet.to_list() |> Enum.sort()))
+IO.puts("")
+
+# ---------------------------------------------------------------------------
+# Step 2: analyze all .ex files in the repo
+# ---------------------------------------------------------------------------
+
+target = "lib/codeqa/git.ex"
+sources = [{target, File.read!(target)}]
+
+apply_stopwords = fn domains ->
+  domains
+  |> Enum.map(&Atom.to_string/1)
+  |> Enum.reject(&MapSet.member?(lang_stopwords, &1))
+  |> Enum.map(&String.to_atom/1)
+end
+
+fn_name_of = fn block ->
+  block.tokens
+  |> Enum.drop_while(&(&1.value != "<ID>" or &1.content not in ["def", "defp"]))
+  |> Enum.drop(1)
+  |> Enum.find(&(&1.value == "<ID>"))
+  |> case do
+    nil -> "(unknown)"
+    t   -> t.content
+  end
+end
+
+Enum.each(sources, fn {source_label, content} ->
+  IO.puts("=== #{source_label} ===\n")
+
+  tokens = TokenNormalizer.normalize_structural(content)
+  blocks = BlockDetector.detect_blocks(tokens, language: :unknown)
+
+  Enum.each(blocks, fn block ->
+    all_domains = DomainTagger.tag(block.tokens) |> apply_stopwords.()
+
+    sub_domain_set =
+      block.sub_blocks
+      |> Enum.flat_map(&DomainTagger.tag(&1.tokens))
+      |> Enum.map(&Atom.to_string/1)
+      |> Enum.reject(&MapSet.member?(lang_stopwords, &1))
+      |> MapSet.new()
+
+    unique = Enum.reject(all_domains, &MapSet.member?(sub_domain_set, Atom.to_string(&1)))
+
+    IO.puts("#{fn_name_of.(block)}/? (lines #{block.start_line}–#{block.end_line})")
+    IO.puts("  all     : #{inspect(all_domains)}")
+    IO.puts("  unique  : #{inspect(unique)}")
+    Enum.each(block.sub_blocks, fn sb ->
+      ds = DomainTagger.tag(sb.tokens) |> apply_stopwords.()
+      unless Enum.empty?(ds), do: IO.puts("  sub:#{sb.start_line} : #{inspect(ds)}")
+    end)
+    IO.puts("")
+  end)
+end)
@@ -0,0 +1,134 @@
+alias CodeQA.Metrics.{TokenNormalizer, BlockDetector, Block}
+
+# ─── sample source ───────────────────────────────────────────────────────────
+
+source = ~S"""
+defmodule Greeter do
+  def hello(name) do
+    "Hello, #{name}!"
+  end
+
+  def goodbye(name) do
+    "Goodbye, #{name}!"
+  end
+end
+
+
+defmodule Calculator do
+  def add(a, b), do: a + b
+
+  def subtract(a, b), do: a - b
+end
+"""
+
+# ─── helpers ─────────────────────────────────────────────────────────────────
+
+sep   = fn char, n -> IO.puts(String.duplicate(char, n)) end
+title = fn label -> sep.("─", 60); IO.puts("  #{label}"); sep.("─", 60) end
+
+# Reconstruct source from token stream using col positions for spacing.
+# <WS> tokens are skipped — indentation is recovered from content-token col.
+reconstruct = fn tokens ->
+  {text, _, _} =
+    Enum.reduce(tokens, {"", 1, 0}, fn token, {acc, line, col} ->
+      case token.value do
+        "<NL>" ->
+          {acc <> "\n", line + 1, 0}
+
+        "<WS>" ->
+          {acc, line, col}
+
+        _ ->
+          padding = String.duplicate(" ", max(0, token.col - col))
+          end_col = token.col + String.length(token.content)
+          {acc <> padding <> token.content, line, end_col}
+      end
+    end)
+  text
+end
+
+# ─── 1. original source ──────────────────────────────────────────────────────
+
+title.("1. ORIGINAL SOURCE")
+IO.puts(source)
+
+# ─── 2. token stream ─────────────────────────────────────────────────────────
+
+title.("2. TOKEN STREAM  (value | content | line:col)")
+
+tokens = TokenNormalizer.normalize_structural(source)
+
+tokens
+|> Enum.each(fn t ->
+  value   = String.pad_trailing(t.value,   8)
+  content = String.pad_trailing(inspect(t.content), 16)
+  IO.puts("  #{value}  #{content}  #{t.line}:#{t.col}")
+end)
+
+# ─── 3. blocks + sub-blocks ──────────────────────────────────────────────────
+
+title.("3. BLOCKS + SUB-BLOCKS")
+
+blocks = BlockDetector.detect_blocks(tokens, language: :unknown)
+
+blocks
+|> Enum.with_index(1)
+|> Enum.each(fn {block, i} ->
+  IO.puts("")
+  IO.puts("  Block #{i}  lines #{block.start_line}–#{block.end_line}" <>
+          "  tokens=#{Block.token_count(block)}" <>
+          "  sub_blocks=#{Block.sub_block_count(block)}")
+
+  reconstructed_block = reconstruct.(block.tokens)
+  IO.puts("  ┌─ reconstructed ────────────────────────────")
+  reconstructed_block
+  |> String.split("\n")
+  |> Enum.each(fn line -> IO.puts("  │ #{line}") end)
+  IO.puts("  └────────────────────────────────────────────")
+
+  if block.sub_blocks != [] do
+    block.sub_blocks
+    |> Enum.with_index(1)
+    |> Enum.each(fn {sub, j} ->
+      IO.puts("")
+      IO.puts("    Sub-block #{i}.#{j}  lines #{sub.start_line}–#{sub.end_line}" <>
+              "  tokens=#{Block.token_count(sub)}")
+      reconstructed_sub = reconstruct.(sub.tokens)
+      IO.puts("    ┌─ reconstructed ──────────────────────")
+      reconstructed_sub
+      |> String.split("\n")
+      |> Enum.each(fn line -> IO.puts("    │ #{line}") end)
+      IO.puts("    └──────────────────────────────────────")
+    end)
+  end
+end)
+
+# ─── 4. full reconstruction + match check ────────────────────────────────────
+
+title.("4. FULL RECONSTRUCTION")
+
+reconstructed = reconstruct.(tokens)
+IO.puts(reconstructed)
+
+title.("5. RECONSTRUCTION vs ORIGINAL")
+
+if reconstructed == source do
+  IO.puts("  ✓ exact match")
+else
+  IO.puts("  ✗ differs (whitespace normalisation expected)")
+  IO.puts("")
+
+  source_lines        = String.split(source, "\n")
+  reconstructed_lines = String.split(reconstructed, "\n")
+
+  source_lines
+  |> Enum.with_index(1)
+  |> Enum.each(fn {orig, n} ->
+    recon = Enum.at(reconstructed_lines, n - 1, "")
+    marker = if orig == recon, do: "  ", else: "!!"
+    IO.puts("  #{marker} #{String.pad_leading(to_string(n), 2)}  orig : #{inspect(orig)}")
+    if orig != recon do
+      IO.puts("         recon: #{inspect(recon)}")
+    end
+  end)
+end
@@ -38,8 +38,9 @@ case "$INPUT_COMMAND" in
     fi
     ;;
   analyze) OUTPUT_FILE="${OUTPUT_FILE}.json" ;;
+  blocks) OUTPUT_FILE="${OUTPUT_FILE}.json" ;;
   *)
-    echo "::error::Unknown command: $INPUT_COMMAND. Must be health-report, compare, or analyze."
+    echo "::error::Unknown command: $INPUT_COMMAND. Must be health-report, compare, analyze, or blocks."
     exit 1
     ;;
 esac
@@ -82,6 +83,9 @@ case "$INPUT_COMMAND" in
   analyze)
     ARGS+=("--output" "$OUTPUT_FILE")
     ;;
+  blocks)
+    ARGS+=("--output" "$OUTPUT_FILE")
+    ;;
 esac
 
 # Parse ignore-paths YAML list into --ignore-paths flag
 
@@ -0,0 +1,17 @@
+defmodule Test.BlockMatcher do
+  @moduledoc """
+  Helpers for asserting on tokens within `CompoundBlock` structures.
+
+  Returns tagged tuples that can be matched against token fields:
+
+  - `exact(:content, "add")` — token whose `content` equals `"add"` exactly
+  - `partial(:content, "@doc")` — token whose `content` contains `"@doc"` as a substring
+  - `:value` targets the normalized token value instead of raw source content
+  """
+
+  @spec exact(:content | :value, String.t()) :: {:exact, :content | :value, String.t()}
+  def exact(field, value) when field in [:content, :value], do: {:exact, field, value}
+
+  @spec partial(:content | :value, String.t()) :: {:partial, :content | :value, String.t()}
+  def partial(field, value) when field in [:content, :value], do: {:partial, field, value}
+end