diff --git a/.codeqa.yml b/.codeqa.yml index e6d5d46..26f39be 100644 --- a/.codeqa.yml +++ b/.codeqa.yml @@ -4,4 +4,32 @@ # Patterns here are merged with any --ignore-paths passed on the command line. ignore_paths: - - priv/combined_metrics/samples/** + - priv/** + - tools/** + - scripts/** + - docs/** + - plans/** + - test/** + - devenv* + - direnv* + - README.md + - action.yml + +# Impact weights for overall score calculation. +# Combined metric categories default to 1 (can be overridden here). +impact: + complexity: 5 + file_structure: 4 + function_design: 4 + code_smells: 3 + naming_conventions: 2 + error_handling: 2 + consistency: 2 + documentation: 1 + testing: 1 + # combined categories override example: + # variable_naming: 2 + +combined_top: 5 # worst offender files per combined-metric behavior + +cosine_significance_threshold: 0.25 # threshold for cosine similarity calculation in behavior categories diff --git a/.dialyzer_ignore.exs b/.dialyzer_ignore.exs new file mode 100644 index 0000000..9722072 --- /dev/null +++ b/.dialyzer_ignore.exs @@ -0,0 +1,14 @@ +[ + # Dialyzer specializes analyze/2 for the codebase call-site where include_pairs + # is always true, making the false branch appear unreachable. Both branches are + # valid and reachable at runtime from the file-level and codebase callers. + {"lib/codeqa/metrics/file/near_duplicate_blocks.ex", :pattern_match}, + # Mix module type information is not available in the PLT; these are valid + # Mix.Task callbacks and standard Mix module calls. + {"lib/mix/tasks/codeqa/sample_report.ex", :callback_info_missing}, + {"lib/mix/tasks/codeqa/signal_debug.ex", :callback_info_missing}, + {"lib/mix/tasks/codeqa/sample_report.ex", :unknown_function}, + {"lib/mix/tasks/codeqa/signal_debug.ex", :unknown_function}, + # CodeQA.Engine.Registry.t/0 is defined via a macro; type is available at runtime. + {"lib/codeqa/analysis/file_metrics_server.ex", :unknown_type} +] diff --git a/.github/workflows/bootstrap-labels.yml b/.github/workflows/bootstrap-labels.yml index a865335..52c644a 100644 --- a/.github/workflows/bootstrap-labels.yml +++ b/.github/workflows/bootstrap-labels.yml @@ -10,7 +10,7 @@ jobs: bootstrap: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Bootstrap labels uses: actions/github-script@v7 diff --git a/.github/workflows/compare.yml b/.github/workflows/compare.yml index fa13ef0..5a672ad 100644 --- a/.github/workflows/compare.yml +++ b/.github/workflows/compare.yml @@ -12,10 +12,19 @@ jobs: compare: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 + - name: Cache Mix deps and build + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-1.19-27.3-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: ${{ runner.os }}-mix-1.19-27.3- + - name: Get fork point id: fork-point run: | diff --git a/.github/workflows/dialyzer.yml b/.github/workflows/dialyzer.yml index 0674398..dfaca60 100644 --- a/.github/workflows/dialyzer.yml +++ b/.github/workflows/dialyzer.yml @@ -25,7 +25,7 @@ jobs: steps: - name: Checkout PR - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Elixir uses: erlef/setup-beam@v1 @@ -45,9 +45,9 @@ jobs: uses: actions/cache@v4 with: path: _build - key: build-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock') }} + key: build-${{ env.MIX_ENV }}-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock') }} restore-keys: | - build-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- + build-${{ env.MIX_ENV }}-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- - name: Cache PLT uses: actions/cache@v4 diff --git a/.github/workflows/health-report.yml b/.github/workflows/health-report.yml index 55e38f0..2e0b896 100644 --- a/.github/workflows/health-report.yml +++ b/.github/workflows/health-report.yml @@ -12,7 +12,17 @@ jobs: health-report: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 + + - name: Cache Mix deps and build + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-1.19-27.3-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: ${{ runner.os }}-mix-1.19-27.3- + - uses: ./ with: command: health-report diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d1ebf66..13300bd 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -17,15 +17,33 @@ jobs: build: runs-on: ubuntu-latest + env: + ELIXIR_VERSION: "1.19" + OTP_VERSION: "27.3" + steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Elixir uses: erlef/setup-beam@v1 with: - otp-version: "27.3" - elixir-version: "1.19" + otp-version: ${{ env.OTP_VERSION }} + elixir-version: ${{ env.ELIXIR_VERSION }} + + - name: Cache deps + uses: actions/cache@v4 + with: + path: deps + key: deps-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: deps-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- + + - name: Cache build + uses: actions/cache@v4 + with: + path: _build + key: build-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: build-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- - name: Build escript run: | diff --git a/.github/workflows/sync-behavior-coverage.yml b/.github/workflows/sync-behavior-coverage.yml new file mode 100644 index 0000000..dc1cc4f --- /dev/null +++ b/.github/workflows/sync-behavior-coverage.yml @@ -0,0 +1,65 @@ +name: Sync Behavior Coverage + +on: + pull_request: + branches: [main] + +permissions: + contents: write + +jobs: + sync: + runs-on: ubuntu-latest + if: github.event.pull_request.head.repo.full_name == github.repository + + steps: + - name: Checkout PR branch + uses: actions/checkout@v6 + with: + ref: ${{ github.head_ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Elixir + uses: erlef/setup-beam@v1 + with: + otp-version: "27.3" + elixir-version: "1.19" + + - name: Cache deps + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-dev-otp27.3-elixir1.19-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: ${{ runner.os }}-mix-dev-otp27.3-elixir1.19- + + - name: Install dependencies + run: mix deps.get + + - name: Compile + run: mix compile --warnings-as-errors + + - name: Regenerate language coverage + run: mix codeqa.sample_report --apply-languages + + - name: Regenerate scalar vectors + run: mix codeqa.sample_report --apply-scalars + + - name: Check for YAML drift + id: diff + run: | + if git diff --quiet priv/combined_metrics/; then + echo "changed=false" >> $GITHUB_OUTPUT + else + echo "changed=true" >> $GITHUB_OUTPUT + fi + + - name: Commit and push updated YAMLs + if: steps.diff.outputs.changed == 'true' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add priv/combined_metrics/*.yml + git commit -m "chore(combined-metrics): sync language coverage and scalar vectors [skip ci]" + git push diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d560a17..04ba9b3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,24 +10,27 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Elixir + id: beam uses: erlef/setup-beam@v1 with: otp-version: "27.3" elixir-version: "1.19" - name: Cache deps + id: mix-cache uses: actions/cache@v4 with: path: | deps _build - key: ${{ runner.os }}-mix-${{ hashFiles('mix.lock') }} - restore-keys: ${{ runner.os }}-mix- + key: ${{ runner.os }}-mix-${{ steps.beam.outputs.otp-version }}-${{ steps.beam.outputs.elixir-version }}-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: ${{ runner.os }}-mix-${{ steps.beam.outputs.otp-version }}-${{ steps.beam.outputs.elixir-version }}- - name: Install dependencies + if: steps.mix-cache.outputs.cache-hit != 'true' run: mix deps.get - name: Compile diff --git a/.github/workflows/validate-issue-links.yml b/.github/workflows/validate-issue-links.yml index e366437..5960df4 100644 --- a/.github/workflows/validate-issue-links.yml +++ b/.github/workflows/validate-issue-links.yml @@ -12,7 +12,7 @@ jobs: validate: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Validate issue template links run: | diff --git a/.gitignore b/.gitignore index ad2603a..4ef9df3 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,13 @@ devenv.lock # Git worktrees .worktrees/ +docs/plans/ +docs/superpowers/ +plans/ +scripts/*.exs + +# Claude Code +.claude/ + +# Node +node_modules/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d008842 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,8 @@ +repos: +- repo: local + hooks: + - id: mix-precommit + name: Mix precommit + entry: devenv shell precommit + language: system + pass_filenames: false diff --git a/README.md b/README.md index 259ee49..1eab2d4 100644 --- a/README.md +++ b/README.md @@ -17,14 +17,14 @@ Works with Python, Ruby, JavaScript, TypeScript, Elixir, C#, Java, C++, Go, Rust - [CLI Reference](#cli-reference) - [analyze](#analyze) - [health-report](#health-report) + - [diagnose](#diagnose) - [compare](#compare) - [history](#history) - [correlate](#correlate) - - [stopwords](#stopwords) - [Metrics Reference](#metrics-reference) - [Raw Metrics](#raw-metrics) - [Health Report Categories](#health-report-categories) - - [Behavior Checks](#behavior-checks) + - [Behavior Categories](#behavior-categories) - [Output Formats](#output-formats) - [Grading](#grading) @@ -76,7 +76,7 @@ jobs: health-report: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: num42/codeqa-action@v1 with: command: health-report @@ -95,7 +95,7 @@ jobs: compare: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 - name: Get fork point @@ -112,7 +112,7 @@ jobs: | Input | Required | Default | Description | |-------|----------|---------|-------------| -| `command` | yes | — | CLI command to run: `health-report`, `compare`, or `analyze` | +| `command` | yes | — | CLI command to run: `health-report`, `compare`, `analyze`, `history`, `correlate`, or `diagnose` | | `path` | no | `.` | Directory to analyze | | `comment` | no | `false` | Post results as a sticky PR comment | | `fail-grade` | no | — | Fail the action if overall grade is below this (e.g. `C`) | @@ -153,22 +153,83 @@ ignore_paths: ```yaml categories: - - name: Naming - weight: 1.5 + Naming: + name: Naming metrics: - name: vowel_density - good: 0.4 - thresholds: [0.35, 0.3, 0.25] + weight: 1.5 + good: "high" + thresholds: + a: 0.42 + b: 0.38 + c: 0.32 + d: 0.25 ``` +Category-level keys: `name` (display name), `metrics` (list of metric overrides), `top` (worst-offender count override). + +Metric-level keys: `name` (metric key), `weight` (relative weight within the category), `good` (`"high"` or `"low"` — direction where higher values are better or worse), `source` (metric path), `thresholds` (map of letter-grade cutoffs: `a`, `b`, `c`, `d`). + ### Grade scale override ```yaml grade_scale: - - [90, "A"] - - [80, "B"] - - [70, "C"] - - [0, "F"] + - min: 90 + grade: "A" + - min: 80 + grade: "B" + - min: 70 + grade: "C" + - min: 0 + grade: "F" +``` + +### impact + +Impact weights used when computing the overall score. The 9 keys below are the built-in defaults; any category not listed falls back to `1`. These weights apply to both primary and behavior categories. + +```yaml +impact: + complexity: 5 + file_structure: 4 + function_design: 4 + code_smells: 3 + naming_conventions: 2 + error_handling: 2 + consistency: 2 + documentation: 1 + testing: 1 + # override any category key: + # variable_naming: 2 +``` + +### combined_top + +Controls how many worst-offender files are shown per behavior category in `health-report` (default: `2`). + +```yaml +combined_top: 3 +``` + +### near_duplicate_blocks + +Configures codebase-level near-duplicate block detection (used by `analyze`). + +```yaml +near_duplicate_blocks: + max_pairs_per_bucket: 50 +``` + +| Key | Description | +|-----|-------------| +| `max_pairs_per_bucket` | Maximum duplicate pairs reported per similarity bucket (default: unlimited) | + +### cosine_significance_threshold + +Minimum cosine similarity required for a behavior category match to be considered significant. Matches below this threshold are treated as noise and excluded from scoring. Default: `0.15`. + +```yaml +cosine_significance_threshold: 0.25 ``` ## CLI Reference @@ -228,6 +289,31 @@ Produces a graded quality report grouped into behavior categories with worst-off ./codeqa health-report --detail full --top 10 --format github ./lib ``` +### diagnose + +Identifies likely code quality issues by scoring behavior profiles using cosine similarity. Useful for understanding *why* a codebase scores poorly without running a full health report. + +```sh +./codeqa diagnose --path [OPTIONS] +``` + +`--path` is **required**. Note: unlike `health-report`, the path is passed as a named flag (`--path`), not a positional argument. + +| Option | Description | +|--------|-------------| +| `--path PATH` | **(Required)** File or directory to analyze | +| `--mode MODE` | `aggregate` (default) or `per-file` | +| `--top N` | Number of top issues to show (default: `15`) | +| `--format FORMAT` | Output format: `plain` or `json` (default: `plain`) | +| `--combined-top N` | Worst-offender files per behavior in per-file mode (default: `2`) | + +**Example:** + +```sh +./codeqa diagnose --path ./lib --mode aggregate --top 10 +./codeqa diagnose --path ./lib --mode per-file --format json +``` + ### compare Compares code quality metrics between two git refs. Designed for PR workflows. @@ -246,6 +332,16 @@ Compares code quality metrics between two git refs. Designed for PR workflows. | `--output MODE` | Output mode: `auto`, `summary`, or `changes` (default: `auto`) | | `--changes-only` | Only analyze files changed between refs | | `--all-files` | Analyze all source files at both refs (default) | +| `--workers N` | Parallel worker count | +| `--progress` | Show per-file progress | +| `--cache` | Cache computed metrics to disk | +| `--cache-dir PATH` | Directory for cached metrics (default: `.codeqa_cache`) | +| `--timeout MS` | Per-file timeout in milliseconds (default: `5000`) | +| `--show-ncd` | Include NCD similarity matrix | +| `--ncd-top N` | Top similar pairs per file | +| `--ncd-paths PATHS` | Comma-separated paths to compare for NCD | +| `--show-files` | Include per-file metrics in output | +| `--show-file-paths PATHS` | Comma-separated list of specific file paths to include | | `--ignore-paths GLOBS` | Comma-separated glob patterns to exclude | **Example:** @@ -269,6 +365,16 @@ Tracks codebase metrics across multiple commits, writing per-commit JSON snapsho | `--output-dir PATH` | **(Required)** Directory to write JSON snapshots | | `--commits N` | Number of recent commits to analyze | | `--commit-list SHAS` | Comma-separated list of explicit commit SHAs | +| `--workers N` | Parallel worker count | +| `--progress` | Show per-file progress | +| `--cache` | Cache computed metrics to disk | +| `--cache-dir PATH` | Directory for cached metrics (default: `.codeqa_cache`) | +| `--timeout MS` | Per-file timeout in milliseconds (default: `5000`) | +| `--show-ncd` | Include NCD similarity matrix | +| `--ncd-top N` | Top similar pairs per file | +| `--ncd-paths PATHS` | Comma-separated paths to compare for NCD | +| `--show-files` | Include per-file metrics in output | +| `--show-file-paths PATHS` | Comma-separated list of specific file paths to include | | `--ignore-paths GLOBS` | Comma-separated glob patterns to exclude | ### correlate @@ -282,27 +388,12 @@ Finds metric correlations across history snapshots produced by `history`. Run `h | Option | Description | |--------|-------------| | `--top N` | Number of top correlations to show (default: `20`) | -| `--hide-exact` | Hide perfect 1.0 correlations | -| `--all-groups` | Show all metric groups | +| `--hide-exact` | Hide perfect 1.0 and -1.0 correlations | +| `--all-groups` | Include correlations between metrics in the same group | | `--min FLOAT` | Minimum correlation threshold | | `--max FLOAT` | Maximum correlation threshold | | `--combined-only` | Show only combined-metric correlations | -| `--max-steps N` | Limit history steps used | - -### stopwords - -Extracts codebase-specific vocabulary stopwords and fingerprints. Use the output to reduce noise from project-specific boilerplate tokens in subsequent metric analysis. - -```sh -./codeqa stopwords [OPTIONS] -``` - -| Option | Description | -|--------|-------------| -| `--workers N` | Parallel worker count | -| `--stopwords-threshold FLOAT` | Minimum frequency ratio (default: `0.01`) | -| `--progress` | Show per-file progress | -| `--ignore-paths GLOBS` | Comma-separated glob patterns to exclude | +| `--max-steps N` | Maximum number of correlation pairs to evaluate | ## Metrics Reference @@ -329,10 +420,17 @@ All metrics are computed per file and aggregated at the codebase level. | **Magic number density** | Ratio of numeric literals that appear to be unnamed constants | | **Function metrics** | Function count, average and maximum function line count, average and maximum parameter count | | **Cross-file similarity** | `cross_file_density`: overall codebase redundancy via combined compression ratio. `ncd_pairs` (opt-in via `--show-ncd`): Normalized Compression Distance between similar file pairs using winnowing fingerprints | +| **Near-duplicate blocks** | Codebase-level detection of near- and exact-duplicate code blocks using token-based similarity. Reports duplicate pairs grouped by bucket, with source locations. Configurable via `near_duplicate_blocks:` in `.codeqa.yml`. | +| **Block impact & refactoring potentials** | Per-file node tree enriched with leave-one-out impact scores and refactoring potentials. Added to each file entry as `"nodes"` in `analyze` JSON output. Surfaces the highest-impact blocks to refactor. | ### Health Report Categories -The `health-report` command grades your codebase against 6 primary categories. Each category aggregates raw metrics using configurable weights and thresholds. +The `health-report` command evaluates your codebase using two complementary scoring models: + +- **6 primary categories** — graded using configurable thresholds against raw metrics (Readability, Complexity, Structure, Duplication, Naming, Magic Numbers) +- **12 behavior categories** — graded using cosine similarity against behavior profiles (see [Behavior Categories](#behavior-categories)) + +The overall score is a weighted average of all 18 categories. Primary category weights are set via `weight:` in `.codeqa.yml`; behavior category weights are configured via [`impact:`](#impact). | Category | What it measures | |----------|-----------------| @@ -343,11 +441,21 @@ The `health-report` command grades your codebase against 6 primary categories. E | **Naming** | Casing entropy, identifier length variance, avg sub-words per identifier | | **Magic Numbers** | Magic number density | +**Cosine scoring breakpoints** (used for behavior categories): + +| Cosine similarity | Score | Approx. grade | +|-------------------|-------|---------------| +| ≥ 0.5 | 90–100 | A | +| ≥ 0.2 | 70–90 | B–A- | +| ≥ 0.0 | 50–70 | C–B- | +| ≥ −0.3 | 30–50 | D–C- | +| ≥ −1.0 | 0–30 | F–D- | + > Category definitions and thresholds are configurable via `.codeqa.yml`. -### Behavior Checks +### Behavior Categories -In addition to the 6 graded categories, `health-report` evaluates additional behavior check categories using a separate multiplicative scoring model. These appear in the report as "Top Issues" diagnostics. +In addition to the 6 primary categories, `health-report` grades 12 behavior categories using cosine similarity against behavior profiles. These contribute to the overall score alongside the primary categories. | Category | Checks | |----------|--------| @@ -364,13 +472,15 @@ In addition to the 6 graded categories, `health-report` evaluates additional beh | **Dependencies** | Import and dependency patterns | | **Error Handling** | Error handling completeness | +> These categories are graded in the `health-report` output using cosine similarity scoring and contribute to the overall score. + ## Output Formats | Format | Commands | Description | |--------|----------|-------------| -| `json` | `analyze`, `compare` | Full metrics structure, suitable for tooling | -| `markdown` | `compare`, `health-report` | GitHub-flavored markdown tables | -| `plain` | `health-report` | Human-readable terminal output (Markdown) | +| `json` | `analyze`, `compare`, `diagnose` | Full metrics structure, suitable for tooling | +| `markdown` | `compare` | GitHub-flavored markdown tables | +| `plain` | `health-report`, `diagnose` | Human-readable terminal output | | `github` | `health-report`, `compare` | Markdown optimized for GitHub PR comments | ## Grading @@ -397,6 +507,8 @@ In addition to the 6 graded categories, `health-report` evaluates additional beh | E- | ≥ 6 | | F | < 6 | +The overall score is a weighted average across all categories. Primary category weights use the `weight:` field inside each category definition in `.codeqa.yml`. Behavior category weights are configured via `impact:` (defaults range from 1–5; categories not listed fall back to `1`). See [Configuration](#configuration) for examples. + The `fail-grade` action input causes a non-zero exit when the overall grade falls below the specified threshold. ## Contributing & Issues diff --git a/action.yml b/action.yml index 6be6078..ebee062 100644 --- a/action.yml +++ b/action.yml @@ -93,15 +93,10 @@ runs: INPUT_VERSION: ${{ inputs.version }} INPUT_BUILD: ${{ inputs.build }} GITHUB_ACTION_PATH: ${{ github.action_path }} + PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_TOKEN: ${{ github.token }} run: ${{ github.action_path }}/scripts/run.sh - - name: Post PR comment - if: inputs.comment == 'true' && github.event_name == 'pull_request' - uses: marocchino/sticky-pull-request-comment@v2 - with: - header: codeqa-${{ inputs.command }} - path: ${{ steps.run.outputs.report-file }} - - name: Check grade threshold if: inputs.fail-grade != '' && inputs.command == 'health-report' shell: bash diff --git a/lib/codeqa/analysis/behavior_config_server.ex b/lib/codeqa/analysis/behavior_config_server.ex new file mode 100644 index 0000000..04cc9ba --- /dev/null +++ b/lib/codeqa/analysis/behavior_config_server.ex @@ -0,0 +1,119 @@ +defmodule CodeQA.Analysis.BehaviorConfigServer do + @moduledoc """ + Per-run GenServer that loads all YAML behavior configs once and serves them + from an anonymous ETS table. + + Eliminates repeated disk reads in `SampleRunner.diagnose_aggregate/2` by + loading `priv/combined_metrics/*.yml` on startup and keeping data in memory + for the duration of the analysis run. + + ETS layout: `{category, behavior} => behavior_data` + where `behavior_data` is the raw YAML map for that behavior. + """ + + use GenServer + + @yaml_dir "priv/combined_metrics" + + # --- Public API --- + + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + GenServer.start_link(__MODULE__, opts) + end + + @doc "Returns the ETS table id. Callers may read directly from it." + @spec get_tid(pid()) :: :ets.tid() + def get_tid(pid), do: GenServer.call(pid, :get_tid) + + @doc """ + Returns all behaviors grouped by category. + + %{"function_design" => [{"no_boolean_parameter", behavior_data}, ...], ...} + """ + @spec get_all_behaviors(pid()) :: %{String.t() => [{String.t(), map()}]} + def get_all_behaviors(pid) do + tid = get_tid(pid) + + tid + |> :ets.tab2list() + |> Enum.reduce(%{}, fn {{cat, beh}, data}, acc -> + Map.update(acc, cat, [{beh, data}], &[{beh, data} | &1]) + end) + end + + @doc "Returns the scalar weight map for a given category + behavior." + @spec get_scalars(pid(), String.t(), String.t()) :: %{{String.t(), String.t()} => float()} + def get_scalars(pid, category, behavior) do + tid = get_tid(pid) + + case :ets.lookup(tid, {category, behavior}) do + [{_, data}] -> scalars_from_behavior_data(data) + [] -> %{} + end + end + + @doc "Returns the `_log_baseline` value for a given category + behavior." + @spec get_log_baseline(pid(), String.t(), String.t()) :: float() + def get_log_baseline(pid, category, behavior) do + tid = get_tid(pid) + + case :ets.lookup(tid, {category, behavior}) do + [{_, data}] -> Map.get(data, "_log_baseline", 0.0) / 1.0 + [] -> 0.0 + end + end + + # --- GenServer callbacks --- + + @impl true + def init(_opts) do + tid = :ets.new(:behavior_config, [:set, :public, read_concurrency: true]) + load_configs(tid) + {:ok, %{tid: tid}} + end + + @impl true + def handle_call(:get_tid, _from, state) do + {:reply, state.tid, state} + end + + # --- Private helpers --- + + defp load_configs(tid) do + case File.ls(@yaml_dir) do + {:ok, files} -> + files + |> Enum.filter(&String.ends_with?(&1, ".yml")) + |> Enum.each(&load_yml_file(&1, tid)) + + {:error, _} -> + :ok + end + end + + defp load_yml_file(yml_file, tid) do + category = String.trim_trailing(yml_file, ".yml") + yaml_path = Path.join(@yaml_dir, yml_file) + {:ok, data} = YamlElixir.read_from_file(yaml_path) + + data + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.each(fn {behavior, behavior_data} -> + :ets.insert(tid, {{category, behavior}, behavior_data}) + end) + end + + @doc false + def scalars_from_behavior_data(behavior_data) do + behavior_data + |> Enum.flat_map(fn + {group, keys} when is_map(keys) -> + Enum.map(keys, fn {key, scalar} -> {{group, key}, scalar / 1.0} end) + + _ -> + [] + end) + |> Map.new() + end +end diff --git a/lib/codeqa/analysis/file_context_server.ex b/lib/codeqa/analysis/file_context_server.ex new file mode 100644 index 0000000..987595f --- /dev/null +++ b/lib/codeqa/analysis/file_context_server.ex @@ -0,0 +1,87 @@ +defmodule CodeQA.Analysis.FileContextServer do + @moduledoc """ + Per-run GenServer that memoizes `Pipeline.build_file_context/2` by + `{MD5(content), language_name}`. + + Cache key includes the resolved language name because different languages + produce different keyword/operator sets, yielding different identifiers from + the same content. + + ETS layout: `{md5_binary, language_name} => FileContext.t()` + + On a cache miss, the calling process builds the context directly and inserts + it into the shared ETS table — no GenServer mailbox round-trip for the + computation itself. + """ + + use GenServer + + alias CodeQA.Engine.{FileContext, Pipeline} + alias CodeQA.Language + alias CodeQA.Languages.Unknown + + # --- Public API --- + + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + GenServer.start_link(__MODULE__, opts) + end + + @doc "Returns the ETS table id. Callers may read directly from it." + @spec get_tid(pid()) :: :ets.tid() + def get_tid(pid), do: GenServer.call(pid, :get_tid) + + @doc """ + Returns a cached (or freshly built) `FileContext` for `content`. + + The language is resolved from `opts` (`:language` or `:path`); defaults to + `Unknown`. + """ + @spec get(pid(), String.t(), keyword()) :: FileContext.t() + def get(pid, content, opts \\ []) do + tid = get_tid(pid) + language_name = resolve_language_name(opts) + key = {md5(content), language_name} + + case :ets.lookup(tid, key) do + [{_, ctx}] -> + ctx + + [] -> + ctx = Pipeline.build_file_context(content, opts) + :ets.insert(tid, {key, ctx}) + ctx + end + end + + # --- GenServer callbacks --- + + @impl true + def init(_opts) do + tid = :ets.new(:file_context, [:set, :public, read_concurrency: true]) + {:ok, %{tid: tid}} + end + + @impl true + def handle_call(:get_tid, _from, state) do + {:reply, state.tid, state} + end + + # --- Private helpers --- + + defp md5(content), do: :crypto.hash(:md5, content) + + defp resolve_language_name(opts) do + cond do + lang = Keyword.get(opts, :language) -> + mod = Language.find(lang) || Unknown + mod.name() + + path = Keyword.get(opts, :path) -> + Language.detect(path).name() + + true -> + Unknown.name() + end + end +end diff --git a/lib/codeqa/analysis/file_metrics_server.ex b/lib/codeqa/analysis/file_metrics_server.ex new file mode 100644 index 0000000..579a63d --- /dev/null +++ b/lib/codeqa/analysis/file_metrics_server.ex @@ -0,0 +1,107 @@ +defmodule CodeQA.Analysis.FileMetricsServer do + @moduledoc """ + Per-run GenServer that caches `Registry.run_file_metrics/2` results. + + Pre-populated from `pipeline_result` before block analysis starts so baseline + metrics are served directly from ETS without recomputation. + + ETS layout: + - `{:path, path}` => metrics map (baseline for existing files) + - `{:hash, md5_binary}` => metrics map (computed on demand for reconstructed content) + """ + + use GenServer + + alias CodeQA.Engine.Pipeline + alias CodeQA.Engine.Registry + + # --- Public API --- + + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + GenServer.start_link(__MODULE__, opts) + end + + @doc "Returns the ETS table id. Callers may read directly from it." + @spec get_tid(pid()) :: :ets.tid() + def get_tid(pid), do: GenServer.call(pid, :get_tid) + + @doc """ + Bulk-inserts all baseline metrics from `pipeline_result` and cross-indexes by + content hash for each path present in `files_map`. + + Call once after starting the supervisor, before beginning block analysis. + """ + @spec populate(pid(), map(), map()) :: :ok + def populate(pid, pipeline_result, files_map) do + tid = get_tid(pid) + files_data = Map.get(pipeline_result, "files", %{}) + + Enum.each(files_data, fn {path, file_data} -> + metrics = Map.get(file_data, "metrics", %{}) + :ets.insert(tid, {{:path, path}, metrics}) + end) + + Enum.each(files_map, fn {path, content} -> + hash = md5(content) + + case :ets.lookup(tid, {:path, path}) do + [{_, metrics}] -> :ets.insert(tid, {{:hash, hash}, metrics}) + [] -> :ok + end + end) + + :ok + end + + @doc "Returns pre-populated baseline metrics for `path`, or `nil` if not found." + @spec get_by_path(pid(), String.t()) :: map() | nil + def get_by_path(pid, path) do + tid = get_tid(pid) + + case :ets.lookup(tid, {:path, path}) do + [{_, metrics}] -> metrics + [] -> nil + end + end + + @doc """ + Returns metrics for `content`, using the hash cache. + + On a cache miss, builds the file context and runs metrics in the calling + process, then inserts the result into ETS for future lookups. + """ + @spec get_for_content(pid(), Registry.t(), String.t(), keyword()) :: map() + def get_for_content(pid, registry, content, opts \\ []) do + tid = get_tid(pid) + hash = md5(content) + + case :ets.lookup(tid, {:hash, hash}) do + [{_, metrics}] -> + metrics + + [] -> + ctx = Pipeline.build_file_context(content, opts) + metrics = Registry.run_file_metrics(registry, ctx) + :ets.insert(tid, {{:hash, hash}, metrics}) + metrics + end + end + + # --- GenServer callbacks --- + + @impl true + def init(_opts) do + tid = :ets.new(:file_metrics, [:set, :public, read_concurrency: true]) + {:ok, %{tid: tid}} + end + + @impl true + def handle_call(:get_tid, _from, state) do + {:reply, state.tid, state} + end + + # --- Private helpers --- + + defp md5(content), do: :crypto.hash(:md5, content) +end diff --git a/lib/codeqa/analysis/run_context.ex b/lib/codeqa/analysis/run_context.ex new file mode 100644 index 0000000..e0e9d52 --- /dev/null +++ b/lib/codeqa/analysis/run_context.ex @@ -0,0 +1,15 @@ +defmodule CodeQA.Analysis.RunContext do + @moduledoc """ + Holds PIDs for the per-run GenServers started under `RunSupervisor`. + + Passed through the analysis call chain so all callers can access + cached state without named process registration. + """ + + defstruct [:behavior_config_pid, :file_context_pid] + + @type t :: %__MODULE__{ + behavior_config_pid: pid(), + file_context_pid: pid() + } +end diff --git a/lib/codeqa/analysis/run_supervisor.ex b/lib/codeqa/analysis/run_supervisor.ex new file mode 100644 index 0000000..ab6bb10 --- /dev/null +++ b/lib/codeqa/analysis/run_supervisor.ex @@ -0,0 +1,52 @@ +defmodule CodeQA.Analysis.RunSupervisor do + @moduledoc """ + One-shot supervisor for the per-analysis-run GenServers. + + Started at the top of `Analyzer.with_run_context/2` and stopped (via + `Supervisor.stop/1`) in an `after` block when the run completes. + + Servers are not registered by name, preventing collisions when multiple + analysis runs share the same BEAM node (e.g. parallel tests). + """ + + use Supervisor + + alias CodeQA.Analysis.{BehaviorConfigServer, FileContextServer, RunContext} + + @spec start_link(keyword()) :: Supervisor.on_start() + def start_link(opts \\ []) do + Supervisor.start_link(__MODULE__, opts) + end + + @doc """ + Queries child PIDs from `sup` and returns a `RunContext` struct. + + Call once after `start_link/1` succeeds, before beginning analysis. + """ + @spec run_context(pid()) :: RunContext.t() + def run_context(sup) do + children = Supervisor.which_children(sup) + + %RunContext{ + behavior_config_pid: find_pid(children, BehaviorConfigServer), + file_context_pid: find_pid(children, FileContextServer) + } + end + + @impl true + def init(_opts) do + children = [ + {BehaviorConfigServer, []}, + {FileContextServer, []} + ] + + Supervisor.init(children, strategy: :one_for_one) + end + + defp find_pid(children, module) do + {_id, pid, _type, _modules} = + Enum.find(children, fn {id, _pid, _type, _modules} -> id == module end) + + pid + end +end diff --git a/lib/codeqa/analyzer.ex b/lib/codeqa/analyzer.ex deleted file mode 100644 index ddcb6ab..0000000 --- a/lib/codeqa/analyzer.ex +++ /dev/null @@ -1,130 +0,0 @@ -defmodule CodeQA.Analyzer do - @moduledoc "Orchestrates metric computation across files." - - alias CodeQA.Registry - alias CodeQA.Metrics - - def build_registry do - Registry.new() - |> Registry.register_file_metric(Metrics.Entropy) - |> Registry.register_file_metric(Metrics.Compression) - |> Registry.register_file_metric(Metrics.Zipf) - |> Registry.register_file_metric(Metrics.Heaps) - |> Registry.register_file_metric(Metrics.Vocabulary) - |> Registry.register_file_metric(Metrics.Ngram) - |> Registry.register_file_metric(Metrics.Halstead) - |> Registry.register_file_metric(Metrics.Readability) - |> Registry.register_file_metric(Metrics.CasingEntropy) - |> Registry.register_file_metric(Metrics.IdentifierLengthVariance) - |> Registry.register_file_metric(Metrics.Indentation) - |> Registry.register_file_metric(Metrics.Branching) - |> Registry.register_file_metric(Metrics.FunctionMetrics) - |> Registry.register_file_metric(Metrics.MagicNumberDensity) - |> Registry.register_file_metric(Metrics.SymbolDensity) - |> Registry.register_file_metric(Metrics.VowelDensity) - |> Registry.register_codebase_metric(Metrics.Similarity) - end - - def analyze_codebase(files, opts \\ []) do - registry = build_registry() - - opts = - if Keyword.get(opts, :experimental_stopwords, false) do - has_progress = Keyword.get(opts, :on_progress) - - if has_progress, - do: IO.puts(:stderr, " Analyzing Stopwords (Tokens and Fingerprints)...") - - word_extractor = fn content -> - Regex.scan(~r/\b[a-zA-Z_]\w*\b/u, content) |> List.flatten() - end - - word_stopwords = - CodeQA.Telemetry.time(:stopwords_words, fn -> - CodeQA.Stopwords.find_stopwords(files, word_extractor, opts) - end) - - fp_extractor = fn content -> - CodeQA.Metrics.TokenNormalizer.normalize(content) |> CodeQA.Metrics.Winnowing.kgrams(5) - end - - fp_stopwords = - CodeQA.Telemetry.time(:stopwords_fingerprints, fn -> - CodeQA.Stopwords.find_stopwords(files, fp_extractor, opts) - end) - - if has_progress do - IO.puts( - :stderr, - " Found #{MapSet.size(word_stopwords)} common word stopwords and #{MapSet.size(fp_stopwords)} common fingerprint stopwords." - ) - end - - opts - |> Keyword.put(:word_stopwords, word_stopwords) - |> Keyword.put(:fp_stopwords, fp_stopwords) - else - opts - end - - file_results = CodeQA.Parallel.analyze_files(files, opts) - codebase_metrics = Registry.run_codebase_metrics(registry, files, opts) - aggregate = aggregate_file_metrics(file_results) - - %{ - "files" => file_results, - "codebase" => %{ - "aggregate" => aggregate, - "similarity" => Map.get(codebase_metrics, "similarity", %{}) - } - } - end - - defp metric_data_to_triples({metric_name, metric_data}) do - metric_data - |> Enum.filter(fn {_k, v} -> is_number(v) end) - |> Enum.map(fn {key, value} -> {metric_name, key, value / 1} end) - end - - defp aggregate_file_metrics(file_results) do - file_results - |> Map.values() - |> Enum.flat_map(fn file_data -> - file_data - |> Map.get("metrics", %{}) - |> Enum.flat_map(&metric_data_to_triples/1) - end) - |> Enum.group_by(fn {metric, key, _val} -> {metric, key} end, fn {_, _, val} -> val end) - |> Enum.reduce(%{}, fn {{metric, key}, values}, acc -> - stats = compute_stats(values) - metric_agg = Map.get(acc, metric, %{}) - - updated = - Map.merge(metric_agg, %{ - "mean_#{key}" => stats.mean, - "std_#{key}" => stats.std, - "min_#{key}" => stats.min, - "max_#{key}" => stats.max - }) - - Map.put(acc, metric, updated) - end) - end - - defp compute_stats([]), do: %{mean: 0.0, std: 0.0, min: 0.0, max: 0.0} - - defp compute_stats(values) do - n = length(values) - mean = Enum.sum(values) / n - sum_squares = Enum.reduce(values, 0.0, fn v, acc -> acc + (v - mean) ** 2 end) - variance = sum_squares / n - std = :math.sqrt(variance) - - %{ - mean: Float.round(mean * 1.0, 4), - std: Float.round(std * 1.0, 4), - min: Float.round(Enum.min(values) * 1.0, 4), - max: Float.round(Enum.max(values) * 1.0, 4) - } - end -end diff --git a/lib/codeqa/ast/classification/node_classifier.ex b/lib/codeqa/ast/classification/node_classifier.ex new file mode 100644 index 0000000..b6f20fa --- /dev/null +++ b/lib/codeqa/ast/classification/node_classifier.ex @@ -0,0 +1,124 @@ +defmodule CodeQA.AST.Classification.NodeClassifier do + @moduledoc """ + Classifies a Node into a typed struct by running classification signals + over its tokens and weighing their votes. + + ## How it works + + Six classification signals scan the node's token stream in parallel via + `SignalStream`. Each signal emits weighted votes (e.g. `{:function_vote, 3}`) + when it detects a pattern indicating a node type. The classifier sums weights + per type and picks the winner. Ties and no-votes fall back to `:code`. + + ## Signals and votes + + | Signal | Vote key | Patterns detected | + |---|---|---| + | `DocSignal` | `:doc_vote` | `` token anywhere | + | `AttributeSignal` | `:attribute_vote` | `@name` at indent 0 | + | `FunctionSignal` | `:function_vote` | `def`, `func`, `fn`, etc. at indent 0 | + | `ModuleSignal` | `:module_vote` | `defmodule`, `class`, `module`, etc. at indent 0 | + | `ImportSignal` | `:import_vote` | `import`, `use`, `alias`, etc. at indent 0 | + | `TestSignal` | `:test_vote` | `test`, `describe`, `it`, etc. at indent 0 | + + ## Weights + + Weight 3 = first keyword seen (strong match); weight 1 = keyword later in + block (weak match, e.g. after a leading comment). `DocSignal` always emits + weight 3 and wins when a `` token is present, since triple-quoted strings + are unambiguous. + + ## Type-specific fields + + `FunctionNode.name/arity/visibility`, `ModuleNode.name/kind`, etc. all default + to `nil`. Population of those fields is left to a future enrichment pass. + """ + + alias CodeQA.AST.Enrichment.Node + + alias CodeQA.AST.Nodes.{ + AttributeNode, + CodeNode, + DocNode, + FunctionNode, + ImportNode, + ModuleNode, + TestNode + } + + alias CodeQA.AST.Parsing.SignalStream + + alias CodeQA.AST.Signals.Classification.{ + AttributeSignal, + DocSignal, + FunctionSignal, + ImportSignal, + ModuleSignal, + TestSignal + } + + @classification_signals [ + %DocSignal{}, + %AttributeSignal{}, + %FunctionSignal{}, + %ModuleSignal{}, + %ImportSignal{}, + %TestSignal{} + ] + + @type_modules %{ + doc: DocNode, + attribute: AttributeNode, + function: FunctionNode, + module: ModuleNode, + import: ImportNode, + test: TestNode, + code: CodeNode + } + + @doc """ + Classify a Node into the most specific typed struct. + + Runs classification signals, weighs votes, and delegates to the winning + struct's `cast/1` to build the result. Type-specific fields default to nil. + """ + @spec classify(Node.t(), module()) :: term() + def classify(%Node{} = node, lang_mod) do + type = vote(node.tokens, lang_mod) + @type_modules[type].cast(node) + end + + defp vote(tokens, lang_mod) do + tokens + |> run_signals(lang_mod) + |> tally() + |> winner() + end + + defp run_signals(tokens, lang_mod) do + SignalStream.run(tokens, @classification_signals, lang_mod) + |> List.flatten() + |> Enum.filter(fn {_src, group, _name, _val} -> group == :classification end) + end + + defp tally(emissions) do + Enum.reduce(emissions, %{}, fn {_src, _grp, name, weight}, acc -> + Map.update(acc, name, weight, &(&1 + weight)) + end) + end + + defp winner(votes) when map_size(votes) == 0, do: :code + + defp winner(votes) do + {vote_name, _weight} = Enum.max_by(votes, fn {_, w} -> w end) + vote_to_type(vote_name) + end + + defp vote_to_type(:doc_vote), do: :doc + defp vote_to_type(:attribute_vote), do: :attribute + defp vote_to_type(:function_vote), do: :function + defp vote_to_type(:module_vote), do: :module + defp vote_to_type(:import_vote), do: :import + defp vote_to_type(:test_vote), do: :test + defp vote_to_type(_), do: :code +end diff --git a/lib/codeqa/ast/classification/node_protocol.ex b/lib/codeqa/ast/classification/node_protocol.ex new file mode 100644 index 0000000..fa4943d --- /dev/null +++ b/lib/codeqa/ast/classification/node_protocol.ex @@ -0,0 +1,29 @@ +defprotocol CodeQA.AST.Classification.NodeProtocol do + @moduledoc """ + Common interface for all typed AST node structs. + + All node struct types (CodeNode, DocNode, FunctionNode, etc.) implement this + protocol, allowing downstream code to work with any node type uniformly. + """ + + @spec tokens(t()) :: [term()] + def tokens(node) + + @spec flat_tokens(t()) :: [term()] + def flat_tokens(node) + + @spec line_count(t()) :: non_neg_integer() + def line_count(node) + + @spec children(t()) :: [term()] + def children(node) + + @spec start_line(t()) :: non_neg_integer() | nil + def start_line(node) + + @spec end_line(t()) :: non_neg_integer() | nil + def end_line(node) + + @spec label(t()) :: term() | nil + def label(node) +end diff --git a/lib/codeqa/ast/classification/node_type_detector.ex b/lib/codeqa/ast/classification/node_type_detector.ex new file mode 100644 index 0000000..5038371 --- /dev/null +++ b/lib/codeqa/ast/classification/node_type_detector.ex @@ -0,0 +1,20 @@ +defmodule CodeQA.AST.Classification.NodeTypeDetector do + @moduledoc """ + Classifies a list of raw `Node` structs (from `Parser`) into typed structs. + + Each node is classified by `NodeClassifier`, which runs classification signals + over the node's tokens and picks the highest-voted type. See `NodeClassifier` + for the full list of signals and their weights. + """ + + alias CodeQA.AST.Classification.NodeClassifier + alias CodeQA.AST.Enrichment.Node + + @doc """ + Classify each node in the list into the most specific typed struct. + """ + @spec detect_types([Node.t()], module()) :: [term()] + def detect_types(blocks, lang_mod) do + Enum.map(blocks, &NodeClassifier.classify(&1, lang_mod)) + end +end diff --git a/lib/codeqa/ast/enrichment/compound_node.ex b/lib/codeqa/ast/enrichment/compound_node.ex new file mode 100644 index 0000000..88a594c --- /dev/null +++ b/lib/codeqa/ast/enrichment/compound_node.ex @@ -0,0 +1,41 @@ +defmodule CodeQA.AST.Enrichment.CompoundNode do + @moduledoc """ + Groups semantically related typed nodes together. + + A compound node represents a complete "unit" in source code — combining + documentation, type annotations, and implementation: + + - `docs` — [DocNode.t()] (triple-quoted docstrings) + - `typespecs` — [AttributeNode.t()] (@spec, @type, etc.) + - `code` — [Node.t()] with type :code (implementation clauses) + + Boundaries span all constituent nodes in source order (docs → typespecs → + code), with leading/trailing whitespace tokens stripped. Column values are + read from the `col` field of the relevant Token structs — Node has no col + fields. + + A bare code node with no preceding docs/typespecs is still wrapped in a + CompoundNode (with empty `docs` and `typespecs`). + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Nodes.AttributeNode + + defstruct docs: [], + typespecs: [], + code: [], + start_line: nil, + start_col: nil, + end_line: nil, + end_col: nil + + @type t :: %__MODULE__{ + docs: [Node.t()], + typespecs: [AttributeNode.t()], + code: [Node.t()], + start_line: non_neg_integer() | nil, + start_col: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + end_col: non_neg_integer() | nil + } +end diff --git a/lib/codeqa/ast/enrichment/compound_node_builder.ex b/lib/codeqa/ast/enrichment/compound_node_builder.ex new file mode 100644 index 0000000..27c6165 --- /dev/null +++ b/lib/codeqa/ast/enrichment/compound_node_builder.ex @@ -0,0 +1,157 @@ +defmodule CodeQA.AST.Enrichment.CompoundNodeBuilder do + @moduledoc """ + Groups typed nodes into CompoundNode structs. + + A new compound starts when: + 1. A :doc or :typespec node appears after at least one :code node + 2. The trailing whitespace of the previous node contains 2+ tokens + + All consecutive :code nodes with no boundary between them accumulate + into the same compound's `code` list. + + Sub-blocks of :code nodes that have type :doc or :typespec are + promoted to the compound's `docs`/`typespecs` lists. + """ + + alias CodeQA.AST.Classification.NodeProtocol + alias CodeQA.AST.Enrichment.CompoundNode + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.{NewlineToken, WhitespaceToken} + alias CodeQA.AST.Nodes.{AttributeNode, DocNode} + + @doc """ + Groups a list of typed nodes into CompoundNode structs. + """ + @spec build([Node.t()]) :: [CompoundNode.t()] + def build([]), do: [] + + def build(blocks) do + # Accumulator: {current_compound, prev_trailing_ws, finalized_compounds} + # prev_trailing_ws carries the trailing / tokens stripped from the + # PREVIOUS node. Blank-line boundaries are detected on the previous node's + # trailing whitespace — BlankLineRule places blank-line tokens at the + # END of the node that precedes the split, not at the start of the new one. + {current, _, compounds} = + Enum.reduce(blocks, {empty_compound(), [], []}, fn block, + {current, prev_trailing_ws, acc} -> + {content_tokens, trailing_ws} = split_trailing_whitespace(block.tokens) + clean_block = %{block | tokens: content_tokens} + # Check the PREVIOUS node's trailing whitespace for blank-line boundary + blank_boundary = blank_line_boundary?(prev_trailing_ws) + + cond do + # Rule 1: doc/typespec after code → flush and start new compound + (is_struct(block, DocNode) or is_struct(block, AttributeNode)) and current.code != [] -> + {start_compound(clean_block), trailing_ws, [finalize(current) | acc]} + + # Rule 2: blank-line boundary on previous node → flush and start fresh + blank_boundary and not empty_compound?(current) -> + {start_compound(clean_block), trailing_ws, [finalize(current) | acc]} + + # No boundary — accumulate into current + true -> + {add_block(current, clean_block), trailing_ws, acc} + end + end) + + compounds + |> then(fn acc -> + if empty_compound?(current), do: acc, else: [finalize(current) | acc] + end) + |> Enum.reverse() + end + + defp empty_compound, do: %CompoundNode{} + + defp empty_compound?(%CompoundNode{docs: [], typespecs: [], code: []}), do: true + defp empty_compound?(_), do: false + + defp add_block(%CompoundNode{} = compound, block) when is_struct(block, DocNode) do + %CompoundNode{compound | docs: compound.docs ++ [block]} + end + + defp add_block(%CompoundNode{} = compound, block) when is_struct(block, AttributeNode) do + %CompoundNode{compound | typespecs: compound.typespecs ++ [block]} + end + + defp add_block(%CompoundNode{} = compound, block) do + {promoted_docs, promoted_specs, clean_children} = promote_sub_blocks(block.children) + clean_block = %{block | children: clean_children} + + %CompoundNode{ + compound + | code: compound.code ++ [clean_block], + docs: compound.docs ++ promoted_docs, + typespecs: compound.typespecs ++ promoted_specs + } + end + + defp start_compound(new_block) do + add_block(empty_compound(), new_block) + end + + # Separates children by type — :doc/:typespec go up to the compound level. + defp promote_sub_blocks(children) do + Enum.reduce(children, {[], [], []}, fn sub, {docs, specs, code} -> + case sub.type do + :doc -> {docs ++ [sub], specs, code} + :typespec -> {docs, specs ++ [sub], code} + _ -> {docs, specs, code ++ [sub]} + end + end) + end + + # Strips trailing / tokens from a node's token list. + # Returns {content_tokens, trailing_ws_tokens}. + defp split_trailing_whitespace(tokens) do + last_content_idx = + tokens + |> Enum.with_index() + |> Enum.reverse() + |> Enum.find_index(fn {t, _} -> + not is_map(t) or t.kind not in [WhitespaceToken.kind(), NewlineToken.kind()] + end) + + case last_content_idx do + nil -> + {[], tokens} + + rev_idx -> + content_len = length(tokens) - rev_idx + {Enum.slice(tokens, 0, content_len), Enum.slice(tokens, content_len..-1//1)} + end + end + + # A blank-line boundary exists when the trailing whitespace contains 3+ tokens + # (i.e. 2+ blank lines). A single blank line (2 NLs: end-of-line + blank line) is + # common within a compound (e.g. between function clauses) and does not split. + defp blank_line_boundary?(trailing_ws) do + Enum.count(trailing_ws, &(&1.kind == NewlineToken.kind())) >= 3 + end + + # Computes boundaries from all constituent nodes in source order: + # docs → typespecs → code. Reads col directly from Token structs. + defp finalize(%CompoundNode{} = compound) do + all_blocks = compound.docs ++ compound.typespecs ++ compound.code + all_tokens = Enum.flat_map(all_blocks, &NodeProtocol.flat_tokens/1) + + first_token = + Enum.find( + all_tokens, + &(is_map(&1) and &1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()]) + ) + + last_token = + all_tokens + |> Enum.reverse() + |> Enum.find(&(is_map(&1) and &1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()])) + + %CompoundNode{ + compound + | start_line: first_token && first_token.line, + start_col: first_token && first_token.col, + end_line: last_token && last_token.line, + end_col: last_token && last_token.col + } + end +end diff --git a/lib/codeqa/ast/enrichment/node.ex b/lib/codeqa/ast/enrichment/node.ex new file mode 100644 index 0000000..65e4b23 --- /dev/null +++ b/lib/codeqa/ast/enrichment/node.ex @@ -0,0 +1,70 @@ +defmodule CodeQA.AST.Enrichment.Node do + @moduledoc """ + A detected code node with optional nested sub-blocks. + + ## Fields + + - `tokens` — aggregated code content: for leaf nodes, the original token stream; + for non-leaf nodes, the flat concatenation of all children's `tokens`. + Use this for content comparison and metrics. + - `line_count` — number of source lines spanned by this node: `end_line - start_line + 1` + when both are available, else `1`. + - `children` — nested `Node.t()` structs detected by enclosure rules + (`BracketRule`, `ColonIndentationRule`). + - `label` — arbitrary term attached by the caller. Set to `"path:start_line"` + by `NearDuplicateBlocks.analyze/2` for human-readable pair reporting. + - `start_line` — 1-based line number of the first token in this node, populated by + `Parser` from `List.first(tokens).line`. + - `end_line` — 1-based line number of the last token in this node, populated by + `Parser` from `List.last(tokens).line`. + + `start_line` and `end_line` may be `nil` for synthetic nodes created in tests + without line metadata. + """ + + @enforce_keys [:tokens, :line_count, :children] + defstruct [ + :tokens, + :line_count, + :children, + :label, + :start_line, + :end_line, + type: :code + ] + + @type t :: %__MODULE__{ + tokens: [CodeQA.AST.Lexing.Token.t()], + line_count: non_neg_integer(), + children: [term()], + label: term() | nil, + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + type: :code | :doc | :typespec + } + + @spec children_count(t()) :: non_neg_integer() + def children_count(%__MODULE__{children: ch}), do: length(ch) + + # Keep old name as deprecated alias during transition + @spec sub_block_count(t()) :: non_neg_integer() + def sub_block_count(%__MODULE__{children: ch}), do: length(ch) + + @spec token_count(t()) :: non_neg_integer() + def token_count(%__MODULE__{tokens: tokens}), do: length(tokens) +end + +defimpl CodeQA.AST.Classification.NodeProtocol, for: CodeQA.AST.Enrichment.Node do + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &CodeQA.AST.Classification.NodeProtocol.flat_tokens/1) + end +end diff --git a/lib/codeqa/ast/enrichment/node_analyzer.ex b/lib/codeqa/ast/enrichment/node_analyzer.ex new file mode 100644 index 0000000..2f6221c --- /dev/null +++ b/lib/codeqa/ast/enrichment/node_analyzer.ex @@ -0,0 +1,65 @@ +defmodule CodeQA.AST.Enrichment.NodeAnalyzer do + @moduledoc """ + Extracts locally bound variable names from a token list. + + Used by the domain tagger to subtract local bindings from the domain signal — + a variable bound within a node (e.g. `user = Repo.get!(id)`) is not a domain + reference and should not appear in the node's domain fingerprint. + """ + + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.Token + + @doc """ + Returns a MapSet of lowercase identifier names that are locally bound + within the given token list. + + Detected patterns: + - ` "="` — simple assignment (guards against `==`, `=>`, `=~`, `!=`, `<=`, `>=`) + - ` "<-"` — with/for binding (all `` tokens on the LHS of `<-`) + + Function parameters are NOT extracted here (see `param_variables/1`). + """ + @spec bound_variables([Token.t()]) :: MapSet.t(String.t()) + def bound_variables(tokens) do + MapSet.union( + assignment_bindings(tokens), + arrow_bindings(tokens) + ) + end + + # Collect `` immediately before `=` + defp assignment_bindings(tokens) do + tokens + |> Enum.chunk_every(2, 1, :discard) + |> Enum.flat_map(fn + [%Token{kind: "", content: name}, %Token{kind: "="}] -> + [String.downcase(name)] + + _ -> + [] + end) + |> MapSet.new() + end + + # Collect all `` tokens on the LHS of `<-` (within the same line). + # Resets the accumulator on `` so RHS tokens from prior lines don't leak. + defp arrow_bindings(tokens) do + tokens + |> Enum.reduce({[], MapSet.new()}, fn + %Token{kind: "<-"}, {lhs_ids, acc} -> + new_bindings = lhs_ids |> Enum.map(&String.downcase/1) |> MapSet.new() + {[], MapSet.union(acc, new_bindings)} + + %NewlineToken{}, {_, acc} -> + {[], acc} + + %Token{kind: "", content: name}, {lhs_ids, acc} -> + {[name | lhs_ids], acc} + + _, {lhs_ids, acc} -> + {lhs_ids, acc} + end) + |> elem(1) + end +end diff --git a/lib/codeqa/ast/lexing/newline_token.ex b/lib/codeqa/ast/lexing/newline_token.ex new file mode 100644 index 0000000..2ccb712 --- /dev/null +++ b/lib/codeqa/ast/lexing/newline_token.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.AST.Lexing.NewlineToken do + @moduledoc """ + A newline token emitted by `TokenNormalizer.normalize_structural/1`. + + Represents a `\\n` line boundary between two source lines. + + ## Fields + + - `kind` — always `""`. + - `content` — the original newline character (`"\\n"`). + - `line` — 1-based line number of the line that ends here. + - `col` — 0-based byte offset of the newline within that line. + """ + + @kind "" + + defstruct [:content, :line, :col, kind: @kind] + + @doc "Returns the normalized kind string for newline tokens." + @spec kind() :: String.t() + def kind, do: @kind + + @type t :: %__MODULE__{ + kind: String.t(), + content: String.t(), + line: non_neg_integer() | nil, + col: non_neg_integer() | nil + } +end diff --git a/lib/codeqa/ast/lexing/string_token.ex b/lib/codeqa/ast/lexing/string_token.ex new file mode 100644 index 0000000..04fc296 --- /dev/null +++ b/lib/codeqa/ast/lexing/string_token.ex @@ -0,0 +1,49 @@ +defmodule CodeQA.AST.Lexing.StringToken do + @moduledoc """ + A string token emitted by `TokenNormalizer` for all string literals, + including triple-quoted heredocs. + + ## Fields + + - `kind` — `""` for single-line strings, `""` for + triple-quoted heredoc delimiters. + - `content` — original source text (the full quoted literal or delimiter). + - `line`, `col` — source location. + - `interpolations` — list of interpolation expressions (`nil` for plain strings). + - `multiline` — `true` for triple-quoted (`\"\"\"` / `'''`) tokens. + - `quotes` — `:double`, `:single`, or `:backtick`. + """ + + @kind "" + @doc_kind "" + + defstruct [ + :content, + :line, + :col, + kind: @kind, + interpolations: nil, + multiline: false, + quotes: :double + ] + + @doc "Returns the normalized kind string for single-line string tokens." + @spec kind() :: String.t() + def kind, do: @kind + + @doc "Returns the normalized kind string for triple-quoted doc string tokens." + @spec doc_kind() :: String.t() + def doc_kind, do: @doc_kind + + @type quotes :: :double | :single | :backtick + + @type t :: %__MODULE__{ + content: String.t(), + line: non_neg_integer() | nil, + col: non_neg_integer() | nil, + kind: String.t(), + interpolations: [String.t()] | nil, + multiline: boolean(), + quotes: quotes() + } +end diff --git a/lib/codeqa/ast/lexing/token.ex b/lib/codeqa/ast/lexing/token.ex new file mode 100644 index 0000000..f705f7e --- /dev/null +++ b/lib/codeqa/ast/lexing/token.ex @@ -0,0 +1,45 @@ +defmodule CodeQA.AST.Lexing.Token do + @moduledoc """ + A single token emitted by `TokenNormalizer.normalize_structural/1`. + + ## Fields + + - `value` — normalized form used for structural comparison: ``, ``, + ``, ``, ``, or the literal character(s) for + punctuation and operators. + - `content` — original source text before normalization. Identical to `value` + for punctuation/structural tokens; differs for identifiers, + strings, and numbers. Enables source reconstruction and is the + correct field to check when matching declaration keywords. + - `line` — 1-based line number in the source file. + - `col` — 0-based byte offset from the start of the line. + + String literals are emitted as `StringToken` structs, not `Token`, so that + the `interpolations` field does not pollute the common token shape. + + ## Design notes (from tree-sitter, ctags, lizard) + + - **value vs content split** — mirrors tree-sitter's distinction between a + node's `type` (structural kind) and its `text` (original source). `value` + is the kind used for pattern matching and comparison; `content` is the + original text used for reporting and reconstruction. + - **Normalization lives in value, not content** — `content` is never modified. + This means two tokens with different `content` but the same `value` (e.g. + `"foo"` and `"bar"` both normalizing to ``) are structurally equivalent + for duplicate detection but distinguishable for reporting. + - **Line + col for precise location** — ctags records line numbers; tree-sitter + records byte ranges. We store both line (for human-readable reporting) and + col (for IDE navigation and sub-block start/end precision). + - **No enforcement on line/col** — synthetic tokens created in tests may omit + line/col. Consumers that need location data should guard for nil. + """ + + defstruct [:kind, :content, :line, :col] + + @type t :: %__MODULE__{ + kind: String.t(), + content: String.t(), + line: non_neg_integer() | nil, + col: non_neg_integer() | nil + } +end diff --git a/lib/codeqa/ast/lexing/token_normalizer.ex b/lib/codeqa/ast/lexing/token_normalizer.ex new file mode 100644 index 0000000..5cabba3 --- /dev/null +++ b/lib/codeqa/ast/lexing/token_normalizer.ex @@ -0,0 +1,263 @@ +defmodule CodeQA.AST.Lexing.TokenNormalizer do + @moduledoc """ + Abstracts raw source code into language-agnostic structural tokens. + + See [lexical analysis](https://en.wikipedia.org/wiki/Lexical_analysis). + """ + + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.StringToken + alias CodeQA.AST.Lexing.Token + alias CodeQA.AST.Lexing.WhitespaceToken + + @doc """ + Normalizes source code into language-agnostic structural tokens, preserving + newlines as `` and leading whitespace as `` tokens (one per + 2-space / 1-tab indentation unit). + + Returns `[Token.t()]` where each token carries its normalized `value`, + original source `content`, 1-based `line` number, and 0-based `col` offset. + Used for structural block detection. + """ + @spec normalize_structural(String.t()) :: [Token.t()] + def normalize_structural(code) do + code = String.replace(code, ~r/[^\x00-\x7F]/, " ") + lines = String.split(code, "\n") + last_idx = length(lines) - 1 + + lines + |> Enum.with_index() + |> Enum.flat_map(fn {line, idx} -> + tokens_with_newline(line, idx, last_idx) + end) + end + + defp tokens_with_newline(line, idx, last_idx) do + line_num = idx + 1 + {tokens, last_token} = tokenize_line(line, line_num) + + if idx < last_idx do + # last_token is tracked during scanning — O(1) vs List.last/1 which is O(N). + nl_col = + case last_token do + nil -> 0 + t -> t.col + String.length(t.content) + end + + tokens ++ [%NewlineToken{content: "\n", line: line_num, col: nl_col}] + else + tokens + end + end + + # Returns {tokens, last_token} where last_token is the final token on the line + # (or nil for an empty line), allowing normalize_structural to compute nl_col + # in O(1) without calling List.last/1. + defp tokenize_line(line, line_num) do + indent_chars = + line + |> String.graphemes() + |> Enum.take_while(&(&1 in [" ", "\t"])) + + indent_units = + indent_chars + |> Enum.reduce(0, fn + "\t", acc -> acc + 2 + " ", acc -> acc + 1 + end) + |> div(2) + + indent_col_width = length(indent_chars) + + ws_tokens = + for i <- 1..indent_units//1 do + %WhitespaceToken{content: " ", line: line_num, col: (i - 1) * 2} + end + + content = String.slice(line, indent_col_width..-1//1) + {content_tokens, last_content} = scan_content(content, line_num, indent_col_width) + + # Last token on the line: prefer the last content token; fall back to the + # last WS token (only possible when the content portion is empty). + last_token = last_content || List.last(ws_tokens) + + {ws_tokens ++ content_tokens, last_token} + end + + # Multi-char operators matched longest-first so that e.g. `===` beats `==`. + # Tagged `:literal` so `next_token` uses the matched text as both value and content + # (unlike ``, ``, `` which normalise content away). + @operator_regex ~r/^(?:===|!==|<=>|==|!=|<=|>=|\|>|<>|<-|->|=>|=~|!~|&&|\|\||\?\?|\?\.|:=|::|\.\.\.|\.\.|--|\+\+|\*\*|\/\/|\+=|-=|\*=|\/=|%=)/ + + # --- Individual rule atoms so dispatch groups can reference them directly --- + @skip_rule {:skip, ~r/^\s+/} + @operator_rule {:literal, @operator_regex} + @trip_quotes_rule {"", ~r/^"""|^'''/} + @str_interp_rule {"", ~r/^"(?=[^"]*#\{)(?:[^"\\#]|\\.|#(?!\{)|#\{[^}]*\})*"/} + @str_dollar_interp_rule {"", + ~r/^"(?=[^"]*\$\{)(?:[^"\\$]|\\.|\\$(?!\{)|\$\{[^}]*\})*"/} + @str_swift_interp_rule {"", ~r/^"(?=[^"]*\\\()(?:[^"\\]|\\.)*"/} + @str_rule {"", ~r/^"(?:[^"\\]|\\.)*"|^'(?:[^'\\]|\\.)*'/} + @backtick_interp_rule {"", + ~r/^`(?=[^`]*\$\{)(?:[^`\\$]|\\.|\\$(?!\{)|\$\{[^}]*\})*`/} + @backtick_str_rule {"", ~r/^`(?:[^`\\]|\\.)*`/} + @num_rule {"", ~r/^\d+(?:\.\d+)?/} + @id_rule {"", ~r/^[a-zA-Z_]\w*/} + + # Dispatch rule subsets by first character so the common cases (identifiers, + # numbers, whitespace, operators) skip irrelevant regex attempts entirely. + @double_quote_rules [ + @trip_quotes_rule, + @str_interp_rule, + @str_dollar_interp_rule, + @str_swift_interp_rule, + @str_rule + ] + @single_quote_rules [@trip_quotes_rule, @str_rule] + @backtick_rules [@backtick_interp_rule, @backtick_str_rule] + + # Returns the rule subset for the given first byte (ASCII codepoint). + defp dispatch_rules(?"), do: @double_quote_rules + defp dispatch_rules(?'), do: @single_quote_rules + defp dispatch_rules(?`), do: @backtick_rules + defp dispatch_rules(c) when c >= ?0 and c <= ?9, do: [@num_rule] + + defp dispatch_rules(c) + when (c >= ?a and c <= ?z) or (c >= ?A and c <= ?Z) or c == ?_, + do: [@id_rule] + + defp dispatch_rules(c) + when c in [?=, ?!, ?<, ?>, ?|, ?&, ??, ?:, ?., ?-, ?+, ?*, ?/, ?%], + do: [@operator_rule] + + defp dispatch_rules(c) when c <= 32, do: [@skip_rule] + + # Unknown first char — no rule applies; caller falls through to single-char token. + defp dispatch_rules(_), do: [] + + # Returns {tokens, last_token_or_nil} — last_token is tracked during scanning + # so callers get O(1) access to the final token without List.last/1. + defp scan_content(text, line_num, col_offset) do + {reversed, last} = do_scan(text, line_num, col_offset, [], nil) + {Enum.reverse(reversed), last} + end + + defp do_scan("", _line, _col, acc, last), do: {acc, last} + + defp do_scan(<> = text, line, col, acc, last) do + case next_token(first, text, line, col) do + {:skip, rest, advance} -> do_scan(rest, line, col + advance, acc, last) + {token, rest, advance} -> do_scan(rest, line, col + advance, [token | acc], token) + end + end + + # next_token/4: dispatches on the first byte to select only candidate rules, + # avoiding regex attempts for rules whose first-char pattern can't possibly match. + defp next_token(first, text, line, col) do + rules = dispatch_rules(first) + + result = + Enum.find_value(rules, fn {type, regex} -> + case Regex.run(regex, text) do + [m | _] -> {type, m} + nil -> nil + end + end) + + case result do + {:skip, m} -> + len = String.length(m) + {:skip, String.slice(text, len..-1//1), len} + + {:literal, m} -> + len = String.length(m) + {%Token{kind: m, content: m, line: line, col: col}, String.slice(text, len..-1//1), len} + + {value, m} -> + len = String.length(m) + token = postprocess(value, %Token{kind: value, content: m, line: line, col: col}) + {token, String.slice(text, len..-1//1), len} + + nil -> + # No rule matched — emit the first character as a literal single-char token. + char = String.first(text) + {%Token{kind: char, content: char, line: line, col: col}, String.slice(text, 1..-1//1), 1} + end + end + + # Extract #{...} interpolation expressions into `interpolations` and strip + # them from `content` so downstream consumers see only the static string parts. + # Nested braces (e.g. #{foo(%{a: 1})}) are left as-is in content — the + # lookahead in the scan rule ensures a match only when simple interpolations + # are present. + defp postprocess("", token), + do: extract_interpolations(token, ~r/#\{([^}]*)\}/, ~r/#\{[^}]*\}/, quotes: :double) + + defp postprocess("", token), + do: extract_interpolations(token, ~r/\$\{([^}]*)\}/, ~r/\$\{[^}]*\}/, quotes: :double) + + defp postprocess("", token), + do: extract_interpolations(token, ~r/\\\(([^)]*)\)/, ~r/\\\([^)]*\)/, quotes: :double) + + defp postprocess("", token), + do: extract_interpolations(token, ~r/\$\{([^}]*)\}/, ~r/\$\{[^}]*\}/, quotes: :backtick) + + defp postprocess("", %Token{content: ~s(""")} = token), + do: %StringToken{ + kind: StringToken.doc_kind(), + content: token.content, + line: token.line, + col: token.col, + multiline: true, + quotes: :double + } + + defp postprocess("", token), + do: %StringToken{ + kind: StringToken.doc_kind(), + content: token.content, + line: token.line, + col: token.col, + multiline: true, + quotes: :single + } + + defp postprocess("", token), + do: %StringToken{ + kind: StringToken.kind(), + content: token.content, + line: token.line, + col: token.col, + quotes: :backtick + } + + defp postprocess("", token) do + quotes = if String.starts_with?(token.content, "\""), do: :double, else: :single + + %StringToken{ + kind: StringToken.kind(), + content: token.content, + line: token.line, + col: token.col, + quotes: quotes + } + end + + defp postprocess(_value, token), do: token + + defp extract_interpolations(token, capture_regex, strip_regex, opts) do + quotes = Keyword.get(opts, :quotes, :double) + + interpolations = + Regex.scan(capture_regex, token.content, capture: :all_but_first) + |> Enum.map(fn [expr] -> String.trim(expr) end) + + %StringToken{ + content: String.replace(token.content, strip_regex, ""), + line: token.line, + col: token.col, + interpolations: interpolations, + quotes: quotes + } + end +end diff --git a/lib/codeqa/ast/lexing/token_protocol.ex b/lib/codeqa/ast/lexing/token_protocol.ex new file mode 100644 index 0000000..e38458d --- /dev/null +++ b/lib/codeqa/ast/lexing/token_protocol.ex @@ -0,0 +1,59 @@ +defprotocol CodeQA.AST.Lexing.TokenProtocol do + @moduledoc """ + Protocol for token structs emitted by `TokenNormalizer`. + + Both `Token` and `StringToken` implement this protocol, so code that + processes token streams does not need to branch on the concrete struct type. + + ## Functions + + - `kind/1` — normalized structural kind (``, ``, ``, …) + - `content/1` — original source text before normalization + - `line/1` — 1-based line number in the source file (`nil` for synthetic tokens) + - `col/1` — 0-based byte offset from the start of the line (`nil` for synthetic tokens) + """ + + @doc "Returns the normalized structural kind of the token." + @spec kind(t) :: String.t() + def kind(token) + + @doc "Returns the original source text of the token." + @spec content(t) :: String.t() + def content(token) + + @doc "Returns the 1-based line number of the token, or `nil` for synthetic tokens." + @spec line(t) :: non_neg_integer() | nil + def line(token) + + @doc "Returns the 0-based column offset of the token, or `nil` for synthetic tokens." + @spec col(t) :: non_neg_integer() | nil + def col(token) +end + +defimpl CodeQA.AST.Lexing.TokenProtocol, for: CodeQA.AST.Lexing.Token do + def kind(%CodeQA.AST.Lexing.Token{kind: k}), do: k + def content(%CodeQA.AST.Lexing.Token{content: c}), do: c + def line(%CodeQA.AST.Lexing.Token{line: l}), do: l + def col(%CodeQA.AST.Lexing.Token{col: c}), do: c +end + +defimpl CodeQA.AST.Lexing.TokenProtocol, for: CodeQA.AST.Lexing.StringToken do + def kind(%CodeQA.AST.Lexing.StringToken{kind: k}), do: k + def content(%CodeQA.AST.Lexing.StringToken{content: c}), do: c + def line(%CodeQA.AST.Lexing.StringToken{line: l}), do: l + def col(%CodeQA.AST.Lexing.StringToken{col: c}), do: c +end + +defimpl CodeQA.AST.Lexing.TokenProtocol, for: CodeQA.AST.Lexing.NewlineToken do + def kind(%CodeQA.AST.Lexing.NewlineToken{kind: k}), do: k + def content(%CodeQA.AST.Lexing.NewlineToken{content: c}), do: c + def line(%CodeQA.AST.Lexing.NewlineToken{line: l}), do: l + def col(%CodeQA.AST.Lexing.NewlineToken{col: c}), do: c +end + +defimpl CodeQA.AST.Lexing.TokenProtocol, for: CodeQA.AST.Lexing.WhitespaceToken do + def kind(%CodeQA.AST.Lexing.WhitespaceToken{kind: k}), do: k + def content(%CodeQA.AST.Lexing.WhitespaceToken{content: c}), do: c + def line(%CodeQA.AST.Lexing.WhitespaceToken{line: l}), do: l + def col(%CodeQA.AST.Lexing.WhitespaceToken{col: c}), do: c +end diff --git a/lib/codeqa/ast/lexing/whitespace_token.ex b/lib/codeqa/ast/lexing/whitespace_token.ex new file mode 100644 index 0000000..cb23082 --- /dev/null +++ b/lib/codeqa/ast/lexing/whitespace_token.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.AST.Lexing.WhitespaceToken do + @moduledoc """ + A whitespace/indentation token emitted by `TokenNormalizer.normalize_structural/1`. + + Represents one indentation unit (2 spaces or 1 tab) at the start of a line. + + ## Fields + + - `kind` — always `""`. + - `content` — the original source text for this indentation unit (`" "`). + - `line` — 1-based line number in the source file. + - `col` — 0-based byte offset from the start of the line. + """ + + @kind "" + + defstruct [:content, :line, :col, kind: @kind] + + @doc "Returns the normalized kind string for whitespace tokens." + @spec kind() :: String.t() + def kind, do: @kind + + @type t :: %__MODULE__{ + kind: String.t(), + content: String.t(), + line: non_neg_integer() | nil, + col: non_neg_integer() | nil + } +end diff --git a/lib/codeqa/ast/nodes/attribute_node.ex b/lib/codeqa/ast/nodes/attribute_node.ex new file mode 100644 index 0000000..7dd106c --- /dev/null +++ b/lib/codeqa/ast/nodes/attribute_node.ex @@ -0,0 +1,67 @@ +defmodule CodeQA.AST.Nodes.AttributeNode do + @moduledoc """ + AST node for fields, constants, decorators, annotations, and typespecs. + Subsumes the previous :typespec node type (kind: :typespec). + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.{NewlineToken, WhitespaceToken} + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label, :name, :kind] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + name: String.t() | nil, + kind: :field | :constant | :decorator | :annotation | :typespec | nil + } + + @typespec_attrs MapSet.new(~w[spec type typep opaque callback macrocallback]) + + @doc "Build an AttributeNode from a raw %Node{}, detecting :typespec kind from tokens." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label, + kind: detect_kind(node.tokens) + } + end + + defp detect_kind(tokens) do + tokens + |> Enum.drop_while(&(&1.kind in [WhitespaceToken.kind(), NewlineToken.kind()])) + |> case do + [%{kind: "@"}, %{kind: "", content: name} | _] -> + if MapSet.member?(@typespec_attrs, name), do: :typespec, else: nil + + _ -> + nil + end + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/code_node.ex b/lib/codeqa/ast/nodes/code_node.ex new file mode 100644 index 0000000..b7dfd9d --- /dev/null +++ b/lib/codeqa/ast/nodes/code_node.ex @@ -0,0 +1,46 @@ +defmodule CodeQA.AST.Nodes.CodeNode do + @moduledoc "Catch-all AST node for unclassified code blocks." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil + } + + @doc "Build a CodeNode from a raw %Node{}, copying all base fields." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/doc_node.ex b/lib/codeqa/ast/nodes/doc_node.ex new file mode 100644 index 0000000..5e011ca --- /dev/null +++ b/lib/codeqa/ast/nodes/doc_node.ex @@ -0,0 +1,46 @@ +defmodule CodeQA.AST.Nodes.DocNode do + @moduledoc "AST node for documentation strings and comment blocks." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil + } + + @doc "Build a DocNode from a raw %Node{}, copying all base fields." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/function_node.ex b/lib/codeqa/ast/nodes/function_node.ex new file mode 100644 index 0000000..48c6a5d --- /dev/null +++ b/lib/codeqa/ast/nodes/function_node.ex @@ -0,0 +1,59 @@ +defmodule CodeQA.AST.Nodes.FunctionNode do + @moduledoc "AST node for function, method, or callable definitions." + + alias CodeQA.AST.Enrichment.Node + + defstruct [ + :tokens, + :line_count, + :children, + :start_line, + :end_line, + :label, + :name, + :arity, + :visibility + ] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + name: String.t() | nil, + arity: non_neg_integer() | nil, + visibility: :public | :private | nil + } + + @doc "Build a FunctionNode from a raw %Node{}, copying all base fields. Type-specific fields default to nil." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/import_node.ex b/lib/codeqa/ast/nodes/import_node.ex new file mode 100644 index 0000000..3730370 --- /dev/null +++ b/lib/codeqa/ast/nodes/import_node.ex @@ -0,0 +1,47 @@ +defmodule CodeQA.AST.Nodes.ImportNode do + @moduledoc "AST node for import, require, use, alias, or include statements." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label, :target] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + target: String.t() | nil + } + + @doc "Build an ImportNode from a raw %Node{}, copying all base fields. Type-specific fields default to nil." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/module_node.ex b/lib/codeqa/ast/nodes/module_node.ex new file mode 100644 index 0000000..c8d5072 --- /dev/null +++ b/lib/codeqa/ast/nodes/module_node.ex @@ -0,0 +1,48 @@ +defmodule CodeQA.AST.Nodes.ModuleNode do + @moduledoc "AST node for module, class, namespace, or struct definitions." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label, :name, :kind] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + name: String.t() | nil, + kind: :class | :module | :namespace | :struct | nil + } + + @doc "Build a ModuleNode from a raw %Node{}, copying all base fields. Type-specific fields default to nil." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/test_node.ex b/lib/codeqa/ast/nodes/test_node.ex new file mode 100644 index 0000000..b3460cf --- /dev/null +++ b/lib/codeqa/ast/nodes/test_node.ex @@ -0,0 +1,47 @@ +defmodule CodeQA.AST.Nodes.TestNode do + @moduledoc "AST node for test cases, describe blocks, and it blocks." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label, :description] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + description: String.t() | nil + } + + @doc "Build a TestNode from a raw %Node{}, copying all base fields. Type-specific fields default to nil." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/parsing/parser.ex b/lib/codeqa/ast/parsing/parser.ex new file mode 100644 index 0000000..2615bb9 --- /dev/null +++ b/lib/codeqa/ast/parsing/parser.ex @@ -0,0 +1,234 @@ +defmodule CodeQA.AST.Parsing.Parser do + @moduledoc """ + Recursively parses a token stream into a nested node tree. + + Top-level nodes are found by splitting on blank lines and declaration keywords. + Each node is then recursively subdivided using enclosure rules (brackets, + colon-indentation) until no further subdivision is possible — forming an + arbitrarily-deep tree rather than a fixed two-level hierarchy. + + ## Recursive parsing algorithm + + `parse_block/3` is the recursive core: + + 1. Immediately create a `Node` spanning the whole token stream. + 2. Apply enclosure rules to find sub-candidate streams. + 3. **Idempotency check** — reject any enclosure that spans the entire stream + (e.g. `BracketRule` re-emitting its own input). This is the termination + condition: the node is a leaf when no strictly-smaller sub-candidates exist. + 4. Recursively call `parse_block/3` on each sub-candidate to produce children. + 5. Return the node with its children attached as `children`. + + ## Design notes (from tree-sitter, ctags, lizard) + + - **Recursive hierarchy** — replaces the old two-level (top + one level of sub-blocks) + model with an N-level tree. Each call to `parse_block/3` mirrors tree-sitter's + recursive descent: emit the node, then recurse into its contents. + - **Language detection by extension** — `language_from_path/1` follows ctags' + convention of inferring language from file extension. + - **Rule extensibility** — enclosure rules are selected per language via + `sub_block_rules/1`. Rules are composable and order-independent. + - **Error recovery** — mismatched brackets and malformed indentation are silently + skipped by individual rules. The parser emits partial nodes rather than failing, + consistent with tree-sitter's error-recovery philosophy. + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.{NewlineToken, WhitespaceToken} + alias CodeQA.AST.Parsing.SignalStream + + alias CodeQA.AST.Signals.Structural.{ + BlankLineSignal, + BracketSignal, + ColonIndentSignal, + KeywordSignal, + TripleQuoteSignal + } + + alias CodeQA.Language + + @spec detect_blocks([CodeQA.AST.Lexing.Token.t()], module()) :: [Node.t()] + def detect_blocks([], _lang_mod), do: [] + + def detect_blocks(tokens, lang_mod) do + all_emissions = + SignalStream.run( + tokens, + [%TripleQuoteSignal{}, %BlankLineSignal{}, %KeywordSignal{}], + lang_mod + ) + |> List.flatten() + + triple_splits = + for {_, :split, :triple_split, v} <- all_emissions, do: v + + protected_ranges = compute_protected_ranges(triple_splits) + + split_points = + for( + {_, :split, name, v} <- all_emissions, + name in [:blank_split, :keyword_split], + not inside_protected?(v, protected_ranges), + do: v + ) + |> Enum.concat(triple_splits) + |> Enum.uniq() + |> Enum.sort() + + tokens + |> split_at(split_points) + |> Enum.reject(fn s -> Enum.empty?(s) or whitespace_only?(s) end) + |> merge_same_line_slices() + |> Enum.map(&parse_block(&1, lang_mod)) + end + + @spec language_module_for_path(String.t()) :: module() + def language_module_for_path(path), do: Language.detect(path) + + @spec language_from_path(String.t()) :: atom() + def language_from_path(path), + do: path |> Language.detect() |> then(& &1.name()) |> String.to_atom() + + # Recursively parse a token stream into a Node with nested children. + # Immediately creates a node spanning the whole stream, then attempts to + # subdivide it. Terminates when no strictly-smaller sub-candidates are found. + defp parse_block(tokens, lang_mod) do + start_line = block_start_line(tokens) + end_line = block_end_line(tokens) + line_count = if start_line && end_line, do: end_line - start_line + 1, else: 1 + + block = %Node{ + tokens: tokens, + line_count: line_count, + children: [], + start_line: start_line, + end_line: end_line + } + + case find_sub_candidates(tokens, lang_mod) do + [] -> + block + + candidates -> + children = Enum.map(candidates, &parse_block(&1, lang_mod)) + %{block | children: children} + end + end + + # Collect enclosure regions from rules. + # + # If the token stream is itself a bracket pair (e.g. the stream IS `(foo, bar)`), + # we unwrap the outer brackets before running rules. Without this, BracketRule + # would only find the whole stream as a single enclosure — filtered by the + # idempotency check — and recursion would stop prematurely at every bracket level. + # Unwrapping lets us see the *inner* structure and keeps the tree growing deeper. + # + # Idempotency check: after unwrapping, reject any enclosure that still spans the + # entire search window (0..n-1), which would produce an infinite loop. + defp find_sub_candidates(tokens, lang_mod) do + {search_tokens, _} = maybe_unwrap_bracket(tokens) + n = length(search_tokens) + + enclosure_signals = + if lang_mod.uses_colon_indent?() do + [%BracketSignal{}, %ColonIndentSignal{}] + else + [%BracketSignal{}] + end + + SignalStream.run(search_tokens, enclosure_signals, lang_mod) + |> List.flatten() + |> Enum.filter(fn {_, group, _, _} -> group == :enclosure end) + |> Enum.map(fn {_, _, _, {s, e}} -> {s, e} end) + |> Enum.uniq() + |> Enum.sort() + |> Enum.reject(fn {s, e} -> s == 0 and e == n - 1 end) + |> Enum.map(fn {s, e} -> Enum.slice(search_tokens, s..e) end) + |> Enum.reject(&whitespace_only?/1) + end + + @open_brackets MapSet.new(["(", "[", "{"]) + @matching_close %{"(" => ")", "[" => "]", "{" => "}"} + + # If the stream is a balanced bracket pair, return the inner tokens. + # Otherwise return the stream unchanged. + defp maybe_unwrap_bracket([first | rest] = tokens) do + last = List.last(tokens) + + if last && MapSet.member?(@open_brackets, first.kind) && + Map.get(@matching_close, first.kind) == last.kind do + {Enum.drop(rest, -1), 1} + else + {tokens, 0} + end + end + + defp maybe_unwrap_bracket([]), do: {[], 0} + + # Pairs consecutive triple-quote split indices into protected interior ranges. + # Uses chunk_every with :discard to safely handle odd counts (malformed input). + defp compute_protected_ranges(split_indices) do + split_indices + |> Enum.chunk_every(2, 2, :discard) + |> Enum.map(fn [a, b] -> {a + 1, b - 1} end) + end + + defp inside_protected?(idx, ranges) do + Enum.any?(ranges, fn {lo, hi} -> idx >= lo and idx <= hi end) + end + + # When TripleQuoteSignal splits `@doc """` mid-line, the tokens before the + # triple-quote land in one slice and the heredoc in the next — both on the same + # starting line. Merge adjacent slices that share a line boundary so `@doc """..."""` + # becomes a single token stream fed to parse_block rather than two separate nodes. + defp merge_same_line_slices([]), do: [] + defp merge_same_line_slices([single]), do: [single] + + defp merge_same_line_slices([slice_a, slice_b | rest]) do + last_line_a = + slice_a + |> Enum.reverse() + |> Enum.find(&(&1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()])) + |> then(&(&1 && &1.line)) + + first_line_b = + slice_b + |> Enum.find(&(&1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()])) + |> then(&(&1 && &1.line)) + + if last_line_a && first_line_b && last_line_a == first_line_b do + merge_same_line_slices([slice_a ++ slice_b | rest]) + else + [slice_a | merge_same_line_slices([slice_b | rest])] + end + end + + defp split_at(tokens, []), do: [tokens] + + defp split_at(tokens, split_points) do + boundaries = [0 | split_points] ++ [length(tokens)] + + boundaries + |> Enum.chunk_every(2, 1, :discard) + |> Enum.map(fn [start, stop] -> Enum.slice(tokens, start..(stop - 1)//1) end) + end + + defp whitespace_only?(tokens) do + Enum.all?(tokens, &(&1.kind in [WhitespaceToken.kind(), NewlineToken.kind()])) + end + + defp block_start_line([%{line: line} | _]), do: line + defp block_start_line([]), do: nil + + defp block_end_line([]), do: nil + + defp block_end_line(tokens) do + tokens + |> Enum.reverse() + |> Enum.find(&(&1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()])) + |> case do + nil -> tokens |> List.last() |> Map.get(:line) + token -> token.line + end + end +end diff --git a/lib/codeqa/ast/parsing/signal.ex b/lib/codeqa/ast/parsing/signal.ex new file mode 100644 index 0000000..dc2d19e --- /dev/null +++ b/lib/codeqa/ast/parsing/signal.ex @@ -0,0 +1,44 @@ +defprotocol CodeQA.AST.Parsing.Signal do + @moduledoc """ + Protocol for token-stream signal emitters. + + A signal is a stateful detector that receives one token at a time and emits + zero or more named values. All signals run independently over the same token + stream — each gets its own full pass, carrying its own state. + + ## Protocol functions + + - `source/1` — the implementing module; used for debugging emission traces + - `group/1` — atom grouping this signal's emissions (e.g. `:split`, `:enclosure`) + - `init/2` — returns initial state; called once before the token stream starts + - `emit/3` — called per token; returns `{MapSet.t({name, value}), new_state}` + + ## State + + State is owned externally (in `SignalStream`) as a positionally-aligned list. + The signal defines the shape; the orchestrator threads it through unchanged. + + ## No-op emission + + To emit nothing for a token, return `{MapSet.new(), state}`. + """ + + @doc "The module that implements this signal — for debugging traces." + @spec source(t) :: module() + def source(signal) + + @doc "Group atom for all emissions from this signal (e.g. :split, :enclosure)." + @spec group(t) :: atom() + def group(signal) + + @doc "Returns the initial state for this signal." + @spec init(t, module()) :: term() + def init(signal, lang_mod) + + @doc """ + Called once per token. Returns a MapSet of `{name, value}` emission pairs + and the updated state. + """ + @spec emit(t, token :: term(), state :: term()) :: {MapSet.t(), term()} + def emit(signal, token, state) +end diff --git a/lib/codeqa/ast/parsing/signal_registry.ex b/lib/codeqa/ast/parsing/signal_registry.ex new file mode 100644 index 0000000..0f4a752 --- /dev/null +++ b/lib/codeqa/ast/parsing/signal_registry.ex @@ -0,0 +1,94 @@ +defmodule CodeQA.AST.Parsing.SignalRegistry do + @moduledoc """ + Registry for structural and classification signals. + + Use `default/0` for the standard signal set. Compose custom registries + with `register_structural/2` and `register_classification/2` for + language-specific or analysis-specific configurations. + """ + + alias CodeQA.AST.Signals.Structural.{ + AccessModifierSignal, + AssignmentFunctionSignal, + BlankLineSignal, + BracketSignal, + BranchSplitSignal, + ColonIndentSignal, + CommentDividerSignal, + DecoratorSignal, + DedentToZeroSignal, + DocCommentLeadSignal, + KeywordSignal, + SQLBlockSignal, + TripleQuoteSignal + } + + alias CodeQA.AST.Signals.Classification.{ + AttributeSignal, + CommentDensitySignal, + ConfigSignal, + DataSignal, + DocSignal, + FunctionSignal, + ImportSignal, + ModuleSignal, + TestSignal, + TypeSignal + } + + defstruct structural: [], classification: [] + + @type t :: %__MODULE__{ + structural: [term()], + classification: [term()] + } + + @spec new() :: t() + def new, do: %__MODULE__{} + + @spec register_structural(t(), term()) :: t() + def register_structural(%__MODULE__{} = r, signal), + do: %{r | structural: r.structural ++ [signal]} + + @spec register_classification(t(), term()) :: t() + def register_classification(%__MODULE__{} = r, signal), + do: %{r | classification: r.classification ++ [signal]} + + @spec default() :: t() + def default do + new() + |> register_structural(%TripleQuoteSignal{}) + |> register_structural(%BlankLineSignal{}) + |> register_structural(%KeywordSignal{}) + |> register_structural(%AccessModifierSignal{}) + |> register_structural(%DecoratorSignal{}) + |> register_structural(%CommentDividerSignal{}) + |> register_structural(%DocCommentLeadSignal{}) + |> register_structural(%AssignmentFunctionSignal{}) + |> register_structural(%DedentToZeroSignal{}) + |> register_structural(%BranchSplitSignal{}) + |> register_structural(%BracketSignal{}) + |> register_classification(%DocSignal{}) + |> register_classification(%TestSignal{}) + |> register_classification(%FunctionSignal{}) + |> register_classification(%ModuleSignal{}) + |> register_classification(%ImportSignal{}) + |> register_classification(%AttributeSignal{}) + |> register_classification(%TypeSignal{}) + |> register_classification(%ConfigSignal{}) + |> register_classification(%DataSignal{}) + |> register_classification(%CommentDensitySignal{}) + end + + @spec python() :: t() + def python do + r = default() + %{r | structural: r.structural ++ [%ColonIndentSignal{}]} + end + + @spec sql() :: t() + def sql do + r = default() + %{r | structural: r.structural ++ [%SQLBlockSignal{}]} + end +end diff --git a/lib/codeqa/ast/parsing/signal_stream.ex b/lib/codeqa/ast/parsing/signal_stream.ex new file mode 100644 index 0000000..8b6f451 --- /dev/null +++ b/lib/codeqa/ast/parsing/signal_stream.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.AST.Parsing.SignalStream do + @moduledoc """ + Runs a list of `Signal` implementations over a token stream. + + Each signal receives its own full pass over the token stream and accumulates + its own state. Signals are independent — no shared state, no cross-signal + coordination. + + ## Return value + + Returns a list of emission lists, one per signal, in the same order as the + input signal list. Each emission is a 4-tuple: + + {source, group, name, value} + + ## Usage + + SignalStream.run(tokens, [%BlankLineSignal{}, %KeywordSignal{}], []) + # => [[{BlankLineSignal, :split, :blank_split, 5}, ...], [...]] + """ + + alias CodeQA.AST.Parsing.Signal + + @spec run([term()], [term()], module()) :: [list()] + def run(tokens, signals, lang_mod) do + prevs = [nil | tokens] + nexts = Enum.drop(tokens, 1) ++ [nil] + triples = Enum.zip_with([prevs, tokens, nexts], fn [p, c, n] -> {p, c, n} end) + + Enum.map(signals, fn signal -> + init_state = Signal.init(signal, lang_mod) + source = Signal.source(signal) + group = Signal.group(signal) + + {_final_state, emissions} = + Enum.reduce_while(triples, {init_state, []}, fn triple, {state, acc} -> + emit_step(signal, triple, state, acc, source, group) + end) + + Enum.reverse(emissions) + end) + end + + defp emit_step(signal, triple, state, acc, source, group) do + {emitted, new_state} = Signal.emit(signal, triple, state) + + new_acc = + emitted + |> Enum.map(fn {name, value} -> {source, group, name, value} end) + |> Enum.reduce(acc, fn e, a -> [e | a] end) + + if new_state == :halt do + {:halt, {new_state, new_acc}} + else + {:cont, {new_state, new_acc}} + end + end +end diff --git a/lib/codeqa/ast/signals/classification/attribute_signal.ex b/lib/codeqa/ast/signals/classification/attribute_signal.ex new file mode 100644 index 0000000..aaaa640 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/attribute_signal.ex @@ -0,0 +1,68 @@ +defmodule CodeQA.AST.Signals.Classification.AttributeSignal do + @moduledoc """ + Classification signal — votes `:attribute` when an `@identifier` pattern + appears at indent 0. + + Weights: + - 3 for Elixir typespec attributes (@spec, @type, @typep, @opaque, @callback, @macrocallback) + - 2 for all other @name attributes + + Skips @doc and @moduledoc — those nodes contain tokens and are handled by DocSignal. + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + @typespec_attrs MapSet.new(~w[spec type typep opaque callback macrocallback]) + @skip_attrs MapSet.new(~w[doc moduledoc]) + + def source(_), do: CodeQA.AST.Signals.Classification.AttributeSignal + def group(_), do: :classification + + def init(_, _lang_mod), + do: %{at_line_start: true, indent: 0, saw_at: false, voted: false} + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit(_, {_prev, token, _next}, %{at_line_start: als, indent: ind, saw_at: saw_at} = state) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0, saw_at: false}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + "@" when ind == 0 -> + {MapSet.new(), %{state | saw_at: true, at_line_start: false}} + + "" when saw_at -> + emit_attribute(token.content, state) + + _ -> + {MapSet.new(), %{state | saw_at: false, at_line_start: false}} + end + end + + defp emit_attribute(name, state) do + base_state = %{state | saw_at: false, at_line_start: false, voted: true} + + cond do + MapSet.member?(@skip_attrs, name) -> + # @doc/@moduledoc: let DocSignal handle via tokens + {MapSet.new(), base_state} + + MapSet.member?(@typespec_attrs, name) -> + {MapSet.new([{:attribute_vote, 3}]), base_state} + + true -> + {MapSet.new([{:attribute_vote, 2}]), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/comment_density_signal.ex b/lib/codeqa/ast/signals/classification/comment_density_signal.ex new file mode 100644 index 0000000..ceb4c23 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/comment_density_signal.ex @@ -0,0 +1,64 @@ +defmodule CodeQA.AST.Signals.Classification.CommentDensitySignal do + @moduledoc """ + Classification signal — votes `:comment` when more than 60% of non-blank + lines begin with a comment prefix. + + Requires `comment_prefixes: [String.t()]` in opts (from the language + module). Returns no vote if no prefixes are configured. + + Emits at the end of the stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.CommentDensitySignal + def group(_), do: :classification + + def init(_, lang_mod) do + prefixes = MapSet.new(lang_mod.comment_prefixes()) + %{prefixes: prefixes, at_line_start: true, comment_lines: 0, total_lines: 0} + end + + def emit(_, {_prev, token, next}, state) do + %{prefixes: prefixes, at_line_start: als} = state + + state = + case token.kind do + @nl -> + %{state | at_line_start: true} + + @ws -> + state + + _ when als -> + is_comment = MapSet.member?(prefixes, token.content) + + %{ + state + | at_line_start: false, + total_lines: state.total_lines + 1, + comment_lines: state.comment_lines + if(is_comment, do: 1, else: 0) + } + + _ -> + %{state | at_line_start: false} + end + + maybe_emit_vote(next, prefixes, state) + end + + defp maybe_emit_vote(nil, prefixes, state) + when map_size(prefixes) > 0 and state.total_lines > 0 do + if state.comment_lines / state.total_lines > 0.6 do + {MapSet.new([{:comment_vote, 2}]), :halt} + else + {MapSet.new(), state} + end + end + + defp maybe_emit_vote(_next, _prefixes, state), do: {MapSet.new(), state} + end +end diff --git a/lib/codeqa/ast/signals/classification/config_signal.ex b/lib/codeqa/ast/signals/classification/config_signal.ex new file mode 100644 index 0000000..43b5872 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/config_signal.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.AST.Signals.Classification.ConfigSignal do + @moduledoc """ + Classification signal — votes `:config` when a configuration keyword + appears at indent 0 and bracket depth 0. + + Matches `config` (Elixir Mix.Config), `configure`, `settings`, `options`, + `defaults`. Emits at most one vote. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + @config_keywords MapSet.new(["config", "configure", "settings", "options", "defaults"]) + def source(_), do: CodeQA.AST.Signals.Classification.ConfigSignal + def group(_), do: :classification + + def init(_, _lang_mod), + do: %{at_line_start: true, indent: 0, bracket_depth: 0, is_first: true} + + def emit(_, {_prev, token, _next}, state) do + %{at_line_start: als, indent: ind, bracket_depth: bd, is_first: first} = state + + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + v when v in ["(", "[", "{"] -> + {MapSet.new(), %{state | bracket_depth: bd + 1, at_line_start: false, is_first: false}} + + v when v in [")", "]", "}"] -> + _ = v + + {MapSet.new(), + %{state | bracket_depth: max(0, bd - 1), at_line_start: false, is_first: false}} + + _ -> + emit_content_token(token, state, ind, bd, first) + end + end + + defp emit_content_token(token, state, ind, bd, first) do + if ind == 0 and bd == 0 and MapSet.member?(@config_keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:config_vote, weight}]), :halt} + else + {MapSet.new(), %{state | at_line_start: false, is_first: false}} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/data_signal.ex b/lib/codeqa/ast/signals/classification/data_signal.ex new file mode 100644 index 0000000..1d6aa77 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/data_signal.ex @@ -0,0 +1,67 @@ +defmodule CodeQA.AST.Signals.Classification.DataSignal do + @moduledoc """ + Classification signal — votes `:data` when a token stream consists primarily + of literal values (``, ``) with no control-flow keywords. + + Emits at the end of the stream (when `next == nil`). Votes only when + literal ratio > 0.6 and no control-flow keywords were seen. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @str CodeQA.AST.Lexing.StringToken.kind() + @control_flow MapSet.new([ + "if", + "else", + "elsif", + "elif", + "unless", + "for", + "while", + "do", + "case", + "when", + "cond", + "switch", + "loop", + "until" + ]) + def source(_), do: CodeQA.AST.Signals.Classification.DataSignal + def group(_), do: :classification + + def init(_, _lang_mod), + do: %{literal_count: 0, id_count: 0, has_control_flow: false} + + def emit(_, {_prev, token, next}, state) do + state = + case token.kind do + kind when kind in [@str, ""] -> + %{state | literal_count: state.literal_count + 1} + + "" -> + if MapSet.member?(@control_flow, token.content) do + %{state | has_control_flow: true, id_count: state.id_count + 1} + else + %{state | id_count: state.id_count + 1} + end + + _ -> + state + end + + if next == nil do + total = state.literal_count + state.id_count + + if total > 0 and not state.has_control_flow and + state.literal_count / total > 0.6 do + {MapSet.new([{:data_vote, 2}]), :halt} + else + {MapSet.new(), state} + end + else + {MapSet.new(), state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/doc_signal.ex b/lib/codeqa/ast/signals/classification/doc_signal.ex new file mode 100644 index 0000000..615cf55 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/doc_signal.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.AST.Signals.Classification.DocSignal do + @moduledoc """ + Classification signal — votes `:doc` when a `` (triple-quoted string) token + is found anywhere in the node's token stream. + + Weight: 3 (unambiguous — triple-quoted strings are documentation). + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @doc_kind CodeQA.AST.Lexing.StringToken.doc_kind() + def source(_), do: CodeQA.AST.Signals.Classification.DocSignal + def group(_), do: :classification + + def init(_, _lang_mod), do: %{voted: false} + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit(_, {_prev, token, _next}, state) do + if token.kind == @doc_kind do + {MapSet.new([{:doc_vote, 3}]), %{state | voted: true}} + else + {MapSet.new(), state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/function_signal.ex b/lib/codeqa/ast/signals/classification/function_signal.ex new file mode 100644 index 0000000..62d3f48 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/function_signal.ex @@ -0,0 +1,76 @@ +defmodule CodeQA.AST.Signals.Classification.FunctionSignal do + @moduledoc """ + Classification signal — votes `:function` when a function definition keyword + appears at indent 0 and bracket depth 0. + + Weights: + - 3 when it is the first content token of the block (strong match) + - 1 when found later in the block (weak match, e.g. after a leading comment) + + Does NOT include module/class/namespace keywords (handled by ModuleSignal) or + test macros like `test`/`describe` (handled by TestSignal). + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.FunctionSignal + def group(_), do: :classification + + def init(_, lang_mod) do + %{ + at_line_start: true, + indent: 0, + bracket_depth: 0, + is_first: true, + voted: false, + keywords: CodeQA.Language.function_keywords(lang_mod) + } + end + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit( + _, + {_prev, token, _next}, + %{at_line_start: als, indent: ind, bracket_depth: bd, is_first: first} = state + ) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + v when v in ["(", "[", "{"] -> + {MapSet.new(), %{state | bracket_depth: bd + 1, is_first: false, at_line_start: false}} + + v when v in [")", "]", "}"] -> + _ = v + + {MapSet.new(), + %{state | bracket_depth: max(0, bd - 1), is_first: false, at_line_start: false}} + + _ -> + emit_content_token(token, state, ind, bd, first) + end + end + + defp emit_content_token(token, state, ind, bd, first) do + base_state = %{state | is_first: false, at_line_start: false} + + if ind == 0 and bd == 0 and MapSet.member?(state.keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:function_vote, weight}]), %{base_state | voted: true}} + else + {MapSet.new(), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/import_signal.ex b/lib/codeqa/ast/signals/classification/import_signal.ex new file mode 100644 index 0000000..e27ed8a --- /dev/null +++ b/lib/codeqa/ast/signals/classification/import_signal.ex @@ -0,0 +1,66 @@ +defmodule CodeQA.AST.Signals.Classification.ImportSignal do + @moduledoc """ + Classification signal — votes `:import` when an import/require/use/alias keyword + appears at indent 0. + + Weights: + - 3 when it is the first content token of the block (strong match) + - 1 when found later in the block + + Covers: Elixir (import, require, use, alias), Python (import, from), + JavaScript/Go (import, package), C# (using), Ruby/Lua (require, include). + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.ImportSignal + def group(_), do: :classification + + def init(_, lang_mod) do + %{ + at_line_start: true, + indent: 0, + is_first: true, + voted: false, + keywords: CodeQA.Language.import_keywords(lang_mod) + } + end + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit( + _, + {_prev, token, _next}, + %{at_line_start: als, indent: ind, is_first: first} = state + ) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + _ -> + emit_content_token(token, state, ind, first) + end + end + + defp emit_content_token(token, state, ind, first) do + base_state = %{state | is_first: false, at_line_start: false} + + if ind == 0 and MapSet.member?(state.keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:import_vote, weight}]), %{base_state | voted: true}} + else + {MapSet.new(), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/module_signal.ex b/lib/codeqa/ast/signals/classification/module_signal.ex new file mode 100644 index 0000000..4e9ca98 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/module_signal.ex @@ -0,0 +1,75 @@ +defmodule CodeQA.AST.Signals.Classification.ModuleSignal do + @moduledoc """ + Classification signal — votes `:module` when a module/class/namespace definition + keyword appears at indent 0 and bracket depth 0. + + Weights: + - 3 when it is the first content token of the block (strong match) + - 1 when found later in the block + + Keyword set is disjoint from FunctionSignal and TestSignal to avoid conflicts. + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.ModuleSignal + def group(_), do: :classification + + def init(_, lang_mod) do + %{ + at_line_start: true, + indent: 0, + bracket_depth: 0, + is_first: true, + voted: false, + keywords: CodeQA.Language.module_keywords(lang_mod) + } + end + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit( + _, + {_prev, token, _next}, + %{at_line_start: als, indent: ind, bracket_depth: bd, is_first: first} = state + ) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + v when v in ["(", "[", "{"] -> + {MapSet.new(), %{state | bracket_depth: bd + 1, is_first: false, at_line_start: false}} + + v when v in [")", "]", "}"] -> + _ = v + + {MapSet.new(), + %{state | bracket_depth: max(0, bd - 1), is_first: false, at_line_start: false}} + + _ -> + emit_content_token(token, state, ind, bd, first) + end + end + + defp emit_content_token(token, state, ind, bd, first) do + base_state = %{state | is_first: false, at_line_start: false} + + if ind == 0 and bd == 0 and MapSet.member?(state.keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:module_vote, weight}]), %{base_state | voted: true}} + else + {MapSet.new(), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/test_signal.ex b/lib/codeqa/ast/signals/classification/test_signal.ex new file mode 100644 index 0000000..de6abe5 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/test_signal.ex @@ -0,0 +1,67 @@ +defmodule CodeQA.AST.Signals.Classification.TestSignal do + @moduledoc """ + Classification signal — votes `:test` when a test block keyword appears at + indent 0. + + Weights: + - 3 when it is the first content token of the block (strong match) + - 1 when found later in the block + + Covers: ExUnit (test, describe), RSpec/Jest/Mocha (it, context, describe), + Cucumber (scenario, given, feature). `test` takes priority over + FunctionSignal — Elixir test macros look like function calls but are test blocks. + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.TestSignal + def group(_), do: :classification + + def init(_, lang_mod) do + %{ + at_line_start: true, + indent: 0, + is_first: true, + voted: false, + keywords: CodeQA.Language.test_keywords(lang_mod) + } + end + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit( + _, + {_prev, token, _next}, + %{at_line_start: als, indent: ind, is_first: first} = state + ) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + _ -> + emit_content_token(token, state, ind, first) + end + end + + defp emit_content_token(token, state, ind, first) do + base_state = %{state | is_first: false, at_line_start: false} + + if ind == 0 and MapSet.member?(state.keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:test_vote, weight}]), %{base_state | voted: true}} + else + {MapSet.new(), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/type_signal.ex b/lib/codeqa/ast/signals/classification/type_signal.ex new file mode 100644 index 0000000..fc4440f --- /dev/null +++ b/lib/codeqa/ast/signals/classification/type_signal.ex @@ -0,0 +1,53 @@ +defmodule CodeQA.AST.Signals.Classification.TypeSignal do + @moduledoc """ + Classification signal — votes `:type` when an Elixir type definition + attribute (`@type`, `@typep`, `@opaque`) appears at indent 0. + + Emits at most one vote. Complements `AttributeSignal`, which handles + `@spec`, `@doc`, and other attributes. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + @type_attrs MapSet.new(["type", "typep", "opaque"]) + def source(_), do: CodeQA.AST.Signals.Classification.TypeSignal + def group(_), do: :classification + + def init(_, _lang_mod), + do: %{at_line_start: true, indent: 0, saw_at: false, is_first: true} + + def emit(_, {_prev, token, _next}, state) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0, saw_at: false}} + + @ws when state.at_line_start -> + {MapSet.new(), %{state | indent: state.indent + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + "@" when state.indent == 0 -> + {MapSet.new(), %{state | saw_at: true, at_line_start: false}} + + _ when state.saw_at and state.indent == 0 -> + emit_after_at(token, state) + + _ -> + {MapSet.new(), %{state | saw_at: false, is_first: false, at_line_start: false}} + end + end + + defp emit_after_at(token, state) do + if MapSet.member?(@type_attrs, token.content) do + weight = if state.is_first, do: 3, else: 1 + {MapSet.new([{:type_vote, weight}]), :halt} + else + {MapSet.new(), %{state | saw_at: false, is_first: false, at_line_start: false}} + end + end + end +end diff --git a/lib/codeqa/ast/signals/structural/access_modifier_signal.ex b/lib/codeqa/ast/signals/structural/access_modifier_signal.ex new file mode 100644 index 0000000..43ed068 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/access_modifier_signal.ex @@ -0,0 +1,80 @@ +defmodule CodeQA.AST.Signals.Structural.AccessModifierSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:access_modifier_split` when an access modifier keyword appears at line + start with bracket_depth == 0. + + Unlike `KeywordSignal`, this does NOT require indentation level 0, so it + detects class members inside bracket enclosures (e.g. `public void foo()` inside + a `class Foo { ... }` body). + + When `opts[:language_module]` is set, uses that language's + `access_modifiers/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.AccessModifierSignal + def group(_), do: :split + + def init(_, lang_mod) do + modifiers = CodeQA.Language.access_modifiers(lang_mod) + %{idx: 0, bracket_depth: 0, at_line_start: true, seen_content: false, modifiers: modifiers} + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in ["(", "[", "{"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: bd + 1, + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in [")", "]", "}"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: max(0, bd - 1), + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, seen_content: true, at_line_start: false} + + emissions = + if modifier_split?(state, token), + do: MapSet.new([{:access_modifier_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + defp modifier_split?( + %{seen_content: true, bracket_depth: 0, at_line_start: true, modifiers: m}, + %{content: c} + ), + do: MapSet.member?(m, c) + + defp modifier_split?(_, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/assignment_function_signal.ex b/lib/codeqa/ast/signals/structural/assignment_function_signal.ex new file mode 100644 index 0000000..a778d55 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/assignment_function_signal.ex @@ -0,0 +1,135 @@ +defmodule CodeQA.AST.Signals.Structural.AssignmentFunctionSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:assignment_function_split` when a top-level assignment to a function + is detected at indent 0 and bracket depth 0. + + Covers patterns such as: + - `identifier = function(...) {}` + - `identifier = async function(...) {}` + - `identifier = (...) => {}` + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.AssignmentFunctionSignal + def group(_), do: :split + + def init(_, _lang_mod) do + %{ + idx: 0, + indent: 0, + bracket_depth: 0, + at_line_start: true, + seen_content: false, + phase: :idle + } + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, indent: 0, at_line_start: true, phase: :idle}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, indent: i, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, indent: i + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd, phase: phase} = state) + when k in ["(", "[", "{"] do + new_bd = bd + 1 + new_phase = advance_phase_open(phase, k) + + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: new_bd, + at_line_start: false, + seen_content: true, + phase: new_phase + }} + end + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd, phase: phase} = state) + when k in [")", "]", "}"] do + new_bd = max(0, bd - 1) + new_phase = advance_phase_close(phase, k) + + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: new_bd, + at_line_start: false, + seen_content: true, + phase: new_phase + }} + end + + def emit( + _, + {_, token, _}, + %{ + idx: idx, + seen_content: sc, + indent: i, + bracket_depth: bd, + at_line_start: als, + phase: phase + } = state + ) do + {emissions, new_phase} = advance_phase(phase, token, idx, sc, i, bd, als) + + {emissions, + %{state | idx: idx + 1, at_line_start: false, seen_content: true, phase: new_phase}} + end + + defp advance_phase_open({:in_parens, id_idx, pd}, "("), do: {:in_parens, id_idx, pd + 1} + defp advance_phase_open({:in_parens, id_idx, pd}, _), do: {:in_parens, id_idx, pd} + defp advance_phase_open({:saw_eq, id_idx}, "("), do: {:in_parens, id_idx, 1} + defp advance_phase_open(_, _), do: :idle + + defp advance_phase_close({:in_parens, id_idx, 1}, ")"), do: {:saw_close_paren, id_idx} + + defp advance_phase_close({:in_parens, id_idx, pd}, ")") when pd > 1, + do: {:in_parens, id_idx, pd - 1} + + defp advance_phase_close({:in_parens, id_idx, pd}, _), do: {:in_parens, id_idx, pd} + defp advance_phase_close(_, _), do: :idle + + defp advance_phase(:idle, %{kind: ""}, idx, true, 0, 0, true), + do: {MapSet.new(), {:saw_id, idx}} + + defp advance_phase(:idle, _, _, _, _, _, _), do: {MapSet.new(), :idle} + + defp advance_phase({:saw_id, id_idx}, %{kind: "="}, _, _, _, _, _), + do: {MapSet.new(), {:saw_eq, id_idx}} + + defp advance_phase({:saw_id, _}, %{kind: ""}, idx, _, _, _, _), + do: {MapSet.new(), {:saw_id, idx}} + + defp advance_phase({:saw_id, id_idx}, %{kind: "."}, _, _, _, _, _), + do: {MapSet.new(), {:saw_id, id_idx}} + + defp advance_phase({:saw_id, _}, _, _, _, _, _, _), do: {MapSet.new(), :idle} + + defp advance_phase({:saw_eq, id_idx}, %{kind: "", content: "function"}, _, _, _, _, _), + do: {MapSet.new([{:assignment_function_split, id_idx}]), :idle} + + defp advance_phase({:saw_eq, id_idx}, %{kind: "", content: "async"}, _, _, _, _, _), + do: {MapSet.new(), {:saw_eq, id_idx}} + + defp advance_phase({:saw_eq, _}, _, _, _, _, _, _), do: {MapSet.new(), :idle} + + defp advance_phase({:saw_close_paren, id_idx}, %{kind: "=>"}, _, _, _, _, _), + do: {MapSet.new([{:assignment_function_split, id_idx}]), :idle} + + defp advance_phase({:saw_close_paren, _}, _, _, _, _, _, _), do: {MapSet.new(), :idle} + + defp advance_phase(_, _, _, _, _, _, _), do: {MapSet.new(), :idle} + end +end diff --git a/lib/codeqa/ast/signals/structural/blank_line_signal.ex b/lib/codeqa/ast/signals/structural/blank_line_signal.ex new file mode 100644 index 0000000..c484e1a --- /dev/null +++ b/lib/codeqa/ast/signals/structural/blank_line_signal.ex @@ -0,0 +1,45 @@ +defmodule CodeQA.AST.Signals.Structural.BlankLineSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:blank_split` at the first substantive token after 2+ consecutive + blank lines that follow a known block-end token. + + When `opts[:language_module]` is set, uses that language's + `block_end_tokens/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.BlankLineSignal + def group(_), do: :split + + def init(_, lang_mod) do + tokens = CodeQA.Language.block_end_tokens(lang_mod) + %{idx: 0, nl_run: 0, seen_content: false, last_content: nil, block_end_tokens: tokens} + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx, nl_run: nl} = state), + do: {MapSet.new(), %{state | idx: idx + 1, nl_run: nl + 1}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, nl_run: 0, seen_content: true, last_content: token.content} + + emissions = + if blank_split?(state), do: MapSet.new([{:blank_split, idx}]), else: MapSet.new() + + {emissions, base} + end + + defp blank_split?(%{seen_content: true, nl_run: nl, block_end_tokens: t, last_content: lc}) + when nl >= 2, + do: MapSet.member?(t, lc) + + defp blank_split?(_), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/bracket_signal.ex b/lib/codeqa/ast/signals/structural/bracket_signal.ex new file mode 100644 index 0000000..201f66e --- /dev/null +++ b/lib/codeqa/ast/signals/structural/bracket_signal.ex @@ -0,0 +1,51 @@ +defmodule CodeQA.AST.Signals.Structural.BracketSignal do + @moduledoc """ + Emits `:bracket_enclosure` for each outermost bracket pair `()`, `[]`, `{}`. + + Replaces `ParseRules.BracketRule`. State tracks: token index, bracket depth, + start index of current open bracket, and a stack of open bracket kinds for + mismatch detection. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @close %{")" => "(", "]" => "[", "}" => "{"} + + def source(_), do: CodeQA.AST.Signals.Structural.BracketSignal + def group(_), do: :enclosure + + def init(_, _lang_mod), do: %{idx: 0, depth: 0, start_idx: nil, stack: []} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, depth: 0, stack: stack} = state) + when k in ["(", "[", "{"], + do: {MapSet.new(), %{state | idx: idx + 1, depth: 1, start_idx: idx, stack: [k | stack]}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, depth: d, stack: stack} = state) + when k in ["(", "[", "{"], + do: {MapSet.new(), %{state | idx: idx + 1, depth: d + 1, stack: [k | stack]}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, depth: d, stack: [top | rest]} = state) + when k in [")", "]", "}"] do + base = %{state | idx: idx + 1} + + if @close[k] == top, + do: close_match(base, d, state.start_idx, idx, rest), + else: {MapSet.new(), base} + end + + def emit(_, {_, %{kind: k}, _}, %{idx: idx} = state) when k in [")", "]", "}"], + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + defp close_match(state, 1, start_idx, idx, rest), + do: + {MapSet.new([{:bracket_enclosure, {start_idx, idx}}]), + %{state | depth: 0, start_idx: nil, stack: rest}} + + defp close_match(state, d, _start_idx, _idx, rest), + do: {MapSet.new(), %{state | depth: d - 1, stack: rest}} + end +end diff --git a/lib/codeqa/ast/signals/structural/branch_split_signal.ex b/lib/codeqa/ast/signals/structural/branch_split_signal.ex new file mode 100644 index 0000000..1d6d264 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/branch_split_signal.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.AST.Signals.Structural.BranchSplitSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:branch_split` when a branch keyword appears at bracket depth 0 + and at least one token has been seen before it. + + Unlike `KeywordSignal`, there is no indentation constraint — branches inside + functions are intentionally split into sibling child blocks by the parser's + recursive phase. + + When `opts[:language_module]` is set, uses that language's + `branch_keywords/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.BranchSplitSignal + def group(_), do: :branch_split + + def init(_, lang_mod) do + keywords = CodeQA.Language.branch_keywords(lang_mod) + %{idx: 0, bracket_depth: 0, seen_content: false, keywords: keywords} + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in ["(", "[", "{"], + do: {MapSet.new(), %{state | idx: idx + 1, bracket_depth: bd + 1, seen_content: true}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in [")", "]", "}"], + do: + {MapSet.new(), + %{state | idx: idx + 1, bracket_depth: max(0, bd - 1), seen_content: true}} + + def emit(_, {_, token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, seen_content: true} + + emissions = + if branch_split?(state, token), do: MapSet.new([{:branch_split, idx}]), else: MapSet.new() + + {emissions, base} + end + + defp branch_split?(%{seen_content: true, bracket_depth: 0, keywords: kw}, %{content: c}), + do: MapSet.member?(kw, c) + + defp branch_split?(_, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/colon_indent_signal.ex b/lib/codeqa/ast/signals/structural/colon_indent_signal.ex new file mode 100644 index 0000000..9189b79 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/colon_indent_signal.ex @@ -0,0 +1,83 @@ +defmodule CodeQA.AST.Signals.Structural.ColonIndentSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:colon_indent_enclosure` for colon-indented blocks (Python). + + Only active when `opts[:language_module]` returns true for `uses_colon_indent?/0`. Replaces + `ParseRules.ColonIndentationRule`. + + ## Limitation + + The original rule flushes open blocks at EOF via `close_all_open/1`. Since + `emit/3` has no end-of-stream callback, open blocks are instead flushed at + each `` token. This correctly handles single-statement blocks; multi-line + blocks are closed at the first newline (conservative). + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.ColonIndentSignal + def group(_), do: :enclosure + + def init(_, lang_mod) do + %{ + enabled: lang_mod.uses_colon_indent?(), + idx: 0, + ci: 0, + last_colon_indent: nil, + stack: [] + } + end + + def emit(_, _, %{enabled: false} = state), + do: {MapSet.new(), %{state | idx: state.idx + 1}} + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state) do + {emissions, _} = flush_stack(state.stack) + {emissions, %{state | idx: idx + 1, ci: 0, stack: []}} + end + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, ci: ci} = state), + do: {MapSet.new(), %{state | idx: idx + 1, ci: ci + 1}} + + def emit(_, {_, %{kind: ":"}, _}, %{idx: idx, ci: ci} = state), + do: {MapSet.new(), %{state | idx: idx + 1, last_colon_indent: ci}} + + def emit(_, {_, _, _}, %{idx: idx, ci: ci} = state) do + {dedent_emissions, remaining} = close_dedented(state.stack, ci) + new_stack = maybe_open_block(remaining, state.last_colon_indent, ci, idx) + + {dedent_emissions, + %{state | idx: idx + 1, last_colon_indent: nil, stack: update_top(new_stack, idx)}} + end + + defp close_dedented(stack, ci) do + {to_close, keep} = Enum.split_while(stack, fn e -> ci <= e.colon_indent end) + {build_emissions(to_close), keep} + end + + defp flush_stack(stack), do: {build_emissions(stack), []} + + defp maybe_open_block(stack, colon_indent, ci, idx) + when colon_indent != nil and ci > colon_indent, + do: [%{colon_indent: colon_indent, sub_start: idx, last_content_idx: idx} | stack] + + defp maybe_open_block(stack, _, _, _), do: stack + + defp build_emissions(entries) do + Enum.reduce(entries, MapSet.new(), fn + %{sub_start: s, last_content_idx: e}, acc when e != nil -> + MapSet.put(acc, {:colon_indent_enclosure, {s, e}}) + + _entry, acc -> + acc + end) + end + + defp update_top([], _idx), do: [] + defp update_top([top | rest], idx), do: [Map.put(top, :last_content_idx, idx) | rest] + end +end diff --git a/lib/codeqa/ast/signals/structural/comment_divider_signal.ex b/lib/codeqa/ast/signals/structural/comment_divider_signal.ex new file mode 100644 index 0000000..d01e5e8 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/comment_divider_signal.ex @@ -0,0 +1,76 @@ +defmodule CodeQA.AST.Signals.Structural.CommentDividerSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:comment_divider_split` when a line is a "visual divider" comment — + a comment prefix at line start followed immediately by repetitive non-word + punctuation characters. + + Used to detect section separators like `# ---`, `// ===`, `-- ---`. + No split is emitted for the first such line (seen_content must be true). + + When `opts[:language_module]` is set, uses that language's + `comment_prefixes/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.CommentDividerSignal + def group(_), do: :split + + def init(_, lang_mod) do + comment_prefixes = MapSet.new(lang_mod.comment_prefixes()) + divider_indicators = CodeQA.Language.divider_indicators(lang_mod) + + %{ + idx: 0, + at_line_start: true, + seen_content: false, + indent: 0, + comment_prefixes: comment_prefixes, + divider_indicators: divider_indicators + } + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true, indent: 0}} + + def emit( + _, + {_, %WhitespaceToken{}, _}, + %{idx: idx, at_line_start: true, indent: indent} = state + ), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true, indent: indent + 1}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, token, next}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, at_line_start: false, seen_content: true} + + emissions = + if divider_split?(state, token, next), + do: MapSet.new([{:comment_divider_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + defp divider_split?( + %{ + seen_content: true, + at_line_start: true, + indent: 0, + comment_prefixes: cp, + divider_indicators: di + }, + %{kind: k}, + next + ), + do: MapSet.member?(cp, k) and next != nil and MapSet.member?(di, next.kind) + + defp divider_split?(_, _, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/decorator_signal.ex b/lib/codeqa/ast/signals/structural/decorator_signal.ex new file mode 100644 index 0000000..0dc1f5b --- /dev/null +++ b/lib/codeqa/ast/signals/structural/decorator_signal.ex @@ -0,0 +1,81 @@ +defmodule CodeQA.AST.Signals.Structural.DecoratorSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:decorator_split` when a decorator/annotation marker appears at line + start with bracket_depth == 0. + + Detects two patterns: + - `@` at line start (Python, TypeScript, Java, Elixir decorators/annotations) + - `#[` at line start (Rust attribute syntax) + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.DecoratorSignal + def group(_), do: :split + + def init(_, _lang_mod), + do: %{idx: 0, bracket_depth: 0, at_line_start: true, seen_content: false} + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in ["(", "[", "{"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: bd + 1, + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in [")", "]", "}"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: max(0, bd - 1), + seen_content: true, + at_line_start: false + }} + + def emit( + _, + {_, %{kind: "@"}, _}, + %{idx: idx, seen_content: true, bracket_depth: 0, at_line_start: true} = state + ), + do: + {MapSet.new([{:decorator_split, idx}]), + %{state | idx: idx + 1, seen_content: true, at_line_start: false}} + + def emit( + _, + {_, %{kind: "#"}, next}, + %{idx: idx, seen_content: true, bracket_depth: 0, at_line_start: true} = state + ) do + emissions = + if next != nil and next.kind == "[", + do: MapSet.new([{:decorator_split, idx}]), + else: MapSet.new() + + {emissions, %{state | idx: idx + 1, seen_content: true, at_line_start: false}} + end + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, seen_content: true, at_line_start: false}} + end +end diff --git a/lib/codeqa/ast/signals/structural/dedent_to_zero_signal.ex b/lib/codeqa/ast/signals/structural/dedent_to_zero_signal.ex new file mode 100644 index 0000000..d644dad --- /dev/null +++ b/lib/codeqa/ast/signals/structural/dedent_to_zero_signal.ex @@ -0,0 +1,87 @@ +defmodule CodeQA.AST.Signals.Structural.DedentToZeroSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:dedent_split` when code returns to indent level 0 after having been + at indent > 0 on the previous line. + + This is the primary split mechanism for Python and other indentation-significant + languages. The split fires at the first substantive token on a line that has no + leading ``, when the previous line did have leading ``. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.DedentToZeroSignal + def group(_), do: :split + + def init(_, _lang_mod) do + %{ + idx: 0, + at_line_start: true, + seen_content: false, + current_line_has_indent: false, + current_line_has_content: false, + prev_line_had_indent: false + } + end + + def emit( + _, + {_, %NewlineToken{}, _}, + %{ + idx: idx, + current_line_has_content: clhc, + current_line_has_indent: clhi, + prev_line_had_indent: plhi + } = state + ) do + new_plhi = if clhc, do: clhi, else: plhi + + {MapSet.new(), + %{ + state + | idx: idx + 1, + at_line_start: true, + prev_line_had_indent: new_plhi, + current_line_has_indent: false, + current_line_has_content: false + }} + end + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: + {MapSet.new(), + %{state | idx: idx + 1, current_line_has_indent: true, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, _, _}, %{idx: idx} = state) do + base = %{ + state + | idx: idx + 1, + at_line_start: false, + seen_content: true, + current_line_has_content: true + } + + emissions = + if dedent_split?(state), do: MapSet.new([{:dedent_split, idx}]), else: MapSet.new() + + {emissions, base} + end + + defp dedent_split?(%{ + at_line_start: true, + current_line_has_indent: false, + prev_line_had_indent: true, + seen_content: true + }), + do: true + + defp dedent_split?(_), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/doc_comment_lead_signal.ex b/lib/codeqa/ast/signals/structural/doc_comment_lead_signal.ex new file mode 100644 index 0000000..c5e5c4e --- /dev/null +++ b/lib/codeqa/ast/signals/structural/doc_comment_lead_signal.ex @@ -0,0 +1,65 @@ +defmodule CodeQA.AST.Signals.Structural.DocCommentLeadSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:doc_comment_split` when a doc-comment opener appears at line start. + + Detects: + - `///` — Rust/C# XML doc comments: `//` token immediately followed by `/` + - `/**` — Java/JS JSDoc: `/` token at line start immediately followed by `*` + + No split is emitted for the first such line (seen_content must be true). + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.DocCommentLeadSignal + def group(_), do: :split + + def init(_, _lang_mod), do: %{idx: 0, at_line_start: true, seen_content: false} + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit( + _, + {_, %{kind: "//"}, next}, + %{idx: idx, at_line_start: true, seen_content: true} = state + ) do + base = %{state | idx: idx + 1, at_line_start: false} + + emissions = + if next != nil and next.kind == "/", + do: MapSet.new([{:doc_comment_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + def emit( + _, + {_, %{kind: "/"}, next}, + %{idx: idx, at_line_start: true, seen_content: true} = state + ) do + base = %{state | idx: idx + 1, at_line_start: false} + + emissions = + if next != nil and next.kind in ["*", "**"], + do: MapSet.new([{:doc_comment_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: false, seen_content: true}} + end +end diff --git a/lib/codeqa/ast/signals/structural/keyword_signal.ex b/lib/codeqa/ast/signals/structural/keyword_signal.ex new file mode 100644 index 0000000..c13d3cf --- /dev/null +++ b/lib/codeqa/ast/signals/structural/keyword_signal.ex @@ -0,0 +1,83 @@ +defmodule CodeQA.AST.Signals.Structural.KeywordSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:keyword_split` when a declaration keyword appears at bracket depth 0 + and indentation level 0. + + When `opts[:language_module]` is set, uses that language's + `declaration_keywords/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.KeywordSignal + def group(_), do: :split + + def init(_, lang_mod) do + keywords = CodeQA.Language.declaration_keywords(lang_mod) + + %{ + idx: 0, + bracket_depth: 0, + indent: 0, + at_line_start: true, + seen_content: false, + keywords: keywords + } + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, indent: 0, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, indent: i, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, indent: i + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in ["(", "[", "{"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: bd + 1, + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in [")", "]", "}"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: max(0, bd - 1), + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, seen_content: true, at_line_start: false} + + emissions = + if keyword_split?(state, token), + do: MapSet.new([{:keyword_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + defp keyword_split?(%{seen_content: true, bracket_depth: 0, indent: 0, keywords: kw}, %{ + content: c + }), + do: MapSet.member?(kw, c) + + defp keyword_split?(_, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/sql_block_signal.ex b/lib/codeqa/ast/signals/structural/sql_block_signal.ex new file mode 100644 index 0000000..1e376f5 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/sql_block_signal.ex @@ -0,0 +1,55 @@ +defmodule CodeQA.AST.Signals.Structural.SQLBlockSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:sql_block_split` when a SQL DDL or DML statement keyword appears + at line start after prior content has been seen. + + Recognises uppercase and lowercase SQL statement starters: + DDL: CREATE, DROP, ALTER, TRUNCATE + DML: INSERT, UPDATE, DELETE, SELECT + Procedures/transactions: BEGIN, COMMIT, ROLLBACK, CALL, EXECUTE + + When `opts[:language_module]` is set, uses that language's + `statement_keywords/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.SQLBlockSignal + def group(_), do: :split + + def init(_, lang_mod) do + keywords = CodeQA.Language.statement_keywords(lang_mod) + %{idx: 0, at_line_start: true, seen_content: false, keywords: keywords} + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: ""} = token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, at_line_start: false, seen_content: true} + + emissions = + if sql_split?(state, token), do: MapSet.new([{:sql_block_split, idx}]), else: MapSet.new() + + {emissions, base} + end + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: false, seen_content: true}} + + defp sql_split?(%{seen_content: true, at_line_start: true, keywords: kw}, %{content: c}), + do: MapSet.member?(kw, String.downcase(c)) + + defp sql_split?(_, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/triple_quote_signal.ex b/lib/codeqa/ast/signals/structural/triple_quote_signal.ex new file mode 100644 index 0000000..ac5808d --- /dev/null +++ b/lib/codeqa/ast/signals/structural/triple_quote_signal.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.AST.Signals.Structural.TripleQuoteSignal do + @moduledoc """ + Emits `:triple_split` at each `` token boundary. + + The first of each pair marks the opening of a heredoc; the second marks the + token after the closing delimiter. These split values are used by the Parser + to compute protected ranges, preventing other signals' splits from being + applied inside heredoc content. + + Replaces `ParseRules.TripleQuoteRule`. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @doc_kind CodeQA.AST.Lexing.StringToken.doc_kind() + def source(_), do: CodeQA.AST.Signals.Structural.TripleQuoteSignal + def group(_), do: :split + + def init(_, _lang_mod), do: %{idx: 0, inside: false} + + def emit(_, {_, %{kind: @doc_kind}, _}, %{idx: idx, inside: false} = state), + do: {MapSet.new([{:triple_split, idx}]), %{state | idx: idx + 1, inside: true}} + + def emit(_, {_, %{kind: @doc_kind}, _}, %{idx: idx, inside: true} = state), + do: {MapSet.new([{:triple_split, idx + 1}]), %{state | idx: idx + 1, inside: false}} + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + end +end diff --git a/lib/codeqa/block_impact/codebase_impact.ex b/lib/codeqa/block_impact/codebase_impact.ex new file mode 100644 index 0000000..50fa5ba --- /dev/null +++ b/lib/codeqa/block_impact/codebase_impact.ex @@ -0,0 +1,22 @@ +defmodule CodeQA.BlockImpact.CodebaseImpact do + @moduledoc """ + Leave-one-out codebase aggregate: reconstruct file content without a target node, + replace the file in the files map, and re-run the codebase aggregate. + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.BlockImpact.FileImpact + alias CodeQA.Engine.Analyzer + + @doc """ + Returns the codebase aggregate after removing the target node from the given file. + """ + @spec compute(String.t(), String.t(), Node.t(), map()) :: map() + def compute(path, content, node, files_map) do + root_tokens = TokenNormalizer.normalize_structural(content) + reconstructed = FileImpact.reconstruct_without(root_tokens, node) + updated_files = Map.put(files_map, path, reconstructed) + Analyzer.analyze_codebase_aggregate(updated_files) + end +end diff --git a/lib/codeqa/block_impact/file_impact.ex b/lib/codeqa/block_impact/file_impact.ex new file mode 100644 index 0000000..10bd1f9 --- /dev/null +++ b/lib/codeqa/block_impact/file_impact.ex @@ -0,0 +1,46 @@ +defmodule CodeQA.BlockImpact.FileImpact do + @moduledoc """ + Leave-one-out file metrics: reconstruct file content without a target node's tokens + and return the re-run file metrics map. + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.Engine.Analyzer + + @min_tokens 10 + + @doc """ + Computes file metrics for the content with the target node's tokens removed. + + Returns `nil` if the node has fewer than `#{@min_tokens}` tokens. + Returns a raw `%{"group" => %{"key" => value}}` metrics map otherwise. + """ + @spec compute(String.t(), Node.t()) :: map() | nil + def compute(_content, %Node{tokens: tokens}) when length(tokens) < @min_tokens, do: nil + + def compute(content, node) do + root_tokens = TokenNormalizer.normalize_structural(content) + reconstructed = reconstruct_without(root_tokens, node) + Analyzer.analyze_file("", reconstructed) + end + + @spec reconstruct_without([CodeQA.AST.Lexing.Token.t()], Node.t()) :: String.t() + def reconstruct_without(root_tokens, %Node{tokens: []}) do + Enum.map_join(root_tokens, "", & &1.content) + end + + def reconstruct_without(root_tokens, node) do + first = List.first(node.tokens) + + case Enum.find_index(root_tokens, fn t -> t.line == first.line and t.col == first.col end) do + nil -> + Enum.map_join(root_tokens, "", & &1.content) + + start_idx -> + end_idx = start_idx + length(node.tokens) + remaining = Enum.take(root_tokens, start_idx) ++ Enum.drop(root_tokens, end_idx) + Enum.map_join(remaining, "", & &1.content) + end + end +end diff --git a/lib/codeqa/block_impact/refactoring_potentials.ex b/lib/codeqa/block_impact/refactoring_potentials.ex new file mode 100644 index 0000000..0dc1738 --- /dev/null +++ b/lib/codeqa/block_impact/refactoring_potentials.ex @@ -0,0 +1,111 @@ +defmodule CodeQA.BlockImpact.RefactoringPotentials do + @moduledoc """ + Computes named refactoring potentials for a code block using leave-one-out cosine deltas. + + Given baseline and without-node metrics at both file scope and codebase scope, + computes the cosine delta per behavior, merges the two scopes via max(), and + returns the top N behaviors sorted by delta descending. + + Positive delta = removing the block improved that behavior's cosine → the block + is a contributor to that anti-pattern. + """ + + alias CodeQA.CombinedMetrics.FileScorer + alias CodeQA.CombinedMetrics.SampleRunner + + @doc """ + Returns top N refactoring potentials for a code block. + + ## Parameters + + - `baseline_file_cosines` — pre-computed cosines list from `SampleRunner.diagnose_aggregate/2` for the baseline file + - `without_file_metrics` — raw `%{"group" => %{"key" => val}}` with the node's tokens removed + - `baseline_codebase_cosines` — pre-computed cosines list for the full codebase baseline + - `without_codebase_agg` — `%{"group" => %{"mean_key" => val}}` with the node removed from the codebase + + ## Options + + - `:top` — number of potentials to return (default 3) + + ## Result shape + + [%{"category" => "function_design", "behavior" => "cyclomatic_complexity_under_10", "cosine_delta" => 0.41}] + """ + @spec compute([map()], map(), [map()], map(), keyword()) :: [map()] + def compute( + baseline_file_cosines, + without_file_metrics, + baseline_codebase_cosines, + without_codebase_agg, + opts \\ [] + ) do + top_n = Keyword.get(opts, :top, 3) + language = Keyword.get(opts, :language) + languages = Keyword.get(opts, :languages) + behavior_map = Keyword.get(opts, :behavior_map) + + file_delta = + compute_file_delta(baseline_file_cosines, without_file_metrics, language, behavior_map) + + codebase_delta = + compute_codebase_delta( + baseline_codebase_cosines, + without_codebase_agg, + languages, + behavior_map + ) + + all_keys = Enum.uniq(Map.keys(file_delta) ++ Map.keys(codebase_delta)) + + all_keys + |> Enum.map(fn {category, behavior} -> + file_d = Map.get(file_delta, {category, behavior}, 0.0) + codebase_d = Map.get(codebase_delta, {category, behavior}, 0.0) + merged = max(file_d, codebase_d) + {category, behavior, merged} + end) + |> Enum.sort_by(fn {_, _, delta} -> delta end, :desc) + |> Enum.take(top_n) + |> Enum.map(fn {category, behavior, delta} -> + %{ + "category" => category, + "behavior" => behavior, + "cosine_delta" => Float.round(delta / 1.0, 4) + } + end) + end + + defp compute_file_delta(baseline_cosines, without_metrics, language, behavior_map) do + without_agg = FileScorer.file_to_aggregate(without_metrics) + + without_cosines = + SampleRunner.diagnose_aggregate(without_agg, + top: 99_999, + language: language, + behavior_map: behavior_map + ) + + cosines_to_delta(baseline_cosines, without_cosines) + end + + defp compute_codebase_delta(baseline_cosines, without_agg, languages, behavior_map) do + without_cosines = + SampleRunner.diagnose_aggregate(without_agg, + top: 99_999, + languages: languages, + behavior_map: behavior_map + ) + + cosines_to_delta(baseline_cosines, without_cosines) + end + + defp cosines_to_delta(baseline_cosines, without_cosines) do + without_map = + Map.new(without_cosines, fn %{category: c, behavior: b, cosine: cos} -> {{c, b}, cos} end) + + Map.new(baseline_cosines, fn %{category: c, behavior: b, cosine: cos} -> + without_cos = Map.get(without_map, {c, b}, 0.0) + {{c, b}, without_cos - cos} + end) + end +end diff --git a/lib/codeqa/block_impact_analyzer.ex b/lib/codeqa/block_impact_analyzer.ex new file mode 100644 index 0000000..aaebdf6 --- /dev/null +++ b/lib/codeqa/block_impact_analyzer.ex @@ -0,0 +1,429 @@ +defmodule CodeQA.BlockImpactAnalyzer do + @moduledoc """ + Orchestrates block impact analysis across all files in a pipeline result. + + For each file, tokenizes its content, parses it into a node tree, and for each + node (recursively including children) computes refactoring potentials via + leave-one-out impact scoring at both file scope and codebase scope. + + The pipeline result is returned with a `"nodes"` key added to each file entry. + All other keys in the result are preserved unchanged. + + ## Telemetry + + Emits the following events (all durations in microseconds): + + - `[:codeqa, :block_impact, :analyze]` — full run + measurements: `%{duration: us}` + metadata: `%{file_count: n}` + + - `[:codeqa, :block_impact, :codebase_cosines]` — codebase baseline cosine computation + measurements: `%{duration: us}` + metadata: `%{behavior_count: n}` + + - `[:codeqa, :block_impact, :file]` — per-file node computation + measurements: `%{duration: us, tokenize_us: us, parse_us: us, file_cosines_us: us, node_count: n}` + metadata: `%{path: string}` + + - `[:codeqa, :block_impact, :node]` — per-node leave-one-out computation + measurements: `%{duration: us, reconstruct_us: us, analyze_file_us: us, aggregate_us: us, refactoring_us: us}` + metadata: `%{path: string, token_count: n}` + """ + + alias CodeQA.Analysis.BehaviorConfigServer + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.BlockImpact.{FileImpact, RefactoringPotentials} + alias CodeQA.CombinedMetrics.{FileScorer, SampleRunner} + alias CodeQA.Engine.Analyzer + alias CodeQA.Languages.Unknown + + @min_tokens 10 + + @doc """ + Analyzes all files in the pipeline result, adding `"nodes"` to each file entry. + + ## Parameters + + - `pipeline_result` — direct return value of `Engine.Analyzer.analyze_codebase/2`, + containing `"files"` and `"codebase"` keys + - `files_map` — raw `%{path => content}` map used for file-scope leave-one-out + - `opts` — keyword options + + ## Options + + - `:nodes_top` — number of refactoring potentials per node (default 3) + - `:workers` — parallelism for `Task.async_stream` (default `System.schedulers_online()`) + - `:baseline_codebase_agg` — pre-computed codebase aggregate (skips redundant analysis) + """ + @spec analyze(map(), map(), keyword()) :: map() + def analyze(pipeline_result, files_map, opts \\ []) do + nodes_top = Keyword.get(opts, :nodes_top, 3) + workers = Keyword.get(opts, :workers, System.schedulers_online()) + + t0 = now() + + baseline_codebase_agg = + Keyword.get_lazy(opts, :baseline_codebase_agg, fn -> + Analyzer.analyze_codebase_aggregate(files_map) + end) + + cached_behaviors = + case Keyword.get(opts, :behavior_config_pid) do + nil -> nil + pid -> BehaviorConfigServer.get_all_behaviors(pid) + end + + project_langs = project_languages(files_map) + + filtered_behaviors = + if cached_behaviors && project_langs != [] do + filter_behaviors_by_languages(cached_behaviors, project_langs) + else + cached_behaviors + end + + {baseline_codebase_cosines, cosines_us} = + timed(fn -> + SampleRunner.diagnose_aggregate(baseline_codebase_agg, + top: 99_999, + languages: project_langs, + behavior_map: filtered_behaviors + ) + end) + + :telemetry.execute( + [:codeqa, :block_impact, :codebase_cosines], + %{duration: cosines_us}, + %{behavior_count: length(baseline_codebase_cosines)} + ) + + file_results = pipeline_result["files"] + + updated_files = + file_results + |> Task.async_stream( + fn {path, file_data} -> + content = Map.get(files_map, path, "") + baseline_file_metrics = Map.get(file_data, "metrics", %{}) + + {nodes, file_measurements} = + compute_nodes_timed( + path, + content, + baseline_file_metrics, + file_results, + baseline_codebase_cosines, + nodes_top, + filtered_behaviors + ) + + :telemetry.execute( + [:codeqa, :block_impact, :file], + file_measurements, + %{path: path} + ) + + {path, Map.put(file_data, "nodes", nodes)} + end, + max_concurrency: workers, + ordered: false, + timeout: :infinity + ) + |> Enum.reduce(%{}, fn {:ok, {path, data}}, acc -> Map.put(acc, path, data) end) + + :telemetry.execute( + [:codeqa, :block_impact, :analyze], + %{duration: now() - t0}, + %{file_count: map_size(file_results)} + ) + + Map.put(pipeline_result, "files", updated_files) + end + + defp compute_nodes_timed( + path, + content, + baseline_file_metrics, + file_results, + baseline_codebase_cosines, + nodes_top, + cached_behaviors + ) do + if content == "" do + {[], %{duration: 0, tokenize_us: 0, parse_us: 0, file_cosines_us: 0, node_count: 0}} + else + t0 = now() + + {root_tokens, tokenize_us} = timed(fn -> TokenNormalizer.normalize_structural(content) end) + {top_level_nodes, parse_us} = timed(fn -> Parser.detect_blocks(root_tokens, Unknown) end) + + baseline_file_agg = FileScorer.file_to_aggregate(baseline_file_metrics) + language = CodeQA.Language.detect(path).name() + + {baseline_file_cosines, file_cosines_us} = + timed(fn -> + SampleRunner.diagnose_aggregate(baseline_file_agg, + top: 99_999, + language: language, + behavior_map: cached_behaviors + ) + end) + + inc_agg = build_incremental_agg(file_results) + old_file_triples = file_metrics_to_triples(baseline_file_metrics) + project_langs = project_languages(file_results) + + node_ctx = %{ + inc_agg: inc_agg, + old_file_triples: old_file_triples, + project_langs: project_langs, + cached_behaviors: cached_behaviors + } + + nodes = + top_level_nodes + |> Enum.map(fn node -> + serialize_node( + node, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx + ) + end) + |> Enum.sort_by(fn n -> {n["start_line"], n["column_start"]} end) + + measurements = %{ + duration: now() - t0, + tokenize_us: tokenize_us, + parse_us: parse_us, + file_cosines_us: file_cosines_us, + node_count: length(top_level_nodes) + } + + {nodes, measurements} + end + end + + defp serialize_node( + node, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx + ) do + potentials = + if length(node.tokens) < @min_tokens do + [] + else + compute_potentials_timed( + node, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx + ) + end + + children = + node.children + |> Enum.map(fn child -> + serialize_node( + child, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx + ) + end) + |> Enum.sort_by(fn n -> {n["start_line"], n["column_start"]} end) + + first_token = List.first(node.tokens) + char_length = Enum.reduce(node.tokens, 0, fn t, acc -> acc + byte_size(t.content) end) + + %{ + "start_line" => node.start_line, + "end_line" => node.end_line, + "column_start" => (first_token && first_token.col) || 0, + "char_length" => char_length, + "type" => Atom.to_string(node.type), + "token_count" => length(node.tokens), + "refactoring_potentials" => potentials, + "children" => children + } + end + + defp compute_potentials_timed( + %Node{} = node, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx + ) do + t0 = now() + + {reconstructed, reconstruct_us} = + timed(fn -> FileImpact.reconstruct_without(root_tokens, node) end) + + {without_file_metrics, analyze_file_us} = + timed(fn -> Analyzer.analyze_file_for_loo(path, reconstructed) end) + + {without_codebase_agg, aggregate_us} = + timed(fn -> + new_triples = file_metrics_to_triples(without_file_metrics) + + node_ctx.inc_agg + |> swap_file_in_agg(node_ctx.old_file_triples, new_triples) + |> incremental_agg_to_aggregate() + end) + + {potentials, refactoring_us} = + timed(fn -> + RefactoringPotentials.compute( + baseline_file_cosines, + without_file_metrics, + baseline_codebase_cosines, + without_codebase_agg, + top: nodes_top, + language: language, + languages: node_ctx.project_langs, + behavior_map: node_ctx.cached_behaviors + ) + end) + + :telemetry.execute( + [:codeqa, :block_impact, :node], + %{ + duration: now() - t0, + reconstruct_us: reconstruct_us, + analyze_file_us: analyze_file_us, + aggregate_us: aggregate_us, + refactoring_us: refactoring_us + }, + %{path: path, token_count: length(node.tokens)} + ) + + potentials + end + + defp file_metrics_to_triples(metrics) when is_map(metrics) do + metrics + |> Enum.flat_map(fn + {metric_name, metric_data} when is_map(metric_data) -> + metric_data + |> Enum.filter(fn {_k, v} -> is_number(v) end) + |> Enum.map(fn {key, value} -> {metric_name, key, value / 1} end) + + _ -> + [] + end) + end + + defp build_incremental_agg(file_results) do + file_results + |> Map.values() + |> Enum.flat_map(fn file_data -> + file_data |> Map.get("metrics", %{}) |> file_metrics_to_triples() + end) + |> Enum.group_by(fn {metric, key, _val} -> {metric, key} end, fn {_, _, val} -> val end) + |> Map.new(fn {{metric, key}, values} -> + n = length(values) + sum = Enum.sum(values) + sum_sq = Enum.reduce(values, 0.0, fn v, acc -> acc + v * v end) + + {{metric, key}, + %{sum: sum, sum_sq: sum_sq, min: Enum.min(values), max: Enum.max(values), count: n}} + end) + end + + defp swap_file_in_agg(inc_agg, old_triples, new_triples) do + old_map = Map.new(old_triples, fn {metric, key, val} -> {{metric, key}, val} end) + new_map = Map.new(new_triples, fn {metric, key, val} -> {{metric, key}, val} end) + all_keys = Enum.uniq(Map.keys(old_map) ++ Map.keys(new_map)) + + Enum.reduce(all_keys, inc_agg, fn mk, acc -> + case Map.get(acc, mk) do + nil -> + acc + + state -> + old_val = Map.get(old_map, mk, 0.0) + new_val = Map.get(new_map, mk, 0.0) + + Map.put(acc, mk, %{ + sum: state.sum - old_val + new_val, + sum_sq: state.sum_sq - old_val * old_val + new_val * new_val, + min: min(state.min, new_val), + max: max(state.max, new_val), + count: state.count + }) + end + end) + end + + defp incremental_agg_to_aggregate(inc_agg) do + Enum.reduce(inc_agg, %{}, fn {{metric, key}, state}, acc -> + n = state.count + mean = if n > 0, do: state.sum / n, else: 0.0 + variance = if n > 0, do: max(state.sum_sq / n - mean * mean, 0.0), else: 0.0 + std = :math.sqrt(variance) + + metric_agg = Map.get(acc, metric, %{}) + + updated = + Map.merge(metric_agg, %{ + "mean_#{key}" => Float.round(mean * 1.0, 4), + "std_#{key}" => Float.round(std * 1.0, 4), + "min_#{key}" => Float.round(state.min * 1.0, 4), + "max_#{key}" => Float.round(state.max * 1.0, 4) + }) + + Map.put(acc, metric, updated) + end) + end + + defp filter_behaviors_by_languages(behaviors_map, project_langs) do + Map.new(behaviors_map, fn {category, behaviors} -> + filtered = + Enum.filter(behaviors, fn {_behavior, behavior_data} -> + behavior_langs = Map.get(behavior_data, "_languages", []) + behavior_langs == [] or Enum.any?(behavior_langs, &(&1 in project_langs)) + end) + + {category, filtered} + end) + end + + defp project_languages(path_keyed_map) do + path_keyed_map + |> Map.keys() + |> Enum.map(&CodeQA.Language.detect(&1).name()) + |> Enum.reject(&(&1 == "unknown")) + |> Enum.uniq() + end + + defp timed(fun) do + t = now() + result = fun.() + {result, now() - t} + end + + defp now, do: System.monotonic_time(:microsecond) +end diff --git a/lib/codeqa/cli.ex b/lib/codeqa/cli.ex index 210654d..3e36d57 100644 --- a/lib/codeqa/cli.ex +++ b/lib/codeqa/cli.ex @@ -3,27 +3,32 @@ defmodule CodeQA.CLI do @commands %{ "analyze" => CodeQA.CLI.Analyze, - "compare" => CodeQA.CLI.Compare, "history" => CodeQA.CLI.History, "correlate" => CodeQA.CLI.Correlate, - "stopwords" => CodeQA.CLI.Stopwords, - "health-report" => CodeQA.CLI.HealthReport + "health-report" => CodeQA.CLI.HealthReport, + "diagnose" => CodeQA.CLI.Diagnose } def main(args) do case args do - [cmd | rest] when is_map_key(@commands, cmd) -> @commands[cmd].run(rest) - _ -> print_usage() + [cmd | rest] when is_map_key(@commands, cmd) -> + output = @commands[cmd].run(rest) + unless output == "", do: IO.puts(output) + output + + _ -> + output = build_usage() + IO.puts(output) + output end end - defp print_usage do + defp build_usage do command_usages = @commands |> Enum.sort_by(fn {name, _} -> name end) - |> Enum.map(fn {_name, mod} -> mod.usage() end) - |> Enum.join("\n") + |> Enum.map_join("\n", fn {_name, mod} -> mod.usage() end) - IO.puts("Usage: codeqa [options]\n\n" <> command_usages) + "Usage: codeqa [options]\n\n" <> command_usages end end diff --git a/lib/codeqa/cli/analyze.ex b/lib/codeqa/cli/analyze.ex index 4473011..9c1f840 100644 --- a/lib/codeqa/cli/analyze.ex +++ b/lib/codeqa/cli/analyze.ex @@ -4,6 +4,9 @@ defmodule CodeQA.CLI.Analyze do @behaviour CodeQA.CLI.Command alias CodeQA.CLI.Options + alias CodeQA.Config + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector @version "0.1.0" @@ -32,19 +35,18 @@ defmodule CodeQA.CLI.Analyze do @impl CodeQA.CLI.Command def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) + usage() end def run(args) do {opts, [path], _} = - Options.parse(args, [output: :string], [o: :output]) - - if opts[:telemetry], do: CodeQA.Telemetry.setup() + Options.parse(args, [output: :string], o: :output) Options.validate_dir!(path) + Config.load(path) - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) - files = CodeQA.Collector.collect_files(path, ignore_patterns: ignore_patterns) + files = + Collector.collect_files(path, Options.parse_ignore_paths(opts[:ignore_paths])) if map_size(files) == 0 do IO.puts(:stderr, "Warning: no source files found in '#{path}'") @@ -53,10 +55,11 @@ defmodule CodeQA.CLI.Analyze do print_progress(opts, files) - analyze_opts = Options.build_analyze_opts(opts) + analyze_opts = + Options.build_analyze_opts(opts) ++ Config.near_duplicate_blocks_opts() start_time = System.monotonic_time(:millisecond) - results = CodeQA.Analyzer.analyze_codebase(files, analyze_opts) + results = Analyzer.analyze_codebase(files, analyze_opts) end_time = System.monotonic_time(:millisecond) IO.puts(:stderr, "Analysis completed in #{end_time - start_time}ms") @@ -80,14 +83,13 @@ defmodule CodeQA.CLI.Analyze do case opts[:output] do nil -> - IO.puts(json) + json file -> File.write!(file, json) IO.puts(:stderr, "Report written to #{file}") + "" end - - if opts[:telemetry], do: CodeQA.Telemetry.print_report() end defp print_progress(opts, files) do diff --git a/lib/codeqa/cli/command.ex b/lib/codeqa/cli/command.ex index e2702a1..c6cd4a1 100644 --- a/lib/codeqa/cli/command.ex +++ b/lib/codeqa/cli/command.ex @@ -1,6 +1,6 @@ defmodule CodeQA.CLI.Command do @moduledoc "Behaviour for CLI commands." - @callback run([String.t()]) :: :ok + @callback run([String.t()]) :: String.t() @callback usage() :: String.t() end diff --git a/lib/codeqa/cli/compare.ex b/lib/codeqa/cli/compare.ex deleted file mode 100644 index b86bc32..0000000 --- a/lib/codeqa/cli/compare.ex +++ /dev/null @@ -1,242 +0,0 @@ -defmodule CodeQA.CLI.Compare do - @moduledoc false - - @behaviour CodeQA.CLI.Command - - alias CodeQA.CLI.Options - - @version "0.1.0" - - @impl CodeQA.CLI.Command - def usage do - """ - Usage: codeqa compare [options] - - Compare code quality metrics between two git refs. - - Options: - --base-ref REF Base git ref to compare from (required) - --head-ref REF Head git ref to compare to (default: HEAD) - --changes-only Only analyze changed files - --all-files Analyze all source files (default) - --format FORMAT Output format: json, markdown, or github (default: json) - --output MODE Output mode: auto, summary, or changes (default: auto) - --progress Show per-file progress on stderr - -w, --workers N Number of parallel workers - --cache Enable caching file metrics - --cache-dir DIR Directory to store cache (default: .codeqa_cache) - -t, --timeout MS Timeout for similarity analysis (default: 5000) - --show-ncd Compute and show NCD similarity metric - --ncd-top N Number of top similar files to show per file - --ncd-paths PATHS Comma-separated list of paths to compute NCD for - --show-files Include individual file metrics in the output - --show-file-paths P Comma-separated list of paths to include in the output - --ignore-paths PATHS Comma-separated list of path patterns to ignore (supports wildcards, e.g. "test/*,docs/*") - """ - end - - @impl CodeQA.CLI.Command - def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) - end - - def run(args) do - {opts, [path], _} = - Options.parse(args, - [ - base_ref: :string, - head_ref: :string, - changes_only: :boolean, - all_files: :boolean, - format: :string, - output: :string - ], - [] - ) - - if opts[:telemetry], do: CodeQA.Telemetry.setup() - - base_ref = opts[:base_ref] || raise "Missing --base-ref" - head_ref = opts[:head_ref] || "HEAD" - changes_only = if opts[:changes_only], do: true, else: false - format = opts[:format] || "json" - output_mode = opts[:output] || "auto" - - Options.validate_dir!(path) - - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) - opts = Keyword.put(opts, :ignore_patterns, ignore_patterns) - - {base_result, head_result, changes} = - run_comparison(path, base_ref, head_ref, changes_only, opts) - - comparison = - CodeQA.Comparator.compare_results(base_result, head_result, changes) - |> enrich_metadata(base_ref, head_ref, changes_only) - |> filter_files_for_output(opts, format) - - output_comparison(comparison, format, output_mode) - - if opts[:telemetry], do: CodeQA.Telemetry.print_report() - end - - defp run_comparison(path, base_ref, head_ref, changes_only, opts) do - ignore_patterns = opts[:ignore_patterns] || [] - changes = CodeQA.Git.changed_files(path, base_ref, head_ref) - changes = CodeQA.Collector.reject_ignored(changes, ignore_patterns, & &1.path) - - file_paths = - if changes_only do - IO.puts(:stderr, "Comparing #{length(changes)} changed files...") - Enum.map(changes, & &1.path) - else - IO.puts(:stderr, "Comparing all source files...") - nil - end - - empty = %{"files" => %{}, "codebase" => %{"aggregate" => %{}, "similarity" => %{}}} - - if changes_only and length(changes) == 0 do - IO.puts(:stderr, "No source files changed — nothing to compare.") - {empty, empty, []} - else - base_files = CodeQA.Git.collect_files_at_ref(path, base_ref, file_paths) - head_files = CodeQA.Git.collect_files_at_ref(path, head_ref, file_paths) - base_files = CodeQA.Collector.reject_ignored_map(base_files, ignore_patterns) - head_files = CodeQA.Collector.reject_ignored_map(head_files, ignore_patterns) - - if map_size(base_files) == 0 and map_size(head_files) == 0 do - IO.puts(:stderr, "Warning: no source files found at either ref") - exit({:shutdown, 1}) - end - - print_progress(opts, base_files, head_files) - - analyze_opts = Options.build_analyze_opts(opts) - - base_result = - if map_size(base_files) > 0, - do: CodeQA.Analyzer.analyze_codebase(base_files, analyze_opts), - else: empty - - head_result = - if map_size(head_files) > 0, - do: CodeQA.Analyzer.analyze_codebase(head_files, analyze_opts), - else: empty - - changes = if changes_only, do: changes, else: synthesize_changes(base_files, head_files) - - {base_result, head_result, changes} - end - end - - defp print_progress(opts, base_files, head_files) do - if opts[:progress] do - step_prefix = if opts[:show_ncd], do: "1/5 ", else: "1/1 " - - IO.puts( - :stderr, - " #{step_prefix}Analyzing base (#{map_size(base_files)} files) and head (#{map_size(head_files)} files)..." - ) - else - IO.puts( - :stderr, - "Analyzing base (#{map_size(base_files)} files) and head (#{map_size(head_files)} files)..." - ) - end - end - - defp enrich_metadata(comparison, base_ref, head_ref, changes_only) do - comparison - |> put_in(["metadata", "base_ref"], base_ref) - |> put_in(["metadata", "head_ref"], head_ref) - |> put_in(["metadata", "changes_only"], changes_only) - |> put_in(["metadata", "version"], @version) - |> put_in(["metadata", "timestamp"], DateTime.utc_now() |> DateTime.to_iso8601()) - end - - defp output_comparison(comparison, "markdown", output_mode) do - IO.puts(CodeQA.Formatter.format_markdown(comparison, output_mode)) - end - - defp output_comparison(comparison, "github", output_mode) do - IO.puts(CodeQA.Formatter.format_github(comparison, output_mode)) - end - - defp output_comparison(comparison, _format, output_mode) do - codebase_summary = CodeQA.Summarizer.summarize_codebase(comparison) - - file_summaries = - Map.new(Map.get(comparison, "files", %{}), fn {path, data} -> - {path, CodeQA.Summarizer.summarize_file(path, data)} - end) - - IO.puts( - Jason.encode!(build_json_output(comparison, codebase_summary, file_summaries, output_mode), - pretty: true - ) - ) - end - - defp build_json_output(comparison, codebase_summary, file_summaries, output_mode) do - result = %{"metadata" => comparison["metadata"]} - - result = - if output_mode in ["auto", "summary"] do - result - |> Map.put("summary", codebase_summary) - |> Map.put("codebase", comparison["codebase"]) - else - result - end - - if output_mode in ["auto", "changes"] and Map.has_key?(comparison, "files") do - files_with_summaries = - Map.new(comparison["files"], fn {path, data} -> - {path, Map.put(data, "summary", Map.get(file_summaries, path, %{}))} - end) - - Map.put(result, "files", files_with_summaries) - else - result - end - end - - defp synthesize_changes(base_files, head_files) do - all_paths = MapSet.union(MapSet.new(Map.keys(base_files)), MapSet.new(Map.keys(head_files))) - - all_paths - |> Enum.sort() - |> Enum.map(fn path -> - status = - cond do - Map.has_key?(base_files, path) and Map.has_key?(head_files, path) -> "modified" - Map.has_key?(head_files, path) -> "added" - true -> "deleted" - end - - %CodeQA.Git.ChangedFile{path: path, status: status} - end) - end - - defp filter_files_for_output(results, _opts, format) when format in ["github", "markdown"], - do: results - - defp filter_files_for_output(results, opts, _format) do - cond do - opts[:show_files] -> - results - - opts[:show_file_paths] -> - target_paths = String.split(opts[:show_file_paths], ",") |> MapSet.new() - - filtered = - Map.filter(results["files"], fn {path, _} -> MapSet.member?(target_paths, path) end) - - Map.put(results, "files", filtered) - - true -> - Map.delete(results, "files") - end - end -end diff --git a/lib/codeqa/cli/correlate.ex b/lib/codeqa/cli/correlate.ex index a3fd2f7..c38a248 100644 --- a/lib/codeqa/cli/correlate.ex +++ b/lib/codeqa/cli/correlate.ex @@ -4,6 +4,7 @@ defmodule CodeQA.CLI.Correlate do @behaviour CodeQA.CLI.Command alias CodeQA.CLI.Options + alias CodeQA.CLI.UI @impl CodeQA.CLI.Command def usage do @@ -25,7 +26,7 @@ defmodule CodeQA.CLI.Correlate do @impl CodeQA.CLI.Command def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) + usage() end def run(args) do @@ -82,7 +83,7 @@ defmodule CodeQA.CLI.Correlate do sorted = Enum.sort_by(correlations, &abs(&1["correlation"]), :desc) top = Enum.take(sorted, top_n) - IO.puts(Jason.encode!(top, pretty: true)) + Jason.encode!(top, pretty: true) end defp extract_metric_series(path, files) do @@ -204,7 +205,16 @@ defmodule CodeQA.CLI.Correlate do pairs_stream |> Task.async_stream( - &correlate_pair(&1, counter, total_pairs, update_interval, total_start, series, category_map, opts), + &correlate_pair( + &1, + counter, + total_pairs, + update_interval, + total_start, + series, + category_map, + opts + ), max_concurrency: System.schedulers_online(), timeout: :infinity ) @@ -257,9 +267,7 @@ defmodule CodeQA.CLI.Correlate do eta_ms = round((total_pairs - current) * avg_time) output = - CodeQA.CLI.UI.progress_bar(current, total_pairs, - eta: CodeQA.CLI.UI.format_eta(eta_ms) - ) + UI.progress_bar(current, total_pairs, eta: UI.format_eta(eta_ms)) IO.write(:stderr, "\r" <> output) if current == total_pairs, do: IO.puts(:stderr, "") diff --git a/lib/codeqa/cli/diagnose.ex b/lib/codeqa/cli/diagnose.ex new file mode 100644 index 0000000..93c2e8d --- /dev/null +++ b/lib/codeqa/cli/diagnose.ex @@ -0,0 +1,71 @@ +defmodule CodeQA.CLI.Diagnose do + @moduledoc false + + @behaviour CodeQA.CLI.Command + + @impl CodeQA.CLI.Command + def usage do + """ + Usage: codeqa diagnose [options] + + Diagnose likely code quality issues using cosine similarity against behavior profiles. + + Options: + --path PATH File or directory path to analyze (required) + --mode MODE Output mode: aggregate (default) or per-file + --top N Number of top issues to display (default: 15) + --format FORMAT Output format: plain (default) or json + --combined-top N Number of worst offender files per behavior (default: 2) + """ + end + + @impl CodeQA.CLI.Command + def run(args) when args in [["--help"], ["-h"]] do + usage() + end + + def run(args) do + {opts, _, _} = + OptionParser.parse(args, + strict: [ + path: :string, + mode: :string, + top: :integer, + format: :string, + combined_top: :integer + ] + ) + + path = opts[:path] + + unless path do + IO.puts(:stderr, "Error: --path required") + exit({:shutdown, 1}) + end + + unless File.exists?(path) do + IO.puts(:stderr, "Error: '#{path}' does not exist") + exit({:shutdown, 1}) + end + + mode = + case opts[:mode] do + "per-file" -> :per_file + _ -> :aggregate + end + + format = + case opts[:format] do + "json" -> :json + _ -> :plain + end + + CodeQA.Diagnostics.run( + path: path, + mode: mode, + top: opts[:top] || 15, + format: format, + combined_top: opts[:combined_top] || 2 + ) + end +end diff --git a/lib/codeqa/cli/health_report.ex b/lib/codeqa/cli/health_report.ex index 8f39186..a21a743 100644 --- a/lib/codeqa/cli/health_report.ex +++ b/lib/codeqa/cli/health_report.ex @@ -4,6 +4,11 @@ defmodule CodeQA.CLI.HealthReport do @behaviour CodeQA.CLI.Command alias CodeQA.CLI.Options + alias CodeQA.Config + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + alias CodeQA.Git + alias CodeQA.HealthReport @impl CodeQA.CLI.Command def usage do @@ -24,33 +29,40 @@ defmodule CodeQA.CLI.HealthReport do --cache-dir DIR Directory to store cache (default: .codeqa_cache) -t, --timeout MS Timeout for similarity analysis (default: 5000) --ignore-paths PATHS Comma-separated list of path patterns to ignore (supports wildcards, e.g. "test/*,docs/*") + --base-ref REF Base git ref for PR comparison (enables delta and block scoping) + --head-ref REF Head git ref (default: HEAD) + --comment Multi-part mode: writes numbered part files to TMPDIR for PR comments """ end @impl CodeQA.CLI.Command def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) + usage() end - def run(args) do - {opts, [path], _} = - Options.parse(args, - [ - output: :string, - config: :string, - detail: :string, - top: :integer, - format: :string - ], - [o: :output] - ) - - if opts[:telemetry], do: CodeQA.Telemetry.setup() + @command_options [ + output: :string, + config: :string, + detail: :string, + top: :integer, + format: :string, + ignore_paths: :string, + base_ref: :string, + head_ref: :string, + telemetry: :boolean, + comment: :boolean + ] + def run(args) do + {opts, [path], _} = Options.parse(args, @command_options, o: :output) Options.validate_dir!(path) + extra_ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) - files = CodeQA.Collector.collect_files(path, ignore_patterns: ignore_patterns) + base_ref = opts[:base_ref] + head_ref = opts[:head_ref] || "HEAD" + + files = + Collector.collect_files(path, extra_ignore_patterns) if map_size(files) == 0 do IO.puts(:stderr, "Warning: no source files found in '#{path}'") @@ -59,14 +71,20 @@ defmodule CodeQA.CLI.HealthReport do IO.puts(:stderr, "Analyzing #{map_size(files)} files for health report...") - analyze_opts = Options.build_analyze_opts(opts) + telemetry_pid = if opts[:telemetry], do: attach_block_impact_telemetry() + + analyze_opts = + Options.build_analyze_opts(opts) ++ + Config.near_duplicate_blocks_opts() ++ [compute_nodes: true] start_time = System.monotonic_time(:millisecond) - results = CodeQA.Analyzer.analyze_codebase(files, analyze_opts) + results = Analyzer.analyze_codebase(files, analyze_opts) end_time = System.monotonic_time(:millisecond) IO.puts(:stderr, "Analysis completed in #{end_time - start_time}ms") + if telemetry_pid, do: print_block_impact_telemetry(telemetry_pid) + total_bytes = results["files"] |> Map.values() |> Enum.map(& &1["bytes"]) |> Enum.sum() results = @@ -77,29 +95,91 @@ defmodule CodeQA.CLI.HealthReport do "total_bytes" => total_bytes }) + {base_results, changed_files, diff_line_ranges} = + if base_ref do + IO.puts(:stderr, "Collecting base snapshot at #{base_ref}...") + base_files = Git.collect_files_at_ref(path, base_ref) + changed = Git.changed_files(path, base_ref, head_ref) + + diff_ranges = + case Git.diff_line_ranges(path, base_ref, head_ref) do + {:ok, ranges} -> + ranges + + {:error, reason} -> + IO.puts(:stderr, "Warning: failed to parse diff line ranges: #{inspect(reason)}") + IO.puts(:stderr, "Block scoping disabled - showing all blocks in changed files") + %{} + end + + IO.puts(:stderr, "Analyzing base snapshot (#{map_size(base_files)} files)...") + base_res = Analyzer.analyze_codebase(base_files, analyze_opts) + + {base_res, changed, diff_ranges} + else + {nil, [], %{}} + end + detail = parse_detail(opts[:detail]) format = parse_format(opts[:format]) top_n = opts[:top] || 5 report = - CodeQA.HealthReport.generate(results, + HealthReport.generate(results, config: opts[:config], detail: detail, - top: top_n + top: top_n, + base_results: base_results, + changed_files: changed_files, + diff_line_ranges: diff_line_ranges ) - markdown = CodeQA.HealthReport.to_markdown(report, detail, format) + if opts[:comment] do + write_comment_parts(report, detail) + else + markdown = HealthReport.to_markdown(report, detail, format) - case opts[:output] do - nil -> - IO.puts(markdown) + case opts[:output] do + nil -> + markdown - file -> - File.write!(file, markdown) - IO.puts(:stderr, "Health report written to #{file}") + file -> + File.write!(file, markdown) + IO.puts(:stderr, "Health report written to #{file}") + "" + end end + end - if opts[:telemetry], do: CodeQA.Telemetry.print_report() + defp write_comment_parts(report, detail) do + tmpdir = System.get_env("TMPDIR", "/tmp") + parts = HealthReport.Formatter.render_parts(report, detail: detail) + + # Write each part to a numbered file + Enum.with_index(parts, 1) + |> Enum.each(fn {content, n} -> + path = Path.join(tmpdir, "codeqa-part-#{n}.md") + File.write!(path, content) + IO.puts(:stderr, "Part #{n} written to #{path} (#{byte_size(content)} bytes)") + end) + + # Ensure at least 3 parts exist for stale cleanup + actual_count = length(parts) + padded_count = max(actual_count, 3) + + for n <- (actual_count + 1)..padded_count//1 do + path = Path.join(tmpdir, "codeqa-part-#{n}.md") + placeholder = "> _No content for this section._\n\n" + File.write!(path, placeholder) + IO.puts(:stderr, "Part #{n} (placeholder) written to #{path}") + end + + # Write part count for run.sh to read + count_path = Path.join(tmpdir, "codeqa-part-count.txt") + File.write!(count_path, to_string(padded_count)) + IO.puts(:stderr, "Part count (#{padded_count}) written to #{count_path}") + + "" end defp parse_detail(nil), do: :default @@ -120,4 +200,119 @@ defmodule CodeQA.CLI.HealthReport do IO.puts(:stderr, "Warning: unknown format '#{other}', using 'plain'") :plain end + + # --------------------------------------------------------------------------- + # Block impact telemetry + # --------------------------------------------------------------------------- + + defp attach_block_impact_telemetry do + {:ok, pid} = Agent.start_link(fn -> %{nodes: [], files: [], codebase_cosines_us: 0} end) + + :telemetry.attach_many( + "block-impact-reporter", + [ + [:codeqa, :block_impact, :codebase_cosines], + [:codeqa, :block_impact, :file], + [:codeqa, :block_impact, :node] + ], + &handle_block_impact_event(&1, &2, &3, &4), + pid + ) + + pid + end + + defp handle_block_impact_event( + [:codeqa, :block_impact, :codebase_cosines], + measurements, + _metadata, + pid + ) do + Agent.update(pid, &Map.put(&1, :codebase_cosines_us, measurements.duration)) + end + + defp handle_block_impact_event([:codeqa, :block_impact, :file], measurements, metadata, pid) do + Agent.update(pid, fn state -> + Map.update!(state, :files, &[{metadata.path, measurements} | &1]) + end) + end + + defp handle_block_impact_event([:codeqa, :block_impact, :node], measurements, metadata, pid) do + Agent.update(pid, fn state -> + Map.update!(state, :nodes, &[{metadata.path, measurements} | &1]) + end) + end + + defp print_block_impact_telemetry(pid) do + state = Agent.get(pid, & &1) + Agent.stop(pid) + :telemetry.detach("block-impact-reporter") + + nodes = state.nodes + files = state.files + + total_nodes = length(nodes) + total_files = length(files) + + node_totals = Enum.map(nodes, fn {_, m} -> m end) + file_totals = Enum.map(files, fn {_, m} -> m end) + + IO.puts(:stderr, """ + + ── Block Impact Telemetry ────────────────────────────── + Codebase cosines: #{us(state.codebase_cosines_us)} + Files processed: #{total_files} + Nodes processed: #{total_nodes} + + Per-file breakdown (avg across #{total_files} files): + tokenize: #{avg_us(file_totals, :tokenize_us)} + parse blocks: #{avg_us(file_totals, :parse_us)} + file cosines: #{avg_us(file_totals, :file_cosines_us)} + total/file: #{avg_us(file_totals, :duration)} + + Per-node breakdown (avg across #{total_nodes} nodes): + reconstruct: #{avg_us(node_totals, :reconstruct_us)} + analyze_file: #{avg_us(node_totals, :analyze_file_us)} + aggregate: #{avg_us(node_totals, :aggregate_us)} + refactoring cosine: #{avg_us(node_totals, :refactoring_us)} + total/node: #{avg_us(node_totals, :duration)} + + Top 5 slowest files (total node time): + #{top_slow_files(files, nodes)} + ──────────────────────────────────────────────────────── + """) + end + + defp top_slow_files(files, nodes) do + node_time_by_file = + nodes + |> Enum.group_by(fn {path, _} -> path end, fn {_, m} -> m.duration end) + |> Map.new(fn {path, durations} -> {path, Enum.sum(durations)} end) + + files + |> Enum.map(fn {path, fm} -> + node_time = Map.get(node_time_by_file, path, 0) + {path, fm.node_count, node_time} + end) + |> Enum.sort_by(fn {_, _, t} -> -t end) + |> Enum.take(5) + |> Enum.map_join("\n", fn {path, node_count, node_time} -> + " #{path} (#{node_count} nodes, #{us(node_time)} node time)" + end) + end + + defp avg_us([], _key), do: "n/a" + + defp avg_us(measurements, key) do + total = Enum.sum(Enum.map(measurements, &Map.get(&1, key, 0))) + us(div(total, length(measurements))) + end + + defp us(microseconds) when microseconds >= 1_000_000, + do: "#{Float.round(microseconds / 1_000_000, 2)}s" + + defp us(microseconds) when microseconds >= 1_000, + do: "#{Float.round(microseconds / 1_000, 1)}ms" + + defp us(microseconds), do: "#{microseconds}µs" end diff --git a/lib/codeqa/cli/history.ex b/lib/codeqa/cli/history.ex index 4c73ace..ca40669 100644 --- a/lib/codeqa/cli/history.ex +++ b/lib/codeqa/cli/history.ex @@ -4,6 +4,11 @@ defmodule CodeQA.CLI.History do @behaviour CodeQA.CLI.Command alias CodeQA.CLI.Options + alias CodeQA.CLI.Progress + alias CodeQA.Config + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + alias CodeQA.Git @version "0.1.0" @@ -34,18 +39,20 @@ defmodule CodeQA.CLI.History do @impl CodeQA.CLI.Command def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) + usage() end def run(args) do {opts, [path], _} = - Options.parse(args, + Options.parse( + args, [ commits: :integer, commit_list: :string, output_dir: :string ], - [n: :commits, o: :output_dir] + n: :commits, + o: :output_dir ) output_dir = opts[:output_dir] || raise "Missing --output-dir" @@ -56,14 +63,19 @@ defmodule CodeQA.CLI.History do commits = resolve_commits(opts, path) IO.puts(:stderr, "Found #{length(commits)} commits to analyze.") - analyze_opts = Options.build_analyze_opts(opts) - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) + Config.load(path) + + analyze_opts = + Options.build_analyze_opts(opts) ++ Config.near_duplicate_blocks_opts() + + ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) commits |> Enum.with_index(1) |> Enum.each(&analyze_commit(&1, path, output_dir, analyze_opts, ignore_patterns, opts)) IO.puts(:stderr, "Done writing history to #{output_dir}") + "" end defp resolve_commits(opts, path) do @@ -90,14 +102,13 @@ defmodule CodeQA.CLI.History do current_opts = if opts[:progress], do: [ - {:on_progress, - fn c, t, p, _tt -> CodeQA.CLI.Progress.callback(c, t, p, start_time_progress) end} + {:on_progress, fn c, t, p, _tt -> Progress.callback(c, t, p, start_time_progress) end} | analyze_opts ], else: analyze_opts - files = CodeQA.Git.collect_files_at_ref(path, commit) - files = CodeQA.Collector.reject_ignored_map(files, ignore_patterns) + files = Git.collect_files_at_ref(path, commit) + files = Collector.reject_ignored_map(files, ignore_patterns) if map_size(files) == 0 do IO.puts(:stderr, "Warning: no source files found at commit #{commit}") @@ -108,7 +119,7 @@ defmodule CodeQA.CLI.History do defp write_commit_result(commit, path, output_dir, files, analyze_opts) do start_time = System.monotonic_time(:millisecond) - results = CodeQA.Analyzer.analyze_codebase(files, analyze_opts) + results = Analyzer.analyze_codebase(files, analyze_opts) end_time = System.monotonic_time(:millisecond) IO.puts(:stderr, " Analysis completed in #{end_time - start_time}ms") diff --git a/lib/codeqa/cli/options.ex b/lib/codeqa/cli/options.ex index c735d56..199a95d 100644 --- a/lib/codeqa/cli/options.ex +++ b/lib/codeqa/cli/options.ex @@ -1,6 +1,8 @@ defmodule CodeQA.CLI.Options do @moduledoc false + alias CodeQA.CLI.Progress + @common_strict [ workers: :integer, cache: :boolean, @@ -10,13 +12,11 @@ defmodule CodeQA.CLI.Options do ncd_top: :integer, ncd_paths: :string, combinations: :boolean, - telemetry: :boolean, - experimental_stopwords: :boolean, - stopwords_threshold: :float, show_files: :boolean, show_file_paths: :string, ignore_paths: :string, - progress: :boolean + progress: :boolean, + nodes_top: :integer ] @common_aliases [w: :workers, t: :timeout] @@ -27,7 +27,7 @@ defmodule CodeQA.CLI.Options do @spec common_aliases() :: keyword() def common_aliases, do: @common_aliases - @spec parse(list(String.t()), keyword()) :: {keyword(), list(String.t()), list()} + @spec parse(list(String.t()), keyword(), keyword()) :: {keyword(), list(String.t()), list()} def parse(args, extra_strict \\ [], extra_aliases \\ []) do OptionParser.parse(args, strict: Keyword.merge(@common_strict, extra_strict), @@ -54,22 +54,6 @@ defmodule CodeQA.CLI.Options do |> Enum.map(&String.trim/1) end - @spec load_config_ignore_paths(String.t()) :: [String.t()] - def load_config_ignore_paths(path) do - config_file = Path.join(path, ".codeqa.yml") - - case File.read(config_file) do - {:ok, contents} -> - case YamlElixir.read_from_string(contents) do - {:ok, %{"ignore_paths" => patterns}} when is_list(patterns) -> patterns - _ -> [] - end - - {:error, _} -> - [] - end - end - @spec build_analyze_opts(keyword()) :: keyword() def build_analyze_opts(opts) do start_time_progress = System.monotonic_time(:millisecond) @@ -79,17 +63,14 @@ defmodule CodeQA.CLI.Options do :show_ncd, :ncd_top, :combinations, - :telemetry, - :experimental_stopwords, - :stopwords_threshold + :nodes_top ] base = [{:timeout, opts[:timeout] || 5000}] |> maybe_add( opts[:progress], - {:on_progress, - fn c, t, p, _tt -> CodeQA.CLI.Progress.callback(c, t, p, start_time_progress) end} + {:on_progress, fn c, t, p, _tt -> Progress.callback(c, t, p, start_time_progress) end} ) |> maybe_add(opts[:cache], {:cache_dir, opts[:cache_dir] || ".codeqa_cache"}) |> maybe_add( diff --git a/lib/codeqa/cli/progress.ex b/lib/codeqa/cli/progress.ex index 6ffdd14..aa09b05 100644 --- a/lib/codeqa/cli/progress.ex +++ b/lib/codeqa/cli/progress.ex @@ -1,6 +1,8 @@ defmodule CodeQA.CLI.Progress do @moduledoc false + alias CodeQA.CLI.UI + @spec callback(integer(), integer(), String.t(), integer()) :: :ok def callback(completed, total, path, start_time) do now = System.monotonic_time(:millisecond) @@ -11,8 +13,8 @@ defmodule CodeQA.CLI.Progress do label = if String.length(path) > 30, do: "..." <> String.slice(path, -27..-1), else: path output = - CodeQA.CLI.UI.progress_bar(completed, total, - eta: CodeQA.CLI.UI.format_eta(eta_ms), + UI.progress_bar(completed, total, + eta: UI.format_eta(eta_ms), label: label ) diff --git a/lib/codeqa/cli/stopwords.ex b/lib/codeqa/cli/stopwords.ex deleted file mode 100644 index f79027b..0000000 --- a/lib/codeqa/cli/stopwords.ex +++ /dev/null @@ -1,97 +0,0 @@ -defmodule CodeQA.CLI.Stopwords do - @moduledoc false - - @behaviour CodeQA.CLI.Command - - alias CodeQA.CLI.Options - - @impl CodeQA.CLI.Command - def usage do - """ - Usage: codeqa stopwords [options] - - Print codebase-specific stopwords based on frequency analysis. - - Options: - --stopwords-threshold FLOAT Frequency threshold for stopword detection - --progress Show per-file progress on stderr - -w, --workers N Number of parallel workers - --ignore-paths PATHS Comma-separated list of path patterns to ignore (supports wildcards, e.g. "test/*,docs/*") - """ - end - - @impl CodeQA.CLI.Command - def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) - end - - def run(args) do - {opts, [path], _} = - OptionParser.parse(args, - strict: [ - workers: :integer, - stopwords_threshold: :float, - progress: :boolean, - ignore_paths: :string - ], - aliases: [w: :workers] - ) - - Options.validate_dir!(path) - - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) - files = CodeQA.Collector.collect_files(path, ignore_patterns: ignore_patterns) - - if map_size(files) == 0 do - IO.puts(:stderr, "Warning: no source files found in '#{path}'") - exit({:shutdown, 1}) - end - - IO.puts(:stderr, "Extracting stopwords for #{map_size(files)} files...") - start_time = System.monotonic_time(:millisecond) - - word_stopwords = find_word_stopwords(files, opts) - fp_stopwords = find_fingerprint_stopwords(files, opts) - - end_time = System.monotonic_time(:millisecond) - - IO.puts(:stderr, "\nAnalysis completed in #{end_time - start_time}ms") - print_word_stopwords(word_stopwords) - IO.puts(:stderr, "\n--- Fingerprint Stopwords (#{MapSet.size(fp_stopwords)}) ---") - IO.puts(:stderr, "Found #{MapSet.size(fp_stopwords)} structural k-gram hashes.") - end - - defp find_word_stopwords(files, opts) do - word_extractor = fn content -> - Regex.scan(~r/\b[a-zA-Z_]\w*\b/u, content) |> List.flatten() - end - - CodeQA.Stopwords.find_stopwords( - files, - word_extractor, - Keyword.put(opts, :progress_label, "Words") - ) - end - - defp find_fingerprint_stopwords(files, opts) do - fp_extractor = fn content -> - CodeQA.Metrics.TokenNormalizer.normalize(content) |> CodeQA.Metrics.Winnowing.kgrams(5) - end - - CodeQA.Stopwords.find_stopwords( - files, - fp_extractor, - Keyword.put(opts, :progress_label, "Fingerprints") - ) - end - - defp print_word_stopwords(word_stopwords) do - IO.puts(:stderr, "\n--- Word Stopwords (#{MapSet.size(word_stopwords)}) ---") - - word_stopwords - |> MapSet.to_list() - |> Enum.sort() - |> Enum.chunk_every(10) - |> Enum.each(fn chunk -> IO.puts(Enum.join(chunk, ", ")) end) - end -end diff --git a/lib/codeqa/collector.ex b/lib/codeqa/collector.ex deleted file mode 100644 index 02e6f34..0000000 --- a/lib/codeqa/collector.ex +++ /dev/null @@ -1,99 +0,0 @@ -defmodule CodeQA.Collector do - @moduledoc false - - @source_extensions MapSet.new(~w[ - .py .js .ts .jsx .tsx .java .rs .go .c .cpp .h .hpp .rb .ex .exs - .swift .kt .scala .sh .css .scss .html .vue .svelte .zig .lua .pl - .pm .r .jl .cs .fs .ml .hs .erl .clj .dart - ]) - - @skip_dirs MapSet.new(~w[ - .git .hg .svn node_modules __pycache__ _build dist build vendor - .tox .venv venv target .mypy_cache .pytest_cache deps .elixir_ls - .next coverage - ]) - - @spec collect_files(String.t(), keyword()) :: %{String.t() => String.t()} - def collect_files(root, opts \\ []) do - root_path = Path.expand(root) - ignore_patterns = Keyword.get(opts, :ignore_patterns, []) - - unless File.dir?(root_path) do - raise File.Error, reason: :enoent, path: root, action: "find directory" - end - - root_path - |> walk_directory() - |> Map.new(fn path -> - rel = Path.relative_to(path, root_path) - {rel, File.read!(path)} - end) - |> reject_ignored_map(ignore_patterns) - end - - def source_extensions, do: @source_extensions - - @doc false - def ignored?(path, patterns) do - Enum.any?(patterns, fn pattern -> - match_pattern?(path, pattern) - end) - end - - @doc false - def reject_ignored_map(files_map, []), do: files_map - - def reject_ignored_map(files_map, patterns) do - Map.reject(files_map, fn {path, _} -> ignored?(path, patterns) end) - end - - @doc false - def reject_ignored(list, [], _key_fn), do: list - - def reject_ignored(list, patterns, key_fn) do - Enum.reject(list, fn item -> ignored?(key_fn.(item), patterns) end) - end - - defp match_pattern?(path, pattern) do - # Convert glob pattern to regex: - # - ** matches any number of directories - # - * matches anything except / - # - ? matches a single character except / - regex_str = - pattern - |> String.replace(".", "\\.") - |> String.replace("**", "\0GLOBSTAR\0") - |> String.replace("*", "[^/]*") - |> String.replace("?", "[^/]") - |> String.replace("\0GLOBSTAR\0", ".*") - - case Regex.compile("^#{regex_str}$") do - {:ok, regex} -> Regex.match?(regex, path) - _ -> false - end - end - - defp walk_directory(dir) do - dir - |> File.ls!() - |> Enum.flat_map(fn entry -> - full_path = Path.join(dir, entry) - - cond do - File.dir?(full_path) and not skip_dir?(entry) -> - walk_directory(full_path) - - File.regular?(full_path) and source_file?(entry) -> - [full_path] - - true -> - [] - end - end) - end - - defp skip_dir?(name), do: MapSet.member?(@skip_dirs, name) or String.starts_with?(name, ".") - - defp source_file?(name), - do: MapSet.member?(@source_extensions, Path.extname(name) |> String.downcase()) -end diff --git a/lib/codeqa/combined_metrics/category.ex b/lib/codeqa/combined_metrics/category.ex new file mode 100644 index 0000000..def09ad --- /dev/null +++ b/lib/codeqa/combined_metrics/category.ex @@ -0,0 +1,40 @@ +defmodule CodeQA.CombinedMetrics.Category do + @moduledoc """ + Macro helper for defining combined-metric category modules. + + Each category module (e.g. `VariableNaming`, `Documentation`) calls + `use CodeQA.CombinedMetrics.Category, yaml_path: "priv/..."`. + + This injects: + - `@callback score(metrics :: map()) :: float()` — making the caller a behaviour + - `compute_score/2` — delegates to `Scorer` with the baked-in yaml path + + ## Example + + defmodule CodeQA.CombinedMetrics.VariableNaming do + use CodeQA.CombinedMetrics.Category, + yaml_path: "priv/combined_metrics/variable_naming.yml" + end + + Leaf modules then declare `@behaviour CodeQA.CombinedMetrics.VariableNaming` + and call `VariableNaming.compute_score("key", metrics)`. + """ + + defmacro __using__(yaml_path: yaml_path) do + quote do + alias CodeQA.CombinedMetrics.Scorer + + @callback score(metrics :: map()) :: float() + + @doc """ + Computes the score for `metric_name` using scalars from this category's YAML file. + + Delegates to `CodeQA.CombinedMetrics.Scorer.compute_score/3`. + """ + @spec compute_score(String.t(), map()) :: float() + def compute_score(metric_name, metrics) do + Scorer.compute_score(unquote(yaml_path), metric_name, metrics) + end + end + end +end diff --git a/lib/codeqa/combined_metrics/code_smells.ex b/lib/codeqa/combined_metrics/code_smells.ex new file mode 100644 index 0000000..13586ba --- /dev/null +++ b/lib/codeqa/combined_metrics/code_smells.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.CodeSmells do + @moduledoc """ + Behaviour and submodule registry for code smell detection metrics. + + Scalar weights are defined in `priv/combined_metrics/code_smells.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/code_smells.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.CodeSmells, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.CodeSmells + @moduledoc doc + @behaviour CodeSmells + @score_key key + @impl true + def score(metrics), + do: CodeSmells.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/consistency.ex b/lib/codeqa/combined_metrics/consistency.ex new file mode 100644 index 0000000..1c4af0c --- /dev/null +++ b/lib/codeqa/combined_metrics/consistency.ex @@ -0,0 +1,30 @@ +defmodule CodeQA.CombinedMetrics.Consistency do + @moduledoc """ + Behaviour and submodule registry for codebase consistency metrics. + + Covers naming style uniformity, structural patterns, and cross-file coherence. + Scalar weights are defined in `priv/combined_metrics/consistency.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/consistency.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.Consistency, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.Consistency + @moduledoc doc + @behaviour Consistency + @score_key key + @impl true + def score(metrics), + do: Consistency.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/cosine_vector.ex b/lib/codeqa/combined_metrics/cosine_vector.ex new file mode 100644 index 0000000..36bbe23 --- /dev/null +++ b/lib/codeqa/combined_metrics/cosine_vector.ex @@ -0,0 +1,90 @@ +defmodule CodeQA.CombinedMetrics.CosineVector do + @moduledoc """ + Computes cosine similarity between a behavior's scalar weight vector and a + log-metric vector derived from an aggregate. + + Pure math — no I/O, no YAML loading. Intended for internal use by `SampleRunner`. + """ + + alias CodeQA.CombinedMetrics.Scorer + + @doc """ + Builds the cosine result entry for a single behavior against the given aggregate. + + Returns a one-element list `[result_map]` on success or `[]` when the behavior + has no non-zero scalars (no sample data) and should be excluded. + + ## Options + + * `:log_metrics` - precomputed log-metric map `%{group => %{key => log_val}}`. + When present, values are looked up directly instead of being recomputed via + `:math.log/1`. Falls back to inline computation when absent or when a key is + not found in the map. + """ + @spec compute(String.t(), String.t(), map(), map(), String.t(), keyword()) :: [map()] + def compute(yaml_path, behavior, behavior_data, aggregate, category, opts \\ []) do + scalars = Scorer.scalars_for(yaml_path, behavior) + + if map_size(scalars) == 0 do + [] + else + build_result(yaml_path, behavior, behavior_data, aggregate, category, scalars, opts) + end + end + + # --- Internal helpers --- + + defp build_result(yaml_path, behavior, behavior_data, aggregate, category, scalars, opts) do + log_baseline = Map.get(behavior_data, "_log_baseline", 0.0) / 1.0 + log_metrics = Keyword.get(opts, :log_metrics) + + {dot, norm_s_sq, norm_v_sq, contributions} = + Enum.reduce(scalars, {0.0, 0.0, 0.0, []}, fn {{group, key}, scalar}, + {d, ns, nv, contribs} -> + log_m = lookup_log_metric(log_metrics, aggregate, group, key) + contrib = scalar * log_m + + {d + contrib, ns + scalar * scalar, nv + log_m * log_m, + [{:"#{group}.#{key}", contrib} | contribs]} + end) + + cos_sim = + if norm_s_sq > 0 and norm_v_sq > 0, + do: dot / (:math.sqrt(norm_s_sq) * :math.sqrt(norm_v_sq)), + else: 0.0 + + raw_score = Scorer.compute_score(yaml_path, behavior, aggregate) + calibrated = :math.log(max(raw_score, 1.0e-300)) - log_baseline + + top_metrics = + contributions + |> Enum.sort_by(fn {_, c} -> c end) + |> Enum.take(5) + |> Enum.map(fn {metric, contribution} -> + %{metric: to_string(metric), contribution: Float.round(contribution, 4)} + end) + + [ + %{ + category: category, + behavior: behavior, + cosine: Float.round(cos_sim, 4), + score: Float.round(calibrated, 4), + top_metrics: top_metrics + } + ] + end + + # Returns a precomputed log value when available, otherwise computes inline. + # Both paths apply the same max(val, 1.0e-300) floor guard to ensure identical + # results regardless of whether log_metrics was precomputed or not. + defp lookup_log_metric(nil, aggregate, group, key), + do: :math.log(max(Scorer.get(aggregate, group, key) / 1.0, 1.0e-300)) + + defp lookup_log_metric(log_metrics, aggregate, group, key) do + case get_in(log_metrics, [group, key]) do + nil -> :math.log(max(Scorer.get(aggregate, group, key) / 1.0, 1.0e-300)) + log_val -> log_val + end + end +end diff --git a/lib/codeqa/combined_metrics/dependencies.ex b/lib/codeqa/combined_metrics/dependencies.ex new file mode 100644 index 0000000..f0b25aa --- /dev/null +++ b/lib/codeqa/combined_metrics/dependencies.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.Dependencies do + @moduledoc """ + Behaviour and submodule registry for dependency and coupling quality metrics. + + Scalar weights are defined in `priv/combined_metrics/dependencies.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/dependencies.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.Dependencies, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.Dependencies + @moduledoc doc + @behaviour Dependencies + @score_key key + @impl true + def score(metrics), + do: Dependencies.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/documentation.ex b/lib/codeqa/combined_metrics/documentation.ex new file mode 100644 index 0000000..94f8a95 --- /dev/null +++ b/lib/codeqa/combined_metrics/documentation.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.Documentation do + @moduledoc """ + Behaviour and submodule registry for documentation quality metrics. + + Scalar weights are defined in `priv/combined_metrics/documentation.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/documentation.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.Documentation, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.Documentation + @moduledoc doc + @behaviour Documentation + @score_key key + @impl true + def score(metrics), + do: Documentation.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/error_handling.ex b/lib/codeqa/combined_metrics/error_handling.ex new file mode 100644 index 0000000..9039ef6 --- /dev/null +++ b/lib/codeqa/combined_metrics/error_handling.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.ErrorHandling do + @moduledoc """ + Behaviour and submodule registry for error handling quality metrics. + + Scalar weights are defined in `priv/combined_metrics/error_handling.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/error_handling.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.ErrorHandling, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.ErrorHandling + @moduledoc doc + @behaviour ErrorHandling + @score_key key + @impl true + def score(metrics), + do: ErrorHandling.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/file_scorer.ex b/lib/codeqa/combined_metrics/file_scorer.ex new file mode 100644 index 0000000..e7479b0 --- /dev/null +++ b/lib/codeqa/combined_metrics/file_scorer.ex @@ -0,0 +1,109 @@ +defmodule CodeQA.CombinedMetrics.FileScorer do + @moduledoc """ + Scores individual files against combined metric behaviors. + + Converts per-file raw metric maps to aggregate-compatible format and + identifies which behaviors each file most likely exhibits. + """ + + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.Config + alias CodeQA.HealthReport.Grader + alias CodeQA.Language + + @doc """ + Converts a single file's raw metric map to aggregate format. + + Wraps each key in each group with the `mean_` prefix so the resulting + map is compatible with `SampleRunner.diagnose_aggregate/2`. + + ## Example + + iex> CodeQA.CombinedMetrics.FileScorer.file_to_aggregate(%{"halstead" => %{"tokens" => 42.0}}) + %{"halstead" => %{"mean_tokens" => 42.0}} + """ + @spec file_to_aggregate(map()) :: map() + def file_to_aggregate(metrics) do + Map.new(metrics, fn {group, keys} -> + prefixed_keys = Map.new(keys, fn {key, value} -> {"mean_" <> key, value} end) + {group, prefixed_keys} + end) + end + + @doc """ + Identifies the worst files per combined metric behavior. + + For each file in `files_map`, converts its metrics to aggregate format and + runs `SampleRunner.diagnose_aggregate/2`. The results are collected per + behavior and sorted ascending by cosine similarity (most negative = worst first), + then truncated to `combined_top` entries. + + ## Options + + * `:combined_top` - number of worst files to keep per behavior (default: 2) + + ## Result shape + + %{ + "function_design.no_boolean_parameter" => [ + %{file: "lib/foo/bar.ex", cosine: -0.71}, + %{file: "lib/foo/baz.ex", cosine: -0.44} + ], + ... + } + """ + @spec worst_files_per_behavior(map(), keyword()) :: + %{ + String.t() => [ + %{file: String.t(), cosine: float(), top_metrics: list(), top_nodes: list()} + ] + } + def worst_files_per_behavior(files_map, opts \\ []) do + # NOTE: cosine similarity is computed at file level; a line-level mapping would require computing a separate + # cosine score for each AST node by projecting that node's metric vector against the behavior's + # feature-weight vector. This is not currently possible because serialized nodes do not carry their own + # metric values. + top_n = Keyword.get(opts, :combined_top, 2) + + files_map + |> Enum.reject(fn {_path, file_data} -> + file_data |> Map.get("metrics", %{}) |> map_size() == 0 + end) + |> Enum.reduce(%{}, fn {path, file_data}, acc -> + accumulate_file_behaviors(path, file_data, acc) + end) + |> Map.new(fn {key, entries} -> + threshold = Config.cosine_significance_threshold() + + sorted = + entries + |> Enum.filter(fn e -> e.cosine <= -threshold end) + |> Enum.sort_by(& &1.cosine) + |> Enum.take(top_n) + + {key, sorted} + end) + end + + # Diagnoses a single file's metrics and merges per-behavior entries into the accumulator. + defp accumulate_file_behaviors(path, file_data, acc) do + top_nodes = Grader.top_3_nodes(Map.get(file_data, "nodes")) + language = Language.detect(path).name() + + file_data + |> Map.get("metrics", %{}) + |> file_to_aggregate() + |> SampleRunner.diagnose_aggregate(top: 99_999, language: language) + |> Enum.reduce(acc, fn %{ + category: category, + behavior: behavior, + cosine: cosine, + top_metrics: top_metrics + }, + inner_acc -> + key = "#{category}.#{behavior}" + entry = %{file: path, cosine: cosine, top_metrics: top_metrics, top_nodes: top_nodes} + Map.update(inner_acc, key, [entry], &[entry | &1]) + end) + end +end diff --git a/lib/codeqa/combined_metrics/file_structure.ex b/lib/codeqa/combined_metrics/file_structure.ex new file mode 100644 index 0000000..aa6f153 --- /dev/null +++ b/lib/codeqa/combined_metrics/file_structure.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.FileStructure do + @moduledoc """ + Behaviour and submodule registry for file structure quality metrics. + + Scalar weights are defined in `priv/combined_metrics/file_structure.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/file_structure.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.FileStructure, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.FileStructure + @moduledoc doc + @behaviour FileStructure + @score_key key + @impl true + def score(metrics), + do: FileStructure.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/function_design.ex b/lib/codeqa/combined_metrics/function_design.ex new file mode 100644 index 0000000..3eab5f7 --- /dev/null +++ b/lib/codeqa/combined_metrics/function_design.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.FunctionDesign do + @moduledoc """ + Behaviour and submodule registry for function design quality metrics. + + Scalar weights are defined in `priv/combined_metrics/function_design.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/function_design.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.FunctionDesign, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.FunctionDesign + @moduledoc doc + @behaviour FunctionDesign + @score_key key + @impl true + def score(metrics), + do: FunctionDesign.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/naming_conventions.ex b/lib/codeqa/combined_metrics/naming_conventions.ex new file mode 100644 index 0000000..eafb5dc --- /dev/null +++ b/lib/codeqa/combined_metrics/naming_conventions.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.CombinedMetrics.NamingConventions do + @moduledoc """ + Behaviour and submodule registry for broader naming convention metrics. + + Covers class, file, and function naming patterns not captured by + `VariableNaming`. Scalar weights are defined in + `priv/combined_metrics/naming_conventions.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/naming_conventions.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.NamingConventions, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.NamingConventions + @moduledoc doc + @behaviour NamingConventions + @score_key key + @impl true + def score(metrics), + do: NamingConventions.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/sample_runner.ex b/lib/codeqa/combined_metrics/sample_runner.ex new file mode 100644 index 0000000..f652c83 --- /dev/null +++ b/lib/codeqa/combined_metrics/sample_runner.ex @@ -0,0 +1,471 @@ +defmodule CodeQA.CombinedMetrics.SampleRunner do + @moduledoc """ + Discovers sample directories, analyzes them, and scores each behavior formula. + + Returns structured results suitable for rendering a separation table, enabling + manual scalar tuning of combined metric formulas. + """ + + alias CodeQA.CombinedMetrics.{CosineVector, ScalarApplier, Scorer} + alias CodeQA.Engine.{Analyzer, Collector} + + @samples_root "priv/combined_metrics/samples" + + # --------------------------------------------------------------------------- + # Public API + # --------------------------------------------------------------------------- + + @doc """ + Runs all behaviors found in sample directories, optionally filtered by category. + + ## Options + + * `:category` - restrict to one category (e.g. `"variable_naming"`) + * `:verbose` - when `true`, populates `:metric_detail` in each result + + ## Result shape + + %{ + category: "variable_naming", + behavior: "name_is_generic", + bad_score: 0.074, + good_score: 0.550, + ratio: 7.43, + direction_ok: true, + metric_detail: [...] # empty unless verbose: true + } + """ + @spec run(keyword()) :: [map()] + def run(opts \\ []) do + filter_category = opts[:category] + + @samples_root + |> list_behaviors() + |> Enum.filter(fn {category, behavior} -> + (filter_category == nil or category == filter_category) and + has_both_dirs?(category, behavior) + end) + |> Enum.map(fn {category, behavior} -> + score_behavior(category, behavior, opts) + end) + end + + @doc """ + Builds a per-behavior metric correlation report for scalar tuning. + + For each behavior with sample data, computes all `mean_*` metric values for + both good and bad samples, then suggests normalized scalars in [-2, 2] using + the log-linear method: + + log_diff = log(good_val) - log(bad_val) + suggested_scalar = 2.0 * log_diff / max(|all log_diffs| for this behavior) + + The strongest signal for each behavior maps to ±2.0; all others scale + proportionally. This lets you paste the suggested scalars into the YAML as a + starting point and refine from there. + + ## Result shape (keyed by "category.behavior") + + %{ + "variable_naming.name_is_generic" => %{ + "identifier_length_variance.mean_variance" => %{ + bad: 5.131, good: 25.109, + log_bad: 1.635, log_good: 3.224, + ratio: 4.895, + suggested_scalar: 2.0 + }, + ... + } + } + """ + @spec build_metric_report(keyword()) :: map() + def build_metric_report(opts \\ []) do + filter_category = opts[:category] + + @samples_root + |> list_behaviors() + |> Enum.filter(fn {category, behavior} -> + (filter_category == nil or category == filter_category) and + has_both_dirs?(category, behavior) + end) + |> Map.new(fn {category, behavior} -> + {"#{category}.#{behavior}", behavior_metric_table(category, behavior)} + end) + end + + @doc """ + Scores all combined metric behaviors against the given codebase aggregate map. + + Reads all YAML config files from `priv/combined_metrics/` and returns one entry + per YAML category, each containing the scores for all behaviors within it. + Behaviors are sorted ascending by score so the lowest-scoring (worst) appear first. + + ## Result shape + + [ + %{ + category: "variable_naming", + name: "Variable Naming", + behaviors: [ + %{behavior: "name_is_generic", score: 3.45}, + ... + ] + }, + ... + ] + """ + @spec score_aggregate(map(), keyword()) :: [map()] + def score_aggregate(aggregate, opts \\ []) do + languages = Keyword.get(opts, :languages) + + Scorer.all_yamls() + |> Enum.sort_by(fn {path, _} -> path end) + |> Enum.map(fn {yaml_path, data} -> + category = yaml_path |> Path.basename() |> String.trim_trailing(".yml") + + behaviors = + data + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.reject(fn {_behavior, behavior_data} -> + behavior_langs = Map.get(behavior_data, "_languages", []) + not behavior_language_applies?(behavior_langs, nil, languages) + end) + |> Enum.map(fn {behavior, behavior_data} -> + log_baseline = Map.get(behavior_data, "_log_baseline", 0.0) / 1.0 + raw_score = Scorer.compute_score(yaml_path, behavior, aggregate) + calibrated = :math.log(max(raw_score, 1.0e-300)) - log_baseline + %{behavior: behavior, score: Float.round(calibrated, 4)} + end) + |> Enum.sort_by(& &1.score) + + %{category: category, name: humanize(category), behaviors: behaviors} + end) + end + + @doc """ + Identifies the most likely code quality issues in an aggregate by cosine similarity. + + For each behavior, computes the cosine similarity between its scalar weight vector + `s` and the file's log-metric vector `v`: + + cos_sim = (s · v) / (|s| × |v|) + + A negative cosine means the file's metric profile anti-aligns with what good code + looks like for that behavior — i.e. the file likely exhibits that anti-pattern. + + Results are sorted by cosine similarity ascending (most negative = most likely + issue). Behaviors with no non-zero scalars (no sample data) are excluded. + + ## Options + + * `:top` - number of results to return (default 15) + * `:language` - single language string for per-file filtering; when set, only + behaviors whose `_languages` list includes this language are scored + * `:languages` - list of language strings for project-level filtering; when set, only + behaviors whose `_languages` list overlaps with this list are scored + + ## Result shape + + %{ + category: "function_design", + behavior: "no_boolean_parameter", + cosine: -0.83, + score: -13.54, + top_metrics: [%{metric: "branching.mean_branching_density", contribution: -4.1}, ...] + } + """ + @spec diagnose_aggregate(map(), keyword()) :: [map()] + def diagnose_aggregate(aggregate, opts \\ []) do + top_n = Keyword.get(opts, :top, 15) + language = Keyword.get(opts, :language) + languages = Keyword.get(opts, :languages) + behavior_map = Keyword.get(opts, :behavior_map) + + log_metrics = precompute_log_metrics(aggregate) + cosine_opts = [log_metrics: log_metrics] + + behaviors_stream = + if behavior_map do + behavior_map + |> Enum.sort_by(fn {category, _} -> category end) + |> Enum.flat_map( + &diagnose_from_behavior_map_entry(&1, aggregate, language, languages, cosine_opts) + ) + else + Scorer.all_yamls() + |> Enum.sort_by(fn {path, _} -> path end) + |> Enum.flat_map(&diagnose_from_yaml(&1, aggregate, language, languages, cosine_opts)) + end + + behaviors_stream + |> Enum.sort_by(& &1.cosine) + |> Enum.take(top_n) + end + + @doc """ + Applies suggested scalars from sample analysis back to the YAML config files. + + For each behavior that has sample data, rewrites its scalar entries using the + log-linear suggestion method. Metrics whose ratio falls in the deadzone are + excluded. All non-deadzoned metrics are written, even if they were not + previously present in the YAML. + + Behaviors without sample data are left unchanged. + + Returns a list of per-category stats maps. + """ + @spec apply_scalars(keyword()) :: [map()] + def apply_scalars(opts \\ []) do + report = build_metric_report(opts) + ScalarApplier.apply_scalars(report, opts) + end + + @doc """ + Updates only the `_languages` field in YAML config files based on sample data. + + Scans `bad/` and `good/` sample directories for each behavior, detects languages + from file extensions via `CodeQA.Language.detect/1`, and writes the intersection + as `_languages` to the YAML. Behaviors without sample data are left without a + `_languages` key (treated as applying to all languages at scoring time). + All existing scalars and baselines are preserved. + + Returns a list of `%{category: String.t(), behaviors_with_languages: non_neg_integer()}`. + """ + @spec apply_languages(keyword()) :: [map()] + def apply_languages(opts \\ []) do + ScalarApplier.apply_languages(opts) + end + + # --------------------------------------------------------------------------- + # Sample discovery + # --------------------------------------------------------------------------- + + defp list_behaviors(samples_root) do + samples_root + |> File.ls!() + |> Enum.flat_map(fn category -> + Path.join([samples_root, category]) + |> File.ls!() + |> Enum.map(&{category, &1}) + end) + end + + defp has_both_dirs?(category, behavior) do + File.dir?(sample_path(category, behavior, "bad")) and + File.dir?(sample_path(category, behavior, "good")) + end + + defp sample_path(category, behavior, kind) do + Path.join([@samples_root, category, behavior, kind]) + end + + defp analyze(dir) do + dir + |> Collector.collect_files() + |> Analyzer.analyze_codebase() + |> get_in(["codebase", "aggregate"]) + end + + # --------------------------------------------------------------------------- + # Sample scoring + # --------------------------------------------------------------------------- + + defp score_behavior(category, behavior, opts) do + yaml_path = "priv/combined_metrics/#{category}.yml" + bad_agg = analyze(sample_path(category, behavior, "bad")) + good_agg = analyze(sample_path(category, behavior, "good")) + + bad_score = Scorer.compute_score(yaml_path, behavior, bad_agg) + good_score = Scorer.compute_score(yaml_path, behavior, good_agg) + ratio = if bad_score > 0, do: good_score / bad_score, else: 0.0 + + base = %{ + category: category, + behavior: behavior, + bad_score: bad_score, + good_score: good_score, + ratio: Float.round(ratio, 2), + direction_ok: good_score >= bad_score + } + + if opts[:verbose] do + Map.put(base, :metric_detail, metric_detail(yaml_path, behavior, bad_agg, good_agg)) + else + Map.put(base, :metric_detail, []) + end + end + + defp metric_detail(yaml_path, behavior, bad_agg, good_agg) do + Scorer.scalars_for(yaml_path, behavior) + |> Enum.map(fn {{group, key}, scalar} -> + bad_val = Scorer.get(bad_agg, group, key) + good_val = Scorer.get(good_agg, group, key) + ratio = if bad_val > 0, do: Float.round(good_val / bad_val, 2), else: 0.0 + %{group: group, key: key, scalar: scalar, bad: bad_val, good: good_val, ratio: ratio} + end) + |> Enum.sort_by(&abs(&1.ratio - 1.0), :desc) + end + + # --------------------------------------------------------------------------- + # Metric report (vector building) + # --------------------------------------------------------------------------- + + defp behavior_metric_table(category, behavior) do + bad_agg = analyze(sample_path(category, behavior, "bad")) + good_agg = analyze(sample_path(category, behavior, "good")) + + entries = + Scorer.default_scalars() + |> Map.keys() + |> Enum.map(fn {group, key} -> + bad_val = Scorer.get(bad_agg, group, key) + good_val = Scorer.get(good_agg, group, key) + log_bad = :math.log(bad_val) + log_good = :math.log(good_val) + ratio = good_val / bad_val + log_diff = log_good - log_bad + {"#{group}.#{key}", bad_val, good_val, log_bad, log_good, ratio, log_diff} + end) + + max_abs_log_diff = + entries + |> Enum.map(fn {_, _, _, _, _, _, ld} -> abs(ld) end) + |> Enum.max(fn -> 1.0 end) + |> max(1.0e-10) + + Map.new(entries, fn {metric_key, bad_val, good_val, log_bad, log_good, ratio, log_diff} -> + suggested_scalar = Float.round(2.0 * log_diff / max_abs_log_diff, 4) + + {metric_key, + %{ + bad: r4(bad_val), + good: r4(good_val), + log_bad: r4(log_bad), + log_good: r4(log_good), + ratio: r4(ratio), + suggested_scalar: suggested_scalar + }} + end) + end + + defp r4(f), do: Float.round(f / 1.0, 4) + + # --------------------------------------------------------------------------- + # Cosine diagnosis (delegates vector math to CosineVector) + # --------------------------------------------------------------------------- + + # Builds a nested map of precomputed log values for all numeric entries in the + # aggregate: %{group => %{key => :math.log(max(val, 1.0e-300))}}. + # Called once per diagnose_aggregate/2 invocation so the inner reduce in + # CosineVector.build_result can do O(1) lookups instead of recomputing log + # for every (behavior, metric) pair. + defp precompute_log_metrics(aggregate) do + aggregate + |> Enum.filter(fn {_group, sub_map} -> is_map(sub_map) end) + |> Map.new(fn {group, sub_map} -> + log_sub = + sub_map + |> Enum.filter(fn {_key, val} -> is_number(val) end) + |> Map.new(fn {key, val} -> + {key, :math.log(max(val / 1.0, 1.0e-300))} + end) + + {group, log_sub} + end) + end + + defp diagnose_from_behavior_map_entry( + {category, behaviors}, + aggregate, + language, + languages, + cosine_opts + ) do + yaml_path = "priv/combined_metrics/#{category}.yml" + + Enum.flat_map(behaviors, fn {behavior, behavior_data} -> + maybe_diagnose_behavior( + yaml_path, + behavior, + behavior_data, + aggregate, + category, + language, + languages, + cosine_opts + ) + end) + end + + defp diagnose_from_yaml({yaml_path, data}, aggregate, language, languages, cosine_opts) do + category = yaml_path |> Path.basename() |> String.trim_trailing(".yml") + + data + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.flat_map(fn {behavior, behavior_data} -> + maybe_diagnose_behavior( + yaml_path, + behavior, + behavior_data, + aggregate, + category, + language, + languages, + cosine_opts + ) + end) + end + + defp maybe_diagnose_behavior( + yaml_path, + behavior, + behavior_data, + aggregate, + category, + language, + languages, + cosine_opts + ) do + behavior_langs = Map.get(behavior_data, "_languages", []) + + if behavior_language_applies?(behavior_langs, language, languages) do + CosineVector.compute(yaml_path, behavior, behavior_data, aggregate, category, cosine_opts) + else + [] + end + end + + # --------------------------------------------------------------------------- + # Language filtering + # --------------------------------------------------------------------------- + + # behavior_langs: the "_languages" list from the YAML ([] = applies to all) + # language: single language string from :language opt (nil = no filter) + # languages: project language list from :languages opt (nil = no filter) + defp behavior_language_applies?(_behavior_langs, nil, nil), do: true + + # Empty behavior_langs means "applies to all languages" — always include. + defp behavior_language_applies?([], _language, _languages), do: true + + defp behavior_language_applies?(_behavior_langs, nil, []), do: true + + defp behavior_language_applies?(behavior_langs, language, nil) when is_binary(language), + do: language in behavior_langs + + defp behavior_language_applies?(behavior_langs, nil, languages) when is_list(languages), + do: Enum.any?(behavior_langs, &(&1 in languages)) + + defp behavior_language_applies?(behavior_langs, language, languages) + when is_binary(language) and is_list(languages), + do: language in behavior_langs or Enum.any?(behavior_langs, &(&1 in languages)) + + # --------------------------------------------------------------------------- + # Misc + # --------------------------------------------------------------------------- + + defp humanize(slug) do + slug + |> String.split("_") + |> Enum.map_join(" ", &String.capitalize/1) + end +end diff --git a/lib/codeqa/combined_metrics/scalar_applier.ex b/lib/codeqa/combined_metrics/scalar_applier.ex new file mode 100644 index 0000000..1c8ec4b --- /dev/null +++ b/lib/codeqa/combined_metrics/scalar_applier.ex @@ -0,0 +1,209 @@ +defmodule CodeQA.CombinedMetrics.ScalarApplier do + @moduledoc """ + Writes suggested scalars and language metadata back to the combined-metrics YAML + config files under `priv/combined_metrics/`. + + Intended for internal use by `SampleRunner`. Two entry points: + + * `apply_scalars/2` — rewrites scalar weights using log-linear suggestions + * `apply_languages/2` — rewrites `_languages` based on sample file extensions + """ + + alias CodeQA.CombinedMetrics.YamlFormatter + + @samples_root "priv/combined_metrics/samples" + @yaml_dir "priv/combined_metrics" + @deadzone_low 0.995 + @deadzone_high 1.005 + + @doc """ + Applies suggested scalars from `report` (a `build_metric_report/1` result) to + the YAML files under `priv/combined_metrics/`. + + Returns a list of per-category stats maps with `:category`, `:updated`, + `:deadzoned`, and `:skipped` keys. + """ + @spec apply_scalars(map(), keyword()) :: [map()] + def apply_scalars(report, opts \\ []) do + filter_category = opts[:category] + + @yaml_dir + |> File.ls!() + |> Enum.filter(fn yml_file -> + String.ends_with?(yml_file, ".yml") and + (filter_category == nil or String.trim_trailing(yml_file, ".yml") == filter_category) + end) + |> Enum.sort() + |> Enum.map(fn yml_file -> + category = String.trim_trailing(yml_file, ".yml") + yaml_path = Path.join(@yaml_dir, yml_file) + {:ok, existing} = YamlElixir.read_from_file(yaml_path) + + {updated_yaml, stats} = apply_to_category(existing, category, report) + File.write!(yaml_path, YamlFormatter.format(updated_yaml)) + + Map.put(stats, :category, category) + end) + end + + @doc """ + Updates only the `_languages` field in YAML config files based on sample data. + + Returns a list of `%{category: String.t(), behaviors_with_languages: non_neg_integer()}`. + """ + @spec apply_languages(keyword()) :: [map()] + def apply_languages(opts \\ []) do + filter_category = opts[:category] + + @yaml_dir + |> File.ls!() + |> Enum.filter(fn yml_file -> + String.ends_with?(yml_file, ".yml") and + (filter_category == nil or String.trim_trailing(yml_file, ".yml") == filter_category) + end) + |> Enum.sort() + |> Enum.map(fn yml_file -> + category = String.trim_trailing(yml_file, ".yml") + yaml_path = Path.join(@yaml_dir, yml_file) + {:ok, existing} = YamlElixir.read_from_file(yaml_path) + + updated = + existing + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Map.new(fn {behavior, groups} -> + langs = languages_for_behavior(category, behavior) + {behavior, maybe_put_languages(groups, langs)} + end) + + File.write!(yaml_path, YamlFormatter.format(updated)) + + behaviors_with_languages = + Enum.count(updated, fn {_b, groups} -> Map.has_key?(groups, "_languages") end) + + %{category: category, behaviors_with_languages: behaviors_with_languages} + end) + end + + # --------------------------------------------------------------------------- + # Scalar application helpers + # --------------------------------------------------------------------------- + + defp apply_to_category(existing, category, report) do + existing + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.reduce({%{}, %{updated: 0, deadzoned: 0, skipped: 0}}, fn + {behavior, current_groups}, {acc_yaml, stats} -> + report_key = "#{category}.#{behavior}" + doc = read_behavior_doc(category, behavior) + + case Map.get(report, report_key) do + nil -> + groups = maybe_put_doc(current_groups, doc) + {Map.put(acc_yaml, behavior, groups), Map.update!(stats, :skipped, &(&1 + 1))} + + metrics -> + apply_metrics(acc_yaml, stats, behavior, current_groups, metrics, doc) + end + end) + end + + defp apply_metrics(acc_yaml, stats, behavior, current_groups, metrics, doc) do + {new_groups, log_baseline, n_updated, n_deadzoned} = groups_from_report(metrics) + # Fall back to current groups if everything was deadzoned + base_groups = if map_size(new_groups) > 0, do: new_groups, else: current_groups + + groups = + base_groups + |> Map.put("_log_baseline", Float.round(log_baseline, 6)) + |> maybe_put_doc(doc) + + {Map.put(acc_yaml, behavior, groups), + %{ + stats + | updated: stats.updated + n_updated, + deadzoned: stats.deadzoned + n_deadzoned + }} + end + + defp groups_from_report(metrics) do + Enum.reduce(metrics, {%{}, 0.0, 0, 0}, fn {metric_key, data}, + {groups, log_baseline, n_updated, n_deadzoned} -> + [group, key] = String.split(metric_key, ".", parts: 2) + + if deadzone?(data.ratio) do + {groups, log_baseline, n_updated, n_deadzoned + 1} + else + accumulate_metric(groups, log_baseline, n_updated, n_deadzoned, group, key, data) + end + end) + end + + defp accumulate_metric(groups, log_baseline, n_updated, n_deadzoned, group, key, data) do + new_groups = + Map.update( + groups, + group, + %{key => data.suggested_scalar}, + &Map.put(&1, key, data.suggested_scalar) + ) + + geo_mean = :math.sqrt(max(data.bad, 1.0e-10) * max(data.good, 1.0e-10)) + new_baseline = log_baseline + data.suggested_scalar * :math.log(geo_mean) + {new_groups, new_baseline, n_updated + 1, n_deadzoned} + end + + defp deadzone?(ratio), do: ratio >= @deadzone_low and ratio <= @deadzone_high + + defp read_behavior_doc(category, behavior) do + config_path = Path.join([@samples_root, category, behavior, "config.yml"]) + + case File.read(config_path) do + {:ok, content} -> + case YamlElixir.read_from_string(content) do + {:ok, %{"doc" => doc}} when is_binary(doc) -> doc + _ -> nil + end + + _ -> + nil + end + end + + defp maybe_put_doc(groups, nil), do: groups + defp maybe_put_doc(groups, doc), do: Map.put(groups, "_doc", doc) + + # --------------------------------------------------------------------------- + # Language detection helpers + # --------------------------------------------------------------------------- + + defp dir_languages(dir) do + case File.ls(dir) do + {:ok, files} -> + files + |> Enum.map(&CodeQA.Language.detect/1) + |> Enum.map(& &1.name()) + |> MapSet.new() + + _ -> + MapSet.new() + end + end + + defp languages_for_behavior(category, behavior) do + bad_langs = dir_languages(sample_path(category, behavior, "bad")) + good_langs = dir_languages(sample_path(category, behavior, "good")) + + bad_langs + |> MapSet.intersection(good_langs) + |> MapSet.to_list() + |> Enum.reject(&(&1 == "unknown")) + |> Enum.sort() + end + + defp maybe_put_languages(groups, []), do: groups + defp maybe_put_languages(groups, langs), do: Map.put(groups, "_languages", langs) + + defp sample_path(category, behavior, kind) do + Path.join([@samples_root, category, behavior, kind]) + end +end diff --git a/lib/codeqa/combined_metrics/scope_and_assignment.ex b/lib/codeqa/combined_metrics/scope_and_assignment.ex new file mode 100644 index 0000000..0b3e616 --- /dev/null +++ b/lib/codeqa/combined_metrics/scope_and_assignment.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.ScopeAndAssignment do + @moduledoc """ + Behaviour and submodule registry for variable scope and assignment quality metrics. + + Scalar weights are defined in `priv/combined_metrics/scope_and_assignment.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/scope_and_assignment.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.ScopeAndAssignment, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.ScopeAndAssignment + @moduledoc doc + @behaviour ScopeAndAssignment + @score_key key + @impl true + def score(metrics), + do: ScopeAndAssignment.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/scorer.ex b/lib/codeqa/combined_metrics/scorer.ex new file mode 100644 index 0000000..627fc97 --- /dev/null +++ b/lib/codeqa/combined_metrics/scorer.ex @@ -0,0 +1,85 @@ +defmodule CodeQA.CombinedMetrics.Scorer do + alias CodeQA.Engine.Analyzer + + @moduledoc """ + Pure computation engine for combined metric formulas. + + Loads scalar weights from a YAML file and computes a score as a product of + metric powers: + + score = metric_a ^ s_a * metric_b ^ s_b * ... + + Scalars of 0.0 contribute nothing (x^0 = 1.0) and are the default for all + metric keys not listed in the YAML. Negative scalars penalise a metric + (higher raw value → lower score). + """ + + @doc """ + Computes the score for `metric_name` using scalars from `yaml_path`. + + `metrics` is the `codebase.aggregate` map returned by `codeqa analyze`. + """ + @spec compute_score(String.t(), String.t(), map()) :: float() + def compute_score(yaml_path, metric_name, metrics) do + default_scalars() + |> Map.merge(scalars_for(yaml_path, metric_name)) + |> Enum.reduce(1.0, fn {{group, key}, scalar}, acc -> + acc * pow(get(metrics, group, key), scalar) + end) + end + + @doc "Returns the non-zero scalar overrides for `metric_name` from `yaml_path`." + @spec scalars_for(String.t(), String.t()) :: %{{String.t(), String.t()} => float()} + def scalars_for(yaml_path, metric_name) do + yaml_path + |> yaml_data() + |> Map.get(metric_name, %{}) + |> Enum.flat_map(fn + {group, keys} when is_map(keys) -> + Enum.map(keys, fn {key, scalar} -> {{group, key}, scalar / 1.0} end) + + _ -> + [] + end) + |> Map.new() + end + + @doc "Returns the full default scalar map: all registered file metric keys mapped to 0.0." + @spec default_scalars() :: %{{String.t(), String.t()} => float()} + def default_scalars do + Analyzer.build_registry().file_metrics + |> Enum.flat_map(fn mod -> + Enum.map(mod.keys(), fn key -> {{mod.name(), "mean_" <> key}, 0.0} end) + end) + |> Map.new() + end + + @doc "Safely fetches a nested metric value, returning 1.0 if missing or non-positive." + @spec get(map(), String.t(), String.t()) :: float() + def get(metrics, group, key) do + case get_in(metrics, [group, key]) do + val when is_number(val) and val > 0 -> val / 1.0 + _ -> 1.0 + end + end + + @doc "Computes `base ^ scalar`, returning 1.0 for non-positive bases." + @spec pow(float(), float()) :: float() + def pow(base, scalar) when base > 0, do: :math.pow(base, scalar) + def pow(_base, _scalar), do: 1.0 + + @yaml_dir "priv/combined_metrics" + @yaml_paths Path.wildcard(Path.join(@yaml_dir, "*.yml")) + for path <- @yaml_paths, do: @external_resource(path) + + @compiled_yamls Map.new(@yaml_paths, fn path -> + {:ok, data} = YamlElixir.read_from_file(path) + {path, data} + end) + + @doc "Returns all compiled YAML data as `%{path => parsed_map}`." + @spec all_yamls() :: %{String.t() => map()} + def all_yamls, do: @compiled_yamls + + defp yaml_data(yaml_path), do: Map.get(@compiled_yamls, yaml_path, %{}) +end diff --git a/lib/codeqa/combined_metrics/testing.ex b/lib/codeqa/combined_metrics/testing.ex new file mode 100644 index 0000000..52b41e4 --- /dev/null +++ b/lib/codeqa/combined_metrics/testing.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.Testing do + @moduledoc """ + Behaviour and submodule registry for test quality metrics. + + Scalar weights are defined in `priv/combined_metrics/testing.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/testing.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.Testing, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.Testing + @moduledoc doc + @behaviour Testing + @score_key key + @impl true + def score(metrics), + do: Testing.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/type_and_value.ex b/lib/codeqa/combined_metrics/type_and_value.ex new file mode 100644 index 0000000..d461c60 --- /dev/null +++ b/lib/codeqa/combined_metrics/type_and_value.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.TypeAndValue do + @moduledoc """ + Behaviour and submodule registry for type safety and value assignment quality metrics. + + Scalar weights are defined in `priv/combined_metrics/type_and_value.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/type_and_value.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.TypeAndValue, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.TypeAndValue + @moduledoc doc + @behaviour TypeAndValue + @score_key key + @impl true + def score(metrics), + do: TypeAndValue.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/variable_naming.ex b/lib/codeqa/combined_metrics/variable_naming.ex new file mode 100644 index 0000000..db49793 --- /dev/null +++ b/lib/codeqa/combined_metrics/variable_naming.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.VariableNaming do + @moduledoc """ + Behaviour and submodule registry for variable naming quality metrics. + + Scalar weights are defined in `priv/combined_metrics/variable_naming.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/variable_naming.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.VariableNaming, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.VariableNaming + @moduledoc doc + @behaviour VariableNaming + @score_key key + @impl true + def score(metrics), + do: VariableNaming.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/yaml_formatter.ex b/lib/codeqa/combined_metrics/yaml_formatter.ex new file mode 100644 index 0000000..ca034b2 --- /dev/null +++ b/lib/codeqa/combined_metrics/yaml_formatter.ex @@ -0,0 +1,72 @@ +defmodule CodeQA.CombinedMetrics.YamlFormatter do + @moduledoc """ + Serialises a combined-metrics behavior map back to the hand-authored YAML format. + + Intended for internal use by `SampleRunner`. The output format preserves the + conventions used across `priv/combined_metrics/*.yml`: + + - Behaviors sorted alphabetically + - Meta-keys (`_doc`, `_fix_hint`, `_languages`, `_log_baseline`) emitted before + group sections + - Groups and keys within groups sorted alphabetically + - Floats written with four decimal places + """ + + @doc """ + Serialises a `%{behavior => groups}` map to a YAML string. + """ + @spec format(map()) :: String.t() + def format(data) do + lines = + data + |> Enum.sort_by(fn {behavior, _} -> behavior end) + |> Enum.flat_map(fn {behavior, groups} -> behavior_lines(behavior, groups) end) + + Enum.join(lines, "\n") <> "\n" + end + + # --- Behavior-level serialisation --- + + defp behavior_lines(behavior, groups) do + doc_line = doc_line(Map.get(groups, "_doc")) + baseline_line = baseline_line(Map.get(groups, "_log_baseline")) + fix_hint_line = fix_hint_line(Map.get(groups, "_fix_hint")) + languages_line = languages_line(Map.get(groups, "_languages")) + group_lines = group_lines(groups) + + ["#{behavior}:" | doc_line] ++ + fix_hint_line ++ languages_line ++ baseline_line ++ group_lines ++ [""] + end + + defp doc_line(nil), do: [] + defp doc_line(doc), do: [" _doc: #{inspect(doc)}"] + + defp baseline_line(nil), do: [] + defp baseline_line(val), do: [" _log_baseline: #{fmt_scalar(val)}"] + + defp fix_hint_line(nil), do: [] + defp fix_hint_line(hint), do: [" _fix_hint: #{inspect(hint)}"] + + defp languages_line(nil), do: [] + defp languages_line([]), do: [] + defp languages_line(langs), do: [" _languages: [#{Enum.join(langs, ", ")}]"] + + defp group_lines(groups) do + groups + |> Enum.filter(fn {k, v} -> + k not in ["_doc", "_log_baseline", "_fix_hint", "_languages"] and is_map(v) + end) + |> Enum.sort_by(fn {group, _} -> group end) + |> Enum.flat_map(fn {group, keys} -> + key_lines = + keys + |> Enum.sort_by(fn {key, _} -> key end) + |> Enum.map(fn {key, scalar} -> " #{key}: #{fmt_scalar(scalar)}" end) + + [" #{group}:" | key_lines] + end) + end + + defp fmt_scalar(f) when is_float(f), do: :erlang.float_to_binary(f, decimals: 4) + defp fmt_scalar(n) when is_integer(n), do: "#{n}.0" +end diff --git a/lib/codeqa/comparator.ex b/lib/codeqa/comparator.ex deleted file mode 100644 index 4fbfa40..0000000 --- a/lib/codeqa/comparator.ex +++ /dev/null @@ -1,109 +0,0 @@ -defmodule CodeQA.Comparator do - @moduledoc "Compare two analysis results and compute metric deltas." - - def compare_results(base_result, head_result, changes) do - base_files = Map.get(base_result, "files", %{}) - head_files = Map.get(head_result, "files", %{}) - - {file_comparisons, status_counts} = - changes - |> Enum.reduce({%{}, %{"added" => 0, "modified" => 0, "deleted" => 0}}, fn change, - {files, counts} -> - base_data = Map.get(base_files, change.path) - head_data = Map.get(head_files, change.path) - delta = compute_file_delta(base_data, head_data) - - file_entry = %{ - "status" => change.status, - "base" => base_data, - "head" => head_data, - "delta" => delta - } - - {Map.put(files, change.path, file_entry), Map.update!(counts, change.status, &(&1 + 1))} - end) - - base_agg = get_in(base_result, ["codebase", "aggregate"]) || %{} - head_agg = get_in(head_result, ["codebase", "aggregate"]) || %{} - agg_delta = compute_aggregate_delta(base_agg, head_agg) - - summary = build_summary(status_counts) - - %{ - "metadata" => %{ - "total_files_compared" => length(changes), - "summary" => summary - }, - "files" => file_comparisons, - "codebase" => %{ - "base" => %{"aggregate" => base_agg}, - "head" => %{"aggregate" => head_agg}, - "delta" => %{"aggregate" => agg_delta} - } - } - end - - defp compute_file_delta(nil, _head), do: nil - defp compute_file_delta(_base, nil), do: nil - - defp compute_file_delta(base_data, head_data) do - top_delta = - ["bytes", "lines"] - |> Enum.reduce(%{}, fn key, acc -> - case {Map.get(base_data, key), Map.get(head_data, key)} do - {b, h} when is_number(b) and is_number(h) -> Map.put(acc, key, h - b) - _ -> acc - end - end) - - base_metrics = Map.get(base_data, "metrics", %{}) - head_metrics = Map.get(head_data, "metrics", %{}) - - metrics_delta = - MapSet.new(Map.keys(base_metrics) ++ Map.keys(head_metrics)) - |> Enum.reduce(%{}, fn metric_name, acc -> - base_m = Map.get(base_metrics, metric_name, %{}) - head_m = Map.get(head_metrics, metric_name, %{}) - delta = compute_numeric_delta(base_m, head_m) - if delta == %{}, do: acc, else: Map.put(acc, metric_name, delta) - end) - - Map.put(top_delta, "metrics", metrics_delta) - end - - defp compute_aggregate_delta(base_agg, head_agg) do - MapSet.new(Map.keys(base_agg) ++ Map.keys(head_agg)) - |> Enum.reduce(%{}, fn metric_name, acc -> - base_m = Map.get(base_agg, metric_name, %{}) - head_m = Map.get(head_agg, metric_name, %{}) - delta = compute_numeric_delta(base_m, head_m) - if delta == %{}, do: acc, else: Map.put(acc, metric_name, delta) - end) - end - - defp compute_numeric_delta(base, head) do - MapSet.new(Map.keys(base) ++ Map.keys(head)) - |> Enum.reduce(%{}, fn key, acc -> - case {Map.get(base, key), Map.get(head, key)} do - {b, h} when is_number(b) and is_number(h) -> - Map.put(acc, key, Float.round((h - b) / 1, 4)) - - _ -> - acc - end - end) - end - - defp build_summary(counts) do - parts = - [ - {"added", counts["added"]}, - {"modified", counts["modified"]}, - {"deleted", counts["deleted"]} - ] - |> Enum.filter(fn {_, c} -> c > 0 end) - |> Enum.map(fn {status, count} -> "#{count} #{status}" end) - - if parts == [], do: "no changes", else: Enum.join(parts, ", ") - end -end diff --git a/lib/codeqa/config.ex b/lib/codeqa/config.ex new file mode 100644 index 0000000..5171eac --- /dev/null +++ b/lib/codeqa/config.ex @@ -0,0 +1,99 @@ +defmodule CodeQA.Config do + @moduledoc "Loads and caches .codeqa.yml configuration via :persistent_term." + + @key {__MODULE__, :config} + + @default_impact %{ + "complexity" => 5, + "file_structure" => 4, + "function_design" => 4, + "code_smells" => 3, + "naming_conventions" => 2, + "error_handling" => 2, + "consistency" => 2, + "documentation" => 1, + "testing" => 1 + } + + defstruct ignore_paths: [], + impact_map: @default_impact, + combined_top: 2, + cosine_significance_threshold: 0.15, + near_duplicate_blocks: [] + + @spec load(String.t()) :: :ok + def load(path) do + if :persistent_term.get(@key, nil) == nil do + config = parse(path) + :persistent_term.put(@key, config) + end + + :ok + end + + @spec reset() :: :ok + def reset do + :persistent_term.erase(@key) + :ok + end + + @spec ignore_paths() :: [String.t()] + def ignore_paths, do: fetch().ignore_paths + + @spec impact_map() :: %{String.t() => pos_integer()} + def impact_map, do: fetch().impact_map + + @spec combined_top() :: pos_integer() + def combined_top, do: fetch().combined_top + + @spec cosine_significance_threshold() :: float() + def cosine_significance_threshold, do: fetch().cosine_significance_threshold + + @spec near_duplicate_blocks_opts() :: keyword() + def near_duplicate_blocks_opts, do: fetch().near_duplicate_blocks + + defp fetch do + :persistent_term.get(@key, %__MODULE__{}) + end + + defp parse(path) do + config_file = Path.join(path, ".codeqa.yml") + + case File.read(config_file) do + {:ok, contents} -> + case YamlElixir.read_from_string(contents) do + {:ok, yaml} -> from_yaml(yaml) + _ -> %__MODULE__{} + end + + {:error, _} -> + %__MODULE__{} + end + end + + defp from_yaml(yaml) do + %__MODULE__{ + ignore_paths: parse_ignore_paths(yaml), + impact_map: parse_impact(yaml), + combined_top: Map.get(yaml, "combined_top", 2), + cosine_significance_threshold: Map.get(yaml, "cosine_significance_threshold", 0.15), + near_duplicate_blocks: parse_near_duplicate_blocks(yaml) + } + end + + defp parse_ignore_paths(%{"ignore_paths" => patterns}) when is_list(patterns), do: patterns + defp parse_ignore_paths(_), do: [] + + defp parse_impact(%{"impact" => overrides}) when is_map(overrides) do + string_overrides = Map.new(overrides, fn {k, v} -> {to_string(k), v} end) + Map.merge(@default_impact, string_overrides) + end + + defp parse_impact(_), do: @default_impact + + defp parse_near_duplicate_blocks(%{"near_duplicate_blocks" => %{"max_pairs_per_bucket" => n}}) + when is_integer(n), + do: [max_pairs_per_bucket: n] + + defp parse_near_duplicate_blocks(_), do: [] +end diff --git a/lib/codeqa/diagnostics.ex b/lib/codeqa/diagnostics.ex new file mode 100644 index 0000000..f2479e0 --- /dev/null +++ b/lib/codeqa/diagnostics.ex @@ -0,0 +1,171 @@ +defmodule CodeQA.Diagnostics do + @moduledoc """ + Diagnoses a codebase by identifying likely code quality issues using + cosine similarity against combined metric behavior profiles. + """ + + alias CodeQA.CombinedMetrics.FileScorer + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + alias CodeQA.HealthReport.Grader + + @doc """ + Runs diagnostics on the given path and returns results as a string. + + ## Options + + * `:path` - file or directory path (required) + * `:mode` - `:aggregate` (default) or `:per_file` + * `:top` - number of top issues to display (default 15) + * `:format` - `:plain` or `:json` (default `:plain`) + * `:combined_top` - worst offender files per behavior (default 2) + """ + @spec run(keyword()) :: String.t() + def run(opts) do + path = opts[:path] + mode = opts[:mode] || :aggregate + top = opts[:top] || 15 + format = opts[:format] || :plain + + files = Collector.collect_files(path) + result = Analyzer.analyze_codebase(files, []) + + case mode do + :per_file -> run_per_file(result, top, format) + _ -> run_aggregate(result, top, format) + end + end + + defp run_aggregate(result, top, format) do + aggregate = get_in(result, ["codebase", "aggregate"]) + files = Map.get(result, "files", %{}) + project_langs = project_languages(files) + + issues_task = + Task.async(fn -> + SampleRunner.diagnose_aggregate(aggregate, top: top, languages: project_langs) + end) + + categories_task = + Task.async(fn -> SampleRunner.score_aggregate(aggregate, languages: project_langs) end) + + issues = Task.await(issues_task) + categories = Task.await(categories_task) + + case format do + :json -> + Jason.encode!(%{issues: issues, categories: categories}, pretty: true) + + _ -> + "## Diagnose: aggregate\n\n" <> + issues_table(issues) <> + "\n" <> + categories_text(categories) + end + end + + defp run_per_file(result, top, format) do + files = Map.get(result, "files", %{}) + + file_diagnoses = + Map.new(files, fn {file_path, file_data} -> + metrics = Map.get(file_data, "metrics", %{}) + file_agg = FileScorer.file_to_aggregate(metrics) + language = CodeQA.Language.detect(file_path).name() + diagnoses = SampleRunner.diagnose_aggregate(file_agg, top: top, language: language) + {file_path, diagnoses} + end) + + case format do + :json -> + files_json = + Enum.map(file_diagnoses, fn {file_path, diagnoses} -> + %{file: file_path, behaviors: Enum.map(diagnoses, &diagnosis_to_map/1)} + end) + + Jason.encode!(%{files: files_json}, pretty: true) + + _ -> + file_rows = + Enum.flat_map(file_diagnoses, fn {file_path, diagnoses} -> + diagnoses_to_rows(file_path, diagnoses) + end) + + "## Diagnose: per-file\n\n" <> per_file_table(file_rows, top) + end + end + + defp diagnosis_to_map(d) do + %{ + behavior: "#{d.category}.#{d.behavior}", + cosine: d.cosine, + score: Grader.score_cosine(d.cosine) + } + end + + defp diagnoses_to_rows(file_path, diagnoses) do + Enum.map(diagnoses, fn %{category: cat, behavior: beh, cosine: cosine, score: score} -> + {file_path, "#{cat}.#{beh}", cosine, score} + end) + end + + defp project_languages(files_map) do + files_map + |> Map.keys() + |> Enum.map(&CodeQA.Language.detect(&1).name()) + |> Enum.reject(&(&1 == "unknown")) + |> Enum.uniq() + end + + defp issues_table(issues) do + rows = + Enum.map(issues, fn %{category: cat, behavior: beh, cosine: cosine, score: score} -> + cosine_str = :erlang.float_to_binary(cosine / 1.0, decimals: 2) + score_str = :erlang.float_to_binary(score / 1.0, decimals: 2) + "| #{cat}.#{beh} | #{cosine_str} | #{score_str} |" + end) + + Enum.join( + ["| Behavior | Cosine | Score |", "|----------|--------|-------|"] ++ rows ++ [""], + "\n" + ) + end + + defp categories_text(categories) do + Enum.map_join(categories, "\n", fn %{name: name, behaviors: behaviors} -> + rows = + Enum.map(behaviors, fn %{behavior: beh, score: score} -> + score_str = :erlang.float_to_binary(score / 1.0, decimals: 2) + "| #{beh} | #{score_str} |" + end) + + Enum.join( + ["### #{name}", "| Behavior | Score |", "|----------|-------|"] ++ rows ++ [""], + "\n" + ) + end) + end + + defp per_file_table(rows, top) do + data_rows = + rows + |> Enum.group_by(fn {file_path, _, _, _} -> file_path end) + |> Enum.flat_map(fn {_file_path, file_rows} -> + file_rows + |> Enum.sort_by(fn {_, _, cosine, _} -> cosine end) + |> Enum.take(top) + end) + |> Enum.map(fn {file_path, behavior_key, cosine, _score} -> + cosine_str = :erlang.float_to_binary(cosine / 1.0, decimals: 2) + cosine_score = Grader.score_cosine(cosine) + "| #{file_path} | #{behavior_key} | #{cosine_str} | #{cosine_score} |" + end) + + Enum.join( + ["| File | Behavior | Cosine | Score |", "|------|----------|--------|-------|"] ++ + data_rows, + "\n" + ) + end +end diff --git a/lib/codeqa/engine/analyzer.ex b/lib/codeqa/engine/analyzer.ex new file mode 100644 index 0000000..bd55e11 --- /dev/null +++ b/lib/codeqa/engine/analyzer.ex @@ -0,0 +1,155 @@ +defmodule CodeQA.Engine.Analyzer do + @moduledoc "Orchestrates metric computation across files." + + alias CodeQA.Analysis.RunSupervisor + alias CodeQA.BlockImpactAnalyzer + alias CodeQA.Engine.Parallel + alias CodeQA.Engine.Pipeline + alias CodeQA.Engine.Registry + alias CodeQA.Metrics.Codebase, as: CodebaseMetrics + alias CodeQA.Metrics.File, as: Metrics + + @registry Registry.new() + |> Registry.register_file_metric(Metrics.Entropy) + |> Registry.register_file_metric(Metrics.Compression) + |> Registry.register_file_metric(Metrics.Zipf) + |> Registry.register_file_metric(Metrics.Heaps) + |> Registry.register_file_metric(Metrics.Vocabulary) + |> Registry.register_file_metric(Metrics.Ngram) + |> Registry.register_file_metric(Metrics.Halstead) + |> Registry.register_file_metric(Metrics.Readability) + |> Registry.register_file_metric(Metrics.CasingEntropy) + |> Registry.register_file_metric(Metrics.IdentifierLengthVariance) + |> Registry.register_file_metric(Metrics.Indentation) + |> Registry.register_file_metric(Metrics.Branching) + |> Registry.register_file_metric(Metrics.FunctionMetrics) + |> Registry.register_file_metric(Metrics.MagicNumberDensity) + |> Registry.register_file_metric(Metrics.SymbolDensity) + |> Registry.register_file_metric(Metrics.VowelDensity) + |> Registry.register_file_metric(Metrics.Brevity) + |> Registry.register_file_metric(Metrics.PunctuationDensity) + |> Registry.register_file_metric(Metrics.CommentStructure) + |> Registry.register_file_metric(Metrics.LinePatterns) + |> Registry.register_codebase_metric(CodebaseMetrics.Similarity) + |> Registry.register_file_metric(Metrics.NearDuplicateBlocksFile) + |> Registry.register_codebase_metric(CodebaseMetrics.NearDuplicateBlocksCodebase) + + def build_registry, do: @registry + + @spec analyze_file(String.t(), String.t()) :: map() + def analyze_file(_path, content) do + ctx = Pipeline.build_file_context(content) + Registry.run_file_metrics(@registry, ctx, []) + end + + @spec analyze_file_for_loo(String.t(), String.t()) :: map() + def analyze_file_for_loo(_path, content) do + ctx = Pipeline.build_file_context(content, skip_structural: true) + Registry.run_file_metrics(@registry, ctx, []) + end + + @spec analyze_codebase_aggregate(map(), keyword()) :: map() + def analyze_codebase_aggregate(files_map, opts \\ []) do + with_run_context(opts, fn opts -> + file_results = Parallel.analyze_files(files_map, opts) + aggregate_file_metrics(file_results) + end) + end + + def analyze_codebase(files, opts \\ []) do + with_run_context(opts, &do_analyze_codebase(files, &1)) + end + + defp with_run_context(opts, fun) do + {:ok, sup} = RunSupervisor.start_link() + run_ctx = RunSupervisor.run_context(sup) + opts = Keyword.put(opts, :file_context_pid, run_ctx.file_context_pid) + opts = Keyword.put(opts, :behavior_config_pid, run_ctx.behavior_config_pid) + + try do + fun.(opts) + after + Supervisor.stop(sup) + end + end + + defp do_analyze_codebase(files, opts) do + registry = @registry + file_results = Parallel.analyze_files(files, opts) + aggregate = aggregate_file_metrics(file_results) + + if Keyword.get(opts, :compute_nodes, false) do + nodes_opts = + [baseline_codebase_agg: aggregate] ++ + Keyword.take(opts, [:nodes_top, :workers, :behavior_config_pid]) + + pipeline_result = %{ + "files" => file_results, + "codebase" => %{"aggregate" => aggregate} + } + + updated_pipeline_result = BlockImpactAnalyzer.analyze(pipeline_result, files, nodes_opts) + codebase_metrics = Registry.run_codebase_metrics(registry, files, opts) + + updated_codebase = + Map.merge(codebase_metrics, updated_pipeline_result["codebase"]) + + Map.put(updated_pipeline_result, "codebase", updated_codebase) + else + codebase_metrics = Registry.run_codebase_metrics(registry, files, opts) + + %{ + "files" => file_results, + "codebase" => Map.put(codebase_metrics, "aggregate", aggregate) + } + end + end + + defp metric_data_to_triples({metric_name, metric_data}) do + metric_data + |> Enum.filter(fn {_k, v} -> is_number(v) end) + |> Enum.map(fn {key, value} -> {metric_name, key, value / 1} end) + end + + def aggregate_file_metrics(file_results) do + file_results + |> Map.values() + |> Enum.flat_map(fn file_data -> + file_data + |> Map.get("metrics", %{}) + |> Enum.flat_map(&metric_data_to_triples/1) + end) + |> Enum.group_by(fn {metric, key, _val} -> {metric, key} end, fn {_, _, val} -> val end) + |> Enum.reduce(%{}, fn {{metric, key}, values}, acc -> + stats = compute_stats(values) + metric_agg = Map.get(acc, metric, %{}) + + updated = + Map.merge(metric_agg, %{ + "mean_#{key}" => stats.mean, + "std_#{key}" => stats.std, + "min_#{key}" => stats.min, + "max_#{key}" => stats.max + }) + + Map.put(acc, metric, updated) + end) + end + + defp compute_stats([]), do: %{mean: 0.0, std: 0.0, min: 0.0, max: 0.0} + + defp compute_stats(values) do + n = length(values) + mean = Enum.sum(values) / n + sum_squares = Enum.reduce(values, 0.0, fn v, acc -> acc + (v - mean) ** 2 end) + variance = sum_squares / n + std = :math.sqrt(variance) + + %{ + mean: Float.round(mean * 1.0, 4), + std: Float.round(std * 1.0, 4), + min: Float.round(Enum.min(values) * 1.0, 4), + max: Float.round(Enum.max(values) * 1.0, 4) + } + end +end diff --git a/lib/codeqa/engine/collector.ex b/lib/codeqa/engine/collector.ex new file mode 100644 index 0000000..3d1b8b4 --- /dev/null +++ b/lib/codeqa/engine/collector.ex @@ -0,0 +1,112 @@ +defmodule CodeQA.Engine.Collector do + @moduledoc false + + @skip_dirs MapSet.new(~w[ + .git .hg .svn node_modules __pycache__ _build dist build vendor + .tox .venv venv target .mypy_cache .pytest_cache deps .elixir_ls + .next coverage + ]) + + @default_ignore_patterns ~w[**/*.md **/*.mdx] + + @spec source_extensions() :: MapSet.t() + def source_extensions do + CodeQA.Language.all() + |> Enum.flat_map(& &1.extensions()) + |> Enum.map(&".#{&1}") + |> MapSet.new() + end + + @spec collect_files(String.t(), [String.t()]) :: %{String.t() => String.t()} + def collect_files(root, extra_ignore_patterns \\ []) do + root_path = Path.expand(root) + CodeQA.Config.load(root_path) + patterns = all_ignore_patterns(extra_ignore_patterns) + extensions = source_extensions() + + unless File.dir?(root_path) do + raise File.Error, reason: :enoent, path: root, action: "find directory" + end + + files_map = + root_path + |> walk_directory(extensions) + |> Map.new(fn path -> + rel = Path.relative_to(path, root_path) + {rel, File.read!(path)} + end) + |> do_reject_ignored_map(patterns) + + gitignored = CodeQA.Git.gitignored_files(root_path, Map.keys(files_map)) + Map.reject(files_map, fn {path, _} -> MapSet.member?(gitignored, path) end) + end + + @doc false + def ignored?(path, patterns) do + Enum.any?(patterns, fn pattern -> + match_pattern?(path, pattern) + end) + end + + @doc false + def reject_ignored_map(files_map, extra_patterns \\ []) do + do_reject_ignored_map(files_map, all_ignore_patterns(extra_patterns)) + end + + @doc false + def reject_ignored(list, key_fn, extra_patterns \\ []) do + patterns = all_ignore_patterns(extra_patterns) + Enum.reject(list, fn item -> ignored?(key_fn.(item), patterns) end) + end + + defp all_ignore_patterns(extra), + do: extra ++ @default_ignore_patterns ++ CodeQA.Config.ignore_paths() + + defp do_reject_ignored_map(files_map, patterns) do + Map.reject(files_map, fn {path, _} -> ignored?(path, patterns) end) + end + + defp match_pattern?(path, pattern) do + # Convert glob pattern to regex: + # - ** matches any number of directories + # - * matches anything except / + # - ? matches a single character except / + regex_str = + pattern + |> String.replace(".", "\\.") + |> String.replace("**", "\0GLOBSTAR\0") + |> String.replace("*", "[^/]*") + |> String.replace("?", "[^/]") + |> String.replace("\0GLOBSTAR\0", ".*") + + case Regex.compile("^#{regex_str}$") do + {:ok, regex} -> Regex.match?(regex, path) + _ -> false + end + end + + defp walk_directory(dir, extensions) do + dir + |> File.ls!() + |> Enum.flat_map(fn entry -> + full_path = Path.join(dir, entry) + + cond do + File.dir?(full_path) and not skip_dir?(entry) -> + walk_directory(full_path, extensions) + + File.regular?(full_path) and source_file?(entry, extensions) and + not String.starts_with?(entry, ".") -> + [full_path] + + true -> + [] + end + end) + end + + defp skip_dir?(name), do: MapSet.member?(@skip_dirs, name) or String.starts_with?(name, ".") + + defp source_file?(name, extensions), + do: MapSet.member?(extensions, Path.extname(name) |> String.downcase()) +end diff --git a/lib/codeqa/engine/file_context.ex b/lib/codeqa/engine/file_context.ex new file mode 100644 index 0000000..6e1da6b --- /dev/null +++ b/lib/codeqa/engine/file_context.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.Engine.FileContext do + @moduledoc "Immutable pre-computed data shared across all file metrics." + @enforce_keys [ + :content, + :tokens, + :token_counts, + :words, + :identifiers, + :lines, + :encoded, + :byte_count, + :line_count + ] + defstruct @enforce_keys ++ [:path, :blocks] + + @type t :: %__MODULE__{ + content: String.t(), + tokens: [CodeQA.Engine.Pipeline.Token.t()], + token_counts: map(), + words: list(), + identifiers: list(), + lines: list(), + encoded: String.t(), + byte_count: non_neg_integer(), + line_count: non_neg_integer(), + path: String.t() | nil, + blocks: [CodeQA.AST.Enrichment.Node.t()] | nil + } +end diff --git a/lib/codeqa/parallel.ex b/lib/codeqa/engine/parallel.ex similarity index 66% rename from lib/codeqa/parallel.ex rename to lib/codeqa/engine/parallel.ex index 0e2cc46..f5a8da1 100644 --- a/lib/codeqa/parallel.ex +++ b/lib/codeqa/engine/parallel.ex @@ -1,4 +1,8 @@ -defmodule CodeQA.Parallel do +defmodule CodeQA.Engine.Parallel do + alias CodeQA.Analysis.FileContextServer + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Registry + @moduledoc "Parallel file analysis using Flow (GenStage-based)." def analyze_files(files, opts \\ []) when is_map(files) do @@ -22,7 +26,7 @@ defmodule CodeQA.Parallel do |> Flow.map(fn {path, content} -> start_time = System.monotonic_time(:millisecond) - result = maybe_cached_analyze(content, cache_dir, opts) + result = maybe_cached_analyze(path, content, cache_dir, opts) end_time = System.monotonic_time(:millisecond) time_taken = end_time - start_time @@ -38,9 +42,10 @@ defmodule CodeQA.Parallel do |> Enum.into(%{}) end - defp maybe_cached_analyze(content, nil, opts), do: analyze_single_file(content, opts) + defp maybe_cached_analyze(path, content, nil, opts), + do: analyze_single_file(path, content, opts) - defp maybe_cached_analyze(content, cache_dir, opts) do + defp maybe_cached_analyze(path, content, cache_dir, opts) do hash = :crypto.hash(:sha256, content) |> Base.encode16(case: :lower) cache_file = Path.join(cache_dir, hash <> ".json") @@ -51,30 +56,25 @@ defmodule CodeQA.Parallel do data _ -> - data = analyze_single_file(content, opts) + data = analyze_single_file(path, content, opts) File.write!(cache_file, Jason.encode!(data)) data end _ -> - data = analyze_single_file(content, opts) + data = analyze_single_file(path, content, opts) File.write!(cache_file, Jason.encode!(data)) data end end - defp analyze_single_file(content, opts) do - registry = CodeQA.Analyzer.build_registry() - - ctx = - CodeQA.Telemetry.time(:pipeline_build_context, fn -> - CodeQA.Pipeline.build_file_context(content, opts) - end) + defp analyze_single_file(path, content, opts) do + registry = Analyzer.build_registry() + file_opts = Keyword.put(opts, :path, path) + pid = Keyword.fetch!(opts, :file_context_pid) - metrics = - CodeQA.Telemetry.time(:registry_run_metrics, fn -> - CodeQA.Registry.run_file_metrics(registry, ctx, opts) - end) + ctx = FileContextServer.get(pid, content, file_opts) + metrics = Registry.run_file_metrics(registry, ctx, opts) %{ "bytes" => ctx.byte_count, diff --git a/lib/codeqa/engine/pipeline.ex b/lib/codeqa/engine/pipeline.ex new file mode 100644 index 0000000..8a697a0 --- /dev/null +++ b/lib/codeqa/engine/pipeline.ex @@ -0,0 +1,98 @@ +defmodule CodeQA.Engine.Pipeline do + @moduledoc "Pre-computed shared context for file-level metrics." + + defmodule Token do + @moduledoc "A lexical token with its string content, kind tag, and 1-based source line." + defstruct [:content, :kind, :line] + + @type t :: %__MODULE__{ + content: String.t(), + kind: String.t(), + line: pos_integer() + } + end + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Engine.FileContext + alias CodeQA.Language + + @word_re ~r/\b[a-zA-Z_]\w*\b/u + + @spec build_file_context(String.t(), keyword()) :: FileContext.t() + def build_file_context(content, opts \\ []) when is_binary(content) do + tokens = tokenize(content) + token_counts = tokens |> Enum.map(& &1.content) |> Enum.frequencies() + + keywords = MapSet.new(Language.all_keywords()) + + words = + Regex.scan(@word_re, content) + |> List.flatten() + + identifiers = Enum.reject(words, &MapSet.member?(keywords, &1)) + lines = content |> String.split("\n") |> trim_trailing_empty() + encoded = content + + skip_structural = Keyword.get(opts, :skip_structural, false) + + {path, blocks} = + case Keyword.get(opts, :path) do + nil -> + {nil, nil} + + p when skip_structural -> + {p, nil} + + p -> + lang_mod = Language.detect(p) + structural_tokens = TokenNormalizer.normalize_structural(content) + {p, Parser.detect_blocks(structural_tokens, lang_mod)} + end + + %FileContext{ + content: content, + tokens: tokens, + token_counts: token_counts, + words: words, + identifiers: identifiers, + lines: lines, + encoded: encoded, + byte_count: byte_size(content), + line_count: length(lines), + path: path, + blocks: blocks + } + end + + # Matches identifiers, integer/float literals, and single non-whitespace chars. + @token_re ~r/[a-zA-Z_]\w*|[0-9]+(?:\.[0-9]+)?|[^\s]/u + + defp tokenize(content) do + content + |> String.split("\n") + |> Enum.with_index(1) + |> Enum.flat_map(fn {line, line_num} -> + @token_re + |> Regex.scan(line) + |> List.flatten() + |> Enum.map(&%Token{content: &1, kind: classify(&1), line: line_num}) + end) + end + + defp classify(tok) do + cond do + Regex.match?(~r/^[a-zA-Z_]\w*$/, tok) -> "" + Regex.match?(~r/^[0-9]/, tok) -> "" + true -> "" + end + end + + defp trim_trailing_empty(lines) do + # Match Python's str.splitlines() behavior + case List.last(lines) do + "" -> List.delete_at(lines, -1) + _ -> lines + end + end +end diff --git a/lib/codeqa/registry.ex b/lib/codeqa/engine/registry.ex similarity index 71% rename from lib/codeqa/registry.ex rename to lib/codeqa/engine/registry.ex index 76dfe23..5f9be25 100644 --- a/lib/codeqa/registry.ex +++ b/lib/codeqa/engine/registry.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Registry do +defmodule CodeQA.Engine.Registry do @moduledoc "Metric registration and execution." defstruct file_metrics: [], codebase_metrics: [] @@ -14,17 +14,11 @@ defmodule CodeQA.Registry do end def run_file_metrics(%__MODULE__{} = reg, ctx, opts \\ []) do - base_metrics = - Map.new(reg.file_metrics, fn mod -> - {mod.name(), - CodeQA.Telemetry.time(String.to_atom("metric_" <> mod.name()), fn -> mod.analyze(ctx) end)} - end) + base_metrics = Map.new(reg.file_metrics, fn mod -> {mod.name(), mod.analyze(ctx)} end) if Keyword.get(opts, :combinations, false) do - CodeQA.Telemetry.time(:registry_combinations, fn -> - combinations = generate_combinations(flat_numeric_metrics(base_metrics), []) - Map.merge(base_metrics, Map.new(combinations)) - end) + combinations = generate_combinations(flat_numeric_metrics(base_metrics), []) + Map.merge(base_metrics, Map.new(combinations)) else base_metrics end @@ -60,6 +54,16 @@ defmodule CodeQA.Registry do end def run_codebase_metrics(%__MODULE__{} = reg, files, opts \\ []) do - Map.new(reg.codebase_metrics, fn mod -> {mod.name(), mod.analyze(files, opts)} end) + has_progress = Keyword.has_key?(opts, :on_progress) + total = length(reg.codebase_metrics) + + reg.codebase_metrics + |> Enum.with_index(1) + |> Map.new(fn {mod, idx} -> + if has_progress, + do: IO.puts(:stderr, "\nCODEBASE #{idx}/#{total}: #{mod.name()}...") + + {mod.name(), mod.analyze(files, opts)} + end) end end diff --git a/lib/codeqa/formatter.ex b/lib/codeqa/formatter.ex deleted file mode 100644 index 55ba6ef..0000000 --- a/lib/codeqa/formatter.ex +++ /dev/null @@ -1,344 +0,0 @@ -defmodule CodeQA.Formatter do - @moduledoc false - - @summary_metrics [ - {"entropy", "char_entropy", "Entropy"}, - {"halstead", "volume", "Halstead Vol."}, - {"halstead", "difficulty", "Difficulty"}, - {"readability", "flesch_adapted", "Readability"}, - {"compression", "redundancy", "Redundancy"} - ] - - @bar_width 20 - @filled "█" - @empty "░" - - def format_github(comparison, output_mode \\ "auto") do - metadata = comparison["metadata"] - files = comparison["files"] || %{} - codebase = comparison["codebase"] || %{} - - if metadata["total_files_compared"] == 0 do - "## Code Quality: PR Comparison\n\nNo file changes detected." - else - build_github_report(metadata, files, codebase, output_mode) - end - end - - defp build_github_report(metadata, files, codebase, output_mode) do - categories = CodeQA.HealthReport.Categories.defaults() - scale = CodeQA.HealthReport.Categories.default_grade_scale() - - base_agg = get_in(codebase, ["base", "aggregate"]) || %{} - head_agg = get_in(codebase, ["head", "aggregate"]) || %{} - - base_grades = CodeQA.HealthReport.Grader.grade_aggregate(categories, base_agg, scale) - head_grades = CodeQA.HealthReport.Grader.grade_aggregate(categories, head_agg, scale) - - paired = Enum.zip(base_grades, head_grades) - - lines = - [ - "## Code Quality: PR Comparison", - "", - "**#{metadata["total_files_compared"]} files compared** (#{metadata["summary"]})", - "" - ] ++ - mermaid_chart(head_grades) ++ - progress_bars(paired) ++ - [""] ++ - file_details(files, codebase, output_mode) ++ - aggregate_details(codebase) - - Enum.join(lines, "\n") - end - - defp mermaid_chart(head_grades) do - names = Enum.map(head_grades, fn g -> ~s("#{g.name}") end) |> Enum.join(", ") - scores = Enum.map(head_grades, fn g -> to_string(g.score) end) |> Enum.join(", ") - - [ - "```mermaid", - "%%{init: {'theme': 'neutral'}}%%", - "xychart-beta", - " title \"Code Health After PR\"", - " x-axis [#{names}]", - " y-axis \"Score\" 0 --> 100", - " bar [#{scores}]", - "```", - "" - ] - end - - defp progress_bars(paired) do - max_name_len = - Enum.reduce(paired, 0, fn {_base, head}, acc -> - max(acc, String.length(head.name)) - end) - - rows = - Enum.map(paired, fn {base, head} -> - name = String.pad_trailing(head.name, max_name_len) - base_bar = build_bar(base.score) - head_bar = build_bar(head.score) - emoji = grade_emoji(head.grade) - delta = head.score - base.score - delta_str = if delta >= 0, do: "+#{delta}", else: to_string(delta) - "#{name} #{base_bar} #{base.score} → #{head_bar} #{head.score} #{emoji} #{delta_str}" - end) - - ["```"] ++ rows ++ ["```"] - end - - defp file_details(files, codebase, _output_mode) do - codebase_summary = CodeQA.Summarizer.summarize_codebase(%{"files" => files, "codebase" => codebase}) - - file_summaries = - Map.new(files, fn {path, data} -> - {path, CodeQA.Summarizer.summarize_file(path, data)} - end) - - inner = - (format_file_table(files, file_summaries) ++ [""]) - |> Enum.join("\n") - - [ - "
", - "File changes — #{codebase_summary["gist"]}", - "", - inner, - "
", - "" - ] - end - - defp aggregate_details(codebase) do - inner = - format_aggregate_table(codebase, build_direction_map()) - |> Enum.join("\n") - - if inner == "" do - [] - else - [ - "
", - "Aggregate metrics", - "", - inner, - "", - "
", - "" - ] - end - end - - defp build_bar(score) do - filled = round(score / 100 * @bar_width) - filled = min(max(filled, 0), @bar_width) - empty = @bar_width - filled - String.duplicate(@filled, filled) <> String.duplicate(@empty, empty) - end - - defp grade_emoji(grade) do - cond do - grade in ["A", "A-"] -> "🟢" - grade in ["B+", "B", "B-"] -> "🟡" - grade in ["C+", "C", "C-"] -> "🟠" - true -> "🔴" - end - end - - def format_markdown(comparison, output_mode \\ "auto") do - metadata = comparison["metadata"] - files = comparison["files"] || %{} - codebase = comparison["codebase"] - - if metadata["total_files_compared"] == 0 do - "## Code Quality: PR Comparison\n\nNo file changes detected." - else - build_report(metadata, files, codebase, output_mode) - end - end - - defp build_report(metadata, files, codebase, output_mode) do - codebase_summary = - CodeQA.Summarizer.summarize_codebase(%{"files" => files, "codebase" => codebase}) - - lines = [ - "## Code Quality: PR Comparison", - "", - "**#{metadata["total_files_compared"]} files compared** (#{metadata["summary"]})", - "" - ] - - lines = - if output_mode in ["auto", "summary"] do - lines ++ ["> #{codebase_summary["gist"]}", ""] - else - lines - end - - lines = - if output_mode in ["auto", "changes"] do - file_summaries = - Map.new(files, fn {path, data} -> - {path, CodeQA.Summarizer.summarize_file(path, data)} - end) - - lines ++ format_file_table(files, file_summaries) ++ [""] - else - lines - end - - lines = - if output_mode in ["auto", "summary"] do - lines ++ format_aggregate_table(codebase) - else - lines - end - - Enum.join(lines, "\n") - end - - defp format_file_table(files, file_summaries) do - columns = detect_columns(files) - - if columns == [], - do: ["No metric data available."], - else: build_file_rows(files, file_summaries, columns) - end - - defp build_file_rows(files, file_summaries, columns) do - header = - "| File | Status | Summary | " <> - Enum.map_join(columns, " | ", fn {_, _, label} -> label end) <> " |" - - separator = - "|------|--------|---------|" <> Enum.map_join(columns, "", fn _ -> "--------|" end) - - rows = - files - |> Enum.sort_by(fn {path, _} -> path end) - |> Enum.map(fn {path, data} -> - gist = get_in(file_summaries, [path, "gist"]) || "" - cells = format_file_row(data, columns) - "| `#{path}` | #{data["status"]} | #{gist} | " <> Enum.join(cells, " | ") <> " |" - end) - - [header, separator | rows] - end - - defp format_file_row(data, columns) do - Enum.map(columns, fn {metric_name, key, _label} -> - case data["status"] do - "modified" -> format_modified_cell(data, metric_name, key) - "added" -> format_added_cell(data, metric_name, key) - "deleted" -> format_deleted_cell(data, metric_name, key) - _ -> "—" - end - end) - end - - defp format_modified_cell(data, metric_name, key) do - case get_in(data, ["delta", "metrics", metric_name, key]) do - nil -> "—" - val -> format_delta(val) - end - end - - defp format_added_cell(data, metric_name, key) do - case get_in(data, ["head", "metrics", metric_name, key]) do - nil -> "—" - val -> "*#{format_value(val)}*" - end - end - - defp format_deleted_cell(data, metric_name, key) do - case get_in(data, ["base", "metrics", metric_name, key]) do - nil -> "—" - val -> "~~#{format_value(val)}~~" - end - end - - defp format_aggregate_table(codebase, direction_map \\ %{}) do - base_agg = get_in(codebase, ["base", "aggregate"]) || %{} - head_agg = get_in(codebase, ["head", "aggregate"]) || %{} - delta_agg = get_in(codebase, ["delta", "aggregate"]) || %{} - - if base_agg == %{} and head_agg == %{}, - do: [], - else: build_aggregate_rows(base_agg, head_agg, delta_agg, direction_map) - end - - defp build_aggregate_rows(base_agg, head_agg, delta_agg, direction_map) do - header = [ - "### Aggregate Metrics", - "", - "| Metric | Base | Head | Delta |", - "|--------|------|------|-------|" - ] - - rows = - MapSet.new(Map.keys(base_agg) ++ Map.keys(head_agg)) - |> Enum.sort() - |> Enum.flat_map(fn metric_name -> - base_m = Map.get(base_agg, metric_name, %{}) - head_m = Map.get(head_agg, metric_name, %{}) - delta_m = Map.get(delta_agg, metric_name, %{}) - - MapSet.new(Map.keys(base_m) ++ Map.keys(head_m)) - |> Enum.sort() - |> Enum.map(fn key -> - direction = Map.get(direction_map, "#{metric_name}.#{key}") - delta_cell = format_delta_with_direction(delta_m[key], direction) - "| #{metric_name}.#{key} | #{format_value(base_m[key])} | #{format_value(head_m[key])} | #{delta_cell} |" - end) - end) - - header ++ rows - end - - defp build_direction_map do - CodeQA.HealthReport.Categories.defaults() - |> Enum.flat_map(fn cat -> - Enum.map(cat.metrics, fn m -> {"#{m.source}.mean_#{m.name}", m.good} end) - end) - |> Map.new() - end - - defp format_delta_with_direction(nil, _direction), do: "—" - - defp format_delta_with_direction(value, direction) do - formatted = format_delta(value) - emoji = delta_emoji(value, direction) - if emoji, do: "#{emoji} #{formatted}", else: formatted - end - - defp delta_emoji(_value, nil), do: nil - defp delta_emoji(value, :high) when value > 0, do: "🟢" - defp delta_emoji(value, :high) when value < 0, do: "🔴" - defp delta_emoji(value, :low) when value < 0, do: "🟢" - defp delta_emoji(value, :low) when value > 0, do: "🔴" - defp delta_emoji(_value, _direction), do: nil - - defp detect_columns(files) do - Enum.filter(@summary_metrics, fn {metric_name, key, _label} -> - Enum.any?(files, fn {_path, data} -> - source = data["head"] || data["base"] - source && get_in(source, ["metrics", metric_name, key]) != nil - end) - end) - end - - defp format_delta(nil), do: "—" - - defp format_delta(value) when value > 0, - do: "+#{:erlang.float_to_binary(value / 1, decimals: 2)}" - - defp format_delta(value) when value < 0, do: :erlang.float_to_binary(value / 1, decimals: 2) - defp format_delta(_), do: "0.00" - - defp format_value(nil), do: "—" - defp format_value(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) - defp format_value(value), do: to_string(value) -end diff --git a/lib/codeqa/git.ex b/lib/codeqa/git.ex index 78c0bdb..4489205 100644 --- a/lib/codeqa/git.ex +++ b/lib/codeqa/git.ex @@ -8,8 +8,26 @@ defmodule CodeQA.Git do defstruct @enforce_keys end + alias CodeQA.Engine.Collector + @status_map %{"A" => "added", "M" => "modified", "D" => "deleted"} + @spec gitignored_files(String.t(), [String.t()]) :: MapSet.t() + def gitignored_files(_repo_path, []), do: MapSet.new() + + def gitignored_files(repo_path, paths) do + {output, _exit_code} = + System.cmd("git", ["check-ignore", "--no-index" | paths], + cd: repo_path, + stderr_to_stdout: false + ) + + output + |> String.trim() + |> String.split("\n", trim: true) + |> MapSet.new() + end + def changed_files(repo_path, base_ref, head_ref) do {output, 0} = System.cmd( @@ -25,6 +43,78 @@ defmodule CodeQA.Git do |> Enum.flat_map(&parse_change_line/1) end + @doc """ + Returns a map of file paths to lists of changed line ranges in the head version. + + Each range is a tuple `{start_line, end_line}` representing lines that were + added or modified in the diff between base_ref and head_ref. + """ + @spec diff_line_ranges(String.t(), String.t(), String.t()) :: + {:ok, %{String.t() => [{pos_integer(), pos_integer()}]}} | {:error, term()} + def diff_line_ranges(repo_path, base_ref, head_ref) do + case System.cmd( + "git", + ["diff", "-U0", "#{base_ref}..#{head_ref}"], + cd: repo_path, + stderr_to_stdout: false + ) do + {output, 0} -> + {:ok, parse_diff_hunks(output)} + + {_output, code} -> + {:error, "git diff exited with code #{code}"} + end + end + + @typep parse_state :: {String.t() | nil, %{String.t() => [{pos_integer(), pos_integer()}]}} + + @spec parse_diff_hunks(String.t()) :: %{String.t() => [{pos_integer(), pos_integer()}]} + defp parse_diff_hunks(diff_output) do + diff_output + |> String.split("\n") + |> Enum.reduce({nil, %{}}, &parse_diff_line/2) + |> elem(1) + |> Map.new(fn {path, ranges} -> {path, Enum.reverse(ranges)} end) + end + + @spec parse_diff_line(String.t(), parse_state()) :: parse_state() + defp parse_diff_line("diff --git a/" <> rest, {_current_file, acc}) do + # Extract the "b/..." path from the diff header + case Regex.run(~r/ b\/(.+)$/, rest) do + [_, path] -> {path, acc} + nil -> {nil, acc} + end + end + + defp parse_diff_line("@@ " <> rest, {current_file, acc}) when is_binary(current_file) do + # Parse hunk header: @@ -old_start,old_count +new_start,new_count @@ + case Regex.run(~r/\+(\d+)(?:,(\d+))?/, rest) do + [_, start_str] -> + # Single line change (no count means 1 line) + start = String.to_integer(start_str) + updated = Map.update(acc, current_file, [{start, start}], &[{start, start} | &1]) + {current_file, updated} + + [_, start_str, count_str] -> + start = String.to_integer(start_str) + count = String.to_integer(count_str) + + if count == 0 do + # Deletion only, no new lines + {current_file, acc} + else + end_line = start + count - 1 + updated = Map.update(acc, current_file, [{start, end_line}], &[{start, end_line} | &1]) + {current_file, updated} + end + + nil -> + {current_file, acc} + end + end + + defp parse_diff_line(_line, state), do: state + def read_file_at_ref(repo_path, ref, path) do case System.cmd("git", ["show", "#{ref}:#{path}"], cd: repo_path, stderr_to_stdout: true) do {output, 0} -> output @@ -66,6 +156,6 @@ defmodule CodeQA.Git do defp source_file?(path) do ext = path |> Path.extname() |> String.downcase() - MapSet.member?(CodeQA.Collector.source_extensions(), ext) + MapSet.member?(Collector.source_extensions(), ext) end end diff --git a/lib/codeqa/health_report.ex b/lib/codeqa/health_report.ex index 982b469..183b737 100644 --- a/lib/codeqa/health_report.ex +++ b/lib/codeqa/health_report.ex @@ -1,46 +1,116 @@ defmodule CodeQA.HealthReport do @moduledoc "Orchestrates health report generation from analysis results." - alias CodeQA.HealthReport.{Config, Grader, Formatter} + alias CodeQA.CombinedMetrics.{FileScorer, SampleRunner} + alias CodeQA.HealthReport.{Config, Delta, Formatter, Grader, TopBlocks} @spec generate(map(), keyword()) :: map() def generate(analysis_results, opts \\ []) do config_path = Keyword.get(opts, :config) - detail = Keyword.get(opts, :detail, :default) - top_n = Keyword.get(opts, :top, 5) + base_results = Keyword.get(opts, :base_results) + changed_files = Keyword.get(opts, :changed_files, []) + diff_line_ranges = Keyword.get(opts, :diff_line_ranges, %{}) + + %{ + categories: categories, + grade_scale: grade_scale, + impact_map: impact_map, + combined_top: combined_top, + block_min_lines: block_min_lines, + block_max_lines: block_max_lines + } = + Config.load(config_path) - %{categories: categories, grade_scale: grade_scale} = Config.load(config_path) aggregate = get_in(analysis_results, ["codebase", "aggregate"]) || %{} files = Map.get(analysis_results, "files", %{}) + project_langs = project_languages(files) - category_grades = Grader.grade_aggregate(categories, aggregate, grade_scale) - - category_grades = - Enum.zip(categories, category_grades) - |> Enum.map(fn {cat_def, graded} -> + threshold_grades = + categories + |> Grader.grade_aggregate(aggregate, grade_scale) + |> Enum.zip(categories) + |> Enum.map(fn {graded, _cat_def} -> summary = build_category_summary(graded) - cat_top = Map.get(cat_def, :top, top_n) + graded + |> Map.put(:type, :threshold) + |> Map.merge(%{summary: summary, worst_offenders: []}) + end) + + worst_files_map = FileScorer.worst_files_per_behavior(files, combined_top: combined_top) + + all_cosines = + SampleRunner.diagnose_aggregate(aggregate, top: 99_999, languages: project_langs) - worst = - case detail do - :summary -> [] - :full -> Grader.worst_offenders(cat_def, files, map_size(files), grade_scale) - _default -> Grader.worst_offenders(cat_def, files, cat_top, grade_scale) - end + cosines_by_category = Enum.group_by(all_cosines, & &1.category) - Map.merge(graded, %{summary: summary, worst_offenders: worst}) + cosine_grades = + Grader.grade_cosine_categories(cosines_by_category, worst_files_map, grade_scale) + + all_categories = + (threshold_grades ++ cosine_grades) + |> Enum.map(fn cat -> + Map.put(cat, :impact, Map.get(impact_map, to_string(cat.key), 1)) end) - {overall_score, overall_grade} = Grader.overall_score(category_grades, grade_scale) + {overall_score, overall_grade} = Grader.overall_score(all_categories, grade_scale, impact_map) metadata = build_metadata(analysis_results) + top_issues = Enum.take(all_cosines, 10) + + codebase_cosine_lookup = + Map.new(all_cosines, fn i -> {{i.category, i.behavior}, i.cosine} end) + + block_opts = [ + block_min_lines: block_min_lines, + block_max_lines: block_max_lines, + diff_line_ranges: diff_line_ranges + ] + + top_blocks = + TopBlocks.build(analysis_results, changed_files, codebase_cosine_lookup, block_opts) + + worst_blocks_by_category = + TopBlocks.worst_per_category( + analysis_results, + changed_files, + codebase_cosine_lookup, + block_opts + ) + + grading_cfg = %{ + category_defs: categories, + grade_scale: grade_scale, + impact_map: impact_map, + combined_top: combined_top + } + + {codebase_delta, pr_summary} = + if base_results do + build_delta_and_summary( + base_results, + analysis_results, + overall_score, + overall_grade, + grading_cfg, + changed_files, + top_blocks + ) + else + {nil, nil} + end + %{ metadata: metadata, + pr_summary: pr_summary, overall_score: overall_score, overall_grade: overall_grade, - categories: category_grades + codebase_delta: codebase_delta, + categories: all_categories, + top_issues: top_issues, + top_blocks: top_blocks, + worst_blocks_by_category: worst_blocks_by_category } end @@ -49,6 +119,77 @@ defmodule CodeQA.HealthReport do Formatter.format_markdown(report, detail, format) end + defp build_delta_and_summary( + base_results, + head_results, + head_score, + head_grade, + %{ + category_defs: category_defs, + grade_scale: grade_scale, + impact_map: impact_map, + combined_top: combined_top + }, + changed_files, + top_blocks + ) do + delta = Delta.compute(base_results, head_results) + + base_aggregate = get_in(base_results, ["codebase", "aggregate"]) || %{} + base_files = Map.get(base_results, "files", %{}) + base_project_langs = project_languages(base_files) + + base_threshold_grades = + category_defs + |> Grader.grade_aggregate(base_aggregate, grade_scale) + |> Enum.zip(category_defs) + |> Enum.map(fn {graded, _cat_def} -> + graded + |> Map.put(:type, :threshold) + |> Map.merge(%{summary: "", worst_offenders: []}) + end) + + base_worst_files_map = + FileScorer.worst_files_per_behavior(base_files, combined_top: combined_top) + + base_cosines_by_category = + SampleRunner.diagnose_aggregate(base_aggregate, top: 99_999, languages: base_project_langs) + |> Enum.group_by(& &1.category) + + base_cosine_grades = + Grader.grade_cosine_categories( + base_cosines_by_category, + base_worst_files_map, + grade_scale + ) + + base_all_categories = + (base_threshold_grades ++ base_cosine_grades) + |> Enum.map(fn cat -> + Map.put(cat, :impact, Map.get(impact_map, to_string(cat.key), 1)) + end) + + {base_score, base_grade} = Grader.overall_score(base_all_categories, grade_scale, impact_map) + + blocks_flagged = length(top_blocks) + files_added = Enum.count(changed_files, &(&1.status == "added")) + files_modified = Enum.count(changed_files, &(&1.status == "modified")) + + summary = %{ + base_score: base_score, + head_score: head_score, + score_delta: head_score - base_score, + base_grade: base_grade, + head_grade: head_grade, + blocks_flagged: blocks_flagged, + files_changed: length(changed_files), + files_added: files_added, + files_modified: files_modified + } + + {delta, summary} + end + defp build_metadata(analysis_results) do meta = Map.get(analysis_results, "metadata", %{}) @@ -59,6 +200,16 @@ defmodule CodeQA.HealthReport do } end + defp project_languages(files_map) do + files_map + |> Map.keys() + |> Enum.map(&CodeQA.Language.detect(&1).name()) + |> Enum.reject(&(&1 == "unknown")) + |> Enum.uniq() + end + + defp build_category_summary(%{type: :cosine}), do: "" + defp build_category_summary(graded) do low_scorers = graded.metric_scores diff --git a/lib/codeqa/health_report/categories.ex b/lib/codeqa/health_report/categories.ex index 69970be..98b2e97 100644 --- a/lib/codeqa/health_report/categories.ex +++ b/lib/codeqa/health_report/categories.ex @@ -36,28 +36,36 @@ defmodule CodeQA.HealthReport.Categories do source: "readability", weight: 0.4, good: :high, - thresholds: %{a: 70, b: 50, c: 35, d: 20} + thresholds: %{a: 70, b: 50, c: 35, d: 20}, + fix_hint: + "Low readability score — simplify sentences, prefer short identifiers, avoid deeply nested expressions" }, %{ name: "fog_adapted", source: "readability", weight: 0.3, good: :low, - thresholds: %{a: 6, b: 10, c: 15, d: 22} + thresholds: %{a: 6, b: 10, c: 15, d: 22}, + fix_hint: + "High fog index — reduce complex multi-word identifiers and long compound expressions" }, %{ name: "avg_tokens_per_line", source: "readability", weight: 0.2, good: :low, - thresholds: %{a: 6, b: 10, c: 14, d: 20} + thresholds: %{a: 6, b: 10, c: 14, d: 20}, + fix_hint: + "Too many tokens per line — break long lines into multiple shorter statements" }, %{ name: "avg_line_length", source: "readability", weight: 0.1, good: :low, - thresholds: %{a: 40, b: 60, c: 80, d: 100} + thresholds: %{a: 40, b: 60, c: 80, d: 100}, + fix_hint: + "Lines too long — wrap at 80–120 characters and extract intermediate variables" } ] }, @@ -70,28 +78,35 @@ defmodule CodeQA.HealthReport.Categories do source: "halstead", weight: 0.35, good: :low, - thresholds: %{a: 10, b: 20, c: 35, d: 50} + thresholds: %{a: 10, b: 20, c: 35, d: 50}, + fix_hint: + "High operator/operand ratio — extract repeated sub-expressions into named variables" }, %{ name: "effort", source: "halstead", weight: 0.30, good: :low, - thresholds: %{a: 5000, b: 20000, c: 50000, d: 100_000} + thresholds: %{a: 5000, b: 20_000, c: 50_000, d: 100_000}, + fix_hint: + "High implementation effort — simplify logic by extracting helpers and reducing branching" }, %{ name: "volume", source: "halstead", weight: 0.20, good: :low, - thresholds: %{a: 300, b: 1000, c: 3000, d: 8000} + thresholds: %{a: 300, b: 1000, c: 3000, d: 8000}, + fix_hint: + "High token volume — extract helper functions to reduce the total operation count" }, %{ name: "estimated_bugs", source: "halstead", weight: 0.15, good: :low, - thresholds: %{a: 0.1, b: 0.5, c: 1.0, d: 3.0} + thresholds: %{a: 0.1, b: 0.5, c: 1.0, d: 3.0}, + fix_hint: "High defect estimate — reduce complexity; simpler code has fewer bugs" } ] }, @@ -104,56 +119,69 @@ defmodule CodeQA.HealthReport.Categories do source: "branching", weight: 0.25, good: :low, - thresholds: %{a: 0.08, b: 0.17, c: 0.30, d: 0.45} + thresholds: %{a: 0.08, b: 0.17, c: 0.30, d: 0.45}, + fix_hint: + "Too many branches per line — flatten conditionals using guard clauses or early returns" }, %{ name: "mean_depth", source: "indentation", weight: 0.2, good: :low, - thresholds: %{a: 3.5, b: 7, c: 10, d: 15} + thresholds: %{a: 3.5, b: 7, c: 10, d: 15}, + fix_hint: "High average nesting — extract inner blocks into helper functions" }, %{ name: "avg_function_lines", source: "function_metrics", weight: 0.2, good: :low, - thresholds: %{a: 8, b: 15, c: 30, d: 65} + thresholds: %{a: 8, b: 15, c: 30, d: 65}, + fix_hint: + "Functions too long on average — split into smaller single-purpose functions" }, %{ name: "max_depth", source: "indentation", weight: 0.1, good: :low, - thresholds: %{a: 8, b: 16, c: 25, d: 35} + thresholds: %{a: 8, b: 16, c: 25, d: 35}, + fix_hint: "Deep nesting — restructure using early returns or extract nested logic" }, %{ name: "max_function_lines", source: "function_metrics", weight: 0.1, good: :low, - thresholds: %{a: 20, b: 50, c: 100, d: 200} + thresholds: %{a: 20, b: 50, c: 100, d: 200}, + fix_hint: + "Largest function too long — decompose the longest function into focused helpers" }, %{ name: "variance", source: "indentation", weight: 0.1, good: :low, - thresholds: %{a: 7, b: 20, c: 40, d: 65} + thresholds: %{a: 7, b: 20, c: 40, d: 65}, + fix_hint: + "Inconsistent indentation depth — standardize nesting by flattening or restructuring" }, %{ name: "avg_param_count", source: "function_metrics", weight: 0.03, good: :low, - thresholds: %{a: 2, b: 3, c: 5, d: 7} + thresholds: %{a: 2, b: 3, c: 5, d: 7}, + fix_hint: "Too many parameters on average — group related params into a struct or map" }, %{ name: "max_param_count", source: "function_metrics", weight: 0.02, good: :low, - thresholds: %{a: 3, b: 5, c: 7, d: 10} + thresholds: %{a: 3, b: 5, c: 7, d: 10}, + fix_hint: + "Function has too many parameters — introduce a parameter object or options map" } ] }, @@ -166,21 +194,27 @@ defmodule CodeQA.HealthReport.Categories do source: "compression", weight: 0.5, good: :low, - thresholds: %{a: 0.3, b: 0.5, c: 0.65, d: 0.8} + thresholds: %{a: 0.3, b: 0.5, c: 0.65, d: 0.8}, + fix_hint: + "High redundancy — extract repeated patterns into shared helpers or abstractions" }, %{ name: "bigram_repetition_rate", source: "ngram", weight: 0.3, good: :low, - thresholds: %{a: 0.15, b: 0.30, c: 0.45, d: 0.60} + thresholds: %{a: 0.15, b: 0.30, c: 0.45, d: 0.60}, + fix_hint: + "Repeated two-token sequences — consolidate duplicated patterns into named functions" }, %{ name: "trigram_repetition_rate", source: "ngram", weight: 0.2, good: :low, - thresholds: %{a: 0.05, b: 0.15, c: 0.30, d: 0.45} + thresholds: %{a: 0.05, b: 0.15, c: 0.30, d: 0.45}, + fix_hint: + "Repeated three-token sequences — extract duplicated logic into reusable abstractions" } ] }, @@ -193,28 +227,34 @@ defmodule CodeQA.HealthReport.Categories do source: "casing_entropy", weight: 0.3, good: :low, - thresholds: %{a: 1.0, b: 1.5, c: 2.0, d: 2.3} + thresholds: %{a: 1.0, b: 1.5, c: 2.0, d: 2.3}, + fix_hint: + "Mixed casing styles — use a single consistent casing convention throughout the file" }, %{ name: "mean", source: "identifier_length_variance", weight: 0.25, good: :low, - thresholds: %{a: 12, b: 18, c: 25, d: 35} + thresholds: %{a: 12, b: 18, c: 25, d: 35}, + fix_hint: "Identifiers too long on average — prefer concise, intent-revealing names" }, %{ name: "variance", source: "identifier_length_variance", weight: 0.25, good: :low, - thresholds: %{a: 15, b: 30, c: 50, d: 80} + thresholds: %{a: 15, b: 30, c: 50, d: 80}, + fix_hint: "High identifier length variance — standardize name length conventions" }, %{ name: "avg_sub_words_per_id", source: "readability", weight: 0.2, good: :low, - thresholds: %{a: 3, b: 4, c: 5, d: 7} + thresholds: %{a: 3, b: 4, c: 5, d: 7}, + fix_hint: + "Identifiers have too many sub-words — simplify to 2–3 word names where possible" } ] }, @@ -227,7 +267,8 @@ defmodule CodeQA.HealthReport.Categories do source: "magic_number_density", weight: 1.0, good: :low, - thresholds: %{a: 0.02, b: 0.05, c: 0.10, d: 0.20} + thresholds: %{a: 0.02, b: 0.05, c: 0.10, d: 0.20}, + fix_hint: "Too many magic numbers — replace literal values with named constants" } ] } diff --git a/lib/codeqa/health_report/config.ex b/lib/codeqa/health_report/config.ex index 15bf125..7c457b2 100644 --- a/lib/codeqa/health_report/config.ex +++ b/lib/codeqa/health_report/config.ex @@ -3,9 +3,24 @@ defmodule CodeQA.HealthReport.Config do alias CodeQA.HealthReport.Categories - @spec load(String.t() | nil) :: %{categories: [map()], grade_scale: [{number(), String.t()}]} - def load(nil), - do: %{categories: Categories.defaults(), grade_scale: Categories.default_grade_scale()} + @spec load(String.t() | nil) :: %{ + categories: [map()], + grade_scale: [{number(), String.t()}], + impact_map: %{String.t() => pos_integer()}, + combined_top: pos_integer(), + block_min_lines: pos_integer(), + block_max_lines: pos_integer() + } + def load(nil) do + %{ + categories: Categories.defaults(), + grade_scale: Categories.default_grade_scale(), + impact_map: CodeQA.Config.impact_map(), + combined_top: CodeQA.Config.combined_top(), + block_min_lines: 3, + block_max_lines: 20 + } + end def load(path) do yaml = YamlElixir.read_from_file!(path) @@ -30,8 +45,26 @@ defmodule CodeQA.HealthReport.Config do end) grade_scale = parse_grade_scale(Map.get(yaml, "grade_scale")) + impact_map = parse_impact(Map.get(yaml, "impact")) + combined_top = Map.get(yaml, "combined_top", 2) + block_min_lines = Map.get(yaml, "block_min_lines", 3) + block_max_lines = Map.get(yaml, "block_max_lines", 20) + + %{ + categories: categories, + grade_scale: grade_scale, + impact_map: impact_map, + combined_top: combined_top, + block_min_lines: block_min_lines, + block_max_lines: block_max_lines + } + end + + defp parse_impact(nil), do: CodeQA.Config.impact_map() - %{categories: categories, grade_scale: grade_scale} + defp parse_impact(overrides) when is_map(overrides) do + string_overrides = Map.new(overrides, fn {k, v} -> {to_string(k), v} end) + Map.merge(CodeQA.Config.impact_map(), string_overrides) end defp parse_grade_scale(nil), do: Categories.default_grade_scale() diff --git a/lib/codeqa/health_report/delta.ex b/lib/codeqa/health_report/delta.ex new file mode 100644 index 0000000..52b0085 --- /dev/null +++ b/lib/codeqa/health_report/delta.ex @@ -0,0 +1,42 @@ +defmodule CodeQA.HealthReport.Delta do + @moduledoc "Computes aggregate metric delta between two codebase analysis results." + + @spec compute(map(), map()) :: %{ + base: %{aggregate: map()}, + head: %{aggregate: map()}, + delta: %{aggregate: map()} + } + def compute(base_results, head_results) do + base_agg = get_in(base_results, ["codebase", "aggregate"]) || %{} + head_agg = get_in(head_results, ["codebase", "aggregate"]) || %{} + + %{ + base: %{aggregate: base_agg}, + head: %{aggregate: head_agg}, + delta: %{aggregate: compute_aggregate_delta(base_agg, head_agg)} + } + end + + defp compute_aggregate_delta(base_agg, head_agg) do + MapSet.new(Map.keys(base_agg) ++ Map.keys(head_agg)) + |> Enum.reduce(%{}, fn metric_name, acc -> + base_m = Map.get(base_agg, metric_name, %{}) + head_m = Map.get(head_agg, metric_name, %{}) + delta = compute_numeric_delta(base_m, head_m) + if delta == %{}, do: acc, else: Map.put(acc, metric_name, delta) + end) + end + + defp compute_numeric_delta(base, head) do + MapSet.new(Map.keys(base) ++ Map.keys(head)) + |> Enum.reduce(%{}, fn key, acc -> + case {Map.get(base, key), Map.get(head, key)} do + {b, h} when is_number(b) and is_number(h) -> + Map.put(acc, key, Float.round((h - b) * 1.0, 4)) + + _ -> + acc + end + end) + end +end diff --git a/lib/codeqa/health_report/formatter.ex b/lib/codeqa/health_report/formatter.ex index df17d8d..d166f14 100644 --- a/lib/codeqa/health_report/formatter.ex +++ b/lib/codeqa/health_report/formatter.ex @@ -8,4 +8,23 @@ defmodule CodeQA.HealthReport.Formatter do def format_markdown(report, detail, :plain, _opts), do: Plain.render(report, detail) def format_markdown(report, detail, :github, opts), do: Github.render(report, detail, opts) + + @doc """ + Renders the report as multiple parts for GitHub PR comments. + Returns a flat list of strings: [part_1, part_2, part_3, ...]. + + Part 1: Header, summary, PR summary, delta, chart, progress bars + Part 2: Top issues, category detail sections + Part 3+: Blocks section, sliced at 60,000 chars per part + + Each part ends with a sentinel comment for sticky comment identification. + """ + @spec render_parts(map(), keyword()) :: [String.t()] + def render_parts(report, opts \\ []) do + part_1 = Github.render_part_1(report, opts) + part_2 = Github.render_part_2(report, opts) + parts_3 = Github.render_parts_3(report, opts) + + [part_1, part_2 | parts_3] + end end diff --git a/lib/codeqa/health_report/formatter/github.ex b/lib/codeqa/health_report/formatter/github.ex index 72bb9ee..dd4a696 100644 --- a/lib/codeqa/health_report/formatter/github.ex +++ b/lib/codeqa/health_report/formatter/github.ex @@ -8,18 +8,125 @@ defmodule CodeQA.HealthReport.Formatter.Github do @spec render(map(), atom(), keyword()) :: String.t() def render(report, detail, opts \\ []) do chart? = Keyword.get(opts, :chart, true) + display_categories = merge_cosine_categories(report.categories) + worst_blocks = Map.get(report, :worst_blocks_by_category, %{}) [ + pr_summary_section(Map.get(report, :pr_summary)), header(report), - if(chart?, do: mermaid_chart(report.categories), else: []), - progress_bars(report.categories), - category_sections(report.categories, detail), + cosine_legend(), + delta_section(Map.get(report, :codebase_delta)), + if(chart?, do: mermaid_chart(display_categories), else: []), + progress_bars(display_categories), + top_issues_section(Map.get(report, :top_issues, []), detail), + blocks_section(Map.get(report, :top_blocks, [])), + category_sections(display_categories, detail, worst_blocks), footer() ] |> List.flatten() |> Enum.join("\n") end + @doc """ + Renders Part 1: header, summary table, PR summary, delta, mermaid chart, progress bars. + Each part ends with a sentinel HTML comment for sticky comment identification. + """ + @spec render_part_1(map(), keyword()) :: String.t() + def render_part_1(report, opts \\ []) do + chart? = Keyword.get(opts, :chart, true) + display_categories = merge_cosine_categories(report.categories) + + [ + pr_summary_section(Map.get(report, :pr_summary)), + header(report), + cosine_legend(), + delta_section(Map.get(report, :codebase_delta)), + if(chart?, do: mermaid_chart(display_categories), else: []), + progress_bars(display_categories), + sentinel(1) + ] + |> List.flatten() + |> Enum.join("\n") + end + + @doc """ + Renders Part 2: top issues + all category detail sections. + """ + @spec render_part_2(map(), keyword()) :: String.t() + def render_part_2(report, opts \\ []) do + detail = Keyword.get(opts, :detail, :default) + display_categories = merge_cosine_categories(report.categories) + worst_blocks = Map.get(report, :worst_blocks_by_category, %{}) + + [ + top_issues_section(Map.get(report, :top_issues, []), detail), + category_sections(display_categories, detail, worst_blocks), + sentinel(2) + ] + |> List.flatten() + |> Enum.join("\n") + end + + @doc """ + Renders Part 3: blocks section (top 10 blocks with code). + Returns a list with a single part since blocks are now limited to top 10. + """ + @spec render_parts_3(map(), keyword()) :: [String.t()] + def render_parts_3(report, _opts \\ []) do + top_blocks = Map.get(report, :top_blocks, []) + + if top_blocks == [] do + ["> _No near-duplicate blocks detected._\n\n" <> sentinel_str(3)] + else + blocks_content = blocks_section(top_blocks) |> List.flatten() |> Enum.join("\n") + [blocks_content <> "\n\n" <> sentinel_str(3)] + end + end + + defp sentinel(n), do: [sentinel_str(n)] + + defp sentinel_str(n), do: "" + + defp merge_cosine_categories(categories) do + {cosine, threshold} = Enum.split_with(categories, &(&1.type == :cosine)) + + case cosine do + [] -> + threshold + + _ -> + total_impact = Enum.sum(Enum.map(cosine, & &1.impact)) + + combined_score = + round(Enum.sum(Enum.map(cosine, &(&1.score * &1.impact))) / max(total_impact, 1)) + + combined = %{ + type: :cosine_group, + key: "combined_metrics", + name: "Combined Metrics", + score: combined_score, + grade: grade_letter_from_score(combined_score), + categories: cosine + } + + threshold ++ [combined] + end + end + + defp grade_letter_from_score(score) when score >= 97, do: "A+" + defp grade_letter_from_score(score) when score >= 93, do: "A" + defp grade_letter_from_score(score) when score >= 90, do: "A-" + defp grade_letter_from_score(score) when score >= 87, do: "B+" + defp grade_letter_from_score(score) when score >= 83, do: "B" + defp grade_letter_from_score(score) when score >= 80, do: "B-" + defp grade_letter_from_score(score) when score >= 77, do: "C+" + defp grade_letter_from_score(score) when score >= 73, do: "C" + defp grade_letter_from_score(score) when score >= 70, do: "C-" + defp grade_letter_from_score(score) when score >= 67, do: "D+" + defp grade_letter_from_score(score) when score >= 63, do: "D" + defp grade_letter_from_score(score) when score >= 60, do: "D-" + defp grade_letter_from_score(_score), do: "F" + defp header(report) do emoji = grade_emoji(report.overall_grade) @@ -31,9 +138,16 @@ defmodule CodeQA.HealthReport.Formatter.Github do ] end + defp cosine_legend do + [ + "> *Combined metric scores use cosine similarity: +1 = metric profile perfectly matches healthy pattern for this behavior, 0 = no signal, −1 = anti-pattern detected. Mapped to 0–100 using breakpoints (approx: ≥0.5→A, ≥0.2→B, ≥0.0→C, ≥−0.3→D, <−0.3→F); actual letter grades use the full 15-step scale.*", + "" + ] + end + defp mermaid_chart(categories) do - names = Enum.map(categories, fn c -> ~s("#{c.name}") end) |> Enum.join(", ") - scores = Enum.map(categories, fn c -> to_string(c.score) end) |> Enum.join(", ") + names = Enum.map_join(categories, ", ", fn c -> ~s("#{c.name}") end) + scores = Enum.map_join(categories, ", ", fn c -> to_string(c.score) end) [ "```mermaid", @@ -74,35 +188,159 @@ defmodule CodeQA.HealthReport.Formatter.Github do String.duplicate(@filled, filled) <> String.duplicate(@empty, empty) end - defp category_sections(_categories, :summary), do: [] + defp category_sections(_categories, :summary, _worst_blocks), do: [] + + defp category_sections(categories, detail, worst_blocks) do + Enum.flat_map(categories, &render_category(&1, detail, worst_blocks)) + end + + defp render_category(%{type: :cosine_group} = group, detail, worst_blocks) do + emoji = grade_emoji(group.grade) + summary_line = "#{emoji} #{group.name} — #{group.grade} (#{group.score}/100)" + + inner = + cosine_group_content(group, detail, worst_blocks) + |> List.flatten() + |> Enum.join("\n") + + [ + "
", + "#{summary_line}", + "", + inner, + "", + "
", + "" + ] + end + + defp render_category(%{type: :cosine} = cat, detail, worst_blocks) do + emoji = grade_emoji(cat.grade) + summary_line = "#{emoji} #{cat.name} — #{cat.grade} (#{cat.score}/100)" + + inner = + cosine_section_content(cat, detail, worst_blocks) + |> List.flatten() + |> Enum.join("\n") + + [ + "
", + "#{summary_line}", + "", + inner, + "", + "
", + "" + ] + end + + defp render_category(cat, detail, _worst_blocks) do + emoji = grade_emoji(cat.grade) + summary_line = "#{emoji} #{cat.name} — #{cat.grade} (#{cat.score}/100)" + + inner = + section_content(cat, detail) + |> List.flatten() + |> Enum.join("\n") + + [ + "
", + "#{summary_line}", + "", + inner, + "", + "
", + "" + ] + end + + defp cosine_group_content(group, detail, worst_blocks) do + rows = + Enum.map(group.categories, fn cat -> + emoji = grade_emoji(cat.grade) + "| #{cat.name} | #{cat.score} | #{emoji} #{cat.grade} |" + end) + + summary_table = [ + "| Category | Score | Grade |", + "|----------|-------|-------|" + | rows + ] + + sub_sections = + Enum.flat_map(group.categories, fn cat -> + emoji = grade_emoji(cat.grade) + + inner = + cosine_section_content(cat, detail, worst_blocks) + |> List.flatten() + |> Enum.join("\n") + + [ + "
", + "#{emoji} #{cat.name} — #{cat.grade} (#{cat.score}/100)", + "", + inner, + "", + "
", + "" + ] + end) + + summary_table ++ [""] ++ sub_sections + end + + defp cosine_section_content(cat, _detail, worst_blocks) do + n = length(cat.behaviors) + category_key = to_string(cat.key) + + behaviors_rows = + Enum.map(cat.behaviors, fn b -> + "| #{b.behavior} | #{format_num(b.cosine)} | #{b.score} | #{b.grade} |" + end) + + behaviors_table = [ + "> Cosine similarity scores for #{n} behaviors.", + "", + "| Behavior | Cosine | Score | Grade |", + "|----------|--------|-------|-------|" + | behaviors_rows + ] + + worst_block_section = + case Map.get(worst_blocks, category_key) do + nil -> [] + block -> render_worst_block(block) + end + + behaviors_table ++ [""] ++ worst_block_section + end - defp category_sections(categories, detail) do - Enum.flat_map(categories, fn cat -> - emoji = grade_emoji(cat.grade) - summary_line = "#{emoji} #{cat.name} — #{cat.grade} (#{cat.score}/100)" + defp render_worst_block(block) do + line_count = (block.end_line || block.start_line) - block.start_line + 1 + location = "#{block.path}:#{block.start_line}-#{block.end_line}" - inner = - section_content(cat, detail) - |> List.flatten() - |> Enum.join("\n") + if line_count >= 1 and line_count <= 15 and block.source do + lang = block.language || "" [ - "
", - "#{summary_line}", - "", - inner, - "", - "
", + "> **Worst offender** (`#{location}`):", + "> ```#{lang}", + block.source |> String.split("\n") |> Enum.map(&"> #{&1}") |> Enum.join("\n"), + "> ```", "" ] - end) + else + [ + "> **Worst offender**: `#{location}` (#{line_count} lines)", + "" + ] + end end defp section_content(cat, _detail) do metric_summary = - cat.metric_scores - |> Enum.map(fn m -> "#{m.name}=#{format_num(m.value)}" end) - |> Enum.join(", ") + Enum.map_join(cat.metric_scores, ", ", fn m -> "#{m.name}=#{format_num(m.value)}" end) metrics_table = if cat.metric_scores != [] do @@ -124,42 +362,35 @@ defmodule CodeQA.HealthReport.Formatter.Github do "Codebase averages: #{metric_summary}", "" | metrics_table - ] ++ [""] ++ worst_offenders(cat) + ] ++ [""] end - defp worst_offenders(cat) do - offenders = Map.get(cat, :worst_offenders, []) + defp top_issues_section([], _detail), do: [] + defp top_issues_section(_issues, :summary), do: [] - if offenders == [] do - [] - else - averages = Map.new(cat.metric_scores, &{&1.name, &1.value}) - - rows = - Enum.map(offenders, fn f -> - issues = - f.metric_scores - |> Enum.map(fn m -> - avg = Map.get(averages, m.name) - avg_str = if avg, do: " (avg: #{format_num(avg)})", else: "" - "#{direction(m.good)}#{m.name}=#{format_num(m.value)}#{avg_str}" - end) - |> Enum.join("
") - - "| #{format_path(f.path)}
#{format_lines(f[:lines])} lines · #{format_size(f[:bytes])} | #{f.grade} (#{f.score}) | #{issues} |" - end) + defp top_issues_section(issues, _detail) do + rows = + Enum.map_join(issues, "\n", fn i -> + "| `#{i.category}.#{i.behavior}` | #{format_num(i.cosine)} | #{format_num(i.score)} |" + end) - [ - "**Worst Offenders**", - "", - "| File | Grade | Issues |", - "|------|-------|--------|" - | rows - ] - end + table = "| Behavior | Cosine | Score |\n|----------|--------|-------|\n#{rows}" + + [ + "
", + "🔍 Top Likely Issues (cosine similarity)", + "", + "> Most negative cosine = file's metric profile best matches this anti-pattern.", + "", + table, + "", + "
", + "" + ] end defp footer do + # Legacy footer for single-part render/3 (used by --output file mode) ["", ""] end @@ -179,29 +410,148 @@ defmodule CodeQA.HealthReport.Formatter.Github do defp extract_project_name(_), do: "unknown" - defp format_path(path) when byte_size(path) < 80, do: "`#{path}`" + defp format_num(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) + defp format_num(value) when is_integer(value), do: to_string(value) + defp format_num(value), do: to_string(value) - defp format_path(path) do - case String.split(path, "/") do - [file] -> "`#{file}`" - parts -> Enum.join(Enum.drop(parts, -1), "/") <> "/
`#{List.last(parts)}`" + defp format_date(timestamp) when is_binary(timestamp), do: String.slice(timestamp, 0, 10) + defp format_date(_), do: "unknown" + + defp pr_summary_section(nil), do: [] + + defp pr_summary_section(summary) do + delta_str = + if summary.score_delta >= 0, + do: "+#{summary.score_delta}", + else: "#{summary.score_delta}" + + status_str = "#{summary.files_modified} modified, #{summary.files_added} added" + + [ + "> **Score:** #{summary.base_grade} → #{summary.head_grade} | **Δ** #{delta_str} pts | **#{summary.blocks_flagged}** blocks flagged across #{summary.files_changed} files | #{status_str}", + "" + ] + end + + defp delta_section(nil), do: [] + + defp delta_section(delta) do + base_agg = delta.base.aggregate + head_agg = delta.head.aggregate + + metrics = [ + {"Readability", "readability", "mean_flesch_adapted"}, + {"Complexity", "halstead", "mean_difficulty"}, + {"Duplication", "compression", "mean_redundancy"}, + {"Structure", "branching", "mean_branch_count"} + ] + + rows = Enum.flat_map(metrics, &format_metric_row(&1, base_agg, head_agg)) + + if rows == [] do + [] + else + [ + "## Metric Changes", + "", + "| Category | Base | Head | Δ |", + "|----------|------|------|---|" + | rows + ] ++ [""] end end - defp direction(:high), do: "↑ " - defp direction(_), do: "↓ " + defp format_metric_row({label, group, key}, base_agg, head_agg) do + base_val = get_in(base_agg, [group, key]) + head_val = get_in(head_agg, [group, key]) - defp format_lines(nil), do: "—" - defp format_lines(n), do: to_string(n) + if is_number(base_val) and is_number(head_val) do + diff = Float.round(head_val - base_val, 2) + diff_str = if diff >= 0, do: "+#{format_num(diff)}", else: "#{format_num(diff)}" + ["| #{label} | #{format_num(base_val)} | #{format_num(head_val)} | #{diff_str} |"] + else + [] + end + end - defp format_size(nil), do: "—" - defp format_size(bytes) when bytes < 1024, do: "#{bytes} B" - defp format_size(bytes), do: "#{Float.round(bytes / 1024, 1)} KB" + defp blocks_section([]), do: [] - defp format_num(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) - defp format_num(value) when is_integer(value), do: to_string(value) - defp format_num(value), do: to_string(value) + defp blocks_section(top_blocks) do + block_cards = Enum.flat_map(top_blocks, &format_block_card/1) - defp format_date(timestamp) when is_binary(timestamp), do: String.slice(timestamp, 0, 10) - defp format_date(_), do: "unknown" + [ + "## 🔍 Top #{length(top_blocks)} Code Blocks by Impact", + "", + "> Ranked by cosine delta — highest anti-pattern signal first.", + "" + | block_cards + ] + end + + defp format_block_card(block) do + end_line = block.end_line || block.start_line + top_potential = List.first(block.potentials) + icon = severity_icon(top_potential.severity) + delta_str = format_num(top_potential.cosine_delta) + status_str = if block.status, do: " [#{block.status}]", else: "" + + summary_line = + "#{icon} #{block.path}:#{block.start_line}-#{end_line}#{status_str} — #{block.type} (#{block.token_count} tokens) — Δ#{delta_str}" + + issues = format_block_issues(block.potentials) + code_block = format_code_block(block) + + [ + "
", + "#{summary_line}", + "", + "**Issues:**", + "" + | issues + ] ++ ["", code_block, "", "
", ""] + end + + defp format_block_issues(potentials) do + Enum.flat_map(potentials, fn p -> + icon = severity_icon(p.severity) + label = String.upcase(to_string(p.severity)) + delta_str = format_num(p.cosine_delta) + line = "- #{icon} **#{label}** `#{p.category}/#{p.behavior}` (Δ #{delta_str})" + fix = if p.fix_hint, do: [" > #{p.fix_hint}"], else: [] + [line | fix] + end) + end + + defp format_code_block(%{source: nil}), do: "_Source code not available_" + + defp format_code_block(%{source: source, language: lang, start_line: start_line}) do + lang_hint = code_fence_lang(lang) + # Add line number comments for context + lines = String.split(source, "\n") + + numbered_lines = + lines + |> Enum.with_index(start_line) + |> Enum.map(fn {line, num} -> "#{String.pad_leading(to_string(num), 4)} │ #{line}" end) + |> Enum.join("\n") + + "```#{lang_hint}\n#{numbered_lines}\n```" + end + + defp code_fence_lang("elixir"), do: "elixir" + defp code_fence_lang("ruby"), do: "ruby" + defp code_fence_lang("javascript"), do: "javascript" + defp code_fence_lang("typescript"), do: "typescript" + defp code_fence_lang("python"), do: "python" + defp code_fence_lang("swift"), do: "swift" + defp code_fence_lang("kotlin"), do: "kotlin" + defp code_fence_lang("java"), do: "java" + defp code_fence_lang("go"), do: "go" + defp code_fence_lang("rust"), do: "rust" + defp code_fence_lang(_), do: "" + + defp severity_icon(:critical), do: "🔴" + defp severity_icon(:high), do: "🟠" + defp severity_icon(:medium), do: "🟡" + defp severity_icon(_), do: "⚪" end diff --git a/lib/codeqa/health_report/formatter/plain.ex b/lib/codeqa/health_report/formatter/plain.ex index 8471aef..3576ed6 100644 --- a/lib/codeqa/health_report/formatter/plain.ex +++ b/lib/codeqa/health_report/formatter/plain.ex @@ -4,8 +4,13 @@ defmodule CodeQA.HealthReport.Formatter.Plain do @spec render(map(), atom()) :: String.t() def render(report, detail) do [ + pr_summary_section(Map.get(report, :pr_summary)), header(report), + cosine_legend(), + delta_section(Map.get(report, :codebase_delta)), overall_table(report), + top_issues_section(Map.get(report, :top_issues, []), detail), + blocks_section(Map.get(report, :top_blocks, [])), category_sections(report.categories, detail) ] |> List.flatten() @@ -23,16 +28,24 @@ defmodule CodeQA.HealthReport.Formatter.Plain do ] end + defp cosine_legend do + [ + "> *Combined metric scores use cosine similarity: +1 = metric profile perfectly matches healthy pattern for this behavior, 0 = no signal, −1 = anti-pattern detected. Mapped to 0–100 using breakpoints (approx: ≥0.5→A, ≥0.2→B, ≥0.0→C, ≥−0.3→D, <−0.3→F); actual letter grades use the full 15-step scale.*", + "" + ] + end + defp overall_table(report) do rows = Enum.map(report.categories, fn cat -> summary = Map.get(cat, :summary, "") - "| #{cat.name} | #{cat.grade} | #{cat.score} | #{summary} |" + impact = Map.get(cat, :impact, "") + "| #{cat.name} | #{cat.grade} | #{cat.score} | #{impact} | #{summary} |" end) [ - "| Category | Grade | Score | Summary |", - "|----------|-------|-------|---------|" + "| Category | Grade | Score | Impact | Summary |", + "|----------|-------|-------|--------|---------|" | rows ] ++ [""] end @@ -41,15 +54,45 @@ defmodule CodeQA.HealthReport.Formatter.Plain do defp category_sections(categories, detail) do Enum.flat_map(categories, fn cat -> - section_header(cat) ++ metric_detail(cat) ++ worst_offenders_section(cat, detail) + render_category(cat, detail) end) end + defp render_category(%{type: :cosine} = cat, _detail) do + cosine_section_header(cat) ++ cosine_behaviors_table(cat) + end + + defp render_category(cat, _detail) do + section_header(cat) ++ metric_detail(cat) + end + + defp cosine_section_header(cat) do + n = length(cat.behaviors) + + [ + "## #{cat.name} — #{cat.grade}", + "", + "> Cosine similarity scores for #{n} behaviors.", + "" + ] + end + + defp cosine_behaviors_table(cat) do + rows = + Enum.map(cat.behaviors, fn b -> + "| #{b.behavior} | #{format_num(b.cosine)} | #{b.score} | #{b.grade} |" + end) + + [ + "| Behavior | Cosine | Score | Grade |", + "|----------|--------|-------|-------|" + | rows + ] ++ [""] + end + defp section_header(cat) do metric_summary = - cat.metric_scores - |> Enum.map(fn m -> "#{m.name}=#{format_num(m.value)}" end) - |> Enum.join(", ") + Enum.map_join(cat.metric_scores, ", ", fn m -> "#{m.name}=#{format_num(m.value)}" end) [ "## #{cat.name} — #{cat.grade}", @@ -76,66 +119,144 @@ defmodule CodeQA.HealthReport.Formatter.Plain do end end - defp worst_offenders_section(_cat, :summary), do: [] + defp format_num(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) + defp format_num(value) when is_integer(value), do: to_string(value) + defp format_num(value), do: to_string(value) + + defp format_date(timestamp) when is_binary(timestamp) do + timestamp |> String.slice(0, 10) + end + + defp format_date(_), do: "unknown" + + defp top_issues_section([], _detail), do: [] + defp top_issues_section(_issues, :summary), do: [] + + defp top_issues_section(issues, _detail) do + rows = + Enum.map(issues, fn i -> + "| #{i.category}.#{i.behavior} | #{format_num(i.cosine)} | #{format_num(i.score)} |" + end) + + [ + "## Top Likely Issues", + "", + "> Ranked by cosine similarity — most negative means the file's metric profile best matches this anti-pattern.", + "", + "| Behavior | Cosine | Score |", + "|----------|--------|-------|" + | rows + ] ++ [""] + end + + defp pr_summary_section(nil), do: [] + + defp pr_summary_section(summary) do + delta_str = + if summary.score_delta >= 0, + do: "+#{summary.score_delta}", + else: "#{summary.score_delta}" + + status_str = "#{summary.files_modified} modified, #{summary.files_added} added" + + [ + "> **Score:** #{summary.base_grade} → #{summary.head_grade} | **Δ** #{delta_str} pts | **#{summary.blocks_flagged}** blocks flagged across #{summary.files_changed} files | #{status_str}", + "" + ] + end + + defp delta_section(nil), do: [] + + defp delta_section(delta) do + base_agg = delta.base.aggregate + head_agg = delta.head.aggregate - defp worst_offenders_section(cat, _detail) do - offenders = Map.get(cat, :worst_offenders, []) + metrics = [ + {"Readability", "readability", "mean_flesch_adapted"}, + {"Complexity", "halstead", "mean_difficulty"}, + {"Duplication", "compression", "mean_redundancy"}, + {"Structure", "branching", "mean_branch_count"} + ] + + rows = Enum.flat_map(metrics, &format_metric_row(&1, base_agg, head_agg)) - if offenders == [] do + if rows == [] do [] else - averages = Map.new(cat.metric_scores, &{&1.name, &1.value}) - - rows = - Enum.map(offenders, fn f -> - issues = - f.metric_scores - |> Enum.map(fn m -> - avg = Map.get(averages, m.name) - avg_str = if avg, do: " (avg: #{format_num(avg)})", else: "" - "#{direction(m.good)}#{m.name}=#{format_num(m.value)}#{avg_str}" - end) - |> Enum.join("
") - - "| #{format_path(f.path)}
#{format_lines(f[:lines])} lines · #{format_size(f[:bytes])} | #{f.grade} | #{issues} |" - end) - [ - "### Worst Offenders", + "## Metric Changes", "", - "| File | Grade | Issues |", - "|------|-------|--------|" + "| Category | Base | Head | Δ |", + "|----------|------|------|---|" | rows ] ++ [""] end end - defp format_path(path) when byte_size(path) < 80, do: "`#{path}`" + defp format_metric_row({label, group, key}, base_agg, head_agg) do + base_val = get_in(base_agg, [group, key]) + head_val = get_in(head_agg, [group, key]) - defp format_path(path) do - case String.split(path, "/") do - [file] -> "`#{file}`" - parts -> Enum.join(Enum.drop(parts, -1), "/") <> "/
`#{List.last(parts)}`" + if is_number(base_val) and is_number(head_val) do + diff = Float.round(head_val - base_val, 2) + diff_str = if diff >= 0, do: "+#{format_num(diff)}", else: "#{format_num(diff)}" + ["| #{label} | #{format_num(base_val)} | #{format_num(head_val)} | #{diff_str} |"] + else + [] end end - defp direction(:high), do: "↑ " - defp direction(_), do: "↓ " + defp blocks_section([]), do: [] - defp format_lines(nil), do: "—" - defp format_lines(n), do: to_string(n) + defp blocks_section(top_blocks) do + block_parts = Enum.flat_map(top_blocks, &format_block/1) - defp format_size(nil), do: "—" - defp format_size(bytes) when bytes < 1024, do: "#{bytes} B" - defp format_size(bytes), do: "#{Float.round(bytes / 1024, 1)} KB" + [ + "## Top #{length(top_blocks)} Code Blocks by Impact", + "" + | block_parts + ] + end - defp format_num(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) - defp format_num(value) when is_integer(value), do: to_string(value) - defp format_num(value), do: to_string(value) + defp format_block(block) do + end_line = block.end_line || block.start_line + status_str = if block.status, do: " [#{block.status}]", else: "" - defp format_date(timestamp) when is_binary(timestamp) do - timestamp |> String.slice(0, 10) + header = + "### #{block.path}:#{block.start_line}-#{end_line}#{status_str}" + + subheader = + "#{block.type} · #{block.token_count} tokens" + + potential_lines = Enum.flat_map(block.potentials, &format_potential/1) + code_lines = format_code_block(block) + [header, subheader, "" | potential_lines] ++ ["" | code_lines] ++ [""] end - defp format_date(_), do: "unknown" + defp format_code_block(%{source: nil}), do: ["_Source code not available_"] + + defp format_code_block(%{source: source, start_line: start_line}) do + lines = String.split(source, "\n") + + numbered_lines = + lines + |> Enum.with_index(start_line) + |> Enum.map(fn {line, num} -> " #{String.pad_leading(to_string(num), 4)} │ #{line}" end) + + ["```" | numbered_lines] ++ ["```"] + end + + defp format_potential(p) do + icon = severity_icon(p.severity) + delta_str = format_num(p.cosine_delta) + label = String.upcase(to_string(p.severity)) + line = " #{icon} #{label} #{p.category} / #{p.behavior} (Δ #{delta_str})" + fix = if p.fix_hint, do: [" → #{p.fix_hint}"], else: [] + [line | fix] + end + + defp severity_icon(:critical), do: "🔴" + defp severity_icon(:high), do: "🟠" + defp severity_icon(:medium), do: "🟡" + defp severity_icon(_), do: "⚪" end diff --git a/lib/codeqa/health_report/grader.ex b/lib/codeqa/health_report/grader.ex index 864cad3..d671a0b 100644 --- a/lib/codeqa/health_report/grader.ex +++ b/lib/codeqa/health_report/grader.ex @@ -1,6 +1,9 @@ defmodule CodeQA.HealthReport.Grader do @moduledoc "Scores metrics and assigns letter grades." + alias CodeQA.Config + alias CodeQA.HealthReport.Categories + @doc """ Score a single metric value (0-100) based on thresholds and direction. @@ -9,35 +12,60 @@ defmodule CodeQA.HealthReport.Grader do """ @spec score_metric(map(), number()) :: integer() def score_metric(%{good: :high, thresholds: t}, value) do - value |> score_high_is_good(t) |> clamp(0, 100) + score_by_direction(:high, value, t) |> clamp(0, 100) end def score_metric(%{good: _, thresholds: t}, value) do - value |> score_low_is_good(t) |> clamp(0, 100) + score_by_direction(:low, value, t) |> clamp(0, 100) end - # Lower values are better: below A = 100, A = 90, A-B = 70-90, etc. - defp score_low_is_good(val, t) do + @doc """ + Maps cosine similarity [-1, +1] to a score [0, 100] with linear interpolation + within each band. Result is clamped to [0, 100] and rounded to an integer. + + | Cosine range | Score range | + |---------------|-------------| + | [0.5, 1.0] | [90, 100] | + | [0.2, 0.5) | [70, 90) | + | [0.0, 0.2) | [50, 70) | + | [-0.3, 0.0) | [30, 50) | + | [-1.0, -0.3) | [0, 30) | + """ + @spec score_cosine(float()) :: integer() + def score_cosine(cosine) do + cosine + |> cosine_to_score() + |> clamp(0, 100) + |> round() + end + + defp cosine_to_score(c) when c >= 0.5, do: interpolate_between(c, 0.5, 90, 1.0, 100) + defp cosine_to_score(c) when c >= 0.2, do: interpolate_between(c, 0.2, 70, 0.5, 90) + defp cosine_to_score(c) when c >= 0.0, do: interpolate_between(c, 0.0, 50, 0.2, 70) + defp cosine_to_score(c) when c >= -0.3, do: interpolate_between(c, -0.3, 30, 0.0, 50) + defp cosine_to_score(c), do: interpolate_between(c, -1.0, 0, -0.3, 30) + + # :low — lower values are better (t.a < t.b < t.c < t.d); below t.a = 100 + # :high — higher values are better (t.a > t.b > t.c > t.d); above t.a = 100 + defp score_by_direction(:low, val, t) do cond do val < t.a -> 100 val == t.a -> 90 val <= t.b -> interpolate_between(val, t.a, 90, t.b, 70) val <= t.c -> interpolate_between(val, t.b, 70, t.c, 50) val <= t.d -> interpolate_between(val, t.c, 50, t.d, 30) - true -> interpolate_below_d(val, t.d, 30) + true -> interpolate_beyond_d(val, t.d, 30) end end - # Higher values are better: above A = 100, A = 90, A-B = 70-90, etc. - # Thresholds are in descending order (a > b > c > d) - defp score_high_is_good(val, t) do + defp score_by_direction(:high, val, t) do cond do val > t.a -> 100 val == t.a -> 90 val >= t.b -> interpolate_between(val, t.a, 90, t.b, 70) val >= t.c -> interpolate_between(val, t.b, 70, t.c, 50) val >= t.d -> interpolate_between(val, t.c, 50, t.d, 30) - true -> interpolate_below_d_high(val, t.d, 30) + true -> interpolate_beyond_d(val, t.d, 30) end end @@ -52,27 +80,22 @@ defmodule CodeQA.HealthReport.Grader do end end - # Value beyond D threshold (low is good): score degrades below 30 - defp interpolate_below_d(_val, threshold_d, _score_at_d) when threshold_d == 0, do: 0 + # Score degrades below 30 when value is beyond the D threshold in either direction. + # abs(val - threshold_d) captures overshoot for :low and undershoot for :high uniformly. + defp interpolate_beyond_d(_val, 0, _score_at_d), do: 0 - defp interpolate_below_d(val, threshold_d, score_at_d) do - overshoot = (val - threshold_d) / threshold_d - round(Kernel.max(0, score_at_d - overshoot * score_at_d)) + defp interpolate_beyond_d(val, threshold_d, score_at_d) do + deviation = abs(val - threshold_d) / threshold_d + round(Kernel.max(0, score_at_d - deviation * score_at_d)) end - # Value below D threshold (high is good): score degrades below 30 - defp interpolate_below_d_high(_val, threshold_d, _score_at_d) when threshold_d == 0, do: 0 - - defp interpolate_below_d_high(val, threshold_d, score_at_d) do - undershoot = (threshold_d - val) / threshold_d - round(Kernel.max(0, score_at_d - undershoot * score_at_d)) + defp clamp(val, min_val, max_val) do + val |> Kernel.max(min_val) |> Kernel.min(max_val) end - defp clamp(val, min_val, max_val), do: val |> Kernel.max(min_val) |> Kernel.min(max_val) - @doc "Convert a numeric score (0-100) to a letter grade using the given scale." @spec grade_letter(number(), [{number(), String.t()}]) :: String.t() - def grade_letter(score, scale \\ CodeQA.HealthReport.Categories.default_grade_scale()) do + def grade_letter(score, scale \\ Categories.default_grade_scale()) do Enum.find_value(scale, "F", fn {min, letter} -> if score >= min, do: letter end) @@ -86,35 +109,14 @@ defmodule CodeQA.HealthReport.Grader do def grade_category( category, file_metrics, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale() ) do scored = category.metrics - |> Enum.map(fn metric_def -> - value = get_in(file_metrics, [metric_def.source, metric_def.name]) - - if value do - %{ - name: metric_def.name, - source: metric_def.source, - weight: metric_def.weight, - good: metric_def.good, - value: value, - score: score_metric(metric_def, value) - } - end - end) + |> Enum.map(&score_metric_entry(&1, file_metrics)) |> Enum.reject(&is_nil/1) - total_weight = Enum.reduce(scored, 0.0, fn s, acc -> acc + s.weight end) - - score = - if total_weight > 0 do - weighted = Enum.reduce(scored, 0.0, fn s, acc -> acc + s.score * s.weight end) - round(weighted / total_weight) - else - 0 - end + score = weighted_category_score(scored) %{ key: category.key, @@ -125,6 +127,34 @@ defmodule CodeQA.HealthReport.Grader do } end + defp score_metric_entry(metric_def, file_metrics) do + value = get_in(file_metrics, [metric_def.source, metric_def.name]) + + if value do + %{ + name: metric_def.name, + source: metric_def.source, + weight: metric_def.weight, + good: metric_def.good, + value: value, + score: score_metric(metric_def, value) + } + end + end + + defp weighted_category_score([]), do: 0 + + defp weighted_category_score(scored) do + total_weight = Enum.reduce(scored, 0.0, fn s, acc -> acc + s.weight end) + + if total_weight > 0 do + weighted = Enum.reduce(scored, 0.0, fn s, acc -> acc + s.score * s.weight end) + round(weighted / total_weight) + else + 0 + end + end + @doc """ Grade a file's metrics against all categories. `file_metrics` is the `%{"entropy" => %{...}, "halstead" => %{...}}` map from analysis. @@ -133,7 +163,7 @@ defmodule CodeQA.HealthReport.Grader do def grade_file( categories, file_metrics, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale() ) do Enum.map(categories, &grade_category(&1, file_metrics, scale)) end @@ -145,7 +175,7 @@ defmodule CodeQA.HealthReport.Grader do def grade_aggregate( categories, aggregate, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale() ) do # Convert aggregate format (mean_X keys) to file-metric-like format file_like = @@ -161,23 +191,114 @@ defmodule CodeQA.HealthReport.Grader do Enum.map(categories, &grade_category(&1, file_like, scale)) end - @doc "Compute overall score as average of category scores." - @spec overall_score(list(), [{number(), String.t()}]) :: {integer(), String.t()} + @doc """ + Compute overall score as a weighted average of category scores. + + Each category's weight is looked up from `impact_map` by converting + `category.key` (atom) to string. Defaults to `1` if the key is absent. + + Backward compatible: calling with two arguments (empty `impact_map`) produces + the same arithmetic mean as the old `/2` signature. + """ + @spec overall_score( + categories :: [map()], + grade_scale :: [{number(), String.t()}], + impact_map :: %{String.t() => pos_integer()} + ) :: {integer(), String.t()} def overall_score( category_grades, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale(), + impact_map \\ %{} ) do if category_grades == [] do {0, "F"} else - avg = - Enum.reduce(category_grades, 0, fn g, acc -> acc + g.score end) - |> div(length(category_grades)) + {weighted_sum, total_impact} = + Enum.reduce(category_grades, {0, 0}, fn g, {ws, ti} -> + impact = Map.get(impact_map, to_string(g.key), 1) + {ws + g.score * impact, ti + impact} + end) + avg = round(weighted_sum / total_impact) {avg, grade_letter(avg, scale)} end end + @doc """ + Grade codebase aggregate metrics using cosine similarity. + + Accepts `cosines_by_category`, a map of category string keys to lists of + behavior cosine entries as returned by + `Enum.group_by(SampleRunner.diagnose_aggregate(...), & &1.category)`. + + The caller is responsible for computing `cosines_by_category` so that + `diagnose_aggregate/2` is invoked only once across the report pipeline. + + Categories with zero behaviors are skipped. + """ + @spec grade_cosine_categories( + cosines_by_category :: %{String.t() => [map()]}, + worst_files :: %{String.t() => [map()]}, + grade_scale :: [{number(), String.t()}] + ) :: [map()] + def grade_cosine_categories( + cosines_by_category, + worst_files, + scale \\ Categories.default_grade_scale() + ) do + threshold = Config.cosine_significance_threshold() + + cosines_by_category + |> Enum.map(fn {category, behaviors} -> + behavior_entries = + score_behavior_entries(behaviors, threshold, worst_files, scale, category) + + category_score = average_behavior_score(behavior_entries) + build_cosine_category(category, category_score, behavior_entries, scale) + end) + end + + defp score_behavior_entries(behaviors, threshold, worst_files, scale, category) do + behaviors + |> Enum.reject(fn b -> abs(b.cosine) < threshold end) + |> Enum.map(&score_behavior_entry(&1, worst_files, scale, category)) + end + + defp score_behavior_entry(b, worst_files, scale, category) do + cosine_score = score_cosine(b.cosine) + + %{ + behavior: b.behavior, + cosine: b.cosine, + score: cosine_score, + grade: grade_letter(cosine_score, scale), + worst_offenders: Map.get(worst_files, "#{category}.#{b.behavior}", []) + } + end + + defp average_behavior_score([]), do: 50 + + defp average_behavior_score(entries) do + round(Enum.sum(Enum.map(entries, & &1.score)) / length(entries)) + end + + defp build_cosine_category(category, category_score, behavior_entries, scale) do + %{ + type: :cosine, + key: category, + name: humanize_category(category), + score: category_score, + grade: grade_letter(category_score, scale), + behaviors: behavior_entries + } + end + + defp humanize_category(slug) do + slug + |> String.split("_") + |> Enum.map_join(" ", &String.capitalize/1) + end + @doc """ Find worst offender files for a category. Returns top N files sorted by worst score. `all_file_metrics` is `%{"path" => %{"metrics" => %{...}}}` from analysis results. @@ -187,23 +308,51 @@ defmodule CodeQA.HealthReport.Grader do category, all_file_metrics, top_n, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale() ) do + # NOTE: threshold metric scores are file-level aggregates; line-level attribution would require + # each AST node to carry its own per-metric values so that the node with the highest + # contribution to the bad metric score could be identified and reported directly. all_file_metrics |> Enum.map(fn {path, file_data} -> metrics = Map.get(file_data, "metrics", %{}) graded = grade_category(category, metrics, scale) + %{ path: path, score: graded.score, grade: graded.grade, metric_scores: graded.metric_scores, lines: file_data["lines"], - bytes: file_data["bytes"] + bytes: file_data["bytes"], + top_nodes: top_3_nodes(Map.get(file_data, "nodes")) } end) |> Enum.filter(fn f -> f.metric_scores != [] end) |> Enum.sort_by(& &1.score, :asc) |> Enum.take(top_n) end + + @doc """ + Returns the top 3 nodes by refactoring potential impact, ranked by cosine_delta sum. + + Only considers top-level nodes; children are not traversed. Returns an empty list + if input is nil, empty, or nodes lack refactoring_potentials data. + """ + @spec top_3_nodes(list() | nil) :: list() + def top_3_nodes(nil), do: [] + def top_3_nodes([]), do: [] + + def top_3_nodes(nodes) when is_list(nodes) do + nodes + |> Enum.sort_by(&node_impact_score/1, :desc) + |> Enum.take(3) + end + + defp node_impact_score(%{"refactoring_potentials" => potentials}) + when is_list(potentials) and potentials != [] do + Enum.sum(Enum.map(potentials, & &1["cosine_delta"])) + end + + defp node_impact_score(_), do: 0.0 end diff --git a/lib/codeqa/health_report/top_blocks.ex b/lib/codeqa/health_report/top_blocks.ex new file mode 100644 index 0000000..5ceddf5 --- /dev/null +++ b/lib/codeqa/health_report/top_blocks.ex @@ -0,0 +1,222 @@ +defmodule CodeQA.HealthReport.TopBlocks do + @moduledoc "Assembles the top_blocks report section from analysis node data." + + alias CodeQA.CombinedMetrics.Scorer + + @min_tokens 10 + @severity_critical 0.50 + @severity_high 0.25 + @severity_medium 0.10 + @gap_floor 0.01 + @top_n 10 + @default_min_lines 3 + @default_max_lines 20 + + defp build_fix_hint_lookup do + Scorer.all_yamls() + |> Enum.flat_map(fn {yaml_path, data} -> + category = yaml_path |> Path.basename() |> String.trim_trailing(".yml") + Enum.flat_map(data, &hints_for_behavior(category, &1)) + end) + |> Map.new() + end + + defp hints_for_behavior(category, {behavior, behavior_data}) when is_map(behavior_data) do + case Map.get(behavior_data, "_fix_hint") do + nil -> [] + hint -> [{{category, behavior}, hint}] + end + end + + defp hints_for_behavior(_category, _entry), do: [] + + @spec build(map(), [struct()], map(), keyword()) :: [map()] + def build(analysis_results, changed_files, codebase_cosine_lookup, opts \\ []) do + base_path = get_in(analysis_results, ["metadata", "path"]) || "." + + analysis_results + |> collect_enriched_blocks(changed_files, codebase_cosine_lookup, opts) + # Rank by highest cosine_delta and take top N + |> Enum.sort_by(&(-max_delta(&1))) + |> Enum.take(@top_n) + # Add source code for each block + |> Enum.map(&add_source_code(&1, base_path)) + end + + @doc """ + Returns a map of category => worst offending block for that category. + Only includes blocks that overlap with the diff (if diff_line_ranges provided). + """ + @spec worst_per_category(map(), [struct()], map(), keyword()) :: %{String.t() => map()} + def worst_per_category(analysis_results, changed_files, codebase_cosine_lookup, opts \\ []) do + base_path = get_in(analysis_results, ["metadata", "path"]) || "." + + all_blocks = + collect_enriched_blocks(analysis_results, changed_files, codebase_cosine_lookup, opts) + + # Group blocks by category, finding the worst block per category + all_blocks + |> Enum.flat_map(fn block -> + # Each block may contribute to multiple categories via its potentials + block.potentials + |> Enum.map(fn potential -> + {potential.category, block, potential.cosine_delta} + end) + end) + |> Enum.group_by(&elem(&1, 0), fn {_cat, block, delta} -> {block, delta} end) + |> Enum.map(fn {category, block_deltas} -> + # Find the block with highest cosine_delta for this category + {worst_block, _delta} = Enum.max_by(block_deltas, fn {_block, delta} -> delta end) + {category, add_source_code(worst_block, base_path)} + end) + |> Map.new() + end + + # Shared logic for collecting and enriching blocks + defp collect_enriched_blocks(analysis_results, changed_files, codebase_cosine_lookup, opts) do + files = Map.get(analysis_results, "files", %{}) + fix_hints = build_fix_hint_lookup() + + min_lines = Keyword.get(opts, :block_min_lines, @default_min_lines) + max_lines = Keyword.get(opts, :block_max_lines, @default_max_lines) + diff_line_ranges = Keyword.get(opts, :diff_line_ranges, %{}) + + file_entries = + if changed_files == [] do + Enum.map(files, fn {path, data} -> {path, nil, data} end) + else + changed_index = Map.new(changed_files, &{&1.path, &1.status}) + + files + |> Enum.filter(fn {path, _} -> Map.has_key?(changed_index, path) end) + |> Enum.map(fn {path, data} -> {path, Map.get(changed_index, path), data} end) + end + + # Flatten all blocks across all files, enrich with path + file_entries + |> Enum.flat_map(fn {path, status, file_data} -> + path_diff_ranges = Map.get(diff_line_ranges, path, []) + + file_data + |> Map.get("nodes", []) + |> Enum.flat_map(&collect_nodes/1) + |> Enum.filter(&(&1["token_count"] >= @min_tokens)) + |> Enum.filter(&block_in_line_range?(&1, min_lines, max_lines)) + |> filter_by_diff_overlap(path_diff_ranges, diff_line_ranges) + |> Enum.map(&enrich_block(&1, codebase_cosine_lookup, fix_hints)) + |> Enum.reject(&(&1.potentials == [])) + |> Enum.map(&Map.merge(&1, %{path: path, status: status})) + end) + end + + @spec block_in_line_range?(map(), pos_integer(), pos_integer()) :: boolean() + defp block_in_line_range?(node, min_lines, max_lines) do + start_line = node["start_line"] || 1 + end_line = node["end_line"] || start_line + line_count = end_line - start_line + 1 + line_count >= min_lines and line_count <= max_lines + end + + # When no diff_line_ranges provided (empty map), show all blocks - no filtering needed + @spec filter_by_diff_overlap([map()], [{pos_integer(), pos_integer()}], map()) :: [map()] + defp filter_by_diff_overlap(blocks, _path_ranges, diff_line_ranges) + when map_size(diff_line_ranges) == 0, + do: blocks + + # When diff_line_ranges provided, filter blocks by overlap + defp filter_by_diff_overlap(blocks, path_ranges, _diff_line_ranges) do + Enum.filter(blocks, &block_overlaps_diff?(&1, path_ranges)) + end + + @spec block_overlaps_diff?(map(), [{pos_integer(), pos_integer()}]) :: boolean() + defp block_overlaps_diff?(_node, []), do: false + + defp block_overlaps_diff?(node, path_ranges) do + block_start = node["start_line"] || 1 + block_end = node["end_line"] || block_start + + Enum.any?(path_ranges, fn {diff_start, diff_end} -> + ranges_overlap?(block_start, block_end, diff_start, diff_end) + end) + end + + @spec ranges_overlap?(pos_integer(), pos_integer(), pos_integer(), pos_integer()) :: boolean() + defp ranges_overlap?(start1, end1, start2, end2) do + start1 <= end2 and start2 <= end1 + end + + defp collect_nodes(node) do + children = node |> Map.get("children", []) |> Enum.flat_map(&collect_nodes/1) + [node | children] + end + + defp enrich_block(node, cosine_lookup, fix_hints) do + potentials = + node + |> Map.get("refactoring_potentials", []) + |> Enum.map(&enrich_potential(&1, cosine_lookup, fix_hints)) + |> Enum.reject(&is_nil/1) + |> Enum.sort_by(& &1.cosine_delta, :desc) + + %{ + start_line: node["start_line"], + end_line: node["end_line"], + type: node["type"], + token_count: node["token_count"], + potentials: potentials + } + end + + defp enrich_potential(p, cosine_lookup, fix_hints) do + category = p["category"] + behavior = p["behavior"] + cosine_delta = p["cosine_delta"] + + codebase_cosine = Map.get(cosine_lookup, {category, behavior}, 0.0) + gap = max(@gap_floor, 1.0 - codebase_cosine) + severity = classify(cosine_delta / gap) + + if severity == :filtered do + nil + else + %{ + category: category, + behavior: behavior, + cosine_delta: cosine_delta, + severity: severity, + fix_hint: Map.get(fix_hints, {category, behavior}) + } + end + end + + defp classify(ratio) when ratio > @severity_critical, do: :critical + defp classify(ratio) when ratio > @severity_high, do: :high + defp classify(ratio) when ratio > @severity_medium, do: :medium + defp classify(_ratio), do: :filtered + + defp max_delta(%{potentials: []}), do: 0.0 + + defp max_delta(%{potentials: potentials}), + do: Enum.max_by(potentials, & &1.cosine_delta).cosine_delta + + defp add_source_code(block, base_path) do + full_path = Path.join(base_path, block.path) + start_line = block.start_line + end_line = block.end_line || start_line + + source = + case File.read(full_path) do + {:ok, content} -> + content + |> String.split("\n") + |> Enum.slice((start_line - 1)..(end_line - 1)//1) + |> Enum.join("\n") + + {:error, _} -> + nil + end + + lang = CodeQA.Language.detect(block.path).name() + Map.merge(block, %{source: source, language: lang}) + end +end diff --git a/lib/codeqa/languages/code/native/cpp.ex b/lib/codeqa/languages/code/native/cpp.ex new file mode 100644 index 0000000..31cbb4e --- /dev/null +++ b/lib/codeqa/languages/code/native/cpp.ex @@ -0,0 +1,49 @@ +defmodule CodeQA.Languages.Code.Native.Cpp do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "cpp" + + @impl true + def extensions, do: ~w[c cpp cc cxx hpp h hh] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do class struct namespace using include template typename + return new delete this public private protected virtual override static + const constexpr inline extern try catch throw switch case break continue + default auto void true false nullptr + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= %= -> :: + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # * + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[class struct namespace template] + + @impl true + def branch_keywords, do: ~w[else catch case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[public private protected static virtual override inline] + + @impl true + def module_keywords, do: ~w[class struct namespace enum] +end diff --git a/lib/codeqa/languages/code/native/go.ex b/lib/codeqa/languages/code/native/go.ex new file mode 100644 index 0000000..b728aab --- /dev/null +++ b/lib/codeqa/languages/code/native/go.ex @@ -0,0 +1,51 @@ +defmodule CodeQA.Languages.Code.Native.Go do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "go" + + @impl true + def extensions, do: ~w[go] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for func type struct interface package import return var const + map chan go defer select switch case break continue default fallthrough + range make new append len cap close nil true false + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= %= := + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[func type struct interface] + + @impl true + def branch_keywords, do: ~w[else case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[func] + + @impl true + def import_keywords, do: ~w[import package] +end diff --git a/lib/codeqa/languages/code/native/haskell.ex b/lib/codeqa/languages/code/native/haskell.ex new file mode 100644 index 0000000..48cd646 --- /dev/null +++ b/lib/codeqa/languages/code/native/haskell.ex @@ -0,0 +1,57 @@ +defmodule CodeQA.Languages.Code.Native.Haskell do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "haskell" + + @impl true + def extensions, do: ~w[hs lhs] + + @impl true + def comment_prefixes, do: ~w[--] + + @impl true + def block_comments, do: [{"{-", "-}"}] + + @impl true + def keywords, do: ~w[ + if else then for do let in where module import data type newtype class + instance deriving case of return True False Nothing Just do + infixl infixr infix qualified as hiding + ] + + @impl true + def operators, do: ~w[ + == /= <= >= + - * / ^ && || ! $ . <$> <*> >>= >> -> <- :: = | @ ~ + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; | @ -> <- :: + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[data type newtype class instance] + + @impl true + def branch_keywords, do: ~w[else] + + @impl true + def block_end_tokens, do: [] + + @impl true + def function_keywords, do: ~w[where let] + + @impl true + def module_keywords, do: ~w[module class instance] + + @impl true + def import_keywords, do: ~w[import] + + @impl true + def test_keywords, do: ~w[test it describe prop] + + @impl true + def uses_colon_indent?, do: true +end diff --git a/lib/codeqa/languages/code/native/ocaml.ex b/lib/codeqa/languages/code/native/ocaml.ex new file mode 100644 index 0000000..d1e8b21 --- /dev/null +++ b/lib/codeqa/languages/code/native/ocaml.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Native.Ocaml do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "ocaml" + + @impl true + def extensions, do: ~w[ml mli] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [{"(*", "*)"}] + + @impl true + def keywords, do: ~w[ + let rec fun if then else for while do done begin end match with type module + open struct sig functor val mutable exception raise try when and or not in + of as include class object method inherit new virtual + ] + + @impl true + def operators, do: ~w[ + == = != <> <= >= + - * / mod << >> & | ^ ~ && || @ :: |> -> <- := ! + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; | @ -> + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[let type module class] + + @impl true + def branch_keywords, do: ~w[else with when] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def access_modifiers, do: ~w[mutable virtual] + + @impl true + def function_keywords, do: ~w[let fun] + + @impl true + def module_keywords, do: ~w[module struct functor class] + + @impl true + def import_keywords, do: ~w[open include] +end diff --git a/lib/codeqa/languages/code/native/rust.ex b/lib/codeqa/languages/code/native/rust.ex new file mode 100644 index 0000000..0616834 --- /dev/null +++ b/lib/codeqa/languages/code/native/rust.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Native.Rust do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "rust" + + @impl true + def extensions, do: ~w[rs] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while loop fn struct enum trait impl use mod pub let mut const + static return match type where as in ref move async await dyn unsafe extern + crate self super true false + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= %= -> => :: + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[fn struct enum trait impl mod] + + @impl true + def branch_keywords, do: ~w[else match] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[pub] + + @impl true + def function_keywords, do: ~w[fn] + + @impl true + def module_keywords, do: ~w[impl trait struct enum] + + @impl true + def import_keywords, do: ~w[use extern] +end diff --git a/lib/codeqa/languages/code/native/swift.ex b/lib/codeqa/languages/code/native/swift.ex new file mode 100644 index 0000000..0422528 --- /dev/null +++ b/lib/codeqa/languages/code/native/swift.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Native.Swift do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "swift" + + @impl true + def extensions, do: ~w[swift] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while repeat func class struct enum protocol extension import + return let var guard defer do try catch throw switch case break continue + default in as is init self super nil true false async await + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || ?? = += -= *= /= %= -> => + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[func class struct enum protocol extension] + + @impl true + def branch_keywords, do: ~w[else catch case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[public private internal fileprivate open] + + @impl true + def function_keywords, do: ~w[func] + + @impl true + def module_keywords, do: ~w[class struct protocol extension enum] + + @impl true + def import_keywords, do: ~w[import] +end diff --git a/lib/codeqa/languages/code/native/zig.ex b/lib/codeqa/languages/code/native/zig.ex new file mode 100644 index 0000000..f3e13f8 --- /dev/null +++ b/lib/codeqa/languages/code/native/zig.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Native.Zig do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "zig" + + @impl true + def extensions, do: ~w[zig] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + const var fn if else for while switch return pub try catch error defer errdefer + comptime inline struct enum union test break continue null undefined unreachable + async await suspend resume orelse anytype anyerror bool void noreturn type + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= %= orelse catch + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[fn struct enum union] + + @impl true + def branch_keywords, do: ~w[else] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[pub inline comptime] + + @impl true + def function_keywords, do: ~w[fn] + + @impl true + def module_keywords, do: ~w[struct enum union] + + @impl true + def test_keywords, do: ~w[test] +end diff --git a/lib/codeqa/languages/code/scripting/julia.ex b/lib/codeqa/languages/code/scripting/julia.ex new file mode 100644 index 0000000..8f859d1 --- /dev/null +++ b/lib/codeqa/languages/code/scripting/julia.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Scripting.Julia do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "julia" + + @impl true + def extensions, do: ~w[jl] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [{"#=", "=#"}] + + @impl true + def keywords, do: ~w[ + if else elseif for while do end function return module import using export + struct mutable abstract type primitive begin let local global const try catch + finally throw macro quote true false nothing + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % ^ << >> & | ~ && || = += -= *= /= ÷ → ← |> + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function struct macro module] + + @impl true + def branch_keywords, do: ~w[else elseif catch finally] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def function_keywords, do: ~w[function macro] + + @impl true + def module_keywords, do: ~w[module struct] + + @impl true + def import_keywords, do: ~w[import using] + + @impl true + def test_keywords, do: ~w[@test @testset] +end diff --git a/lib/codeqa/languages/code/scripting/lua.ex b/lib/codeqa/languages/code/scripting/lua.ex new file mode 100644 index 0000000..7ae8e9d --- /dev/null +++ b/lib/codeqa/languages/code/scripting/lua.ex @@ -0,0 +1,47 @@ +defmodule CodeQA.Languages.Code.Scripting.Lua do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "lua" + + @impl true + def extensions, do: ~w[lua] + + @impl true + def comment_prefixes, do: ~w[--] + + @impl true + def block_comments, do: [{"--[[", "]]"}] + + @impl true + def keywords, do: ~w[ + and break do else elseif end false for function goto if in local nil not or + repeat return then true until while + ] + + @impl true + def operators, do: ~w[ + == ~= <= >= + - * / % ^ # & | ~ << >> // .. = and or not + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function local] + + @impl true + def branch_keywords, do: ~w[else elseif] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def function_keywords, do: ~w[function] + + @impl true + def import_keywords, do: ~w[require] +end diff --git a/lib/codeqa/languages/code/scripting/perl.ex b/lib/codeqa/languages/code/scripting/perl.ex new file mode 100644 index 0000000..3155f1c --- /dev/null +++ b/lib/codeqa/languages/code/scripting/perl.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Scripting.Perl do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "perl" + + @impl true + def extensions, do: ~w[pl pm t] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else elsif unless for foreach while do until sub my our local use require + package return last next redo goto print say die warn eval and or not defined + undef true false + ] + + @impl true + def operators, do: ~w[ + == != <= >= eq ne lt gt le ge + - * / % ** . x = += -= *= /= .= && || ! ~ & | + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ $ % + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[sub package] + + @impl true + def branch_keywords, do: ~w[else elsif] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def function_keywords, do: ~w[sub] + + @impl true + def module_keywords, do: ~w[package] + + @impl true + def import_keywords, do: ~w[use require] + + @impl true + def test_keywords, do: ~w[ok is isnt like unlike cmp_ok] +end diff --git a/lib/codeqa/languages/code/scripting/php.ex b/lib/codeqa/languages/code/scripting/php.ex new file mode 100644 index 0000000..294b9a1 --- /dev/null +++ b/lib/codeqa/languages/code/scripting/php.ex @@ -0,0 +1,55 @@ +defmodule CodeQA.Languages.Code.Scripting.PHP do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "php" + + @impl true + def extensions, do: ~w[php phtml php3 php4 php5 php7 php8] + + @impl true + def comment_prefixes, do: ~w[// #] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else elseif for foreach while do function class interface trait namespace + use return new echo print public private protected static abstract final + try catch finally throw switch case break continue default include require + include_once require_once extends implements null true false + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % ** << >> & | ^ ~ && || ?? = += -= *= /= %= -> :: => + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # $ + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function class interface trait namespace] + + @impl true + def branch_keywords, do: ~w[else elseif catch finally case default] + + @impl true + def block_end_tokens, do: ~w[} endif endfor endforeach endwhile endswitch] + + @impl true + def access_modifiers, do: ~w[public private protected static abstract final] + + @impl true + def function_keywords, do: ~w[function fn] + + @impl true + def module_keywords, do: ~w[class interface trait namespace] + + @impl true + def import_keywords, do: ~w[use namespace] +end diff --git a/lib/codeqa/languages/code/scripting/python.ex b/lib/codeqa/languages/code/scripting/python.ex new file mode 100644 index 0000000..e1c4bb4 --- /dev/null +++ b/lib/codeqa/languages/code/scripting/python.ex @@ -0,0 +1,57 @@ +defmodule CodeQA.Languages.Code.Scripting.Python do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "python" + + @impl true + def extensions, do: ~w[py pyi] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else elif for while def class import from return pass break continue + not and or in is lambda with as try except finally raise yield async await + global nonlocal del assert True False None + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % ** // << >> & | ^ ~ = += -= *= /= %= **= //= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[def class async] + + @impl true + def branch_keywords, do: ~w[elif else except finally] + + @impl true + def block_end_tokens, do: [] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[def async] + + @impl true + def module_keywords, do: ~w[class] + + @impl true + def import_keywords, do: ~w[import from] + + @impl true + def uses_colon_indent?, do: true +end diff --git a/lib/codeqa/languages/code/scripting/r.ex b/lib/codeqa/languages/code/scripting/r.ex new file mode 100644 index 0000000..d735d2b --- /dev/null +++ b/lib/codeqa/languages/code/scripting/r.ex @@ -0,0 +1,49 @@ +defmodule CodeQA.Languages.Code.Scripting.R do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "r" + + @impl true + def extensions, do: ~w[r R Rmd rmd] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else for while repeat break next return function TRUE FALSE NULL NA Inf NaN + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / ^ %% %/% %in% <- -> = & | ! && || ~ : :: + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function] + + @impl true + def branch_keywords, do: ~w[else] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def function_keywords, do: ~w[function] + + @impl true + def import_keywords, do: ~w[library require source] + + @impl true + def test_keywords, do: ~w[test_that expect_equal expect_true describe it] +end diff --git a/lib/codeqa/languages/code/scripting/ruby.ex b/lib/codeqa/languages/code/scripting/ruby.ex new file mode 100644 index 0000000..d1e9761 --- /dev/null +++ b/lib/codeqa/languages/code/scripting/ruby.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.Languages.Code.Scripting.Ruby do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "ruby" + + @impl true + def extensions, do: ~w[rb rake gemspec] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else elsif unless for while until def class module do end return begin + rescue ensure raise yield include extend require require_relative + attr_accessor attr_reader attr_writer then case when next break in + and or not true false nil self super + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % ** << >> & | ^ ~ = += -= *= /= %= **= <=> === =~ + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ | # ? + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[def class module] + + @impl true + def branch_keywords, do: ~w[else elsif rescue ensure when] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[def] + + @impl true + def module_keywords, do: ~w[class module] + + @impl true + def import_keywords, do: ~w[require require_relative include] + + @impl true + def test_keywords, do: ~w[it describe context scenario feature given] +end diff --git a/lib/codeqa/languages/code/scripting/shell.ex b/lib/codeqa/languages/code/scripting/shell.ex new file mode 100644 index 0000000..710d28a --- /dev/null +++ b/lib/codeqa/languages/code/scripting/shell.ex @@ -0,0 +1,47 @@ +defmodule CodeQA.Languages.Code.Scripting.Shell do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "shell" + + @impl true + def extensions, do: ~w[sh bash zsh fish] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else elif fi for while do done case esac function return then in until + select break continue exit local export readonly unset + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % && || | & > < >> << = += -= *= /= %= -eq -ne -lt -gt -le -ge + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # $ ! ? | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function] + + @impl true + def branch_keywords, do: ~w[else elif case] + + @impl true + def block_end_tokens, do: ~w[fi done esac] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[function] +end diff --git a/lib/codeqa/languages/code/vm/clojure.ex b/lib/codeqa/languages/code/vm/clojure.ex new file mode 100644 index 0000000..5dd149b --- /dev/null +++ b/lib/codeqa/languages/code/vm/clojure.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Vm.Clojure do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "clojure" + + @impl true + def extensions, do: ~w[clj cljs cljc edn] + + @impl true + def comment_prefixes, do: ~w[;] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + def defn defmacro let fn if do when cond case for loop recur ns require use + import try catch finally throw quote defprotocol defrecord deftype reify + extend-type extend-protocol nil true false and or not + ] + + @impl true + def operators, do: ~w[ + = == not= < > <= >= + - * / mod rem quot and or not + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; # @ ^ + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[def defn defmacro defprotocol defrecord deftype] + + @impl true + def branch_keywords, do: ~w[else] + + @impl true + def block_end_tokens, do: ~w[)] + + @impl true + def function_keywords, do: ~w[defn fn] + + @impl true + def module_keywords, do: ~w[ns defprotocol defrecord] + + @impl true + def import_keywords, do: ~w[ns require use import] + + @impl true + def test_keywords, do: ~w[deftest is testing] +end diff --git a/lib/codeqa/languages/code/vm/csharp.ex b/lib/codeqa/languages/code/vm/csharp.ex new file mode 100644 index 0000000..85edce7 --- /dev/null +++ b/lib/codeqa/languages/code/vm/csharp.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Vm.CSharp do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "csharp" + + @impl true + def extensions, do: ~w[cs csx] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for foreach while do class interface struct enum namespace using + return var new this base public private protected internal static abstract + virtual override sealed async await try catch finally throw switch case + break continue default in out ref void true false null readonly const + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || ?? = += -= *= /= %= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[class interface struct enum namespace] + + @impl true + def branch_keywords, do: ~w[else catch finally case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, + do: + ~w[public private protected internal static abstract virtual override sealed readonly const async] + + @impl true + def module_keywords, do: ~w[class interface struct enum namespace] + + @impl true + def import_keywords, do: ~w[using namespace] +end diff --git a/lib/codeqa/languages/code/vm/dart.ex b/lib/codeqa/languages/code/vm/dart.ex new file mode 100644 index 0000000..e821e22 --- /dev/null +++ b/lib/codeqa/languages/code/vm/dart.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.Languages.Code.Vm.Dart do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "dart" + + @impl true + def extensions, do: ~w[dart] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do switch case break continue return class extends implements + with new final const var void null true false import export part library + abstract static dynamic async await yield try catch finally throw rethrow + enum typedef mixin factory is as in + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % ~/ << >> & | ^ ~ && || ?? = += -= *= /= %= ??= -> => + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[class enum typedef mixin] + + @impl true + def branch_keywords, do: ~w[else catch finally case] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[static final const abstract] + + @impl true + def function_keywords, do: ~w[void async] + + @impl true + def module_keywords, do: ~w[class enum mixin] + + @impl true + def import_keywords, do: ~w[import export] + + @impl true + def test_keywords, do: ~w[test group setUp tearDown expect] +end diff --git a/lib/codeqa/languages/code/vm/elixir.ex b/lib/codeqa/languages/code/vm/elixir.ex new file mode 100644 index 0000000..2eab027 --- /dev/null +++ b/lib/codeqa/languages/code/vm/elixir.ex @@ -0,0 +1,59 @@ +defmodule CodeQA.Languages.Code.Vm.Elixir do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "elixir" + + @impl true + def extensions, do: ~w[ex exs] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else unless for do end def defp defmodule defmacro defmacrop defprotocol + defimpl defguard defdelegate defstruct case cond with when fn try rescue + catch raise receive in not and or true false nil + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % << >> & | ^ ~ && || |> <> <- -> = ! not and or in + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, + do: + ~w[def defp defmodule defmacro defmacrop defprotocol defimpl defdelegate defoverridable defguard] + + @impl true + def branch_keywords, do: ~w[else rescue catch ensure cond when case] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[def defp defmacro defmacrop defdelegate defguard] + + @impl true + def module_keywords, do: ~w[defmodule defprotocol defimpl] + + @impl true + def import_keywords, do: ~w[import require use alias] + + @impl true + def test_keywords, do: ~w[test describe] +end diff --git a/lib/codeqa/languages/code/vm/erlang.ex b/lib/codeqa/languages/code/vm/erlang.ex new file mode 100644 index 0000000..c835dd6 --- /dev/null +++ b/lib/codeqa/languages/code/vm/erlang.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Vm.Erlang do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "erlang" + + @impl true + def extensions, do: ~w[erl hrl] + + @impl true + def comment_prefixes, do: ~w[%] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if case when of begin end receive after fun try catch throw error exit + module export import define record true false ok undefined andalso orelse + not band bor bxor bnot bsl bsr div rem + ] + + @impl true + def operators, do: ~w[ + == /= =< >= =:= =/= + - * / ! <- -> :: | . , ; : + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; | -> + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[-module -record -define] + + @impl true + def branch_keywords, do: ~w[of after catch] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def function_keywords, do: ~w[fun] + + @impl true + def module_keywords, do: ~w[-module] + + @impl true + def import_keywords, do: ~w[-import -include] + + @impl true + def test_keywords, do: ~w[_test_ _test] +end diff --git a/lib/codeqa/languages/code/vm/fsharp.ex b/lib/codeqa/languages/code/vm/fsharp.ex new file mode 100644 index 0000000..9c7792f --- /dev/null +++ b/lib/codeqa/languages/code/vm/fsharp.ex @@ -0,0 +1,60 @@ +defmodule CodeQA.Languages.Code.Vm.Fsharp do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "fsharp" + + @impl true + def extensions, do: ~w[fs fsi fsx] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"(*", "*)"}] + + @impl true + def keywords, do: ~w[ + let rec if then else for while do match with type module open namespace val + mutable abstract member override new return yield async await try finally + raise true false null and or not in when downto to + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = |> <| >> << -> <- :: @ ? + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; | @ # -> + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[let type module] + + @impl true + def branch_keywords, do: ~w[else with] + + @impl true + def block_end_tokens, do: [] + + @impl true + def access_modifiers, do: ~w[public private protected internal static abstract override] + + @impl true + def function_keywords, do: ~w[let fun] + + @impl true + def module_keywords, do: ~w[module namespace type] + + @impl true + def import_keywords, do: ~w[open] + + @impl true + def test_keywords, do: ~w[testCase test testProperty] + + @impl true + def uses_colon_indent?, do: true +end diff --git a/lib/codeqa/languages/code/vm/java.ex b/lib/codeqa/languages/code/vm/java.ex new file mode 100644 index 0000000..fa018e0 --- /dev/null +++ b/lib/codeqa/languages/code/vm/java.ex @@ -0,0 +1,52 @@ +defmodule CodeQA.Languages.Code.Vm.Java do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "java" + + @impl true + def extensions, do: ~w[java] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do class interface extends implements import package + return new this super public private protected static abstract final + synchronized volatile try catch finally throw throws switch case break + continue default void true false null instanceof + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> >>> & | ^ ~ && || = += -= *= /= %= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[class interface] + + @impl true + def branch_keywords, do: ~w[else catch finally case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[public private protected static abstract final synchronized] + + @impl true + def module_keywords, do: ~w[class interface enum] + + @impl true + def import_keywords, do: ~w[import package] +end diff --git a/lib/codeqa/languages/code/vm/kotlin.ex b/lib/codeqa/languages/code/vm/kotlin.ex new file mode 100644 index 0000000..4c286c2 --- /dev/null +++ b/lib/codeqa/languages/code/vm/kotlin.ex @@ -0,0 +1,55 @@ +defmodule CodeQA.Languages.Code.Vm.Kotlin do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "kotlin" + + @impl true + def extensions, do: ~w[kt kts] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do fun class object interface data sealed abstract enum + companion import package return val var when is as in out by override open + final private protected public internal suspend inline reified crossinline + noinline try catch finally throw break continue null true false this super init + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % << >> & | ^ ~ && || ?: = += -= *= /= %= -> => :: + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[fun class object interface data sealed abstract enum] + + @impl true + def branch_keywords, do: ~w[else when catch finally] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[public private protected internal override open abstract final] + + @impl true + def function_keywords, do: ~w[fun] + + @impl true + def module_keywords, do: ~w[class interface object] + + @impl true + def import_keywords, do: ~w[import package] +end diff --git a/lib/codeqa/languages/code/vm/scala.ex b/lib/codeqa/languages/code/vm/scala.ex new file mode 100644 index 0000000..08ac7ab --- /dev/null +++ b/lib/codeqa/languages/code/vm/scala.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.Languages.Code.Vm.Scala do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "scala" + + @impl true + def extensions, do: ~w[scala sc] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do def class object trait extends with new return import + package val var type match case sealed abstract override final protected + private implicit lazy yield try catch finally throw true false null this super + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= => <- <: >: : + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[def class object trait type] + + @impl true + def branch_keywords, do: ~w[else catch case finally] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, + do: ~w[public private protected override abstract final sealed implicit lazy] + + @impl true + def function_keywords, do: ~w[def] + + @impl true + def module_keywords, do: ~w[class object trait package] + + @impl true + def import_keywords, do: ~w[import package] + + @impl true + def test_keywords, do: ~w[test it describe should] +end diff --git a/lib/codeqa/languages/code/web/javascript.ex b/lib/codeqa/languages/code/web/javascript.ex new file mode 100644 index 0000000..87f48f5 --- /dev/null +++ b/lib/codeqa/languages/code/web/javascript.ex @@ -0,0 +1,57 @@ +defmodule CodeQA.Languages.Code.Web.JavaScript do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "javascript" + + @impl true + def extensions, do: ~w[js mjs cjs jsx vue svelte] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while function class return var let const import export from + new this typeof instanceof try catch finally throw switch case break + continue default delete in of async await yield true false null undefined + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % ** << >> >>> & | ^ ~ && || ?? = += -= *= /= %= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function class async] + + @impl true + def branch_keywords, do: ~w[else catch finally case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[export static] + + @impl true + def function_keywords, do: ~w[function async] + + @impl true + def module_keywords, do: ~w[class] + + @impl true + def import_keywords, do: ~w[import] + + @impl true + def test_keywords, do: ~w[test it describe context scenario feature given] +end diff --git a/lib/codeqa/languages/code/web/typescript.ex b/lib/codeqa/languages/code/web/typescript.ex new file mode 100644 index 0000000..b8a422a --- /dev/null +++ b/lib/codeqa/languages/code/web/typescript.ex @@ -0,0 +1,60 @@ +defmodule CodeQA.Languages.Code.Web.TypeScript do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "typescript" + + @impl true + def extensions, do: ~w[ts tsx] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while function class return var let const import export from + new this typeof instanceof try catch finally throw switch case break + continue default delete in of async await yield true false null undefined + type interface enum namespace declare abstract override readonly implements + extends satisfies as keyof typeof infer never unknown any void + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % ** << >> >>> & | ^ ~ && || ?? = += -= *= /= %= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => < + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function class async interface enum namespace type declare] + + @impl true + def branch_keywords, do: ~w[else catch finally case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, + do: ~w[export public private protected static abstract override readonly sealed] + + @impl true + def function_keywords, do: ~w[function async] + + @impl true + def module_keywords, do: ~w[class interface enum namespace] + + @impl true + def import_keywords, do: ~w[import] + + @impl true + def test_keywords, do: ~w[test it describe context scenario feature given] +end diff --git a/lib/codeqa/languages/config/dockerfile.ex b/lib/codeqa/languages/config/dockerfile.ex new file mode 100644 index 0000000..e1ed3a6 --- /dev/null +++ b/lib/codeqa/languages/config/dockerfile.ex @@ -0,0 +1,35 @@ +defmodule CodeQA.Languages.Config.Dockerfile do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "dockerfile" + + @impl true + def extensions, do: ~w[Dockerfile] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + FROM RUN CMD LABEL EXPOSE ENV ADD COPY ENTRYPOINT VOLUME USER WORKDIR ARG + ONBUILD STOPSIGNAL HEALTHCHECK SHELL AS + ] + + @impl true + def operators, do: ~w[ + = \ + ] + + @impl true + def delimiters, do: ~w[ + ( ) , : # + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[FROM] +end diff --git a/lib/codeqa/languages/config/makefile.ex b/lib/codeqa/languages/config/makefile.ex new file mode 100644 index 0000000..ffb4522 --- /dev/null +++ b/lib/codeqa/languages/config/makefile.ex @@ -0,0 +1,32 @@ +defmodule CodeQA.Languages.Config.Makefile do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "makefile" + + @impl true + def extensions, do: ~w[Makefile GNUmakefile mk] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + ifeq ifneq ifdef ifndef else endif define endef include export unexport + override private vpath all clean install + ] + + @impl true + def operators, do: ~w[ + = := ::= ?= += != + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ $ % # \ + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/config/terraform.ex b/lib/codeqa/languages/config/terraform.ex new file mode 100644 index 0000000..c35cb9f --- /dev/null +++ b/lib/codeqa/languages/config/terraform.ex @@ -0,0 +1,33 @@ +defmodule CodeQA.Languages.Config.Terraform do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "terraform" + + @impl true + def extensions, do: ~w[tf tfvars] + + @impl true + def comment_prefixes, do: ~w[# //] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + resource data variable output locals module provider terraform + required_providers backend for_each count depends_on lifecycle + source version true false null for if + ] + + @impl true + def operators, do: ~w[ + = == != <= >= && || ! ? : + ] + + @impl true + def delimiters, do: ~w[ + { } ( ) , . : = " # // + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/data/graphql.ex b/lib/codeqa/languages/data/graphql.ex new file mode 100644 index 0000000..47dbc51 --- /dev/null +++ b/lib/codeqa/languages/data/graphql.ex @@ -0,0 +1,32 @@ +defmodule CodeQA.Languages.Data.GraphQL do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "graphql" + + @impl true + def extensions, do: ~w[graphql gql] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + query mutation subscription fragment on type interface union enum input + scalar schema directive extend implements true false null + ] + + @impl true + def operators, do: ~w[ + = : ! | & + ] + + @impl true + def delimiters, do: ~w[ + { } ( ) , . : # @ ! + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/data/json.ex b/lib/codeqa/languages/data/json.ex new file mode 100644 index 0000000..0b1909e --- /dev/null +++ b/lib/codeqa/languages/data/json.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Data.Json do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "json" + + @impl true + def extensions, do: ~w[json jsonc] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + true false null + ] + + @impl true + def operators, do: ~w[ + : + ] + + @impl true + def delimiters, do: ~w[ + { } , " ' + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/data/sql.ex b/lib/codeqa/languages/data/sql.ex new file mode 100644 index 0000000..ddc4018 --- /dev/null +++ b/lib/codeqa/languages/data/sql.ex @@ -0,0 +1,40 @@ +defmodule CodeQA.Languages.Data.Sql do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "sql" + + @impl true + def extensions, do: ~w[sql] + + @impl true + def comment_prefixes, do: ~w[--] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + SELECT FROM WHERE INSERT INTO UPDATE DELETE SET CREATE DROP ALTER TABLE + INDEX VIEW JOIN LEFT RIGHT INNER OUTER FULL CROSS ON AND OR NOT IN EXISTS + AS GROUP BY ORDER HAVING LIMIT OFFSET DISTINCT NULL TRUE FALSE PRIMARY KEY + FOREIGN REFERENCES CASCADE UNIQUE DEFAULT VALUES RETURNING WITH UNION + INTERSECT EXCEPT CASE WHEN THEN ELSE END IF BEGIN COMMIT ROLLBACK + ] + + @impl true + def operators, do: ~w[ + = != <> <= >= + - * / % LIKE BETWEEN IS IN + ] + + @impl true + def delimiters, do: ~w[ + ( ) , . ; ' " -- /* + ] ++ ~w( [ ] ) + + @impl true + def statement_keywords, + do: + ~w[select insert update delete create drop alter truncate begin commit rollback call execute] +end diff --git a/lib/codeqa/languages/data/toml.ex b/lib/codeqa/languages/data/toml.ex new file mode 100644 index 0000000..1051c0d --- /dev/null +++ b/lib/codeqa/languages/data/toml.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Data.Toml do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "toml" + + @impl true + def extensions, do: ~w[toml] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + true false + ] + + @impl true + def operators, do: ~w[ + = + ] + + @impl true + def delimiters, do: ~w[ + { } , . : # " ' + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/data/yaml.ex b/lib/codeqa/languages/data/yaml.ex new file mode 100644 index 0000000..8beb0cb --- /dev/null +++ b/lib/codeqa/languages/data/yaml.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Data.Yaml do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "yaml" + + @impl true + def extensions, do: ~w[yml yaml] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + true false null yes no on off + ] + + @impl true + def operators, do: ~w[ + : | > & * ! + ] + + @impl true + def delimiters, do: ~w[ + { } , . # @ --- + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/language.ex b/lib/codeqa/languages/language.ex new file mode 100644 index 0000000..3ccd172 --- /dev/null +++ b/lib/codeqa/languages/language.ex @@ -0,0 +1,181 @@ +defmodule CodeQA.Language do + @moduledoc false + @callback name() :: String.t() + @callback extensions() :: [String.t()] + @callback comment_prefixes() :: [String.t()] + @callback block_comments() :: [{String.t(), String.t()}] + @callback keywords() :: [String.t()] + @callback operators() :: [String.t()] + @callback delimiters() :: [String.t()] + + @callback declaration_keywords() :: [String.t()] + @callback branch_keywords() :: [String.t()] + @callback block_end_tokens() :: [String.t()] + @callback access_modifiers() :: [String.t()] + @callback statement_keywords() :: [String.t()] + + @callback function_keywords() :: [String.t()] + @callback module_keywords() :: [String.t()] + @callback import_keywords() :: [String.t()] + @callback test_keywords() :: [String.t()] + @callback uses_colon_indent?() :: boolean() + @callback divider_indicators() :: [String.t()] + + @optional_callbacks [ + declaration_keywords: 0, + branch_keywords: 0, + block_end_tokens: 0, + access_modifiers: 0, + statement_keywords: 0, + function_keywords: 0, + module_keywords: 0, + import_keywords: 0, + test_keywords: 0, + uses_colon_indent?: 0, + divider_indicators: 0 + ] + + defmacro __using__(_opts) do + quote do + @behaviour CodeQA.Language + def declaration_keywords, do: [] + def branch_keywords, do: [] + def block_end_tokens, do: [] + def access_modifiers, do: [] + def statement_keywords, do: [] + def function_keywords, do: [] + def module_keywords, do: [] + def import_keywords, do: [] + def test_keywords, do: [] + def uses_colon_indent?, do: false + def divider_indicators, do: ~w[-- - == === ~ * ** # // / =] + + defoverridable declaration_keywords: 0, + branch_keywords: 0, + block_end_tokens: 0, + access_modifiers: 0, + statement_keywords: 0, + function_keywords: 0, + module_keywords: 0, + import_keywords: 0, + test_keywords: 0, + uses_colon_indent?: 0, + divider_indicators: 0 + end + end + + @spec all() :: [module()] + def all do + {:ok, modules} = :application.get_key(:codeqa, :modules) + Enum.filter(modules, &implements?/1) + end + + @spec all_keywords() :: [String.t()] + def all_keywords do + all() + |> Enum.flat_map(& &1.keywords()) + |> Enum.uniq() + end + + @spec keywords(atom() | String.t()) :: MapSet.t() + def keywords(language) do + case find(language) do + nil -> MapSet.new() + mod -> MapSet.new(mod.keywords()) + end + end + + @spec operators(atom() | String.t()) :: MapSet.t() + def operators(language) do + case find(language) do + nil -> MapSet.new() + mod -> MapSet.new(mod.operators()) + end + end + + @spec delimiters(atom() | String.t()) :: MapSet.t() + def delimiters(language) do + case find(language) do + nil -> MapSet.new() + mod -> MapSet.new(mod.delimiters()) + end + end + + @spec declaration_keywords(module()) :: MapSet.t() + def declaration_keywords(mod), do: MapSet.new(mod.declaration_keywords()) + + @spec branch_keywords(module()) :: MapSet.t() + def branch_keywords(mod), do: MapSet.new(mod.branch_keywords()) + + @spec block_end_tokens(module()) :: MapSet.t() + def block_end_tokens(mod), do: MapSet.new(mod.block_end_tokens()) + + @spec access_modifiers(module()) :: MapSet.t() + def access_modifiers(mod), do: MapSet.new(mod.access_modifiers()) + + @spec statement_keywords(module()) :: MapSet.t() + def statement_keywords(mod), do: MapSet.new(mod.statement_keywords()) + + @spec function_keywords(module()) :: MapSet.t() + def function_keywords(mod), do: MapSet.new(mod.function_keywords()) + + @spec module_keywords(module()) :: MapSet.t() + def module_keywords(mod), do: MapSet.new(mod.module_keywords()) + + @spec import_keywords(module()) :: MapSet.t() + def import_keywords(mod), do: MapSet.new(mod.import_keywords()) + + @spec test_keywords(module()) :: MapSet.t() + def test_keywords(mod), do: MapSet.new(mod.test_keywords()) + + @spec divider_indicators(module()) :: MapSet.t() + def divider_indicators(mod), do: MapSet.new(mod.divider_indicators()) + + @spec find(atom() | String.t()) :: module() + def find(language) do + name = to_string(language) + Enum.find(all(), fn mod -> mod.name() == name end) || CodeQA.Languages.Unknown + end + + @spec detect(String.t()) :: module() + def detect(path) do + basename = Path.basename(path) + ext = path |> Path.extname() |> String.trim_leading(".") + + Enum.find(all(), fn mod -> + ext in mod.extensions() or (ext == "" and basename in mod.extensions()) + end) || CodeQA.Languages.Unknown + end + + @spec strip_comments(String.t(), module()) :: String.t() + def strip_comments(content, language_mod) do + content + |> strip_block_comments(language_mod.block_comments()) + |> strip_line_comments(language_mod.comment_prefixes()) + end + + defp strip_block_comments(content, []), do: content + + defp strip_block_comments(content, pairs) do + Enum.reduce(pairs, content, fn {open, close}, acc -> + regex = Regex.compile!(Regex.escape(open) <> ".*?" <> Regex.escape(close), [:dotall]) + + Regex.replace(regex, acc, fn match -> + String.replace(match, ~r/[^\n]/, "") + end) + end) + end + + defp strip_line_comments(content, []), do: content + + defp strip_line_comments(content, prefixes) do + pattern = Enum.map_join(prefixes, "|", &Regex.escape/1) + Regex.replace(Regex.compile!("(#{pattern}).*$", [:multiline]), content, "") + end + + defp implements?(module) do + CodeQA.Language in (module.__info__(:attributes)[:behaviour] || []) + rescue + _ -> false + end +end diff --git a/lib/codeqa/languages/markup/css.ex b/lib/codeqa/languages/markup/css.ex new file mode 100644 index 0000000..0b0af14 --- /dev/null +++ b/lib/codeqa/languages/markup/css.ex @@ -0,0 +1,32 @@ +defmodule CodeQA.Languages.Markup.Css do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "css" + + @impl true + def extensions, do: ~w[css scss sass less] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + media keyframes import charset supports layer font-face from to + auto none inherit initial unset normal bold italic + ] + + @impl true + def operators, do: ~w[ + : ; > + ~ * = ^= $= *= ~= |= + ] + + @impl true + def delimiters, do: ~w[ + { } ( ) , . # : ; @ + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/markup/html.ex b/lib/codeqa/languages/markup/html.ex new file mode 100644 index 0000000..31a0fe8 --- /dev/null +++ b/lib/codeqa/languages/markup/html.ex @@ -0,0 +1,34 @@ +defmodule CodeQA.Languages.Markup.Html do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "html" + + @impl true + def extensions, do: ~w[html htm heex eex leex erb htmlbars hbs mustache jinja jinja2 njk liquid] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [{""}] + + @impl true + def keywords, do: ~w[ + html head body div span p a img input form button select option textarea + script style link meta title h1 h2 h3 h4 h5 h6 ul ol li table tr td th + header footer nav main section article aside figure figcaption + class id href src type name value rel action method placeholder + ] + + @impl true + def operators, do: ~w[ + < > / = & + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; " ' # ! ? + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/markup/markdown.ex b/lib/codeqa/languages/markup/markdown.ex new file mode 100644 index 0000000..ee75d60 --- /dev/null +++ b/lib/codeqa/languages/markup/markdown.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Markup.Markdown do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "markdown" + + @impl true + def extensions, do: ~w[md mdx] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + TODO NOTE FIXME WARNING IMPORTANT + ] + + @impl true + def operators, do: ~w[ + # ## ### #### ##### ###### > ``` ** * _ __ ~~ + ] + + @impl true + def delimiters, do: ~w[ + ( ) . ! ? ` * _ ~ + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/markup/xml.ex b/lib/codeqa/languages/markup/xml.ex new file mode 100644 index 0000000..85c7668 --- /dev/null +++ b/lib/codeqa/languages/markup/xml.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Markup.Xml do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "xml" + + @impl true + def extensions, do: ~w[xml svg xsl xslt xsd wsdl plist] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [{""}] + + @impl true + def keywords, do: ~w[ + xmlns version encoding standalone + ] + + @impl true + def operators, do: ~w[ + < > / = & + ] + + @impl true + def delimiters, do: ~w[ + ( ) , . : ; " ' # ! ? + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/unknown.ex b/lib/codeqa/languages/unknown.ex new file mode 100644 index 0000000..11a0f7a --- /dev/null +++ b/lib/codeqa/languages/unknown.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Unknown do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "unknown" + + @impl true + def extensions, do: [] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else + ] + + @impl true + def operators, do: ~w[ + == != + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } + ] +end diff --git a/lib/codeqa/metrics/codebase/codebase_metric.ex b/lib/codeqa/metrics/codebase/codebase_metric.ex new file mode 100644 index 0000000..8b275c8 --- /dev/null +++ b/lib/codeqa/metrics/codebase/codebase_metric.ex @@ -0,0 +1,14 @@ +defmodule CodeQA.Metrics.Codebase.CodebaseMetric do + @moduledoc """ + Behaviour for metrics that operate across an entire codebase. + + Unlike `FileMetric`, which analyzes a single file, codebase metrics receive + a map of all source files and can compute cross-file properties such as + duplication or structural similarity. + + See [software metrics](https://en.wikipedia.org/wiki/Software_metric). + """ + + @callback name() :: String.t() + @callback analyze(%{String.t() => String.t()}, keyword()) :: map() +end diff --git a/lib/codeqa/metrics/codebase/near_duplicate_blocks_codebase.ex b/lib/codeqa/metrics/codebase/near_duplicate_blocks_codebase.ex new file mode 100644 index 0000000..2e821e9 --- /dev/null +++ b/lib/codeqa/metrics/codebase/near_duplicate_blocks_codebase.ex @@ -0,0 +1,44 @@ +defmodule CodeQA.Metrics.Codebase.NearDuplicateBlocksCodebase do + @moduledoc """ + Counts near-duplicate and exact-duplicate natural code blocks across the codebase. + + Detects blocks per file, pools them, and finds pairs across all files. + Includes pair source lists (capped by max_pairs_per_bucket). + + Configure in .codeqa.yml: + near_duplicate_blocks: + max_pairs_per_bucket: 50 + """ + + @behaviour CodeQA.Metrics.Codebase.CodebaseMetric + + alias CodeQA.Analysis.FileContextServer + alias CodeQA.Metrics.File.NearDuplicateBlocks + + @impl true + def name, do: "near_duplicate_blocks_codebase" + + @impl true + def analyze(files, opts \\ []) do + ndb_opts = Keyword.get(opts, :near_duplicate_blocks, []) + max_pairs = Keyword.get(ndb_opts, :max_pairs_per_bucket, nil) + workers = Keyword.get(opts, :workers, System.schedulers_online()) + + ndb_opts = + [include_pairs: true, max_pairs_per_bucket: max_pairs, workers: workers] ++ + Keyword.take(opts, [:on_progress]) + + pid = Keyword.fetch!(opts, :file_context_pid) + + all_blocks = + Enum.flat_map(files, fn {path, content} -> + ctx = FileContextServer.get(pid, content, path: path) + NearDuplicateBlocks.label_blocks(ctx.blocks, path) + end) + + result = NearDuplicateBlocks.analyze_from_blocks(all_blocks, ndb_opts) + + result + |> Map.reject(fn {k, _} -> k in ["block_count", "sub_block_count"] end) + end +end diff --git a/lib/codeqa/metrics/similarity.ex b/lib/codeqa/metrics/codebase/similarity.ex similarity index 65% rename from lib/codeqa/metrics/similarity.ex rename to lib/codeqa/metrics/codebase/similarity.ex index 910e631..e20e556 100644 --- a/lib/codeqa/metrics/similarity.ex +++ b/lib/codeqa/metrics/codebase/similarity.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Similarity do +defmodule CodeQA.Metrics.Codebase.Similarity do @moduledoc """ Detects cross-file code duplication at the codebase level. @@ -9,23 +9,19 @@ defmodule CodeQA.Metrics.Similarity do See [winnowing](https://theory.stanford.edu/~aiken/publications/papers/sigmod03.pdf), [locality-sensitive hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing), and [normalized compression distance](https://en.wikipedia.org/wiki/Normalized_compression_distance). - - ## Options - - - `:show_ncd` — boolean, whether to compute per-pair NCD scores (default: `false`) - - `:ncd_paths` — list of file paths to compute similarity for (default: all files) - - `:ncd_top` — integer, max similar files to return per path (default: all) - - `:ncd_threshold` — minimum Jaccard similarity to consider as candidate (default: `0.20`) - - `:workers` — number of parallel workers (default: `System.schedulers_online/0`) - - `:on_progress` — include this key (any value) to enable progress output to stderr - - `:fp_stopwords` — `MapSet` of fingerprint hashes to exclude (default: empty) """ - @behaviour CodeQA.Metrics.CodebaseMetric + @behaviour CodeQA.Metrics.Codebase.CodebaseMetric + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.CLI.UI + alias CodeQA.Metrics.File.Winnowing @impl true def name, do: "similarity" + def keys, do: ["ncd_pairs", "cross_file_density"] + @spec analyze(map(), keyword()) :: map() @impl true def analyze(files, opts \\ []) @@ -37,6 +33,9 @@ defmodule CodeQA.Metrics.Similarity do def analyze(files, opts) do names = Map.keys(files) contents = Map.values(files) + has_progress = Keyword.has_key?(opts, :on_progress) + + if has_progress, do: IO.puts(:stderr, " Computing cross-file density...") result = %{ "cross_file_density" => cross_file_density(contents) @@ -80,38 +79,45 @@ defmodule CodeQA.Metrics.Similarity do if has_progress, do: IO.puts(:stderr, " 2/5 Computing Winnowing fingerprints...") result = - CodeQA.Telemetry.time(:ncd_fingerprinting, fn -> - contents - |> Enum.with_index() - |> Task.async_stream( - fn {content, i} -> - fp = compute_fingerprints(content, opts) - {i, fp} - end, max_concurrency: workers, timeout: :infinity) - |> Enum.map(fn {:ok, {i, fp}} -> - print_bar_progress(has_progress, i, length(contents), "Fingerprinting") + contents + |> Enum.with_index() + |> Task.async_stream( + fn {content, i} -> + fp = compute_fingerprints(content, opts) {i, fp} - end) - |> Map.new() + end, + max_concurrency: workers, + timeout: :infinity + ) + |> Enum.map(fn {:ok, {i, fp}} -> + maybe_print_fingerprint_progress(has_progress, i, length(contents)) + {i, fp} end) + |> Map.new() if has_progress, do: IO.puts(:stderr, "") result end + defp maybe_print_fingerprint_progress(false, _i, _total), do: :ok + + defp maybe_print_fingerprint_progress(true, i, total) do + if rem(i + 1, max(1, div(total, 20))) == 0 do + IO.write(:stderr, "\r" <> UI.progress_bar(i + 1, total, label: "Fingerprinting")) + end + end + defp build_inverted_index(fingerprints_by_id, has_progress) do if has_progress, do: IO.puts(:stderr, " 3/5 Building inverted index...") total = map_size(fingerprints_by_id) result = - CodeQA.Telemetry.time(:ncd_build_index, fn -> - fingerprints_by_id - |> Enum.with_index() - |> Enum.reduce(%{}, fn {{i, set}, idx}, acc -> - print_bar_progress(has_progress, idx, total, "Indexing") - index_fingerprint_set(set, i, acc) - end) + fingerprints_by_id + |> Enum.with_index() + |> Enum.reduce(%{}, fn {{i, set}, idx}, acc -> + maybe_print_index_progress(has_progress, idx, total) + index_fingerprint_set(set, i, acc) end) if has_progress, do: IO.puts(:stderr, "") @@ -124,6 +130,14 @@ defmodule CodeQA.Metrics.Similarity do end) end + defp maybe_print_index_progress(false, _idx, _total), do: :ok + + defp maybe_print_index_progress(true, idx, total) do + if rem(idx + 1, max(1, div(total, 20))) == 0 do + IO.write(:stderr, "\r" <> UI.progress_bar(idx + 1, total, label: "Indexing")) + end + end + defp find_candidate_pairs( fingerprints_by_id, inverted_index, @@ -136,37 +150,37 @@ defmodule CodeQA.Metrics.Similarity do if has_progress, do: IO.puts(:stderr, " 4/5 Identifying candidate pairs...") total = map_size(fingerprints_by_id) - names_tuple = List.to_tuple(names) candidates = - CodeQA.Telemetry.time(:ncd_lsh_filter, fn -> - fingerprints_by_id - |> Enum.with_index() - |> Task.async_stream( - fn {{i, set}, idx} -> - valid_pairs = - collect_valid_pairs( - i, - set, - inverted_index, - fingerprints_by_id, - names_tuple, - target_set, - threshold - ) - - {idx, valid_pairs} - end, max_concurrency: workers, timeout: :infinity) - |> Enum.reduce(%{}, fn {:ok, {idx, valid_pairs}}, acc -> - print_bar_progress(has_progress, idx, total, "LSH Filter") - merge_valid_pairs(valid_pairs, acc) - end) + fingerprints_by_id + |> Enum.with_index() + |> Task.async_stream( + fn {{i, set}, idx} -> + valid_pairs = + collect_valid_pairs( + i, + set, + inverted_index, + fingerprints_by_id, + names, + target_set, + threshold + ) + + {idx, valid_pairs} + end, + max_concurrency: workers, + timeout: :infinity + ) + |> Enum.reduce(%{}, fn {:ok, {idx, valid_pairs}}, acc -> + maybe_print_lsh_progress(has_progress, idx, total) + merge_valid_pairs(valid_pairs, acc) end) if has_progress, do: IO.puts(:stderr, "") Enum.map(candidates, fn {{i, j}, jaccard} -> - {elem(names_tuple, i), i, elem(names_tuple, j), j, jaccard} + {Enum.at(names, i), i, Enum.at(names, j), j, jaccard} end) end @@ -175,19 +189,19 @@ defmodule CodeQA.Metrics.Similarity do set, inverted_index, fingerprints_by_id, - names_tuple, + names, target_set, threshold ) do collisions = count_collisions(set, inverted_index, i) size_a = MapSet.size(set) - name_a = elem(names_tuple, i) + name_a = Enum.at(names, i) is_target_a = MapSet.member?(target_set, name_a) collisions - |> Enum.filter(fn {j, _} -> is_target_a or MapSet.member?(target_set, elem(names_tuple, j)) end) + |> Enum.filter(fn {j, _} -> is_target_a or MapSet.member?(target_set, Enum.at(names, j)) end) |> Enum.reduce([], fn {j, intersection}, acc_pairs -> jaccard = compute_jaccard(size_a, MapSet.size(Map.get(fingerprints_by_id, j)), intersection) if jaccard >= threshold, do: [{{i, j}, jaccard} | acc_pairs], else: acc_pairs @@ -217,11 +231,11 @@ defmodule CodeQA.Metrics.Similarity do end) end - defp print_bar_progress(false, _current, _total, _label), do: :ok + defp maybe_print_lsh_progress(false, _idx, _total), do: :ok - defp print_bar_progress(true, current, total, label) do - if rem(current + 1, max(1, div(total, 20))) == 0 do - IO.write(:stderr, "\r" <> CodeQA.CLI.UI.progress_bar(current + 1, total, label: label)) + defp maybe_print_lsh_progress(true, idx, total) do + if rem(idx + 1, max(1, div(total, 20))) == 0 do + IO.write(:stderr, "\r" <> UI.progress_bar(idx + 1, total, label: "LSH Filter")) end end @@ -240,26 +254,25 @@ defmodule CodeQA.Metrics.Similarity do counter = :counters.new(1, [:atomics]) start_time_ncd = System.monotonic_time(:millisecond) - CodeQA.Telemetry.time(:ncd_exact_compression_phase, fn -> - filtered_pairs - |> Task.async_stream( - fn {name_a, i, name_b, j, _jaccard} -> - ncd = compute_single_ncd(precomputed, i, j) - maybe_print_ncd_progress(has_progress, counter, total_pairs, start_time_ncd) - {name_a, name_b, ncd} - end, max_concurrency: workers, timeout: :infinity) - |> Enum.map(fn {:ok, res} -> res end) - end) + filtered_pairs + |> Task.async_stream( + fn {name_a, i, name_b, j, _jaccard} -> + ncd = compute_single_ncd(precomputed, i, j) + maybe_print_ncd_progress(has_progress, counter, total_pairs, start_time_ncd) + {name_a, name_b, ncd} + end, + max_concurrency: workers, + timeout: :infinity + ) + |> Enum.map(fn {:ok, res} -> res end) end defp compute_single_ncd(precomputed, i, j) do - CodeQA.Telemetry.time(:ncd_single_compression, fn -> - {a, ca} = elem(precomputed, i) - {b, cb} = elem(precomputed, j) - cab = byte_size(:zlib.compress([a, b])) - ncd = if max(ca, cb) > 0, do: (cab - min(ca, cb)) / max(ca, cb), else: 0.0 - Float.round(ncd, 4) - end) + {a, ca} = elem(precomputed, i) + {b, cb} = elem(precomputed, j) + cab = byte_size(:zlib.compress([a, b])) + ncd = if max(ca, cb) > 0, do: (cab - min(ca, cb)) / max(ca, cb), else: 0.0 + Float.round(ncd, 4) end defp maybe_print_ncd_progress(false, _counter, _total_pairs, _start_time), do: :ok @@ -275,8 +288,8 @@ defmodule CodeQA.Metrics.Similarity do eta_ms = round((total_pairs - c) * avg_time) output = - CodeQA.CLI.UI.progress_bar(c, total_pairs, - eta: CodeQA.CLI.UI.format_eta(eta_ms), + UI.progress_bar(c, total_pairs, + eta: UI.format_eta(eta_ms), label: "NCD Compression" ) @@ -316,13 +329,11 @@ defmodule CodeQA.Metrics.Similarity do end end - defp compute_fingerprints(content, opts) do - fp_stopwords = Keyword.get(opts, :fp_stopwords, MapSet.new()) - + defp compute_fingerprints(content, _opts) do content - |> CodeQA.Metrics.TokenNormalizer.normalize() - |> CodeQA.Metrics.Winnowing.kgrams(5) - |> Enum.reject(&MapSet.member?(fp_stopwords, &1)) + |> TokenNormalizer.normalize_structural() + |> Enum.map(& &1.kind) + |> Winnowing.kgrams(5) |> MapSet.new() end diff --git a/lib/codeqa/metrics/codebase_metric.ex b/lib/codeqa/metrics/codebase_metric.ex deleted file mode 100644 index 0b1284d..0000000 --- a/lib/codeqa/metrics/codebase_metric.ex +++ /dev/null @@ -1,42 +0,0 @@ -defmodule CodeQA.Metrics.CodebaseMetric do - @moduledoc """ - Behaviour for metrics that operate across an entire codebase. - - Unlike `FileMetric`, which analyzes a single file, codebase metrics receive - a map of all source files and can compute cross-file properties such as - duplication or structural similarity. - - ## Common opts keys - - Implementations may accept keyword options including: - - `:workers` — number of parallel workers (default: `System.schedulers_online/0`) - - `:on_progress` — progress callback key (presence enables progress output) - - ## Minimal implementation - - defmodule MyCodebaseMetric do - @behaviour CodeQA.Metrics.CodebaseMetric - - @impl true - def name, do: "my_metric" - - @impl true - def analyze(files, _opts) do - %{"file_count" => map_size(files)} - end - end - - See [software metrics](https://en.wikipedia.org/wiki/Software_metric). - """ - - @typedoc "Map of file path to file content string." - @type file_map :: %{required(String.t()) => String.t()} - - @callback name() :: String.t() - @callback analyze(file_map(), keyword()) :: map() - - @doc "Human-readable description of what this metric measures." - @callback description() :: String.t() - - @optional_callbacks [description: 0] -end diff --git a/lib/codeqa/metrics/file/bradford.ex b/lib/codeqa/metrics/file/bradford.ex new file mode 100644 index 0000000..22b7bce --- /dev/null +++ b/lib/codeqa/metrics/file/bradford.ex @@ -0,0 +1,99 @@ +defmodule CodeQA.Metrics.File.Bradford do + @moduledoc """ + Applies Bradford's concentration law to token density across lines. + + Lines are ranked by token count (densest first), then grouped into three + zones of equal total tokens. The ratio between zone sizes gives Bradford's + k values: how many more lines each successive zone needs to match the + token yield of the previous one. + + k ≈ 1 uniform density — tokens spread evenly across lines + k = 3–5 Bradford-like — a small dense core, long sparse tail + k >> 5 extreme concentration — a few lines carry almost all tokens + + k1 = zone2_lines / zone1_lines (core → middle transition) + k2 = zone3_lines / zone2_lines (middle → tail transition) + k_ratio = k2 / k1 (> 1 means tail is more stretched than core) + + In a perfect Bradford distribution k1 ≈ k2. In practice k2 > k1 is common + (moderate core, very stretched tail); k1 > k2 suggests extreme concentration + that levels off quickly. + + See [Bradford's law](https://en.wikipedia.org/wiki/Bradford%27s_law). + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "bradford" + + @impl true + def keys, do: ["k1", "k2", "k_ratio"] + + @spec analyze(map()) :: map() + @impl true + def analyze(%{tokens: []}) do + %{"k1" => 0.0, "k2" => 0.0, "k_ratio" => 0.0} + end + + def analyze(%{tokens: tokens}) do + # Count tokens per line using the .line field, then rank densest-first — + # this is Bradford's "sort sources by yield" step. + counts = + tokens + |> Enum.group_by(& &1.line) + |> Enum.map(fn {_line, toks} -> length(toks) end) + |> Enum.sort(:desc) + + total = Enum.sum(counts) + + # Need at least 3 lines and 3 tokens to form meaningful zones. + if total < 3 or length(counts) < 3 do + %{"k1" => 0.0, "k2" => 0.0, "k_ratio" => 0.0} + else + # Each zone should contain one third of all tokens. + # We find zone boundaries by walking the ranked list until each third is filled. + third = total / 3 + + # n1: lines in zone 1 (the dense core — fewest lines, highest token density) + # n2: lines in zone 2 (middle tier) + # n3: all remaining lines (the sparse tail) + {n1, rest} = count_until(counts, third) + {n2, _} = count_until(rest, third) + n3 = length(counts) - n1 - n2 + + # k1 > 1 always: the middle zone always needs more lines than the core. + # Higher k1 = more extreme concentration in the core (fewer lines do more work). + k1 = if n1 > 0, do: Float.round(n2 / n1, 4), else: 0.0 + + # k2 > 1 always: the tail always needs more lines than the middle. + # Higher k2 = longer sparse tail relative to the middle zone. + k2 = if n2 > 0, do: Float.round(n3 / n2, 4), else: 0.0 + + # k_ratio = k2 / k1 + # > 1: the tail is more stretched than the core is concentrated (common — many trivial lines) + # < 1: the core is more extreme than the tail is sparse (god-function pattern) + # ≈ 1: a clean Bradford distribution where each zone multiplies evenly + k_ratio = if k1 > 0, do: Float.round(k2 / k1, 4), else: 0.0 + + %{"k1" => k1, "k2" => k2, "k_ratio" => k_ratio} + end + end + + # Walks the density-ranked list, consuming lines until the accumulated token + # count reaches the zone target. Returns {lines_consumed, remaining_list}. + # The remaining list is passed directly to the next zone's count_until call, + # so zones are computed in a single linear pass over the sorted counts. + defp count_until(counts, target), do: do_count(counts, target, 0, 0) + + defp do_count([], _target, n, _acc), do: {n, []} + + defp do_count([h | rest], target, n, acc) do + new_acc = acc + h + # Once we've accumulated enough tokens to fill the zone, stop and return + # the remainder so the next zone can continue from where we left off. + if new_acc >= target, + do: {n + 1, rest}, + else: do_count(rest, target, n + 1, new_acc) + end +end diff --git a/lib/codeqa/metrics/branching.ex b/lib/codeqa/metrics/file/branching.ex similarity index 70% rename from lib/codeqa/metrics/branching.ex rename to lib/codeqa/metrics/file/branching.ex index 2cfdbe1..ce5e20a 100644 --- a/lib/codeqa/metrics/branching.ex +++ b/lib/codeqa/metrics/file/branching.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Branching do +defmodule CodeQA.Metrics.File.Branching do @moduledoc """ Measures branching density as a proxy for cyclomatic complexity. @@ -12,7 +12,7 @@ defmodule CodeQA.Metrics.Branching do See [cyclomatic complexity](https://en.wikipedia.org/wiki/Cyclomatic_complexity). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric # Python: if elif else for while try except finally with match case # Ruby: if elsif else unless for while until case when begin rescue ensure @@ -43,18 +43,14 @@ defmodule CodeQA.Metrics.Branching do @impl true def name, do: "branching" - @spec analyze(map()) :: map() @impl true - def analyze(%{lines: lines, tokens: tokens}) do - non_blank_count = - lines - |> Tuple.to_list() - |> Enum.count(&(String.trim(&1) != "")) + def keys, do: ["branching_density", "branch_count", "non_blank_count", "max_nesting_depth"] - branch_count = - tokens - |> Tuple.to_list() - |> Enum.count(&MapSet.member?(@branching_keywords, &1)) + @spec analyze(CodeQA.Engine.FileContext.t()) :: map() + @impl true + def analyze(%{lines: lines, tokens: tokens, content: content}) do + non_blank_count = Enum.count(lines, &(String.trim(&1) != "")) + branch_count = Enum.count(tokens, &MapSet.member?(@branching_keywords, &1.content)) density = if non_blank_count > 0, @@ -64,7 +60,19 @@ defmodule CodeQA.Metrics.Branching do %{ "branching_density" => density, "branch_count" => branch_count, - "non_blank_count" => non_blank_count + "non_blank_count" => non_blank_count, + "max_nesting_depth" => max_nesting_depth(content) } end + + defp max_nesting_depth(content) do + content + |> String.graphemes() + |> Enum.reduce({0, 0}, fn + c, {depth, max} when c in ["(", "[", "{"] -> {depth + 1, max(depth + 1, max)} + c, {depth, max} when c in [")", "]", "}"] -> {max(depth - 1, 0), max} + _, acc -> acc + end) + |> elem(1) + end end diff --git a/lib/codeqa/metrics/file/brevity.ex b/lib/codeqa/metrics/file/brevity.ex new file mode 100644 index 0000000..bc0d9a6 --- /dev/null +++ b/lib/codeqa/metrics/file/brevity.ex @@ -0,0 +1,50 @@ +defmodule CodeQA.Metrics.File.Brevity do + @moduledoc """ + Measures how well Brevity law holds in the token distribution. + + Computes the Pearson correlation between token length and token frequency. + A negative value indicates shorter tokens appear more often (law holds). + A positive value indicates longer tokens appear more often (law violated). + Also fits a log-log regression to capture the power-law slope. + + See [Brevity law](https://en.wikipedia.org/wiki/Brevity_law). + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "brevity" + + @impl true + def keys, do: ["correlation", "slope", "sample_size"] + + @spec analyze(map()) :: map() + @impl true + def analyze(%{token_counts: token_counts}) when map_size(token_counts) < 3 do + %{"correlation" => 0.0, "slope" => 0.0, "sample_size" => map_size(token_counts)} + end + + def analyze(%{token_counts: token_counts}) do + pairs = Enum.map(token_counts, fn {token, freq} -> {String.length(token), freq} end) + lengths = Enum.map(pairs, &elem(&1, 0)) + freqs = Enum.map(pairs, &elem(&1, 1)) + + %{ + "correlation" => CodeQA.Math.pearson_correlation_list(lengths, freqs), + "slope" => log_log_slope(lengths, freqs), + "sample_size" => map_size(token_counts) + } + end + + defp log_log_slope(lengths, freqs) do + log_lengths = lengths |> Enum.map(&:math.log(max(&1, 1))) |> Nx.tensor(type: :f64) + log_freqs = freqs |> Enum.map(&:math.log(max(&1, 1))) |> Nx.tensor(type: :f64) + + {slope, _intercept, _r_squared} = CodeQA.Math.linear_regression(log_lengths, log_freqs) + + case Nx.to_number(slope) do + val when is_float(val) -> Float.round(val, 4) + _ -> 0.0 + end + end +end diff --git a/lib/codeqa/metrics/casing_entropy.ex b/lib/codeqa/metrics/file/casing_entropy.ex similarity index 60% rename from lib/codeqa/metrics/casing_entropy.ex rename to lib/codeqa/metrics/file/casing_entropy.ex index cb38011..4256e0e 100644 --- a/lib/codeqa/metrics/casing_entropy.ex +++ b/lib/codeqa/metrics/file/casing_entropy.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.CasingEntropy do +defmodule CodeQA.Metrics.File.CasingEntropy do @moduledoc """ Measures Shannon entropy of identifier casing styles in a file. @@ -12,31 +12,45 @@ defmodule CodeQA.Metrics.CasingEntropy do - `"pascal_case_count"`, `"camel_case_count"`, `"snake_case_count"`, `"macro_case_count"`, `"kebab_case_count"`, `"other_count"` — per-style counts (only keys for styles that appear are included) + - `"screaming_snake_density"` — ratio of MACRO_CASE identifiers to total identifiers See [Shannon entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)) and [naming conventions](https://en.wikipedia.org/wiki/Naming_convention_(programming)). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric + + alias CodeQA.Metrics.File.Inflector @impl true def name, do: "casing_entropy" + @impl true + def keys, + do: [ + "entropy", + "pascal_case_count", + "camel_case_count", + "snake_case_count", + "macro_case_count", + "kebab_case_count", + "other_count", + "screaming_snake_density" + ] + @spec analyze(map()) :: map() @impl true - def analyze(%{identifiers: identifiers}) when tuple_size(identifiers) == 0 do - %{"entropy" => 0.0} + def analyze(%{identifiers: []}) do + %{"entropy" => 0.0, "screaming_snake_density" => 0.0} end def analyze(%{identifiers: identifiers}) do - identifiers_list = Tuple.to_list(identifiers) - counts = - identifiers_list - |> Enum.map(&CodeQA.Metrics.Inflector.detect_casing/1) + identifiers + |> Enum.map(&Inflector.detect_casing/1) |> Enum.frequencies() - total = length(identifiers_list) + total = length(identifiers) entropy = counts @@ -46,7 +60,10 @@ defmodule CodeQA.Metrics.CasingEntropy do acc - p * :math.log2(p) end) - %{"entropy" => Float.round(entropy, 4)} + macro_count = Map.get(counts, :macro_case, 0) + screaming_density = Float.round(macro_count / total, 4) + + %{"entropy" => Float.round(entropy, 4), "screaming_snake_density" => screaming_density} |> Map.merge(counts_to_output(counts)) end diff --git a/lib/codeqa/metrics/file/comment_structure.ex b/lib/codeqa/metrics/file/comment_structure.ex new file mode 100644 index 0000000..65bc0e0 --- /dev/null +++ b/lib/codeqa/metrics/file/comment_structure.ex @@ -0,0 +1,45 @@ +defmodule CodeQA.Metrics.File.CommentStructure do + @moduledoc """ + Measures comment density and annotation patterns. + + Counts lines that begin with a comment marker (language-agnostic: `#`, `//`, + `/*`, ` *`) relative to non-blank lines. Also counts TODO/FIXME/HACK/XXX + markers which indicate deferred work or known issues. + + ## Output keys + + - `"comment_line_ratio"` — comment lines / non-blank lines + - `"comment_line_count"` — raw count of comment lines + - `"todo_fixme_count"` — occurrences of TODO, FIXME, HACK, or XXX + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "comment_structure" + + @impl true + def keys, do: ["comment_line_ratio", "comment_line_count", "todo_fixme_count"] + + @comment_line ~r/^\s*(?:#|\/\/|\/\*|\*)/ + @todo_marker ~r/\b(?:TODO|FIXME|HACK|XXX)\b/ + + @spec analyze(map()) :: map() + @impl true + def analyze(%{content: content, lines: lines}) do + non_blank = Enum.reject(lines, &(String.trim(&1) == "")) + non_blank_count = length(non_blank) + + comment_count = Enum.count(non_blank, &Regex.match?(@comment_line, &1)) + todo_count = @todo_marker |> Regex.scan(content) |> length() + + comment_ratio = + if non_blank_count > 0, do: Float.round(comment_count / non_blank_count, 4), else: 0.0 + + %{ + "comment_line_ratio" => comment_ratio, + "comment_line_count" => comment_count, + "todo_fixme_count" => todo_count + } + end +end diff --git a/lib/codeqa/metrics/compression.ex b/lib/codeqa/metrics/file/compression.ex similarity index 56% rename from lib/codeqa/metrics/compression.ex rename to lib/codeqa/metrics/file/compression.ex index fe68705..9f0981b 100644 --- a/lib/codeqa/metrics/compression.ex +++ b/lib/codeqa/metrics/file/compression.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Compression do +defmodule CodeQA.Metrics.File.Compression do @moduledoc """ Measures file redundancy via zlib compression ratio. @@ -6,34 +6,49 @@ defmodule CodeQA.Metrics.Compression do original. A high compression ratio signals repetitive or boilerplate-heavy code. - `ctx.encoded` is the binary representation of the file content used for - compression, distinct from `ctx.content` which is the UTF-8 string. - See [Kolmogorov complexity](https://en.wikipedia.org/wiki/Kolmogorov_complexity) and [data compression ratio](https://en.wikipedia.org/wiki/Data_compression_ratio). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "compression" + @impl true + def keys, do: ["raw_bytes", "zlib_bytes", "zlib_ratio", "redundancy", "unique_line_ratio"] + @spec analyze(map()) :: map() @impl true def analyze(%{content: "", byte_count: 0}) do - %{"raw_bytes" => 0, "zlib_bytes" => 0, "zlib_ratio" => 0.0, "redundancy" => 0.0} + %{ + "raw_bytes" => 0, + "zlib_bytes" => 0, + "zlib_ratio" => 0.0, + "redundancy" => 0.0, + "unique_line_ratio" => 0.0 + } end def analyze(ctx) do raw_size = ctx.byte_count - zlib_data = :zlib.compress(ctx.encoded) + zlib_data = :zlib.compress(ctx.content) zlib_size = byte_size(zlib_data) + non_blank = ctx.lines |> Enum.reject(&(String.trim(&1) == "")) + + unique_line_ratio = + case length(non_blank) do + 0 -> 0.0 + n -> Float.round(length(Enum.uniq(non_blank)) / n, 4) + end + %{ "raw_bytes" => raw_size, "zlib_bytes" => zlib_size, "zlib_ratio" => Float.round(raw_size / max(1, zlib_size), 4), - "redundancy" => Float.round(max(0.0, 1.0 - zlib_size / raw_size), 4) + "redundancy" => Float.round(max(0.0, 1.0 - zlib_size / raw_size), 4), + "unique_line_ratio" => unique_line_ratio } end end diff --git a/lib/codeqa/metrics/entropy.ex b/lib/codeqa/metrics/file/entropy.ex similarity index 82% rename from lib/codeqa/metrics/entropy.ex rename to lib/codeqa/metrics/file/entropy.ex index 4756471..6533a21 100644 --- a/lib/codeqa/metrics/entropy.ex +++ b/lib/codeqa/metrics/file/entropy.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Entropy do +defmodule CodeQA.Metrics.File.Entropy do @moduledoc """ Computes Shannon entropy at both character and token levels. @@ -11,11 +11,24 @@ defmodule CodeQA.Metrics.Entropy do See [Shannon entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "entropy" + @impl true + def keys, + do: [ + "char_entropy", + "char_max_entropy", + "char_normalized", + "token_entropy", + "token_max_entropy", + "token_normalized", + "vocab_size", + "total_tokens" + ] + @spec analyze(map()) :: map() @impl true def analyze(ctx) do @@ -30,13 +43,12 @@ defmodule CodeQA.Metrics.Entropy do compute_entropy(counts, total, "char") end - defp token_entropy(%{tokens: tokens, token_counts: _token_counts}) - when tuple_size(tokens) == 0 do + defp token_entropy(%{tokens: [], token_counts: _token_counts}) do Map.merge(zero_entropy_map("token"), %{"vocab_size" => 0, "total_tokens" => 0}) end defp token_entropy(%{tokens: tokens, token_counts: token_counts}) do - total = tuple_size(tokens) + total = length(tokens) vocab_size = map_size(token_counts) entropy_map = compute_entropy(token_counts, total, "token") diff --git a/lib/codeqa/metrics/file_metric.ex b/lib/codeqa/metrics/file/file_metric.ex similarity index 77% rename from lib/codeqa/metrics/file_metric.ex rename to lib/codeqa/metrics/file/file_metric.ex index 75a6f61..9488c7b 100644 --- a/lib/codeqa/metrics/file_metric.ex +++ b/lib/codeqa/metrics/file/file_metric.ex @@ -1,8 +1,8 @@ -defmodule CodeQA.Metrics.FileMetric do +defmodule CodeQA.Metrics.File.FileMetric do @moduledoc """ Behaviour for metrics that analyze a single source file. - Implementations receive a `CodeQA.Pipeline.FileContext` struct containing + Implementations receive a `CodeQA.Engine.FileContext` struct containing pre-parsed data (tokens, identifiers, lines, etc.) and return a map of metric key-value pairs. On error, return an empty map `%{}` rather than raising. @@ -25,7 +25,10 @@ defmodule CodeQA.Metrics.FileMetric do """ @callback name() :: String.t() - @callback analyze(CodeQA.Pipeline.FileContext.t()) :: map() + @callback analyze(CodeQA.Engine.FileContext.t()) :: map() + + @doc "List of metric keys returned by analyze/1." + @callback keys() :: [String.t()] @doc "Human-readable description of what this metric measures." @callback description() :: String.t() diff --git a/lib/codeqa/metrics/function_metrics.ex b/lib/codeqa/metrics/file/function_metrics.ex similarity index 92% rename from lib/codeqa/metrics/function_metrics.ex rename to lib/codeqa/metrics/file/function_metrics.ex index 7fd2262..6a9bb0c 100644 --- a/lib/codeqa/metrics/function_metrics.ex +++ b/lib/codeqa/metrics/file/function_metrics.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.FunctionMetrics do +defmodule CodeQA.Metrics.File.FunctionMetrics do @moduledoc """ Estimates function-level structure metrics from source text. @@ -13,7 +13,7 @@ defmodule CodeQA.Metrics.FunctionMetrics do - C#: lines starting with access modifiers (`public`, `private`, etc.) """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric # Python, Ruby, Elixir: `def` family # JavaScript: `function` @@ -39,14 +39,23 @@ defmodule CodeQA.Metrics.FunctionMetrics do @impl true def name, do: "function_metrics" + @impl true + def keys, + do: [ + "function_count", + "avg_function_lines", + "max_function_lines", + "avg_param_count", + "max_param_count" + ] + @spec analyze(map()) :: map() @impl true def analyze(%{lines: lines}) do - lines_list = Tuple.to_list(lines) - total = length(lines_list) + total = length(lines) {func_indices, param_counts} = - lines_list + lines |> Enum.with_index() |> Enum.filter(fn {line, _} -> Regex.match?(@func_keyword_re, line) or Regex.match?(@csharp_method_re, line) diff --git a/lib/codeqa/metrics/halstead.ex b/lib/codeqa/metrics/file/halstead.ex similarity index 92% rename from lib/codeqa/metrics/halstead.ex rename to lib/codeqa/metrics/file/halstead.ex index ca38665..157f67b 100644 --- a/lib/codeqa/metrics/halstead.ex +++ b/lib/codeqa/metrics/file/halstead.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Halstead do +defmodule CodeQA.Metrics.File.Halstead do @moduledoc """ Implements Halstead software-science complexity metrics. @@ -9,11 +9,27 @@ defmodule CodeQA.Metrics.Halstead do See [Halstead complexity measures](https://en.wikipedia.org/wiki/Halstead_complexity_measures). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "halstead" + @impl true + def keys, + do: [ + "n1_unique_operators", + "n2_unique_operands", + "N1_total_operators", + "N2_total_operands", + "vocabulary", + "length", + "volume", + "difficulty", + "effort", + "estimated_bugs", + "time_to_implement_seconds" + ] + # Keyword operators for: # Python, Ruby, JavaScript, Elixir, C#, # Java, C++, Go, Rust, PHP, Swift, Shell, Kotlin diff --git a/lib/codeqa/metrics/heaps.ex b/lib/codeqa/metrics/file/heaps.ex similarity index 84% rename from lib/codeqa/metrics/heaps.ex rename to lib/codeqa/metrics/file/heaps.ex index edc390b..b7cae9c 100644 --- a/lib/codeqa/metrics/heaps.ex +++ b/lib/codeqa/metrics/file/heaps.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Heaps do +defmodule CodeQA.Metrics.File.Heaps do @moduledoc """ Fits Heaps' law to vocabulary growth in a file. @@ -9,25 +9,27 @@ defmodule CodeQA.Metrics.Heaps do See [Heaps' law](https://en.wikipedia.org/wiki/Heaps%27_law). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "heaps" + @impl true + def keys, do: ["k", "beta", "r_squared"] + @max_samples 50 @spec analyze(map()) :: map() @impl true - def analyze(%{tokens: tokens}) when tuple_size(tokens) == 0 do + def analyze(%{tokens: []}) do %{"k" => 0.0, "beta" => 0.0, "r_squared" => 0.0} end def analyze(%{tokens: tokens}) do - token_list = Tuple.to_list(tokens) - total = length(token_list) + total = length(tokens) interval = max(1, div(total, @max_samples)) - data_points = sample_vocabulary_growth(token_list, interval) + data_points = sample_vocabulary_growth(tokens, interval) if length(data_points) < 5 do %{"k" => 0.0, "beta" => 0.0, "r_squared" => 0.0} @@ -40,7 +42,7 @@ defmodule CodeQA.Metrics.Heaps do tokens |> Enum.with_index(1) |> Enum.reduce({MapSet.new(), []}, fn {token, i}, {seen, points} -> - seen = MapSet.put(seen, token) + seen = MapSet.put(seen, token.content) if rem(i, interval) == 0 do {seen, [{i, MapSet.size(seen)} | points]} diff --git a/lib/codeqa/metrics/identifier_length_variance.ex b/lib/codeqa/metrics/file/identifier_length_variance.ex similarity index 81% rename from lib/codeqa/metrics/identifier_length_variance.ex rename to lib/codeqa/metrics/file/identifier_length_variance.ex index 2203b10..424b95b 100644 --- a/lib/codeqa/metrics/identifier_length_variance.ex +++ b/lib/codeqa/metrics/file/identifier_length_variance.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.IdentifierLengthVariance do +defmodule CodeQA.Metrics.File.IdentifierLengthVariance do @moduledoc """ Measures the mean, variance, and maximum length of identifiers. @@ -11,20 +11,22 @@ defmodule CodeQA.Metrics.IdentifierLengthVariance do and [variance](https://en.wikipedia.org/wiki/Variance). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "identifier_length_variance" + @impl true + def keys, do: ["mean", "variance", "std_dev", "max"] + @spec analyze(map()) :: map() @impl true - def analyze(%{identifiers: identifiers}) when tuple_size(identifiers) == 0 do + def analyze(%{identifiers: []}) do %{"mean" => 0.0, "variance" => 0.0, "std_dev" => 0.0, "max" => 0} end def analyze(%{identifiers: identifiers}) do - list = Tuple.to_list(identifiers) - lengths = Enum.map(list, &String.length/1) + lengths = Enum.map(identifiers, &String.length/1) n = length(lengths) mean = Enum.sum(lengths) / n diff --git a/lib/codeqa/metrics/indentation.ex b/lib/codeqa/metrics/file/indentation.ex similarity index 60% rename from lib/codeqa/metrics/indentation.ex rename to lib/codeqa/metrics/file/indentation.ex index ab44743..75923b9 100644 --- a/lib/codeqa/metrics/indentation.ex +++ b/lib/codeqa/metrics/file/indentation.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Indentation do +defmodule CodeQA.Metrics.File.Indentation do @moduledoc """ Analyzes indentation depth patterns across non-blank lines. @@ -10,20 +10,27 @@ defmodule CodeQA.Metrics.Indentation do See [indentation style](https://en.wikipedia.org/wiki/Indentation_style). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "indentation" + @impl true + def keys, do: ["mean_depth", "variance", "max_depth", "uses_tabs", "blank_line_ratio"] + @spec analyze(map()) :: map() @impl true def analyze(%{lines: lines}) do - lines_list = Tuple.to_list(lines) + uses_tabs = Enum.any?(lines, &String.match?(&1, ~r/^\t/)) + + total_lines = length(lines) + blank_count = Enum.count(lines, &(String.trim(&1) == "")) - uses_tabs = Enum.any?(lines_list, &String.match?(&1, ~r/^\t/)) + blank_line_ratio = + if total_lines > 0, do: Float.round(blank_count / total_lines, 4), else: 0.0 depths = - lines_list + lines |> Enum.reject(&(String.trim(&1) == "")) |> Enum.map(fn line -> [leading] = Regex.run(~r/^\s*/, line) @@ -31,7 +38,13 @@ defmodule CodeQA.Metrics.Indentation do end) if depths == [] do - %{"mean_depth" => 0.0, "max_depth" => 0, "variance" => 0.0, "uses_tabs" => uses_tabs} + %{ + "mean_depth" => 0.0, + "max_depth" => 0, + "variance" => 0.0, + "uses_tabs" => uses_tabs, + "blank_line_ratio" => blank_line_ratio + } else n = length(depths) mean = Enum.sum(depths) / n @@ -45,7 +58,8 @@ defmodule CodeQA.Metrics.Indentation do "mean_depth" => Float.round(mean, 4), "variance" => Float.round(variance, 4), "max_depth" => Enum.max(depths), - "uses_tabs" => uses_tabs + "uses_tabs" => uses_tabs, + "blank_line_ratio" => blank_line_ratio } end end diff --git a/lib/codeqa/metrics/inflector.ex b/lib/codeqa/metrics/file/inflector.ex similarity index 89% rename from lib/codeqa/metrics/inflector.ex rename to lib/codeqa/metrics/file/inflector.ex index 7c49531..04e732c 100644 --- a/lib/codeqa/metrics/inflector.ex +++ b/lib/codeqa/metrics/file/inflector.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Inflector do +defmodule CodeQA.Metrics.File.Inflector do @moduledoc """ Utility for detecting identifier casing styles. @@ -30,7 +30,8 @@ defmodule CodeQA.Metrics.Inflector do iex> CodeQA.Metrics.Inflector.detect_casing("FOO_BAR") :macro_case """ - @spec detect_casing(String.t()) :: :pascal_case | :camel_case | :snake_case | :macro_case | :kebab_case | :other + @spec detect_casing(String.t()) :: + :pascal_case | :camel_case | :snake_case | :macro_case | :kebab_case | :other def detect_casing(identifier) do cond do identifier =~ ~r/^[A-Z][a-zA-Z0-9]*$/ -> :pascal_case diff --git a/lib/codeqa/metrics/file/line_patterns.ex b/lib/codeqa/metrics/file/line_patterns.ex new file mode 100644 index 0000000..e8b2b45 --- /dev/null +++ b/lib/codeqa/metrics/file/line_patterns.ex @@ -0,0 +1,83 @@ +defmodule CodeQA.Metrics.File.LinePatterns do + @moduledoc """ + Structural line-level and nesting metrics. + + ## Output keys + + - `"blank_line_ratio"` — blank lines / total lines (spacing/organisation signal) + - `"unique_line_ratio"` — distinct non-blank trimmed lines / total non-blank lines + (low values indicate repetition or boilerplate) + - `"max_nesting_depth"` — maximum bracket nesting depth across `()`, `[]`, `{}` + (complexity proxy independent of branching keywords) + - `"string_literal_ratio"` — quoted string literal spans / total tokens + (high values may indicate magic strings or hardcoded data) + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "line_patterns" + + @impl true + def keys, + do: ["blank_line_ratio", "unique_line_ratio", "max_nesting_depth", "string_literal_ratio"] + + @string_literal ~r/(?:"[^"]*"|'[^']*')/ + + @spec analyze(map()) :: map() + @impl true + def analyze(%{content: content, lines: lines, tokens: tokens}) do + total_lines = length(lines) + total_tokens = length(tokens) + + if total_lines == 0 do + %{ + "blank_line_ratio" => 0.0, + "unique_line_ratio" => 1.0, + "max_nesting_depth" => 0, + "string_literal_ratio" => 0.0 + } + else + blank_count = Enum.count(lines, &(String.trim(&1) == "")) + blank_ratio = Float.round(blank_count / total_lines, 4) + + non_blank = lines |> Enum.map(&String.trim/1) |> Enum.reject(&(&1 == "")) + + unique_ratio = + if non_blank == [], + do: 1.0, + else: Float.round(length(Enum.uniq(non_blank)) / length(non_blank), 4) + + string_count = @string_literal |> Regex.scan(content) |> length() + + string_ratio = + if total_tokens == 0, + do: 0.0, + else: Float.round(string_count / total_tokens, 4) + + %{ + "blank_line_ratio" => blank_ratio, + "unique_line_ratio" => unique_ratio, + "max_nesting_depth" => max_nesting_depth(content), + "string_literal_ratio" => string_ratio + } + end + end + + defp max_nesting_depth(content) do + content + |> String.graphemes() + |> Enum.reduce({0, 0}, fn + char, {depth, max_d} when char in ["(", "[", "{"] -> + new_depth = depth + 1 + {new_depth, max(max_d, new_depth)} + + char, {depth, max_d} when char in [")", "]", "}"] -> + {max(depth - 1, 0), max_d} + + _, acc -> + acc + end) + |> elem(1) + end +end diff --git a/lib/codeqa/metrics/magic_number_density.ex b/lib/codeqa/metrics/file/magic_number_density.ex similarity index 51% rename from lib/codeqa/metrics/magic_number_density.ex rename to lib/codeqa/metrics/file/magic_number_density.ex index 3e28bb4..20428df 100644 --- a/lib/codeqa/metrics/magic_number_density.ex +++ b/lib/codeqa/metrics/file/magic_number_density.ex @@ -1,10 +1,10 @@ -defmodule CodeQA.Metrics.MagicNumberDensity do +defmodule CodeQA.Metrics.File.MagicNumberDensity do @moduledoc """ - Measures the density of magic numbers in source code. + Measures the density of magic numbers and string literals in source code. - Counts numeric literals (excluding common constants 0, 1, 0.0, 1.0) as a - proportion of total tokens. A high density suggests unexplained constants - that should be extracted into named values. + Counts numeric literals (excluding common constants 0, 1, 0.0, 1.0) and + double-quoted string literals as proportions of total tokens. High densities + suggest unexplained constants or hardcoded values that should be extracted. Note: negative numbers (e.g. `-42`) are not detected since the minus sign is a separate token. @@ -12,22 +12,25 @@ defmodule CodeQA.Metrics.MagicNumberDensity do See [magic number](). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "magic_number_density" + @impl true + def keys, do: ["density", "magic_number_count", "string_literal_ratio"] + @number_re ~r/\b\d+\.?\d*(?:[eE][+-]?\d+)?\b/ @idiomatic_constants ~w[0 1 2 0.0 1.0 0.5] + @string_literal_re ~r/"(?:[^"\\]|\\.)*"/ @spec analyze(map()) :: map() @impl true def analyze(%{content: content, tokens: tokens}) do - token_list = Tuple.to_list(tokens) - total_tokens = length(token_list) + total_tokens = length(tokens) if total_tokens == 0 do - %{"density" => 0.0, "magic_number_count" => 0} + %{"density" => 0.0, "magic_number_count" => 0, "string_literal_ratio" => 0.0} else numbers = @number_re @@ -36,10 +39,12 @@ defmodule CodeQA.Metrics.MagicNumberDensity do |> Enum.reject(&(&1 in @idiomatic_constants)) magic_count = length(numbers) + string_count = @string_literal_re |> Regex.scan(content) |> length() %{ "density" => Float.round(magic_count / total_tokens, 4), - "magic_number_count" => magic_count + "magic_number_count" => magic_count, + "string_literal_ratio" => Float.round(string_count / total_tokens, 4) } end end diff --git a/lib/codeqa/metrics/file/near_duplicate_blocks.ex b/lib/codeqa/metrics/file/near_duplicate_blocks.ex new file mode 100644 index 0000000..e1e0c08 --- /dev/null +++ b/lib/codeqa/metrics/file/near_duplicate_blocks.ex @@ -0,0 +1,198 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocks do + @moduledoc """ + Near-duplicate block detection using natural code blocks. + + Detects blocks via blank-line boundaries and sub-blocks via bracket/indentation rules. + Compares structurally similar blocks by token-level edit distance, bucketed as a + percentage of the smaller block's token count. + + Distance buckets: + d0 = exact (0%), d1 ≤ 5%, d2 ≤ 10%, d3 ≤ 15%, d4 ≤ 20%, + d5 ≤ 25%, d6 ≤ 30%, d7 ≤ 40%, d8 ≤ 50% + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Language + alias CodeQA.Metrics.File.NearDuplicateBlocks.Candidates + alias CodeQA.Metrics.File.NearDuplicateBlocks.Distance + + @max_bucket 8 + + # --------------------------------------------------------------------------- + # Public API — distance helpers delegated to Distance submodule + # --------------------------------------------------------------------------- + + @doc "Standard Levenshtein distance between two token lists." + @spec token_edit_distance([String.t()], [String.t()]) :: non_neg_integer() + defdelegate token_edit_distance(a, b), to: Distance + + @doc "Map an edit distance and min token count to a percentage bucket 0–8, or nil if > 50%." + @spec percent_bucket(non_neg_integer(), non_neg_integer()) :: 0..8 | nil + defdelegate percent_bucket(ed, min_count), to: Distance + + # --------------------------------------------------------------------------- + # Public API — analysis entry points + # --------------------------------------------------------------------------- + + @doc """ + Analyze a list of `{path, content}` pairs for near-duplicate blocks. + Returns count keys `near_dup_block_d0..d8`, `block_count`, `sub_block_count`. + With `include_pairs: true` in opts, also returns `_pairs` keys. + """ + @dialyzer {:nowarn_function, analyze: 2} + @spec analyze([{String.t(), String.t()}], keyword()) :: map() + def analyze(labeled_content, opts) do + all_blocks = + Enum.flat_map(labeled_content, fn {path, content} -> + lang_mod = Language.detect(path) + tokens = TokenNormalizer.normalize_structural(content) + + Parser.detect_blocks(tokens, lang_mod) + |> label_blocks(path) + end) + + analyze_from_blocks(all_blocks, opts) + end + + @doc """ + Analyze a pre-built list of labeled `Node.t()` structs for near-duplicate blocks. + Skips tokenization and block detection — use when blocks are already available. + Returns the same keys as `analyze/2`. + """ + @dialyzer {:nowarn_function, analyze_from_blocks: 2} + @spec analyze_from_blocks([Node.t()], keyword()) :: map() + def analyze_from_blocks(all_blocks, opts) do + workers = Keyword.get(opts, :workers, System.schedulers_online()) + max_pairs = Keyword.get(opts, :max_pairs_per_bucket, nil) + include_pairs = Keyword.get(opts, :include_pairs, false) + + block_count = length(all_blocks) + + find_pairs_opts = + [workers: workers, max_pairs_per_bucket: max_pairs] ++ + Keyword.take(opts, [:on_progress, :idf_max_freq]) + + # do_find_pairs computes sub_block_count from the decorated list it already + # builds, eliminating the separate NodeProtocol.children pass. + {buckets, sub_block_count} = do_find_pairs(all_blocks, find_pairs_opts) + + result = + for d <- 0..@max_bucket, into: %{} do + {"near_dup_block_d#{d}", Map.get(buckets, d, %{count: 0}).count} + end + + result = + Map.merge(result, %{"block_count" => block_count, "sub_block_count" => sub_block_count}) + + case include_pairs do + true -> + pairs_result = + for d <- 0..@max_bucket, into: %{} do + {"near_dup_block_d#{d}_pairs", + Map.get(buckets, d, %{pairs: []}).pairs |> format_pairs()} + end + + Map.merge(result, pairs_result) + + false -> + result + end + end + + @doc "Find near-duplicate pairs across a list of %Node{} structs." + @spec find_pairs([Node.t()], keyword()) :: map() + def find_pairs(blocks, opts) do + {buckets, _sub_block_count} = do_find_pairs(blocks, opts) + buckets + end + + @doc false + def label_blocks(blocks, path) do + Enum.map(blocks, fn block -> + label = if block.start_line, do: "#{path}:#{block.start_line}", else: path + %{block | label: label} + end) + end + + # --------------------------------------------------------------------------- + # Internal pair-finding pipeline + # --------------------------------------------------------------------------- + + # Internal implementation returning {buckets, sub_block_count} so that + # analyze_from_blocks gets both without a redundant NodeProtocol.children pass. + defp do_find_pairs(blocks, opts) do + workers = Keyword.get(opts, :workers, System.schedulers_online()) + max_pairs = Keyword.get(opts, :max_pairs_per_bucket, nil) + idf_max_freq = Keyword.get(opts, :idf_max_freq, 1.0) + has_progress = Keyword.has_key?(opts, :on_progress) + + if length(blocks) < 2 do + {%{}, 0} + else + decorated = Candidates.decorate(blocks) + + # sub_block_count derived from the already-computed children_count in decorated. + sub_block_count = + Enum.reduce(decorated, 0, fn {_, _, _, _, _, cc, _, _}, acc -> acc + cc end) + + # IDF: prune bigrams that appear in more than idf_max_freq fraction of blocks. + # These are structural noise (e.g. "end nil", "return false") that inflate the + # candidate set without helping identify true duplicates. + pruned = Candidates.compute_frequent_bigrams(decorated, idf_max_freq) + + decorated = + if MapSet.size(pruned) > 0 do + Enum.map(decorated, &Candidates.prune_bigrams(&1, pruned)) + else + decorated + end + + {exact_index, shingle_index} = Candidates.build_indexes(decorated) + + total = length(decorated) + # Convert to tuple for O(1) indexed lookup inside the hot comparison loop. + decorated_arr = List.to_tuple(decorated) + + if has_progress, + do: IO.puts(:stderr, " Comparing #{total} blocks for near-duplicates...") + + raw_pairs = + decorated + |> Flow.from_enumerable(max_demand: 10, stages: workers) + |> Flow.flat_map( + &Candidates.find_pairs_for_block(&1, decorated_arr, exact_index, shingle_index) + ) + |> Enum.to_list() + + {bucket_pairs(raw_pairs, max_pairs), sub_block_count} + end + end + + defp bucket_pairs(raw_pairs, max_pairs) do + Enum.reduce(raw_pairs, %{}, fn {bucket, pair}, acc -> + Map.update( + acc, + bucket, + %{count: 1, pairs: maybe_append([], pair, max_pairs, 0)}, + fn existing -> + %{ + count: existing.count + 1, + pairs: maybe_append(existing.pairs, pair, max_pairs, existing.count) + } + end + ) + end) + end + + # Uses the already-tracked count instead of length(list) to avoid an O(n) walk. + defp maybe_append(list, _pair, max, count) when is_integer(max) and count >= max, do: list + defp maybe_append(list, pair, _max, _count), do: [pair | list] + + defp format_pairs(pairs) do + Enum.map(pairs, fn {label_a, label_b} -> + %{"source_a" => label_a, "source_b" => label_b} + end) + end +end diff --git a/lib/codeqa/metrics/file/near_duplicate_blocks/candidates.ex b/lib/codeqa/metrics/file/near_duplicate_blocks/candidates.ex new file mode 100644 index 0000000..01fa53e --- /dev/null +++ b/lib/codeqa/metrics/file/near_duplicate_blocks/candidates.ex @@ -0,0 +1,214 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocks.Candidates do + @moduledoc """ + Block fingerprinting, indexing, and candidate-pair matching for near-duplicate detection. + + Handles: + - Canonical token-value extraction (stripping leading/trailing whitespace tokens) + - Exact-hash and shingle indexes for fast candidate lookup + - IDF-based bigram pruning to reduce structural-noise candidates + - Structural compatibility checks (child-count and line-ratio guards) + - Pair scoring and bucketing + """ + + alias CodeQA.AST.Classification.NodeProtocol + alias CodeQA.AST.Lexing.{NewlineToken, WhitespaceToken} + alias CodeQA.Metrics.File.NearDuplicateBlocks.Distance + + # Pre-compute token kind strings to avoid repeated function calls in the hot path. + @nl_kind NewlineToken.kind() + @ws_kind WhitespaceToken.kind() + + @doc """ + Decorate a list of blocks with pre-computed canonical values, hashes, bigrams, and + structural metadata. Each entry is an 8-tuple: + + {index, block, values, hash, len_values, children_count, newline_count, bigrams} + """ + @spec decorate([term()]) :: [tuple()] + def decorate(blocks) do + blocks + |> Enum.with_index() + |> Enum.map(fn {block, i} -> + values = canonical_values(NodeProtocol.flat_tokens(block)) + children_count = length(NodeProtocol.children(block)) + newline_count = Enum.count(values, &(&1 == @nl_kind)) + bigrams = Enum.chunk_every(values, 2, 1, :discard) + + {i, block, values, :erlang.phash2(values), length(values), children_count, newline_count, + bigrams} + end) + end + + @doc """ + Build both exact (hash → [idx]) and shingle (bigram_hash → [idx]) indexes in one pass, + using the pre-computed values from the decorated list. + """ + @spec build_indexes([tuple()]) :: {map(), map()} + def build_indexes(decorated) do + Enum.reduce(decorated, {%{}, %{}}, fn {idx, _block, _values, hash, _len, _children, _newlines, + bigrams}, + {exact_acc, shingle_acc} -> + exact_acc = Map.update(exact_acc, hash, [idx], &[idx | &1]) + + shingle_acc = + bigrams + |> Enum.reduce(shingle_acc, fn bigram, sh_acc -> + h = :erlang.phash2(bigram) + Map.update(sh_acc, h, [idx], &[idx | &1]) + end) + + {exact_acc, shingle_acc} + end) + end + + @doc """ + Returns the set of bigram hashes that appear in more than `max_freq` fraction of blocks. + + Minimum threshold of 2 so a bigram must appear in 3+ blocks before being pruned — + prevents over-pruning when the total block count is very small. + """ + @spec compute_frequent_bigrams([tuple()], float()) :: MapSet.t() + def compute_frequent_bigrams(decorated, max_freq) do + total = length(decorated) + threshold = max(2, round(total * max_freq)) + + decorated + |> Enum.reduce(%{}, fn {_, _, _, _, _, _, _, bigrams}, acc -> + bigrams + |> Enum.uniq_by(&:erlang.phash2/1) + |> Enum.reduce(acc, fn bigram, a -> + Map.update(a, :erlang.phash2(bigram), 1, &(&1 + 1)) + end) + end) + |> Enum.filter(fn {_, count} -> count > threshold end) + |> Enum.map(&elem(&1, 0)) + |> MapSet.new() + end + + @doc "Remove bigrams whose hash is in the pruned set from a decorated tuple." + @spec prune_bigrams(tuple(), MapSet.t()) :: tuple() + def prune_bigrams({i, b, v, h, l, c, n, bigrams}, pruned) do + {i, b, v, h, l, c, n, Enum.reject(bigrams, &MapSet.member?(pruned, :erlang.phash2(&1)))} + end + + @doc """ + Find all near-duplicate pairs for a single block against the full decorated array. + Returns a list of `{bucket, {label_a, label_b}}` pairs. + """ + @spec find_pairs_for_block(tuple(), tuple(), map(), map()) :: list() + def find_pairs_for_block( + {i, block_a, values_a, hash_a, len_a, children_a, newlines_a, bigrams_a}, + decorated_arr, + exact_index, + shingle_index + ) do + # For small exact-match lists (typically 0–3 entries) a plain list membership + # check avoids the overhead of constructing a MapSet. + exact_list = Map.get(exact_index, hash_a, []) + + # For d0 (exact), find hash-matching blocks and confirm with value equality + # to guard against phash2 collisions. + exact_pairs = + exact_list + |> Enum.filter(&(&1 > i)) + |> Enum.map(fn j -> + {_j, block_b, values_b, _hash_b, _len_b, children_b, newlines_b, _bigrams_b} = + elem(decorated_arr, j) + + if values_b == values_a and + structure_compatible?(children_a, newlines_a, children_b, newlines_b) do + {0, {block_a.label, block_b.label}} + else + nil + end + end) + |> Enum.reject(&is_nil/1) + + # For d1-d8 (near), use shingle index to find candidates. + min_shared = max(0, round(len_a * 0.5) - 1) + + near_pairs = + bigrams_a + |> Enum.reduce(%{}, fn bigram, acc -> + h = :erlang.phash2(bigram) + Map.get(shingle_index, h, []) |> Enum.reduce(acc, &count_candidate(&1, &2, i)) + end) + |> Enum.filter(fn {_, count} -> count >= min_shared end) + |> Enum.map(&elem(&1, 0)) + |> Enum.reject(fn j -> j in exact_list end) + |> Enum.flat_map(fn j -> + near_pair_for_candidate( + j, + decorated_arr, + block_a, + values_a, + len_a, + children_a, + newlines_a + ) + end) + + exact_pairs ++ near_pairs + end + + # --------------------------------------------------------------------------- + # Private helpers + # --------------------------------------------------------------------------- + + # Strip leading/trailing and tokens and extract kind values as strings. + # Optimised to 3 passes: one reduce (skip leading NL/WS + collect reversed kinds), + # one drop_while (strip trailing), one :lists.reverse. + defp canonical_values(tokens) do + {reversed, _in_content} = + Enum.reduce(tokens, {[], false}, fn t, {acc, in_content} -> + kind = t.kind + is_skip = kind == @nl_kind or kind == @ws_kind + + if in_content or not is_skip do + {[kind | acc], true} + else + {acc, false} + end + end) + + reversed + |> Enum.drop_while(&(&1 == @nl_kind or &1 == @ws_kind)) + |> :lists.reverse() + end + + defp count_candidate(j, cnt, i) when j > i, do: Map.update(cnt, j, 1, &(&1 + 1)) + defp count_candidate(_j, cnt, _i), do: cnt + + defp near_pair_for_candidate(j, decorated_arr, block_a, values_a, len_a, children_a, newlines_a) do + {_j, block_b, values_b, _hash_b, len_b, children_b, newlines_b, _bigrams_b} = + elem(decorated_arr, j) + + min_count = min(len_a, len_b) + max_allowed = round(min_count * 0.5) + + if structure_compatible?(children_a, newlines_a, children_b, newlines_b) and + abs(len_a - len_b) <= max_allowed do + ed = Distance.token_edit_distance_bounded(values_a, values_b, max_allowed) + + case Distance.percent_bucket(ed, min_count) do + nil -> [] + bucket when bucket > 0 -> [{bucket, {block_a.label, block_b.label}}] + # ed=0 handled by exact_pairs above + _ -> [] + end + else + [] + end + end + + # Uses pre-computed children counts and newline counts from the decorated tuple + # so NodeProtocol.children/1 and Enum.count/2 are not called per candidate pair. + defp structure_compatible?(children_a, newlines_a, children_b, newlines_b) do + sub_diff = abs(children_a - children_b) + lines_a = newlines_a + 1 + lines_b = newlines_b + 1 + max_lines = max(lines_a, lines_b) + line_ratio = if max_lines > 0, do: abs(lines_a - lines_b) / max_lines, else: 0.0 + sub_diff <= 1 and line_ratio <= 0.30 + end +end diff --git a/lib/codeqa/metrics/file/near_duplicate_blocks/distance.ex b/lib/codeqa/metrics/file/near_duplicate_blocks/distance.ex new file mode 100644 index 0000000..475aa3e --- /dev/null +++ b/lib/codeqa/metrics/file/near_duplicate_blocks/distance.ex @@ -0,0 +1,114 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocks.Distance do + @moduledoc """ + Token-level edit distance and percentage-bucket classification for near-duplicate detection. + + Provides standard Levenshtein distance, a bounded variant that short-circuits + when the distance already exceeds a threshold, and a bucket classifier that maps + an edit distance + minimum token count to a similarity bucket (d0–d8). + + Distance buckets: + d0 = exact (0%), d1 ≤ 5%, d2 ≤ 10%, d3 ≤ 15%, d4 ≤ 20%, + d5 ≤ 25%, d6 ≤ 30%, d7 ≤ 40%, d8 ≤ 50% + """ + + @bucket_thresholds [ + {0, 0.0}, + {1, 0.05}, + {2, 0.10}, + {3, 0.15}, + {4, 0.20}, + {5, 0.25}, + {6, 0.30}, + {7, 0.40}, + {8, 0.50} + ] + + @doc "Standard Levenshtein distance between two token lists." + @spec token_edit_distance([String.t()], [String.t()]) :: non_neg_integer() + def token_edit_distance([], b), do: length(b) + def token_edit_distance(a, []), do: length(a) + + def token_edit_distance(a, b) do + a_arr = List.to_tuple(a) + b_arr = List.to_tuple(b) + lb = tuple_size(b_arr) + init_row = List.to_tuple(Enum.to_list(0..lb)) + result_row = levenshtein_rows(a_arr, b_arr, tuple_size(a_arr), lb, init_row, 1) + elem(result_row, lb) + end + + defp levenshtein_rows(_a, _b, la, _lb, prev, i) when i > la, do: prev + + defp levenshtein_rows(a, b, la, lb, prev, i) do + ai = elem(a, i - 1) + curr_reversed = levenshtein_cols(b, lb, prev, ai, [i], 1) + curr = List.to_tuple(:lists.reverse(curr_reversed)) + levenshtein_rows(a, b, la, lb, curr, i + 1) + end + + defp levenshtein_cols(_b, lb, _prev, _ai, acc, j) when j > lb, do: acc + + defp levenshtein_cols(b, lb, prev, ai, [last_val | _] = acc, j) do + cost = if ai == elem(b, j - 1), do: 0, else: 1 + val = min(elem(prev, j) + 1, min(last_val + 1, elem(prev, j - 1) + cost)) + levenshtein_cols(b, lb, prev, ai, [val | acc], j + 1) + end + + # Bounded Levenshtein: returns the edit distance, or max_distance + 1 if the + # distance would exceed max_distance. Bails after each row when the row minimum + # already exceeds max_distance — the final distance can only grow from there. + @doc false + @spec token_edit_distance_bounded([String.t()], [String.t()], non_neg_integer()) :: + non_neg_integer() + def token_edit_distance_bounded([], b, _max), do: length(b) + def token_edit_distance_bounded(a, [], _max), do: length(a) + + def token_edit_distance_bounded(a, b, max_distance) do + a_arr = List.to_tuple(a) + b_arr = List.to_tuple(b) + lb = tuple_size(b_arr) + init_row = List.to_tuple(Enum.to_list(0..lb)) + levenshtein_rows_bounded(a_arr, b_arr, tuple_size(a_arr), lb, init_row, max_distance, 1) + end + + defp levenshtein_rows_bounded(_a, _b, la, lb, prev, _max, i) when i > la, do: elem(prev, lb) + + defp levenshtein_rows_bounded(a, b, la, lb, prev, max_distance, i) do + ai = elem(a, i - 1) + # levenshtein_cols_with_min tracks the row minimum as it builds, avoiding + # a separate O(lb) pass to find the min after the row is complete. + {curr_reversed, row_min} = levenshtein_cols_with_min(b, lb, prev, ai, {[i], i}, 1) + curr = List.to_tuple(:lists.reverse(curr_reversed)) + + if row_min > max_distance do + max_distance + 1 + else + levenshtein_rows_bounded(a, b, la, lb, curr, max_distance, i + 1) + end + end + + defp levenshtein_cols_with_min(_b, lb, _prev, _ai, acc_and_min, j) when j > lb, + do: acc_and_min + + defp levenshtein_cols_with_min(b, lb, prev, ai, {[last_val | _] = acc, min_val}, j) do + cost = if ai == elem(b, j - 1), do: 0, else: 1 + val = min(elem(prev, j) + 1, min(last_val + 1, elem(prev, j - 1) + cost)) + levenshtein_cols_with_min(b, lb, prev, ai, {[val | acc], min(min_val, val)}, j + 1) + end + + @doc "Map an edit distance and min token count to a percentage bucket 0–8, or nil if > 50%." + @spec percent_bucket(non_neg_integer(), non_neg_integer()) :: 0..8 | nil + def percent_bucket(_ed, 0), do: nil + def percent_bucket(0, _min_count), do: 0 + + def percent_bucket(ed, min_count) do + pct = ed / min_count + + @bucket_thresholds + |> Enum.find(fn {bucket, threshold} -> bucket > 0 and pct <= threshold end) + |> case do + {bucket, _} -> bucket + nil -> nil + end + end +end diff --git a/lib/codeqa/metrics/file/near_duplicate_blocks_file.ex b/lib/codeqa/metrics/file/near_duplicate_blocks_file.ex new file mode 100644 index 0000000..7a15e74 --- /dev/null +++ b/lib/codeqa/metrics/file/near_duplicate_blocks_file.ex @@ -0,0 +1,39 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocksFile do + @moduledoc """ + Counts near-duplicate and exact-duplicate natural code blocks within a single file. + + Blocks are detected at blank-line boundaries with sub-block detection via bracket rules. + Distance is a percentage of the smaller block's token count, bucketed d0–d8. + Also reports block_count and sub_block_count as standalone metrics. + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + alias CodeQA.Metrics.File.NearDuplicateBlocks + + @impl true + def name, do: "near_duplicate_blocks_file" + + @impl true + def keys do + ["block_count", "sub_block_count"] ++ for(d <- 0..8, do: "near_dup_block_d#{d}") + end + + @impl true + def analyze(%{blocks: nil}), do: Map.new(keys(), fn k -> {k, 0} end) + + def analyze(%{path: path, blocks: blocks}) when is_list(blocks) do + NearDuplicateBlocks.analyze_from_blocks( + NearDuplicateBlocks.label_blocks(blocks, path || "unknown"), + [] + ) + |> Map.reject(fn {k, _} -> String.ends_with?(k, "_pairs") end) + end + + def analyze(ctx) do + path = ctx.path || "unknown" + + NearDuplicateBlocks.analyze([{path, ctx.content}], []) + |> Map.reject(fn {k, _} -> String.ends_with?(k, "_pairs") end) + end +end diff --git a/lib/codeqa/metrics/ngram.ex b/lib/codeqa/metrics/file/ngram.ex similarity index 71% rename from lib/codeqa/metrics/ngram.ex rename to lib/codeqa/metrics/file/ngram.ex index fb2b44b..b100513 100644 --- a/lib/codeqa/metrics/ngram.ex +++ b/lib/codeqa/metrics/file/ngram.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Ngram do +defmodule CodeQA.Metrics.File.Ngram do @moduledoc """ Computes bigram and trigram statistics over the token stream. @@ -10,15 +10,30 @@ defmodule CodeQA.Metrics.Ngram do and [hapax legomenon](https://en.wikipedia.org/wiki/Hapax_legomenon). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "ngram" + @impl true + def keys, + do: [ + "bigram_total", + "bigram_unique", + "bigram_repetition_rate", + "bigram_hapax_fraction", + "bigram_repeated_unique", + "trigram_total", + "trigram_unique", + "trigram_repetition_rate", + "trigram_hapax_fraction", + "trigram_repeated_unique" + ] + @spec analyze(map()) :: map() @impl true def analyze(ctx) do - tokens = Tuple.to_list(ctx.tokens) + tokens = Enum.map(ctx.tokens, & &1.content) bigram_stats = ngram_stats(tokens, 2) |> rename_keys("bigram") trigram_stats = ngram_stats(tokens, 3) |> rename_keys("trigram") @@ -27,7 +42,13 @@ defmodule CodeQA.Metrics.Ngram do end defp ngram_stats(tokens, n) when length(tokens) < n do - %{"total" => 0, "unique" => 0, "repetition_rate" => 0.0, "hapax_fraction" => 0.0, "repeated_unique" => 0} + %{ + "total" => 0, + "unique" => 0, + "repetition_rate" => 0.0, + "hapax_fraction" => 0.0, + "repeated_unique" => 0 + } end defp ngram_stats(tokens, n) do diff --git a/lib/codeqa/metrics/file/punctuation_density.ex b/lib/codeqa/metrics/file/punctuation_density.ex new file mode 100644 index 0000000..8b42ee4 --- /dev/null +++ b/lib/codeqa/metrics/file/punctuation_density.ex @@ -0,0 +1,96 @@ +defmodule CodeQA.Metrics.File.PunctuationDensity do + @moduledoc """ + Character-level punctuation and structural pattern metrics. + + Captures signals that character-level metrics miss: naming conventions using + `?`/`!` suffixes, chained method calls (dots), non-standard bracket adjacency, + and numeric bracket pair patterns. + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "punctuation_density" + + @impl true + def keys do + [ + "question_mark_density", + "exclamation_density", + "dot_count", + "id_nonalpha_suffix_density", + "bracket_nonalpha_prefix_count", + "bracket_nonalpha_suffix_count", + "bracket_number_pair_count", + "arrow_density", + "colon_suffix_density" + ] + end + + # identifier-like token (starts with letter/underscore) ending with non-alphanumeric non-whitespace + @id_nonalpha_suffix ~r/[a-zA-Z_]\w*[^\w\s]/ + # opening bracket immediately preceded by non-alphanumeric non-whitespace (e.g. `?(`, `==[`) + @bracket_nonalpha_prefix ~r/[^\w\s\(\[\{][\(\[\{]/ + # closing bracket immediately followed by non-alphanumeric non-whitespace (e.g. `}.`, `)?`) + @bracket_nonalpha_suffix ~r/[\)\]\}][^\w\s\)\]\}]/ + # number (with optional underscores) wrapped in brackets: (42), [1_000], (3.14) + @bracket_number_pair ~r/[\(\[]\d[\d_]*(?:\.\d+)?[\)\]]/ + # arrow operators: -> and => + @arrow ~r/->|=>/ + # identifier immediately followed by colon (keyword args, dict keys, labels) + @colon_suffix ~r/[a-zA-Z_]\w*:/ + + @spec analyze(map()) :: map() + @impl true + def analyze(%{content: content, tokens: tokens}) do + total_chars = String.length(content) + total_tokens = length(tokens) + + if total_chars == 0 do + %{ + "question_mark_density" => 0.0, + "exclamation_density" => 0.0, + "dot_count" => 0, + "id_nonalpha_suffix_density" => 0.0, + "bracket_nonalpha_prefix_count" => 0, + "bracket_nonalpha_suffix_count" => 0, + "bracket_number_pair_count" => 0, + "arrow_density" => 0.0, + "colon_suffix_density" => 0.0 + } + else + qmarks = count_char(content, "?") + excls = count_char(content, "!") + dots = count_char(content, ".") + + id_suffix_count = count_matches(content, @id_nonalpha_suffix) + bracket_prefix = count_matches(content, @bracket_nonalpha_prefix) + bracket_suffix = count_matches(content, @bracket_nonalpha_suffix) + bracket_num = count_matches(content, @bracket_number_pair) + + id_denom = max(total_tokens, 1) + arrows = count_matches(content, @arrow) + colon_suffixes = count_matches(content, @colon_suffix) + + %{ + "question_mark_density" => Float.round(qmarks / total_chars, 6), + "exclamation_density" => Float.round(excls / total_chars, 6), + "dot_count" => dots, + "id_nonalpha_suffix_density" => Float.round(id_suffix_count / id_denom, 4), + "bracket_nonalpha_prefix_count" => bracket_prefix, + "bracket_nonalpha_suffix_count" => bracket_suffix, + "bracket_number_pair_count" => bracket_num, + "arrow_density" => Float.round(arrows / id_denom, 4), + "colon_suffix_density" => Float.round(colon_suffixes / id_denom, 4) + } + end + end + + defp count_char(content, char) do + content |> String.graphemes() |> Enum.count(&(&1 == char)) + end + + defp count_matches(content, regex) do + regex |> Regex.scan(content) |> length() + end +end diff --git a/lib/codeqa/metrics/readability.ex b/lib/codeqa/metrics/file/readability.ex similarity index 89% rename from lib/codeqa/metrics/readability.ex rename to lib/codeqa/metrics/file/readability.ex index 5ffa9e1..3e1bd2c 100644 --- a/lib/codeqa/metrics/readability.ex +++ b/lib/codeqa/metrics/file/readability.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Readability do +defmodule CodeQA.Metrics.File.Readability do @moduledoc """ Computes adapted Flesch and Fog readability indices for source code. @@ -10,17 +10,27 @@ defmodule CodeQA.Metrics.Readability do and [Gunning fog index](https://en.wikipedia.org/wiki/Gunning_fog_index). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "readability" + @impl true + def keys, + do: [ + "avg_tokens_per_line", + "avg_line_length", + "avg_sub_words_per_id", + "flesch_adapted", + "fog_adapted", + "total_lines" + ] + @spec analyze(map()) :: map() @impl true def analyze(ctx) do lines = ctx.lines - |> Tuple.to_list() |> Enum.filter(fn line -> trimmed = String.trim(line) trimmed != "" and not String.starts_with?(trimmed, "#") @@ -42,11 +52,11 @@ defmodule CodeQA.Metrics.Readability do defp compute_readability(ctx, lines) do total_lines = length(lines) - total_tokens = tuple_size(ctx.tokens) + total_tokens = length(ctx.tokens) avg_tokens = total_tokens / total_lines avg_line_length = lines |> Enum.map(&String.length/1) |> Enum.sum() |> Kernel./(total_lines) - words = Tuple.to_list(ctx.words) + words = ctx.words {avg_sub_words, complex_fraction} = if words != [] do diff --git a/lib/codeqa/metrics/file/rfc.ex b/lib/codeqa/metrics/file/rfc.ex new file mode 100644 index 0000000..5416c68 --- /dev/null +++ b/lib/codeqa/metrics/file/rfc.ex @@ -0,0 +1,81 @@ +defmodule CodeQA.Metrics.File.RFC do + @moduledoc """ + Response For a Class (RFC) — a coupling metric from the Chidamber & Kemerer suite. + + RFC ≈ number of distinct methods/functions reachable from this file, counting + both locally-defined functions and distinct external call targets. + + Formula: `RFC = function_def_count + |distinct_call_targets|` + + Computed from the token stream without requiring a real AST: + - Function definitions are detected by function-keyword tokens (`def`, `fn`, etc.) + followed by an `` token. + - Call targets are detected by `` tokens immediately followed by `(`. + Duplicates are collapsed to a set. + + Higher RFC values indicate a module with more responsibility and more external + coupling, correlating empirically with higher fault density. + + See [CK metrics suite](https://en.wikipedia.org/wiki/Programming_complexity#Chidamber_and_Kemerer_metrics). + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @func_keywords MapSet.new(~w[ + def defp defmacro defmacrop defguard defdelegate + function func fun fn + sub proc method + ]) + + @impl true + def name, do: "rfc" + + @impl true + def keys, do: ["rfc_count", "rfc_density", "function_def_count", "distinct_call_count"] + + @impl true + def description, + do: "Response For a Class: function definitions + distinct call targets (CK suite)" + + @spec analyze(CodeQA.Engine.FileContext.t()) :: map() + @impl true + def analyze(%{tokens: tokens, line_count: line_count}) do + {func_def_count, call_targets} = scan_tokens(tokens) + + distinct_call_count = MapSet.size(call_targets) + rfc_count = func_def_count + distinct_call_count + + density = + if line_count > 0, + do: Float.round(rfc_count / line_count, 4), + else: 0.0 + + %{ + "rfc_count" => rfc_count, + "rfc_density" => density, + "function_def_count" => func_def_count, + "distinct_call_count" => distinct_call_count + } + end + + # Single pass: detect function definitions and call sites simultaneously. + # Uses a sliding window of two adjacent tokens. + defp scan_tokens(tokens) do + tokens + |> Enum.zip(Enum.drop(tokens, 1)) + |> Enum.reduce({0, MapSet.new()}, fn {tok, next}, {defs, calls} -> + cond do + # Function definition: keyword followed by an identifier + MapSet.member?(@func_keywords, tok.content) and next.kind == "" -> + {defs + 1, calls} + + # Call site: identifier followed by open paren + tok.kind == "" and next.content == "(" -> + {defs, MapSet.put(calls, tok.content)} + + true -> + {defs, calls} + end + end) + end +end diff --git a/lib/codeqa/metrics/symbol_density.ex b/lib/codeqa/metrics/file/symbol_density.ex similarity index 85% rename from lib/codeqa/metrics/symbol_density.ex rename to lib/codeqa/metrics/file/symbol_density.ex index 67459a0..3e71bf3 100644 --- a/lib/codeqa/metrics/symbol_density.ex +++ b/lib/codeqa/metrics/file/symbol_density.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.SymbolDensity do +defmodule CodeQA.Metrics.File.SymbolDensity do @moduledoc """ Measures the density of non-word, non-whitespace symbols in source code. @@ -9,11 +9,14 @@ defmodule CodeQA.Metrics.SymbolDensity do See [code readability](https://en.wikipedia.org/wiki/Computer_programming#Readability_of_source_code). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "symbol_density" + @impl true + def keys, do: ["density", "symbol_count", "distinct_symbol_types"] + @spec analyze(map()) :: map() @impl true def analyze(%{content: content}) do diff --git a/lib/codeqa/metrics/vocabulary.ex b/lib/codeqa/metrics/file/vocabulary.ex similarity index 91% rename from lib/codeqa/metrics/vocabulary.ex rename to lib/codeqa/metrics/file/vocabulary.ex index d9ef637..496cc68 100644 --- a/lib/codeqa/metrics/vocabulary.ex +++ b/lib/codeqa/metrics/file/vocabulary.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Vocabulary do +defmodule CodeQA.Metrics.File.Vocabulary do @moduledoc """ Analyzes vocabulary diversity using type-token ratio (TTR) and MATTR. @@ -14,19 +14,22 @@ defmodule CodeQA.Metrics.Vocabulary do and [MATTR](https://doi.org/10.3758/BRM.42.2.381). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "vocabulary" + @impl true + def keys, do: ["raw_ttr", "mattr", "unique_identifiers", "total_identifiers", "vocabulary"] + @window_size 100 @spec analyze(map()) :: map() @impl true def analyze(ctx) do - identifiers = Tuple.to_list(ctx.identifiers) + identifiers = ctx.identifiers total = length(identifiers) - vocabulary = ctx.words |> Tuple.to_list() |> Enum.uniq() |> Enum.sort() + vocabulary = ctx.words |> Enum.uniq() |> Enum.sort() if total == 0 do %{ diff --git a/lib/codeqa/metrics/vowel_density.ex b/lib/codeqa/metrics/file/vowel_density.ex similarity index 86% rename from lib/codeqa/metrics/vowel_density.ex rename to lib/codeqa/metrics/file/vowel_density.ex index 84ea39e..f3f53de 100644 --- a/lib/codeqa/metrics/vowel_density.ex +++ b/lib/codeqa/metrics/file/vowel_density.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.VowelDensity do +defmodule CodeQA.Metrics.File.VowelDensity do @moduledoc """ Measures the density of vowels in identifiers. @@ -9,17 +9,20 @@ defmodule CodeQA.Metrics.VowelDensity do See [identifier naming](https://en.wikipedia.org/wiki/Identifier_(computer_languages)). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @vowels MapSet.new(~c"aeiouyAEIOUY") @impl true def name, do: "vowel_density" + @impl true + def keys, do: ["density", "vowel_count", "total_chars"] + @spec analyze(map()) :: map() @impl true def analyze(%{identifiers: identifiers}) do - list = Tuple.to_list(identifiers) + list = identifiers if list == [] do %{"density" => 0.0, "vowel_count" => 0, "total_chars" => 0} diff --git a/lib/codeqa/metrics/winnowing.ex b/lib/codeqa/metrics/file/winnowing.ex similarity index 96% rename from lib/codeqa/metrics/winnowing.ex rename to lib/codeqa/metrics/file/winnowing.ex index 9c8961c..d725a38 100644 --- a/lib/codeqa/metrics/winnowing.ex +++ b/lib/codeqa/metrics/file/winnowing.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Winnowing do +defmodule CodeQA.Metrics.File.Winnowing do @moduledoc """ Generates structural fingerprints using k-grams. diff --git a/lib/codeqa/metrics/zipf.ex b/lib/codeqa/metrics/file/zipf.ex similarity index 86% rename from lib/codeqa/metrics/zipf.ex rename to lib/codeqa/metrics/file/zipf.ex index 4948c3d..b03a07c 100644 --- a/lib/codeqa/metrics/zipf.ex +++ b/lib/codeqa/metrics/file/zipf.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Zipf do +defmodule CodeQA.Metrics.File.Zipf do @moduledoc """ Fits Zipf's law to the token frequency distribution. @@ -9,21 +9,24 @@ defmodule CodeQA.Metrics.Zipf do See [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "zipf" + @impl true + def keys, do: ["exponent", "r_squared", "vocab_size", "total_tokens"] + @spec analyze(map()) :: map() @impl true - def analyze(%{tokens: tokens, token_counts: _token_counts}) when tuple_size(tokens) == 0 do + def analyze(%{tokens: [], token_counts: _token_counts}) do %{"exponent" => 0.0, "r_squared" => 0.0, "vocab_size" => 0, "total_tokens" => 0} end def analyze(%{tokens: tokens, token_counts: token_counts}) do frequencies = token_counts |> Map.values() |> Enum.sort(:desc) vocab_size = length(frequencies) - total_tokens = tuple_size(tokens) + total_tokens = length(tokens) if vocab_size < 3 do %{ diff --git a/lib/codeqa/metrics/post_processing/menzerath.ex b/lib/codeqa/metrics/post_processing/menzerath.ex new file mode 100644 index 0000000..4b5b10c --- /dev/null +++ b/lib/codeqa/metrics/post_processing/menzerath.ex @@ -0,0 +1,282 @@ +defmodule CodeQA.Metrics.PostProcessing.Menzerath do + @moduledoc """ + Measures structural hierarchy conformance using Menzerath's law. + + ## Block-level score + + For each parsed block in a file, computes: + + ratio = block.line_count / parent.line_count + + Root blocks use the file's line count as parent. Ratio close to 1.0 means the block + dominates its parent (poor decomposition). Low ratio means the block is small relative + to its parent (good decomposition). + + For internal nodes that have children, also computes `avg_child_ratio` — the mean ratio + of direct children. High `avg_child_ratio` means this node failed to decompose its + children into small enough pieces. + + ## Codebase-level score + + Collects `{function_count, avg_function_lines}` pairs from all files and computes: + - Pearson correlation (negative = law holds across the codebase) + - Power-law exponent `b` from `y = a · x^b` fit on log-log scale + - R² of the fit + """ + + @behaviour CodeQA.Metrics.PostProcessing.PostProcessingMetric + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Languages.Unknown + + @violation_threshold 0.6 + + @impl true + def name, do: "menzerath" + + @impl true + def analyze(pipeline_result, files_map, _opts) do + file_scores = + Map.new(files_map, fn {path, content} -> + {path, %{"menzerath" => score_file(content)}} + end) + + codebase_score = compute_codebase_score(pipeline_result) + + %{ + "files" => file_scores, + "codebase" => %{"menzerath" => codebase_score} + } + end + + # --- file-level scoring --- + + defp score_file("") do + %{ + "blocks" => [], + "mean_ratio" => 0.0, + "max_ratio" => 0.0, + "violation_count" => 0, + "insight" => "Empty file." + } + end + + defp score_file(content) do + file_lines = content |> String.split("\n") |> length() + root_tokens = TokenNormalizer.normalize_structural(content) + top_nodes = Parser.detect_blocks(root_tokens, Unknown) + + blocks = Enum.map(top_nodes, &score_node(&1, file_lines)) + all_ratios = collect_ratios(blocks) + n = length(all_ratios) + + mean_ratio = if(n == 0, do: 0.0, else: round4(Enum.sum(all_ratios) / n)) + max_ratio = if(n == 0, do: 0.0, else: round4(Enum.max(all_ratios))) + violation_count = Enum.count(all_ratios, &(&1 >= @violation_threshold)) + + %{ + "blocks" => blocks, + "mean_ratio" => mean_ratio, + "max_ratio" => max_ratio, + "violation_count" => violation_count, + "insight" => file_insight(mean_ratio, max_ratio, violation_count, length(top_nodes)) + } + end + + defp file_insight(_mean, _max, _violations, 0), + do: "No blocks detected." + + defp file_insight(_mean, _max, 0, _block_count), + do: "Well decomposed — all blocks are small relative to their parents." + + defp file_insight(_mean, max_ratio, violations, _block_count) when max_ratio >= 0.9, + do: + "#{violations} block(s) nearly span the entire file — the file is not decomposed into meaningful pieces." + + defp file_insight(mean_ratio, _max, violations, _block_count) when mean_ratio >= 0.5, + do: + "#{violations} violation(s); blocks are large on average (mean ratio #{mean_ratio}) — the file likely needs to be split or its blocks extracted." + + defp file_insight(_mean, _max, violations, _block_count), + do: + "#{violations} block(s) dominate their parent context — consider extracting those into separate functions or modules." + + defp score_node(node, parent_lines) do + ratio = if parent_lines > 0, do: round4(node.line_count / parent_lines), else: 0.0 + + children = Enum.map(node.children, &score_node(&1, node.line_count)) + + base = %{ + "start_line" => node.start_line, + "end_line" => node.end_line, + "line_count" => node.line_count, + "parent_lines" => parent_lines, + "ratio" => ratio, + "insight" => block_insight(ratio, []), + "children" => children + } + + case children do + [] -> + base + + kids -> + child_ratios = Enum.map(kids, & &1["ratio"]) + avg = round4(Enum.sum(child_ratios) / length(child_ratios)) + + base + |> Map.put("avg_child_ratio", avg) + |> Map.put("insight", block_insight(ratio, avg_child_ratio: avg)) + end + end + + defp block_insight(ratio, opts) do + avg_child_ratio = Keyword.get(opts, :avg_child_ratio) + + cond do + ratio >= 0.9 -> + "Block spans nearly the entire parent — no meaningful decomposition at this level." + + (ratio >= @violation_threshold and avg_child_ratio) && + avg_child_ratio >= @violation_threshold -> + "Block is large relative to its parent and its own children are also large — nested decomposition failure." + + ratio >= @violation_threshold -> + "Block is large relative to its parent — consider splitting or extracting." + + avg_child_ratio && avg_child_ratio >= @violation_threshold -> + "Block is reasonably sized but its children are too large — this block should be broken down further." + + true -> + nil + end + end + + defp collect_ratios(blocks) do + Enum.flat_map(blocks, fn block -> + [block["ratio"] | collect_ratios(block["children"])] + end) + end + + # --- codebase-level scoring --- + + defp compute_codebase_score(pipeline_result) do + pairs = + pipeline_result + |> Map.get("files", %{}) + |> Enum.flat_map(fn {_path, file_data} -> + fm = get_in(file_data, ["metrics", "function_metrics"]) || %{} + count = fm["function_count"] + avg = fm["avg_function_lines"] + + if is_number(count) and is_number(avg) and count > 0 do + [{count * 1.0, avg * 1.0}] + else + [] + end + end) + + n = length(pairs) + + if n < 3 do + %{ + "correlation" => nil, + "exponent" => nil, + "r_squared" => nil, + "sample_size" => n, + "insight" => + "Not enough files with function data to compute Menzerath conformance (need ≥ 3, got #{n})." + } + else + xs = Enum.map(pairs, &elem(&1, 0)) + ys = Enum.map(pairs, &elem(&1, 1)) + correlation = round4(pearson(xs, ys)) + {exponent, r_squared} = fit_power_law(xs, ys) + + %{ + "correlation" => correlation, + "exponent" => if(exponent, do: round4(exponent), else: nil), + "r_squared" => if(r_squared, do: round4(r_squared), else: nil), + "sample_size" => n, + "insight" => codebase_insight(correlation, r_squared) + } + end + end + + defp codebase_insight(correlation, r_squared) do + fit_quality = if r_squared && r_squared >= 0.5, do: " (strong fit, R²=#{r_squared})", else: "" + + cond do + correlation <= -0.3 -> + "Menzerath's law holds#{fit_quality} — larger files tend to have shorter functions, indicating healthy decomposition." + + correlation >= 0.3 -> + "Menzerath's law violated#{fit_quality} — larger files have longer functions. Files are growing without being decomposed; consider splitting large files or extracting functions." + + true -> + "Weak Menzerath signal (correlation #{correlation}) — no clear relationship between file size and function length. Decomposition patterns are inconsistent across the codebase." + end + end + + defp pearson(xs, ys) do + n = length(xs) + sum_x = Enum.sum(xs) + sum_y = Enum.sum(ys) + sum_xy = Enum.zip(xs, ys) |> Enum.reduce(0.0, fn {x, y}, acc -> acc + x * y end) + sum_x2 = Enum.reduce(xs, 0.0, fn x, acc -> acc + x * x end) + sum_y2 = Enum.reduce(ys, 0.0, fn y, acc -> acc + y * y end) + + num = n * sum_xy - sum_x * sum_y + den = :math.sqrt((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y)) + + if den == 0.0, do: 0.0, else: num / den + end + + defp fit_power_law(xs, ys) do + # Linearize: log(y) = log(a) + b * log(x), fit via OLS on log-log scale + pairs = + Enum.zip(xs, ys) + |> Enum.filter(fn {x, y} -> x > 0 and y > 0 end) + + if length(pairs) < 2 do + {nil, nil} + else + log_xs = Enum.map(pairs, fn {x, _} -> :math.log(x) end) + log_ys = Enum.map(pairs, fn {_, y} -> :math.log(y) end) + + n = length(pairs) + sum_lx = Enum.sum(log_xs) + sum_ly = Enum.sum(log_ys) + sum_lx2 = Enum.reduce(log_xs, 0.0, fn x, acc -> acc + x * x end) + sum_lxly = Enum.zip(log_xs, log_ys) |> Enum.reduce(0.0, fn {x, y}, acc -> acc + x * y end) + + denom = n * sum_lx2 - sum_lx * sum_lx + + if denom == 0.0 do + {nil, nil} + else + fit_power_law_coefficients(log_xs, log_ys, sum_lx, sum_ly, sum_lxly, n, denom) + end + end + end + + defp fit_power_law_coefficients(log_xs, log_ys, sum_lx, sum_ly, sum_lxly, n, denom) do + b = (n * sum_lxly - sum_lx * sum_ly) / denom + log_a = (sum_ly - b * sum_lx) / n + mean_ly = sum_ly / n + + ss_tot = Enum.reduce(log_ys, 0.0, fn ly, acc -> acc + (ly - mean_ly) ** 2 end) + + ss_res = + Enum.zip(log_xs, log_ys) + |> Enum.reduce(0.0, fn {lx, ly}, acc -> + acc + (ly - (log_a + b * lx)) ** 2 + end) + + r_squared = if ss_tot == 0.0, do: 0.0, else: 1.0 - ss_res / ss_tot + {b, r_squared} + end + + defp round4(v), do: Float.round(v * 1.0, 4) +end diff --git a/lib/codeqa/metrics/post_processing/post_processing_metric.ex b/lib/codeqa/metrics/post_processing/post_processing_metric.ex new file mode 100644 index 0000000..c4b7bc0 --- /dev/null +++ b/lib/codeqa/metrics/post_processing/post_processing_metric.ex @@ -0,0 +1,21 @@ +defmodule CodeQA.Metrics.PostProcessing.PostProcessingMetric do + @moduledoc """ + Behaviour for post-processing metrics that derive values from the full pipeline result. + + Post-processing metrics run after both file and codebase metrics complete. They receive + the full result tree and the raw files map, and return a partial result map that is + deep-merged into the pipeline result. + """ + + @doc "Unique name used as the key in the output." + @callback name() :: String.t() + + @doc """ + Analyze the pipeline result and return a partial result map to be deep-merged. + + The returned map should use the same top-level structure as the pipeline result: + `%{"files" => %{path => additions}, "codebase" => additions}`. + Only keys present in the return value are merged; absent keys are left unchanged. + """ + @callback analyze(pipeline_result :: map(), files_map :: map(), opts :: keyword()) :: map() +end diff --git a/lib/codeqa/metrics/token_normalizer.ex b/lib/codeqa/metrics/token_normalizer.ex deleted file mode 100644 index 6967e6a..0000000 --- a/lib/codeqa/metrics/token_normalizer.ex +++ /dev/null @@ -1,45 +0,0 @@ -defmodule CodeQA.Metrics.TokenNormalizer do - @moduledoc """ - Abstracts raw source code into language-agnostic structural tokens. - - See [lexical analysis](https://en.wikipedia.org/wiki/Lexical_analysis). - """ - - # Note for future: This module can be extended with a second parameter - # normalize(code, language \\ :agnostic) to load specific regex dictionaries. - - @doc """ - Normalizes source code into a list of structural tokens. - - Replaces string literals with ``, numeric literals with ``, - and identifiers/keywords with ``. Remaining punctuation is split into - individual tokens, with common multi-character operators kept together. - - ## Examples - - iex> CodeQA.Metrics.TokenNormalizer.normalize("x = 42") - ["", "=", ""] - - """ - @spec normalize(String.t()) :: [String.t()] - def normalize(code) do - code - # 1. Strings (single and double quotes, handling escaped quotes) - |> String.replace(~r/"(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*'/, " ") - # 2. Numbers (integers and floats) - |> String.replace(~r/\b\d+(\.\d+)?\b/, " ") - # 3. Identifiers/Keywords (negative lookbehind/ahead to avoid clobbering // tags) - |> String.replace(~r/(?)/, " ") - # 4. Split by whitespace to extract the tokens and remaining structural punctuation - |> String.split(~r/\s+/, trim: true) - # 5. Further split punctuation, keeping common multi-char operators together - |> Enum.flat_map(&split_punctuation/1) - end - - defp split_punctuation(token) when token in ["", "", ""], do: [token] - - defp split_punctuation(text) do - Regex.scan(~r/->|=>|<>|\|>|::|\.\.\.|<-|!=|==|<=|>=|\+\+|--|&&|\|\||[^\w\s]/, text) - |> List.flatten() - end -end diff --git a/lib/codeqa/pipeline.ex b/lib/codeqa/pipeline.ex deleted file mode 100644 index bcd256c..0000000 --- a/lib/codeqa/pipeline.ex +++ /dev/null @@ -1,109 +0,0 @@ -defmodule CodeQA.Pipeline do - @moduledoc "Pre-computed shared context for file-level metrics." - - defmodule FileContext do - @moduledoc "Immutable pre-computed data shared across all file metrics." - @enforce_keys [ - :content, - :tokens, - :token_counts, - :words, - :identifiers, - :lines, - :encoded, - :byte_count, - :line_count - ] - defstruct @enforce_keys - - @type t :: %__MODULE__{ - content: String.t(), - tokens: tuple(), - token_counts: map(), - words: tuple(), - identifiers: tuple(), - lines: tuple(), - encoded: String.t(), - byte_count: non_neg_integer(), - line_count: non_neg_integer() - } - end - - @word_re ~r/\b[a-zA-Z_]\w*\b/u - - # Reserved words and keywords for: - # Python, Ruby, JavaScript, Elixir, C#, - # Java, C++, Go, Rust, PHP, Swift, Shell, Kotlin - @keywords MapSet.new(~w[ - if else elif elsif unless - for foreach while until do - return break continue yield pass - try except finally rescue ensure after catch throw raise begin end throws - case when switch cond match default fallthrough - with as and or not in is - import from require use using alias namespace package - class def defp defmodule defmacro defmacrop defprotocol defimpl defguard defdelegate - module interface struct enum delegate event protocol extension - function fn func fun new delete typeof instanceof void - var let val const static public private protected internal - sealed override virtual abstract final readonly open - async await receive suspend - self super this Self - extends implements - null undefined nil None nullptr - true false True False - bool int float double long short byte char boolean string decimal object dynamic - ref out params get set value inout - lambda del global nonlocal assert - type typealias - synchronized volatile transient native strictfp - auto register extern signed unsigned typedef sizeof union - template typename operator inline friend explicit mutable constexpr decltype noexcept - func chan go select defer range - mut impl trait pub mod crate dyn unsafe loop where move - echo print array list mixed never - actor init deinit lazy open some any rethrows willSet didSet - then fi done esac local export source unset declare - fun val object data companion reified infix vararg expect actual - ]) - - @spec build_file_context(String.t(), keyword()) :: FileContext.t() - def build_file_context(content, opts \\ []) when is_binary(content) do - stopwords = Keyword.get(opts, :word_stopwords, MapSet.new()) - - tokens = content |> String.split() |> List.to_tuple() - token_list = Tuple.to_list(tokens) - token_counts = Enum.frequencies(token_list) - - words = - Regex.scan(@word_re, content) - |> List.flatten() - |> Enum.reject(&MapSet.member?(stopwords, &1)) - |> List.to_tuple() - - word_list = Tuple.to_list(words) - identifiers = word_list |> Enum.reject(&MapSet.member?(@keywords, &1)) |> List.to_tuple() - lines = content |> String.split("\n") |> trim_trailing_empty() |> List.to_tuple() - encoded = content - - %FileContext{ - content: content, - tokens: tokens, - token_counts: token_counts, - words: words, - identifiers: identifiers, - lines: lines, - encoded: encoded, - byte_count: byte_size(content), - line_count: tuple_size(lines) - } - end - - defp trim_trailing_empty(lines) do - # Match Python's str.splitlines() behavior - case List.last(lines) do - "" -> List.delete_at(lines, -1) - _ -> lines - end - end -end diff --git a/lib/codeqa/stopwords.ex b/lib/codeqa/stopwords.ex deleted file mode 100644 index bd33374..0000000 --- a/lib/codeqa/stopwords.ex +++ /dev/null @@ -1,63 +0,0 @@ -defmodule CodeQA.Stopwords do - @moduledoc "Finds highly frequent items across a codebase to act as stopwords." - - @doc """ - Finds items that appear in more than the specified threshold of files. - `extractor` is a function that takes a file's content and returns an Enumerable of items. - """ - def find_stopwords(files, extractor, opts \\ []) do - threshold_ratio = Keyword.get(opts, :stopwords_threshold, 0.15) - total_docs = map_size(files) - min_docs = max(1, round(total_docs * threshold_ratio)) - workers = Keyword.get(opts, :workers, System.schedulers_online()) - has_progress = Keyword.get(opts, :progress, false) - label = Keyword.get(opts, :progress_label, "") - - counter = :counters.new(1, [:atomics]) - start_time = System.monotonic_time(:millisecond) - - files - |> Task.async_stream( - fn {_path, content} -> - res = - content - |> extractor.() - |> MapSet.new() - - if has_progress do - :counters.add(counter, 1, 1) - completed = :counters.get(counter, 1) - print_progress(completed, total_docs, start_time, label) - end - - res - end, max_concurrency: workers, timeout: :infinity) - |> Enum.reduce(%{}, fn {:ok, unique_items_in_file}, doc_freqs -> - Enum.reduce(unique_items_in_file, doc_freqs, fn item, acc -> - Map.update(acc, item, 1, &(&1 + 1)) - end) - end) - |> Enum.filter(fn {_item, count} -> count >= min_docs end) - |> Enum.map(fn {item, _count} -> item end) - |> MapSet.new() - end - - defp print_progress(completed, total, start_time, label) do - now = System.monotonic_time(:millisecond) - elapsed = max(now - start_time, 1) - avg_time = elapsed / completed - eta_ms = round((total - completed) * avg_time) - - output = - CodeQA.CLI.UI.progress_bar(completed, total, - eta: CodeQA.CLI.UI.format_eta(eta_ms), - label: label - ) - - IO.write(:stderr, "\r" <> output) - - if completed == total do - IO.puts(:stderr, "") - end - end -end diff --git a/lib/codeqa/summarizer.ex b/lib/codeqa/summarizer.ex deleted file mode 100644 index d6d9c92..0000000 --- a/lib/codeqa/summarizer.ex +++ /dev/null @@ -1,126 +0,0 @@ -defmodule CodeQA.Summarizer do - @moduledoc false - - @codebase_direction_metrics [ - {"complexity", "halstead", "mean_volume"}, - {"readability", "readability", "mean_flesch_adapted"}, - {"entropy", "entropy", "mean_char_entropy"}, - {"redundancy", "compression", "mean_redundancy"} - ] - - @file_direction_metrics [ - {"complexity", "halstead", "volume"}, - {"readability", "readability", "flesch_adapted"}, - {"entropy", "entropy", "char_entropy"}, - {"redundancy", "compression", "redundancy"} - ] - - @threshold_stable 0.05 - @threshold_slight 0.20 - - def summarize_codebase(comparison) do - files = Map.get(comparison, "files", %{}) - codebase = Map.get(comparison, "codebase", %{}) - - file_counts = count_statuses(files) - directions = compute_codebase_directions(codebase) - gist = build_codebase_gist(file_counts, directions) - - %{"gist" => gist, "file_counts" => file_counts, "directions" => directions} - end - - def summarize_file(_path, %{"status" => "added"} = data) do - lines = get_in(data, ["head", "lines"]) || 0 - %{"gist" => "new file (#{lines} lines)", "status" => "added", "lines" => lines} - end - - def summarize_file(_path, %{"status" => "deleted"} = data) do - lines = get_in(data, ["base", "lines"]) || 0 - %{"gist" => "removed (#{lines} lines)", "status" => "deleted", "lines" => lines} - end - - def summarize_file(_path, %{"status" => "modified"} = data) do - directions = compute_file_directions(data) - gist = build_file_gist(directions) - %{"gist" => gist, "status" => "modified", "directions" => directions} - end - - defp count_statuses(files) do - Enum.reduce(files, %{"added" => 0, "modified" => 0, "deleted" => 0}, fn {_path, data}, acc -> - status = Map.get(data, "status", "modified") - Map.update!(acc, status, &(&1 + 1)) - end) - end - - defp compute_codebase_directions(codebase) do - base_agg = get_in(codebase, ["base", "aggregate"]) || %{} - delta_agg = get_in(codebase, ["delta", "aggregate"]) || %{} - - Map.new(@codebase_direction_metrics, fn {dir_key, metric, agg_key} -> - base_val = get_in(base_agg, [metric, agg_key]) - delta_val = get_in(delta_agg, [metric, agg_key]) - {dir_key, classify_change(base_val, delta_val)} - end) - end - - defp compute_file_directions(file_data) do - base_metrics = get_in(file_data, ["base", "metrics"]) || %{} - delta_metrics = get_in(file_data, ["delta", "metrics"]) || %{} - - Map.new(@file_direction_metrics, fn {dir_key, metric, key} -> - base_val = get_in(base_metrics, [metric, key]) - delta_val = get_in(delta_metrics, [metric, key]) - {dir_key, classify_change(base_val, delta_val)} - end) - end - - defp classify_change(nil, _), do: "stable" - defp classify_change(_, nil), do: "stable" - defp classify_change(0, _), do: "stable" - defp classify_change(+0.0, _), do: "stable" - - defp classify_change(base_val, delta_val) do - ratio = abs(delta_val) / abs(base_val) - - cond do - ratio < @threshold_stable -> "stable" - ratio < @threshold_slight and delta_val > 0 -> "increased slightly" - ratio < @threshold_slight -> "decreased slightly" - delta_val > 0 -> "increased" - true -> "decreased" - end - end - - defp build_file_gist(directions) do - parts = - directions - |> Enum.reject(fn {_, d} -> d == "stable" end) - |> Enum.map(fn {k, d} -> "#{k} #{d}" end) - - if parts == [], do: "all metrics stable", else: Enum.join(parts, ", ") - end - - defp build_codebase_gist(file_counts, directions) do - file_parts = - [ - {"added", file_counts["added"]}, - {"modified", file_counts["modified"]}, - {"deleted", file_counts["deleted"]} - ] - |> Enum.filter(fn {_, c} -> c > 0 end) - |> Enum.map(fn {s, c} -> "#{c} #{s}" end) - - file_summary = if file_parts == [], do: "no changes", else: Enum.join(file_parts, ", ") - - dir_parts = - directions - |> Enum.reject(fn {_, d} -> d == "stable" end) - |> Enum.map(fn {k, d} -> "#{k} #{d}" end) - - if dir_parts == [] do - "#{file_summary} — all metrics stable" - else - "#{file_summary} — #{Enum.join(dir_parts, ", ")}" - end - end -end diff --git a/lib/codeqa/telemetry.ex b/lib/codeqa/telemetry.ex deleted file mode 100644 index 3f5d22d..0000000 --- a/lib/codeqa/telemetry.ex +++ /dev/null @@ -1,68 +0,0 @@ -defmodule CodeQA.Telemetry do - @moduledoc "Simple concurrent telemetry tracker using ETS." - - def setup do - if :ets.info(:codeqa_telemetry) == :undefined do - :ets.new(:codeqa_telemetry, [:named_table, :public, :set, write_concurrency: true]) - end - - :ok - end - - def time(metric_name, fun) do - if :ets.info(:codeqa_telemetry) != :undefined do - start_time = System.monotonic_time(:microsecond) - result = fun.() - end_time = System.monotonic_time(:microsecond) - duration = end_time - start_time - - :ets.update_counter(:codeqa_telemetry, metric_name, {2, duration}, {metric_name, 0}) - - :ets.update_counter( - :codeqa_telemetry, - "#{metric_name}_count", - {2, 1}, - {"#{metric_name}_count", 0} - ) - - result - else - fun.() - end - end - - defp format_metric_line({name, total_time_us}) do - count = - case :ets.lookup(:codeqa_telemetry, "#{name}_count") do - [{_, c}] -> c - _ -> 1 - end - - total_ms = Float.round(total_time_us / 1000, 2) - avg_ms = Float.round(total_ms / count, 2) - - String.pad_trailing(to_string(name), 30) <> - " | Total: #{String.pad_trailing(to_string(total_ms) <> "ms", 12)}" <> - " | Count: #{String.pad_trailing(to_string(count), 6)}" <> - " | Avg: #{avg_ms}ms" - end - - def print_report do - if :ets.info(:codeqa_telemetry) != :undefined do - IO.puts(:stderr, " ---- Telemetry Report (Wall-clock times) ---") - metrics = :ets.tab2list(:codeqa_telemetry) - - # Group totals and counts - totals = - Enum.filter(metrics, fn {k, _} -> not String.ends_with?(to_string(k), "_count") end) - - totals - |> Enum.sort_by(fn {_, time} -> time end, :desc) - |> Enum.each(&IO.puts(:stderr, format_metric_line(&1))) - - IO.puts(:stderr, "------------------------------------------- -") - end - end -end diff --git a/lib/mix/tasks/codeqa/sample_report.ex b/lib/mix/tasks/codeqa/sample_report.ex new file mode 100644 index 0000000..1bc5cf0 --- /dev/null +++ b/lib/mix/tasks/codeqa/sample_report.ex @@ -0,0 +1,210 @@ +defmodule Mix.Tasks.Codeqa.SampleReport do + use Mix.Task + + @shortdoc "Evaluates combined metric formulas against good/bad sample code" + + @moduledoc """ + Runs combined metric formulas against sample files and prints a separation table. + + mix codeqa.sample_report + mix codeqa.sample_report --category variable_naming + mix codeqa.sample_report --verbose + mix codeqa.sample_report --output results.json + mix codeqa.sample_report --apply-scalars + mix codeqa.sample_report --apply-languages + mix codeqa.sample_report --apply-languages --category variable_naming + mix codeqa.sample_report --file path/to/file.ex + + A ratio ≥ 2x means the formula meaningfully separates good from bad code. + A ratio < 1.5x is flagged as weak; < 1.0x is marked ✗ (wrong direction). + + `--apply-scalars` rewrites the YAML config files with suggested scalars derived + from the sample data. Metrics with ratio in the deadzone (0.995–1.005) are + excluded. All non-deadzoned metrics are written, including ones not previously + in the YAML. + + `--file` analyzes a single file or directory and prints all combined metric + behavior scores, grouped by category, sorted worst-first. + """ + + @switches [ + category: :string, + verbose: :boolean, + output: :string, + report: :string, + apply_scalars: :boolean, + apply_languages: :boolean, + file: :string, + top: :integer + ] + + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + + def run(args) do + Mix.Task.run("app.start") + {opts, _, _} = OptionParser.parse(args, switches: @switches) + + results = SampleRunner.run(opts) + + results + |> Enum.group_by(& &1.category) + |> Enum.each(&print_category(&1, opts)) + + if path = opts[:output] do + File.write!(path, Jason.encode!(results, pretty: true)) + IO.puts("\nResults written to #{path}") + end + + if path = opts[:report] do + report = SampleRunner.build_metric_report(opts) + File.write!(path, Jason.encode!(report, pretty: true)) + IO.puts("\nMetric report written to #{path}") + end + + if opts[:apply_scalars] do + stats = SampleRunner.apply_scalars(opts) + IO.puts("\nApplied scalars to YAML configs:") + Enum.each(stats, &print_scalar_stats/1) + end + + if opts[:apply_languages] do + stats = SampleRunner.apply_languages(opts) + IO.puts("\nApplied language coverage to YAML configs:") + + Enum.each(stats, fn %{category: cat, behaviors_with_languages: n} -> + IO.puts(" #{cat}: #{n} behaviors with language coverage") + end) + end + + if path = opts[:file] do + print_file_scores(path, opts) + end + end + + defp print_category({category, results}, opts) do + IO.puts("\n#{category}") + IO.puts(String.duplicate("-", 75)) + + IO.puts( + " " <> + pad("behavior", 35) <> + pad("bad", 9) <> + pad("good", 9) <> + pad("ratio", 13) <> + "ok?" + ) + + Enum.each(results, &print_row(&1, opts)) + end + + defp print_row(r, opts) do + ratio_str = + "#{r.ratio}x" <> + cond do + not r.direction_ok -> "" + r.ratio < 1.5 -> " (weak)" + true -> "" + end + + ok = if r.direction_ok, do: "✓", else: "✗" + + IO.puts( + " " <> + pad(r.behavior, 35) <> + pad(fmt(r.bad_score), 9) <> + pad(fmt(r.good_score), 9) <> + pad(ratio_str, 13) <> + ok + ) + + if opts[:verbose] do + Enum.each(r.metric_detail, &print_metric_detail/1) + end + end + + defp print_metric_detail(m) do + scalar_str = if m.scalar >= 0, do: "+#{m.scalar}", else: "#{m.scalar}" + + IO.puts( + " " <> + pad("#{m.group}.#{m.key}", 45) <> + pad(scalar_str, 7) <> + pad(fmt(m.bad), 8) <> + pad(fmt(m.good), 8) <> + "#{m.ratio}x" + ) + end + + defp print_file_scores(path, opts) do + expanded = Path.expand(path) + + files = + cond do + File.dir?(expanded) -> + Collector.collect_files(expanded) + + File.regular?(expanded) -> + %{Path.basename(expanded) => File.read!(expanded)} + + true -> + IO.puts("\nPath not found: #{path}") + nil + end + + if files && map_size(files) > 0 do + IO.puts("\nAnalyzing #{map_size(files)} file(s) at: #{path}") + + aggregate = + files + |> Analyzer.analyze_codebase() + |> get_in(["codebase", "aggregate"]) + + top_n = opts[:top] || 15 + issues = SampleRunner.diagnose_aggregate(aggregate, top: top_n) + IO.puts("\nTop #{top_n} likely issues (by cosine similarity):") + IO.puts(String.duplicate("-", 75)) + IO.puts(" " <> pad("behavior", 38) <> pad("cosine", 9) <> "score") + Enum.each(issues, &print_issue_row/1) + + IO.puts("\nFull breakdown by category:") + combined = SampleRunner.score_aggregate(aggregate) + IO.puts("") + Enum.each(combined, &print_combined_category/1) + else + IO.puts("\nNo supported files found at: #{path}") + end + end + + defp print_issue_row(%{category: cat, behavior: b, cosine: cos, score: s, top_metrics: metrics}) do + IO.puts(" " <> pad("#{cat}.#{b}", 38) <> pad(fmt(cos), 9) <> fmt(s)) + + Enum.each(metrics, fn %{metric: m, contribution: c} -> + IO.puts(" " <> pad(m, 44) <> fmt(c)) + end) + end + + defp print_combined_category(%{name: name, behaviors: behaviors}) do + IO.puts(name) + IO.puts(String.duplicate("-", 60)) + + IO.puts(" " <> pad("behavior", 40) <> "score") + + behaviors + |> Enum.sort_by(& &1.score) + |> Enum.each(fn %{behavior: b, score: s} -> + flag = if s < 0.0, do: " ⚠", else: "" + IO.puts(" " <> pad(b, 40) <> fmt(s) <> flag) + end) + + IO.puts("") + end + + defp print_scalar_stats(%{category: cat, updated: u, deadzoned: d, skipped: s}) do + IO.puts(" #{pad(cat, 30)} #{u} written #{d} deadzoned #{s} skipped (no samples)") + end + + defp fmt(f), do: :erlang.float_to_binary(f / 1, decimals: 4) + defp pad(s, n), do: String.pad_trailing(to_string(s), n) +end diff --git a/lib/mix/tasks/codeqa/signal_debug.ex b/lib/mix/tasks/codeqa/signal_debug.ex new file mode 100644 index 0000000..3852dec --- /dev/null +++ b/lib/mix/tasks/codeqa/signal_debug.ex @@ -0,0 +1,183 @@ +defmodule Mix.Tasks.Codeqa.SignalDebug do + use Mix.Task + + @shortdoc "Shows structural signal emissions when splitting a file into blocks" + + @moduledoc """ + Runs each structural signal over a file and prints its emissions step by step. + + mix codeqa.signal_debug path/to/file.ex + mix codeqa.signal_debug path/to/file.py --signal keyword + mix codeqa.signal_debug path/to/file.ex --show-tokens + + Options: + --signal Only show a specific signal (e.g. keyword, blank, bracket) + --show-tokens Print the full token list before signal output + """ + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.Language + + alias CodeQA.AST.Signals.Structural.{ + AccessModifierSignal, + BlankLineSignal, + BracketSignal, + BranchSplitSignal, + ColonIndentSignal, + CommentDividerSignal, + KeywordSignal, + SQLBlockSignal, + TripleQuoteSignal + } + + @switches [signal: :string, show_tokens: :boolean] + + @all_signals [ + %TripleQuoteSignal{}, + %BlankLineSignal{}, + %KeywordSignal{}, + %BranchSplitSignal{}, + %AccessModifierSignal{}, + %CommentDividerSignal{}, + %SQLBlockSignal{}, + %BracketSignal{}, + %ColonIndentSignal{} + ] + + @impl Mix.Task + def run(args) do + {opts, positional, _} = OptionParser.parse(args, strict: @switches) + + path = + case positional do + [p | _] -> p + [] -> Mix.raise("Usage: mix codeqa.signal_debug [--signal ] [--show-tokens]") + end + + unless File.exists?(path), do: Mix.raise("File not found: #{path}") + + content = File.read!(path) + lang_mod = Language.detect(path) + tokens = TokenNormalizer.normalize_structural(content) + lines = String.split(content, "\n") + + Mix.shell().info("File: #{path}") + Mix.shell().info("Language: #{lang_mod.name()}") + Mix.shell().info("Tokens: #{length(tokens)}") + Mix.shell().info("Lines: #{length(lines)}") + Mix.shell().info("") + + if opts[:show_tokens] do + print_tokens(tokens) + end + + signals = filter_signals(@all_signals, opts[:signal]) + + emissions_per_signal = + SignalStream.run(tokens, signals, lang_mod) + + Enum.zip(signals, emissions_per_signal) + |> Enum.each(fn {signal, emissions} -> + print_signal_section(signal, emissions, tokens, lines) + end) + end + + defp filter_signals(signals, nil), do: signals + + defp filter_signals(signals, name_filter) do + Enum.filter(signals, fn signal -> + module_name = + signal.__struct__ + |> Module.split() + |> List.last() + |> String.downcase() + + String.contains?(module_name, String.downcase(name_filter)) + end) + end + + defp print_tokens(tokens) do + Mix.shell().info("=== TOKEN LIST ===") + + tokens + |> Enum.with_index() + |> Enum.each(fn {token, idx} -> + Mix.shell().info( + " [#{idx}] line #{token.line} col #{token.col} #{inspect(token.kind)} #{inspect(token.content)}" + ) + end) + + Mix.shell().info("") + end + + defp print_signal_section(signal, emissions, tokens, lines) do + name = signal.__struct__ |> Module.split() |> List.last() + separator = String.duplicate("─", 60) + + Mix.shell().info(separator) + Mix.shell().info("SIGNAL: #{name}") + Mix.shell().info("Emissions: #{length(emissions)}") + Mix.shell().info("") + + if Enum.empty?(emissions) do + Mix.shell().info(" (no emissions)") + else + Enum.each(emissions, fn {_source, group, emission_name, value} -> + print_emission(group, emission_name, value, tokens, lines) + end) + end + + Mix.shell().info("") + end + + defp print_emission(:split, name, token_idx, tokens, lines) do + token = Enum.at(tokens, token_idx) + + line_num = token && token.line + line_src = line_num && Enum.at(lines, line_num - 1) + + Mix.shell().info(" [SPLIT :#{name}] token[#{token_idx}] → line #{line_num}") + + if line_src do + Mix.shell().info(" #{String.trim_trailing(line_src)}") + end + + if token do + Mix.shell().info(" ^ #{inspect(token.kind)} #{inspect(token.content)}") + end + + Mix.shell().info("") + end + + defp print_emission(:enclosure, name, {start_idx, end_idx}, tokens, lines) do + start_token = Enum.at(tokens, start_idx) + end_token = Enum.at(tokens, end_idx) + + start_line = start_token && start_token.line + end_line = end_token && end_token.line + + Mix.shell().info( + " [ENCLOSURE :#{name}] tokens[#{start_idx}..#{end_idx}] lines #{start_line}–#{end_line}" + ) + + if start_line do + Mix.shell().info( + " open: #{inspect(Enum.at(lines, start_line - 1) |> String.trim_trailing())}" + ) + end + + if end_line && end_line != start_line do + Mix.shell().info( + " close: #{inspect(Enum.at(lines, end_line - 1) |> String.trim_trailing())}" + ) + end + + Mix.shell().info("") + end + + defp print_emission(group, name, value, _tokens, _lines) do + Mix.shell().info(" [:#{group} :#{name}] #{inspect(value)}") + Mix.shell().info("") + end +end diff --git a/mix.exs b/mix.exs index 2e55bfe..cb2f413 100644 --- a/mix.exs +++ b/mix.exs @@ -11,7 +11,13 @@ defmodule CodeQA.MixProject do escript: [main_module: CodeQA.CLI], elixirc_paths: elixirc_paths(Mix.env()), preferred_envs: [precommit: :test], - aliases: aliases() + aliases: aliases(), + dialyzer: [ + ignore_warnings: ".dialyzer_ignore.exs", + plt_local_path: "priv/plts", + plt_core_path: "priv/plts" + ], + consolidate_protocols: Mix.env() != :test ] end @@ -30,6 +36,12 @@ defmodule CodeQA.MixProject do "compile --warnings-as-errors", "deps.unlock --unused", "format" + ], + health: [ + "run -e 'CodeQA.CLI.main([\"health-report\", \".\", \"--ignore-paths\", \"test/**\"])'" + ], + "health.progress": [ + "run -e 'CodeQA.CLI.main([\"health-report\", \".\", \"--ignore-paths\", \"test/**\", \"--progress\"])'" ] ] end diff --git a/priv/combined_metrics/code_smells.yml b/priv/combined_metrics/code_smells.yml new file mode 100644 index 0000000..f1c73c5 --- /dev/null +++ b/priv/combined_metrics/code_smells.yml @@ -0,0 +1,550 @@ +consistent_string_quote_style: + _doc: "Files should use a single, consistent string quoting style throughout." + _log_baseline: -18.2553 + branching: + mean_branching_density: 0.0243 + mean_non_blank_count: -0.0248 + brevity: + mean_sample_size: -0.0656 + casing_entropy: + mean_entropy: -0.0405 + mean_pascal_case_count: -0.1743 + mean_snake_case_count: -0.0505 + compression: + mean_raw_bytes: -0.0672 + mean_redundancy: 0.0207 + mean_unique_line_ratio: -0.0338 + mean_zlib_bytes: -0.1085 + mean_zlib_ratio: 0.0413 + entropy: + mean_char_max_entropy: -0.0077 + mean_token_entropy: -0.0178 + mean_token_max_entropy: -0.0141 + mean_total_tokens: -0.0783 + mean_vocab_size: -0.0656 + function_metrics: + mean_avg_function_lines: -0.0226 + mean_max_function_lines: -0.0644 + halstead: + mean_N1_total_operators: -0.1087 + mean_N2_total_operands: -0.2297 + mean_difficulty: -0.1017 + mean_effort: -0.2949 + mean_estimated_bugs: -0.1934 + mean_length: -0.1541 + mean_n1_unique_operators: -0.0790 + mean_n2_unique_operands: -0.2071 + mean_time_to_implement_seconds: -0.2949 + mean_vocabulary: -0.1721 + mean_volume: -0.1933 + heaps: + mean_k: -0.0368 + mean_r_squared: -0.0080 + identifier_length_variance: + mean_mean: 0.0059 + mean_std_dev: 0.0235 + mean_variance: 0.0470 + indentation: + mean_blank_line_ratio: 0.0205 + mean_variance: 0.0246 + line_patterns: + mean_blank_line_ratio: 0.0205 + mean_string_literal_ratio: -0.1616 + mean_unique_line_ratio: -0.0365 + magic_number_density: + mean_string_literal_ratio: 0.3018 + near_duplicate_blocks_file: + mean_near_dup_block_d2: 0.9542 + mean_near_dup_block_d3: -0.6021 + mean_near_dup_block_d4: -0.9542 + mean_near_dup_block_d5: 0.9542 + mean_near_dup_block_d6: -0.6021 + mean_sub_block_count: -0.1804 + ngram: + mean_bigram_hapax_fraction: -0.0101 + mean_bigram_repeated_unique: -0.0915 + mean_bigram_repetition_rate: 0.0277 + mean_bigram_total: -0.0785 + mean_bigram_unique: -0.1146 + mean_trigram_repeated_unique: -0.1104 + mean_trigram_repetition_rate: 0.0499 + mean_trigram_total: -0.0787 + mean_trigram_unique: -0.1182 + punctuation_density: + mean_arrow_density: 0.0674 + mean_bracket_nonalpha_prefix_count: 1.0103 + mean_bracket_nonalpha_suffix_count: 2.0000 + mean_colon_suffix_density: 0.0458 + mean_dot_count: -0.1743 + mean_exclamation_density: 0.0424 + mean_id_nonalpha_suffix_density: 0.0783 + readability: + mean_avg_line_length: -0.0444 + mean_avg_tokens_per_line: -0.0535 + mean_flesch_adapted: 0.0046 + mean_fog_adapted: -0.0301 + mean_total_lines: -0.0248 + symbol_density: + mean_density: -0.0325 + mean_distinct_symbol_types: -0.0966 + mean_symbol_count: -0.0999 + vocabulary: + mean_mattr: -0.0187 + mean_raw_ttr: -0.0050 + mean_total_identifiers: -0.0666 + mean_unique_identifiers: -0.0714 + vowel_density: + mean_total_chars: -0.0607 + zipf: + mean_total_tokens: -0.0783 + mean_vocab_size: -0.0656 + +no_dead_code_after_return: + _doc: "There should be no unreachable statements after a return or early exit." + _log_baseline: -55.8435 + branching: + mean_branch_count: -2.0000 + mean_branching_density: -1.4201 + mean_non_blank_count: -0.5815 + brevity: + mean_sample_size: -0.2610 + casing_entropy: + mean_entropy: -0.2430 + mean_other_count: -0.8708 + mean_pascal_case_count: -0.5752 + mean_snake_case_count: -0.3559 + compression: + mean_raw_bytes: -0.4531 + mean_redundancy: -0.0467 + mean_zlib_bytes: -0.3558 + mean_zlib_ratio: -0.0974 + entropy: + mean_char_entropy: 0.0250 + mean_char_max_entropy: -0.0205 + mean_char_normalized: 0.0455 + mean_token_entropy: -0.0475 + mean_token_max_entropy: -0.0575 + mean_token_normalized: 0.0099 + mean_total_tokens: -0.3093 + mean_vocab_size: -0.2610 + function_metrics: + mean_avg_function_lines: -0.4255 + mean_avg_param_count: 0.1143 + mean_function_count: -0.1143 + mean_max_function_lines: -0.5062 + halstead: + mean_N1_total_operators: -0.2185 + mean_N2_total_operands: -0.4051 + mean_difficulty: -0.1769 + mean_effort: -0.5126 + mean_estimated_bugs: -0.3357 + mean_length: -0.2795 + mean_n1_unique_operators: -0.0857 + mean_n2_unique_operands: -0.3139 + mean_time_to_implement_seconds: -0.5126 + mean_vocabulary: -0.2525 + mean_volume: -0.3357 + heaps: + mean_k: -0.1169 + identifier_length_variance: + mean_max: -0.4367 + mean_mean: -0.0159 + mean_std_dev: -0.2804 + mean_variance: -0.5607 + indentation: + mean_blank_line_ratio: 0.2883 + mean_mean_depth: -0.4448 + mean_variance: -0.6173 + line_patterns: + mean_blank_line_ratio: 0.2883 + mean_string_literal_ratio: -0.8289 + mean_unique_line_ratio: -0.0289 + magic_number_density: + mean_density: 0.2821 + mean_string_literal_ratio: -0.8289 + near_duplicate_blocks_file: + mean_block_count: -0.1083 + mean_near_dup_block_d0: 1.1292 + mean_near_dup_block_d5: 1.1292 + mean_near_dup_block_d7: -0.7124 + mean_near_dup_block_d8: 1.1292 + mean_sub_block_count: -0.3612 + ngram: + mean_bigram_hapax_fraction: 0.0142 + mean_bigram_repeated_unique: -0.3335 + mean_bigram_repetition_rate: -0.0114 + mean_bigram_total: -0.3100 + mean_bigram_unique: -0.3022 + mean_trigram_hapax_fraction: -0.0576 + mean_trigram_repeated_unique: -0.0894 + mean_trigram_repetition_rate: 0.0890 + mean_trigram_total: -0.3107 + mean_trigram_unique: -0.3313 + punctuation_density: + mean_arrow_density: -1.1156 + mean_bracket_nonalpha_prefix_count: 1.0397 + mean_bracket_nonalpha_suffix_count: -0.4541 + mean_colon_suffix_density: 0.3588 + mean_dot_count: -1.0081 + mean_id_nonalpha_suffix_density: 0.0111 + readability: + mean_avg_line_length: 0.1309 + mean_avg_sub_words_per_id: -0.0415 + mean_avg_tokens_per_line: 0.2722 + mean_flesch_adapted: 0.0243 + mean_fog_adapted: -0.3299 + mean_total_lines: -0.5815 + symbol_density: + mean_density: 0.2141 + mean_symbol_count: -0.2386 + vocabulary: + mean_mattr: -0.0424 + mean_raw_ttr: 0.0435 + mean_total_identifiers: -0.4061 + mean_unique_identifiers: -0.3626 + vowel_density: + mean_total_chars: -0.4220 + zipf: + mean_exponent: -0.0067 + mean_total_tokens: -0.3093 + mean_vocab_size: -0.2610 + +no_debug_print_statements: + _doc: "Debug output (`console.log`, `IO.inspect`, `fmt.Println`) must not be left in committed code." + _log_baseline: -88.0844 + branching: + mean_branch_count: -0.3540 + mean_branching_density: 0.0181 + mean_max_nesting_depth: -0.4025 + mean_non_blank_count: -0.3719 + brevity: + mean_sample_size: -0.2128 + casing_entropy: + mean_entropy: 0.0841 + mean_other_count: 0.1169 + mean_pascal_case_count: -1.0141 + mean_snake_case_count: -0.6773 + compression: + mean_raw_bytes: -0.4490 + mean_redundancy: -0.0518 + mean_unique_line_ratio: -0.0864 + mean_zlib_bytes: -0.3414 + mean_zlib_ratio: -0.1076 + entropy: + mean_char_entropy: -0.0058 + mean_char_normalized: -0.0059 + mean_token_entropy: -0.0215 + mean_token_max_entropy: -0.0460 + mean_token_normalized: 0.0245 + mean_total_tokens: -0.5168 + mean_vocab_size: -0.2128 + function_metrics: + mean_avg_function_lines: -0.4292 + mean_max_function_lines: -0.4243 + halstead: + mean_N1_total_operators: -0.3780 + mean_N2_total_operands: -0.4000 + mean_difficulty: -0.2494 + mean_effort: -0.6911 + mean_estimated_bugs: -0.4418 + mean_length: -0.3857 + mean_n1_unique_operators: -0.1473 + mean_n2_unique_operands: -0.2979 + mean_time_to_implement_seconds: -0.6911 + mean_vocabulary: -0.2559 + mean_volume: -0.4417 + heaps: + mean_beta: 0.0307 + mean_k: 0.0324 + mean_r_squared: -0.0077 + identifier_length_variance: + mean_mean: 0.1804 + mean_std_dev: 0.0496 + mean_variance: 0.0993 + indentation: + mean_blank_line_ratio: -0.0499 + mean_max_depth: -0.2215 + mean_mean_depth: -0.1903 + mean_variance: -0.2218 + line_patterns: + mean_blank_line_ratio: -0.0499 + mean_max_nesting_depth: -0.4025 + mean_string_literal_ratio: -1.0798 + mean_unique_line_ratio: -0.0602 + magic_number_density: + mean_density: 0.5227 + mean_string_literal_ratio: -1.0798 + near_duplicate_blocks_file: + mean_block_count: -0.2856 + mean_sub_block_count: -0.4114 + ngram: + mean_bigram_hapax_fraction: -0.0183 + mean_bigram_repeated_unique: -0.3069 + mean_bigram_repetition_rate: -0.0758 + mean_bigram_total: -0.5178 + mean_bigram_unique: -0.3466 + mean_trigram_hapax_fraction: -0.0225 + mean_trigram_repeated_unique: -0.3221 + mean_trigram_repetition_rate: -0.0929 + mean_trigram_total: -0.5188 + mean_trigram_unique: -0.4061 + punctuation_density: + mean_arrow_density: 0.4321 + mean_bracket_nonalpha_prefix_count: -2.0000 + mean_bracket_nonalpha_suffix_count: -1.2715 + mean_colon_suffix_density: -0.1696 + mean_dot_count: -0.9095 + mean_id_nonalpha_suffix_density: -0.0078 + readability: + mean_avg_line_length: -0.0795 + mean_avg_sub_words_per_id: 0.0617 + mean_avg_tokens_per_line: -0.1449 + mean_flesch_adapted: -0.0473 + mean_fog_adapted: -0.0298 + mean_total_lines: -0.3719 + symbol_density: + mean_density: -0.0878 + mean_distinct_symbol_types: -0.1242 + mean_symbol_count: -0.5369 + vocabulary: + mean_mattr: 0.3160 + mean_raw_ttr: 0.4438 + mean_total_identifiers: -0.6654 + mean_unique_identifiers: -0.2215 + vowel_density: + mean_total_chars: -0.4850 + zipf: + mean_exponent: -0.1196 + mean_r_squared: 0.0127 + mean_total_tokens: -0.5168 + mean_vocab_size: -0.2128 + +no_fixme_comments: + _doc: "FIXME, XXX, and HACK comments indicate known problems that should be resolved before merging." + _log_baseline: -2.0233 + branching: + mean_branch_count: 0.1755 + mean_branching_density: 0.1504 + mean_non_blank_count: 0.0252 + brevity: + mean_sample_size: -0.0887 + casing_entropy: + mean_entropy: 0.0286 + mean_pascal_case_count: 0.0129 + mean_snake_case_count: -0.0450 + comment_structure: + mean_comment_line_count: -0.6667 + mean_comment_line_ratio: 0.5464 + mean_todo_fixme_count: -0.7195 + compression: + mean_raw_bytes: 0.0176 + mean_redundancy: 0.0275 + mean_unique_line_ratio: -0.0049 + mean_zlib_bytes: -0.0211 + mean_zlib_ratio: 0.0387 + entropy: + mean_char_entropy: 0.0020 + mean_char_max_entropy: -0.0054 + mean_char_normalized: 0.0074 + mean_token_entropy: -0.0266 + mean_token_max_entropy: -0.0180 + mean_token_normalized: -0.0086 + mean_total_tokens: 0.0600 + mean_vocab_size: -0.0887 + function_metrics: + mean_avg_function_lines: 0.0209 + mean_avg_param_count: 0.0155 + mean_function_count: 0.0232 + mean_max_function_lines: 0.1330 + halstead: + mean_N1_total_operators: 0.1156 + mean_N2_total_operands: 0.0046 + mean_difficulty: 0.0964 + mean_effort: 0.1556 + mean_estimated_bugs: 0.0592 + mean_length: 0.0745 + mean_n1_unique_operators: -0.0082 + mean_n2_unique_operands: -0.1000 + mean_time_to_implement_seconds: 0.1556 + mean_vocabulary: -0.0750 + mean_volume: 0.0591 + heaps: + mean_beta: -0.0665 + mean_k: 0.1089 + identifier_length_variance: + mean_mean: 0.0159 + mean_std_dev: 0.0203 + mean_variance: 0.0407 + indentation: + mean_blank_line_ratio: 0.0697 + mean_max_depth: 0.0646 + mean_mean_depth: 0.0482 + mean_variance: 0.1889 + line_patterns: + mean_blank_line_ratio: 0.0697 + mean_string_literal_ratio: -0.0060 + mean_unique_line_ratio: -0.0041 + magic_number_density: + mean_density: -0.0646 + mean_string_literal_ratio: 0.0564 + near_duplicate_blocks_file: + mean_block_count: 0.0305 + mean_sub_block_count: 0.1090 + ngram: + mean_bigram_hapax_fraction: -0.0378 + mean_bigram_repeated_unique: 0.0858 + mean_bigram_repetition_rate: 0.0747 + mean_bigram_total: 0.0601 + mean_bigram_unique: -0.0026 + mean_trigram_hapax_fraction: -0.0202 + mean_trigram_repeated_unique: 0.1423 + mean_trigram_repetition_rate: 0.1085 + mean_trigram_total: 0.0602 + mean_trigram_unique: 0.0286 + punctuation_density: + mean_arrow_density: 0.0123 + mean_bracket_nonalpha_prefix_count: 0.1755 + mean_bracket_nonalpha_suffix_count: 0.2840 + mean_colon_suffix_density: -0.1540 + mean_dot_count: 0.1361 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: 0.0200 + mean_question_mark_density: 0.2981 + readability: + mean_avg_line_length: 0.0494 + mean_avg_sub_words_per_id: 0.0089 + mean_avg_tokens_per_line: -0.0127 + mean_flesch_adapted: -0.0070 + mean_fog_adapted: -0.0103 + mean_total_lines: 0.0728 + symbol_density: + mean_density: 0.1041 + mean_symbol_count: 0.1218 + vocabulary: + mean_mattr: -0.0826 + mean_raw_ttr: -0.0781 + mean_total_identifiers: -0.0352 + mean_unique_identifiers: -0.1133 + vowel_density: + mean_total_chars: -0.0193 + zipf: + mean_exponent: 0.0584 + mean_r_squared: 0.0015 + mean_total_tokens: 0.0600 + mean_vocab_size: -0.0887 + +no_nested_ternary: + _doc: "Nested conditional expressions (ternary-within-ternary) are harder to read than a plain if-else." + _log_baseline: 7.6475 + branching: + mean_branch_count: -0.5662 + mean_branching_density: -0.3441 + mean_max_nesting_depth: 0.1824 + mean_non_blank_count: -0.2221 + brevity: + mean_sample_size: 0.0486 + casing_entropy: + mean_entropy: 0.2311 + mean_other_count: 0.7455 + mean_pascal_case_count: 0.1237 + mean_snake_case_count: 0.1138 + compression: + mean_raw_bytes: -0.0141 + mean_redundancy: -0.0117 + mean_unique_line_ratio: 0.1154 + mean_zlib_bytes: 0.0170 + mean_zlib_ratio: -0.0312 + entropy: + mean_char_entropy: 0.0689 + mean_char_max_entropy: 0.0024 + mean_char_normalized: 0.0665 + mean_token_entropy: -0.0014 + mean_token_max_entropy: 0.0110 + mean_token_normalized: -0.0124 + mean_total_tokens: 0.1324 + mean_vocab_size: 0.0486 + function_metrics: + mean_avg_function_lines: -0.7403 + mean_avg_param_count: -0.0277 + mean_function_count: 0.5579 + mean_max_function_lines: -0.4954 + halstead: + mean_N1_total_operators: 0.1382 + mean_N2_total_operands: 0.1252 + mean_difficulty: 0.0773 + mean_effort: 0.2218 + mean_estimated_bugs: 0.1445 + mean_length: 0.1335 + mean_n1_unique_operators: 0.0128 + mean_n2_unique_operands: 0.0608 + mean_time_to_implement_seconds: 0.2218 + mean_vocabulary: 0.0480 + mean_volume: 0.1445 + heaps: + mean_beta: -0.0334 + mean_k: 0.0563 + identifier_length_variance: + mean_max: 0.0170 + mean_mean: -0.0112 + mean_std_dev: -0.0060 + mean_variance: -0.0120 + indentation: + mean_blank_line_ratio: 0.3825 + mean_max_depth: -0.2891 + mean_mean_depth: -0.2922 + mean_variance: -0.5254 + line_patterns: + mean_blank_line_ratio: 0.3825 + mean_max_nesting_depth: 0.1824 + mean_string_literal_ratio: 0.0146 + mean_unique_line_ratio: 0.1591 + magic_number_density: + mean_density: -0.1634 + mean_magic_number_count: -0.0310 + mean_string_literal_ratio: 0.0146 + near_duplicate_blocks_file: + mean_block_count: 0.0885 + mean_near_dup_block_d7: -0.1824 + mean_sub_block_count: 0.5472 + ngram: + mean_bigram_hapax_fraction: -0.0464 + mean_bigram_repeated_unique: 0.1405 + mean_bigram_repetition_rate: 0.0564 + mean_bigram_total: 0.1327 + mean_bigram_unique: 0.0600 + mean_trigram_hapax_fraction: -0.0321 + mean_trigram_repeated_unique: 0.1699 + mean_trigram_repetition_rate: 0.1003 + mean_trigram_total: 0.1331 + mean_trigram_unique: 0.0704 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.5781 + mean_bracket_nonalpha_suffix_count: 0.7295 + mean_colon_suffix_density: -0.6851 + mean_dot_count: -0.1824 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: 0.2589 + readability: + mean_avg_line_length: 0.2148 + mean_avg_sub_words_per_id: 0.0173 + mean_avg_tokens_per_line: 0.3545 + mean_flesch_adapted: -0.0367 + mean_fog_adapted: 0.3545 + mean_total_lines: -0.2221 + symbol_density: + mean_density: 0.2615 + mean_distinct_symbol_types: 0.0377 + mean_symbol_count: 0.2475 + vocabulary: + mean_mattr: -0.0587 + mean_raw_ttr: -0.0515 + mean_total_identifiers: 0.1551 + mean_unique_identifiers: 0.1036 + vowel_density: + mean_total_chars: 0.1439 + zipf: + mean_exponent: 0.0240 + mean_r_squared: 0.0111 + mean_total_tokens: 0.1324 + mean_vocab_size: 0.0486 + diff --git a/priv/combined_metrics/consistency.yml b/priv/combined_metrics/consistency.yml new file mode 100644 index 0000000..25026ea --- /dev/null +++ b/priv/combined_metrics/consistency.yml @@ -0,0 +1,312 @@ +consistent_casing_within_file: + _doc: "A file should use one naming convention throughout — no mixing of camelCase and snake_case for the same kind of identifier." + _log_baseline: -2.4826 + brevity: + mean_sample_size: -0.0471 + casing_entropy: + mean_camel_case_count: -2.0000 + mean_entropy: -0.4254 + mean_snake_case_count: 0.2663 + compression: + mean_raw_bytes: 0.0213 + mean_redundancy: 0.0219 + mean_zlib_bytes: -0.0194 + mean_zlib_ratio: 0.0407 + entropy: + mean_char_entropy: -0.0126 + mean_char_max_entropy: -0.0170 + mean_char_normalized: 0.0044 + mean_token_entropy: -0.0090 + mean_token_max_entropy: -0.0101 + mean_vocab_size: -0.0471 + halstead: + mean_difficulty: 0.0629 + mean_effort: 0.0530 + mean_estimated_bugs: -0.0099 + mean_n2_unique_operands: -0.0629 + mean_time_to_implement_seconds: 0.0530 + mean_vocabulary: -0.0456 + mean_volume: -0.0099 + heaps: + mean_beta: -0.0232 + mean_k: 0.0253 + identifier_length_variance: + mean_mean: 0.0337 + mean_std_dev: 0.0139 + mean_variance: 0.0278 + ngram: + mean_bigram_hapax_fraction: -0.0071 + mean_bigram_repetition_rate: 0.0267 + mean_bigram_unique: -0.0197 + mean_trigram_hapax_fraction: -0.0122 + mean_trigram_repeated_unique: 0.0698 + mean_trigram_repetition_rate: 0.0874 + mean_trigram_unique: -0.0172 + readability: + mean_avg_line_length: 0.0221 + symbol_density: + mean_density: -0.0214 + vocabulary: + mean_mattr: -0.0680 + mean_raw_ttr: -0.0735 + mean_unique_identifiers: -0.0735 + vowel_density: + mean_total_chars: 0.0337 + zipf: + mean_exponent: 0.0265 + mean_vocab_size: -0.0471 + +consistent_error_return_shape: + _doc: "All functions in a module should return errors in the same shape — mixed `nil`, `false`, and `{:error, _}` returns are confusing." + _log_baseline: 30.7874 + branching: + mean_branch_count: -0.2178 + mean_branching_density: -0.1258 + mean_non_blank_count: -0.0921 + brevity: + mean_sample_size: 0.0231 + casing_entropy: + mean_entropy: 0.0245 + mean_snake_case_count: -0.0406 + compression: + mean_raw_bytes: -0.0234 + mean_redundancy: 0.0058 + mean_unique_line_ratio: 0.0576 + mean_zlib_bytes: -0.0378 + mean_zlib_ratio: 0.0144 + entropy: + mean_char_entropy: 0.0159 + mean_char_normalized: 0.0160 + mean_token_entropy: -0.0277 + mean_token_normalized: -0.0328 + mean_total_tokens: 0.0361 + mean_vocab_size: 0.0231 + function_metrics: + mean_avg_function_lines: -0.0914 + halstead: + mean_N1_total_operators: 0.1217 + mean_N2_total_operands: 0.0275 + mean_difficulty: 0.0936 + mean_effort: 0.1918 + mean_estimated_bugs: 0.0982 + mean_length: 0.0900 + mean_n1_unique_operators: 0.0835 + mean_n2_unique_operands: 0.0174 + mean_time_to_implement_seconds: 0.1918 + mean_vocabulary: 0.0368 + mean_volume: 0.0982 + heaps: + mean_beta: -0.0402 + mean_k: 0.0903 + identifier_length_variance: + mean_mean: 0.0455 + mean_std_dev: 0.0308 + mean_variance: 0.0616 + indentation: + mean_blank_line_ratio: 0.0185 + mean_mean_depth: -0.0638 + mean_variance: -0.0831 + line_patterns: + mean_blank_line_ratio: 0.0185 + mean_string_literal_ratio: -2.0000 + mean_unique_line_ratio: 0.0559 + magic_number_density: + mean_density: -0.0557 + mean_string_literal_ratio: -2.0000 + near_duplicate_blocks_file: + mean_near_dup_block_d7: 0.7522 + mean_near_dup_block_d8: -0.4400 + mean_sub_block_count: 0.1553 + ngram: + mean_bigram_hapax_fraction: -0.0430 + mean_bigram_repeated_unique: 0.0136 + mean_bigram_repetition_rate: 0.0601 + mean_bigram_total: 0.0362 + mean_bigram_unique: -0.0667 + mean_trigram_hapax_fraction: -0.0176 + mean_trigram_repeated_unique: 0.0150 + mean_trigram_repetition_rate: 0.0920 + mean_trigram_total: 0.0363 + mean_trigram_unique: -0.0443 + punctuation_density: + mean_arrow_density: -0.3821 + mean_bracket_nonalpha_suffix_count: 0.0804 + mean_colon_suffix_density: -0.0384 + mean_id_nonalpha_suffix_density: 0.0720 + readability: + mean_avg_line_length: 0.0716 + mean_avg_sub_words_per_id: 0.0302 + mean_avg_tokens_per_line: 0.1283 + mean_flesch_adapted: -0.0449 + mean_fog_adapted: 0.0827 + mean_total_lines: -0.0921 + symbol_density: + mean_density: 0.1360 + mean_symbol_count: 0.1121 + vocabulary: + mean_mattr: 0.0877 + mean_raw_ttr: 0.0733 + mean_total_identifiers: -0.0353 + mean_unique_identifiers: 0.0381 + vowel_density: + mean_total_chars: 0.0102 + zipf: + mean_exponent: 0.0181 + mean_total_tokens: 0.0361 + mean_vocab_size: 0.0231 + +consistent_function_style: + _doc: "A module should not mix one-liner and multi-clause function definitions for the same concern." + _log_baseline: 0.1374 + branching: + mean_branch_count: -0.1610 + mean_branching_density: -0.3349 + mean_max_nesting_depth: -0.1610 + mean_non_blank_count: 0.1738 + brevity: + mean_sample_size: 0.0028 + casing_entropy: + mean_entropy: -0.0534 + mean_other_count: -0.2753 + mean_pascal_case_count: -0.0379 + mean_snake_case_count: 0.0199 + compression: + mean_raw_bytes: 0.0313 + mean_redundancy: 0.0188 + mean_unique_line_ratio: -0.0440 + mean_zlib_bytes: 0.0037 + mean_zlib_ratio: 0.0276 + entropy: + mean_char_entropy: -0.0072 + mean_char_normalized: -0.0071 + mean_token_entropy: 0.0058 + mean_token_normalized: 0.0052 + mean_vocab_size: 0.0028 + function_metrics: + mean_avg_function_lines: 0.0608 + mean_avg_param_count: -0.0099 + mean_function_count: 0.0939 + mean_max_function_lines: -0.0797 + halstead: + mean_N2_total_operands: 0.0471 + mean_difficulty: 0.0186 + mean_effort: 0.0362 + mean_estimated_bugs: 0.0176 + mean_length: 0.0157 + mean_n1_unique_operators: -0.0122 + mean_n2_unique_operands: 0.0162 + mean_time_to_implement_seconds: 0.0362 + mean_vocabulary: 0.0091 + mean_volume: 0.0176 + heaps: + mean_beta: 0.0024 + identifier_length_variance: + mean_mean: 0.0076 + mean_variance: 0.0038 + indentation: + mean_blank_line_ratio: -0.0991 + mean_max_depth: -0.1143 + mean_mean_depth: -0.0203 + mean_variance: -0.1730 + line_patterns: + mean_blank_line_ratio: -0.0991 + mean_max_nesting_depth: -0.1610 + mean_unique_line_ratio: -0.0456 + near_duplicate_blocks_file: + mean_block_count: 0.2753 + mean_near_dup_block_d0: 0.9145 + mean_near_dup_block_d7: 0.1610 + mean_near_dup_block_d8: 0.5506 + mean_sub_block_count: 0.0594 + ngram: + mean_bigram_hapax_fraction: 0.0037 + mean_bigram_repeated_unique: -0.0041 + mean_bigram_repetition_rate: -0.0091 + mean_bigram_unique: 0.0065 + mean_trigram_repeated_unique: -0.0058 + punctuation_density: + mean_arrow_density: 2.0000 + mean_bracket_nonalpha_suffix_count: -0.0781 + mean_colon_suffix_density: -0.1318 + mean_dot_count: -0.0204 + mean_id_nonalpha_suffix_density: -0.0132 + readability: + mean_avg_line_length: -0.1471 + mean_avg_sub_words_per_id: 0.0030 + mean_avg_tokens_per_line: -0.1751 + mean_flesch_adapted: 0.0147 + mean_fog_adapted: -0.1412 + mean_total_lines: 0.1738 + symbol_density: + mean_density: -0.0473 + mean_symbol_count: -0.0159 + vocabulary: + mean_mattr: -0.0025 + mean_raw_ttr: -0.0051 + mean_total_identifiers: 0.0098 + mean_unique_identifiers: 0.0047 + vowel_density: + mean_total_chars: 0.0175 + zipf: + mean_exponent: -0.0054 + mean_vocab_size: 0.0028 + +same_concept_same_name: + _doc: "The same domain concept should use the same name throughout a file — mixing `user`, `usr`, and `u` for the same thing harms readability." + _log_baseline: -12.4109 + brevity: + mean_sample_size: -1.3457 + compression: + mean_raw_bytes: 0.1773 + mean_redundancy: 0.3935 + mean_unique_line_ratio: -0.3251 + mean_zlib_bytes: -0.8263 + mean_zlib_ratio: 1.0033 + entropy: + mean_char_entropy: -0.1808 + mean_char_normalized: -0.1800 + mean_token_entropy: -0.3546 + mean_token_max_entropy: -0.2899 + mean_vocab_size: -1.3457 + halstead: + mean_difficulty: 1.8665 + mean_effort: 1.5662 + mean_estimated_bugs: -0.2997 + mean_n2_unique_operands: -1.8665 + mean_time_to_implement_seconds: 1.5662 + mean_vocabulary: -1.3857 + mean_volume: -0.3003 + heaps: + mean_beta: -0.5870 + mean_k: 0.5102 + identifier_length_variance: + mean_mean: 0.3431 + mean_std_dev: -0.4791 + mean_variance: -0.9580 + line_patterns: + mean_unique_line_ratio: -0.6939 + ngram: + mean_bigram_hapax_fraction: -0.6466 + mean_bigram_repeated_unique: -0.2091 + mean_bigram_repetition_rate: 0.6530 + mean_bigram_unique: -1.1746 + mean_trigram_hapax_fraction: -0.6625 + mean_trigram_repeated_unique: 1.2887 + mean_trigram_repetition_rate: 1.6149 + mean_trigram_unique: -0.9875 + readability: + mean_avg_line_length: 0.1837 + mean_avg_sub_words_per_id: -0.1771 + mean_flesch_adapted: 0.2348 + symbol_density: + mean_density: -0.1807 + vocabulary: + mean_mattr: -1.8899 + mean_raw_ttr: -1.9969 + mean_unique_identifiers: -2.0000 + vowel_density: + mean_total_chars: 0.3432 + zipf: + mean_exponent: 0.7698 + mean_vocab_size: -1.3457 + diff --git a/priv/combined_metrics/dependencies.yml b/priv/combined_metrics/dependencies.yml new file mode 100644 index 0000000..7bcfacf --- /dev/null +++ b/priv/combined_metrics/dependencies.yml @@ -0,0 +1,323 @@ +import_count_under_10: + _doc: "Files should import fewer than 10 modules; high import counts signal excessive coupling." + _log_baseline: 7.1916 + branching: + mean_branch_count: 0.2110 + mean_branching_density: -1.0683 + mean_max_nesting_depth: 0.1234 + mean_non_blank_count: -0.0219 + brevity: + mean_sample_size: 0.0119 + casing_entropy: + mean_entropy: -0.0389 + mean_pascal_case_count: -0.1657 + mean_snake_case_count: -0.0025 + comment_structure: + mean_comment_line_ratio: -1.2802 + compression: + mean_raw_bytes: -0.0133 + mean_redundancy: -0.0135 + mean_unique_line_ratio: -0.0046 + mean_zlib_bytes: 0.0144 + mean_zlib_ratio: -0.0277 + entropy: + mean_char_entropy: 0.0035 + mean_char_max_entropy: 0.0088 + mean_char_normalized: -0.0053 + mean_token_entropy: -0.0040 + mean_token_max_entropy: 0.0026 + mean_token_normalized: -0.0066 + mean_total_tokens: -0.0251 + mean_vocab_size: 0.0119 + function_metrics: + mean_avg_function_lines: -0.0688 + mean_avg_param_count: -0.0555 + mean_function_count: 0.1234 + mean_max_function_lines: 0.0944 + mean_max_param_count: -0.1234 + halstead: + mean_N1_total_operators: -0.0138 + mean_N2_total_operands: -0.0464 + mean_difficulty: -0.0353 + mean_effort: -0.0606 + mean_estimated_bugs: -0.0253 + mean_length: -0.0260 + mean_n1_unique_operators: 0.0111 + mean_time_to_implement_seconds: -0.0606 + mean_vocabulary: 0.0032 + mean_volume: -0.0253 + heaps: + mean_beta: -0.0893 + mean_k: 0.3293 + mean_r_squared: 0.0101 + identifier_length_variance: + mean_max: 0.0679 + mean_mean: 0.0648 + mean_std_dev: 0.0688 + mean_variance: 0.1375 + indentation: + mean_blank_line_ratio: 0.1478 + mean_max_depth: -0.0876 + mean_mean_depth: -0.0397 + mean_variance: -0.2328 + line_patterns: + mean_blank_line_ratio: 0.1478 + mean_max_nesting_depth: 0.1234 + mean_string_literal_ratio: 0.0265 + mean_unique_line_ratio: -0.0050 + magic_number_density: + mean_density: 0.5219 + mean_magic_number_count: 0.4898 + mean_string_literal_ratio: 0.0265 + near_duplicate_blocks_file: + mean_block_count: 0.0765 + mean_sub_block_count: 0.1110 + ngram: + mean_bigram_repeated_unique: 0.0034 + mean_bigram_repetition_rate: -0.0129 + mean_bigram_total: -0.0252 + mean_bigram_unique: 0.0024 + mean_trigram_hapax_fraction: -0.0051 + mean_trigram_repeated_unique: 0.0257 + mean_trigram_repetition_rate: -0.0296 + mean_trigram_total: -0.0252 + mean_trigram_unique: 0.0062 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.5922 + mean_bracket_nonalpha_suffix_count: 0.1086 + mean_colon_suffix_density: -0.1389 + mean_dot_count: -0.1234 + mean_id_nonalpha_suffix_density: -0.0141 + mean_question_mark_density: -2.0000 + readability: + mean_avg_line_length: 0.0110 + mean_avg_sub_words_per_id: 0.0173 + mean_flesch_adapted: -0.0204 + mean_fog_adapted: 0.2028 + mean_total_lines: -0.0265 + symbol_density: + mean_density: 0.0223 + mean_distinct_symbol_types: 0.0643 + mean_symbol_count: 0.0087 + vocabulary: + mean_mattr: -0.0031 + mean_raw_ttr: 0.0573 + mean_total_identifiers: -0.0573 + vowel_density: + mean_total_chars: 0.0075 + zipf: + mean_exponent: -0.0152 + mean_r_squared: 0.0050 + mean_total_tokens: -0.0251 + mean_vocab_size: 0.0119 + +low_coupling: + _doc: "Modules should depend on few external symbols — a low unique-operand count relative to total is a proxy for tight coupling." + _log_baseline: -38.2335 + branching: + mean_branch_count: 0.0745 + mean_branching_density: 0.2097 + mean_max_nesting_depth: -0.1353 + mean_non_blank_count: -0.1353 + brevity: + mean_sample_size: -0.1276 + casing_entropy: + mean_entropy: -0.0870 + mean_pascal_case_count: -0.3722 + mean_snake_case_count: -0.1302 + compression: + mean_raw_bytes: -0.1657 + mean_redundancy: 0.0126 + mean_unique_line_ratio: -0.0296 + mean_zlib_bytes: -0.1918 + mean_zlib_ratio: 0.0262 + entropy: + mean_char_entropy: -0.0044 + mean_char_max_entropy: -0.0152 + mean_char_normalized: 0.0108 + mean_token_entropy: -0.0215 + mean_token_max_entropy: -0.0285 + mean_token_normalized: 0.0070 + mean_total_tokens: -0.1602 + mean_vocab_size: -0.1276 + function_metrics: + mean_avg_function_lines: -0.3103 + mean_function_count: 0.1353 + mean_max_function_lines: -0.3573 + halstead: + mean_N1_total_operators: -0.1645 + mean_N2_total_operands: -0.1785 + mean_difficulty: -0.1429 + mean_effort: -0.3500 + mean_estimated_bugs: -0.2072 + mean_length: -0.1700 + mean_n1_unique_operators: -0.1406 + mean_n2_unique_operands: -0.1763 + mean_time_to_implement_seconds: -0.3500 + mean_vocabulary: -0.1655 + mean_volume: -0.2072 + heaps: + mean_beta: -0.0557 + mean_k: 0.1362 + mean_r_squared: -0.0234 + identifier_length_variance: + mean_max: -0.0427 + mean_mean: 0.0133 + mean_std_dev: -0.0321 + mean_variance: -0.0642 + indentation: + mean_blank_line_ratio: -0.0752 + mean_max_depth: -0.0352 + mean_mean_depth: -0.1381 + mean_variance: -0.2519 + line_patterns: + mean_blank_line_ratio: -0.0752 + mean_max_nesting_depth: -0.1353 + mean_string_literal_ratio: 0.1282 + mean_unique_line_ratio: -0.0312 + magic_number_density: + mean_density: -2.0000 + mean_string_literal_ratio: 0.1282 + near_duplicate_blocks_file: + mean_block_count: 0.1123 + mean_near_dup_block_d4: 0.2314 + mean_near_dup_block_d8: 0.2314 + mean_sub_block_count: -0.0902 + ngram: + mean_bigram_hapax_fraction: -0.0247 + mean_bigram_repeated_unique: -0.1792 + mean_bigram_repetition_rate: 0.0301 + mean_bigram_total: -0.1605 + mean_bigram_unique: -0.2135 + mean_trigram_hapax_fraction: -0.0265 + mean_trigram_repeated_unique: -0.1784 + mean_trigram_repetition_rate: 0.0750 + mean_trigram_total: -0.1608 + mean_trigram_unique: -0.2352 + punctuation_density: + mean_arrow_density: -0.0373 + mean_bracket_nonalpha_prefix_count: -0.4412 + mean_bracket_nonalpha_suffix_count: 0.2314 + mean_colon_suffix_density: -0.0705 + mean_dot_count: -0.2609 + mean_exclamation_density: 1.8877 + mean_id_nonalpha_suffix_density: -0.0113 + readability: + mean_avg_line_length: -0.0307 + mean_avg_sub_words_per_id: 0.0032 + mean_avg_tokens_per_line: -0.0248 + mean_fog_adapted: 0.0082 + mean_total_lines: -0.1353 + symbol_density: + mean_density: -0.0137 + mean_distinct_symbol_types: -0.0960 + mean_symbol_count: -0.1794 + vocabulary: + mean_mattr: -0.0823 + mean_raw_ttr: 0.0349 + mean_total_identifiers: -0.1801 + mean_unique_identifiers: -0.1453 + vowel_density: + mean_total_chars: -0.1669 + zipf: + mean_exponent: 0.0065 + mean_r_squared: -0.0205 + mean_total_tokens: -0.1602 + mean_vocab_size: -0.1276 + +no_wildcard_imports: + _doc: "Wildcard imports (`import *`, `using Module`) pollute the local namespace and hide dependencies." + _log_baseline: -8.9685 + branching: + mean_branching_density: 0.0249 + mean_non_blank_count: -0.0268 + brevity: + mean_sample_size: -0.0077 + casing_entropy: + mean_entropy: -0.0054 + mean_snake_case_count: 0.0163 + compression: + mean_raw_bytes: 0.0310 + mean_unique_line_ratio: -0.0046 + mean_zlib_bytes: 0.0331 + entropy: + mean_total_tokens: 0.0131 + mean_vocab_size: -0.0077 + function_metrics: + mean_avg_function_lines: -0.0263 + halstead: + mean_N1_total_operators: 0.0202 + mean_N2_total_operands: 0.0271 + mean_difficulty: 0.0600 + mean_effort: 0.0778 + mean_estimated_bugs: 0.0179 + mean_length: 0.0228 + mean_n2_unique_operands: -0.0329 + mean_time_to_implement_seconds: 0.0778 + mean_vocabulary: -0.0230 + mean_volume: 0.0178 + heaps: + mean_beta: -0.0537 + mean_k: 0.1998 + mean_r_squared: -0.0155 + identifier_length_variance: + mean_mean: 0.0438 + mean_std_dev: 0.0473 + mean_variance: 0.0945 + indentation: + mean_blank_line_ratio: 0.0763 + mean_mean_depth: -0.0117 + mean_variance: 0.0042 + line_patterns: + mean_blank_line_ratio: 0.0763 + mean_string_literal_ratio: -0.3463 + mean_unique_line_ratio: -0.0053 + magic_number_density: + mean_density: 1.1035 + mean_magic_number_count: 1.1312 + mean_string_literal_ratio: -0.3463 + near_duplicate_blocks_file: + mean_near_dup_block_d6: -0.3309 + mean_near_dup_block_d7: 0.3309 + mean_near_dup_block_d8: 0.3309 + mean_sub_block_count: 0.0355 + ngram: + mean_bigram_hapax_fraction: 0.0182 + mean_bigram_repeated_unique: -0.0089 + mean_bigram_repetition_rate: -0.0149 + mean_bigram_total: 0.0131 + mean_bigram_unique: 0.0308 + mean_trigram_hapax_fraction: 0.0094 + mean_trigram_repeated_unique: -0.0263 + mean_trigram_repetition_rate: -0.0255 + mean_trigram_total: 0.0132 + mean_trigram_unique: 0.0274 + punctuation_density: + mean_arrow_density: -0.0139 + mean_bracket_nonalpha_prefix_count: -0.5656 + mean_bracket_nonalpha_suffix_count: -0.0908 + mean_colon_suffix_density: 2.0000 + mean_dot_count: -0.0137 + mean_id_nonalpha_suffix_density: 0.0143 + readability: + mean_avg_line_length: 0.0591 + mean_avg_sub_words_per_id: 0.0084 + mean_avg_tokens_per_line: 0.0399 + mean_flesch_adapted: -0.0142 + mean_fog_adapted: 0.0290 + mean_total_lines: -0.0268 + symbol_density: + mean_density: -0.0266 + mean_distinct_symbol_types: -0.0817 + mean_symbol_count: 0.0042 + vocabulary: + mean_mattr: 0.0259 + mean_raw_ttr: -0.0117 + mean_total_identifiers: 0.0116 + vowel_density: + mean_total_chars: 0.0554 + zipf: + mean_exponent: -0.0270 + mean_total_tokens: 0.0131 + mean_vocab_size: -0.0077 + diff --git a/priv/combined_metrics/documentation.yml b/priv/combined_metrics/documentation.yml new file mode 100644 index 0000000..4e3333c --- /dev/null +++ b/priv/combined_metrics/documentation.yml @@ -0,0 +1,631 @@ +docstring_is_nonempty: + _doc: "Docstrings must contain meaningful content, not just a placeholder or empty string." + _log_baseline: 28.4942 + branching: + mean_branch_count: 0.3089 + mean_branching_density: 0.2652 + mean_non_blank_count: 0.0437 + brevity: + mean_sample_size: 0.1931 + casing_entropy: + mean_entropy: 0.0676 + mean_other_count: 0.0709 + mean_pascal_case_count: 0.3089 + mean_snake_case_count: 0.1382 + compression: + mean_raw_bytes: 0.1245 + mean_redundancy: -0.0198 + mean_unique_line_ratio: 0.0053 + mean_zlib_bytes: 0.1557 + mean_zlib_ratio: -0.0312 + entropy: + mean_char_entropy: 0.0065 + mean_char_max_entropy: 0.0102 + mean_char_normalized: -0.0036 + mean_token_entropy: 0.0408 + mean_token_max_entropy: 0.0400 + mean_total_tokens: 0.1038 + mean_vocab_size: 0.1931 + function_metrics: + mean_avg_function_lines: 0.0357 + mean_avg_param_count: 0.0131 + mean_function_count: -0.0290 + mean_max_function_lines: 0.0329 + halstead: + mean_N1_total_operators: 0.0456 + mean_N2_total_operands: -0.0027 + mean_difficulty: 0.0706 + mean_effort: 0.1098 + mean_estimated_bugs: 0.0392 + mean_length: 0.0289 + mean_n1_unique_operators: 0.0913 + mean_n2_unique_operands: 0.0179 + mean_time_to_implement_seconds: 0.1098 + mean_vocabulary: 0.0465 + mean_volume: 0.0392 + heaps: + mean_beta: 0.0242 + mean_k: 0.0556 + identifier_length_variance: + mean_mean: 0.0042 + mean_std_dev: -0.0168 + mean_variance: -0.0336 + indentation: + mean_blank_line_ratio: 0.0413 + mean_mean_depth: -0.0330 + mean_variance: -0.0309 + line_patterns: + mean_blank_line_ratio: 0.0413 + mean_string_literal_ratio: 0.1078 + mean_unique_line_ratio: 0.0072 + magic_number_density: + mean_density: 0.0693 + mean_magic_number_count: 0.1709 + mean_string_literal_ratio: 0.1078 + near_duplicate_blocks_file: + mean_block_count: 0.0907 + mean_near_dup_block_d5: -0.2709 + mean_near_dup_block_d8: 0.1000 + mean_sub_block_count: -0.0061 + ngram: + mean_bigram_hapax_fraction: 0.0378 + mean_bigram_repeated_unique: 0.0767 + mean_bigram_repetition_rate: -0.0528 + mean_bigram_total: 0.1039 + mean_bigram_unique: 0.1635 + mean_trigram_hapax_fraction: 0.0158 + mean_trigram_repeated_unique: 0.0692 + mean_trigram_repetition_rate: -0.0615 + mean_trigram_total: 0.1041 + mean_trigram_unique: 0.1386 + punctuation_density: + mean_arrow_density: -0.0651 + mean_bracket_nonalpha_prefix_count: 0.0450 + mean_bracket_nonalpha_suffix_count: 0.1000 + mean_colon_suffix_density: -0.0260 + mean_dot_count: 0.1435 + mean_exclamation_density: -2.0000 + mean_id_nonalpha_suffix_density: -0.0474 + readability: + mean_avg_line_length: 0.0834 + mean_avg_sub_words_per_id: -0.0071 + mean_avg_tokens_per_line: 0.0601 + mean_fog_adapted: 0.0452 + mean_total_lines: 0.0437 + symbol_density: + mean_density: -0.0578 + mean_distinct_symbol_types: 0.0505 + mean_symbol_count: 0.0664 + vocabulary: + mean_mattr: 0.1382 + mean_raw_ttr: 0.0976 + mean_total_identifiers: 0.1534 + mean_unique_identifiers: 0.2510 + vowel_density: + mean_total_chars: 0.1576 + zipf: + mean_exponent: -0.0353 + mean_r_squared: 0.0037 + mean_total_tokens: 0.1038 + mean_vocab_size: 0.1931 + +file_has_license_header: + _doc: "Source files should begin with a license or copyright header." + _log_baseline: 5.8777 + branching: + mean_branching_density: -0.0081 + mean_non_blank_count: 0.0080 + brevity: + mean_sample_size: 0.0263 + casing_entropy: + mean_entropy: 0.0296 + mean_pascal_case_count: 0.0957 + mean_snake_case_count: 0.0039 + comment_structure: + mean_comment_line_ratio: -2.0000 + compression: + mean_raw_bytes: 0.0104 + mean_redundancy: -0.0059 + mean_zlib_bytes: 0.0200 + mean_zlib_ratio: -0.0095 + entropy: + mean_char_entropy: 0.0028 + mean_char_max_entropy: 0.0052 + mean_token_entropy: 0.0042 + mean_token_max_entropy: 0.0054 + mean_total_tokens: 0.0091 + mean_vocab_size: 0.0263 + halstead: + mean_N1_total_operators: 0.0051 + mean_N2_total_operands: 0.0185 + mean_difficulty: -0.0273 + mean_effort: -0.0113 + mean_estimated_bugs: 0.0159 + mean_length: 0.0095 + mean_n2_unique_operands: 0.0458 + mean_time_to_implement_seconds: -0.0113 + mean_vocabulary: 0.0306 + mean_volume: 0.0160 + heaps: + mean_beta: -0.0113 + mean_k: 0.0614 + identifier_length_variance: + mean_mean: -0.0048 + indentation: + mean_blank_line_ratio: 0.0206 + mean_mean_depth: -0.0080 + mean_variance: 0.0154 + line_patterns: + mean_blank_line_ratio: 0.0206 + mean_string_literal_ratio: -0.0104 + magic_number_density: + mean_density: 0.1920 + mean_magic_number_count: 0.1973 + mean_string_literal_ratio: -0.0104 + near_duplicate_blocks_file: + mean_block_count: 0.0650 + mean_sub_block_count: 0.0089 + ngram: + mean_bigram_hapax_fraction: 0.0086 + mean_bigram_repetition_rate: -0.0091 + mean_bigram_total: 0.0091 + mean_bigram_unique: 0.0182 + mean_trigram_hapax_fraction: 0.0031 + mean_trigram_repetition_rate: -0.0091 + mean_trigram_total: 0.0091 + mean_trigram_unique: 0.0133 + punctuation_density: + mean_arrow_density: -0.0105 + mean_colon_suffix_density: -0.0104 + mean_dot_count: 0.0423 + readability: + mean_avg_tokens_per_line: 0.0091 + mean_fog_adapted: 0.0060 + symbol_density: + mean_density: -0.0042 + mean_symbol_count: 0.0065 + vocabulary: + mean_mattr: 0.0108 + mean_raw_ttr: 0.0207 + mean_total_identifiers: 0.0187 + mean_unique_identifiers: 0.0395 + vowel_density: + mean_total_chars: 0.0139 + zipf: + mean_exponent: -0.0055 + mean_total_tokens: 0.0091 + mean_vocab_size: 0.0263 + +file_has_module_docstring: + _doc: "Files should have a module-level docstring explaining purpose and usage." + _log_baseline: 24.1681 + branching: + mean_branch_count: 0.3854 + mean_branching_density: -2.0000 + mean_non_blank_count: 0.0908 + brevity: + mean_sample_size: 0.2219 + casing_entropy: + mean_entropy: -0.0210 + mean_pascal_case_count: 0.0929 + mean_snake_case_count: 0.1544 + compression: + mean_raw_bytes: 0.1161 + mean_redundancy: -0.0256 + mean_unique_line_ratio: 0.0122 + mean_zlib_bytes: 0.1676 + mean_zlib_ratio: -0.0514 + entropy: + mean_char_max_entropy: 0.0126 + mean_char_normalized: -0.0120 + mean_token_entropy: 0.0441 + mean_token_max_entropy: 0.0457 + mean_total_tokens: 0.0837 + mean_vocab_size: 0.2219 + function_metrics: + mean_avg_function_lines: 0.0166 + mean_max_function_lines: 0.1014 + halstead: + mean_N1_total_operators: 0.0448 + mean_N2_total_operands: 0.0268 + mean_difficulty: 0.0971 + mean_effort: 0.1486 + mean_estimated_bugs: 0.0515 + mean_length: 0.0387 + mean_n1_unique_operators: 0.1116 + mean_n2_unique_operands: 0.0412 + mean_time_to_implement_seconds: 0.1486 + mean_vocabulary: 0.0602 + mean_volume: 0.0515 + heaps: + mean_beta: -0.0925 + mean_k: 0.5760 + mean_r_squared: -0.0049 + identifier_length_variance: + mean_mean: -0.0101 + mean_std_dev: -0.0477 + mean_variance: -0.0954 + indentation: + mean_blank_line_ratio: 0.0686 + mean_mean_depth: -0.0240 + mean_variance: -0.0634 + line_patterns: + mean_blank_line_ratio: 0.0686 + mean_string_literal_ratio: 0.1425 + mean_unique_line_ratio: 0.0141 + magic_number_density: + mean_density: 0.0812 + mean_magic_number_count: 0.1599 + mean_string_literal_ratio: 0.1425 + near_duplicate_blocks_file: + mean_block_count: 0.0586 + mean_sub_block_count: 0.0098 + ngram: + mean_bigram_hapax_fraction: 0.0500 + mean_bigram_repeated_unique: 0.0539 + mean_bigram_repetition_rate: -0.0497 + mean_bigram_total: 0.0838 + mean_bigram_unique: 0.1493 + mean_trigram_hapax_fraction: 0.0283 + mean_trigram_repeated_unique: 0.0225 + mean_trigram_repetition_rate: -0.0657 + mean_trigram_total: 0.0839 + mean_trigram_unique: 0.1235 + punctuation_density: + mean_colon_suffix_density: 0.0341 + mean_dot_count: 0.0777 + mean_exclamation_density: -0.1014 + mean_id_nonalpha_suffix_density: -0.0339 + readability: + mean_avg_line_length: 0.0257 + mean_avg_sub_words_per_id: -0.0181 + mean_avg_tokens_per_line: -0.0071 + mean_flesch_adapted: 0.0205 + mean_fog_adapted: -0.0266 + mean_total_lines: 0.0908 + symbol_density: + mean_density: -0.0727 + mean_distinct_symbol_types: 0.0618 + mean_symbol_count: 0.0433 + vocabulary: + mean_mattr: 0.0532 + mean_raw_ttr: 0.1353 + mean_total_identifiers: 0.1326 + mean_unique_identifiers: 0.2679 + vowel_density: + mean_total_chars: 0.1226 + zipf: + mean_exponent: -0.0467 + mean_total_tokens: 0.0837 + mean_vocab_size: 0.2219 + +file_has_no_commented_out_code: + _doc: "Files should not contain commented-out code blocks left from development." + _log_baseline: -8.5677 + branching: + mean_branching_density: 0.0368 + mean_non_blank_count: -0.0367 + brevity: + mean_sample_size: -0.0046 + casing_entropy: + mean_entropy: -0.0091 + mean_pascal_case_count: -0.0597 + mean_snake_case_count: -0.0126 + comment_structure: + mean_comment_line_count: -0.9901 + mean_comment_line_ratio: 0.3578 + compression: + mean_raw_bytes: -0.0068 + mean_redundancy: 0.0077 + mean_zlib_bytes: -0.0179 + mean_zlib_ratio: 0.0111 + entropy: + mean_char_entropy: -0.0026 + mean_char_max_entropy: -0.0061 + mean_char_normalized: 0.0035 + mean_total_tokens: -0.0158 + mean_vocab_size: -0.0046 + function_metrics: + mean_avg_function_lines: -0.0992 + mean_function_count: 0.0686 + mean_max_function_lines: -0.1247 + halstead: + mean_N1_total_operators: -0.0058 + mean_N2_total_operands: -0.0546 + mean_difficulty: 0.0608 + mean_effort: 0.0253 + mean_estimated_bugs: -0.0355 + mean_length: -0.0224 + mean_n1_unique_operators: 0.0171 + mean_n2_unique_operands: -0.0984 + mean_time_to_implement_seconds: 0.0253 + mean_vocabulary: -0.0628 + mean_volume: -0.0356 + heaps: + mean_beta: -0.0499 + mean_k: 0.1958 + mean_r_squared: -0.0200 + identifier_length_variance: + mean_mean: 0.0169 + mean_std_dev: 0.0264 + mean_variance: 0.0527 + indentation: + mean_blank_line_ratio: 0.0551 + mean_max_depth: 0.0324 + mean_mean_depth: 0.0564 + mean_variance: 0.0552 + line_patterns: + mean_blank_line_ratio: 0.0551 + mean_string_literal_ratio: -0.0818 + mean_unique_line_ratio: -0.0077 + magic_number_density: + mean_density: 2.0000 + mean_string_literal_ratio: -0.0818 + near_duplicate_blocks_file: + mean_block_count: -0.0474 + mean_sub_block_count: -0.0454 + ngram: + mean_bigram_hapax_fraction: 0.0101 + mean_bigram_repeated_unique: -0.0414 + mean_bigram_repetition_rate: -0.0019 + mean_bigram_total: -0.0158 + mean_bigram_unique: -0.0223 + mean_trigram_hapax_fraction: -0.0019 + mean_trigram_repeated_unique: -0.0273 + mean_trigram_repetition_rate: 0.0258 + mean_trigram_total: -0.0159 + mean_trigram_unique: -0.0338 + punctuation_density: + mean_arrow_density: 0.1869 + mean_bracket_nonalpha_prefix_count: -0.1247 + mean_bracket_nonalpha_suffix_count: -0.0885 + mean_colon_suffix_density: -0.1285 + mean_dot_count: -0.0411 + mean_exclamation_density: -0.1956 + mean_id_nonalpha_suffix_density: 0.0028 + mean_question_mark_density: -0.2494 + readability: + mean_avg_line_length: 0.0371 + mean_avg_sub_words_per_id: -0.0018 + mean_avg_tokens_per_line: -0.0943 + mean_flesch_adapted: 0.0114 + mean_fog_adapted: -0.0779 + mean_total_lines: 0.0785 + symbol_density: + mean_density: -0.0172 + mean_symbol_count: -0.0237 + vocabulary: + mean_mattr: -0.0327 + mean_raw_ttr: 0.0060 + mean_total_identifiers: -0.0246 + mean_unique_identifiers: -0.0186 + vowel_density: + mean_total_chars: -0.0077 + zipf: + mean_exponent: -0.0043 + mean_total_tokens: -0.0158 + mean_vocab_size: -0.0046 + +function_has_docstring: + _doc: "Public functions should have a docstring describing behaviour, params, and return value." + _log_baseline: 41.6283 + branching: + mean_branch_count: 0.5279 + mean_branching_density: 0.3832 + mean_non_blank_count: 0.1446 + brevity: + mean_sample_size: 0.2608 + casing_entropy: + mean_entropy: -0.0026 + mean_other_count: 0.3105 + mean_pascal_case_count: 0.1852 + mean_snake_case_count: 0.2708 + comment_structure: + mean_comment_line_ratio: -2.0000 + compression: + mean_raw_bytes: 0.2251 + mean_redundancy: -0.0242 + mean_unique_line_ratio: -0.0264 + mean_zlib_bytes: 0.2718 + mean_zlib_ratio: -0.0468 + entropy: + mean_char_entropy: 0.0081 + mean_char_max_entropy: 0.0163 + mean_char_normalized: -0.0082 + mean_token_entropy: 0.0517 + mean_token_max_entropy: 0.0557 + mean_token_normalized: -0.0040 + mean_total_tokens: 0.2284 + mean_vocab_size: 0.2608 + function_metrics: + mean_avg_function_lines: 0.0289 + mean_avg_param_count: 0.0202 + mean_function_count: 0.0999 + mean_max_function_lines: 0.1368 + halstead: + mean_N1_total_operators: 0.1175 + mean_N2_total_operands: 0.0799 + mean_difficulty: 0.0232 + mean_effort: 0.1555 + mean_estimated_bugs: 0.1324 + mean_length: 0.1035 + mean_n1_unique_operators: 0.0939 + mean_n2_unique_operands: 0.1507 + mean_time_to_implement_seconds: 0.1555 + mean_vocabulary: 0.1288 + mean_volume: 0.1324 + heaps: + mean_beta: 0.0660 + mean_k: -0.0612 + mean_r_squared: -0.0041 + identifier_length_variance: + mean_mean: -0.0191 + mean_std_dev: -0.0493 + mean_variance: -0.0985 + indentation: + mean_blank_line_ratio: 0.1003 + mean_max_depth: -0.1288 + mean_mean_depth: -0.0904 + mean_variance: -0.2118 + line_patterns: + mean_blank_line_ratio: 0.1003 + mean_string_literal_ratio: 0.5931 + mean_unique_line_ratio: -0.0135 + magic_number_density: + mean_density: 0.1744 + mean_magic_number_count: 0.4104 + mean_string_literal_ratio: 0.5931 + near_duplicate_blocks_file: + mean_block_count: 0.2288 + mean_near_dup_block_d6: -0.3105 + mean_near_dup_block_d7: 0.3105 + mean_near_dup_block_d8: -0.1816 + mean_sub_block_count: 0.0349 + ngram: + mean_bigram_hapax_fraction: 0.0560 + mean_bigram_repeated_unique: 0.1917 + mean_bigram_repetition_rate: -0.0476 + mean_bigram_total: 0.2288 + mean_bigram_unique: 0.2856 + mean_trigram_hapax_fraction: 0.0480 + mean_trigram_repeated_unique: 0.1175 + mean_trigram_repetition_rate: -0.1025 + mean_trigram_total: 0.2292 + mean_trigram_unique: 0.2807 + punctuation_density: + mean_arrow_density: -0.3619 + mean_bracket_nonalpha_prefix_count: 0.0999 + mean_bracket_nonalpha_suffix_count: 0.2024 + mean_colon_suffix_density: -0.0297 + mean_dot_count: 0.1816 + mean_exclamation_density: -0.3105 + mean_id_nonalpha_suffix_density: -0.1019 + mean_question_mark_density: -0.2377 + readability: + mean_avg_line_length: 0.0861 + mean_avg_sub_words_per_id: -0.0113 + mean_avg_tokens_per_line: 0.0890 + mean_flesch_adapted: 0.0026 + mean_fog_adapted: 0.0948 + mean_total_lines: 0.1394 + symbol_density: + mean_density: -0.0353 + mean_distinct_symbol_types: 0.0427 + mean_symbol_count: 0.1896 + vocabulary: + mean_mattr: 0.1769 + mean_raw_ttr: 0.0666 + mean_total_identifiers: 0.2541 + mean_unique_identifiers: 0.3207 + vowel_density: + mean_total_chars: 0.2350 + zipf: + mean_exponent: 0.0025 + mean_total_tokens: 0.2284 + mean_vocab_size: 0.2608 + +function_todo_comment_in_body: + _doc: "Functions should not contain TODO/FIXME comments indicating unfinished work." + _log_baseline: 7.2394 + branching: + mean_branch_count: -0.0287 + mean_branching_density: -0.0435 + mean_non_blank_count: 0.0147 + brevity: + mean_sample_size: -0.0084 + casing_entropy: + mean_entropy: 0.0157 + mean_pascal_case_count: 0.0410 + mean_snake_case_count: -0.0125 + comment_structure: + mean_comment_line_count: -0.5392 + mean_comment_line_ratio: 0.7796 + mean_todo_fixme_count: -0.5392 + compression: + mean_raw_bytes: 0.0082 + mean_unique_line_ratio: 0.0028 + mean_zlib_bytes: 0.0074 + entropy: + mean_char_entropy: 0.0026 + mean_char_normalized: 0.0026 + mean_token_max_entropy: -0.0017 + mean_token_normalized: 0.0020 + mean_total_tokens: 0.0157 + mean_vocab_size: -0.0084 + function_metrics: + mean_avg_function_lines: -0.0250 + mean_avg_param_count: -0.0354 + mean_function_count: 0.0354 + mean_max_function_lines: -0.0182 + halstead: + mean_N1_total_operators: 0.0224 + mean_N2_total_operands: -0.0309 + mean_difficulty: 0.0451 + mean_effort: 0.0375 + mean_estimated_bugs: -0.0076 + mean_length: 0.0035 + mean_n2_unique_operands: -0.0761 + mean_time_to_implement_seconds: 0.0375 + mean_vocabulary: -0.0540 + mean_volume: -0.0076 + heaps: + mean_beta: -0.0498 + mean_k: 0.1608 + mean_r_squared: -0.0095 + identifier_length_variance: + mean_mean: 0.0061 + mean_std_dev: 0.0128 + mean_variance: 0.0257 + indentation: + mean_blank_line_ratio: 0.0593 + mean_mean_depth: -0.0184 + mean_variance: -0.0277 + line_patterns: + mean_blank_line_ratio: 0.0593 + mean_string_literal_ratio: -0.0151 + mean_unique_line_ratio: 0.0033 + magic_number_density: + mean_density: -2.0000 + mean_string_literal_ratio: -0.0151 + near_duplicate_blocks_file: + mean_block_count: 0.0317 + mean_sub_block_count: 0.0281 + ngram: + mean_bigram_hapax_fraction: -0.0187 + mean_bigram_repeated_unique: 0.0464 + mean_bigram_repetition_rate: 0.0098 + mean_bigram_total: 0.0157 + mean_bigram_unique: 0.0136 + mean_trigram_hapax_fraction: -0.0109 + mean_trigram_repeated_unique: 0.0479 + mean_trigram_repetition_rate: 0.0123 + mean_trigram_total: 0.0157 + mean_trigram_unique: 0.0149 + punctuation_density: + mean_arrow_density: -0.0161 + mean_bracket_nonalpha_prefix_count: -0.0287 + mean_colon_suffix_density: -0.0293 + mean_dot_count: 0.0485 + mean_id_nonalpha_suffix_density: 0.0062 + mean_question_mark_density: -0.0287 + readability: + mean_avg_line_length: 0.0123 + mean_avg_sub_words_per_id: 0.0073 + mean_avg_tokens_per_line: -0.0224 + mean_flesch_adapted: -0.0053 + mean_fog_adapted: -0.0109 + mean_total_lines: 0.0381 + symbol_density: + mean_density: 0.0116 + mean_distinct_symbol_types: -0.0140 + mean_symbol_count: 0.0200 + vocabulary: + mean_mattr: -0.0525 + mean_raw_ttr: -0.0250 + mean_unique_identifiers: -0.0236 + vowel_density: + mean_total_chars: 0.0076 + zipf: + mean_total_tokens: 0.0157 + mean_vocab_size: -0.0084 + diff --git a/priv/combined_metrics/error_handling.yml b/priv/combined_metrics/error_handling.yml new file mode 100644 index 0000000..ce7d040 --- /dev/null +++ b/priv/combined_metrics/error_handling.yml @@ -0,0 +1,332 @@ +does_not_swallow_errors: + _doc: "Errors must be handled or re-raised — empty rescue/catch blocks silently hide failures." + _log_baseline: 86.0584 + branching: + mean_branch_count: -0.1041 + mean_branching_density: -0.2095 + mean_max_nesting_depth: 0.5405 + mean_non_blank_count: 0.1054 + brevity: + mean_sample_size: 0.2830 + casing_entropy: + mean_entropy: -0.1412 + mean_other_count: -1.6214 + mean_pascal_case_count: 0.8391 + mean_snake_case_count: 0.4785 + compression: + mean_raw_bytes: 0.3818 + mean_redundancy: 0.0202 + mean_unique_line_ratio: 0.1028 + mean_zlib_bytes: 0.3399 + mean_zlib_ratio: 0.0419 + entropy: + mean_char_entropy: 0.0445 + mean_char_max_entropy: 0.0347 + mean_char_normalized: 0.0098 + mean_token_entropy: 0.0223 + mean_token_max_entropy: 0.0620 + mean_token_normalized: -0.0397 + mean_total_tokens: 0.4926 + mean_vocab_size: 0.2830 + function_metrics: + mean_avg_function_lines: 0.1005 + mean_max_function_lines: 0.2243 + halstead: + mean_N1_total_operators: 0.4699 + mean_N2_total_operands: 0.2900 + mean_difficulty: 0.2395 + mean_effort: 0.6960 + mean_estimated_bugs: 0.4564 + mean_length: 0.4072 + mean_n1_unique_operators: 0.1859 + mean_n2_unique_operands: 0.2364 + mean_time_to_implement_seconds: 0.6960 + mean_vocabulary: 0.2190 + mean_volume: 0.4565 + heaps: + mean_beta: -0.0869 + mean_k: 0.2466 + identifier_length_variance: + mean_std_dev: -0.1168 + mean_variance: -0.2335 + indentation: + mean_blank_line_ratio: 0.0451 + mean_max_depth: 0.1740 + mean_mean_depth: 0.1043 + mean_variance: 0.3416 + line_patterns: + mean_blank_line_ratio: 0.0451 + mean_max_nesting_depth: 0.5405 + mean_string_literal_ratio: 0.2524 + mean_unique_line_ratio: 0.1413 + magic_number_density: + mean_string_literal_ratio: 0.2524 + near_duplicate_blocks_file: + mean_near_dup_block_d0: -0.5405 + mean_near_dup_block_d7: -0.3162 + mean_near_dup_block_d8: 0.8566 + mean_sub_block_count: 0.3065 + ngram: + mean_bigram_hapax_fraction: -0.0373 + mean_bigram_repeated_unique: 0.4011 + mean_bigram_repetition_rate: 0.0995 + mean_bigram_total: 0.4937 + mean_bigram_unique: 0.3266 + mean_trigram_hapax_fraction: -0.0651 + mean_trigram_repeated_unique: 0.5672 + mean_trigram_repetition_rate: 0.2299 + mean_trigram_total: 0.4949 + mean_trigram_unique: 0.3376 + punctuation_density: + mean_arrow_density: -0.3177 + mean_bracket_nonalpha_prefix_count: 0.7888 + mean_bracket_nonalpha_suffix_count: 2.0000 + mean_colon_suffix_density: 0.2150 + mean_dot_count: 0.5172 + mean_exclamation_density: -0.5217 + mean_id_nonalpha_suffix_density: 0.0181 + mean_question_mark_density: -0.4364 + readability: + mean_avg_line_length: 0.2905 + mean_avg_sub_words_per_id: 0.0219 + mean_avg_tokens_per_line: 0.3872 + mean_flesch_adapted: -0.0373 + mean_fog_adapted: 0.3019 + mean_total_lines: 0.1054 + symbol_density: + mean_density: 0.2563 + mean_distinct_symbol_types: 0.0400 + mean_symbol_count: 0.6378 + vocabulary: + mean_mattr: 0.0350 + mean_raw_ttr: -0.0769 + mean_total_identifiers: 0.4896 + mean_unique_identifiers: 0.4127 + vowel_density: + mean_total_chars: 0.4927 + zipf: + mean_exponent: 0.0933 + mean_total_tokens: 0.4926 + mean_vocab_size: 0.2830 + +error_message_is_descriptive: + _doc: "Error values should carry a meaningful message, not just a bare atom or empty string." + _log_baseline: 52.7053 + branching: + mean_branch_count: 0.0664 + mean_branching_density: -0.0540 + mean_max_nesting_depth: 0.3900 + mean_non_blank_count: 0.1204 + brevity: + mean_sample_size: 0.3136 + casing_entropy: + mean_entropy: 0.1147 + mean_pascal_case_count: 2.0000 + mean_snake_case_count: 0.5117 + compression: + mean_raw_bytes: 0.3028 + mean_redundancy: 0.0104 + mean_unique_line_ratio: -0.0126 + mean_zlib_bytes: 0.2771 + mean_zlib_ratio: 0.0257 + entropy: + mean_char_entropy: 0.0161 + mean_char_max_entropy: 0.0487 + mean_char_normalized: -0.0326 + mean_token_entropy: 0.0596 + mean_token_max_entropy: 0.0685 + mean_token_normalized: -0.0089 + mean_total_tokens: 0.3002 + mean_vocab_size: 0.3136 + function_metrics: + mean_avg_function_lines: 0.1160 + mean_max_function_lines: 0.0713 + halstead: + mean_N1_total_operators: 0.1787 + mean_N2_total_operands: 0.0463 + mean_difficulty: 0.0136 + mean_effort: 0.1822 + mean_estimated_bugs: 0.1686 + mean_length: 0.1370 + mean_n1_unique_operators: 0.1179 + mean_n2_unique_operands: 0.1506 + mean_time_to_implement_seconds: 0.1822 + mean_vocabulary: 0.1415 + mean_volume: 0.1686 + heaps: + mean_beta: 0.0120 + mean_k: 0.1259 + mean_r_squared: 0.0073 + identifier_length_variance: + mean_mean: -0.0908 + mean_std_dev: -0.0799 + mean_variance: -0.1597 + indentation: + mean_blank_line_ratio: -0.1098 + mean_max_depth: 0.1754 + mean_mean_depth: 0.1108 + mean_variance: 0.1967 + line_patterns: + mean_blank_line_ratio: -0.1098 + mean_max_nesting_depth: 0.3900 + mean_string_literal_ratio: 0.3673 + mean_unique_line_ratio: 0.0304 + magic_number_density: + mean_string_literal_ratio: 0.3673 + near_duplicate_blocks_file: + mean_near_dup_block_d6: -1.0566 + mean_near_dup_block_d8: -0.6667 + mean_sub_block_count: 0.0621 + ngram: + mean_bigram_hapax_fraction: -0.0059 + mean_bigram_repeated_unique: 0.3150 + mean_bigram_total: 0.3008 + mean_bigram_unique: 0.3055 + mean_trigram_hapax_fraction: -0.0298 + mean_trigram_repeated_unique: 0.4104 + mean_trigram_repetition_rate: 0.0227 + mean_trigram_total: 0.3014 + mean_trigram_unique: 0.3075 + punctuation_density: + mean_arrow_density: -0.0591 + mean_bracket_nonalpha_prefix_count: 1.4250 + mean_bracket_nonalpha_suffix_count: 0.3350 + mean_colon_suffix_density: -0.0854 + mean_id_nonalpha_suffix_density: -0.0694 + readability: + mean_avg_line_length: 0.1895 + mean_avg_tokens_per_line: 0.1798 + mean_flesch_adapted: -0.0175 + mean_fog_adapted: 0.1420 + mean_total_lines: 0.1204 + symbol_density: + mean_distinct_symbol_types: 0.0664 + mean_symbol_count: 0.3056 + vocabulary: + mean_mattr: -0.0179 + mean_raw_ttr: -0.1153 + mean_total_identifiers: 0.5114 + mean_unique_identifiers: 0.3962 + vowel_density: + mean_total_chars: 0.4207 + zipf: + mean_r_squared: 0.0056 + mean_total_tokens: 0.3002 + mean_vocab_size: 0.3136 + +returns_typed_error: + _doc: "Functions should signal failure via a typed return (e.g. `{:error, reason}`) rather than returning `nil` or `false`." + _log_baseline: 120.8554 + branching: + mean_branch_count: -0.1286 + mean_branching_density: -0.1895 + mean_max_nesting_depth: 1.1292 + mean_non_blank_count: 0.0608 + brevity: + mean_sample_size: 0.2322 + casing_entropy: + mean_entropy: -0.3072 + mean_other_count: -0.2697 + mean_pascal_case_count: 0.7124 + mean_snake_case_count: 0.6125 + compression: + mean_raw_bytes: 0.4375 + mean_redundancy: 0.0334 + mean_unique_line_ratio: 0.1471 + mean_zlib_bytes: 0.3486 + mean_zlib_ratio: 0.0889 + entropy: + mean_char_entropy: 0.0854 + mean_char_max_entropy: 0.0427 + mean_char_normalized: 0.0426 + mean_token_entropy: -0.0120 + mean_token_max_entropy: 0.0531 + mean_token_normalized: -0.0651 + mean_total_tokens: 0.6727 + mean_vocab_size: 0.2322 + function_metrics: + mean_avg_function_lines: 0.0904 + mean_avg_param_count: 0.0054 + mean_function_count: -0.0556 + mean_max_function_lines: 0.0823 + halstead: + mean_N1_total_operators: 0.7914 + mean_N2_total_operands: 0.5495 + mean_difficulty: 0.4516 + mean_effort: 1.2300 + mean_estimated_bugs: 0.7784 + mean_length: 0.7139 + mean_n1_unique_operators: 0.2105 + mean_n2_unique_operands: 0.3084 + mean_time_to_implement_seconds: 1.2300 + mean_vocabulary: 0.2747 + mean_volume: 0.7785 + heaps: + mean_beta: -0.2332 + mean_k: 0.4822 + mean_r_squared: 0.0110 + identifier_length_variance: + mean_max: 0.1996 + mean_mean: 0.1313 + mean_std_dev: 0.2519 + mean_variance: 0.5039 + indentation: + mean_blank_line_ratio: -0.1515 + mean_mean_depth: -0.0287 + mean_variance: 0.0372 + line_patterns: + mean_blank_line_ratio: -0.1515 + mean_max_nesting_depth: 1.1292 + mean_string_literal_ratio: -0.6750 + mean_unique_line_ratio: 0.1454 + magic_number_density: + mean_string_literal_ratio: -0.6750 + near_duplicate_blocks_file: + mean_block_count: -0.0980 + mean_near_dup_block_d0: -1.4248 + mean_near_dup_block_d6: 0.7124 + mean_near_dup_block_d7: -1.0081 + mean_near_dup_block_d8: -2.0000 + mean_sub_block_count: 0.7384 + ngram: + mean_bigram_hapax_fraction: -0.1410 + mean_bigram_repeated_unique: 0.4891 + mean_bigram_repetition_rate: 0.1302 + mean_bigram_total: 0.6740 + mean_bigram_unique: 0.3101 + mean_trigram_hapax_fraction: -0.0547 + mean_trigram_repeated_unique: 0.5003 + mean_trigram_repetition_rate: 0.2370 + mean_trigram_total: 0.6753 + mean_trigram_unique: 0.3580 + punctuation_density: + mean_arrow_density: -0.8033 + mean_bracket_nonalpha_prefix_count: -0.1874 + mean_colon_suffix_density: -0.8583 + mean_dot_count: 1.1292 + mean_id_nonalpha_suffix_density: 0.0810 + mean_question_mark_density: -0.6568 + readability: + mean_avg_line_length: 0.3955 + mean_avg_sub_words_per_id: 0.0948 + mean_avg_tokens_per_line: 0.6118 + mean_flesch_adapted: -0.1272 + mean_fog_adapted: 0.6637 + mean_total_lines: 0.0608 + symbol_density: + mean_density: 0.5813 + mean_distinct_symbol_types: 0.2134 + mean_symbol_count: 1.0187 + vocabulary: + mean_mattr: -0.2229 + mean_raw_ttr: -0.2020 + mean_total_identifiers: 0.4979 + mean_unique_identifiers: 0.2957 + vowel_density: + mean_total_chars: 0.6292 + zipf: + mean_exponent: 0.1047 + mean_r_squared: 0.0253 + mean_total_tokens: 0.6727 + mean_vocab_size: 0.2322 + diff --git a/priv/combined_metrics/file_structure.yml b/priv/combined_metrics/file_structure.yml new file mode 100644 index 0000000..b38fc00 --- /dev/null +++ b/priv/combined_metrics/file_structure.yml @@ -0,0 +1,510 @@ +has_consistent_indentation: + _doc: "Files should use a single, consistent indentation style with no mixed tabs and spaces." + _log_baseline: -12.7016 + branching: + mean_branching_density: 0.1994 + mean_non_blank_count: -0.2001 + compression: + mean_raw_bytes: -0.1617 + mean_redundancy: -0.0442 + mean_unique_line_ratio: -0.1671 + mean_zlib_bytes: -0.0805 + mean_zlib_ratio: -0.0812 + entropy: + mean_char_entropy: 0.0876 + mean_char_normalized: 0.0877 + function_metrics: + mean_avg_function_lines: -0.1622 + mean_max_function_lines: -0.1528 + indentation: + mean_blank_line_ratio: 0.3569 + mean_max_depth: -0.9827 + mean_mean_depth: -0.6193 + mean_variance: -2.0000 + line_patterns: + mean_blank_line_ratio: 0.3569 + mean_unique_line_ratio: -0.0830 + readability: + mean_avg_line_length: 0.0378 + mean_avg_tokens_per_line: 0.2001 + mean_flesch_adapted: -0.0112 + mean_fog_adapted: 0.1818 + mean_total_lines: -0.2001 + symbol_density: + mean_density: 0.1620 + +line_count_under_300: + _doc: "Files should be under 300 lines; longer files typically violate single responsibility." + _log_baseline: -45.8565 + branching: + mean_branch_count: -0.4508 + mean_branching_density: -0.2446 + mean_non_blank_count: -0.2063 + brevity: + mean_sample_size: -0.2062 + casing_entropy: + mean_entropy: 0.0413 + mean_other_count: -0.6011 + mean_pascal_case_count: 0.1036 + mean_snake_case_count: -0.2080 + compression: + mean_raw_bytes: -0.2263 + mean_redundancy: -0.0026 + mean_unique_line_ratio: 0.0519 + mean_zlib_bytes: -0.2194 + mean_zlib_ratio: -0.0069 + entropy: + mean_char_entropy: -0.0072 + mean_char_max_entropy: -0.0245 + mean_char_normalized: 0.0173 + mean_token_entropy: -0.0264 + mean_token_max_entropy: -0.0433 + mean_token_normalized: 0.0169 + mean_total_tokens: -0.1807 + mean_vocab_size: -0.2062 + function_metrics: + mean_avg_function_lines: 0.1338 + mean_avg_param_count: -0.0931 + mean_function_count: -0.3274 + mean_max_function_lines: 0.0222 + mean_max_param_count: -0.1036 + halstead: + mean_N1_total_operators: -0.1746 + mean_N2_total_operands: -0.1868 + mean_difficulty: 0.0070 + mean_effort: -0.2194 + mean_estimated_bugs: -0.2264 + mean_length: -0.1785 + mean_n1_unique_operators: -0.0814 + mean_n2_unique_operands: -0.2752 + mean_time_to_implement_seconds: -0.2194 + mean_vocabulary: -0.2238 + mean_volume: -0.2264 + heaps: + mean_beta: -0.0687 + mean_k: 0.0978 + mean_r_squared: -0.0094 + identifier_length_variance: + mean_max: -0.0671 + mean_mean: -0.0614 + mean_std_dev: 0.0205 + mean_variance: 0.0411 + indentation: + mean_blank_line_ratio: -0.4899 + mean_max_depth: 0.0301 + mean_mean_depth: 0.0114 + mean_variance: 0.1685 + line_patterns: + mean_blank_line_ratio: -0.4899 + mean_string_literal_ratio: 0.0039 + mean_unique_line_ratio: 0.0561 + magic_number_density: + mean_density: 1.4051 + mean_magic_number_count: -0.4114 + mean_string_literal_ratio: 0.0039 + near_duplicate_blocks_file: + mean_block_count: 0.5617 + mean_near_dup_block_d7: 0.1772 + mean_near_dup_block_d8: 0.1772 + mean_sub_block_count: 1.0591 + ngram: + mean_bigram_hapax_fraction: -0.0655 + mean_bigram_repeated_unique: -0.1356 + mean_bigram_repetition_rate: 0.0296 + mean_bigram_total: -0.1809 + mean_bigram_unique: -0.2260 + mean_trigram_hapax_fraction: -0.0366 + mean_trigram_repeated_unique: -0.1208 + mean_trigram_repetition_rate: 0.0506 + mean_trigram_total: -0.1812 + mean_trigram_unique: -0.2220 + punctuation_density: + mean_arrow_density: -0.2511 + mean_bracket_nonalpha_prefix_count: -0.2342 + mean_bracket_nonalpha_suffix_count: -0.3472 + mean_bracket_number_pair_count: -0.1772 + mean_colon_suffix_density: -0.2045 + mean_dot_count: -0.0341 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: -0.0125 + readability: + mean_avg_line_length: -0.0217 + mean_avg_sub_words_per_id: -0.0148 + mean_avg_tokens_per_line: 0.0256 + mean_flesch_adapted: 0.0146 + mean_fog_adapted: 0.0323 + mean_total_lines: -0.2063 + symbol_density: + mean_density: 0.0758 + mean_distinct_symbol_types: -0.0604 + mean_symbol_count: -0.1504 + vocabulary: + mean_mattr: -0.1396 + mean_raw_ttr: -0.0669 + mean_total_identifiers: -0.1838 + mean_unique_identifiers: -0.2507 + vowel_density: + mean_total_chars: -0.2452 + zipf: + mean_exponent: 0.0102 + mean_r_squared: -0.0067 + mean_total_tokens: -0.1807 + mean_vocab_size: -0.2062 + +line_length_under_120: + _doc: "Lines should be under 120 characters to avoid horizontal scrolling." + _log_baseline: -6.2041 + branching: + mean_branching_density: -0.1942 + mean_non_blank_count: 0.1944 + brevity: + mean_sample_size: -0.0200 + casing_entropy: + mean_entropy: -0.0047 + mean_snake_case_count: 0.0074 + compression: + mean_raw_bytes: 0.0170 + mean_redundancy: 0.0140 + mean_unique_line_ratio: 0.0133 + mean_zlib_bytes: -0.0077 + mean_zlib_ratio: 0.0247 + entropy: + mean_char_entropy: -0.0087 + mean_char_normalized: -0.0076 + mean_token_entropy: -0.0022 + mean_token_max_entropy: -0.0041 + mean_token_normalized: 0.0019 + mean_total_tokens: -0.0030 + mean_vocab_size: -0.0200 + function_metrics: + mean_avg_function_lines: 0.2084 + mean_avg_param_count: -0.0276 + mean_max_function_lines: 0.2570 + mean_max_param_count: -0.0944 + halstead: + mean_N1_total_operators: -0.0033 + mean_N2_total_operands: 0.0022 + mean_difficulty: 0.0219 + mean_effort: 0.0160 + mean_estimated_bugs: -0.0059 + mean_n1_unique_operators: -0.0081 + mean_n2_unique_operands: -0.0278 + mean_time_to_implement_seconds: 0.0160 + mean_vocabulary: -0.0228 + mean_volume: -0.0059 + heaps: + mean_beta: -0.0068 + mean_k: 0.0086 + identifier_length_variance: + mean_mean: -0.0207 + mean_std_dev: -0.0480 + mean_variance: -0.0960 + indentation: + mean_blank_line_ratio: -0.0420 + mean_max_depth: 0.1137 + mean_mean_depth: 0.1254 + mean_variance: 0.2595 + line_patterns: + mean_blank_line_ratio: -0.0420 + mean_string_literal_ratio: -0.0264 + mean_unique_line_ratio: 0.0181 + magic_number_density: + mean_density: 0.0052 + mean_string_literal_ratio: -0.0264 + near_duplicate_blocks_file: + mean_sub_block_count: 0.0477 + ngram: + mean_bigram_hapax_fraction: -0.0141 + mean_bigram_repeated_unique: 0.0257 + mean_bigram_repetition_rate: 0.0141 + mean_bigram_total: -0.0030 + mean_bigram_unique: -0.0113 + mean_trigram_hapax_fraction: 0.0017 + mean_trigram_repeated_unique: -0.0134 + mean_trigram_total: -0.0030 + mean_trigram_unique: -0.0043 + punctuation_density: + mean_bracket_nonalpha_prefix_count: -0.0807 + mean_bracket_nonalpha_suffix_count: -0.1362 + mean_colon_suffix_density: 0.0705 + mean_dot_count: -0.0069 + mean_id_nonalpha_suffix_density: 0.0093 + mean_question_mark_density: 2.0000 + readability: + mean_avg_line_length: -0.1816 + mean_avg_sub_words_per_id: -0.0066 + mean_avg_tokens_per_line: -0.1974 + mean_flesch_adapted: 0.0402 + mean_fog_adapted: -0.2009 + mean_total_lines: 0.1944 + symbol_density: + mean_density: -0.0247 + mean_distinct_symbol_types: -0.0130 + mean_symbol_count: -0.0078 + vocabulary: + mean_mattr: -0.0231 + mean_raw_ttr: -0.0300 + mean_total_identifiers: 0.0067 + mean_unique_identifiers: -0.0232 + vowel_density: + mean_total_chars: -0.0140 + zipf: + mean_exponent: 0.0039 + mean_total_tokens: -0.0030 + mean_vocab_size: -0.0200 + +no_magic_numbers: + _doc: "Numeric literals should be extracted to named constants rather than used inline." + _log_baseline: 105.2910 + branching: + mean_branch_count: -0.4352 + mean_branching_density: -0.9103 + mean_non_blank_count: 0.4762 + brevity: + mean_sample_size: 0.3955 + casing_entropy: + mean_entropy: -0.5234 + mean_snake_case_count: 0.9072 + compression: + mean_raw_bytes: 0.7713 + mean_redundancy: 0.1328 + mean_unique_line_ratio: 0.1073 + mean_zlib_bytes: 0.5072 + mean_zlib_ratio: 0.2642 + entropy: + mean_char_entropy: 0.0481 + mean_char_normalized: 0.0481 + mean_token_entropy: 0.0769 + mean_token_max_entropy: 0.0825 + mean_total_tokens: 0.4877 + mean_vocab_size: 0.3955 + function_metrics: + mean_avg_function_lines: -0.5888 + mean_avg_param_count: -0.1339 + mean_function_count: 0.5327 + mean_max_function_lines: -0.2655 + halstead: + mean_N1_total_operators: 0.1749 + mean_N2_total_operands: 0.4966 + mean_difficulty: -0.0338 + mean_effort: 0.3387 + mean_estimated_bugs: 0.3723 + mean_length: 0.3056 + mean_n1_unique_operators: -0.0901 + mean_n2_unique_operands: 0.4402 + mean_time_to_implement_seconds: 0.3387 + mean_vocabulary: 0.3159 + mean_volume: 0.3724 + heaps: + mean_beta: -0.1294 + mean_k: 0.7952 + mean_r_squared: -0.0645 + identifier_length_variance: + mean_max: 0.2172 + mean_mean: 0.4886 + mean_std_dev: 0.4918 + mean_variance: 0.9835 + indentation: + mean_blank_line_ratio: 0.3137 + mean_mean_depth: -0.4612 + mean_variance: -0.5503 + line_patterns: + mean_blank_line_ratio: 0.3137 + mean_string_literal_ratio: -0.5060 + mean_unique_line_ratio: 0.1502 + magic_number_density: + mean_density: -1.2903 + mean_magic_number_count: -0.8032 + mean_string_literal_ratio: -0.5060 + near_duplicate_blocks_file: + mean_block_count: -0.1911 + mean_near_dup_block_d0: -1.6546 + mean_near_dup_block_d7: -1.0789 + mean_sub_block_count: 0.3466 + ngram: + mean_bigram_hapax_fraction: -0.1520 + mean_bigram_repeated_unique: 0.7630 + mean_bigram_repetition_rate: 0.1469 + mean_bigram_total: 0.4887 + mean_bigram_unique: 0.4248 + mean_trigram_hapax_fraction: 0.0849 + mean_trigram_repeated_unique: 0.0415 + mean_trigram_repetition_rate: -0.2233 + mean_trigram_total: 0.4896 + mean_trigram_unique: 0.5215 + punctuation_density: + mean_arrow_density: -1.4573 + mean_bracket_nonalpha_suffix_count: 0.5999 + mean_colon_suffix_density: 0.5811 + mean_id_nonalpha_suffix_density: -0.1238 + mean_question_mark_density: -0.8032 + readability: + mean_avg_line_length: 0.3048 + mean_avg_sub_words_per_id: 0.3883 + mean_flesch_adapted: -0.7069 + mean_fog_adapted: 2.0000 + mean_total_lines: 0.4762 + symbol_density: + mean_density: -0.3071 + mean_symbol_count: 0.4654 + vocabulary: + mean_mattr: 0.3553 + mean_raw_ttr: -0.0669 + mean_total_identifiers: 0.7640 + mean_unique_identifiers: 0.6968 + vowel_density: + mean_total_chars: 1.2526 + zipf: + mean_exponent: -0.1353 + mean_r_squared: -0.0320 + mean_total_tokens: 0.4877 + mean_vocab_size: 0.3955 + +single_responsibility: + _doc: "Each file should have one primary concern — low complexity spread across few, focused functions." + _log_baseline: -36.0617 + branching: + mean_branch_count: -0.0678 + mean_branching_density: 0.1364 + mean_max_nesting_depth: -0.1093 + mean_non_blank_count: -0.2043 + brevity: + mean_sample_size: -0.0864 + casing_entropy: + mean_entropy: -0.0206 + mean_other_count: -0.7475 + mean_pascal_case_count: 0.0470 + mean_snake_case_count: -0.1543 + compression: + mean_raw_bytes: -0.1908 + mean_redundancy: -0.0351 + mean_unique_line_ratio: 0.0316 + mean_zlib_bytes: -0.1293 + mean_zlib_ratio: -0.0616 + entropy: + mean_char_entropy: 0.0078 + mean_char_max_entropy: -0.0021 + mean_char_normalized: 0.0099 + mean_token_entropy: 0.0014 + mean_token_max_entropy: -0.0182 + mean_token_normalized: 0.0196 + mean_total_tokens: -0.1489 + mean_vocab_size: -0.0864 + function_metrics: + mean_avg_function_lines: 0.1696 + mean_avg_param_count: -0.0805 + mean_function_count: -0.4114 + mean_max_param_count: -0.2962 + halstead: + mean_N1_total_operators: -0.1395 + mean_N2_total_operands: -0.1701 + mean_difficulty: 0.0527 + mean_effort: -0.1183 + mean_estimated_bugs: -0.1710 + mean_length: -0.1498 + mean_n1_unique_operators: 0.0541 + mean_n2_unique_operands: -0.1687 + mean_time_to_implement_seconds: -0.1183 + mean_vocabulary: -0.0965 + mean_volume: -0.1710 + heaps: + mean_beta: -0.0154 + mean_k: 0.0801 + mean_r_squared: -0.0163 + identifier_length_variance: + mean_max: -0.0836 + mean_mean: -0.0508 + mean_std_dev: -0.0865 + mean_variance: -0.1729 + indentation: + mean_blank_line_ratio: 0.0458 + mean_mean_depth: -0.0476 + mean_variance: -0.0931 + line_patterns: + mean_blank_line_ratio: 0.0458 + mean_max_nesting_depth: -0.1093 + mean_string_literal_ratio: -0.1759 + mean_unique_line_ratio: 0.0324 + magic_number_density: + mean_density: 0.1469 + mean_string_literal_ratio: -0.1759 + near_duplicate_blocks_file: + mean_block_count: -0.2284 + mean_near_dup_block_d0: -0.2962 + mean_near_dup_block_d7: -0.3737 + mean_sub_block_count: -0.1348 + ngram: + mean_bigram_hapax_fraction: 0.0075 + mean_bigram_repeated_unique: -0.1303 + mean_bigram_repetition_rate: -0.0207 + mean_bigram_total: -0.1492 + mean_bigram_unique: -0.1162 + mean_trigram_hapax_fraction: 0.0132 + mean_trigram_repeated_unique: -0.1793 + mean_trigram_repetition_rate: -0.0466 + mean_trigram_total: -0.1495 + mean_trigram_unique: -0.1273 + punctuation_density: + mean_arrow_density: -0.1462 + mean_bracket_nonalpha_prefix_count: -0.0859 + mean_bracket_nonalpha_suffix_count: -0.4201 + mean_colon_suffix_density: -0.4720 + mean_dot_count: -0.0630 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: -0.0499 + mean_question_mark_density: 0.4596 + readability: + mean_avg_line_length: 0.0137 + mean_avg_sub_words_per_id: -0.0377 + mean_avg_tokens_per_line: 0.0553 + mean_flesch_adapted: 0.0348 + mean_fog_adapted: -0.0587 + mean_total_lines: -0.2043 + symbol_density: + mean_density: 0.0683 + mean_distinct_symbol_types: 0.0284 + mean_symbol_count: -0.1225 + vocabulary: + mean_mattr: -0.0285 + mean_raw_ttr: 0.0110 + mean_total_identifiers: -0.1419 + mean_unique_identifiers: -0.1309 + vowel_density: + mean_total_chars: -0.1927 + zipf: + mean_exponent: -0.0209 + mean_r_squared: -0.0043 + mean_total_tokens: -0.1489 + mean_vocab_size: -0.0864 + +uses_standard_indentation_width: + _doc: "Indentation should use consistent multiples of 2 or 4 spaces throughout the file." + _log_baseline: -17.9172 + compression: + mean_raw_bytes: -0.2512 + mean_redundancy: -0.0906 + mean_zlib_bytes: -0.0351 + mean_zlib_ratio: -0.2161 + entropy: + mean_char_entropy: 0.1510 + mean_char_normalized: 0.1510 + function_metrics: + mean_avg_function_lines: 0.0361 + indentation: + mean_blank_line_ratio: 0.2077 + mean_max_depth: -1.0000 + mean_mean_depth: -1.0000 + mean_variance: -2.0000 + line_patterns: + mean_blank_line_ratio: 0.2077 + near_duplicate_blocks_file: + mean_near_dup_block_d3: -1.0000 + mean_near_dup_block_d4: 1.0000 + punctuation_density: + mean_exclamation_density: 0.2630 + mean_question_mark_density: 0.2630 + readability: + mean_avg_line_length: -0.2644 + symbol_density: + mean_density: 0.2512 + diff --git a/priv/combined_metrics/function_design.yml b/priv/combined_metrics/function_design.yml new file mode 100644 index 0000000..cb1d808 --- /dev/null +++ b/priv/combined_metrics/function_design.yml @@ -0,0 +1,832 @@ +boolean_function_has_question_mark: + _doc: "Functions returning a boolean should end with `?` (Elixir/Ruby) or start with `is_`/`has_` (JS/Python)." + _log_baseline: 7.0991 + brevity: + mean_sample_size: 0.0085 + compression: + mean_raw_bytes: 0.0063 + mean_zlib_bytes: 0.0085 + entropy: + mean_char_entropy: 0.0045 + mean_char_max_entropy: 0.0028 + mean_token_entropy: 0.0034 + mean_total_tokens: 0.0248 + mean_vocab_size: 0.0085 + heaps: + mean_beta: -0.0104 + mean_k: 0.0270 + ngram: + mean_bigram_hapax_fraction: -0.0039 + mean_bigram_repeated_unique: 0.0190 + mean_bigram_repetition_rate: 0.0261 + mean_bigram_total: 0.0250 + mean_bigram_unique: 0.0039 + mean_trigram_repeated_unique: 0.0300 + mean_trigram_repetition_rate: 0.0180 + mean_trigram_total: 0.0251 + mean_trigram_unique: 0.0207 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 1.0697 + mean_exclamation_density: -0.0344 + mean_id_nonalpha_suffix_density: -0.0248 + mean_question_mark_density: -2.0000 + readability: + mean_avg_line_length: 0.0067 + mean_avg_tokens_per_line: 0.0248 + mean_fog_adapted: 0.0248 + symbol_density: + mean_density: 0.0490 + mean_distinct_symbol_types: 0.0404 + mean_symbol_count: 0.0554 + zipf: + mean_exponent: 0.0072 + mean_total_tokens: 0.0248 + mean_vocab_size: 0.0085 + +cyclomatic_complexity_under_10: + _doc: "Functions should have a cyclomatic complexity under 10." + _log_baseline: -1.4896 + branching: + mean_branch_count: -0.2373 + mean_branching_density: -0.1952 + mean_non_blank_count: -0.0421 + casing_entropy: + mean_entropy: 0.0964 + mean_other_count: 0.3306 + mean_snake_case_count: 0.0321 + compression: + mean_raw_bytes: -0.0162 + mean_redundancy: -0.0172 + mean_unique_line_ratio: -0.0305 + mean_zlib_bytes: 0.0186 + mean_zlib_ratio: -0.0347 + entropy: + mean_char_entropy: 0.0349 + mean_char_max_entropy: 0.0016 + mean_char_normalized: 0.0333 + mean_token_entropy: -0.0050 + mean_token_normalized: -0.0050 + mean_total_tokens: 0.0437 + function_metrics: + mean_avg_function_lines: -0.4757 + mean_function_count: 0.4636 + mean_max_function_lines: -0.5038 + halstead: + mean_N1_total_operators: 0.0708 + mean_N2_total_operands: 0.0358 + mean_difficulty: 0.0472 + mean_effort: 0.1087 + mean_estimated_bugs: 0.0615 + mean_length: 0.0605 + mean_n1_unique_operators: 0.0114 + mean_time_to_implement_seconds: 0.1087 + mean_vocabulary: 0.0043 + mean_volume: 0.0615 + heaps: + mean_beta: -0.0367 + mean_k: 0.0672 + mean_r_squared: 0.0049 + identifier_length_variance: + mean_mean: 0.0130 + mean_std_dev: 0.0120 + mean_variance: 0.0240 + indentation: + mean_blank_line_ratio: 0.1655 + mean_max_depth: -0.2086 + mean_mean_depth: -0.2901 + mean_variance: -0.4637 + line_patterns: + mean_blank_line_ratio: 0.1655 + mean_string_literal_ratio: -0.0439 + mean_unique_line_ratio: 0.0055 + magic_number_density: + mean_density: -0.0329 + mean_string_literal_ratio: -0.0439 + near_duplicate_blocks_file: + mean_block_count: 0.1013 + mean_near_dup_block_d8: -0.2086 + mean_sub_block_count: 0.0994 + ngram: + mean_bigram_hapax_fraction: -0.0068 + mean_bigram_repeated_unique: 0.0301 + mean_bigram_repetition_rate: 0.0115 + mean_bigram_total: 0.0438 + mean_bigram_unique: 0.0192 + mean_trigram_hapax_fraction: -0.0027 + mean_trigram_repeated_unique: 0.0456 + mean_trigram_repetition_rate: 0.0055 + mean_trigram_total: 0.0440 + mean_trigram_unique: 0.0388 + punctuation_density: + mean_arrow_density: -0.4960 + mean_bracket_nonalpha_prefix_count: 0.2488 + mean_bracket_nonalpha_suffix_count: 0.3306 + mean_colon_suffix_density: 0.2760 + mean_dot_count: -0.3005 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: 0.0616 + readability: + mean_avg_line_length: 0.0257 + mean_avg_sub_words_per_id: 0.0067 + mean_avg_tokens_per_line: 0.0858 + mean_flesch_adapted: -0.0140 + mean_fog_adapted: 0.1035 + mean_total_lines: -0.0421 + symbol_density: + mean_density: 0.0552 + mean_distinct_symbol_types: -0.0172 + mean_symbol_count: 0.0391 + vocabulary: + mean_mattr: -0.0361 + mean_raw_ttr: -0.0361 + mean_total_identifiers: 0.0441 + mean_unique_identifiers: 0.0080 + vowel_density: + mean_total_chars: 0.0572 + zipf: + mean_exponent: 0.0120 + mean_r_squared: 0.0057 + mean_total_tokens: 0.0437 + +has_verb_in_name: + _doc: "Function names should contain a verb describing the action performed." + _log_baseline: 14.8350 + compression: + mean_raw_bytes: 0.0816 + mean_redundancy: -0.0390 + mean_zlib_bytes: 0.2011 + mean_zlib_ratio: -0.1195 + identifier_length_variance: + mean_max: 0.7747 + mean_mean: 0.2058 + mean_std_dev: 1.0000 + mean_variance: 2.0000 + punctuation_density: + mean_exclamation_density: -0.1076 + readability: + mean_avg_line_length: 0.0846 + mean_avg_sub_words_per_id: 0.1330 + mean_flesch_adapted: -0.1324 + mean_fog_adapted: 1.3261 + symbol_density: + mean_density: -0.0828 + vowel_density: + mean_total_chars: 0.2058 + +is_less_than_20_lines: + _doc: "Functions should be 20 lines or fewer." + _log_baseline: 23.9658 + branching: + mean_branch_count: -0.0820 + mean_branching_density: -0.1010 + mean_max_nesting_depth: -0.1156 + mean_non_blank_count: 0.0188 + brevity: + mean_sample_size: 0.0165 + casing_entropy: + mean_entropy: 0.0577 + mean_other_count: 0.6266 + mean_pascal_case_count: 0.0440 + mean_snake_case_count: 0.0910 + compression: + mean_raw_bytes: 0.0746 + mean_redundancy: 0.0227 + mean_unique_line_ratio: -0.0334 + mean_zlib_bytes: 0.0366 + mean_zlib_ratio: 0.0379 + entropy: + mean_char_entropy: 0.0020 + mean_token_entropy: -0.0041 + mean_token_max_entropy: 0.0035 + mean_token_normalized: -0.0076 + mean_total_tokens: 0.0759 + mean_vocab_size: 0.0165 + function_metrics: + mean_avg_function_lines: -0.3598 + mean_avg_param_count: 0.1156 + mean_function_count: 0.3705 + mean_max_function_lines: -0.4532 + mean_max_param_count: 0.0820 + halstead: + mean_N1_total_operators: 0.0857 + mean_N2_total_operands: 0.0965 + mean_difficulty: 0.0624 + mean_effort: 0.1550 + mean_estimated_bugs: 0.0926 + mean_length: 0.0895 + mean_n1_unique_operators: -0.0097 + mean_n2_unique_operands: 0.0245 + mean_time_to_implement_seconds: 0.1550 + mean_vocabulary: 0.0143 + mean_volume: 0.0926 + heaps: + mean_k: -0.0254 + identifier_length_variance: + mean_mean: 0.0122 + mean_std_dev: 0.0297 + mean_variance: 0.0593 + indentation: + mean_blank_line_ratio: -0.0440 + mean_mean_depth: -0.0962 + mean_variance: -0.1115 + line_patterns: + mean_blank_line_ratio: -0.0440 + mean_max_nesting_depth: -0.1156 + mean_string_literal_ratio: -0.0774 + mean_unique_line_ratio: -0.0188 + magic_number_density: + mean_density: 0.0389 + mean_magic_number_count: 0.1156 + mean_string_literal_ratio: -0.0774 + near_duplicate_blocks_file: + mean_block_count: 0.2797 + mean_near_dup_block_d8: 0.3133 + mean_sub_block_count: 0.1886 + ngram: + mean_bigram_hapax_fraction: -0.0508 + mean_bigram_repeated_unique: 0.1067 + mean_bigram_repetition_rate: 0.0562 + mean_bigram_total: 0.0760 + mean_bigram_unique: 0.0228 + mean_trigram_hapax_fraction: -0.0300 + mean_trigram_repeated_unique: 0.1516 + mean_trigram_repetition_rate: 0.1014 + mean_trigram_total: 0.0761 + mean_trigram_unique: 0.0386 + punctuation_density: + mean_arrow_density: -0.3892 + mean_bracket_nonalpha_prefix_count: 0.0418 + mean_bracket_nonalpha_suffix_count: 0.0476 + mean_colon_suffix_density: 0.0941 + mean_dot_count: 0.0717 + mean_exclamation_density: -0.0820 + mean_id_nonalpha_suffix_density: 0.0518 + mean_question_mark_density: -2.0000 + readability: + mean_avg_line_length: 0.0576 + mean_avg_sub_words_per_id: -0.0034 + mean_avg_tokens_per_line: 0.0570 + mean_flesch_adapted: -0.0039 + mean_fog_adapted: 0.0868 + mean_total_lines: 0.0188 + symbol_density: + mean_density: -0.0084 + mean_distinct_symbol_types: 0.0127 + mean_symbol_count: 0.0662 + vocabulary: + mean_mattr: -0.0390 + mean_raw_ttr: -0.0717 + mean_total_identifiers: 0.0965 + mean_unique_identifiers: 0.0248 + vowel_density: + mean_total_chars: 0.1087 + zipf: + mean_exponent: 0.0225 + mean_r_squared: 0.0030 + mean_total_tokens: 0.0759 + mean_vocab_size: 0.0165 + +nesting_depth_under_4: + _doc: "Code should not nest deeper than 4 levels." + _log_baseline: 1.0611 + branching: + mean_branch_count: -0.3267 + mean_branching_density: -0.2061 + mean_max_nesting_depth: 0.2061 + mean_non_blank_count: -0.1206 + brevity: + mean_sample_size: 0.0178 + casing_entropy: + mean_entropy: -0.0207 + mean_other_count: 0.2917 + mean_pascal_case_count: -0.2725 + mean_snake_case_count: 0.0787 + compression: + mean_raw_bytes: -0.0069 + mean_redundancy: -0.0076 + mean_unique_line_ratio: -0.0028 + mean_zlib_bytes: 0.0117 + mean_zlib_ratio: -0.0186 + entropy: + mean_char_entropy: 0.0715 + mean_char_max_entropy: -0.0017 + mean_char_normalized: 0.0732 + mean_token_entropy: -0.0118 + mean_token_max_entropy: 0.0042 + mean_token_normalized: -0.0161 + mean_total_tokens: 0.1047 + mean_vocab_size: 0.0178 + function_metrics: + mean_avg_function_lines: -0.6349 + mean_function_count: 0.5787 + mean_max_function_lines: -0.3375 + halstead: + mean_N1_total_operators: 0.1525 + mean_N2_total_operands: 0.0950 + mean_difficulty: 0.0991 + mean_effort: 0.2385 + mean_estimated_bugs: 0.1394 + mean_length: 0.1347 + mean_n1_unique_operators: 0.0229 + mean_n2_unique_operands: 0.0188 + mean_time_to_implement_seconds: 0.2385 + mean_vocabulary: 0.0202 + mean_volume: 0.1394 + heaps: + mean_beta: -0.0464 + mean_k: 0.0845 + identifier_length_variance: + mean_mean: 0.0770 + mean_std_dev: 0.1858 + mean_variance: 0.3716 + indentation: + mean_blank_line_ratio: 0.5622 + mean_max_depth: -0.3155 + mean_mean_depth: -0.3651 + mean_variance: -0.6050 + line_patterns: + mean_blank_line_ratio: 0.5622 + mean_max_nesting_depth: 0.2061 + mean_string_literal_ratio: -0.1046 + mean_unique_line_ratio: 0.0786 + magic_number_density: + mean_string_literal_ratio: -0.1046 + near_duplicate_blocks_file: + mean_block_count: 0.0856 + mean_sub_block_count: 0.1999 + ngram: + mean_bigram_hapax_fraction: -0.0645 + mean_bigram_repeated_unique: 0.1420 + mean_bigram_repetition_rate: 0.0534 + mean_bigram_total: 0.1049 + mean_bigram_unique: 0.0292 + mean_trigram_hapax_fraction: -0.0170 + mean_trigram_repeated_unique: 0.1274 + mean_trigram_repetition_rate: 0.0457 + mean_trigram_total: 0.1052 + mean_trigram_unique: 0.0686 + punctuation_density: + mean_arrow_density: 0.9701 + mean_bracket_nonalpha_prefix_count: 0.1748 + mean_bracket_nonalpha_suffix_count: 0.9451 + mean_colon_suffix_density: 0.8804 + mean_dot_count: -0.2520 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: 0.0946 + mean_question_mark_density: -0.1977 + readability: + mean_avg_line_length: 0.1152 + mean_avg_sub_words_per_id: 0.0220 + mean_avg_tokens_per_line: 0.2252 + mean_flesch_adapted: -0.0374 + mean_fog_adapted: 0.2252 + mean_total_lines: -0.1206 + symbol_density: + mean_density: 0.1426 + mean_symbol_count: 0.1355 + vocabulary: + mean_mattr: -0.0269 + mean_raw_ttr: -0.0269 + mean_total_identifiers: 0.0774 + mean_unique_identifiers: 0.0505 + vowel_density: + mean_total_chars: 0.1544 + zipf: + mean_exponent: 0.0250 + mean_r_squared: 0.0156 + mean_total_tokens: 0.1047 + mean_vocab_size: 0.0178 + +no_boolean_parameter: + _doc: "Functions should not take boolean parameters — a flag usually means the function does two things." + _log_baseline: 13.6290 + branching: + mean_branch_count: -2.0000 + mean_branching_density: 1.0271 + mean_max_nesting_depth: -0.3263 + mean_non_blank_count: -0.0383 + brevity: + mean_sample_size: -0.0253 + casing_entropy: + mean_entropy: 0.0049 + mean_pascal_case_count: 0.1180 + mean_snake_case_count: 0.0931 + compression: + mean_raw_bytes: 0.0435 + mean_redundancy: 0.0777 + mean_unique_line_ratio: -0.0656 + mean_zlib_bytes: -0.1055 + mean_zlib_ratio: 0.1490 + entropy: + mean_char_entropy: 0.0152 + mean_char_normalized: 0.0153 + mean_token_entropy: -0.0129 + mean_token_max_entropy: -0.0055 + mean_token_normalized: -0.0073 + mean_total_tokens: 0.0692 + mean_vocab_size: -0.0253 + function_metrics: + mean_avg_function_lines: -0.3850 + mean_avg_param_count: -0.2935 + mean_function_count: 0.4338 + mean_max_function_lines: -0.5579 + halstead: + mean_N1_total_operators: 0.0393 + mean_N2_total_operands: 0.0832 + mean_difficulty: 0.0207 + mean_effort: 0.0660 + mean_estimated_bugs: 0.0453 + mean_length: 0.0543 + mean_n1_unique_operators: -0.0806 + mean_n2_unique_operands: -0.0181 + mean_time_to_implement_seconds: 0.0660 + mean_vocabulary: -0.0374 + mean_volume: 0.0453 + heaps: + mean_beta: -0.0314 + mean_k: 0.0620 + identifier_length_variance: + mean_mean: 0.0125 + mean_std_dev: 0.1858 + mean_variance: 0.3715 + indentation: + mean_blank_line_ratio: 0.4402 + mean_max_depth: -0.5579 + mean_mean_depth: -0.2880 + mean_variance: -0.8414 + line_patterns: + mean_blank_line_ratio: 0.4402 + mean_max_nesting_depth: -0.3263 + mean_string_literal_ratio: 0.0206 + mean_unique_line_ratio: 0.0101 + magic_number_density: + mean_string_literal_ratio: 0.0206 + near_duplicate_blocks_file: + mean_block_count: 0.4338 + mean_near_dup_block_d0: 1.7685 + mean_near_dup_block_d2: 1.1158 + mean_near_dup_block_d4: 1.6737 + mean_near_dup_block_d5: 1.6737 + mean_near_dup_block_d6: 1.7685 + mean_near_dup_block_d7: -0.8842 + mean_near_dup_block_d8: 0.5579 + mean_sub_block_count: 0.2775 + ngram: + mean_bigram_hapax_fraction: -0.1940 + mean_bigram_repeated_unique: 0.1467 + mean_bigram_repetition_rate: 0.1504 + mean_bigram_total: 0.0694 + mean_bigram_unique: -0.1127 + mean_trigram_hapax_fraction: -0.2208 + mean_trigram_repeated_unique: 0.3783 + mean_trigram_repetition_rate: 0.3150 + mean_trigram_total: 0.0695 + mean_trigram_unique: -0.1019 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.1152 + mean_bracket_nonalpha_suffix_count: 0.1075 + mean_colon_suffix_density: -0.2793 + mean_dot_count: 0.1538 + mean_exclamation_density: -0.0435 + mean_id_nonalpha_suffix_density: 0.0357 + readability: + mean_avg_line_length: 0.0817 + mean_avg_sub_words_per_id: 0.0160 + mean_avg_tokens_per_line: 0.1075 + mean_flesch_adapted: -0.0254 + mean_fog_adapted: 0.2928 + mean_total_lines: -0.0383 + symbol_density: + mean_density: 0.0479 + mean_symbol_count: 0.0916 + vocabulary: + mean_mattr: -0.0916 + mean_raw_ttr: -0.1091 + mean_total_identifiers: 0.0962 + mean_unique_identifiers: -0.0129 + vowel_density: + mean_total_chars: 0.1087 + zipf: + mean_exponent: 0.0374 + mean_total_tokens: 0.0692 + mean_vocab_size: -0.0253 + +no_magic_numbers: + _doc: "Numeric literals should be named constants, not inline magic numbers." + _log_baseline: 45.8808 + branching: + mean_branch_count: -0.2708 + mean_branching_density: -0.1682 + mean_non_blank_count: -0.1029 + brevity: + mean_sample_size: 0.1527 + casing_entropy: + mean_entropy: -0.2908 + mean_snake_case_count: 0.4279 + compression: + mean_raw_bytes: 0.3823 + mean_redundancy: 0.0584 + mean_unique_line_ratio: 0.2269 + mean_zlib_bytes: 0.2473 + mean_zlib_ratio: 0.1350 + entropy: + mean_char_entropy: 0.0661 + mean_char_normalized: 0.0624 + mean_token_entropy: 0.0148 + mean_token_max_entropy: 0.0355 + mean_token_normalized: -0.0207 + mean_total_tokens: 0.2834 + mean_vocab_size: 0.1527 + function_metrics: + mean_avg_function_lines: -0.8758 + mean_function_count: 0.4111 + halstead: + mean_N1_total_operators: 0.1953 + mean_N2_total_operands: 0.2960 + mean_difficulty: 0.0408 + mean_effort: 0.3105 + mean_estimated_bugs: 0.2698 + mean_length: 0.2359 + mean_n1_unique_operators: -0.0413 + mean_n2_unique_operands: 0.2139 + mean_time_to_implement_seconds: 0.3105 + mean_vocabulary: 0.1447 + mean_volume: 0.2697 + heaps: + mean_beta: -0.1129 + mean_k: 0.5236 + mean_r_squared: -0.0256 + identifier_length_variance: + mean_max: 0.0987 + mean_mean: 0.3721 + mean_std_dev: 0.3878 + mean_variance: 0.7757 + indentation: + mean_blank_line_ratio: 0.2374 + mean_mean_depth: -0.3518 + mean_variance: -0.4760 + line_patterns: + mean_blank_line_ratio: 0.2374 + mean_string_literal_ratio: -0.2880 + mean_unique_line_ratio: 0.2337 + magic_number_density: + mean_density: -0.2831 + mean_string_literal_ratio: -0.2880 + near_duplicate_blocks_file: + mean_block_count: -0.7894 + mean_near_dup_block_d0: -1.1158 + mean_near_dup_block_d7: -1.1158 + mean_sub_block_count: 0.2708 + ngram: + mean_bigram_hapax_fraction: -0.1437 + mean_bigram_repeated_unique: 0.4787 + mean_bigram_repetition_rate: 0.1545 + mean_bigram_total: 0.2844 + mean_bigram_unique: 0.1437 + mean_trigram_hapax_fraction: -0.0207 + mean_trigram_repeated_unique: 0.2787 + mean_trigram_repetition_rate: 0.1465 + mean_trigram_total: 0.2854 + mean_trigram_unique: 0.1843 + punctuation_density: + mean_arrow_density: -1.1699 + mean_bracket_nonalpha_suffix_count: 2.0000 + mean_colon_suffix_density: 1.9476 + mean_id_nonalpha_suffix_density: 0.3448 + readability: + mean_avg_line_length: 0.5035 + mean_avg_sub_words_per_id: 0.2699 + mean_avg_tokens_per_line: 0.3863 + mean_flesch_adapted: -0.3819 + mean_fog_adapted: 1.0656 + mean_total_lines: -0.1029 + symbol_density: + mean_density: -0.0314 + mean_distinct_symbol_types: 0.0644 + mean_symbol_count: 0.3512 + vocabulary: + mean_mattr: 0.0058 + mean_raw_ttr: -0.0081 + mean_total_identifiers: 0.3908 + mean_unique_identifiers: 0.3826 + vowel_density: + mean_total_chars: 0.7629 + zipf: + mean_exponent: 0.0164 + mean_r_squared: 0.0321 + mean_total_tokens: 0.2834 + mean_vocab_size: 0.1527 + +parameter_count_under_4: + _doc: "Functions should take fewer than 4 parameters." + _log_baseline: 1.9637 + branching: + mean_non_blank_count: 0.0967 + brevity: + mean_sample_size: 0.0261 + casing_entropy: + mean_entropy: 0.5731 + mean_other_count: 0.5408 + mean_pascal_case_count: 0.2329 + mean_snake_case_count: -0.0351 + compression: + mean_raw_bytes: -0.0343 + mean_redundancy: -0.0308 + mean_unique_line_ratio: -0.0166 + mean_zlib_bytes: 0.0291 + mean_zlib_ratio: -0.0634 + entropy: + mean_char_entropy: 0.0082 + mean_char_max_entropy: 0.0175 + mean_char_normalized: -0.0093 + mean_token_entropy: 0.0206 + mean_token_max_entropy: 0.0063 + mean_token_normalized: 0.0144 + mean_total_tokens: -0.0335 + mean_vocab_size: 0.0261 + function_metrics: + mean_avg_function_lines: 0.1262 + mean_avg_param_count: -0.3179 + mean_function_count: -0.0320 + mean_max_function_lines: 0.2037 + mean_max_param_count: -0.2847 + halstead: + mean_N1_total_operators: -0.0264 + mean_N2_total_operands: -0.0507 + mean_difficulty: 0.0180 + mean_effort: -0.0147 + mean_estimated_bugs: -0.0326 + mean_length: -0.0353 + mean_n1_unique_operators: 0.0613 + mean_n2_unique_operands: -0.0074 + mean_time_to_implement_seconds: -0.0147 + mean_vocabulary: 0.0108 + mean_volume: -0.0327 + heaps: + mean_beta: 0.0179 + mean_k: -0.0082 + mean_r_squared: -0.0062 + identifier_length_variance: + mean_mean: -0.0239 + mean_std_dev: -0.0185 + mean_variance: -0.0371 + indentation: + mean_blank_line_ratio: 0.0518 + mean_max_depth: 0.1362 + mean_mean_depth: 0.0506 + mean_variance: 0.1451 + line_patterns: + mean_blank_line_ratio: 0.0518 + mean_string_literal_ratio: 0.1674 + mean_unique_line_ratio: -0.0137 + magic_number_density: + mean_density: -2.0000 + mean_string_literal_ratio: 0.1674 + near_duplicate_blocks_file: + mean_block_count: 0.0967 + mean_near_dup_block_d7: -0.4658 + ngram: + mean_bigram_hapax_fraction: 0.0479 + mean_bigram_repeated_unique: -0.0222 + mean_bigram_repetition_rate: -0.0480 + mean_bigram_total: -0.0336 + mean_bigram_unique: 0.0376 + mean_trigram_hapax_fraction: 0.0610 + mean_trigram_repeated_unique: -0.1263 + mean_trigram_repetition_rate: -0.1619 + mean_trigram_total: -0.0337 + mean_trigram_unique: 0.0524 + punctuation_density: + mean_bracket_nonalpha_prefix_count: -0.0967 + mean_bracket_nonalpha_suffix_count: -0.1131 + mean_colon_suffix_density: 0.0056 + mean_dot_count: 0.9099 + mean_id_nonalpha_suffix_density: -0.0434 + readability: + mean_avg_line_length: -0.1345 + mean_avg_sub_words_per_id: -0.0093 + mean_avg_tokens_per_line: -0.1302 + mean_flesch_adapted: 0.0271 + mean_fog_adapted: -0.1290 + mean_total_lines: 0.0967 + symbol_density: + mean_density: 0.0124 + mean_distinct_symbol_types: 0.1042 + mean_symbol_count: -0.0218 + vocabulary: + mean_mattr: 0.0150 + mean_raw_ttr: 0.0153 + mean_total_identifiers: -0.0153 + vowel_density: + mean_total_chars: -0.0393 + zipf: + mean_exponent: 0.0101 + mean_r_squared: -0.0074 + mean_total_tokens: -0.0335 + mean_vocab_size: 0.0261 + +uses_ternary_expression: + _doc: "Simple conditional assignments should use inline expressions rather than full if-blocks." + _log_baseline: -4.5289 + branching: + mean_branch_count: -0.4160 + mean_branching_density: 0.1134 + mean_non_blank_count: -0.5296 + brevity: + mean_sample_size: 0.0095 + casing_entropy: + mean_entropy: 0.0068 + mean_snake_case_count: -0.0141 + compression: + mean_raw_bytes: -0.0819 + mean_redundancy: -0.0629 + mean_unique_line_ratio: 0.1604 + mean_zlib_bytes: 0.0148 + mean_zlib_ratio: -0.0967 + entropy: + mean_char_entropy: 0.0664 + mean_char_normalized: 0.0636 + mean_token_entropy: -0.0078 + mean_token_normalized: -0.0101 + mean_total_tokens: 0.0859 + mean_vocab_size: 0.0095 + function_metrics: + mean_avg_function_lines: -0.6785 + mean_function_count: 0.2434 + mean_max_function_lines: -0.4160 + halstead: + mean_N1_total_operators: 0.1567 + mean_N2_total_operands: 0.0551 + mean_difficulty: 0.0844 + mean_effort: 0.2135 + mean_estimated_bugs: 0.1291 + mean_length: 0.1267 + mean_n1_unique_operators: 0.0293 + mean_time_to_implement_seconds: 0.2135 + mean_vocabulary: 0.0101 + mean_volume: 0.1291 + heaps: + mean_beta: -0.0301 + mean_k: 0.0594 + identifier_length_variance: + mean_mean: 0.0749 + mean_std_dev: 0.0535 + mean_variance: 0.1070 + indentation: + mean_blank_line_ratio: 0.5054 + mean_max_depth: -0.2434 + mean_mean_depth: -0.3243 + mean_variance: -0.5454 + line_patterns: + mean_blank_line_ratio: 0.5054 + mean_string_literal_ratio: -0.0855 + mean_unique_line_ratio: 0.1630 + magic_number_density: + mean_density: -0.0859 + mean_string_literal_ratio: -0.0855 + near_duplicate_blocks_file: + mean_block_count: -0.2821 + mean_near_dup_block_d0: -2.0000 + mean_sub_block_count: 0.2434 + ngram: + mean_bigram_hapax_fraction: -0.0546 + mean_bigram_repeated_unique: 0.2141 + mean_bigram_repetition_rate: 0.0785 + mean_bigram_total: 0.0863 + mean_bigram_unique: 0.0432 + mean_trigram_hapax_fraction: -0.0165 + mean_trigram_repeated_unique: 0.1339 + mean_trigram_repetition_rate: 0.1178 + mean_trigram_total: 0.0866 + mean_trigram_unique: 0.0400 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.2713 + mean_bracket_nonalpha_suffix_count: 0.5255 + mean_bracket_number_pair_count: 0.4160 + mean_colon_suffix_density: 1.7729 + mean_dot_count: -1.1679 + mean_id_nonalpha_suffix_density: 0.1908 + readability: + mean_avg_line_length: 0.4657 + mean_avg_sub_words_per_id: 0.0312 + mean_avg_tokens_per_line: 0.6155 + mean_flesch_adapted: -0.0682 + mean_fog_adapted: 0.5360 + mean_total_lines: -0.5296 + symbol_density: + mean_density: 0.3167 + mean_distinct_symbol_types: 0.0364 + mean_symbol_count: 0.2350 + vocabulary: + mean_mattr: -0.0068 + mean_raw_ttr: -0.0068 + mean_total_identifiers: -0.0117 + mean_unique_identifiers: -0.0185 + vowel_density: + mean_total_chars: 0.0632 + zipf: + mean_exponent: 0.0320 + mean_r_squared: 0.0133 + mean_total_tokens: 0.0859 + mean_vocab_size: 0.0095 + diff --git a/priv/combined_metrics/naming_conventions.yml b/priv/combined_metrics/naming_conventions.yml new file mode 100644 index 0000000..aa6c6d5 --- /dev/null +++ b/priv/combined_metrics/naming_conventions.yml @@ -0,0 +1,264 @@ +class_name_is_noun: + _doc: "Class and module names should be nouns describing what they represent, not verbs or gerunds." + _log_baseline: 2.9861 + brevity: + mean_sample_size: 0.7106 + compression: + mean_raw_bytes: 0.1346 + mean_redundancy: -0.0605 + mean_zlib_bytes: 0.2139 + mean_zlib_ratio: -0.0794 + entropy: + mean_token_entropy: 0.1236 + mean_token_max_entropy: 0.1716 + mean_token_normalized: -0.0484 + mean_vocab_size: 0.7106 + halstead: + mean_difficulty: -1.1493 + mean_effort: -0.9669 + mean_estimated_bugs: 0.1818 + mean_n2_unique_operands: 1.1492 + mean_time_to_implement_seconds: -0.9669 + mean_vocabulary: 0.7462 + mean_volume: 0.1823 + heaps: + mean_beta: 0.4086 + mean_k: -0.6266 + identifier_length_variance: + mean_max: -0.4031 + mean_mean: 0.3287 + mean_std_dev: -0.8347 + mean_variance: -1.6695 + ngram: + mean_bigram_hapax_fraction: 0.2542 + mean_bigram_repeated_unique: -0.5967 + mean_bigram_repetition_rate: -0.9599 + mean_bigram_unique: 0.6173 + mean_trigram_hapax_fraction: 0.2449 + mean_trigram_repeated_unique: -2.0000 + mean_trigram_repetition_rate: -1.9547 + mean_trigram_unique: 0.6002 + punctuation_density: + mean_exclamation_density: -0.3314 + readability: + mean_avg_line_length: 0.1418 + symbol_density: + mean_density: -0.1381 + vocabulary: + mean_mattr: 1.4020 + mean_raw_ttr: 1.4020 + mean_unique_identifiers: 1.4020 + vowel_density: + mean_total_chars: 0.3287 + zipf: + mean_exponent: -0.2180 + mean_vocab_size: 0.7106 + +file_name_matches_primary_export: + _doc: "The file name should match the primary class or module it exports (e.g. `user.js` exports `User`)." + _fix_hint: "Rename the file to match the primary module it defines" + _languages: [elixir] + _log_baseline: 0.0000 + casing_entropy: + mean_pascal_case_count: 0.0000 + vocabulary: + mean_unique_identifiers: 0.0000 + +function_name_is_not_single_word: + _doc: "Single-word function names like `run`, `process`, or `handle` are too vague to convey intent." + _log_baseline: 17.8470 + compression: + mean_raw_bytes: 0.2434 + mean_redundancy: 0.0776 + mean_zlib_bytes: 0.1029 + mean_zlib_ratio: 0.1405 + entropy: + mean_char_entropy: 0.0241 + mean_char_normalized: 0.0241 + identifier_length_variance: + mean_max: 0.7685 + mean_mean: 0.5825 + mean_std_dev: 1.0000 + mean_variance: 2.0000 + readability: + mean_avg_line_length: 0.2559 + mean_avg_sub_words_per_id: 0.3083 + mean_flesch_adapted: -0.3181 + mean_fog_adapted: 1.3258 + symbol_density: + mean_density: -0.2431 + vowel_density: + mean_total_chars: 0.5825 + +function_name_matches_return_type: + _doc: "Functions prefixed with `get_`, `fetch_`, or `find_` should return the thing they name." + _log_baseline: 7.5638 + branching: + mean_max_nesting_depth: 0.1335 + brevity: + mean_sample_size: 0.0257 + casing_entropy: + mean_entropy: 0.0310 + mean_other_count: 0.0347 + mean_snake_case_count: -0.0296 + compression: + mean_raw_bytes: -0.0190 + mean_redundancy: -0.0180 + mean_unique_line_ratio: -0.0104 + mean_zlib_bytes: 0.0143 + mean_zlib_ratio: -0.0332 + entropy: + mean_char_entropy: 0.0079 + mean_char_max_entropy: 0.0071 + mean_token_max_entropy: 0.0059 + mean_token_normalized: -0.0045 + mean_total_tokens: 0.0030 + mean_vocab_size: 0.0257 + halstead: + mean_N1_total_operators: 0.0392 + mean_N2_total_operands: -0.0539 + mean_difficulty: 0.0029 + mean_effort: 0.0080 + mean_estimated_bugs: 0.0050 + mean_n1_unique_operators: 0.0629 + mean_n2_unique_operands: 0.0060 + mean_time_to_implement_seconds: 0.0080 + mean_vocabulary: 0.0218 + mean_volume: 0.0050 + heaps: + mean_beta: 0.0291 + mean_k: -0.0519 + mean_r_squared: 0.0038 + identifier_length_variance: + mean_max: 0.1082 + mean_std_dev: 0.0326 + mean_variance: 0.0653 + line_patterns: + mean_max_nesting_depth: 0.1335 + mean_string_literal_ratio: -0.0027 + mean_unique_line_ratio: -0.0108 + magic_number_density: + mean_density: -0.0108 + mean_string_literal_ratio: -0.0027 + near_duplicate_blocks_file: + mean_near_dup_block_d0: -0.5899 + mean_near_dup_block_d5: -0.2282 + mean_near_dup_block_d7: 0.2282 + mean_sub_block_count: 0.0314 + ngram: + mean_bigram_hapax_fraction: 0.0106 + mean_bigram_repeated_unique: 0.0095 + mean_bigram_repetition_rate: -0.0167 + mean_bigram_total: 0.0030 + mean_bigram_unique: 0.0261 + mean_trigram_hapax_fraction: 0.0174 + mean_trigram_repeated_unique: -0.0297 + mean_trigram_repetition_rate: -0.0444 + mean_trigram_total: 0.0030 + mean_trigram_unique: 0.0245 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.1048 + mean_colon_suffix_density: -0.0027 + mean_dot_count: 0.1335 + mean_id_nonalpha_suffix_density: 0.0266 + mean_question_mark_density: -2.0000 + readability: + mean_avg_line_length: 0.0175 + mean_avg_sub_words_per_id: 0.0087 + mean_avg_tokens_per_line: 0.0030 + mean_flesch_adapted: -0.0107 + mean_fog_adapted: 0.0058 + symbol_density: + mean_density: 0.0633 + mean_distinct_symbol_types: 0.0639 + mean_symbol_count: 0.0442 + vocabulary: + mean_mattr: 0.0350 + mean_raw_ttr: 0.0299 + mean_total_identifiers: -0.0225 + mean_unique_identifiers: 0.0074 + vowel_density: + mean_total_chars: -0.0235 + zipf: + mean_exponent: -0.0047 + mean_r_squared: 0.0105 + mean_total_tokens: 0.0030 + mean_vocab_size: 0.0257 + +test_name_starts_with_verb: + _doc: "Test descriptions should start with a verb: `creates`, `raises`, `returns`, not a noun phrase." + _log_baseline: 7.8915 + branching: + mean_branch_count: 1.9977 + mean_branching_density: 2.0000 + brevity: + mean_sample_size: 0.0694 + casing_entropy: + mean_entropy: -0.0711 + mean_snake_case_count: 0.1381 + compression: + mean_raw_bytes: 0.0914 + mean_redundancy: 0.0182 + mean_zlib_bytes: 0.0482 + mean_zlib_ratio: 0.0431 + entropy: + mean_char_max_entropy: 0.0064 + mean_char_normalized: -0.0121 + mean_token_entropy: 0.0259 + mean_token_max_entropy: 0.0155 + mean_token_normalized: 0.0104 + mean_total_tokens: 0.0600 + mean_vocab_size: 0.0694 + halstead: + mean_N1_total_operators: 0.0411 + mean_difficulty: 0.0577 + mean_effort: 0.0855 + mean_estimated_bugs: 0.0277 + mean_length: 0.0240 + mean_n1_unique_operators: 0.0577 + mean_time_to_implement_seconds: 0.0855 + mean_vocabulary: 0.0164 + mean_volume: 0.0278 + heaps: + mean_beta: -0.0149 + mean_k: 0.0795 + mean_r_squared: -0.0081 + identifier_length_variance: + mean_std_dev: -0.0192 + mean_variance: -0.0384 + line_patterns: + mean_string_literal_ratio: -0.0611 + magic_number_density: + mean_string_literal_ratio: -0.0611 + ngram: + mean_bigram_hapax_fraction: -0.0506 + mean_bigram_repeated_unique: 0.1209 + mean_bigram_repetition_rate: 0.0150 + mean_bigram_total: 0.0602 + mean_bigram_unique: 0.0621 + mean_trigram_hapax_fraction: -0.0206 + mean_trigram_repeated_unique: 0.0961 + mean_trigram_repetition_rate: 0.0117 + mean_trigram_total: 0.0603 + mean_trigram_unique: 0.0596 + punctuation_density: + mean_arrow_density: -0.1129 + mean_colon_suffix_density: -0.0591 + mean_id_nonalpha_suffix_density: -0.0602 + readability: + mean_avg_line_length: 0.0943 + mean_avg_tokens_per_line: 0.0600 + mean_fog_adapted: 0.0600 + symbol_density: + mean_density: -0.0912 + vocabulary: + mean_mattr: 0.0463 + mean_total_identifiers: 0.1129 + mean_unique_identifiers: 0.1161 + vowel_density: + mean_total_chars: 0.1122 + zipf: + mean_exponent: -0.0239 + mean_total_tokens: 0.0600 + mean_vocab_size: 0.0694 + diff --git a/priv/combined_metrics/samples/code_smells/consistent_string_quote_style/bad/notifications.ex b/priv/combined_metrics/samples/code_smells/consistent_string_quote_style/bad/notifications.ex new file mode 100644 index 0000000..71bbb89 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/consistent_string_quote_style/bad/notifications.ex @@ -0,0 +1,84 @@ +defmodule Notifications do + @moduledoc 'Handles sending notifications and emails to users' + + @default_sender 'noreply@example.com' + @support_email "support@example.com" + + def send_welcome_email(user) do + subject = "Welcome to the platform" + body = 'Hi ' <> user.name <> ", welcome aboard!" + sender = @default_sender + + deliver_email(%{ + to: user.email, + from: sender, + subject: subject, + body: body, + reply_to: 'noreply@example.com' + }) + end + + def send_password_reset(user, token) do + link = "https://example.com/reset/" <> token + subject = 'Reset your password' + body = "Click the link to reset your password: " <> link + + deliver_email(%{ + to: user.email, + from: @default_sender, + subject: subject, + body: body + }) + end + + def send_order_confirmation(user, order) do + subject = 'Order #' <> Integer.to_string(order.id) <> " confirmed" + body = "Thank you for your order, " <> user.name <> "!" + + deliver_email(%{ + to: user.email, + from: "orders@example.com", + subject: subject, + body: body, + cc: 'orders@example.com' + }) + end + + def send_invoice(user, invoice) do + subject = "Invoice " <> invoice.number + body = 'Please find your invoice attached.' + + deliver_email(%{ + to: user.email, + from: @default_sender, + subject: subject, + body: body, + attachment: invoice.pdf_path + }) + end + + def send_support_reply(ticket, message) do + subject = 'Re: Support Ticket #' <> Integer.to_string(ticket.id) + body = "Hello,\n\n" <> message <> '\n\nBest regards,\nSupport Team' + + deliver_email(%{ + to: ticket.user_email, + from: @support_email, + subject: subject, + body: body + }) + end + + def format_greeting(name, locale) do + case locale do + "en" -> 'Hello, ' <> name <> "!" + "de" -> "Hallo, " <> name <> '!' + _ -> "Hi, " <> name + end + end + + defp deliver_email(params) do + IO.puts("Sending email to: " <> params.to) + {:ok, params} + end +end diff --git a/priv/combined_metrics/samples/code_smells/consistent_string_quote_style/config.yml b/priv/combined_metrics/samples/code_smells/consistent_string_quote_style/config.yml new file mode 100644 index 0000000..ffdf597 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/consistent_string_quote_style/config.yml @@ -0,0 +1 @@ +doc: "Files should use a single, consistent string quoting style throughout." diff --git a/priv/combined_metrics/samples/code_smells/consistent_string_quote_style/good/notifications.ex b/priv/combined_metrics/samples/code_smells/consistent_string_quote_style/good/notifications.ex new file mode 100644 index 0000000..2b8d576 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/consistent_string_quote_style/good/notifications.ex @@ -0,0 +1,82 @@ +defmodule Notifications do + @moduledoc "Handles sending notifications and emails to users" + + @default_sender "noreply@example.com" + @support_email "support@example.com" + + def send_welcome_email(user) do + subject = "Welcome to the platform" + body = "Hi #{user.name}, welcome aboard!" + + deliver_email(%{ + to: user.email, + from: @default_sender, + subject: subject, + body: body, + reply_to: "noreply@example.com" + }) + end + + def send_password_reset(user, token) do + link = "https://example.com/reset/#{token}" + subject = "Reset your password" + body = "Click the link to reset your password: #{link}" + + deliver_email(%{ + to: user.email, + from: @default_sender, + subject: subject, + body: body + }) + end + + def send_order_confirmation(user, order) do + subject = "Order ##{order.id} confirmed" + body = "Thank you for your order, #{user.name}!" + + deliver_email(%{ + to: user.email, + from: "orders@example.com", + subject: subject, + body: body, + cc: "orders@example.com" + }) + end + + def send_invoice(user, invoice) do + subject = "Invoice #{invoice.number}" + body = "Please find your invoice attached." + + deliver_email(%{ + to: user.email, + from: @default_sender, + subject: subject, + body: body, + attachment: invoice.pdf_path + }) + end + + def send_support_reply(ticket, message) do + subject = "Re: Support Ticket ##{ticket.id}" + body = "Hello,\n\n#{message}\n\nBest regards,\nSupport Team" + + deliver_email(%{ + to: ticket.user_email, + from: @support_email, + subject: subject, + body: body + }) + end + + def format_greeting(name, locale) do + case locale do + "en" -> "Hello, #{name}!" + "de" -> "Hallo, #{name}!" + _ -> "Hi, #{name}" + end + end + + defp deliver_email(params) do + {:ok, params} + end +end diff --git a/priv/combined_metrics/samples/code_smells/context_not_stored_in_struct/bad/fetcher.go b/priv/combined_metrics/samples/code_smells/context_not_stored_in_struct/bad/fetcher.go new file mode 100644 index 0000000..fb8b2cc --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/context_not_stored_in_struct/bad/fetcher.go @@ -0,0 +1,64 @@ +package fetcher + +import ( + "context" + "encoding/json" + "fmt" + "net/http" +) + +// Article represents a remote article resource. +type Article struct { + ID string + Title string + Body string +} + +// ArticleFetcher retrieves articles from a remote API. +// Context is stored as a struct field — an anti-pattern that ties the instance +// to a single request lifetime and makes cancellation hard to reason about. +type ArticleFetcher struct { + base string + client *http.Client + ctx context.Context // anti-pattern: context stored in struct +} + +func New(ctx context.Context, base string) *ArticleFetcher { + return &ArticleFetcher{base: base, client: &http.Client{}, ctx: ctx} +} + +// FetchByID retrieves an article by ID using the stored context. +// Callers cannot supply per-call cancellation. +func (f *ArticleFetcher) FetchByID(id string) (*Article, error) { + url := fmt.Sprintf("%s/articles/%s", f.base, id) + // Uses f.ctx from the struct — callers cannot override it. + req, err := http.NewRequestWithContext(f.ctx, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("fetch article %q: %w", id, err) + } + + resp, err := f.client.Do(req) + if err != nil { + return nil, fmt.Errorf("fetch article %q: %w", id, err) + } + defer resp.Body.Close() + + var a Article + if err := json.NewDecoder(resp.Body).Decode(&a); err != nil { + return nil, fmt.Errorf("fetch article %q: decode: %w", id, err) + } + return &a, nil +} + +// FetchAll retrieves multiple articles using the struct's stored context. +func (f *ArticleFetcher) FetchAll(ids []string) ([]*Article, error) { + out := make([]*Article, 0, len(ids)) + for _, id := range ids { + a, err := f.FetchByID(id) + if err != nil { + return nil, err + } + out = append(out, a) + } + return out, nil +} diff --git a/priv/combined_metrics/samples/code_smells/context_not_stored_in_struct/good/fetcher.go b/priv/combined_metrics/samples/code_smells/context_not_stored_in_struct/good/fetcher.go new file mode 100644 index 0000000..ce3182e --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/context_not_stored_in_struct/good/fetcher.go @@ -0,0 +1,61 @@ +package fetcher + +import ( + "context" + "encoding/json" + "fmt" + "net/http" +) + +// Article represents a remote article resource. +type Article struct { + ID string + Title string + Body string +} + +// ArticleFetcher retrieves articles from a remote API. +// Context is accepted as a function parameter, not stored in the struct. +type ArticleFetcher struct { + base string + client *http.Client +} + +func New(base string) *ArticleFetcher { + return &ArticleFetcher{base: base, client: &http.Client{}} +} + +// FetchByID retrieves an article by ID using the provided context for cancellation. +// Context is passed explicitly — it is not stored on ArticleFetcher. +func (f *ArticleFetcher) FetchByID(ctx context.Context, id string) (*Article, error) { + url := fmt.Sprintf("%s/articles/%s", f.base, id) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("fetch article %q: build request: %w", id, err) + } + + resp, err := f.client.Do(req) + if err != nil { + return nil, fmt.Errorf("fetch article %q: %w", id, err) + } + defer resp.Body.Close() + + var a Article + if err := json.NewDecoder(resp.Body).Decode(&a); err != nil { + return nil, fmt.Errorf("fetch article %q: decode: %w", id, err) + } + return &a, nil +} + +// FetchAll retrieves multiple articles using the provided context. +func (f *ArticleFetcher) FetchAll(ctx context.Context, ids []string) ([]*Article, error) { + out := make([]*Article, 0, len(ids)) + for _, id := range ids { + a, err := f.FetchByID(ctx, id) + if err != nil { + return nil, err + } + out = append(out, a) + } + return out, nil +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_accept_box_ref_parameter/bad/worker.rs b/priv/combined_metrics/samples/code_smells/does_not_accept_box_ref_parameter/bad/worker.rs new file mode 100644 index 0000000..b8b6111 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_accept_box_ref_parameter/bad/worker.rs @@ -0,0 +1,51 @@ +use std::fmt; + +pub trait Task: fmt::Debug + Send { + fn name(&self) -> &str; + fn execute(&self) -> Result; +} + +#[derive(Debug)] +pub struct EmailTask { + pub recipient: String, + pub subject: String, +} + +impl Task for EmailTask { + fn name(&self) -> &str { "email" } + fn execute(&self) -> Result { + Ok(format!("sent email to {}", self.recipient)) + } +} + +#[derive(Debug)] +pub struct ReportTask { + pub report_id: u64, +} + +impl Task for ReportTask { + fn name(&self) -> &str { "report" } + fn execute(&self) -> Result { + Ok(format!("generated report #{}", self.report_id)) + } +} + +// Bad: &Box forces callers to have an owned Box — cannot pass a +// plain reference; also adds an extra level of indirection unnecessarily. +pub fn run_task(task: &Box) -> Result { + println!("running task: {}", task.name()); + task.execute() +} + +pub fn run_all(tasks: &[Box]) -> Vec> { + tasks.iter().map(run_task).collect() +} + +// Same anti-pattern with a concrete generic type +pub fn log_task_name(task: &Box) { + println!("[worker] task name: {}", task.name()); +} + +pub fn describe(task: &Box) -> String { + format!("Task({})", task.name()) +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_accept_box_ref_parameter/good/worker.rs b/priv/combined_metrics/samples/code_smells/does_not_accept_box_ref_parameter/good/worker.rs new file mode 100644 index 0000000..5979a98 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_accept_box_ref_parameter/good/worker.rs @@ -0,0 +1,51 @@ +use std::fmt; + +pub trait Task: fmt::Debug + Send { + fn name(&self) -> &str; + fn execute(&self) -> Result; +} + +#[derive(Debug)] +pub struct EmailTask { + pub recipient: String, + pub subject: String, +} + +impl Task for EmailTask { + fn name(&self) -> &str { + "email" + } + fn execute(&self) -> Result { + Ok(format!("sent email to {} re: {}", self.recipient, self.subject)) + } +} + +#[derive(Debug)] +pub struct ReportTask { + pub report_id: u64, +} + +impl Task for ReportTask { + fn name(&self) -> &str { + "report" + } + fn execute(&self) -> Result { + Ok(format!("generated report #{}", self.report_id)) + } +} + +// Accept &dyn Task (or &T) rather than &Box — works with +// both owned Box and references to stack-allocated types +pub fn run_task(task: &dyn Task) -> Result { + println!("running task: {}", task.name()); + task.execute() +} + +pub fn run_all(tasks: &[Box]) -> Vec> { + // Dereference each Box to get &dyn Task — clean, no Box leaking into API + tasks.iter().map(|t| run_task(t.as_ref())).collect() +} + +pub fn log_task_name(task: &dyn Task) { + println!("[worker] task name: {}", task.name()); +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_box_collections_unnecessarily/bad/cache.rs b/priv/combined_metrics/samples/code_smells/does_not_box_collections_unnecessarily/bad/cache.rs new file mode 100644 index 0000000..46fe6b8 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_box_collections_unnecessarily/bad/cache.rs @@ -0,0 +1,59 @@ +use std::collections::HashMap; +use std::time::{Duration, Instant}; + +#[derive(Debug, Clone)] +pub struct CacheEntry { + pub value: V, + pub inserted_at: Instant, + pub ttl: Duration, +} + +// Bad: Vec and HashMap are wrapped in Box — adds heap indirection with no benefit +pub struct TtlCache { + // Box> adds an extra pointer hop for every lookup + store: Box>>, + // Box> is redundant — Vec already lives on the heap + eviction_order: Box>, + max_size: usize, +} + +impl TtlCache +where + K: std::hash::Hash + Eq + Clone, + V: Clone, +{ + pub fn new(max_size: usize) -> Self { + Self { + store: Box::new(HashMap::new()), + eviction_order: Box::new(Vec::new()), + max_size, + } + } + + pub fn insert(&mut self, key: K, value: V, ttl: Duration) { + if self.store.len() >= self.max_size { + self.evict_oldest(); + } + let entry = CacheEntry { value, inserted_at: Instant::now(), ttl }; + self.store.insert(key.clone(), entry); + self.eviction_order.push(key); + } + + pub fn get(&self, key: &K) -> Option<&V> { + self.store.get(key).and_then(|e| { + if e.inserted_at.elapsed() > e.ttl { None } else { Some(&e.value) } + }) + } + + // Returning Box> — caller must dereference to get slice behavior + pub fn snapshot_keys(&self) -> Box> { + Box::new(self.eviction_order.iter().cloned().collect()) + } + + fn evict_oldest(&mut self) { + if let Some(oldest) = self.eviction_order.first().cloned() { + self.store.remove(&oldest); + self.eviction_order.remove(0); + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_box_collections_unnecessarily/good/cache.rs b/priv/combined_metrics/samples/code_smells/does_not_box_collections_unnecessarily/good/cache.rs new file mode 100644 index 0000000..2e5ceb6 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_box_collections_unnecessarily/good/cache.rs @@ -0,0 +1,61 @@ +use std::collections::HashMap; +use std::time::{Duration, Instant}; + +#[derive(Debug, Clone)] +pub struct CacheEntry { + pub value: V, + pub inserted_at: Instant, + pub ttl: Duration, +} + +impl CacheEntry { + pub fn is_expired(&self) -> bool { + self.inserted_at.elapsed() > self.ttl + } +} + +// Vec and HashMap are used directly — no unnecessary Box wrapping +pub struct TtlCache { + store: HashMap>, + eviction_order: Vec, + max_size: usize, +} + +impl TtlCache +where + K: std::hash::Hash + Eq + Clone, +{ + pub fn new(max_size: usize) -> Self { + Self { + store: HashMap::new(), + eviction_order: Vec::new(), + max_size, + } + } + + pub fn insert(&mut self, key: K, value: V, ttl: Duration) { + if self.store.len() >= self.max_size { + self.evict_oldest(); + } + let entry = CacheEntry { value, inserted_at: Instant::now(), ttl }; + self.store.insert(key.clone(), entry); + self.eviction_order.push(key); + } + + pub fn get(&self, key: &K) -> Option<&V> { + self.store.get(key).and_then(|e| { + if e.is_expired() { None } else { Some(&e.value) } + }) + } + + pub fn keys(&self) -> Vec<&K> { + self.eviction_order.iter().collect() + } + + fn evict_oldest(&mut self) { + if let Some(oldest) = self.eviction_order.first().cloned() { + self.store.remove(&oldest); + self.eviction_order.remove(0); + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_cast_ref_to_mut_ptr/bad/buffer.rs b/priv/combined_metrics/samples/code_smells/does_not_cast_ref_to_mut_ptr/bad/buffer.rs new file mode 100644 index 0000000..14d5808 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_cast_ref_to_mut_ptr/bad/buffer.rs @@ -0,0 +1,44 @@ +/// Bad: uses raw pointer casts to mutate through shared references, +/// violating Rust's aliasing rules and causing undefined behavior. +pub struct RingBuffer { + data: Vec, + capacity: usize, +} + +impl RingBuffer { + pub fn new(capacity: usize) -> Self { + Self { data: Vec::with_capacity(capacity), capacity } + } + + // BAD: casts an immutable reference to a mutable pointer to bypass borrow rules. + // This is undefined behavior — multiple callers can hold &RingBuffer and mutate + // the same Vec simultaneously. + pub fn write_bypass(&self, byte: u8) { + let data_ptr = &self.data as *const Vec as *mut Vec; + unsafe { + (*data_ptr).push(byte); + } + } + + // BAD: same pattern — casting &[u8] pointer to *mut u8 to overwrite bytes + pub fn patch_byte(&self, index: usize, value: u8) { + if index < self.data.len() { + let ptr = self.data.as_ptr() as *mut u8; + unsafe { + *ptr.add(index) = value; + } + } + } + + pub fn len(&self) -> usize { + self.data.len() + } + + // BAD: capacity field mutated through a const-cast pointer + pub fn resize_limit(&self, new_capacity: usize) { + let cap_ptr = &self.capacity as *const usize as *mut usize; + unsafe { + *cap_ptr = new_capacity; + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_cast_ref_to_mut_ptr/good/buffer.rs b/priv/combined_metrics/samples/code_smells/does_not_cast_ref_to_mut_ptr/good/buffer.rs new file mode 100644 index 0000000..0119ace --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_cast_ref_to_mut_ptr/good/buffer.rs @@ -0,0 +1,61 @@ +use std::sync::Mutex; + +/// A shared ring buffer with interior mutability via Mutex. +/// Mutations go through &mut self or Mutex — no raw pointer casting. +pub struct RingBuffer { + data: Mutex>, + capacity: usize, +} + +impl RingBuffer { + pub fn new(capacity: usize) -> Self { + Self { + data: Mutex::new(Vec::with_capacity(capacity)), + capacity, + } + } + + pub fn write(&self, chunk: &[u8]) -> usize { + let mut data = self.data.lock().expect("mutex poisoned"); + let remaining = self.capacity.saturating_sub(data.len()); + let to_write = chunk.len().min(remaining); + data.extend_from_slice(&chunk[..to_write]); + to_write + } + + pub fn read_all(&self) -> Vec { + let mut data = self.data.lock().expect("mutex poisoned"); + std::mem::take(&mut *data) + } + + pub fn len(&self) -> usize { + self.data.lock().expect("mutex poisoned").len() + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +pub struct OwnedBuffer { + data: Vec, +} + +impl OwnedBuffer { + pub fn new() -> Self { + Self { data: Vec::new() } + } + + // Mutation via &mut self — safe, no raw pointers + pub fn append(&mut self, bytes: &[u8]) { + self.data.extend_from_slice(bytes); + } + + pub fn clear(&mut self) { + self.data.clear(); + } + + pub fn as_slice(&self) -> &[u8] { + &self.data + } +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_clone_for_comparison/bad/registry.rs b/priv/combined_metrics/samples/code_smells/does_not_clone_for_comparison/bad/registry.rs new file mode 100644 index 0000000..8b55b41 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_clone_for_comparison/bad/registry.rs @@ -0,0 +1,56 @@ +use std::collections::HashMap; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ServiceId(pub String); + +#[derive(Debug, Clone)] +pub struct ServiceEntry { + pub id: ServiceId, + pub endpoint: String, + pub healthy: bool, +} + +pub struct Registry { + entries: HashMap, + primary_id: Option, +} + +impl Registry { + pub fn new() -> Self { + Self { entries: HashMap::new(), primary_id: None } + } + + pub fn register(&mut self, entry: ServiceEntry) { + self.entries.insert(entry.id.clone(), entry); + } + + pub fn set_primary(&mut self, id: ServiceId) { + self.primary_id = Some(id); + } + + // Bad: clones primary_id solely to compare — &ServiceId comparison suffices + pub fn is_primary(&self, id: &ServiceId) -> bool { + self.primary_id.clone() == Some(id.clone()) + } + + // Bad: clones candidate endpoint just to compare strings + pub fn find_by_endpoint(&self, endpoint: &str) -> Option<&ServiceEntry> { + self.entries + .values() + .find(|e| e.endpoint.clone() == endpoint.to_string()) + } + + pub fn healthy_ids(&self) -> Vec { + self.entries + .values() + .filter(|e| e.healthy) + // Bad: clones every healthy id even though callers may only iterate + .map(|e| e.id.clone()) + .collect() + } + + // Bad: clones the key just to check membership + pub fn contains(&self, id: &ServiceId) -> bool { + self.entries.contains_key(&id.clone()) + } +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_clone_for_comparison/good/registry.rs b/priv/combined_metrics/samples/code_smells/does_not_clone_for_comparison/good/registry.rs new file mode 100644 index 0000000..1bfb365 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_clone_for_comparison/good/registry.rs @@ -0,0 +1,52 @@ +use std::collections::HashMap; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ServiceId(pub String); + +#[derive(Debug, Clone)] +pub struct ServiceEntry { + pub id: ServiceId, + pub endpoint: String, + pub healthy: bool, +} + +pub struct Registry { + entries: HashMap, + primary_id: Option, +} + +impl Registry { + pub fn new() -> Self { + Self { entries: HashMap::new(), primary_id: None } + } + + pub fn register(&mut self, entry: ServiceEntry) { + self.entries.insert(entry.id.clone(), entry); + } + + pub fn set_primary(&mut self, id: ServiceId) { + self.primary_id = Some(id); + } + + // Compare by reference — no clone needed + pub fn is_primary(&self, id: &ServiceId) -> bool { + self.primary_id.as_ref() == Some(id) + } + + // Find without cloning the candidate + pub fn find_by_endpoint(&self, endpoint: &str) -> Option<&ServiceEntry> { + self.entries.values().find(|e| e.endpoint == endpoint) + } + + pub fn healthy_ids(&self) -> Vec<&ServiceId> { + self.entries + .values() + .filter(|e| e.healthy) + .map(|e| &e.id) + .collect() + } + + pub fn contains(&self, id: &ServiceId) -> bool { + self.entries.contains_key(id) + } +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_declare_unused_mut/bad/pipeline.rs b/priv/combined_metrics/samples/code_smells/does_not_declare_unused_mut/bad/pipeline.rs new file mode 100644 index 0000000..83d575c --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_declare_unused_mut/bad/pipeline.rs @@ -0,0 +1,53 @@ +pub struct Record { + pub id: u64, + pub value: f64, + pub tags: Vec, +} + +pub struct Pipeline { + steps: Vec Record>>, +} + +impl Pipeline { + pub fn new() -> Self { + Self { steps: Vec::new() } + } + + // BAD: record is declared mut but never actually mutated — fold returns new value + pub fn run(&self, mut record: Record) -> Record { + self.steps.iter().fold(record, |acc, step| step(acc)) + } +} + +// BAD: scale is declared mut but the parameter is never reassigned +pub fn normalize(records: &mut Vec, mut scale: f64) { + for r in records.iter_mut() { + r.value *= scale; + } +} + +pub fn summarize(records: &[Record]) -> (f64, f64) { + // BAD: count is declared mut but assigned once and never incremented explicitly + let mut count: usize = records.len(); + let mut sum = 0.0f64; + + // sum is mutated — but count is used as a constant after assignment + for r in records { + sum += r.value; + } + + if count == 0 { + return (0.0, 0.0); + } + (sum, sum / count as f64) +} + +// BAD: result is declared mut but never reassigned after initialization +pub fn find_max(records: &[Record]) -> Option { + let mut result = records.iter().map(|r| r.value).fold(f64::NEG_INFINITY, f64::max); + if result == f64::NEG_INFINITY { + None + } else { + Some(result) + } +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_declare_unused_mut/good/pipeline.rs b/priv/combined_metrics/samples/code_smells/does_not_declare_unused_mut/good/pipeline.rs new file mode 100644 index 0000000..8b65a19 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_declare_unused_mut/good/pipeline.rs @@ -0,0 +1,48 @@ +pub struct Record { + pub id: u64, + pub value: f64, + pub tags: Vec, +} + +pub struct Pipeline { + steps: Vec Record>>, +} + +impl Pipeline { + pub fn new() -> Self { + // steps will be mutated — mut is needed here + let mut steps: Vec Record>> = Vec::new(); + steps.push(Box::new(|r| r)); // identity step + Self { steps } + } + + // No mut needed on record — we return a new one via fold + pub fn run(&self, record: Record) -> Record { + self.steps.iter().fold(record, |acc, step| step(acc)) + } + + pub fn add_step(&mut self, step: impl Fn(Record) -> Record + 'static) { + self.steps.push(Box::new(step)); + } +} + +pub fn normalize(records: &mut Vec, scale: f64) { + // records is genuinely mutated via iter_mut + for r in records.iter_mut() { + r.value *= scale; + } +} + +pub fn summarize(records: &[Record]) -> (f64, f64) { + // sum and count are mutated by the loop + let mut sum = 0.0f64; + let mut count = 0usize; + for r in records { + sum += r.value; + count += 1; + } + if count == 0 { + return (0.0, 0.0); + } + (sum, sum / count as f64) +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_hold_mutex_across_await/bad/queue.rs b/priv/combined_metrics/samples/code_smells/does_not_hold_mutex_across_await/bad/queue.rs new file mode 100644 index 0000000..d52641c --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_hold_mutex_across_await/bad/queue.rs @@ -0,0 +1,52 @@ +use std::collections::VecDeque; +use std::sync::{Arc, Mutex}; + +#[derive(Debug, Clone)] +pub struct Job { + pub id: u64, + pub payload: String, +} + +pub struct JobQueue { + inner: Arc>>, +} + +impl JobQueue { + pub fn new() -> Self { + Self { inner: Arc::new(Mutex::new(VecDeque::new())) } + } + + pub fn push(&self, job: Job) { + self.inner.lock().expect("mutex poisoned").push_back(job); + } + + // BAD: the MutexGuard is held across an .await point. + // This can deadlock (tokio Mutex panics) or block other tasks from + // acquiring the lock while the async work runs. + pub async fn process_next_bad(&self) -> Option { + let mut q = self.inner.lock().expect("mutex poisoned"); + // MutexGuard is still live here — held across the await below + let job = q.pop_front()?; + + // Awaiting while holding the guard — deadlock risk + let result = self.handle_job(&job).await; + // Guard finally dropped when this function returns, after all awaits + Some(result) + } + + // BAD: returns while guard is in scope after an await + pub async fn peek_and_log(&self) { + let q = self.inner.lock().expect("mutex poisoned"); + if let Some(job) = q.front() { + println!("next job: {}", job.id); + } + // MutexGuard q is still in scope + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + // q dropped here — after the await + } + + async fn handle_job(&self, job: &Job) -> String { + tokio::time::sleep(std::time::Duration::from_millis(1)).await; + format!("processed job #{}: {}", job.id, job.payload) + } +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_hold_mutex_across_await/good/queue.rs b/priv/combined_metrics/samples/code_smells/does_not_hold_mutex_across_await/good/queue.rs new file mode 100644 index 0000000..0d73408 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_hold_mutex_across_await/good/queue.rs @@ -0,0 +1,50 @@ +use std::collections::VecDeque; +use std::sync::{Arc, Mutex}; + +#[derive(Debug, Clone)] +pub struct Job { + pub id: u64, + pub payload: String, +} + +pub struct JobQueue { + inner: Arc>>, +} + +impl JobQueue { + pub fn new() -> Self { + Self { inner: Arc::new(Mutex::new(VecDeque::new())) } + } + + pub fn push(&self, job: Job) { + let mut q = self.inner.lock().expect("mutex poisoned"); + q.push_back(job); + // Guard dropped at end of block — not held across await + } + + // Good: release the lock before awaiting, then use the extracted value + pub async fn process_next(&self) -> Option { + // Extract the job while holding the lock... + let job = { + let mut q = self.inner.lock().expect("mutex poisoned"); + q.pop_front() + // MutexGuard dropped here — before any await point + }; + + // ...then await without holding the lock + match job { + Some(j) => Some(self.handle_job(j).await), + None => None, + } + } + + async fn handle_job(&self, job: Job) -> String { + // Simulated async work (e.g., HTTP call, DB write) + tokio::time::sleep(std::time::Duration::from_millis(1)).await; + format!("processed job #{}: {}", job.id, job.payload) + } + + pub fn len(&self) -> usize { + self.inner.lock().expect("mutex poisoned").len() + } +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_spawn_unbounded_goroutines/bad/importer.go b/priv/combined_metrics/samples/code_smells/does_not_spawn_unbounded_goroutines/bad/importer.go new file mode 100644 index 0000000..a708ed1 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_spawn_unbounded_goroutines/bad/importer.go @@ -0,0 +1,53 @@ +package importer + +import ( + "context" + "fmt" + "sync" +) + +type Record struct { + ID string + Data string +} + +type Storer interface { + Store(ctx context.Context, r Record) error +} + +// BulkImporter processes records by spawning one goroutine per record. +// With a large input slice this creates an unbounded number of goroutines, +// exhausting memory and file descriptors. +type BulkImporter struct { + store Storer +} + +func New(store Storer) *BulkImporter { + return &BulkImporter{store: store} +} + +// Import spawns one goroutine per record with no concurrency limit. +func (b *BulkImporter) Import(ctx context.Context, records []Record) error { + errs := make(chan error, len(records)) + var wg sync.WaitGroup + + for _, r := range records { + r := r + wg.Add(1) + // One goroutine per record — can spawn thousands simultaneously. + go func() { + defer wg.Done() + if err := b.store.Store(ctx, r); err != nil { + errs <- fmt.Errorf("store record %s: %w", r.ID, err) + } + }() + } + + wg.Wait() + close(errs) + + for err := range errs { + return err + } + return nil +} diff --git a/priv/combined_metrics/samples/code_smells/does_not_spawn_unbounded_goroutines/good/importer.go b/priv/combined_metrics/samples/code_smells/does_not_spawn_unbounded_goroutines/good/importer.go new file mode 100644 index 0000000..7f118e2 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/does_not_spawn_unbounded_goroutines/good/importer.go @@ -0,0 +1,58 @@ +package importer + +import ( + "context" + "fmt" + "sync" +) + +type Record struct { + ID string + Data string +} + +type Storer interface { + Store(ctx context.Context, r Record) error +} + +// BulkImporter processes records using a fixed-size worker pool. +// The number of concurrent goroutines is bounded by concurrency. +type BulkImporter struct { + store Storer + concurrency int +} + +func New(store Storer, concurrency int) *BulkImporter { + if concurrency <= 0 { + concurrency = 4 + } + return &BulkImporter{store: store, concurrency: concurrency} +} + +// Import processes all records with at most concurrency goroutines running simultaneously. +func (b *BulkImporter) Import(ctx context.Context, records []Record) error { + sem := make(chan struct{}, b.concurrency) + errs := make(chan error, len(records)) + var wg sync.WaitGroup + + for _, r := range records { + r := r + sem <- struct{}{} // acquire slot + wg.Add(1) + go func() { + defer wg.Done() + defer func() { <-sem }() // release slot + if err := b.store.Store(ctx, r); err != nil { + errs <- fmt.Errorf("store record %s: %w", r.ID, err) + } + }() + } + + wg.Wait() + close(errs) + + for err := range errs { + return err // return first error + } + return nil +} diff --git a/priv/combined_metrics/samples/code_smells/goroutine_has_clear_exit_condition/bad/worker.go b/priv/combined_metrics/samples/code_smells/goroutine_has_clear_exit_condition/bad/worker.go new file mode 100644 index 0000000..fe8afab --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/goroutine_has_clear_exit_condition/bad/worker.go @@ -0,0 +1,45 @@ +package worker + +import ( + "log" + "time" +) + +type EmailJob struct { + To string + Subject string + Body string +} + +type Mailer interface { + Send(job EmailJob) error +} + +// EmailWorker drains jobs from a channel. +// The goroutine has no exit condition — it leaks forever. +type EmailWorker struct { + mailer Mailer + jobs <-chan EmailJob + logger *log.Logger +} + +func NewEmailWorker(mailer Mailer, jobs <-chan EmailJob, logger *log.Logger) *EmailWorker { + return &EmailWorker{mailer: mailer, jobs: jobs, logger: logger} +} + +// Run starts the worker in a goroutine with no way to stop it. +func (w *EmailWorker) Run() { + // No context, no stop channel — this goroutine runs forever with no exit path. + go func() { + for { + select { + case job := <-w.jobs: + if err := w.mailer.Send(job); err != nil { + w.logger.Printf("failed to send email to %s: %v", job.To, err) + } + default: + time.Sleep(100 * time.Millisecond) + } + } + }() +} diff --git a/priv/combined_metrics/samples/code_smells/goroutine_has_clear_exit_condition/good/worker.go b/priv/combined_metrics/samples/code_smells/goroutine_has_clear_exit_condition/good/worker.go new file mode 100644 index 0000000..046ed7b --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/goroutine_has_clear_exit_condition/good/worker.go @@ -0,0 +1,52 @@ +package worker + +import ( + "context" + "log" + "time" +) + +type EmailJob struct { + To string + Subject string + Body string +} + +type Mailer interface { + Send(job EmailJob) error +} + +// EmailWorker drains jobs from a channel until the context is cancelled. +// The goroutine has a clear exit condition: ctx.Done(). +type EmailWorker struct { + mailer Mailer + jobs <-chan EmailJob + logger *log.Logger +} + +func NewEmailWorker(mailer Mailer, jobs <-chan EmailJob, logger *log.Logger) *EmailWorker { + return &EmailWorker{mailer: mailer, jobs: jobs, logger: logger} +} + +// Run starts the worker and blocks until ctx is cancelled or jobs is closed. +func (w *EmailWorker) Run(ctx context.Context) { + for { + select { + case <-ctx.Done(): + // Clear exit: context cancelled — drain stops. + w.logger.Println("email worker shutting down") + return + case job, ok := <-w.jobs: + if !ok { + // Clear exit: channel closed — no more work. + w.logger.Println("jobs channel closed, email worker exiting") + return + } + if err := w.mailer.Send(job); err != nil { + w.logger.Printf("failed to send email to %s: %v", job.To, err) + } + case <-time.After(30 * time.Second): + w.logger.Println("email worker idle heartbeat") + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/io_bound_uses_async_await_not_task_run/bad/DocumentStorage.cs b/priv/combined_metrics/samples/code_smells/io_bound_uses_async_await_not_task_run/bad/DocumentStorage.cs new file mode 100644 index 0000000..6bc1279 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/io_bound_uses_async_await_not_task_run/bad/DocumentStorage.cs @@ -0,0 +1,63 @@ +using System.IO; +using System.Net.Http; +using System.Text; +using System.Threading.Tasks; + +namespace Storage +{ + public class DocumentStorage + { + private readonly HttpClient _httpClient; + private readonly string _storageRoot; + + public DocumentStorage(HttpClient httpClient, string storageRoot) + { + _httpClient = httpClient; + _storageRoot = storageRoot; + } + + // I/O-bound but wrapped in Task.Run — wastes a thread pool thread + public async Task ReadDocumentAsync(string documentId) + { + var path = BuildPath(documentId); + return await Task.Run(() => File.ReadAllText(path)); + } + + // I/O-bound write wrapped in Task.Run unnecessarily + public async Task SaveDocumentAsync(string documentId, string content) + { + var path = BuildPath(documentId); + await Task.Run(() => + { + Directory.CreateDirectory(Path.GetDirectoryName(path)!); + File.WriteAllText(path, content, Encoding.UTF8); + }); + } + + // Network I/O inside Task.Run — HttpClient is already async, Task.Run adds no value + public async Task FetchFromRemoteAsync(string url) + { + return await Task.Run(async () => + { + var response = await _httpClient.GetAsync(url); + response.EnsureSuccessStatusCode(); + return await response.Content.ReadAsStringAsync(); + }); + } + + // Streaming wrapped in Task.Run — unnecessary thread pool hop for I/O + public async Task DownloadToFileAsync(string url, string destinationPath) + { + await Task.Run(async () => + { + var response = await _httpClient.GetAsync(url); + response.EnsureSuccessStatusCode(); + var bytes = await response.Content.ReadAsByteArrayAsync(); + File.WriteAllBytes(destinationPath, bytes); + }); + } + + private string BuildPath(string documentId) => + Path.Combine(_storageRoot, documentId[..2], documentId + ".txt"); + } +} diff --git a/priv/combined_metrics/samples/code_smells/io_bound_uses_async_await_not_task_run/good/DocumentStorage.cs b/priv/combined_metrics/samples/code_smells/io_bound_uses_async_await_not_task_run/good/DocumentStorage.cs new file mode 100644 index 0000000..8c6a734 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/io_bound_uses_async_await_not_task_run/good/DocumentStorage.cs @@ -0,0 +1,60 @@ +using System.IO; +using System.Net.Http; +using System.Text; +using System.Threading.Tasks; + +namespace Storage +{ + public class DocumentStorage + { + private readonly HttpClient _httpClient; + private readonly string _storageRoot; + + public DocumentStorage(HttpClient httpClient, string storageRoot) + { + _httpClient = httpClient; + _storageRoot = storageRoot; + } + + // I/O-bound: reads directly with async file API, no Task.Run needed + public async Task ReadDocumentAsync(string documentId) + { + var path = BuildPath(documentId); + return await File.ReadAllTextAsync(path); + } + + // I/O-bound: writes via async file API + public async Task SaveDocumentAsync(string documentId, string content) + { + var path = BuildPath(documentId); + Directory.CreateDirectory(Path.GetDirectoryName(path)!); + await File.WriteAllTextAsync(path, content, Encoding.UTF8); + } + + // I/O-bound: uses async HttpClient, not Task.Run + public async Task FetchFromRemoteAsync(string url) + { + var response = await _httpClient.GetAsync(url); + response.EnsureSuccessStatusCode(); + return await response.Content.ReadAsStringAsync(); + } + + // I/O-bound: streams large file without blocking + public async Task DownloadToFileAsync(string url, string destinationPath) + { + using var response = await _httpClient.GetAsync( + url, System.Net.Http.HttpCompletionOption.ResponseHeadersRead); + response.EnsureSuccessStatusCode(); + + using var contentStream = await response.Content.ReadAsStreamAsync(); + using var fileStream = new FileStream( + destinationPath, FileMode.Create, FileAccess.Write, FileShare.None, + bufferSize: 8192, useAsync: true); + + await contentStream.CopyToAsync(fileStream); + } + + private string BuildPath(string documentId) => + Path.Combine(_storageRoot, documentId[..2], documentId + ".txt"); + } +} diff --git a/priv/combined_metrics/samples/code_smells/lambda_not_assigned_to_variable/bad/price_calculator.py b/priv/combined_metrics/samples/code_smells/lambda_not_assigned_to_variable/bad/price_calculator.py new file mode 100644 index 0000000..939b8de --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/lambda_not_assigned_to_variable/bad/price_calculator.py @@ -0,0 +1,47 @@ +"""Price calculator with pluggable discount and tax strategies.""" +from __future__ import annotations + +from decimal import Decimal +from typing import Callable + + +PriceTransform = Callable[[Decimal], Decimal] + + +# lambdas assigned to names — should be def statements +apply_percentage_discount = lambda percent: ( # noqa: E731 + lambda price: price * (1 - Decimal(percent) / 100) +) + +apply_flat_discount = lambda amount: ( # noqa: E731 + lambda price: max(price - amount, Decimal(0)) +) + +apply_tax = lambda rate: ( # noqa: E731 + lambda price: price * (1 + rate) +) + + +def chain_transforms(*transforms: PriceTransform) -> PriceTransform: + # lambda assigned to a local name — should be a nested def + apply_all = lambda price: [t(price) for t in transforms][-1] # noqa: E731 + return apply_all + + +# Module-level strategy functions replaced by named lambdas +member_price = lambda price: price * Decimal("0.90") # noqa: E731 — use def + +vip_price = lambda price: ( # noqa: E731 — use def + price * Decimal("0.80") * Decimal("1.08") +) + +format_price = lambda price: f"${price:.2f}" # noqa: E731 — use def + + +def calculate( + base_price: Decimal, + transform: PriceTransform, +) -> str: + """Apply a transform to a base price and return the formatted result.""" + final = transform(base_price) + return format_price(final) diff --git a/priv/combined_metrics/samples/code_smells/lambda_not_assigned_to_variable/good/price_calculator.py b/priv/combined_metrics/samples/code_smells/lambda_not_assigned_to_variable/good/price_calculator.py new file mode 100644 index 0000000..7aa46c8 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/lambda_not_assigned_to_variable/good/price_calculator.py @@ -0,0 +1,64 @@ +"""Price calculator with pluggable discount and tax strategies.""" +from __future__ import annotations + +from decimal import Decimal +from typing import Callable + + +PriceTransform = Callable[[Decimal], Decimal] + + +def apply_percentage_discount(percent: int) -> PriceTransform: + """Return a function that applies a percentage discount to a price.""" + def discount(price: Decimal) -> Decimal: + return price * (1 - Decimal(percent) / 100) + return discount + + +def apply_flat_discount(amount: Decimal) -> PriceTransform: + """Return a function that subtracts a flat amount from a price.""" + def discount(price: Decimal) -> Decimal: + return max(price - amount, Decimal(0)) + return discount + + +def apply_tax(rate: Decimal) -> PriceTransform: + """Return a function that adds a tax rate to a price.""" + def add_tax(price: Decimal) -> Decimal: + return price * (1 + rate) + return add_tax + + +def chain_transforms(*transforms: PriceTransform) -> PriceTransform: + """Combine multiple price transforms into a single function.""" + def apply_all(price: Decimal) -> Decimal: + for transform in transforms: + price = transform(price) + return price + return apply_all + + +def format_price(price: Decimal) -> str: + """Format a Decimal price as a currency string.""" + return f"${price:.2f}" + + +def calculate( + base_price: Decimal, + transform: PriceTransform, +) -> str: + """Apply a transform to a base price and return the formatted result.""" + final = transform(base_price) + return format_price(final) + + +# Module-level strategies defined as proper named functions, not lambdas +def member_price(price: Decimal) -> Decimal: + """10 % member discount.""" + return price * Decimal("0.90") + + +def vip_price(price: Decimal) -> Decimal: + """20 % VIP discount followed by 8 % tax.""" + discounted = price * Decimal("0.80") + return discounted * Decimal("1.08") diff --git a/priv/combined_metrics/samples/code_smells/minimizes_data_in_messages/bad/shipping.ex b/priv/combined_metrics/samples/code_smells/minimizes_data_in_messages/bad/shipping.ex new file mode 100644 index 0000000..b658fe6 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/minimizes_data_in_messages/bad/shipping.ex @@ -0,0 +1,67 @@ +defmodule MyApp.Shipping.LabelWorker do + @moduledoc """ + Generates shipping labels asynchronously. + """ + + use GenServer + + alias MyApp.Orders.Order + alias MyApp.Shipping + + def start_link(opts) do + GenServer.start_link(__MODULE__, opts, name: __MODULE__) + end + + # Bad: enqueues the entire Order struct in the message. + # This copies the full struct (including all its associations) into the + # worker's mailbox, wasting memory and defeating the purpose of async work. + @spec enqueue(Order.t()) :: :ok + def enqueue(%Order{} = order) do + GenServer.cast(__MODULE__, {:generate_label, order}) + end + + @impl true + def init(_opts), do: {:ok, %{}} + + @impl true + # Bad: entire order struct is in the message payload — large copy + def handle_cast({:generate_label, %Order{} = order}, state) do + # The worker already received a (potentially stale) full struct + Task.start(fn -> Shipping.generate_label(order) end) + {:noreply, state} + end +end + +defmodule MyApp.Shipping.BatchNotifier do + @moduledoc """ + Broadcasts shipping updates. + """ + + alias MyApp.Shipping.Shipment + + # Bad: broadcasts the full Shipment struct to all subscribers. + # If 100 processes subscribe, this full struct is copied 100 times. + @spec notify_dispatched(Shipment.t()) :: :ok | {:error, term()} + def notify_dispatched(%Shipment{} = shipment) do + Phoenix.PubSub.broadcast( + MyApp.PubSub, + "shipments", + # Bad: sending the entire struct with all fields + {:shipment_dispatched, shipment} + ) + end + + # Bad: spawning a closure that captures the full struct + @spec process_async(Shipment.t()) :: :ok + def process_async(%Shipment{} = shipment) do + # Bad: the full shipment struct is captured in the closure + # and copied into the new process's heap + spawn(fn -> + Shipping.finalize(shipment) + Shipping.archive(shipment) + Shipping.notify_customer(shipment) + end) + + :ok + end +end diff --git a/priv/combined_metrics/samples/code_smells/minimizes_data_in_messages/good/shipping.ex b/priv/combined_metrics/samples/code_smells/minimizes_data_in_messages/good/shipping.ex new file mode 100644 index 0000000..9b415e0 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/minimizes_data_in_messages/good/shipping.ex @@ -0,0 +1,61 @@ +defmodule MyApp.Shipping.LabelWorker do + @moduledoc """ + Generates shipping labels asynchronously. Sends only the order ID + in the message — the worker re-fetches the data it needs from the + database, avoiding large struct serialisation over process boundaries. + """ + + use GenServer + + alias MyApp.Orders + alias MyApp.Shipping + + def start_link(opts) do + GenServer.start_link(__MODULE__, opts, name: __MODULE__) + end + + @doc """ + Enqueues label generation for the given order ID. + Sends only the integer ID — not the full order struct. + """ + @spec enqueue(integer()) :: :ok + def enqueue(order_id) when is_integer(order_id) do + GenServer.cast(__MODULE__, {:generate_label, order_id}) + end + + @impl true + def init(_opts), do: {:ok, %{}} + + @impl true + # Good: message carries only the order_id. + # The handler fetches the full record inside the worker process. + def handle_cast({:generate_label, order_id}, state) do + Task.start(fn -> do_generate(order_id) end) + {:noreply, state} + end + + defp do_generate(order_id) do + # Re-fetch only the fields needed for label generation + order = Orders.get_order!(order_id) + Shipping.generate_label(order) + end +end + +defmodule MyApp.Shipping.BatchNotifier do + @moduledoc """ + Broadcasts shipping updates. Sends only the shipment ID in PubSub + messages; subscribers fetch full details on demand. + """ + + @doc """ + Publishes a shipment-dispatched event with only the shipment ID. + """ + @spec notify_dispatched(integer()) :: :ok | {:error, term()} + def notify_dispatched(shipment_id) when is_integer(shipment_id) do + Phoenix.PubSub.broadcast( + MyApp.PubSub, + "shipments", + {:shipment_dispatched, shipment_id} + ) + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_array_constructor_with_arguments/bad/matrix_utils.js b/priv/combined_metrics/samples/code_smells/no_array_constructor_with_arguments/bad/matrix_utils.js new file mode 100644 index 0000000..e624438 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_array_constructor_with_arguments/bad/matrix_utils.js @@ -0,0 +1,59 @@ +function createMatrix(rows, cols, fillValue = 0) { + const matrix = new Array(rows); + for (let r = 0; r < rows; r++) { + matrix[r] = new Array(cols).fill(fillValue); + } + return matrix; +} + +function createRange(start, end, step = 1) { + const count = Math.ceil((end - start) / step); + const result = new Array(count); + for (let i = 0; i < count; i++) { + result[i] = start + i * step; + } + return result; +} + +function createFilledArray(length, valueFn) { + const arr = new Array(length); + for (let i = 0; i < length; i++) { + arr[i] = valueFn(i); + } + return arr; +} + +function transposeMatrix(matrix) { + const rows = matrix.length; + const cols = matrix[0].length; + const result = new Array(cols); + + for (let c = 0; c < cols; c++) { + result[c] = new Array(rows); + for (let r = 0; r < rows; r++) { + result[c][r] = matrix[r][c]; + } + } + + return result; +} + +function multiplyMatrices(a, b) { + const rows = a.length; + const cols = b[0].length; + const inner = b.length; + + const result = new Array(rows); + for (let r = 0; r < rows; r++) { + result[r] = new Array(cols).fill(0); + for (let c = 0; c < cols; c++) { + for (let k = 0; k < inner; k++) { + result[r][c] += a[r][k] * b[k][c]; + } + } + } + + return result; +} + +export { createMatrix, createRange, createFilledArray, transposeMatrix, multiplyMatrices }; diff --git a/priv/combined_metrics/samples/code_smells/no_array_constructor_with_arguments/good/matrix_utils.js b/priv/combined_metrics/samples/code_smells/no_array_constructor_with_arguments/good/matrix_utils.js new file mode 100644 index 0000000..d7bdf20 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_array_constructor_with_arguments/good/matrix_utils.js @@ -0,0 +1,53 @@ +function createMatrix(rows, cols, fillValue = 0) { + return Array.from({ length: rows }, () => Array.from({ length: cols }, () => fillValue)); +} + +function createRange(start, end, step = 1) { + const result = []; + for (let i = start; i < end; i += step) { + result.push(i); + } + return result; +} + +function createFilledArray(length, valueFn) { + return Array.from({ length }, (_, index) => valueFn(index)); +} + +function transposeMatrix(matrix) { + const rows = matrix.length; + const cols = matrix[0].length; + const result = Array.from({ length: cols }, () => Array.from({ length: rows }, () => 0)); + + for (let r = 0; r < rows; r++) { + for (let c = 0; c < cols; c++) { + result[c][r] = matrix[r][c]; + } + } + + return result; +} + +function multiplyMatrices(a, b) { + const rows = a.length; + const cols = b[0].length; + const inner = b.length; + + const result = Array.from({ length: rows }, () => Array.from({ length: cols }, () => 0)); + + for (let r = 0; r < rows; r++) { + for (let c = 0; c < cols; c++) { + for (let k = 0; k < inner; k++) { + result[r][c] += a[r][k] * b[k][c]; + } + } + } + + return result; +} + +function flattenMatrix(matrix) { + return matrix.flat(); +} + +export { createMatrix, createRange, createFilledArray, transposeMatrix, multiplyMatrices, flattenMatrix }; diff --git a/priv/combined_metrics/samples/code_smells/no_array_delete/bad/notification_handler.ts b/priv/combined_metrics/samples/code_smells/no_array_delete/bad/notification_handler.ts new file mode 100644 index 0000000..7a13f77 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_array_delete/bad/notification_handler.ts @@ -0,0 +1,67 @@ +interface Notification { + id: string; + userId: string; + type: string; + title: string; + readAt: string | null; +} + +function removeNotificationById( + notifications: Notification[], + id: string +): Notification[] { + const index = notifications.findIndex((n) => n.id === id); + if (index !== -1) { + // delete leaves a hole (undefined) in the array instead of removing the element + delete notifications[index]; + } + return notifications; +} + +function removeReadNotifications(notifications: Notification[]): Notification[] { + for (let i = 0; i < notifications.length; i++) { + if (notifications[i].readAt !== null) { + delete notifications[i]; + } + } + return notifications; +} + +function clearBulk(notifications: Notification[], ids: Set): Notification[] { + for (let i = 0; i < notifications.length; i++) { + if (ids.has(notifications[i].id)) { + delete notifications[i]; + } + } + return notifications; +} + +class NotificationQueue { + private items: Notification[]; + + constructor(initial: Notification[] = []) { + this.items = [...initial]; + } + + enqueue(notification: Notification): void { + this.items.push(notification); + } + + remove(id: string): void { + const index = this.items.findIndex((n) => n.id === id); + if (index !== -1) { + delete this.items[index]; + } + } + + getAll(): Notification[] { + return [...this.items]; + } + + get length(): number { + return this.items.length; + } +} + +export { removeNotificationById, removeReadNotifications, clearBulk, NotificationQueue }; +export type { Notification }; diff --git a/priv/combined_metrics/samples/code_smells/no_array_delete/good/notification_handler.ts b/priv/combined_metrics/samples/code_smells/no_array_delete/good/notification_handler.ts new file mode 100644 index 0000000..942650c --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_array_delete/good/notification_handler.ts @@ -0,0 +1,64 @@ +interface Notification { + id: string; + userId: string; + type: string; + title: string; + readAt: string | null; +} + +function removeNotificationById( + notifications: Notification[], + id: string +): Notification[] { + return notifications.filter((n) => n.id !== id); +} + +function removeNotificationsByType( + notifications: Notification[], + type: string +): Notification[] { + return notifications.filter((n) => n.type !== type); +} + +function removeReadNotifications(notifications: Notification[]): Notification[] { + return notifications.filter((n) => n.readAt === null); +} + +function removeAtIndex(notifications: Notification[], index: number): Notification[] { + return [...notifications.slice(0, index), ...notifications.slice(index + 1)]; +} + +function clearBulk(notifications: Notification[], ids: Set): Notification[] { + return notifications.filter((n) => !ids.has(n.id)); +} + +class NotificationQueue { + private items: Notification[]; + + constructor(initial: Notification[] = []) { + this.items = [...initial]; + } + + enqueue(notification: Notification): void { + this.items.push(notification); + } + + remove(id: string): void { + this.items = this.items.filter((n) => n.id !== id); + } + + removeFirst(): Notification | undefined { + return this.items.shift(); + } + + getAll(): Notification[] { + return [...this.items]; + } + + get length(): number { + return this.items.length; + } +} + +export { removeNotificationById, removeNotificationsByType, removeReadNotifications, removeAtIndex, clearBulk, NotificationQueue }; +export type { Notification }; diff --git a/priv/combined_metrics/samples/code_smells/no_auto_ptr/bad/Connection.cpp b/priv/combined_metrics/samples/code_smells/no_auto_ptr/bad/Connection.cpp new file mode 100644 index 0000000..2cbdc89 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_auto_ptr/bad/Connection.cpp @@ -0,0 +1,73 @@ +#include +#include +#include +#include + +class Socket { +public: + explicit Socket(const std::string& host, int port) + : host_(host), port_(port), connected_(false) {} + + void connect() { connected_ = true; } + void disconnect() { connected_ = false; } + bool isConnected() const { return connected_; } + std::string host() const { return host_; } + int port() const { return port_; } + +private: + std::string host_; + int port_; + bool connected_; +}; + +class Connection { +public: + explicit Connection(const std::string& host, int port) + : socket_(new Socket(host, port)) // std::auto_ptr — deprecated and removed in C++17 + { + socket_->connect(); + } + + ~Connection() { + if (socket_.get() && socket_->isConnected()) + socket_->disconnect(); + // auto_ptr deletes automatically, but transfer semantics are broken + } + + // auto_ptr copy "moves" ownership silently — source becomes null after copy + // This causes bugs when connection is put into a container or passed by value + Connection(const Connection& other) : socket_(other.socket_) {} // silently steals! + + bool isAlive() const { + return socket_.get() && socket_->isConnected(); + } + + std::string endpoint() const { + return socket_->host() + ":" + std::to_string(socket_->port()); + } + +private: + std::auto_ptr socket_; // std::auto_ptr: deprecated since C++11, removed in C++17 +}; + +class ConnectionPool { +public: + explicit ConnectionPool(const std::string& host, int port, std::size_t poolSize) { + for (std::size_t i = 0; i < poolSize; ++i) { + // Storing auto_ptr in a vector is undefined behavior — + // std::vector requires copyable elements; auto_ptr's copy transfers ownership + connections_.push_back(std::auto_ptr(new Connection(host, port))); + } + } + + Connection* acquire() { + for (auto& conn : connections_) { + if (conn.get() && conn->isAlive()) + return conn.get(); + } + throw std::runtime_error("No available connections"); + } + +private: + std::vector> connections_; // undefined behavior +}; diff --git a/priv/combined_metrics/samples/code_smells/no_auto_ptr/good/Connection.cpp b/priv/combined_metrics/samples/code_smells/no_auto_ptr/good/Connection.cpp new file mode 100644 index 0000000..7c2ecc4 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_auto_ptr/good/Connection.cpp @@ -0,0 +1,73 @@ +#include +#include +#include +#include + +class Socket { +public: + explicit Socket(const std::string& host, int port) + : host_(host), port_(port), connected_(false) {} + + void connect() { connected_ = true; } + void disconnect() noexcept { connected_ = false; } + bool isConnected() const noexcept { return connected_; } + std::string host() const { return host_; } + int port() const { return port_; } + +private: + std::string host_; + int port_; + bool connected_; +}; + +class Connection { +public: + explicit Connection(const std::string& host, int port) + : socket_(std::make_unique(host, port)) // unique_ptr — clear ownership + { + socket_->connect(); + } + + ~Connection() { + if (socket_ && socket_->isConnected()) + socket_->disconnect(); + } + + // Move is well-defined with unique_ptr + Connection(Connection&&) noexcept = default; + Connection& operator=(Connection&&) noexcept = default; + + Connection(const Connection&) = delete; + Connection& operator=(const Connection&) = delete; + + bool isAlive() const noexcept { + return socket_ && socket_->isConnected(); + } + + std::string endpoint() const { + return socket_->host() + ":" + std::to_string(socket_->port()); + } + +private: + std::unique_ptr socket_; +}; + +class ConnectionPool { +public: + explicit ConnectionPool(const std::string& host, int port, std::size_t size) { + connections_.reserve(size); + for (std::size_t i = 0; i < size; ++i) + connections_.push_back(std::make_unique(host, port)); + } + + Connection* acquire() { + for (auto& conn : connections_) { + if (conn && conn->isAlive()) + return conn.get(); + } + throw std::runtime_error("No available connections in pool"); + } + +private: + std::vector> connections_; +}; diff --git a/priv/combined_metrics/samples/code_smells/no_blocking_on_async_code/bad/NotificationService.cs b/priv/combined_metrics/samples/code_smells/no_blocking_on_async_code/bad/NotificationService.cs new file mode 100644 index 0000000..7380910 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_blocking_on_async_code/bad/NotificationService.cs @@ -0,0 +1,74 @@ +using System.Collections.Generic; +using System.Net.Http; +using System.Text.Json; +using System.Threading.Tasks; + +namespace Notifications +{ + public class NotificationService + { + private readonly HttpClient _httpClient; + private readonly string _baseUrl; + + public NotificationService(HttpClient httpClient, string baseUrl) + { + _httpClient = httpClient; + _baseUrl = baseUrl; + } + + public void Send(Notification notification) + { + var payload = JsonSerializer.Serialize(notification); + var content = new StringContent(payload, System.Text.Encoding.UTF8, "application/json"); + + // .Result blocks the calling thread and can cause deadlocks in ASP.NET contexts + var response = _httpClient.PostAsync($"{_baseUrl}/notify", content).Result; + response.EnsureSuccessStatusCode(); + } + + public IReadOnlyList GetPending(string recipientId) + { + // .Result blocks the thread; deadlock-prone in synchronization contexts + var response = _httpClient.GetAsync( + $"{_baseUrl}/notifications/pending?recipientId={recipientId}").Result; + response.EnsureSuccessStatusCode(); + + // Another .Result to block on the content read + var json = response.Content.ReadAsStringAsync().Result; + var notifications = JsonSerializer.Deserialize>(json); + return notifications?.AsReadOnly() ?? new List().AsReadOnly(); + } + + public void DispatchBatch(IEnumerable notifications) + { + var tasks = new List(); + foreach (var notification in notifications) + tasks.Add(SendAsync(notification)); + + // Task.WaitAll blocks the calling thread + Task.WaitAll(tasks.ToArray()); + } + + public bool IsReachable() + { + try + { + // .Wait() blocks and can deadlock + var responseTask = _httpClient.GetAsync($"{_baseUrl}/health"); + responseTask.Wait(); + return responseTask.Result.IsSuccessStatusCode; + } + catch (AggregateException) + { + return false; + } + } + + private Task SendAsync(Notification notification) + { + var payload = JsonSerializer.Serialize(notification); + var content = new StringContent(payload, System.Text.Encoding.UTF8, "application/json"); + return _httpClient.PostAsync($"{_baseUrl}/notify", content); + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_blocking_on_async_code/good/NotificationService.cs b/priv/combined_metrics/samples/code_smells/no_blocking_on_async_code/good/NotificationService.cs new file mode 100644 index 0000000..cd4ea22 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_blocking_on_async_code/good/NotificationService.cs @@ -0,0 +1,61 @@ +using System.Collections.Generic; +using System.Net.Http; +using System.Text.Json; +using System.Threading.Tasks; + +namespace Notifications +{ + public class NotificationService + { + private readonly HttpClient _httpClient; + private readonly string _baseUrl; + + public NotificationService(HttpClient httpClient, string baseUrl) + { + _httpClient = httpClient; + _baseUrl = baseUrl; + } + + public async Task SendAsync(Notification notification) + { + var payload = JsonSerializer.Serialize(notification); + var content = new StringContent(payload, System.Text.Encoding.UTF8, "application/json"); + + var response = await _httpClient.PostAsync($"{_baseUrl}/notify", content); + response.EnsureSuccessStatusCode(); + } + + public async Task> GetPendingAsync(string recipientId) + { + var response = await _httpClient.GetAsync( + $"{_baseUrl}/notifications/pending?recipientId={recipientId}"); + response.EnsureSuccessStatusCode(); + + var json = await response.Content.ReadAsStringAsync(); + var notifications = JsonSerializer.Deserialize>(json); + return notifications?.AsReadOnly() ?? new List().AsReadOnly(); + } + + public async Task DispatchBatchAsync(IEnumerable notifications) + { + var tasks = new List(); + foreach (var notification in notifications) + tasks.Add(SendAsync(notification)); + + await Task.WhenAll(tasks); + } + + public async Task IsReachableAsync() + { + try + { + var response = await _httpClient.GetAsync($"{_baseUrl}/health"); + return response.IsSuccessStatusCode; + } + catch (HttpRequestException) + { + return false; + } + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_class_variables/bad/account.rb b/priv/combined_metrics/samples/code_smells/no_class_variables/bad/account.rb new file mode 100644 index 0000000..4abf9fa --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_class_variables/bad/account.rb @@ -0,0 +1,46 @@ +class Account + # @@var is shared across the entire inheritance hierarchy + @@default_plan = :free + @@max_team_size = 5 + @@registered_count = 0 + + def self.default_plan + @@default_plan + end + + def self.default_plan=(plan) + @@default_plan = plan + end + + def self.max_team_size + @@max_team_size + end + + def self.registered_count + @@registered_count + end + + attr_reader :email, :plan, :team_size + + def initialize(email:, plan: @@default_plan, team_size: 1) + @email = email + @plan = plan + @team_size = team_size + @@registered_count += 1 + end + + def upgrade_plan(new_plan) + @plan = new_plan + end + + def within_team_limit? + team_size <= @@max_team_size + end +end + +class EnterpriseAccount < Account + # Attempting to set subclass defaults, but @@vars are shared with Account + # Setting @@default_plan here also changes Account.default_plan — a surprise + @@default_plan = :enterprise + @@max_team_size = 500 +end diff --git a/priv/combined_metrics/samples/code_smells/no_class_variables/good/account.rb b/priv/combined_metrics/samples/code_smells/no_class_variables/good/account.rb new file mode 100644 index 0000000..c4f4de8 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_class_variables/good/account.rb @@ -0,0 +1,47 @@ +class Account + # Use instance variables on the class object instead of @@ + @default_plan = :free + @max_team_size = 5 + @registered_count = 0 + + class << self + attr_accessor :default_plan, :max_team_size + + def registered_count + @registered_count + end + + def increment_registered_count + @registered_count += 1 + end + + def reset_registered_count + @registered_count = 0 + end + end + + attr_reader :email, :plan, :team_size + + def initialize(email:, plan: Account.default_plan, team_size: 1) + @email = email + @plan = plan + @team_size = team_size + self.class.increment_registered_count + end + + def upgrade_plan(new_plan) + @plan = new_plan + end + + def within_team_limit? + team_size <= self.class.max_team_size + end +end + +class EnterpriseAccount < Account + @default_plan = :enterprise + @max_team_size = 500 + + # Enterprise subclass has its own independent class-level state + # This would not be possible with @@ which leaks across the hierarchy +end diff --git a/priv/combined_metrics/samples/code_smells/no_const_enum/bad/product_repository.ts b/priv/combined_metrics/samples/code_smells/no_const_enum/bad/product_repository.ts new file mode 100644 index 0000000..aed9efc --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_const_enum/bad/product_repository.ts @@ -0,0 +1,70 @@ +// const enum is inlined at compile time, breaking at runtime when used +// from JavaScript or with isolatedModules / bundlers +const enum ProductStatus { + Draft = "draft", + Active = "active", + Archived = "archived", + OutOfStock = "out_of_stock", +} + +const enum ProductCategory { + Electronics = "electronics", + Clothing = "clothing", + Food = "food", + Books = "books", + Other = "other", +} + +const enum SortOrder { + PriceAsc = "price_asc", + PriceDesc = "price_desc", + NameAsc = "name_asc", + Newest = "newest", +} + +interface Product { + id: string; + name: string; + price: number; + status: ProductStatus; + category: ProductCategory; + stock: number; + createdAt: string; +} + +interface ProductQuery { + category?: ProductCategory; + status?: ProductStatus; + sortOrder?: SortOrder; + page?: number; + pageSize?: number; +} + +async function fetchProducts(query: ProductQuery = {}): Promise { + const params = new URLSearchParams(); + if (query.category) params.set("category", query.category); + if (query.status) params.set("status", query.status); + if (query.sortOrder) params.set("sort", query.sortOrder); + if (query.page) params.set("page", String(query.page)); + if (query.pageSize) params.set("pageSize", String(query.pageSize)); + + const response = await fetch(`/api/products?${params}`); + if (!response.ok) throw new Error(`Failed to fetch products: ${response.status}`); + return response.json() as Promise; +} + +function isAvailable(product: Product): boolean { + return product.status === ProductStatus.Active && product.stock > 0; +} + +function getStatusLabel(status: ProductStatus): string { + switch (status) { + case ProductStatus.Draft: return "Draft"; + case ProductStatus.Active: return "Active"; + case ProductStatus.Archived: return "Archived"; + case ProductStatus.OutOfStock: return "Out of Stock"; + } +} + +export { fetchProducts, isAvailable, getStatusLabel, ProductStatus, ProductCategory, SortOrder }; +export type { Product, ProductQuery }; diff --git a/priv/combined_metrics/samples/code_smells/no_const_enum/good/product_repository.ts b/priv/combined_metrics/samples/code_smells/no_const_enum/good/product_repository.ts new file mode 100644 index 0000000..c68ffe9 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_const_enum/good/product_repository.ts @@ -0,0 +1,69 @@ +enum ProductStatus { + Draft = "draft", + Active = "active", + Archived = "archived", + OutOfStock = "out_of_stock", +} + +enum ProductCategory { + Electronics = "electronics", + Clothing = "clothing", + Food = "food", + Books = "books", + Other = "other", +} + +enum SortOrder { + PriceAsc = "price_asc", + PriceDesc = "price_desc", + NameAsc = "name_asc", + Newest = "newest", +} + +interface Product { + id: string; + name: string; + price: number; + status: ProductStatus; + category: ProductCategory; + stock: number; + createdAt: string; +} + +interface ProductQuery { + category?: ProductCategory; + status?: ProductStatus; + sortOrder?: SortOrder; + page?: number; + pageSize?: number; +} + +async function fetchProducts(query: ProductQuery = {}): Promise { + const params = new URLSearchParams(); + if (query.category) params.set("category", query.category); + if (query.status) params.set("status", query.status); + if (query.sortOrder) params.set("sort", query.sortOrder); + if (query.page) params.set("page", String(query.page)); + if (query.pageSize) params.set("pageSize", String(query.pageSize)); + + const response = await fetch(`/api/products?${params}`); + if (!response.ok) throw new Error(`Failed to fetch products: ${response.status}`); + return response.json() as Promise; +} + +function isAvailable(product: Product): boolean { + return product.status === ProductStatus.Active && product.stock > 0; +} + +function getStatusLabel(status: ProductStatus): string { + const labels: Record = { + [ProductStatus.Draft]: "Draft", + [ProductStatus.Active]: "Active", + [ProductStatus.Archived]: "Archived", + [ProductStatus.OutOfStock]: "Out of Stock", + }; + return labels[status]; +} + +export { fetchProducts, isAvailable, getStatusLabel, ProductStatus, ProductCategory, SortOrder }; +export type { Product, ProductQuery }; diff --git a/priv/combined_metrics/samples/code_smells/no_dead_code_after_return/bad/order_service.ex b/priv/combined_metrics/samples/code_smells/no_dead_code_after_return/bad/order_service.ex new file mode 100644 index 0000000..c501350 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_dead_code_after_return/bad/order_service.ex @@ -0,0 +1,96 @@ +defmodule OrderService do + @moduledoc "Handles order creation, validation, and fulfillment" + + def create_order(user, items) do + if user == nil do + {:error, :user_required} + IO.puts("This will never run") + log_attempt(nil) + end + + if items == [] do + {:error, :items_required} + notify_empty_cart(user) + end + + total = calculate_total(items) + {:ok, %{user_id: user.id, items: items, total: total}} + end + + def cancel_order(order) do + case order.status do + :pending -> + {:ok, %{order | status: :cancelled}} + send_cancellation_email(order.user_id) + update_inventory(order.items) + + :shipped -> + {:error, :already_shipped} + log_cancel_attempt(order.id) + notify_support(order) + + _ -> + {:error, :invalid_status} + IO.inspect(order, label: "unexpected order") + end + end + + def apply_discount(order, code) do + case lookup_discount(code) do + nil -> + {:error, :invalid_code} + track_invalid_code(code) + {:error, :not_found} + + discount -> + new_total = order.total * (1 - discount.rate) + {:ok, %{order | total: new_total}} + end + end + + def validate_address(address) do + if address.zip == nil do + {:error, :zip_required} + flag_incomplete_address(address) + end + + if address.city == nil do + {:error, :city_required} + flag_incomplete_address(address) + end + + {:ok, address} + end + + def fulfill_order(order) do + case order.payment_status do + :paid -> + {:ok, %{order | status: :fulfilling}} + schedule_shipment(order) + notify_warehouse(order) + + :pending -> + {:error, :payment_pending} + retry_payment(order) + + :failed -> + {:error, :payment_failed} + notify_user_payment_failed(order.user_id) + end + end + + defp calculate_total(items), do: Enum.sum(Enum.map(items, & &1.price)) + defp lookup_discount(_code), do: nil + defp send_cancellation_email(_user_id), do: :ok + defp update_inventory(_items), do: :ok + defp log_cancel_attempt(_id), do: :ok + defp notify_support(_order), do: :ok + defp track_invalid_code(_code), do: :ok + defp flag_incomplete_address(_address), do: :ok + defp schedule_shipment(_order), do: :ok + defp notify_warehouse(_order), do: :ok + defp retry_payment(_order), do: :ok + defp notify_user_payment_failed(_user_id), do: :ok + defp log_attempt(_user), do: :ok + defp notify_empty_cart(_user), do: :ok +end diff --git a/priv/combined_metrics/samples/code_smells/no_dead_code_after_return/config.yml b/priv/combined_metrics/samples/code_smells/no_dead_code_after_return/config.yml new file mode 100644 index 0000000..9132452 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_dead_code_after_return/config.yml @@ -0,0 +1 @@ +doc: "There should be no unreachable statements after a return or early exit." diff --git a/priv/combined_metrics/samples/code_smells/no_dead_code_after_return/good/order_service.ex b/priv/combined_metrics/samples/code_smells/no_dead_code_after_return/good/order_service.ex new file mode 100644 index 0000000..8262780 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_dead_code_after_return/good/order_service.ex @@ -0,0 +1,58 @@ +defmodule OrderService do + @moduledoc "Handles order creation, validation, and fulfillment" + + def create_order(nil, _items), do: {:error, :user_required} + def create_order(_user, []), do: {:error, :items_required} + + def create_order(user, items) do + total = calculate_total(items) + {:ok, %{user_id: user.id, items: items, total: total}} + end + + def cancel_order(%{status: :pending} = order) do + {:ok, %{order | status: :cancelled}} + end + + def cancel_order(%{status: :shipped}) do + {:error, :already_shipped} + end + + def cancel_order(_order) do + {:error, :invalid_status} + end + + def apply_discount(order, code) do + case lookup_discount(code) do + nil -> + {:error, :invalid_code} + + discount -> + new_total = order.total * (1 - discount.rate) + {:ok, %{order | total: new_total}} + end + end + + def validate_address(%{zip: nil}), do: {:error, :zip_required} + def validate_address(%{city: nil}), do: {:error, :city_required} + def validate_address(address), do: {:ok, address} + + def fulfill_order(%{payment_status: :paid} = order) do + updated = %{order | status: :fulfilling} + schedule_shipment(updated) + notify_warehouse(updated) + {:ok, updated} + end + + def fulfill_order(%{payment_status: :pending}) do + {:error, :payment_pending} + end + + def fulfill_order(%{payment_status: :failed}) do + {:error, :payment_failed} + end + + defp calculate_total(items), do: Enum.sum(Enum.map(items, & &1.price)) + defp lookup_discount(_code), do: nil + defp schedule_shipment(_order), do: :ok + defp notify_warehouse(_order), do: :ok +end diff --git a/priv/combined_metrics/samples/code_smells/no_debug_print_statements/bad/payment.ex b/priv/combined_metrics/samples/code_smells/no_debug_print_statements/bad/payment.ex new file mode 100644 index 0000000..2c75241 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_debug_print_statements/bad/payment.ex @@ -0,0 +1,97 @@ +defmodule Payment do + @moduledoc "Handles payment processing and refunds" + + def charge(user, amount, card) do + IO.puts("charging user: #{user.id}") + IO.inspect(card, label: "card details") + IO.inspect(amount, label: "amount") + + case validate_card(card) do + {:ok, validated} -> + IO.puts("card validated successfully") + IO.inspect(validated, label: "validated card") + result = call_payment_gateway(validated, amount) + IO.inspect(result, label: "gateway result") + result + + {:error, reason} -> + IO.puts("card validation failed: #{inspect(reason)}") + {:error, reason} + end + end + + def refund(transaction_id, amount) do + IO.puts("starting refund for transaction: #{transaction_id}") + + case fetch_transaction(transaction_id) do + {:ok, transaction} -> + IO.inspect(transaction, label: "found transaction") + + if transaction.amount < amount do + IO.puts("refund amount exceeds original") + {:error, :exceeds_original} + else + IO.puts("processing refund of #{amount}") + result = call_refund_api(transaction, amount) + IO.inspect(result, label: "refund result") + result + end + + {:error, :not_found} -> + IO.puts("transaction not found: #{transaction_id}") + {:error, :not_found} + end + end + + def calculate_fee(amount, method) do + IO.inspect({amount, method}, label: "fee calculation input") + + fee = + case method do + :credit_card -> amount * 0.029 + 0.30 + :debit_card -> amount * 0.015 + :bank_transfer -> 0.25 + _ -> amount * 0.035 + end + + IO.puts("calculated fee: #{fee}") + fee + end + + def authorize(user, amount) do + IO.inspect(user, label: "authorizing user") + IO.puts("checking balance for #{user.id}, amount: #{amount}") + + cond do + user.balance >= amount -> + IO.puts("authorization approved") + {:ok, :authorized} + + true -> + IO.puts("insufficient funds: #{user.balance} < #{amount}") + {:error, :insufficient_funds} + end + end + + def apply_coupon(total, coupon_code) do + IO.puts("applying coupon: #{coupon_code}") + + case lookup_coupon(coupon_code) do + {:ok, coupon} -> + IO.inspect(coupon, label: "coupon found") + discounted = total - coupon.discount + IO.puts("new total after coupon: #{discounted}") + {:ok, discounted} + + {:error, _} -> + IO.puts("coupon not found: #{coupon_code}") + {:error, :invalid_coupon} + end + end + + defp validate_card(card), do: {:ok, card} + defp call_payment_gateway(_card, _amount), do: {:ok, %{transaction_id: "txn_123"}} + defp fetch_transaction(_id), do: {:ok, %{amount: 100.0}} + defp call_refund_api(_transaction, _amount), do: {:ok, :refunded} + defp lookup_coupon(_code), do: {:error, :not_found} +end diff --git a/priv/combined_metrics/samples/code_smells/no_debug_print_statements/config.yml b/priv/combined_metrics/samples/code_smells/no_debug_print_statements/config.yml new file mode 100644 index 0000000..5f85825 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_debug_print_statements/config.yml @@ -0,0 +1 @@ +doc: "Debug output (`console.log`, `IO.inspect`, `fmt.Println`) must not be left in committed code." diff --git a/priv/combined_metrics/samples/code_smells/no_debug_print_statements/good/payment.ex b/priv/combined_metrics/samples/code_smells/no_debug_print_statements/good/payment.ex new file mode 100644 index 0000000..b9b5b1c --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_debug_print_statements/good/payment.ex @@ -0,0 +1,66 @@ +defmodule Payment do + @moduledoc "Handles payment processing and refunds" + + require Logger + + def charge(user, amount, card) do + case validate_card(card) do + {:ok, validated} -> + result = call_payment_gateway(validated, amount) + Logger.info("Payment charged", user_id: user.id, amount: amount) + result + + {:error, reason} -> + Logger.warning("Card validation failed", user_id: user.id, reason: inspect(reason)) + {:error, reason} + end + end + + def refund(transaction_id, amount) do + case fetch_transaction(transaction_id) do + {:ok, transaction} when transaction.amount >= amount -> + result = call_refund_api(transaction, amount) + Logger.info("Refund processed", transaction_id: transaction_id, amount: amount) + result + + {:ok, _transaction} -> + {:error, :exceeds_original} + + {:error, :not_found} -> + {:error, :not_found} + end + end + + def calculate_fee(amount, method) do + case method do + :credit_card -> amount * 0.029 + 0.30 + :debit_card -> amount * 0.015 + :bank_transfer -> 0.25 + _ -> amount * 0.035 + end + end + + def authorize(user, amount) do + if user.balance >= amount do + {:ok, :authorized} + else + {:error, :insufficient_funds} + end + end + + def apply_coupon(total, coupon_code) do + case lookup_coupon(coupon_code) do + {:ok, coupon} -> + {:ok, total - coupon.discount} + + {:error, _} -> + {:error, :invalid_coupon} + end + end + + defp validate_card(card), do: {:ok, card} + defp call_payment_gateway(_card, _amount), do: {:ok, %{transaction_id: "txn_123"}} + defp fetch_transaction(_id), do: {:ok, %{amount: 100.0}} + defp call_refund_api(_transaction, _amount), do: {:ok, :refunded} + defp lookup_coupon(_code), do: {:error, :not_found} +end diff --git a/priv/combined_metrics/samples/code_smells/no_double_negation/bad/cart.rb b/priv/combined_metrics/samples/code_smells/no_double_negation/bad/cart.rb new file mode 100644 index 0000000..44e3fb0 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_double_negation/bad/cart.rb @@ -0,0 +1,45 @@ +class CartPresenter + attr_reader :cart, :user + + def initialize(cart, user) + @cart = cart + @user = user + end + + def has_items? + !!cart.line_items.any? + end + + def has_coupon? + !!cart.coupon_code + end + + # !! is unnecessary — nil? already returns a boolean + def user_authenticated? + !!user + end + + def show_guest_prompt? + !!user.nil? + end + + def checkout_enabled? + !!(has_items? && user_authenticated?) + end + + def discount_applied? + !!(has_coupon? && cart.discount_amount.to_f > 0) + end + + def to_h + { + has_items: !!has_items?, + has_coupon: !!has_coupon?, + authenticated: !!user_authenticated?, + checkout_enabled: !!checkout_enabled?, + discount_applied: !!discount_applied?, + item_count: cart.line_items.size, + total: cart.total + } + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_double_negation/good/cart.rb b/priv/combined_metrics/samples/code_smells/no_double_negation/good/cart.rb new file mode 100644 index 0000000..42f421a --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_double_negation/good/cart.rb @@ -0,0 +1,44 @@ +class CartPresenter + attr_reader :cart, :user + + def initialize(cart, user) + @cart = cart + @user = user + end + + def has_items? + cart.line_items.any? + end + + def has_coupon? + cart.coupon_code.present? + end + + def user_authenticated? + !user.nil? + end + + def show_guest_prompt? + user.nil? + end + + def checkout_enabled? + has_items? && user_authenticated? + end + + def discount_applied? + has_coupon? && cart.discount_amount.to_f > 0 + end + + def to_h + { + has_items: has_items?, + has_coupon: has_coupon?, + authenticated: user_authenticated?, + checkout_enabled: checkout_enabled?, + discount_applied: discount_applied?, + item_count: cart.line_items.size, + total: cart.total + } + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_finalize_override/bad/DatabaseConnection.java b/priv/combined_metrics/samples/code_smells/no_finalize_override/bad/DatabaseConnection.java new file mode 100644 index 0000000..823d0c1 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_finalize_override/bad/DatabaseConnection.java @@ -0,0 +1,67 @@ +package com.example.db; + +import java.sql.Connection; +import java.sql.SQLException; +import java.util.logging.Level; +import java.util.logging.Logger; + +public class DatabaseConnection { + + private static final Logger logger = Logger.getLogger(DatabaseConnection.class.getName()); + + private final Connection connection; + private boolean closed = false; + + public DatabaseConnection(Connection connection) { + this.connection = connection; + } + + public QueryResult execute(String sql, Object... params) throws SQLException { + try (var stmt = connection.prepareStatement(sql)) { + for (int i = 0; i < params.length; i++) { + stmt.setObject(i + 1, params[i]); + } + return QueryResult.from(stmt.executeQuery()); + } + } + + public void beginTransaction() throws SQLException { + connection.setAutoCommit(false); + } + + public void commit() throws SQLException { + connection.commit(); + } + + public void rollback() throws SQLException { + connection.rollback(); + } + + public void close() throws SQLException { + if (!closed) { + closed = true; + connection.close(); + } + } + + /** + * Overrides Object.finalize() to close the connection when garbage collected. + * This is unreliable — finalize() may never run, or run too late, leaving + * database connections open indefinitely. + */ + @Override + protected void finalize() throws Throwable { + try { + if (!closed) { + logger.log(Level.WARNING, "DatabaseConnection was not closed explicitly — closing in finalizer"); + connection.close(); + } + } finally { + super.finalize(); + } + } + + public boolean isClosed() { + return closed; + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_finalize_override/good/DatabaseConnection.java b/priv/combined_metrics/samples/code_smells/no_finalize_override/good/DatabaseConnection.java new file mode 100644 index 0000000..ed98a87 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_finalize_override/good/DatabaseConnection.java @@ -0,0 +1,77 @@ +package com.example.db; + +import java.io.Closeable; +import java.sql.Connection; +import java.sql.SQLException; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * Wraps a JDBC connection. Use try-with-resources to ensure the connection + * is closed promptly; do not rely on garbage collection for cleanup. + */ +public class DatabaseConnection implements Closeable { + + private static final Logger logger = Logger.getLogger(DatabaseConnection.class.getName()); + + private final Connection connection; + private boolean closed = false; + + public DatabaseConnection(Connection connection) { + this.connection = connection; + } + + public QueryResult execute(String sql, Object... params) throws SQLException { + ensureOpen(); + try (var stmt = connection.prepareStatement(sql)) { + for (int i = 0; i < params.length; i++) { + stmt.setObject(i + 1, params[i]); + } + return QueryResult.from(stmt.executeQuery()); + } + } + + public void beginTransaction() throws SQLException { + ensureOpen(); + connection.setAutoCommit(false); + } + + public void commit() throws SQLException { + ensureOpen(); + connection.commit(); + } + + public void rollback() throws SQLException { + ensureOpen(); + try { + connection.rollback(); + } catch (SQLException e) { + logger.log(Level.WARNING, "Rollback failed", e); + throw e; + } + } + + @Override + public void close() { + if (!closed) { + closed = true; + try { + connection.close(); + } catch (SQLException e) { + // Log but do not propagate — close() must not throw checked exceptions. + // The connection resource is released regardless. + logger.log(Level.WARNING, "Error closing database connection", e); + } + } + } + + public boolean isClosed() { + return closed; + } + + private void ensureOpen() { + if (closed) { + throw new IllegalStateException("DatabaseConnection has already been closed"); + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_fixme_comments/bad/importer.ex b/priv/combined_metrics/samples/code_smells/no_fixme_comments/bad/importer.ex new file mode 100644 index 0000000..6e5ea09 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_fixme_comments/bad/importer.ex @@ -0,0 +1,77 @@ +defmodule Importer do + @moduledoc "Handles importing data from CSV and external sources" + + # FIXME: this crashes on empty files, need to handle that + def import_csv(path) do + path + |> File.read!() + |> String.split("\n") + |> Enum.map(&parse_row/1) + |> Enum.reject(&is_nil/1) + end + + # TODO: FIXME - validate headers before parsing rows + def parse_row(line) do + case String.split(line, ",") do + [id, name, email] -> + %{id: id, name: name, email: email} + + # XXX: silently drops malformed rows, should log or collect errors + _ -> + nil + end + end + + def import_users(rows) do + # FIXME: this does N+1 inserts, wrap in a transaction + Enum.map(rows, fn row -> + insert_user(row) + end) + end + + def validate_row(%{email: email} = row) do + # XXX: email regex is wrong, doesn't handle subdomains + if String.match?(email, ~r/@/) do + {:ok, row} + else + {:error, :invalid_email} + end + end + + def deduplicate(rows) do + # FIXME: uses email as dedup key but doesn't normalize case first + rows + |> Enum.group_by(& &1.email) + |> Enum.map(fn {_email, [first | _rest]} -> first end) + end + + def import_from_api(source_url) do + # TODO: FIXME - add retry logic and timeout handling + case fetch_remote(source_url) do + {:ok, data} -> + parse_api_response(data) + + # XXX: swallows all errors, need proper error propagation + _ -> + [] + end + end + + def transform_row(row, field_map) do + # FIXME: doesn't handle nested fields or type coercion + Enum.reduce(field_map, %{}, fn {src, dst}, acc -> + Map.put(acc, dst, Map.get(row, src)) + end) + end + + def write_results(results, output_path) do + # XXX: overwrites file without backup, could lose data + content = Enum.map_join(results, "\n", &format_result/1) + File.write!(output_path, content) + end + + defp insert_user(row), do: {:ok, row} + defp fetch_remote(_url), do: {:ok, []} + defp parse_api_response(data), do: data + defp format_result(result), do: inspect(result) +end diff --git a/priv/combined_metrics/samples/code_smells/no_fixme_comments/config.yml b/priv/combined_metrics/samples/code_smells/no_fixme_comments/config.yml new file mode 100644 index 0000000..be88771 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_fixme_comments/config.yml @@ -0,0 +1 @@ +doc: "FIXME, XXX, and HACK comments indicate known problems that should be resolved before merging." diff --git a/priv/combined_metrics/samples/code_smells/no_fixme_comments/good/importer.ex b/priv/combined_metrics/samples/code_smells/no_fixme_comments/good/importer.ex new file mode 100644 index 0000000..2af79a9 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_fixme_comments/good/importer.ex @@ -0,0 +1,88 @@ +defmodule Importer do + @moduledoc "Handles importing data from CSV and external sources" + + def import_csv(path) do + case File.read(path) do + {:ok, ""} -> + {:error, :empty_file} + + {:ok, content} -> + rows = + content + |> String.split("\n", trim: true) + |> Enum.map(&parse_row/1) + |> Enum.reject(&is_nil/1) + + {:ok, rows} + + {:error, reason} -> + {:error, reason} + end + end + + def parse_row(line) do + case String.split(line, ",") do + [id, name, email] -> + %{id: id, name: name, email: email} + + _ -> + nil + end + end + + def import_users(rows) do + rows + |> Enum.map(&insert_user/1) + |> Enum.split_with(&match?({:ok, _}, &1)) + |> then(fn {ok, errors} -> {:ok, length(ok), length(errors)} end) + end + + def validate_row(%{email: email} = row) do + normalized = String.downcase(email) + + if String.match?(normalized, ~r/^[^\s@]+@[^\s@]+\.[^\s@]+$/) do + {:ok, %{row | email: normalized}} + else + {:error, :invalid_email} + end + end + + def deduplicate(rows) do + rows + |> Enum.map(fn row -> %{row | email: String.downcase(row.email)} end) + |> Enum.group_by(& &1.email) + |> Enum.map(fn {_email, [first | _rest]} -> first end) + end + + def import_from_api(source_url) do + with {:ok, data} <- fetch_remote(source_url), + {:ok, parsed} <- parse_api_response(data) do + {:ok, parsed} + end + end + + def transform_row(row, field_map) do + Enum.reduce(field_map, %{}, fn {src, dst}, acc -> + Map.put(acc, dst, Map.get(row, src)) + end) + end + + def write_results(results, output_path) do + backup_path = output_path <> ".bak" + + with :ok <- maybe_backup(output_path, backup_path), + content = Enum.map_join(results, "\n", &format_result/1), + :ok <- File.write(output_path, content) do + :ok + end + end + + defp maybe_backup(path, backup) do + if File.exists?(path), do: File.copy(path, backup), else: :ok + end + + defp insert_user(row), do: {:ok, row} + defp fetch_remote(_url), do: {:ok, []} + defp parse_api_response(data), do: {:ok, data} + defp format_result(result), do: inspect(result) +end diff --git a/priv/combined_metrics/samples/code_smells/no_global_scope_pollution/bad/analytics_tracker.js b/priv/combined_metrics/samples/code_smells/no_global_scope_pollution/bad/analytics_tracker.js new file mode 100644 index 0000000..02af07e --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_global_scope_pollution/bad/analytics_tracker.js @@ -0,0 +1,59 @@ +window.ANALYTICS_ENDPOINT = "/api/analytics"; +window.ANALYTICS_API_KEY = process.env.ANALYTICS_API_KEY; +window.analyticsQueue = []; +window.analyticsSessionId = null; +window.analyticsFlushTimer = null; +const MAX_QUEUE_SIZE = 100; + +function initAnalytics() { + const stored = sessionStorage.getItem("session"); + window.analyticsSessionId = stored ?? crypto.randomUUID(); + sessionStorage.setItem("session", window.analyticsSessionId); +} + +function trackEvent(eventName, properties = {}) { + if (window.analyticsQueue.length >= MAX_QUEUE_SIZE) { + flushAnalytics(); + } + + window.analyticsQueue.push({ + event: eventName, + properties, + sessionId: window.analyticsSessionId, + timestamp: Date.now(), + }); + + scheduleAnalyticsFlush(); +} + +function identifyUser(userId, traits = {}) { + trackEvent("$identify", { userId, ...traits }); +} + +function scheduleAnalyticsFlush() { + if (window.analyticsFlushTimer) return; + window.analyticsFlushTimer = setTimeout(() => { + window.analyticsFlushTimer = null; + flushAnalytics(); + }, 2000); +} + +async function flushAnalytics() { + if (window.analyticsQueue.length === 0) return; + const events = window.analyticsQueue.splice(0, window.analyticsQueue.length); + await fetch(window.ANALYTICS_ENDPOINT, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Api-Key": window.ANALYTICS_API_KEY, + }, + body: JSON.stringify({ events }), + }); +} + +window.trackEvent = trackEvent; +window.identifyUser = identifyUser; +window.flushAnalytics = flushAnalytics; +window.initAnalytics = initAnalytics; + +initAnalytics(); diff --git a/priv/combined_metrics/samples/code_smells/no_global_scope_pollution/good/analytics_tracker.js b/priv/combined_metrics/samples/code_smells/no_global_scope_pollution/good/analytics_tracker.js new file mode 100644 index 0000000..d81f8ac --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_global_scope_pollution/good/analytics_tracker.js @@ -0,0 +1,68 @@ +const SESSION_KEY = "analytics_session"; +const MAX_QUEUE_SIZE = 100; + +class AnalyticsTracker { + constructor(config) { + this._endpoint = config.endpoint; + this._apiKey = config.apiKey; + this._queue = []; + this._sessionId = this._loadOrCreateSession(); + this._flushTimer = null; + } + + track(eventName, properties = {}) { + if (this._queue.length >= MAX_QUEUE_SIZE) { + this._flushQueue(); + } + + this._queue.push({ + event: eventName, + properties, + sessionId: this._sessionId, + timestamp: Date.now(), + }); + + this._scheduleFlush(); + } + + identify(userId, traits = {}) { + this.track("$identify", { userId, ...traits }); + } + + async flush() { + await this._flushQueue(); + } + + _scheduleFlush() { + if (this._flushTimer) return; + this._flushTimer = setTimeout(() => { + this._flushTimer = null; + this._flushQueue(); + }, 2000); + } + + async _flushQueue() { + if (this._queue.length === 0) return; + const events = this._queue.splice(0, this._queue.length); + await fetch(this._endpoint, { + method: "POST", + headers: { "Content-Type": "application/json", "X-Api-Key": this._apiKey }, + body: JSON.stringify({ events }), + }); + } + + _loadOrCreateSession() { + const stored = sessionStorage.getItem(SESSION_KEY); + if (stored) return stored; + const id = crypto.randomUUID(); + sessionStorage.setItem(SESSION_KEY, id); + return id; + } +} + +const tracker = new AnalyticsTracker({ + endpoint: "/api/analytics", + apiKey: process.env.ANALYTICS_API_KEY, +}); + +export { tracker, AnalyticsTracker }; diff --git a/priv/combined_metrics/samples/code_smells/no_library_global_application_config/bad/billing.ex b/priv/combined_metrics/samples/code_smells/no_library_global_application_config/bad/billing.ex new file mode 100644 index 0000000..7e639a1 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_library_global_application_config/bad/billing.ex @@ -0,0 +1,58 @@ +defmodule Acme.Billing do + @moduledoc """ + Billing library. + """ + + # Bad: reads configuration from the global application environment. + # This couples the library to a specific OTP app name and config key. + # The library cannot be used without configuring `:acme_billing` in config.exs, + # and it cannot be used with different configs simultaneously (e.g., multiple accounts). + + @doc """ + Creates a charge. Reads API key and base URL from Application env. + """ + @spec create_charge(map()) :: {:ok, map()} | {:error, term()} + def create_charge(params) do + # Bad: tight coupling to global config + api_key = Application.get_env(:acme_billing, :api_key) || + raise "Acme.Billing: :api_key not configured" + + base_url = Application.get_env(:acme_billing, :base_url, "https://api.acmebilling.com") + timeout = Application.get_env(:acme_billing, :timeout_ms, 5_000) + + url = "#{base_url}/charges" + headers = [{"Authorization", "Bearer #{api_key}"}] + options = [timeout: timeout, recv_timeout: timeout] + + case HTTPoison.post(url, Jason.encode!(params), headers, options) do + {:ok, %{status_code: 201, body: body}} -> {:ok, Jason.decode!(body)} + {:ok, %{status_code: status, body: body}} -> {:error, {:http_error, status, body}} + {:error, reason} -> {:error, {:network_error, reason}} + end + end + + @doc """ + Lists recent charges. Also reads from Application env. + """ + @spec list_charges(keyword()) :: {:ok, [map()]} | {:error, term()} + def list_charges(opts \\ []) do + # Bad: same global config dependency repeated in every function + api_key = Application.get_env(:acme_billing, :api_key) || + raise "Acme.Billing: :api_key not configured" + + base_url = Application.get_env(:acme_billing, :base_url, "https://api.acmebilling.com") + timeout = Application.get_env(:acme_billing, :timeout_ms, 5_000) + retry = Application.get_env(:acme_billing, :retry_count, 3) + + limit = Keyword.get(opts, :limit, 20) + url = "#{base_url}/charges?limit=#{limit}" + headers = [{"Authorization", "Bearer #{api_key}"}] + + case HTTPoison.get(url, headers, timeout: timeout, recv_timeout: timeout) do + {:ok, %{status_code: 200, body: body}} -> {:ok, Jason.decode!(body)} + {:ok, %{status_code: status}} -> {:error, {:http_error, status}} + {:error, reason} when retry > 0 -> list_charges(opts) + {:error, reason} -> {:error, {:network_error, reason}} + end + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_library_global_application_config/good/billing.ex b/priv/combined_metrics/samples/code_smells/no_library_global_application_config/good/billing.ex new file mode 100644 index 0000000..f656656 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_library_global_application_config/good/billing.ex @@ -0,0 +1,53 @@ +defmodule Acme.Billing do + @moduledoc """ + Billing library. All configuration is passed as explicit function + arguments rather than read from `Application.get_env/2`. + This makes the library usable in any application without side effects. + """ + + defmodule Config do + @moduledoc "Configuration struct for the Acme.Billing client." + @enforce_keys [:api_key, :base_url] + defstruct [:api_key, :base_url, timeout_ms: 5_000, retry_count: 3] + + @type t :: %__MODULE__{ + api_key: String.t(), + base_url: String.t(), + timeout_ms: pos_integer(), + retry_count: non_neg_integer() + } + end + + @doc """ + Creates a charge using the provided configuration. + Config is passed explicitly — the library does not read global app env. + """ + @spec create_charge(Config.t(), map()) :: {:ok, map()} | {:error, term()} + def create_charge(%Config{} = config, params) do + url = "#{config.base_url}/charges" + headers = [{"Authorization", "Bearer #{config.api_key}"}] + options = [timeout: config.timeout_ms, recv_timeout: config.timeout_ms] + + case HTTPoison.post(url, Jason.encode!(params), headers, options) do + {:ok, %{status_code: 201, body: body}} -> {:ok, Jason.decode!(body)} + {:ok, %{status_code: status, body: body}} -> {:error, {:http_error, status, body}} + {:error, reason} -> {:error, {:network_error, reason}} + end + end + + @doc """ + Lists recent charges. Configuration is explicit. + """ + @spec list_charges(Config.t(), keyword()) :: {:ok, [map()]} | {:error, term()} + def list_charges(%Config{} = config, opts \\ []) do + limit = Keyword.get(opts, :limit, 20) + url = "#{config.base_url}/charges?limit=#{limit}" + headers = [{"Authorization", "Bearer #{config.api_key}"}] + + case HTTPoison.get(url, headers, timeout: config.timeout_ms) do + {:ok, %{status_code: 200, body: body}} -> {:ok, Jason.decode!(body)} + {:ok, %{status_code: status}} -> {:error, {:http_error, status}} + {:error, reason} -> {:error, {:network_error, reason}} + end + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_list_comprehension_for_side_effects/bad/task_runner.py b/priv/combined_metrics/samples/code_smells/no_list_comprehension_for_side_effects/bad/task_runner.py new file mode 100644 index 0000000..46f7234 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_list_comprehension_for_side_effects/bad/task_runner.py @@ -0,0 +1,61 @@ +"""Task runner that executes a queue of background jobs and logs results.""" +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from typing import Callable, Optional + + +@dataclass +class Task: + id: str + name: str + action: Callable[[], None] + retries: int = 0 + + +@dataclass +class TaskResult: + task_id: str + success: bool + started_at: datetime + finished_at: datetime + error: Optional[str] = None + + +_results: list[TaskResult] = [] + + +def run_task(task: Task) -> TaskResult: + started = datetime.utcnow() + try: + task.action() + return TaskResult(task_id=task.id, success=True, started_at=started, finished_at=datetime.utcnow()) + except Exception as exc: + return TaskResult(task_id=task.id, success=False, started_at=started, finished_at=datetime.utcnow(), error=str(exc)) + + +def run_all(tasks: list[Task]) -> list[TaskResult]: + """Run all tasks — comprehension used only for its side effect; returned list discarded.""" + results = [] + # list comprehension purely for the side effect of appending to results + [results.append(run_task(task)) for task in tasks] # bad: comprehension for side effects + return results + + +def notify_failures(results: list[TaskResult]) -> None: + """Log failures — comprehension builds a list of Nones that is immediately discarded.""" + [print(f"[ALERT] Task {r.task_id} failed: {r.error}") for r in results if not r.success] + + +def archive_results(results: list[TaskResult]) -> None: + """Persist results — comprehension is used purely to call append on _results.""" + [_results.append(result) for result in results] # bad: side-effect-only comprehension + + +def send_summary_emails(results: list[TaskResult], recipients: list[str]) -> None: + """Send emails — nested comprehension used entirely for its side effects.""" + [ + print(f"Sending summary to {email}: {len(results)} tasks run") + for email in recipients + ] # bad: comprehension discarded, used only for print side effect diff --git a/priv/combined_metrics/samples/code_smells/no_list_comprehension_for_side_effects/good/task_runner.py b/priv/combined_metrics/samples/code_smells/no_list_comprehension_for_side_effects/good/task_runner.py new file mode 100644 index 0000000..eb472eb --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_list_comprehension_for_side_effects/good/task_runner.py @@ -0,0 +1,76 @@ +"""Task runner that executes a queue of background jobs and logs results.""" +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Callable, Optional + + +@dataclass +class Task: + id: str + name: str + action: Callable[[], None] + retries: int = 0 + + +@dataclass +class TaskResult: + task_id: str + success: bool + started_at: datetime + finished_at: datetime + error: Optional[str] = None + + +_results: list[TaskResult] = [] + + +def run_task(task: Task) -> TaskResult: + """Execute a single task and return its result.""" + started = datetime.utcnow() + try: + task.action() + return TaskResult( + task_id=task.id, + success=True, + started_at=started, + finished_at=datetime.utcnow(), + ) + except Exception as exc: + return TaskResult( + task_id=task.id, + success=False, + started_at=started, + finished_at=datetime.utcnow(), + error=str(exc), + ) + + +def run_all(tasks: list[Task]) -> list[TaskResult]: + """Run all tasks and collect results using a plain for loop.""" + results = [] + for task in tasks: + result = run_task(task) + results.append(result) + return results + + +def notify_failures(results: list[TaskResult]) -> None: + """Log each failed result — side-effect loop, not a list comprehension.""" + for result in results: + if not result.success: + print(f"[ALERT] Task {result.task_id} failed: {result.error}") + + +def archive_results(results: list[TaskResult]) -> None: + """Persist results to the global store — explicit for loop makes intent clear.""" + for result in results: + _results.append(result) + + +def retry_failed(tasks: list[Task], results: list[TaskResult]) -> list[TaskResult]: + """Re-run tasks whose first attempt failed.""" + failed_ids = {r.task_id for r in results if not r.success} + retry_tasks = [t for t in tasks if t.id in failed_ids] + return run_all(retry_tasks) diff --git a/priv/combined_metrics/samples/code_smells/no_long_parameter_list/bad/accounts.ex b/priv/combined_metrics/samples/code_smells/no_long_parameter_list/bad/accounts.ex new file mode 100644 index 0000000..b8bbc26 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_long_parameter_list/bad/accounts.ex @@ -0,0 +1,62 @@ +defmodule MyApp.Accounts do + @moduledoc """ + User account management. + """ + + alias MyApp.Accounts.User + alias MyApp.Repo + + # Bad: eight positional parameters. Callers must remember the exact order. + # What is the difference between `role` and `plan`? Is `org_id` before or after? + # Easy to accidentally swap `team_id` and `org_id`. + @spec register(String.t(), String.t(), String.t(), atom(), atom(), integer(), integer(), DateTime.t()) :: + {:ok, User.t()} | {:error, Ecto.Changeset.t()} + def register(name, email, password, role, plan, org_id, team_id, trial_expires_at) do + %User{} + |> User.registration_changeset(%{ + name: name, + email: email, + password: password, + role: role, + plan: plan, + organization_id: org_id, + team_id: team_id, + trial_expires_at: trial_expires_at + }) + |> Repo.insert() + end + + # Bad: sending an email with six individual string parameters + @spec send_welcome_email(String.t(), String.t(), String.t(), String.t(), String.t(), String.t()) :: :ok + def send_welcome_email(to_email, user_name, org_name, plan_name, support_email, login_url) do + MyApp.Mailer.deliver(%{ + to: to_email, + subject: "Welcome, #{user_name}!", + template: :welcome, + assigns: %{ + name: user_name, + org: org_name, + plan: plan_name, + support: support_email, + url: login_url + } + }) + + :ok + end + + # Bad: updating a user with many individual named fields — no grouping + @spec update_profile(User.t(), String.t(), String.t(), String.t(), String.t(), boolean()) :: + {:ok, User.t()} | {:error, Ecto.Changeset.t()} + def update_profile(%User{} = user, name, bio, website, timezone, notifications_enabled) do + user + |> User.profile_changeset(%{ + name: name, + bio: bio, + website: website, + timezone: timezone, + notifications_enabled: notifications_enabled + }) + |> Repo.update() + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_long_parameter_list/good/accounts.ex b/priv/combined_metrics/samples/code_smells/no_long_parameter_list/good/accounts.ex new file mode 100644 index 0000000..473fb53 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_long_parameter_list/good/accounts.ex @@ -0,0 +1,66 @@ +defmodule MyApp.Accounts do + @moduledoc """ + User account management. Related parameters are grouped into + structs rather than passed as long argument lists. + """ + + alias MyApp.Accounts.{User, UserRegistration} + alias MyApp.Repo + + defmodule UserRegistration do + @moduledoc "Encapsulates all parameters needed to register a new user." + @enforce_keys [:email, :password, :name] + defstruct [ + :email, + :password, + :name, + :organization_id, + :role, + :plan, + :invite_token, + timezone: "UTC" + ] + end + + @doc """ + Registers a new user. Parameters are grouped in a `UserRegistration` struct + rather than passed as individual arguments. + """ + @spec register(UserRegistration.t()) :: {:ok, User.t()} | {:error, Ecto.Changeset.t()} + def register(%UserRegistration{} = registration) do + %User{} + |> User.registration_changeset(%{ + email: registration.email, + password: registration.password, + name: registration.name, + organization_id: registration.organization_id, + role: registration.role || :member, + plan: registration.plan || :free, + timezone: registration.timezone + }) + |> Repo.insert() + end + + @doc """ + Sends a welcome email. Takes the user struct rather than individual fields. + """ + @spec send_welcome_email(User.t()) :: :ok | {:error, term()} + def send_welcome_email(%User{} = user) do + MyApp.Mailer.deliver(%{ + to: user.email, + subject: "Welcome, #{user.name}!", + template: :welcome, + assigns: %{user: user} + }) + end + + @doc """ + Updates a user's profile. Groups the changeable fields in a map. + """ + @spec update_profile(User.t(), map()) :: {:ok, User.t()} | {:error, Ecto.Changeset.t()} + def update_profile(%User{} = user, attrs) when is_map(attrs) do + user + |> User.profile_changeset(attrs) + |> Repo.update() + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_magic_method_abuse/bad/OrderService.php b/priv/combined_metrics/samples/code_smells/no_magic_method_abuse/bad/OrderService.php new file mode 100644 index 0000000..1186908 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_magic_method_abuse/bad/OrderService.php @@ -0,0 +1,57 @@ +repository = $repository; + } + + // __get is used to make all properties dynamically readable — obscures what actually exists + public function __get(string $name): mixed + { + if (array_key_exists($name, $this->data)) { + return $this->data[$name]; + } + + // Silently returns null for any unknown property + return null; + } + + // __set makes all properties dynamically writable — no validation, no IDE support + public function __set(string $name, mixed $value): void + { + $this->data[$name] = $value; + } + + // __call forwards any method call to the repository — callers can't know what's supported + public function __call(string $name, array $arguments): mixed + { + if (method_exists($this->repository, $name)) { + return $this->repository->$name(...$arguments); + } + + // Falls through silently for unknown methods + return null; + } + + // __isset makes isset() work on magic properties — further obfuscating the interface + public function __isset(string $name): bool + { + return isset($this->data[$name]); + } + + // This is the only real method, but callers are expected to discover API via trial and error + public function place(int $customerId, array $items): array + { + $this->lastCustomer = $customerId; // Triggers __set — invisible + $this->lastItems = $items; // Triggers __set — invisible + + return $this->repository->createOrder($customerId, $items); + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_magic_method_abuse/good/OrderService.php b/priv/combined_metrics/samples/code_smells/no_magic_method_abuse/good/OrderService.php new file mode 100644 index 0000000..b328e2d --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_magic_method_abuse/good/OrderService.php @@ -0,0 +1,63 @@ +orders[$id])) { + return $this->orders[$id]; + } + + $order = $this->repository->findById($id); + if ($order !== null) { + $this->orders[$id] = $order; + } + + return $order; + } + + public function getStatus(int $orderId): string + { + $order = $this->findById($orderId); + if ($order === null) { + throw new OrderNotFoundException("Order {$orderId} not found"); + } + + return $order->getStatus(); + } + + public function updateStatus(int $orderId, string $status): void + { + $order = $this->findById($orderId); + if ($order === null) { + throw new OrderNotFoundException("Order {$orderId} not found"); + } + + $order->setStatus($status); + $this->repository->save($order); + unset($this->orders[$orderId]); + } + + public function listByCustomer(int $customerId): array + { + return $this->repository->findByCustomer($customerId); + } + + public function countByStatus(string $status): int + { + return $this->repository->countByStatus($status); + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_namespace_trespassing/bad/catalog.ex b/priv/combined_metrics/samples/code_smells/no_namespace_trespassing/bad/catalog.ex new file mode 100644 index 0000000..ab1c4fd --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_namespace_trespassing/bad/catalog.ex @@ -0,0 +1,58 @@ +defmodule Acme.Catalog do + @moduledoc """ + Public API for the Acme product catalog library. + """ + + alias Acme.Catalog.Product + + @doc "Searches the catalog." + @spec search(String.t()) :: [Product.t()] + def search(query) do + Acme.Catalog.SearchIndex.query(query) + end +end + +# Bad: defines a module inside Elixir's standard `Enum` namespace. +# This will conflict with the standard library and confuse anyone reading the code. +defmodule Enum.CatalogUtils do + @moduledoc """ + Extra Enum utilities added by the Acme.Catalog library. + BAD: This module pollutes the `Enum` namespace which belongs to Elixir itself. + """ + + def filter_available(products) do + Enum.filter(products, & &1.available) + end + + def sort_by_price(products) do + Enum.sort_by(products, & &1.price_cents) + end +end + +# Bad: extends the `String` module with catalog-specific logic. +# Any library calling `String.normalize/1` will be confused. +defmodule String.Utils do + @moduledoc """ + String helpers added by the Acme.Catalog library. + BAD: Trespasses on the `String` namespace owned by Elixir. + """ + + def normalize(str) when is_binary(str) do + str + |> String.trim() + |> String.downcase() + |> String.replace(~r/[^\w\s]/, "") + end +end + +# Bad: opens a module under `Map` — a core Elixir namespace. +defmodule Map.ProductHelpers do + @moduledoc """ + Map utilities for products. + BAD: Pollutes the standard `Map` namespace. + """ + + def from_product(%{id: id, name: name, price_cents: price}) do + %{id: id, name: name, price_cents: price} + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_namespace_trespassing/good/catalog.ex b/priv/combined_metrics/samples/code_smells/no_namespace_trespassing/good/catalog.ex new file mode 100644 index 0000000..333e3bf --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_namespace_trespassing/good/catalog.ex @@ -0,0 +1,61 @@ +defmodule Acme.Catalog do + @moduledoc """ + Public API for the Acme product catalog library. + All modules defined by this library live under the `Acme` namespace. + """ + + alias Acme.Catalog.{Product, Category, SearchIndex} + + @doc """ + Searches the catalog for products matching the query. + """ + @spec search(String.t(), keyword()) :: [Product.t()] + def search(query, opts \\ []) do + SearchIndex.query(query, opts) + end + + @doc """ + Lists all products in a category. + """ + @spec list_by_category(Category.t()) :: [Product.t()] + def list_by_category(%Category{} = category) do + Product.by_category(category) + end +end + +defmodule Acme.Catalog.Product do + @moduledoc """ + Product schema and query helpers. Stays within the `Acme.Catalog` namespace. + """ + + defstruct [:id, :name, :sku, :price_cents, :category_id] + + @type t :: %__MODULE__{ + id: integer(), + name: String.t(), + sku: String.t(), + price_cents: integer(), + category_id: integer() + } + + def by_category(%Acme.Catalog.Category{id: id}) do + # query implementation + [] + end +end + +defmodule Acme.Catalog.StringHelpers do + @moduledoc """ + String utilities specific to catalog use cases. + Kept under `Acme.Catalog` — not polluting the `String` namespace. + """ + + @doc "Normalizes a product name for indexing." + @spec normalize(String.t()) :: String.t() + def normalize(str) when is_binary(str) do + str + |> String.trim() + |> String.downcase() + |> String.replace(~r/[^\w\s]/, "") + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_nested_method_definition/bad/subscription.rb b/priv/combined_metrics/samples/code_smells/no_nested_method_definition/bad/subscription.rb new file mode 100644 index 0000000..767b739 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_nested_method_definition/bad/subscription.rb @@ -0,0 +1,39 @@ +class SubscriptionRenewalService + def initialize(billing, notifier, logger) + @billing = billing + @notifier = notifier + @logger = logger + end + + def renew(subscription) + # Methods defined inside another method do NOT create closures in Ruby. + # They are defined on the enclosing class/object, not scoped locally. + def renewable?(sub) + sub.active? || sub.in_grace_period? + end + + def attempt_charge(sub) + @billing.charge( + sub.payment_method_id, + sub.plan.monthly_price_cents + ) + rescue Billing::DeclinedError => e + { success: false, error: :payment_declined } + end + + def extend_subscription(sub) + new_expiry = [sub.expires_at, Time.current].max + 30.days + sub.update!(expires_at: new_expiry, status: :active) + end + + return { success: false, skipped: true } unless renewable?(subscription) + + charge_result = attempt_charge(subscription) + return charge_result unless charge_result[:success] + + extend_subscription(subscription) + @notifier.send_renewal_confirmation(subscription.user, subscription) + + { success: true, renewed_until: subscription.expires_at } + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_nested_method_definition/good/subscription.rb b/priv/combined_metrics/samples/code_smells/no_nested_method_definition/good/subscription.rb new file mode 100644 index 0000000..f099fe4 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_nested_method_definition/good/subscription.rb @@ -0,0 +1,48 @@ +class SubscriptionRenewalService + def initialize(billing, notifier, logger) + @billing = billing + @notifier = notifier + @logger = logger + end + + def renew(subscription) + return skip_result(:not_renewable) unless renewable?(subscription) + + charge_result = attempt_charge(subscription) + return charge_result unless charge_result[:success] + + extend_subscription(subscription) + notify_renewal(subscription) + + { success: true, renewed_until: subscription.expires_at } + end + + private + + def renewable?(subscription) + subscription.active? || subscription.in_grace_period? + end + + def attempt_charge(subscription) + @billing.charge( + subscription.payment_method_id, + subscription.plan.monthly_price_cents + ) + rescue Billing::DeclinedError => e + @logger.warn("Renewal charge declined for #{subscription.id}: #{e.message}") + { success: false, error: :payment_declined } + end + + def extend_subscription(subscription) + new_expiry = [subscription.expires_at, Time.current].max + 30.days + subscription.update!(expires_at: new_expiry, status: :active) + end + + def notify_renewal(subscription) + @notifier.send_renewal_confirmation(subscription.user, subscription) + end + + def skip_result(reason) + { success: false, skipped: true, reason: reason } + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_nested_ternary/bad/pricing.ex b/priv/combined_metrics/samples/code_smells/no_nested_ternary/bad/pricing.ex new file mode 100644 index 0000000..4828b43 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_nested_ternary/bad/pricing.ex @@ -0,0 +1,105 @@ +defmodule Pricing do + @moduledoc "Calculates prices and discounts for products" + + def final_price(product, user, coupon) do + base = + if product.on_sale do + if product.sale_price > 0 do + product.sale_price + else + product.price * 0.9 + end + else + product.price + end + + with_membership = + if user.member do + if user.tier == :gold do + if base > 100 do + base * 0.75 + else + base * 0.85 + end + else + base * 0.9 + end + else + base + end + + if coupon != nil do + if coupon.type == :percent do + if coupon.value > 50 do + with_membership * 0.5 + else + with_membership * (1 - coupon.value / 100) + end + else + if with_membership - coupon.value > 0 do + with_membership - coupon.value + else + 0 + end + end + else + with_membership + end + end + + def shipping_cost(order, user) do + if order.total > 50 do + if user.member do + 0 + else + if order.express do + 9.99 + else + 4.99 + end + end + else + if user.member do + if order.express do + 5.99 + else + 2.99 + end + else + if order.express do + 14.99 + else + 7.99 + end + end + end + end + + def tax_rate(country, region, product_type) do + if country == "US" do + if region == "CA" do + if product_type == :food do + 0.0 + else + 0.0725 + end + else + if product_type == :food do + 0.0 + else + 0.05 + end + end + else + if country == "DE" do + if product_type == :food do + 0.07 + else + 0.19 + end + else + 0.0 + end + end + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_nested_ternary/config.yml b/priv/combined_metrics/samples/code_smells/no_nested_ternary/config.yml new file mode 100644 index 0000000..a87321f --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_nested_ternary/config.yml @@ -0,0 +1 @@ +doc: "Nested conditional expressions (ternary-within-ternary) are harder to read than a plain if-else." diff --git a/priv/combined_metrics/samples/code_smells/no_nested_ternary/good/pricing.ex b/priv/combined_metrics/samples/code_smells/no_nested_ternary/good/pricing.ex new file mode 100644 index 0000000..fbef9ef --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_nested_ternary/good/pricing.ex @@ -0,0 +1,54 @@ +defmodule Pricing do + @moduledoc "Calculates prices and discounts for products" + + def final_price(product, user, coupon) do + product + |> base_price() + |> apply_membership(user) + |> apply_coupon(coupon) + end + + defp base_price(%{on_sale: true, sale_price: sale}) when sale > 0, do: sale + defp base_price(%{on_sale: true, price: price}), do: price * 0.9 + defp base_price(%{price: price}), do: price + + defp apply_membership(price, %{member: true, tier: :gold}) when price > 100, do: price * 0.75 + defp apply_membership(price, %{member: true, tier: :gold}), do: price * 0.85 + defp apply_membership(price, %{member: true}), do: price * 0.9 + defp apply_membership(price, _user), do: price + + defp apply_coupon(price, nil), do: price + + defp apply_coupon(price, %{type: :percent, value: value}) when value > 50 do + price * 0.5 + end + + defp apply_coupon(price, %{type: :percent, value: value}) do + price * (1 - value / 100) + end + + defp apply_coupon(price, %{type: :fixed, value: value}) do + max(price - value, 0) + end + + def shipping_cost(order, user) do + shipping_rate(order.total, user.member, order.express) + end + + defp shipping_rate(total, _member, _express) when total > 50, do: 0 + defp shipping_rate(_total, true, true), do: 5.99 + defp shipping_rate(_total, true, false), do: 2.99 + defp shipping_rate(_total, false, true), do: 14.99 + defp shipping_rate(_total, false, false), do: 7.99 + + def tax_rate(country, region, product_type) do + tax_for(country, region, product_type) + end + + defp tax_for("US", _region, :food), do: 0.0 + defp tax_for("US", "CA", _type), do: 0.0725 + defp tax_for("US", _region, _type), do: 0.05 + defp tax_for("DE", :food, _type), do: 0.07 + defp tax_for("DE", _region, _type), do: 0.19 + defp tax_for(_country, _region, _type), do: 0.0 +end diff --git a/priv/combined_metrics/samples/code_smells/no_primitive_wrapper_constructors/bad/form_validator.js b/priv/combined_metrics/samples/code_smells/no_primitive_wrapper_constructors/bad/form_validator.js new file mode 100644 index 0000000..40efe98 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_primitive_wrapper_constructors/bad/form_validator.js @@ -0,0 +1,68 @@ +function isNonEmptyString(value) { + const s = new String(value); + return s.trim().length > 0; +} + +function isPositiveNumber(value) { + const n = new Number(value); + return isFinite(n) && n > 0; +} + +function isTruthy(value) { + const b = new Boolean(value); + return b.valueOf(); +} + +function coerceToNumber(value) { + const n = new Number(value); + return isNaN(n) ? null : n.valueOf(); +} + +function coerceToString(value) { + if (value === null || value === undefined) return new String(""); + return new String(value); +} + +function validateField(name, value, rules) { + const errors = []; + + for (const rule of rules) { + switch (rule.type) { + case "required": + const strVal = new String(value); + if (strVal.trim().length === 0) { + errors.push(new String(`${name} is required`).valueOf()); + } + break; + case "minLength": + if (new String(value).length < new Number(rule.value)) { + errors.push(`${name} must be at least ${rule.value} characters`); + } + break; + case "numeric": + if (isNaN(new Number(value))) { + errors.push(`${name} must be a number`); + } + break; + case "positive": + if (!isPositiveNumber(value)) { + errors.push(`${name} must be a positive number`); + } + break; + default: + break; + } + } + + return errors; +} + +function validateForm(fields) { + return Object.entries(fields).reduce((errors, [name, { value, rules }]) => { + const fieldErrors = validateField(name, value, rules); + if (fieldErrors.length > new Number(0)) errors[name] = fieldErrors; + return errors; + }, {}); +} + +export { validateForm, validateField, isNonEmptyString, isPositiveNumber }; diff --git a/priv/combined_metrics/samples/code_smells/no_primitive_wrapper_constructors/good/form_validator.js b/priv/combined_metrics/samples/code_smells/no_primitive_wrapper_constructors/good/form_validator.js new file mode 100644 index 0000000..0c1e42b --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_primitive_wrapper_constructors/good/form_validator.js @@ -0,0 +1,64 @@ +function isNonEmptyString(value) { + return typeof value === "string" && value.trim().length > 0; +} + +function isPositiveNumber(value) { + return typeof value === "number" && Number.isFinite(value) && value > 0; +} + +function isTruthy(value) { + return Boolean(value); +} + +function coerceToNumber(value) { + const n = Number(value); + return Number.isNaN(n) ? null : n; +} + +function coerceToString(value) { + if (value === null || value === undefined) return ""; + return String(value); +} + +function validateField(name, value, rules) { + const errors = []; + + for (const rule of rules) { + switch (rule.type) { + case "required": + if (!isNonEmptyString(coerceToString(value))) { + errors.push(`${name} is required`); + } + break; + case "minLength": + if (typeof value === "string" && value.length < rule.value) { + errors.push(`${name} must be at least ${rule.value} characters`); + } + break; + case "numeric": + if (coerceToNumber(value) === null) { + errors.push(`${name} must be a number`); + } + break; + case "positive": + if (!isPositiveNumber(coerceToNumber(value))) { + errors.push(`${name} must be a positive number`); + } + break; + default: + break; + } + } + + return errors; +} + +function validateForm(fields) { + return Object.entries(fields).reduce((errors, [name, { value, rules }]) => { + const fieldErrors = validateField(name, value, rules); + if (fieldErrors.length > 0) errors[name] = fieldErrors; + return errors; + }, {}); +} + +export { validateForm, validateField, isNonEmptyString, isPositiveNumber, coerceToNumber, coerceToString }; diff --git a/priv/combined_metrics/samples/code_smells/no_private_inheritance/bad/Connection.cpp b/priv/combined_metrics/samples/code_smells/no_private_inheritance/bad/Connection.cpp new file mode 100644 index 0000000..b6a5f52 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_private_inheritance/bad/Connection.cpp @@ -0,0 +1,75 @@ +#include +#include +#include + +class TcpSocket { +public: + explicit TcpSocket(const std::string& host, int port) + : host_(host), port_(port), connected_(false) {} + + void connect() { connected_ = true; } + void disconnect() noexcept { connected_ = false; } + bool isConnected() const noexcept { return connected_; } + void send(const std::string& data) { (void)data; } + std::string receive(std::size_t maxBytes) { (void)maxBytes; return {}; } + +protected: + std::string host_; + int port_; + bool connected_; +}; + +class RetryPolicy { +public: + explicit RetryPolicy(int maxAttempts, std::chrono::milliseconds delay) + : maxAttempts_(maxAttempts), delay_(delay) {} + + bool shouldRetry(int attempt) const noexcept { return attempt < maxAttempts_; } + std::chrono::milliseconds delay() const noexcept { return delay_; } + +protected: + int maxAttempts_; + std::chrono::milliseconds delay_; +}; + +// Private inheritance used for implementation reuse — anti-pattern. +// Connection IS-NOT-A TcpSocket; it has confusing base-class subobjects +// and makes it hard to switch the socket implementation later. +class Connection + : private TcpSocket // private inheritance for reuse — should be composition + , private RetryPolicy // same problem +{ +public: + Connection(const std::string& host, int port, int maxRetries) + : TcpSocket(host, port) + , RetryPolicy(maxRetries, std::chrono::milliseconds(500)) + {} + + void open() { + for (int attempt = 0; ; ++attempt) { + try { + TcpSocket::connect(); // must call base explicitly — tightly coupled + return; + } catch (const std::exception&) { + if (!RetryPolicy::shouldRetry(attempt)) + throw; + } + } + } + + void close() noexcept { TcpSocket::disconnect(); } + bool isOpen() const noexcept { return TcpSocket::isConnected(); } + + void send(const std::string& data) { + if (!isOpen()) throw std::runtime_error("Connection is closed"); + TcpSocket::send(data); // using base class members directly + } + + std::string receive(std::size_t maxBytes) { + if (!isOpen()) throw std::runtime_error("Connection is closed"); + return TcpSocket::receive(maxBytes); + } + + // Accesses inherited protected member directly — tight coupling + std::string connectedHost() const { return host_; } +}; diff --git a/priv/combined_metrics/samples/code_smells/no_private_inheritance/good/Connection.cpp b/priv/combined_metrics/samples/code_smells/no_private_inheritance/good/Connection.cpp new file mode 100644 index 0000000..c4b7060 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_private_inheritance/good/Connection.cpp @@ -0,0 +1,75 @@ +#include +#include +#include +#include + +// Reuse via composition, not private inheritance + +class TcpSocket { +public: + explicit TcpSocket(const std::string& host, int port) + : host_(host), port_(port), connected_(false) {} + + void connect() { connected_ = true; } + void disconnect() noexcept { connected_ = false; } + bool isConnected() const noexcept { return connected_; } + void send(const std::string& data) { (void)data; } + std::string receive(std::size_t maxBytes) { (void)maxBytes; return {}; } + +private: + std::string host_; + int port_; + bool connected_; +}; + +class RetryPolicy { +public: + explicit RetryPolicy(int maxAttempts, std::chrono::milliseconds delay) + : maxAttempts_(maxAttempts), delay_(delay) {} + + bool shouldRetry(int attempt) const noexcept { return attempt < maxAttempts_; } + std::chrono::milliseconds delay() const noexcept { return delay_; } + +private: + int maxAttempts_; + std::chrono::milliseconds delay_; +}; + +// Composition: Connection HAS-A TcpSocket and HAS-A RetryPolicy +// Not IS-A; private inheritance would expose implementation details +class Connection { +public: + Connection(const std::string& host, int port, int maxRetries) + : socket_(std::make_unique(host, port)) + , retryPolicy_(maxRetries, std::chrono::milliseconds(500)) + {} + + void open() { + for (int attempt = 0; ; ++attempt) { + try { + socket_->connect(); + return; + } catch (const std::exception&) { + if (!retryPolicy_.shouldRetry(attempt)) + throw; + } + } + } + + void close() noexcept { socket_->disconnect(); } + bool isOpen() const noexcept { return socket_->isConnected(); } + + void send(const std::string& data) { + if (!isOpen()) throw std::runtime_error("Connection is closed"); + socket_->send(data); + } + + std::string receive(std::size_t maxBytes) { + if (!isOpen()) throw std::runtime_error("Connection is closed"); + return socket_->receive(maxBytes); + } + +private: + std::unique_ptr socket_; // composed, not inherited + RetryPolicy retryPolicy_; // composed, not inherited +}; diff --git a/priv/combined_metrics/samples/code_smells/no_problematic_operator_overloads/bad/Widget.cpp b/priv/combined_metrics/samples/code_smells/no_problematic_operator_overloads/bad/Widget.cpp new file mode 100644 index 0000000..62209d6 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_problematic_operator_overloads/bad/Widget.cpp @@ -0,0 +1,58 @@ +#include +#include +#include + +class Widget { +public: + Widget(int id, std::string label, bool visible) + : id_(id), label_(std::move(label)), visible_(visible), valid_(true) {} + + // Overloading && — breaks short-circuit evaluation; both operands always evaluated + bool operator&&(const Widget& rhs) const { + return valid_ && rhs.valid_; + } + + // Overloading || — same problem: no short-circuit, confusing semantics + bool operator||(const Widget& rhs) const { + return visible_ || rhs.visible_; + } + + // Overloading comma operator — evaluated left-to-right but breaks comma in function + // arguments and for-loop expressions in surprising ways + Widget& operator,(const Widget& rhs) { + (void)rhs; + return *this; + } + + // Overloading unary & — takes the address of Widget, not its actual address + // Breaks generic code that uses &widget to get a pointer + Widget* operator&() { + return nullptr; // returns something other than the real address — very surprising + } + + int id() const { return id_; } + const std::string& label() const { return label_; } + bool isVisible() const { return visible_; } + +private: + int id_; + std::string label_; + bool visible_; + bool valid_; +}; + +void processWidget(Widget* ptr); + +void demonstrate() { + Widget a(1, "Alpha", true); + Widget b(2, "Beta", false); + + // Looks like short-circuit but isn't — b.valid_ is ALWAYS evaluated + if (a && b) { /* ... */ } + + // Comma operator overloaded — (a, b) returns a, not b as expected + Widget& result = (a, b); + + // &a calls overloaded operator& — does NOT return the real address of a + processWidget(&a); // silently passes nullptr +} diff --git a/priv/combined_metrics/samples/code_smells/no_problematic_operator_overloads/good/Widget.cpp b/priv/combined_metrics/samples/code_smells/no_problematic_operator_overloads/good/Widget.cpp new file mode 100644 index 0000000..8998506 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_problematic_operator_overloads/good/Widget.cpp @@ -0,0 +1,72 @@ +#include +#include +#include + +// Overloads only operators with predictable, well-defined semantics. +// Does NOT overload &&, ||, comma, or unary &. + +class WidgetId { +public: + explicit WidgetId(int value) : value_(value) {} + + bool operator==(const WidgetId& rhs) const noexcept { return value_ == rhs.value_; } + bool operator!=(const WidgetId& rhs) const noexcept { return !(*this == rhs); } + bool operator<(const WidgetId& rhs) const noexcept { return value_ < rhs.value_; } + + int value() const noexcept { return value_; } + +private: + int value_; +}; + +class Widget { +public: + Widget(WidgetId id, std::string label, int priority) + : id_(id), label_(std::move(label)), priority_(priority) {} + + // Comparison by priority — clear semantic + bool operator<(const Widget& rhs) const noexcept { return priority_ < rhs.priority_; } + bool operator==(const Widget& rhs) const noexcept { return id_ == rhs.id_; } + + // Arithmetic with clear meaning: combine widget priority scores + Widget operator+(int extraPriority) const { + return Widget(id_, label_, priority_ + extraPriority); + } + + WidgetId id() const noexcept { return id_; } + const std::string& label() const noexcept { return label_; } + int priority() const noexcept { return priority_; } + + // Logical conditions are free functions using && and || naturally — + // no overloaded && or || to break short-circuit evaluation + static bool isHighPriority(const Widget& w) { return w.priority_ > 100; } + static bool isVisible(const Widget& w) { return !w.label_.empty(); } + +private: + WidgetId id_; + std::string label_; + int priority_; +}; + +class WidgetCollection { +public: + void add(Widget w) { items_.push_back(std::move(w)); } + + // operator[] with clear semantics + Widget& operator[](std::size_t index) { return items_[index]; } + const Widget& operator[](std::size_t index) const { return items_[index]; } + + std::size_t size() const noexcept { return items_.size(); } + bool empty() const noexcept { return items_.empty(); } + + // Natural use of && and || without overloading: + bool hasHighPriorityVisible() const { + for (const auto& w : items_) + if (Widget::isHighPriority(w) && Widget::isVisible(w)) + return true; + return false; + } + +private: + std::vector items_; +}; diff --git a/priv/combined_metrics/samples/code_smells/no_process_for_code_organization/bad/inventory.ex b/priv/combined_metrics/samples/code_smells/no_process_for_code_organization/bad/inventory.ex new file mode 100644 index 0000000..86da060 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_process_for_code_organization/bad/inventory.ex @@ -0,0 +1,71 @@ +defmodule MyApp.Inventory do + @moduledoc """ + Inventory calculations. Unnecessarily uses a GenServer to hold a map + that could simply be passed as function arguments. The GenServer adds + overhead and serializes all access with no benefit. + """ + + use GenServer + + # Bad: using a GenServer purely to hold a map that needs no + # concurrency protection or long-lived state. + def start_link(products) do + GenServer.start_link(__MODULE__, products, name: __MODULE__) + end + + @impl true + def init(products) do + {:ok, Map.new(products, &{&1.id, &1})} + end + + # Bad: simple computation wrapped in a GenServer.call — all callers + # are serialized through a single process for no reason. + @spec sufficient_stock?(integer(), pos_integer()) :: boolean() + def sufficient_stock?(product_id, quantity) do + GenServer.call(__MODULE__, {:sufficient_stock, product_id, quantity}) + end + + @spec compute_reservation([{integer(), pos_integer()}]) :: map() + def compute_reservation(items) do + GenServer.call(__MODULE__, {:compute_reservation, items}) + end + + @spec reorder_point(integer()) :: integer() + def reorder_point(product_id) do + GenServer.call(__MODULE__, {:reorder_point, product_id}) + end + + @impl true + def handle_call({:sufficient_stock, product_id, quantity}, _from, products) do + result = + case Map.get(products, product_id) do + nil -> false + product -> product.stock >= quantity + end + {:reply, result, products} + end + + @impl true + def handle_call({:compute_reservation, items}, _from, products) do + result = + Enum.reduce(items, %{available: [], unavailable: []}, fn {id, qty}, acc -> + product = Map.get(products, id) + if product && product.stock >= qty do + Map.update!(acc, :available, &[{id, qty} | &1]) + else + Map.update!(acc, :unavailable, &[{id, qty} | &1]) + end + end) + {:reply, result, products} + end + + @impl true + def handle_call({:reorder_point, product_id}, _from, products) do + result = + case Map.get(products, product_id) do + nil -> 0 + product -> ceil(product.daily_usage * product.lead_time_days * 1.2) + end + {:reply, result, products} + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_process_for_code_organization/good/inventory.ex b/priv/combined_metrics/samples/code_smells/no_process_for_code_organization/good/inventory.ex new file mode 100644 index 0000000..0f79e0d --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_process_for_code_organization/good/inventory.ex @@ -0,0 +1,58 @@ +defmodule MyApp.Inventory do + @moduledoc """ + Inventory calculations. Pure module with stateless functions — + no GenServer or Agent needed because there is no mutable state + or concurrency concern. The "state" is just data passed around. + """ + + alias MyApp.Inventory.{Product, StockLevel} + + @doc """ + Checks whether a product has enough stock for the requested quantity. + Pure function — no process needed. + """ + @spec sufficient_stock?(Product.t(), pos_integer()) :: boolean() + def sufficient_stock?(%Product{stock: stock}, quantity), do: stock >= quantity + + @doc """ + Computes a reservation summary for a list of items. + Pure transformation — no process needed. + """ + @spec compute_reservation([{Product.t(), pos_integer()}]) :: map() + def compute_reservation(items) when is_list(items) do + items + |> Enum.reduce(%{available: [], unavailable: []}, fn {product, qty}, acc -> + if sufficient_stock?(product, qty) do + Map.update!(acc, :available, &[{product.id, qty} | &1]) + else + Map.update!(acc, :unavailable, &[{product.id, qty} | &1]) + end + end) + end + + @doc """ + Calculates the reorder point for a product based on lead time and daily usage. + Pure computation — no process involved. + """ + @spec reorder_point(float(), pos_integer()) :: integer() + def reorder_point(daily_usage, lead_time_days) when daily_usage >= 0 do + ceil(daily_usage * lead_time_days * 1.2) + end + + @doc """ + Groups stock levels by warehouse. + Pure data transformation. + """ + @spec group_by_warehouse([StockLevel.t()]) :: map() + def group_by_warehouse(levels) when is_list(levels) do + Enum.group_by(levels, & &1.warehouse_id) + end + + @doc """ + Merges two stock level maps, summing quantities for shared keys. + """ + @spec merge_stock(map(), map()) :: map() + def merge_stock(stock_a, stock_b) do + Map.merge(stock_a, stock_b, fn _warehouse, qty_a, qty_b -> qty_a + qty_b end) + end +end diff --git a/priv/combined_metrics/samples/code_smells/no_prototype_modification/bad/data_store.js b/priv/combined_metrics/samples/code_smells/no_prototype_modification/bad/data_store.js new file mode 100644 index 0000000..0773eec --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_prototype_modification/bad/data_store.js @@ -0,0 +1,63 @@ +Array.prototype.groupBy = function (keyFn) { + return this.reduce((groups, item) => { + const key = keyFn(item); + if (!groups[key]) groups[key] = []; + groups[key].push(item); + return groups; + }, {}); +}; + +Array.prototype.unique = function (keyFn = (x) => x) { + const seen = new Set(); + return this.filter((item) => { + const key = keyFn(item); + if (seen.has(key)) return false; + seen.add(key); + return true; + }); +}; + +Array.prototype.sortedBy = function (keyFn, direction = "asc") { + const multiplier = direction === "asc" ? 1 : -1; + return [...this].sort((a, b) => { + const ak = keyFn(a); + const bk = keyFn(b); + return ak < bk ? -multiplier : ak > bk ? multiplier : 0; + }); +}; + +Object.prototype.deepClone = function () { + return JSON.parse(JSON.stringify(this)); +}; + +String.prototype.toTitleCase = function () { + return this.replace(/\b\w/g, (c) => c.toUpperCase()); +}; + +class DataStore { + constructor(records = []) { + this._records = [...records]; + } + + add(record) { + this._records.push(record); + } + + findBy(predicate) { + return this._records.filter(predicate); + } + + groupBy(keyFn) { + return this._records.groupBy(keyFn); + } + + sortedBy(keyFn, direction) { + return this._records.sortedBy(keyFn, direction); + } + + get size() { + return this._records.length; + } +} + +export { DataStore }; diff --git a/priv/combined_metrics/samples/code_smells/no_prototype_modification/good/data_store.js b/priv/combined_metrics/samples/code_smells/no_prototype_modification/good/data_store.js new file mode 100644 index 0000000..60cab9d --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_prototype_modification/good/data_store.js @@ -0,0 +1,67 @@ +function groupBy(array, keyFn) { + return array.reduce((groups, item) => { + const key = keyFn(item); + if (!Object.prototype.hasOwnProperty.call(groups, key)) { + groups[key] = []; + } + groups[key].push(item); + return groups; + }, {}); +} + +function unique(array, keyFn = (x) => x) { + const seen = new Set(); + return array.filter((item) => { + const key = keyFn(item); + if (seen.has(key)) return false; + seen.add(key); + return true; + }); +} + +function sortedBy(array, keyFn, direction = "asc") { + const multiplier = direction === "asc" ? 1 : -1; + return [...array].sort((a, b) => { + const ak = keyFn(a); + const bk = keyFn(b); + return ak < bk ? -multiplier : ak > bk ? multiplier : 0; + }); +} + +class DataStore { + constructor(records = []) { + this._records = [...records]; + this._indexes = new Map(); + } + + add(record) { + this._records.push(record); + this._invalidateIndexes(); + } + + findBy(predicate) { + return this._records.filter(predicate); + } + + groupBy(keyFn) { + return groupBy(this._records, keyFn); + } + + sortedBy(keyFn, direction) { + return sortedBy(this._records, keyFn, direction); + } + + unique(keyFn) { + return unique(this._records, keyFn); + } + + _invalidateIndexes() { + this._indexes.clear(); + } + + get size() { + return this._records.length; + } +} + +export { DataStore, groupBy, unique, sortedBy }; diff --git a/priv/combined_metrics/samples/code_smells/no_raw_sql_string_concatenation/bad/UserRepository.php b/priv/combined_metrics/samples/code_smells/no_raw_sql_string_concatenation/bad/UserRepository.php new file mode 100644 index 0000000..5fb56b0 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_raw_sql_string_concatenation/bad/UserRepository.php @@ -0,0 +1,58 @@ +connection->query($sql); + + return $stmt->fetchAll(PDO::FETCH_ASSOC); + } + + public function findByIds($ids): array + { + if (empty($ids)) { + return []; + } + + // Unsafe: builds IN clause by joining raw input + $idList = implode(',', $ids); + $sql = "SELECT * FROM users WHERE id IN ($idList) AND deleted_at IS NULL"; + + return $this->connection->query($sql)->fetchAll(PDO::FETCH_ASSOC); + } + + public function updateLastLogin($userId, $ip): void + { + // Direct interpolation of $ip — attacker-controlled value in SQL + $sql = "UPDATE users SET last_login_at = NOW(), last_login_ip = '" . $ip . "' WHERE id = " . $userId; + $this->connection->exec($sql); + } + + public function findByUsername($username): ?array + { + // Classic SQL injection pattern + $result = $this->connection->query( + "SELECT * FROM users WHERE username = '" . $username . "'" + ); + + $row = $result->fetch(PDO::FETCH_ASSOC); + return $row ?: null; + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_raw_sql_string_concatenation/good/UserRepository.php b/priv/combined_metrics/samples/code_smells/no_raw_sql_string_concatenation/good/UserRepository.php new file mode 100644 index 0000000..1d815a4 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_raw_sql_string_concatenation/good/UserRepository.php @@ -0,0 +1,69 @@ +connection->prepare( + 'SELECT * FROM users + WHERE (name LIKE :query OR email LIKE :query) + AND role = :role + AND deleted_at IS NULL + ORDER BY created_at DESC + LIMIT :limit' + ); + + $stmt->bindValue(':query', "%{$query}%", PDO::PARAM_STR); + $stmt->bindValue(':role', $role, PDO::PARAM_STR); + $stmt->bindValue(':limit', $limit, PDO::PARAM_INT); + $stmt->execute(); + + return array_map([$this, 'hydrate'], $stmt->fetchAll(PDO::FETCH_ASSOC)); + } + + public function findByIds(array $ids): array + { + if (empty($ids)) { + return []; + } + + // Safe handling of IN clause with bound parameters + $placeholders = implode(',', array_fill(0, count($ids), '?')); + $stmt = $this->connection->prepare( + "SELECT * FROM users WHERE id IN ({$placeholders}) AND deleted_at IS NULL" + ); + $stmt->execute(array_values($ids)); + + return array_map([$this, 'hydrate'], $stmt->fetchAll(PDO::FETCH_ASSOC)); + } + + public function updateLastLogin(int $userId, string $ip): void + { + $stmt = $this->connection->prepare( + 'UPDATE users SET last_login_at = NOW(), last_login_ip = :ip WHERE id = :id' + ); + $stmt->execute(['ip' => $ip, 'id' => $userId]); + } + + private function hydrate(array $row): User + { + return new User( + id: (int) $row['id'], + email: $row['email'], + name: $row['name'], + role: $row['role'] + ); + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_runblocking_in_coroutines/bad/FileImportService.kt b/priv/combined_metrics/samples/code_smells/no_runblocking_in_coroutines/bad/FileImportService.kt new file mode 100644 index 0000000..a598525 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_runblocking_in_coroutines/bad/FileImportService.kt @@ -0,0 +1,54 @@ +package com.example.imports + +import kotlinx.coroutines.* +import java.io.File + +data class ImportResult(val rowsImported: Int, val errors: List) + +class FileImportService( + private val parser: CsvParser, + private val repository: ProductRepository +) { + + /** + * importFile is a suspend function — it is called from within a coroutine. + * Using runBlocking here blocks the thread that the coroutine was running on, + * which can cause deadlocks on single-threaded dispatchers and defeats the + * purpose of structured concurrency. + */ + suspend fun importFile(filePath: String): ImportResult { + val lines = File(filePath).readLines() + val errors = mutableListOf() + var count = 0 + + val batches = lines.drop(1).chunked(500) + + // runBlocking inside a suspend function — blocks the coroutine's thread + count = runBlocking { + val jobs = batches.mapIndexed { index, batch -> + async(Dispatchers.IO) { + processBatch(batch, index, errors) + } + } + jobs.sumOf { it.await() } + } + + return ImportResult(count, errors) + } + + suspend fun validateAndImport(filePath: String): ImportResult { + // runBlocking used to call another suspend function from within a suspend function + val exists = runBlocking { checkFileExists(filePath) } + if (!exists) return ImportResult(0, listOf("File not found: $filePath")) + return importFile(filePath) + } + + private suspend fun checkFileExists(path: String): Boolean { + return withContext(Dispatchers.IO) { File(path).exists() } + } + + private suspend fun processBatch(lines: List, batchIndex: Int, errors: MutableList): Int { + val products = parser.parseLines(lines, batchIndex) + return repository.upsertAll(products) + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_runblocking_in_coroutines/good/FileImportService.kt b/priv/combined_metrics/samples/code_smells/no_runblocking_in_coroutines/good/FileImportService.kt new file mode 100644 index 0000000..4c09849 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_runblocking_in_coroutines/good/FileImportService.kt @@ -0,0 +1,51 @@ +package com.example.imports + +import kotlinx.coroutines.* +import java.io.File + +data class ImportResult(val rowsImported: Int, val errors: List) + +class FileImportService( + private val parser: CsvParser, + private val repository: ProductRepository +) { + + /** + * Top-level entry point — runBlocking here is acceptable because this is + * the boundary between the non-coroutine world (e.g. a CLI main function) + * and the coroutine world. It is NOT inside an existing coroutine. + */ + fun importFromCli(filePath: String): ImportResult = runBlocking { + importFile(filePath) + } + + /** + * The actual work is a proper suspend function. + * Any coroutine-aware caller (HTTP handler, scheduled job) calls this directly + * without incurring the overhead or blocking of runBlocking. + */ + suspend fun importFile(filePath: String): ImportResult = coroutineScope { + val lines = File(filePath).readLines() + val errors = mutableListOf() + var count = 0 + + val batches = lines.drop(1).chunked(500) + val jobs = batches.mapIndexed { index, batch -> + async(Dispatchers.IO) { + processBatch(batch, index, errors) + } + } + + count = jobs.sumOf { it.await() } + ImportResult(count, errors) + } + + private suspend fun processBatch( + lines: List, + batchIndex: Int, + errors: MutableList + ): Int { + val products = parser.parseLines(lines, batchIndex) + return repository.upsertAll(products) + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_side_effects_in_declaration_file/bad/UserRepository.php b/priv/combined_metrics/samples/code_smells/no_side_effects_in_declaration_file/bad/UserRepository.php new file mode 100644 index 0000000..b8d851d --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_side_effects_in_declaration_file/bad/UserRepository.php @@ -0,0 +1,55 @@ +query("SELECT * FROM users WHERE id = $id")->fetch(); +} + +class UserRepository +{ + private PDO $connection; + + public function __construct(PDO $connection) + { + $this->connection = $connection; + + // Side effect: running a query in the constructor (at class use time) + $this->connection->exec("SET NAMES utf8mb4"); + } + + public function findById(int $id): ?array + { + $stmt = $this->connection->prepare('SELECT * FROM users WHERE id = :id'); + $stmt->execute(['id' => $id]); + return $stmt->fetch(PDO::FETCH_ASSOC) ?: null; + } + + public function findByEmail(string $email): ?array + { + $stmt = $this->connection->prepare('SELECT * FROM users WHERE email = :email'); + $stmt->execute(['email' => strtolower($email)]); + return $stmt->fetch(PDO::FETCH_ASSOC) ?: null; + } +} + +// Side effect: running code at include time — creates a user unconditionally +$repo = new UserRepository($GLOBALS['db']); +$adminUser = $repo->findByEmail('admin@example.com'); +if (!$adminUser) { + echo "Warning: no admin user found\n"; +} diff --git a/priv/combined_metrics/samples/code_smells/no_side_effects_in_declaration_file/good/UserRepository.php b/priv/combined_metrics/samples/code_smells/no_side_effects_in_declaration_file/good/UserRepository.php new file mode 100644 index 0000000..228a432 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_side_effects_in_declaration_file/good/UserRepository.php @@ -0,0 +1,84 @@ +connection->prepare( + 'SELECT * FROM users WHERE id = :id' + ); + $stmt->execute(['id' => $id]); + $row = $stmt->fetch(PDO::FETCH_ASSOC); + + return $row ? $this->hydrate($row) : null; + } + + public function findByEmail(string $email): ?User + { + $stmt = $this->connection->prepare( + 'SELECT * FROM users WHERE email = :email' + ); + $stmt->execute(['email' => mb_strtolower($email)]); + $row = $stmt->fetch(PDO::FETCH_ASSOC); + + return $row ? $this->hydrate($row) : null; + } + + public function save(User $user): void + { + if ($user->getId() === null) { + $this->insert($user); + } else { + $this->update($user); + } + } + + private function hydrate(array $row): User + { + return new User( + id: (int) $row['id'], + email: $row['email'], + name: $row['name'], + role: $row['role'] + ); + } + + private function insert(User $user): void + { + $stmt = $this->connection->prepare( + 'INSERT INTO users (email, name, role, created_at) VALUES (:email, :name, :role, NOW())' + ); + $stmt->execute([ + 'email' => $user->getEmail(), + 'name' => $user->getName(), + 'role' => $user->getRole(), + ]); + } + + private function update(User $user): void + { + $stmt = $this->connection->prepare( + 'UPDATE users SET email = :email, name = :name WHERE id = :id' + ); + $stmt->execute([ + 'email' => $user->getEmail(), + 'name' => $user->getName(), + 'id' => $user->getId(), + ]); + } +} diff --git a/priv/combined_metrics/samples/code_smells/no_unnecessary_conditions/bad/order_service.ts b/priv/combined_metrics/samples/code_smells/no_unnecessary_conditions/bad/order_service.ts new file mode 100644 index 0000000..3c51ec9 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_unnecessary_conditions/bad/order_service.ts @@ -0,0 +1,68 @@ +interface Order { + id: string; + status: "pending" | "confirmed" | "shipped" | "delivered" | "cancelled"; + total: number; + currency: string; + items: Array<{ productId: string; quantity: number; price: number }>; +} + +function isOrderCancellable(order: Order): boolean { + // Always true for a string — typeof check is unnecessary here + if (typeof order.id === "string") { + return order.status === "pending" || order.status === "confirmed"; + } + return false; +} + +function canRequestRefund(order: Order): boolean { + return order.status === "delivered"; +} + +async function cancelOrder(order: Order): Promise { + // order.id is always truthy (typed as string), this check is unnecessary + if (order.id) { + if (!isOrderCancellable(order)) { + throw new Error(`Order ${order.id} cannot be cancelled in status '${order.status}'`); + } + } + + const response = await fetch(`/api/orders/${order.id}/cancel`, { method: "POST" }); + if (!response.ok) throw new Error(`Cancel failed: ${response.status}`); + return response.json() as Promise; +} + +function calculateOrderTotal(items: Order["items"]): number { + // items is typed as Array — the null check is unnecessary + if (items !== null && items !== undefined) { + return items.reduce((sum, item) => sum + item.price * item.quantity, 0); + } + return 0; +} + +function getHighValueItems(order: Order, threshold: number): Order["items"] { + return order.items.filter((item) => { + // item.price is typed as number, it's always a number + if (typeof item.price === "number") { + return item.price > threshold; + } + return false; + }); +} + +function formatOrderSummary(order: Order): string { + const itemCount = order.items.length; + // itemCount is always >= 0; the `< 0` branch is unreachable + if (itemCount < 0) { + return `Order #${order.id}: no items`; + } + + const total = new Intl.NumberFormat("en-US", { + style: "currency", + currency: order.currency, + }).format(order.total); + + return `Order #${order.id}: ${itemCount} item${itemCount === 1 ? "" : "s"}, ${total} (${order.status})`; +} + +export { cancelOrder, isOrderCancellable, canRequestRefund, calculateOrderTotal, getHighValueItems, formatOrderSummary }; +export type { Order }; diff --git a/priv/combined_metrics/samples/code_smells/no_unnecessary_conditions/good/order_service.ts b/priv/combined_metrics/samples/code_smells/no_unnecessary_conditions/good/order_service.ts new file mode 100644 index 0000000..e454095 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_unnecessary_conditions/good/order_service.ts @@ -0,0 +1,46 @@ +interface Order { + id: string; + status: "pending" | "confirmed" | "shipped" | "delivered" | "cancelled"; + total: number; + currency: string; + items: Array<{ productId: string; quantity: number; price: number }>; +} + +function isOrderCancellable(order: Order): boolean { + return order.status === "pending" || order.status === "confirmed"; +} + +function canRequestRefund(order: Order): boolean { + return order.status === "delivered"; +} + +async function cancelOrder(order: Order): Promise { + if (!isOrderCancellable(order)) { + throw new Error(`Order ${order.id} cannot be cancelled in status '${order.status}'`); + } + + const response = await fetch(`/api/orders/${order.id}/cancel`, { method: "POST" }); + if (!response.ok) throw new Error(`Cancel failed: ${response.status}`); + return response.json() as Promise; +} + +function calculateOrderTotal(items: Order["items"]): number { + return items.reduce((sum, item) => sum + item.price * item.quantity, 0); +} + +function getHighValueItems(order: Order, threshold: number): Order["items"] { + return order.items.filter((item) => item.price > threshold); +} + +function formatOrderSummary(order: Order): string { + const itemCount = order.items.length; + const total = new Intl.NumberFormat("en-US", { + style: "currency", + currency: order.currency, + }).format(order.total); + + return `Order #${order.id}: ${itemCount} item${itemCount === 1 ? "" : "s"}, ${total} (${order.status})`; +} + +export { cancelOrder, isOrderCancellable, canRequestRefund, calculateOrderTotal, getHighValueItems, formatOrderSummary }; +export type { Order }; diff --git a/priv/combined_metrics/samples/code_smells/no_using_namespace_directives/bad/Matrix.cpp b/priv/combined_metrics/samples/code_smells/no_using_namespace_directives/bad/Matrix.cpp new file mode 100644 index 0000000..ba1ca5d --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_using_namespace_directives/bad/Matrix.cpp @@ -0,0 +1,75 @@ +#include +#include +#include +#include + +// Pollutes the global namespace — conflicts with user-defined names and other libraries +using namespace std; + +class Matrix { +public: + Matrix(size_t rows, size_t cols) // "size_t" unqualified due to using namespace std + : rows_(rows), cols_(cols), data_(rows * cols, 0.0) {} + + double& at(size_t row, size_t col) { + boundsCheck(row, col); + return data_[row * cols_ + col]; + } + + double at(size_t row, size_t col) const { + boundsCheck(row, col); + return data_[row * cols_ + col]; + } + + size_t rows() const { return rows_; } + size_t cols() const { return cols_; } + + Matrix operator+(const Matrix& rhs) const { + checkCompatible(rhs); + Matrix result(rows_, cols_); + // "transform" and "plus" silently resolved via using namespace std + transform(data_.begin(), data_.end(), + rhs.data_.begin(), result.data_.begin(), + plus()); + return result; + } + + Matrix operator*(const Matrix& rhs) const { + if (cols_ != rhs.rows_) + throw invalid_argument("Incompatible dimensions"); + + Matrix result(rows_, rhs.cols_); + for (size_t i = 0; i < rows_; ++i) + for (size_t k = 0; k < cols_; ++k) + for (size_t j = 0; j < rhs.cols_; ++j) + result.at(i, j) += at(i, k) * rhs.at(k, j); + return result; + } + + double frobeniusNorm() const { + double sum = 0.0; + for (double val : data_) + sum += val * val; + return sqrt(sum); // unqualified — ambiguous if a custom sqrt exists in scope + } + + void fill(double value) { + // "fill" collides with std::fill; confusing without qualification + fill(data_.begin(), data_.end(), value); + } + +private: + size_t rows_; + size_t cols_; + vector data_; + + void boundsCheck(size_t row, size_t col) const { + if (row >= rows_ || col >= cols_) + throw out_of_range("Matrix index out of range"); + } + + void checkCompatible(const Matrix& other) const { + if (rows_ != other.rows_ || cols_ != other.cols_) + throw invalid_argument("Matrix dimensions do not match"); + } +}; diff --git a/priv/combined_metrics/samples/code_smells/no_using_namespace_directives/good/Matrix.cpp b/priv/combined_metrics/samples/code_smells/no_using_namespace_directives/good/Matrix.cpp new file mode 100644 index 0000000..8468d0c --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_using_namespace_directives/good/Matrix.cpp @@ -0,0 +1,72 @@ +#include +#include +#include +#include + +// No "using namespace std;" — names are qualified explicitly + +class Matrix { +public: + Matrix(std::size_t rows, std::size_t cols) + : rows_(rows), cols_(cols), data_(rows * cols, 0.0) {} + + double& at(std::size_t row, std::size_t col) { + boundsCheck(row, col); + return data_[row * cols_ + col]; + } + + double at(std::size_t row, std::size_t col) const { + boundsCheck(row, col); + return data_[row * cols_ + col]; + } + + std::size_t rows() const noexcept { return rows_; } + std::size_t cols() const noexcept { return cols_; } + + Matrix operator+(const Matrix& rhs) const { + checkCompatible(rhs); + Matrix result(rows_, cols_); + std::transform(data_.begin(), data_.end(), + rhs.data_.begin(), result.data_.begin(), + std::plus()); + return result; + } + + Matrix operator*(const Matrix& rhs) const { + if (cols_ != rhs.rows_) + throw std::invalid_argument("Incompatible matrix dimensions for multiplication"); + + Matrix result(rows_, rhs.cols_); + for (std::size_t i = 0; i < rows_; ++i) + for (std::size_t k = 0; k < cols_; ++k) + for (std::size_t j = 0; j < rhs.cols_; ++j) + result.at(i, j) += at(i, k) * rhs.at(k, j); + return result; + } + + double frobeniusNorm() const { + double sum = 0.0; + for (double val : data_) + sum += val * val; + return std::sqrt(sum); + } + + void fill(double value) { + std::fill(data_.begin(), data_.end(), value); + } + +private: + std::size_t rows_; + std::size_t cols_; + std::vector data_; + + void boundsCheck(std::size_t row, std::size_t col) const { + if (row >= rows_ || col >= cols_) + throw std::out_of_range("Matrix index out of range"); + } + + void checkCompatible(const Matrix& other) const { + if (rows_ != other.rows_ || cols_ != other.cols_) + throw std::invalid_argument("Matrix dimensions do not match"); + } +}; diff --git a/priv/combined_metrics/samples/code_smells/no_virtual_calls_in_constructors/bad/Widget.cpp b/priv/combined_metrics/samples/code_smells/no_virtual_calls_in_constructors/bad/Widget.cpp new file mode 100644 index 0000000..a2e698b --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_virtual_calls_in_constructors/bad/Widget.cpp @@ -0,0 +1,72 @@ +#include +#include +#include + +class Renderer { +public: + virtual ~Renderer() = default; + virtual void draw(const std::string& label) = 0; +}; + +class Widget { +public: + explicit Widget(std::string label) + : label_(std::move(label)) + { + // Calls virtual methods in the constructor — at this point the vtable + // points to Widget's implementations, not the derived class overrides. + // This is almost certainly a bug when derived classes override these. + setupLayout(); // virtual call in ctor — dispatches to Widget::setupLayout + loadResources(); // virtual call in ctor — dispatches to Widget::loadResources + } + + virtual ~Widget() = default; + + virtual void render(Renderer& renderer) const { + renderer.draw(label_); + } + +protected: + virtual void setupLayout() { + // Base implementation — called even when derived class overrides it + minWidth_ = static_cast(label_.size()); + } + + virtual void loadResources() { + // Base implementation — derived class version is never called from ctor + } + + std::string label_; + int minWidth_ = 0; +}; + +class Button : public Widget { +public: + explicit Button(std::string label, std::string action) + : Widget(std::move(label)) // Widget ctor calls setupLayout/loadResources... + , action_(std::move(action)) + { + // ...but Button::setupLayout and Button::loadResources were NOT called above. + // The Button is not properly initialized after construction. + } + + void render(Renderer& renderer) const override { + renderer.draw("[" + label_ + "]"); + } + +protected: + void setupLayout() override { + // This override is NEVER called from Widget's constructor + minWidth_ = static_cast(label_.size()) + 4; // button-specific padding + paddingSet_ = true; + } + + void loadResources() override { + resourcesLoaded_ = true; + } + +private: + std::string action_; + bool paddingSet_ = false; // always false after construction + bool resourcesLoaded_ = false; // always false after construction +}; diff --git a/priv/combined_metrics/samples/code_smells/no_virtual_calls_in_constructors/good/Widget.cpp b/priv/combined_metrics/samples/code_smells/no_virtual_calls_in_constructors/good/Widget.cpp new file mode 100644 index 0000000..bac17fc --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_virtual_calls_in_constructors/good/Widget.cpp @@ -0,0 +1,77 @@ +#include +#include +#include + +// Deferred initialization pattern: virtual methods are not called from constructors. +// Initialization that depends on virtual behavior uses a factory or a separate init step. + +class Renderer { +public: + virtual ~Renderer() = default; + virtual void draw(const std::string& label) = 0; +}; + +class Widget { +public: + explicit Widget(std::string label) + : label_(std::move(label)), initialized_(false) + { + // Constructor only sets plain data — no virtual calls + } + + virtual ~Widget() = default; + + // Separate initialization method that can safely call virtual members + void initialize() { + if (initialized_) return; + setupLayout(); // virtual — called after construction, when vtable is correct + loadResources(); // virtual — same + initialized_ = true; + } + + virtual void render(Renderer& renderer) const { + renderer.draw(label_); + } + + const std::string& label() const noexcept { return label_; } + bool isInitialized() const noexcept { return initialized_; } + +protected: + virtual void setupLayout() {} + virtual void loadResources() {} + + std::string label_; + bool initialized_; +}; + +class Button : public Widget { +public: + explicit Button(std::string label, std::string action) + : Widget(std::move(label)), action_(std::move(action)) {} + + void render(Renderer& renderer) const override { + renderer.draw("[" + label_ + "]"); + } + +protected: + void setupLayout() override { + // Button-specific layout — runs after construction, vtable is fully set + minWidth_ = static_cast(label_.size()) + 4; + } + + void loadResources() override { + // Load button-specific resources + } + +private: + std::string action_; + int minWidth_ = 0; +}; + +// Factory ensures initialize() is called on the fully-constructed object +template +std::unique_ptr makeWidget(Args&&... args) { + auto widget = std::make_unique(std::forward(args)...); + widget->initialize(); // safe: called after full construction + return widget; +} diff --git a/priv/combined_metrics/samples/code_smells/no_with_statement/bad/report_builder.js b/priv/combined_metrics/samples/code_smells/no_with_statement/bad/report_builder.js new file mode 100644 index 0000000..0f4caee --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_with_statement/bad/report_builder.js @@ -0,0 +1,54 @@ +function formatCurrency(amount, currency = "USD") { + return new Intl.NumberFormat("en-US", { style: "currency", currency }).format(amount); +} + +function formatDate(date) { + return new Intl.DateTimeFormat("en-US", { + year: "numeric", + month: "short", + day: "numeric", + }).format(new Date(date)); +} + +function buildOrderRow(order) { + with (order) { + const total = formatCurrency(order.total, currency); + const date = formatDate(createdAt); + const statusLabel = status.charAt(0).toUpperCase() + status.slice(1); + + return { + id, + customer: `${customer.firstName} ${customer.lastName}`, + email: customer.email, + date, + total, + status: statusLabel, + itemCount: items.length, + }; + } +} + +function buildSummaryStats(orders) { + const totalRevenue = orders.reduce((sum, o) => sum + o.total, 0); + + with (Math) { + const averageOrderValue = totalRevenue / orders.length; + const completedCount = orders.filter((o) => o.status === "completed").length; + + return { + totalOrders: orders.length, + totalRevenue: formatCurrency(totalRevenue), + averageOrderValue: formatCurrency(round(averageOrderValue * 100) / 100), + completionRate: `${round((completedCount / orders.length) * 100)}%`, + }; + } +} + +function buildReport(orders) { + const rows = orders.map(buildOrderRow); + const summary = buildSummaryStats(orders); + + return { rows, summary, generatedAt: new Date().toISOString() }; +} + +export { buildReport, buildOrderRow, buildSummaryStats }; diff --git a/priv/combined_metrics/samples/code_smells/no_with_statement/good/report_builder.js b/priv/combined_metrics/samples/code_smells/no_with_statement/good/report_builder.js new file mode 100644 index 0000000..4e80900 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/no_with_statement/good/report_builder.js @@ -0,0 +1,49 @@ +function formatCurrency(amount, currency = "USD") { + return new Intl.NumberFormat("en-US", { style: "currency", currency }).format(amount); +} + +function formatDate(date) { + return new Intl.DateTimeFormat("en-US", { + year: "numeric", + month: "short", + day: "numeric", + }).format(new Date(date)); +} + +function buildOrderRow(order) { + const total = formatCurrency(order.total, order.currency); + const date = formatDate(order.createdAt); + const statusLabel = order.status.charAt(0).toUpperCase() + order.status.slice(1); + + return { + id: order.id, + customer: `${order.customer.firstName} ${order.customer.lastName}`, + email: order.customer.email, + date, + total, + status: statusLabel, + itemCount: order.items.length, + }; +} + +function buildSummaryStats(orders) { + const totalRevenue = orders.reduce((sum, o) => sum + o.total, 0); + const averageOrderValue = totalRevenue / orders.length; + const completedCount = orders.filter((o) => o.status === "completed").length; + + return { + totalOrders: orders.length, + totalRevenue: formatCurrency(totalRevenue), + averageOrderValue: formatCurrency(averageOrderValue), + completionRate: `${Math.round((completedCount / orders.length) * 100)}%`, + }; +} + +function buildReport(orders) { + const rows = orders.map(buildOrderRow); + const summary = buildSummaryStats(orders); + + return { rows, summary, generatedAt: new Date().toISOString() }; +} + +export { buildReport, buildOrderRow, buildSummaryStats, formatCurrency, formatDate }; diff --git a/priv/combined_metrics/samples/code_smells/shares_memory_by_communicating/bad/counter.go b/priv/combined_metrics/samples/code_smells/shares_memory_by_communicating/bad/counter.go new file mode 100644 index 0000000..ced67d4 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/shares_memory_by_communicating/bad/counter.go @@ -0,0 +1,39 @@ +package counter + +import "sync" + +// PageCounter tracks page view counts using a mutex-guarded map. +// Multiple goroutines directly mutate shared state instead of communicating. +type PageCounter struct { + mu sync.Mutex + counts map[string]int +} + +func NewPageCounter() *PageCounter { + return &PageCounter{ + counts: make(map[string]int), + } +} + +// Increment records a hit for the given page. +// Multiple goroutines share the map directly, protected only by a mutex. +func (c *PageCounter) Increment(page string) { + c.mu.Lock() + defer c.mu.Unlock() + c.counts[page]++ +} + +// Count returns the current hit count for the given page. +func (c *PageCounter) Count(page string) int { + c.mu.Lock() + defer c.mu.Unlock() + return c.counts[page] +} + +// Reset clears all counters. +func (c *PageCounter) Reset() { + c.mu.Lock() + defer c.mu.Unlock() + // Direct mutation of shared map — goroutines share memory instead of communicating. + c.counts = make(map[string]int) +} diff --git a/priv/combined_metrics/samples/code_smells/shares_memory_by_communicating/good/counter.go b/priv/combined_metrics/samples/code_smells/shares_memory_by_communicating/good/counter.go new file mode 100644 index 0000000..a4bcf19 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/shares_memory_by_communicating/good/counter.go @@ -0,0 +1,55 @@ +package counter + +// PageCounter tracks page view counts by routing all mutations through a channel. +// No mutex is needed; the single goroutine that owns the map is the only writer. +type PageCounter struct { + inc chan string + query chan queryReq + stop chan struct{} +} + +type queryReq struct { + page string + result chan int +} + +func NewPageCounter() *PageCounter { + c := &PageCounter{ + inc: make(chan string, 64), + query: make(chan queryReq), + stop: make(chan struct{}), + } + go c.run() + return c +} + +func (c *PageCounter) run() { + counts := make(map[string]int) + for { + select { + case page := <-c.inc: + counts[page]++ + case req := <-c.query: + req.result <- counts[req.page] + case <-c.stop: + return + } + } +} + +// Increment records a hit for the given page. Safe to call from multiple goroutines. +func (c *PageCounter) Increment(page string) { + c.inc <- page +} + +// Count returns the current hit count for the given page. +func (c *PageCounter) Count(page string) int { + result := make(chan int, 1) + c.query <- queryReq{page: page, result: result} + return <-result +} + +// Stop shuts down the background goroutine. +func (c *PageCounter) Stop() { + close(c.stop) +} diff --git a/priv/combined_metrics/samples/code_smells/single_argument_constructors_are_explicit/bad/Parser.cpp b/priv/combined_metrics/samples/code_smells/single_argument_constructors_are_explicit/bad/Parser.cpp new file mode 100644 index 0000000..ab246bc --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/single_argument_constructors_are_explicit/bad/Parser.cpp @@ -0,0 +1,97 @@ +#include +#include +#include +#include + +class TokenStream { +public: + // Missing explicit: any string is silently convertible to TokenStream + TokenStream(std::string source) + : source_(std::move(source)), position_(0) {} + + bool hasNext() const { return position_ < source_.size(); } + char peek() const { return source_[position_]; } + char consume() { return source_[position_++]; } + +private: + std::string source_; + std::size_t position_; +}; + +class ParseError : public std::runtime_error { +public: + // Missing explicit: a string literal accidentally converts to ParseError in the wrong context + ParseError(const std::string& message) : std::runtime_error(message) {} +}; + +class Token { +public: + enum class Kind { Identifier, Number, Operator, EndOfStream }; + + Token(Kind kind, std::string value) + : kind_(kind), value_(std::move(value)) {} + + Kind kind() const { return kind_; } + const std::string& value() const { return value_; } + +private: + Kind kind_; + std::string value_; +}; + +// Accepting a TokenStream by value triggers an implicit conversion from string +void processStream(TokenStream stream); + +class Parser { +public: + // Missing explicit: Parser p = "some expression"; compiles silently + Parser(std::string input) + : stream_(std::move(input)) {} + + std::vector tokenize() { + std::vector tokens; + while (stream_.hasNext()) { + skipWhitespace(); + if (!stream_.hasNext()) break; + + char c = stream_.peek(); + if (std::isalpha(c)) + tokens.push_back(readIdentifier()); + else if (std::isdigit(c)) + tokens.push_back(readNumber()); + else + tokens.push_back(readOperator()); + } + tokens.emplace_back(Token::Kind::EndOfStream, ""); + return tokens; + } + +private: + TokenStream stream_; + + void skipWhitespace() { + while (stream_.hasNext() && std::isspace(stream_.peek())) + stream_.consume(); + } + + Token readIdentifier() { + std::string value; + while (stream_.hasNext() && std::isalnum(stream_.peek())) + value += stream_.consume(); + return Token(Token::Kind::Identifier, std::move(value)); + } + + Token readNumber() { + std::string value; + while (stream_.hasNext() && std::isdigit(stream_.peek())) + value += stream_.consume(); + return Token(Token::Kind::Number, std::move(value)); + } + + Token readOperator() { + return Token(Token::Kind::Operator, std::string(1, stream_.consume())); + } +}; + +// This compiles because Parser(std::string) is not explicit: +// Parser p = std::string("1 + 2"); // implicit conversion diff --git a/priv/combined_metrics/samples/code_smells/single_argument_constructors_are_explicit/good/Parser.cpp b/priv/combined_metrics/samples/code_smells/single_argument_constructors_are_explicit/good/Parser.cpp new file mode 100644 index 0000000..6f2a96a --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/single_argument_constructors_are_explicit/good/Parser.cpp @@ -0,0 +1,93 @@ +#include +#include +#include +#include + +class TokenStream { +public: + explicit TokenStream(std::string source) // explicit: prevents implicit string -> TokenStream + : source_(std::move(source)), position_(0) {} + + bool hasNext() const noexcept { return position_ < source_.size(); } + char peek() const { return source_[position_]; } + char consume() { return source_[position_++]; } + +private: + std::string source_; + std::size_t position_; +}; + +class ParseError : public std::runtime_error { +public: + explicit ParseError(const std::string& message) // explicit: single-arg exception ctor + : std::runtime_error(message) {} + + explicit ParseError(std::string_view message) + : std::runtime_error(std::string(message)) {} +}; + +class Token { +public: + enum class Kind { Identifier, Number, Operator, EndOfStream }; + + Token(Kind kind, std::string value) // two-arg ctor: explicit not required but consistent + : kind_(kind), value_(std::move(value)) {} + + Kind kind() const noexcept { return kind_; } + const std::string& value() const noexcept { return value_; } + +private: + Kind kind_; + std::string value_; +}; + +class Parser { +public: + // explicit: prevents accidental conversion from string to Parser + explicit Parser(std::string input) + : stream_(std::move(input)) {} + + std::vector tokenize() { + std::vector tokens; + while (stream_.hasNext()) { + skipWhitespace(); + if (!stream_.hasNext()) break; + + char c = stream_.peek(); + if (std::isalpha(c)) + tokens.push_back(readIdentifier()); + else if (std::isdigit(c)) + tokens.push_back(readNumber()); + else + tokens.push_back(readOperator()); + } + tokens.emplace_back(Token::Kind::EndOfStream, ""); + return tokens; + } + +private: + TokenStream stream_; + + void skipWhitespace() { + while (stream_.hasNext() && std::isspace(stream_.peek())) + stream_.consume(); + } + + Token readIdentifier() { + std::string value; + while (stream_.hasNext() && std::isalnum(stream_.peek())) + value += stream_.consume(); + return Token(Token::Kind::Identifier, std::move(value)); + } + + Token readNumber() { + std::string value; + while (stream_.hasNext() && std::isdigit(stream_.peek())) + value += stream_.consume(); + return Token(Token::Kind::Number, std::move(value)); + } + + Token readOperator() { + return Token(Token::Kind::Operator, std::string(1, stream_.consume())); + } +}; diff --git a/priv/combined_metrics/samples/code_smells/supervised_processes_in_supervision_tree/bad/analytics.ex b/priv/combined_metrics/samples/code_smells/supervised_processes_in_supervision_tree/bad/analytics.ex new file mode 100644 index 0000000..fad35d2 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/supervised_processes_in_supervision_tree/bad/analytics.ex @@ -0,0 +1,65 @@ +defmodule MyApp.Analytics.EventBuffer do + @moduledoc """ + Buffers analytics events. + """ + + @flush_interval_ms 5_000 + + # Bad: starts the background loop with a bare spawn/1 call. + # If this process crashes it will not be restarted, and no crash report + # is linked to any supervisor — it silently disappears. + @spec start() :: pid() + def start do + state = %{buffer: [], count: 0} + spawn(fn -> loop(state) end) + end + + @spec push(pid(), map()) :: :ok + def push(pid, event) when is_map(event) do + send(pid, {:push, event}) + :ok + end + + defp loop(state) do + receive do + {:push, event} -> + new_state = %{state | buffer: [event | state.buffer], count: state.count + 1} + + if new_state.count >= 500 do + flush(new_state.buffer) + loop(%{buffer: [], count: 0}) + else + loop(new_state) + end + + :flush -> + flush(state.buffer) + schedule_flush(self()) + loop(%{buffer: [], count: 0}) + end + end + + defp flush(events), do: MyApp.Analytics.Store.insert_all(Enum.reverse(events)) + + defp schedule_flush(pid) do + Process.send_after(pid, :flush, @flush_interval_ms) + end +end + +defmodule MyApp.Application do + use Application + + @impl true + def start(_type, _args) do + children = [MyApp.Repo] + + result = Supervisor.start_link(children, strategy: :one_for_one) + + # Bad: EventBuffer is started with spawn/1 outside the supervision tree. + # It will not be restarted on crash and the PID is hard to track. + pid = MyApp.Analytics.EventBuffer.start() + Process.register(pid, :event_buffer) + + result + end +end diff --git a/priv/combined_metrics/samples/code_smells/supervised_processes_in_supervision_tree/good/analytics.ex b/priv/combined_metrics/samples/code_smells/supervised_processes_in_supervision_tree/good/analytics.ex new file mode 100644 index 0000000..caa83d3 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/supervised_processes_in_supervision_tree/good/analytics.ex @@ -0,0 +1,74 @@ +defmodule MyApp.Analytics.EventBuffer do + @moduledoc """ + Buffers analytics events before flushing to the database. + Started under the application supervisor — never spawned bare. + """ + + use GenServer, restart: :permanent + + @flush_interval_ms 5_000 + @max_buffer_size 500 + + def start_link(opts) do + GenServer.start_link(__MODULE__, opts, name: __MODULE__) + end + + @doc "Adds an event to the buffer." + @spec push(map()) :: :ok + def push(event) when is_map(event) do + GenServer.cast(__MODULE__, {:push, event}) + end + + @impl true + def init(_opts) do + schedule_flush() + {:ok, %{buffer: [], count: 0}} + end + + @impl true + def handle_cast({:push, event}, %{buffer: buf, count: count} = state) do + new_state = %{state | buffer: [event | buf], count: count + 1} + + if new_state.count >= @max_buffer_size do + flush(new_state.buffer) + {:noreply, %{new_state | buffer: [], count: 0}} + else + {:noreply, new_state} + end + end + + @impl true + def handle_info(:flush, %{buffer: []} = state) do + schedule_flush() + {:noreply, state} + end + + def handle_info(:flush, %{buffer: buf} = state) do + flush(buf) + schedule_flush() + {:noreply, %{state | buffer: [], count: 0}} + end + + defp flush(events) do + MyApp.Analytics.Store.insert_all(Enum.reverse(events)) + end + + defp schedule_flush do + Process.send_after(self(), :flush, @flush_interval_ms) + end +end + +defmodule MyApp.Application do + use Application + + @impl true + def start(_type, _args) do + children = [ + MyApp.Repo, + # Good: EventBuffer is started as a supervised child, not with spawn/1 + MyApp.Analytics.EventBuffer + ] + + Supervisor.start_link(children, strategy: :one_for_one) + end +end diff --git a/priv/combined_metrics/samples/code_smells/switch_has_default_case/bad/notification_router.js b/priv/combined_metrics/samples/code_smells/switch_has_default_case/bad/notification_router.js new file mode 100644 index 0000000..f43c6b1 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/switch_has_default_case/bad/notification_router.js @@ -0,0 +1,58 @@ +function getNotificationChannel(userPreferences, notificationType) { + switch (notificationType) { + case "order_confirmed": + return userPreferences.emailEnabled ? "email" : "push"; + case "order_shipped": + return "push"; + case "order_delivered": + return userPreferences.emailEnabled ? "email" : "push"; + case "payment_failed": + return "email"; + case "account_locked": + return "email"; + case "promotional": + return userPreferences.marketingEnabled ? "email" : null; + } +} + +function formatNotificationMessage(notification) { + const { type, data } = notification; + + switch (type) { + case "order_confirmed": + return { + subject: `Order #${data.orderId} confirmed`, + body: `Your order has been confirmed and is being prepared.`, + }; + case "order_shipped": + return { + subject: `Order #${data.orderId} is on its way`, + body: `Your order has shipped. Tracking number: ${data.trackingNumber}`, + }; + case "order_delivered": + return { + subject: `Order #${data.orderId} delivered`, + body: `Your order has been delivered. Enjoy!`, + }; + case "payment_failed": + return { + subject: "Payment failed", + body: `Your payment of ${data.amount} could not be processed.`, + }; + } +} + +function getNotificationPriority(type) { + switch (type) { + case "account_locked": + return "critical"; + case "payment_failed": + return "high"; + case "order_shipped": + return "medium"; + case "promotional": + return "low"; + } +} + +export { getNotificationChannel, formatNotificationMessage, getNotificationPriority }; diff --git a/priv/combined_metrics/samples/code_smells/switch_has_default_case/good/notification_router.js b/priv/combined_metrics/samples/code_smells/switch_has_default_case/good/notification_router.js new file mode 100644 index 0000000..562cacd --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/switch_has_default_case/good/notification_router.js @@ -0,0 +1,52 @@ +import logger from "./logger.js"; + +function getNotificationChannel(userPreferences, notificationType) { + switch (notificationType) { + case "order_confirmed": + return userPreferences.emailEnabled ? "email" : "push"; + case "order_shipped": + return "push"; + case "order_delivered": + return userPreferences.emailEnabled ? "email" : "push"; + case "payment_failed": + return "email"; + case "account_locked": + return "email"; + case "promotional": + return userPreferences.marketingEnabled ? "email" : null; + default: + logger.warn(`Unknown notification type: '${notificationType}'`); + return null; + } +} + +function formatNotificationMessage(notification) { + const { type, data } = notification; + + switch (type) { + case "order_confirmed": + return { + subject: `Order #${data.orderId} confirmed`, + body: `Your order has been confirmed and is being prepared.`, + }; + case "order_shipped": + return { + subject: `Order #${data.orderId} is on its way`, + body: `Your order has shipped. Tracking number: ${data.trackingNumber}`, + }; + case "order_delivered": + return { + subject: `Order #${data.orderId} delivered`, + body: `Your order has been delivered. Enjoy!`, + }; + case "payment_failed": + return { + subject: "Payment failed", + body: `Your payment of ${data.amount} could not be processed.`, + }; + default: + throw new Error(`No message template for notification type: '${type}'`); + } +} + +export { getNotificationChannel, formatNotificationMessage }; diff --git a/priv/combined_metrics/samples/code_smells/uses_appropriate_dispatcher/bad/AnalyticsProcessor.kt b/priv/combined_metrics/samples/code_smells/uses_appropriate_dispatcher/bad/AnalyticsProcessor.kt new file mode 100644 index 0000000..05ef94c --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_appropriate_dispatcher/bad/AnalyticsProcessor.kt @@ -0,0 +1,54 @@ +package com.example.analytics + +import kotlinx.coroutines.* +import java.time.LocalDate + +data class DailyStats(val date: LocalDate, val totalRevenue: Double, val orderCount: Int) + +class AnalyticsProcessor( + private val repository: AnalyticsRepository, + private val fileExporter: CsvExporter +) { + + /** + * CPU-intensive computation incorrectly uses Dispatchers.IO. + * IO's thread pool is designed for blocking calls — using it for heavy + * CPU work starves I/O threads and misses parallelism optimisations. + */ + suspend fun computeDailyStats(events: List): List = + withContext(Dispatchers.IO) { // Wrong dispatcher for CPU work + events + .groupBy { it.occurredAt.toLocalDate() } + .map { (date, dayEvents) -> + val revenue = dayEvents.sumOf { it.amount } + val orders = dayEvents.count { it.type == "ORDER_PLACED" } + DailyStats(date, revenue, orders) + } + .sortedBy { it.date } + } + + /** + * Database access incorrectly uses Dispatchers.Default. + * Default is for CPU-bound work; blocking DB calls here will exhaust + * the limited Default thread pool and degrade all CPU-bound tasks. + */ + suspend fun loadEvents(from: LocalDate, to: LocalDate): List = + withContext(Dispatchers.Default) { // Wrong dispatcher for I/O + repository.findBetween(from, to) + } + + /** + * File write runs on Dispatchers.Default — blocking I/O on a CPU dispatcher + * ties up a thread that should be doing computation. + */ + suspend fun exportToCsv(stats: List, outputPath: String) = + withContext(Dispatchers.Default) { // Wrong dispatcher for file I/O + fileExporter.write(outputPath, stats) + } + + suspend fun generateReport(from: LocalDate, to: LocalDate, outputPath: String) { + val events = loadEvents(from, to) + val stats = computeDailyStats(events) + exportToCsv(stats, outputPath) + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_appropriate_dispatcher/good/AnalyticsProcessor.kt b/priv/combined_metrics/samples/code_smells/uses_appropriate_dispatcher/good/AnalyticsProcessor.kt new file mode 100644 index 0000000..57fa3b4 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_appropriate_dispatcher/good/AnalyticsProcessor.kt @@ -0,0 +1,54 @@ +package com.example.analytics + +import kotlinx.coroutines.* +import java.time.LocalDate + +data class DailyStats(val date: LocalDate, val totalRevenue: Double, val orderCount: Int) + +class AnalyticsProcessor( + private val repository: AnalyticsRepository, + private val fileExporter: CsvExporter +) { + + /** + * CPU-intensive aggregation uses Dispatchers.Default — optimised for + * parallel computation on a thread pool sized to available CPU cores. + */ + suspend fun computeDailyStats(events: List): List = + withContext(Dispatchers.Default) { + events + .groupBy { it.occurredAt.toLocalDate() } + .map { (date, dayEvents) -> + val revenue = dayEvents.sumOf { it.amount } + val orders = dayEvents.count { it.type == "ORDER_PLACED" } + DailyStats(date, revenue, orders) + } + .sortedBy { it.date } + } + + /** + * Database reads and writes use Dispatchers.IO — the thread pool is sized + * for blocking I/O without starving CPU-bound work. + */ + suspend fun loadEvents(from: LocalDate, to: LocalDate): List = + withContext(Dispatchers.IO) { + repository.findBetween(from, to) + } + + /** + * File write is I/O-bound — Dispatchers.IO is appropriate. + */ + suspend fun exportToCsv(stats: List, outputPath: String) = + withContext(Dispatchers.IO) { + fileExporter.write(outputPath, stats) + } + + /** + * Orchestrator — each step runs on the right dispatcher via the helpers above. + */ + suspend fun generateReport(from: LocalDate, to: LocalDate, outputPath: String) { + val events = loadEvents(from, to) // I/O dispatcher + val stats = computeDailyStats(events) // Default dispatcher + exportToCsv(stats, outputPath) // I/O dispatcher + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_arc_only_with_send_sync_types/bad/pool.rs b/priv/combined_metrics/samples/code_smells/uses_arc_only_with_send_sync_types/bad/pool.rs new file mode 100644 index 0000000..c44f1cc --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_arc_only_with_send_sync_types/bad/pool.rs @@ -0,0 +1,52 @@ +use std::cell::RefCell; +use std::rc::Rc; +use std::sync::Arc; + +/// A connection that uses RefCell for interior mutability. +/// RefCell is NOT Sync — it cannot be safely accessed from multiple threads. +pub struct Connection { + pub id: u64, + pub url: String, + // RefCell is !Sync — wrapping this in Arc is unsound + pub state: RefCell, +} + +// BAD: Arc requires Connection: Send + Sync. +// Because Connection contains RefCell, it is !Sync. +// This will fail to compile when sent across threads, but the intent is wrong. +pub struct Pool { + // Arc here is misleading — it looks thread-safe but isn't + connections: Vec>, +} + +impl Pool { + pub fn new() -> Self { + Self { connections: Vec::new() } + } + + pub fn add(&mut self, url: impl Into) -> Arc { + let conn = Arc::new(Connection { + id: self.connections.len() as u64, + url: url.into(), + state: RefCell::new("idle".to_string()), + }); + self.connections.push(Arc::clone(&conn)); + conn + } +} + +// BAD: Rc is not Send, so Arc> cannot be used across threads. +// Using Arc here is misleading — the Rc inside prevents thread sharing. +pub struct BadSharedHandle { + inner: Arc>, +} + +impl BadSharedHandle { + pub fn new(s: impl Into) -> Self { + Self { inner: Arc::new(Rc::new(s.into())) } + } + + pub fn value(&self) -> Rc { + Rc::clone(&self.inner) + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_arc_only_with_send_sync_types/good/pool.rs b/priv/combined_metrics/samples/code_smells/uses_arc_only_with_send_sync_types/good/pool.rs new file mode 100644 index 0000000..63865a9 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_arc_only_with_send_sync_types/good/pool.rs @@ -0,0 +1,55 @@ +use std::sync::{Arc, Mutex}; + +/// A connection that is safe to share across threads. +/// Implements Send + Sync, making it valid to wrap in Arc. +#[derive(Debug)] +pub struct Connection { + pub id: u64, + pub url: String, + pub active: bool, +} + +// Connection is Send + Sync because all its fields are Send + Sync +// Arc is therefore also Send + Sync — correct usage +pub struct Pool { + connections: Vec>>, + max_size: usize, +} + +impl Pool { + pub fn new(max_size: usize) -> Self { + Self { connections: Vec::new(), max_size } + } + + pub fn add(&mut self, url: impl Into) -> Arc> { + let conn = Arc::new(Mutex::new(Connection { + id: self.connections.len() as u64, + url: url.into(), + active: true, + })); + self.connections.push(Arc::clone(&conn)); + conn + } + + // Returns an Arc to share the connection safely between threads + pub fn get_connection(&self, id: u64) -> Option>> { + self.connections + .iter() + .find(|c| c.lock().map(|c| c.id == id).unwrap_or(false)) + .map(Arc::clone) + } + + pub fn active_count(&self) -> usize { + self.connections + .iter() + .filter(|c| c.lock().map(|c| c.active).unwrap_or(false)) + .count() + } +} + +// Shared, thread-safe state — Arc wraps a Mutex>, all Send + Sync +pub type SharedLog = Arc>>; + +pub fn create_shared_log() -> SharedLog { + Arc::new(Mutex::new(Vec::new())) +} diff --git a/priv/combined_metrics/samples/code_smells/uses_attr_accessor_not_manual_getter_setter/bad/payment.rb b/priv/combined_metrics/samples/code_smells/uses_attr_accessor_not_manual_getter_setter/bad/payment.rb new file mode 100644 index 0000000..e515c9c --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_attr_accessor_not_manual_getter_setter/bad/payment.rb @@ -0,0 +1,58 @@ +class PaymentMethod + def initialize(attrs = {}) + @id = attrs[:id] + @card_type = attrs[:card_type] + @last_four = attrs[:last_four] + @expires_at = attrs[:expires_at] + @billing_address = attrs[:billing_address] + @nickname = attrs[:nickname] + @is_default = attrs[:is_default] || false + end + + # Manual getters — should use attr_reader + def id + @id + end + + def card_type + @card_type + end + + def last_four + @last_four + end + + def expires_at + @expires_at + end + + def billing_address + @billing_address + end + + # Manual getter + setter pair — should use attr_accessor + def nickname + @nickname + end + + def nickname=(value) + @nickname = value + end + + def is_default + @is_default + end + + def is_default=(value) + @is_default = value + end + + def expired? + @expires_at < Date.today + end + + def display_name + base = "#{@card_type.upcase} ending in #{@last_four}" + @nickname ? "#{@nickname} (#{base})" : base + end +end diff --git a/priv/combined_metrics/samples/code_smells/uses_attr_accessor_not_manual_getter_setter/good/payment.rb b/priv/combined_metrics/samples/code_smells/uses_attr_accessor_not_manual_getter_setter/good/payment.rb new file mode 100644 index 0000000..b00ead9 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_attr_accessor_not_manual_getter_setter/good/payment.rb @@ -0,0 +1,39 @@ +class PaymentMethod + attr_reader :id, :card_type, :last_four, :expires_at, :billing_address + attr_accessor :nickname, :is_default + + def initialize(attrs = {}) + @id = attrs[:id] + @card_type = attrs[:card_type] + @last_four = attrs[:last_four] + @expires_at = attrs[:expires_at] + @billing_address = attrs[:billing_address] + @nickname = attrs[:nickname] + @is_default = attrs[:is_default] || false + end + + def expired? + expires_at < Date.today + end + + def expiring_soon? + !expired? && expires_at < 30.days.from_now + end + + def display_name + base = "#{card_type.upcase} ending in #{last_four}" + nickname ? "#{nickname} (#{base})" : base + end + + def to_h + { + id: id, + card_type: card_type, + last_four: last_four, + expires_at: expires_at, + nickname: nickname, + is_default: is_default, + expired: expired? + } + end +end diff --git a/priv/combined_metrics/samples/code_smells/uses_context_manager_for_resources/bad/database_manager.py b/priv/combined_metrics/samples/code_smells/uses_context_manager_for_resources/bad/database_manager.py new file mode 100644 index 0000000..e3bf408 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_context_manager_for_resources/bad/database_manager.py @@ -0,0 +1,62 @@ +"""Database manager providing connection pooling and query helpers.""" +from __future__ import annotations + +import sqlite3 +from typing import Any, Optional + + +DB_PATH = ":memory:" + + +def get_connection(path: str = DB_PATH) -> sqlite3.Connection: + """Return a raw connection — caller is responsible for closing it.""" + return sqlite3.connect(path) + + +def execute_query( + sql: str, + params: tuple = (), + path: str = DB_PATH, +) -> list[dict[str, Any]]: + """Run a SELECT query — connection left open if an exception is raised.""" + conn = get_connection(path) # no context manager + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + cursor.execute(sql, params) # if this raises, conn is never closed + rows = [dict(row) for row in cursor.fetchall()] + conn.close() # only reached on success + return rows + + +def execute_write( + sql: str, + params: tuple = (), + path: str = DB_PATH, +) -> int: + """Execute a write — connection leaked on error; no rollback on failure.""" + conn = get_connection(path) + cursor = conn.cursor() + cursor.execute(sql, params) # exception here leaks conn + conn.commit() + conn.close() + return cursor.rowcount + + +def export_to_csv( + sql: str, + output_path: str, + path: str = DB_PATH, +) -> int: + """Export results — both file and connection are manually managed.""" + import csv + rows = execute_query(sql, path=path) + if not rows: + return 0 + + csv_file = open(output_path, "w", newline="", encoding="utf-8") # no 'with' + writer = csv.DictWriter(csv_file, fieldnames=rows[0].keys()) + writer.writeheader() + writer.writerows(rows) + csv_file.close() # only reached if writerows() doesn't raise + + return len(rows) diff --git a/priv/combined_metrics/samples/code_smells/uses_context_manager_for_resources/good/database_manager.py b/priv/combined_metrics/samples/code_smells/uses_context_manager_for_resources/good/database_manager.py new file mode 100644 index 0000000..6f0405e --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_context_manager_for_resources/good/database_manager.py @@ -0,0 +1,67 @@ +"""Database manager providing connection pooling and query helpers.""" +from __future__ import annotations + +import sqlite3 +from contextlib import contextmanager +from typing import Any, Generator, Optional + + +DB_PATH = ":memory:" + + +@contextmanager +def get_connection(path: str = DB_PATH) -> Generator[sqlite3.Connection, None, None]: + """Yield a database connection, committing on success and rolling back on error.""" + conn = sqlite3.connect(path) + try: + yield conn + conn.commit() + except Exception: + conn.rollback() + raise + finally: + conn.close() + + +def execute_query( + sql: str, + params: tuple = (), + path: str = DB_PATH, +) -> list[dict[str, Any]]: + """Run a SELECT query and return rows as dicts — connection closed automatically.""" + with get_connection(path) as conn: + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + cursor.execute(sql, params) + return [dict(row) for row in cursor.fetchall()] + + +def execute_write( + sql: str, + params: tuple = (), + path: str = DB_PATH, +) -> int: + """Execute an INSERT/UPDATE/DELETE and return the number of affected rows.""" + with get_connection(path) as conn: + cursor = conn.cursor() + cursor.execute(sql, params) + return cursor.rowcount + + +def export_to_csv( + sql: str, + output_path: str, + path: str = DB_PATH, +) -> int: + """Export query results to a CSV file using context managers for both resources.""" + import csv + rows = execute_query(sql, path=path) + if not rows: + return 0 + + with open(output_path, "w", newline="", encoding="utf-8") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=rows[0].keys()) + writer.writeheader() + writer.writerows(rows) + + return len(rows) diff --git a/priv/combined_metrics/samples/code_smells/uses_copied_on_copy_types/bad/metrics.rs b/priv/combined_metrics/samples/code_smells/uses_copied_on_copy_types/bad/metrics.rs new file mode 100644 index 0000000..72a44f8 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_copied_on_copy_types/bad/metrics.rs @@ -0,0 +1,48 @@ +pub struct MetricSeries { + pub timestamps: Vec, + pub values: Vec, +} + +impl MetricSeries { + pub fn new(timestamps: Vec, values: Vec) -> Self { + Self { timestamps, values } + } + + // Bad: .cloned() on u64 — u64 is Copy, .copied() is the right choice + pub fn recent_timestamps(&self, n: usize) -> Vec { + self.timestamps + .iter() + .rev() + .take(n) + .cloned() + .collect() + } + + // Bad: .cloned() on f64 — misleadingly suggests Clone behavior + pub fn max_value(&self) -> Option { + self.values.iter().cloned().reduce(f64::max) + } + + pub fn values_above(&self, threshold: f64) -> Vec { + self.values + .iter() + .cloned() + .filter(|&v| v > threshold) + .collect() + } + + pub fn timestamp_range(&self) -> Option<(u64, u64)> { + // Bad: .cloned() on a Copy type throughout + let min = self.timestamps.iter().cloned().min()?; + let max = self.timestamps.iter().cloned().max()?; + Some((min, max)) + } + + // Bad: .cloned() on i32 — i32 implements Copy, not just Clone + pub fn count_ids_above(ids: &[i32], threshold: i32) -> Vec { + ids.iter() + .cloned() + .filter(|&id| id > threshold) + .collect() + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_copied_on_copy_types/good/metrics.rs b/priv/combined_metrics/samples/code_smells/uses_copied_on_copy_types/good/metrics.rs new file mode 100644 index 0000000..f77af03 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_copied_on_copy_types/good/metrics.rs @@ -0,0 +1,47 @@ +pub struct MetricSeries { + pub timestamps: Vec, + pub values: Vec, +} + +impl MetricSeries { + pub fn new(timestamps: Vec, values: Vec) -> Self { + Self { timestamps, values } + } + + // u64 is Copy — use .copied() to avoid the misleading suggestion of Clone + pub fn recent_timestamps(&self, n: usize) -> Vec { + self.timestamps + .iter() + .rev() + .take(n) + .copied() + .collect() + } + + // f64 is Copy — .copied() is correct and clear + pub fn max_value(&self) -> Option { + self.values.iter().copied().reduce(f64::max) + } + + pub fn values_above(&self, threshold: f64) -> Vec { + self.values + .iter() + .copied() + .filter(|&v| v > threshold) + .collect() + } + + pub fn timestamp_range(&self) -> Option<(u64, u64)> { + let min = self.timestamps.iter().copied().min()?; + let max = self.timestamps.iter().copied().max()?; + Some((min, max)) + } + + // i32 is Copy — .copied() communicates intent clearly + pub fn count_ids_above(ids: &[i32], threshold: i32) -> Vec { + ids.iter() + .copied() + .filter(|&id| id > threshold) + .collect() + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_mb_string_functions_for_unicode/bad/ProductCatalog.php b/priv/combined_metrics/samples/code_smells/uses_mb_string_functions_for_unicode/bad/ProductCatalog.php new file mode 100644 index 0000000..80ae1f6 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_mb_string_functions_for_unicode/bad/ProductCatalog.php @@ -0,0 +1,62 @@ + String { + // Magic number check instead of nil check + if age == -1 { + return "Age not provided" + } + return "\(age) years old" + } + + func contactSummary() -> String { + var parts: [String] = [email] + // Empty string check instead of nil check + if !phoneNumber.isEmpty { + parts.append(phoneNumber) + } + return parts.joined(separator: " | ") + } + + func loyaltySummary() -> String { + // Magic number check throughout codebase + if loyaltyPoints == -1 { + return "Not enrolled in loyalty program" + } + return "\(loyaltyPoints) points" + } + + func hasLocation() -> Bool { + // Magic value comparison: 0/0 is technically a real coordinate + return latitude != 0.0 || longitude != 0.0 + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_optionals_not_sentinel_values/good/UserProfile.swift b/priv/combined_metrics/samples/code_smells/uses_optionals_not_sentinel_values/good/UserProfile.swift new file mode 100644 index 0000000..9215e6a --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_optionals_not_sentinel_values/good/UserProfile.swift @@ -0,0 +1,57 @@ +import Foundation + +struct Address { + let street: String + let city: String + let postalCode: String + let country: String +} + +struct UserProfile { + let id: String + var displayName: String + var email: String + var age: Int? + var phoneNumber: String? + var bio: String? + var address: Address? + var loyaltyPoints: Int? + var lastLoginDate: Date? + + var isPhoneVerified: Bool + var isPremiumMember: Bool + + func formattedAge() -> String { + guard let age = age else { + return "Age not provided" + } + return "\(age) years old" + } + + func contactSummary() -> String { + var parts: [String] = [email] + if let phone = phoneNumber { + parts.append(phone) + } + return parts.joined(separator: " | ") + } + + func loyaltySummary() -> String { + guard let points = loyaltyPoints else { + return "Not enrolled in loyalty program" + } + return "\(points) points" + } +} + +class UserProfileRepository { + private var profiles: [String: UserProfile] = [:] + + func profile(for userID: String) -> UserProfile? { + return profiles[userID] + } + + func update(_ profile: UserProfile) { + profiles[profile.id] = profile + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_recover_in_long_running_goroutines/bad/server.go b/priv/combined_metrics/samples/code_smells/uses_recover_in_long_running_goroutines/bad/server.go new file mode 100644 index 0000000..0b451f7 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_recover_in_long_running_goroutines/bad/server.go @@ -0,0 +1,46 @@ +package server + +import ( + "context" + "log" +) + +type Task struct { + ID string + Payload string +} + +type Executor interface { + Execute(ctx context.Context, task Task) error +} + +// TaskServer runs tasks continuously but does not recover from panics. +// A single panic in Execute will crash the entire process. +type TaskServer struct { + executor Executor + tasks <-chan Task + logger *log.Logger +} + +func New(executor Executor, tasks <-chan Task, logger *log.Logger) *TaskServer { + return &TaskServer{executor: executor, tasks: tasks, logger: logger} +} + +// Run starts the task processing loop with no panic recovery. +func (s *TaskServer) Run(ctx context.Context) { + for { + select { + case <-ctx.Done(): + s.logger.Println("task server shutting down") + return + case task, ok := <-s.tasks: + if !ok { + return + } + // No recover — a panic inside Execute terminates the process. + if err := s.executor.Execute(ctx, task); err != nil { + s.logger.Printf("task %s failed: %v", task.ID, err) + } + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_recover_in_long_running_goroutines/good/server.go b/priv/combined_metrics/samples/code_smells/uses_recover_in_long_running_goroutines/good/server.go new file mode 100644 index 0000000..acd60f4 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_recover_in_long_running_goroutines/good/server.go @@ -0,0 +1,58 @@ +package server + +import ( + "context" + "log" + "time" +) + +type Task struct { + ID string + Payload string +} + +type Executor interface { + Execute(ctx context.Context, task Task) error +} + +// TaskServer runs tasks continuously and recovers from panics so the process stays alive. +type TaskServer struct { + executor Executor + tasks <-chan Task + logger *log.Logger +} + +func New(executor Executor, tasks <-chan Task, logger *log.Logger) *TaskServer { + return &TaskServer{executor: executor, tasks: tasks, logger: logger} +} + +// Run starts the task processing loop. It recovers from panics within each iteration. +func (s *TaskServer) Run(ctx context.Context) { + for { + select { + case <-ctx.Done(): + s.logger.Println("task server shutting down") + return + case task, ok := <-s.tasks: + if !ok { + return + } + s.processWithRecover(ctx, task) + } + } +} + +// processWithRecover wraps task execution in a deferred recover so a panic in +// Execute cannot crash the entire server process. +func (s *TaskServer) processWithRecover(ctx context.Context, task Task) { + defer func() { + if r := recover(); r != nil { + s.logger.Printf("panic processing task %s: %v — continuing", task.ID, r) + time.Sleep(100 * time.Millisecond) // brief back-off after panic + } + }() + + if err := s.executor.Execute(ctx, task); err != nil { + s.logger.Printf("task %s failed: %v", task.ID, err) + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_short_circuit_operators/bad/AccessPolicy.cs b/priv/combined_metrics/samples/code_smells/uses_short_circuit_operators/bad/AccessPolicy.cs new file mode 100644 index 0000000..3b77421 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_short_circuit_operators/bad/AccessPolicy.cs @@ -0,0 +1,56 @@ +using System; + +namespace Security +{ + public class AccessPolicy + { + public bool CanReadDocument(User user, Document document) + { + // Non-short-circuit & evaluates both sides even if user is null → NullReferenceException + return user != null & document != null + & (user.IsAdmin | document.OwnerId == user.Id | user.HasRole("reader")); + } + + public bool CanEditDocument(User user, Document document) + { + if (user == null | document == null) // | does not short-circuit + return false; + + // | evaluates both sides; IsLocked check runs even when OwnerId doesn't match + return user.IsAdmin | (document.OwnerId == user.Id & !document.IsLocked); + } + + public bool ShouldSendAlert(SystemMetrics metrics) + { + // Non-short-circuit & can throw if metrics is null + return metrics != null + & (metrics.CpuUsage > 90.0 | metrics.MemoryUsage > 85.0) + & metrics.AlertsEnabled; + } + + public string ResolveDisplayName(User user) + { + // Non-short-circuit & evaluates IsNullOrWhiteSpace even when user is null + return user != null & !string.IsNullOrWhiteSpace(user.DisplayName) + ? user.DisplayName + : "Anonymous"; + } + + public bool IsValidRequest(ApiRequest request) + { + // All conditions evaluated regardless; throws if request is null + return request != null + & !string.IsNullOrWhiteSpace(request.ApiKey) + & request.Timestamp > DateTimeOffset.UtcNow.AddMinutes(-5) + & request.Payload?.Length <= 1_048_576; + } + + public bool ShouldRetry(HttpResponse response, int attempt) + { + // Non-short-circuit | evaluates status codes even when response is null + return response != null + & attempt < 3 + & (response.StatusCode == 429 | response.StatusCode >= 500); + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_short_circuit_operators/good/AccessPolicy.cs b/priv/combined_metrics/samples/code_smells/uses_short_circuit_operators/good/AccessPolicy.cs new file mode 100644 index 0000000..1c5b6ea --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_short_circuit_operators/good/AccessPolicy.cs @@ -0,0 +1,54 @@ +using System; + +namespace Security +{ + public class AccessPolicy + { + public bool CanReadDocument(User user, Document document) + { + // Short-circuit: if user is null, the rest is never evaluated + return user != null && document != null + && (user.IsAdmin || document.OwnerId == user.Id || user.HasRole("reader")); + } + + public bool CanEditDocument(User user, Document document) + { + if (user == null || document == null) + return false; + + // Short-circuit: IsAdmin check avoids evaluating the more expensive checks + return user.IsAdmin || (document.OwnerId == user.Id && !document.IsLocked); + } + + public bool ShouldSendAlert(SystemMetrics metrics) + { + // Short-circuit: if metrics is null, subsequent property access is skipped + return metrics != null + && (metrics.CpuUsage > 90.0 || metrics.MemoryUsage > 85.0) + && metrics.AlertsEnabled; + } + + public string ResolveDisplayName(User user) + { + // Short-circuit null coalescing with && guards + return user != null && !string.IsNullOrWhiteSpace(user.DisplayName) + ? user.DisplayName + : "Anonymous"; + } + + public bool IsValidRequest(ApiRequest request) + { + return request != null + && !string.IsNullOrWhiteSpace(request.ApiKey) + && request.Timestamp > DateTimeOffset.UtcNow.AddMinutes(-5) + && request.Payload?.Length <= 1_048_576; + } + + public bool ShouldRetry(HttpResponse response, int attempt) + { + return response != null + && attempt < 3 + && (response.StatusCode == 429 || response.StatusCode >= 500); + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_smart_pointers_for_ownership/bad/Buffer.cpp b/priv/combined_metrics/samples/code_smells/uses_smart_pointers_for_ownership/bad/Buffer.cpp new file mode 100644 index 0000000..d1ee596 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_smart_pointers_for_ownership/bad/Buffer.cpp @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include + +class Buffer { +public: + explicit Buffer(std::size_t capacity) + : capacity_(capacity) + , size_(0) + { + // Raw owning pointer — must be manually deleted; leaks on exception + data_ = new uint8_t[capacity]; + } + + ~Buffer() { + delete[] data_; // relies on correct destructor call; no RAII safety + } + + // Copy constructor not implemented — double-free if copied + Buffer(const Buffer&) = delete; + Buffer& operator=(const Buffer&) = delete; + + void write(const uint8_t* src, std::size_t length) { + if (size_ + length > capacity_) + throw std::overflow_error("Buffer capacity exceeded"); + std::memcpy(data_ + size_, src, length); + size_ += length; + } + + std::size_t read(uint8_t* dst, std::size_t maxLength) const { + std::size_t toRead = std::min(maxLength, size_); + std::memcpy(dst, data_, toRead); + return toRead; + } + + void clear() { size_ = 0; } + std::size_t size() const { return size_; } + +private: + uint8_t* data_; // raw owning pointer — manual memory management + std::size_t capacity_; + std::size_t size_; +}; + +class BufferPool { +public: + explicit BufferPool(std::size_t bufferSize, std::size_t poolSize) + : bufferSize_(bufferSize) + { + for (std::size_t i = 0; i < poolSize; ++i) + available_.push_back(new Buffer(bufferSize)); // raw owning pointers in vector + } + + ~BufferPool() { + for (auto* buf : available_) + delete buf; // manual cleanup; leaks if exception thrown before this + } + + Buffer* acquire() { + if (available_.empty()) + return new Buffer(bufferSize_); // caller must delete — ownership unclear + Buffer* buf = available_.back(); + available_.pop_back(); + return buf; + } + + void release(Buffer* buf) { + buf->clear(); + available_.push_back(buf); + } + +private: + std::size_t bufferSize_; + std::vector available_; // vector of raw owning pointers +}; diff --git a/priv/combined_metrics/samples/code_smells/uses_smart_pointers_for_ownership/good/Buffer.cpp b/priv/combined_metrics/samples/code_smells/uses_smart_pointers_for_ownership/good/Buffer.cpp new file mode 100644 index 0000000..7dd829d --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_smart_pointers_for_ownership/good/Buffer.cpp @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +class Buffer { +public: + explicit Buffer(std::size_t capacity) + : data_(std::make_unique(capacity)) + , capacity_(capacity) + , size_(0) + {} + + void write(const uint8_t* src, std::size_t length) { + if (size_ + length > capacity_) + throw std::overflow_error("Buffer capacity exceeded"); + std::copy(src, src + length, data_.get() + size_); + size_ += length; + } + + std::size_t read(uint8_t* dst, std::size_t maxLength) const { + std::size_t toRead = std::min(maxLength, size_); + std::copy(data_.get(), data_.get() + toRead, dst); + return toRead; + } + + void clear() noexcept { size_ = 0; } + std::size_t size() const noexcept { return size_; } + std::size_t capacity() const noexcept { return capacity_; } + +private: + std::unique_ptr data_; // ownership is explicit and automatic + std::size_t capacity_; + std::size_t size_; +}; + +class BufferPool { +public: + explicit BufferPool(std::size_t bufferSize, std::size_t poolSize) + : bufferSize_(bufferSize) + { + for (std::size_t i = 0; i < poolSize; ++i) + available_.push_back(std::make_unique(bufferSize)); + } + + std::unique_ptr acquire() { + if (available_.empty()) + return std::make_unique(bufferSize_); + auto buf = std::move(available_.back()); + available_.pop_back(); + return buf; + } + + void release(std::unique_ptr buf) { + buf->clear(); + available_.push_back(std::move(buf)); + } + +private: + std::size_t bufferSize_; + std::vector> available_; +}; diff --git a/priv/combined_metrics/samples/code_smells/uses_standard_library_constants/bad/geometry.rs b/priv/combined_metrics/samples/code_smells/uses_standard_library_constants/bad/geometry.rs new file mode 100644 index 0000000..22f144e --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_standard_library_constants/bad/geometry.rs @@ -0,0 +1,51 @@ +#[derive(Debug, Clone, Copy)] +pub struct Circle { + pub radius: f64, +} + +impl Circle { + pub fn new(radius: f64) -> Self { + Self { radius } + } + + pub fn area(&self) -> f64 { + // BAD: hardcoded approximation — less precise than std::f64::consts::PI + 3.14159 * self.radius * self.radius + } + + pub fn circumference(&self) -> f64 { + // BAD: 2 * pi approximated — differs from PI at the 6th decimal place + 2.0 * 3.14159 * self.radius + } + + pub fn inscribed_square_side(&self) -> f64 { + // BAD: sqrt(2) hardcoded — use std::f64::consts::SQRT_2 + 2.0 * self.radius / 1.41421 + } +} + +#[derive(Debug, Clone, Copy)] +pub struct Sector { + pub radius: f64, + pub angle: f64, +} + +impl Sector { + pub fn area(&self) -> f64 { + 0.5 * self.radius * self.radius * self.angle + } + + pub fn from_degrees(radius: f64, degrees: f64) -> Self { + // BAD: magic constant instead of TAU or PI from std + Self { radius, angle: degrees * 6.28318 / 360.0 } + } +} + +pub fn degrees_to_radians(degrees: f64) -> f64 { + // BAD: literal approximation of PI + degrees * 3.14159265 / 180.0 +} + +pub fn radians_to_degrees(radians: f64) -> f64 { + radians * 180.0 / 3.14159265 +} diff --git a/priv/combined_metrics/samples/code_smells/uses_standard_library_constants/good/geometry.rs b/priv/combined_metrics/samples/code_smells/uses_standard_library_constants/good/geometry.rs new file mode 100644 index 0000000..ca9790f --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_standard_library_constants/good/geometry.rs @@ -0,0 +1,55 @@ +use std::f64::consts::{PI, SQRT_2, TAU}; + +#[derive(Debug, Clone, Copy)] +pub struct Circle { + pub radius: f64, +} + +impl Circle { + pub fn new(radius: f64) -> Self { + Self { radius } + } + + pub fn area(&self) -> f64 { + PI * self.radius * self.radius + } + + pub fn circumference(&self) -> f64 { + TAU * self.radius + } + + pub fn inscribed_square_diagonal(&self) -> f64 { + // Diameter * sqrt(2) / sqrt(2) == diameter, but demonstrates SQRT_2 usage + 2.0 * self.radius / SQRT_2 * SQRT_2 + } +} + +#[derive(Debug, Clone, Copy)] +pub struct Sector { + pub radius: f64, + /// angle in radians + pub angle: f64, +} + +impl Sector { + pub fn arc_length(&self) -> f64 { + self.radius * self.angle + } + + pub fn area(&self) -> f64 { + 0.5 * self.radius * self.radius * self.angle + } + + pub fn from_degrees(radius: f64, degrees: f64) -> Self { + // TAU / 360.0 is exact; using PI * 2.0 / 360.0 would also work + Self { radius, angle: degrees * TAU / 360.0 } + } +} + +pub fn degrees_to_radians(degrees: f64) -> f64 { + degrees * PI / 180.0 +} + +pub fn radians_to_degrees(radians: f64) -> f64 { + radians * 180.0 / PI +} diff --git a/priv/combined_metrics/samples/code_smells/uses_string_builder_for_loop_concatenation/bad/HtmlRenderer.cs b/priv/combined_metrics/samples/code_smells/uses_string_builder_for_loop_concatenation/bad/HtmlRenderer.cs new file mode 100644 index 0000000..7164ee6 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_string_builder_for_loop_concatenation/bad/HtmlRenderer.cs @@ -0,0 +1,61 @@ +using System.Collections.Generic; + +namespace Rendering +{ + public class HtmlRenderer + { + public string RenderTable(IEnumerable rows) + { + // String concatenation in loops creates a new string object on every iteration + string html = "\n \n"; + + foreach (var row in rows) + { + html += " "; + foreach (var cell in row.Cells) + { + html += ""; // O(n²) allocations + } + html += "\n"; + } + + html += " \n
" + Escape(cell) + "
\n"; + return html; + } + + public string RenderList(IEnumerable items, string cssClass) + { + string html = "\n"; // new string every loop + } + + html += "\n"; + return html; + } + + public string RenderReport(ReportData report) + { + string html = "\n\n\n"; + html += "

" + Escape(report.Title) + "

\n"; + + foreach (var section in report.Sections) + { + // Each += allocates a new string on the heap + html += "

" + Escape(section.Heading) + "

\n"; + html += "

" + Escape(section.Body) + "

\n"; + } + + html += "\n\n"; + return html; + } + + private static string Escape(string text) => + text?.Replace("&", "&").Replace("<", "<").Replace(">", ">") ?? string.Empty; + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_string_builder_for_loop_concatenation/good/HtmlRenderer.cs b/priv/combined_metrics/samples/code_smells/uses_string_builder_for_loop_concatenation/good/HtmlRenderer.cs new file mode 100644 index 0000000..b9d892a --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_string_builder_for_loop_concatenation/good/HtmlRenderer.cs @@ -0,0 +1,72 @@ +using System.Collections.Generic; +using System.Text; + +namespace Rendering +{ + public class HtmlRenderer + { + public string RenderTable(IEnumerable rows) + { + var sb = new StringBuilder(); + sb.AppendLine(""); + sb.AppendLine(" "); + + foreach (var row in rows) + { + sb.Append(" "); + foreach (var cell in row.Cells) + { + sb.Append(""); + } + sb.AppendLine(""); + } + + sb.AppendLine(" "); + sb.AppendLine("
") + .Append(Escape(cell)) + .Append("
"); + return sb.ToString(); + } + + public string RenderList(IEnumerable items, string cssClass) + { + var sb = new StringBuilder(); + sb.Append(""); + + foreach (var item in items) + { + sb.Append("
  • ").Append(Escape(item)).AppendLine("
  • "); + } + + sb.AppendLine(""); + return sb.ToString(); + } + + public string RenderReport(ReportData report) + { + var sb = new StringBuilder(capacity: 4096); + sb.AppendLine("") + .AppendLine("") + .AppendLine(""); + + sb.Append("

    ").Append(Escape(report.Title)).AppendLine("

    "); + + foreach (var section in report.Sections) + { + sb.Append("

    ").Append(Escape(section.Heading)).AppendLine("

    "); + sb.Append("

    ").Append(Escape(section.Body)).AppendLine("

    "); + } + + sb.AppendLine("") + .AppendLine(""); + + return sb.ToString(); + } + + private static string Escape(string text) => + text?.Replace("&", "&").Replace("<", "<").Replace(">", ">") ?? string.Empty; + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_structured_concurrency/bad/DataSyncService.kt b/priv/combined_metrics/samples/code_smells/uses_structured_concurrency/bad/DataSyncService.kt new file mode 100644 index 0000000..49db07d --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_structured_concurrency/bad/DataSyncService.kt @@ -0,0 +1,47 @@ +package com.example.sync + +import kotlinx.coroutines.* + +class DataSyncService( + private val userRepository: UserRepository, + private val orderRepository: OrderRepository, + private val inventoryClient: InventoryClient +) { + + /** + * Launches coroutines using GlobalScope — they are not tied to any lifecycle. + * If the service is destroyed or the app shuts down, these coroutines keep running + * and cannot be cancelled as a group. + */ + fun startPeriodicSync(): Job = GlobalScope.launch(Dispatchers.IO) { + while (isActive) { + syncAll() + delay(60_000) + } + } + + /** + * Each task is launched into GlobalScope independently. + * There is no parent scope to cancel them together, and exceptions + * in one do not cancel the others. + */ + suspend fun syncAll(): SyncReport { + val userJob = GlobalScope.async(Dispatchers.IO) { syncUsers() } + val orderJob = GlobalScope.async(Dispatchers.IO) { syncOrders() } + val inventoryJob = GlobalScope.async(Dispatchers.IO) { syncInventory() } + + return SyncReport( + usersUpdated = userJob.await(), + ordersUpdated = orderJob.await(), + itemsUpdated = inventoryJob.await() + ) + } + + private suspend fun syncUsers(): Int = userRepository.fetchAndUpdate() + private suspend fun syncOrders(): Int = orderRepository.fetchAndUpdate() + private suspend fun syncInventory(): Int = inventoryClient.syncAll() + + fun stop() { + // No-op — nothing to cancel because GlobalScope outlives everything + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_structured_concurrency/good/DataSyncService.kt b/priv/combined_metrics/samples/code_smells/uses_structured_concurrency/good/DataSyncService.kt new file mode 100644 index 0000000..92cbb33 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_structured_concurrency/good/DataSyncService.kt @@ -0,0 +1,49 @@ +package com.example.sync + +import kotlinx.coroutines.* + +class DataSyncService( + private val userRepository: UserRepository, + private val orderRepository: OrderRepository, + private val inventoryClient: InventoryClient +) { + + // Uses a defined CoroutineScope tied to the service lifecycle + private val scope = CoroutineScope(SupervisorJob() + Dispatchers.IO) + + /** + * Launches a background sync job within the service scope. + * When the service is stopped, all child coroutines are cancelled via scope.cancel(). + */ + fun startPeriodicSync(): Job = scope.launch { + while (isActive) { + syncAll() + delay(60_000) + } + } + + /** + * Runs all sync tasks concurrently within a single coroutine scope. + * All tasks are children of the caller's scope — cancellation propagates correctly. + */ + suspend fun syncAll(): SyncReport = coroutineScope { + val userSync = async { syncUsers() } + val orderSync = async { syncOrders() } + val inventorySync = async { syncInventory() } + + SyncReport( + usersUpdated = userSync.await(), + ordersUpdated = orderSync.await(), + itemsUpdated = inventorySync.await() + ) + } + + private suspend fun syncUsers(): Int = userRepository.fetchAndUpdate() + private suspend fun syncOrders(): Int = orderRepository.fetchAndUpdate() + private suspend fun syncInventory(): Int = inventoryClient.syncAll() + + fun stop() { + // Cancels all coroutines launched in this scope + scope.cancel() + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_using_statement_for_disposables/bad/ReportExporter.cs b/priv/combined_metrics/samples/code_smells/uses_using_statement_for_disposables/bad/ReportExporter.cs new file mode 100644 index 0000000..70fc219 --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_using_statement_for_disposables/bad/ReportExporter.cs @@ -0,0 +1,81 @@ +using System; +using System.Data.SqlClient; +using System.IO; +using System.Text; + +namespace Reporting +{ + public class ReportExporter + { + private readonly string _connectionString; + + public ReportExporter(string connectionString) + { + _connectionString = connectionString; + } + + public void ExportToFile(string reportName, string outputPath) + { + // Manual resource management — Dispose never called if an exception occurs + var connection = new SqlConnection(_connectionString); + connection.Open(); + + var command = new SqlCommand( + "SELECT * FROM Reports WHERE Name = @name", connection); + command.Parameters.AddWithValue("@name", reportName); + + var reader = command.ExecuteReader(); + var writer = new StreamWriter(outputPath, append: false, Encoding.UTF8); + + writer.WriteLine($"Report: {reportName}"); + writer.WriteLine(new string('-', 40)); + + while (reader.Read()) + { + writer.WriteLine( + $"{reader["Date"]:yyyy-MM-dd} | {reader["Value"]:N2}"); + } + + // These Dispose calls are never reached if an exception is thrown above + reader.Dispose(); + command.Dispose(); + writer.Dispose(); + connection.Dispose(); + } + + public byte[] ExportToBytes(int reportId) + { + var memoryStream = new MemoryStream(); + var writer = new StreamWriter(memoryStream, Encoding.UTF8); + + var connection = new SqlConnection(_connectionString); + connection.Open(); + + var command = new SqlCommand( + "SELECT * FROM ReportRows WHERE ReportId = @id ORDER BY RowIndex", connection); + command.Parameters.AddWithValue("@id", reportId); + + var reader = command.ExecuteReader(); + while (reader.Read()) + writer.WriteLine(reader["Content"].ToString()); + + writer.Flush(); + var result = memoryStream.ToArray(); + + // Missing dispose calls on reader, command, connection + writer.Dispose(); + memoryStream.Dispose(); + + return result; + } + + public void CopyReport(string sourcePath, string destinationPath) + { + var source = new FileStream(sourcePath, FileMode.Open, FileAccess.Read); + var destination = new FileStream(destinationPath, FileMode.Create, FileAccess.Write); + source.CopyTo(destination); + source.Dispose(); + destination.Dispose(); // not reached if CopyTo throws + } + } +} diff --git a/priv/combined_metrics/samples/code_smells/uses_using_statement_for_disposables/good/ReportExporter.cs b/priv/combined_metrics/samples/code_smells/uses_using_statement_for_disposables/good/ReportExporter.cs new file mode 100644 index 0000000..5ac04ac --- /dev/null +++ b/priv/combined_metrics/samples/code_smells/uses_using_statement_for_disposables/good/ReportExporter.cs @@ -0,0 +1,66 @@ +using System; +using System.Data.SqlClient; +using System.IO; +using System.Text; + +namespace Reporting +{ + public class ReportExporter + { + private readonly string _connectionString; + + public ReportExporter(string connectionString) + { + _connectionString = connectionString; + } + + public void ExportToFile(string reportName, string outputPath) + { + using var connection = new SqlConnection(_connectionString); + connection.Open(); + + using var command = new SqlCommand( + "SELECT * FROM Reports WHERE Name = @name", connection); + command.Parameters.AddWithValue("@name", reportName); + + using var reader = command.ExecuteReader(); + using var writer = new StreamWriter(outputPath, append: false, Encoding.UTF8); + + writer.WriteLine($"Report: {reportName}"); + writer.WriteLine(new string('-', 40)); + + while (reader.Read()) + { + writer.WriteLine( + $"{reader["Date"]:yyyy-MM-dd} | {reader["Value"]:N2}"); + } + } + + public byte[] ExportToBytes(int reportId) + { + using var memoryStream = new MemoryStream(); + using var writer = new StreamWriter(memoryStream, Encoding.UTF8, leaveOpen: true); + + using var connection = new SqlConnection(_connectionString); + connection.Open(); + + using var command = new SqlCommand( + "SELECT * FROM ReportRows WHERE ReportId = @id ORDER BY RowIndex", connection); + command.Parameters.AddWithValue("@id", reportId); + + using var reader = command.ExecuteReader(); + while (reader.Read()) + writer.WriteLine(reader["Content"].ToString()); + + writer.Flush(); + return memoryStream.ToArray(); + } + + public void CopyReport(string sourcePath, string destinationPath) + { + using var source = new FileStream(sourcePath, FileMode.Open, FileAccess.Read); + using var destination = new FileStream(destinationPath, FileMode.Create, FileAccess.Write); + source.CopyTo(destination); + } + } +} diff --git a/priv/combined_metrics/samples/consistency/all_methods_declare_visibility/bad/PaymentGateway.php b/priv/combined_metrics/samples/consistency/all_methods_declare_visibility/bad/PaymentGateway.php new file mode 100644 index 0000000..e72c4f2 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/all_methods_declare_visibility/bad/PaymentGateway.php @@ -0,0 +1,70 @@ +apiKey = $apiKey; + $this->baseUrl = $baseUrl; + } + + // Missing visibility — defaults to public implicitly in PHP, but PSR-12 requires explicit declaration + function charge($amountCents, $token) + { + $this->requestCount++; + return $this->post('/v1/charges', ['amount' => $amountCents, 'source' => $token]); + } + + // Missing visibility + function refund($transactionId, $amountCents) + { + $this->requestCount++; + return $this->post('/v1/refunds', ['transaction_id' => $transactionId, 'amount' => $amountCents]); + } + + public function getRequestCount() + { + return $this->requestCount; + } + + // Missing visibility on static method + static function getTotalRequests() + { + return 0; + } + + // Missing visibility + function buildHeaders() + { + return [ + 'Authorization' => "Bearer {$this->apiKey}", + 'Content-Type' => 'application/json', + ]; + } + + // Missing visibility on private method + function post($path, $payload) + { + return []; + } + + // Missing visibility + function buildUrl($path) + { + return rtrim($this->baseUrl, '/') . $path; + } + + // Missing visibility on abstract-style helper + function validateAmount($amountCents) + { + if ($amountCents <= 0) { + throw new \InvalidArgumentException("Amount must be positive"); + } + } +} diff --git a/priv/combined_metrics/samples/consistency/all_methods_declare_visibility/good/PaymentGateway.php b/priv/combined_metrics/samples/consistency/all_methods_declare_visibility/good/PaymentGateway.php new file mode 100644 index 0000000..a90368d --- /dev/null +++ b/priv/combined_metrics/samples/consistency/all_methods_declare_visibility/good/PaymentGateway.php @@ -0,0 +1,73 @@ +apiKey = $apiKey; + $this->baseUrl = $baseUrl; + } + + public function charge(int $amountCents, string $token): array + { + $this->requestCount++; + self::$totalRequests++; + + return $this->post('/v1/charges', ['amount' => $amountCents, 'source' => $token]); + } + + public function refund(string $transactionId, int $amountCents): array + { + $this->requestCount++; + self::$totalRequests++; + + return $this->post('/v1/refunds', ['transaction_id' => $transactionId, 'amount' => $amountCents]); + } + + public function getRequestCount(): int + { + return $this->requestCount; + } + + public static function getTotalRequests(): int + { + return self::$totalRequests; + } + + protected function buildHeaders(): array + { + return [ + 'Authorization' => "Bearer {$this->apiKey}", + 'Content-Type' => 'application/json', + ]; + } + + private function post(string $path, array $payload): array + { + $url = $this->buildUrl($path); + // HTTP client logic + return []; + } + + private function buildUrl(string $path): string + { + return rtrim($this->baseUrl, '/') . $path; + } + + private function validateAmount(int $amountCents): void + { + if ($amountCents <= 0) { + throw new \InvalidArgumentException("Amount must be positive: {$amountCents}"); + } + } +} diff --git a/priv/combined_metrics/samples/consistency/all_properties_declare_visibility/bad/OrderService.php b/priv/combined_metrics/samples/consistency/all_properties_declare_visibility/bad/OrderService.php new file mode 100644 index 0000000..fc0fe00 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/all_properties_declare_visibility/bad/OrderService.php @@ -0,0 +1,70 @@ +orderRepository = $orderRepository; + $this->paymentService = $paymentService; + $this->notificationService = $notificationService; + self::$instanceCount++; + } + + public function place($customerId, $items) + { + if (count($items) > self::MAX_ITEMS_PER_ORDER) { + throw new \InvalidArgumentException('Too many items'); + } + + $order = new \stdClass(); + $order->customerId = $customerId; + $order->currency = $this->defaultCurrency; + $order->items = $items; + + $this->orderRepository->save($order); + + if ($this->auditEnabled) { + $this->logPlacement($order); + } + + return $order; + } + + public function cancel($orderId) + { + $order = $this->orderRepository->findById($orderId); + + if ($order === null) { + return false; + } + + $order->status = 'cancelled'; + $this->orderRepository->save($order); + + return true; + } + + static function getInstanceCount() // missing visibility on static method + { + return self::$instanceCount; + } + + function logPlacement($order): void // missing visibility on instance method + { + // audit logging + } +} diff --git a/priv/combined_metrics/samples/consistency/all_properties_declare_visibility/good/OrderService.php b/priv/combined_metrics/samples/consistency/all_properties_declare_visibility/good/OrderService.php new file mode 100644 index 0000000..233468f --- /dev/null +++ b/priv/combined_metrics/samples/consistency/all_properties_declare_visibility/good/OrderService.php @@ -0,0 +1,79 @@ +orderRepository = $orderRepository; + $this->paymentService = $paymentService; + $this->notificationService = $notificationService; + self::$instanceCount++; + } + + public function place(int $customerId, array $items): Order + { + if (count($items) > self::MAX_ITEMS_PER_ORDER) { + throw new \InvalidArgumentException( + 'Order exceeds maximum of ' . self::MAX_ITEMS_PER_ORDER . ' items' + ); + } + + $order = new Order(customerId: $customerId, currency: $this->defaultCurrency); + foreach ($items as $item) { + $order->addItem($item['product_id'], $item['quantity'], $item['unit_price']); + } + + $this->orderRepository->save($order); + + if ($this->auditEnabled) { + $this->logPlacement($order); + } + + return $order; + } + + public function cancel(int $orderId): bool + { + $order = $this->orderRepository->findById($orderId); + + if ($order === null) { + throw new OrderNotFoundException("Order {$orderId} not found"); + } + + $order->setStatus('cancelled'); + $this->orderRepository->save($order); + $this->notificationService->notifyCancellation($order); + + return true; + } + + public static function getInstanceCount(): int + { + return self::$instanceCount; + } + + private function logPlacement(Order $order): void + { + // audit logging + } +} diff --git a/priv/combined_metrics/samples/consistency/consistent_casing_within_file/bad/analytics.ex b/priv/combined_metrics/samples/consistency/consistent_casing_within_file/bad/analytics.ex new file mode 100644 index 0000000..372546b --- /dev/null +++ b/priv/combined_metrics/samples/consistency/consistent_casing_within_file/bad/analytics.ex @@ -0,0 +1,78 @@ +defmodule Analytics do + @moduledoc "Tracks and reports on user events and analytics data" + + def track_event(userId, eventName, properties) do + timeStamp = DateTime.utc_now() + eventData = build_event(userId, eventName, properties, timeStamp) + store_event(eventData) + end + + def build_event(user_id, event_name, props, timestamp) do + %{ + userId: user_id, + eventName: event_name, + properties: props, + createdAt: timestamp + } + end + + def get_user_events(userId, opts \\ []) do + page_size = Keyword.get(opts, :pageSize, 20) + startDate = Keyword.get(opts, :start_date) + endDate = Keyword.get(opts, :end_date) + + fetch_events(userId, startDate, endDate, page_size) + end + + def aggregate_events(eventList) do + eventList + |> Enum.group_by(fn event -> event.eventName end) + |> Enum.map(fn {event_name, events} -> + event_count = length(events) + {event_name, event_count} + end) + |> Map.new() + end + + def compute_retention(userList, start_date, endDate) do + activeUsers = + userList + |> Enum.filter(fn u -> + last_seen = u.lastSeenAt + DateTime.compare(last_seen, start_date) == :gt and + DateTime.compare(last_seen, endDate) == :lt + end) + + totalUsers = length(userList) + activeCount = length(activeUsers) + + if totalUsers > 0 do + retentionRate = activeCount / totalUsers + {:ok, retentionRate} + else + {:error, :no_users} + end + end + + def format_report(reportData) do + event_count = reportData.totalEvents + uniqueUsers = reportData.unique_users + topEvent = reportData.topEventName + + %{ + summary: "#{event_count} events from #{uniqueUsers} users", + top_event: topEvent, + generatedAt: DateTime.utc_now() + } + end + + def filter_by_property(eventList, propertyKey, propertyVal) do + Enum.filter(eventList, fn event -> + val = Map.get(event.properties, propertyKey) + val == propertyVal + end) + end + + defp store_event(eventData), do: {:ok, eventData} + defp fetch_events(_userId, _start, _end, _pageSize), do: [] +end diff --git a/priv/combined_metrics/samples/consistency/consistent_casing_within_file/config.yml b/priv/combined_metrics/samples/consistency/consistent_casing_within_file/config.yml new file mode 100644 index 0000000..6955881 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/consistent_casing_within_file/config.yml @@ -0,0 +1 @@ +doc: "A file should use one naming convention throughout — no mixing of camelCase and snake_case for the same kind of identifier." diff --git a/priv/combined_metrics/samples/consistency/consistent_casing_within_file/good/analytics.ex b/priv/combined_metrics/samples/consistency/consistent_casing_within_file/good/analytics.ex new file mode 100644 index 0000000..4e6adb5 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/consistent_casing_within_file/good/analytics.ex @@ -0,0 +1,78 @@ +defmodule Analytics do + @moduledoc "Tracks and reports on user events and analytics data" + + def track_event(user_id, event_name, properties) do + timestamp = DateTime.utc_now() + event_data = build_event(user_id, event_name, properties, timestamp) + store_event(event_data) + end + + def build_event(user_id, event_name, properties, timestamp) do + %{ + user_id: user_id, + event_name: event_name, + properties: properties, + created_at: timestamp + } + end + + def get_user_events(user_id, opts \\ []) do + page_size = Keyword.get(opts, :page_size, 20) + start_date = Keyword.get(opts, :start_date) + end_date = Keyword.get(opts, :end_date) + + fetch_events(user_id, start_date, end_date, page_size) + end + + def aggregate_events(event_list) do + event_list + |> Enum.group_by(fn event -> event.event_name end) + |> Enum.map(fn {event_name, events} -> + event_count = length(events) + {event_name, event_count} + end) + |> Map.new() + end + + def compute_retention(user_list, start_date, end_date) do + active_users = + user_list + |> Enum.filter(fn user -> + last_seen = user.last_seen_at + DateTime.compare(last_seen, start_date) == :gt and + DateTime.compare(last_seen, end_date) == :lt + end) + + total_users = length(user_list) + active_count = length(active_users) + + if total_users > 0 do + retention_rate = active_count / total_users + {:ok, retention_rate} + else + {:error, :no_users} + end + end + + def format_report(report_data) do + event_count = report_data.total_events + unique_users = report_data.unique_users + top_event = report_data.top_event_name + + %{ + summary: "#{event_count} events from #{unique_users} users", + top_event: top_event, + generated_at: DateTime.utc_now() + } + end + + def filter_by_property(event_list, property_key, property_value) do + Enum.filter(event_list, fn event -> + value = Map.get(event.properties, property_key) + value == property_value + end) + end + + defp store_event(event_data), do: {:ok, event_data} + defp fetch_events(_user_id, _start_date, _end_date, _page_size), do: [] +end diff --git a/priv/combined_metrics/samples/consistency/consistent_error_return_shape/bad/accounts.ex b/priv/combined_metrics/samples/consistency/consistent_error_return_shape/bad/accounts.ex new file mode 100644 index 0000000..3acfccd --- /dev/null +++ b/priv/combined_metrics/samples/consistency/consistent_error_return_shape/bad/accounts.ex @@ -0,0 +1,102 @@ +defmodule Accounts do + @moduledoc "Manages user accounts and authentication" + + def get_user(id) do + case fetch_user_from_db(id) do + nil -> nil + user -> user + end + end + + def create_user(attrs) do + cond do + Map.get(attrs, :email) == nil -> + "email is required" + + not valid_email?(attrs.email) -> + {:error, "invalid email format"} + + user_exists?(attrs.email) -> + false + + true -> + do_insert_user(attrs) + end + end + + def update_user(id, attrs) do + case fetch_user_from_db(id) do + nil -> + {:error, :not_found} + + user -> + case validate_attrs(attrs) do + false -> "validation failed" + true -> do_update_user(user, attrs) + end + end + end + + def delete_user(id) do + case fetch_user_from_db(id) do + nil -> false + user -> do_delete_user(user) + end + end + + def authenticate(email, password) do + case fetch_user_by_email(email) do + nil -> + {:error, :not_found} + + user -> + if check_password(user, password) do + {:ok, user} + else + "invalid password" + end + end + end + + def change_password(user, old_password, new_password) do + cond do + not check_password(user, old_password) -> + {:error, :wrong_password} + + String.length(new_password) < 8 -> + nil + + true -> + do_update_password(user, new_password) + end + end + + def list_users(filters \\ %{}) do + case fetch_all_users(filters) do + [] -> false + users -> users + end + end + + def verify_email(user, token) do + case validate_token(token) do + :invalid -> "token is invalid" + :expired -> {:error, :token_expired} + :ok -> do_verify_email(user) + end + end + + defp fetch_user_from_db(_id), do: nil + defp fetch_user_by_email(_email), do: nil + defp fetch_all_users(_filters), do: [] + defp do_insert_user(attrs), do: {:ok, attrs} + defp do_update_user(user, _attrs), do: {:ok, user} + defp do_delete_user(_user), do: {:ok, :deleted} + defp do_update_password(user, _pw), do: {:ok, user} + defp do_verify_email(user), do: {:ok, user} + defp valid_email?(_email), do: true + defp user_exists?(_email), do: false + defp validate_attrs(_attrs), do: true + defp check_password(_user, _pw), do: true + defp validate_token(_token), do: :ok +end diff --git a/priv/combined_metrics/samples/consistency/consistent_error_return_shape/config.yml b/priv/combined_metrics/samples/consistency/consistent_error_return_shape/config.yml new file mode 100644 index 0000000..5b61d8f --- /dev/null +++ b/priv/combined_metrics/samples/consistency/consistent_error_return_shape/config.yml @@ -0,0 +1 @@ +doc: "All functions in a module should return errors in the same shape — mixed `nil`, `false`, and `{:error, _}` returns are confusing." diff --git a/priv/combined_metrics/samples/consistency/consistent_error_return_shape/good/accounts.ex b/priv/combined_metrics/samples/consistency/consistent_error_return_shape/good/accounts.ex new file mode 100644 index 0000000..045cb19 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/consistent_error_return_shape/good/accounts.ex @@ -0,0 +1,94 @@ +defmodule Accounts do + @moduledoc "Manages user accounts and authentication" + + def get_user(id) do + case fetch_user_from_db(id) do + nil -> {:error, :not_found} + user -> {:ok, user} + end + end + + def create_user(attrs) do + cond do + Map.get(attrs, :email) == nil -> + {:error, :email_required} + + not valid_email?(attrs.email) -> + {:error, :invalid_email} + + user_exists?(attrs.email) -> + {:error, :email_taken} + + true -> + do_insert_user(attrs) + end + end + + def update_user(id, attrs) do + with {:ok, user} <- get_user(id), + :ok <- validate_attrs(attrs) do + do_update_user(user, attrs) + end + end + + def delete_user(id) do + case fetch_user_from_db(id) do + nil -> {:error, :not_found} + user -> do_delete_user(user) + end + end + + def authenticate(email, password) do + case fetch_user_by_email(email) do + nil -> + {:error, :not_found} + + user -> + if check_password(user, password) do + {:ok, user} + else + {:error, :invalid_password} + end + end + end + + def change_password(user, old_password, new_password) do + cond do + not check_password(user, old_password) -> + {:error, :wrong_password} + + String.length(new_password) < 8 -> + {:error, :password_too_short} + + true -> + do_update_password(user, new_password) + end + end + + def list_users(filters \\ %{}) do + users = fetch_all_users(filters) + {:ok, users} + end + + def verify_email(user, token) do + case validate_token(token) do + :invalid -> {:error, :invalid_token} + :expired -> {:error, :token_expired} + :ok -> do_verify_email(user) + end + end + + defp fetch_user_from_db(_id), do: nil + defp fetch_user_by_email(_email), do: nil + defp fetch_all_users(_filters), do: [] + defp do_insert_user(attrs), do: {:ok, attrs} + defp do_update_user(user, _attrs), do: {:ok, user} + defp do_delete_user(_user), do: {:ok, :deleted} + defp do_update_password(user, _pw), do: {:ok, user} + defp do_verify_email(user), do: {:ok, user} + defp valid_email?(_email), do: true + defp user_exists?(_email), do: false + defp validate_attrs(_attrs), do: :ok + defp check_password(_user, _pw), do: true + defp validate_token(_token), do: :ok +end diff --git a/priv/combined_metrics/samples/consistency/consistent_function_style/bad/formatter.ex b/priv/combined_metrics/samples/consistency/consistent_function_style/bad/formatter.ex new file mode 100644 index 0000000..44ee61e --- /dev/null +++ b/priv/combined_metrics/samples/consistency/consistent_function_style/bad/formatter.ex @@ -0,0 +1,66 @@ +defmodule Formatter do + @moduledoc "Formats and serializes various data types for display and output" + + def format_name(first, last), do: "#{first} #{last}" + + def format_full_address(address) do + "#{address.street}, #{address.city}, #{address.state} #{address.zip}" + end + + def format_price(cents), do: "$#{:erlang.float_to_binary(cents / 100, decimals: 2)}" + + def format_date(date) do + "#{date.year}-#{pad(date.month)}-#{pad(date.day)}" + end + + def format_phone(digits), do: "(#{String.slice(digits, 0, 3)}) #{String.slice(digits, 3, 3)}-#{String.slice(digits, 6, 4)}" + + def format_percentage(value) do + rounded = Float.round(value * 100, 1) + "#{rounded}%" + end + + def format_bytes(bytes), do: if(bytes < 1024, do: "#{bytes} B", else: "#{Float.round(bytes / 1024, 1)} KB") + + def format_duration(seconds) do + minutes = div(seconds, 60) + remaining = rem(seconds, 60) + "#{minutes}m #{remaining}s" + end + + def serialize_user(user), do: %{id: user.id, name: format_name(user.first_name, user.last_name), email: user.email} + + def serialize_order(order) do + %{ + id: order.id, + total: format_price(order.total_cents), + placed_at: format_date(order.inserted_at), + items: Enum.map(order.items, &serialize_order_item/1) + } + end + + def serialize_order_item(item), do: %{name: item.name, quantity: item.quantity, unit_price: format_price(item.unit_price_cents)} + + def truncate(text, max_length) do + if String.length(text) > max_length do + String.slice(text, 0, max_length - 3) <> "..." + else + text + end + end + + def slugify(text), do: text |> String.downcase() |> String.replace(~r/[^a-z0-9]+/, "-") |> String.trim("-") + + def format_list(items) do + case length(items) do + 0 -> "none" + 1 -> hd(items) + 2 -> "#{Enum.at(items, 0)} and #{Enum.at(items, 1)}" + _ -> + all_but_last = Enum.join(Enum.drop(items, -1), ", ") + "#{all_but_last}, and #{List.last(items)}" + end + end + + defp pad(n), do: String.pad_leading(Integer.to_string(n), 2, "0") +end diff --git a/priv/combined_metrics/samples/consistency/consistent_function_style/config.yml b/priv/combined_metrics/samples/consistency/consistent_function_style/config.yml new file mode 100644 index 0000000..28e73a2 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/consistent_function_style/config.yml @@ -0,0 +1 @@ +doc: "A module should not mix one-liner and multi-clause function definitions for the same concern." diff --git a/priv/combined_metrics/samples/consistency/consistent_function_style/good/formatter.ex b/priv/combined_metrics/samples/consistency/consistent_function_style/good/formatter.ex new file mode 100644 index 0000000..1f43bed --- /dev/null +++ b/priv/combined_metrics/samples/consistency/consistent_function_style/good/formatter.ex @@ -0,0 +1,96 @@ +defmodule Formatter do + @moduledoc "Formats and serializes various data types for display and output" + + def format_name(first, last) do + "#{first} #{last}" + end + + def format_full_address(address) do + "#{address.street}, #{address.city}, #{address.state} #{address.zip}" + end + + def format_price(cents) do + "$#{:erlang.float_to_binary(cents / 100, decimals: 2)}" + end + + def format_date(date) do + "#{date.year}-#{pad(date.month)}-#{pad(date.day)}" + end + + def format_phone(digits) do + area = String.slice(digits, 0, 3) + prefix = String.slice(digits, 3, 3) + line = String.slice(digits, 6, 4) + "(#{area}) #{prefix}-#{line}" + end + + def format_percentage(value) do + rounded = Float.round(value * 100, 1) + "#{rounded}%" + end + + def format_bytes(bytes) when bytes < 1024 do + "#{bytes} B" + end + + def format_bytes(bytes) do + "#{Float.round(bytes / 1024, 1)} KB" + end + + def format_duration(seconds) do + minutes = div(seconds, 60) + remaining = rem(seconds, 60) + "#{minutes}m #{remaining}s" + end + + def serialize_user(user) do + %{ + id: user.id, + name: format_name(user.first_name, user.last_name), + email: user.email + } + end + + def serialize_order(order) do + %{ + id: order.id, + total: format_price(order.total_cents), + placed_at: format_date(order.inserted_at), + items: Enum.map(order.items, &serialize_order_item/1) + } + end + + def serialize_order_item(item) do + %{ + name: item.name, + quantity: item.quantity, + unit_price: format_price(item.unit_price_cents) + } + end + + def truncate(text, max_length) do + if String.length(text) > max_length do + String.slice(text, 0, max_length - 3) <> "..." + else + text + end + end + + def slugify(text) do + text + |> String.downcase() + |> String.replace(~r/[^a-z0-9]+/, "-") + |> String.trim("-") + end + + def format_list([]), do: "none" + def format_list([item]), do: item + def format_list([a, b]), do: "#{a} and #{b}" + + def format_list(items) do + all_but_last = items |> Enum.drop(-1) |> Enum.join(", ") + "#{all_but_last}, and #{List.last(items)}" + end + + defp pad(n), do: String.pad_leading(Integer.to_string(n), 2, "0") +end diff --git a/priv/combined_metrics/samples/consistency/doc_vs_comment_separation/bad/subscriptions.ex b/priv/combined_metrics/samples/consistency/doc_vs_comment_separation/bad/subscriptions.ex new file mode 100644 index 0000000..619771b --- /dev/null +++ b/priv/combined_metrics/samples/consistency/doc_vs_comment_separation/bad/subscriptions.ex @@ -0,0 +1,58 @@ +defmodule MyApp.Subscriptions do + # Bad: no @moduledoc at all — public module with no documentation + # The module purpose, lifecycle, and conventions are undocumented. + + alias MyApp.Subscriptions.{Subscription, Plan} + alias MyApp.Repo + + # Bad: using a plain comment instead of @doc for a public function. + # Consumers cannot use `h MyApp.Subscriptions.create/2` in IEx. + # Creates a new subscription + @spec create(integer(), Plan.t()) :: {:ok, Subscription.t()} | {:error, Ecto.Changeset.t()} + def create(customer_id, %Plan{} = plan) do + initial_status = if plan.trial_days > 0, do: :trialing, else: :active + + trial_ends_at = + if plan.trial_days > 0 do + DateTime.add(DateTime.utc_now(), plan.trial_days * 86_400, :second) + end + + %Subscription{} + |> Subscription.changeset(%{ + customer_id: customer_id, + plan_id: plan.id, + status: initial_status, + trial_ends_at: trial_ends_at + }) + |> Repo.insert() + end + + @doc """ + Cancels the subscription. + + Implementation note: we first check if the changeset is valid by calling + Subscription.changeset/2, then call Repo.update/1. The Subscription schema + has a :cancelled_at field that gets set here. We also emit a telemetry event + by calling :telemetry.execute/3 with the [:my_app, :subscriptions, :transitioned] + event name. The metadata map has :from and :to keys. The Repo is aliased at the + top of this module. We use DateTime.utc_now() for the timestamp. + """ + # Bad: @doc describes the implementation in exhaustive detail — not the contract. + # The doc should explain what the function does for callers, not how it works internally. + @spec cancel(Subscription.t()) :: {:ok, Subscription.t()} | {:error, Ecto.Changeset.t()} + def cancel(%Subscription{status: :cancelled} = sub), do: {:ok, sub} + + def cancel(%Subscription{} = sub) do + sub + |> Subscription.changeset(%{status: :cancelled, cancelled_at: DateTime.utc_now()}) + |> Repo.update() + end + + # Bad: no @doc on a public function — leaves callers guessing + @spec reactivate(Subscription.t()) :: {:ok, Subscription.t()} | {:error, atom()} + def reactivate(%Subscription{} = sub) do + sub + |> Subscription.changeset(%{status: :active, cancelled_at: nil}) + |> Repo.update() + end +end diff --git a/priv/combined_metrics/samples/consistency/doc_vs_comment_separation/good/subscriptions.ex b/priv/combined_metrics/samples/consistency/doc_vs_comment_separation/good/subscriptions.ex new file mode 100644 index 0000000..fab6629 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/doc_vs_comment_separation/good/subscriptions.ex @@ -0,0 +1,67 @@ +defmodule MyApp.Subscriptions do + @moduledoc """ + Public API for managing customer subscriptions. + + Subscriptions move through the following lifecycle: + `:trialing` -> `:active` -> `:past_due` -> `:cancelled` + + All state transitions emit a telemetry event under + `[:my_app, :subscriptions, :transitioned]`. + """ + + alias MyApp.Subscriptions.{Subscription, Plan} + alias MyApp.Repo + + @doc """ + Creates a new subscription for the given customer on the specified plan. + + Returns `{:ok, subscription}` on success, or `{:error, changeset}` when + validation fails (e.g. the customer already has an active subscription). + """ + @spec create(integer(), Plan.t()) :: {:ok, Subscription.t()} | {:error, Ecto.Changeset.t()} + def create(customer_id, %Plan{} = plan) do + # Determine initial status: new customers start in a trial period + initial_status = if plan.trial_days > 0, do: :trialing, else: :active + + # Compute trial end date; nil when the plan has no trial + trial_ends_at = + if plan.trial_days > 0 do + DateTime.add(DateTime.utc_now(), plan.trial_days * 86_400, :second) + end + + %Subscription{} + |> Subscription.changeset(%{ + customer_id: customer_id, + plan_id: plan.id, + status: initial_status, + trial_ends_at: trial_ends_at + }) + |> Repo.insert() + end + + @doc """ + Cancels a subscription immediately. + + If the subscription is already cancelled this is a no-op and + `{:ok, subscription}` is still returned. + """ + @spec cancel(Subscription.t()) :: {:ok, Subscription.t()} | {:error, Ecto.Changeset.t()} + def cancel(%Subscription{status: :cancelled} = sub), do: {:ok, sub} + + def cancel(%Subscription{} = sub) do + sub + |> Subscription.changeset(%{status: :cancelled, cancelled_at: DateTime.utc_now()}) + |> Repo.update() + |> tap(fn + {:ok, updated} -> + # Emit telemetry so billing can react to the cancellation + :telemetry.execute([:my_app, :subscriptions, :transitioned], %{}, %{ + from: sub.status, + to: updated.status + }) + + _ -> + :ok + end) + end +end diff --git a/priv/combined_metrics/samples/consistency/overloads_are_contiguous/bad/PaymentProcessor.java b/priv/combined_metrics/samples/consistency/overloads_are_contiguous/bad/PaymentProcessor.java new file mode 100644 index 0000000..40e5226 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/overloads_are_contiguous/bad/PaymentProcessor.java @@ -0,0 +1,62 @@ +package com.example.payments; + +import java.math.BigDecimal; +import java.util.Currency; +import java.util.Locale; + +public class PaymentProcessor { + + private final PaymentGateway gateway; + private final AuditLog auditLog; + + public PaymentProcessor(PaymentGateway gateway, AuditLog auditLog) { + this.gateway = gateway; + this.auditLog = auditLog; + } + + // First process() overload + public PaymentResult process(PaymentRequest request) { + return process(request, Currency.getInstance(Locale.US)); + } + + // refund() overload interspersed between process() overloads + public RefundResult refund(String transactionId) { + return refund(transactionId, null); + } + + // Second process() overload — separated from the first by refund() + public PaymentResult process(PaymentRequest request, Currency currency) { + return process(request, currency, false); + } + + public boolean isHealthy() { + return gateway.ping(); + } + + // Third process() overload — far from the other two + public PaymentResult process(PaymentRequest request, Currency currency, boolean capture) { + ChargeRequest charge = ChargeRequest.builder() + .amount(request.getAmount()) + .currency(currency) + .capture(capture) + .token(request.getPaymentToken()) + .build(); + PaymentResult result = gateway.charge(charge); + auditLog.record(request, result); + return result; + } + + public PaymentSummary summarize(String merchantId) { + return gateway.fetchSummary(merchantId); + } + + // Second refund() overload — separated from the first by three other methods + public RefundResult refund(String transactionId, BigDecimal amount) { + RefundRequest refund = amount != null + ? RefundRequest.partial(transactionId, amount) + : RefundRequest.full(transactionId); + RefundResult result = gateway.refund(refund); + auditLog.record(refund, result); + return result; + } +} diff --git a/priv/combined_metrics/samples/consistency/overloads_are_contiguous/good/PaymentProcessor.java b/priv/combined_metrics/samples/consistency/overloads_are_contiguous/good/PaymentProcessor.java new file mode 100644 index 0000000..4c30cc7 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/overloads_are_contiguous/good/PaymentProcessor.java @@ -0,0 +1,59 @@ +package com.example.payments; + +import java.math.BigDecimal; +import java.util.Currency; +import java.util.Locale; + +public class PaymentProcessor { + + private final PaymentGateway gateway; + private final AuditLog auditLog; + + public PaymentProcessor(PaymentGateway gateway, AuditLog auditLog) { + this.gateway = gateway; + this.auditLog = auditLog; + } + + // All process() overloads are grouped together + public PaymentResult process(PaymentRequest request) { + return process(request, Currency.getInstance(Locale.US)); + } + + public PaymentResult process(PaymentRequest request, Currency currency) { + return process(request, currency, false); + } + + public PaymentResult process(PaymentRequest request, Currency currency, boolean capture) { + ChargeRequest charge = ChargeRequest.builder() + .amount(request.getAmount()) + .currency(currency) + .capture(capture) + .token(request.getPaymentToken()) + .build(); + PaymentResult result = gateway.charge(charge); + auditLog.record(request, result); + return result; + } + + // All refund() overloads are grouped together + public RefundResult refund(String transactionId) { + return refund(transactionId, null); + } + + public RefundResult refund(String transactionId, BigDecimal amount) { + RefundRequest refund = amount != null + ? RefundRequest.partial(transactionId, amount) + : RefundRequest.full(transactionId); + RefundResult result = gateway.refund(refund); + auditLog.record(refund, result); + return result; + } + + public boolean isHealthy() { + return gateway.ping(); + } + + public PaymentSummary summarize(String merchantId) { + return gateway.fetchSummary(merchantId); + } +} diff --git a/priv/combined_metrics/samples/consistency/override_annotation_present/bad/UserRepository.java b/priv/combined_metrics/samples/consistency/override_annotation_present/bad/UserRepository.java new file mode 100644 index 0000000..71ef3cd --- /dev/null +++ b/priv/combined_metrics/samples/consistency/override_annotation_present/bad/UserRepository.java @@ -0,0 +1,88 @@ +package com.example.users; + +import java.util.List; +import java.util.Objects; + +public class UserRepository extends AbstractRepository implements Auditable { + + private final DataSource dataSource; + + public UserRepository(DataSource dataSource) { + this.dataSource = dataSource; + } + + // Missing @Override — not obvious this is implementing the abstract method + public User findById(long id) { + return dataSource.query( + "SELECT * FROM users WHERE id = ?", + ps -> ps.setLong(1, id), + UserRepository::mapRow + ); + } + + // Missing @Override + public List findAll() { + return dataSource.queryList( + "SELECT * FROM users ORDER BY created_at DESC", + UserRepository::mapRow + ); + } + + // Missing @Override + public void save(User user) { + if (user.getId() == null) { + insert(user); + } else { + update(user); + } + } + + // Missing @Override + public void delete(long id) { + dataSource.execute("DELETE FROM users WHERE id = ?", ps -> ps.setLong(1, id)); + } + + // Missing @Override — implementing interface method without annotation + public String auditLabel() { + return "users"; + } + + // Missing @Override on Object methods + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof UserRepository)) return false; + UserRepository that = (UserRepository) o; + return Objects.equals(dataSource, that.dataSource); + } + + // Missing @Override + public int hashCode() { + return Objects.hash(dataSource); + } + + // Missing @Override + public String toString() { + return "UserRepository{dataSource=" + dataSource + "}"; + } + + private void insert(User user) { + dataSource.execute( + "INSERT INTO users (email, name, created_at) VALUES (?, ?, NOW())", + ps -> { + ps.setString(1, user.getEmail()); + ps.setString(2, user.getName()); + } + ); + } + + private void update(User user) { + dataSource.execute( + "UPDATE users SET email = ?, name = ? WHERE id = ?", + ps -> { + ps.setString(1, user.getEmail()); + ps.setString(2, user.getName()); + ps.setLong(3, user.getId()); + } + ); + } +} diff --git a/priv/combined_metrics/samples/consistency/override_annotation_present/good/UserRepository.java b/priv/combined_metrics/samples/consistency/override_annotation_present/good/UserRepository.java new file mode 100644 index 0000000..0e1124e --- /dev/null +++ b/priv/combined_metrics/samples/consistency/override_annotation_present/good/UserRepository.java @@ -0,0 +1,92 @@ +package com.example.users; + +import java.util.List; +import java.util.Objects; + +public class UserRepository extends AbstractRepository implements Auditable { + + private final DataSource dataSource; + + public UserRepository(DataSource dataSource) { + this.dataSource = dataSource; + } + + @Override + public User findById(long id) { + return dataSource.query( + "SELECT * FROM users WHERE id = ?", + ps -> ps.setLong(1, id), + UserRepository::mapRow + ); + } + + @Override + public List findAll() { + return dataSource.queryList( + "SELECT * FROM users ORDER BY created_at DESC", + UserRepository::mapRow + ); + } + + @Override + public void save(User user) { + if (user.getId() == null) { + insert(user); + } else { + update(user); + } + } + + @Override + public void delete(long id) { + dataSource.execute("DELETE FROM users WHERE id = ?", ps -> ps.setLong(1, id)); + } + + @Override + public String auditLabel() { + return "users"; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof UserRepository)) return false; + UserRepository that = (UserRepository) o; + return Objects.equals(dataSource, that.dataSource); + } + + @Override + public int hashCode() { + return Objects.hash(dataSource); + } + + @Override + public String toString() { + return "UserRepository{dataSource=" + dataSource + "}"; + } + + private void insert(User user) { + dataSource.execute( + "INSERT INTO users (email, name, created_at) VALUES (?, ?, NOW())", + ps -> { + ps.setString(1, user.getEmail()); + ps.setString(2, user.getName()); + } + ); + } + + private void update(User user) { + dataSource.execute( + "UPDATE users SET email = ?, name = ? WHERE id = ?", + ps -> { + ps.setString(1, user.getEmail()); + ps.setString(2, user.getName()); + ps.setLong(3, user.getId()); + } + ); + } + + private static User mapRow(ResultSet rs) throws SQLException { + return new User(rs.getLong("id"), rs.getString("email"), rs.getString("name")); + } +} diff --git a/priv/combined_metrics/samples/consistency/process_interactions_centralized/bad/sessions.ex b/priv/combined_metrics/samples/consistency/process_interactions_centralized/bad/sessions.ex new file mode 100644 index 0000000..9c67252 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/process_interactions_centralized/bad/sessions.ex @@ -0,0 +1,60 @@ +defmodule MyApp.SessionStore do + @moduledoc "Holds session state." + + use GenServer + + def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: __MODULE__) + + @impl true + def init(_opts), do: {:ok, %{sessions: %{}}} + + @impl true + def handle_call({:put, token, session}, _from, state) do + {:reply, :ok, put_in(state, [:sessions, token], session)} + end + + def handle_call({:get, token}, _from, state) do + {:reply, Map.get(state.sessions, token), state} + end + + def handle_call(:count, _from, state) do + {:reply, map_size(state.sessions), state} + end + + @impl true + def handle_cast({:delete, token}, state) do + {:noreply, update_in(state, [:sessions], &Map.delete(&1, token))} + end +end + +# Bad: MyApp.AuthController calls GenServer directly instead of going through a facade +defmodule MyApp.AuthController do + def login(conn, %{"token" => token}) do + # Bad: direct GenServer call scattered in a controller + session = GenServer.call(MyApp.SessionStore, {:get, token}) + # ... + end +end + +# Bad: MyApp.Plugs.LoadSession also calls GenServer directly — duplication +defmodule MyApp.Plugs.LoadSession do + def call(conn, _opts) do + token = get_session_token(conn) + # Bad: same GenServer call repeated here — no single facade + session = GenServer.call(MyApp.SessionStore, {:get, token}) + assign(conn, :current_session, session) + end + + defp get_session_token(conn), do: Plug.Conn.get_req_header(conn, "x-session-token") |> List.first() + defp assign(conn, key, value), do: Map.put(conn, key, value) +end + +# Bad: yet another module talking directly to the GenServer +defmodule MyApp.SessionCleanup do + def delete_expired(tokens) do + Enum.each(tokens, fn token -> + # Bad: direct cast, not going through any facade + GenServer.cast(MyApp.SessionStore, {:delete, token}) + end) + end +end diff --git a/priv/combined_metrics/samples/consistency/process_interactions_centralized/good/sessions.ex b/priv/combined_metrics/samples/consistency/process_interactions_centralized/good/sessions.ex new file mode 100644 index 0000000..9690d41 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/process_interactions_centralized/good/sessions.ex @@ -0,0 +1,76 @@ +defmodule MyApp.SessionStore do + @moduledoc """ + Facade for all interactions with the session cache GenServer. + All GenServer calls are centralised here — no other module calls + `GenServer.call/cast` on the session process directly. + """ + + use GenServer + + alias MyApp.Sessions.Session + + # --- Public API (the facade) --- + + def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: __MODULE__) + + @doc "Stores a session, returning the token." + @spec put(Session.t()) :: String.t() + def put(%Session{} = session) do + token = generate_token() + GenServer.call(__MODULE__, {:put, token, session}) + token + end + + @doc "Retrieves a session by token." + @spec get(String.t()) :: Session.t() | nil + def get(token) when is_binary(token) do + GenServer.call(__MODULE__, {:get, token}) + end + + @doc "Deletes a session by token." + @spec delete(String.t()) :: :ok + def delete(token) when is_binary(token) do + GenServer.cast(__MODULE__, {:delete, token}) + end + + @doc "Extends a session's TTL by the given number of seconds." + @spec touch(String.t(), pos_integer()) :: :ok + def touch(token, ttl_seconds) when is_binary(token) and is_integer(ttl_seconds) do + GenServer.cast(__MODULE__, {:touch, token, ttl_seconds}) + end + + @doc "Returns the number of active sessions." + @spec count() :: non_neg_integer() + def count do + GenServer.call(__MODULE__, :count) + end + + # --- GenServer callbacks --- + + @impl true + def init(_opts), do: {:ok, %{sessions: %{}, expiry: %{}}} + + @impl true + def handle_call({:put, token, session}, _from, state) do + {:reply, :ok, put_in(state, [:sessions, token], session)} + end + + def handle_call({:get, token}, _from, state) do + {:reply, Map.get(state.sessions, token), state} + end + + def handle_call(:count, _from, state) do + {:reply, map_size(state.sessions), state} + end + + @impl true + def handle_cast({:delete, token}, state) do + {:noreply, update_in(state, [:sessions], &Map.delete(&1, token))} + end + + def handle_cast({:touch, _token, _ttl}, state) do + {:noreply, state} + end + + defp generate_token, do: Base.url_encode64(:crypto.strong_rand_bytes(32), padding: false) +end diff --git a/priv/combined_metrics/samples/consistency/protocol_conformance_in_separate_extension/bad/PaymentProcessor.swift b/priv/combined_metrics/samples/consistency/protocol_conformance_in_separate_extension/bad/PaymentProcessor.swift new file mode 100644 index 0000000..7161470 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/protocol_conformance_in_separate_extension/bad/PaymentProcessor.swift @@ -0,0 +1,53 @@ +import Foundation + +enum PaymentMethod { + case creditCard(last4: String) + case bankTransfer(accountNumber: String) + case digitalWallet(provider: String) +} + +enum PaymentStatus { + case pending, authorized, captured, refunded, failed +} + +// All protocol conformances declared inline with the primary type, +// making it harder to locate core logic vs. protocol implementations +struct PaymentTransaction: Equatable, Hashable, CustomStringConvertible, Codable { + let id: String + let amount: Decimal + let currency: String + let method: PaymentMethod + var status: PaymentStatus + let createdAt: Date + var completedAt: Date? + var failureReason: String? + + // Equatable mixed in with stored properties + static func == (lhs: PaymentTransaction, rhs: PaymentTransaction) -> Bool { + return lhs.id == rhs.id + } + + // Hashable mixed in with stored properties + func hash(into hasher: inout Hasher) { + hasher.combine(id) + } + + // CustomStringConvertible mixed in with stored properties + var description: String { + return "PaymentTransaction(id: \(id), amount: \(amount) \(currency), status: \(status))" + } + + // Codable CodingKeys mixed in with stored properties + enum CodingKeys: String, CodingKey { + case id, amount, currency, status, createdAt, completedAt, failureReason + } + + // Business logic buried alongside protocol boilerplate + func isRefundable() -> Bool { + return status == .captured && completedAt != nil + } + + func summary() -> String { + return "\(currency) \(amount) via \(method)" + } +} diff --git a/priv/combined_metrics/samples/consistency/protocol_conformance_in_separate_extension/good/PaymentProcessor.swift b/priv/combined_metrics/samples/consistency/protocol_conformance_in_separate_extension/good/PaymentProcessor.swift new file mode 100644 index 0000000..f7cfcad --- /dev/null +++ b/priv/combined_metrics/samples/consistency/protocol_conformance_in_separate_extension/good/PaymentProcessor.swift @@ -0,0 +1,51 @@ +import Foundation + +enum PaymentMethod { + case creditCard(last4: String) + case bankTransfer(accountNumber: String) + case digitalWallet(provider: String) +} + +enum PaymentStatus { + case pending, authorized, captured, refunded, failed +} + +// Primary type definition — only core stored properties and init +struct PaymentTransaction { + let id: String + let amount: Decimal + let currency: String + let method: PaymentMethod + var status: PaymentStatus + let createdAt: Date + var completedAt: Date? + var failureReason: String? +} + +// MARK: - Equatable conformance in its own extension +extension PaymentTransaction: Equatable { + static func == (lhs: PaymentTransaction, rhs: PaymentTransaction) -> Bool { + return lhs.id == rhs.id + } +} + +// MARK: - Hashable conformance in its own extension +extension PaymentTransaction: Hashable { + func hash(into hasher: inout Hasher) { + hasher.combine(id) + } +} + +// MARK: - CustomStringConvertible in its own extension +extension PaymentTransaction: CustomStringConvertible { + var description: String { + return "PaymentTransaction(id: \(id), amount: \(amount) \(currency), status: \(status))" + } +} + +// MARK: - Codable in its own extension +extension PaymentTransaction: Codable { + enum CodingKeys: String, CodingKey { + case id, amount, currency, status, createdAt, completedAt, failureReason + } +} diff --git a/priv/combined_metrics/samples/consistency/same_concept_same_name/bad/auth.ex b/priv/combined_metrics/samples/consistency/same_concept_same_name/bad/auth.ex new file mode 100644 index 0000000..3dbbbc3 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/same_concept_same_name/bad/auth.ex @@ -0,0 +1,102 @@ +defmodule Auth do + @moduledoc "Handles authentication and session management" + + def login(account, password) do + case fetch_user(account.email) do + nil -> + {:error, :not_found} + + u -> + if verify_password(u, password) do + token = generate_token(u.id) + {:ok, token} + else + {:error, :invalid_credentials} + end + end + end + + def logout(usr) do + case fetch_active_session(usr.id) do + nil -> {:error, :no_session} + session -> invalidate_session(session) + end + end + + def register(params) do + with :ok <- validate_registration(params), + {:ok, new_account} <- create_user(params), + {:ok, _} <- send_verification_email(new_account) do + {:ok, new_account} + end + end + + def verify_token(token_string) do + case decode_token(token_string) do + {:ok, claims} -> + account = fetch_user_by_id(claims["sub"]) + {:ok, account} + + {:error, reason} -> + {:error, reason} + end + end + + def change_password(u, old_pw, new_pw) do + if verify_password(u, old_pw) do + hashed = hash_password(new_pw) + update_user_password(u.id, hashed) + else + {:error, :wrong_password} + end + end + + def request_password_reset(email_address) do + case fetch_user(email_address) do + nil -> + {:error, :not_found} + + account -> + reset_token = generate_reset_token(account.id) + send_reset_email(account.email, reset_token) + end + end + + def reset_password(reset_token, new_pw) do + with {:ok, usr_id} <- validate_reset_token(reset_token), + account <- fetch_user_by_id(usr_id), + hashed = hash_password(new_pw), + {:ok, updated} <- update_user_password(account.id, hashed) do + {:ok, updated} + end + end + + def list_sessions(account) do + fetch_sessions_for_user(account.id) + end + + def revoke_session(u, session_id) do + case fetch_session(session_id) do + %{user_id: ^u.id} = session -> invalidate_session(session) + _ -> {:error, :unauthorized} + end + end + + defp fetch_user(_email), do: nil + defp fetch_user_by_id(_id), do: nil + defp fetch_active_session(_user_id), do: nil + defp fetch_session(_id), do: nil + defp fetch_sessions_for_user(_user_id), do: [] + defp verify_password(_user, _pw), do: true + defp generate_token(_user_id), do: "tok_abc" + defp generate_reset_token(_user_id), do: "rst_abc" + defp invalidate_session(_session), do: {:ok, :logged_out} + defp create_user(attrs), do: {:ok, attrs} + defp send_verification_email(_user), do: {:ok, :sent} + defp send_reset_email(_email, _token), do: {:ok, :sent} + defp decode_token(_token), do: {:ok, %{"sub" => "1"}} + defp validate_reset_token(_token), do: {:ok, "1"} + defp validate_registration(_params), do: :ok + defp hash_password(pw), do: pw + defp update_user_password(_id, _hash), do: {:ok, %{}} +end diff --git a/priv/combined_metrics/samples/consistency/same_concept_same_name/config.yml b/priv/combined_metrics/samples/consistency/same_concept_same_name/config.yml new file mode 100644 index 0000000..3543d7a --- /dev/null +++ b/priv/combined_metrics/samples/consistency/same_concept_same_name/config.yml @@ -0,0 +1 @@ +doc: "The same domain concept should use the same name throughout a file — mixing `user`, `usr`, and `u` for the same thing harms readability." diff --git a/priv/combined_metrics/samples/consistency/same_concept_same_name/good/auth.ex b/priv/combined_metrics/samples/consistency/same_concept_same_name/good/auth.ex new file mode 100644 index 0000000..ea578d8 --- /dev/null +++ b/priv/combined_metrics/samples/consistency/same_concept_same_name/good/auth.ex @@ -0,0 +1,102 @@ +defmodule Auth do + @moduledoc "Handles authentication and session management" + + def login(user, password) do + case fetch_user(user.email) do + nil -> + {:error, :not_found} + + user -> + if verify_password(user, password) do + token = generate_token(user.id) + {:ok, token} + else + {:error, :invalid_credentials} + end + end + end + + def logout(user) do + case fetch_active_session(user.id) do + nil -> {:error, :no_session} + session -> invalidate_session(session) + end + end + + def register(params) do + with :ok <- validate_registration(params), + {:ok, user} <- create_user(params), + {:ok, _} <- send_verification_email(user) do + {:ok, user} + end + end + + def verify_token(token_string) do + case decode_token(token_string) do + {:ok, claims} -> + user = fetch_user_by_id(claims["sub"]) + {:ok, user} + + {:error, reason} -> + {:error, reason} + end + end + + def change_password(user, old_password, new_password) do + if verify_password(user, old_password) do + hashed = hash_password(new_password) + update_user_password(user.id, hashed) + else + {:error, :wrong_password} + end + end + + def request_password_reset(email) do + case fetch_user(email) do + nil -> + {:error, :not_found} + + user -> + reset_token = generate_reset_token(user.id) + send_reset_email(user.email, reset_token) + end + end + + def reset_password(reset_token, new_password) do + with {:ok, user_id} <- validate_reset_token(reset_token), + user <- fetch_user_by_id(user_id), + hashed = hash_password(new_password), + {:ok, updated_user} <- update_user_password(user.id, hashed) do + {:ok, updated_user} + end + end + + def list_sessions(user) do + fetch_sessions_for_user(user.id) + end + + def revoke_session(user, session_id) do + case fetch_session(session_id) do + %{user_id: ^user.id} = session -> invalidate_session(session) + _ -> {:error, :unauthorized} + end + end + + defp fetch_user(_email), do: nil + defp fetch_user_by_id(_id), do: nil + defp fetch_active_session(_user_id), do: nil + defp fetch_session(_id), do: nil + defp fetch_sessions_for_user(_user_id), do: [] + defp verify_password(_user, _password), do: true + defp generate_token(_user_id), do: "tok_abc" + defp generate_reset_token(_user_id), do: "rst_abc" + defp invalidate_session(_session), do: {:ok, :logged_out} + defp create_user(attrs), do: {:ok, attrs} + defp send_verification_email(_user), do: {:ok, :sent} + defp send_reset_email(_email, _token), do: {:ok, :sent} + defp decode_token(_token), do: {:ok, %{"sub" => "1"}} + defp validate_reset_token(_token), do: {:ok, "1"} + defp validate_registration(_params), do: :ok + defp hash_password(password), do: password + defp update_user_password(_id, _hash), do: {:ok, %{}} +end diff --git a/priv/combined_metrics/samples/consistency/static_member_via_class_name/bad/ProductCatalog.java b/priv/combined_metrics/samples/consistency/static_member_via_class_name/bad/ProductCatalog.java new file mode 100644 index 0000000..3e63a8b --- /dev/null +++ b/priv/combined_metrics/samples/consistency/static_member_via_class_name/bad/ProductCatalog.java @@ -0,0 +1,63 @@ +package com.example.catalog; + +import java.math.BigDecimal; +import java.util.List; + +public class ProductCatalog { + + public static final int MAX_SEARCH_RESULTS = 100; + public static final BigDecimal DEFAULT_TAX_RATE = new BigDecimal("0.20"); + + private static int instanceCount = 0; + + private final String region; + private final ProductRepository repository; + + public ProductCatalog(String region, ProductRepository repository) { + this.region = region; + this.repository = repository; + // Accessing static field via `this` — misleading, looks like instance state + this.instanceCount++; + } + + public List search(String query) { + // Accessing static constant via instance reference — hides the static nature + return repository.search(query, this.MAX_SEARCH_RESULTS); + } + + public BigDecimal priceWithTax(Product product) { + // Accessing static constant via instance reference + return product.getBasePrice().multiply( + BigDecimal.ONE.add(this.DEFAULT_TAX_RATE) + ); + } + + public static ProductCatalog forRegion(String region, ProductRepository repo) { + return new ProductCatalog(region, repo); + } + + public static int getInstanceCount() { + // Fine in static context, but inconsistent with rest of file + return instanceCount; + } + + public void resetInstanceTracking() { + // Accessing static field via `this` in instance method + this.instanceCount = 0; + } + + public List findOnSale() { + ProductCatalog catalog = this; + // Accessing static member via local instance variable — very confusing + return repository.findAll().stream() + .filter(Product::isOnSale) + .limit(catalog.MAX_SEARCH_RESULTS) + .toList(); + } + + public void logStats() { + ProductCatalog temp = new ProductCatalog(region, repository); + // Accessing static field through a different instance + System.out.println("Count: " + temp.instanceCount); + } +} diff --git a/priv/combined_metrics/samples/consistency/static_member_via_class_name/good/ProductCatalog.java b/priv/combined_metrics/samples/consistency/static_member_via_class_name/good/ProductCatalog.java new file mode 100644 index 0000000..363e97a --- /dev/null +++ b/priv/combined_metrics/samples/consistency/static_member_via_class_name/good/ProductCatalog.java @@ -0,0 +1,62 @@ +package com.example.catalog; + +import java.math.BigDecimal; +import java.util.List; + +public class ProductCatalog { + + public static final int MAX_SEARCH_RESULTS = 100; + public static final BigDecimal DEFAULT_TAX_RATE = new BigDecimal("0.20"); + + private static int instanceCount = 0; + + private final String region; + private final ProductRepository repository; + + public ProductCatalog(String region, ProductRepository repository) { + this.region = region; + this.repository = repository; + ProductCatalog.instanceCount++; + } + + public List search(String query) { + // Accessing static constant via class name + return repository.search(query, ProductCatalog.MAX_SEARCH_RESULTS); + } + + public BigDecimal priceWithTax(Product product) { + // Accessing static constant via class name + return product.getBasePrice().multiply( + BigDecimal.ONE.add(ProductCatalog.DEFAULT_TAX_RATE) + ); + } + + public static ProductCatalog forRegion(String region, ProductRepository repo) { + return new ProductCatalog(region, repo); + } + + public static int getInstanceCount() { + // Accessing static field via class name inside static method + return ProductCatalog.instanceCount; + } + + public void resetInstanceTracking() { + // Static field accessed via class name even from instance method + ProductCatalog.instanceCount = 0; + } + + public Product findBySkuOrThrow(String sku) { + Product product = repository.findBySku(sku); + if (product == null) { + throw new ProductNotFoundException(sku); + } + return product; + } + + public List findOnSale() { + return repository.findAll().stream() + .filter(Product::isOnSale) + .limit(ProductCatalog.MAX_SEARCH_RESULTS) + .toList(); + } +} diff --git a/priv/combined_metrics/samples/consistency/switch_fallthrough_has_comment/bad/OrderService.php b/priv/combined_metrics/samples/consistency/switch_fallthrough_has_comment/bad/OrderService.php new file mode 100644 index 0000000..90f311c --- /dev/null +++ b/priv/combined_metrics/samples/consistency/switch_fallthrough_has_comment/bad/OrderService.php @@ -0,0 +1,71 @@ += ^from and e.date <= ^to + ) + + %Report{user_id: user_id, events: events} + end +end diff --git a/priv/combined_metrics/samples/dependencies/import_count_under_10/config.yml b/priv/combined_metrics/samples/dependencies/import_count_under_10/config.yml new file mode 100644 index 0000000..3fad5bf --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/import_count_under_10/config.yml @@ -0,0 +1 @@ +doc: "Files should import fewer than 10 modules; high import counts signal excessive coupling." diff --git a/priv/combined_metrics/samples/dependencies/import_count_under_10/good/dashboard.ex b/priv/combined_metrics/samples/dependencies/import_count_under_10/good/dashboard.ex new file mode 100644 index 0000000..8313437 --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/import_count_under_10/good/dashboard.ex @@ -0,0 +1,80 @@ +defmodule MyApp.Dashboard do + @moduledoc """ + Dashboard aggregation module that compiles user-facing metrics + and summaries from all subdomains. + """ + + alias MyApp.Accounts + alias MyApp.Orders + alias MyApp.Billing + alias MyApp.Shipping + alias MyApp.Notifications + + @spec summary(Accounts.User.t()) :: map() + def summary(user) do + %{ + orders: Orders.recent_for_user(user, limit: 5), + invoices: Billing.open_invoices_for_user(user), + shipments: Shipping.active_shipments_for_user(user), + notifications: Notifications.unread_for_user(user) + } + end + + @spec order_count(Accounts.User.t()) :: non_neg_integer() + def order_count(user) do + user + |> Orders.for_user() + |> length() + end + + @spec billing_status(Accounts.User.t()) :: :current | :overdue | :no_invoices + def billing_status(user) do + user + |> Billing.open_invoices_for_user() + |> determine_billing_status() + end + + @spec shipment_summary(Accounts.User.t()) :: map() + def shipment_summary(user) do + shipments = Shipping.active_shipments_for_user(user) + + %{ + in_transit: Enum.count(shipments, &(&1.status == :in_transit)), + delivered: Enum.count(shipments, &(&1.status == :delivered)), + total: length(shipments) + } + end + + @spec notification_badge(Accounts.User.t()) :: non_neg_integer() + def notification_badge(user) do + user + |> Notifications.unread_for_user() + |> length() + end + + @spec activity_feed(Accounts.User.t(), keyword()) :: [map()] + def activity_feed(user, opts \\ []) do + limit = Keyword.get(opts, :limit, 20) + + [ + Orders.recent_for_user(user, limit: limit), + Shipping.recent_events_for_user(user, limit: limit), + Notifications.unread_for_user(user) + ] + |> List.flatten() + |> Enum.sort_by(& &1.inserted_at, {:desc, DateTime}) + |> Enum.take(limit) + end + + # Private + + defp determine_billing_status([]), do: :no_invoices + + defp determine_billing_status(invoices) do + if Enum.any?(invoices, &past_due?/1), do: :overdue, else: :current + end + + defp past_due?(%{due_date: due_date}) do + Date.compare(due_date, Date.utc_today()) == :lt + end +end diff --git a/priv/combined_metrics/samples/dependencies/low_coupling/bad/order_controller.ex b/priv/combined_metrics/samples/dependencies/low_coupling/bad/order_controller.ex new file mode 100644 index 0000000..e52e77e --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/low_coupling/bad/order_controller.ex @@ -0,0 +1,91 @@ +defmodule MyAppWeb.OrderController do + use MyAppWeb, :controller + + alias MyApp.Repo + alias MyApp.Orders.Order + alias MyApp.Orders.OrderItem + alias MyApp.Accounts.User + alias MyApp.Billing.Invoice + alias MyApp.Notifications.Mailer + alias MyApp.Shipping.ShipmentService + + @moduledoc """ + Controller for order lifecycle management. + """ + + def index(conn, _params) do + user_id = conn.assigns.current_user.id + + orders = + Repo.all( + from o in Order, + where: o.user_id == ^user_id, + preload: [:items, :invoice] + ) + + render(conn, :index, orders: orders) + end + + def show(conn, %{"id" => id}) do + order = Repo.get!(Order, id) |> Repo.preload([:items, :invoice, :user]) + render(conn, :show, order: order) + end + + def create(conn, %{"order" => params}) do + user = Repo.get!(User, conn.assigns.current_user.id) + + changeset = Order.changeset(%Order{}, Map.put(params, "user_id", user.id)) + + case Repo.insert(changeset) do + {:ok, order} -> + items = Map.get(params, "items", []) + + Enum.each(items, fn item_params -> + %OrderItem{} + |> OrderItem.changeset(Map.put(item_params, "order_id", order.id)) + |> Repo.insert!() + end) + + invoice = Repo.insert!(%Invoice{order_id: order.id, user_id: user.id, status: :open}) + + total = Enum.reduce(items, 0, fn i, acc -> acc + i["price"] * i["quantity"] end) + Repo.update!(Invoice.changeset(invoice, %{total: total})) + + ShipmentService.create_shipment_for_order(order) + + Mailer.send_order_confirmation(user.email, order) + + conn + |> put_status(:created) + |> render(:show, order: order) + + {:error, changeset} -> + conn + |> put_status(:unprocessable_entity) + |> render(:error, changeset: changeset) + end + end + + def cancel(conn, %{"id" => id}) do + order = Repo.get!(Order, id) + + case Repo.update(Order.changeset(order, %{status: :cancelled})) do + {:ok, order} -> + invoice = Repo.get_by!(Invoice, order_id: order.id) + Repo.update!(Invoice.changeset(invoice, %{status: :voided})) + + shipment = ShipmentService.find_shipment(order.id) + if shipment, do: ShipmentService.cancel_shipment(shipment) + + user = Repo.get!(User, order.user_id) + Mailer.send_cancellation_notice(user.email, order) + + render(conn, :show, order: order) + + {:error, changeset} -> + conn + |> put_status(:unprocessable_entity) + |> render(:error, changeset: changeset) + end + end +end diff --git a/priv/combined_metrics/samples/dependencies/low_coupling/config.yml b/priv/combined_metrics/samples/dependencies/low_coupling/config.yml new file mode 100644 index 0000000..a84bf39 --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/low_coupling/config.yml @@ -0,0 +1 @@ +doc: "Modules should depend on few external symbols — a low unique-operand count relative to total is a proxy for tight coupling." diff --git a/priv/combined_metrics/samples/dependencies/low_coupling/good/order_controller.ex b/priv/combined_metrics/samples/dependencies/low_coupling/good/order_controller.ex new file mode 100644 index 0000000..0bbb5c1 --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/low_coupling/good/order_controller.ex @@ -0,0 +1,57 @@ +defmodule MyAppWeb.OrderController do + use MyAppWeb, :controller + + alias MyApp.Orders + + @moduledoc """ + Controller for order lifecycle management. + Delegates all business logic to the Orders context. + """ + + action_fallback MyAppWeb.FallbackController + + def index(conn, _params) do + orders = Orders.list_orders_for_user(conn.assigns.current_user) + render(conn, :index, orders: orders) + end + + def show(conn, %{"id" => id}) do + with {:ok, order} <- Orders.get_order(id, conn.assigns.current_user) do + render(conn, :show, order: order) + end + end + + def create(conn, %{"order" => params}) do + with {:ok, order} <- Orders.place_order(conn.assigns.current_user, params) do + conn + |> put_status(:created) + |> render(:show, order: order) + end + end + + def cancel(conn, %{"id" => id}) do + with {:ok, order} <- Orders.get_order(id, conn.assigns.current_user), + {:ok, cancelled} <- Orders.cancel_order(order) do + render(conn, :show, order: cancelled) + end + end + + def update(conn, %{"id" => id, "order" => params}) do + with {:ok, order} <- Orders.get_order(id, conn.assigns.current_user), + {:ok, updated} <- Orders.update_order(order, params) do + render(conn, :show, order: updated) + end + end + + def history(conn, params) do + page = Map.get(params, "page", 1) + per_page = Map.get(params, "per_page", 20) + + orders = Orders.order_history_for_user(conn.assigns.current_user, + page: page, + per_page: per_page + ) + + render(conn, :index, orders: orders) + end +end diff --git a/priv/combined_metrics/samples/dependencies/no_default_exports/bad/payment_gateway.ts b/priv/combined_metrics/samples/dependencies/no_default_exports/bad/payment_gateway.ts new file mode 100644 index 0000000..d50dc6c --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/no_default_exports/bad/payment_gateway.ts @@ -0,0 +1,68 @@ +interface ChargeRequest { + amount: number; + currency: string; + paymentMethodId: string; + customerId: string; +} + +interface ChargeResult { + id: string; + status: "succeeded" | "pending" | "failed"; + amount: number; + currency: string; + createdAt: string; +} + +interface RefundRequest { + chargeId: string; + amount?: number; + reason?: string; +} + +interface RefundResult { + id: string; + chargeId: string; + amount: number; + status: "pending" | "succeeded"; +} + +async function createCharge(request: ChargeRequest): Promise { + const response = await fetch("/api/charges", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(request), + }); + if (!response.ok) throw new Error(`Charge failed: ${response.status}`); + return response.json() as Promise; +} + +async function fetchCharge(chargeId: string): Promise { + const response = await fetch(`/api/charges/${chargeId}`); + if (!response.ok) throw new Error(`Charge not found: ${chargeId}`); + return response.json() as Promise; +} + +async function refundCharge(request: RefundRequest): Promise { + const response = await fetch("/api/refunds", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(request), + }); + if (!response.ok) throw new Error(`Refund failed: ${response.status}`); + return response.json() as Promise; +} + +function formatChargeAmount(charge: ChargeResult): string { + return new Intl.NumberFormat("en-US", { + style: "currency", + currency: charge.currency, + }).format(charge.amount / 100); +} + +// Default export makes it hard to rename consistently across the codebase +export default { + createCharge, + fetchCharge, + refundCharge, + formatChargeAmount, +}; diff --git a/priv/combined_metrics/samples/dependencies/no_default_exports/good/payment_gateway.ts b/priv/combined_metrics/samples/dependencies/no_default_exports/good/payment_gateway.ts new file mode 100644 index 0000000..5cc1ba3 --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/no_default_exports/good/payment_gateway.ts @@ -0,0 +1,63 @@ +interface ChargeRequest { + amount: number; + currency: string; + paymentMethodId: string; + customerId: string; +} + +interface ChargeResult { + id: string; + status: "succeeded" | "pending" | "failed"; + amount: number; + currency: string; + createdAt: string; +} + +interface RefundRequest { + chargeId: string; + amount?: number; + reason?: string; +} + +interface RefundResult { + id: string; + chargeId: string; + amount: number; + status: "pending" | "succeeded"; +} + +async function createCharge(request: ChargeRequest): Promise { + const response = await fetch("/api/charges", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(request), + }); + if (!response.ok) throw new Error(`Charge failed: ${response.status}`); + return response.json() as Promise; +} + +async function fetchCharge(chargeId: string): Promise { + const response = await fetch(`/api/charges/${chargeId}`); + if (!response.ok) throw new Error(`Charge not found: ${chargeId}`); + return response.json() as Promise; +} + +async function refundCharge(request: RefundRequest): Promise { + const response = await fetch("/api/refunds", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(request), + }); + if (!response.ok) throw new Error(`Refund failed: ${response.status}`); + return response.json() as Promise; +} + +function formatChargeAmount(charge: ChargeResult): string { + return new Intl.NumberFormat("en-US", { + style: "currency", + currency: charge.currency, + }).format(charge.amount / 100); +} + +export { createCharge, fetchCharge, refundCharge, formatChargeAmount }; +export type { ChargeRequest, ChargeResult, RefundRequest, RefundResult }; diff --git a/priv/combined_metrics/samples/dependencies/no_mutable_exports/bad/product_repository.ts b/priv/combined_metrics/samples/dependencies/no_mutable_exports/bad/product_repository.ts new file mode 100644 index 0000000..d1daac2 --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/no_mutable_exports/bad/product_repository.ts @@ -0,0 +1,46 @@ +interface Product { + id: string; + name: string; + price: number; + category: string; +} + +// Exported mutable state — consumers can accidentally mutate these +export let productCache: Map = new Map(); +export let cacheLastFetchedAt: number | null = null; +export let isLoading = false; + +const CACHE_TTL_MS = 5 * 60 * 1000; + +function isCacheStale(): boolean { + return cacheLastFetchedAt === null || Date.now() - cacheLastFetchedAt > CACHE_TTL_MS; +} + +export async function loadProducts(): Promise { + isLoading = true; + try { + const response = await fetch("/api/products"); + if (!response.ok) throw new Error(`Failed to load products: ${response.status}`); + const products: Product[] = await response.json(); + + productCache = new Map(products.map((p) => [p.id, p])); + cacheLastFetchedAt = Date.now(); + } finally { + isLoading = false; + } +} + +export async function getProducts(): Promise { + if (isCacheStale()) await loadProducts(); + return Array.from(productCache.values()); +} + +export async function getProduct(id: string): Promise { + if (isCacheStale()) await loadProducts(); + return productCache.get(id) ?? null; +} + +export function invalidateCache(): void { + productCache = new Map(); + cacheLastFetchedAt = null; +} diff --git a/priv/combined_metrics/samples/dependencies/no_mutable_exports/good/product_repository.ts b/priv/combined_metrics/samples/dependencies/no_mutable_exports/good/product_repository.ts new file mode 100644 index 0000000..66a99a3 --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/no_mutable_exports/good/product_repository.ts @@ -0,0 +1,57 @@ +interface Product { + id: string; + name: string; + price: number; + category: string; +} + +interface CacheState { + products: Map; + lastFetchedAt: number | null; +} + +const state: CacheState = { + products: new Map(), + lastFetchedAt: null, +}; + +const CACHE_TTL_MS = 5 * 60 * 1000; + +function isCacheStale(): boolean { + return state.lastFetchedAt === null || Date.now() - state.lastFetchedAt > CACHE_TTL_MS; +} + +async function loadProducts(): Promise { + const response = await fetch("/api/products"); + if (!response.ok) throw new Error(`Failed to load products: ${response.status}`); + const products: Product[] = await response.json(); + + state.products.clear(); + for (const product of products) { + state.products.set(product.id, product); + } + state.lastFetchedAt = Date.now(); +} + +async function getProducts(): Promise { + if (isCacheStale()) await loadProducts(); + return Array.from(state.products.values()); +} + +async function getProduct(id: string): Promise { + if (isCacheStale()) await loadProducts(); + return state.products.get(id) ?? null; +} + +function invalidateCache(): void { + state.products.clear(); + state.lastFetchedAt = null; +} + +function getCacheAge(): number | null { + if (state.lastFetchedAt === null) return null; + return Date.now() - state.lastFetchedAt; +} + +export { getProducts, getProduct, invalidateCache, getCacheAge, loadProducts }; +export type { Product }; diff --git a/priv/combined_metrics/samples/dependencies/no_wildcard_imports/bad/query_helpers.ex b/priv/combined_metrics/samples/dependencies/no_wildcard_imports/bad/query_helpers.ex new file mode 100644 index 0000000..32f1c3d --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/no_wildcard_imports/bad/query_helpers.ex @@ -0,0 +1,74 @@ +defmodule MyApp.QueryHelpers do + @moduledoc """ + Helpers for building common Ecto query patterns. + """ + + import Ecto.Query + import Ecto.Changeset + import MyApp.QueryFilters + import MyApp.PaginationHelpers + + @spec paginate(Ecto.Queryable.t(), map()) :: Ecto.Query.t() + def paginate(query, params) do + page = Map.get(params, "page", 1) + per_page = Map.get(params, "per_page", 20) + + query + |> offset(^((page - 1) * per_page)) + |> limit(^per_page) + end + + @spec filter_by_status(Ecto.Queryable.t(), atom()) :: Ecto.Query.t() + def filter_by_status(query, status) do + where(query, [q], q.status == ^status) + end + + @spec filter_by_user(Ecto.Queryable.t(), integer()) :: Ecto.Query.t() + def filter_by_user(query, user_id) do + where(query, [q], q.user_id == ^user_id) + end + + @spec order_by_inserted(Ecto.Queryable.t(), :asc | :desc) :: Ecto.Query.t() + def order_by_inserted(query, direction \\ :desc) do + order_by(query, [q], [{^direction, q.inserted_at}]) + end + + @spec search_name(Ecto.Queryable.t(), String.t()) :: Ecto.Query.t() + def search_name(query, term) do + like_term = "%#{term}%" + where(query, [q], ilike(q.name, ^like_term)) + end + + @spec with_preloads(Ecto.Queryable.t(), list()) :: Ecto.Query.t() + def with_preloads(query, associations) do + preload(query, ^associations) + end + + @spec apply_filters(Ecto.Queryable.t(), map()) :: Ecto.Query.t() + def apply_filters(query, filters) do + Enum.reduce(filters, query, fn + {"status", status}, q -> filter_by_status(q, String.to_atom(status)) + {"user_id", id}, q -> filter_by_user(q, id) + {"search", term}, q -> search_name(q, term) + _unknown, q -> q + end) + end + + @spec validate_and_apply(Ecto.Changeset.t(), map()) :: Ecto.Changeset.t() + def validate_and_apply(changeset, attrs) do + changeset + |> cast(attrs, [:status, :name]) + |> validate_required([:status]) + |> validate_inclusion(:status, [:active, :inactive, :pending]) + end + + @spec count_query(Ecto.Queryable.t()) :: Ecto.Query.t() + def count_query(query) do + from q in query, select: count(q.id) + end + + @spec date_range(Ecto.Queryable.t(), Date.t(), Date.t()) :: Ecto.Query.t() + def date_range(query, from_date, to_date) do + where(query, [q], q.inserted_at >= ^from_date and q.inserted_at <= ^to_date) + end +end diff --git a/priv/combined_metrics/samples/dependencies/no_wildcard_imports/config.yml b/priv/combined_metrics/samples/dependencies/no_wildcard_imports/config.yml new file mode 100644 index 0000000..4b85bcb --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/no_wildcard_imports/config.yml @@ -0,0 +1 @@ +doc: "Wildcard imports (`import *`, `using Module`) pollute the local namespace and hide dependencies." diff --git a/priv/combined_metrics/samples/dependencies/no_wildcard_imports/good/query_helpers.ex b/priv/combined_metrics/samples/dependencies/no_wildcard_imports/good/query_helpers.ex new file mode 100644 index 0000000..ccde3df --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/no_wildcard_imports/good/query_helpers.ex @@ -0,0 +1,73 @@ +defmodule MyApp.QueryHelpers do + @moduledoc """ + Helpers for building common Ecto query patterns. + """ + + import Ecto.Query, only: [from: 2, where: 3, order_by: 3, limit: 2, offset: 2, preload: 2] + import Ecto.Changeset, only: [cast: 3, validate_required: 2, validate_inclusion: 3] + + alias MyApp.QueryFilters + alias MyApp.PaginationHelpers + + @spec paginate(Ecto.Queryable.t(), map()) :: Ecto.Query.t() + def paginate(query, params) do + {page, per_page} = PaginationHelpers.extract_pagination(params) + + query + |> offset(^((page - 1) * per_page)) + |> limit(^per_page) + end + + @spec filter_by_status(Ecto.Queryable.t(), atom()) :: Ecto.Query.t() + def filter_by_status(query, status) do + where(query, [q], q.status == ^status) + end + + @spec filter_by_user(Ecto.Queryable.t(), integer()) :: Ecto.Query.t() + def filter_by_user(query, user_id) do + where(query, [q], q.user_id == ^user_id) + end + + @spec order_by_inserted(Ecto.Queryable.t(), :asc | :desc) :: Ecto.Query.t() + def order_by_inserted(query, direction \\ :desc) do + order_by(query, [q], [{^direction, q.inserted_at}]) + end + + @spec search_name(Ecto.Queryable.t(), String.t()) :: Ecto.Query.t() + def search_name(query, term) do + QueryFilters.ilike_search(query, :name, term) + end + + @spec with_preloads(Ecto.Queryable.t(), list()) :: Ecto.Query.t() + def with_preloads(query, associations) do + preload(query, ^associations) + end + + @spec apply_filters(Ecto.Queryable.t(), map()) :: Ecto.Query.t() + def apply_filters(query, filters) do + Enum.reduce(filters, query, fn + {"status", status}, q -> filter_by_status(q, String.to_atom(status)) + {"user_id", id}, q -> filter_by_user(q, id) + {"search", term}, q -> search_name(q, term) + _unknown, q -> q + end) + end + + @spec validate_and_apply(Ecto.Changeset.t(), map()) :: Ecto.Changeset.t() + def validate_and_apply(changeset, attrs) do + changeset + |> cast(attrs, [:status, :name]) + |> validate_required([:status]) + |> validate_inclusion(:status, [:active, :inactive, :pending]) + end + + @spec count_query(Ecto.Queryable.t()) :: Ecto.Query.t() + def count_query(query) do + from q in query, select: count(q.id) + end + + @spec date_range(Ecto.Queryable.t(), Date.t(), Date.t()) :: Ecto.Query.t() + def date_range(query, from_date, to_date) do + where(query, [q], q.inserted_at >= ^from_date and q.inserted_at <= ^to_date) + end +end diff --git a/priv/combined_metrics/samples/dependencies/uses_import_type_for_type_only_imports/bad/user_service.ts b/priv/combined_metrics/samples/dependencies/uses_import_type_for_type_only_imports/bad/user_service.ts new file mode 100644 index 0000000..a1fbecd --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/uses_import_type_for_type_only_imports/bad/user_service.ts @@ -0,0 +1,47 @@ +// Regular imports used for type-only symbols — should use `import type` +import { User, CreateUserPayload, UpdateUserPayload } from "./user_types.js"; +import { PaginatedResponse, ApiError } from "./api_types.js"; +import { buildApiUrl, handleResponse } from "./api_client.js"; +import { formatDate } from "./date_utils.js"; + +async function fetchUser(userId: string): Promise { + const url = buildApiUrl(`/users/${userId}`); + const response = await fetch(url); + return handleResponse(response); +} + +async function createUser(payload: CreateUserPayload): Promise { + const url = buildApiUrl("/users"); + const response = await fetch(url, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + }); + return handleResponse(response); +} + +async function updateUser(userId: string, changes: UpdateUserPayload): Promise { + const url = buildApiUrl(`/users/${userId}`); + const response = await fetch(url, { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(changes), + }); + return handleResponse(response); +} + +async function listUsers(page = 1, pageSize = 20): Promise> { + const url = buildApiUrl(`/users?page=${page}&pageSize=${pageSize}`); + const response = await fetch(url); + return handleResponse>(response); +} + +function handleApiError(error: ApiError): void { + console.error(`API error [${error.code}]: ${error.message}`); +} + +function formatUserCreatedDate(user: User): string { + return formatDate(user.createdAt); +} + +export { fetchUser, createUser, updateUser, listUsers, handleApiError, formatUserCreatedDate }; diff --git a/priv/combined_metrics/samples/dependencies/uses_import_type_for_type_only_imports/good/user_service.ts b/priv/combined_metrics/samples/dependencies/uses_import_type_for_type_only_imports/good/user_service.ts new file mode 100644 index 0000000..78fef3b --- /dev/null +++ b/priv/combined_metrics/samples/dependencies/uses_import_type_for_type_only_imports/good/user_service.ts @@ -0,0 +1,47 @@ +import type { User, CreateUserPayload, UpdateUserPayload } from "./user_types.js"; +import type { PaginatedResponse, ApiError } from "./api_types.js"; +import { buildApiUrl, handleResponse } from "./api_client.js"; +import { formatDate } from "./date_utils.js"; + +async function fetchUser(userId: string): Promise { + const url = buildApiUrl(`/users/${userId}`); + const response = await fetch(url); + return handleResponse(response); +} + +async function createUser(payload: CreateUserPayload): Promise { + const url = buildApiUrl("/users"); + const response = await fetch(url, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + }); + return handleResponse(response); +} + +async function updateUser(userId: string, changes: UpdateUserPayload): Promise { + const url = buildApiUrl(`/users/${userId}`); + const response = await fetch(url, { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(changes), + }); + return handleResponse(response); +} + +async function listUsers(page = 1, pageSize = 20): Promise> { + const url = buildApiUrl(`/users?page=${page}&pageSize=${pageSize}`); + const response = await fetch(url); + return handleResponse>(response); +} + +function formatUserCreatedDate(user: User): string { + return formatDate(user.createdAt); +} + +function getUserDisplayLabel(user: User): string { + return `${user.displayName} <${user.email}>`; +} + +export { fetchUser, createUser, updateUser, listUsers, formatUserCreatedDate, getUserDisplayLabel }; +export type { User, CreateUserPayload, UpdateUserPayload }; diff --git a/priv/combined_metrics/samples/documentation/docstring_is_nonempty/bad/cache.ex b/priv/combined_metrics/samples/documentation/docstring_is_nonempty/bad/cache.ex new file mode 100644 index 0000000..1127e5f --- /dev/null +++ b/priv/combined_metrics/samples/documentation/docstring_is_nonempty/bad/cache.ex @@ -0,0 +1,78 @@ +defmodule MyApp.Cache do + @moduledoc "" + + @doc "" + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + name = Keyword.get(opts, :name, __MODULE__) + ttl = Keyword.get(opts, :ttl, :timer.minutes(5)) + :ets.new(name, [:set, :public, :named_table, read_concurrency: true]) + Agent.start_link(fn -> %{name: name, ttl: ttl} end, name: :"#{name}_agent") + end + + @doc "" + @spec get(atom(), term()) :: term() | nil + def get(cache, key) do + case :ets.lookup(cache, key) do + [{^key, value, expires_at}] -> + if System.monotonic_time(:millisecond) < expires_at, do: value, else: nil + + [] -> + nil + end + end + + @doc "" + @spec put(atom(), term(), term(), keyword()) :: true + def put(cache, key, value, opts \\ []) do + ttl = Keyword.get(opts, :ttl, default_ttl(cache)) + expires_at = System.monotonic_time(:millisecond) + ttl + :ets.insert(cache, {key, value, expires_at}) + end + + @doc "" + @spec delete(atom(), term()) :: true + def delete(cache, key) do + :ets.delete(cache, key) + end + + @doc "" + @spec flush(atom()) :: true + def flush(cache) do + :ets.delete_all_objects(cache) + end + + @doc "" + @spec fetch(atom(), term(), (-> term())) :: term() + def fetch(cache, key, fun) do + case get(cache, key) do + nil -> + value = fun.() + put(cache, key, value) + value + + value -> + value + end + end + + @doc "" + @spec size(atom()) :: non_neg_integer() + def size(cache) do + :ets.info(cache, :size) + end + + @doc "" + @spec stats(atom()) :: map() + def stats(cache) do + %{ + size: size(cache), + memory: :ets.info(cache, :memory) + } + end + + defp default_ttl(cache) do + agent = :"#{cache}_agent" + Agent.get(agent, & &1.ttl) + end +end diff --git a/priv/combined_metrics/samples/documentation/docstring_is_nonempty/config.yml b/priv/combined_metrics/samples/documentation/docstring_is_nonempty/config.yml new file mode 100644 index 0000000..02f3c8b --- /dev/null +++ b/priv/combined_metrics/samples/documentation/docstring_is_nonempty/config.yml @@ -0,0 +1 @@ +doc: "Docstrings must contain meaningful content, not just a placeholder or empty string." diff --git a/priv/combined_metrics/samples/documentation/docstring_is_nonempty/good/cache.ex b/priv/combined_metrics/samples/documentation/docstring_is_nonempty/good/cache.ex new file mode 100644 index 0000000..89bb205 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/docstring_is_nonempty/good/cache.ex @@ -0,0 +1,96 @@ +defmodule MyApp.Cache do + @moduledoc """ + ETS-backed in-memory cache with per-entry TTL support. + + Each cache instance is an ETS table created at startup via `start_link/1`. + Values are stored with an expiry timestamp and are considered stale after + their TTL has elapsed. Stale entries are not automatically evicted but are + ignored on read. + + Use `fetch/3` for the common read-through pattern to avoid redundant + computations or database calls. + """ + + @doc """ + Creates a new cache ETS table and starts its companion Agent. + + Accepts the following options: + - `:name` — the atom name for the ETS table (defaults to `#{__MODULE__}`) + - `:ttl` — default time-to-live in milliseconds (defaults to 5 minutes) + """ + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + name = Keyword.get(opts, :name, __MODULE__) + ttl = Keyword.get(opts, :ttl, :timer.minutes(5)) + :ets.new(name, [:set, :public, :named_table, read_concurrency: true]) + Agent.start_link(fn -> %{name: name, ttl: ttl} end, name: :"#{name}_agent") + end + + @doc """ + Returns the cached value for `key`, or `nil` if missing or expired. + """ + @spec get(atom(), term()) :: term() | nil + def get(cache, key) do + case :ets.lookup(cache, key) do + [{^key, value, expires_at}] -> + if System.monotonic_time(:millisecond) < expires_at, do: value, else: nil + + [] -> + nil + end + end + + @doc """ + Stores `value` under `key` in the cache with an optional `:ttl` override. + + If `:ttl` is not provided, the cache's default TTL is used. + """ + @spec put(atom(), term(), term(), keyword()) :: true + def put(cache, key, value, opts \\ []) do + ttl = Keyword.get(opts, :ttl, default_ttl(cache)) + expires_at = System.monotonic_time(:millisecond) + ttl + :ets.insert(cache, {key, value, expires_at}) + end + + @doc """ + Removes the entry for `key` from the cache. + """ + @spec delete(atom(), term()) :: true + def delete(cache, key), do: :ets.delete(cache, key) + + @doc """ + Removes all entries from the cache without deleting the table itself. + """ + @spec flush(atom()) :: true + def flush(cache), do: :ets.delete_all_objects(cache) + + @doc """ + Returns the cached value for `key`, computing and storing it via `fun` on a miss. + + This is the preferred read-through pattern to avoid duplicate work: + + MyApp.Cache.fetch(:my_cache, {:user, id}, fn -> Accounts.get_user!(id) end) + """ + @spec fetch(atom(), term(), (-> term())) :: term() + def fetch(cache, key, fun) do + case get(cache, key) do + nil -> + value = fun.() + put(cache, key, value) + value + + value -> + value + end + end + + @doc """ + Returns the number of entries currently stored in the cache table. + """ + @spec size(atom()) :: non_neg_integer() + def size(cache), do: :ets.info(cache, :size) + + defp default_ttl(cache) do + Agent.get(:"#{cache}_agent", & &1.ttl) + end +end diff --git a/priv/combined_metrics/samples/documentation/doctests_validate_examples/bad/billing.ex b/priv/combined_metrics/samples/documentation/doctests_validate_examples/bad/billing.ex new file mode 100644 index 0000000..a90ebe2 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/doctests_validate_examples/bad/billing.ex @@ -0,0 +1,59 @@ +defmodule MyApp.Billing.Formatter do + @moduledoc """ + Formatting utilities for billing amounts and invoice references. + """ + + @doc """ + Formats an integer amount in cents to a display string for the given currency. + + For example, passing 2999 and :usd would return a string like "$29.99". + Passing 0 for any currency returns "0.00" with the appropriate symbol. + For unknown currencies the amount is formatted without a symbol prefix. + + Note that the function expects a non-negative integer for cents. + Negative values are not supported and may produce unexpected output. + The currency atom should be one of :usd, :eur, or :gbp for proper + symbol formatting. Other atoms fall back to an uppercase string suffix. + """ + # Bad: prose-only documentation with no `iex>` examples. + # The description is vague ("a string like...") and untestable. + # A doctest would pin the exact return values and catch regressions. + @spec format_amount(integer(), atom()) :: String.t() + def format_amount(cents, currency) when is_integer(cents) and cents >= 0 do + value = :erlang.float_to_binary(cents / 100, decimals: 2) + + case currency do + :usd -> "$#{value}" + :eur -> "€#{value}" + :gbp -> "£#{value}" + other -> "#{value} #{other |> Atom.to_string() |> String.upcase()}" + end + end + + @doc """ + Generates an invoice reference string. + + The format is "INV-" followed by the customer ID padded to 5 digits, + a dash, and the sequence number padded to 4 digits. + Customer IDs and sequence numbers must be positive integers. + """ + # Bad: describes the format in prose but provides no runnable example. + # No doctest means the claimed format cannot be verified automatically. + @spec invoice_ref(pos_integer(), pos_integer()) :: String.t() + def invoice_ref(customer_id, sequence) + when is_integer(customer_id) and is_integer(sequence) do + "INV-#{String.pad_leading(to_string(customer_id), 5, "0")}-#{String.pad_leading(to_string(sequence), 4, "0")}" + end + + @doc """ + Checks whether an amount is valid for charging. + Returns true for amounts between 1 and 1,000,000 (inclusive), false otherwise. + Zero is not a valid charge amount. Amounts above one million are rejected. + """ + # Bad: no iex> examples. The boundary conditions (0, 1, 1_000_000, 1_000_001) + # are described in words but never tested via doctest. + @spec valid_amount?(integer()) :: boolean() + def valid_amount?(amount) when is_integer(amount) do + amount in 1..1_000_000 + end +end diff --git a/priv/combined_metrics/samples/documentation/doctests_validate_examples/good/billing.ex b/priv/combined_metrics/samples/documentation/doctests_validate_examples/good/billing.ex new file mode 100644 index 0000000..d451fb6 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/doctests_validate_examples/good/billing.ex @@ -0,0 +1,73 @@ +defmodule MyApp.Billing.Formatter do + @moduledoc """ + Formatting utilities for billing amounts and invoice references. + + All examples in this module's `@doc` blocks are run as ExUnit doctests. + See `test/my_app/billing/formatter_test.exs`. + """ + + @doc """ + Formats an integer amount in cents to a display string for the given currency. + + iex> MyApp.Billing.Formatter.format_amount(2999, :usd) + "$29.99" + + iex> MyApp.Billing.Formatter.format_amount(1050, :eur) + "€10.50" + + iex> MyApp.Billing.Formatter.format_amount(0, :gbp) + "£0.00" + + iex> MyApp.Billing.Formatter.format_amount(100, :unknown) + "1.00 UNKNOWN" + """ + @spec format_amount(integer(), atom()) :: String.t() + def format_amount(cents, currency) when is_integer(cents) and cents >= 0 do + value = :erlang.float_to_binary(cents / 100, decimals: 2) + + case currency do + :usd -> "$#{value}" + :eur -> "€#{value}" + :gbp -> "£#{value}" + other -> "#{value} #{other |> Atom.to_string() |> String.upcase()}" + end + end + + @doc """ + Generates an invoice reference from a customer ID and sequence number. + + iex> MyApp.Billing.Formatter.invoice_ref(42, 7) + "INV-00042-0007" + + iex> MyApp.Billing.Formatter.invoice_ref(1, 1) + "INV-00001-0001" + + iex> MyApp.Billing.Formatter.invoice_ref(99999, 9999) + "INV-99999-9999" + """ + @spec invoice_ref(pos_integer(), pos_integer()) :: String.t() + def invoice_ref(customer_id, sequence) + when is_integer(customer_id) and is_integer(sequence) do + "INV-#{String.pad_leading(to_string(customer_id), 5, "0")}-#{String.pad_leading(to_string(sequence), 4, "0")}" + end + + @doc """ + Returns true if the amount is within the acceptable charge range. + + iex> MyApp.Billing.Formatter.valid_amount?(50) + true + + iex> MyApp.Billing.Formatter.valid_amount?(0) + false + + iex> MyApp.Billing.Formatter.valid_amount?(10_000_00) + true + + iex> MyApp.Billing.Formatter.valid_amount?(10_000_01) + false + """ + @spec valid_amount?(integer()) :: boolean() + def valid_amount?(amount) when is_integer(amount) do + amount in 1..1_000_000 + end +end diff --git a/priv/combined_metrics/samples/documentation/every_declaration_has_doc_comment/bad/ShippingService.swift b/priv/combined_metrics/samples/documentation/every_declaration_has_doc_comment/bad/ShippingService.swift new file mode 100644 index 0000000..1bca816 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/every_declaration_has_doc_comment/bad/ShippingService.swift @@ -0,0 +1,57 @@ +import Foundation + +// No doc comment on the enum +enum ShipmentStatus { + // No doc comment on cases + case pending + case inTransit(currentLocation: String) + case delivered(at: Date) + case attemptedDelivery(attemptedAt: Date) + case returned(reason: String) +} + +// No doc comment on the struct +struct Shipment { + // No doc comment on properties + let trackingNumber: String + let destinationAddress: String + let estimatedDeliveryDate: Date + var status: ShipmentStatus + let carrier: String +} + +// No doc comment on the class +class ShippingService { + private var shipments: [String: Shipment] = [:] + + // No doc comment — what does this return, and what are the failure conditions? + @discardableResult + func register(_ shipment: Shipment) -> Bool { + guard shipments[shipment.trackingNumber] == nil else { return false } + shipments[shipment.trackingNumber] = shipment + return true + } + + // No doc comment — unclear whether nil means "not found" or "error" + func shipment(for trackingNumber: String) -> Shipment? { + return shipments[trackingNumber] + } + + // No doc comment — parameters and return value undocumented + @discardableResult + func updateStatus(for trackingNumber: String, to status: ShipmentStatus) -> Bool { + guard shipments[trackingNumber] != nil else { return false } + shipments[trackingNumber]?.status = status + return true + } + + // No doc comment — what counts as "overdue"? What is referenceDate for? + func overdueShipments(referenceDate: Date = Date()) -> [Shipment] { + return shipments.values + .filter { shipment in + if case .delivered = shipment.status { return false } + return shipment.estimatedDeliveryDate < referenceDate + } + .sorted { $0.estimatedDeliveryDate < $1.estimatedDeliveryDate } + } +} diff --git a/priv/combined_metrics/samples/documentation/every_declaration_has_doc_comment/good/ShippingService.swift b/priv/combined_metrics/samples/documentation/every_declaration_has_doc_comment/good/ShippingService.swift new file mode 100644 index 0000000..3b7a244 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/every_declaration_has_doc_comment/good/ShippingService.swift @@ -0,0 +1,85 @@ +import Foundation + +/// Represents the current status of a shipment in the delivery pipeline. +enum ShipmentStatus { + /// The order has been placed but not yet picked up by a carrier. + case pending + /// The package is in transit between facilities. + case inTransit(currentLocation: String) + /// The package has been delivered to the destination address. + case delivered(at: Date) + /// Delivery was attempted but the recipient was unavailable. + case attemptedDelivery(attemptedAt: Date) + /// The shipment was returned to the sender. + case returned(reason: String) +} + +/// Encapsulates all tracking information for a single shipment. +struct Shipment { + /// The unique tracking number assigned by the carrier. + let trackingNumber: String + /// The destination address. + let destinationAddress: String + /// The estimated delivery date provided at the time of dispatch. + let estimatedDeliveryDate: Date + /// The current status of the shipment. + var status: ShipmentStatus + /// The carrier responsible for delivery (e.g., "FedEx", "UPS"). + let carrier: String +} + +/// Manages shipment creation, tracking updates, and delivery confirmation. +/// +/// Use this service as the single point of contact for all shipping operations. +/// It maintains an in-memory registry of active shipments. +class ShippingService { + private var shipments: [String: Shipment] = [:] + + /// Registers a new shipment and begins tracking it. + /// + /// - Parameter shipment: The shipment to register. The `trackingNumber` must be unique. + /// - Returns: `true` if registration succeeded; `false` if the tracking number already exists. + @discardableResult + func register(_ shipment: Shipment) -> Bool { + guard shipments[shipment.trackingNumber] == nil else { return false } + shipments[shipment.trackingNumber] = shipment + return true + } + + /// Returns the shipment with the given tracking number, if it exists. + /// + /// - Parameter trackingNumber: The carrier-assigned tracking number. + /// - Returns: The matching `Shipment`, or `nil` if not found. + func shipment(for trackingNumber: String) -> Shipment? { + return shipments[trackingNumber] + } + + /// Updates the status of an existing shipment. + /// + /// - Parameters: + /// - trackingNumber: The tracking number of the shipment to update. + /// - status: The new status to apply. + /// - Returns: `true` if the update was applied; `false` if the tracking number was not found. + @discardableResult + func updateStatus(for trackingNumber: String, to status: ShipmentStatus) -> Bool { + guard shipments[trackingNumber] != nil else { return false } + shipments[trackingNumber]?.status = status + return true + } + + /// Returns all shipments that are currently overdue based on their estimated delivery date. + /// + /// A shipment is considered overdue if it has not been delivered and its estimated + /// delivery date is in the past. + /// + /// - Parameter referenceDate: The date to compare against. Defaults to the current date. + /// - Returns: An array of overdue shipments, sorted by estimated delivery date ascending. + func overdueShipments(referenceDate: Date = Date()) -> [Shipment] { + return shipments.values + .filter { shipment in + if case .delivered = shipment.status { return false } + return shipment.estimatedDeliveryDate < referenceDate + } + .sorted { $0.estimatedDeliveryDate < $1.estimatedDeliveryDate } + } +} diff --git a/priv/combined_metrics/samples/documentation/exported_symbol_has_doc_comment/bad/cache.go b/priv/combined_metrics/samples/documentation/exported_symbol_has_doc_comment/bad/cache.go new file mode 100644 index 0000000..b53ea42 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/exported_symbol_has_doc_comment/bad/cache.go @@ -0,0 +1,67 @@ +package cache + +import ( + "sync" + "time" +) + +// no doc comment on exported var +var ErrExpired = errExpired{} + +type errExpired struct{} + +func (errExpired) Error() string { return "cache entry expired" } + +// no doc comment on exported type +type Entry struct { + Value interface{} + ExpiresAt time.Time +} + +// no doc comment on exported method +func (e Entry) IsExpired() bool { + return time.Now().After(e.ExpiresAt) +} + +// no doc comment on exported type +type Cache struct { + mu sync.RWMutex + entries map[string]Entry +} + +// no doc comment on exported constructor +func New() *Cache { + return &Cache{entries: make(map[string]Entry)} +} + +// no doc comment on exported method +func (c *Cache) Set(key string, value interface{}, ttl time.Duration) { + c.mu.Lock() + defer c.mu.Unlock() + if ttl <= 0 { + delete(c.entries, key) + return + } + c.entries[key] = Entry{Value: value, ExpiresAt: time.Now().Add(ttl)} +} + +// no doc comment on exported method +func (c *Cache) Get(key string) (interface{}, error) { + c.mu.RLock() + defer c.mu.RUnlock() + entry, ok := c.entries[key] + if !ok { + return nil, nil + } + if entry.IsExpired() { + return nil, ErrExpired + } + return entry.Value, nil +} + +// no doc comment on exported method +func (c *Cache) Delete(key string) { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.entries, key) +} diff --git a/priv/combined_metrics/samples/documentation/exported_symbol_has_doc_comment/good/cache.go b/priv/combined_metrics/samples/documentation/exported_symbol_has_doc_comment/good/cache.go new file mode 100644 index 0000000..973bb5c --- /dev/null +++ b/priv/combined_metrics/samples/documentation/exported_symbol_has_doc_comment/good/cache.go @@ -0,0 +1,72 @@ +// Package cache provides an in-memory key-value cache with TTL-based expiry. +package cache + +import ( + "sync" + "time" +) + +// ErrExpired is returned when a requested key exists but its TTL has elapsed. +var ErrExpired = errExpired{} + +type errExpired struct{} + +func (errExpired) Error() string { return "cache entry expired" } + +// Entry holds a cached value together with its expiry time. +type Entry struct { + Value interface{} + ExpiresAt time.Time +} + +// IsExpired reports whether the entry's TTL has elapsed. +func (e Entry) IsExpired() bool { + return time.Now().After(e.ExpiresAt) +} + +// Cache is a thread-safe in-memory store with per-entry TTLs. +// The zero value is not usable; construct one with New. +type Cache struct { + mu sync.RWMutex + entries map[string]Entry +} + +// New constructs an empty Cache ready for use. +func New() *Cache { + return &Cache{entries: make(map[string]Entry)} +} + +// Set stores value under key with the given TTL. +// Calling Set with a non-positive TTL removes any existing entry for key. +func (c *Cache) Set(key string, value interface{}, ttl time.Duration) { + c.mu.Lock() + defer c.mu.Unlock() + if ttl <= 0 { + delete(c.entries, key) + return + } + c.entries[key] = Entry{Value: value, ExpiresAt: time.Now().Add(ttl)} +} + +// Get returns the value stored under key. +// It returns ErrExpired if the entry exists but has elapsed, and a nil error +// with a nil value if the key is absent. +func (c *Cache) Get(key string) (interface{}, error) { + c.mu.RLock() + defer c.mu.RUnlock() + entry, ok := c.entries[key] + if !ok { + return nil, nil + } + if entry.IsExpired() { + return nil, ErrExpired + } + return entry.Value, nil +} + +// Delete removes the entry for key. It is a no-op if key is not present. +func (c *Cache) Delete(key string) { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.entries, key) +} diff --git a/priv/combined_metrics/samples/documentation/file_has_license_header/bad/core.ex b/priv/combined_metrics/samples/documentation/file_has_license_header/bad/core.ex new file mode 100644 index 0000000..b07eabd --- /dev/null +++ b/priv/combined_metrics/samples/documentation/file_has_license_header/bad/core.ex @@ -0,0 +1,74 @@ +defmodule MyApp.Core do + @moduledoc """ + Core utility functions shared across all contexts. + + Provides helpers for formatting, type coercion, and safe value + extraction that do not belong to any specific domain context. + """ + + @spec format_currency(Decimal.t(), String.t()) :: String.t() + def format_currency(%Decimal{} = amount, currency \\ "USD") do + formatted = + amount + |> Decimal.round(2) + |> Decimal.to_string(:normal) + + "#{currency} #{formatted}" + end + + @spec truncate(String.t(), non_neg_integer()) :: String.t() + def truncate(string, max_length) when byte_size(string) <= max_length, do: string + + def truncate(string, max_length) do + String.slice(string, 0, max_length - 3) <> "..." + end + + @spec slugify(String.t()) :: String.t() + def slugify(string) do + string + |> String.downcase() + |> String.replace(~r/[^a-z0-9\s-]/, "") + |> String.replace(~r/\s+/, "-") + |> String.trim("-") + end + + @spec safe_to_integer(term()) :: {:ok, integer()} | :error + def safe_to_integer(value) when is_integer(value), do: {:ok, value} + + def safe_to_integer(value) when is_binary(value) do + case Integer.parse(value) do + {int, ""} -> {:ok, int} + _ -> :error + end + end + + def safe_to_integer(_), do: :error + + @spec deep_merge(map(), map()) :: map() + def deep_merge(left, right) do + Map.merge(left, right, fn _key, left_val, right_val -> + if is_map(left_val) and is_map(right_val) do + deep_merge(left_val, right_val) + else + right_val + end + end) + end + + @spec present?(term()) :: boolean() + def present?(nil), do: false + def present?(""), do: false + def present?([]), do: false + def present?(%{} = map) when map_size(map) == 0, do: false + def present?(_), do: true + + @spec blank?(term()) :: boolean() + def blank?(value), do: not present?(value) + + @spec wrap_ok(term()) :: {:ok, term()} + def wrap_ok(value), do: {:ok, value} + + @spec unwrap_ok!({:ok, term()}) :: term() + def unwrap_ok!({:ok, value}), do: value + def unwrap_ok!({:error, reason}), do: raise("Expected {:ok, _}, got {:error, #{inspect(reason)}}") +end diff --git a/priv/combined_metrics/samples/documentation/file_has_license_header/config.yml b/priv/combined_metrics/samples/documentation/file_has_license_header/config.yml new file mode 100644 index 0000000..5611933 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/file_has_license_header/config.yml @@ -0,0 +1 @@ +doc: "Source files should begin with a license or copyright header." diff --git a/priv/combined_metrics/samples/documentation/file_has_license_header/good/core.ex b/priv/combined_metrics/samples/documentation/file_has_license_header/good/core.ex new file mode 100644 index 0000000..93e899d --- /dev/null +++ b/priv/combined_metrics/samples/documentation/file_has_license_header/good/core.ex @@ -0,0 +1,76 @@ +# Copyright (c) 2024 Acme Corp. MIT License. + +defmodule MyApp.Core do + @moduledoc """ + Core utility functions shared across all contexts. + + Provides helpers for formatting, type coercion, and safe value + extraction that do not belong to any specific domain context. + """ + + @spec format_currency(Decimal.t(), String.t()) :: String.t() + def format_currency(%Decimal{} = amount, currency \\ "USD") do + formatted = + amount + |> Decimal.round(2) + |> Decimal.to_string(:normal) + + "#{currency} #{formatted}" + end + + @spec truncate(String.t(), non_neg_integer()) :: String.t() + def truncate(string, max_length) when byte_size(string) <= max_length, do: string + + def truncate(string, max_length) do + String.slice(string, 0, max_length - 3) <> "..." + end + + @spec slugify(String.t()) :: String.t() + def slugify(string) do + string + |> String.downcase() + |> String.replace(~r/[^a-z0-9\s-]/, "") + |> String.replace(~r/\s+/, "-") + |> String.trim("-") + end + + @spec safe_to_integer(term()) :: {:ok, integer()} | :error + def safe_to_integer(value) when is_integer(value), do: {:ok, value} + + def safe_to_integer(value) when is_binary(value) do + case Integer.parse(value) do + {int, ""} -> {:ok, int} + _ -> :error + end + end + + def safe_to_integer(_), do: :error + + @spec deep_merge(map(), map()) :: map() + def deep_merge(left, right) do + Map.merge(left, right, fn _key, left_val, right_val -> + if is_map(left_val) and is_map(right_val) do + deep_merge(left_val, right_val) + else + right_val + end + end) + end + + @spec present?(term()) :: boolean() + def present?(nil), do: false + def present?(""), do: false + def present?([]), do: false + def present?(%{} = map) when map_size(map) == 0, do: false + def present?(_), do: true + + @spec blank?(term()) :: boolean() + def blank?(value), do: not present?(value) + + @spec wrap_ok(term()) :: {:ok, term()} + def wrap_ok(value), do: {:ok, value} + + @spec unwrap_ok!({:ok, term()}) :: term() + def unwrap_ok!({:ok, value}), do: value + def unwrap_ok!({:error, reason}), do: raise("Expected {:ok, _}, got {:error, #{inspect(reason)}}") +end diff --git a/priv/combined_metrics/samples/documentation/file_has_module_docstring/bad/shipping.ex b/priv/combined_metrics/samples/documentation/file_has_module_docstring/bad/shipping.ex new file mode 100644 index 0000000..044c4f8 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/file_has_module_docstring/bad/shipping.ex @@ -0,0 +1,73 @@ +defmodule MyApp.Shipping do + alias MyApp.Repo + alias MyApp.Shipping.Shipment + alias MyApp.Shipping.TrackingEvent + alias MyApp.Orders.Order + + @spec create_shipment(Order.t(), map()) :: {:ok, Shipment.t()} | {:error, Ecto.Changeset.t()} + def create_shipment(%Order{} = order, attrs) do + %Shipment{} + |> Shipment.changeset(Map.put(attrs, :order_id, order.id)) + |> Repo.insert() + end + + @spec get_shipment!(integer()) :: Shipment.t() + def get_shipment!(id) do + Repo.get!(Shipment, id) + |> Repo.preload(:tracking_events) + end + + @spec update_shipment(Shipment.t(), map()) :: {:ok, Shipment.t()} | {:error, Ecto.Changeset.t()} + def update_shipment(%Shipment{} = shipment, attrs) do + shipment + |> Shipment.changeset(attrs) + |> Repo.update() + end + + @spec cancel_shipment(Shipment.t()) :: {:ok, Shipment.t()} | {:error, Ecto.Changeset.t()} + def cancel_shipment(%Shipment{} = shipment) do + update_shipment(shipment, %{status: :cancelled}) + end + + @spec add_tracking_event(Shipment.t(), map()) :: + {:ok, TrackingEvent.t()} | {:error, Ecto.Changeset.t()} + def add_tracking_event(%Shipment{} = shipment, attrs) do + %TrackingEvent{} + |> TrackingEvent.changeset(Map.put(attrs, :shipment_id, shipment.id)) + |> Repo.insert() + end + + @spec estimated_delivery(Shipment.t()) :: Date.t() | nil + def estimated_delivery(%Shipment{shipped_at: nil}), do: nil + + def estimated_delivery(%Shipment{shipped_at: shipped_at, service: service}) do + days = transit_days(service) + Date.add(DateTime.to_date(shipped_at), days) + end + + @spec active_shipments_for_user(map()) :: [Shipment.t()] + def active_shipments_for_user(%{id: user_id}) do + Repo.all( + from s in Shipment, + join: o in Order, + on: o.id == s.order_id, + where: o.user_id == ^user_id and s.status == :in_transit + ) + end + + @spec calculate_shipping_cost(map(), String.t()) :: Decimal.t() + def calculate_shipping_cost(%{weight_grams: weight}, destination_zone) do + base = base_rate(destination_zone) + weight_cost = Decimal.mult(Decimal.new(weight), Decimal.new("0.001")) + Decimal.add(base, weight_cost) + end + + defp transit_days(:standard), do: 5 + defp transit_days(:express), do: 2 + defp transit_days(:overnight), do: 1 + defp transit_days(_), do: 7 + + defp base_rate("domestic"), do: Decimal.new("4.99") + defp base_rate("international"), do: Decimal.new("19.99") + defp base_rate(_), do: Decimal.new("9.99") +end diff --git a/priv/combined_metrics/samples/documentation/file_has_module_docstring/config.yml b/priv/combined_metrics/samples/documentation/file_has_module_docstring/config.yml new file mode 100644 index 0000000..d3f61c3 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/file_has_module_docstring/config.yml @@ -0,0 +1 @@ +doc: "Files should have a module-level docstring explaining purpose and usage." diff --git a/priv/combined_metrics/samples/documentation/file_has_module_docstring/good/shipping.ex b/priv/combined_metrics/samples/documentation/file_has_module_docstring/good/shipping.ex new file mode 100644 index 0000000..5d7e2de --- /dev/null +++ b/priv/combined_metrics/samples/documentation/file_has_module_docstring/good/shipping.ex @@ -0,0 +1,88 @@ +defmodule MyApp.Shipping do + @moduledoc """ + Context module for shipment lifecycle management. + + Handles creating and tracking shipments tied to orders, recording + tracking events, and estimating delivery dates based on the selected + shipping service and destination zone. + + All public functions in this module are the sole entry point for + shipment-related operations. Internal schema modules such as + `MyApp.Shipping.Shipment` and `MyApp.Shipping.TrackingEvent` should + not be accessed directly from outside this context. + """ + + alias MyApp.Repo + alias MyApp.Shipping.Shipment + alias MyApp.Shipping.TrackingEvent + alias MyApp.Orders.Order + + @spec create_shipment(Order.t(), map()) :: {:ok, Shipment.t()} | {:error, Ecto.Changeset.t()} + def create_shipment(%Order{} = order, attrs) do + %Shipment{} + |> Shipment.changeset(Map.put(attrs, :order_id, order.id)) + |> Repo.insert() + end + + @spec get_shipment!(integer()) :: Shipment.t() + def get_shipment!(id) do + Repo.get!(Shipment, id) + |> Repo.preload(:tracking_events) + end + + @spec update_shipment(Shipment.t(), map()) :: {:ok, Shipment.t()} | {:error, Ecto.Changeset.t()} + def update_shipment(%Shipment{} = shipment, attrs) do + shipment + |> Shipment.changeset(attrs) + |> Repo.update() + end + + @spec cancel_shipment(Shipment.t()) :: {:ok, Shipment.t()} | {:error, Ecto.Changeset.t()} + def cancel_shipment(%Shipment{} = shipment) do + update_shipment(shipment, %{status: :cancelled}) + end + + @spec add_tracking_event(Shipment.t(), map()) :: + {:ok, TrackingEvent.t()} | {:error, Ecto.Changeset.t()} + def add_tracking_event(%Shipment{} = shipment, attrs) do + %TrackingEvent{} + |> TrackingEvent.changeset(Map.put(attrs, :shipment_id, shipment.id)) + |> Repo.insert() + end + + @spec estimated_delivery(Shipment.t()) :: Date.t() | nil + def estimated_delivery(%Shipment{shipped_at: nil}), do: nil + + def estimated_delivery(%Shipment{shipped_at: shipped_at, service: service}) do + days = transit_days(service) + Date.add(DateTime.to_date(shipped_at), days) + end + + @spec active_shipments_for_user(map()) :: [Shipment.t()] + def active_shipments_for_user(%{id: user_id}) do + import Ecto.Query, only: [from: 2, where: 3, join: 5] + + Repo.all( + from s in Shipment, + join: o in Order, + on: o.id == s.order_id, + where: o.user_id == ^user_id and s.status == :in_transit + ) + end + + @spec calculate_shipping_cost(map(), String.t()) :: Decimal.t() + def calculate_shipping_cost(%{weight_grams: weight}, destination_zone) do + base = base_rate(destination_zone) + weight_cost = Decimal.mult(Decimal.new(weight), Decimal.new("0.001")) + Decimal.add(base, weight_cost) + end + + defp transit_days(:standard), do: 5 + defp transit_days(:express), do: 2 + defp transit_days(:overnight), do: 1 + defp transit_days(_), do: 7 + + defp base_rate("domestic"), do: Decimal.new("4.99") + defp base_rate("international"), do: Decimal.new("19.99") + defp base_rate(_), do: Decimal.new("9.99") +end diff --git a/priv/combined_metrics/samples/documentation/file_has_no_commented_out_code/bad/legacy_importer.ex b/priv/combined_metrics/samples/documentation/file_has_no_commented_out_code/bad/legacy_importer.ex new file mode 100644 index 0000000..78f7f78 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/file_has_no_commented_out_code/bad/legacy_importer.ex @@ -0,0 +1,91 @@ +defmodule MyApp.LegacyImporter do + @moduledoc """ + Handles importing records from the legacy CSV export format. + """ + + alias MyApp.Repo + alias MyApp.Accounts.User + alias MyApp.Orders.Order + + # def import_all(path) do + # path + # |> File.stream!() + # |> CSV.decode!(headers: true) + # |> Enum.map(&import_row/1) + # end + + @spec import_file(String.t()) :: {:ok, non_neg_integer()} | {:error, String.t()} + def import_file(path) do + rows = + path + |> File.stream!() + |> Stream.map(&String.trim/1) + |> Stream.reject(&(&1 == "")) + |> Stream.drop(1) + |> Enum.map(&String.split(&1, ",")) + + results = Enum.map(rows, &import_row/1) + errors = Enum.filter(results, &match?({:error, _}, &1)) + + if errors == [] do + {:ok, length(results)} + else + {:error, "#{length(errors)} rows failed"} + end + end + + # Old row import — replaced by pattern matched version below + # def import_row(row) do + # user = Repo.get_by(User, email: Enum.at(row, 2)) + # if user do + # %Order{user_id: user.id, total: String.to_float(Enum.at(row, 5))} + # |> Repo.insert() + # else + # {:error, "user not found"} + # end + # end + + defp import_row([id, _name, email, date, _source, total | _rest]) do + with %User{} = user <- Repo.get_by(User, email: email), + {:ok, ordered_at} <- Date.from_iso8601(date), + {amount, _} <- Float.parse(total) do + %Order{} + |> Order.changeset(%{ + legacy_id: id, + user_id: user.id, + total: amount, + ordered_at: ordered_at + }) + |> Repo.insert() + else + nil -> {:error, "unknown user: #{email}"} + {:error, reason} -> {:error, reason} + end + end + + defp import_row(_invalid), do: {:error, "malformed row"} + + # TODO: add dry_run mode + # def dry_run(path) do + # import_file(path) + # |> case do + # {:ok, count} -> IO.puts("Would import #{count} rows") + # {:error, msg} -> IO.puts("Error: #{msg}") + # end + # end + + @spec summary_stats(String.t()) :: map() + def summary_stats(path) do + rows = + path + |> File.stream!() + |> Stream.drop(1) + |> Enum.to_list() + + %{ + total_rows: length(rows), + # valid_rows: Enum.count(rows, &valid_row?/1), + file: path + } + end +end diff --git a/priv/combined_metrics/samples/documentation/file_has_no_commented_out_code/config.yml b/priv/combined_metrics/samples/documentation/file_has_no_commented_out_code/config.yml new file mode 100644 index 0000000..345b09b --- /dev/null +++ b/priv/combined_metrics/samples/documentation/file_has_no_commented_out_code/config.yml @@ -0,0 +1 @@ +doc: "Files should not contain commented-out code blocks left from development." diff --git a/priv/combined_metrics/samples/documentation/file_has_no_commented_out_code/good/legacy_importer.ex b/priv/combined_metrics/samples/documentation/file_has_no_commented_out_code/good/legacy_importer.ex new file mode 100644 index 0000000..aa6d3da --- /dev/null +++ b/priv/combined_metrics/samples/documentation/file_has_no_commented_out_code/good/legacy_importer.ex @@ -0,0 +1,83 @@ +defmodule MyApp.LegacyImporter do + @moduledoc """ + Handles importing records from the legacy CSV export format. + + Rows are expected to be comma-separated with the following columns: + `id, name, email, date, source, total`. + + Use `import_file/1` to run the full import and receive a success count, + or `summary_stats/1` to inspect the file without persisting any records. + """ + + alias MyApp.Repo + alias MyApp.Accounts.User + alias MyApp.Orders.Order + + @spec import_file(String.t()) :: {:ok, non_neg_integer()} | {:error, String.t()} + def import_file(path) do + rows = + path + |> File.stream!() + |> Stream.map(&String.trim/1) + |> Stream.reject(&(&1 == "")) + |> Stream.drop(1) + |> Enum.map(&String.split(&1, ",")) + + results = Enum.map(rows, &import_row/1) + failed = Enum.count(results, &match?({:error, _}, &1)) + + if failed == 0 do + {:ok, length(results)} + else + {:error, "#{failed} rows failed to import"} + end + end + + @spec dry_run(String.t()) :: {:ok, non_neg_integer()} | {:error, String.t()} + def dry_run(path) do + Repo.transaction(fn -> + case import_file(path) do + {:ok, count} -> + Repo.rollback({:dry_run, count}) + + {:error, reason} -> + Repo.rollback({:error, reason}) + end + end) + |> case do + {:error, {:dry_run, count}} -> {:ok, count} + {:error, {:error, reason}} -> {:error, reason} + end + end + + @spec summary_stats(String.t()) :: map() + def summary_stats(path) do + rows = + path + |> File.stream!() + |> Stream.drop(1) + |> Enum.to_list() + + %{total_rows: length(rows), file: path} + end + + defp import_row([id, _name, email, date, _source, total | _rest]) do + with %User{} = user <- Repo.get_by(User, email: email), + {:ok, ordered_at} <- Date.from_iso8601(date), + {amount, _} <- Float.parse(total) do + %Order{} + |> Order.changeset(%{ + legacy_id: id, + user_id: user.id, + total: amount, + ordered_at: ordered_at + }) + |> Repo.insert() + else + nil -> {:error, "unknown user: #{email}"} + {:error, reason} -> {:error, reason} + end + end + + defp import_row(_invalid), do: {:error, "malformed row"} +end diff --git a/priv/combined_metrics/samples/documentation/function_has_docstring/bad/tax.ex b/priv/combined_metrics/samples/documentation/function_has_docstring/bad/tax.ex new file mode 100644 index 0000000..48812b4 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/function_has_docstring/bad/tax.ex @@ -0,0 +1,73 @@ +defmodule MyApp.Tax do + @moduledoc """ + Tax calculation context for order totals and line items. + """ + + alias MyApp.Tax.Rate + alias MyApp.Tax.Exemption + alias MyApp.Repo + + @spec calculate(Decimal.t(), String.t()) :: Decimal.t() + def calculate(subtotal, region) do + rate = fetch_rate(region) + Decimal.mult(subtotal, rate) + end + + @spec calculate_line_items([map()], String.t()) :: [map()] + def calculate_line_items(items, region) do + rate = fetch_rate(region) + Enum.map(items, fn item -> + tax = Decimal.mult(item.price, rate) + Map.put(item, :tax, tax) + end) + end + + @spec effective_rate(String.t()) :: Decimal.t() + def effective_rate(region) do + fetch_rate(region) + end + + @spec exempt?(String.t(), String.t()) :: boolean() + def exempt?(product_category, region) do + Repo.get_by(Exemption, category: product_category, region: region) != nil + end + + @spec apply_exemptions([map()], String.t()) :: [map()] + def apply_exemptions(items, region) do + Enum.map(items, fn item -> + if exempt?(item.category, region) do + Map.put(item, :tax, Decimal.new(0)) + else + item + end + end) + end + + @spec summarize(Decimal.t(), String.t()) :: map() + def summarize(subtotal, region) do + tax = calculate(subtotal, region) + total = Decimal.add(subtotal, tax) + + %{ + subtotal: subtotal, + tax: tax, + total: total, + rate: effective_rate(region), + region: region + } + end + + @spec annual_liability([map()]) :: Decimal.t() + def annual_liability(transactions) do + transactions + |> Enum.map(& &1.tax) + |> Enum.reduce(Decimal.new(0), &Decimal.add/2) + end + + defp fetch_rate(region) do + case Repo.get_by(Rate, region: region) do + %Rate{rate: rate} -> rate + nil -> Decimal.new("0.10") + end + end +end diff --git a/priv/combined_metrics/samples/documentation/function_has_docstring/config.yml b/priv/combined_metrics/samples/documentation/function_has_docstring/config.yml new file mode 100644 index 0000000..bac2ee2 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/function_has_docstring/config.yml @@ -0,0 +1 @@ +doc: "Public functions should have a docstring describing behaviour, params, and return value." diff --git a/priv/combined_metrics/samples/documentation/function_has_docstring/good/tax.ex b/priv/combined_metrics/samples/documentation/function_has_docstring/good/tax.ex new file mode 100644 index 0000000..9767383 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/function_has_docstring/good/tax.ex @@ -0,0 +1,105 @@ +defmodule MyApp.Tax do + @moduledoc """ + Tax calculation context for order totals and line items. + """ + + alias MyApp.Tax.Rate + alias MyApp.Tax.Exemption + alias MyApp.Repo + + @doc """ + Calculates the tax amount for a given subtotal and region. + + Returns the tax amount as a `Decimal`, not the total. Use `summarize/2` + to get a full breakdown including subtotal, tax, and total. + + ## Examples + + iex> MyApp.Tax.calculate(Decimal.new("100.00"), "us-ca") + Decimal.new("8.25") + """ + @spec calculate(Decimal.t(), String.t()) :: Decimal.t() + def calculate(subtotal, region) do + rate = fetch_rate(region) + Decimal.mult(subtotal, rate) + end + + @doc """ + Applies per-line-item tax to a list of order items for a given region. + + Each item map must have a `:price` and `:category` key. Returns the + same list with a `:tax` key added to each item. Items with an exemption + in the given region receive a tax of zero. + """ + @spec calculate_line_items([map()], String.t()) :: [map()] + def calculate_line_items(items, region) do + items + |> apply_exemptions(region) + |> Enum.map(&apply_tax_rate(&1, fetch_rate(region))) + end + + @doc """ + Returns the applicable tax rate for the given region as a `Decimal`. + + Falls back to a default rate of 10% when no specific rate is configured. + """ + @spec effective_rate(String.t()) :: Decimal.t() + def effective_rate(region) do + fetch_rate(region) + end + + @doc """ + Returns `true` if the given product category is tax-exempt in a region. + """ + @spec exempt?(String.t(), String.t()) :: boolean() + def exempt?(product_category, region) do + Repo.get_by(Exemption, category: product_category, region: region) != nil + end + + @doc """ + Returns a tax summary map for a subtotal and region. + + The map contains `:subtotal`, `:tax`, `:total`, `:rate`, and `:region`. + """ + @spec summarize(Decimal.t(), String.t()) :: map() + def summarize(subtotal, region) do + tax = calculate(subtotal, region) + total = Decimal.add(subtotal, tax) + + %{ + subtotal: subtotal, + tax: tax, + total: total, + rate: effective_rate(region), + region: region + } + end + + @doc """ + Sums the total tax liability across a list of transactions. + + Each transaction map must have a `:tax` key with a `Decimal` value. + """ + @spec annual_liability([map()]) :: Decimal.t() + def annual_liability(transactions) do + transactions + |> Enum.map(& &1.tax) + |> Enum.reduce(Decimal.new(0), &Decimal.add/2) + end + + defp apply_exemptions(items, region) do + Enum.map(items, fn item -> + if exempt?(item.category, region), do: Map.put(item, :tax, Decimal.new(0)), else: item + end) + end + + defp apply_tax_rate(%{tax: _already_set} = item, _rate), do: item + defp apply_tax_rate(item, rate), do: Map.put(item, :tax, Decimal.mult(item.price, rate)) + + defp fetch_rate(region) do + case Repo.get_by(Rate, region: region) do + %Rate{rate: rate} -> rate + nil -> Decimal.new("0.10") + end + end +end diff --git a/priv/combined_metrics/samples/documentation/function_todo_comment_in_body/bad/sync.ex b/priv/combined_metrics/samples/documentation/function_todo_comment_in_body/bad/sync.ex new file mode 100644 index 0000000..4cfb397 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/function_todo_comment_in_body/bad/sync.ex @@ -0,0 +1,95 @@ +defmodule MyApp.Sync do + @moduledoc """ + Data synchronization service for reconciling records with an external system. + """ + + alias MyApp.Repo + alias MyApp.Sync.Record + alias MyApp.ExternalAPI + + @spec sync_all() :: {:ok, map()} | {:error, String.t()} + def sync_all() do + # TODO: add pagination support so we don't fetch all records at once + records = Repo.all(Record) + results = Enum.map(records, &sync_record/1) + + %{ + total: length(results), + success: Enum.count(results, &match?({:ok, _}, &1)), + failed: Enum.count(results, &match?({:error, _}, &1)) + } + |> then(&{:ok, &1}) + end + + @spec sync_record(Record.t()) :: {:ok, Record.t()} | {:error, String.t()} + def sync_record(%Record{} = record) do + # TODO: handle rate limiting from ExternalAPI + case ExternalAPI.push(record.external_id, record_payload(record)) do + {:ok, response} -> + # TODO: parse and store the response metadata + update_synced_at(record, response) + + {:error, reason} -> + {:error, "ExternalAPI error: #{reason}"} + end + end + + @spec pull_updates(DateTime.t()) :: {:ok, non_neg_integer()} | {:error, String.t()} + def pull_updates(since) do + # TODO: implement delta sync — currently fetches everything + case ExternalAPI.list_updated(since) do + {:ok, items} -> + count = + items + |> Enum.map(&upsert_from_external/1) + |> Enum.count(&match?({:ok, _}, &1)) + + {:ok, count} + + {:error, reason} -> + {:error, reason} + end + end + + @spec conflict_resolution(Record.t(), map()) :: {:ok, Record.t()} + def conflict_resolution(%Record{} = local, remote) do + # TODO: implement proper conflict resolution strategy (last-write-wins vs merge) + if DateTime.compare(local.updated_at, remote["updated_at"]) == :gt do + {:ok, local} + else + upsert_from_external(remote) + end + end + + @spec status() :: map() + def status() do + # TODO: add last_error_at tracking + total = Repo.aggregate(Record, :count) + synced = Repo.aggregate(from(r in Record, where: not is_nil(r.synced_at)), :count) + + %{ + total: total, + synced: synced, + pending: total - synced + } + end + + defp record_payload(%Record{} = record) do + %{id: record.external_id, data: record.payload, version: record.version} + end + + defp update_synced_at(record, _response) do + record + |> Record.changeset(%{synced_at: DateTime.utc_now()}) + |> Repo.update() + end + + defp upsert_from_external(%{"id" => ext_id} = data) do + attrs = %{external_id: ext_id, payload: data, synced_at: DateTime.utc_now()} + + case Repo.get_by(Record, external_id: ext_id) do + nil -> Repo.insert(Record.changeset(%Record{}, attrs)) + record -> Repo.update(Record.changeset(record, attrs)) + end + end +end diff --git a/priv/combined_metrics/samples/documentation/function_todo_comment_in_body/config.yml b/priv/combined_metrics/samples/documentation/function_todo_comment_in_body/config.yml new file mode 100644 index 0000000..d1cd973 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/function_todo_comment_in_body/config.yml @@ -0,0 +1 @@ +doc: "Functions should not contain TODO/FIXME comments indicating unfinished work." diff --git a/priv/combined_metrics/samples/documentation/function_todo_comment_in_body/good/sync.ex b/priv/combined_metrics/samples/documentation/function_todo_comment_in_body/good/sync.ex new file mode 100644 index 0000000..08778c0 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/function_todo_comment_in_body/good/sync.ex @@ -0,0 +1,104 @@ +defmodule MyApp.Sync do + @moduledoc """ + Data synchronization service for reconciling records with an external system. + + Supports full sync via `sync_all/0`, incremental pull via `pull_updates/1`, + and per-record sync via `sync_record/1`. Conflict resolution uses a + last-write-wins strategy based on `updated_at` timestamps. + """ + + alias MyApp.Repo + alias MyApp.Sync.Record + alias MyApp.ExternalAPI + + @page_size 100 + + @spec sync_all() :: {:ok, map()} | {:error, String.t()} + def sync_all() do + results = + stream_all_records() + |> Enum.map(&sync_record/1) + + summary = %{ + total: length(results), + success: Enum.count(results, &match?({:ok, _}, &1)), + failed: Enum.count(results, &match?({:error, _}, &1)) + } + + {:ok, summary} + end + + @spec sync_record(Record.t()) :: {:ok, Record.t()} | {:error, String.t()} + def sync_record(%Record{} = record) do + case ExternalAPI.push(record.external_id, record_payload(record)) do + {:ok, response} -> update_synced_at(record, response) + {:error, reason} -> {:error, "ExternalAPI error: #{reason}"} + end + end + + @spec pull_updates(DateTime.t()) :: {:ok, non_neg_integer()} | {:error, String.t()} + def pull_updates(since) do + case ExternalAPI.list_updated(since) do + {:ok, items} -> + count = + items + |> Enum.map(&upsert_from_external/1) + |> Enum.count(&match?({:ok, _}, &1)) + + {:ok, count} + + {:error, reason} -> + {:error, reason} + end + end + + @spec conflict_resolution(Record.t(), map()) :: {:ok, Record.t()} + def conflict_resolution(%Record{} = local, remote) do + if DateTime.compare(local.updated_at, remote["updated_at"]) == :gt do + {:ok, local} + else + upsert_from_external(remote) + end + end + + @spec status() :: map() + def status() do + import Ecto.Query, only: [from: 2] + + total = Repo.aggregate(Record, :count) + synced = Repo.aggregate(from(r in Record, where: not is_nil(r.synced_at)), :count) + + %{ + total: total, + synced: synced, + pending: total - synced + } + end + + defp stream_all_records() do + import Ecto.Query, only: [from: 2] + + Repo.all(from r in Record, order_by: r.id) + |> Stream.chunk_every(@page_size) + |> Stream.flat_map(& &1) + end + + defp record_payload(%Record{} = record) do + %{id: record.external_id, data: record.payload, version: record.version} + end + + defp update_synced_at(record, _response) do + record + |> Record.changeset(%{synced_at: DateTime.utc_now()}) + |> Repo.update() + end + + defp upsert_from_external(%{"id" => ext_id} = data) do + attrs = %{external_id: ext_id, payload: data, synced_at: DateTime.utc_now()} + + case Repo.get_by(Record, external_id: ext_id) do + nil -> Repo.insert(Record.changeset(%Record{}, attrs)) + record -> Repo.update(Record.changeset(record, attrs)) + end + end +end diff --git a/priv/combined_metrics/samples/documentation/public_api_has_moduledoc_and_doc/bad/payments.ex b/priv/combined_metrics/samples/documentation/public_api_has_moduledoc_and_doc/bad/payments.ex new file mode 100644 index 0000000..2f886e0 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/public_api_has_moduledoc_and_doc/bad/payments.ex @@ -0,0 +1,81 @@ +defmodule MyApp.Payments do + # Bad: no @moduledoc — the module's purpose, conventions, and usage + # are completely undiscoverable without reading the full source. + + alias MyApp.Payments.{Charge, Refund} + alias MyApp.Repo + + # Bad: no @doc on a public function. Callers cannot use `h MyApp.Payments.charge/3` + # in IEx, and documentation tools will not generate an entry for this function. + @spec charge(integer(), pos_integer(), :usd | :eur | :gbp) :: + {:ok, Charge.t()} | {:error, atom()} + def charge(customer_id, amount, currency) + when is_integer(amount) and amount > 0 do + with {:ok, pm} <- fetch_default_payment_method(customer_id), + {:ok, result} <- MyApp.PaymentGateway.charge(pm.token, amount, currency) do + insert_charge(customer_id, amount, currency, result.transaction_id) + end + end + + # Bad: no @doc on this public function either + @spec refund(integer(), pos_integer()) :: {:ok, Refund.t()} | {:error, atom()} + def refund(charge_id, amount) when is_integer(amount) and amount > 0 do + with {:ok, charge} <- fetch_charge(charge_id), + :ok <- validate_refund_amount(charge, amount), + {:ok, result} <- MyApp.PaymentGateway.refund(charge.transaction_id, amount) do + insert_refund(charge, amount, result.refund_id) + end + end + + # Bad: `list_charges/2` is public but completely undocumented. + # What does `opts` accept? What order are results in? + def list_charges(customer_id, opts \\ []) do + limit = Keyword.get(opts, :limit, 20) + offset = Keyword.get(opts, :offset, 0) + + Repo.all( + from c in Charge, + where: c.customer_id == ^customer_id, + order_by: [desc: c.inserted_at], + limit: ^limit, + offset: ^offset + ) + end + + defp fetch_default_payment_method(customer_id) do + case Repo.get_by(MyApp.Payments.PaymentMethod, customer_id: customer_id, default: true) do + nil -> {:error, :no_payment_method} + pm -> {:ok, pm} + end + end + + defp fetch_charge(charge_id) do + case Repo.get(Charge, charge_id) do + nil -> {:error, :charge_not_found} + charge -> {:ok, charge} + end + end + + defp validate_refund_amount(%Charge{amount: orig}, amount) when amount > orig do + {:error, :exceeds_original} + end + + defp validate_refund_amount(_, _), do: :ok + + defp insert_charge(customer_id, amount, currency, transaction_id) do + %Charge{} + |> Charge.changeset(%{ + customer_id: customer_id, + amount: amount, + currency: currency, + transaction_id: transaction_id + }) + |> Repo.insert() + end + + defp insert_refund(charge, amount, refund_id) do + %Refund{} + |> Refund.changeset(%{charge_id: charge.id, amount: amount, refund_id: refund_id}) + |> Repo.insert() + end +end diff --git a/priv/combined_metrics/samples/documentation/public_api_has_moduledoc_and_doc/good/payments.ex b/priv/combined_metrics/samples/documentation/public_api_has_moduledoc_and_doc/good/payments.ex new file mode 100644 index 0000000..2d87c50 --- /dev/null +++ b/priv/combined_metrics/samples/documentation/public_api_has_moduledoc_and_doc/good/payments.ex @@ -0,0 +1,109 @@ +defmodule MyApp.Payments do + @moduledoc """ + Public API for processing payments and managing charges. + + All monetary amounts are in the smallest currency unit (e.g. cents for USD). + Currency codes follow ISO 4217 (e.g. `:usd`, `:eur`). + + ## Usage + + {:ok, charge} = MyApp.Payments.charge(customer_id, 2999, :usd) + {:ok, _} = MyApp.Payments.refund(charge.id, 2999) + """ + + alias MyApp.Payments.{Charge, Refund} + alias MyApp.Repo + + @doc """ + Creates a charge against a customer's default payment method. + + `amount` must be a positive integer in the smallest currency unit. + `currency` must be one of `:usd`, `:eur`, or `:gbp`. + + Returns `{:ok, charge}` on success, or `{:error, reason}` when the + customer has no payment method, the card is declined, or validation fails. + """ + @spec charge(integer(), pos_integer(), :usd | :eur | :gbp) :: + {:ok, Charge.t()} | {:error, atom()} + def charge(customer_id, amount, currency) + when is_integer(amount) and amount > 0 do + with {:ok, pm} <- fetch_default_payment_method(customer_id), + {:ok, result} <- MyApp.PaymentGateway.charge(pm.token, amount, currency) do + insert_charge(customer_id, amount, currency, result.transaction_id) + end + end + + @doc """ + Refunds a charge fully or partially. + + `amount` must not exceed the original charge amount. Pass the full charge + amount to issue a full refund. + + Returns `{:ok, refund}` or `{:error, :exceeds_original}` when the requested + amount is greater than the charge amount. + """ + @spec refund(integer(), pos_integer()) :: {:ok, Refund.t()} | {:error, atom()} + def refund(charge_id, amount) when is_integer(amount) and amount > 0 do + with {:ok, charge} <- fetch_charge(charge_id), + :ok <- validate_refund_amount(charge, amount), + {:ok, result} <- MyApp.PaymentGateway.refund(charge.transaction_id, amount) do + insert_refund(charge, amount, result.refund_id) + end + end + + @doc """ + Lists all charges for a customer, ordered by most recent first. + + `opts` supports `:limit` (default 20) and `:offset` (default 0). + """ + @spec list_charges(integer(), keyword()) :: [Charge.t()] + def list_charges(customer_id, opts \\ []) do + limit = Keyword.get(opts, :limit, 20) + offset = Keyword.get(opts, :offset, 0) + + Repo.all( + from c in Charge, + where: c.customer_id == ^customer_id, + order_by: [desc: c.inserted_at], + limit: ^limit, + offset: ^offset + ) + end + + defp fetch_default_payment_method(customer_id) do + case Repo.get_by(MyApp.Payments.PaymentMethod, customer_id: customer_id, default: true) do + nil -> {:error, :no_payment_method} + pm -> {:ok, pm} + end + end + + defp fetch_charge(charge_id) do + case Repo.get(Charge, charge_id) do + nil -> {:error, :charge_not_found} + charge -> {:ok, charge} + end + end + + defp validate_refund_amount(%Charge{amount: orig}, amount) when amount > orig do + {:error, :exceeds_original} + end + + defp validate_refund_amount(_, _), do: :ok + + defp insert_charge(customer_id, amount, currency, transaction_id) do + %Charge{} + |> Charge.changeset(%{ + customer_id: customer_id, + amount: amount, + currency: currency, + transaction_id: transaction_id + }) + |> Repo.insert() + end + + defp insert_refund(charge, amount, refund_id) do + %Refund{} + |> Refund.changeset(%{charge_id: charge.id, amount: amount, refund_id: refund_id}) + |> Repo.insert() + end +end diff --git a/priv/combined_metrics/samples/error_handling/catches_specific_exception/bad/file_importer.py b/priv/combined_metrics/samples/error_handling/catches_specific_exception/bad/file_importer.py new file mode 100644 index 0000000..0ab63e0 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/catches_specific_exception/bad/file_importer.py @@ -0,0 +1,66 @@ +"""File importer that reads, parses, and ingests CSV data files.""" +from __future__ import annotations + +import csv +import io +import os +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class ImportResult: + filename: str + rows_imported: int + rows_skipped: int + error: Optional[str] = None + + +def read_file(path: str) -> str: + """Read a file — catches all exceptions, masking programming errors.""" + try: + with open(path, encoding="utf-8") as fh: + return fh.read() + except Exception: # too broad: hides PermissionError, MemoryError, etc. + return "" + + +def parse_csv(content: str) -> list: + """Parse CSV text — broad catch swallows malformed-data signals.""" + try: + reader = csv.DictReader(io.StringIO(content)) + return list(reader) + except Exception as e: # catches everything including KeyboardInterrupt chain + print(f"parse error: {e}") + return [] + + +def convert_row(row: dict) -> dict: + """Convert raw string values — broad except prevents surfacing schema issues.""" + try: + return { + "id": int(row["id"]), + "name": row["name"].strip(), + "amount": float(row["amount"]), + } + except Exception: # hides KeyError (missing column) vs ValueError (bad data) + return {} + + +def import_file(path: str) -> ImportResult: + """Import a CSV file — catches everything so failures are silently swallowed.""" + filename = os.path.basename(path) + try: + content = read_file(path) + rows = parse_csv(content) + imported = 0 + skipped = 0 + for row in rows: + result = convert_row(row) + if result: + imported += 1 + else: + skipped += 1 + return ImportResult(filename=filename, rows_imported=imported, rows_skipped=skipped) + except Exception as e: # outermost catch hides all failures + return ImportResult(filename=filename, rows_imported=0, rows_skipped=0, error=str(e)) diff --git a/priv/combined_metrics/samples/error_handling/catches_specific_exception/good/file_importer.py b/priv/combined_metrics/samples/error_handling/catches_specific_exception/good/file_importer.py new file mode 100644 index 0000000..790db17 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/catches_specific_exception/good/file_importer.py @@ -0,0 +1,69 @@ +"""File importer that reads, parses, and ingests CSV data files.""" +from __future__ import annotations + +import csv +import os +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class ImportResult: + filename: str + rows_imported: int + rows_skipped: int + error: Optional[str] = None + + +def read_file(path: str) -> str: + """Read a file and return its contents, raising descriptive errors.""" + try: + with open(path, encoding="utf-8") as fh: + return fh.read() + except FileNotFoundError: + raise FileNotFoundError(f"Import file not found: {path}") + except PermissionError: + raise PermissionError(f"No read permission for file: {path}") + except UnicodeDecodeError as exc: + raise ValueError(f"File {path} is not valid UTF-8") from exc + + +def parse_csv(content: str) -> list[dict[str, str]]: + """Parse CSV text into a list of row dicts.""" + import io + try: + reader = csv.DictReader(io.StringIO(content)) + return list(reader) + except csv.Error as exc: + raise ValueError(f"Malformed CSV content: {exc}") from exc + + +def convert_row(row: dict[str, str]) -> dict: + """Convert raw string values to typed fields.""" + try: + return { + "id": int(row["id"]), + "name": row["name"].strip(), + "amount": float(row["amount"]), + } + except KeyError as exc: + raise ValueError(f"Missing required column: {exc}") from exc + except (TypeError, ValueError) as exc: + raise ValueError(f"Type conversion failed for row {row}: {exc}") from exc + + +def import_file(path: str) -> ImportResult: + """Import a CSV file, skipping rows that fail conversion.""" + filename = os.path.basename(path) + content = read_file(path) + rows = parse_csv(content) + + imported, skipped = 0, 0 + for row in rows: + try: + convert_row(row) + imported += 1 + except ValueError: + skipped += 1 + + return ImportResult(filename=filename, rows_imported=imported, rows_skipped=skipped) diff --git a/priv/combined_metrics/samples/error_handling/catches_typed_exception/bad/PaymentGateway.php b/priv/combined_metrics/samples/error_handling/catches_typed_exception/bad/PaymentGateway.php new file mode 100644 index 0000000..db4449b --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/catches_typed_exception/bad/PaymentGateway.php @@ -0,0 +1,62 @@ +httpClient->post('/v1/charges', [ + 'amount' => $amountCents, + 'source' => $token, + 'customer_id' => $customerId, + ]); + + return [ + 'transaction_id' => $response['id'], + 'status' => $response['status'], + ]; + } catch (\Exception $e) { + // Bare \Exception catches everything — loses all error specificity + $this->logger->error("Charge failed: {$e->getMessage()}"); + return null; + } + } + + public function refund($transactionId, $amountCents) + { + try { + $response = $this->httpClient->post('/v1/refunds', [ + 'transaction_id' => $transactionId, + 'amount' => $amountCents, + ]); + + return ['refund_id' => $response['refund_id']]; + } catch (\Throwable $e) { + // \Throwable is even broader — catches Errors and Exceptions alike + $this->logger->error("Refund failed: {$e->getMessage()}"); + return false; + } + } + + public function validateCard($cardNumber, $expiry, $cvv) + { + try { + return $this->httpClient->post('/v1/validate', [ + 'number' => $cardNumber, + 'expiry' => $expiry, + 'cvv' => $cvv, + ]); + } catch (\Exception $e) { + // Swallows all exceptions — caller gets null regardless of cause + return null; + } + } +} diff --git a/priv/combined_metrics/samples/error_handling/catches_typed_exception/good/PaymentGateway.php b/priv/combined_metrics/samples/error_handling/catches_typed_exception/good/PaymentGateway.php new file mode 100644 index 0000000..5d49240 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/catches_typed_exception/good/PaymentGateway.php @@ -0,0 +1,62 @@ +httpClient->post('/v1/charges', [ + 'amount' => $amountCents, + 'source' => $token, + 'customer_id' => $customerId, + ]); + + return new ChargeResult( + transactionId: $response['id'], + status: $response['status'] + ); + } catch (CardDeclinedException $e) { + $this->logger->info("Card declined for customer {$customerId}: {$e->getDeclineCode()}"); + throw $e; + } catch (GatewayTimeoutException $e) { + $this->logger->error("Gateway timeout for customer {$customerId}: {$e->getMessage()}"); + throw new \RuntimeException("Payment service temporarily unavailable", 0, $e); + } catch (InvalidCardException $e) { + $this->logger->warning("Invalid card for customer {$customerId}: {$e->getMessage()}"); + throw $e; + } + } + + public function refund(string $transactionId, int $amountCents): RefundResult + { + try { + $response = $this->httpClient->post('/v1/refunds', [ + 'transaction_id' => $transactionId, + 'amount' => $amountCents, + ]); + + return new RefundResult(refundId: $response['refund_id']); + } catch (TransactionNotFoundException $e) { + $this->logger->error("Refund failed — transaction not found: {$transactionId}"); + throw $e; + } catch (RefundNotAllowedException $e) { + $this->logger->warning("Refund not allowed for {$transactionId}: {$e->getMessage()}"); + throw $e; + } + } +} diff --git a/priv/combined_metrics/samples/error_handling/custom_error_type_implements_unwrap/bad/processor.go b/priv/combined_metrics/samples/error_handling/custom_error_type_implements_unwrap/bad/processor.go new file mode 100644 index 0000000..6e00094 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/custom_error_type_implements_unwrap/bad/processor.go @@ -0,0 +1,61 @@ +package processor + +import ( + "errors" + "fmt" +) + +// ProcessingError wraps an underlying error but does NOT implement Unwrap. +// This prevents errors.Is and errors.As from traversing the chain. +type ProcessingError struct { + Stage string + JobID string + Err error +} + +func (e *ProcessingError) Error() string { + return fmt.Sprintf("processing job %s at stage %q: %v", e.JobID, e.Stage, e.Err) +} + +// Missing: func (e *ProcessingError) Unwrap() error { return e.Err } + +var ErrInvalidPayload = errors.New("invalid payload") + +type Job struct { + ID string + Payload []byte +} + +type PaymentProcessor struct{} + +func (p *PaymentProcessor) Process(job Job) error { + if len(job.Payload) == 0 { + return &ProcessingError{ + Stage: "validate", + JobID: job.ID, + Err: ErrInvalidPayload, + } + } + + if err := p.execute(job); err != nil { + return &ProcessingError{ + Stage: "execute", + JobID: job.ID, + Err: err, + } + } + return nil +} + +func (p *PaymentProcessor) execute(job Job) error { + return nil +} + +// HandleJob — errors.Is returns false here because Unwrap is missing. +func HandleJob(p *PaymentProcessor, job Job) { + err := p.Process(job) + // This will never be true; the error chain cannot be traversed. + if errors.Is(err, ErrInvalidPayload) { + fmt.Println("bad payload, skip job") + } +} diff --git a/priv/combined_metrics/samples/error_handling/custom_error_type_implements_unwrap/good/processor.go b/priv/combined_metrics/samples/error_handling/custom_error_type_implements_unwrap/good/processor.go new file mode 100644 index 0000000..591ad1c --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/custom_error_type_implements_unwrap/good/processor.go @@ -0,0 +1,63 @@ +package processor + +import ( + "errors" + "fmt" +) + +// ProcessingError wraps an underlying error and adds the stage at which it occurred. +// It implements Unwrap so errors.Is and errors.As can traverse the chain. +type ProcessingError struct { + Stage string + JobID string + Err error +} + +func (e *ProcessingError) Error() string { + return fmt.Sprintf("processing job %s at stage %q: %v", e.JobID, e.Stage, e.Err) +} + +// Unwrap allows errors.Is and errors.As to inspect the wrapped error. +func (e *ProcessingError) Unwrap() error { return e.Err } + +var ErrInvalidPayload = errors.New("invalid payload") + +type Job struct { + ID string + Payload []byte +} + +type PaymentProcessor struct{} + +func (p *PaymentProcessor) Process(job Job) error { + if len(job.Payload) == 0 { + return &ProcessingError{ + Stage: "validate", + JobID: job.ID, + Err: ErrInvalidPayload, + } + } + + if err := p.execute(job); err != nil { + return &ProcessingError{ + Stage: "execute", + JobID: job.ID, + Err: err, + } + } + return nil +} + +func (p *PaymentProcessor) execute(job Job) error { + // simulate execution + return nil +} + +// HandleJob demonstrates that errors.Is works through ProcessingError.Unwrap. +func HandleJob(p *PaymentProcessor, job Job) { + err := p.Process(job) + if errors.Is(err, ErrInvalidPayload) { + // reachable because ProcessingError implements Unwrap + fmt.Println("bad payload, skip job") + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_assert_result_without_value/bad/validator_test.rs b/priv/combined_metrics/samples/error_handling/does_not_assert_result_without_value/bad/validator_test.rs new file mode 100644 index 0000000..dd0e094 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_assert_result_without_value/bad/validator_test.rs @@ -0,0 +1,50 @@ +// Bad: assert!(result.is_ok()) discards the error — failures show no useful info + +fn validate_email(email: &str) -> Result<(), String> { + if email.contains('@') && email.contains('.') { + Ok(()) + } else { + Err(format!("'{email}' is not a valid email address")) + } +} + +fn parse_port(s: &str) -> Result { + s.parse::().map_err(|e| format!("invalid port '{s}': {e}")) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn valid_email_passes() { + // Failure output: "assertion failed" — no indication of what went wrong + assert!(validate_email("user@example.com").is_ok()); + } + + #[test] + fn invalid_email_returns_error() { + // Only checks that it's an Err — cannot see the actual message + assert!(validate_email("not-an-email").is_err()); + } + + #[test] + fn valid_port_parses() { + let result = parse_port("8080"); + // We know it's Ok but cannot verify the actual parsed value + assert!(result.is_ok()); + } + + #[test] + fn non_numeric_port_returns_error() { + // Cannot inspect what error was returned + assert!(parse_port("abc").is_err()); + } + + #[test] + fn edge_case_port() { + // If this fails, we see nothing about what parse_port returned + assert!(parse_port("65535").is_ok()); + assert!(parse_port("65536").is_err()); + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_assert_result_without_value/good/validator_test.rs b/priv/combined_metrics/samples/error_handling/does_not_assert_result_without_value/good/validator_test.rs new file mode 100644 index 0000000..8619e93 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_assert_result_without_value/good/validator_test.rs @@ -0,0 +1,50 @@ +// Good: tests unwrap the Result so the actual error is shown on failure + +fn validate_email(email: &str) -> Result<(), String> { + if email.contains('@') && email.contains('.') { + Ok(()) + } else { + Err(format!("'{email}' is not a valid email address")) + } +} + +fn parse_port(s: &str) -> Result { + s.parse::().map_err(|e| format!("invalid port '{s}': {e}")) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn valid_email_passes() { + // Using unwrap() shows the error message when the assertion fails + validate_email("user@example.com").unwrap(); + } + + #[test] + fn invalid_email_returns_error() { + let err = validate_email("not-an-email").unwrap_err(); + assert!(err.contains("not-an-email"), "expected email in error, got: {err}"); + } + + #[test] + fn valid_port_parses() { + let port = parse_port("8080").unwrap(); + assert_eq!(port, 8080); + } + + #[test] + fn port_zero_is_rejected() { + // parse_port("0") succeeds (0 is a valid u16), but a higher-level + // validator would reject it — show the value to understand failures + let port = parse_port("0").unwrap(); + assert_eq!(port, 0); + } + + #[test] + fn non_numeric_port_returns_error() { + let err = parse_port("abc").unwrap_err(); + assert!(err.contains("abc"), "expected input in error, got: {err}"); + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_catch_and_suppress_errors/bad/FileImporter.swift b/priv/combined_metrics/samples/error_handling/does_not_catch_and_suppress_errors/bad/FileImporter.swift new file mode 100644 index 0000000..2a90f19 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_catch_and_suppress_errors/bad/FileImporter.swift @@ -0,0 +1,58 @@ +import Foundation + +struct ImportResult { + let recordsImported: Int + let sourceURL: URL +} + +class FileImporter { + + func importCSV(from url: URL) -> ImportResult? { + guard FileManager.default.fileExists(atPath: url.path) else { + return nil + } + + let contents: String + do { + contents = try String(contentsOf: url, encoding: .utf8) + } catch { + // Silently swallowed — caller has no idea what went wrong + return nil + } + + let lines = contents.components(separatedBy: .newlines).filter { !$0.isEmpty } + guard lines.count > 1 else { + return nil + } + + var successCount = 0 + + for (index, line) in lines.dropFirst().enumerated() { + do { + try processLine(line, index: index) + successCount += 1 + } catch { + // Silently skipping bad rows — data loss with no notification + continue + } + } + + // Returns a "success" even when half the rows were silently dropped + return ImportResult(recordsImported: successCount, sourceURL: url) + } + + func saveRecord(_ data: Data, to url: URL) { + do { + try data.write(to: url) + } catch { + // Error completely swallowed — caller thinks save succeeded + } + } + + private func processLine(_ line: String, index: Int) throws { + let columns = line.components(separatedBy: ",") + guard columns.count >= 3 else { + throw NSError(domain: "ImportError", code: 1) + } + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_catch_and_suppress_errors/good/FileImporter.swift b/priv/combined_metrics/samples/error_handling/does_not_catch_and_suppress_errors/good/FileImporter.swift new file mode 100644 index 0000000..031ad87 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_catch_and_suppress_errors/good/FileImporter.swift @@ -0,0 +1,66 @@ +import Foundation + +enum ImportError: Error { + case fileNotFound(URL) + case invalidFormat(String) + case permissionDenied + case partialFailure(succeeded: Int, failed: [Error]) +} + +struct ImportResult { + let recordsImported: Int + let sourceURL: URL +} + +class FileImporter { + private let logger: Logger + + init(logger: Logger = Logger(subsystem: "com.app", category: "importer")) { + self.logger = logger + } + + func importCSV(from url: URL) throws -> ImportResult { + guard FileManager.default.fileExists(atPath: url.path) else { + throw ImportError.fileNotFound(url) + } + + let contents: String + do { + contents = try String(contentsOf: url, encoding: .utf8) + } catch { + logger.error("Failed to read file at \(url.path): \(error)") + throw ImportError.permissionDenied + } + + let lines = contents.components(separatedBy: .newlines).filter { !$0.isEmpty } + guard lines.count > 1 else { + throw ImportError.invalidFormat("File contains no data rows") + } + + var errors: [Error] = [] + var successCount = 0 + + for (index, line) in lines.dropFirst().enumerated() { + do { + try processLine(line, index: index) + successCount += 1 + } catch { + logger.warning("Row \(index) failed: \(error)") + errors.append(error) + } + } + + if !errors.isEmpty { + throw ImportError.partialFailure(succeeded: successCount, failed: errors) + } + + return ImportResult(recordsImported: successCount, sourceURL: url) + } + + private func processLine(_ line: String, index: Int) throws { + let columns = line.components(separatedBy: ",") + guard columns.count >= 3 else { + throw ImportError.invalidFormat("Row \(index) has \(columns.count) columns, expected 3+") + } + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_catch_general_exception/bad/OrderProcessor.cs b/priv/combined_metrics/samples/error_handling/does_not_catch_general_exception/bad/OrderProcessor.cs new file mode 100644 index 0000000..12e1c1c --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_catch_general_exception/bad/OrderProcessor.cs @@ -0,0 +1,73 @@ +using System; +using System.Data.SqlClient; +using System.IO; + +namespace OrderService +{ + public class OrderProcessor + { + private readonly IOrderRepository _repository; + private readonly ILogger _logger; + + public OrderProcessor(IOrderRepository repository, ILogger logger) + { + _repository = repository; + _logger = logger; + } + + public void ProcessOrder(int orderId) + { + try + { + var order = _repository.GetById(orderId); + ValidateOrder(order); + _repository.MarkAsProcessed(order); + } + catch (Exception ex) + { + // Catches everything — hides programming errors, thread aborts, etc. + _logger.Error("Something went wrong: " + ex.Message); + } + } + + public bool TrySaveInvoice(Order order, string path) + { + try + { + var content = GenerateInvoiceContent(order); + File.WriteAllText(path, content); + return true; + } + catch (Exception) + { + // Swallows all exceptions silently, including OutOfMemoryException + return false; + } + } + + public void FinalizeOrders() + { + try + { + var pending = _repository.GetPendingOrders(); + foreach (var order in pending) + { + _repository.Finalize(order); + } + } + catch (Exception ex) + { + // Re-throwing System.Exception as a new Exception loses the specific type + throw new Exception("Finalization failed", ex); + } + } + + private void ValidateOrder(Order order) + { + if (order.Items.Count == 0) + throw new InvalidOrderException("Order must contain at least one item"); + } + + private string GenerateInvoiceContent(Order order) => $"Invoice for order {order.Id}"; + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_catch_general_exception/good/OrderProcessor.cs b/priv/combined_metrics/samples/error_handling/does_not_catch_general_exception/good/OrderProcessor.cs new file mode 100644 index 0000000..e9614dc --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_catch_general_exception/good/OrderProcessor.cs @@ -0,0 +1,68 @@ +using System; +using System.Data.SqlClient; +using System.IO; + +namespace OrderService +{ + public class OrderProcessor + { + private readonly IOrderRepository _repository; + private readonly ILogger _logger; + + public OrderProcessor(IOrderRepository repository, ILogger logger) + { + _repository = repository; + _logger = logger; + } + + public void ProcessOrder(int orderId) + { + try + { + var order = _repository.GetById(orderId); + ValidateOrder(order); + _repository.MarkAsProcessed(order); + } + catch (SqlException ex) + { + _logger.Error("Database error while processing order {orderId}", ex); + throw new OrderProcessingException("Failed to access order data", ex); + } + catch (InvalidOrderException ex) + { + _logger.Warning("Order {orderId} failed validation: {message}", ex.Message); + throw; + } + } + + public bool TrySaveInvoice(Order order, string path) + { + try + { + var content = GenerateInvoiceContent(order); + File.WriteAllText(path, content); + return true; + } + catch (UnauthorizedAccessException ex) + { + _logger.Warning("Cannot write invoice to {path}: access denied", ex); + return false; + } + catch (IOException ex) + { + _logger.Warning("IO error writing invoice to {path}", ex); + return false; + } + } + + private void ValidateOrder(Order order) + { + if (order.Items.Count == 0) + throw new InvalidOrderException("Order must contain at least one item"); + if (order.CustomerId <= 0) + throw new InvalidOrderException("Order must be associated with a valid customer"); + } + + private string GenerateInvoiceContent(Order order) => $"Invoice for order {order.Id}"; + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_discard_errors/bad/handler.go b/priv/combined_metrics/samples/error_handling/does_not_discard_errors/bad/handler.go new file mode 100644 index 0000000..9ecb167 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_discard_errors/bad/handler.go @@ -0,0 +1,50 @@ +package handler + +import ( + "encoding/json" + "log" + "net/http" +) + +type OrderRequest struct { + UserID string `json:"user_id"` + ProductID string `json:"product_id"` + Quantity int `json:"quantity"` + Price float64 `json:"price"` +} + +type OrderResponse struct { + OrderID string `json:"order_id"` + Status string `json:"status"` +} + +type OrderService interface { + PlaceOrder(req OrderRequest) (string, error) + NotifyUser(userID, orderID string) error +} + +type OrderHandler struct { + service OrderService + logger *log.Logger +} + +func NewOrderHandler(service OrderService, logger *log.Logger) *OrderHandler { + return &OrderHandler{service: service, logger: logger} +} + +func (h *OrderHandler) PlaceOrder(w http.ResponseWriter, r *http.Request) { + var req OrderRequest + // error from Decode is discarded + _ = json.NewDecoder(r.Body).Decode(&req) + + // error from PlaceOrder is discarded + orderID, _ := h.service.PlaceOrder(req) + + // error from NotifyUser is discarded + _ = h.service.NotifyUser(req.UserID, orderID) + + resp := OrderResponse{OrderID: orderID, Status: "confirmed"} + w.Header().Set("Content-Type", "application/json") + // error from Encode is discarded + _ = json.NewEncoder(w).Encode(resp) +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_discard_errors/good/handler.go b/priv/combined_metrics/samples/error_handling/does_not_discard_errors/good/handler.go new file mode 100644 index 0000000..3b4be17 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_discard_errors/good/handler.go @@ -0,0 +1,60 @@ +package handler + +import ( + "encoding/json" + "log" + "net/http" +) + +type OrderRequest struct { + UserID string `json:"user_id"` + ProductID string `json:"product_id"` + Quantity int `json:"quantity"` + Price float64 `json:"price"` +} + +type OrderResponse struct { + OrderID string `json:"order_id"` + Status string `json:"status"` +} + +type OrderService interface { + PlaceOrder(req OrderRequest) (string, error) + NotifyUser(userID, orderID string) error +} + +type OrderHandler struct { + service OrderService + logger *log.Logger +} + +func NewOrderHandler(service OrderService, logger *log.Logger) *OrderHandler { + return &OrderHandler{service: service, logger: logger} +} + +func (h *OrderHandler) PlaceOrder(w http.ResponseWriter, r *http.Request) { + var req OrderRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + h.logger.Printf("failed to decode request: %v", err) + http.Error(w, "invalid request body", http.StatusBadRequest) + return + } + + orderID, err := h.service.PlaceOrder(req) + if err != nil { + h.logger.Printf("failed to place order for user %s: %v", req.UserID, err) + http.Error(w, "failed to place order", http.StatusInternalServerError) + return + } + + if err := h.service.NotifyUser(req.UserID, orderID); err != nil { + h.logger.Printf("failed to notify user %s for order %s: %v", req.UserID, orderID, err) + // notification failure is non-fatal; continue + } + + resp := OrderResponse{OrderID: orderID, Status: "confirmed"} + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(resp); err != nil { + h.logger.Printf("failed to encode response: %v", err) + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_expose_implementation_errors/bad/gateway.go b/priv/combined_metrics/samples/error_handling/does_not_expose_implementation_errors/bad/gateway.go new file mode 100644 index 0000000..5394b67 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_expose_implementation_errors/bad/gateway.go @@ -0,0 +1,50 @@ +package gateway + +import ( + "context" + "encoding/json" + "fmt" + "net/http" +) + +// ShipmentStatus is returned by the shipping gateway. +type ShipmentStatus struct { + TrackingID string + State string +} + +// ShippingGateway calls an external carrier API. +type ShippingGateway struct { + base string + client *http.Client +} + +func New(base string) *ShippingGateway { + return &ShippingGateway{base: base, client: &http.Client{}} +} + +// TrackShipment retrieves the current status of a shipment. +// Using %w exposes internal http, net/url, and json error types to callers, +// leaking implementation details across the abstraction boundary. +func (g *ShippingGateway) TrackShipment(ctx context.Context, trackingID string) (*ShipmentStatus, error) { + url := fmt.Sprintf("%s/shipments/%s", g.base, trackingID) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + // %w leaks *url.Error and http internals to callers. + return nil, fmt.Errorf("track shipment %q: build request: %w", trackingID, err) + } + + resp, err := g.client.Do(req) + if err != nil { + // %w propagates net/http transport types; callers now depend on them. + return nil, fmt.Errorf("track shipment %q: call carrier api: %w", trackingID, err) + } + defer resp.Body.Close() + + var status ShipmentStatus + if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { + // %w exposes *json.SyntaxError / *json.UnmarshalTypeError to callers. + return nil, fmt.Errorf("track shipment %q: decode response: %w", trackingID, err) + } + return &status, nil +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_expose_implementation_errors/good/gateway.go b/priv/combined_metrics/samples/error_handling/does_not_expose_implementation_errors/good/gateway.go new file mode 100644 index 0000000..fa7fc27 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_expose_implementation_errors/good/gateway.go @@ -0,0 +1,50 @@ +package gateway + +import ( + "context" + "encoding/json" + "fmt" + "net/http" +) + +// ShipmentStatus is returned by the shipping gateway. +type ShipmentStatus struct { + TrackingID string + State string +} + +// ShippingGateway calls an external carrier API. +type ShippingGateway struct { + base string + client *http.Client +} + +func New(base string) *ShippingGateway { + return &ShippingGateway{base: base, client: &http.Client{}} +} + +// TrackShipment retrieves the current status of a shipment. +// Internal HTTP and JSON errors are wrapped with %v to avoid leaking +// implementation details to callers above this abstraction layer. +func (g *ShippingGateway) TrackShipment(ctx context.Context, trackingID string) (*ShipmentStatus, error) { + url := fmt.Sprintf("%s/shipments/%s", g.base, trackingID) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + // %v instead of %w: callers should not depend on http.Request internals. + return nil, fmt.Errorf("track shipment %q: build request: %v", trackingID, err) + } + + resp, err := g.client.Do(req) + if err != nil { + // %v prevents leaking net/url or transport error types. + return nil, fmt.Errorf("track shipment %q: call carrier api: %v", trackingID, err) + } + defer resp.Body.Close() + + var status ShipmentStatus + if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { + // %v hides JSON parsing internals from callers. + return nil, fmt.Errorf("track shipment %q: decode response: %v", trackingID, err) + } + return &status, nil +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_force_unwrap_optionals/bad/NetworkClient.swift b/priv/combined_metrics/samples/error_handling/does_not_force_unwrap_optionals/bad/NetworkClient.swift new file mode 100644 index 0000000..49710b7 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_force_unwrap_optionals/bad/NetworkClient.swift @@ -0,0 +1,57 @@ +import Foundation + +struct APIResponse: Decodable { + let data: T + let statusCode: Int +} + +enum NetworkError: Error { + case invalidURL + case noData + case decodingFailed(Error) +} + +class NetworkClient { + private let session: URLSession + private let baseURL: URL + + init(baseURL: URL, session: URLSession = .shared) { + self.session = session + self.baseURL = baseURL + } + + func fetch( + path: String, + completion: @escaping (Result) -> Void + ) { + // Force unwrap: crashes if path is not a valid URL + let url = URL(string: path, relativeTo: baseURL)! + + session.dataTask(with: url) { data, response, error in + // Force unwrap: crashes if data is nil + let responseData = data! + + do { + let decoded = try JSONDecoder().decode(T.self, from: responseData) + completion(.success(decoded)) + } catch { + completion(.failure(.decodingFailed(error))) + } + }.resume() + } + + func buildURL(for path: String) -> URL { + // Force unwrap: crashes on invalid input + return URL(string: path, relativeTo: baseURL)! + } + + func headerValue(for key: String, in response: HTTPURLResponse) -> String { + // Force unwrap: crashes if header is absent + return response.value(forHTTPHeaderField: key)! + } + + func firstComponent(of url: URL) -> String { + // Force unwrap: crashes if pathComponents is empty + return url.pathComponents.first! + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_force_unwrap_optionals/good/NetworkClient.swift b/priv/combined_metrics/samples/error_handling/does_not_force_unwrap_optionals/good/NetworkClient.swift new file mode 100644 index 0000000..242e54e --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_force_unwrap_optionals/good/NetworkClient.swift @@ -0,0 +1,66 @@ +import Foundation + +struct APIResponse: Decodable { + let data: T + let statusCode: Int +} + +enum NetworkError: Error { + case invalidURL + case noData + case decodingFailed(Error) + case unexpectedStatusCode(Int) +} + +class NetworkClient { + private let session: URLSession + private let baseURL: URL + + init(baseURL: URL, session: URLSession = .shared) { + self.session = session + self.baseURL = baseURL + } + + func fetch( + path: String, + completion: @escaping (Result) -> Void + ) { + guard let url = URL(string: path, relativeTo: baseURL) else { + completion(.failure(.invalidURL)) + return + } + + session.dataTask(with: url) { data, response, error in + if let error = error { + completion(.failure(.decodingFailed(error))) + return + } + + guard let data = data else { + completion(.failure(.noData)) + return + } + + if let httpResponse = response as? HTTPURLResponse, + !(200..<300).contains(httpResponse.statusCode) { + completion(.failure(.unexpectedStatusCode(httpResponse.statusCode))) + return + } + + do { + let decoded = try JSONDecoder().decode(T.self, from: data) + completion(.success(decoded)) + } catch { + completion(.failure(.decodingFailed(error))) + } + }.resume() + } + + func buildURL(for path: String) -> URL? { + return URL(string: path, relativeTo: baseURL) + } + + func headerValue(for key: String, in response: HTTPURLResponse) -> String? { + return response.value(forHTTPHeaderField: key) + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_return_error_codes/bad/UserRepository.cs b/priv/combined_metrics/samples/error_handling/does_not_return_error_codes/bad/UserRepository.cs new file mode 100644 index 0000000..64fcb35 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_return_error_codes/bad/UserRepository.cs @@ -0,0 +1,98 @@ +using System; +using System.Collections.Generic; +using System.Data.SqlClient; + +namespace UserService +{ + public class UserRepository + { + private readonly string _connectionString; + + public UserRepository(string connectionString) + { + _connectionString = connectionString; + } + + // Returns -1 on failure instead of throwing + public int GetById(int userId, out User user) + { + user = null; + try + { + using var connection = new SqlConnection(_connectionString); + connection.Open(); + var command = new SqlCommand("SELECT * FROM Users WHERE Id = @id", connection); + command.Parameters.AddWithValue("@id", userId); + + using var reader = command.ExecuteReader(); + if (!reader.Read()) + return -1; // not found + + user = MapUser(reader); + return 0; // success + } + catch (SqlException) + { + return -2; // database error + } + } + + // Returns false on failure — caller can't distinguish why it failed + public bool Create(User user) + { + if (user == null) return false; + if (string.IsNullOrWhiteSpace(user.Email)) return false; + + try + { + using var connection = new SqlConnection(_connectionString); + connection.Open(); + + var checkCmd = new SqlCommand( + "SELECT COUNT(1) FROM Users WHERE Email = @email", connection); + checkCmd.Parameters.AddWithValue("@email", user.Email); + if ((int)checkCmd.ExecuteScalar() > 0) + return false; // duplicate email, but caller doesn't know that + + var command = new SqlCommand( + "INSERT INTO Users (Email, Name) VALUES (@email, @name)", connection); + command.Parameters.AddWithValue("@email", user.Email); + command.Parameters.AddWithValue("@name", user.Name); + command.ExecuteNonQuery(); + return true; + } + catch (SqlException) + { + return false; + } + } + + // Returns null to signal "no users" or "error" — ambiguous + public List GetByRole(string role) + { + if (string.IsNullOrWhiteSpace(role)) return null; + + try + { + using var connection = new SqlConnection(_connectionString); + connection.Open(); + var command = new SqlCommand("SELECT * FROM Users WHERE Role = @role", connection); + command.Parameters.AddWithValue("@role", role); + + var users = new List(); + using var reader = command.ExecuteReader(); + while (reader.Read()) + users.Add(MapUser(reader)); + + return users; + } + catch (SqlException) + { + return null; // error or empty — caller cannot tell the difference + } + } + + private User MapUser(SqlDataReader reader) => + new User(reader.GetInt32(0), reader.GetString(1), reader.GetString(2)); + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_return_error_codes/good/UserRepository.cs b/priv/combined_metrics/samples/error_handling/does_not_return_error_codes/good/UserRepository.cs new file mode 100644 index 0000000..f113b5d --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_return_error_codes/good/UserRepository.cs @@ -0,0 +1,78 @@ +using System; +using System.Collections.Generic; +using System.Data.SqlClient; + +namespace UserService +{ + public class UserRepository + { + private readonly string _connectionString; + + public UserRepository(string connectionString) + { + _connectionString = connectionString; + } + + public User GetById(int userId) + { + using var connection = new SqlConnection(_connectionString); + connection.Open(); + var command = new SqlCommand("SELECT * FROM Users WHERE Id = @id", connection); + command.Parameters.AddWithValue("@id", userId); + + using var reader = command.ExecuteReader(); + if (!reader.Read()) + throw new UserNotFoundException($"User with ID {userId} does not exist."); + + return MapUser(reader); + } + + public void Create(User user) + { + if (user == null) throw new ArgumentNullException(nameof(user)); + if (string.IsNullOrWhiteSpace(user.Email)) + throw new ArgumentException("Email is required.", nameof(user)); + + using var connection = new SqlConnection(_connectionString); + connection.Open(); + + if (EmailExists(connection, user.Email)) + throw new DuplicateEmailException($"Email '{user.Email}' is already registered."); + + var command = new SqlCommand( + "INSERT INTO Users (Email, Name) VALUES (@email, @name)", connection); + command.Parameters.AddWithValue("@email", user.Email); + command.Parameters.AddWithValue("@name", user.Name); + command.ExecuteNonQuery(); + } + + public IReadOnlyList GetByRole(string role) + { + if (string.IsNullOrWhiteSpace(role)) + throw new ArgumentException("Role must not be empty.", nameof(role)); + + using var connection = new SqlConnection(_connectionString); + connection.Open(); + var command = new SqlCommand("SELECT * FROM Users WHERE Role = @role", connection); + command.Parameters.AddWithValue("@role", role); + + var users = new List(); + using var reader = command.ExecuteReader(); + while (reader.Read()) + users.Add(MapUser(reader)); + + return users.AsReadOnly(); + } + + private bool EmailExists(SqlConnection connection, string email) + { + var command = new SqlCommand( + "SELECT COUNT(1) FROM Users WHERE Email = @email", connection); + command.Parameters.AddWithValue("@email", email); + return (int)command.ExecuteScalar() > 0; + } + + private User MapUser(SqlDataReader reader) => + new User(reader.GetInt32(0), reader.GetString(1), reader.GetString(2)); + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_swallow_errors/bad/file_processor.ex b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors/bad/file_processor.ex new file mode 100644 index 0000000..e1c017f --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors/bad/file_processor.ex @@ -0,0 +1,85 @@ +defmodule FileProcessor do + @moduledoc """ + Processes uploaded files and extracts their contents. + """ + + def process_file(path) do + try do + contents = File.read!(path) + parsed = parse_contents(contents) + {:ok, parsed} + rescue + _ -> nil + end + end + + def read_csv(path) do + try do + path + |> File.stream!() + |> Enum.map(&String.trim/1) + |> Enum.map(&parse_csv_row/1) + rescue + e -> false + end + end + + def extract_metadata(path) do + try do + stat = File.stat!(path) + %{size: stat.size, modified: stat.mtime} + catch + _, _ -> nil + end + end + + def batch_process(paths) do + Enum.map(paths, fn path -> + try do + process_file(path) + rescue + _ -> nil + end + end) + end + + def validate_and_process(path) do + try do + if File.exists?(path) do + process_file(path) + else + {:error, :not_found} + end + rescue + _ -> false + end + end + + def compress_file(path, dest) do + try do + contents = File.read!(path) + compressed = :zlib.compress(contents) + File.write!(dest, compressed) + :ok + rescue + _ -> nil + end + end + + def delete_processed(path) do + try do + File.rm!(path) + :ok + catch + _, _ -> false + end + end + + defp parse_contents(contents) do + String.split(contents, "\n") + end + + defp parse_csv_row(row) do + String.split(row, ",") + end +end diff --git a/priv/combined_metrics/samples/error_handling/does_not_swallow_errors/config.yml b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors/config.yml new file mode 100644 index 0000000..9db16ed --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors/config.yml @@ -0,0 +1 @@ +doc: "Errors must be handled or re-raised — empty rescue/catch blocks silently hide failures." diff --git a/priv/combined_metrics/samples/error_handling/does_not_swallow_errors/good/file_processor.ex b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors/good/file_processor.ex new file mode 100644 index 0000000..6647609 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors/good/file_processor.ex @@ -0,0 +1,98 @@ +defmodule FileProcessor do + @moduledoc """ + Processes uploaded files and extracts their contents. + """ + + require Logger + + def process_file(path) do + try do + contents = File.read!(path) + parsed = parse_contents(contents) + {:ok, parsed} + rescue + e in File.Error -> + Logger.error("Failed to read file at #{path}: #{Exception.message(e)}") + {:error, {:read_failed, path}} + end + end + + def read_csv(path) do + try do + rows = + path + |> File.stream!() + |> Enum.map(&String.trim/1) + |> Enum.map(&parse_csv_row/1) + + {:ok, rows} + rescue + e in File.Error -> + Logger.error("CSV read failed for #{path}: #{Exception.message(e)}") + {:error, {:csv_read_failed, path}} + end + end + + def extract_metadata(path) do + try do + stat = File.stat!(path) + {:ok, %{size: stat.size, modified: stat.mtime}} + rescue + e in File.Error -> + Logger.warning("Could not stat file #{path}: #{Exception.message(e)}") + {:error, {:stat_failed, path}} + end + end + + def batch_process(paths) do + Enum.map(paths, fn path -> + case process_file(path) do + {:ok, result} -> {:ok, result} + {:error, reason} -> + Logger.warning("Skipping #{path} due to error: #{inspect(reason)}") + {:error, reason} + end + end) + end + + def validate_and_process(path) do + if File.exists?(path) do + process_file(path) + else + Logger.warning("Attempted to process non-existent file: #{path}") + {:error, {:file_not_found, path}} + end + end + + def compress_file(path, dest) do + try do + contents = File.read!(path) + compressed = :zlib.compress(contents) + File.write!(dest, compressed) + :ok + rescue + e in File.Error -> + Logger.error("Compression failed for #{path} -> #{dest}: #{Exception.message(e)}") + reraise e, __STACKTRACE__ + end + end + + def delete_processed(path) do + case File.rm(path) do + :ok -> + Logger.info("Deleted processed file: #{path}") + :ok + {:error, reason} -> + Logger.error("Failed to delete #{path}: #{inspect(reason)}") + {:error, {:delete_failed, reason}} + end + end + + defp parse_contents(contents) do + String.split(contents, "\n") + end + + defp parse_csv_row(row) do + String.split(row, ",") + end +end diff --git a/priv/combined_metrics/samples/error_handling/does_not_swallow_errors_silently/bad/event_bus.js b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors_silently/bad/event_bus.js new file mode 100644 index 0000000..677dcc5 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors_silently/bad/event_bus.js @@ -0,0 +1,61 @@ +class EventBus { + constructor() { + this._handlers = new Map(); + } + + subscribe(eventName, handler) { + if (!this._handlers.has(eventName)) { + this._handlers.set(eventName, []); + } + this._handlers.get(eventName).push(handler); + } + + unsubscribe(eventName, handler) { + try { + const handlers = this._handlers.get(eventName); + const index = handlers.indexOf(handler); + handlers.splice(index, 1); + } catch (e) { + } + } + + async publish(eventName, payload) { + const handlers = this._handlers.get(eventName) ?? []; + + for (const handler of handlers) { + try { + await handler(payload); + } catch (e) { + } + } + } + + async publishAll(events) { + for (const { name, payload } of events) { + try { + await this.publish(name, payload); + } catch { + } + } + } +} + +async function initializeAnalytics(bus) { + try { + await bus.publish("analytics:init", { timestamp: Date.now() }); + } catch (e) { + } +} + +async function loadUserPreferences(userId, bus) { + try { + const prefs = await fetch(`/api/users/${userId}/preferences`).then((r) => + r.json() + ); + await bus.publish("preferences:loaded", prefs); + } catch (err) { + } +} + +const bus = new EventBus(); +export { bus, EventBus, initializeAnalytics, loadUserPreferences }; diff --git a/priv/combined_metrics/samples/error_handling/does_not_swallow_errors_silently/good/event_bus.js b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors_silently/good/event_bus.js new file mode 100644 index 0000000..2c13f6a --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_swallow_errors_silently/good/event_bus.js @@ -0,0 +1,66 @@ +import logger from "./logger.js"; + +class EventBus { + constructor() { + this._handlers = new Map(); + this._deadLetterQueue = []; + } + + subscribe(eventName, handler) { + if (!this._handlers.has(eventName)) { + this._handlers.set(eventName, []); + } + this._handlers.get(eventName).push(handler); + } + + unsubscribe(eventName, handler) { + const handlers = this._handlers.get(eventName); + if (!handlers) return; + const index = handlers.indexOf(handler); + if (index !== -1) { + handlers.splice(index, 1); + } + } + + async publish(eventName, payload) { + const handlers = this._handlers.get(eventName) ?? []; + + for (const handler of handlers) { + try { + await handler(payload); + } catch (err) { + logger.error( + `EventBus: handler for '${eventName}' threw an error`, + err + ); + this._deadLetterQueue.push({ eventName, payload, error: err, ts: Date.now() }); + } + } + } + + async publishOrFail(eventName, payload) { + const handlers = this._handlers.get(eventName) ?? []; + + for (const handler of handlers) { + await handler(payload); + } + } + + drainDeadLetterQueue() { + const items = [...this._deadLetterQueue]; + this._deadLetterQueue.length = 0; + return items; + } +} + +async function initializeAnalytics(bus) { + try { + await bus.publish("analytics:init", { timestamp: Date.now() }); + } catch (err) { + // Analytics is non-critical; log and continue application startup + logger.warn("Analytics initialization failed, proceeding without it", err); + } +} + +const bus = new EventBus(); +export { bus, EventBus, initializeAnalytics }; diff --git a/priv/combined_metrics/samples/error_handling/does_not_swallow_exceptions/bad/OrderService.java b/priv/combined_metrics/samples/error_handling/does_not_swallow_exceptions/bad/OrderService.java new file mode 100644 index 0000000..388899c --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_swallow_exceptions/bad/OrderService.java @@ -0,0 +1,69 @@ +package com.example.orders; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.logging.Logger; + +public class OrderService { + + private static final Logger logger = Logger.getLogger(OrderService.class.getName()); + + private final OrderRepository repository; + private final PaymentGateway paymentGateway; + + public OrderService(OrderRepository repository, PaymentGateway paymentGateway) { + this.repository = repository; + this.paymentGateway = paymentGateway; + } + + public Order placeOrder(Cart cart, PaymentDetails payment) { + Order order = Order.from(cart); + + try { + paymentGateway.charge(payment, order.totalAmount()); + } catch (PaymentDeclinedException e) { + // silently swallowed — caller will never know the charge failed + } catch (IOException e) { + // silently swallowed — network errors are completely hidden + } + + try { + repository.save(order); + } catch (SQLException e) { + // silently swallowed — order may not have been saved at all + } + + return order; + } + + public void cancelOrder(String orderId) { + Order order = null; + try { + order = repository.findById(orderId); + } catch (SQLException e) { + // silently swallowed — order is null but execution continues + } + + if (order == null) { + return; + } + + order.cancel(); + + try { + repository.update(order); + } catch (SQLException e) { + // silently swallowed — cancellation may not have been persisted + } + } + + public double getOrderTotal(String orderId) { + try { + Order order = repository.findById(orderId); + return order.totalAmount(); + } catch (Exception e) { + // catch-all swallowed; returns 0 as if the order doesn't exist + return 0.0; + } + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_swallow_exceptions/good/OrderService.java b/priv/combined_metrics/samples/error_handling/does_not_swallow_exceptions/good/OrderService.java new file mode 100644 index 0000000..2a9655c --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_swallow_exceptions/good/OrderService.java @@ -0,0 +1,73 @@ +package com.example.orders; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.logging.Level; +import java.util.logging.Logger; + +public class OrderService { + + private static final Logger logger = Logger.getLogger(OrderService.class.getName()); + + private final OrderRepository repository; + private final PaymentGateway paymentGateway; + + public OrderService(OrderRepository repository, PaymentGateway paymentGateway) { + this.repository = repository; + this.paymentGateway = paymentGateway; + } + + public Order placeOrder(Cart cart, PaymentDetails payment) { + Order order = Order.from(cart); + + try { + paymentGateway.charge(payment, order.totalAmount()); + } catch (PaymentDeclinedException e) { + // Payment was declined by the gateway; surface this to the caller + // so they can prompt the user to retry with different details. + throw new OrderPlacementException("Payment declined: " + e.getReason(), e); + } catch (IOException e) { + // Network error communicating with the payment gateway. + // Log at ERROR level and rethrow so the caller can handle retries. + logger.log(Level.SEVERE, "Network failure while charging payment for order", e); + throw new OrderPlacementException("Payment gateway unreachable", e); + } + + try { + repository.save(order); + } catch (SQLException e) { + // Database write failed after successful payment — log with order + // context so support can reconcile the charge manually. + logger.log(Level.SEVERE, "Failed to persist order after successful payment: orderId=" + order.getId(), e); + throw new OrderPlacementException("Order could not be saved", e); + } + + return order; + } + + public void cancelOrder(String orderId) { + Order order; + try { + order = repository.findById(orderId); + } catch (SQLException e) { + // Could not load order from the database; rethrow with context. + logger.log(Level.WARNING, "Database error looking up order: " + orderId, e); + throw new OrderServiceException("Unable to retrieve order " + orderId, e); + } + + if (order == null) { + throw new OrderNotFoundException(orderId); + } + + order.cancel(); + + try { + repository.update(order); + } catch (SQLException e) { + // Persisting the cancellation status failed; rethrow so the caller + // knows the cancellation did not complete successfully. + logger.log(Level.SEVERE, "Failed to persist cancellation for orderId=" + orderId, e); + throw new OrderServiceException("Cancellation could not be saved", e); + } + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_throw_from_finally_block/bad/FileProcessor.cs b/priv/combined_metrics/samples/error_handling/does_not_throw_from_finally_block/bad/FileProcessor.cs new file mode 100644 index 0000000..473b49a --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_throw_from_finally_block/bad/FileProcessor.cs @@ -0,0 +1,81 @@ +using System; +using System.IO; + +namespace FileProcessing +{ + public class FileProcessor + { + private readonly ILogger _logger; + + public FileProcessor(ILogger logger) + { + _logger = logger; + } + + public string ReadAndProcess(string filePath) + { + StreamReader reader = null; + try + { + reader = new StreamReader(filePath); + var content = reader.ReadToEnd(); + return Transform(content); + } + catch (FileNotFoundException ex) + { + _logger.Error("Input file not found", ex); + throw; + } + finally + { + reader?.Close(); + // Throws from finally — suppresses the FileNotFoundException above + if (!File.Exists(filePath + ".processed")) + throw new InvalidOperationException("Processed marker missing."); + } + } + + public void ProcessBatch(string[] filePaths) + { + FileStream lockFile = null; + try + { + lockFile = AcquireLock(); + foreach (var path in filePaths) + { + ProcessSingleFile(path); + } + } + finally + { + lockFile?.Close(); + + // Validating state inside finally and throwing — bad practice. + // Any exception thrown during batch processing is now lost. + var pendingCount = CountPending(filePaths); + if (pendingCount > 0) + throw new InvalidOperationException( + $"{pendingCount} files were not processed."); + } + } + + private string Transform(string content) => content.Trim().ToUpperInvariant(); + + private void ProcessSingleFile(string path) + { + var content = File.ReadAllText(path); + File.WriteAllText(path + ".out", Transform(content)); + } + + private int CountPending(string[] filePaths) + { + int count = 0; + foreach (var p in filePaths) + if (!File.Exists(p + ".out")) count++; + return count; + } + + private FileStream AcquireLock() => + new FileStream("/tmp/processor.lock", FileMode.Create, FileAccess.Write); + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_throw_from_finally_block/good/FileProcessor.cs b/priv/combined_metrics/samples/error_handling/does_not_throw_from_finally_block/good/FileProcessor.cs new file mode 100644 index 0000000..dc5510b --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_throw_from_finally_block/good/FileProcessor.cs @@ -0,0 +1,84 @@ +using System; +using System.IO; + +namespace FileProcessing +{ + public class FileProcessor + { + private readonly ILogger _logger; + + public FileProcessor(ILogger logger) + { + _logger = logger; + } + + public string ReadAndProcess(string filePath) + { + StreamReader reader = null; + try + { + reader = new StreamReader(filePath); + var content = reader.ReadToEnd(); + return Transform(content); + } + catch (FileNotFoundException ex) + { + _logger.Error("Input file not found: {path}", ex); + throw; + } + finally + { + // Finally only performs cleanup — never throws + try + { + reader?.Close(); + } + catch (IOException ex) + { + // Log but do not rethrow; we must not suppress the original exception + _logger.Warning("Failed to close reader cleanly", ex); + } + } + } + + public void ProcessBatch(string[] filePaths) + { + FileStream lockFile = null; + try + { + lockFile = AcquireLock(); + foreach (var path in filePaths) + { + ProcessSingleFile(path); + } + } + finally + { + // Cleanup only — releasing the lock must not throw out of finally + if (lockFile != null) + { + try + { + lockFile.Close(); + File.Delete(lockFile.Name); + } + catch (IOException ex) + { + _logger.Warning("Lock file cleanup failed", ex); + } + } + } + } + + private string Transform(string content) => content.Trim().ToUpperInvariant(); + + private void ProcessSingleFile(string path) + { + var content = File.ReadAllText(path); + File.WriteAllText(path + ".out", Transform(content)); + } + + private FileStream AcquireLock() => + new FileStream("/tmp/processor.lock", FileMode.Create, FileAccess.Write); + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_use_exceptions_for_control_flow/bad/ProductCatalog.cs b/priv/combined_metrics/samples/error_handling/does_not_use_exceptions_for_control_flow/bad/ProductCatalog.cs new file mode 100644 index 0000000..90547ea --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_use_exceptions_for_control_flow/bad/ProductCatalog.cs @@ -0,0 +1,94 @@ +using System; +using System.Collections.Generic; + +namespace Catalog +{ + public class ProductCatalog + { + private readonly Dictionary _products = new(); + + // Using exception to detect "not found" as normal control flow + public Product GetProduct(string sku) + { + try + { + return _products[sku]; // throws KeyNotFoundException for missing keys + } + catch (KeyNotFoundException) + { + return null; + } + } + + // Using FormatException to drive parsing logic + public int ParseQuantity(string input) + { + try + { + return int.Parse(input); + } + catch (FormatException) + { + return 0; + } + catch (OverflowException) + { + return 0; + } + } + + // Using exception to check discount applicability + public decimal ApplyDiscount(string couponCode, decimal originalPrice) + { + try + { + decimal rate = _discountMap[couponCode]; // throws if not found + return originalPrice * (1 - rate); + } + catch (KeyNotFoundException) + { + return originalPrice; // no discount — but this is expected, not exceptional + } + } + + public decimal CalculateTotalPrice(string sku, string quantityInput, string couponCode) + { + var product = GetProduct(sku); + if (product == null) return 0m; + + int quantity = ParseQuantity(quantityInput); + if (quantity <= 0) return 0m; + + decimal lineTotal = product.UnitPrice * quantity; + return ApplyDiscount(couponCode, lineTotal); + } + + public IReadOnlyList SearchByCategory(string category) + { + var results = new List(); + try + { + foreach (var product in _products.Values) + { + // Throwing to break from nested search — very bad pattern + if (results.Count >= 50) + throw new InvalidOperationException("limit reached"); + + if (product.Category.Equals(category, StringComparison.OrdinalIgnoreCase)) + results.Add(product); + } + } + catch (InvalidOperationException) + { + // silently stop — exception used as a loop break + } + return results.AsReadOnly(); + } + + private readonly Dictionary _discountMap = new() + { + ["SAVE10"] = 0.10m, + ["SAVE20"] = 0.20m, + }; + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_use_exceptions_for_control_flow/good/ProductCatalog.cs b/priv/combined_metrics/samples/error_handling/does_not_use_exceptions_for_control_flow/good/ProductCatalog.cs new file mode 100644 index 0000000..4c79b6b --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_use_exceptions_for_control_flow/good/ProductCatalog.cs @@ -0,0 +1,67 @@ +using System; +using System.Collections.Generic; + +namespace Catalog +{ + public class ProductCatalog + { + private readonly Dictionary _products = new(); + + public bool TryGetProduct(string sku, out Product product) + { + return _products.TryGetValue(sku, out product); + } + + public bool TryParseQuantity(string input, out int quantity) + { + return int.TryParse(input, out quantity) && quantity > 0; + } + + public bool TryApplyDiscount(string couponCode, decimal originalPrice, out decimal discountedPrice) + { + discountedPrice = originalPrice; + + if (string.IsNullOrWhiteSpace(couponCode)) + return false; + + if (!_discountMap.TryGetValue(couponCode, out decimal rate)) + return false; + + discountedPrice = originalPrice * (1 - rate); + return true; + } + + public decimal CalculateTotalPrice(string sku, string quantityInput, string couponCode) + { + if (!TryGetProduct(sku, out var product)) + return 0m; + + if (!TryParseQuantity(quantityInput, out int quantity)) + return 0m; + + decimal lineTotal = product.UnitPrice * quantity; + + if (TryApplyDiscount(couponCode, lineTotal, out decimal discounted)) + return discounted; + + return lineTotal; + } + + public IReadOnlyList SearchByCategory(string category) + { + var results = new List(); + foreach (var product in _products.Values) + { + if (product.Category.Equals(category, StringComparison.OrdinalIgnoreCase)) + results.Add(product); + } + return results.AsReadOnly(); + } + + private readonly Dictionary _discountMap = new() + { + ["SAVE10"] = 0.10m, + ["SAVE20"] = 0.20m, + }; + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_use_force_try/bad/DataLoader.swift b/priv/combined_metrics/samples/error_handling/does_not_use_force_try/bad/DataLoader.swift new file mode 100644 index 0000000..9e973ac --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_use_force_try/bad/DataLoader.swift @@ -0,0 +1,49 @@ +import Foundation + +enum DataLoaderError: Error { + case fileNotFound(String) + case decodingFailed(Error) +} + +struct UserPreferences: Codable { + var theme: String + var notificationsEnabled: Bool + var language: String +} + +class DataLoader { + private let fileManager: FileManager + private let documentsURL: URL + + init(fileManager: FileManager = .default) { + self.fileManager = fileManager + self.documentsURL = fileManager.urls(for: .documentDirectory, in: .userDomainMask)[0] + } + + func loadPreferences(from filename: String) -> UserPreferences { + let fileURL = documentsURL.appendingPathComponent(filename) + + // try! crashes the app if the file is missing or malformed + let data = try! Data(contentsOf: fileURL) + return try! JSONDecoder().decode(UserPreferences.self, from: data) + } + + func savePreferences(_ preferences: UserPreferences, to filename: String) { + let fileURL = documentsURL.appendingPathComponent(filename) + + // try! crashes if encoding or writing fails + let data = try! JSONEncoder().encode(preferences) + try! data.write(to: fileURL, options: .atomic) + } + + func loadJSON(from url: URL, as type: T.Type) -> T { + // try! on remote or file URL will crash for any network/IO error + let data = try! Data(contentsOf: url) + return try! JSONDecoder().decode(type, from: data) + } + + func parseRegex(pattern: String) -> NSRegularExpression { + // try! will crash for invalid regex patterns + return try! NSRegularExpression(pattern: pattern, options: []) + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_use_force_try/good/DataLoader.swift b/priv/combined_metrics/samples/error_handling/does_not_use_force_try/good/DataLoader.swift new file mode 100644 index 0000000..d7da4fc --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_use_force_try/good/DataLoader.swift @@ -0,0 +1,60 @@ +import Foundation + +enum DataLoaderError: Error { + case fileNotFound(String) + case decodingFailed(Error) + case encodingFailed(Error) +} + +struct UserPreferences: Codable { + var theme: String + var notificationsEnabled: Bool + var language: String +} + +class DataLoader { + private let fileManager: FileManager + private let documentsURL: URL + + init(fileManager: FileManager = .default) { + self.fileManager = fileManager + self.documentsURL = fileManager.urls(for: .documentDirectory, in: .userDomainMask)[0] + } + + func loadPreferences(from filename: String) -> Result { + let fileURL = documentsURL.appendingPathComponent(filename) + + guard fileManager.fileExists(atPath: fileURL.path) else { + return .failure(.fileNotFound(filename)) + } + + do { + let data = try Data(contentsOf: fileURL) + let preferences = try JSONDecoder().decode(UserPreferences.self, from: data) + return .success(preferences) + } catch let error as DecodingError { + return .failure(.decodingFailed(error)) + } catch { + return .failure(.decodingFailed(error)) + } + } + + func savePreferences(_ preferences: UserPreferences, to filename: String) throws { + let fileURL = documentsURL.appendingPathComponent(filename) + do { + let data = try JSONEncoder().encode(preferences) + try data.write(to: fileURL, options: .atomic) + } catch { + throw DataLoaderError.encodingFailed(error) + } + } + + func loadJSON(from url: URL, as type: T.Type) throws -> T { + do { + let data = try Data(contentsOf: url) + return try JSONDecoder().decode(type, from: data) + } catch { + throw DataLoaderError.decodingFailed(error) + } + } +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_use_inband_error_values/bad/store.go b/priv/combined_metrics/samples/error_handling/does_not_use_inband_error_values/bad/store.go new file mode 100644 index 0000000..807d8ef --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_use_inband_error_values/bad/store.go @@ -0,0 +1,53 @@ +package store + +import ( + "context" + "database/sql" + "errors" +) + +type User struct { + ID int64 + Name string + Email string +} + +type UserStore struct { + db *sql.DB +} + +func NewUserStore(db *sql.DB) *UserStore { + return &UserStore{db: db} +} + +// FindByEmail returns the user's name or empty string if not found. +// Callers must check for empty string to detect failure. +func (s *UserStore) FindByEmail(ctx context.Context, email string) string { + row := s.db.QueryRowContext(ctx, + `SELECT name FROM users WHERE email = $1`, email) + + var name string + if err := row.Scan(&name); err != nil { + // returns sentinel "" to signal failure — callers can't distinguish + // "not found" from a real DB error + return "" + } + return name +} + +// FindByID returns the user ID or -1 if not found. +// Callers must check for -1 to detect failure. +func (s *UserStore) FindByID(ctx context.Context, id int64) int64 { + row := s.db.QueryRowContext(ctx, + `SELECT id FROM users WHERE id = $1`, id) + + var found int64 + if err := row.Scan(&found); err != nil { + if errors.Is(err, sql.ErrNoRows) { + // returns sentinel -1 to signal "not found" + return -1 + } + return -1 + } + return found +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_use_inband_error_values/good/store.go b/priv/combined_metrics/samples/error_handling/does_not_use_inband_error_values/good/store.go new file mode 100644 index 0000000..eaa1e8a --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_use_inband_error_values/good/store.go @@ -0,0 +1,56 @@ +package store + +import ( + "context" + "database/sql" + "errors" + "fmt" +) + +var ErrNotFound = errors.New("record not found") + +type User struct { + ID int64 + Name string + Email string +} + +type UserStore struct { + db *sql.DB +} + +func NewUserStore(db *sql.DB) *UserStore { + return &UserStore{db: db} +} + +// FindByEmail returns the user with the given email address. +// Returns ErrNotFound if no such user exists. +func (s *UserStore) FindByEmail(ctx context.Context, email string) (*User, error) { + row := s.db.QueryRowContext(ctx, + `SELECT id, name, email FROM users WHERE email = $1`, email) + + var u User + if err := row.Scan(&u.ID, &u.Name, &u.Email); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + return nil, fmt.Errorf("find user by email %q: %w", email, err) + } + return &u, nil +} + +// FindByID returns the user with the given ID. +// Returns ErrNotFound if no such user exists. +func (s *UserStore) FindByID(ctx context.Context, id int64) (*User, error) { + row := s.db.QueryRowContext(ctx, + `SELECT id, name, email FROM users WHERE id = $1`, id) + + var u User + if err := row.Scan(&u.ID, &u.Name, &u.Email); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + return nil, fmt.Errorf("find user by id %d: %w", id, err) + } + return &u, nil +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_use_unwrap_in_production/bad/client.rs b/priv/combined_metrics/samples/error_handling/does_not_use_unwrap_in_production/bad/client.rs new file mode 100644 index 0000000..f8747ae --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_use_unwrap_in_production/bad/client.rs @@ -0,0 +1,48 @@ +use std::collections::HashMap; +use std::time::Duration; + +pub struct HttpClient { + base_url: String, + timeout: Duration, + headers: HashMap, +} + +impl HttpClient { + pub fn new(base_url: &str, timeout_secs: u64) -> Self { + // unwrap() in constructor: if base_url is empty this is confusing to debug + let parsed = base_url.strip_prefix("https://").unwrap(); + let _ = parsed; // not used, just demonstrating the unwrap + + Self { + base_url: base_url.to_string(), + timeout: Duration::from_secs(timeout_secs), + headers: HashMap::new(), + } + } + + pub fn set_auth_token(&mut self, token: Option<&str>) { + // unwrap() here panics if caller passes None — no graceful handling + let tok = token.unwrap(); + self.headers.insert("Authorization".to_string(), format!("Bearer {tok}")); + } + + pub fn get(&self, path: &str) -> String { + let url = format!("{}{}", self.base_url, path); + let response = self.execute(&url); + // unwrap() on production path — any error panics the whole process + response.unwrap() + } + + fn execute(&self, url: &str) -> Result { + if url.contains("unreachable") { + return Err(format!("cannot connect to {url}")); + } + Ok(format!("OK from {url}")) + } +} + +pub fn fetch_user_profile(client: &HttpClient, user_id: u64) -> String { + let path = format!("/users/{user_id}"); + // Returns a String — caller has no way to handle errors + client.get(&path) +} diff --git a/priv/combined_metrics/samples/error_handling/does_not_use_unwrap_in_production/good/client.rs b/priv/combined_metrics/samples/error_handling/does_not_use_unwrap_in_production/good/client.rs new file mode 100644 index 0000000..6605d25 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/does_not_use_unwrap_in_production/good/client.rs @@ -0,0 +1,64 @@ +use std::time::Duration; + +#[derive(Debug)] +pub enum ClientError { + InvalidUrl(String), + ConnectionFailed(String), + Timeout, + BadResponse { status: u16, body: String }, +} + +impl std::fmt::Display for ClientError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ClientError::InvalidUrl(u) => write!(f, "invalid URL: {u}"), + ClientError::ConnectionFailed(msg) => write!(f, "connection failed: {msg}"), + ClientError::Timeout => write!(f, "request timed out"), + ClientError::BadResponse { status, body } => { + write!(f, "unexpected status {status}: {body}") + } + } + } +} + +pub struct HttpClient { + base_url: String, + timeout: Duration, +} + +impl HttpClient { + pub fn new(base_url: impl Into, timeout_secs: u64) -> Result { + let base_url = base_url.into(); + if !base_url.starts_with("http://") && !base_url.starts_with("https://") { + return Err(ClientError::InvalidUrl(base_url)); + } + Ok(Self { + base_url, + timeout: Duration::from_secs(timeout_secs), + }) + } + + pub fn get(&self, path: &str) -> Result { + let url = format!("{}{}", self.base_url, path); + // Simulated HTTP call — real impl would use reqwest or hyper + self.execute_request(&url) + } + + fn execute_request(&self, url: &str) -> Result { + if url.contains("unreachable") { + return Err(ClientError::ConnectionFailed(format!( + "host not reachable for {url}" + ))); + } + if self.timeout < Duration::from_millis(1) { + return Err(ClientError::Timeout); + } + Ok(format!("200 OK from {url}")) + } +} + +pub fn fetch_user_profile(client: &HttpClient, user_id: u64) -> Result { + let path = format!("/users/{user_id}/profile"); + let body = client.get(&path)?; + Ok(body) +} diff --git a/priv/combined_metrics/samples/error_handling/error_message_is_descriptive/bad/billing.ex b/priv/combined_metrics/samples/error_handling/error_message_is_descriptive/bad/billing.ex new file mode 100644 index 0000000..351c5de --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/error_message_is_descriptive/bad/billing.ex @@ -0,0 +1,83 @@ +defmodule Billing do + @moduledoc """ + Handles billing and invoice generation. + """ + + def create_invoice(user_id, items) do + case fetch_user(user_id) do + nil -> {:error, :error} + user -> build_invoice(user, items) + end + end + + def charge_customer(customer_id, amount) do + if amount <= 0 do + raise "error" + end + + case find_payment_method(customer_id) do + nil -> {:error, ""} + method -> process_charge(method, amount) + end + end + + def apply_discount(invoice, code) do + case lookup_discount_code(code) do + nil -> {:error, :not_found} + discount -> + if discount.expired do + {:error, :expired} + else + {:ok, apply(invoice, discount)} + end + end + end + + def issue_refund(invoice_id, amount) do + case get_invoice(invoice_id) do + nil -> + {:error, :error} + invoice -> + if amount > invoice.total do + raise "bad amount" + else + process_refund(invoice, amount) + end + end + end + + def update_billing_address(customer_id, address) do + if address == nil or address == "" do + {:error, ""} + else + case find_customer(customer_id) do + nil -> {:error, :error} + customer -> save_address(customer, address) + end + end + end + + def send_invoice(invoice_id, email) do + case get_invoice(invoice_id) do + nil -> {:error, :missing} + invoice -> + case validate_email(email) do + false -> {:error, :bad} + true -> dispatch_email(invoice, email) + end + end + end + + defp fetch_user(_id), do: nil + defp find_payment_method(_id), do: nil + defp process_charge(_method, _amount), do: {:ok, %{}} + defp lookup_discount_code(_code), do: nil + defp apply(_invoice, _discount), do: %{} + defp get_invoice(_id), do: nil + defp process_refund(_invoice, _amount), do: {:ok, %{}} + defp find_customer(_id), do: nil + defp save_address(_customer, _address), do: {:ok, %{}} + defp validate_email(_email), do: true + defp dispatch_email(_invoice, _email), do: :ok + defp build_invoice(_user, _items), do: {:ok, %{}} +end diff --git a/priv/combined_metrics/samples/error_handling/error_message_is_descriptive/config.yml b/priv/combined_metrics/samples/error_handling/error_message_is_descriptive/config.yml new file mode 100644 index 0000000..6a3115e --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/error_message_is_descriptive/config.yml @@ -0,0 +1 @@ +doc: "Error values should carry a meaningful message, not just a bare atom or empty string." diff --git a/priv/combined_metrics/samples/error_handling/error_message_is_descriptive/good/billing.ex b/priv/combined_metrics/samples/error_handling/error_message_is_descriptive/good/billing.ex new file mode 100644 index 0000000..270a35b --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/error_message_is_descriptive/good/billing.ex @@ -0,0 +1,93 @@ +defmodule Billing do + @moduledoc """ + Handles billing and invoice generation. + """ + + def create_invoice(user_id, items) do + case fetch_user(user_id) do + nil -> + {:error, "User #{user_id} not found, cannot create invoice"} + user -> + build_invoice(user, items) + end + end + + def charge_customer(customer_id, amount) do + if amount <= 0 do + raise ArgumentError, "Charge amount must be positive, got: #{amount}" + end + + case find_payment_method(customer_id) do + nil -> + {:error, "No payment method on file for customer #{customer_id}"} + method -> + process_charge(method, amount) + end + end + + def apply_discount(invoice, code) do + case lookup_discount_code(code) do + nil -> + {:error, "Discount code #{inspect(code)} does not exist"} + %{expired: true, expires_at: expires_at} -> + {:error, "Discount code #{inspect(code)} expired on #{expires_at}"} + discount -> + {:ok, apply_to_invoice(invoice, discount)} + end + end + + def issue_refund(invoice_id, amount) do + case get_invoice(invoice_id) do + nil -> + {:error, "Invoice #{invoice_id} not found, cannot issue refund"} + invoice -> + if amount > invoice.total do + raise ArgumentError, + "Refund amount #{amount} exceeds invoice total #{invoice.total} for invoice #{invoice_id}" + else + process_refund(invoice, amount) + end + end + end + + def update_billing_address(customer_id, address) do + cond do + is_nil(address) -> + {:error, "Billing address for customer #{customer_id} cannot be nil"} + address == "" -> + {:error, "Billing address for customer #{customer_id} cannot be empty"} + true -> + case find_customer(customer_id) do + nil -> {:error, "Customer #{customer_id} not found"} + customer -> save_address(customer, address) + end + end + end + + def send_invoice(invoice_id, email) do + case get_invoice(invoice_id) do + nil -> + {:error, "Invoice #{invoice_id} not found, cannot send"} + invoice -> + case validate_email(email) do + false -> + {:error, "Cannot send invoice #{invoice_id}: #{inspect(email)} is not a valid email"} + true -> + dispatch_email(invoice, email) + end + end + end + + defp fetch_user(_id), do: nil + defp find_payment_method(_id), do: nil + defp process_charge(_method, _amount), do: {:ok, %{}} + defp lookup_discount_code(_code), do: nil + defp apply_to_invoice(_invoice, _discount), do: %{} + defp get_invoice(_id), do: nil + defp process_refund(_invoice, _amount), do: {:ok, %{}} + defp find_customer(_id), do: nil + defp save_address(_customer, _address), do: {:ok, %{}} + defp validate_email(_email), do: true + defp dispatch_email(_invoice, _email), do: :ok + defp build_invoice(_user, _items), do: {:ok, %{}} +end diff --git a/priv/combined_metrics/samples/error_handling/error_string_not_capitalized/bad/service.go b/priv/combined_metrics/samples/error_handling/error_string_not_capitalized/bad/service.go new file mode 100644 index 0000000..57413c6 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/error_string_not_capitalized/bad/service.go @@ -0,0 +1,54 @@ +package service + +import ( + "errors" + "fmt" +) + +type Product struct { + ID string + Stock int + Price float64 +} + +type Inventory interface { + Get(id string) (*Product, error) + Decrement(id string, qty int) error +} + +type CartService struct { + inventory Inventory +} + +func NewCartService(inv Inventory) *CartService { + return &CartService{inventory: inv} +} + +var ( + // Error strings are capitalized and end with punctuation — bad practice. + ErrProductNotFound = errors.New("Product not found.") + ErrInsufficientStock = errors.New("Insufficient stock.") +) + +func (s *CartService) AddToCart(productID string, qty int) error { + if qty <= 0 { + // Capitalized and ends with period — will look odd when embedded in larger messages. + return fmt.Errorf("Quantity must be positive, got %d.", qty) + } + + product, err := s.inventory.Get(productID) + if err != nil { + // Capitalized start and trailing period break embedding. + return fmt.Errorf("Failed to get product %q: %w.", productID, err) + } + + if product.Stock < qty { + return fmt.Errorf("Not enough stock for product %q. Has %d, requested %d.", + productID, product.Stock, qty) + } + + if err := s.inventory.Decrement(productID, qty); err != nil { + return fmt.Errorf("Could not decrement stock for %q: %w.", productID, err) + } + return nil +} diff --git a/priv/combined_metrics/samples/error_handling/error_string_not_capitalized/good/service.go b/priv/combined_metrics/samples/error_handling/error_string_not_capitalized/good/service.go new file mode 100644 index 0000000..5c2c49f --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/error_string_not_capitalized/good/service.go @@ -0,0 +1,51 @@ +package service + +import ( + "errors" + "fmt" +) + +type Product struct { + ID string + Stock int + Price float64 +} + +type Inventory interface { + Get(id string) (*Product, error) + Decrement(id string, qty int) error +} + +type CartService struct { + inventory Inventory +} + +func NewCartService(inv Inventory) *CartService { + return &CartService{inventory: inv} +} + +var ( + ErrProductNotFound = errors.New("product not found") + ErrInsufficientStock = errors.New("insufficient stock") +) + +func (s *CartService) AddToCart(productID string, qty int) error { + if qty <= 0 { + return fmt.Errorf("quantity must be positive, got %d", qty) + } + + product, err := s.inventory.Get(productID) + if err != nil { + return fmt.Errorf("add to cart: get product %q: %w", productID, err) + } + + if product.Stock < qty { + return fmt.Errorf("add to cart: product %q has %d in stock, requested %d: %w", + productID, product.Stock, qty, ErrInsufficientStock) + } + + if err := s.inventory.Decrement(productID, qty); err != nil { + return fmt.Errorf("add to cart: decrement stock for %q: %w", productID, err) + } + return nil +} diff --git a/priv/combined_metrics/samples/error_handling/error_type_includes_context/bad/client.go b/priv/combined_metrics/samples/error_handling/error_type_includes_context/bad/client.go new file mode 100644 index 0000000..a4c354d --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/error_type_includes_context/bad/client.go @@ -0,0 +1,47 @@ +package client + +import ( + "errors" + "fmt" + "net/http" +) + +// APIError carries no useful context about what failed or where. +type APIError struct { + Message string +} + +func (e *APIError) Error() string { + return e.Message +} + +type PaymentClient struct { + base string + client *http.Client +} + +func NewPaymentClient(base string) *PaymentClient { + return &PaymentClient{base: base, client: &http.Client{}} +} + +func (c *PaymentClient) Charge(orderID string, amountCents int) error { + url := fmt.Sprintf("%s/orders/%s/charge", c.base, orderID) + req, err := http.NewRequest(http.MethodPost, url, nil) + if err != nil { + // wraps nothing; caller cannot recover the original error or know the URL + return &APIError{Message: "request failed"} + } + + resp, err := c.client.Do(req) + if err != nil { + // loses the original transport error entirely + return errors.New("request failed") + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + // no operation, resource, or status code included + return &APIError{Message: "unexpected response"} + } + return nil +} diff --git a/priv/combined_metrics/samples/error_handling/error_type_includes_context/good/client.go b/priv/combined_metrics/samples/error_handling/error_type_includes_context/good/client.go new file mode 100644 index 0000000..4309e53 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/error_type_includes_context/good/client.go @@ -0,0 +1,56 @@ +package client + +import ( + "fmt" + "net/http" +) + +// APIError captures the HTTP operation, the target resource, the HTTP status +// code, and the underlying transport error when one occurs. +type APIError struct { + Method string + Resource string + Status int + Err error +} + +func (e *APIError) Error() string { + if e.Err != nil { + return fmt.Sprintf("%s %s: status %d: %v", e.Method, e.Resource, e.Status, e.Err) + } + return fmt.Sprintf("%s %s: status %d", e.Method, e.Resource, e.Status) +} + +func (e *APIError) Unwrap() error { return e.Err } + +type PaymentClient struct { + base string + client *http.Client +} + +func NewPaymentClient(base string) *PaymentClient { + return &PaymentClient{base: base, client: &http.Client{}} +} + +func (c *PaymentClient) Charge(orderID string, amountCents int) error { + url := fmt.Sprintf("%s/orders/%s/charge", c.base, orderID) + req, err := http.NewRequest(http.MethodPost, url, nil) + if err != nil { + return &APIError{Method: "POST", Resource: url, Err: err} + } + + resp, err := c.client.Do(req) + if err != nil { + return &APIError{Method: "POST", Resource: url, Err: err} + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return &APIError{ + Method: "POST", + Resource: url, + Status: resp.StatusCode, + } + } + return nil +} diff --git a/priv/combined_metrics/samples/error_handling/no_bare_except/bad/api_client.py b/priv/combined_metrics/samples/error_handling/no_bare_except/bad/api_client.py new file mode 100644 index 0000000..5f4386c --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_bare_except/bad/api_client.py @@ -0,0 +1,60 @@ +"""HTTP API client with retry logic and structured error handling.""" +from __future__ import annotations + +import json +import time +from dataclasses import dataclass +from typing import Any, Optional +from urllib.request import urlopen + + +@dataclass +class ApiResponse: + status_code: int + body: dict[str, Any] + latency_ms: float + + +def get(url: str, timeout: float = 5.0) -> Optional[ApiResponse]: + """Perform a GET request — bare except swallows KeyboardInterrupt and SystemExit.""" + start = time.monotonic() + try: + with urlopen(url, timeout=timeout) as resp: + raw = resp.read().decode("utf-8") + elapsed = (time.monotonic() - start) * 1000 + return ApiResponse( + status_code=resp.status, + body=json.loads(raw), + latency_ms=round(elapsed, 2), + ) + except: # bare except — catches EVERYTHING including Ctrl+C + return None + + +def get_with_retry( + url: str, + retries: int = 3, + backoff: float = 1.0, + timeout: float = 5.0, +) -> Optional[ApiResponse]: + """Retry a GET — bare except in retry loop makes Ctrl+C impossible to act on.""" + for attempt in range(1, retries + 1): + try: + response = get(url, timeout=timeout) + if response is not None: + return response + except: # bare except — user cannot interrupt a long retry loop + pass + time.sleep(backoff * attempt) + return None + + +def batch_fetch(urls: list[str]) -> list[Optional[ApiResponse]]: + """Fetch multiple URLs — each bare except silently discards all error context.""" + results = [] + for url in urls: + try: + results.append(get(url)) + except: # can't distinguish network vs programming error + results.append(None) + return results diff --git a/priv/combined_metrics/samples/error_handling/no_bare_except/good/api_client.py b/priv/combined_metrics/samples/error_handling/no_bare_except/good/api_client.py new file mode 100644 index 0000000..87e96ad --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_bare_except/good/api_client.py @@ -0,0 +1,71 @@ +"""HTTP API client with retry logic and structured error handling.""" +from __future__ import annotations + +import time +from dataclasses import dataclass +from typing import Any, Optional +from urllib.error import HTTPError, URLError +from urllib.request import urlopen +import json + + +@dataclass +class ApiResponse: + status_code: int + body: dict[str, Any] + latency_ms: float + + +class ApiClientError(Exception): + """Base error for all API client failures.""" + + +class NetworkError(ApiClientError): + """Raised when the network is unreachable.""" + + +class HttpError(ApiClientError): + """Raised when the server returns a 4xx or 5xx response.""" + + def __init__(self, status_code: int, message: str) -> None: + super().__init__(message) + self.status_code = status_code + + +def get(url: str, timeout: float = 5.0) -> ApiResponse: + """Perform a GET request and return a structured response.""" + start = time.monotonic() + try: + with urlopen(url, timeout=timeout) as resp: + raw = resp.read().decode("utf-8") + elapsed = (time.monotonic() - start) * 1000 + return ApiResponse( + status_code=resp.status, + body=json.loads(raw), + latency_ms=round(elapsed, 2), + ) + except HTTPError as exc: + raise HttpError(exc.code, f"Server returned {exc.code} for {url}") from exc + except URLError as exc: + raise NetworkError(f"Could not reach {url}: {exc.reason}") from exc + except json.JSONDecodeError as exc: + raise ApiClientError(f"Invalid JSON from {url}") from exc + + +def get_with_retry( + url: str, + retries: int = 3, + backoff: float = 1.0, + timeout: float = 5.0, +) -> Optional[ApiResponse]: + """Retry a GET request on network errors only; re-raise HTTP errors immediately.""" + for attempt in range(1, retries + 1): + try: + return get(url, timeout=timeout) + except NetworkError: + if attempt == retries: + raise + time.sleep(backoff * attempt) + except HttpError: + raise # do not retry server-side errors + return None diff --git a/priv/combined_metrics/samples/error_handling/no_blind_rescue/bad/cart.rb b/priv/combined_metrics/samples/error_handling/no_blind_rescue/bad/cart.rb new file mode 100644 index 0000000..4304d84 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_blind_rescue/bad/cart.rb @@ -0,0 +1,45 @@ +class CartCheckoutService + def initialize(inventory, payment_processor, logger) + @inventory = inventory + @payment_processor = payment_processor + @logger = logger + end + + def checkout(cart, payment_details) + reserve_items(cart) + process_payment(cart, payment_details) + end + + private + + def reserve_items(cart) + cart.line_items.each do |item| + begin + @inventory.reserve(item.sku, item.quantity) + rescue => e + # Bare rescue catches all StandardError — masks the specific cause + raise "Reservation failed: #{e.message}" + end + end + end + + def process_payment(cart, payment_details) + begin + result = @payment_processor.charge(cart.total_cents, payment_details) + { success: true, order_id: result.order_id } + rescue + # Bare rescue with no class — swallows everything silently + { success: false, error: :unknown } + end + end + + def release_reserved_items(cart) + cart.line_items.each do |item| + begin + @inventory.release(item.sku, item.quantity) + rescue + # Silent swallow — no logging, no re-raise + end + end + end +end diff --git a/priv/combined_metrics/samples/error_handling/no_blind_rescue/good/cart.rb b/priv/combined_metrics/samples/error_handling/no_blind_rescue/good/cart.rb new file mode 100644 index 0000000..bcccc52 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_blind_rescue/good/cart.rb @@ -0,0 +1,53 @@ +class CartCheckoutService + def initialize(inventory, payment_processor, logger) + @inventory = inventory + @payment_processor = payment_processor + @logger = logger + end + + def checkout(cart, payment_details) + reserve_items(cart) + process_payment(cart, payment_details) + end + + private + + def reserve_items(cart) + cart.line_items.each do |item| + begin + @inventory.reserve(item.sku, item.quantity) + rescue Inventory::OutOfStockError => e + raise CheckoutError.new(:out_of_stock, "#{item.name} is no longer available: #{e.message}") + rescue Inventory::ConnectionError => e + @logger.error("Inventory service unreachable: #{e.message}") + raise CheckoutError.new(:service_unavailable, "Unable to confirm stock at this time") + end + end + end + + def process_payment(cart, payment_details) + begin + result = @payment_processor.charge(cart.total_cents, payment_details) + { success: true, order_id: result.order_id, receipt_url: result.receipt_url } + rescue PaymentProcessor::DeclinedError => e + @logger.info("Payment declined for cart #{cart.id}: #{e.decline_code}") + { success: false, error: :payment_declined, decline_code: e.decline_code } + rescue PaymentProcessor::TimeoutError => e + @logger.error("Payment timeout for cart #{cart.id}: #{e.message}") + release_reserved_items(cart) + { success: false, error: :payment_timeout } + rescue PaymentProcessor::Error => e + @logger.error("Payment error for cart #{cart.id}: #{e.message}") + release_reserved_items(cart) + raise + end + end + + def release_reserved_items(cart) + cart.line_items.each do |item| + @inventory.release(item.sku, item.quantity) + rescue Inventory::Error => e + @logger.warn("Failed to release reservation for #{item.sku}: #{e.message}") + end + end +end diff --git a/priv/combined_metrics/samples/error_handling/no_empty_rescue_block/bad/payment.rb b/priv/combined_metrics/samples/error_handling/no_empty_rescue_block/bad/payment.rb new file mode 100644 index 0000000..9f168e8 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_empty_rescue_block/bad/payment.rb @@ -0,0 +1,42 @@ +class PaymentProcessor + def initialize(gateway, logger) + @gateway = gateway + @logger = logger + end + + def charge(order, card_token) + amount_cents = (order.total * 100).to_i + + begin + response = @gateway.charge(amount_cents, card_token, order_id: order.id) + record_transaction(order, response.transaction_id) + { success: true, transaction_id: response.transaction_id } + rescue PaymentGateway::CardDeclinedError + # TODO: handle this + rescue PaymentGateway::NetworkError + rescue PaymentGateway::InvalidAmountError + end + end + + def refund(transaction_id, amount_cents) + begin + response = @gateway.refund(transaction_id, amount_cents) + { success: true, refund_id: response.refund_id } + rescue PaymentGateway::TransactionNotFoundError + rescue PaymentGateway::RefundError + end + end + + private + + def record_transaction(order, transaction_id) + begin + order.update!( + payment_status: :paid, + transaction_id: transaction_id, + paid_at: Time.current + ) + rescue + end + end +end diff --git a/priv/combined_metrics/samples/error_handling/no_empty_rescue_block/good/payment.rb b/priv/combined_metrics/samples/error_handling/no_empty_rescue_block/good/payment.rb new file mode 100644 index 0000000..6845c92 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_empty_rescue_block/good/payment.rb @@ -0,0 +1,49 @@ +class PaymentProcessor + def initialize(gateway, logger) + @gateway = gateway + @logger = logger + end + + def charge(order, card_token) + amount_cents = (order.total * 100).to_i + + begin + response = @gateway.charge(amount_cents, card_token, order_id: order.id) + record_transaction(order, response.transaction_id) + { success: true, transaction_id: response.transaction_id } + rescue PaymentGateway::CardDeclinedError => e + @logger.warn("Card declined for order #{order.id}: #{e.message}") + { success: false, error: :card_declined, message: e.message } + rescue PaymentGateway::NetworkError => e + @logger.error("Gateway network error for order #{order.id}: #{e.message}") + { success: false, error: :network_error, message: "Payment service unavailable" } + rescue PaymentGateway::InvalidAmountError => e + @logger.error("Invalid amount #{amount_cents} for order #{order.id}: #{e.message}") + raise ArgumentError, "Order total is invalid: #{order.total}" + end + end + + def refund(transaction_id, amount_cents) + begin + response = @gateway.refund(transaction_id, amount_cents) + @logger.info("Refund issued: #{response.refund_id} for transaction #{transaction_id}") + { success: true, refund_id: response.refund_id } + rescue PaymentGateway::TransactionNotFoundError => e + @logger.error("Refund failed — transaction not found: #{transaction_id} — #{e.message}") + { success: false, error: :transaction_not_found } + rescue PaymentGateway::RefundError => e + @logger.error("Refund failed for transaction #{transaction_id}: #{e.message}") + { success: false, error: :refund_failed, message: e.message } + end + end + + private + + def record_transaction(order, transaction_id) + order.update!( + payment_status: :paid, + transaction_id: transaction_id, + paid_at: Time.current + ) + end +end diff --git a/priv/combined_metrics/samples/error_handling/no_error_suppression_operator/bad/ProductCatalog.php b/priv/combined_metrics/samples/error_handling/no_error_suppression_operator/bad/ProductCatalog.php new file mode 100644 index 0000000..da091c1 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_error_suppression_operator/bad/ProductCatalog.php @@ -0,0 +1,62 @@ +storageBasePath}/{$sku}.jpg"; + + // @ suppresses mkdir warnings — no way to know if it actually succeeded + @mkdir(dirname($destination), 0755, true); + + // @ suppresses copy errors — destination may not exist but no error is thrown + @copy($sourcePath, $destination); + + return $destination; + } + + public function deleteImage(string $sku): bool + { + $path = "{$this->storageBasePath}/{$sku}.jpg"; + + // @ suppresses warnings if file doesn't exist — silent success/failure + return @unlink($path); + } + + public function readConfig(string $iniPath): array + { + // @ hides parse errors in the ini file + $config = @parse_ini_file($iniPath, true); + + return $config ?: []; + } +} diff --git a/priv/combined_metrics/samples/error_handling/no_error_suppression_operator/good/ProductCatalog.php b/priv/combined_metrics/samples/error_handling/no_error_suppression_operator/good/ProductCatalog.php new file mode 100644 index 0000000..95e941d --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_error_suppression_operator/good/ProductCatalog.php @@ -0,0 +1,77 @@ +parseRows($handle); + } finally { + fclose($handle); + } + } + + public function saveProductImage(string $sku, string $sourcePath): string + { + $destination = "{$this->storageBasePath}/{$sku}.jpg"; + + if (!file_exists($sourcePath)) { + throw new ImageProcessingException("Source image not found: {$sourcePath}"); + } + + $targetDir = dirname($destination); + if (!is_dir($targetDir) && !mkdir($targetDir, 0755, true)) { + throw new ImageProcessingException("Failed to create directory: {$targetDir}"); + } + + if (!copy($sourcePath, $destination)) { + throw new ImageProcessingException( + "Failed to copy image from {$sourcePath} to {$destination}" + ); + } + + return $destination; + } + + private function parseRows($handle): array + { + $products = []; + $headers = fgetcsv($handle); + + if ($headers === false) { + throw new CatalogImportException("CSV file is empty or unreadable"); + } + + while (($row = fgetcsv($handle)) !== false) { + $products[] = array_combine($headers, $row); + } + + return $products; + } +} diff --git a/priv/combined_metrics/samples/error_handling/no_eval_or_dynamic_code_execution/bad/template_engine.js b/priv/combined_metrics/samples/error_handling/no_eval_or_dynamic_code_execution/bad/template_engine.js new file mode 100644 index 0000000..e6ff440 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_eval_or_dynamic_code_execution/bad/template_engine.js @@ -0,0 +1,48 @@ +function renderTemplate(template, context) { + const keys = Object.keys(context); + const values = Object.values(context); + + // Using Function constructor to evaluate template expressions + const fn = new Function(...keys, `return \`${template}\``); + return fn(...values); +} + +function applyFilter(value, filterExpression) { + // Evaluate arbitrary filter code supplied by the user + return eval(`(function(v) { return ${filterExpression}; })(${JSON.stringify(value)})`); +} + +function buildSortComparator(sortConfig) { + // Build a comparator from a user-supplied config string + const comparatorCode = `(a, b) => { return ${sortConfig}; }`; + return eval(comparatorCode); +} + +function compileValidator(rules) { + // Compile validation rules into executable code + const body = rules.map((rule) => `if (!(${rule.expression})) return false;`).join("\n"); + return new Function("value", `${body}\nreturn true;`); +} + +function executePluginHook(pluginCode, eventName, payload) { + // Execute plugin hook code loaded from external source + const runner = new Function("event", "payload", pluginCode); + return runner(eventName, payload); +} + +function renderDynamicField(fieldConfig, record) { + // Evaluate field display expression + const displayValue = eval( + `(function(record) { return ${fieldConfig.expression}; })(record)` + ); + return displayValue; +} + +export { + renderTemplate, + applyFilter, + buildSortComparator, + compileValidator, + executePluginHook, + renderDynamicField, +}; diff --git a/priv/combined_metrics/samples/error_handling/no_eval_or_dynamic_code_execution/good/template_engine.js b/priv/combined_metrics/samples/error_handling/no_eval_or_dynamic_code_execution/good/template_engine.js new file mode 100644 index 0000000..88ea2db --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_eval_or_dynamic_code_execution/good/template_engine.js @@ -0,0 +1,58 @@ +const ALLOWED_FILTERS = { + uppercase: (value) => String(value).toUpperCase(), + lowercase: (value) => String(value).toLowerCase(), + trim: (value) => String(value).trim(), + truncate: (value, length = 80) => String(value).slice(0, Number(length)), +}; + +function renderTemplate(template, context) { + return template.replace(/\{\{\s*([\w.]+)(?:\s*\|\s*([\w]+)(?::([^}]*))?)?\s*\}\}/g, (_, path, filter, arg) => { + const value = resolvePath(context, path); + + if (value === undefined || value === null) { + return ""; + } + + if (filter) { + const fn = ALLOWED_FILTERS[filter]; + if (!fn) { + throw new Error(`Unknown filter: '${filter}'. Allowed filters: ${Object.keys(ALLOWED_FILTERS).join(", ")}`); + } + return fn(value, arg); + } + + return String(value); + }); +} + +function resolvePath(obj, path) { + return path.split(".").reduce((current, key) => { + if (current == null) return undefined; + return current[key]; + }, obj); +} + +function buildSortComparator(field, direction) { + const multiplier = direction === "desc" ? -1 : 1; + + return (a, b) => { + const av = resolvePath(a, field); + const bv = resolvePath(b, field); + + if (av == null && bv == null) return 0; + if (av == null) return 1 * multiplier; + if (bv == null) return -1 * multiplier; + + return av < bv ? -1 * multiplier : av > bv ? 1 * multiplier : 0; + }; +} + +function applyTransforms(value, transforms) { + return transforms.reduce((acc, { name, args }) => { + const fn = ALLOWED_FILTERS[name]; + if (!fn) throw new Error(`Unknown transform: '${name}'`); + return fn(acc, ...args); + }, value); +} + +export { renderTemplate, buildSortComparator, applyTransforms }; diff --git a/priv/combined_metrics/samples/error_handling/no_exceptions_for_control_flow/bad/billing.ex b/priv/combined_metrics/samples/error_handling/no_exceptions_for_control_flow/bad/billing.ex new file mode 100644 index 0000000..f3e75ea --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_exceptions_for_control_flow/bad/billing.ex @@ -0,0 +1,68 @@ +defmodule MyApp.Billing do + @moduledoc """ + Billing operations. + """ + + alias MyApp.Billing.{Invoice, PaymentMethod} + alias MyApp.Repo + + # Bad: using try/rescue for expected, recoverable failures (subscription not found) + @spec create_invoice(integer()) :: {:ok, Invoice.t()} | {:error, atom()} + def create_invoice(subscription_id) do + try do + subscription = Repo.get!(MyApp.Subscriptions.Subscription, subscription_id) + + unless subscription.billing_enabled do + raise "billing disabled" + end + + line_items = MyApp.Billing.LineItemCalculator.compute(subscription) + + invoice = + %Invoice{} + |> Invoice.changeset(%{ + subscription_id: subscription.id, + customer_id: subscription.customer_id, + line_items: line_items, + status: :draft + }) + |> Repo.insert!() + + {:ok, invoice} + rescue + Ecto.NoResultsError -> {:error, :subscription_not_found} + RuntimeError -> {:error, :billing_disabled} + Ecto.InvalidChangesetError -> {:error, :invalid_data} + end + end + + # Bad: using try/rescue as a null-check replacement + @spec charge_invoice(Invoice.t()) :: {:ok, Invoice.t()} | {:error, atom()} + def charge_invoice(%Invoice{} = invoice) do + try do + if invoice.status == :paid, do: raise("already paid") + if invoice.status == :void, do: raise("invoice void") + + payment_method = Repo.get_by!(PaymentMethod, customer_id: invoice.customer_id, default: true) + + unless payment_method.active do + raise "payment method inactive" + end + + case MyApp.PaymentGateway.charge(payment_method.token, invoice.total) do + {:ok, _transaction} -> + invoice + |> Invoice.changeset(%{status: :paid, paid_at: DateTime.utc_now()}) + |> Repo.update() + + {:error, reason} -> + raise "payment failed: #{inspect(reason)}" + end + rescue + RuntimeError, message: "already paid" -> {:error, :already_paid} + RuntimeError, message: "invoice void" -> {:error, :invoice_void} + Ecto.NoResultsError -> {:error, :no_payment_method} + RuntimeError -> {:error, :payment_failed} + end + end +end diff --git a/priv/combined_metrics/samples/error_handling/no_exceptions_for_control_flow/good/billing.ex b/priv/combined_metrics/samples/error_handling/no_exceptions_for_control_flow/good/billing.ex new file mode 100644 index 0000000..04fd18b --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_exceptions_for_control_flow/good/billing.ex @@ -0,0 +1,84 @@ +defmodule MyApp.Billing do + @moduledoc """ + Billing operations. Uses `{:ok, value}` / `{:error, reason}` tuples + for all expected failure paths — no exceptions for control flow. + """ + + alias MyApp.Billing.{Invoice, PaymentMethod} + alias MyApp.Repo + + @doc """ + Creates an invoice for the given subscription ID. + Returns `{:ok, invoice}` or `{:error, reason}`. + """ + @spec create_invoice(integer()) :: {:ok, Invoice.t()} | {:error, atom()} + def create_invoice(subscription_id) do + with {:ok, subscription} <- fetch_subscription(subscription_id), + :ok <- validate_billing_enabled(subscription), + {:ok, line_items} <- compute_line_items(subscription), + {:ok, invoice} <- insert_invoice(subscription, line_items) do + {:ok, invoice} + end + end + + @doc """ + Charges the default payment method for an invoice. + Returns `{:ok, invoice}` or `{:error, reason}`. + """ + @spec charge_invoice(Invoice.t()) :: {:ok, Invoice.t()} | {:error, atom()} + def charge_invoice(%Invoice{status: :paid}), do: {:error, :already_paid} + def charge_invoice(%Invoice{status: :void}), do: {:error, :invoice_void} + + def charge_invoice(%Invoice{} = invoice) do + case fetch_payment_method(invoice.customer_id) do + {:ok, %PaymentMethod{active: true} = pm} -> process_payment(invoice, pm) + {:ok, %PaymentMethod{active: false}} -> {:error, :payment_method_inactive} + {:error, :not_found} -> {:error, :no_payment_method} + end + end + + defp fetch_subscription(id) do + case Repo.get(MyApp.Subscriptions.Subscription, id) do + nil -> {:error, :subscription_not_found} + sub -> {:ok, sub} + end + end + + defp validate_billing_enabled(%{billing_enabled: true}), do: :ok + defp validate_billing_enabled(_), do: {:error, :billing_disabled} + + defp compute_line_items(subscription) do + items = MyApp.Billing.LineItemCalculator.compute(subscription) + {:ok, items} + end + + defp insert_invoice(subscription, line_items) do + %Invoice{} + |> Invoice.changeset(%{ + subscription_id: subscription.id, + customer_id: subscription.customer_id, + line_items: line_items, + status: :draft + }) + |> Repo.insert() + end + + defp fetch_payment_method(customer_id) do + case Repo.get_by(PaymentMethod, customer_id: customer_id, default: true) do + nil -> {:error, :not_found} + pm -> {:ok, pm} + end + end + + defp process_payment(invoice, payment_method) do + case MyApp.PaymentGateway.charge(payment_method.token, invoice.total) do + {:ok, _transaction} -> + invoice + |> Invoice.changeset(%{status: :paid, paid_at: DateTime.utc_now()}) + |> Repo.update() + + {:error, reason} -> + {:error, reason} + end + end +end diff --git a/priv/combined_metrics/samples/error_handling/no_floating_promises/bad/order_service.js b/priv/combined_metrics/samples/error_handling/no_floating_promises/bad/order_service.js new file mode 100644 index 0000000..21c1b60 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_floating_promises/bad/order_service.js @@ -0,0 +1,55 @@ +import logger from "./logger.js"; + +async function processOrder(orderId) { + const order = await fetchOrder(orderId); + + validateInventory(order.items); + + const payment = await chargePayment(order.total, order.paymentMethod); + + updateOrderStatus(orderId, "confirmed"); + sendConfirmationEmail(order.customerEmail, order); + recordAnalyticsEvent("order_confirmed", { orderId, total: order.total }); + + return { orderId, paymentId: payment.id, status: "confirmed" }; +} + +async function cancelOrder(orderId, reason) { + const order = await fetchOrder(orderId); + + if (order.status === "shipped") { + throw new Error("Cannot cancel an order that has already shipped"); + } + + await updateOrderStatus(orderId, "cancelled"); + + refundPayment(order.paymentId, order.total); + sendCancellationEmail(order.customerEmail, { orderId, reason }); + + return { orderId, status: "cancelled" }; +} + +function scheduleOrderReminder(orderId, delayMs) { + new Promise((resolve) => setTimeout(resolve, delayMs)) + .then(() => sendReminderEmail(orderId)); +} + +async function bulkFulfillOrders(orderIds) { + let fulfilled = 0; + + for (const id of orderIds) { + processOrder(id).then(() => { + fulfilled++; + }); + } + + return { fulfilled }; +} + +function onOrderCreated(order) { + sendConfirmationEmail(order.customerEmail, order); + recordAnalyticsEvent("order_created", { orderId: order.id }); + updateInventoryReservation(order.items); +} + +export { processOrder, cancelOrder, scheduleOrderReminder, bulkFulfillOrders, onOrderCreated }; diff --git a/priv/combined_metrics/samples/error_handling/no_floating_promises/bad/user_service.ts b/priv/combined_metrics/samples/error_handling/no_floating_promises/bad/user_service.ts new file mode 100644 index 0000000..c803dcf --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_floating_promises/bad/user_service.ts @@ -0,0 +1,68 @@ +interface User { + id: string; + email: string; + displayName: string; +} + +interface AuditEntry { + action: string; + userId: string; + timestamp: number; +} + +async function fetchUser(userId: string): Promise { + const response = await fetch(`/api/users/${userId}`); + if (!response.ok) throw new Error(`User not found: ${userId}`); + return response.json() as Promise; +} + +async function writeAuditLog(entry: AuditEntry): Promise { + await fetch("/api/audit", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(entry), + }); +} + +async function deleteUser(userId: string): Promise { + const user = await fetchUser(userId); + + const response = await fetch(`/api/users/${userId}`, { method: "DELETE" }); + if (!response.ok) throw new Error(`Failed to delete user: ${response.status}`); + + // Floating promise — not awaited + writeAuditLog({ + action: "user_deleted", + userId: user.id, + timestamp: Date.now(), + }); +} + +async function updateEmail(userId: string, newEmail: string): Promise { + const response = await fetch(`/api/users/${userId}`, { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ email: newEmail }), + }); + + if (!response.ok) throw new Error(`Failed to update email: ${response.status}`); + + const updated: User = await response.json(); + + // Floating promise — not awaited + writeAuditLog({ action: "email_updated", userId, timestamp: Date.now() }); + + return updated; +} + +function onUserCreated(user: User): void { + // Floating promises in void function + writeAuditLog({ action: "user_created", userId: user.id, timestamp: Date.now() }); + fetch("/api/notifications/welcome", { + method: "POST", + body: JSON.stringify({ userId: user.id }), + }); +} + +export { fetchUser, deleteUser, updateEmail, onUserCreated }; +export type { User }; diff --git a/priv/combined_metrics/samples/error_handling/no_floating_promises/good/order_service.js b/priv/combined_metrics/samples/error_handling/no_floating_promises/good/order_service.js new file mode 100644 index 0000000..9a4237f --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_floating_promises/good/order_service.js @@ -0,0 +1,59 @@ +import logger from "./logger.js"; + +async function processOrder(orderId) { + const order = await fetchOrder(orderId); + + await validateInventory(order.items); + + const payment = await chargePayment(order.total, order.paymentMethod); + + await Promise.all([ + updateOrderStatus(orderId, "confirmed"), + sendConfirmationEmail(order.customerEmail, order), + recordAnalyticsEvent("order_confirmed", { orderId, total: order.total }), + ]); + + return { orderId, paymentId: payment.id, status: "confirmed" }; +} + +async function cancelOrder(orderId, reason) { + const order = await fetchOrder(orderId); + + if (order.status === "shipped") { + throw new Error("Cannot cancel an order that has already shipped"); + } + + await updateOrderStatus(orderId, "cancelled"); + + const refundPromise = refundPayment(order.paymentId, order.total); + const emailPromise = sendCancellationEmail(order.customerEmail, { orderId, reason }); + + const [refund] = await Promise.all([refundPromise, emailPromise]); + + return { orderId, refundId: refund.id, status: "cancelled" }; +} + +function scheduleOrderReminder(orderId, delayMs) { + const reminderPromise = new Promise((resolve) => setTimeout(resolve, delayMs)) + .then(() => sendReminderEmail(orderId)) + .catch((err) => logger.error("Reminder email failed", { orderId, err })); + + return reminderPromise; +} + +async function bulkFulfillOrders(orderIds) { + const results = await Promise.allSettled( + orderIds.map((id) => processOrder(id)) + ); + + const fulfilled = results.filter((r) => r.status === "fulfilled").length; + const failed = results.filter((r) => r.status === "rejected"); + + for (const result of failed) { + logger.error("Order fulfillment failed", result.reason); + } + + return { fulfilled, failedCount: failed.length }; +} + +export { processOrder, cancelOrder, scheduleOrderReminder, bulkFulfillOrders }; diff --git a/priv/combined_metrics/samples/error_handling/no_floating_promises/good/user_service.ts b/priv/combined_metrics/samples/error_handling/no_floating_promises/good/user_service.ts new file mode 100644 index 0000000..a0a83e4 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_floating_promises/good/user_service.ts @@ -0,0 +1,86 @@ +import logger from "./logger.js"; + +interface User { + id: string; + email: string; + displayName: string; +} + +interface AuditEntry { + action: string; + userId: string; + timestamp: number; +} + +async function fetchUser(userId: string): Promise { + const response = await fetch(`/api/users/${userId}`); + if (!response.ok) throw new Error(`User not found: ${userId}`); + return response.json() as Promise; +} + +async function writeAuditLog(entry: AuditEntry): Promise { + await fetch("/api/audit", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(entry), + }); +} + +async function deleteUser(userId: string): Promise { + const user = await fetchUser(userId); + + const response = await fetch(`/api/users/${userId}`, { method: "DELETE" }); + if (!response.ok) throw new Error(`Failed to delete user: ${response.status}`); + + await writeAuditLog({ + action: "user_deleted", + userId: user.id, + timestamp: Date.now(), + }); +} + +async function updateEmail(userId: string, newEmail: string): Promise { + const response = await fetch(`/api/users/${userId}`, { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ email: newEmail }), + }); + + if (!response.ok) throw new Error(`Failed to update email: ${response.status}`); + + const updated: User = await response.json(); + + await writeAuditLog({ + action: "email_updated", + userId, + timestamp: Date.now(), + }); + + return updated; +} + +async function bulkInviteUsers(emails: string[]): Promise<{ sent: number; failed: number }> { + const results = await Promise.allSettled( + emails.map((email) => + fetch("/api/invitations", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ email }), + }) + ) + ); + + const sent = results.filter((r) => r.status === "fulfilled").length; + const failed = results.filter((r) => r.status === "rejected").length; + + for (const r of results) { + if (r.status === "rejected") { + logger.error("Invitation failed", r.reason); + } + } + + return { sent, failed }; +} + +export { fetchUser, deleteUser, updateEmail, bulkInviteUsers }; +export type { User }; diff --git a/priv/combined_metrics/samples/error_handling/no_misused_promises/bad/product_repository.ts b/priv/combined_metrics/samples/error_handling/no_misused_promises/bad/product_repository.ts new file mode 100644 index 0000000..2f286a5 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_misused_promises/bad/product_repository.ts @@ -0,0 +1,64 @@ +interface Product { + id: string; + name: string; + inStock: boolean; +} + +async function fetchProduct(id: string): Promise { + const response = await fetch(`/api/products/${id}`); + if (!response.ok) throw new Error(`Product not found: ${id}`); + return response.json() as Promise; +} + +async function isProductAvailable(id: string): Promise { + const product = await fetchProduct(id); + return product.inStock; +} + +async function handleAddToCart(productId: string): Promise { + // Misuse: Promise used directly in `if` without await + if (isProductAvailable(productId)) { + console.log(`Adding product ${productId} to cart`); + } else { + console.log(`Product ${productId} is out of stock`); + } +} + +function loadAndFilterProducts(ids: string[]): Product[] { + const products: Product[] = []; + + ids.forEach(async (id) => { + // Misuse: async callback in forEach — errors and results are ignored + const product = await fetchProduct(id); + if (product.inStock) { + products.push(product); + } + }); + + return products; +} + +function setupProductEventListeners(productId: string): void { + const button = document.querySelector(`[data-product="${productId}"]`); + if (!button) return; + + // Misuse: async function passed where void callback is expected with no error handling + button.addEventListener("click", async () => { + await handleAddToCart(productId); + }); +} + +async function validateBeforeCheckout(cartItems: string[]): Promise { + const unavailable: string[] = []; + + cartItems.forEach(async (id) => { + // Misuse: async in forEach, result never collected + const available = await isProductAvailable(id); + if (!available) unavailable.push(id); + }); + + return unavailable; +} + +export { fetchProduct, isProductAvailable, handleAddToCart, loadAndFilterProducts, validateBeforeCheckout }; +export type { Product }; diff --git a/priv/combined_metrics/samples/error_handling/no_misused_promises/good/product_repository.ts b/priv/combined_metrics/samples/error_handling/no_misused_promises/good/product_repository.ts new file mode 100644 index 0000000..fba04b3 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_misused_promises/good/product_repository.ts @@ -0,0 +1,55 @@ +interface Product { + id: string; + name: string; + inStock: boolean; +} + +async function fetchProduct(id: string): Promise { + const response = await fetch(`/api/products/${id}`); + if (!response.ok) throw new Error(`Product not found: ${id}`); + return response.json() as Promise; +} + +async function isProductAvailable(id: string): Promise { + const product = await fetchProduct(id); + return product.inStock; +} + +async function handleAddToCart(productId: string): Promise { + const available = await isProductAvailable(productId); + + if (available) { + console.log(`Adding product ${productId} to cart`); + } else { + console.log(`Product ${productId} is out of stock`); + } +} + +async function loadAndFilterProducts(ids: string[]): Promise { + const products = await Promise.all(ids.map((id) => fetchProduct(id))); + return products.filter((p) => p.inStock); +} + +function setupProductEventListeners(productId: string): void { + const button = document.querySelector(`[data-product="${productId}"]`); + if (!button) return; + + button.addEventListener("click", () => { + handleAddToCart(productId).catch((err) => { + console.error("Failed to add to cart", err); + }); + }); +} + +async function validateBeforeCheckout(cartItems: string[]): Promise { + const checks = await Promise.all( + cartItems.map(async (id) => { + const available = await isProductAvailable(id); + return available ? null : id; + }) + ); + return checks.filter((id): id is string => id !== null); +} + +export { fetchProduct, isProductAvailable, handleAddToCart, loadAndFilterProducts, validateBeforeCheckout, setupProductEventListeners }; +export type { Product }; diff --git a/priv/combined_metrics/samples/error_handling/no_rescue_as_flow_control/bad/invoice.rb b/priv/combined_metrics/samples/error_handling/no_rescue_as_flow_control/bad/invoice.rb new file mode 100644 index 0000000..3cfab14 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_rescue_as_flow_control/bad/invoice.rb @@ -0,0 +1,65 @@ +class InvoiceService + def initialize(repository, mailer) + @repository = repository + @mailer = mailer + end + + def find_or_create_for_order(order) + begin + @repository.find_by!(order_id: order.id) + rescue ActiveRecord::RecordNotFound + create_invoice(order) + end + end + + def apply_discount(invoice, coupon_code) + begin + coupon = @repository.find_coupon!(coupon_code) + rescue ActiveRecord::RecordNotFound + return { success: false, error: :coupon_not_found } + end + + begin + raise "expired" if coupon.expired? + raise "used" if coupon.already_used_by?(invoice.customer_id) + rescue => e + return { success: false, error: e.message.to_sym } + end + + discount = coupon.calculate_discount(invoice.subtotal) + invoice.update!(discount_amount: discount, coupon_code: coupon_code) + + { success: true, discount_amount: discount } + end + + def mark_paid(invoice_id, paid_at: Time.current) + begin + invoice = @repository.find!(invoice_id) + rescue ActiveRecord::RecordNotFound + return { success: false, error: :not_found } + end + + begin + raise "already paid" if invoice.paid? + rescue + return { success: false, error: :already_paid } + end + + invoice.update!(status: :paid, paid_at: paid_at) + @mailer.send_receipt(invoice) + + { success: true, invoice: invoice } + end + + private + + def create_invoice(order) + @repository.create!( + order_id: order.id, + customer_id: order.customer_id, + subtotal: order.subtotal, + total: order.total, + status: :pending + ) + end +end diff --git a/priv/combined_metrics/samples/error_handling/no_rescue_as_flow_control/good/invoice.rb b/priv/combined_metrics/samples/error_handling/no_rescue_as_flow_control/good/invoice.rb new file mode 100644 index 0000000..9b97439 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_rescue_as_flow_control/good/invoice.rb @@ -0,0 +1,59 @@ +class InvoiceService + def initialize(repository, mailer) + @repository = repository + @mailer = mailer + end + + def find_or_create_for_order(order) + existing = @repository.find_by(order_id: order.id) + return existing if existing + + create_invoice(order) + end + + def apply_discount(invoice, coupon_code) + coupon = @repository.find_coupon(coupon_code) + + unless coupon + return { success: false, error: :coupon_not_found } + end + + if coupon.expired? + return { success: false, error: :coupon_expired } + end + + if coupon.already_used_by?(invoice.customer_id) + return { success: false, error: :coupon_already_used } + end + + discount = coupon.calculate_discount(invoice.subtotal) + invoice.update!(discount_amount: discount, coupon_code: coupon_code) + + { success: true, discount_amount: discount } + end + + def mark_paid(invoice_id, paid_at: Time.current) + invoice = @repository.find(invoice_id) + + return { success: false, error: :not_found } unless invoice + return { success: false, error: :already_paid } if invoice.paid? + + invoice.update!(status: :paid, paid_at: paid_at) + @mailer.send_receipt(invoice) + + { success: true, invoice: invoice } + end + + private + + def create_invoice(order) + @repository.create!( + order_id: order.id, + customer_id: order.customer_id, + subtotal: order.subtotal, + tax: order.tax, + total: order.total, + status: :pending + ) + end +end diff --git a/priv/combined_metrics/samples/error_handling/no_return_from_ensure/bad/subscription.rb b/priv/combined_metrics/samples/error_handling/no_return_from_ensure/bad/subscription.rb new file mode 100644 index 0000000..ea096c4 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_return_from_ensure/bad/subscription.rb @@ -0,0 +1,49 @@ +class SubscriptionActivator + def initialize(billing_client, notifier, logger) + @billing_client = billing_client + @notifier = notifier + @logger = logger + end + + def activate(subscription) + begin + @billing_client.authorize(subscription.payment_method_id, subscription.plan.monthly_price) + subscription.update!(status: :active, activated_at: Time.current) + @notifier.send_welcome_email(subscription.user) + return { success: true, subscription: subscription } + rescue BillingClient::AuthorizationError => e + @logger.warn("Authorization failed: #{e.message}") + return { success: false, error: :payment_authorization_failed } + ensure + # This return silently swallows any exception raised above + cleanup_pending_state(subscription) + return { success: false, error: :aborted } + end + end + + def cancel(subscription, reason:) + begin + subscription.update!(status: :cancelled, cancelled_at: Time.current, cancel_reason: reason) + @billing_client.cancel_recurring(subscription.billing_id) + @notifier.send_cancellation_confirmation(subscription.user) + return true + rescue StandardError => e + @logger.error("Cancel failed: #{e.message}") + raise + ensure + release_subscription_seats(subscription) + # Returning from ensure masks the re-raised exception + return false + end + end + + private + + def cleanup_pending_state(subscription) + subscription.update_column(:pending_activation, false) if subscription.pending_activation? + end + + def release_subscription_seats(subscription) + subscription.team_seats.update_all(active: false) + end +end diff --git a/priv/combined_metrics/samples/error_handling/no_return_from_ensure/good/subscription.rb b/priv/combined_metrics/samples/error_handling/no_return_from_ensure/good/subscription.rb new file mode 100644 index 0000000..b5b7ee2 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/no_return_from_ensure/good/subscription.rb @@ -0,0 +1,52 @@ +class SubscriptionActivator + def initialize(billing_client, notifier, logger) + @billing_client = billing_client + @notifier = notifier + @logger = logger + end + + def activate(subscription) + result = nil + + begin + @billing_client.authorize(subscription.payment_method_id, subscription.plan.monthly_price) + subscription.update!(status: :active, activated_at: Time.current) + @notifier.send_welcome_email(subscription.user) + result = { success: true, subscription: subscription } + rescue BillingClient::AuthorizationError => e + @logger.warn("Authorization failed for subscription #{subscription.id}: #{e.message}") + result = { success: false, error: :payment_authorization_failed } + rescue StandardError => e + @logger.error("Unexpected error activating subscription #{subscription.id}: #{e.message}") + raise + ensure + @logger.info("Activation attempt completed for subscription #{subscription.id}") + cleanup_pending_state(subscription) + end + + result + end + + def cancel(subscription, reason:) + begin + subscription.update!(status: :cancelled, cancelled_at: Time.current, cancel_reason: reason) + @billing_client.cancel_recurring(subscription.billing_id) + @notifier.send_cancellation_confirmation(subscription.user) + rescue BillingClient::NotFoundError => e + @logger.warn("Billing record not found during cancel #{subscription.id}: #{e.message}") + ensure + release_subscription_seats(subscription) + @logger.info("Cancellation cleanup done for #{subscription.id}") + end + end + + private + + def cleanup_pending_state(subscription) + subscription.update_column(:pending_activation, false) if subscription.pending_activation? + end + + def release_subscription_seats(subscription) + subscription.team_seats.update_all(active: false) + end +end diff --git a/priv/combined_metrics/samples/error_handling/rescue_most_specific_first/bad/account.rb b/priv/combined_metrics/samples/error_handling/rescue_most_specific_first/bad/account.rb new file mode 100644 index 0000000..c7eaa2e --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/rescue_most_specific_first/bad/account.rb @@ -0,0 +1,55 @@ +class AccountImporter + def initialize(csv_parser, repository, logger) + @csv_parser = csv_parser + @repository = repository + @logger = logger + end + + def import(file_path) + results = { imported: 0, skipped: 0, errors: [] } + + @csv_parser.each_row(file_path) do |row| + import_row(row, results) + end + + results + end + + private + + def import_row(row, results) + account = build_account(row) + + begin + @repository.save!(account) + results[:imported] += 1 + rescue StandardError => e + # Too broad — this catches everything below and the specific rescues are unreachable + @logger.error("Unexpected error for #{row[:email]}: #{e.message}") + results[:errors] << { email: row[:email], reason: :unexpected } + rescue ActiveRecord::ActiveRecordError => e + # Dead code — StandardError already matched this + @logger.error("ActiveRecord error for #{row[:email]}: #{e.message}") + results[:errors] << { email: row[:email], reason: :active_record_error } + rescue ActiveRecord::StatementInvalid => e + # Dead code — caught by StandardError above + @logger.error("DB statement error for #{row[:email]}: #{e.message}") + results[:errors] << { email: row[:email], reason: :database_error } + rescue ActiveRecord::RecordNotUnique + # Dead code — caught by StandardError above + results[:skipped] += 1 + rescue ActiveRecord::RecordInvalid + # Dead code — caught by StandardError above + results[:skipped] += 1 + end + end + + def build_account(row) + Account.new( + email: row[:email], + name: row[:name], + plan: row[:plan] || :free, + source: :csv_import + ) + end +end diff --git a/priv/combined_metrics/samples/error_handling/rescue_most_specific_first/good/account.rb b/priv/combined_metrics/samples/error_handling/rescue_most_specific_first/good/account.rb new file mode 100644 index 0000000..85147a7 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/rescue_most_specific_first/good/account.rb @@ -0,0 +1,57 @@ +class AccountImporter + def initialize(csv_parser, repository, logger) + @csv_parser = csv_parser + @repository = repository + @logger = logger + end + + def import(file_path) + results = { imported: 0, skipped: 0, errors: [] } + + @csv_parser.each_row(file_path) do |row| + import_row(row, results) + end + + results + end + + private + + def import_row(row, results) + account = build_account(row) + + begin + @repository.save!(account) + results[:imported] += 1 + rescue ActiveRecord::RecordInvalid => e + # Most specific: validation failures are expected and recoverable + @logger.warn("Validation failed for row #{row[:email]}: #{e.record.errors.full_messages.join(', ')}") + results[:skipped] += 1 + rescue ActiveRecord::RecordNotUnique => e + # More specific than StatementInvalid but less than RecordInvalid + @logger.warn("Duplicate account skipped: #{row[:email]}") + results[:skipped] += 1 + rescue ActiveRecord::StatementInvalid => e + # Less specific DB error + @logger.error("DB statement error for #{row[:email]}: #{e.message}") + results[:errors] << { email: row[:email], reason: :database_error } + rescue ActiveRecord::ActiveRecordError => e + # Broad ActiveRecord error — catches anything above not already matched + @logger.error("ActiveRecord error for #{row[:email]}: #{e.message}") + results[:errors] << { email: row[:email], reason: :active_record_error } + rescue StandardError => e + # Catch-all for unexpected errors + @logger.error("Unexpected error for #{row[:email]}: #{e.message}") + results[:errors] << { email: row[:email], reason: :unexpected } + end + end + + def build_account(row) + Account.new( + email: row[:email], + name: row[:name], + plan: row[:plan] || :free, + source: :csv_import + ) + end +end diff --git a/priv/combined_metrics/samples/error_handling/returns_typed_error/bad/repository.ex b/priv/combined_metrics/samples/error_handling/returns_typed_error/bad/repository.ex new file mode 100644 index 0000000..cebeb97 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/returns_typed_error/bad/repository.ex @@ -0,0 +1,92 @@ +defmodule Repository do + @moduledoc """ + Data repository layer for persisting and fetching domain records. + """ + + def find_by_id(id) do + case lookup(id) do + nil -> nil + record -> record + end + end + + def find_by_email(email) do + case search_email(email) do + [] -> false + [record | _] -> record + end + end + + def save(record) do + if valid?(record) do + do_insert(record) + else + false + end + end + + def update(id, attrs) do + case lookup(id) do + nil -> nil + record -> + if valid_attrs?(attrs) do + do_update(record, attrs) + else + :invalid + end + end + end + + def delete(id) do + case lookup(id) do + nil -> false + record -> + case do_delete(record) do + :ok -> true + _ -> false + end + end + end + + def list_all(filters) do + try do + do_list(filters) + rescue + _ -> [] + end + end + + def count(filters) do + case do_count(filters) do + nil -> 0 + n -> n + end + end + + def exists?(id) do + case lookup(id) do + nil -> false + _ -> true + end + end + + def find_or_create(attrs) do + case search_attrs(attrs) do + nil -> + if valid_attrs?(attrs), do: do_insert(attrs), else: nil + record -> + record + end + end + + defp lookup(_id), do: nil + defp search_email(_email), do: [] + defp valid?(_record), do: true + defp do_insert(record), do: record + defp valid_attrs?(_attrs), do: true + defp do_update(record, _attrs), do: record + defp do_delete(_record), do: :ok + defp do_list(_filters), do: [] + defp do_count(_filters), do: 0 + defp search_attrs(_attrs), do: nil +end diff --git a/priv/combined_metrics/samples/error_handling/returns_typed_error/config.yml b/priv/combined_metrics/samples/error_handling/returns_typed_error/config.yml new file mode 100644 index 0000000..85e423f --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/returns_typed_error/config.yml @@ -0,0 +1 @@ +doc: "Functions should signal failure via a typed return (e.g. `{:error, reason}`) rather than returning `nil` or `false`." diff --git a/priv/combined_metrics/samples/error_handling/returns_typed_error/good/repository.ex b/priv/combined_metrics/samples/error_handling/returns_typed_error/good/repository.ex new file mode 100644 index 0000000..755ed07 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/returns_typed_error/good/repository.ex @@ -0,0 +1,96 @@ +defmodule Repository do + @moduledoc """ + Data repository layer for persisting and fetching domain records. + """ + + @spec find_by_id(term()) :: {:ok, map()} | {:error, :not_found} + def find_by_id(id) do + case lookup(id) do + nil -> {:error, :not_found} + record -> {:ok, record} + end + end + + @spec find_by_email(String.t()) :: {:ok, map()} | {:error, :not_found} + def find_by_email(email) do + case search_email(email) do + [] -> {:error, :not_found} + [record | _] -> {:ok, record} + end + end + + @spec save(map()) :: {:ok, map()} | {:error, :validation_failed} + def save(record) do + if valid?(record) do + {:ok, do_insert(record)} + else + {:error, :validation_failed} + end + end + + @spec update(term(), map()) :: {:ok, map()} | {:error, :not_found | :validation_failed} + def update(id, attrs) do + with {:ok, record} <- find_by_id(id), + true <- valid_attrs?(attrs) do + {:ok, do_update(record, attrs)} + else + {:error, :not_found} -> {:error, :not_found} + false -> {:error, :validation_failed} + end + end + + @spec delete(term()) :: {:ok, map()} | {:error, :not_found | :delete_failed} + def delete(id) do + case find_by_id(id) do + {:error, :not_found} -> + {:error, :not_found} + {:ok, record} -> + case do_delete(record) do + :ok -> {:ok, record} + {:error, reason} -> {:error, {:delete_failed, reason}} + end + end + end + + @spec list_all(map()) :: {:ok, list()} | {:error, :query_failed} + def list_all(filters) do + try do + {:ok, do_list(filters)} + rescue + e -> {:error, {:query_failed, Exception.message(e)}} + end + end + + @spec count(map()) :: {:ok, non_neg_integer()} | {:error, :query_failed} + def count(filters) do + case do_count(filters) do + nil -> {:error, :query_failed} + n -> {:ok, n} + end + end + + @spec find_or_create(map()) :: {:ok, map()} | {:error, :validation_failed} + def find_or_create(attrs) do + case search_attrs(attrs) do + nil -> + if valid_attrs?(attrs) do + {:ok, do_insert(attrs)} + else + {:error, :validation_failed} + end + record -> + {:ok, record} + end + end + + defp lookup(_id), do: nil + defp search_email(_email), do: [] + defp valid?(_record), do: true + defp do_insert(record), do: record + defp valid_attrs?(_attrs), do: true + defp do_update(record, _attrs), do: record + defp do_delete(_record), do: :ok + defp do_list(_filters), do: [] + defp do_count(_filters), do: 0 + defp search_attrs(_attrs), do: nil +end diff --git a/priv/combined_metrics/samples/error_handling/throws_error_objects_not_primitives/bad/api_client.js b/priv/combined_metrics/samples/error_handling/throws_error_objects_not_primitives/bad/api_client.js new file mode 100644 index 0000000..5dc49bc --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/throws_error_objects_not_primitives/bad/api_client.js @@ -0,0 +1,64 @@ +async function fetchUser(userId) { + if (!userId || typeof userId !== "string") { + throw "userId must be a non-empty string"; + } + + let response; + try { + response = await fetch(`/api/users/${userId}`); + } catch (err) { + throw "Failed to reach the API server"; + } + + if (response.status === 404) { + throw 404; + } + + if (response.status === 403) { + throw { code: 403, message: "You do not have permission to view this user" }; + } + + if (!response.ok) { + throw `Unexpected response status: ${response.status}`; + } + + return response.json(); +} + +async function updateUserEmail(userId, newEmail) { + if (!newEmail.includes("@")) { + throw `'${newEmail}' is not a valid email address`; + } + + const response = await fetch(`/api/users/${userId}`, { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ email: newEmail }), + }); + + if (!response.ok) { + const body = await response.json().catch(() => ({})); + throw { + code: response.status, + message: body.message ?? `Failed to update user: ${response.status}`, + }; + } + + return response.json(); +} + +async function deleteUser(userId) { + if (!userId) { + throw null; + } + + const response = await fetch(`/api/users/${userId}`, { method: "DELETE" }); + + if (!response.ok) { + throw response.status; + } + + return true; +} + +export { fetchUser, updateUserEmail, deleteUser }; diff --git a/priv/combined_metrics/samples/error_handling/throws_error_objects_not_primitives/good/api_client.js b/priv/combined_metrics/samples/error_handling/throws_error_objects_not_primitives/good/api_client.js new file mode 100644 index 0000000..41331e9 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/throws_error_objects_not_primitives/good/api_client.js @@ -0,0 +1,69 @@ +class ApiError extends Error { + constructor(message, statusCode) { + super(message); + this.name = "ApiError"; + this.statusCode = statusCode; + } +} + +class NetworkError extends Error { + constructor(message, cause) { + super(message); + this.name = "NetworkError"; + this.cause = cause; + } +} + +async function fetchUser(userId) { + if (!userId || typeof userId !== "string") { + throw new TypeError("userId must be a non-empty string"); + } + + let response; + try { + response = await fetch(`/api/users/${userId}`); + } catch (err) { + throw new NetworkError("Failed to reach the API server", err); + } + + if (response.status === 404) { + throw new ApiError(`User with id '${userId}' not found`, 404); + } + + if (response.status === 403) { + throw new ApiError("You do not have permission to view this user", 403); + } + + if (!response.ok) { + throw new ApiError( + `Unexpected response status: ${response.status}`, + response.status + ); + } + + return response.json(); +} + +async function updateUserEmail(userId, newEmail) { + if (!newEmail.includes("@")) { + throw new RangeError(`'${newEmail}' is not a valid email address`); + } + + const response = await fetch(`/api/users/${userId}`, { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ email: newEmail }), + }); + + if (!response.ok) { + const body = await response.json().catch(() => ({})); + throw new ApiError( + body.message ?? `Failed to update user: ${response.status}`, + response.status + ); + } + + return response.json(); +} + +export { fetchUser, updateUserEmail, ApiError, NetworkError }; diff --git a/priv/combined_metrics/samples/error_handling/try_block_is_minimal/bad/payment_processor.py b/priv/combined_metrics/samples/error_handling/try_block_is_minimal/bad/payment_processor.py new file mode 100644 index 0000000..84740c2 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/try_block_is_minimal/bad/payment_processor.py @@ -0,0 +1,73 @@ +"""Payment processor that charges customers and records transactions.""" +from __future__ import annotations + +from dataclasses import dataclass +from decimal import Decimal +from typing import Optional +import uuid + + +class PaymentGatewayError(Exception): + """Raised when the external gateway rejects a charge.""" + + +@dataclass +class PaymentIntent: + amount: Decimal + currency: str + customer_id: str + description: str + + +@dataclass +class ChargeResult: + charge_id: str + amount: Decimal + currency: str + customer_id: str + + +def _call_gateway(intent: PaymentIntent) -> str: + if intent.amount <= 0: + raise PaymentGatewayError("Amount must be positive") + return f"ch_{uuid.uuid4().hex[:16]}" + + +def _record_transaction(charge_id: str, intent: PaymentIntent) -> None: + print(f"[DB] recorded charge {charge_id} for customer {intent.customer_id}") + + +def charge(intent: PaymentIntent) -> Optional[ChargeResult]: + """Charge a customer — oversized try block hides bugs in safe code.""" + try: + # gateway call AND all subsequent safe operations crammed into one try block + charge_id = _call_gateway(intent) + + # if _record_transaction raises (e.g. DB error), it's caught as PaymentGatewayError + _record_transaction(charge_id, intent) + + # building the result struct is also in try — bugs here are misattributed + result = ChargeResult( + charge_id=charge_id, + amount=intent.amount, + currency=intent.currency, + customer_id=intent.customer_id, + ) + return result + except PaymentGatewayError as exc: + print(f"charge failed: {exc}") + return None + + +def refund(charge_id: str, amount: Optional[Decimal] = None) -> bool: + """Issue a refund — the try block swallows errors from multiple unrelated steps.""" + try: + # all three steps are wrapped together; an error in any one blames the gateway + is_valid = charge_id.startswith("ch_") + if not is_valid: + raise PaymentGatewayError(f"Refund rejected for charge {charge_id}") + print(f"[DB] recorded refund for {charge_id} amount={amount}") + return True + except PaymentGatewayError as exc: + print(f"refund failed: {exc}") + return False diff --git a/priv/combined_metrics/samples/error_handling/try_block_is_minimal/good/payment_processor.py b/priv/combined_metrics/samples/error_handling/try_block_is_minimal/good/payment_processor.py new file mode 100644 index 0000000..e6e7c52 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/try_block_is_minimal/good/payment_processor.py @@ -0,0 +1,78 @@ +"""Payment processor that charges customers and records transactions.""" +from __future__ import annotations + +from dataclasses import dataclass +from decimal import Decimal +from typing import Optional +import uuid + + +class PaymentGatewayError(Exception): + """Raised when the external gateway rejects a charge.""" + + +@dataclass +class PaymentIntent: + amount: Decimal + currency: str + customer_id: str + description: str + + +@dataclass +class ChargeResult: + charge_id: str + amount: Decimal + currency: str + customer_id: str + + +def _call_gateway(intent: PaymentIntent) -> str: + """Simulate an external gateway call; returns a charge ID.""" + if intent.amount <= 0: + raise PaymentGatewayError("Amount must be positive") + return f"ch_{uuid.uuid4().hex[:16]}" + + +def _record_transaction(charge_id: str, intent: PaymentIntent) -> None: + """Persist the transaction record (simulated).""" + print(f"[DB] recorded charge {charge_id} for customer {intent.customer_id}") + + +def charge(intent: PaymentIntent) -> ChargeResult: + """Charge a customer, keeping the try block as small as possible. + + Only the gateway call is inside try; recording and building the result + happen outside so any errors there surface with a clean traceback. + """ + try: + charge_id = _call_gateway(intent) # only the risky call is in try + except PaymentGatewayError as exc: + raise PaymentGatewayError( + f"Gateway rejected charge for customer {intent.customer_id}: {exc}" + ) from exc + + # safe operations live outside the try block + _record_transaction(charge_id, intent) + + return ChargeResult( + charge_id=charge_id, + amount=intent.amount, + currency=intent.currency, + customer_id=intent.customer_id, + ) + + +def refund(charge_id: str, amount: Optional[Decimal] = None) -> bool: + """Issue a refund — try wraps only the gateway call.""" + try: + # only the I/O-bound, failure-prone call belongs inside try + success = charge_id.startswith("ch_") # simulated gateway call + except AttributeError as exc: + raise ValueError(f"Invalid charge_id: {charge_id!r}") from exc + + if not success: + raise PaymentGatewayError(f"Refund rejected for charge {charge_id}") + + print(f"[DB] recorded refund for {charge_id} amount={amount}") + return True diff --git a/priv/combined_metrics/samples/error_handling/uses_checked_arithmetic/bad/invoice.rs b/priv/combined_metrics/samples/error_handling/uses_checked_arithmetic/bad/invoice.rs new file mode 100644 index 0000000..fdbb4c4 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_checked_arithmetic/bad/invoice.rs @@ -0,0 +1,41 @@ +pub struct LineItem { + pub description: String, + pub unit_price_cents: u64, + pub quantity: u32, +} + +impl LineItem { + pub fn subtotal(&self) -> u64 { + // Silently wraps on overflow in release builds — wrong amount charged + self.unit_price_cents * self.quantity as u64 + } +} + +pub struct Invoice { + pub items: Vec, + /// Discount in basis points (100 = 1%) + pub discount_bps: u32, +} + +impl Invoice { + pub fn total_cents(&self) -> u64 { + let subtotal: u64 = self.items.iter().map(|i| i.subtotal()).sum(); + + // If discount_bps > 10_000 this underflows to a huge positive number + let after_discount = subtotal * (10_000 - self.discount_bps as u64) / 10_000; + + // Final accumulation also has no overflow check + after_discount + } + + pub fn tax_amount(&self, rate_bps: u32) -> u64 { + let total = self.total_cents(); + // Multiplication can overflow for large totals + total * rate_bps as u64 / 10_000 + } + + pub fn grand_total(&self, tax_rate_bps: u32) -> u64 { + // Adding two potentially wrapped values — wrong result silently returned + self.total_cents() + self.tax_amount(tax_rate_bps) + } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_checked_arithmetic/good/invoice.rs b/priv/combined_metrics/samples/error_handling/uses_checked_arithmetic/good/invoice.rs new file mode 100644 index 0000000..4d160c1 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_checked_arithmetic/good/invoice.rs @@ -0,0 +1,69 @@ +use std::fmt; + +#[derive(Debug)] +pub enum InvoiceError { + LineItemOverflow { item: String }, + TotalOverflow, + DiscountOutOfRange(u32), +} + +impl fmt::Display for InvoiceError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + InvoiceError::LineItemOverflow { item } => { + write!(f, "line item subtotal overflowed for: {item}") + } + InvoiceError::TotalOverflow => write!(f, "invoice total overflowed u64"), + InvoiceError::DiscountOutOfRange(d) => { + write!(f, "discount {d} basis points exceeds 100%") + } + } + } +} + +#[derive(Debug)] +pub struct LineItem { + pub description: String, + pub unit_price_cents: u64, + pub quantity: u32, +} + +impl LineItem { + pub fn subtotal(&self) -> Result { + let qty = self.quantity as u64; + self.unit_price_cents + .checked_mul(qty) + .ok_or_else(|| InvoiceError::LineItemOverflow { + item: self.description.clone(), + }) + } +} + +pub struct Invoice { + pub items: Vec, + /// Discount in basis points (100 = 1%) + pub discount_bps: u32, +} + +impl Invoice { + pub fn total_cents(&self) -> Result { + if self.discount_bps > 10_000 { + return Err(InvoiceError::DiscountOutOfRange(self.discount_bps)); + } + + let subtotal = self + .items + .iter() + .try_fold(0u64, |acc, item| { + item.subtotal()? + .checked_add(acc) + .ok_or(InvoiceError::TotalOverflow) + })?; + + let discount_factor = 10_000 - self.discount_bps as u64; + subtotal + .checked_mul(discount_factor) + .and_then(|n| n.checked_div(10_000)) + .ok_or(InvoiceError::TotalOverflow) + } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_custom_error_type_with_context/bad/store.rs b/priv/combined_metrics/samples/error_handling/uses_custom_error_type_with_context/bad/store.rs new file mode 100644 index 0000000..e1d6bb3 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_custom_error_type_with_context/bad/store.rs @@ -0,0 +1,51 @@ +use std::collections::HashMap; + +pub struct BoundedStore { + data: HashMap>, + max_entries: usize, + max_bytes: u64, + used_bytes: u64, +} + +impl BoundedStore { + pub fn new(max_entries: usize, max_bytes: u64) -> Self { + Self { + data: HashMap::new(), + max_entries, + max_bytes, + used_bytes: 0, + } + } + + // Returning raw &str errors: no structure, callers can't match on variants + pub fn insert(&mut self, key: String, value: Vec) -> Result<(), &'static str> { + if self.data.len() >= self.max_entries && !self.data.contains_key(&key) { + // Cannot include actual limit in static string + return Err("capacity exceeded"); + } + let new_bytes = self.used_bytes + value.len() as u64; + if new_bytes > self.max_bytes { + // Cannot communicate how full the store is + return Err("storage full"); + } + self.used_bytes = new_bytes; + self.data.insert(key, value); + Ok(()) + } + + // Returning String errors: slightly better, but callers can't pattern match + pub fn get(&self, key: &str) -> Result<&[u8], String> { + self.data + .get(key) + .map(Vec::as_slice) + // key is in the message, but only as a substring — fragile to parse + .ok_or_else(|| format!("not found: {key}")) + } + + pub fn remove(&mut self, key: &str) -> Result, String> { + self.data + .remove(key) + .ok_or_else(|| "key does not exist".to_string()) + // No key context — caller cannot tell which key was missing + } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_custom_error_type_with_context/good/store.rs b/priv/combined_metrics/samples/error_handling/uses_custom_error_type_with_context/good/store.rs new file mode 100644 index 0000000..86b8f5e --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_custom_error_type_with_context/good/store.rs @@ -0,0 +1,70 @@ +use std::collections::HashMap; +use std::fmt; + +#[derive(Debug)] +pub enum StoreError { + NotFound { key: String }, + CapacityExceeded { limit: usize }, + SerializationFailed { key: String, reason: String }, + StorageFull { used_bytes: u64, max_bytes: u64 }, +} + +impl fmt::Display for StoreError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + StoreError::NotFound { key } => write!(f, "key not found: '{key}'"), + StoreError::CapacityExceeded { limit } => { + write!(f, "store capacity of {limit} entries exceeded") + } + StoreError::SerializationFailed { key, reason } => { + write!(f, "failed to serialize value for key '{key}': {reason}") + } + StoreError::StorageFull { used_bytes, max_bytes } => { + write!(f, "storage full: {used_bytes}/{max_bytes} bytes used") + } + } + } +} + +impl std::error::Error for StoreError {} + +pub struct BoundedStore { + data: HashMap>, + max_entries: usize, + max_bytes: u64, + used_bytes: u64, +} + +impl BoundedStore { + pub fn new(max_entries: usize, max_bytes: u64) -> Self { + Self { + data: HashMap::new(), + max_entries, + max_bytes, + used_bytes: 0, + } + } + + pub fn insert(&mut self, key: String, value: Vec) -> Result<(), StoreError> { + if self.data.len() >= self.max_entries && !self.data.contains_key(&key) { + return Err(StoreError::CapacityExceeded { limit: self.max_entries }); + } + let new_bytes = self.used_bytes + value.len() as u64; + if new_bytes > self.max_bytes { + return Err(StoreError::StorageFull { + used_bytes: self.used_bytes, + max_bytes: self.max_bytes, + }); + } + self.used_bytes = new_bytes; + self.data.insert(key, value); + Ok(()) + } + + pub fn get(&self, key: &str) -> Result<&[u8], StoreError> { + self.data + .get(key) + .map(Vec::as_slice) + .ok_or_else(|| StoreError::NotFound { key: key.to_string() }) + } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_errors_as_for_type_assertion/bad/router.go b/priv/combined_metrics/samples/error_handling/uses_errors_as_for_type_assertion/bad/router.go new file mode 100644 index 0000000..c153d1b --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_errors_as_for_type_assertion/bad/router.go @@ -0,0 +1,44 @@ +package router + +import ( + "net/http" +) + +// ValidationError represents a field-level validation failure. +type ValidationError struct { + Field string + Message string +} + +func (e *ValidationError) Error() string { + return e.Field + ": " + e.Message +} + +type ProductHandler struct { + service ProductService +} + +type ProductService interface { + Create(name string, price float64) error +} + +// CreateProduct handles product creation and maps ValidationError to 400. +// It uses a direct type assertion, which fails silently when the error is wrapped. +func (h *ProductHandler) CreateProduct(w http.ResponseWriter, r *http.Request) { + name := r.FormValue("name") + price := 0.0 + + err := h.service.Create(name, price) + if err == nil { + w.WriteHeader(http.StatusCreated) + return + } + + // Direct type assertion fails when the error is wrapped with fmt.Errorf("%w", ve). + if ve, ok := err.(*ValidationError); ok { + http.Error(w, "validation error: "+ve.Field+": "+ve.Message, http.StatusBadRequest) + return + } + + http.Error(w, "internal server error", http.StatusInternalServerError) +} diff --git a/priv/combined_metrics/samples/error_handling/uses_errors_as_for_type_assertion/good/router.go b/priv/combined_metrics/samples/error_handling/uses_errors_as_for_type_assertion/good/router.go new file mode 100644 index 0000000..e92ca34 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_errors_as_for_type_assertion/good/router.go @@ -0,0 +1,46 @@ +package router + +import ( + "errors" + "net/http" +) + +// ValidationError represents a field-level validation failure. +type ValidationError struct { + Field string + Message string +} + +func (e *ValidationError) Error() string { + return e.Field + ": " + e.Message +} + +type ProductHandler struct { + service ProductService +} + +type ProductService interface { + Create(name string, price float64) error +} + +// CreateProduct handles product creation and maps ValidationError to 400. +// It uses errors.As to correctly unwrap errors in a chain. +func (h *ProductHandler) CreateProduct(w http.ResponseWriter, r *http.Request) { + name := r.FormValue("name") + price := 0.0 + + err := h.service.Create(name, price) + if err == nil { + w.WriteHeader(http.StatusCreated) + return + } + + // errors.As traverses the error chain to find *ValidationError even when wrapped. + var ve *ValidationError + if errors.As(err, &ve) { + http.Error(w, "validation error: "+ve.Field+": "+ve.Message, http.StatusBadRequest) + return + } + + http.Error(w, "internal server error", http.StatusInternalServerError) +} diff --git a/priv/combined_metrics/samples/error_handling/uses_errors_is_for_sentinel_comparison/bad/middleware.go b/priv/combined_metrics/samples/error_handling/uses_errors_is_for_sentinel_comparison/bad/middleware.go new file mode 100644 index 0000000..bec84b2 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_errors_is_for_sentinel_comparison/bad/middleware.go @@ -0,0 +1,42 @@ +package middleware + +import ( + "errors" + "net/http" +) + +var ( + ErrUnauthorized = errors.New("unauthorized") + ErrForbidden = errors.New("forbidden") +) + +type AuthService interface { + Validate(token string) error +} + +// RequireAuth returns a middleware that validates the Bearer token. +// It uses == for error comparison, which breaks when errors are wrapped. +func RequireAuth(auth AuthService) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + token := r.Header.Get("Authorization") + err := auth.Validate(token) + if err == nil { + next.ServeHTTP(w, r) + return + } + + // Direct == comparison fails when err is wrapped with fmt.Errorf("%w"). + if err == ErrUnauthorized { + http.Error(w, "authentication required", http.StatusUnauthorized) + return + } + if err == ErrForbidden { + http.Error(w, "access denied", http.StatusForbidden) + return + } + + http.Error(w, "internal server error", http.StatusInternalServerError) + }) + } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_errors_is_for_sentinel_comparison/good/middleware.go b/priv/combined_metrics/samples/error_handling/uses_errors_is_for_sentinel_comparison/good/middleware.go new file mode 100644 index 0000000..01e3a4e --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_errors_is_for_sentinel_comparison/good/middleware.go @@ -0,0 +1,42 @@ +package middleware + +import ( + "errors" + "net/http" +) + +var ( + ErrUnauthorized = errors.New("unauthorized") + ErrForbidden = errors.New("forbidden") +) + +type AuthService interface { + Validate(token string) error +} + +// RequireAuth returns a middleware that validates the Bearer token. +// It uses errors.Is to correctly match sentinel errors through any wrapping. +func RequireAuth(auth AuthService) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + token := r.Header.Get("Authorization") + err := auth.Validate(token) + if err == nil { + next.ServeHTTP(w, r) + return + } + + // errors.Is traverses the error chain, so wrapped sentinels are matched correctly. + if errors.Is(err, ErrUnauthorized) { + http.Error(w, "authentication required", http.StatusUnauthorized) + return + } + if errors.Is(err, ErrForbidden) { + http.Error(w, "access denied", http.StatusForbidden) + return + } + + http.Error(w, "internal server error", http.StatusInternalServerError) + }) + } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_exceptions_not_error_codes/bad/OrderService.php b/priv/combined_metrics/samples/error_handling/uses_exceptions_not_error_codes/bad/OrderService.php new file mode 100644 index 0000000..cc7773c --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_exceptions_not_error_codes/bad/OrderService.php @@ -0,0 +1,66 @@ +inventory->getAvailableQuantity($item['product_id']); + if ($available < $item['quantity']) { + return -1; // Caller must know magic codes + } + } + + $order = new \stdClass(); + $order->customerId = $customerId; + $order->status = 'pending'; + $order->items = $items; + $saved = $this->orders->save($order); + + if (!$saved) { + return false; // Is false a different error than -1? + } + + return $order; + } + + public function confirm($orderId, $paymentToken) + { + $order = $this->orders->findById($orderId); + if ($order === null) { + return null; // Caller must null-check + } + + if ($order->status !== 'pending') { + return -2; // Magic number for wrong state + } + + $charged = $this->payments->charge($order->total, $paymentToken); + if (!$charged) { + return -3; // Magic number for payment failed + } + + $order->status = 'confirmed'; + $this->orders->save($order); + + return $order; + } + + public function getOrFail($orderId) + { + $order = $this->orders->findById($orderId); + + // Returns false — caller must remember to check === false + return $order ?? false; + } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_exceptions_not_error_codes/good/OrderService.php b/priv/combined_metrics/samples/error_handling/uses_exceptions_not_error_codes/good/OrderService.php new file mode 100644 index 0000000..2c3d70b --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_exceptions_not_error_codes/good/OrderService.php @@ -0,0 +1,72 @@ +inventory->getAvailableQuantity($item['product_id']); + if ($available < $item['quantity']) { + throw new InsufficientStockException( + "Product {$item['product_id']} has only {$available} units available" + ); + } + } + + $order = new Order(customerId: $customerId, status: 'pending'); + foreach ($items as $item) { + $order->addItem($item['product_id'], $item['quantity'], $item['unit_price']); + } + $this->orders->save($order); + + return $order; + } + + public function confirm(int $orderId, string $paymentToken): Order + { + $order = $this->orders->findById($orderId); + if ($order === null) { + throw new OrderNotFoundException("Order {$orderId} not found"); + } + + if ($order->getStatus() !== 'pending') { + throw new InvalidOrderStateException( + "Cannot confirm order in state '{$order->getStatus()}'" + ); + } + + $this->payments->charge($order->getTotal(), $paymentToken); + + $order->setStatus('confirmed'); + $this->orders->save($order); + + return $order; + } + + public function getOrFail(int $orderId): Order + { + $order = $this->orders->findById($orderId); + if ($order === null) { + throw new OrderNotFoundException("Order {$orderId} not found"); + } + + return $order; + } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_question_mark_for_propagation/bad/loader.rs b/priv/combined_metrics/samples/error_handling/uses_question_mark_for_propagation/bad/loader.rs new file mode 100644 index 0000000..ea5190d --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_question_mark_for_propagation/bad/loader.rs @@ -0,0 +1,58 @@ +use std::fs; +use std::io; +use std::path::Path; + +pub struct FileRecord { + pub path: String, + pub lines: Vec, + pub byte_count: usize, +} + +pub fn load_record(path: &Path) -> Result { + // Manual match instead of ? operator — noisy and error-prone + let raw = match fs::read(path) { + Ok(bytes) => bytes, + Err(e) => return Err(format!("IO error: {e}")), + }; + + let content = match String::from_utf8(raw.clone()) { + Ok(s) => s, + Err(e) => return Err(format!("encoding error: {e}")), + }; + + if content.trim().is_empty() { + return Err(format!("file is empty: {}", path.display())); + } + + let lines: Vec = content.lines().map(str::to_string).collect(); + + Ok(FileRecord { + path: path.display().to_string(), + lines, + byte_count: raw.len(), + }) +} + +pub fn load_all(dir: &Path) -> Result, String> { + let read_dir = match fs::read_dir(dir) { + Ok(rd) => rd, + Err(e) => return Err(format!("cannot read dir: {e}")), + }; + + let mut records = Vec::new(); + for entry_result in read_dir { + let entry = match entry_result { + Ok(e) => e, + Err(e) => return Err(format!("dir entry error: {e}")), + }; + let path = entry.path(); + if path.extension().and_then(|e| e.to_str()) == Some("txt") { + let record = match load_record(&path) { + Ok(r) => r, + Err(e) => return Err(e), + }; + records.push(record); + } + } + Ok(records) +} diff --git a/priv/combined_metrics/samples/error_handling/uses_question_mark_for_propagation/good/loader.rs b/priv/combined_metrics/samples/error_handling/uses_question_mark_for_propagation/good/loader.rs new file mode 100644 index 0000000..d209aae --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_question_mark_for_propagation/good/loader.rs @@ -0,0 +1,63 @@ +use std::fs; +use std::io; +use std::path::Path; + +#[derive(Debug)] +pub enum LoadError { + Io(io::Error), + InvalidEncoding(String), + EmptyFile(String), +} + +impl From for LoadError { + fn from(e: io::Error) -> Self { + LoadError::Io(e) + } +} + +impl std::fmt::Display for LoadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LoadError::Io(e) => write!(f, "IO error: {e}"), + LoadError::InvalidEncoding(msg) => write!(f, "encoding error: {msg}"), + LoadError::EmptyFile(path) => write!(f, "file is empty: {path}"), + } + } +} + +pub struct FileRecord { + pub path: String, + pub lines: Vec, + pub byte_count: usize, +} + +pub fn load_record(path: &Path) -> Result { + let raw = fs::read(path)?; + let content = String::from_utf8(raw.clone()).map_err(|e| { + LoadError::InvalidEncoding(format!("{}: {e}", path.display())) + })?; + + if content.trim().is_empty() { + return Err(LoadError::EmptyFile(path.display().to_string())); + } + + let lines: Vec = content.lines().map(str::to_string).collect(); + + Ok(FileRecord { + path: path.display().to_string(), + lines, + byte_count: raw.len(), + }) +} + +pub fn load_all(dir: &Path) -> Result, LoadError> { + let mut records = Vec::new(); + for entry in fs::read_dir(dir)? { + let entry = entry?; + let path = entry.path(); + if path.extension().and_then(|e| e.to_str()) == Some("txt") { + records.push(load_record(&path)?); + } + } + Ok(records) +} diff --git a/priv/combined_metrics/samples/error_handling/uses_raise_from/bad/config_loader.py b/priv/combined_metrics/samples/error_handling/uses_raise_from/bad/config_loader.py new file mode 100644 index 0000000..7a97e95 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_raise_from/bad/config_loader.py @@ -0,0 +1,68 @@ +"""Configuration loader that reads YAML/JSON config files and validates them.""" +from __future__ import annotations + +import json +import os +from typing import Any + + +class ConfigError(Exception): + """Raised when configuration cannot be loaded or is invalid.""" + + +class MissingKeyError(ConfigError): + """Raised when a required configuration key is absent.""" + + +def _read_file(path: str) -> str: + try: + with open(path, encoding="utf-8") as fh: + return fh.read() + except FileNotFoundError: + # original FileNotFoundError context is lost — no 'from exc' + raise ConfigError(f"Configuration file not found: {path}") + except PermissionError: + raise ConfigError(f"Cannot read configuration file: {path}") + + +def _parse_json(raw: str, path: str) -> dict[str, Any]: + try: + return json.loads(raw) + except json.JSONDecodeError: + # the precise parse error (line, column) vanishes from the traceback + raise ConfigError(f"Invalid JSON in configuration file {path}") + + +def _require_key(config: dict[str, Any], key: str) -> Any: + try: + return config[key] + except KeyError: + # KeyError is silently replaced — no chain, harder to debug + raise MissingKeyError(f"Required configuration key {key!r} is missing") + + +def load(path: str) -> dict[str, Any]: + """Load and validate a JSON configuration file.""" + raw = _read_file(path) + config = _parse_json(raw, path) + + database_url = _require_key(config, "database_url") + secret_key = _require_key(config, "secret_key") + debug = config.get("debug", False) + + return { + "database_url": database_url, + "secret_key": secret_key, + "debug": debug, + "raw": config, + } + + +def load_from_env_or_file(env_var: str, fallback_path: str) -> dict[str, Any]: + """Load config from an env var path or fall back to a default file.""" + path = os.environ.get(env_var, fallback_path) + try: + return load(path) + except ConfigError: + # wraps again without from — traceback chain is broken at every level + raise ConfigError(f"Failed to load config (env {env_var}={path!r})") diff --git a/priv/combined_metrics/samples/error_handling/uses_raise_from/good/config_loader.py b/priv/combined_metrics/samples/error_handling/uses_raise_from/good/config_loader.py new file mode 100644 index 0000000..99ae990 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_raise_from/good/config_loader.py @@ -0,0 +1,72 @@ +"""Configuration loader that reads YAML/JSON config files and validates them.""" +from __future__ import annotations + +import json +import os +from typing import Any + + +class ConfigError(Exception): + """Raised when configuration cannot be loaded or is invalid.""" + + +class MissingKeyError(ConfigError): + """Raised when a required configuration key is absent.""" + + +def _read_file(path: str) -> str: + try: + with open(path, encoding="utf-8") as fh: + return fh.read() + except FileNotFoundError as exc: + raise ConfigError(f"Configuration file not found: {path}") from exc + except PermissionError as exc: + raise ConfigError(f"Cannot read configuration file: {path}") from exc + + +def _parse_json(raw: str, path: str) -> dict[str, Any]: + try: + return json.loads(raw) + except json.JSONDecodeError as exc: + # raise from preserves the JSON parse error in the exception chain + raise ConfigError( + f"Invalid JSON in configuration file {path}: {exc.msg} " + f"(line {exc.lineno}, col {exc.colno})" + ) from exc + + +def _require_key(config: dict[str, Any], key: str) -> Any: + try: + return config[key] + except KeyError as exc: + raise MissingKeyError( + f"Required configuration key {key!r} is missing" + ) from exc + + +def load(path: str) -> dict[str, Any]: + """Load and validate a JSON configuration file.""" + raw = _read_file(path) + config = _parse_json(raw, path) + + database_url = _require_key(config, "database_url") + secret_key = _require_key(config, "secret_key") + debug = config.get("debug", False) + + return { + "database_url": database_url, + "secret_key": secret_key, + "debug": debug, + "raw": config, + } + + +def load_from_env_or_file(env_var: str, fallback_path: str) -> dict[str, Any]: + """Load config from an env var path or fall back to a default file.""" + path = os.environ.get(env_var, fallback_path) + try: + return load(path) + except ConfigError as exc: + raise ConfigError( + f"Failed to load config (env {env_var}={path!r})" + ) from exc diff --git a/priv/combined_metrics/samples/error_handling/uses_result_for_recoverable_errors/bad/parser.rs b/priv/combined_metrics/samples/error_handling/uses_result_for_recoverable_errors/bad/parser.rs new file mode 100644 index 0000000..d1668dc --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_result_for_recoverable_errors/bad/parser.rs @@ -0,0 +1,50 @@ +pub struct Config { + pub host: String, + pub port: u16, + pub max_connections: usize, +} + +pub fn parse_config(raw: &str) -> Config { + let mut host = String::new(); + let mut port: u16 = 0; + let mut max_connections: usize = 10; + + for line in raw.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + + // Panics if line has no '=' separator — caller cannot recover + let parts: Vec<&str> = line.splitn(2, '=').collect(); + if parts.len() != 2 { + panic!("malformed config line: {line}"); + } + + let key = parts[0].trim(); + let value = parts[1].trim(); + + match key { + "host" => host = value.to_string(), + "port" => { + // Panics on invalid port — even a typo in the config file crashes + port = value.parse().unwrap(); + } + "max_connections" => { + max_connections = value.parse().unwrap(); + } + _ => {} + } + } + + if host.is_empty() { + // Missing config key is recoverable but we panic anyway + panic!("config missing required field: host"); + } + + if port == 0 { + panic!("config missing required field: port"); + } + + Config { host, port, max_connections } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_result_for_recoverable_errors/good/parser.rs b/priv/combined_metrics/samples/error_handling/uses_result_for_recoverable_errors/good/parser.rs new file mode 100644 index 0000000..4f99ea2 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_result_for_recoverable_errors/good/parser.rs @@ -0,0 +1,74 @@ +use std::num::ParseIntError; +use std::fmt; + +#[derive(Debug)] +pub enum ConfigError { + MissingField(String), + InvalidValue { field: String, reason: String }, + ParseError(ParseIntError), +} + +impl fmt::Display for ConfigError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ConfigError::MissingField(field) => write!(f, "missing required field: {field}"), + ConfigError::InvalidValue { field, reason } => { + write!(f, "invalid value for '{field}': {reason}") + } + ConfigError::ParseError(e) => write!(f, "parse error: {e}"), + } + } +} + +impl From for ConfigError { + fn from(e: ParseIntError) -> Self { + ConfigError::ParseError(e) + } +} + +pub struct Config { + pub host: String, + pub port: u16, + pub max_connections: usize, +} + +pub fn parse_config(raw: &str) -> Result { + let mut host = None; + let mut port = None; + let mut max_connections = None; + + for line in raw.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + let (key, value) = line.split_once('=').ok_or_else(|| { + ConfigError::InvalidValue { + field: line.to_string(), + reason: "expected key=value format".to_string(), + } + })?; + + match key.trim() { + "host" => host = Some(value.trim().to_string()), + "port" => { + let p: u16 = value.trim().parse().map_err(|_| ConfigError::InvalidValue { + field: "port".to_string(), + reason: "must be a number between 1 and 65535".to_string(), + })?; + port = Some(p); + } + "max_connections" => { + let n: usize = value.trim().parse()?; + max_connections = Some(n); + } + _ => {} + } + } + + Ok(Config { + host: host.ok_or_else(|| ConfigError::MissingField("host".to_string()))?, + port: port.ok_or_else(|| ConfigError::MissingField("port".to_string()))?, + max_connections: max_connections.unwrap_or(10), + }) +} diff --git a/priv/combined_metrics/samples/error_handling/uses_throws_for_recoverable_errors/bad/AuthService.swift b/priv/combined_metrics/samples/error_handling/uses_throws_for_recoverable_errors/bad/AuthService.swift new file mode 100644 index 0000000..2ce1e80 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_throws_for_recoverable_errors/bad/AuthService.swift @@ -0,0 +1,65 @@ +import Foundation + +struct AuthToken { + let value: String + let expiresAt: Date + let userID: String +} + +struct Credentials { + let username: String + let password: String +} + +class AuthService { + private var failedAttempts: [String: Int] = [:] + private let maxAttempts = 5 + + // Returns nil for all failure modes — caller cannot distinguish between + // invalid credentials, account locked, network error, etc. + func login(with credentials: Credentials) -> AuthToken? { + guard !credentials.username.isEmpty, !credentials.password.isEmpty else { + return nil + } + + let attempts = failedAttempts[credentials.username, default: 0] + if attempts >= maxAttempts { + return nil + } + + guard isReachable() else { + return nil + } + + guard validateCredentials(credentials) else { + failedAttempts[credentials.username, default: 0] += 1 + return nil + } + + failedAttempts.removeValue(forKey: credentials.username) + return AuthToken( + value: generateToken(), + expiresAt: Date().addingTimeInterval(3600), + userID: credentials.username + ) + } + + // Returns nil with no way to tell if expired or invalid + func validateToken(_ token: AuthToken) -> Bool? { + guard token.expiresAt > Date() else { + return nil + } + return true + } + + // Returns false for both "no permission" and "token invalid" + func requirePermission(_ permission: String, for token: AuthToken) -> Bool { + guard token.expiresAt > Date() else { return false } + return hasPermission(permission, userID: token.userID) + } + + private func validateCredentials(_ credentials: Credentials) -> Bool { true } + private func generateToken() -> String { UUID().uuidString } + private func isReachable() -> Bool { true } + private func hasPermission(_ permission: String, userID: String) -> Bool { true } +} diff --git a/priv/combined_metrics/samples/error_handling/uses_throws_for_recoverable_errors/good/AuthService.swift b/priv/combined_metrics/samples/error_handling/uses_throws_for_recoverable_errors/good/AuthService.swift new file mode 100644 index 0000000..b0862b7 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/uses_throws_for_recoverable_errors/good/AuthService.swift @@ -0,0 +1,70 @@ +import Foundation + +enum AuthError: Error { + case invalidCredentials + case accountLocked(until: Date) + case networkUnavailable + case tokenExpired + case insufficientPermissions(required: String) +} + +struct AuthToken { + let value: String + let expiresAt: Date + let userID: String +} + +struct Credentials { + let username: String + let password: String +} + +class AuthService { + private var failedAttempts: [String: Int] = [:] + private let maxAttempts = 5 + + func login(with credentials: Credentials) throws -> AuthToken { + guard !credentials.username.isEmpty, !credentials.password.isEmpty else { + throw AuthError.invalidCredentials + } + + let attempts = failedAttempts[credentials.username, default: 0] + if attempts >= maxAttempts { + let lockoutEnd = Date().addingTimeInterval(15 * 60) + throw AuthError.accountLocked(until: lockoutEnd) + } + + guard isReachable() else { + throw AuthError.networkUnavailable + } + + guard validateCredentials(credentials) else { + failedAttempts[credentials.username, default: 0] += 1 + throw AuthError.invalidCredentials + } + + failedAttempts.removeValue(forKey: credentials.username) + return AuthToken( + value: generateToken(), + expiresAt: Date().addingTimeInterval(3600), + userID: credentials.username + ) + } + + func validateToken(_ token: AuthToken) throws { + guard token.expiresAt > Date() else { + throw AuthError.tokenExpired + } + } + + func requirePermission(_ permission: String, for token: AuthToken) throws { + guard hasPermission(permission, userID: token.userID) else { + throw AuthError.insufficientPermissions(required: permission) + } + } + + private func validateCredentials(_ credentials: Credentials) -> Bool { true } + private func generateToken() -> String { UUID().uuidString } + private func isReachable() -> Bool { true } + private func hasPermission(_ permission: String, userID: String) -> Bool { true } +} diff --git a/priv/combined_metrics/samples/error_handling/wraps_errors_with_context/bad/repository.go b/priv/combined_metrics/samples/error_handling/wraps_errors_with_context/bad/repository.go new file mode 100644 index 0000000..18a4ad3 --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/wraps_errors_with_context/bad/repository.go @@ -0,0 +1,50 @@ +package repository + +import ( + "context" + "database/sql" + "errors" + "fmt" +) + +var ErrNotFound = errors.New("not found") + +type Invoice struct { + ID int64 + CustomerID int64 + Amount float64 +} + +type InvoiceRepository struct { + db *sql.DB +} + +func New(db *sql.DB) *InvoiceRepository { + return &InvoiceRepository{db: db} +} + +func (r *InvoiceRepository) FindByID(ctx context.Context, id int64) (*Invoice, error) { + row := r.db.QueryRowContext(ctx, + `SELECT id, customer_id, amount FROM invoices WHERE id = $1`, id) + + var inv Invoice + if err := row.Scan(&inv.ID, &inv.CustomerID, &inv.Amount); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + // Returns a new error that discards the original — callers lose all context. + return nil, fmt.Errorf("database error") + } + return &inv, nil +} + +func (r *InvoiceRepository) Save(ctx context.Context, inv *Invoice) error { + _, err := r.db.ExecContext(ctx, + `INSERT INTO invoices (customer_id, amount) VALUES ($1, $2)`, + inv.CustomerID, inv.Amount) + if err != nil { + // No wrapping, no context — callers see only a bare message. + return errors.New("save failed") + } + return nil +} diff --git a/priv/combined_metrics/samples/error_handling/wraps_errors_with_context/good/repository.go b/priv/combined_metrics/samples/error_handling/wraps_errors_with_context/good/repository.go new file mode 100644 index 0000000..f31e17a --- /dev/null +++ b/priv/combined_metrics/samples/error_handling/wraps_errors_with_context/good/repository.go @@ -0,0 +1,49 @@ +package repository + +import ( + "context" + "database/sql" + "errors" + "fmt" +) + +var ErrNotFound = errors.New("not found") + +type Invoice struct { + ID int64 + CustomerID int64 + Amount float64 +} + +type InvoiceRepository struct { + db *sql.DB +} + +func New(db *sql.DB) *InvoiceRepository { + return &InvoiceRepository{db: db} +} + +func (r *InvoiceRepository) FindByID(ctx context.Context, id int64) (*Invoice, error) { + row := r.db.QueryRowContext(ctx, + `SELECT id, customer_id, amount FROM invoices WHERE id = $1`, id) + + var inv Invoice + if err := row.Scan(&inv.ID, &inv.CustomerID, &inv.Amount); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, fmt.Errorf("find invoice %d: %w", id, ErrNotFound) + } + // Wraps the database error so callers can inspect it with errors.Is/As. + return nil, fmt.Errorf("find invoice %d: %w", id, err) + } + return &inv, nil +} + +func (r *InvoiceRepository) Save(ctx context.Context, inv *Invoice) error { + _, err := r.db.ExecContext(ctx, + `INSERT INTO invoices (customer_id, amount) VALUES ($1, $2)`, + inv.CustomerID, inv.Amount) + if err != nil { + return fmt.Errorf("save invoice for customer %d: %w", inv.CustomerID, err) + } + return nil +} diff --git a/priv/combined_metrics/samples/file_structure/has_consistent_indentation/bad/worker.ex b/priv/combined_metrics/samples/file_structure/has_consistent_indentation/bad/worker.ex new file mode 100644 index 0000000..f11b5cf --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/has_consistent_indentation/bad/worker.ex @@ -0,0 +1,73 @@ +defmodule Worker do + @moduledoc """ + Background worker for processing queued jobs. + """ + + def start(queue) do + jobs = fetch_jobs(queue) + Enum.each(jobs, fn job -> + process(job) + end) + end + + def process(job) do + case job.type do + :email -> + send_email(job) + :report -> + generate_report(job) + _ -> + {:error, :unknown_type} + end + end + + def retry(job, attempts) do + if attempts > 0 do + case process(job) do + :ok -> :ok + {:error, _} -> + retry(job, attempts - 1) + end + else + {:error, :max_retries_exceeded} + end + end + + def schedule(job, delay_ms) do + Process.send_after(self(), {:run, job}, delay_ms) + :ok + end + + def cancel(job_id) do + case find_job(job_id) do + nil -> + {:error, :not_found} + job -> + do_cancel(job) + end + end + + def status(job_id) do + case find_job(job_id) do + nil -> {:error, :not_found} + job -> + {:ok, job.status} + end + end + + def drain(queue) do + jobs = fetch_jobs(queue) + Enum.reduce(jobs, {[], []}, fn job, {ok, err} -> + case process(job) do + :ok -> {[job | ok], err} + {:error, _} -> {ok, [job | err]} + end + end) + end + + defp fetch_jobs(_queue), do: [] + defp send_email(_job), do: :ok + defp generate_report(_job), do: :ok + defp find_job(_id), do: nil + defp do_cancel(_job), do: :ok +end diff --git a/priv/combined_metrics/samples/file_structure/has_consistent_indentation/config.yml b/priv/combined_metrics/samples/file_structure/has_consistent_indentation/config.yml new file mode 100644 index 0000000..f3fd8c4 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/has_consistent_indentation/config.yml @@ -0,0 +1 @@ +doc: "Files should use a single, consistent indentation style with no mixed tabs and spaces." diff --git a/priv/combined_metrics/samples/file_structure/has_consistent_indentation/good/worker.ex b/priv/combined_metrics/samples/file_structure/has_consistent_indentation/good/worker.ex new file mode 100644 index 0000000..7edae97 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/has_consistent_indentation/good/worker.ex @@ -0,0 +1,67 @@ +defmodule Worker do + @moduledoc """ + Background worker for processing queued jobs. + """ + + def start(queue) do + jobs = fetch_jobs(queue) + Enum.each(jobs, fn job -> + process(job) + end) + end + + def process(job) do + case job.type do + :email -> send_email(job) + :report -> generate_report(job) + _ -> {:error, :unknown_type} + end + end + + def retry(job, attempts) do + if attempts > 0 do + case process(job) do + :ok -> :ok + {:error, _} -> retry(job, attempts - 1) + end + else + {:error, :max_retries_exceeded} + end + end + + def schedule(job, delay_ms) do + Process.send_after(self(), {:run, job}, delay_ms) + :ok + end + + def cancel(job_id) do + case find_job(job_id) do + nil -> {:error, :not_found} + job -> do_cancel(job) + end + end + + def status(job_id) do + case find_job(job_id) do + nil -> {:error, :not_found} + job -> {:ok, job.status} + end + end + + def drain(queue) do + jobs = fetch_jobs(queue) + + Enum.reduce(jobs, {[], []}, fn job, {ok, err} -> + case process(job) do + :ok -> {[job | ok], err} + {:error, _} -> {ok, [job | err]} + end + end) + end + + defp fetch_jobs(_queue), do: [] + defp send_email(_job), do: :ok + defp generate_report(_job), do: :ok + defp find_job(_id), do: nil + defp do_cancel(_job), do: :ok +end diff --git a/priv/combined_metrics/samples/file_structure/headers_have_include_guards/bad/Parser.cpp b/priv/combined_metrics/samples/file_structure/headers_have_include_guards/bad/Parser.cpp new file mode 100644 index 0000000..bee9773 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/headers_have_include_guards/bad/Parser.cpp @@ -0,0 +1,121 @@ +// This file demonstrates a translation unit that includes headers WITHOUT +// include guards. If these headers are included more than once — directly +// or transitively — the compiler sees duplicate declarations and definitions, +// causing errors or subtle ODR (One Definition Rule) violations. + +// token.h (inline for demonstration — NO include guard) +// ----------------------------------------------- +// // No #pragma once and no #ifndef guard +// +// enum class TokenKind { Identifier, Number, Operator, EndOfStream }; +// +// struct Token { +// TokenKind kind; +// std::string value; +// int line; +// }; +// ----------------------------------------------- + +// parse_error.h (inline for demonstration — NO include guard) +// ----------------------------------------------- +// // No #pragma once and no #ifndef guard +// +// #include +// #include +// +// // If parse_error.h is included by both Parser.cpp and another header that +// // Parser.cpp also includes, ParseError is defined twice → compile error. +// class ParseError : public std::runtime_error { +// public: +// explicit ParseError(const std::string& msg, int line) +// : std::runtime_error(msg), line_(line) {} +// int line() const noexcept { return line_; } +// private: +// int line_; +// }; +// ----------------------------------------------- + +#include +#include +#include + +// Simulated second include of the same unguarded header content: +// In a real project this happens via transitive includes. +// Without guards, the declarations below would appear twice — compile error. + +enum class TokenKind { Identifier, Number, Operator, EndOfStream }; + +struct Token { // first definition + TokenKind kind; + std::string value; + int line; +}; + +// struct Token { ← if the header were included again, this would be a redefinition +// TokenKind kind; +// std::string value; +// int line; +// }; + +class ParseError : public std::runtime_error { +public: + explicit ParseError(const std::string& msg, int line) + : std::runtime_error(msg), line_(line) {} + int line() const noexcept { return line_; } +private: + int line_; +}; + +class Parser { +public: + explicit Parser(std::string source) + : source_(std::move(source)), pos_(0), currentLine_(1) {} + + std::vector tokenize() { + std::vector tokens; + while (pos_ < source_.size()) { + skipWhitespace(); + if (pos_ >= source_.size()) break; + + char c = source_[pos_]; + if (std::isalpha(static_cast(c))) + tokens.push_back(readIdentifier()); + else if (std::isdigit(static_cast(c))) + tokens.push_back(readNumber()); + else + tokens.push_back(readOperator()); + } + tokens.push_back({TokenKind::EndOfStream, "", currentLine_}); + return tokens; + } + +private: + std::string source_; + std::size_t pos_; + int currentLine_; + + void skipWhitespace() { + while (pos_ < source_.size() && std::isspace(static_cast(source_[pos_]))) { + if (source_[pos_] == '\n') ++currentLine_; + ++pos_; + } + } + + Token readIdentifier() { + std::string value; + while (pos_ < source_.size() && std::isalnum(static_cast(source_[pos_]))) + value += source_[pos_++]; + return {TokenKind::Identifier, std::move(value), currentLine_}; + } + + Token readNumber() { + std::string value; + while (pos_ < source_.size() && std::isdigit(static_cast(source_[pos_]))) + value += source_[pos_++]; + return {TokenKind::Number, std::move(value), currentLine_}; + } + + Token readOperator() { + return {TokenKind::Operator, std::string(1, source_[pos_++]), currentLine_}; + } +}; diff --git a/priv/combined_metrics/samples/file_structure/headers_have_include_guards/good/Parser.cpp b/priv/combined_metrics/samples/file_structure/headers_have_include_guards/good/Parser.cpp new file mode 100644 index 0000000..09df204 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/headers_have_include_guards/good/Parser.cpp @@ -0,0 +1,116 @@ +// This file demonstrates a translation unit that includes multiple headers, +// each of which is protected by an include guard (or #pragma once). +// The guards ensure that even if the same header is transitively included +// multiple times, its contents are only processed once by the compiler. + +// token.h (inline for demonstration) +// ----------------------------------------------- +// #pragma once ← include guard: #pragma once form +// +// enum class TokenKind { Identifier, Number, Operator, EndOfStream }; +// +// struct Token { +// TokenKind kind; +// std::string value; +// int line; +// }; +// ----------------------------------------------- + +// parse_error.h (inline for demonstration) +// ----------------------------------------------- +// #ifndef MYAPP_PARSE_ERROR_H ← include guard: #define guard form +// #define MYAPP_PARSE_ERROR_H +// +// #include +// #include +// +// class ParseError : public std::runtime_error { +// public: +// explicit ParseError(const std::string& msg, int line) +// : std::runtime_error(msg), line_(line) {} +// int line() const noexcept { return line_; } +// private: +// int line_; +// }; +// +// #endif // MYAPP_PARSE_ERROR_H +// ----------------------------------------------- + +#include +#include +#include +#include + +// Both headers above are guarded; including them multiple times (e.g., via +// transitive includes) is safe and idiomatic. + +enum class TokenKind { Identifier, Number, Operator, EndOfStream }; + +struct Token { + TokenKind kind; + std::string value; + int line; +}; + +class ParseError : public std::runtime_error { +public: + explicit ParseError(const std::string& msg, int line) + : std::runtime_error(msg), line_(line) {} + int line() const noexcept { return line_; } +private: + int line_; +}; + +class Parser { +public: + explicit Parser(std::string source) + : source_(std::move(source)), pos_(0), currentLine_(1) {} + + std::vector tokenize() { + std::vector tokens; + while (pos_ < source_.size()) { + skipWhitespace(); + if (pos_ >= source_.size()) break; + + char c = source_[pos_]; + if (std::isalpha(static_cast(c))) + tokens.push_back(readIdentifier()); + else if (std::isdigit(static_cast(c))) + tokens.push_back(readNumber()); + else + tokens.push_back(readOperator()); + } + tokens.push_back({TokenKind::EndOfStream, "", currentLine_}); + return tokens; + } + +private: + std::string source_; + std::size_t pos_; + int currentLine_; + + void skipWhitespace() { + while (pos_ < source_.size() && std::isspace(static_cast(source_[pos_]))) { + if (source_[pos_] == '\n') ++currentLine_; + ++pos_; + } + } + + Token readIdentifier() { + std::string value; + while (pos_ < source_.size() && std::isalnum(static_cast(source_[pos_]))) + value += source_[pos_++]; + return {TokenKind::Identifier, std::move(value), currentLine_}; + } + + Token readNumber() { + std::string value; + while (pos_ < source_.size() && std::isdigit(static_cast(source_[pos_]))) + value += source_[pos_++]; + return {TokenKind::Number, std::move(value), currentLine_}; + } + + Token readOperator() { + return {TokenKind::Operator, std::string(1, source_[pos_++]), currentLine_}; + } +}; diff --git a/priv/combined_metrics/samples/file_structure/line_count_under_300/bad/mega_service.ex b/priv/combined_metrics/samples/file_structure/line_count_under_300/bad/mega_service.ex new file mode 100644 index 0000000..f14f2de --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/line_count_under_300/bad/mega_service.ex @@ -0,0 +1,130 @@ +defmodule MegaService do + @moduledoc "Handles accounts, payments, shipping, and email all in one module." + def create_account(email, password) do + if String.length(password) < 8, do: {:error, :weak_password}, else: {:ok, %{email: email, password_hash: hash(password), id: generate_id()}} + end + def update_account(id, attrs) do + case find_account(id) do + nil -> {:error, :not_found} + account -> {:ok, Map.merge(account, attrs)} + end + end + def delete_account(id) do + case find_account(id) do + nil -> {:error, :not_found} + _account -> :ok + end + end + def authenticate(email, password) do + case find_by_email(email) do + nil -> {:error, :not_found} + account -> if verify_password(password, account.password_hash), do: {:ok, account}, else: {:error, :invalid_password} + end + end + def change_password(id, old_password, new_password) do + with {:ok, account} <- {:ok, find_account(id)}, true <- verify_password(old_password, account.password_hash), true <- String.length(new_password) >= 8 do + {:ok, Map.put(account, :password_hash, hash(new_password))} + else + _ -> {:error, :password_change_failed} + end + end + def charge_card(account_id, amount_cents, card_token) do + if amount_cents <= 0, do: {:error, :invalid_amount}, else: call_payment_gateway(card_token, amount_cents, account_id) + end + def refund_charge(charge_id, amount_cents) do + case find_charge(charge_id) do + nil -> {:error, :not_found} + charge -> if amount_cents > charge.amount, do: {:error, :exceeds_original}, else: process_refund(charge, amount_cents) + end + end + def create_subscription(account_id, plan) do + valid_plans = [:basic, :pro, :enterprise] + if plan in valid_plans do + {:ok, %{account_id: account_id, plan: plan, started_at: DateTime.utc_now(), billing_cycle: :monthly}} + else + {:error, :invalid_plan} + end + end + def cancel_subscription(account_id) do + case find_subscription(account_id) do + nil -> {:error, :no_subscription} + sub -> {:ok, Map.put(sub, :cancelled_at, DateTime.utc_now())} + end + end + def apply_coupon(account_id, code) do + case lookup_coupon(code) do + nil -> {:error, :invalid_coupon} + coupon -> if coupon.expired, do: {:error, :expired_coupon}, else: attach_coupon(account_id, coupon) + end + end + def create_shipment(order_id, address) do + case find_order(order_id) do + nil -> {:error, :order_not_found} + order -> {:ok, %{order_id: order.id, address: address, tracking: generate_tracking(), status: :pending}} + end + end + def update_shipment_status(shipment_id, status) do + valid_statuses = [:pending, :in_transit, :delivered, :returned] + if status in valid_statuses do + case find_shipment(shipment_id) do + nil -> {:error, :not_found} + shipment -> {:ok, Map.put(shipment, :status, status)} + end + else + {:error, :invalid_status} + end + end + def estimate_delivery(shipment_id) do + case find_shipment(shipment_id) do + nil -> {:error, :not_found} + %{status: :delivered} -> {:error, :already_delivered} + shipment -> {:ok, calculate_eta(shipment)} + end + end + def cancel_shipment(shipment_id) do + case find_shipment(shipment_id) do + nil -> {:error, :not_found} + %{status: :delivered} -> {:error, :cannot_cancel_delivered} + shipment -> {:ok, Map.put(shipment, :status, :cancelled)} + end + end + def send_welcome_email(account_id) do + case find_account(account_id) do + nil -> {:error, :not_found} + account -> dispatch_email(account.email, "Welcome!", welcome_body(account)) + end + end + def send_receipt_email(account_id, charge_id) do + with account when not is_nil(account) <- find_account(account_id), charge when not is_nil(charge) <- find_charge(charge_id) do + dispatch_email(account.email, "Your receipt", receipt_body(charge)) + else + nil -> {:error, :not_found} + end + end + def send_shipment_notification(account_id, shipment_id) do + with account when not is_nil(account) <- find_account(account_id), shipment when not is_nil(shipment) <- find_shipment(shipment_id) do + dispatch_email(account.email, "Your order shipped!", shipment_body(shipment)) + else + nil -> {:error, :not_found} + end + end + defp hash(password), do: :crypto.hash(:sha256, password) + defp generate_id, do: :rand.uniform(1_000_000) + defp generate_tracking, do: "TRACK-#{:rand.uniform(999_999)}" + defp find_account(_id), do: nil + defp find_by_email(_email), do: nil + defp verify_password(_pw, _hash), do: true + defp call_payment_gateway(_token, _amount, _id), do: {:ok, %{id: generate_id()}} + defp find_charge(_id), do: nil + defp process_refund(charge, _amount), do: {:ok, charge} + defp find_subscription(_id), do: nil + defp lookup_coupon(_code), do: nil + defp attach_coupon(_id, coupon), do: {:ok, coupon} + defp find_order(_id), do: nil + defp find_shipment(_id), do: nil + defp calculate_eta(_shipment), do: DateTime.add(DateTime.utc_now(), 3 * 24 * 3600) + defp dispatch_email(_to, _subject, _body), do: :ok + defp welcome_body(account), do: "Welcome #{account.email}" + defp receipt_body(charge), do: "Amount: #{charge}" + defp shipment_body(shipment), do: "Tracking: #{shipment}" +end diff --git a/priv/combined_metrics/samples/file_structure/line_count_under_300/config.yml b/priv/combined_metrics/samples/file_structure/line_count_under_300/config.yml new file mode 100644 index 0000000..2c19563 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/line_count_under_300/config.yml @@ -0,0 +1 @@ +doc: "Files should be under 300 lines; longer files typically violate single responsibility." diff --git a/priv/combined_metrics/samples/file_structure/line_count_under_300/good/order_service.ex b/priv/combined_metrics/samples/file_structure/line_count_under_300/good/order_service.ex new file mode 100644 index 0000000..761517a --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/line_count_under_300/good/order_service.ex @@ -0,0 +1,68 @@ +defmodule OrderService do + @moduledoc """ + Manages order lifecycle: creation, updates, and cancellation. + """ + + alias OrderService.{Order, Repo} + + @spec create_order(map()) :: {:ok, Order.t()} | {:error, String.t()} + def create_order(attrs) do + with {:ok, validated} <- validate_order_attrs(attrs), + {:ok, order} <- Repo.insert(Order, validated) do + {:ok, order} + end + end + + @spec get_order(String.t()) :: {:ok, Order.t()} | {:error, :not_found} + def get_order(order_id) do + case Repo.find(Order, order_id) do + nil -> {:error, :not_found} + order -> {:ok, order} + end + end + + @spec list_orders_for_user(String.t()) :: {:ok, list(Order.t())} + def list_orders_for_user(user_id) do + orders = Repo.all(Order, user_id: user_id) + {:ok, orders} + end + + @spec update_order(String.t(), map()) :: {:ok, Order.t()} | {:error, :not_found | String.t()} + def update_order(order_id, attrs) do + with {:ok, order} <- get_order(order_id), + {:ok, validated} <- validate_order_attrs(attrs), + {:ok, updated} <- Repo.update(order, validated) do + {:ok, updated} + end + end + + @spec cancel_order(String.t()) :: {:ok, Order.t()} | {:error, :not_found | :already_cancelled} + def cancel_order(order_id) do + case get_order(order_id) do + {:error, :not_found} -> + {:error, :not_found} + {:ok, %Order{status: :cancelled}} -> + {:error, :already_cancelled} + {:ok, order} -> + Repo.update(order, %{status: :cancelled}) + end + end + + @spec complete_order(String.t()) :: {:ok, Order.t()} | {:error, :not_found | :not_fulfillable} + def complete_order(order_id) do + with {:ok, order} <- get_order(order_id), + :ok <- ensure_fulfillable(order), + {:ok, completed} <- Repo.update(order, %{status: :completed}) do + {:ok, completed} + end + end + + defp validate_order_attrs(%{items: items}) when is_list(items) and length(items) > 0 do + {:ok, items} + end + + defp validate_order_attrs(_), do: {:error, "Order must contain at least one item"} + + defp ensure_fulfillable(%Order{status: :pending}), do: :ok + defp ensure_fulfillable(_), do: {:error, :not_fulfillable} +end diff --git a/priv/combined_metrics/samples/file_structure/line_length_under_120/bad/query_builder.ex b/priv/combined_metrics/samples/file_structure/line_length_under_120/bad/query_builder.ex new file mode 100644 index 0000000..5e55bb9 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/line_length_under_120/bad/query_builder.ex @@ -0,0 +1,46 @@ +defmodule QueryBuilder do + @moduledoc """ + Builds Ecto queries for filtering and sorting records. + """ + + import Ecto.Query + + def build_user_query(filters) do + from(u in "users", where: u.active == true and u.role in ^Map.get(filters, :roles, ["admin", "member", "viewer", "guest"]) and u.inserted_at >= ^Map.get(filters, :since, ~D[2020-01-01])) + end + + def build_order_query(user_id, status, date_from, date_to, include_archived) do + from(o in "orders", where: o.user_id == ^user_id and o.status == ^status and o.inserted_at >= ^date_from and o.inserted_at <= ^date_to and (^include_archived or o.archived == false), order_by: [desc: o.inserted_at]) + end + + def build_product_query(filters) do + from(p in "products", where: p.price >= ^Map.get(filters, :min_price, 0) and p.price <= ^Map.get(filters, :max_price, 999_999) and p.category in ^Map.get(filters, :categories, []) and p.in_stock == ^Map.get(filters, :in_stock, true), select: %{id: p.id, name: p.name, price: p.price, category: p.category, description: p.description}) + end + + def paginate(query, page, per_page) do + offset = (page - 1) * per_page + from(q in query, limit: ^per_page, offset: ^offset) + end + + def apply_sort(query, "name_asc"), do: from(q in query, order_by: [asc: q.name]) + def apply_sort(query, "name_desc"), do: from(q in query, order_by: [desc: q.name]) + def apply_sort(query, "created_asc"), do: from(q in query, order_by: [asc: q.inserted_at]) + def apply_sort(query, "created_desc"), do: from(q in query, order_by: [desc: q.inserted_at]) + def apply_sort(query, _), do: query + + def with_preloads(query, preloads) when is_list(preloads) do + Enum.reduce(preloads, query, fn preload, acc -> from(q in acc, preload: ^[preload]) end) + end + + def build_report_query(tenant_id, report_type, date_range_start, date_range_end, group_by_field, aggregate_function, having_threshold) do + from(r in "report_entries", where: r.tenant_id == ^tenant_id and r.type == ^report_type and r.date >= ^date_range_start and r.date <= ^date_range_end, group_by: ^[group_by_field], having: fragment("? > ?", ^aggregate_function, ^having_threshold)) + end + + def build_search_query(search_term, fields, opts) do + pattern = "%#{String.replace(search_term, "%", "\\%")}%" + conditions = Enum.map(fields, fn field -> dynamic([q], ilike(field(q, ^field), ^pattern)) end) + combined_condition = Enum.reduce(conditions, fn cond, acc -> dynamic(^acc or ^cond) end) + base = from(q in Map.get(opts, :schema, "records"), where: ^combined_condition, limit: ^Map.get(opts, :limit, 50), offset: ^Map.get(opts, :offset, 0)) + base + end +end diff --git a/priv/combined_metrics/samples/file_structure/line_length_under_120/config.yml b/priv/combined_metrics/samples/file_structure/line_length_under_120/config.yml new file mode 100644 index 0000000..42f9df1 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/line_length_under_120/config.yml @@ -0,0 +1 @@ +doc: "Lines should be under 120 characters to avoid horizontal scrolling." diff --git a/priv/combined_metrics/samples/file_structure/line_length_under_120/good/query_builder.ex b/priv/combined_metrics/samples/file_structure/line_length_under_120/good/query_builder.ex new file mode 100644 index 0000000..994e696 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/line_length_under_120/good/query_builder.ex @@ -0,0 +1,89 @@ +defmodule QueryBuilder do + @moduledoc """ + Builds Ecto queries for filtering and sorting records. + """ + + import Ecto.Query + + def build_user_query(filters) do + roles = Map.get(filters, :roles, ["admin", "member", "viewer"]) + since = Map.get(filters, :since, ~D[2020-01-01]) + + from u in "users", + where: u.active == true, + where: u.role in ^roles, + where: u.inserted_at >= ^since + end + + def build_order_query(user_id, status, date_from, date_to, include_archived) do + from o in "orders", + where: o.user_id == ^user_id, + where: o.status == ^status, + where: o.inserted_at >= ^date_from, + where: o.inserted_at <= ^date_to, + where: ^include_archived or o.archived == false, + order_by: [desc: o.inserted_at] + end + + def build_product_query(filters) do + min_price = Map.get(filters, :min_price, 0) + max_price = Map.get(filters, :max_price, 999_999) + categories = Map.get(filters, :categories, []) + in_stock = Map.get(filters, :in_stock, true) + + from p in "products", + where: p.price >= ^min_price, + where: p.price <= ^max_price, + where: p.category in ^categories, + where: p.in_stock == ^in_stock, + select: %{id: p.id, name: p.name, price: p.price, category: p.category} + end + + def paginate(query, page, per_page) do + offset = (page - 1) * per_page + + from q in query, + limit: ^per_page, + offset: ^offset + end + + def apply_sort(query, "name_asc"), do: from(q in query, order_by: [asc: q.name]) + def apply_sort(query, "name_desc"), do: from(q in query, order_by: [desc: q.name]) + def apply_sort(query, "created_asc"), do: from(q in query, order_by: [asc: q.inserted_at]) + def apply_sort(query, "created_desc"), do: from(q in query, order_by: [desc: q.inserted_at]) + def apply_sort(query, _), do: query + + def with_preloads(query, preloads) when is_list(preloads) do + Enum.reduce(preloads, query, fn preload, acc -> + from q in acc, preload: ^[preload] + end) + end + + def build_report_query(tenant_id, report_type, date_start, date_end) do + from r in "report_entries", + where: r.tenant_id == ^tenant_id, + where: r.type == ^report_type, + where: r.date >= ^date_start, + where: r.date <= ^date_end + end + + def build_search_query(search_term, fields, opts) do + pattern = "%#{String.replace(search_term, "%", "\\%")}%" + schema = Map.get(opts, :schema, "records") + limit = Map.get(opts, :limit, 50) + offset = Map.get(opts, :offset, 0) + + conditions = Enum.map(fields, fn field -> + dynamic([q], ilike(field(q, ^field), ^pattern)) + end) + + combined = Enum.reduce(conditions, fn cond, acc -> + dynamic(^acc or ^cond) + end) + + from q in schema, + where: ^combined, + limit: ^limit, + offset: ^offset + end +end diff --git a/priv/combined_metrics/samples/file_structure/no_magic_numbers/bad/rate_limiter.ex b/priv/combined_metrics/samples/file_structure/no_magic_numbers/bad/rate_limiter.ex new file mode 100644 index 0000000..2a1b738 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/no_magic_numbers/bad/rate_limiter.ex @@ -0,0 +1,91 @@ +defmodule RateLimiter do + @moduledoc """ + Rate limiting logic for API endpoints. + """ + + def check_rate(user_id, action) do + key = "#{user_id}:#{action}" + count = get_count(key) + + cond do + action == :api_call and count >= 100 -> + {:error, :rate_limited} + action == :login and count >= 5 -> + {:error, :rate_limited} + action == :export and count >= 10 -> + {:error, :rate_limited} + true -> + increment_count(key) + :ok + end + end + + def session_valid?(created_at) do + age_seconds = DateTime.diff(DateTime.utc_now(), created_at) + age_seconds < 3600 + end + + def token_expired?(issued_at) do + age_seconds = DateTime.diff(DateTime.utc_now(), issued_at) + age_seconds > 86400 + end + + def compute_backoff(attempt) do + min(1000 * :math.pow(2, attempt), 30_000) + end + + def charge_credits(user_id, action) do + cost = + case action do + :api_call -> 1 + :export -> 10 + :bulk_import -> 50 + :report -> 25 + end + + balance = get_balance(user_id) + + if balance >= cost do + deduct_credits(user_id, cost) + :ok + else + {:error, :insufficient_credits} + end + end + + def apply_rate_penalty(user_id, violation_count) do + penalty_seconds = + cond do + violation_count >= 10 -> 86400 + violation_count >= 5 -> 3600 + violation_count >= 3 -> 300 + true -> 60 + end + + lock_until = DateTime.add(DateTime.utc_now(), penalty_seconds) + set_lock(user_id, lock_until) + end + + def calculate_overage_fee(requests_made, limit) do + overage = max(0, requests_made - limit) + overage * 0.15 + end + + def burst_allowed?(user_id) do + recent = count_recent_requests(user_id, 60) + recent < 200 + end + + def cleanup_old_entries do + cutoff = DateTime.add(DateTime.utc_now(), -604800) + delete_entries_before(cutoff) + end + + defp get_count(_key), do: 0 + defp increment_count(_key), do: :ok + defp get_balance(_user_id), do: 100 + defp deduct_credits(_user_id, _amount), do: :ok + defp set_lock(_user_id, _until), do: :ok + defp count_recent_requests(_user_id, _seconds), do: 0 + defp delete_entries_before(_cutoff), do: :ok +end diff --git a/priv/combined_metrics/samples/file_structure/no_magic_numbers/config.yml b/priv/combined_metrics/samples/file_structure/no_magic_numbers/config.yml new file mode 100644 index 0000000..dddb8b2 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/no_magic_numbers/config.yml @@ -0,0 +1 @@ +doc: "Numeric literals should be extracted to named constants rather than used inline." diff --git a/priv/combined_metrics/samples/file_structure/no_magic_numbers/good/rate_limiter.ex b/priv/combined_metrics/samples/file_structure/no_magic_numbers/good/rate_limiter.ex new file mode 100644 index 0000000..ecde981 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/no_magic_numbers/good/rate_limiter.ex @@ -0,0 +1,114 @@ +defmodule RateLimiter do + @moduledoc """ + Rate limiting logic for API endpoints. + """ + + @api_call_limit 100 + @login_limit 5 + @export_limit 10 + + @session_ttl_seconds 3_600 + @token_ttl_seconds 86_400 + @week_in_seconds 604_800 + + @max_backoff_ms 30_000 + @base_backoff_ms 1_000 + + @credit_cost_api_call 1 + @credit_cost_export 10 + @credit_cost_bulk_import 50 + @credit_cost_report 25 + + @penalty_minor_seconds 60 + @penalty_low_seconds 300 + @penalty_medium_seconds 3_600 + @penalty_high_seconds 86_400 + + @overage_fee_per_request 0.15 + @burst_window_seconds 60 + @burst_limit 200 + + def check_rate(user_id, action) do + key = "#{user_id}:#{action}" + count = get_count(key) + + cond do + action == :api_call and count >= @api_call_limit -> + {:error, :rate_limited} + action == :login and count >= @login_limit -> + {:error, :rate_limited} + action == :export and count >= @export_limit -> + {:error, :rate_limited} + true -> + increment_count(key) + :ok + end + end + + def session_valid?(created_at) do + age_seconds = DateTime.diff(DateTime.utc_now(), created_at) + age_seconds < @session_ttl_seconds + end + + def token_expired?(issued_at) do + age_seconds = DateTime.diff(DateTime.utc_now(), issued_at) + age_seconds > @token_ttl_seconds + end + + def compute_backoff(attempt) do + min(@base_backoff_ms * :math.pow(2, attempt), @max_backoff_ms) + end + + def charge_credits(user_id, action) do + cost = credit_cost(action) + balance = get_balance(user_id) + + if balance >= cost do + deduct_credits(user_id, cost) + :ok + else + {:error, :insufficient_credits} + end + end + + def apply_rate_penalty(user_id, violation_count) do + penalty_seconds = + cond do + violation_count >= 10 -> @penalty_high_seconds + violation_count >= 5 -> @penalty_medium_seconds + violation_count >= 3 -> @penalty_low_seconds + true -> @penalty_minor_seconds + end + + lock_until = DateTime.add(DateTime.utc_now(), penalty_seconds) + set_lock(user_id, lock_until) + end + + def calculate_overage_fee(requests_made, limit) do + overage = max(0, requests_made - limit) + overage * @overage_fee_per_request + end + + def burst_allowed?(user_id) do + recent = count_recent_requests(user_id, @burst_window_seconds) + recent < @burst_limit + end + + def cleanup_old_entries do + cutoff = DateTime.add(DateTime.utc_now(), -@week_in_seconds) + delete_entries_before(cutoff) + end + + defp credit_cost(:api_call), do: @credit_cost_api_call + defp credit_cost(:export), do: @credit_cost_export + defp credit_cost(:bulk_import), do: @credit_cost_bulk_import + defp credit_cost(:report), do: @credit_cost_report + + defp get_count(_key), do: 0 + defp increment_count(_key), do: :ok + defp get_balance(_user_id), do: 100 + defp deduct_credits(_user_id, _amount), do: :ok + defp set_lock(_user_id, _until), do: :ok + defp count_recent_requests(_user_id, _seconds), do: 0 + defp delete_entries_before(_cutoff), do: :ok +end diff --git a/priv/combined_metrics/samples/file_structure/one_top_level_class_per_file/bad/InvoiceService.java b/priv/combined_metrics/samples/file_structure/one_top_level_class_per_file/bad/InvoiceService.java new file mode 100644 index 0000000..49c3ca7 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/one_top_level_class_per_file/bad/InvoiceService.java @@ -0,0 +1,55 @@ +package com.example.billing; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.util.List; + +// Multiple top-level class declarations in a single file + +public class InvoiceService { + + private final InvoiceRepository repository; + private final TaxCalculator taxCalculator; + + public InvoiceService(InvoiceRepository repository, TaxCalculator taxCalculator) { + this.repository = repository; + this.taxCalculator = taxCalculator; + } + + public Invoice createInvoice(Order order) { + List lineItems = order.getItems().stream() + .map(item -> new InvoiceLineItem(item.getDescription(), item.getUnitPrice(), item.getQuantity())) + .toList(); + return new Invoice(order.getId(), lineItems, LocalDate.now().plusDays(30)); + } +} + +// Second top-level class in the same file — violates the one-class-per-file rule +class InvoiceLineItem { + private final String description; + private final BigDecimal unitPrice; + private final int quantity; + + public InvoiceLineItem(String description, BigDecimal unitPrice, int quantity) { + this.description = description; + this.unitPrice = unitPrice; + this.quantity = quantity; + } + + public BigDecimal total() { + return unitPrice.multiply(BigDecimal.valueOf(quantity)); + } + + public String getDescription() { return description; } + public BigDecimal getUnitPrice() { return unitPrice; } + public int getQuantity() { return quantity; } +} + +// Third top-level class in the same file +class InvoiceValidator { + public boolean isValid(Invoice invoice) { + return invoice != null + && invoice.getId() != null + && !invoice.getLineItems().isEmpty(); + } +} diff --git a/priv/combined_metrics/samples/file_structure/one_top_level_class_per_file/good/InvoiceService.java b/priv/combined_metrics/samples/file_structure/one_top_level_class_per_file/good/InvoiceService.java new file mode 100644 index 0000000..5117698 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/one_top_level_class_per_file/good/InvoiceService.java @@ -0,0 +1,63 @@ +package com.example.billing; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.util.List; + +/** + * Service responsible for creating and managing invoices. + * Supporting types (InvoiceLineItem, InvoiceStatus) live in their own files. + */ +public class InvoiceService { + + private final InvoiceRepository repository; + private final TaxCalculator taxCalculator; + private final NotificationService notifications; + + public InvoiceService( + InvoiceRepository repository, + TaxCalculator taxCalculator, + NotificationService notifications + ) { + this.repository = repository; + this.taxCalculator = taxCalculator; + this.notifications = notifications; + } + + public Invoice createInvoice(Order order) { + List lineItems = order.getItems().stream() + .map(item -> new InvoiceLineItem(item.getDescription(), item.getUnitPrice(), item.getQuantity())) + .toList(); + + BigDecimal subtotal = lineItems.stream() + .map(InvoiceLineItem::total) + .reduce(BigDecimal.ZERO, BigDecimal::add); + + BigDecimal tax = taxCalculator.calculate(subtotal, order.getRegion()); + + Invoice invoice = new Invoice( + order.getId(), + lineItems, + subtotal, + tax, + LocalDate.now().plusDays(30) + ); + + repository.save(invoice); + notifications.sendInvoiceCreated(order.getCustomerEmail(), invoice); + return invoice; + } + + public void markPaid(String invoiceId) { + Invoice invoice = repository.findByIdOrThrow(invoiceId); + invoice.markPaid(); + repository.update(invoice); + notifications.sendPaymentConfirmation(invoice); + } + + public List findOverdue() { + return repository.findByDueDateBefore(LocalDate.now()).stream() + .filter(inv -> inv.getStatus() == InvoiceStatus.PENDING) + .toList(); + } +} diff --git a/priv/combined_metrics/samples/file_structure/single_responsibility/bad/user_handler.ex b/priv/combined_metrics/samples/file_structure/single_responsibility/bad/user_handler.ex new file mode 100644 index 0000000..8178879 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/single_responsibility/bad/user_handler.ex @@ -0,0 +1,110 @@ +defmodule UserHandler do + @moduledoc """ + Handles everything user-related: registration, email, payments, and audit. + """ + + require Logger + + def register_user(attrs) do + with {:ok, _} <- validate_registration(attrs), + {:ok, user} <- insert_user(attrs), + :ok <- send_welcome_email(user), + :ok <- create_free_trial_subscription(user), + :ok <- log_audit_event(:user_registered, user) do + {:ok, user} + end + end + + def update_user(id, attrs) do + case find_user(id) do + nil -> {:error, :not_found} + user -> + updated = Map.merge(user, attrs) + save_user(updated) + send_profile_updated_email(updated) + log_audit_event(:user_updated, updated) + {:ok, updated} + end + end + + def delete_user(id) do + case find_user(id) do + nil -> {:error, :not_found} + user -> + cancel_subscription(user) + send_goodbye_email(user) + remove_user(user) + log_audit_event(:user_deleted, user) + :ok + end + end + + def send_welcome_email(user) do + body = "Hi #{user.name}, welcome to our platform!" + dispatch_email(user.email, "Welcome!", body) + end + + def send_profile_updated_email(user) do + body = "Hi #{user.name}, your profile has been updated." + dispatch_email(user.email, "Profile Updated", body) + end + + def send_goodbye_email(user) do + body = "Goodbye #{user.name}, your account has been deleted." + dispatch_email(user.email, "Account Deleted", body) + end + + def create_free_trial_subscription(user) do + sub = %{user_id: user.id, plan: :free_trial, expires_at: trial_expiry()} + save_subscription(sub) + charge_initial_setup_fee(user, 0) + :ok + end + + def cancel_subscription(user) do + case find_subscription(user.id) do + nil -> :ok + sub -> + update_subscription(sub, %{status: :cancelled}) + process_prorated_refund(user, sub) + :ok + end + end + + def charge_initial_setup_fee(user, amount) do + if amount > 0 do + call_payment_gateway(user.payment_method, amount) + else + :ok + end + end + + def process_prorated_refund(_user, _sub) do + :ok + end + + def log_audit_event(event, user) do + Logger.info("AUDIT: #{event} for user #{user.id} at #{DateTime.utc_now()}") + write_audit_log(%{event: event, user_id: user.id, timestamp: DateTime.utc_now()}) + end + + defp validate_registration(attrs) do + if Map.has_key?(attrs, :email) and Map.has_key?(attrs, :password) do + {:ok, attrs} + else + {:error, :missing_fields} + end + end + + defp find_user(_id), do: nil + defp insert_user(attrs), do: {:ok, Map.put(attrs, :id, :rand.uniform(1000))} + defp save_user(_user), do: :ok + defp remove_user(_user), do: :ok + defp dispatch_email(_to, _subject, _body), do: :ok + defp save_subscription(_sub), do: :ok + defp find_subscription(_user_id), do: nil + defp update_subscription(_sub, _attrs), do: :ok + defp call_payment_gateway(_method, _amount), do: :ok + defp write_audit_log(_entry), do: :ok + defp trial_expiry, do: DateTime.add(DateTime.utc_now(), 30 * 86_400) +end diff --git a/priv/combined_metrics/samples/file_structure/single_responsibility/config.yml b/priv/combined_metrics/samples/file_structure/single_responsibility/config.yml new file mode 100644 index 0000000..2497edc --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/single_responsibility/config.yml @@ -0,0 +1 @@ +doc: "Each file should have one primary concern — low complexity spread across few, focused functions." diff --git a/priv/combined_metrics/samples/file_structure/single_responsibility/good/user_registration.ex b/priv/combined_metrics/samples/file_structure/single_responsibility/good/user_registration.ex new file mode 100644 index 0000000..a530341 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/single_responsibility/good/user_registration.ex @@ -0,0 +1,53 @@ +defmodule UserRegistration do + @moduledoc """ + Handles new user registration: validation and account creation only. + + Side effects (email, billing, audit) are delegated to their respective + context modules and triggered via events after successful registration. + """ + + alias UserRegistration.{Repo, User} + + @spec register(map()) :: {:ok, User.t()} | {:error, :missing_fields | :email_taken | String.t()} + def register(attrs) do + with {:ok, validated} <- validate(attrs), + :ok <- ensure_email_available(validated.email), + {:ok, user} <- Repo.insert(User, validated) do + {:ok, user} + end + end + + @spec validate(map()) :: {:ok, map()} | {:error, :missing_fields} + def validate(attrs) do + required = [:email, :password, :name] + missing = Enum.reject(required, &Map.has_key?(attrs, &1)) + + if missing == [] do + {:ok, attrs} + else + {:error, :missing_fields} + end + end + + @spec ensure_email_available(String.t()) :: :ok | {:error, :email_taken} + def ensure_email_available(email) do + case Repo.find_by(User, email: email) do + nil -> :ok + _existing -> {:error, :email_taken} + end + end + + @spec valid_password?(String.t()) :: boolean() + def valid_password?(password) do + String.length(password) >= 8 and + String.match?(password, ~r/[A-Z]/) and + String.match?(password, ~r/[0-9]/) + end + + @spec normalize_email(String.t()) :: String.t() + def normalize_email(email) do + email + |> String.trim() + |> String.downcase() + end +end diff --git a/priv/combined_metrics/samples/file_structure/uses_standard_indentation_width/bad/mailer.ex b/priv/combined_metrics/samples/file_structure/uses_standard_indentation_width/bad/mailer.ex new file mode 100644 index 0000000..6a23fc5 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/uses_standard_indentation_width/bad/mailer.ex @@ -0,0 +1,86 @@ +defmodule Mailer do + @moduledoc """ + Sends transactional emails. + """ + + @from_address "noreply@example.com" + + def send_welcome(user) do + body = build_welcome_body(user) + dispatch(%{ + to: user.email, + from: @from_address, + subject: "Welcome to the platform", + body: body + }) + end + + def send_password_reset(user, token) do + link = reset_link(token) + body = build_reset_body(user, link) + dispatch(%{ + to: user.email, + from: @from_address, + subject: "Reset your password", + body: body + }) + end + + def send_invoice(user, invoice) do + case format_invoice(invoice) do + {:ok, formatted} -> + dispatch(%{ + to: user.email, + from: @from_address, + subject: "Your invoice ##{invoice.id}", + body: formatted + }) + {:error, reason} -> + {:error, reason} + end + end + + def send_notification(user, message) do + if String.length(message) > 0 do + dispatch(%{ + to: user.email, + from: @from_address, + subject: "Notification", + body: message + }) + else + {:error, :empty_message} + end + end + + def send_bulk(users, subject, body) do + Enum.map(users, fn user -> + dispatch(%{ + to: user.email, + from: @from_address, + subject: subject, + body: body + }) + end) + end + + defp build_welcome_body(user) do + "Hi #{user.name}, welcome aboard!" + end + + defp build_reset_body(user, link) do + "Hi #{user.name}, reset your password here: #{link}" + end + + defp reset_link(token) do + "https://example.com/reset?token=#{token}" + end + + defp format_invoice(invoice) do + {:ok, "Invoice ##{invoice.id}: $#{invoice.total}"} + end + + defp dispatch(email) do + {:ok, email} + end +end diff --git a/priv/combined_metrics/samples/file_structure/uses_standard_indentation_width/config.yml b/priv/combined_metrics/samples/file_structure/uses_standard_indentation_width/config.yml new file mode 100644 index 0000000..6a6b96b --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/uses_standard_indentation_width/config.yml @@ -0,0 +1 @@ +doc: "Indentation should use consistent multiples of 2 or 4 spaces throughout the file." diff --git a/priv/combined_metrics/samples/file_structure/uses_standard_indentation_width/good/mailer.ex b/priv/combined_metrics/samples/file_structure/uses_standard_indentation_width/good/mailer.ex new file mode 100644 index 0000000..2538478 --- /dev/null +++ b/priv/combined_metrics/samples/file_structure/uses_standard_indentation_width/good/mailer.ex @@ -0,0 +1,88 @@ +defmodule Mailer do + @moduledoc """ + Sends transactional emails. + """ + + @from_address "noreply@example.com" + + def send_welcome(user) do + body = build_welcome_body(user) + + dispatch(%{ + to: user.email, + from: @from_address, + subject: "Welcome to the platform", + body: body + }) + end + + def send_password_reset(user, token) do + link = reset_link(token) + body = build_reset_body(user, link) + + dispatch(%{ + to: user.email, + from: @from_address, + subject: "Reset your password", + body: body + }) + end + + def send_invoice(user, invoice) do + case format_invoice(invoice) do + {:ok, formatted} -> + dispatch(%{ + to: user.email, + from: @from_address, + subject: "Your invoice ##{invoice.id}", + body: formatted + }) + {:error, reason} -> + {:error, reason} + end + end + + def send_notification(user, message) do + if String.length(message) > 0 do + dispatch(%{ + to: user.email, + from: @from_address, + subject: "Notification", + body: message + }) + else + {:error, :empty_message} + end + end + + def send_bulk(users, subject, body) do + Enum.map(users, fn user -> + dispatch(%{ + to: user.email, + from: @from_address, + subject: subject, + body: body + }) + end) + end + + defp build_welcome_body(user) do + "Hi #{user.name}, welcome aboard!" + end + + defp build_reset_body(user, link) do + "Hi #{user.name}, reset your password here: #{link}" + end + + defp reset_link(token) do + "https://example.com/reset?token=#{token}" + end + + defp format_invoice(invoice) do + {:ok, "Invoice ##{invoice.id}: $#{invoice.total}"} + end + + defp dispatch(email) do + {:ok, email} + end +end diff --git a/priv/combined_metrics/samples/function_design/arrow_functions_as_callbacks/bad/user_service.ts b/priv/combined_metrics/samples/function_design/arrow_functions_as_callbacks/bad/user_service.ts new file mode 100644 index 0000000..b835058 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/arrow_functions_as_callbacks/bad/user_service.ts @@ -0,0 +1,68 @@ +interface User { + id: string; + email: string; + displayName: string; + role: "admin" | "member"; + score: number; +} + +class UserService { + private readonly baseUrl: string; + private readonly defaultPageSize: number; + + constructor(baseUrl: string, defaultPageSize = 20) { + this.baseUrl = baseUrl; + this.defaultPageSize = defaultPageSize; + } + + async fetchUsers(): Promise { + const response = await fetch(`${this.baseUrl}/users`); + return response.json() as Promise; + } + + async getAdmins(): Promise { + const users = await this.fetchUsers(); + // Using function keyword instead of arrow — `this` is unbound inside + return users.filter(function (user) { + return user.role === "admin"; + }); + } + + async getSortedByScore(): Promise { + const users = await this.fetchUsers(); + return [...users].sort(function (a, b) { + return b.score - a.score; + }); + } + + async getPage(page: number): Promise { + const users = await this.fetchUsers(); + const offset = (page - 1) * this.defaultPageSize; + return users.slice(offset, offset + this.defaultPageSize); + } + + async getDisplayNames(): Promise { + const users = await this.fetchUsers(); + return users.map(function (user) { + return user.displayName; + }); + } + + async searchByEmail(query: string): Promise { + const users = await this.fetchUsers(); + return users.filter(function (user) { + return user.email.toLowerCase().includes(query.toLowerCase()); + }); + } + + async transformToMap(): Promise> { + const users = await this.fetchUsers(); + return users.reduce(function (map, user) { + map.set(user.id, user); + return map; + }, new Map()); + } +} + +export { UserService }; +export type { User }; diff --git a/priv/combined_metrics/samples/function_design/arrow_functions_as_callbacks/good/user_service.ts b/priv/combined_metrics/samples/function_design/arrow_functions_as_callbacks/good/user_service.ts new file mode 100644 index 0000000..cc00697 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/arrow_functions_as_callbacks/good/user_service.ts @@ -0,0 +1,61 @@ +interface User { + id: string; + email: string; + displayName: string; + role: "admin" | "member"; + score: number; +} + +class UserService { + private readonly baseUrl: string; + private readonly defaultPageSize: number; + + constructor(baseUrl: string, defaultPageSize = 20) { + this.baseUrl = baseUrl; + this.defaultPageSize = defaultPageSize; + } + + async fetchUsers(): Promise { + const response = await fetch(`${this.baseUrl}/users`); + return response.json() as Promise; + } + + async getAdmins(): Promise { + const users = await this.fetchUsers(); + return users.filter((user) => user.role === "admin"); + } + + async getSortedByScore(): Promise { + const users = await this.fetchUsers(); + return [...users].sort((a, b) => b.score - a.score); + } + + async getPage(page: number): Promise { + const users = await this.fetchUsers(); + const offset = (page - 1) * this.defaultPageSize; + return users.slice(offset, offset + this.defaultPageSize); + } + + async getDisplayNames(): Promise { + const users = await this.fetchUsers(); + return users.map((user) => user.displayName); + } + + async searchByEmail(query: string): Promise { + const users = await this.fetchUsers(); + return users.filter((user) => + user.email.toLowerCase().includes(query.toLowerCase()) + ); + } + + async transformToMap(): Promise> { + const users = await this.fetchUsers(); + return users.reduce((map, user) => { + map.set(user.id, user); + return map; + }, new Map()); + } +} + +export { UserService }; +export type { User }; diff --git a/priv/combined_metrics/samples/function_design/async_functions_contain_await/bad/payment_gateway.ts b/priv/combined_metrics/samples/function_design/async_functions_contain_await/bad/payment_gateway.ts new file mode 100644 index 0000000..657baaf --- /dev/null +++ b/priv/combined_metrics/samples/function_design/async_functions_contain_await/bad/payment_gateway.ts @@ -0,0 +1,69 @@ +interface ChargeRequest { + amount: number; + currency: string; + paymentMethodId: string; + description: string; +} + +interface ChargeResult { + chargeId: string; + status: "succeeded" | "pending" | "failed"; + amount: number; + currency: string; +} + +// async but no await — just wraps a synchronous value +async function buildChargeRequest( + paymentMethodId: string, + amount: number, + currency: string +): Promise { + return { + amount, + currency, + paymentMethodId, + description: `Charge of ${amount} ${currency}`, + }; +} + +// async but no await — validation is synchronous +async function validateChargeRequest(request: ChargeRequest): Promise { + if (request.amount <= 0) return false; + if (!request.paymentMethodId) return false; + if (!request.currency) return false; + return true; +} + +// async but no await — just rethrows synchronously +async function assertPositiveAmount(amount: number): Promise { + if (amount <= 0) { + throw new Error(`Amount must be positive, got ${amount}`); + } +} + +async function createCharge(request: ChargeRequest): Promise { + const response = await fetch("/api/charges", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(request), + }); + + if (!response.ok) { + throw new Error(`Charge failed with status ${response.status}`); + } + + return response.json() as Promise; +} + +// async but only returns a promise chain without await +async function fetchAndLogCharge(chargeId: string): Promise { + return fetch(`/api/charges/${chargeId}`) + .then((r) => r.json() as Promise) + .then((charge) => { + console.log("Fetched charge", charge.chargeId); + return charge; + }); +} + +export { buildChargeRequest, validateChargeRequest, assertPositiveAmount, createCharge, fetchAndLogCharge }; +export type { ChargeRequest, ChargeResult }; diff --git a/priv/combined_metrics/samples/function_design/async_functions_contain_await/good/payment_gateway.ts b/priv/combined_metrics/samples/function_design/async_functions_contain_await/good/payment_gateway.ts new file mode 100644 index 0000000..b07cdbd --- /dev/null +++ b/priv/combined_metrics/samples/function_design/async_functions_contain_await/good/payment_gateway.ts @@ -0,0 +1,75 @@ +interface ChargeRequest { + amount: number; + currency: string; + paymentMethodId: string; + description: string; +} + +interface ChargeResult { + chargeId: string; + status: "succeeded" | "pending" | "failed"; + amount: number; + currency: string; +} + +async function createCharge(request: ChargeRequest): Promise { + const response = await fetch("/api/charges", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(request), + }); + + if (!response.ok) { + throw new Error(`Charge failed with status ${response.status}`); + } + + return response.json() as Promise; +} + +async function fetchCharge(chargeId: string): Promise { + const response = await fetch(`/api/charges/${chargeId}`); + + if (!response.ok) { + throw new Error(`Charge not found: ${chargeId}`); + } + + return response.json() as Promise; +} + +async function waitForChargeSettlement( + chargeId: string, + maxAttempts = 10 +): Promise { + for (let attempt = 0; attempt < maxAttempts; attempt++) { + const charge = await fetchCharge(chargeId); + + if (charge.status === "succeeded" || charge.status === "failed") { + return charge; + } + + await new Promise((resolve) => setTimeout(resolve, 2000 * (attempt + 1))); + } + + throw new Error(`Charge ${chargeId} did not settle after ${maxAttempts} attempts`); +} + +async function processPaymentWithRetry( + request: ChargeRequest, + maxRetries = 3 +): Promise { + let lastError: Error | null = null; + + for (let i = 0; i < maxRetries; i++) { + try { + const charge = await createCharge(request); + return await waitForChargeSettlement(charge.chargeId); + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + } + } + + throw lastError ?? new Error("Payment failed after retries"); +} + +export { createCharge, fetchCharge, waitForChargeSettlement, processPaymentWithRetry }; +export type { ChargeRequest, ChargeResult }; diff --git a/priv/combined_metrics/samples/function_design/async_method_has_await/bad/EmailDispatcher.cs b/priv/combined_metrics/samples/function_design/async_method_has_await/bad/EmailDispatcher.cs new file mode 100644 index 0000000..d82401d --- /dev/null +++ b/priv/combined_metrics/samples/function_design/async_method_has_await/bad/EmailDispatcher.cs @@ -0,0 +1,59 @@ +using System.Collections.Generic; +using System.Net.Http; +using System.Text.Json; +using System.Threading.Tasks; + +namespace Email +{ + public class EmailDispatcher + { + private readonly HttpClient _httpClient; + private readonly string _apiEndpoint; + + public EmailDispatcher(HttpClient httpClient, string apiEndpoint) + { + _httpClient = httpClient; + _apiEndpoint = apiEndpoint; + } + + // async keyword with no await — compiles with a warning; runs synchronously + public async Task SendAsync(EmailMessage message) + { + var payload = JsonSerializer.Serialize(message); + var content = new StringContent(payload, System.Text.Encoding.UTF8, "application/json"); + + // Missing await — blocks synchronously, defeats the purpose of async + var response = _httpClient.PostAsync(_apiEndpoint, content).Result; + var body = response.Content.ReadAsStringAsync().Result; + + return response.IsSuccessStatusCode + ? SendResult.Success() + : SendResult.Failure(body); + } + + // async but delegates all work to non-awaited helpers — no suspension point + public async Task PingAsync() + { + return CheckPing(); // synchronous; async here adds overhead with no benefit + } + + // async method that just wraps a completed task — should not be async + public async Task GetApiEndpointAsync() + { + return _apiEndpoint; // no await, just returns a value + } + + // async that does no I/O at all — the async machinery is pure overhead + public async Task LogMetricsAsync(int sent, int failed) + { + var summary = $"Sent: {sent}, Failed: {failed}"; + System.Console.WriteLine(summary); + } + + private bool CheckPing() + { + var response = _httpClient.GetAsync(_apiEndpoint + "/ping").Result; + return response.IsSuccessStatusCode; + } + } +} diff --git a/priv/combined_metrics/samples/function_design/async_method_has_await/good/EmailDispatcher.cs b/priv/combined_metrics/samples/function_design/async_method_has_await/good/EmailDispatcher.cs new file mode 100644 index 0000000..6480e69 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/async_method_has_await/good/EmailDispatcher.cs @@ -0,0 +1,59 @@ +using System.Collections.Generic; +using System.Net.Http; +using System.Text.Json; +using System.Threading.Tasks; + +namespace Email +{ + public class EmailDispatcher + { + private readonly HttpClient _httpClient; + private readonly string _apiEndpoint; + + public EmailDispatcher(HttpClient httpClient, string apiEndpoint) + { + _httpClient = httpClient; + _apiEndpoint = apiEndpoint; + } + + public async Task SendAsync(EmailMessage message) + { + var payload = JsonSerializer.Serialize(message); + var content = new StringContent(payload, System.Text.Encoding.UTF8, "application/json"); + + // Contains genuine await — truly async I/O operation + var response = await _httpClient.PostAsync(_apiEndpoint, content); + var body = await response.Content.ReadAsStringAsync(); + + return response.IsSuccessStatusCode + ? SendResult.Success() + : SendResult.Failure(body); + } + + public async Task> SendBatchAsync(IEnumerable messages) + { + var tasks = new List>(); + foreach (var message in messages) + tasks.Add(SendAsync(message)); + + // Awaits all concurrent I/O operations + var results = await Task.WhenAll(tasks); + return results; + } + + public async Task PingAsync() + { + // Contains await — not just wrapping sync work + var response = await _httpClient.GetAsync(_apiEndpoint + "/ping"); + return response.IsSuccessStatusCode; + } + + public async Task GetRemainingQuotaAsync() + { + var response = await _httpClient.GetAsync(_apiEndpoint + "/quota"); + response.EnsureSuccessStatusCode(); + var json = await response.Content.ReadAsStringAsync(); + return JsonSerializer.Deserialize(json)!; + } + } +} diff --git a/priv/combined_metrics/samples/function_design/boolean_function_has_question_mark/bad/guards.ex b/priv/combined_metrics/samples/function_design/boolean_function_has_question_mark/bad/guards.ex new file mode 100644 index 0000000..eb9fb04 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/boolean_function_has_question_mark/bad/guards.ex @@ -0,0 +1,41 @@ +defmodule Guards do + def valid(value) when is_binary(value) do + String.length(value) > 0 + end + + def active(user) do + user.status == :active && !user.banned + end + + def empty(list) when is_list(list) do + length(list) == 0 + end + + def expired(token) do + DateTime.compare(token.expires_at, DateTime.utc_now()) == :lt + end + + def admin(user) do + user.role == :admin + end + + def verified(user) do + user.email_verified && user.phone_verified + end + + def authorized(user, resource) do + user.role == :admin || resource.owner_id == user.id + end + + def pending(order) do + order.status == :pending + end + + def within_limit(count, limit) do + count < limit + end + + def matching(a, b) do + a == b + end +end diff --git a/priv/combined_metrics/samples/function_design/boolean_function_has_question_mark/config.yml b/priv/combined_metrics/samples/function_design/boolean_function_has_question_mark/config.yml new file mode 100644 index 0000000..87e5ea3 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/boolean_function_has_question_mark/config.yml @@ -0,0 +1 @@ +doc: "Functions returning a boolean should end with `?` (Elixir/Ruby) or start with `is_`/`has_` (JS/Python)." diff --git a/priv/combined_metrics/samples/function_design/boolean_function_has_question_mark/good/guards.ex b/priv/combined_metrics/samples/function_design/boolean_function_has_question_mark/good/guards.ex new file mode 100644 index 0000000..bb5c168 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/boolean_function_has_question_mark/good/guards.ex @@ -0,0 +1,41 @@ +defmodule Guards do + def valid?(value) when is_binary(value) do + String.length(value) > 0 + end + + def active?(user) do + user.status == :active && !user.banned + end + + def empty?(list) when is_list(list) do + length(list) == 0 + end + + def expired?(token) do + DateTime.compare(token.expires_at, DateTime.utc_now()) == :lt + end + + def admin?(user) do + user.role == :admin + end + + def verified?(user) do + user.email_verified && user.phone_verified + end + + def authorized?(user, resource) do + user.role == :admin || resource.owner_id == user.id + end + + def pending?(order) do + order.status == :pending + end + + def within_limit?(count, limit) do + count < limit + end + + def matching?(a, b) do + a == b + end +end diff --git a/priv/combined_metrics/samples/function_design/call_site_forms_grammatical_phrase/bad/EventHandler.swift b/priv/combined_metrics/samples/function_design/call_site_forms_grammatical_phrase/bad/EventHandler.swift new file mode 100644 index 0000000..e6eca29 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/call_site_forms_grammatical_phrase/bad/EventHandler.swift @@ -0,0 +1,56 @@ +import Foundation + +enum EventPriority: Int, Comparable { + case low = 0, normal = 1, high = 2, critical = 3 + + static func < (lhs: EventPriority, rhs: EventPriority) -> Bool { + return lhs.rawValue < rhs.rawValue + } +} + +struct AppEvent { + let name: String + let payload: [String: Any] + let priority: EventPriority + let occurredAt: Date +} + +typealias EventCallback = (AppEvent) -> Void + +class EventBus { + private var handlers: [String: [EventCallback]] = [:] + private var filters: [String: (AppEvent) -> Bool] = [:] + + // Reads awkwardly: eventBus.handlerRegistration(callback, eventName: "purchase") + func handlerRegistration(_ callback: @escaping EventCallback, eventName: String) { + handlers[eventName, default: []].append(callback) + } + + // Reads awkwardly: eventBus.handlerRemoval(eventName: "purchase") + func handlerRemoval(eventName: String) { + handlers.removeValue(forKey: eventName) + } + + // Reads awkwardly: eventBus.eventPublishing(event) + func eventPublishing(_ event: AppEvent) { + guard let eventHandlers = handlers[event.name] else { return } + let passes = filters[event.name].map { $0(event) } ?? true + guard passes else { return } + eventHandlers.forEach { $0(event) } + } + + // Reads awkwardly: eventBus.filterAddition(predicate, eventName: "purchase") + func filterAddition(_ predicate: @escaping (AppEvent) -> Bool, eventName: String) { + filters[eventName] = predicate + } + + // Reads awkwardly: eventBus.subscriptionCheck(eventName: "purchase") + func subscriptionCheck(eventName: String) -> Bool { + return handlers[eventName]?.isEmpty == false + } + + // Reads awkwardly: eventBus.priorityFiltering(priority: .critical, log: events) + func priorityFiltering(priority: EventPriority, log: [AppEvent]) -> [AppEvent] { + return log.filter { $0.priority >= priority } + } +} diff --git a/priv/combined_metrics/samples/function_design/call_site_forms_grammatical_phrase/good/EventHandler.swift b/priv/combined_metrics/samples/function_design/call_site_forms_grammatical_phrase/good/EventHandler.swift new file mode 100644 index 0000000..9a7a356 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/call_site_forms_grammatical_phrase/good/EventHandler.swift @@ -0,0 +1,56 @@ +import Foundation + +enum EventPriority: Int, Comparable { + case low = 0, normal = 1, high = 2, critical = 3 + + static func < (lhs: EventPriority, rhs: EventPriority) -> Bool { + return lhs.rawValue < rhs.rawValue + } +} + +struct AppEvent { + let name: String + let payload: [String: Any] + let priority: EventPriority + let occurredAt: Date +} + +typealias EventHandler = (AppEvent) -> Void + +class EventBus { + private var handlers: [String: [EventHandler]] = [:] + private var filters: [String: (AppEvent) -> Bool] = [:] + + // Reads naturally: eventBus.register(handler, for: "purchase") + func register(_ handler: @escaping EventHandler, for eventName: String) { + handlers[eventName, default: []].append(handler) + } + + // Reads naturally: eventBus.remove(handlers, for: "purchase") + func removeHandlers(for eventName: String) { + handlers.removeValue(forKey: eventName) + } + + // Reads naturally: eventBus.publish(event) + func publish(_ event: AppEvent) { + guard let eventHandlers = handlers[event.name] else { return } + let passesFiler = filters[event.name].map { $0(event) } ?? true + guard passesFiler else { return } + eventHandlers.forEach { $0(event) } + } + + // Reads naturally: eventBus.addFilter(predicate, for: "purchase") + func addFilter(_ predicate: @escaping (AppEvent) -> Bool, for eventName: String) { + filters[eventName] = predicate + } + + // Reads naturally: eventBus.isSubscribed(to: "purchase") + func isSubscribed(to eventName: String) -> Bool { + return handlers[eventName]?.isEmpty == false + } + + // Reads naturally: eventBus.events(with priority: .critical) + func events(with priority: EventPriority, from log: [AppEvent]) -> [AppEvent] { + return log.filter { $0.priority >= priority } + } +} diff --git a/priv/combined_metrics/samples/function_design/context_is_first_parameter/bad/mailer.go b/priv/combined_metrics/samples/function_design/context_is_first_parameter/bad/mailer.go new file mode 100644 index 0000000..5b99bd2 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/context_is_first_parameter/bad/mailer.go @@ -0,0 +1,60 @@ +package mailer + +import ( + "context" + "fmt" + "net/smtp" +) + +// Message is an outbound email. +type Message struct { + To string + Subject string + Body string +} + +// SMTPMailer sends email over SMTP. +type SMTPMailer struct { + host string + port int + from string + auth smtp.Auth +} + +// New constructs an SMTPMailer. +func New(host string, port int, from string, auth smtp.Auth) *SMTPMailer { + return &SMTPMailer{host: host, port: port, from: from, auth: auth} +} + +// Send delivers a message. ctx is passed last and named "context" — both +// violate Go conventions: context must be first and named ctx. +func (m *SMTPMailer) Send(msg Message, context context.Context) error { + if context.Err() != nil { + return fmt.Errorf("send email: context already done: %w", context.Err()) + } + + addr := fmt.Sprintf("%s:%d", m.host, m.port) + body := fmt.Sprintf("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s", + m.from, msg.To, msg.Subject, msg.Body) + + if err := smtp.SendMail(addr, m.auth, m.from, []string{msg.To}, []byte(body)); err != nil { + return fmt.Errorf("send email to %q: %w", msg.To, err) + } + return nil +} + +// SendBulk delivers multiple messages. ctx is in the middle — inconsistent +// with the convention that context is always the first parameter. +func (m *SMTPMailer) SendBulk(msgs []Message, ctx context.Context, stopOnError bool) error { + for _, msg := range msgs { + if ctx.Err() != nil { + return fmt.Errorf("send bulk: %w", ctx.Err()) + } + if err := m.Send(msg, ctx); err != nil { + if stopOnError { + return err + } + } + } + return nil +} diff --git a/priv/combined_metrics/samples/function_design/context_is_first_parameter/good/mailer.go b/priv/combined_metrics/samples/function_design/context_is_first_parameter/good/mailer.go new file mode 100644 index 0000000..585c524 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/context_is_first_parameter/good/mailer.go @@ -0,0 +1,57 @@ +package mailer + +import ( + "context" + "fmt" + "net/smtp" +) + +// Message is an outbound email. +type Message struct { + To string + Subject string + Body string +} + +// SMTPMailer sends email over SMTP. +type SMTPMailer struct { + host string + port int + from string + auth smtp.Auth +} + +// New constructs an SMTPMailer. +func New(host string, port int, from string, auth smtp.Auth) *SMTPMailer { + return &SMTPMailer{host: host, port: port, from: from, auth: auth} +} + +// Send delivers a message. ctx is the first parameter, named ctx — idiomatic Go. +func (m *SMTPMailer) Send(ctx context.Context, msg Message) error { + if ctx.Err() != nil { + return fmt.Errorf("send email: context already done: %w", ctx.Err()) + } + + addr := fmt.Sprintf("%s:%d", m.host, m.port) + body := fmt.Sprintf("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s", + m.from, msg.To, msg.Subject, msg.Body) + + if err := smtp.SendMail(addr, m.auth, m.from, []string{msg.To}, []byte(body)); err != nil { + return fmt.Errorf("send email to %q: %w", msg.To, err) + } + return nil +} + +// SendBulk delivers multiple messages, stopping if the context is cancelled. +// ctx is the first parameter on every method that does I/O. +func (m *SMTPMailer) SendBulk(ctx context.Context, msgs []Message) error { + for _, msg := range msgs { + if ctx.Err() != nil { + return fmt.Errorf("send bulk: %w", ctx.Err()) + } + if err := m.Send(ctx, msg); err != nil { + return err + } + } + return nil +} diff --git a/priv/combined_metrics/samples/function_design/cyclomatic_complexity_under_10/bad/order_processor.ex b/priv/combined_metrics/samples/function_design/cyclomatic_complexity_under_10/bad/order_processor.ex new file mode 100644 index 0000000..b73b804 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/cyclomatic_complexity_under_10/bad/order_processor.ex @@ -0,0 +1,52 @@ +defmodule OrderProcessor do + def process(order) do + cond do + order.status == :new && order.payment_method == :card && order.total > 0 -> + if order.user.verified do + if order.items != [] do + case charge_card(order) do + {:ok, charge} -> + if order.total > 1000 do + notify_fraud_team(order) + end + {:ok, %{order | status: :paid, charge_id: charge.id}} + {:error, :declined} -> + {:error, :payment_declined} + {:error, _} -> + {:error, :payment_failed} + end + else + {:error, :empty_order} + end + else + {:error, :unverified_user} + end + + order.status == :new && order.payment_method == :invoice -> + if order.user.credit_approved do + {:ok, %{order | status: :invoiced}} + else + {:error, :credit_not_approved} + end + + order.status == :paid -> + if order.shipment_address != nil do + {:ok, %{order | status: :shipped}} + else + {:error, :no_address} + end + + order.status == :shipped -> + {:ok, %{order | status: :delivered}} + + order.status == :cancelled -> + {:error, :already_cancelled} + + true -> + {:error, :invalid_transition} + end + end + + defp charge_card(order), do: {:ok, %{id: "ch_#{order.id}"}} + defp notify_fraud_team(order), do: IO.puts("Fraud check: #{order.id}") +end diff --git a/priv/combined_metrics/samples/function_design/cyclomatic_complexity_under_10/config.yml b/priv/combined_metrics/samples/function_design/cyclomatic_complexity_under_10/config.yml new file mode 100644 index 0000000..bdf1f44 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/cyclomatic_complexity_under_10/config.yml @@ -0,0 +1 @@ +doc: "Functions should have a cyclomatic complexity under 10." diff --git a/priv/combined_metrics/samples/function_design/cyclomatic_complexity_under_10/good/order_processor.ex b/priv/combined_metrics/samples/function_design/cyclomatic_complexity_under_10/good/order_processor.ex new file mode 100644 index 0000000..2ceae99 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/cyclomatic_complexity_under_10/good/order_processor.ex @@ -0,0 +1,50 @@ +defmodule OrderProcessor do + def process(%{status: :new, payment_method: :card} = order) do + with :ok <- verify_user(order), + :ok <- require_items(order), + {:ok, charge} <- charge_card(order) do + maybe_alert_fraud(order) + {:ok, %{order | status: :paid, charge_id: charge.id}} + end + end + + def process(%{status: :new, payment_method: :invoice} = order) do + if order.user.credit_approved do + {:ok, %{order | status: :invoiced}} + else + {:error, :credit_not_approved} + end + end + + def process(%{status: :paid} = order) do + case order.shipment_address do + nil -> {:error, :no_address} + _ -> {:ok, %{order | status: :shipped}} + end + end + + def process(%{status: :shipped} = order) do + {:ok, %{order | status: :delivered}} + end + + def process(%{status: :cancelled}) do + {:error, :already_cancelled} + end + + def process(_order), do: {:error, :invalid_transition} + + defp verify_user(%{user: %{verified: true}}), do: :ok + defp verify_user(_), do: {:error, :unverified_user} + + defp require_items(%{items: []}), do: {:error, :empty_order} + defp require_items(_), do: :ok + + defp maybe_alert_fraud(%{total: total} = order) when total > 1000 do + notify_fraud_team(order) + end + + defp maybe_alert_fraud(_order), do: :ok + + defp charge_card(order), do: {:ok, %{id: "ch_#{order.id}"}} + defp notify_fraud_team(order), do: IO.puts("Fraud check: #{order.id}") +end diff --git a/priv/combined_metrics/samples/function_design/default_parameters_placed_at_end/bad/EmailComposer.swift b/priv/combined_metrics/samples/function_design/default_parameters_placed_at_end/bad/EmailComposer.swift new file mode 100644 index 0000000..04475b5 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/default_parameters_placed_at_end/bad/EmailComposer.swift @@ -0,0 +1,64 @@ +import Foundation + +struct EmailMessage { + let to: [String] + let subject: String + let body: String + let cc: [String] + let bcc: [String] + let isHTML: Bool + let attachments: [URL] +} + +class EmailComposer { + + // Default parameters intermixed with required ones, making call sites confusing + func compose( + cc: [String] = [], // default before required params + to recipients: [String], // required + isHTML: Bool = false, // default before more required params + subject: String, // required + attachments: [URL] = [], // default + body: String, // required — buried after defaults + bcc: [String] = [] + ) -> EmailMessage { + return EmailMessage( + to: recipients, + subject: subject, + body: body, + cc: cc, + bcc: bcc, + isHTML: isHTML, + attachments: attachments + ) + } + + // Default parameter (includeGettingStartedGuide) appears before required "name" + func sendWelcome( + to recipient: String, + includeGettingStartedGuide: Bool = true, // default before required + name: String, // required after default + replyTo: String? = nil + ) -> EmailMessage { + let body = includeGettingStartedGuide + ? "Welcome, \(name)! Check out our getting started guide." + : "Welcome, \(name)!" + + var bcc: [String] = [] + if let replyAddress = replyTo { + bcc.append(replyAddress) + } + + return EmailMessage(to: [recipient], subject: "Welcome!", body: body, cc: [], bcc: bcc, isHTML: false, attachments: []) + } + + // Default "retryCount" before required "date" + func scheduleDelivery( + for message: EmailMessage, + retryCount: Int = 3, // default before required + at date: Date, // required after default + retryDelay: TimeInterval = 60 + ) { + _ = (message, date, retryCount, retryDelay) + } +} diff --git a/priv/combined_metrics/samples/function_design/default_parameters_placed_at_end/good/EmailComposer.swift b/priv/combined_metrics/samples/function_design/default_parameters_placed_at_end/good/EmailComposer.swift new file mode 100644 index 0000000..167e085 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/default_parameters_placed_at_end/good/EmailComposer.swift @@ -0,0 +1,64 @@ +import Foundation + +struct EmailMessage { + let to: [String] + let subject: String + let body: String + let cc: [String] + let bcc: [String] + let isHTML: Bool + let attachments: [URL] +} + +class EmailComposer { + + // Required parameters first, defaults at the end + func compose( + to recipients: [String], + subject: String, + body: String, + cc: [String] = [], + bcc: [String] = [], + isHTML: Bool = false, + attachments: [URL] = [] + ) -> EmailMessage { + return EmailMessage( + to: recipients, + subject: subject, + body: body, + cc: cc, + bcc: bcc, + isHTML: isHTML, + attachments: attachments + ) + } + + // Required parameter first, optional config at end + func sendWelcome( + to recipient: String, + name: String, + includeGettingStartedGuide: Bool = true, + replyTo: String? = nil + ) -> EmailMessage { + let body = includeGettingStartedGuide + ? "Welcome, \(name)! Check out our getting started guide." + : "Welcome, \(name)!" + + var bcc: [String] = [] + if let replyAddress = replyTo { + bcc.append(replyAddress) + } + + return EmailMessage(to: [recipient], subject: "Welcome!", body: body, cc: [], bcc: bcc, isHTML: false, attachments: []) + } + + func scheduleDelivery( + for message: EmailMessage, + at date: Date, + retryCount: Int = 3, + retryDelay: TimeInterval = 60 + ) { + // Schedule logic here + _ = (message, date, retryCount, retryDelay) + } +} diff --git a/priv/combined_metrics/samples/function_design/function_does_not_change_return_type_via_options/bad/reports.ex b/priv/combined_metrics/samples/function_design/function_does_not_change_return_type_via_options/bad/reports.ex new file mode 100644 index 0000000..803e326 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/function_does_not_change_return_type_via_options/bad/reports.ex @@ -0,0 +1,64 @@ +defmodule MyApp.Reports do + @moduledoc """ + Report generation. + """ + + alias MyApp.Analytics + + # Bad: `format` option changes the return type from struct -> binary -> map. + # Callers cannot know the return type without inspecting the options. + @spec build_revenue_report(Date.t(), Date.t(), keyword()) :: + map() | binary() | MyApp.Reports.RevenueReport.t() + def build_revenue_report(%Date{} = from, %Date{} = to, opts \\ []) do + rows = Analytics.revenue_by_day(from, to) + total = Enum.sum(Enum.map(rows, & &1.amount)) + + report = %{from: from, to: to, rows: rows, total: total} + + case Keyword.get(opts, :format) do + :csv -> + # Returns a binary when :csv + header = "date,amount\n" + body = Enum.map_join(rows, "\n", &"#{&1.date},#{&1.amount}") + header <> body + + :json -> + # Returns a map when :json + %{ + from: Date.to_iso8601(from), + to: Date.to_iso8601(to), + total: total, + rows: Enum.map(rows, &%{date: Date.to_iso8601(&1.date), amount: &1.amount}) + } + + nil -> + # Returns raw map with no format + report + end + end + + # Bad: `raw` option changes return from list of maps to list of tuples + @spec fetch_revenue_rows(Date.t(), Date.t(), keyword()) :: [map()] | [{Date.t(), integer()}] + def fetch_revenue_rows(from, to, opts \\ []) do + rows = Analytics.revenue_by_day(from, to) + + if Keyword.get(opts, :raw) do + Enum.map(rows, &{&1.date, &1.amount}) + else + rows + end + end + + # Bad: `verbose` option changes return from integer to map + @spec total_revenue(Date.t(), Date.t(), keyword()) :: integer() | map() + def total_revenue(from, to, opts \\ []) do + rows = Analytics.revenue_by_day(from, to) + total = Enum.sum(Enum.map(rows, & &1.amount)) + + if Keyword.get(opts, :verbose) do + %{total: total, from: from, to: to, row_count: length(rows)} + else + total + end + end +end diff --git a/priv/combined_metrics/samples/function_design/function_does_not_change_return_type_via_options/good/reports.ex b/priv/combined_metrics/samples/function_design/function_does_not_change_return_type_via_options/good/reports.ex new file mode 100644 index 0000000..01aafb2 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/function_does_not_change_return_type_via_options/good/reports.ex @@ -0,0 +1,69 @@ +defmodule MyApp.Reports do + @moduledoc """ + Report generation. Separate functions are used for distinct output + formats rather than changing the return type via options. + """ + + alias MyApp.Reports.{RevenueReport, SummaryReport} + alias MyApp.Analytics + + @doc """ + Builds a revenue report struct for the given date range. + Always returns a `RevenueReport` struct. + """ + @spec build_revenue_report(Date.t(), Date.t()) :: RevenueReport.t() + def build_revenue_report(%Date{} = from, %Date{} = to) do + rows = Analytics.revenue_by_day(from, to) + total = Enum.sum(Enum.map(rows, & &1.amount)) + + %RevenueReport{ + from: from, + to: to, + rows: rows, + total: total, + generated_at: DateTime.utc_now() + } + end + + @doc """ + Renders a revenue report as a CSV binary. + Always returns a binary. + """ + @spec render_revenue_csv(RevenueReport.t()) :: binary() + def render_revenue_csv(%RevenueReport{rows: rows}) do + header = "date,amount\n" + body = Enum.map_join(rows, "\n", &"#{&1.date},#{&1.amount}") + header <> body + end + + @doc """ + Renders a revenue report as a JSON-encodable map. + Always returns a map. + """ + @spec render_revenue_json(RevenueReport.t()) :: map() + def render_revenue_json(%RevenueReport{} = report) do + %{ + from: Date.to_iso8601(report.from), + to: Date.to_iso8601(report.to), + total: report.total, + rows: Enum.map(report.rows, &%{date: Date.to_iso8601(&1.date), amount: &1.amount}) + } + end + + @doc """ + Builds a summary report for a single month. + Always returns a `SummaryReport` struct. + """ + @spec build_summary(integer(), integer()) :: SummaryReport.t() + def build_summary(year, month) do + data = Analytics.monthly_summary(year, month) + + %SummaryReport{ + year: year, + month: month, + total_orders: data.order_count, + total_revenue: data.revenue, + avg_order_value: data.revenue / max(data.order_count, 1) + } + end +end diff --git a/priv/combined_metrics/samples/function_design/has_verb_in_name/bad/api.ex b/priv/combined_metrics/samples/function_design/has_verb_in_name/bad/api.ex new file mode 100644 index 0000000..8ac1843 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/has_verb_in_name/bad/api.ex @@ -0,0 +1,54 @@ +defmodule Api do + def user_data(user_id) do + case http_get("/users/#{user_id}") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def order_status(order_id) do + case http_get("/orders/#{order_id}/status") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def payment_result(payment_id) do + case http_get("/payments/#{payment_id}") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def product_inventory(sku) do + case http_get("/inventory/#{sku}") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def shipment_tracking(tracking_number) do + case http_get("/shipments/#{tracking_number}") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def customer_profile(customer_id) do + case http_get("/customers/#{customer_id}/profile") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def webhook_registration(url, events) do + payload = Jason.encode!(%{url: url, events: events}) + case http_post("/webhooks", payload) do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + defp http_get(path), do: {:ok, ~s({"path":"#{path}"})} + defp http_post(path, _body), do: {:ok, ~s({"path":"#{path}","created":true})} +end diff --git a/priv/combined_metrics/samples/function_design/has_verb_in_name/config.yml b/priv/combined_metrics/samples/function_design/has_verb_in_name/config.yml new file mode 100644 index 0000000..cd84977 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/has_verb_in_name/config.yml @@ -0,0 +1 @@ +doc: "Function names should contain a verb describing the action performed." diff --git a/priv/combined_metrics/samples/function_design/has_verb_in_name/good/api.ex b/priv/combined_metrics/samples/function_design/has_verb_in_name/good/api.ex new file mode 100644 index 0000000..c60f64d --- /dev/null +++ b/priv/combined_metrics/samples/function_design/has_verb_in_name/good/api.ex @@ -0,0 +1,54 @@ +defmodule Api do + def fetch_user_data(user_id) do + case http_get("/users/#{user_id}") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def get_order_status(order_id) do + case http_get("/orders/#{order_id}/status") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def retrieve_payment_result(payment_id) do + case http_get("/payments/#{payment_id}") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def check_product_inventory(sku) do + case http_get("/inventory/#{sku}") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def track_shipment(tracking_number) do + case http_get("/shipments/#{tracking_number}") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def load_customer_profile(customer_id) do + case http_get("/customers/#{customer_id}/profile") do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + def register_webhook(url, events) do + payload = Jason.encode!(%{url: url, events: events}) + case http_post("/webhooks", payload) do + {:ok, body} -> {:ok, Jason.decode!(body)} + {:error, reason} -> {:error, reason} + end + end + + defp http_get(path), do: {:ok, ~s({"path":"#{path}"})} + defp http_post(path, _body), do: {:ok, ~s({"path":"#{path}","created":true})} +end diff --git a/priv/combined_metrics/samples/function_design/input_parameters_before_output_parameters/bad/Matrix.cpp b/priv/combined_metrics/samples/function_design/input_parameters_before_output_parameters/bad/Matrix.cpp new file mode 100644 index 0000000..2d26944 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/input_parameters_before_output_parameters/bad/Matrix.cpp @@ -0,0 +1,64 @@ +#include +#include +#include +#include + +// Output parameters appear before inputs — confusing parameter order + +// Output result comes first — counterintuitive +void multiplyScalar(std::vector& result, // output first — confusing + const std::vector& input, // input second + double scalar) +{ + result.resize(input.size()); + for (std::size_t i = 0; i < input.size(); ++i) + result[i] = input[i] * scalar; +} + +// Output before input dimensions — reader must study the body to understand the order +void transpose(double* output, // output first + const double* input, // input second + std::size_t rows, std::size_t cols) +{ + for (std::size_t r = 0; r < rows; ++r) + for (std::size_t c = 0; c < cols; ++c) + output[c * rows + r] = input[r * cols + c]; +} + +// Output interleaved with inputs — no clear convention +void addVectors(double* result, // output first + const double* a, // input + std::size_t size, // input dimension + const double* b) // second input — split from first by size +{ + for (std::size_t i = 0; i < size; ++i) + result[i] = a[i] + b[i]; +} + +// Output buried in the middle of the parameter list +void multiplyMatrices(const double* lhs, + std::size_t lhsRows, + double* result, // output in the middle + const double* rhs, + std::size_t sharedDim, + std::size_t rhsCols) +{ + for (std::size_t i = 0; i < lhsRows; ++i) + for (std::size_t k = 0; k < sharedDim; ++k) + for (std::size_t j = 0; j < rhsCols; ++j) + result[i * rhsCols + j] += lhs[i * sharedDim + k] * rhs[k * rhsCols + j]; +} + +// Output first, then all inputs +void formatRow(std::string& output, // output first + int rowIndex, + const std::vector& values, + char separator) +{ + output.clear(); + output += std::to_string(rowIndex) + separator; + for (std::size_t i = 0; i < values.size(); ++i) { + if (i > 0) output += separator; + output += std::to_string(values[i]); + } +} diff --git a/priv/combined_metrics/samples/function_design/input_parameters_before_output_parameters/good/Matrix.cpp b/priv/combined_metrics/samples/function_design/input_parameters_before_output_parameters/good/Matrix.cpp new file mode 100644 index 0000000..51e9333 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/input_parameters_before_output_parameters/good/Matrix.cpp @@ -0,0 +1,53 @@ +#include +#include +#include + +// Inputs come first, outputs last — consistent with standard library conventions (e.g., std::copy) + +// Pure inputs first, result returned by value +std::vector multiplyScalar(const std::vector& input, double scalar) { + std::vector result(input.size()); + for (std::size_t i = 0; i < input.size(); ++i) + result[i] = input[i] * scalar; + return result; +} + +// Input rows/cols before the output matrix +void transpose(const double* input, std::size_t rows, std::size_t cols, + double* output) // output parameter last +{ + for (std::size_t r = 0; r < rows; ++r) + for (std::size_t c = 0; c < cols; ++c) + output[c * rows + r] = input[r * cols + c]; +} + +// Read-only inputs (a, b, size) before write output (result) +void addVectors(const double* a, const double* b, std::size_t size, + double* result) // output last +{ + for (std::size_t i = 0; i < size; ++i) + result[i] = a[i] + b[i]; +} + +// Inputs: lhs, rhs matrices and their dimensions; output: result matrix last +void multiplyMatrices(const double* lhs, const double* rhs, + std::size_t lhsRows, std::size_t sharedDim, std::size_t rhsCols, + double* result) // output last +{ + for (std::size_t i = 0; i < lhsRows; ++i) + for (std::size_t k = 0; k < sharedDim; ++k) + for (std::size_t j = 0; j < rhsCols; ++j) + result[i * rhsCols + j] += lhs[i * sharedDim + k] * rhs[k * rhsCols + j]; +} + +// Input configuration first, output buffer last +void formatRow(int rowIndex, const std::vector& values, char separator, + std::string& output) // output last +{ + output.clear(); + output += std::to_string(rowIndex) + separator; + for (std::size_t i = 0; i < values.size(); ++i) { + if (i > 0) output += separator; + output += std::to_string(values[i]); + } +} diff --git a/priv/combined_metrics/samples/function_design/interface_has_one_or_two_methods/bad/storage.go b/priv/combined_metrics/samples/function_design/interface_has_one_or_two_methods/bad/storage.go new file mode 100644 index 0000000..4a93408 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/interface_has_one_or_two_methods/bad/storage.go @@ -0,0 +1,43 @@ +package storage + +import "context" + +// Document is a stored item. +type Document struct { + ID string + Content []byte +} + +// Store is a fat interface with many methods. Implementors must provide all of +// them even when a caller only needs Read. This makes mocking in tests verbose +// and tightly couples callers to the full Store surface. +type Store interface { + Read(ctx context.Context, id string) (*Document, error) + Write(ctx context.Context, doc Document) error + Delete(ctx context.Context, id string) error + List(ctx context.Context) ([]Document, error) + Count(ctx context.Context) (int, error) + Exists(ctx context.Context, id string) (bool, error) + Ping(ctx context.Context) error +} + +// DocumentService depends on the entire Store interface even though it only +// uses Read and Write. +type DocumentService struct { + store Store +} + +// New constructs a DocumentService. +func New(store Store) *DocumentService { + return &DocumentService{store: store} +} + +// Get fetches a document by ID. +func (s *DocumentService) Get(ctx context.Context, id string) (*Document, error) { + return s.store.Read(ctx, id) +} + +// Save persists a document. +func (s *DocumentService) Save(ctx context.Context, doc Document) error { + return s.store.Write(ctx, doc) +} diff --git a/priv/combined_metrics/samples/function_design/interface_has_one_or_two_methods/good/storage.go b/priv/combined_metrics/samples/function_design/interface_has_one_or_two_methods/good/storage.go new file mode 100644 index 0000000..be1d6ba --- /dev/null +++ b/priv/combined_metrics/samples/function_design/interface_has_one_or_two_methods/good/storage.go @@ -0,0 +1,53 @@ +package storage + +import "context" + +// Document is a stored item. +type Document struct { + ID string + Content []byte +} + +// Reader is a single-method interface for fetching a document. +// Small interfaces are easy to implement, test, and compose. +type Reader interface { + Read(ctx context.Context, id string) (*Document, error) +} + +// Writer is a single-method interface for persisting a document. +type Writer interface { + Write(ctx context.Context, doc Document) error +} + +// Deleter is a single-method interface for removing a document. +type Deleter interface { + Delete(ctx context.Context, id string) error +} + +// ReadWriter composes Reader and Writer for callers that need both. +// Composed from small interfaces rather than a large monolith. +type ReadWriter interface { + Reader + Writer +} + +// DocumentService uses only the capabilities it requires. +type DocumentService struct { + rw ReadWriter + deleter Deleter +} + +// New constructs a DocumentService. +func New(rw ReadWriter, deleter Deleter) *DocumentService { + return &DocumentService{rw: rw, deleter: deleter} +} + +// Get fetches a document by ID. +func (s *DocumentService) Get(ctx context.Context, id string) (*Document, error) { + return s.rw.Read(ctx, id) +} + +// Save persists a document. +func (s *DocumentService) Save(ctx context.Context, doc Document) error { + return s.rw.Write(ctx, doc) +} diff --git a/priv/combined_metrics/samples/function_design/is_less_than_20_lines/bad/report_generator.ex b/priv/combined_metrics/samples/function_design/is_less_than_20_lines/bad/report_generator.ex new file mode 100644 index 0000000..90ba048 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/is_less_than_20_lines/bad/report_generator.ex @@ -0,0 +1,53 @@ +defmodule ReportGenerator do + def generate_report(orders, user, opts) do + start_date = Keyword.get(opts, :start_date) + end_date = Keyword.get(opts, :end_date) + format = Keyword.get(opts, :format, :pdf) + + filtered = Enum.filter(orders, fn order -> + order.user_id == user.id && + (is_nil(start_date) || Date.compare(order.date, start_date) != :lt) && + (is_nil(end_date) || Date.compare(order.date, end_date) != :gt) + end) + + total = Enum.reduce(filtered, 0, fn order, acc -> + line_total = Enum.reduce(order.items, 0, fn item, item_acc -> + item_acc + item.price * item.quantity + end) + acc + line_total + end) + + discount = if user.vip do + total * 0.1 + else + 0 + end + + net_total = total - discount + tax = net_total * 0.2 + grand_total = net_total + tax + + summary_lines = Enum.map(filtered, fn order -> + items_text = Enum.map_join(order.items, ", ", fn item -> + "#{item.name} x#{item.quantity} @ #{item.price}" + end) + "Order #{order.id} (#{order.date}): #{items_text}" + end) + + header = "Report for #{user.name} | #{start_date} - #{end_date}" + body = Enum.join(summary_lines, "\n") + footer = "Subtotal: #{total} | Discount: #{discount} | Tax: #{tax} | Total: #{grand_total}" + + content = "#{header}\n\n#{body}\n\n#{footer}" + + case format do + :pdf -> {:ok, render_pdf(content)} + :csv -> {:ok, render_csv(filtered, grand_total)} + :html -> {:ok, "
    #{content}
    "} + _ -> {:error, :unsupported_format} + end + end + + defp render_pdf(content), do: %{type: :pdf, data: content} + defp render_csv(orders, total), do: %{type: :csv, orders: orders, total: total} +end diff --git a/priv/combined_metrics/samples/function_design/is_less_than_20_lines/config.yml b/priv/combined_metrics/samples/function_design/is_less_than_20_lines/config.yml new file mode 100644 index 0000000..10d2d75 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/is_less_than_20_lines/config.yml @@ -0,0 +1 @@ +doc: "Functions should be 20 lines or fewer." diff --git a/priv/combined_metrics/samples/function_design/is_less_than_20_lines/good/report_generator.ex b/priv/combined_metrics/samples/function_design/is_less_than_20_lines/good/report_generator.ex new file mode 100644 index 0000000..c777d77 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/is_less_than_20_lines/good/report_generator.ex @@ -0,0 +1,55 @@ +defmodule ReportGenerator do + def generate_report(orders, user, opts) do + filtered = filter_orders(orders, user, opts) + totals = calculate_totals(filtered, user) + content = build_content(filtered, user, totals, opts) + render(content, filtered, totals, Keyword.get(opts, :format, :pdf)) + end + + defp filter_orders(orders, user, opts) do + start_date = Keyword.get(opts, :start_date) + end_date = Keyword.get(opts, :end_date) + + Enum.filter(orders, fn order -> + order.user_id == user.id && + within_date_range?(order.date, start_date, end_date) + end) + end + + defp within_date_range?(date, start_date, end_date) do + (is_nil(start_date) || Date.compare(date, start_date) != :lt) && + (is_nil(end_date) || Date.compare(date, end_date) != :gt) + end + + defp calculate_totals(orders, user) do + subtotal = Enum.sum(Enum.map(orders, &order_subtotal/1)) + discount = if user.vip, do: subtotal * 0.1, else: 0 + net = subtotal - discount + %{subtotal: subtotal, discount: discount, net: net, tax: net * 0.2, grand: net + net * 0.2} + end + + defp order_subtotal(order) do + Enum.sum(Enum.map(order.items, fn item -> item.price * item.quantity end)) + end + + defp build_content(orders, user, totals, opts) do + start_date = Keyword.get(opts, :start_date) + end_date = Keyword.get(opts, :end_date) + header = "Report for #{user.name} | #{start_date} - #{end_date}" + body = Enum.map_join(orders, "\n", &format_order_line/1) + footer = "Subtotal: #{totals.subtotal} | Discount: #{totals.discount} | Tax: #{totals.tax} | Total: #{totals.grand}" + "#{header}\n\n#{body}\n\n#{footer}" + end + + defp format_order_line(order) do + items_text = Enum.map_join(order.items, ", ", fn item -> + "#{item.name} x#{item.quantity} @ #{item.price}" + end) + "Order #{order.id} (#{order.date}): #{items_text}" + end + + defp render(content, _orders, _totals, :pdf), do: {:ok, %{type: :pdf, data: content}} + defp render(_content, orders, totals, :csv), do: {:ok, %{type: :csv, orders: orders, total: totals.grand}} + defp render(content, _orders, _totals, :html), do: {:ok, "
    #{content}
    "} + defp render(_content, _orders, _totals, _), do: {:error, :unsupported_format} +end diff --git a/priv/combined_metrics/samples/function_design/move_constructors_are_noexcept/bad/Buffer.cpp b/priv/combined_metrics/samples/function_design/move_constructors_are_noexcept/bad/Buffer.cpp new file mode 100644 index 0000000..7ced490 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/move_constructors_are_noexcept/bad/Buffer.cpp @@ -0,0 +1,82 @@ +#include +#include +#include +#include +#include + +class Buffer { +public: + explicit Buffer(std::size_t capacity) + : data_(std::make_unique(capacity)) + , capacity_(capacity) + , size_(0) + {} + + // Move constructor without noexcept: + // std::vector and other containers will use the copy constructor instead of move + // during reallocation, causing unnecessary heap allocations and memcpy calls + Buffer(Buffer&& other) // missing noexcept + : data_(std::move(other.data_)) + , capacity_(other.capacity_) + , size_(other.size_) + { + other.capacity_ = 0; + other.size_ = 0; + } + + // Move assignment also missing noexcept + Buffer& operator=(Buffer&& other) // missing noexcept + { + if (this != &other) { + data_ = std::move(other.data_); + capacity_ = other.capacity_; + size_ = other.size_; + other.capacity_ = 0; + other.size_ = 0; + } + return *this; + } + + Buffer(const Buffer& other) + : data_(std::make_unique(other.capacity_)) + , capacity_(other.capacity_) + , size_(other.size_) + { + std::memcpy(data_.get(), other.data_.get(), other.size_); + } + + Buffer& operator=(const Buffer& other) { + if (this != &other) { + auto newData = std::make_unique(other.capacity_); + std::memcpy(newData.get(), other.data_.get(), other.size_); + data_ = std::move(newData); + capacity_ = other.capacity_; + size_ = other.size_; + } + return *this; + } + + void append(const uint8_t* src, std::size_t length) { + if (size_ + length > capacity_) + throw std::overflow_error("Buffer capacity exceeded"); + std::memcpy(data_.get() + size_, src, length); + size_ += length; + } + + std::size_t size() const noexcept { return size_; } + std::size_t capacity() const noexcept { return capacity_; } + +private: + std::unique_ptr data_; + std::size_t capacity_; + std::size_t size_; +}; + +// Because move ctor is not noexcept, std::vector will copy (not move) Buffer +// objects during reallocation — expensive for large buffers +void demonstrateVectorRealloc() { + std::vector buffers; + buffers.reserve(4); + for (int i = 0; i < 8; ++i) + buffers.emplace_back(1024); // triggers copy, not move, on reallocation +} diff --git a/priv/combined_metrics/samples/function_design/move_constructors_are_noexcept/good/Buffer.cpp b/priv/combined_metrics/samples/function_design/move_constructors_are_noexcept/good/Buffer.cpp new file mode 100644 index 0000000..3fce1c2 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/move_constructors_are_noexcept/good/Buffer.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include + +class Buffer { +public: + explicit Buffer(std::size_t capacity) + : data_(std::make_unique(capacity)) + , capacity_(capacity) + , size_(0) + {} + + // Move constructor is noexcept: only reassigns pointers and integers — cannot throw + Buffer(Buffer&& other) noexcept + : data_(std::move(other.data_)) + , capacity_(other.capacity_) + , size_(other.size_) + { + other.capacity_ = 0; + other.size_ = 0; + } + + // Move assignment is noexcept for the same reason + Buffer& operator=(Buffer&& other) noexcept { + if (this != &other) { + data_ = std::move(other.data_); + capacity_ = other.capacity_; + size_ = other.size_; + other.capacity_ = 0; + other.size_ = 0; + } + return *this; + } + + Buffer(const Buffer& other) + : data_(std::make_unique(other.capacity_)) + , capacity_(other.capacity_) + , size_(other.size_) + { + std::memcpy(data_.get(), other.data_.get(), other.size_); + } + + Buffer& operator=(const Buffer& other) { + if (this != &other) { + auto newData = std::make_unique(other.capacity_); + std::memcpy(newData.get(), other.data_.get(), other.size_); + data_ = std::move(newData); + capacity_ = other.capacity_; + size_ = other.size_; + } + return *this; + } + + void append(const uint8_t* src, std::size_t length) { + if (size_ + length > capacity_) + throw std::overflow_error("Buffer capacity exceeded"); + std::memcpy(data_.get() + size_, src, length); + size_ += length; + } + + std::size_t size() const noexcept { return size_; } + std::size_t capacity() const noexcept { return capacity_; } + +private: + std::unique_ptr data_; + std::size_t capacity_; + std::size_t size_; +}; + +// noexcept move allows std::vector to use move during reallocation +// rather than copying — important for performance +void demonstrateVectorRealloc() { + std::vector buffers; + buffers.reserve(4); + for (int i = 0; i < 8; ++i) + buffers.emplace_back(1024); // triggers reallocation; uses move ctor (noexcept) +} diff --git a/priv/combined_metrics/samples/function_design/named_return_values_used_for_documentation/bad/parser.go b/priv/combined_metrics/samples/function_design/named_return_values_used_for_documentation/bad/parser.go new file mode 100644 index 0000000..c57a550 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/named_return_values_used_for_documentation/bad/parser.go @@ -0,0 +1,56 @@ +package parser + +import ( + "bufio" + "fmt" + "strconv" + "strings" +) + +// ParseCSVRow splits a CSV line into its fields. +// Without named returns the two []string values are indistinguishable from +// the signature alone — callers must read the body to know which is which. +func ParseCSVRow(line string) ([]string, []string) { + var headers, values []string + parts := strings.Split(line, ",") + for i, p := range parts { + p = strings.TrimSpace(p) + if i == 0 { + headers = append(headers, p) + } else { + values = append(values, p) + } + } + return headers, values +} + +// ParseBounds extracts the start and end line numbers from a range string "N-M". +// Three return values of which two are int — callers cannot tell from the +// signature which int is start and which is end without reading the body. +func ParseBounds(rangeStr string) (int, int, error) { + parts := strings.SplitN(rangeStr, "-", 2) + if len(parts) != 2 { + return 0, 0, fmt.Errorf("invalid range %q: expected format N-M", rangeStr) + } + start, err := strconv.Atoi(strings.TrimSpace(parts[0])) + if err != nil { + return 0, 0, fmt.Errorf("invalid start in range %q: %w", rangeStr, err) + } + end, err := strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil { + return 0, 0, fmt.Errorf("invalid end in range %q: %w", rangeStr, err) + } + return start, end, nil +} + +// CountWords scans a multi-line string and returns two counts. +// The two ints are ambiguous — is it (words, lines) or (lines, words)? +func CountWords(text string) (int, int) { + var words, lines int + scanner := bufio.NewScanner(strings.NewReader(text)) + for scanner.Scan() { + lines++ + words += len(strings.Fields(scanner.Text())) + } + return words, lines +} diff --git a/priv/combined_metrics/samples/function_design/named_return_values_used_for_documentation/good/parser.go b/priv/combined_metrics/samples/function_design/named_return_values_used_for_documentation/good/parser.go new file mode 100644 index 0000000..0abe9cf --- /dev/null +++ b/priv/combined_metrics/samples/function_design/named_return_values_used_for_documentation/good/parser.go @@ -0,0 +1,58 @@ +package parser + +import ( + "bufio" + "fmt" + "strconv" + "strings" +) + +// ParseCSVRow splits a CSV line into its fields. +// Named returns make the two string slices unambiguous at the call site. +func ParseCSVRow(line string) (headers []string, values []string) { + parts := strings.Split(line, ",") + for i, p := range parts { + p = strings.TrimSpace(p) + if i == 0 { + headers = append(headers, p) + } else { + values = append(values, p) + } + } + return +} + +// ParseBounds extracts the start and end line numbers from a range string "N-M". +// Named returns clarify which int is start and which is end. +func ParseBounds(rangeStr string) (start, end int, err error) { + parts := strings.SplitN(rangeStr, "-", 2) + if len(parts) != 2 { + err = fmt.Errorf("invalid range %q: expected format N-M", rangeStr) + return + } + start, err = strconv.Atoi(strings.TrimSpace(parts[0])) + if err != nil { + err = fmt.Errorf("invalid start in range %q: %w", rangeStr, err) + return + } + end, err = strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil { + err = fmt.Errorf("invalid end in range %q: %w", rangeStr, err) + return + } + if end < start { + err = fmt.Errorf("end %d is before start %d in range %q", end, start, rangeStr) + } + return +} + +// CountWords scans a multi-line string and returns word and line counts. +// Named returns document what each int represents. +func CountWords(text string) (words, lines int) { + scanner := bufio.NewScanner(strings.NewReader(text)) + for scanner.Scan() { + lines++ + words += len(strings.Fields(scanner.Text())) + } + return +} diff --git a/priv/combined_metrics/samples/function_design/nesting_depth_under_4/bad/validator.ex b/priv/combined_metrics/samples/function_design/nesting_depth_under_4/bad/validator.ex new file mode 100644 index 0000000..b53ed02 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/nesting_depth_under_4/bad/validator.ex @@ -0,0 +1,59 @@ +defmodule Validator do + def validate_request(request) do + if request != nil do + if Map.has_key?(request, :user) do + if request.user != nil do + if Map.has_key?(request.user, :role) do + if request.user.role in [:admin, :editor, :viewer] do + if Map.has_key?(request, :payload) do + {:ok, request} + else + {:error, "missing payload"} + end + else + {:error, "invalid role"} + end + else + {:error, "missing role"} + end + else + {:error, "user is nil"} + end + else + {:error, "missing user"} + end + else + {:error, "request is nil"} + end + end + + def validate_order(order) do + case order do + nil -> {:error, "order is nil"} + _ -> + case order.status do + :pending -> + case order.items do + [] -> {:error, "no items"} + items -> + case Enum.all?(items, &valid_item?/1) do + true -> + case order.payment do + nil -> {:error, "no payment"} + payment -> + case payment.method do + :card -> {:ok, order} + :cash -> {:ok, order} + _ -> {:error, "invalid payment method"} + end + end + false -> {:error, "invalid item"} + end + end + _ -> {:error, "order not pending"} + end + end + end + + defp valid_item?(item), do: item.quantity > 0 && item.price > 0 +end diff --git a/priv/combined_metrics/samples/function_design/nesting_depth_under_4/config.yml b/priv/combined_metrics/samples/function_design/nesting_depth_under_4/config.yml new file mode 100644 index 0000000..df376c9 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/nesting_depth_under_4/config.yml @@ -0,0 +1 @@ +doc: "Code should not nest deeper than 4 levels." diff --git a/priv/combined_metrics/samples/function_design/nesting_depth_under_4/good/validator.ex b/priv/combined_metrics/samples/function_design/nesting_depth_under_4/good/validator.ex new file mode 100644 index 0000000..5ced262 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/nesting_depth_under_4/good/validator.ex @@ -0,0 +1,49 @@ +defmodule Validator do + def validate_request(nil), do: {:error, "request is nil"} + + def validate_request(request) do + with {:ok, user} <- fetch_user(request), + :ok <- validate_role(user), + :ok <- require_payload(request) do + {:ok, request} + end + end + + def validate_order(nil), do: {:error, "order is nil"} + + def validate_order(order) do + with :ok <- require_pending(order), + :ok <- require_items(order.items), + :ok <- validate_items(order.items), + :ok <- validate_payment(order.payment) do + {:ok, order} + end + end + + defp fetch_user(%{user: nil}), do: {:error, "user is nil"} + defp fetch_user(%{user: user}), do: {:ok, user} + defp fetch_user(_), do: {:error, "missing user"} + + defp validate_role(%{role: role}) when role in [:admin, :editor, :viewer], do: :ok + defp validate_role(%{role: _}), do: {:error, "invalid role"} + defp validate_role(_), do: {:error, "missing role"} + + defp require_payload(%{payload: _}), do: :ok + defp require_payload(_), do: {:error, "missing payload"} + + defp require_pending(%{status: :pending}), do: :ok + defp require_pending(_), do: {:error, "order not pending"} + + defp require_items([]), do: {:error, "no items"} + defp require_items(_), do: :ok + + defp validate_items(items) do + if Enum.all?(items, &valid_item?/1), do: :ok, else: {:error, "invalid item"} + end + + defp validate_payment(nil), do: {:error, "no payment"} + defp validate_payment(%{method: method}) when method in [:card, :cash], do: :ok + defp validate_payment(_), do: {:error, "invalid payment method"} + + defp valid_item?(item), do: item.quantity > 0 && item.price > 0 +end diff --git a/priv/combined_metrics/samples/function_design/no_arguments_object/bad/query_builder.js b/priv/combined_metrics/samples/function_design/no_arguments_object/bad/query_builder.js new file mode 100644 index 0000000..38ff091 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/no_arguments_object/bad/query_builder.js @@ -0,0 +1,57 @@ +function buildSelectClause() { + const table = arguments[0]; + if (arguments.length <= 1) { + return `SELECT * FROM ${table}`; + } + const columns = []; + for (let i = 1; i < arguments.length; i++) { + columns.push(`"${arguments[i]}"`); + } + return `SELECT ${columns.join(", ")} FROM "${table}"`; +} + +function buildWhereClause() { + if (arguments.length === 0) return ""; + const conditions = []; + for (let i = 0; i < arguments.length; i++) { + conditions.push(arguments[i]); + } + return "WHERE " + conditions.join(" AND "); +} + +function mergeQueryOptions() { + const result = {}; + for (let i = 0; i < arguments.length; i++) { + Object.assign(result, arguments[i]); + } + return result; +} + +function buildOrderClause() { + if (arguments.length === 0) return ""; + const parts = []; + for (let i = 0; i < arguments.length; i++) { + const field = arguments[i]; + parts.push(`"${field.column}" ${field.direction || "ASC"}`); + } + return "ORDER BY " + parts.join(", "); +} + +function buildQuery(table, options) { + const columns = options.columns || []; + const conditions = options.conditions || []; + const orderBy = options.orderBy || []; + + const selectPart = buildSelectClause.apply(null, [table].concat(columns)); + const wherePart = buildWhereClause.apply(null, conditions); + const orderPart = buildOrderClause.apply(null, orderBy); + + const parts = [selectPart, wherePart, orderPart].filter(Boolean); + + if (options.limit != null) parts.push(`LIMIT ${Number(options.limit)}`); + if (options.offset != null) parts.push(`OFFSET ${Number(options.offset)}`); + + return parts.join(" "); +} + +export { buildQuery, buildSelectClause, buildWhereClause, buildOrderClause, mergeQueryOptions }; diff --git a/priv/combined_metrics/samples/function_design/no_arguments_object/good/query_builder.js b/priv/combined_metrics/samples/function_design/no_arguments_object/good/query_builder.js new file mode 100644 index 0000000..19c8fc2 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/no_arguments_object/good/query_builder.js @@ -0,0 +1,49 @@ +function buildSelectClause(table, ...columns) { + if (columns.length === 0) { + return `SELECT * FROM ${table}`; + } + const escaped = columns.map((c) => `"${c}"`).join(", "); + return `SELECT ${escaped} FROM "${table}"`; +} + +function buildWhereClause(...conditions) { + if (conditions.length === 0) return ""; + return "WHERE " + conditions.join(" AND "); +} + +function mergeQueryOptions(...optionSets) { + return Object.assign({}, ...optionSets); +} + +function buildOrderClause(...fields) { + if (fields.length === 0) return ""; + const parts = fields.map(({ column, direction = "ASC" }) => `"${column}" ${direction}`); + return "ORDER BY " + parts.join(", "); +} + +function buildQuery(table, options = {}, ...extraConditions) { + const { columns = [], conditions = [], orderBy = [], limit, offset } = options; + + const allConditions = [...conditions, ...extraConditions]; + + const parts = [ + buildSelectClause(table, ...columns), + buildWhereClause(...allConditions), + buildOrderClause(...orderBy), + ].filter(Boolean); + + if (limit != null) parts.push(`LIMIT ${Number(limit)}`); + if (offset != null) parts.push(`OFFSET ${Number(offset)}`); + + return parts.join(" "); +} + +function paginatedQuery(table, page, pageSize, ...baseConditions) { + return buildQuery(table, { + conditions: baseConditions, + limit: pageSize, + offset: (page - 1) * pageSize, + }); +} + +export { buildQuery, buildSelectClause, buildWhereClause, buildOrderClause, paginatedQuery, mergeQueryOptions }; diff --git a/priv/combined_metrics/samples/function_design/no_async_void_outside_event_handlers/bad/BackgroundSync.cs b/priv/combined_metrics/samples/function_design/no_async_void_outside_event_handlers/bad/BackgroundSync.cs new file mode 100644 index 0000000..935bbe0 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/no_async_void_outside_event_handlers/bad/BackgroundSync.cs @@ -0,0 +1,61 @@ +using System; +using System.Threading.Tasks; + +namespace Sync +{ + public class BackgroundSync + { + private readonly ISyncRepository _repository; + private readonly ILogger _logger; + + public BackgroundSync(ISyncRepository repository, ILogger logger) + { + _repository = repository; + _logger = logger; + } + + // async void: callers cannot await this; exceptions crash the process unhandled + public async void Synchronize() + { + var pending = await _repository.GetPendingItemsAsync(); + foreach (var item in pending) + { + await _repository.PushItemAsync(item); + await _repository.MarkSyncedAsync(item.Id); + } + } + + // async void: caller cannot observe exceptions or know when it finishes + public async void SynchronizeWithLogging() + { + try + { + var pending = await _repository.GetPendingItemsAsync(); + foreach (var item in pending) + { + await _repository.PushItemAsync(item); + await _repository.MarkSyncedAsync(item.Id); + } + } + catch (Exception ex) + { + // Exception is swallowed here; no way for callers to know about it + _logger.Error("Sync failed", ex); + } + } + + // async void: cannot be unit tested properly; cannot be awaited in service startup + public async void RetryFailed() + { + var failed = await _repository.GetFailedItemsAsync(); + foreach (var item in failed) + await _repository.PushItemAsync(item); + } + + public void TriggerSync() + { + // Fire-and-forget via async void — exceptions are silently lost + Synchronize(); + } + } +} diff --git a/priv/combined_metrics/samples/function_design/no_async_void_outside_event_handlers/good/BackgroundSync.cs b/priv/combined_metrics/samples/function_design/no_async_void_outside_event_handlers/good/BackgroundSync.cs new file mode 100644 index 0000000..5d14172 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/no_async_void_outside_event_handlers/good/BackgroundSync.cs @@ -0,0 +1,74 @@ +using System; +using System.Threading.Tasks; +using System.Windows.Forms; + +namespace Sync +{ + public class BackgroundSync + { + private readonly ISyncRepository _repository; + private readonly ILogger _logger; + + public BackgroundSync(ISyncRepository repository, ILogger logger) + { + _repository = repository; + _logger = logger; + } + + // Returns Task so callers can await, observe exceptions, and compose + public async Task SynchronizeAsync() + { + var pending = await _repository.GetPendingItemsAsync(); + foreach (var item in pending) + { + await _repository.PushItemAsync(item); + await _repository.MarkSyncedAsync(item.Id); + } + } + + public async Task SynchronizeWithResultAsync() + { + int synced = 0; + int failed = 0; + + var pending = await _repository.GetPendingItemsAsync(); + foreach (var item in pending) + { + try + { + await _repository.PushItemAsync(item); + await _repository.MarkSyncedAsync(item.Id); + synced++; + } + catch (SyncException ex) + { + _logger.Warning("Failed to sync item {id}", ex); + failed++; + } + } + + return new SyncResult(synced, failed); + } + + // async void is acceptable ONLY for event handlers — exceptions cannot be caught otherwise + private async void OnSyncButtonClicked(object sender, EventArgs e) + { + try + { + await SynchronizeAsync(); + } + catch (Exception ex) + { + _logger.Error("Sync failed from UI button", ex); + MessageBox.Show("Sync failed. Please try again."); + } + } + + public async Task RetryFailedAsync() + { + var failed = await _repository.GetFailedItemsAsync(); + foreach (var item in failed) + await _repository.PushItemAsync(item); + } + } +} diff --git a/priv/combined_metrics/samples/function_design/no_boolean_parameter/bad/notifications.ex b/priv/combined_metrics/samples/function_design/no_boolean_parameter/bad/notifications.ex new file mode 100644 index 0000000..e6f0bcd --- /dev/null +++ b/priv/combined_metrics/samples/function_design/no_boolean_parameter/bad/notifications.ex @@ -0,0 +1,49 @@ +defmodule Notifications do + def send_email(user, is_welcome) do + if is_welcome do + deliver(user.email, "Welcome!", "Hello #{user.name}, welcome aboard!") + else + deliver(user.email, "See you soon", "Goodbye #{user.name}, we hope to see you again.") + end + end + + def notify_order(user, order, is_shipped) do + if is_shipped do + deliver(user.email, "Your order shipped!", "Order #{order.id} is on its way.") + else + deliver(user.email, "Order confirmed", "We received your order #{order.id}.") + end + end + + def send_payment_notification(user, amount, succeeded) do + if succeeded do + deliver(user.email, "Payment received", "We received your payment of #{amount}.") + else + deliver(user.email, "Payment failed", "Your payment of #{amount} could not be processed.") + end + end + + def schedule_reminder(user, event, is_urgent) do + subject = if is_urgent, do: "[URGENT] Reminder", else: "Reminder" + body = "Don't forget: #{event.title} at #{event.time}" + deliver_with_priority(user.email, subject, body, is_urgent) + end + + def send_admin_alert(admin, message, include_details) do + body = + if include_details do + "#{message}\n\nDetails: #{inspect(message)}" + else + message + end + deliver(admin.email, "Admin Alert", body) + end + + defp deliver(to, subject, body) do + {:ok, %{to: to, subject: subject, body: body}} + end + + defp deliver_with_priority(to, subject, body, urgent) do + {:ok, %{to: to, subject: subject, body: body, priority: if(urgent, do: :high, else: :normal)}} + end +end diff --git a/priv/combined_metrics/samples/function_design/no_boolean_parameter/config.yml b/priv/combined_metrics/samples/function_design/no_boolean_parameter/config.yml new file mode 100644 index 0000000..fd6e0d6 --- /dev/null +++ b/priv/combined_metrics/samples/function_design/no_boolean_parameter/config.yml @@ -0,0 +1 @@ +doc: "Functions should not take boolean parameters — a flag usually means the function does two things." diff --git a/priv/combined_metrics/samples/function_design/no_boolean_parameter/good/notifications.ex b/priv/combined_metrics/samples/function_design/no_boolean_parameter/good/notifications.ex new file mode 100644 index 0000000..31d02cd --- /dev/null +++ b/priv/combined_metrics/samples/function_design/no_boolean_parameter/good/notifications.ex @@ -0,0 +1,52 @@ +defmodule Notifications do + def send_welcome_email(user) do + deliver(user.email, "Welcome!", "Hello #{user.name}, welcome aboard!") + end + + def send_farewell_email(user) do + deliver(user.email, "See you soon", "Goodbye #{user.name}, we hope to see you again.") + end + + def notify_order_shipped(user, order) do + deliver(user.email, "Your order shipped!", "Order #{order.id} is on its way.") + end + + def notify_order_confirmed(user, order) do + deliver(user.email, "Order confirmed", "We received your order #{order.id}.") + end + + def notify_payment_received(user, amount) do + deliver(user.email, "Payment received", "We received your payment of #{amount}.") + end + + def notify_payment_failed(user, amount) do + deliver(user.email, "Payment failed", "Your payment of #{amount} could not be processed.") + end + + def send_urgent_reminder(user, event) do + body = "Don't forget: #{event.title} at #{event.time}" + deliver_with_priority(user.email, "[URGENT] Reminder", body, :high) + end + + def send_reminder(user, event) do + body = "Don't forget: #{event.title} at #{event.time}" + deliver_with_priority(user.email, "Reminder", body, :normal) + end + + def send_detailed_admin_alert(admin, message) do + body = "#{message}\n\nDetails: #{inspect(message)}" + deliver(admin.email, "Admin Alert", body) + end + + def send_admin_alert(admin, message) do + deliver(admin.email, "Admin Alert", message) + end + + defp deliver(to, subject, body) do + {:ok, %{to: to, subject: subject, body: body}} + end + + defp deliver_with_priority(to, subject, body, priority) do + {:ok, %{to: to, subject: subject, body: body, priority: priority}} + end +end diff --git a/priv/combined_metrics/samples/function_design/no_default_arguments_on_virtual_functions/bad/Widget.cpp b/priv/combined_metrics/samples/function_design/no_default_arguments_on_virtual_functions/bad/Widget.cpp new file mode 100644 index 0000000..55ab0ad --- /dev/null +++ b/priv/combined_metrics/samples/function_design/no_default_arguments_on_virtual_functions/bad/Widget.cpp @@ -0,0 +1,63 @@ +#include +#include + +// Virtual functions with default argument values — the default is resolved statically +// at the call site based on the static type, NOT the dynamic type. +// This means the base class defaults are used even when a derived class override is called. + +class Widget { +public: + virtual ~Widget() = default; + + // Default argument on virtual function — resolved at compile time using static type + virtual void render(const RenderOptions& options = RenderOptions::defaults()) = 0; + + // Default on virtual — if Button overrides this with a different default, the + // base default is used when called via a Widget pointer or reference + virtual void resize(int width, int height = 100) = 0; + + virtual void highlight(const Color& color = Color::yellow()) = 0; + + virtual std::string describe() const = 0; +}; + +class Button : public Widget { +public: + explicit Button(std::string label) : label_(std::move(label)) {} + + // Override with a DIFFERENT default — this default is NEVER used when called + // through a Widget pointer/reference; the base class default applies instead + void render(const RenderOptions& options = RenderOptions::minimal()) override { + (void)options; + } + + void resize(int width, int height = 50) override { // different default — silently ignored via base ptr + width_ = width; + height_ = height; + } + + void highlight(const Color& color = Color::blue()) override { // different default — same problem + highlightColor_ = color; + } + + std::string describe() const override { + return "Button(" + label_ + ")"; + } + +private: + std::string label_; + int width_ = 0; + int height_ = 0; + Color highlightColor_; +}; + +void demonstrate() { + std::unique_ptr w = std::make_unique + + + + + +
    + + +
    +
    + + {selectedIndex + 1} / {behaviorKeys.length} + + {behavior?.split('.').slice(1).join('.')} +
    + + +
    +
    + + {behavior && data[behavior] && ( + setScalar(behavior, metric, val)} + /> + )} +
    +
    + + {yamlOpen && ( + setYamlOpen(false)} /> + )} + + ) +} diff --git a/tools/scalar_tuner/src/assets/hero.png b/tools/scalar_tuner/src/assets/hero.png new file mode 100644 index 0000000..cc51a3d Binary files /dev/null and b/tools/scalar_tuner/src/assets/hero.png differ diff --git a/tools/scalar_tuner/src/assets/react.svg b/tools/scalar_tuner/src/assets/react.svg new file mode 100644 index 0000000..6c87de9 --- /dev/null +++ b/tools/scalar_tuner/src/assets/react.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tools/scalar_tuner/src/assets/vite.svg b/tools/scalar_tuner/src/assets/vite.svg new file mode 100644 index 0000000..5101b67 --- /dev/null +++ b/tools/scalar_tuner/src/assets/vite.svg @@ -0,0 +1 @@ +Vite diff --git a/tools/scalar_tuner/src/components/BehaviorCard.jsx b/tools/scalar_tuner/src/components/BehaviorCard.jsx new file mode 100644 index 0000000..0219a09 --- /dev/null +++ b/tools/scalar_tuner/src/components/BehaviorCard.jsx @@ -0,0 +1,160 @@ +import { useState, useMemo } from 'react' +import MetricRow from './MetricRow' +import ScoreDisplay from './ScoreDisplay' +import { isIntegerMultiple } from '../App' + +function colValue(col, metricKey, vals, effectiveScalar) { + switch (col) { + case 'metric': return metricKey + case 'bad': return vals.bad + case 'good': return vals.good + case 'ratio': return vals.ratio + case 'logdiff': return vals.log_good - vals.log_bad + case 'contrib': return Math.exp(effectiveScalar * (vals.log_good - vals.log_bad)) + case 'scalar': return effectiveScalar + } +} + +function sortIcon(col, sortCol, sortDir) { + if (sortCol !== col) return + if (sortDir === 'asc') return + if (sortDir === 'desc') return +} + +function isInDeadzone(effectiveScalar, logDiff, deadzone) { + if (effectiveScalar === 0) return true + const contrib = Math.exp(effectiveScalar * logDiff) + return contrib >= deadzone && contrib <= 2 - deadzone +} + +export default function BehaviorCard({ behavior, metrics, scalars, score, onScalarChange }) { + const [showAll, setShowAll] = useState(true) + const [sortCol, setSortCol] = useState(null) + const [sortDir, setSortDir] = useState(null) + const [scale, setScale] = useState(1.0) + const [deadzone, setDeadzone] = useState(1.0) + + // Default order: by abs(suggested_scalar), stable, never depends on scalars + const defaultEntries = useMemo( + () => [...Object.entries(metrics)].sort(([, a], [, b]) => + Math.abs(b.suggested_scalar ?? 0) - Math.abs(a.suggested_scalar ?? 0) + ), + [metrics] + ) + + // Virtual score: use scale + deadzone filter, never modifies actual scalars + const virtualScore = useMemo(() => { + const compute = side => Object.entries(scalars).reduce((acc, [k, scalar]) => { + const eff = scalar * scale + if (eff === 0) return acc + const bv = Math.max(metrics[k]?.bad ?? 1, 1e-300) + const gv = Math.max(metrics[k]?.good ?? 1, 1e-300) + const logDiff = Math.log(gv) - Math.log(bv) + if (isInDeadzone(eff, logDiff, deadzone)) return acc + return acc * Math.pow(side === 'bad' ? bv : gv, eff) + }, 1.0) + const bad = compute('bad') + const good = compute('good') + return { bad, good, ratio: bad > 0 ? good / bad : 0 } + }, [scalars, scale, deadzone, metrics]) + + function handleColClick(col) { + if (sortCol !== col) { setSortCol(col); setSortDir('asc') } + else if (sortDir === 'asc') setSortDir('desc') + else { setSortCol(null); setSortDir(null) } + } + + const sortedEntries = useMemo(() => { + const base = showAll ? defaultEntries : defaultEntries.filter(([k]) => (scalars[k] ?? 0) !== 0) + if (!sortCol || !sortDir) return base + return [...base].sort(([ka, va], [kb, vb]) => { + const a = colValue(sortCol, ka, va, (scalars[ka] ?? 0) * scale) + const b = colValue(sortCol, kb, vb, (scalars[kb] ?? 0) * scale) + if (typeof a === 'string') return sortDir === 'asc' ? a.localeCompare(b) : b.localeCompare(a) + return sortDir === 'asc' ? a - b : b - a + }) + }, [defaultEntries, sortCol, sortDir, showAll, scalars, scale]) + + const nonZeroCount = defaultEntries.filter(([k]) => (scalars[k] ?? 0) !== 0).length + const deadzoneCount = useMemo(() => + defaultEntries.filter(([k, v]) => { + const eff = (scalars[k] ?? 0) * scale + const logDiff = v.log_good - v.log_bad + return isInDeadzone(eff, logDiff, deadzone) + }).length, + [defaultEntries, scalars, scale, deadzone] + ) + + function th(col, label) { + return ( + handleColClick(col)}> + {label}{sortIcon(col, sortCol, sortDir)} + + ) + } + + return ( +
    +
    + + +
    + + + +
    +
    + + + + + {th('metric', 'metric')} + {th('bad', 'bad')} + {th('good', 'good')} + {th('ratio', 'ratio')} + {th('logdiff', 'log diff')} + {th('contrib', 'contrib')} + {th('scalar', 'scalar')} + + + + {sortedEntries.map(([metricKey, vals]) => { + const actualScalar = scalars[metricKey] ?? 0 + const effectiveScalar = actualScalar * scale + const logDiff = vals.log_good - vals.log_bad + return ( + onScalarChange(metricKey, v)} + /> + ) + })} + +
    + +
    + ) +} diff --git a/tools/scalar_tuner/src/components/Knob.jsx b/tools/scalar_tuner/src/components/Knob.jsx new file mode 100644 index 0000000..874ca2f --- /dev/null +++ b/tools/scalar_tuner/src/components/Knob.jsx @@ -0,0 +1,68 @@ +import { useEffect, useRef, useState } from 'react' + +const MIN_ANGLE = -135 +const MAX_ANGLE = 135 + +function angleToFactor(angle) { + // center (0°) = ×1.0, full CW (+135°) = ×3.0, full CCW (-135°) = ×0.0 + if (angle >= 0) return 1 + (angle / MAX_ANGLE) * 2 + else return 1 + (angle / Math.abs(MIN_ANGLE)) * 1 +} + +export default function Knob({ onFactor }) { + const [angle, setAngle] = useState(0) + const [dragging, setDragging] = useState(false) + const startRef = useRef(null) + const angleRef = useRef(0) + + function onMouseDown(e) { + e.preventDefault() + startRef.current = { y: e.clientY, startAngle: angleRef.current } + setDragging(true) + } + + useEffect(() => { + if (!dragging) return + + function onMouseMove(e) { + const dy = startRef.current.y - e.clientY + const newAngle = Math.max(MIN_ANGLE, Math.min(MAX_ANGLE, startRef.current.startAngle + dy)) + angleRef.current = newAngle + setAngle(newAngle) + onFactor(angleToFactor(newAngle), false) + } + + function onMouseUp() { + onFactor(angleToFactor(angleRef.current), true) + angleRef.current = 0 + setAngle(0) + setDragging(false) + } + + window.addEventListener('mousemove', onMouseMove) + window.addEventListener('mouseup', onMouseUp) + return () => { + window.removeEventListener('mousemove', onMouseMove) + window.removeEventListener('mouseup', onMouseUp) + } + }, [dragging]) + + const factor = angleToFactor(angle) + + return ( +
    +
    + 0 +
    +
    +
    + +
    + {dragging ? `×${factor.toFixed(2)}` : 'scale all'} +
    + ) +} diff --git a/tools/scalar_tuner/src/components/MetricRow.jsx b/tools/scalar_tuner/src/components/MetricRow.jsx new file mode 100644 index 0000000..c3339e3 --- /dev/null +++ b/tools/scalar_tuner/src/components/MetricRow.jsx @@ -0,0 +1,59 @@ +function fmtContrib(v) { + if (v === 1) return '1.000' + if (v > 999) return '>999' + if (v < 0.001) return v.toExponential(1) + return v.toFixed(3) +} + +export default function MetricRow({ metricKey, vals, scalar, effectiveScalar, isDeadzoned, isDamped, onChange }) { + const logDiff = (vals.log_good - vals.log_bad).toFixed(3) + const ratio = vals.ratio.toFixed(3) + + const contribution = effectiveScalar !== 0 + ? Math.exp(effectiveScalar * (vals.log_good - vals.log_bad)) + : 1 + const contribPositive = contribution > 1 + const contribNeutral = isDeadzoned || Math.abs(contribution - 1) < 0.001 + + return ( + + + {metricKey} + {isDamped && ( + ~int + )} + + {vals.bad.toFixed(4)} + {vals.good.toFixed(4)} + {ratio}x + 0 ? 'pos' : parseFloat(logDiff) < 0 ? 'neg' : ''}`}> + {logDiff} + + + {isDeadzoned ? '—' : `${fmtContrib(contribution)}x`} + + + onChange(parseFloat(e.target.value))} + /> + onChange(parseFloat(e.target.value) || 0)} + className="scalar-input" + /> + + + ) +} diff --git a/tools/scalar_tuner/src/components/ScoreDisplay.jsx b/tools/scalar_tuner/src/components/ScoreDisplay.jsx new file mode 100644 index 0000000..8d372ec --- /dev/null +++ b/tools/scalar_tuner/src/components/ScoreDisplay.jsx @@ -0,0 +1,52 @@ +function fmt(v) { + if (v === 0) return '0' + if (v >= 999999999) return '999999999' + if (v < 0.0001) return v.toExponential(2) + return v.toFixed(4) +} + +function fmtLog(v) { + if (v <= 0) return '—' + return Math.log(v).toFixed(3) +} + +function fmtRatio(v) { + if (v >= 999999999) return '999999999' + return v.toFixed(2) +} + +function fmtLog10Ratio(v) { + if (v <= 0) return '—' + return Math.log10(v).toFixed(2) +} + +export default function ScoreDisplay({ score }) { + if (!score) return null + const { bad, good, ratio } = score + + const cls = ratio >= 2.0 ? 'ratio-good' : ratio >= 1.0 ? 'ratio-weak' : 'ratio-bad' + const icon = ratio >= 2.0 ? '✓' : ratio >= 1.0 ? '~' : '✗' + const isUnderflow = bad < 1e-10 && good < 1e-10 + + return ( + + + + + + + + + + + + + + + + + + +
    bad{fmt(bad)}ln={fmtLog(bad)}
    good{fmt(good)}ln={fmtLog(good)}
    ratio{fmtRatio(ratio)}x {icon}log₁₀={fmtLog10Ratio(ratio)}{isUnderflow ? ' ⚠' : ''}
    + ) +} diff --git a/tools/scalar_tuner/src/components/YamlModal.jsx b/tools/scalar_tuner/src/components/YamlModal.jsx new file mode 100644 index 0000000..875d651 --- /dev/null +++ b/tools/scalar_tuner/src/components/YamlModal.jsx @@ -0,0 +1,64 @@ +import { useMemo } from 'react' + +function toYaml(scalars) { + const categories = {} + + for (const [behaviorKey, metrics] of Object.entries(scalars)) { + const dotIdx = behaviorKey.indexOf('.') + const category = behaviorKey.slice(0, dotIdx) + const behavior = behaviorKey.slice(dotIdx + 1) + + if (!categories[category]) categories[category] = {} + + const nonZero = Object.entries(metrics).filter(([, v]) => v !== 0) + if (nonZero.length === 0) continue + + const grouped = {} + for (const [metricKey, scalar] of nonZero) { + const [group, key] = metricKey.split('.') + if (!grouped[group]) grouped[group] = {} + grouped[group][key] = scalar + } + + categories[category][behavior] = grouped + } + + let out = '' + for (const [category, behaviors] of Object.entries(categories)) { + out += `# ${category}\n` + for (const [behavior, groups] of Object.entries(behaviors)) { + out += `${behavior}:\n` + for (const [group, keys] of Object.entries(groups)) { + out += ` ${group}:\n` + for (const [key, scalar] of Object.entries(keys)) { + out += ` ${key}: ${scalar.toFixed(4)}\n` + } + } + out += '\n' + } + } + return out.trim() +} + +export default function YamlModal({ scalars, onClose }) { + const yaml = useMemo(() => toYaml(scalars), [scalars]) + + function copy() { + navigator.clipboard.writeText(yaml) + } + + return ( +
    +
    e.stopPropagation()}> +
    +

    YAML Export

    +
    + + +
    +
    +
    {yaml}
    +
    +
    + ) +} diff --git a/tools/scalar_tuner/src/index.css b/tools/scalar_tuner/src/index.css new file mode 100644 index 0000000..4300cd0 --- /dev/null +++ b/tools/scalar_tuner/src/index.css @@ -0,0 +1,106 @@ +:root { + --text: #6b6375; + --text-h: #08060d; + --bg: #fff; + --border: #e5e4e7; + --code-bg: #f4f3ec; + --accent: #aa3bff; + --accent-bg: rgba(170, 59, 255, 0.1); + --accent-border: rgba(170, 59, 255, 0.5); + --social-bg: rgba(244, 243, 236, 0.5); + --shadow: + rgba(0, 0, 0, 0.1) 0 10px 15px -3px, rgba(0, 0, 0, 0.05) 0 4px 6px -2px; + + --sans: system-ui, 'Segoe UI', Roboto, sans-serif; + --heading: system-ui, 'Segoe UI', Roboto, sans-serif; + --mono: ui-monospace, Consolas, monospace; + + font: 18px/145% var(--sans); + letter-spacing: 0.18px; + color-scheme: light dark; + color: var(--text); + background: var(--bg); + font-synthesis: none; + text-rendering: optimizeLegibility; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; + + @media (max-width: 1024px) { + font-size: 16px; + } +} + +@media (prefers-color-scheme: dark) { + :root { + --text: #9ca3af; + --text-h: #f3f4f6; + --bg: #16171d; + --border: #2e303a; + --code-bg: #1f2028; + --accent: #c084fc; + --accent-bg: rgba(192, 132, 252, 0.15); + --accent-border: rgba(192, 132, 252, 0.5); + --social-bg: rgba(47, 48, 58, 0.5); + --shadow: + rgba(0, 0, 0, 0.4) 0 10px 15px -3px, rgba(0, 0, 0, 0.25) 0 4px 6px -2px; + } + + #social .button-icon { + filter: invert(1) brightness(2); + } +} + +body { + margin: 0; +} + +#root { + width: 100%; + height: 100vh; + display: flex; + flex-direction: column; +} + +h1, +h2 { + font-family: var(--heading); + font-weight: 500; + color: var(--text-h); +} + +h1 { + font-size: 56px; + letter-spacing: -1.68px; + margin: 32px 0; + @media (max-width: 1024px) { + font-size: 36px; + margin: 20px 0; + } +} +h2 { + font-size: 24px; + line-height: 118%; + letter-spacing: -0.24px; + margin: 0 0 8px; + @media (max-width: 1024px) { + font-size: 20px; + } +} +p { + margin: 0; +} + +code, +.counter { + font-family: var(--mono); + display: inline-flex; + border-radius: 4px; + color: var(--text-h); +} + +code { + font-size: 15px; + line-height: 135%; + padding: 4px 8px; + background: var(--code-bg); +} diff --git a/tools/scalar_tuner/src/main.jsx b/tools/scalar_tuner/src/main.jsx new file mode 100644 index 0000000..b9a1a6d --- /dev/null +++ b/tools/scalar_tuner/src/main.jsx @@ -0,0 +1,10 @@ +import { StrictMode } from 'react' +import { createRoot } from 'react-dom/client' +import './index.css' +import App from './App.jsx' + +createRoot(document.getElementById('root')).render( + + + , +) diff --git a/tools/scalar_tuner/vite.config.js b/tools/scalar_tuner/vite.config.js new file mode 100644 index 0000000..8b0f57b --- /dev/null +++ b/tools/scalar_tuner/vite.config.js @@ -0,0 +1,7 @@ +import { defineConfig } from 'vite' +import react from '@vitejs/plugin-react' + +// https://vite.dev/config/ +export default defineConfig({ + plugins: [react()], +})