From 200508e29330431541ea3af31c489778a8ab956b Mon Sep 17 00:00:00 2001
From: Abhishek Nath <anath1@linkedin.com>
Date: Fri, 22 May 2026 15:33:00 -0700
Subject: [PATCH] Optimizer: Batched orphan file deletion using bin packing

---
 .../BatchedOrphanFilesDeletionSparkApp.java   | 421 ++++++++++++++++++
 .../optimizer/OperationUpdateRequest.java     |  26 ++
 .../optimizer/OptimizerServiceClient.java     |  81 ++++
 .../openhouse/jobs/util/binpack/Bin.java      |  49 ++
 .../openhouse/jobs/util/binpack/BinItem.java  |  29 ++
 .../binpack/FirstFitDecreasingBinPacker.java  |  70 +++
 ...edOrphanFilesDeletionSparkAppArgsTest.java |  74 +++
 .../FirstFitDecreasingBinPackerTest.java      | 150 +++++++
 8 files changed, 900 insertions(+)
 create mode 100644 apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
 create mode 100644 apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OperationUpdateRequest.java
 create mode 100644 apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OptimizerServiceClient.java
 create mode 100644 apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/Bin.java
 create mode 100644 apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/BinItem.java
 create mode 100644 apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPacker.java
 create mode 100644 apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppArgsTest.java
 create mode 100644 apps/spark/src/test/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPackerTest.java
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
new file mode 100644
index 000000000..998dc9c2a
--- /dev/null
+++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
@@ -0,0 +1,421 @@
+package com.linkedin.openhouse.jobs.spark;
+
+import com.linkedin.openhouse.common.metrics.DefaultOtelConfig;
+import com.linkedin.openhouse.common.metrics.OtelEmitter;
+import com.linkedin.openhouse.jobs.exception.TableValidationException;
+import com.linkedin.openhouse.jobs.spark.optimizer.OperationUpdateRequest;
+import com.linkedin.openhouse.jobs.spark.optimizer.OptimizerServiceClient;
+import com.linkedin.openhouse.jobs.spark.state.StateManager;
+import com.linkedin.openhouse.jobs.util.AppConstants;
+import com.linkedin.openhouse.jobs.util.AppsOtelEmitter;
+import com.linkedin.openhouse.jobs.util.TableStateValidator;
+import io.opentelemetry.api.common.AttributeKey;
+import io.opentelemetry.api.common.Attributes;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.lang3.math.NumberUtils;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.actions.DeleteOrphanFiles;
+
+/**
+ * Batched orphan-files-deletion Spark app. One Spark job processes a list of {@code (table,
+ * operationId)} pairs that the optimizer scheduler bin-packed into a single batch. Each table is
+ * handled by a worker thread; per-table failures are caught and reported back independently — the
+ * job continues for the remaining tables and exits 0 if at least one table succeeds.
+ *
+ * <p>This is the multi-table counterpart of {@link OrphanFilesDeletionSparkApp}. The single-table
+ * app remains the deployment unit when bin size is 1, and stays the canonical reference for the
+ * actual deletion logic.
+ *
+ * <p>Example invocation:
+ *
+ * <pre>{@code
+ * com.linkedin.openhouse.jobs.spark.BatchedOrphanFilesDeletionSparkApp \
+ *   --tableNames db.t1,db.t2,db.t3 \
+ *   --operationIds op-uuid-1,op-uuid-2,op-uuid-3 \
+ *   --tableUuids tab-uuid-1,tab-uuid-2,tab-uuid-3 \
+ *   --resultsEndpoint http://optimizer.svc:8080 \
+ *   --driverParallelism 4
+ * }</pre>
+ */
+@Slf4j
+public class BatchedOrphanFilesDeletionSparkApp extends BaseSparkApp {
+
+  private static final String OPERATION_TYPE = "ORPHAN_FILES_DELETION";
+  private static final String STATUS_SUCCESS = "SUCCESS";
+  private static final String STATUS_FAILED = "FAILED";
+  private static final int DEFAULT_MAX_ORPHAN_FILE_SAMPLE_SIZE = 20000;
+  private static final int DEFAULT_MIN_OFD_TTL_IN_DAYS = 3;
+
+  private final List<BatchEntry> entries;
+  private final String resultsEndpoint;
+  private final int driverParallelism;
+  private final long ttlSeconds;
+  private final String backupDir;
+  private final int concurrentDeletes;
+  private final boolean streamResults;
+  private final int maxOrphanFileSampleSize;
+
+  public BatchedOrphanFilesDeletionSparkApp(
+      String jobId,
+      StateManager stateManager,
+      OtelEmitter otelEmitter,
+      List<BatchEntry> entries,
+      String resultsEndpoint,
+      int driverParallelism,
+      long ttlSeconds,
+      String backupDir,
+      int concurrentDeletes,
+      boolean streamResults,
+      int maxOrphanFileSampleSize) {
+    super(jobId, stateManager, otelEmitter);
+    this.entries = entries;
+    this.resultsEndpoint = resultsEndpoint;
+    this.driverParallelism = Math.max(1, driverParallelism);
+    this.ttlSeconds = ttlSeconds;
+    this.backupDir = backupDir;
+    this.concurrentDeletes = concurrentDeletes;
+    this.streamResults = streamResults;
+    this.maxOrphanFileSampleSize = maxOrphanFileSampleSize;
+  }
+
+  @Override
+  protected void runInner(Operations ops) {
+    log.info(
+        "Batched OFD start: entries={} driverParallelism={} resultsEndpoint={}",
+        entries.size(),
+        driverParallelism,
+        resultsEndpoint);
+
+    if (entries.isEmpty()) {
+      log.warn("Batched OFD invoked with no entries; nothing to do");
+      return;
+    }
+
+    int successCount;
+    try (OptimizerServiceClient client = newOptimizerClient()) {
+      successCount = runBatch(ops, client);
+    }
+
+    int failureCount = entries.size() - successCount;
+    log.info(
+        "Batched OFD finished: total={} success={} failed={}",
+        entries.size(),
+        successCount,
+        failureCount);
+
+    if (successCount == 0) {
+      throw new RuntimeException(
+          String.format("All %d operations in batch failed", entries.size()));
+    }
+  }
+
+  private int runBatch(Operations ops, OptimizerServiceClient client) {
+    ExecutorService pool = Executors.newFixedThreadPool(driverParallelism);
+    try {
+      List<Future<Boolean>> futures = new ArrayList<>(entries.size());
+      for (BatchEntry entry : entries) {
+        futures.add(pool.submit(new TableWorker(ops, entry, client)));
+      }
+      int successes = 0;
+      for (int i = 0; i < futures.size(); i++) {
+        try {
+          if (Boolean.TRUE.equals(futures.get(i).get())) {
+            successes++;
+          }
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+          log.error("Interrupted while waiting on table {}", entries.get(i).getFqtn(), e);
+        } catch (ExecutionException e) {
+          // Per-table workers swallow their own failures and report FAILED upstream; an
+          // ExecutionException here means the worker itself threw, which we treat as a failed
+          // operation but otherwise let the batch continue.
+          log.error("Worker threw for table {}", entries.get(i).getFqtn(), e.getCause());
+        }
+      }
+      return successes;
+    } finally {
+      pool.shutdown();
+      try {
+        if (!pool.awaitTermination(30, TimeUnit.SECONDS)) {
+          pool.shutdownNow();
+        }
+      } catch (InterruptedException e) {
+        Thread.currentThread().interrupt();
+        pool.shutdownNow();
+      }
+    }
+  }
+
+  protected OptimizerServiceClient newOptimizerClient() {
+    return new OptimizerServiceClient(resultsEndpoint);
+  }
+
+  /** One unit of work in a batched OFD job. */
+  private final class TableWorker implements Callable<Boolean> {
+    private final Operations ops;
+    private final BatchEntry entry;
+    private final OptimizerServiceClient client;
+
+    TableWorker(Operations ops, BatchEntry entry, OptimizerServiceClient client) {
+      this.ops = ops;
+      this.entry = entry;
+      this.client = client;
+    }
+
+    @Override
+    public Boolean call() {
+      String fqtn = entry.getFqtn();
+      boolean success = false;
+      try {
+        log.info("OFD start: fqtn={} operationId={}", fqtn, entry.getOperationId());
+        Table table = ops.getTable(fqtn);
+        long effectiveTtlSeconds = resolveTtlSeconds(table);
+        long olderThanTimestampMillis =
+            System.currentTimeMillis() - TimeUnit.SECONDS.toMillis(effectiveTtlSeconds);
+        boolean backupEnabled =
+            Boolean.parseBoolean(
+                table.properties().getOrDefault(AppConstants.BACKUP_ENABLED_KEY, "false"));
+        DeleteOrphanFiles.Result result =
+            ops.deleteOrphanFiles(
+                table,
+                olderThanTimestampMillis,
+                backupEnabled,
+                backupDir,
+                concurrentDeletes,
+                streamResults,
+                maxOrphanFileSampleSize);
+        int orphanCount = countOrphans(result);
+        otelEmitter.count(
+            METRICS_SCOPE,
+            AppConstants.ORPHAN_FILE_COUNT,
+            orphanCount,
+            Attributes.of(AttributeKey.stringKey(AppConstants.TABLE_NAME), fqtn));
+        validate(fqtn);
+        success = true;
+        log.info("OFD success: fqtn={} orphansDetected={}", fqtn, orphanCount);
+      } catch (TableValidationException e) {
+        log.error("Post-job validation failed for fqtn={}", fqtn, e);
+      } catch (Throwable t) {
+        log.error("OFD failed: fqtn={} operationId={}", fqtn, entry.getOperationId(), t);
+      } finally {
+        reportResult(success);
+      }
+      return success;
+    }
+
+    private void validate(String fqtn) {
+      try {
+        TableStateValidator.run(ops.spark(), fqtn);
+      } catch (TableValidationException e) {
+        otelEmitter.count(
+            METRICS_SCOPE,
+            "post_run_validation_error",
+            1,
+            Attributes.of(
+                AttributeKey.stringKey(AppConstants.TABLE_NAME),
+                fqtn,
+                AttributeKey.stringKey(AppConstants.JOB_NAME),
+                BatchedOrphanFilesDeletionSparkApp.class.getSimpleName()));
+        throw e;
+      }
+    }
+
+    private void reportResult(boolean success) {
+      OperationUpdateRequest body =
+          OperationUpdateRequest.builder()
+              .operationId(entry.getOperationId())
+              .status(success ? STATUS_SUCCESS : STATUS_FAILED)
+              .tableUuid(entry.getTableUuid())
+              .databaseName(entry.getDatabaseName())
+              .tableName(entry.getTableName())
+              .operationType(OPERATION_TYPE)
+              .build();
+      try {
+        client.updateOperation(body);
+      } catch (IOException e) {
+        log.error(
+            "Failed to report operation result; row will stay SCHEDULED until stale-timeout: operationId={} fqtn={}",
+            entry.getOperationId(),
+            entry.getFqtn(),
+            e);
+      }
+    }
+
+    private long resolveTtlSeconds(Table table) {
+      long resolved = ttlSeconds;
+      boolean oneDayTtlEnabled =
+          Boolean.parseBoolean(
+              table.properties().getOrDefault(AppConstants.OFD_ONE_DAY_TTL_ENABLED_KEY, "false"));
+      if (oneDayTtlEnabled) {
+        resolved = TimeUnit.DAYS.toSeconds(1);
+      }
+      String tableType =
+          table
+              .properties()
+              .getOrDefault(AppConstants.OPENHOUSE_TABLE_TYPE_KEY, AppConstants.TABLE_TYPE_PRIMARY);
+      if (AppConstants.TABLE_TYPE_REPLICA.equals(tableType)) {
+        long days = Duration.ofSeconds(resolved).toDays();
+        if (days < DEFAULT_MIN_OFD_TTL_IN_DAYS) {
+          resolved = TimeUnit.DAYS.toSeconds(DEFAULT_MIN_OFD_TTL_IN_DAYS);
+        }
+      }
+      return resolved;
+    }
+
+    private int countOrphans(DeleteOrphanFiles.Result result) {
+      int count = 0;
+      for (String unused : result.orphanFileLocations()) {
+        count++;
+      }
+      return count;
+    }
+  }
+
+  /** Per-table inputs for one operation row inside a bin. */
+  @lombok.AllArgsConstructor
+  @lombok.Builder
+  @lombok.Getter
+  @lombok.ToString
+  public static class BatchEntry {
+    private final String fqtn;
+    private final String operationId;
+    private final String tableUuid;
+    private final String databaseName;
+    private final String tableName;
+  }
+
+  public static void main(String[] args) {
+    OtelEmitter otelEmitter =
+        new AppsOtelEmitter(Collections.singletonList(DefaultOtelConfig.getOpenTelemetry()));
+    createApp(args, otelEmitter).run();
+  }
+
+  public static BatchedOrphanFilesDeletionSparkApp createApp(
+      String[] args, OtelEmitter otelEmitter) {
+    List<Option> extraOptions = new ArrayList<>();
+    extraOptions.add(
+        new Option(
+            null, "tableNames", true, "Comma-separated list of fully-qualified table names"));
+    extraOptions.add(
+        new Option(
+            null, "operationIds", true, "Comma-separated operation UUIDs, parallel to tableNames"));
+    extraOptions.add(
+        new Option(
+            null, "tableUuids", true, "Comma-separated table UUIDs, parallel to tableNames"));
+    extraOptions.add(
+        new Option(null, "resultsEndpoint", true, "Base URL of the Optimizer Service"));
+    extraOptions.add(
+        new Option(null, "driverParallelism", true, "Worker threads in this batch (default 1)"));
+    extraOptions.add(
+        new Option("tr", "trashDir", true, "Orphan files staging dir before deletion"));
+    extraOptions.add(
+        new Option(
+            "r",
+            "ttl",
+            true,
+            "How old files should be to be considered orphaned in seconds, minimum 1d is enforced"));
+    extraOptions.add(new Option("b", "backupDir", true, "Backup directory for deleted data"));
+    extraOptions.add(
+        new Option("c", "concurrentDeletes", true, "Number of concurrent deletes per table"));
+    extraOptions.add(
+        new Option(
+            null, "streamResults", false, "Stream orphan file deletions instead of collecting"));
+    extraOptions.add(
+        new Option(null, "maxOrphanFileSampleSize", true, "Max orphan file sample paths returned"));
+
+    CommandLine cmdLine = createCommandLine(args, extraOptions);
+
+    List<BatchEntry> entries =
+        buildEntries(
+            cmdLine.getOptionValue("tableNames"),
+            cmdLine.getOptionValue("operationIds"),
+            cmdLine.getOptionValue("tableUuids"));
+
+    return new BatchedOrphanFilesDeletionSparkApp(
+        getJobId(cmdLine),
+        createStateManager(cmdLine, otelEmitter),
+        otelEmitter,
+        entries,
+        requireOption(cmdLine, "resultsEndpoint"),
+        Integer.parseInt(cmdLine.getOptionValue("driverParallelism", "1")),
+        Math.max(
+            NumberUtils.toLong(cmdLine.getOptionValue("ttl"), TimeUnit.DAYS.toSeconds(7)),
+            TimeUnit.DAYS.toSeconds(1)),
+        cmdLine.getOptionValue("backupDir", ".backup"),
+        Integer.parseInt(cmdLine.getOptionValue("concurrentDeletes", "10")),
+        cmdLine.hasOption("streamResults"),
+        Integer.parseInt(
+            cmdLine.getOptionValue(
+                "maxOrphanFileSampleSize", String.valueOf(DEFAULT_MAX_ORPHAN_FILE_SAMPLE_SIZE))));
+  }
+
+  static List<BatchEntry> buildEntries(String tableNames, String operationIds, String tableUuids) {
+    if (tableNames == null
+        || operationIds == null
+        || tableUuids == null
+        || tableNames.isEmpty()
+        || operationIds.isEmpty()
+        || tableUuids.isEmpty()) {
+      throw new IllegalArgumentException(
+          "--tableNames, --operationIds, and --tableUuids are all required and must be non-empty");
+    }
+    String[] tables = tableNames.split(",");
+    String[] ops = operationIds.split(",");
+    String[] uuids = tableUuids.split(",");
+    if (tables.length != ops.length || tables.length != uuids.length) {
+      throw new IllegalArgumentException(
+          String.format(
+              "Parallel-list length mismatch: tableNames=%d operationIds=%d tableUuids=%d",
+              tables.length, ops.length, uuids.length));
+    }
+    List<BatchEntry> entries = new ArrayList<>(tables.length);
+    for (int i = 0; i < tables.length; i++) {
+      String fqtn = tables[i].trim();
+      String[] dbAndTable = fqtn.split("\\.", 2);
+      if (dbAndTable.length != 2 || dbAndTable[0].isEmpty() || dbAndTable[1].isEmpty()) {
+        throw new IllegalArgumentException(
+            "tableNames entries must be fully-qualified (db.table): " + fqtn);
+      }
+      entries.add(
+          BatchEntry.builder()
+              .fqtn(fqtn)
+              .operationId(ops[i].trim())
+              .tableUuid(uuids[i].trim())
+              .databaseName(dbAndTable[0])
+              .tableName(dbAndTable[1])
+              .build());
+    }
+    return entries;
+  }
+
+  private static String requireOption(CommandLine cmdLine, String name) {
+    String value = cmdLine.getOptionValue(name);
+    if (value == null || value.isEmpty()) {
+      throw new IllegalArgumentException("--" + name + " is required");
+    }
+    return value;
+  }
+
+  /** Visible for tests. */
+  List<BatchEntry> getEntries() {
+    return Collections.unmodifiableList(entries);
+  }
+
+  /** Visible for tests. */
+  int getDriverParallelism() {
+    return driverParallelism;
+  }
+}
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OperationUpdateRequest.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OperationUpdateRequest.java
new file mode 100644
index 000000000..715873aaa
--- /dev/null
+++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OperationUpdateRequest.java
@@ -0,0 +1,26 @@
+package com.linkedin.openhouse.jobs.spark.optimizer;
+
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+/**
+ * Wire-compatible body for {@code POST /v1/optimizer/operations/update} on the Optimizer Service.
+ *
+ * <p>Mirrors {@code com.linkedin.openhouse.optimizer.api.spec.UpdateOperationRequest} from the
+ * optimizer service module so this app can be built before that module merges. Keep the two in
+ * sync.
+ */
+@Data
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
+public class OperationUpdateRequest {
+  private String operationId;
+  private String status;
+  private String tableUuid;
+  private String databaseName;
+  private String tableName;
+  private String operationType;
+}
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OptimizerServiceClient.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OptimizerServiceClient.java
new file mode 100644
index 000000000..ec220337f
--- /dev/null
+++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OptimizerServiceClient.java
@@ -0,0 +1,81 @@
+package com.linkedin.openhouse.jobs.spark.optimizer;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import lombok.extern.slf4j.Slf4j;
+import okhttp3.MediaType;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import okhttp3.RequestBody;
+import okhttp3.Response;
+
+/**
+ * Thin OkHttp client for the Optimizer Service. The batched Spark app calls {@link
+ * #updateOperation(OperationUpdateRequest)} once per finished operation to record SUCCESS or
+ * FAILED.
+ *
+ * <p>Errors are surfaced as {@link IOException}; the caller decides whether to retry. Per the
+ * design, a missed update is recoverable — the operation row stays SCHEDULED and the Analyzer's
+ * stale-timeout will re-queue it.
+ */
+@Slf4j
+public class OptimizerServiceClient implements AutoCloseable {
+
+  private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8");
+  private static final String UPDATE_PATH = "/v1/optimizer/operations/update";
+
+  private final String baseUrl;
+  private final OkHttpClient httpClient;
+  private final ObjectMapper objectMapper;
+
+  public OptimizerServiceClient(String baseUrl) {
+    this(baseUrl, defaultClient(), new ObjectMapper());
+  }
+
+  OptimizerServiceClient(String baseUrl, OkHttpClient httpClient, ObjectMapper objectMapper) {
+    this.baseUrl = stripTrailingSlash(baseUrl);
+    this.httpClient = httpClient;
+    this.objectMapper = objectMapper;
+  }
+
+  public void updateOperation(OperationUpdateRequest body) throws IOException {
+    String url = baseUrl + UPDATE_PATH;
+    String json = objectMapper.writeValueAsString(body);
+    Request request = new Request.Builder().url(url).post(RequestBody.create(json, JSON)).build();
+    try (Response response = httpClient.newCall(request).execute()) {
+      if (!response.isSuccessful()) {
+        throw new IOException(
+            String.format(
+                "Optimizer Service update failed: url=%s status=%d operationId=%s",
+                url, response.code(), body.getOperationId()));
+      }
+      log.info(
+          "Reported operation update: operationId={} status={} httpStatus={}",
+          body.getOperationId(),
+          body.getStatus(),
+          response.code());
+    }
+  }
+
+  @Override
+  public void close() {
+    httpClient.dispatcher().executorService().shutdown();
+    httpClient.connectionPool().evictAll();
+  }
+
+  private static OkHttpClient defaultClient() {
+    return new OkHttpClient.Builder()
+        .connectTimeout(10, TimeUnit.SECONDS)
+        .readTimeout(30, TimeUnit.SECONDS)
+        .writeTimeout(30, TimeUnit.SECONDS)
+        .build();
+  }
+
+  private static String stripTrailingSlash(String url) {
+    if (url == null || url.isEmpty()) {
+      throw new IllegalArgumentException("Optimizer Service base URL must be non-empty");
+    }
+    return url.endsWith("/") ? url.substring(0, url.length() - 1) : url;
+  }
+}
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/Bin.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/Bin.java
new file mode 100644
index 000000000..0b40b4958
--- /dev/null
+++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/Bin.java
@@ -0,0 +1,49 @@
+package com.linkedin.openhouse.jobs.util.binpack;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import lombok.Getter;
+import lombok.ToString;
+
+/**
+ * Mutable accumulator used by {@link FirstFitDecreasingBinPacker}. After packing completes the
+ * caller treats the returned bins as immutable; {@link #items()} returns an unmodifiable view.
+ */
+@ToString
+public class Bin {
+  private final List<BinItem> items = new ArrayList<>();
+  @Getter private long totalWeight;
+  @Getter private long totalSizeBytes;
+
+  /**
+   * Returns true iff adding {@code item} would keep this bin at or below all three caps. A cap of
+   * {@code <= 0} disables that dimension.
+   */
+  boolean fits(BinItem item, long maxWeight, long maxSizeBytes, int maxItems) {
+    if (maxItems > 0 && items.size() >= maxItems) {
+      return false;
+    }
+    if (maxWeight > 0 && totalWeight + item.getWeight() > maxWeight) {
+      return false;
+    }
+    if (maxSizeBytes > 0 && totalSizeBytes + item.getSizeBytes() > maxSizeBytes) {
+      return false;
+    }
+    return true;
+  }
+
+  void add(BinItem item) {
+    items.add(item);
+    totalWeight += item.getWeight();
+    totalSizeBytes += item.getSizeBytes();
+  }
+
+  public List<BinItem> items() {
+    return Collections.unmodifiableList(items);
+  }
+
+  public int size() {
+    return items.size();
+  }
+}
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/BinItem.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/BinItem.java
new file mode 100644
index 000000000..68bcb16e2
--- /dev/null
+++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/BinItem.java
@@ -0,0 +1,29 @@
+package com.linkedin.openhouse.jobs.util.binpack;
+
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NonNull;
+import lombok.ToString;
+
+/**
+ * A single packable unit for {@link FirstFitDecreasingBinPacker}. Carries everything the batched
+ * Spark app needs both to do the work ({@link #fqtn}) and to report the result back to the
+ * Optimizer Service ({@link #operationId}, {@link #tableUuid}, {@link #databaseName}, {@link
+ * #tableName}).
+ *
+ * <p>{@link #weight} is the bin-packing dimension (for OFD: number of current files in the table).
+ * {@link #sizeBytes} is a secondary capacity dimension that lets the packer cap the total on-disk
+ * footprint of a bin independently of file count.
+ */
+@Getter
+@Builder
+@ToString
+public class BinItem {
+  @NonNull private final String fqtn;
+  @NonNull private final String operationId;
+  @NonNull private final String tableUuid;
+  @NonNull private final String databaseName;
+  @NonNull private final String tableName;
+  private final long weight;
+  private final long sizeBytes;
+}
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPacker.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPacker.java
new file mode 100644
index 000000000..71009d3ff
--- /dev/null
+++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPacker.java
@@ -0,0 +1,70 @@
+package com.linkedin.openhouse.jobs.util.binpack;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.stream.Collectors;
+import lombok.Builder;
+import lombok.extern.slf4j.Slf4j;
+
+/**
+ * First-fit-decreasing bin packer used by the optimizer scheduler to group table operations into
+ * batches before launching a single Spark job per batch.
+ *
+ * <p>Each bin has three independent caps:
+ *
+ * <ul>
+ *   <li>{@code maxWeightPerBin} — total {@link BinItem#getWeight()} (for OFD: number of files)
+ *   <li>{@code maxSizeBytesPerBin} — total on-disk size of all tables in the bin
+ *   <li>{@code maxItemsPerBin} — number of tables per bin
+ * </ul>
+ *
+ * <p>An item that exceeds any single cap on its own is placed into a bin by itself rather than
+ * dropped — we never silently skip maintenance work for an oversized table.
+ *
+ * <p>Pass {@code 0} or a negative value for any cap to disable that dimension.
+ */
+@Slf4j
+@Builder
+public class FirstFitDecreasingBinPacker {
+
+  @Builder.Default private final long maxWeightPerBin = 1_000_000L;
+  @Builder.Default private final long maxSizeBytesPerBin = 5L * 1024L * 1024L * 1024L * 1024L;
+  @Builder.Default private final int maxItemsPerBin = 50;
+
+  public List<Bin> pack(List<BinItem> items) {
+    if (items == null || items.isEmpty()) {
+      return new ArrayList<>();
+    }
+
+    List<BinItem> sorted =
+        items.stream()
+            .sorted(Comparator.comparingLong(BinItem::getWeight).reversed())
+            .collect(Collectors.toList());
+
+    List<Bin> bins = new ArrayList<>();
+    for (BinItem item : sorted) {
+      Bin target = null;
+      for (Bin bin : bins) {
+        if (bin.fits(item, maxWeightPerBin, maxSizeBytesPerBin, maxItemsPerBin)) {
+          target = bin;
+          break;
+        }
+      }
+      if (target == null) {
+        target = new Bin();
+        bins.add(target);
+        if (!target.fits(item, maxWeightPerBin, maxSizeBytesPerBin, maxItemsPerBin)) {
+          log.warn(
+              "Item exceeds per-bin caps on its own; placing in dedicated bin: fqtn={} weight={} sizeBytes={}",
+              item.getFqtn(),
+              item.getWeight(),
+              item.getSizeBytes());
+        }
+      }
+      target.add(item);
+    }
+    log.info("Packed {} items into {} bins", items.size(), bins.size());
+    return bins;
+  }
+}
diff --git a/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppArgsTest.java b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppArgsTest.java
new file mode 100644
index 000000000..7a32e503f
--- /dev/null
+++ b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppArgsTest.java
@@ -0,0 +1,74 @@
+package com.linkedin.openhouse.jobs.spark;
+
+import java.util.List;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Pure-Java unit tests for {@link BatchedOrphanFilesDeletionSparkApp#buildEntries}. No Spark
+ * session, no HTTP — exercises the CLI-parsing edges that decide whether the app can even start.
+ */
+public class BatchedOrphanFilesDeletionSparkAppArgsTest {
+
+  @Test
+  public void buildEntriesParsesParallelLists() {
+    List<BatchedOrphanFilesDeletionSparkApp.BatchEntry> entries =
+        BatchedOrphanFilesDeletionSparkApp.buildEntries(
+            "db1.t1,db2.t2", "op-1,op-2", "uuid-1,uuid-2");
+
+    Assertions.assertEquals(2, entries.size());
+    Assertions.assertEquals("db1.t1", entries.get(0).getFqtn());
+    Assertions.assertEquals("db1", entries.get(0).getDatabaseName());
+    Assertions.assertEquals("t1", entries.get(0).getTableName());
+    Assertions.assertEquals("op-1", entries.get(0).getOperationId());
+    Assertions.assertEquals("uuid-1", entries.get(0).getTableUuid());
+    Assertions.assertEquals("db2.t2", entries.get(1).getFqtn());
+    Assertions.assertEquals("op-2", entries.get(1).getOperationId());
+  }
+
+  @Test
+  public void buildEntriesTrimsWhitespaceInEachEntry() {
+    List<BatchedOrphanFilesDeletionSparkApp.BatchEntry> entries =
+        BatchedOrphanFilesDeletionSparkApp.buildEntries(
+            " db1.t1 , db2.t2 ", " op-1 , op-2 ", " uuid-1 , uuid-2 ");
+
+    Assertions.assertEquals("db1.t1", entries.get(0).getFqtn());
+    Assertions.assertEquals("op-1", entries.get(0).getOperationId());
+    Assertions.assertEquals("uuid-1", entries.get(0).getTableUuid());
+  }
+
+  @Test
+  public void buildEntriesRejectsMismatchedLengths() {
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () ->
+            BatchedOrphanFilesDeletionSparkApp.buildEntries("db.a,db.b", "op-1", "uuid-1,uuid-2"));
+  }
+
+  @Test
+  public void buildEntriesRejectsNullArguments() {
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries(null, "op-1", "uuid-1"));
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries("db.a", null, "uuid-1"));
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries("db.a", "op-1", null));
+  }
+
+  @Test
+  public void buildEntriesRejectsEmptyStrings() {
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries("", "op-1", "uuid-1"));
+  }
+
+  @Test
+  public void buildEntriesRejectsNonFqtn() {
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries("just_a_table", "op-1", "uuid-1"));
+  }
+}
diff --git a/apps/spark/src/test/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPackerTest.java b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPackerTest.java
new file mode 100644
index 000000000..d77944772
--- /dev/null
+++ b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPackerTest.java
@@ -0,0 +1,150 @@
+package com.linkedin.openhouse.jobs.util.binpack;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+public class FirstFitDecreasingBinPackerTest {
+
+  @Test
+  public void emptyInputProducesEmptyOutput() {
+    List<Bin> bins = packer(100, 0, 50).pack(Collections.emptyList());
+    Assertions.assertTrue(bins.isEmpty());
+  }
+
+  @Test
+  public void nullInputProducesEmptyOutput() {
+    List<Bin> bins = packer(100, 0, 50).pack(null);
+    Assertions.assertTrue(bins.isEmpty());
+  }
+
+  @Test
+  public void itemsSortDescendingByWeightBeforePacking() {
+    List<BinItem> items =
+        Arrays.asList(item("db.t_small", 10), item("db.t_big", 100), item("db.t_mid", 50));
+
+    List<Bin> bins = packer(1000, 0, 50).pack(items);
+
+    // Everything fits in one bin since capacity is huge; order inside the bin must be descending.
+    Assertions.assertEquals(1, bins.size());
+    Bin only = bins.get(0);
+    Assertions.assertEquals(3, only.size());
+    Assertions.assertEquals("db.t_big", only.items().get(0).getFqtn());
+    Assertions.assertEquals("db.t_mid", only.items().get(1).getFqtn());
+    Assertions.assertEquals("db.t_small", only.items().get(2).getFqtn());
+    Assertions.assertEquals(160, only.getTotalWeight());
+  }
+
+  @Test
+  public void weightCapForcesMultipleBins() {
+    List<BinItem> items =
+        Arrays.asList(item("db.a", 60), item("db.b", 50), item("db.c", 40), item("db.d", 30));
+
+    List<Bin> bins = packer(100, 0, 50).pack(items);
+
+    // FFD on [60, 50, 40, 30] with cap 100:
+    //   bin0: 60          -> remaining 40
+    //   bin0 tries 50 -> doesn't fit, new bin1: 50
+    //   bin0 tries 40 -> fits, bin0: 60+40=100
+    //   bin1 tries 30 -> fits, bin1: 50+30=80
+    Assertions.assertEquals(2, bins.size());
+    Assertions.assertEquals(100, bins.get(0).getTotalWeight());
+    Assertions.assertEquals(80, bins.get(1).getTotalWeight());
+  }
+
+  @Test
+  public void maxItemsPerBinCapHonored() {
+    List<BinItem> items =
+        IntStream.range(0, 5).mapToObj(i -> item("db.t" + i, 1)).collect(Collectors.toList());
+
+    List<Bin> bins = packer(1000, 0, 2).pack(items);
+
+    Assertions.assertEquals(3, bins.size());
+    Assertions.assertEquals(2, bins.get(0).size());
+    Assertions.assertEquals(2, bins.get(1).size());
+    Assertions.assertEquals(1, bins.get(2).size());
+  }
+
+  @Test
+  public void maxSizeBytesCapHonored() {
+    List<BinItem> items =
+        Arrays.asList(
+            BinItem.builder()
+                .fqtn("db.a")
+                .operationId("op-a")
+                .tableUuid("uuid-a")
+                .databaseName("db")
+                .tableName("a")
+                .weight(1)
+                .sizeBytes(800L)
+                .build(),
+            BinItem.builder()
+                .fqtn("db.b")
+                .operationId("op-b")
+                .tableUuid("uuid-b")
+                .databaseName("db")
+                .tableName("b")
+                .weight(1)
+                .sizeBytes(800L)
+                .build());
+
+    List<Bin> bins = packer(1000, 1000L, 50).pack(items);
+
+    Assertions.assertEquals(2, bins.size());
+    Assertions.assertEquals(800L, bins.get(0).getTotalSizeBytes());
+    Assertions.assertEquals(800L, bins.get(1).getTotalSizeBytes());
+  }
+
+  @Test
+  public void oversizedItemGetsItsOwnBinRatherThanBeingDropped() {
+    List<BinItem> items =
+        Arrays.asList(item("db.tiny1", 10), item("db.giant", 500), item("db.tiny2", 10));
+
+    List<Bin> bins = packer(100, 0, 50).pack(items);
+
+    // Giant exceeds the cap on its own — must still appear in some bin.
+    long total = bins.stream().mapToLong(Bin::getTotalWeight).sum();
+    Assertions.assertEquals(520, total);
+    boolean giantPresent =
+        bins.stream()
+            .flatMap(b -> b.items().stream())
+            .anyMatch(i -> i.getFqtn().equals("db.giant"));
+    Assertions.assertTrue(giantPresent, "oversized item must not be dropped");
+  }
+
+  @Test
+  public void disabledCapsLetEverythingShareOneBin() {
+    List<BinItem> items =
+        IntStream.range(0, 20).mapToObj(i -> item("db.t" + i, 100)).collect(Collectors.toList());
+
+    List<Bin> bins = packer(0, 0, 0).pack(items);
+
+    Assertions.assertEquals(1, bins.size());
+    Assertions.assertEquals(20, bins.get(0).size());
+  }
+
+  private static FirstFitDecreasingBinPacker packer(long maxWeight, long maxSize, int maxItems) {
+    return FirstFitDecreasingBinPacker.builder()
+        .maxWeightPerBin(maxWeight)
+        .maxSizeBytesPerBin(maxSize)
+        .maxItemsPerBin(maxItems)
+        .build();
+  }
+
+  private static BinItem item(String fqtn, long weight) {
+    String[] parts = fqtn.split("\\.", 2);
+    return BinItem.builder()
+        .fqtn(fqtn)
+        .operationId("op-" + parts[1])
+        .tableUuid("uuid-" + parts[1])
+        .databaseName(parts[0])
+        .tableName(parts[1])
+        .weight(weight)
+        .sizeBytes(0L)
+        .build();
+  }
+}