linkedin · abhisheknath2011 · May 22, 2026 · mkuchenbecker · May 22, 2026 · mkuchenbecker
diff --git a/...k/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java b/...k/src/main/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkApp.java
diff --git a/...ark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OperationUpdateRequest.java b/...ark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OperationUpdateRequest.java
@@ -0,0 +1,26 @@
+package com.linkedin.openhouse.jobs.spark.optimizer;
+
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+/**
+ * Wire-compatible body for {@code POST /v1/optimizer/operations/update} on the Optimizer Service.
+ *
+ * <p>Mirrors {@code com.linkedin.openhouse.optimizer.api.spec.UpdateOperationRequest} from the
+ * optimizer service module so this app can be built before that module merges. Keep the two in
+ * sync.
+ */
+@Data
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
+public class OperationUpdateRequest {
+  private String operationId;
+  private String status;
+  private String tableUuid;
+  private String databaseName;
+  private String tableName;
+  private String operationType;
+}
diff --git a/...ark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OptimizerServiceClient.java b/...ark/src/main/java/com/linkedin/openhouse/jobs/spark/optimizer/OptimizerServiceClient.java
@@ -0,0 +1,81 @@
+package com.linkedin.openhouse.jobs.spark.optimizer;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import lombok.extern.slf4j.Slf4j;
+import okhttp3.MediaType;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import okhttp3.RequestBody;
+import okhttp3.Response;
+
+/**
+ * Thin OkHttp client for the Optimizer Service. The batched Spark app calls {@link
+ * #updateOperation(OperationUpdateRequest)} once per finished operation to record SUCCESS or
+ * FAILED.
+ *
+ * <p>Errors are surfaced as {@link IOException}; the caller decides whether to retry. Per the
+ * design, a missed update is recoverable — the operation row stays SCHEDULED and the Analyzer's
+ * stale-timeout will re-queue it.
+ */
+@Slf4j
+public class OptimizerServiceClient implements AutoCloseable {
+
+  private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8");
+  private static final String UPDATE_PATH = "/v1/optimizer/operations/update";
+
+  private final String baseUrl;
+  private final OkHttpClient httpClient;
+  private final ObjectMapper objectMapper;
+
+  public OptimizerServiceClient(String baseUrl) {
+    this(baseUrl, defaultClient(), new ObjectMapper());
+  }
+
+  OptimizerServiceClient(String baseUrl, OkHttpClient httpClient, ObjectMapper objectMapper) {
+    this.baseUrl = stripTrailingSlash(baseUrl);
+    this.httpClient = httpClient;
+    this.objectMapper = objectMapper;
+  }
+
+  public void updateOperation(OperationUpdateRequest body) throws IOException {
+    String url = baseUrl + UPDATE_PATH;
+    String json = objectMapper.writeValueAsString(body);
+    Request request = new Request.Builder().url(url).post(RequestBody.create(json, JSON)).build();
+    try (Response response = httpClient.newCall(request).execute()) {
+      if (!response.isSuccessful()) {
+        throw new IOException(
+            String.format(
+                "Optimizer Service update failed: url=%s status=%d operationId=%s",
+                url, response.code(), body.getOperationId()));
+      }
+      log.info(
+          "Reported operation update: operationId={} status={} httpStatus={}",
+          body.getOperationId(),
+          body.getStatus(),
+          response.code());
+    }
+  }
+
+  @Override
+  public void close() {
+    httpClient.dispatcher().executorService().shutdown();
+    httpClient.connectionPool().evictAll();
+  }
+
+  private static OkHttpClient defaultClient() {
+    return new OkHttpClient.Builder()
+        .connectTimeout(10, TimeUnit.SECONDS)
+        .readTimeout(30, TimeUnit.SECONDS)
+        .writeTimeout(30, TimeUnit.SECONDS)
+        .build();
+  }
+
+  private static String stripTrailingSlash(String url) {
+    if (url == null || url.isEmpty()) {
+      throw new IllegalArgumentException("Optimizer Service base URL must be non-empty");
+    }
+    return url.endsWith("/") ? url.substring(0, url.length() - 1) : url;
+  }
+}
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/Bin.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/Bin.java
@@ -0,0 +1,49 @@
+package com.linkedin.openhouse.jobs.util.binpack;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import lombok.Getter;
+import lombok.ToString;
+
+/**
+ * Mutable accumulator used by {@link FirstFitDecreasingBinPacker}. After packing completes the
+ * caller treats the returned bins as immutable; {@link #items()} returns an unmodifiable view.
+ */
+@ToString
+public class Bin {
+  private final List<BinItem> items = new ArrayList<>();
+  @Getter private long totalWeight;
+  @Getter private long totalSizeBytes;
+
+  /**
+   * Returns true iff adding {@code item} would keep this bin at or below all three caps. A cap of
+   * {@code <= 0} disables that dimension.
+   */
+  boolean fits(BinItem item, long maxWeight, long maxSizeBytes, int maxItems) {
+    if (maxItems > 0 && items.size() >= maxItems) {
+      return false;
+    }
+    if (maxWeight > 0 && totalWeight + item.getWeight() > maxWeight) {
+      return false;
+    }
+    if (maxSizeBytes > 0 && totalSizeBytes + item.getSizeBytes() > maxSizeBytes) {
+      return false;
+    }
+    return true;
+  }
+
+  void add(BinItem item) {
+    items.add(item);
+    totalWeight += item.getWeight();
+    totalSizeBytes += item.getSizeBytes();
+  }
+
+  public List<BinItem> items() {
+    return Collections.unmodifiableList(items);
+  }
+
+  public int size() {
+    return items.size();
+  }
+}
diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/BinItem.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/util/binpack/BinItem.java
@@ -0,0 +1,29 @@
+package com.linkedin.openhouse.jobs.util.binpack;
+
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NonNull;
+import lombok.ToString;
+
+/**
+ * A single packable unit for {@link FirstFitDecreasingBinPacker}. Carries everything the batched
+ * Spark app needs both to do the work ({@link #fqtn}) and to report the result back to the
+ * Optimizer Service ({@link #operationId}, {@link #tableUuid}, {@link #databaseName}, {@link
+ * #tableName}).
+ *
+ * <p>{@link #weight} is the bin-packing dimension (for OFD: number of current files in the table).
+ * {@link #sizeBytes} is a secondary capacity dimension that lets the packer cap the total on-disk
+ * footprint of a bin independently of file count.
+ */
+@Getter
+@Builder
+@ToString
+public class BinItem {
+  @NonNull private final String fqtn;
+  @NonNull private final String operationId;
+  @NonNull private final String tableUuid;
+  @NonNull private final String databaseName;
+  @NonNull private final String tableName;
+  private final long weight;
+  private final long sizeBytes;
+}
diff --git a/...k/src/main/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPacker.java b/...k/src/main/java/com/linkedin/openhouse/jobs/util/binpack/FirstFitDecreasingBinPacker.java
@@ -0,0 +1,70 @@
+package com.linkedin.openhouse.jobs.util.binpack;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.stream.Collectors;
+import lombok.Builder;
+import lombok.extern.slf4j.Slf4j;
+
+/**
+ * First-fit-decreasing bin packer used by the optimizer scheduler to group table operations into
+ * batches before launching a single Spark job per batch.
+ *
+ * <p>Each bin has three independent caps:
+ *
+ * <ul>
+ *   <li>{@code maxWeightPerBin} — total {@link BinItem#getWeight()} (for OFD: number of files)
+ *   <li>{@code maxSizeBytesPerBin} — total on-disk size of all tables in the bin
+ *   <li>{@code maxItemsPerBin} — number of tables per bin
+ * </ul>
+ *
+ * <p>An item that exceeds any single cap on its own is placed into a bin by itself rather than
+ * dropped — we never silently skip maintenance work for an oversized table.
+ *
+ * <p>Pass {@code 0} or a negative value for any cap to disable that dimension.
+ */
+@Slf4j
+@Builder
+public class FirstFitDecreasingBinPacker {
+
+  @Builder.Default private final long maxWeightPerBin = 1_000_000L;
+  @Builder.Default private final long maxSizeBytesPerBin = 5L * 1024L * 1024L * 1024L * 1024L;
+  @Builder.Default private final int maxItemsPerBin = 50;
+
+  public List<Bin> pack(List<BinItem> items) {
+    if (items == null || items.isEmpty()) {
+      return new ArrayList<>();
+    }
+
+    List<BinItem> sorted =
+        items.stream()
+            .sorted(Comparator.comparingLong(BinItem::getWeight).reversed())
+            .collect(Collectors.toList());
+
+    List<Bin> bins = new ArrayList<>();
+    for (BinItem item : sorted) {
+      Bin target = null;
+      for (Bin bin : bins) {
+        if (bin.fits(item, maxWeightPerBin, maxSizeBytesPerBin, maxItemsPerBin)) {
+          target = bin;
+          break;
+        }
+      }
+      if (target == null) {
+        target = new Bin();
+        bins.add(target);
+        if (!target.fits(item, maxWeightPerBin, maxSizeBytesPerBin, maxItemsPerBin)) {
+          log.warn(
+              "Item exceeds per-bin caps on its own; placing in dedicated bin: fqtn={} weight={} sizeBytes={}",
+              item.getFqtn(),
+              item.getWeight(),
+              item.getSizeBytes());
+        }
+      }
+      target.add(item);
+    }
+    log.info("Packed {} items into {} bins", items.size(), bins.size());
+    return bins;
+  }
+}
diff --git a/...st/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppArgsTest.java b/...st/java/com/linkedin/openhouse/jobs/spark/BatchedOrphanFilesDeletionSparkAppArgsTest.java
@@ -0,0 +1,74 @@
+package com.linkedin.openhouse.jobs.spark;
+
+import java.util.List;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Pure-Java unit tests for {@link BatchedOrphanFilesDeletionSparkApp#buildEntries}. No Spark
+ * session, no HTTP — exercises the CLI-parsing edges that decide whether the app can even start.
+ */
+public class BatchedOrphanFilesDeletionSparkAppArgsTest {
+
+  @Test
+  public void buildEntriesParsesParallelLists() {
+    List<BatchedOrphanFilesDeletionSparkApp.BatchEntry> entries =
+        BatchedOrphanFilesDeletionSparkApp.buildEntries(
+            "db1.t1,db2.t2", "op-1,op-2", "uuid-1,uuid-2");
+
+    Assertions.assertEquals(2, entries.size());
+    Assertions.assertEquals("db1.t1", entries.get(0).getFqtn());
+    Assertions.assertEquals("db1", entries.get(0).getDatabaseName());
+    Assertions.assertEquals("t1", entries.get(0).getTableName());
+    Assertions.assertEquals("op-1", entries.get(0).getOperationId());
+    Assertions.assertEquals("uuid-1", entries.get(0).getTableUuid());
+    Assertions.assertEquals("db2.t2", entries.get(1).getFqtn());
+    Assertions.assertEquals("op-2", entries.get(1).getOperationId());
+  }
+
+  @Test
+  public void buildEntriesTrimsWhitespaceInEachEntry() {
+    List<BatchedOrphanFilesDeletionSparkApp.BatchEntry> entries =
+        BatchedOrphanFilesDeletionSparkApp.buildEntries(
+            " db1.t1 , db2.t2 ", " op-1 , op-2 ", " uuid-1 , uuid-2 ");
+
+    Assertions.assertEquals("db1.t1", entries.get(0).getFqtn());
+    Assertions.assertEquals("op-1", entries.get(0).getOperationId());
+    Assertions.assertEquals("uuid-1", entries.get(0).getTableUuid());
+  }
+
+  @Test
+  public void buildEntriesRejectsMismatchedLengths() {
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () ->
+            BatchedOrphanFilesDeletionSparkApp.buildEntries("db.a,db.b", "op-1", "uuid-1,uuid-2"));
+  }
+
+  @Test
+  public void buildEntriesRejectsNullArguments() {
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries(null, "op-1", "uuid-1"));
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries("db.a", null, "uuid-1"));
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries("db.a", "op-1", null));
+  }
+
+  @Test
+  public void buildEntriesRejectsEmptyStrings() {
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries("", "op-1", "uuid-1"));
+  }
+
+  @Test
+  public void buildEntriesRejectsNonFqtn() {
+    Assertions.assertThrows(
+        IllegalArgumentException.class,
+        () -> BatchedOrphanFilesDeletionSparkApp.buildEntries("just_a_table", "op-1", "uuid-1"));
+  }
+}