diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md
new file mode 100644
index 000000000..3620076c0
--- /dev/null
+++ b/.changeset/image-and-video-inputs.md
@@ -0,0 +1,22 @@
+---
+'@tanstack/ai': minor
+'@tanstack/ai-openai': minor
+'@tanstack/ai-gemini': minor
+'@tanstack/ai-fal': minor
+'@tanstack/ai-grok': patch
+'@tanstack/ai-openrouter': patch
+'@tanstack/ai-event-client': patch
+---
+
+Add `imageInputs`, `videoInputs`, and `audioInputs` to `generateImage()` and `generateVideo()` for image-conditioned generation, image-to-image, multi-reference, image-to-video, and edit / inpaint flows. Each input part may carry a `metadata.role` hint (`'reference' | 'mask' | 'control' | 'start_frame' | 'end_frame' | 'character'`) that adapters use to route to the provider-specific field.
+
+Provider behavior in this release:
+
+- **OpenAI image** — `gpt-image-1` / `gpt-image-1-mini` route to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` throws a clear not-supported error.
+- **OpenAI video** — Sora-2 / Sora-2-Pro accept a single `input_reference` image; passing more than one throws.
+- **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") receive inputs as multimodal parts in `contents`. Imagen throws (text-only).
+- **fal.ai** — Inputs map to fal field names: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Video adapter additionally honors `role: 'start_frame'` / `'end_frame'`.
+- **Grok**, **OpenRouter** — Throw with a link to issue #618 (full support pending dedicated Imagine / multimodal injection work).
+- **Anthropic** — Unchanged (no image generation API).
+
+Closes #618.
diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md
index 6c3c6e115..5ba27fe94 100644
--- a/docs/media/image-generation.md
+++ b/docs/media/image-generation.md
@@ -82,6 +82,9 @@ All image adapters support these common options:
 | `prompt` | `string` | Text description of the image to generate (required) |
 | `numberOfImages` | `number` | Number of images to generate |
 | `size` | `string` | Size of the generated image in WIDTHxHEIGHT format |
+| `imageInputs?` | `ImagePart[]` | Image conditioning inputs for image-to-image, reference-guided, edit, or multi-reference generation. See [Image-Conditioned Generation](#image-conditioned-generation) below. |
+| `videoInputs?` | `VideoPart[]` | Video conditioning inputs. Provider support is limited; most adapters throw. |
+| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs. Provider support is limited; most adapters throw. |
 | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) |
 
 ### Size Options
@@ -132,6 +135,114 @@ const result = await generateImage({
 })
 ```
 
+## Image-Conditioned Generation
+
+`generateImage()` accepts an optional `imageInputs` field for image-to-image,
+reference-guided, multi-reference, and edit / inpaint flows. The field reuses
+the same `ImagePart` shape used elsewhere for multimodal content:
+
+```typescript
+import { generateImage, type ImagePart } from '@tanstack/ai'
+import { openaiImage } from '@tanstack/ai-openai'
+
+const reference: ImagePart = {
+  type: 'image',
+  source: { type: 'url', value: 'https://example.com/product.png' },
+}
+
+await generateImage({
+  adapter: openaiImage('gpt-image-1'),
+  prompt: 'Turn this into a cinematic product photo',
+  imageInputs: [reference],
+})
+```
+
+### Source format
+
+`ImagePart.source` is a discriminated union supporting both URLs and inline
+base64 data — pass whichever you have:
+
+```typescript
+// URL source
+{ type: 'image', source: { type: 'url', value: 'https://example.com/img.png' } }
+
+// Inline base64 data (mimeType required)
+{ type: 'image', source: { type: 'data', value: base64String, mimeType: 'image/png' } }
+```
+
+OpenAI's edit endpoint requires file uploads; the adapter fetches URL sources
+and converts base64 to a `File` automatically.
+
+### Role hints via `metadata.role`
+
+When a generation has multiple inputs with different roles (mask vs reference
+vs start/end frame), set `metadata.role` on each part. Adapters route by role
+to the provider-specific field; parts without a role fall back to positional
+mapping.
+
+| Role            | Maps to                                                                                |
+| --------------- | -------------------------------------------------------------------------------------- |
+| `'reference'`   | fal `reference_image_urls`; Gemini multimodal part; positional fallback                |
+| `'character'`   | Same as `'reference'`; Veo `referenceImages` slot                                      |
+| `'mask'`        | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url`                                  |
+| `'control'`     | fal `control_image_url` (ControlNet / depth / pose conditioning)                       |
+| `'start_frame'` | fal `start_image_url`; Veo `image` (used by `generateVideo`)                           |
+| `'end_frame'`   | fal `end_image_url`; Veo `lastFrame` (used by `generateVideo`)                         |
+
+#### Inpaint / edit with a mask
+
+```typescript
+await generateImage({
+  adapter: openaiImage('gpt-image-1'),
+  prompt: 'Replace the masked region with a tree',
+  imageInputs: [
+    {
+      type: 'image',
+      source: { type: 'url', value: photoUrl },
+    },
+    {
+      type: 'image',
+      source: { type: 'url', value: maskUrl },
+      metadata: { role: 'mask' },
+    },
+  ],
+})
+```
+
+#### Multi-reference composition
+
+```typescript
+const product: ImagePart = {
+  type: 'image',
+  source: { type: 'url', value: 'https://example.com/product.png' },
+}
+
+const style: ImagePart = {
+  type: 'image',
+  source: { type: 'url', value: 'https://example.com/style.png' },
+}
+
+await generateImage({
+  adapter: geminiImage('gemini-3.1-flash-image-preview'),
+  prompt: 'Generate a new image of the product using the style of the second reference',
+  imageInputs: [product, style],
+})
+```
+
+### Provider support
+
+| Provider     | Behavior                                                                                                  |
+| ------------ | --------------------------------------------------------------------------------------------------------- |
+| **OpenAI**   | `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.<br>`dall-e-2` → `images.edit()` with 1 source image only.<br>`dall-e-3` → throws (no edit support). |
+| **Gemini**   | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → inputs become multimodal parts in `contents`. Up to ~14 input images.<br>Imagen models → throws (text-to-image only). |
+| **fal.ai**   | 1 input → `image_url`; multiple → `image_urls`. `role: 'mask'` → `mask_url`. `role: 'control'` → `control_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. |
+| **Grok**     | Throws — the current adapter wraps Grok's OpenAI-compat endpoint, which doesn't expose image inputs. xAI's native Imagine API support is tracked as a follow-up.                                                                                                          |
+| **OpenRouter** | Throws — multimodal injection into the chat-completions pathway is tracked as a follow-up.                                                                                                              |
+| **Anthropic** | n/a — no image generation API.                                                                                                                                                                          |
+
+Adapters that don't support image-conditioned generation throw a clear
+runtime error so calls fail fast rather than silently dropping the inputs.
+
 ## Model Options
 
 ### OpenAI Model Options
diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md
index b42e88b6b..1872a39be 100644
--- a/docs/media/video-generation.md
+++ b/docs/media/video-generation.md
@@ -372,8 +372,76 @@ And returns:
 | `prompt` | `string` | Text description of the video to generate (required) |
 | `size` | `string` | Video resolution in WIDTHxHEIGHT format |
 | `duration` | `number` | Video duration in seconds (maps to `seconds` parameter in API) |
+| `imageInputs?` | `ImagePart[]` | Image conditioning inputs — starting frame, end frame, character / reference images. See [Image-to-Video](#image-to-video) below. |
+| `videoInputs?` | `VideoPart[]` | Video conditioning inputs for video-to-video / source clip flows. Provider support varies. |
+| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs for lipsync / voice cloning flows. Provider support varies. |
 | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) |
 
+## Image-to-Video
+
+`generateVideo()` accepts `imageInputs` for starting-frame, ending-frame,
+and reference-image conditioned video generation:
+
+```typescript
+import { generateVideo, type ImagePart } from '@tanstack/ai'
+import { openaiVideo } from '@tanstack/ai-openai'
+
+const startingFrame: ImagePart = {
+  type: 'image',
+  source: {
+    type: 'data',
+    value: base64Image,
+    mimeType: 'image/png',
+  },
+}
+
+const { jobId } = await generateVideo({
+  adapter: openaiVideo('sora-2'),
+  prompt: 'Animate this still into a slow cinematic push-in with subtle motion',
+  imageInputs: [startingFrame],
+})
+```
+
+### Role hints
+
+Each `ImagePart` can carry an optional `metadata.role` hint that the
+adapter uses to route the input to the provider-specific field:
+
+| Role            | Maps to                                                       |
+| --------------- | ------------------------------------------------------------- |
+| `'start_frame'` | fal `start_image_url` (positional default for the first input) |
+| `'end_frame'`   | fal `end_image_url` (Veo `lastFrame` when available)           |
+| `'reference'`   | fal `reference_image_urls` (Veo `referenceImages`)             |
+| `'character'`   | Same as `'reference'` — character consistency images           |
+
+```typescript
+import { falVideo } from '@tanstack/ai-fal'
+
+await generateVideo({
+  adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'),
+  prompt: 'Slow cinematic push-in then a hard cut',
+  imageInputs: [
+    { type: 'image', source: { type: 'url', value: firstFrameUrl } },
+    {
+      type: 'image',
+      source: { type: 'url', value: lastFrameUrl },
+      metadata: { role: 'end_frame' },
+    },
+  ],
+})
+```
+
+### Provider support
+
+| Provider     | Image-to-Video Behavior                                                                                  |
+| ------------ | -------------------------------------------------------------------------------------------------------- |
+| **OpenAI**   | Sora-2 / Sora-2-Pro → first input goes to `input_reference`. Single image only — throws if more than one. |
+| **fal.ai**   | Single input → `image_url` (start frame). `role: 'end_frame'` → `end_image_url`. `role: 'start_frame'` → `start_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions`. |
+| **Gemini**   | Veo adapter not yet implemented — `imageInputs` will be supported when Veo lands.                         |
+
+Adapters whose underlying API can't accept image inputs throw a clear
+runtime error so calls fail fast.
+
 ### Supported Sizes
 
 Based on [OpenAI API docs](https://platform.openai.com/docs/api-reference/videos/create):
diff --git a/packages/typescript/ai-event-client/src/index.ts b/packages/typescript/ai-event-client/src/index.ts
index 17fa1d6e0..acfc953c4 100644
--- a/packages/typescript/ai-event-client/src/index.ts
+++ b/packages/typescript/ai-event-client/src/index.ts
@@ -428,6 +428,12 @@ export interface ImageRequestStartedEvent extends BaseEventContext {
   prompt: string
   numberOfImages?: number
   size?: string
+  /** Count of image conditioning inputs (image-to-image, mask, reference). */
+  imageInputCount?: number
+  /** Count of video conditioning inputs (video-to-video). */
+  videoInputCount?: number
+  /** Count of audio conditioning inputs (lipsync, voice reference). */
+  audioInputCount?: number
 }
 
 /** Emitted when an image request completes. */
diff --git a/packages/typescript/ai-fal/src/adapters/image.ts b/packages/typescript/ai-fal/src/adapters/image.ts
index 11dcbaeb2..7fbb14a08 100644
--- a/packages/typescript/ai-fal/src/adapters/image.ts
+++ b/packages/typescript/ai-fal/src/adapters/image.ts
@@ -2,6 +2,7 @@ import { fal } from '@fal-ai/client'
 import { BaseImageAdapter } from '@tanstack/ai/adapters'
 import { configureFalClient, generateId as utilGenerateId } from '../utils'
 import { mapSizeToFalFormat } from '../image/image-provider-options'
+import { mapImageInputsToFalFields } from '../image/image-inputs'
 import type { OutputType, Result } from '@fal-ai/client'
 import type { FalClientConfig } from '../utils'
 import type {
@@ -63,6 +64,17 @@ export class FalImageAdapter<TModel extends FalModel> extends BaseImageAdapter<
       model: this.model,
     })
 
+    if (options.videoInputs?.length) {
+      throw new Error(
+        `fal.generateImages does not support videoInputs on model ${this.model}.`,
+      )
+    }
+    if (options.audioInputs?.length) {
+      throw new Error(
+        `fal.generateImages does not support audioInputs on model ${this.model}.`,
+      )
+    }
+
     try {
       const input = this.buildInput(options)
       const result = await fal.subscribe(this.model, { input })
@@ -83,9 +95,14 @@ export class FalImageAdapter<TModel extends FalModel> extends BaseImageAdapter<
     >,
   ): FalModelInput<TModel> {
     const sizeParams = mapSizeToFalFormat(options.size)
+    // Order matters: modelOptions first (so user overrides win for
+    // mask_url / control_image_url / reference_image_urls), then size,
+    // then derived image-input fields, then prompt / num_images.
+    const inputFields = mapImageInputsToFalFields(options.imageInputs)
     const input = {
       ...options.modelOptions,
       ...sizeParams,
+      ...inputFields,
       prompt: options.prompt,
       num_images: options.numberOfImages,
     } as FalModelInput<TModel>
diff --git a/packages/typescript/ai-fal/src/adapters/video.ts b/packages/typescript/ai-fal/src/adapters/video.ts
index 662b4f60f..6f3dbb1ef 100644
--- a/packages/typescript/ai-fal/src/adapters/video.ts
+++ b/packages/typescript/ai-fal/src/adapters/video.ts
@@ -2,6 +2,8 @@ import { fal } from '@fal-ai/client'
 import { BaseVideoAdapter } from '@tanstack/ai/adapters'
 import { configureFalClient, generateId as utilGenerateId } from '../utils'
 import { mapVideoSizeToFalFormat } from '../video/video-provider-options'
+import { mapImageInputsToFalVideoFields } from '../image/image-inputs'
+import type { AudioPart, MediaInputMetadata, VideoPart } from '@tanstack/ai'
 import type {
   VideoGenerationOptions,
   VideoJobResult,
@@ -16,6 +18,63 @@ import type {
 } from '../model-meta'
 import type { FalClientConfig } from '../utils'
 
+/**
+ * Map video conditioning inputs onto fal field names.
+ * Video-to-video endpoints on fal almost universally use `video_url`; the
+ * occasional model takes `video_urls` (rare). Mirror the image-input logic
+ * positionally with a `reference` role escape hatch via `reference_video_urls`.
+ */
+function mapVideoInputsToFalFields(
+  videoInputs?: ReadonlyArray<VideoPart<MediaInputMetadata>>,
+): Record<string, unknown> {
+  if (!videoInputs || videoInputs.length === 0) return {}
+  const references: Array<string> = []
+  const sources: Array<string> = []
+  for (const part of videoInputs) {
+    const url = videoPartToUrl(part)
+    if (
+      part.metadata?.role === 'reference' ||
+      part.metadata?.role === 'character'
+    ) {
+      references.push(url)
+    } else {
+      sources.push(url)
+    }
+  }
+  const out: Record<string, unknown> = {}
+  if (references.length > 0) out.reference_video_urls = references
+  if (sources.length === 1) {
+    out.video_url = sources[0]
+  } else if (sources.length > 1) {
+    out.video_urls = sources
+  }
+  return out
+}
+
+function mapAudioInputsToFalFields(
+  audioInputs?: ReadonlyArray<AudioPart<MediaInputMetadata>>,
+): Record<string, unknown> {
+  if (!audioInputs || audioInputs.length === 0) return {}
+  if (audioInputs.length > 1) {
+    throw new Error(
+      `fal: multiple audioInputs are not supported (received ${audioInputs.length}).`,
+    )
+  }
+  const part = audioInputs[0]!
+  return {
+    audio_url:
+      part.source.type === 'url'
+        ? part.source.value
+        : `data:${part.source.mimeType};base64,${part.source.value}`,
+  }
+}
+
+function videoPartToUrl(part: VideoPart<MediaInputMetadata>): string {
+  return part.source.type === 'url'
+    ? part.source.value
+    : `data:${part.source.mimeType};base64,${part.source.value}`
+}
+
 type FalQueueStatus = 'IN_QUEUE' | 'IN_PROGRESS' | 'COMPLETED'
 
 interface FalStatusResponse {
@@ -80,7 +139,16 @@ export class FalVideoAdapter<TModel extends FalModel> extends BaseVideoAdapter<
       FalModelVideoSize<TModel>
     >,
   ): Promise<VideoJobResult> {
-    const { prompt, size, duration, modelOptions, logger } = options
+    const {
+      prompt,
+      size,
+      duration,
+      modelOptions,
+      logger,
+      imageInputs,
+      videoInputs,
+      audioInputs,
+    } = options
 
     logger.request(`activity=generateVideo provider=fal model=${this.model}`, {
       provider: 'fal',
@@ -89,10 +157,16 @@ export class FalVideoAdapter<TModel extends FalModel> extends BaseVideoAdapter<
 
     try {
       const sizeParams = mapVideoSizeToFalFormat(size)
+      const inputImageFields = mapImageInputsToFalVideoFields(imageInputs)
+      const videoFields = mapVideoInputsToFalFields(videoInputs)
+      const audioFields = mapAudioInputsToFalFields(audioInputs)
 
       const input = {
         ...modelOptions,
         ...sizeParams,
+        ...inputImageFields,
+        ...videoFields,
+        ...audioFields,
         prompt,
         ...(duration ? { duration } : {}),
       } as FalModelInput<TModel>
diff --git a/packages/typescript/ai-fal/src/image/image-inputs.ts b/packages/typescript/ai-fal/src/image/image-inputs.ts
new file mode 100644
index 000000000..0a5a06ca3
--- /dev/null
+++ b/packages/typescript/ai-fal/src/image/image-inputs.ts
@@ -0,0 +1,165 @@
+import type { ImagePart, MediaInputMetadata } from '@tanstack/ai'
+
+/**
+ * Map TanStack `imageInputs` onto fal.ai endpoint fields.
+ *
+ * fal endpoints use different field names for image-conditioned generation
+ * (~80% use `image_url` for single; the rest use `image_urls`,
+ * `reference_image_urls`, `mask_url`, `control_image_url`, etc.). Without
+ * per-endpoint metadata we apply this heuristic:
+ *
+ * - parts with `metadata.role === 'mask'`         → `mask_url`        (single)
+ * - parts with `metadata.role === 'control'`      → `control_image_url` (single)
+ * - parts with `metadata.role === 'reference'`    → `reference_image_urls` (array)
+ * - parts with `metadata.role === 'character'`    → `reference_image_urls` (array)
+ * - remaining parts (no role, or unknown role):
+ *     - exactly 1 part                            → `image_url`
+ *     - >1 parts                                  → `image_urls`
+ *
+ * Users can always override the resulting field shape via `modelOptions`
+ * (spread before these fields), or pass everything through `modelOptions`
+ * directly when the heuristic doesn't match an obscure endpoint.
+ *
+ * This mapping is interim and will be replaced by a per-endpoint mapping
+ * sourced from the `@fal-ai/schemas` library once it lands.
+ */
+export function mapImageInputsToFalFields(
+  imageInputs?: ReadonlyArray<ImagePart<MediaInputMetadata>>,
+): Record<string, unknown> {
+  if (!imageInputs || imageInputs.length === 0) return {}
+
+  const fields: Record<string, unknown> = {}
+
+  const masks: Array<string> = []
+  const controls: Array<string> = []
+  const references: Array<string> = []
+  const sources: Array<string> = []
+
+  for (const part of imageInputs) {
+    const url = imagePartToUrl(part)
+    const role = part.metadata?.role
+    switch (role) {
+      case 'mask':
+        masks.push(url)
+        break
+      case 'control':
+        controls.push(url)
+        break
+      case 'reference':
+      case 'character':
+        references.push(url)
+        break
+      case 'start_frame':
+      case 'end_frame':
+        // Frame roles aren't meaningful for image generation; treat as the
+        // primary source. Video adapter handles start/end framing.
+        sources.push(url)
+        break
+      default:
+        sources.push(url)
+    }
+  }
+
+  if (masks.length > 1) {
+    throw new Error(
+      `fal: only one input with metadata.role === 'mask' is supported per request (received ${masks.length}).`,
+    )
+  }
+  if (controls.length > 1) {
+    throw new Error(
+      `fal: only one input with metadata.role === 'control' is supported per request (received ${controls.length}).`,
+    )
+  }
+
+  if (masks[0]) fields.mask_url = masks[0]
+  if (controls[0]) fields.control_image_url = controls[0]
+  if (references.length > 0) fields.reference_image_urls = references
+
+  if (sources.length === 1) {
+    fields.image_url = sources[0]
+  } else if (sources.length > 1) {
+    fields.image_urls = sources
+  }
+
+  return fields
+}
+
+/**
+ * Map TanStack `imageInputs` onto fal.ai video-endpoint fields.
+ *
+ * Video endpoints often expose a start frame as `image_url` (76% of i2v
+ * models) plus an optional `end_image_url`. Multi-reference video models
+ * (Kling O3, Seedance reference-to-video) use `reference_image_urls` or
+ * `image_urls`. Mapping:
+ *
+ * - `metadata.role === 'start_frame'`              → `start_image_url`
+ * - `metadata.role === 'end_frame'`                → `end_image_url`
+ * - `metadata.role === 'reference' | 'character'`  → `reference_image_urls`
+ * - remaining parts (no role or unknown role):
+ *     - exactly 1 part                             → `image_url`
+ *     - >1 parts                                   → `image_urls`
+ */
+export function mapImageInputsToFalVideoFields(
+  imageInputs?: ReadonlyArray<ImagePart<MediaInputMetadata>>,
+): Record<string, unknown> {
+  if (!imageInputs || imageInputs.length === 0) return {}
+
+  const fields: Record<string, unknown> = {}
+
+  const startFrames: Array<string> = []
+  const endFrames: Array<string> = []
+  const references: Array<string> = []
+  const sources: Array<string> = []
+
+  for (const part of imageInputs) {
+    const url = imagePartToUrl(part)
+    const role = part.metadata?.role
+    switch (role) {
+      case 'start_frame':
+        startFrames.push(url)
+        break
+      case 'end_frame':
+        endFrames.push(url)
+        break
+      case 'reference':
+      case 'character':
+        references.push(url)
+        break
+      default:
+        sources.push(url)
+    }
+  }
+
+  if (startFrames.length > 1) {
+    throw new Error(
+      `fal: only one input with metadata.role === 'start_frame' is supported (received ${startFrames.length}).`,
+    )
+  }
+  if (endFrames.length > 1) {
+    throw new Error(
+      `fal: only one input with metadata.role === 'end_frame' is supported (received ${endFrames.length}).`,
+    )
+  }
+
+  if (startFrames[0]) fields.start_image_url = startFrames[0]
+  if (endFrames[0]) fields.end_image_url = endFrames[0]
+  if (references.length > 0) fields.reference_image_urls = references
+
+  if (sources.length === 1) {
+    fields.image_url = sources[0]
+  } else if (sources.length > 1) {
+    fields.image_urls = sources
+  }
+
+  return fields
+}
+
+/**
+ * Convert a TanStack ImagePart into a string suitable for fal's URL-based
+ * input fields. URL sources pass through; data sources are emitted as a
+ * `data:<mime>;base64,<value>` URI which fal endpoints accept on the wire.
+ */
+function imagePartToUrl(part: ImagePart<MediaInputMetadata>): string {
+  if (part.source.type === 'url') return part.source.value
+  return `data:${part.source.mimeType};base64,${part.source.value}`
+}
diff --git a/packages/typescript/ai-fal/tests/image-inputs.test.ts b/packages/typescript/ai-fal/tests/image-inputs.test.ts
new file mode 100644
index 000000000..0ed534080
--- /dev/null
+++ b/packages/typescript/ai-fal/tests/image-inputs.test.ts
@@ -0,0 +1,140 @@
+import { describe, expect, it } from 'vitest'
+import {
+  mapImageInputsToFalFields,
+  mapImageInputsToFalVideoFields,
+} from '../src/image/image-inputs'
+import type { ImagePart, MediaInputMetadata } from '@tanstack/ai'
+
+function urlPart(
+  value: string,
+  metadata?: MediaInputMetadata,
+): ImagePart<MediaInputMetadata> {
+  return {
+    type: 'image',
+    source: { type: 'url', value },
+    ...(metadata && { metadata }),
+  }
+}
+
+describe('mapImageInputsToFalFields', () => {
+  it('returns an empty object when imageInputs is missing or empty', () => {
+    expect(mapImageInputsToFalFields(undefined)).toEqual({})
+    expect(mapImageInputsToFalFields([])).toEqual({})
+  })
+
+  it('routes a single source to image_url', () => {
+    expect(
+      mapImageInputsToFalFields([urlPart('https://example.com/a.png')]),
+    ).toEqual({ image_url: 'https://example.com/a.png' })
+  })
+
+  it('routes multiple sources to image_urls', () => {
+    expect(
+      mapImageInputsToFalFields([
+        urlPart('https://example.com/a.png'),
+        urlPart('https://example.com/b.png'),
+      ]),
+    ).toEqual({
+      image_urls: ['https://example.com/a.png', 'https://example.com/b.png'],
+    })
+  })
+
+  it('routes role=mask to mask_url alongside the source image_url', () => {
+    expect(
+      mapImageInputsToFalFields([
+        urlPart('https://example.com/img.png'),
+        urlPart('https://example.com/mask.png', { role: 'mask' }),
+      ]),
+    ).toEqual({
+      image_url: 'https://example.com/img.png',
+      mask_url: 'https://example.com/mask.png',
+    })
+  })
+
+  it('routes role=reference to reference_image_urls', () => {
+    expect(
+      mapImageInputsToFalFields([
+        urlPart('https://example.com/product.png'),
+        urlPart('https://example.com/style.png', { role: 'reference' }),
+        urlPart('https://example.com/character.png', { role: 'character' }),
+      ]),
+    ).toEqual({
+      image_url: 'https://example.com/product.png',
+      reference_image_urls: [
+        'https://example.com/style.png',
+        'https://example.com/character.png',
+      ],
+    })
+  })
+
+  it('routes role=control to control_image_url', () => {
+    expect(
+      mapImageInputsToFalFields([
+        urlPart('https://example.com/img.png'),
+        urlPart('https://example.com/depth.png', { role: 'control' }),
+      ]),
+    ).toEqual({
+      image_url: 'https://example.com/img.png',
+      control_image_url: 'https://example.com/depth.png',
+    })
+  })
+
+  it('encodes data sources as data URIs', () => {
+    expect(
+      mapImageInputsToFalFields([
+        {
+          type: 'image',
+          source: { type: 'data', value: 'aGVsbG8=', mimeType: 'image/png' },
+        },
+      ]),
+    ).toEqual({ image_url: 'data:image/png;base64,aGVsbG8=' })
+  })
+
+  it('throws when more than one mask is provided', () => {
+    expect(() =>
+      mapImageInputsToFalFields([
+        urlPart('https://example.com/m1.png', { role: 'mask' }),
+        urlPart('https://example.com/m2.png', { role: 'mask' }),
+      ]),
+    ).toThrow(/only one input with metadata.role === 'mask'/)
+  })
+})
+
+describe('mapImageInputsToFalVideoFields', () => {
+  it('returns empty for missing/empty inputs', () => {
+    expect(mapImageInputsToFalVideoFields(undefined)).toEqual({})
+    expect(mapImageInputsToFalVideoFields([])).toEqual({})
+  })
+
+  it('routes a single positional source to image_url (start frame)', () => {
+    expect(
+      mapImageInputsToFalVideoFields([
+        urlPart('https://example.com/start.png'),
+      ]),
+    ).toEqual({ image_url: 'https://example.com/start.png' })
+  })
+
+  it('routes role=start_frame to start_image_url and role=end_frame to end_image_url', () => {
+    expect(
+      mapImageInputsToFalVideoFields([
+        urlPart('https://example.com/a.png', { role: 'start_frame' }),
+        urlPart('https://example.com/z.png', { role: 'end_frame' }),
+      ]),
+    ).toEqual({
+      start_image_url: 'https://example.com/a.png',
+      end_image_url: 'https://example.com/z.png',
+    })
+  })
+
+  it('routes role=reference to reference_image_urls', () => {
+    expect(
+      mapImageInputsToFalVideoFields([
+        urlPart('https://example.com/start.png'),
+        urlPart('https://example.com/character.png', { role: 'reference' }),
+      ]),
+    ).toEqual({
+      image_url: 'https://example.com/start.png',
+      reference_image_urls: ['https://example.com/character.png'],
+    })
+  })
+})
diff --git a/packages/typescript/ai-gemini/src/adapters/image.ts b/packages/typescript/ai-gemini/src/adapters/image.ts
index 612385e16..90063cfe3 100644
--- a/packages/typescript/ai-gemini/src/adapters/image.ts
+++ b/packages/typescript/ai-gemini/src/adapters/image.ts
@@ -1,4 +1,5 @@
 import { BaseImageAdapter } from '@tanstack/ai/adapters'
+import { arrayBufferToBase64 } from '@tanstack/ai-utils'
 import {
   createGeminiClient,
   generateId,
@@ -21,13 +22,17 @@ import type {
   GeneratedImage,
   ImageGenerationOptions,
   ImageGenerationResult,
+  ImagePart,
+  MediaInputMetadata,
 } from '@tanstack/ai'
 import type {
+  Content,
   GenerateContentConfig,
   GenerateContentResponse,
   GenerateImagesConfig,
   GenerateImagesResponse,
   GoogleGenAI,
+  Part,
 } from '@google/genai'
 import type { GeminiClientConfig } from '../utils'
 
@@ -94,10 +99,29 @@ export class GeminiImageAdapter<
     try {
       validatePrompt({ prompt, model })
 
+      if (options.videoInputs?.length) {
+        throw new Error(
+          `${this.name}.generateImages does not support videoInputs (model: ${model}).`,
+        )
+      }
+      if (options.audioInputs?.length) {
+        throw new Error(
+          `${this.name}.generateImages does not support audioInputs (model: ${model}).`,
+        )
+      }
+
       if (this.isGeminiImageModel(model)) {
         return await this.generateWithGeminiApi(options)
       }
 
+      // Imagen does not accept image inputs — it's strictly text-to-image.
+      if (options.imageInputs?.length) {
+        throw new Error(
+          `${this.name}: model "${model}" (Imagen) does not support imageInputs. ` +
+            `Use a Gemini-native image model (e.g. gemini-2.5-flash-image, "nano-banana") for image-conditioned generation.`,
+        )
+      }
+
       // Imagen models path (generateImages API)
       validateImageSize(model, options.size)
       validateNumberOfImages(model, options.numberOfImages)
@@ -127,7 +151,8 @@ export class GeminiImageAdapter<
   private async generateWithGeminiApi(
     options: ImageGenerationOptions<GeminiImageProviderOptions>,
   ): Promise<ImageGenerationResult> {
-    const { model, prompt, size, numberOfImages, modelOptions } = options
+    const { model, prompt, size, numberOfImages, modelOptions, imageInputs } =
+      options
 
     const parsedSize = size ? parseNativeImageSize(size) : undefined
 
@@ -169,15 +194,81 @@ export class GeminiImageAdapter<
       }),
     }
 
+    const contents = await this.buildContents(augmentedPrompt, imageInputs)
+
     const response = await this.client.models.generateContent({
       model,
-      contents: augmentedPrompt,
+      contents,
       config,
     })
 
     return this.transformGeminiResponse(model, response)
   }
 
+  /**
+   * Build the multimodal `contents` payload. When `imageInputs` is empty the
+   * SDK accepts a plain prompt string; with inputs we hand it a single user
+   * `Content` whose `parts` interleave the inline/file image data with the
+   * text prompt last (Gemini conventionally treats the trailing text as the
+   * instruction).
+   */
+  private async buildContents(
+    prompt: string,
+    imageInputs?: ReadonlyArray<ImagePart<MediaInputMetadata>>,
+  ): Promise<string | Array<Content>> {
+    if (!imageInputs || imageInputs.length === 0) {
+      return prompt
+    }
+    const imageParts: Array<Part> = await Promise.all(
+      imageInputs.map((part) => this.imagePartToGeminiPart(part)),
+    )
+    const parts: Array<Part> = [...imageParts, { text: prompt }]
+    return [{ role: 'user', parts }]
+  }
+
+  private async imagePartToGeminiPart(
+    part: ImagePart<MediaInputMetadata>,
+  ): Promise<Part> {
+    if (part.source.type === 'data') {
+      return {
+        inlineData: {
+          mimeType: part.source.mimeType || 'image/png',
+          data: part.source.value,
+        },
+      }
+    }
+    // For URL sources, prefer passing the URL through as `fileData` when it
+    // looks like a Google Files API URI; otherwise fetch and inline as base64.
+    if (
+      part.source.value.startsWith('gs://') ||
+      /^https?:\/\/generativelanguage\.googleapis\.com\//.test(
+        part.source.value,
+      )
+    ) {
+      return {
+        fileData: {
+          fileUri: part.source.value,
+          ...(part.source.mimeType && { mimeType: part.source.mimeType }),
+        },
+      }
+    }
+    const response = await fetch(part.source.value)
+    if (!response.ok) {
+      throw new Error(
+        `Failed to fetch image input (${response.status} ${response.statusText}): ${part.source.value}`,
+      )
+    }
+    const blob = await response.blob()
+    const buffer = await blob.arrayBuffer()
+    const base64 = arrayBufferToBase64(buffer)
+    return {
+      inlineData: {
+        mimeType: part.source.mimeType || blob.type || 'image/png',
+        data: base64,
+      },
+    }
+  }
+
   private transformGeminiResponse(
     model: string,
     response: GenerateContentResponse,
diff --git a/packages/typescript/ai-grok/src/adapters/image.ts b/packages/typescript/ai-grok/src/adapters/image.ts
index 35f8cd224..081ed610c 100644
--- a/packages/typescript/ai-grok/src/adapters/image.ts
+++ b/packages/typescript/ai-grok/src/adapters/image.ts
@@ -61,6 +61,18 @@ export class GrokImageAdapter<
   ): Promise<ImageGenerationResult> {
     const { model, prompt, numberOfImages, size, modelOptions } = options
 
+    if (
+      options.imageInputs?.length ||
+      options.videoInputs?.length ||
+      options.audioInputs?.length
+    ) {
+      throw new Error(
+        `grok.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` +
+          `Image-conditioned generation requires the xAI Imagine API, which the current adapter ` +
+          `does not target (it uses the OpenAI-compat endpoint). Track progress at https://github.com/TanStack/ai/issues/618.`,
+      )
+    }
+
     validatePrompt({ prompt, model })
     validateImageSize(model, size)
     validateNumberOfImages(model, numberOfImages)
diff --git a/packages/typescript/ai-openai/src/adapters/image.ts b/packages/typescript/ai-openai/src/adapters/image.ts
index 1380c7a26..334001832 100644
--- a/packages/typescript/ai-openai/src/adapters/image.ts
+++ b/packages/typescript/ai-openai/src/adapters/image.ts
@@ -3,6 +3,7 @@ import { BaseImageAdapter } from '@tanstack/ai/adapters'
 import { toRunErrorPayload } from '@tanstack/ai/adapter-internals'
 import { generateId } from '@tanstack/ai-utils'
 import { getOpenAIApiKeyFromEnv } from '../utils/client'
+import { imagePartToFile } from '../image/image-input-to-file'
 import {
   validateImageSize,
   validateNumberOfImages,
@@ -12,6 +13,8 @@ import type {
   GeneratedImage,
   ImageGenerationOptions,
   ImageGenerationResult,
+  ImagePart,
+  MediaInputMetadata,
 } from '@tanstack/ai'
 import type OpenAI_SDK from 'openai'
 import type { OpenAIImageModel } from '../model-meta'
@@ -22,6 +25,15 @@ import type {
 } from '../image/image-provider-options'
 import type { OpenAIClientConfig } from '../utils/client'
 
+// Per OpenAI docs: dall-e-2 accepts 1 image to `images.edit()`; gpt-image-1
+// and gpt-image-1-mini accept up to 16; dall-e-3 does not support edit at all.
+const EDIT_MAX_IMAGES: Record<OpenAIImageModel, number> = {
+  'dall-e-2': 1,
+  'gpt-image-1': 16,
+  'gpt-image-1-mini': 16,
+  'dall-e-3': 0,
+}
+
 /**
  * Configuration for OpenAI image adapter
  */
@@ -59,12 +71,44 @@ export class OpenAIImageAdapter<
   async generateImages(
     options: ImageGenerationOptions<OpenAIImageProviderOptions>,
   ): Promise<ImageGenerationResult> {
-    const { model, prompt, numberOfImages, size, modelOptions } = options
+    const {
+      model,
+      prompt,
+      numberOfImages,
+      size,
+      modelOptions,
+      imageInputs,
+      videoInputs,
+      audioInputs,
+    } = options
 
     validatePrompt({ prompt, model })
     validateImageSize(model, size)
     validateNumberOfImages(model, numberOfImages)
 
+    if (videoInputs?.length) {
+      throw new Error(
+        `${this.name}.generateImages does not support videoInputs (model: ${model}).`,
+      )
+    }
+    if (audioInputs?.length) {
+      throw new Error(
+        `${this.name}.generateImages does not support audioInputs (model: ${model}).`,
+      )
+    }
+
+    if (imageInputs && imageInputs.length > 0) {
+      return this.editImages({
+        model: model as OpenAIImageModel,
+        prompt,
+        numberOfImages,
+        size,
+        modelOptions,
+        imageInputs,
+        logger: options.logger,
+      })
+    }
+
     // With exactOptionalPropertyTypes, vendor SDK request shapes reject
     // `T | undefined` in optional fields. Build the request incrementally and
     // only set `size` when it's actually defined.
@@ -138,6 +182,129 @@ export class OpenAIImageAdapter<
       throw error
     }
   }
+
+  /**
+   * Image-conditioned generation via OpenAI's `images.edit()` endpoint.
+   * dall-e-2 accepts 1 input image; gpt-image-1 / gpt-image-1-mini accept up
+   * to 16; dall-e-3 rejects entirely. A part with `metadata.role === 'mask'`
+   * is routed to the SDK's `mask` field (PNG with alpha channel).
+   */
+  private async editImages(args: {
+    model: OpenAIImageModel
+    prompt: string
+    numberOfImages?: number
+    size?: string
+    modelOptions?: OpenAIImageProviderOptions
+    imageInputs: ReadonlyArray<ImagePart<MediaInputMetadata>>
+    logger: ImageGenerationOptions<OpenAIImageProviderOptions>['logger']
+  }): Promise<ImageGenerationResult> {
+    const { model, prompt, numberOfImages, size, modelOptions, logger } = args
+    const maxImages = EDIT_MAX_IMAGES[model]
+    if (maxImages === 0) {
+      throw new Error(
+        `${this.name}: model "${model}" does not support imageInputs. ` +
+          `Use gpt-image-1, gpt-image-1-mini, or dall-e-2 for image-conditioned generation.`,
+      )
+    }
+
+    const maskParts = args.imageInputs.filter(
+      (part) => part.metadata?.role === 'mask',
+    )
+    const sourceParts = args.imageInputs.filter(
+      (part) => part.metadata?.role !== 'mask',
+    )
+
+    if (maskParts.length > 1) {
+      throw new Error(
+        `${this.name}: only one input with metadata.role === 'mask' is supported per request.`,
+      )
+    }
+    if (sourceParts.length === 0) {
+      throw new Error(
+        `${this.name}: imageInputs contained only mask parts; at least one source image is required.`,
+      )
+    }
+    if (sourceParts.length > maxImages) {
+      throw new Error(
+        `${this.name}: model "${model}" accepts at most ${maxImages} source image(s); received ${sourceParts.length}.`,
+      )
+    }
+
+    const sourceFiles = await Promise.all(
+      sourceParts.map((part, i) => imagePartToFile(part, `source-${i}`)),
+    )
+    const maskFile = maskParts[0]
+      ? await imagePartToFile(maskParts[0], 'mask')
+      : undefined
+
+    // `modelOptions` is typed across all four image models (including dall-e-3's
+    // `quality: 'hd' | 'standard'` which isn't valid for edit). dall-e-3 has
+    // already been rejected above, so any remaining quality value is valid for
+    // the edit endpoint — cast the spread to clear the union mismatch.
+    const request: OpenAI_SDK.Images.ImageEditParamsNonStreaming = {
+      model,
+      prompt,
+      image: sourceFiles.length === 1 ? sourceFiles[0]! : sourceFiles,
+      n: numberOfImages ?? 1,
+      stream: false,
+      ...((modelOptions ??
+        {}) as Partial<OpenAI_SDK.Images.ImageEditParamsNonStreaming>),
+    }
+    if (size !== undefined) {
+      request.size = size as Exclude<
+        OpenAI_SDK.Images.ImageEditParamsNonStreaming['size'],
+        undefined
+      >
+    }
+    if (maskFile) {
+      request.mask = maskFile
+    }
+
+    try {
+      logger.request(
+        `activity=imageEdit provider=${this.name} model=${model} n=${request.n ?? 1} size=${request.size ?? 'default'} sources=${sourceFiles.length}${maskFile ? ' mask' : ''}`,
+        { provider: this.name, model },
+      )
+      const response = await this.client.images.edit(request)
+
+      const images: Array<GeneratedImage> = (response.data ?? []).flatMap(
+        (item): Array<GeneratedImage> => {
+          const revisedPromptField =
+            item.revised_prompt !== undefined
+              ? { revisedPrompt: item.revised_prompt }
+              : {}
+          if (item.b64_json) {
+            return [{ b64Json: item.b64_json, ...revisedPromptField }]
+          }
+          if (item.url) {
+            return [{ url: item.url, ...revisedPromptField }]
+          }
+          return []
+        },
+      )
+
+      return {
+        id: generateId(this.name),
+        model,
+        images,
+        ...(response.usage
+          ? {
+              usage: {
+                inputTokens: response.usage.input_tokens,
+                outputTokens: response.usage.output_tokens,
+                totalTokens: response.usage.total_tokens,
+              },
+            }
+          : {}),
+      }
+    } catch (error: unknown) {
+      logger.errors(`${this.name}.editImages fatal`, {
+        error: toRunErrorPayload(error, `${this.name}.editImages failed`),
+        source: `${this.name}.editImages`,
+      })
+      throw error
+    }
+  }
 }
 
 /**
diff --git a/packages/typescript/ai-openai/src/adapters/video.ts b/packages/typescript/ai-openai/src/adapters/video.ts
index 2bb9df046..8a1bc9c1b 100644
--- a/packages/typescript/ai-openai/src/adapters/video.ts
+++ b/packages/typescript/ai-openai/src/adapters/video.ts
@@ -3,6 +3,7 @@ import { BaseVideoAdapter } from '@tanstack/ai/adapters'
 import { toRunErrorPayload } from '@tanstack/ai/adapter-internals'
 import { arrayBufferToBase64 } from '@tanstack/ai-utils'
 import { getOpenAIApiKeyFromEnv } from '../utils/client'
+import { imagePartToFile } from '../image/image-input-to-file'
 import {
   toApiSeconds,
   validateVideoSeconds,
@@ -88,15 +89,38 @@ export class OpenAIVideoAdapter<
     options: VideoGenerationOptions<OpenAIVideoProviderOptions>,
   ): Promise<VideoJobResult> {
     const { model, size, duration, modelOptions } = options
+    const { imageInputs, videoInputs, audioInputs } = options
 
     validateVideoSize(model, size)
     const seconds = duration ?? modelOptions?.seconds
     validateVideoSeconds(model, seconds)
 
+    if (videoInputs?.length) {
+      throw new Error(
+        `${this.name}.createVideoJob does not support videoInputs (model: ${model}).`,
+      )
+    }
+    if (audioInputs?.length) {
+      throw new Error(
+        `${this.name}.createVideoJob does not support audioInputs (model: ${model}).`,
+      )
+    }
+    if (imageInputs && imageInputs.length > 1) {
+      throw new Error(
+        `${this.name}: Sora accepts at most one input_reference image; received ${imageInputs.length}.`,
+      )
+    }
+
     const request: OpenAI_SDK.Videos.VideoCreateParams = {
       model: model as VideoModel,
       prompt: options.prompt,
     }
+    if (imageInputs && imageInputs[0]) {
+      // Sora's `input_reference` is a single Uploadable; convert TanStack
+      // ImagePart (URL or base64) → File before handing it to the SDK.
+      const file = await imagePartToFile(imageInputs[0], 'input-reference')
+      ;(request as { input_reference?: unknown }).input_reference = file
+    }
     // `VideoCreateParams.size` is `size?: VideoSize` (no `| undefined`), so we
     // narrow before assignment instead of casting from a `T | undefined` source.
     if (size) {
diff --git a/packages/typescript/ai-openai/src/image/image-input-to-file.ts b/packages/typescript/ai-openai/src/image/image-input-to-file.ts
new file mode 100644
index 000000000..2074496fd
--- /dev/null
+++ b/packages/typescript/ai-openai/src/image/image-input-to-file.ts
@@ -0,0 +1,70 @@
+import { base64ToArrayBuffer } from '@tanstack/ai-utils'
+import type { ImagePart, MediaInputMetadata } from '@tanstack/ai'
+
+const DEFAULT_MIME = 'image/png'
+const MIME_TO_EXT: Record<string, string> = {
+  'image/png': 'png',
+  'image/jpeg': 'jpg',
+  'image/jpg': 'jpg',
+  'image/webp': 'webp',
+  'image/gif': 'gif',
+}
+
+function extForMime(mimeType: string): string {
+  return MIME_TO_EXT[mimeType] ?? mimeType.split('/')[1] ?? 'png'
+}
+
+function ensureFileSupport(): void {
+  if (typeof File === 'undefined') {
+    throw new Error(
+      '`File` is not available in this environment. ' +
+        'Image-conditioned generation requires Node 20+ or a browser context.',
+    )
+  }
+}
+
+/**
+ * Convert a TanStack `ImagePart` into an OpenAI-compatible `File`.
+ *
+ * - `source.type === 'data'`: decode base64 → Buffer → File.
+ * - `source.type === 'url'`: fetch the URL (or parse data: URI) → File.
+ *
+ * The mime type comes from the source when available, else inferred from the
+ * URL extension, else `image/png`.
+ */
+export async function imagePartToFile(
+  part: ImagePart<MediaInputMetadata>,
+  fallbackName: string,
+): Promise<File> {
+  ensureFileSupport()
+
+  if (part.source.type === 'data') {
+    const mimeType = part.source.mimeType || DEFAULT_MIME
+    const bytes = base64ToArrayBuffer(part.source.value)
+    return new File([bytes], `${fallbackName}.${extForMime(mimeType)}`, {
+      type: mimeType,
+    })
+  }
+
+  // URL source — also handles data: URIs uniformly via fetch().
+  const response = await fetch(part.source.value)
+  if (!response.ok) {
+    throw new Error(
+      `Failed to fetch image input (${response.status} ${response.statusText}): ${part.source.value}`,
+    )
+  }
+  const blob = await response.blob()
+  const mimeType =
+    part.source.mimeType || blob.type || inferMimeFromUrl(part.source.value)
+  return new File([blob], `${fallbackName}.${extForMime(mimeType)}`, {
+    type: mimeType,
+  })
+}
+
+function inferMimeFromUrl(url: string): string {
+  const match = url.match(/\.(png|jpe?g|webp|gif)(?:\?|#|$)/i)
+  if (!match || !match[1]) return DEFAULT_MIME
+  const ext = match[1].toLowerCase()
+  if (ext === 'jpg' || ext === 'jpeg') return 'image/jpeg'
+  return `image/${ext}`
+}
diff --git a/packages/typescript/ai-openai/tests/image-adapter.test.ts b/packages/typescript/ai-openai/tests/image-adapter.test.ts
index 34cb5fda7..6c1da8143 100644
--- a/packages/typescript/ai-openai/tests/image-adapter.test.ts
+++ b/packages/typescript/ai-openai/tests/image-adapter.test.ts
@@ -25,6 +25,9 @@ class TestOpenAIImageAdapter<
   spyOnImagesGenerate() {
     return vi.spyOn(this.client.images, 'generate')
   }
+  spyOnImagesEdit() {
+    return vi.spyOn(this.client.images, 'edit')
+  }
 }
 
 describe('OpenAI Image Adapter', () => {
@@ -232,4 +235,162 @@ describe('OpenAI Image Adapter', () => {
       expect(result2.id).toMatch(/^openai-/)
     })
   })
+
+  describe('imageInputs (image-conditioned generation)', () => {
+    const imagesEditResponse: OpenAI.Images.ImagesResponse = {
+      created: 0,
+      data: [{ b64_json: 'edited-base64' }],
+    }
+
+    it('routes to images.edit() for gpt-image-1 when imageInputs is present', async () => {
+      const adapter = new TestOpenAIImageAdapter(
+        { apiKey: 'test-api-key' },
+        'gpt-image-1',
+      )
+      const editSpy = adapter
+        .spyOnImagesEdit()
+        .mockResolvedValueOnce(imagesEditResponse)
+      const generateSpy = adapter.spyOnImagesGenerate()
+
+      const result = await adapter.generateImages({
+        model: 'gpt-image-1',
+        prompt: 'Make it cinematic',
+        imageInputs: [
+          {
+            type: 'image',
+            source: {
+              type: 'data',
+              value: 'aGVsbG8=',
+              mimeType: 'image/png',
+            },
+          },
+        ],
+        logger: testLogger,
+      })
+
+      expect(generateSpy).not.toHaveBeenCalled()
+      expect(editSpy).toHaveBeenCalledTimes(1)
+      const editArgs = editSpy.mock.calls[0]![0]
+      expect(editArgs.model).toBe('gpt-image-1')
+      expect(editArgs.prompt).toBe('Make it cinematic')
+      expect(editArgs.image).toBeInstanceOf(File)
+      expect(result.images[0]!.b64Json).toBe('edited-base64')
+    })
+
+    it('rejects dall-e-3 with a clear error when imageInputs is present', async () => {
+      const adapter = new TestOpenAIImageAdapter(
+        { apiKey: 'test-api-key' },
+        'dall-e-3',
+      )
+
+      await expect(
+        adapter.generateImages({
+          model: 'dall-e-3',
+          prompt: 'edit',
+          imageInputs: [
+            {
+              type: 'image',
+              source: { type: 'data', value: 'aGk=', mimeType: 'image/png' },
+            },
+          ],
+          logger: testLogger,
+        }),
+      ).rejects.toThrow(/does not support imageInputs/)
+    })
+
+    it('rejects dall-e-2 when more than one source image is provided', async () => {
+      const adapter = new TestOpenAIImageAdapter(
+        { apiKey: 'test-api-key' },
+        'dall-e-2',
+      )
+
+      await expect(
+        adapter.generateImages({
+          model: 'dall-e-2',
+          prompt: 'edit',
+          imageInputs: [
+            {
+              type: 'image',
+              source: { type: 'data', value: 'aGk=', mimeType: 'image/png' },
+            },
+            {
+              type: 'image',
+              source: {
+                type: 'data',
+                value: 'YnllCg==',
+                mimeType: 'image/png',
+              },
+            },
+          ],
+          logger: testLogger,
+        }),
+      ).rejects.toThrow(/at most 1 source image/)
+    })
+
+    it('routes metadata.role==="mask" to the mask param', async () => {
+      const adapter = new TestOpenAIImageAdapter(
+        { apiKey: 'test-api-key' },
+        'gpt-image-1',
+      )
+      const editSpy = adapter
+        .spyOnImagesEdit()
+        .mockResolvedValueOnce(imagesEditResponse)
+
+      await adapter.generateImages({
+        model: 'gpt-image-1',
+        prompt: 'replace masked region',
+        imageInputs: [
+          {
+            type: 'image',
+            source: { type: 'data', value: 'aGk=', mimeType: 'image/png' },
+          },
+          {
+            type: 'image',
+            source: { type: 'data', value: 'bWFzaw==', mimeType: 'image/png' },
+            metadata: { role: 'mask' },
+          },
+        ],
+        logger: testLogger,
+      })
+
+      const editArgs = editSpy.mock.calls[0]![0]
+      expect(editArgs.mask).toBeInstanceOf(File)
+      expect(editArgs.image).toBeInstanceOf(File)
+    })
+
+    it('rejects videoInputs or audioInputs', async () => {
+      const adapter = new TestOpenAIImageAdapter(
+        { apiKey: 'test-api-key' },
+        'gpt-image-1',
+      )
+
+      await expect(
+        adapter.generateImages({
+          model: 'gpt-image-1',
+          prompt: 'x',
+          videoInputs: [
+            {
+              type: 'video',
+              source: { type: 'url', value: 'https://example.com/v.mp4' },
+            },
+          ],
+          logger: testLogger,
+        }),
+      ).rejects.toThrow(/videoInputs/)
+
+      await expect(
+        adapter.generateImages({
+          model: 'gpt-image-1',
+          prompt: 'x',
+          audioInputs: [
+            {
+              type: 'audio',
+              source: { type: 'url', value: 'https://example.com/a.mp3' },
+            },
+          ],
+          logger: testLogger,
+        }),
+      ).rejects.toThrow(/audioInputs/)
+    })
+  })
 })
diff --git a/packages/typescript/ai-openrouter/src/adapters/image.ts b/packages/typescript/ai-openrouter/src/adapters/image.ts
index ac746ab3b..4d4e75772 100644
--- a/packages/typescript/ai-openrouter/src/adapters/image.ts
+++ b/packages/typescript/ai-openrouter/src/adapters/image.ts
@@ -64,6 +64,18 @@ export class OpenRouterImageAdapter<
   async generateImages(
     options: ImageGenerationOptions<OpenRouterImageProviderOptions>,
   ): Promise<ImageGenerationResult> {
+    if (
+      options.imageInputs?.length ||
+      options.videoInputs?.length ||
+      options.audioInputs?.length
+    ) {
+      throw new Error(
+        `openrouter.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` +
+          `Image-conditioned generation via OpenRouter requires injecting parts into the multimodal ` +
+          `chat-completions messages array; this is tracked at https://github.com/TanStack/ai/issues/618.`,
+      )
+    }
+
     const { model, prompt, numberOfImages, size, modelOptions, logger } =
       options
     // Use provided aspect_ratio or derive from size
diff --git a/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md b/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md
index 0cd507a56..bf7ccaf0f 100644
--- a/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md
+++ b/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md
@@ -189,6 +189,95 @@ Result shape: `ImageGenerationResult` with `images` array where each entry
 has `b64Json?`, `url?`, and `revisedPrompt?`. OpenAI image URLs expire
 after 1 hour -- download or display immediately.
 
+#### Image-conditioned generation: `imageInputs` / `videoInputs` / `audioInputs`
+
+Both `generateImage()` and `generateVideo()` accept multimodal conditioning
+inputs that reuse the existing `ImagePart` / `VideoPart` / `AudioPart`
+shape used elsewhere in TanStack AI. Each input may carry an optional
+`metadata.role` hint that adapters use to route the part to the
+provider-specific field.
+
+```typescript
+import { generateImage, type ImagePart } from '@tanstack/ai'
+import { openaiImage } from '@tanstack/ai-openai'
+
+// Image-to-image (OpenAI gpt-image-1, dall-e-2)
+await generateImage({
+  adapter: openaiImage('gpt-image-1'),
+  prompt: 'Turn this into a cinematic product photo',
+  imageInputs: [
+    { type: 'image', source: { type: 'url', value: 'https://…/product.png' } },
+  ],
+})
+
+// Multi-reference (up to 16 for gpt-image-1; up to 14 for Gemini native)
+await generateImage({
+  adapter: openaiImage('gpt-image-1'),
+  prompt: 'Apply the second image as style to the first',
+  imageInputs: [
+    { type: 'image', source: { type: 'url', value: 'https://…/product.png' } },
+    { type: 'image', source: { type: 'url', value: 'https://…/style.png' } },
+  ],
+})
+
+// Inpaint via metadata.role === 'mask' (OpenAI gpt-image-1, dall-e-2; fal mask_url)
+await generateImage({
+  adapter: openaiImage('gpt-image-1'),
+  prompt: 'Replace the masked region with a tree',
+  imageInputs: [
+    { type: 'image', source: { type: 'url', value: photoUrl } },
+    {
+      type: 'image',
+      source: { type: 'url', value: maskUrl },
+      metadata: { role: 'mask' },
+    },
+  ],
+})
+
+// Image-to-video (OpenAI Sora: single input_reference; fal: image_url + optional end_image_url)
+import { generateVideo } from '@tanstack/ai'
+import { falVideo } from '@tanstack/ai-fal'
+
+await generateVideo({
+  adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'),
+  prompt: 'Slow cinematic push-in',
+  imageInputs: [
+    { type: 'image', source: { type: 'url', value: firstFrameUrl } },
+    {
+      type: 'image',
+      source: { type: 'url', value: lastFrameUrl },
+      metadata: { role: 'end_frame' },
+    },
+  ],
+})
+```
+
+**Role hints** (`metadata.role`):
+
+| Role            | Maps to                                                                  |
+| --------------- | ------------------------------------------------------------------------ |
+| `'reference'`   | fal `reference_image_urls`; Gemini multimodal part; positional otherwise |
+| `'character'`   | Same as `'reference'`; Veo `referenceImages` slot                        |
+| `'mask'`        | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url`                    |
+| `'control'`     | fal `control_image_url` (ControlNet / depth / pose)                      |
+| `'start_frame'` | fal `start_image_url`; Veo `image`                                       |
+| `'end_frame'`   | fal `end_image_url`; Veo `lastFrame`                                     |
+
+**Provider support matrix:**
+
+| Provider   | `generateImage` `imageInputs`                                                                                | `generateVideo` `imageInputs`                                                                                               |
+| ---------- | ------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------- |
+| OpenAI     | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws.                      | Sora-2 / -pro → `input_reference` (single). Throws if >1.                                                                   |
+| Gemini     | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws.                        | No native Veo adapter yet — deferred to a follow-up.                                                                        |
+| fal        | 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. |
+| Grok       | Throws — adapter uses OpenAI-compat endpoint; native Imagine API rewrite pending.                            | n/a                                                                                                                         |
+| OpenRouter | Throws — multimodal injection pending.                                                                       | n/a                                                                                                                         |
+| Anthropic  | n/a (no image generation API).                                                                               | n/a                                                                                                                         |
+
+`videoInputs` and `audioInputs` follow the same `metadata.role` convention
+for video-to-video and lipsync flows on fal; other providers throw when
+they're passed.
+
 ### 2. Audio Generation (Music, Sound Effects)
 
 Distinct from TTS — `generateAudio()` produces non-speech audio content.
@@ -579,7 +668,45 @@ generateSpeech({
 
 > Source: Gemini TTS adapter validation; CodeRabbit review of PR #463.
 
-### h. LOW: Writing a logging middleware to see media chunks flow through
+### h. HIGH: Passing `imageInputs` to a model that doesn't support image-conditioned generation
+
+Not every model accepts image-conditioned inputs. Adapters throw a clear
+runtime error when the caller passes `imageInputs` to a model that
+can't honor it (dall-e-3, Imagen, Grok, OpenRouter), so users learn at
+call time rather than getting silently wrong output.
+
+```typescript
+// WRONG — dall-e-3 has no edit/inputs API
+generateImage({
+  adapter: openaiImage('dall-e-3'),
+  prompt: 'Edit this',
+  imageInputs: [{ type: 'image', source: { type: 'url', value: url } }],
+}) // throws: model "dall-e-3" does not support imageInputs.
+
+// WRONG — Imagen is text-to-image only
+generateImage({
+  adapter: geminiImage('imagen-4.0-generate-001'),
+  prompt: 'Edit this',
+  imageInputs: [{ type: 'image', source: { type: 'url', value: url } }],
+}) // throws: Imagen does not support imageInputs.
+
+// CORRECT — use a model that supports edits/inputs
+generateImage({
+  adapter: openaiImage('gpt-image-1'), // edits up to 16 images
+  prompt: 'Edit this',
+  imageInputs: [{ type: 'image', source: { type: 'url', value: url } }],
+})
+
+generateImage({
+  adapter: geminiImage('gemini-3.1-flash-image-preview'), // native multimodal
+  prompt: 'Edit this',
+  imageInputs: [{ type: 'image', source: { type: 'url', value: url } }],
+})
+```
+
+> Source: docs/media/image-generation.md, docs/media/video-generation.md.
+
+### i. LOW: Writing a logging middleware to see media chunks flow through
 
 Every media activity — `generateAudio`, `generateSpeech`,
 `generateTranscription`, `generateImage`, `generateVideo` — accepts the
diff --git a/packages/typescript/ai/src/activities/generateImage/index.ts b/packages/typescript/ai/src/activities/generateImage/index.ts
index b8d173b09..285fb6a56 100644
--- a/packages/typescript/ai/src/activities/generateImage/index.ts
+++ b/packages/typescript/ai/src/activities/generateImage/index.ts
@@ -11,7 +11,14 @@ import { resolveDebugOption } from '../../logger/resolve'
 import type { InternalLogger } from '../../logger/internal-logger'
 import type { DebugOption } from '../../logger/types'
 import type { ImageAdapter } from './adapter'
-import type { ImageGenerationResult, StreamChunk } from '../../types'
+import type {
+  AudioPart,
+  ImageGenerationResult,
+  ImagePart,
+  MediaInputMetadata,
+  StreamChunk,
+  VideoPart,
+} from '../../types'
 
 // ===========================
 // Activity Kind
@@ -78,6 +85,17 @@ export type ImageActivityOptions<
   numberOfImages?: number
   /** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */
   size?: ImageSizeForModel<TAdapter, TAdapter['model']>
+  /**
+   * Image conditioning inputs for image-to-image, reference-guided, edit, or
+   * multi-reference generation. Each part may carry `metadata.role`
+   * (`'reference' | 'mask' | 'control' | 'character'`) to disambiguate intent.
+   * Adapters that don't support image-conditioned generation throw clearly.
+   */
+  imageInputs?: Array<ImagePart<MediaInputMetadata>>
+  /** Video conditioning inputs. Provider support varies; unsupported adapters throw. */
+  videoInputs?: Array<VideoPart<MediaInputMetadata>>
+  /** Audio conditioning inputs. Provider support varies; unsupported adapters throw. */
+  audioInputs?: Array<AudioPart<MediaInputMetadata>>
   /**
    * Whether to stream the image generation result.
    * When true, returns an AsyncIterable<StreamChunk> for streaming transport.
@@ -210,6 +228,9 @@ async function runGenerateImage<
     prompt: rest.prompt,
     numberOfImages: rest.numberOfImages,
     size: rest.size,
+    imageInputCount: rest.imageInputs?.length,
+    videoInputCount: rest.videoInputs?.length,
+    audioInputCount: rest.audioInputs?.length,
     modelOptions: rest.modelOptions,
     timestamp: startTime,
   })
diff --git a/packages/typescript/ai/src/activities/generateVideo/index.ts b/packages/typescript/ai/src/activities/generateVideo/index.ts
index cee2339f7..ee9b2f9b0 100644
--- a/packages/typescript/ai/src/activities/generateVideo/index.ts
+++ b/packages/typescript/ai/src/activities/generateVideo/index.ts
@@ -14,8 +14,12 @@ import type { InternalLogger } from '../../logger/internal-logger'
 import type { DebugOption } from '../../logger/types'
 import type { VideoAdapter } from './adapter'
 import type {
+  AudioPart,
+  ImagePart,
+  MediaInputMetadata,
   StreamChunk,
   VideoJobResult,
+  VideoPart,
   VideoStatusResult,
   VideoUrlResult,
 } from '../../types'
@@ -89,6 +93,16 @@ export type VideoCreateOptions<
   size?: VideoSizeForAdapter<TAdapter>
   /** Video duration in seconds */
   duration?: number
+  /**
+   * Image conditioning inputs (start frame, end frame, reference / character
+   * images). Use `metadata.role` (`'start_frame' | 'end_frame' | 'reference' |
+   * 'character'`) to disambiguate intent; positional fallback otherwise.
+   */
+  imageInputs?: Array<ImagePart<MediaInputMetadata>>
+  /** Video conditioning inputs (video-to-video, source clip). */
+  videoInputs?: Array<VideoPart<MediaInputMetadata>>
+  /** Audio conditioning inputs (lipsync source, voice reference). */
+  audioInputs?: Array<AudioPart<MediaInputMetadata>>
   /**
    * Whether to stream the video generation lifecycle.
    * When true, returns an AsyncIterable<StreamChunk> that handles the full
@@ -249,7 +263,16 @@ export function generateVideo<
 async function runCreateVideoJob<
   TAdapter extends VideoAdapter<string, any, any, any>,
 >(options: VideoCreateOptions<TAdapter, boolean>): Promise<VideoJobResult> {
-  const { adapter, prompt, size, duration, modelOptions } = options
+  const {
+    adapter,
+    prompt,
+    size,
+    duration,
+    modelOptions,
+    imageInputs,
+    videoInputs,
+    audioInputs,
+  } = options
   const model = adapter.model
   const logger: InternalLogger = resolveDebugOption(options.debug)
   const providerName =
@@ -269,6 +292,9 @@ async function runCreateVideoJob<
       size,
       duration,
       modelOptions,
+      imageInputs,
+      videoInputs,
+      audioInputs,
       logger,
     })
     logger.output(`activity=generateVideo jobId=${result.jobId}`, {
@@ -296,7 +322,16 @@ function sleep(ms: number): Promise<void> {
 async function* runStreamingVideoGeneration<
   TAdapter extends VideoAdapter<string, any, any, any>,
 >(options: VideoCreateOptions<TAdapter, true>): AsyncIterable<StreamChunk> {
-  const { adapter, prompt, size, duration, modelOptions } = options
+  const {
+    adapter,
+    prompt,
+    size,
+    duration,
+    modelOptions,
+    imageInputs,
+    videoInputs,
+    audioInputs,
+  } = options
   const model = adapter.model
   const runId = options.runId ?? createId('run')
   const pollingInterval = options.pollingInterval ?? 2000
@@ -332,6 +367,9 @@ async function* runStreamingVideoGeneration<
       size,
       duration,
       modelOptions,
+      imageInputs,
+      videoInputs,
+      audioInputs,
       logger,
     })
 
diff --git a/packages/typescript/ai/src/types.ts b/packages/typescript/ai/src/types.ts
index a12964981..5528ca9f9 100644
--- a/packages/typescript/ai/src/types.ts
+++ b/packages/typescript/ai/src/types.ts
@@ -1409,6 +1409,31 @@ export interface SummarizationResult {
 // Image Generation Types
 // ============================================================================
 
+/**
+ * Optional role hint on a media input part (image / video / audio). Adapters
+ * read `metadata.role` to route the part to the provider-specific request
+ * field — e.g. `'mask'` → OpenAI `mask` / fal `mask_url`, `'end_frame'` → fal
+ * `end_image_url`, `'reference'` → fal `reference_image_urls`. When omitted
+ * the adapter falls back to positional routing.
+ */
+export type MediaInputRole =
+  | 'reference'
+  | 'mask'
+  | 'control'
+  | 'start_frame'
+  | 'end_frame'
+  | 'character'
+
+/**
+ * Metadata convention for image / video / audio inputs to media generation.
+ * Carried on `ImagePart.metadata` / `VideoPart.metadata` / `AudioPart.metadata`
+ * when used as conditioning inputs to `generateImage()` or `generateVideo()`.
+ */
+export interface MediaInputMetadata {
+  /** Optional role hint disambiguating the part's intent for the adapter */
+  role?: MediaInputRole
+}
+
 /**
  * Options for image generation.
  * These are the common options supported across providers.
@@ -1425,6 +1450,25 @@ export interface ImageGenerationOptions<
   numberOfImages?: number
   /** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */
   size?: TSize
+  /**
+   * Image conditioning inputs (reference / mask / control / start frame /
+   * character). Reuses the multimodal `ImagePart` shape. Adapters map these
+   * onto the provider-native request — e.g. OpenAI `images.edit()`, Gemini
+   * multimodal `contents`, fal `image_url` / `image_urls` / `mask_url`.
+   * Adapters that do not support image-conditioned generation throw a clear
+   * runtime error when this field is non-empty.
+   */
+  imageInputs?: Array<ImagePart<MediaInputMetadata>>
+  /**
+   * Video conditioning inputs (video-to-video, edit, lipsync source).
+   * Not all providers support this; adapters throw when unsupported.
+   */
+  videoInputs?: Array<VideoPart<MediaInputMetadata>>
+  /**
+   * Audio conditioning inputs (audio reference, voice cloning, lipsync).
+   * Not all providers support this; adapters throw when unsupported.
+   */
+  audioInputs?: Array<AudioPart<MediaInputMetadata>>
   /** Model-specific options for image generation */
   modelOptions?: TProviderOptions
   /**
@@ -1555,6 +1599,24 @@ export interface VideoGenerationOptions<
   size?: TSize
   /** Video duration in seconds */
   duration?: number
+  /**
+   * Image conditioning inputs (start frame, end frame, character / reference
+   * images). Reuses the multimodal `ImagePart` shape; adapters route by
+   * `metadata.role` and array position (e.g. OpenAI Sora `input_reference`,
+   * fal `image_url` / `end_image_url`, Veo `image` / `lastFrame` /
+   * `referenceImages`). Adapters throw at runtime if unsupported.
+   */
+  imageInputs?: Array<ImagePart<MediaInputMetadata>>
+  /**
+   * Video conditioning inputs (video-to-video edit, source clip).
+   * Not all providers support this; adapters throw when unsupported.
+   */
+  videoInputs?: Array<VideoPart<MediaInputMetadata>>
+  /**
+   * Audio conditioning inputs (lipsync source, voice reference).
+   * Not all providers support this; adapters throw when unsupported.
+   */
+  audioInputs?: Array<AudioPart<MediaInputMetadata>>
   /** Model-specific options for video generation */
   modelOptions?: TProviderOptions
   /**
diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts
index 3b464be5b..89c632208 100644
--- a/testing/e2e/src/lib/feature-support.ts
+++ b/testing/e2e/src/lib/feature-support.ts
@@ -142,9 +142,19 @@ export const matrix: Record<Feature, Set<Provider>> = {
   ]),
   // Gemini excluded: aimock doesn't mock Gemini's Imagen predict endpoint format
   'image-gen': new Set(['openai', 'grok']),
+  // image-to-image (imageInputs on generateImage) routes adapters to wire
+  // endpoints aimock doesn't yet mock (OpenAI `/v1/images/edits`, Gemini
+  // multimodal `generateContent`, fal endpoint-specific input fields).
+  // Adapter-level mapping is covered by unit tests. Populate this set when
+  // aimock gains support for those endpoints.
+  'image-to-image': new Set([]),
   tts: new Set(['openai', 'grok']),
   transcription: new Set(['openai', 'grok']),
   'video-gen': new Set(['openai']),
+  // image-to-video (imageInputs on generateVideo) similarly depends on
+  // aimock mocking Sora's `input_reference` upload field. Populate when
+  // aimock support lands.
+  'image-to-video': new Set([]),
 }
 
 export function isSupported(provider: Provider, feature: Feature): boolean {
diff --git a/testing/e2e/src/lib/types.ts b/testing/e2e/src/lib/types.ts
index be405c74d..dc839fcc4 100644
--- a/testing/e2e/src/lib/types.ts
+++ b/testing/e2e/src/lib/types.ts
@@ -27,9 +27,11 @@ export type Feature =
   | 'summarize'
   | 'summarize-stream'
   | 'image-gen'
+  | 'image-to-image'
   | 'tts'
   | 'transcription'
   | 'video-gen'
+  | 'image-to-video'
 
 export const ALL_PROVIDERS: Provider[] = [
   'openai',
@@ -59,7 +61,9 @@ export const ALL_FEATURES: Feature[] = [
   'summarize',
   'summarize-stream',
   'image-gen',
+  'image-to-image',
   'tts',
   'transcription',
   'video-gen',
+  'image-to-video',
 ]