diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md new file mode 100644 index 000000000..3620076c0 --- /dev/null +++ b/.changeset/image-and-video-inputs.md @@ -0,0 +1,22 @@ +--- +'@tanstack/ai': minor +'@tanstack/ai-openai': minor +'@tanstack/ai-gemini': minor +'@tanstack/ai-fal': minor +'@tanstack/ai-grok': patch +'@tanstack/ai-openrouter': patch +'@tanstack/ai-event-client': patch +--- + +Add `imageInputs`, `videoInputs`, and `audioInputs` to `generateImage()` and `generateVideo()` for image-conditioned generation, image-to-image, multi-reference, image-to-video, and edit / inpaint flows. Each input part may carry a `metadata.role` hint (`'reference' | 'mask' | 'control' | 'start_frame' | 'end_frame' | 'character'`) that adapters use to route to the provider-specific field. + +Provider behavior in this release: + +- **OpenAI image** — `gpt-image-1` / `gpt-image-1-mini` route to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` throws a clear not-supported error. +- **OpenAI video** — Sora-2 / Sora-2-Pro accept a single `input_reference` image; passing more than one throws. +- **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") receive inputs as multimodal parts in `contents`. Imagen throws (text-only). +- **fal.ai** — Inputs map to fal field names: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Video adapter additionally honors `role: 'start_frame'` / `'end_frame'`. +- **Grok**, **OpenRouter** — Throw with a link to issue #618 (full support pending dedicated Imagine / multimodal injection work). +- **Anthropic** — Unchanged (no image generation API). + +Closes #618. diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md index 6c3c6e115..5ba27fe94 100644 --- a/docs/media/image-generation.md +++ b/docs/media/image-generation.md @@ -82,6 +82,9 @@ All image adapters support these common options: | `prompt` | `string` | Text description of the image to generate (required) | | `numberOfImages` | `number` | Number of images to generate | | `size` | `string` | Size of the generated image in WIDTHxHEIGHT format | +| `imageInputs?` | `ImagePart[]` | Image conditioning inputs for image-to-image, reference-guided, edit, or multi-reference generation. See [Image-Conditioned Generation](#image-conditioned-generation) below. | +| `videoInputs?` | `VideoPart[]` | Video conditioning inputs. Provider support is limited; most adapters throw. | +| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs. Provider support is limited; most adapters throw. | | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) | ### Size Options @@ -132,6 +135,114 @@ const result = await generateImage({ }) ``` +## Image-Conditioned Generation + +`generateImage()` accepts an optional `imageInputs` field for image-to-image, +reference-guided, multi-reference, and edit / inpaint flows. The field reuses +the same `ImagePart` shape used elsewhere for multimodal content: + +```typescript +import { generateImage, type ImagePart } from '@tanstack/ai' +import { openaiImage } from '@tanstack/ai-openai' + +const reference: ImagePart = { + type: 'image', + source: { type: 'url', value: 'https://example.com/product.png' }, +} + +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Turn this into a cinematic product photo', + imageInputs: [reference], +}) +``` + +### Source format + +`ImagePart.source` is a discriminated union supporting both URLs and inline +base64 data — pass whichever you have: + +```typescript +// URL source +{ type: 'image', source: { type: 'url', value: 'https://example.com/img.png' } } + +// Inline base64 data (mimeType required) +{ type: 'image', source: { type: 'data', value: base64String, mimeType: 'image/png' } } +``` + +OpenAI's edit endpoint requires file uploads; the adapter fetches URL sources +and converts base64 to a `File` automatically. + +### Role hints via `metadata.role` + +When a generation has multiple inputs with different roles (mask vs reference +vs start/end frame), set `metadata.role` on each part. Adapters route by role +to the provider-specific field; parts without a role fall back to positional +mapping. + +| Role | Maps to | +| --------------- | -------------------------------------------------------------------------------------- | +| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional fallback | +| `'character'` | Same as `'reference'`; Veo `referenceImages` slot | +| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | +| `'control'` | fal `control_image_url` (ControlNet / depth / pose conditioning) | +| `'start_frame'` | fal `start_image_url`; Veo `image` (used by `generateVideo`) | +| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` (used by `generateVideo`) | + +#### Inpaint / edit with a mask + +```typescript +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Replace the masked region with a tree', + imageInputs: [ + { + type: 'image', + source: { type: 'url', value: photoUrl }, + }, + { + type: 'image', + source: { type: 'url', value: maskUrl }, + metadata: { role: 'mask' }, + }, + ], +}) +``` + +#### Multi-reference composition + +```typescript +const product: ImagePart = { + type: 'image', + source: { type: 'url', value: 'https://example.com/product.png' }, +} + +const style: ImagePart = { + type: 'image', + source: { type: 'url', value: 'https://example.com/style.png' }, +} + +await generateImage({ + adapter: geminiImage('gemini-3.1-flash-image-preview'), + prompt: 'Generate a new image of the product using the style of the second reference', + imageInputs: [product, style], +}) +``` + +### Provider support + +| Provider | Behavior | +| ------------ | --------------------------------------------------------------------------------------------------------- | +| **OpenAI** | `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.
`dall-e-2` → `images.edit()` with 1 source image only.
`dall-e-3` → throws (no edit support). | +| **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → inputs become multimodal parts in `contents`. Up to ~14 input images.
Imagen models → throws (text-to-image only). | +| **fal.ai** | 1 input → `image_url`; multiple → `image_urls`. `role: 'mask'` → `mask_url`. `role: 'control'` → `control_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. | +| **Grok** | Throws — the current adapter wraps Grok's OpenAI-compat endpoint, which doesn't expose image inputs. xAI's native Imagine API support is tracked as a follow-up. | +| **OpenRouter** | Throws — multimodal injection into the chat-completions pathway is tracked as a follow-up. | +| **Anthropic** | n/a — no image generation API. | + +Adapters that don't support image-conditioned generation throw a clear +runtime error so calls fail fast rather than silently dropping the inputs. + ## Model Options ### OpenAI Model Options diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md index b42e88b6b..1872a39be 100644 --- a/docs/media/video-generation.md +++ b/docs/media/video-generation.md @@ -372,8 +372,76 @@ And returns: | `prompt` | `string` | Text description of the video to generate (required) | | `size` | `string` | Video resolution in WIDTHxHEIGHT format | | `duration` | `number` | Video duration in seconds (maps to `seconds` parameter in API) | +| `imageInputs?` | `ImagePart[]` | Image conditioning inputs — starting frame, end frame, character / reference images. See [Image-to-Video](#image-to-video) below. | +| `videoInputs?` | `VideoPart[]` | Video conditioning inputs for video-to-video / source clip flows. Provider support varies. | +| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs for lipsync / voice cloning flows. Provider support varies. | | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) | +## Image-to-Video + +`generateVideo()` accepts `imageInputs` for starting-frame, ending-frame, +and reference-image conditioned video generation: + +```typescript +import { generateVideo, type ImagePart } from '@tanstack/ai' +import { openaiVideo } from '@tanstack/ai-openai' + +const startingFrame: ImagePart = { + type: 'image', + source: { + type: 'data', + value: base64Image, + mimeType: 'image/png', + }, +} + +const { jobId } = await generateVideo({ + adapter: openaiVideo('sora-2'), + prompt: 'Animate this still into a slow cinematic push-in with subtle motion', + imageInputs: [startingFrame], +}) +``` + +### Role hints + +Each `ImagePart` can carry an optional `metadata.role` hint that the +adapter uses to route the input to the provider-specific field: + +| Role | Maps to | +| --------------- | ------------------------------------------------------------- | +| `'start_frame'` | fal `start_image_url` (positional default for the first input) | +| `'end_frame'` | fal `end_image_url` (Veo `lastFrame` when available) | +| `'reference'` | fal `reference_image_urls` (Veo `referenceImages`) | +| `'character'` | Same as `'reference'` — character consistency images | + +```typescript +import { falVideo } from '@tanstack/ai-fal' + +await generateVideo({ + adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'), + prompt: 'Slow cinematic push-in then a hard cut', + imageInputs: [ + { type: 'image', source: { type: 'url', value: firstFrameUrl } }, + { + type: 'image', + source: { type: 'url', value: lastFrameUrl }, + metadata: { role: 'end_frame' }, + }, + ], +}) +``` + +### Provider support + +| Provider | Image-to-Video Behavior | +| ------------ | -------------------------------------------------------------------------------------------------------- | +| **OpenAI** | Sora-2 / Sora-2-Pro → first input goes to `input_reference`. Single image only — throws if more than one. | +| **fal.ai** | Single input → `image_url` (start frame). `role: 'end_frame'` → `end_image_url`. `role: 'start_frame'` → `start_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions`. | +| **Gemini** | Veo adapter not yet implemented — `imageInputs` will be supported when Veo lands. | + +Adapters whose underlying API can't accept image inputs throw a clear +runtime error so calls fail fast. + ### Supported Sizes Based on [OpenAI API docs](https://platform.openai.com/docs/api-reference/videos/create): diff --git a/packages/typescript/ai-event-client/src/index.ts b/packages/typescript/ai-event-client/src/index.ts index 17fa1d6e0..acfc953c4 100644 --- a/packages/typescript/ai-event-client/src/index.ts +++ b/packages/typescript/ai-event-client/src/index.ts @@ -428,6 +428,12 @@ export interface ImageRequestStartedEvent extends BaseEventContext { prompt: string numberOfImages?: number size?: string + /** Count of image conditioning inputs (image-to-image, mask, reference). */ + imageInputCount?: number + /** Count of video conditioning inputs (video-to-video). */ + videoInputCount?: number + /** Count of audio conditioning inputs (lipsync, voice reference). */ + audioInputCount?: number } /** Emitted when an image request completes. */ diff --git a/packages/typescript/ai-fal/src/adapters/image.ts b/packages/typescript/ai-fal/src/adapters/image.ts index 11dcbaeb2..7fbb14a08 100644 --- a/packages/typescript/ai-fal/src/adapters/image.ts +++ b/packages/typescript/ai-fal/src/adapters/image.ts @@ -2,6 +2,7 @@ import { fal } from '@fal-ai/client' import { BaseImageAdapter } from '@tanstack/ai/adapters' import { configureFalClient, generateId as utilGenerateId } from '../utils' import { mapSizeToFalFormat } from '../image/image-provider-options' +import { mapImageInputsToFalFields } from '../image/image-inputs' import type { OutputType, Result } from '@fal-ai/client' import type { FalClientConfig } from '../utils' import type { @@ -63,6 +64,17 @@ export class FalImageAdapter extends BaseImageAdapter< model: this.model, }) + if (options.videoInputs?.length) { + throw new Error( + `fal.generateImages does not support videoInputs on model ${this.model}.`, + ) + } + if (options.audioInputs?.length) { + throw new Error( + `fal.generateImages does not support audioInputs on model ${this.model}.`, + ) + } + try { const input = this.buildInput(options) const result = await fal.subscribe(this.model, { input }) @@ -83,9 +95,14 @@ export class FalImageAdapter extends BaseImageAdapter< >, ): FalModelInput { const sizeParams = mapSizeToFalFormat(options.size) + // Order matters: modelOptions first (so user overrides win for + // mask_url / control_image_url / reference_image_urls), then size, + // then derived image-input fields, then prompt / num_images. + const inputFields = mapImageInputsToFalFields(options.imageInputs) const input = { ...options.modelOptions, ...sizeParams, + ...inputFields, prompt: options.prompt, num_images: options.numberOfImages, } as FalModelInput diff --git a/packages/typescript/ai-fal/src/adapters/video.ts b/packages/typescript/ai-fal/src/adapters/video.ts index 662b4f60f..6f3dbb1ef 100644 --- a/packages/typescript/ai-fal/src/adapters/video.ts +++ b/packages/typescript/ai-fal/src/adapters/video.ts @@ -2,6 +2,8 @@ import { fal } from '@fal-ai/client' import { BaseVideoAdapter } from '@tanstack/ai/adapters' import { configureFalClient, generateId as utilGenerateId } from '../utils' import { mapVideoSizeToFalFormat } from '../video/video-provider-options' +import { mapImageInputsToFalVideoFields } from '../image/image-inputs' +import type { AudioPart, MediaInputMetadata, VideoPart } from '@tanstack/ai' import type { VideoGenerationOptions, VideoJobResult, @@ -16,6 +18,63 @@ import type { } from '../model-meta' import type { FalClientConfig } from '../utils' +/** + * Map video conditioning inputs onto fal field names. + * Video-to-video endpoints on fal almost universally use `video_url`; the + * occasional model takes `video_urls` (rare). Mirror the image-input logic + * positionally with a `reference` role escape hatch via `reference_video_urls`. + */ +function mapVideoInputsToFalFields( + videoInputs?: ReadonlyArray>, +): Record { + if (!videoInputs || videoInputs.length === 0) return {} + const references: Array = [] + const sources: Array = [] + for (const part of videoInputs) { + const url = videoPartToUrl(part) + if ( + part.metadata?.role === 'reference' || + part.metadata?.role === 'character' + ) { + references.push(url) + } else { + sources.push(url) + } + } + const out: Record = {} + if (references.length > 0) out.reference_video_urls = references + if (sources.length === 1) { + out.video_url = sources[0] + } else if (sources.length > 1) { + out.video_urls = sources + } + return out +} + +function mapAudioInputsToFalFields( + audioInputs?: ReadonlyArray>, +): Record { + if (!audioInputs || audioInputs.length === 0) return {} + if (audioInputs.length > 1) { + throw new Error( + `fal: multiple audioInputs are not supported (received ${audioInputs.length}).`, + ) + } + const part = audioInputs[0]! + return { + audio_url: + part.source.type === 'url' + ? part.source.value + : `data:${part.source.mimeType};base64,${part.source.value}`, + } +} + +function videoPartToUrl(part: VideoPart): string { + return part.source.type === 'url' + ? part.source.value + : `data:${part.source.mimeType};base64,${part.source.value}` +} + type FalQueueStatus = 'IN_QUEUE' | 'IN_PROGRESS' | 'COMPLETED' interface FalStatusResponse { @@ -80,7 +139,16 @@ export class FalVideoAdapter extends BaseVideoAdapter< FalModelVideoSize >, ): Promise { - const { prompt, size, duration, modelOptions, logger } = options + const { + prompt, + size, + duration, + modelOptions, + logger, + imageInputs, + videoInputs, + audioInputs, + } = options logger.request(`activity=generateVideo provider=fal model=${this.model}`, { provider: 'fal', @@ -89,10 +157,16 @@ export class FalVideoAdapter extends BaseVideoAdapter< try { const sizeParams = mapVideoSizeToFalFormat(size) + const inputImageFields = mapImageInputsToFalVideoFields(imageInputs) + const videoFields = mapVideoInputsToFalFields(videoInputs) + const audioFields = mapAudioInputsToFalFields(audioInputs) const input = { ...modelOptions, ...sizeParams, + ...inputImageFields, + ...videoFields, + ...audioFields, prompt, ...(duration ? { duration } : {}), } as FalModelInput diff --git a/packages/typescript/ai-fal/src/image/image-inputs.ts b/packages/typescript/ai-fal/src/image/image-inputs.ts new file mode 100644 index 000000000..0a5a06ca3 --- /dev/null +++ b/packages/typescript/ai-fal/src/image/image-inputs.ts @@ -0,0 +1,165 @@ +import type { ImagePart, MediaInputMetadata } from '@tanstack/ai' + +/** + * Map TanStack `imageInputs` onto fal.ai endpoint fields. + * + * fal endpoints use different field names for image-conditioned generation + * (~80% use `image_url` for single; the rest use `image_urls`, + * `reference_image_urls`, `mask_url`, `control_image_url`, etc.). Without + * per-endpoint metadata we apply this heuristic: + * + * - parts with `metadata.role === 'mask'` → `mask_url` (single) + * - parts with `metadata.role === 'control'` → `control_image_url` (single) + * - parts with `metadata.role === 'reference'` → `reference_image_urls` (array) + * - parts with `metadata.role === 'character'` → `reference_image_urls` (array) + * - remaining parts (no role, or unknown role): + * - exactly 1 part → `image_url` + * - >1 parts → `image_urls` + * + * Users can always override the resulting field shape via `modelOptions` + * (spread before these fields), or pass everything through `modelOptions` + * directly when the heuristic doesn't match an obscure endpoint. + * + * This mapping is interim and will be replaced by a per-endpoint mapping + * sourced from the `@fal-ai/schemas` library once it lands. + */ +export function mapImageInputsToFalFields( + imageInputs?: ReadonlyArray>, +): Record { + if (!imageInputs || imageInputs.length === 0) return {} + + const fields: Record = {} + + const masks: Array = [] + const controls: Array = [] + const references: Array = [] + const sources: Array = [] + + for (const part of imageInputs) { + const url = imagePartToUrl(part) + const role = part.metadata?.role + switch (role) { + case 'mask': + masks.push(url) + break + case 'control': + controls.push(url) + break + case 'reference': + case 'character': + references.push(url) + break + case 'start_frame': + case 'end_frame': + // Frame roles aren't meaningful for image generation; treat as the + // primary source. Video adapter handles start/end framing. + sources.push(url) + break + default: + sources.push(url) + } + } + + if (masks.length > 1) { + throw new Error( + `fal: only one input with metadata.role === 'mask' is supported per request (received ${masks.length}).`, + ) + } + if (controls.length > 1) { + throw new Error( + `fal: only one input with metadata.role === 'control' is supported per request (received ${controls.length}).`, + ) + } + + if (masks[0]) fields.mask_url = masks[0] + if (controls[0]) fields.control_image_url = controls[0] + if (references.length > 0) fields.reference_image_urls = references + + if (sources.length === 1) { + fields.image_url = sources[0] + } else if (sources.length > 1) { + fields.image_urls = sources + } + + return fields +} + +/** + * Map TanStack `imageInputs` onto fal.ai video-endpoint fields. + * + * Video endpoints often expose a start frame as `image_url` (76% of i2v + * models) plus an optional `end_image_url`. Multi-reference video models + * (Kling O3, Seedance reference-to-video) use `reference_image_urls` or + * `image_urls`. Mapping: + * + * - `metadata.role === 'start_frame'` → `start_image_url` + * - `metadata.role === 'end_frame'` → `end_image_url` + * - `metadata.role === 'reference' | 'character'` → `reference_image_urls` + * - remaining parts (no role or unknown role): + * - exactly 1 part → `image_url` + * - >1 parts → `image_urls` + */ +export function mapImageInputsToFalVideoFields( + imageInputs?: ReadonlyArray>, +): Record { + if (!imageInputs || imageInputs.length === 0) return {} + + const fields: Record = {} + + const startFrames: Array = [] + const endFrames: Array = [] + const references: Array = [] + const sources: Array = [] + + for (const part of imageInputs) { + const url = imagePartToUrl(part) + const role = part.metadata?.role + switch (role) { + case 'start_frame': + startFrames.push(url) + break + case 'end_frame': + endFrames.push(url) + break + case 'reference': + case 'character': + references.push(url) + break + default: + sources.push(url) + } + } + + if (startFrames.length > 1) { + throw new Error( + `fal: only one input with metadata.role === 'start_frame' is supported (received ${startFrames.length}).`, + ) + } + if (endFrames.length > 1) { + throw new Error( + `fal: only one input with metadata.role === 'end_frame' is supported (received ${endFrames.length}).`, + ) + } + + if (startFrames[0]) fields.start_image_url = startFrames[0] + if (endFrames[0]) fields.end_image_url = endFrames[0] + if (references.length > 0) fields.reference_image_urls = references + + if (sources.length === 1) { + fields.image_url = sources[0] + } else if (sources.length > 1) { + fields.image_urls = sources + } + + return fields +} + +/** + * Convert a TanStack ImagePart into a string suitable for fal's URL-based + * input fields. URL sources pass through; data sources are emitted as a + * `data:;base64,` URI which fal endpoints accept on the wire. + */ +function imagePartToUrl(part: ImagePart): string { + if (part.source.type === 'url') return part.source.value + return `data:${part.source.mimeType};base64,${part.source.value}` +} diff --git a/packages/typescript/ai-fal/tests/image-inputs.test.ts b/packages/typescript/ai-fal/tests/image-inputs.test.ts new file mode 100644 index 000000000..0ed534080 --- /dev/null +++ b/packages/typescript/ai-fal/tests/image-inputs.test.ts @@ -0,0 +1,140 @@ +import { describe, expect, it } from 'vitest' +import { + mapImageInputsToFalFields, + mapImageInputsToFalVideoFields, +} from '../src/image/image-inputs' +import type { ImagePart, MediaInputMetadata } from '@tanstack/ai' + +function urlPart( + value: string, + metadata?: MediaInputMetadata, +): ImagePart { + return { + type: 'image', + source: { type: 'url', value }, + ...(metadata && { metadata }), + } +} + +describe('mapImageInputsToFalFields', () => { + it('returns an empty object when imageInputs is missing or empty', () => { + expect(mapImageInputsToFalFields(undefined)).toEqual({}) + expect(mapImageInputsToFalFields([])).toEqual({}) + }) + + it('routes a single source to image_url', () => { + expect( + mapImageInputsToFalFields([urlPart('https://example.com/a.png')]), + ).toEqual({ image_url: 'https://example.com/a.png' }) + }) + + it('routes multiple sources to image_urls', () => { + expect( + mapImageInputsToFalFields([ + urlPart('https://example.com/a.png'), + urlPart('https://example.com/b.png'), + ]), + ).toEqual({ + image_urls: ['https://example.com/a.png', 'https://example.com/b.png'], + }) + }) + + it('routes role=mask to mask_url alongside the source image_url', () => { + expect( + mapImageInputsToFalFields([ + urlPart('https://example.com/img.png'), + urlPart('https://example.com/mask.png', { role: 'mask' }), + ]), + ).toEqual({ + image_url: 'https://example.com/img.png', + mask_url: 'https://example.com/mask.png', + }) + }) + + it('routes role=reference to reference_image_urls', () => { + expect( + mapImageInputsToFalFields([ + urlPart('https://example.com/product.png'), + urlPart('https://example.com/style.png', { role: 'reference' }), + urlPart('https://example.com/character.png', { role: 'character' }), + ]), + ).toEqual({ + image_url: 'https://example.com/product.png', + reference_image_urls: [ + 'https://example.com/style.png', + 'https://example.com/character.png', + ], + }) + }) + + it('routes role=control to control_image_url', () => { + expect( + mapImageInputsToFalFields([ + urlPart('https://example.com/img.png'), + urlPart('https://example.com/depth.png', { role: 'control' }), + ]), + ).toEqual({ + image_url: 'https://example.com/img.png', + control_image_url: 'https://example.com/depth.png', + }) + }) + + it('encodes data sources as data URIs', () => { + expect( + mapImageInputsToFalFields([ + { + type: 'image', + source: { type: 'data', value: 'aGVsbG8=', mimeType: 'image/png' }, + }, + ]), + ).toEqual({ image_url: 'data:image/png;base64,aGVsbG8=' }) + }) + + it('throws when more than one mask is provided', () => { + expect(() => + mapImageInputsToFalFields([ + urlPart('https://example.com/m1.png', { role: 'mask' }), + urlPart('https://example.com/m2.png', { role: 'mask' }), + ]), + ).toThrow(/only one input with metadata.role === 'mask'/) + }) +}) + +describe('mapImageInputsToFalVideoFields', () => { + it('returns empty for missing/empty inputs', () => { + expect(mapImageInputsToFalVideoFields(undefined)).toEqual({}) + expect(mapImageInputsToFalVideoFields([])).toEqual({}) + }) + + it('routes a single positional source to image_url (start frame)', () => { + expect( + mapImageInputsToFalVideoFields([ + urlPart('https://example.com/start.png'), + ]), + ).toEqual({ image_url: 'https://example.com/start.png' }) + }) + + it('routes role=start_frame to start_image_url and role=end_frame to end_image_url', () => { + expect( + mapImageInputsToFalVideoFields([ + urlPart('https://example.com/a.png', { role: 'start_frame' }), + urlPart('https://example.com/z.png', { role: 'end_frame' }), + ]), + ).toEqual({ + start_image_url: 'https://example.com/a.png', + end_image_url: 'https://example.com/z.png', + }) + }) + + it('routes role=reference to reference_image_urls', () => { + expect( + mapImageInputsToFalVideoFields([ + urlPart('https://example.com/start.png'), + urlPart('https://example.com/character.png', { role: 'reference' }), + ]), + ).toEqual({ + image_url: 'https://example.com/start.png', + reference_image_urls: ['https://example.com/character.png'], + }) + }) +}) diff --git a/packages/typescript/ai-gemini/src/adapters/image.ts b/packages/typescript/ai-gemini/src/adapters/image.ts index 612385e16..90063cfe3 100644 --- a/packages/typescript/ai-gemini/src/adapters/image.ts +++ b/packages/typescript/ai-gemini/src/adapters/image.ts @@ -1,4 +1,5 @@ import { BaseImageAdapter } from '@tanstack/ai/adapters' +import { arrayBufferToBase64 } from '@tanstack/ai-utils' import { createGeminiClient, generateId, @@ -21,13 +22,17 @@ import type { GeneratedImage, ImageGenerationOptions, ImageGenerationResult, + ImagePart, + MediaInputMetadata, } from '@tanstack/ai' import type { + Content, GenerateContentConfig, GenerateContentResponse, GenerateImagesConfig, GenerateImagesResponse, GoogleGenAI, + Part, } from '@google/genai' import type { GeminiClientConfig } from '../utils' @@ -94,10 +99,29 @@ export class GeminiImageAdapter< try { validatePrompt({ prompt, model }) + if (options.videoInputs?.length) { + throw new Error( + `${this.name}.generateImages does not support videoInputs (model: ${model}).`, + ) + } + if (options.audioInputs?.length) { + throw new Error( + `${this.name}.generateImages does not support audioInputs (model: ${model}).`, + ) + } + if (this.isGeminiImageModel(model)) { return await this.generateWithGeminiApi(options) } + // Imagen does not accept image inputs — it's strictly text-to-image. + if (options.imageInputs?.length) { + throw new Error( + `${this.name}: model "${model}" (Imagen) does not support imageInputs. ` + + `Use a Gemini-native image model (e.g. gemini-2.5-flash-image, "nano-banana") for image-conditioned generation.`, + ) + } + // Imagen models path (generateImages API) validateImageSize(model, options.size) validateNumberOfImages(model, options.numberOfImages) @@ -127,7 +151,8 @@ export class GeminiImageAdapter< private async generateWithGeminiApi( options: ImageGenerationOptions, ): Promise { - const { model, prompt, size, numberOfImages, modelOptions } = options + const { model, prompt, size, numberOfImages, modelOptions, imageInputs } = + options const parsedSize = size ? parseNativeImageSize(size) : undefined @@ -169,15 +194,81 @@ export class GeminiImageAdapter< }), } + const contents = await this.buildContents(augmentedPrompt, imageInputs) + const response = await this.client.models.generateContent({ model, - contents: augmentedPrompt, + contents, config, }) return this.transformGeminiResponse(model, response) } + /** + * Build the multimodal `contents` payload. When `imageInputs` is empty the + * SDK accepts a plain prompt string; with inputs we hand it a single user + * `Content` whose `parts` interleave the inline/file image data with the + * text prompt last (Gemini conventionally treats the trailing text as the + * instruction). + */ + private async buildContents( + prompt: string, + imageInputs?: ReadonlyArray>, + ): Promise> { + if (!imageInputs || imageInputs.length === 0) { + return prompt + } + const imageParts: Array = await Promise.all( + imageInputs.map((part) => this.imagePartToGeminiPart(part)), + ) + const parts: Array = [...imageParts, { text: prompt }] + return [{ role: 'user', parts }] + } + + private async imagePartToGeminiPart( + part: ImagePart, + ): Promise { + if (part.source.type === 'data') { + return { + inlineData: { + mimeType: part.source.mimeType || 'image/png', + data: part.source.value, + }, + } + } + // For URL sources, prefer passing the URL through as `fileData` when it + // looks like a Google Files API URI; otherwise fetch and inline as base64. + if ( + part.source.value.startsWith('gs://') || + /^https?:\/\/generativelanguage\.googleapis\.com\//.test( + part.source.value, + ) + ) { + return { + fileData: { + fileUri: part.source.value, + ...(part.source.mimeType && { mimeType: part.source.mimeType }), + }, + } + } + const response = await fetch(part.source.value) + if (!response.ok) { + throw new Error( + `Failed to fetch image input (${response.status} ${response.statusText}): ${part.source.value}`, + ) + } + const blob = await response.blob() + const buffer = await blob.arrayBuffer() + const base64 = arrayBufferToBase64(buffer) + return { + inlineData: { + mimeType: part.source.mimeType || blob.type || 'image/png', + data: base64, + }, + } + } + private transformGeminiResponse( model: string, response: GenerateContentResponse, diff --git a/packages/typescript/ai-grok/src/adapters/image.ts b/packages/typescript/ai-grok/src/adapters/image.ts index 35f8cd224..081ed610c 100644 --- a/packages/typescript/ai-grok/src/adapters/image.ts +++ b/packages/typescript/ai-grok/src/adapters/image.ts @@ -61,6 +61,18 @@ export class GrokImageAdapter< ): Promise { const { model, prompt, numberOfImages, size, modelOptions } = options + if ( + options.imageInputs?.length || + options.videoInputs?.length || + options.audioInputs?.length + ) { + throw new Error( + `grok.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` + + `Image-conditioned generation requires the xAI Imagine API, which the current adapter ` + + `does not target (it uses the OpenAI-compat endpoint). Track progress at https://github.com/TanStack/ai/issues/618.`, + ) + } + validatePrompt({ prompt, model }) validateImageSize(model, size) validateNumberOfImages(model, numberOfImages) diff --git a/packages/typescript/ai-openai/src/adapters/image.ts b/packages/typescript/ai-openai/src/adapters/image.ts index 1380c7a26..334001832 100644 --- a/packages/typescript/ai-openai/src/adapters/image.ts +++ b/packages/typescript/ai-openai/src/adapters/image.ts @@ -3,6 +3,7 @@ import { BaseImageAdapter } from '@tanstack/ai/adapters' import { toRunErrorPayload } from '@tanstack/ai/adapter-internals' import { generateId } from '@tanstack/ai-utils' import { getOpenAIApiKeyFromEnv } from '../utils/client' +import { imagePartToFile } from '../image/image-input-to-file' import { validateImageSize, validateNumberOfImages, @@ -12,6 +13,8 @@ import type { GeneratedImage, ImageGenerationOptions, ImageGenerationResult, + ImagePart, + MediaInputMetadata, } from '@tanstack/ai' import type OpenAI_SDK from 'openai' import type { OpenAIImageModel } from '../model-meta' @@ -22,6 +25,15 @@ import type { } from '../image/image-provider-options' import type { OpenAIClientConfig } from '../utils/client' +// Per OpenAI docs: dall-e-2 accepts 1 image to `images.edit()`; gpt-image-1 +// and gpt-image-1-mini accept up to 16; dall-e-3 does not support edit at all. +const EDIT_MAX_IMAGES: Record = { + 'dall-e-2': 1, + 'gpt-image-1': 16, + 'gpt-image-1-mini': 16, + 'dall-e-3': 0, +} + /** * Configuration for OpenAI image adapter */ @@ -59,12 +71,44 @@ export class OpenAIImageAdapter< async generateImages( options: ImageGenerationOptions, ): Promise { - const { model, prompt, numberOfImages, size, modelOptions } = options + const { + model, + prompt, + numberOfImages, + size, + modelOptions, + imageInputs, + videoInputs, + audioInputs, + } = options validatePrompt({ prompt, model }) validateImageSize(model, size) validateNumberOfImages(model, numberOfImages) + if (videoInputs?.length) { + throw new Error( + `${this.name}.generateImages does not support videoInputs (model: ${model}).`, + ) + } + if (audioInputs?.length) { + throw new Error( + `${this.name}.generateImages does not support audioInputs (model: ${model}).`, + ) + } + + if (imageInputs && imageInputs.length > 0) { + return this.editImages({ + model: model as OpenAIImageModel, + prompt, + numberOfImages, + size, + modelOptions, + imageInputs, + logger: options.logger, + }) + } + // With exactOptionalPropertyTypes, vendor SDK request shapes reject // `T | undefined` in optional fields. Build the request incrementally and // only set `size` when it's actually defined. @@ -138,6 +182,129 @@ export class OpenAIImageAdapter< throw error } } + + /** + * Image-conditioned generation via OpenAI's `images.edit()` endpoint. + * dall-e-2 accepts 1 input image; gpt-image-1 / gpt-image-1-mini accept up + * to 16; dall-e-3 rejects entirely. A part with `metadata.role === 'mask'` + * is routed to the SDK's `mask` field (PNG with alpha channel). + */ + private async editImages(args: { + model: OpenAIImageModel + prompt: string + numberOfImages?: number + size?: string + modelOptions?: OpenAIImageProviderOptions + imageInputs: ReadonlyArray> + logger: ImageGenerationOptions['logger'] + }): Promise { + const { model, prompt, numberOfImages, size, modelOptions, logger } = args + const maxImages = EDIT_MAX_IMAGES[model] + if (maxImages === 0) { + throw new Error( + `${this.name}: model "${model}" does not support imageInputs. ` + + `Use gpt-image-1, gpt-image-1-mini, or dall-e-2 for image-conditioned generation.`, + ) + } + + const maskParts = args.imageInputs.filter( + (part) => part.metadata?.role === 'mask', + ) + const sourceParts = args.imageInputs.filter( + (part) => part.metadata?.role !== 'mask', + ) + + if (maskParts.length > 1) { + throw new Error( + `${this.name}: only one input with metadata.role === 'mask' is supported per request.`, + ) + } + if (sourceParts.length === 0) { + throw new Error( + `${this.name}: imageInputs contained only mask parts; at least one source image is required.`, + ) + } + if (sourceParts.length > maxImages) { + throw new Error( + `${this.name}: model "${model}" accepts at most ${maxImages} source image(s); received ${sourceParts.length}.`, + ) + } + + const sourceFiles = await Promise.all( + sourceParts.map((part, i) => imagePartToFile(part, `source-${i}`)), + ) + const maskFile = maskParts[0] + ? await imagePartToFile(maskParts[0], 'mask') + : undefined + + // `modelOptions` is typed across all four image models (including dall-e-3's + // `quality: 'hd' | 'standard'` which isn't valid for edit). dall-e-3 has + // already been rejected above, so any remaining quality value is valid for + // the edit endpoint — cast the spread to clear the union mismatch. + const request: OpenAI_SDK.Images.ImageEditParamsNonStreaming = { + model, + prompt, + image: sourceFiles.length === 1 ? sourceFiles[0]! : sourceFiles, + n: numberOfImages ?? 1, + stream: false, + ...((modelOptions ?? + {}) as Partial), + } + if (size !== undefined) { + request.size = size as Exclude< + OpenAI_SDK.Images.ImageEditParamsNonStreaming['size'], + undefined + > + } + if (maskFile) { + request.mask = maskFile + } + + try { + logger.request( + `activity=imageEdit provider=${this.name} model=${model} n=${request.n ?? 1} size=${request.size ?? 'default'} sources=${sourceFiles.length}${maskFile ? ' mask' : ''}`, + { provider: this.name, model }, + ) + const response = await this.client.images.edit(request) + + const images: Array = (response.data ?? []).flatMap( + (item): Array => { + const revisedPromptField = + item.revised_prompt !== undefined + ? { revisedPrompt: item.revised_prompt } + : {} + if (item.b64_json) { + return [{ b64Json: item.b64_json, ...revisedPromptField }] + } + if (item.url) { + return [{ url: item.url, ...revisedPromptField }] + } + return [] + }, + ) + + return { + id: generateId(this.name), + model, + images, + ...(response.usage + ? { + usage: { + inputTokens: response.usage.input_tokens, + outputTokens: response.usage.output_tokens, + totalTokens: response.usage.total_tokens, + }, + } + : {}), + } + } catch (error: unknown) { + logger.errors(`${this.name}.editImages fatal`, { + error: toRunErrorPayload(error, `${this.name}.editImages failed`), + source: `${this.name}.editImages`, + }) + throw error + } + } } /** diff --git a/packages/typescript/ai-openai/src/adapters/video.ts b/packages/typescript/ai-openai/src/adapters/video.ts index 2bb9df046..8a1bc9c1b 100644 --- a/packages/typescript/ai-openai/src/adapters/video.ts +++ b/packages/typescript/ai-openai/src/adapters/video.ts @@ -3,6 +3,7 @@ import { BaseVideoAdapter } from '@tanstack/ai/adapters' import { toRunErrorPayload } from '@tanstack/ai/adapter-internals' import { arrayBufferToBase64 } from '@tanstack/ai-utils' import { getOpenAIApiKeyFromEnv } from '../utils/client' +import { imagePartToFile } from '../image/image-input-to-file' import { toApiSeconds, validateVideoSeconds, @@ -88,15 +89,38 @@ export class OpenAIVideoAdapter< options: VideoGenerationOptions, ): Promise { const { model, size, duration, modelOptions } = options + const { imageInputs, videoInputs, audioInputs } = options validateVideoSize(model, size) const seconds = duration ?? modelOptions?.seconds validateVideoSeconds(model, seconds) + if (videoInputs?.length) { + throw new Error( + `${this.name}.createVideoJob does not support videoInputs (model: ${model}).`, + ) + } + if (audioInputs?.length) { + throw new Error( + `${this.name}.createVideoJob does not support audioInputs (model: ${model}).`, + ) + } + if (imageInputs && imageInputs.length > 1) { + throw new Error( + `${this.name}: Sora accepts at most one input_reference image; received ${imageInputs.length}.`, + ) + } + const request: OpenAI_SDK.Videos.VideoCreateParams = { model: model as VideoModel, prompt: options.prompt, } + if (imageInputs && imageInputs[0]) { + // Sora's `input_reference` is a single Uploadable; convert TanStack + // ImagePart (URL or base64) → File before handing it to the SDK. + const file = await imagePartToFile(imageInputs[0], 'input-reference') + ;(request as { input_reference?: unknown }).input_reference = file + } // `VideoCreateParams.size` is `size?: VideoSize` (no `| undefined`), so we // narrow before assignment instead of casting from a `T | undefined` source. if (size) { diff --git a/packages/typescript/ai-openai/src/image/image-input-to-file.ts b/packages/typescript/ai-openai/src/image/image-input-to-file.ts new file mode 100644 index 000000000..2074496fd --- /dev/null +++ b/packages/typescript/ai-openai/src/image/image-input-to-file.ts @@ -0,0 +1,70 @@ +import { base64ToArrayBuffer } from '@tanstack/ai-utils' +import type { ImagePart, MediaInputMetadata } from '@tanstack/ai' + +const DEFAULT_MIME = 'image/png' +const MIME_TO_EXT: Record = { + 'image/png': 'png', + 'image/jpeg': 'jpg', + 'image/jpg': 'jpg', + 'image/webp': 'webp', + 'image/gif': 'gif', +} + +function extForMime(mimeType: string): string { + return MIME_TO_EXT[mimeType] ?? mimeType.split('/')[1] ?? 'png' +} + +function ensureFileSupport(): void { + if (typeof File === 'undefined') { + throw new Error( + '`File` is not available in this environment. ' + + 'Image-conditioned generation requires Node 20+ or a browser context.', + ) + } +} + +/** + * Convert a TanStack `ImagePart` into an OpenAI-compatible `File`. + * + * - `source.type === 'data'`: decode base64 → Buffer → File. + * - `source.type === 'url'`: fetch the URL (or parse data: URI) → File. + * + * The mime type comes from the source when available, else inferred from the + * URL extension, else `image/png`. + */ +export async function imagePartToFile( + part: ImagePart, + fallbackName: string, +): Promise { + ensureFileSupport() + + if (part.source.type === 'data') { + const mimeType = part.source.mimeType || DEFAULT_MIME + const bytes = base64ToArrayBuffer(part.source.value) + return new File([bytes], `${fallbackName}.${extForMime(mimeType)}`, { + type: mimeType, + }) + } + + // URL source — also handles data: URIs uniformly via fetch(). + const response = await fetch(part.source.value) + if (!response.ok) { + throw new Error( + `Failed to fetch image input (${response.status} ${response.statusText}): ${part.source.value}`, + ) + } + const blob = await response.blob() + const mimeType = + part.source.mimeType || blob.type || inferMimeFromUrl(part.source.value) + return new File([blob], `${fallbackName}.${extForMime(mimeType)}`, { + type: mimeType, + }) +} + +function inferMimeFromUrl(url: string): string { + const match = url.match(/\.(png|jpe?g|webp|gif)(?:\?|#|$)/i) + if (!match || !match[1]) return DEFAULT_MIME + const ext = match[1].toLowerCase() + if (ext === 'jpg' || ext === 'jpeg') return 'image/jpeg' + return `image/${ext}` +} diff --git a/packages/typescript/ai-openai/tests/image-adapter.test.ts b/packages/typescript/ai-openai/tests/image-adapter.test.ts index 34cb5fda7..6c1da8143 100644 --- a/packages/typescript/ai-openai/tests/image-adapter.test.ts +++ b/packages/typescript/ai-openai/tests/image-adapter.test.ts @@ -25,6 +25,9 @@ class TestOpenAIImageAdapter< spyOnImagesGenerate() { return vi.spyOn(this.client.images, 'generate') } + spyOnImagesEdit() { + return vi.spyOn(this.client.images, 'edit') + } } describe('OpenAI Image Adapter', () => { @@ -232,4 +235,162 @@ describe('OpenAI Image Adapter', () => { expect(result2.id).toMatch(/^openai-/) }) }) + + describe('imageInputs (image-conditioned generation)', () => { + const imagesEditResponse: OpenAI.Images.ImagesResponse = { + created: 0, + data: [{ b64_json: 'edited-base64' }], + } + + it('routes to images.edit() for gpt-image-1 when imageInputs is present', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'gpt-image-1', + ) + const editSpy = adapter + .spyOnImagesEdit() + .mockResolvedValueOnce(imagesEditResponse) + const generateSpy = adapter.spyOnImagesGenerate() + + const result = await adapter.generateImages({ + model: 'gpt-image-1', + prompt: 'Make it cinematic', + imageInputs: [ + { + type: 'image', + source: { + type: 'data', + value: 'aGVsbG8=', + mimeType: 'image/png', + }, + }, + ], + logger: testLogger, + }) + + expect(generateSpy).not.toHaveBeenCalled() + expect(editSpy).toHaveBeenCalledTimes(1) + const editArgs = editSpy.mock.calls[0]![0] + expect(editArgs.model).toBe('gpt-image-1') + expect(editArgs.prompt).toBe('Make it cinematic') + expect(editArgs.image).toBeInstanceOf(File) + expect(result.images[0]!.b64Json).toBe('edited-base64') + }) + + it('rejects dall-e-3 with a clear error when imageInputs is present', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'dall-e-3', + ) + + await expect( + adapter.generateImages({ + model: 'dall-e-3', + prompt: 'edit', + imageInputs: [ + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/does not support imageInputs/) + }) + + it('rejects dall-e-2 when more than one source image is provided', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'dall-e-2', + ) + + await expect( + adapter.generateImages({ + model: 'dall-e-2', + prompt: 'edit', + imageInputs: [ + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + { + type: 'image', + source: { + type: 'data', + value: 'YnllCg==', + mimeType: 'image/png', + }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/at most 1 source image/) + }) + + it('routes metadata.role==="mask" to the mask param', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'gpt-image-1', + ) + const editSpy = adapter + .spyOnImagesEdit() + .mockResolvedValueOnce(imagesEditResponse) + + await adapter.generateImages({ + model: 'gpt-image-1', + prompt: 'replace masked region', + imageInputs: [ + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + { + type: 'image', + source: { type: 'data', value: 'bWFzaw==', mimeType: 'image/png' }, + metadata: { role: 'mask' }, + }, + ], + logger: testLogger, + }) + + const editArgs = editSpy.mock.calls[0]![0] + expect(editArgs.mask).toBeInstanceOf(File) + expect(editArgs.image).toBeInstanceOf(File) + }) + + it('rejects videoInputs or audioInputs', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'gpt-image-1', + ) + + await expect( + adapter.generateImages({ + model: 'gpt-image-1', + prompt: 'x', + videoInputs: [ + { + type: 'video', + source: { type: 'url', value: 'https://example.com/v.mp4' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/videoInputs/) + + await expect( + adapter.generateImages({ + model: 'gpt-image-1', + prompt: 'x', + audioInputs: [ + { + type: 'audio', + source: { type: 'url', value: 'https://example.com/a.mp3' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/audioInputs/) + }) + }) }) diff --git a/packages/typescript/ai-openrouter/src/adapters/image.ts b/packages/typescript/ai-openrouter/src/adapters/image.ts index ac746ab3b..4d4e75772 100644 --- a/packages/typescript/ai-openrouter/src/adapters/image.ts +++ b/packages/typescript/ai-openrouter/src/adapters/image.ts @@ -64,6 +64,18 @@ export class OpenRouterImageAdapter< async generateImages( options: ImageGenerationOptions, ): Promise { + if ( + options.imageInputs?.length || + options.videoInputs?.length || + options.audioInputs?.length + ) { + throw new Error( + `openrouter.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` + + `Image-conditioned generation via OpenRouter requires injecting parts into the multimodal ` + + `chat-completions messages array; this is tracked at https://github.com/TanStack/ai/issues/618.`, + ) + } + const { model, prompt, numberOfImages, size, modelOptions, logger } = options // Use provided aspect_ratio or derive from size diff --git a/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md b/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md index 0cd507a56..bf7ccaf0f 100644 --- a/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/typescript/ai/skills/ai-core/media-generation/SKILL.md @@ -189,6 +189,95 @@ Result shape: `ImageGenerationResult` with `images` array where each entry has `b64Json?`, `url?`, and `revisedPrompt?`. OpenAI image URLs expire after 1 hour -- download or display immediately. +#### Image-conditioned generation: `imageInputs` / `videoInputs` / `audioInputs` + +Both `generateImage()` and `generateVideo()` accept multimodal conditioning +inputs that reuse the existing `ImagePart` / `VideoPart` / `AudioPart` +shape used elsewhere in TanStack AI. Each input may carry an optional +`metadata.role` hint that adapters use to route the part to the +provider-specific field. + +```typescript +import { generateImage, type ImagePart } from '@tanstack/ai' +import { openaiImage } from '@tanstack/ai-openai' + +// Image-to-image (OpenAI gpt-image-1, dall-e-2) +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Turn this into a cinematic product photo', + imageInputs: [ + { type: 'image', source: { type: 'url', value: 'https://…/product.png' } }, + ], +}) + +// Multi-reference (up to 16 for gpt-image-1; up to 14 for Gemini native) +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Apply the second image as style to the first', + imageInputs: [ + { type: 'image', source: { type: 'url', value: 'https://…/product.png' } }, + { type: 'image', source: { type: 'url', value: 'https://…/style.png' } }, + ], +}) + +// Inpaint via metadata.role === 'mask' (OpenAI gpt-image-1, dall-e-2; fal mask_url) +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Replace the masked region with a tree', + imageInputs: [ + { type: 'image', source: { type: 'url', value: photoUrl } }, + { + type: 'image', + source: { type: 'url', value: maskUrl }, + metadata: { role: 'mask' }, + }, + ], +}) + +// Image-to-video (OpenAI Sora: single input_reference; fal: image_url + optional end_image_url) +import { generateVideo } from '@tanstack/ai' +import { falVideo } from '@tanstack/ai-fal' + +await generateVideo({ + adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'), + prompt: 'Slow cinematic push-in', + imageInputs: [ + { type: 'image', source: { type: 'url', value: firstFrameUrl } }, + { + type: 'image', + source: { type: 'url', value: lastFrameUrl }, + metadata: { role: 'end_frame' }, + }, + ], +}) +``` + +**Role hints** (`metadata.role`): + +| Role | Maps to | +| --------------- | ------------------------------------------------------------------------ | +| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional otherwise | +| `'character'` | Same as `'reference'`; Veo `referenceImages` slot | +| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | +| `'control'` | fal `control_image_url` (ControlNet / depth / pose) | +| `'start_frame'` | fal `start_image_url`; Veo `image` | +| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` | + +**Provider support matrix:** + +| Provider | `generateImage` `imageInputs` | `generateVideo` `imageInputs` | +| ---------- | ------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------- | +| OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | +| Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. | +| fal | 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. | +| Grok | Throws — adapter uses OpenAI-compat endpoint; native Imagine API rewrite pending. | n/a | +| OpenRouter | Throws — multimodal injection pending. | n/a | +| Anthropic | n/a (no image generation API). | n/a | + +`videoInputs` and `audioInputs` follow the same `metadata.role` convention +for video-to-video and lipsync flows on fal; other providers throw when +they're passed. + ### 2. Audio Generation (Music, Sound Effects) Distinct from TTS — `generateAudio()` produces non-speech audio content. @@ -579,7 +668,45 @@ generateSpeech({ > Source: Gemini TTS adapter validation; CodeRabbit review of PR #463. -### h. LOW: Writing a logging middleware to see media chunks flow through +### h. HIGH: Passing `imageInputs` to a model that doesn't support image-conditioned generation + +Not every model accepts image-conditioned inputs. Adapters throw a clear +runtime error when the caller passes `imageInputs` to a model that +can't honor it (dall-e-3, Imagen, Grok, OpenRouter), so users learn at +call time rather than getting silently wrong output. + +```typescript +// WRONG — dall-e-3 has no edit/inputs API +generateImage({ + adapter: openaiImage('dall-e-3'), + prompt: 'Edit this', + imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], +}) // throws: model "dall-e-3" does not support imageInputs. + +// WRONG — Imagen is text-to-image only +generateImage({ + adapter: geminiImage('imagen-4.0-generate-001'), + prompt: 'Edit this', + imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], +}) // throws: Imagen does not support imageInputs. + +// CORRECT — use a model that supports edits/inputs +generateImage({ + adapter: openaiImage('gpt-image-1'), // edits up to 16 images + prompt: 'Edit this', + imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], +}) + +generateImage({ + adapter: geminiImage('gemini-3.1-flash-image-preview'), // native multimodal + prompt: 'Edit this', + imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], +}) +``` + +> Source: docs/media/image-generation.md, docs/media/video-generation.md. + +### i. LOW: Writing a logging middleware to see media chunks flow through Every media activity — `generateAudio`, `generateSpeech`, `generateTranscription`, `generateImage`, `generateVideo` — accepts the diff --git a/packages/typescript/ai/src/activities/generateImage/index.ts b/packages/typescript/ai/src/activities/generateImage/index.ts index b8d173b09..285fb6a56 100644 --- a/packages/typescript/ai/src/activities/generateImage/index.ts +++ b/packages/typescript/ai/src/activities/generateImage/index.ts @@ -11,7 +11,14 @@ import { resolveDebugOption } from '../../logger/resolve' import type { InternalLogger } from '../../logger/internal-logger' import type { DebugOption } from '../../logger/types' import type { ImageAdapter } from './adapter' -import type { ImageGenerationResult, StreamChunk } from '../../types' +import type { + AudioPart, + ImageGenerationResult, + ImagePart, + MediaInputMetadata, + StreamChunk, + VideoPart, +} from '../../types' // =========================== // Activity Kind @@ -78,6 +85,17 @@ export type ImageActivityOptions< numberOfImages?: number /** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */ size?: ImageSizeForModel + /** + * Image conditioning inputs for image-to-image, reference-guided, edit, or + * multi-reference generation. Each part may carry `metadata.role` + * (`'reference' | 'mask' | 'control' | 'character'`) to disambiguate intent. + * Adapters that don't support image-conditioned generation throw clearly. + */ + imageInputs?: Array> + /** Video conditioning inputs. Provider support varies; unsupported adapters throw. */ + videoInputs?: Array> + /** Audio conditioning inputs. Provider support varies; unsupported adapters throw. */ + audioInputs?: Array> /** * Whether to stream the image generation result. * When true, returns an AsyncIterable for streaming transport. @@ -210,6 +228,9 @@ async function runGenerateImage< prompt: rest.prompt, numberOfImages: rest.numberOfImages, size: rest.size, + imageInputCount: rest.imageInputs?.length, + videoInputCount: rest.videoInputs?.length, + audioInputCount: rest.audioInputs?.length, modelOptions: rest.modelOptions, timestamp: startTime, }) diff --git a/packages/typescript/ai/src/activities/generateVideo/index.ts b/packages/typescript/ai/src/activities/generateVideo/index.ts index cee2339f7..ee9b2f9b0 100644 --- a/packages/typescript/ai/src/activities/generateVideo/index.ts +++ b/packages/typescript/ai/src/activities/generateVideo/index.ts @@ -14,8 +14,12 @@ import type { InternalLogger } from '../../logger/internal-logger' import type { DebugOption } from '../../logger/types' import type { VideoAdapter } from './adapter' import type { + AudioPart, + ImagePart, + MediaInputMetadata, StreamChunk, VideoJobResult, + VideoPart, VideoStatusResult, VideoUrlResult, } from '../../types' @@ -89,6 +93,16 @@ export type VideoCreateOptions< size?: VideoSizeForAdapter /** Video duration in seconds */ duration?: number + /** + * Image conditioning inputs (start frame, end frame, reference / character + * images). Use `metadata.role` (`'start_frame' | 'end_frame' | 'reference' | + * 'character'`) to disambiguate intent; positional fallback otherwise. + */ + imageInputs?: Array> + /** Video conditioning inputs (video-to-video, source clip). */ + videoInputs?: Array> + /** Audio conditioning inputs (lipsync source, voice reference). */ + audioInputs?: Array> /** * Whether to stream the video generation lifecycle. * When true, returns an AsyncIterable that handles the full @@ -249,7 +263,16 @@ export function generateVideo< async function runCreateVideoJob< TAdapter extends VideoAdapter, >(options: VideoCreateOptions): Promise { - const { adapter, prompt, size, duration, modelOptions } = options + const { + adapter, + prompt, + size, + duration, + modelOptions, + imageInputs, + videoInputs, + audioInputs, + } = options const model = adapter.model const logger: InternalLogger = resolveDebugOption(options.debug) const providerName = @@ -269,6 +292,9 @@ async function runCreateVideoJob< size, duration, modelOptions, + imageInputs, + videoInputs, + audioInputs, logger, }) logger.output(`activity=generateVideo jobId=${result.jobId}`, { @@ -296,7 +322,16 @@ function sleep(ms: number): Promise { async function* runStreamingVideoGeneration< TAdapter extends VideoAdapter, >(options: VideoCreateOptions): AsyncIterable { - const { adapter, prompt, size, duration, modelOptions } = options + const { + adapter, + prompt, + size, + duration, + modelOptions, + imageInputs, + videoInputs, + audioInputs, + } = options const model = adapter.model const runId = options.runId ?? createId('run') const pollingInterval = options.pollingInterval ?? 2000 @@ -332,6 +367,9 @@ async function* runStreamingVideoGeneration< size, duration, modelOptions, + imageInputs, + videoInputs, + audioInputs, logger, }) diff --git a/packages/typescript/ai/src/types.ts b/packages/typescript/ai/src/types.ts index a12964981..5528ca9f9 100644 --- a/packages/typescript/ai/src/types.ts +++ b/packages/typescript/ai/src/types.ts @@ -1409,6 +1409,31 @@ export interface SummarizationResult { // Image Generation Types // ============================================================================ +/** + * Optional role hint on a media input part (image / video / audio). Adapters + * read `metadata.role` to route the part to the provider-specific request + * field — e.g. `'mask'` → OpenAI `mask` / fal `mask_url`, `'end_frame'` → fal + * `end_image_url`, `'reference'` → fal `reference_image_urls`. When omitted + * the adapter falls back to positional routing. + */ +export type MediaInputRole = + | 'reference' + | 'mask' + | 'control' + | 'start_frame' + | 'end_frame' + | 'character' + +/** + * Metadata convention for image / video / audio inputs to media generation. + * Carried on `ImagePart.metadata` / `VideoPart.metadata` / `AudioPart.metadata` + * when used as conditioning inputs to `generateImage()` or `generateVideo()`. + */ +export interface MediaInputMetadata { + /** Optional role hint disambiguating the part's intent for the adapter */ + role?: MediaInputRole +} + /** * Options for image generation. * These are the common options supported across providers. @@ -1425,6 +1450,25 @@ export interface ImageGenerationOptions< numberOfImages?: number /** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */ size?: TSize + /** + * Image conditioning inputs (reference / mask / control / start frame / + * character). Reuses the multimodal `ImagePart` shape. Adapters map these + * onto the provider-native request — e.g. OpenAI `images.edit()`, Gemini + * multimodal `contents`, fal `image_url` / `image_urls` / `mask_url`. + * Adapters that do not support image-conditioned generation throw a clear + * runtime error when this field is non-empty. + */ + imageInputs?: Array> + /** + * Video conditioning inputs (video-to-video, edit, lipsync source). + * Not all providers support this; adapters throw when unsupported. + */ + videoInputs?: Array> + /** + * Audio conditioning inputs (audio reference, voice cloning, lipsync). + * Not all providers support this; adapters throw when unsupported. + */ + audioInputs?: Array> /** Model-specific options for image generation */ modelOptions?: TProviderOptions /** @@ -1555,6 +1599,24 @@ export interface VideoGenerationOptions< size?: TSize /** Video duration in seconds */ duration?: number + /** + * Image conditioning inputs (start frame, end frame, character / reference + * images). Reuses the multimodal `ImagePart` shape; adapters route by + * `metadata.role` and array position (e.g. OpenAI Sora `input_reference`, + * fal `image_url` / `end_image_url`, Veo `image` / `lastFrame` / + * `referenceImages`). Adapters throw at runtime if unsupported. + */ + imageInputs?: Array> + /** + * Video conditioning inputs (video-to-video edit, source clip). + * Not all providers support this; adapters throw when unsupported. + */ + videoInputs?: Array> + /** + * Audio conditioning inputs (lipsync source, voice reference). + * Not all providers support this; adapters throw when unsupported. + */ + audioInputs?: Array> /** Model-specific options for video generation */ modelOptions?: TProviderOptions /** diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts index 3b464be5b..89c632208 100644 --- a/testing/e2e/src/lib/feature-support.ts +++ b/testing/e2e/src/lib/feature-support.ts @@ -142,9 +142,19 @@ export const matrix: Record> = { ]), // Gemini excluded: aimock doesn't mock Gemini's Imagen predict endpoint format 'image-gen': new Set(['openai', 'grok']), + // image-to-image (imageInputs on generateImage) routes adapters to wire + // endpoints aimock doesn't yet mock (OpenAI `/v1/images/edits`, Gemini + // multimodal `generateContent`, fal endpoint-specific input fields). + // Adapter-level mapping is covered by unit tests. Populate this set when + // aimock gains support for those endpoints. + 'image-to-image': new Set([]), tts: new Set(['openai', 'grok']), transcription: new Set(['openai', 'grok']), 'video-gen': new Set(['openai']), + // image-to-video (imageInputs on generateVideo) similarly depends on + // aimock mocking Sora's `input_reference` upload field. Populate when + // aimock support lands. + 'image-to-video': new Set([]), } export function isSupported(provider: Provider, feature: Feature): boolean { diff --git a/testing/e2e/src/lib/types.ts b/testing/e2e/src/lib/types.ts index be405c74d..dc839fcc4 100644 --- a/testing/e2e/src/lib/types.ts +++ b/testing/e2e/src/lib/types.ts @@ -27,9 +27,11 @@ export type Feature = | 'summarize' | 'summarize-stream' | 'image-gen' + | 'image-to-image' | 'tts' | 'transcription' | 'video-gen' + | 'image-to-video' export const ALL_PROVIDERS: Provider[] = [ 'openai', @@ -59,7 +61,9 @@ export const ALL_FEATURES: Feature[] = [ 'summarize', 'summarize-stream', 'image-gen', + 'image-to-image', 'tts', 'transcription', 'video-gen', + 'image-to-video', ]