From ae7766bc92312e6b5c5486f572b06f41d6788872 Mon Sep 17 00:00:00 2001 From: Zain Hasan Date: Fri, 1 May 2026 13:54:48 -0700 Subject: [PATCH 1/5] cleanup readme --- README.md | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/README.md b/README.md index b7e5bb3..562fa59 100644 --- a/README.md +++ b/README.md @@ -143,43 +143,6 @@ python skills/together-batch-inference/scripts/batch_workflow.py Scripts use the **Together Python v2 SDK** (`together>=2.0.0`) with keyword-only arguments, updated method names, and current response shapes. -## Skill Structure - -``` -togetherai-skills/ -├── quality/ -│ └── trigger-evals/ # Skill trigger test sets -├── scripts/ # Repo tooling, generators, validators -└── skills/ - └── together-/ - ├── SKILL.md # Core instructions (always loaded on trigger) - ├── agents/ - │ └── openai.yaml # OpenAI/Codex interface metadata - ├── references/ # Detailed docs (loaded when needed) - │ ├── models.md # Supported models, IDs, context lengths - │ ├── api-reference.md - │ └── ... - └── scripts/ # Runnable Python examples (v2 SDK) - └── .py -``` - -### How skills are loaded - -1. **Metadata** (YAML frontmatter) — Always available to the agent (~100 words). Used to decide whether to load the skill. -2. **Body** (Markdown) — Loaded when the skill is triggered. It should stay lean and focus on routing, high-signal rules, and the next resource to open. -3. **References** — Loaded on demand when the agent needs deeper detail (model lists, full API specs). -4. **Scripts** — Available as runnable code that the agent can reference or execute directly. -5. **OpenAI metadata** — `agents/openai.yaml` gives OpenAI/Codex surfaces a display name, short description, and default prompt. - -## Quality Guardrails - -This repo now treats skills as agent artifacts rather than long tutorials: - -- `SKILL.md` files are intentionally short and routing-oriented -- Long references include a `## Contents` section near the top -- Each skill has trigger eval examples in `quality/trigger-evals/` -- Multi-step Python workflows are validated for current v2 SDK usage and safer tempfile handling - ## SDK Compatibility > **Version bump:** This repo now requires `together>=2.0.0`. If you are upgrading from v1, see the [migration guide](https://docs.together.ai/docs/v2-migration-guide) for breaking changes in method names, argument styles, and response shapes. From e50c36d1c6570f35f05acc9fd026d6de33d0c5d3 Mon Sep 17 00:00:00 2001 From: Zain Hasan Date: Fri, 1 May 2026 13:56:23 -0700 Subject: [PATCH 2/5] wip --- .../together-batch-inference/references/api-reference.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/skills/together-batch-inference/references/api-reference.md b/skills/together-batch-inference/references/api-reference.md index c13c5ca..65f509b 100644 --- a/skills/together-batch-inference/references/api-reference.md +++ b/skills/together-batch-inference/references/api-reference.md @@ -238,14 +238,10 @@ curl -X GET "https://api.together.xyz/v1/batches" \ ## Models with 50% Discount -- `Qwen/Qwen2.5-7B-Instruct-Turbo` - `meta-llama/Llama-3.3-70B-Instruct-Turbo` -- `meta-llama/Llama-3-70b-chat-hf` -- `mistralai/Mixtral-8x7B-Instruct-v0.1` -- `zai-org/GLM-4.5-Air-FP8` -- `openai/whisper-large-v3` -All serverless models support batch processing — models not listed have no discount. + +All serverless models support batch processing — models not listed have no discount. The 50% discount does not apply to dedicated endpoint usage. ## Rate Limits From 86588f29bae1ef9de933a20e3296a2f65f09a766 Mon Sep 17 00:00:00 2001 From: Zain Hasan Date: Fri, 1 May 2026 13:57:50 -0700 Subject: [PATCH 3/5] update chat completions skills --- .../references/function-calling-patterns.md | 10 +- .../references/models.md | 47 +++---- .../references/reasoning-models.md | 128 +++++------------- .../references/structured-outputs.md | 23 ++-- .../scripts/reasoning_models.py | 2 +- .../scripts/reasoning_models.ts | 2 +- .../scripts/structured_outputs.py | 2 +- .../scripts/structured_outputs.ts | 2 +- 8 files changed, 75 insertions(+), 141 deletions(-) diff --git a/skills/together-chat-completions/references/function-calling-patterns.md b/skills/together-chat-completions/references/function-calling-patterns.md index 9a4ff55..938770d 100644 --- a/skills/together-chat-completions/references/function-calling-patterns.md +++ b/skills/together-chat-completions/references/function-calling-patterns.md @@ -705,8 +705,8 @@ const final = await together.chat.completions.create({ ## Supported Models -openai/gpt-oss-120b, openai/gpt-oss-20b, moonshotai/Kimi-K2.5, zai-org/GLM-5, zai-org/GLM-4.5-Air-FP8, -MiniMaxAI/MiniMax-M2.5, Qwen/Qwen3-Next-80B-A3B-Instruct, Qwen/Qwen3.5-397B-A17B, -Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8, deepseek-ai/DeepSeek-R1, deepseek-ai/DeepSeek-V3, -meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8, meta-llama/Llama-3.3-70B-Instruct-Turbo, -Qwen/Qwen2.5-7B-Instruct-Turbo, mistralai/Mistral-Small-24B-Instruct-2501 +openai/gpt-oss-120b, openai/gpt-oss-20b, moonshotai/Kimi-K2.6, moonshotai/Kimi-K2.5, +zai-org/GLM-5.1, zai-org/GLM-5, MiniMaxAI/MiniMax-M2.7, Qwen/Qwen3.5-397B-A17B, +Qwen/Qwen3.5-9B, Qwen/Qwen3.6-Plus, Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8, +Qwen/Qwen3-235B-A22B-Instruct-2507-tput, deepseek-ai/DeepSeek-V4-Pro, +meta-llama/Llama-3.3-70B-Instruct-Turbo, Qwen/Qwen2.5-7B-Instruct-Turbo, google/gemma-4-31B-it diff --git a/skills/together-chat-completions/references/models.md b/skills/together-chat-completions/references/models.md index e0e0bc0..d422032 100644 --- a/skills/together-chat-completions/references/models.md +++ b/skills/together-chat-completions/references/models.md @@ -4,39 +4,37 @@ | Use Case | Model | API String | Alternatives | |----------|-------|-----------|-------------| -| Chat (best) | Kimi K2.5 (instant) | `moonshotai/Kimi-K2.5` | `deepseek-ai/DeepSeek-V3.1`, `openai/gpt-oss-120b` | -| Reasoning | Kimi K2.5 (thinking) | `moonshotai/Kimi-K2.5` | `deepseek-ai/DeepSeek-R1` | -| Coding Agents | Kimi K2.5 (thinking) | `moonshotai/Kimi-K2.5` | `Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8`, `deepseek-ai/DeepSeek-V3.1` | -| Small & Fast | GPT-OSS 20B | `openai/gpt-oss-20b` | `Qwen/Qwen2.5-7B-Instruct-Turbo` | -| Medium General | GPT-OSS 120B | `openai/gpt-oss-120b` | `zai-org/GLM-4.5-Air-FP8` | -| Function Calling | GLM-5 | `zai-org/GLM-5` | `moonshotai/Kimi-K2.5` | -| Vision | Kimi K2.5 | `moonshotai/Kimi-K2.5` | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` | +| Chat (best) | Kimi K2.6 | `moonshotai/Kimi-K2.6` | `MiniMaxAI/MiniMax-M2.7`, `openai/gpt-oss-120b` | +| Reasoning | DeepSeek-V4-Pro | `deepseek-ai/DeepSeek-V4-Pro` | `moonshotai/Kimi-K2.6`, `Qwen/Qwen3.6-Plus` | +| Coding Agents | Qwen3-Coder 480B | `Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8` | `moonshotai/Kimi-K2.6`, `deepseek-ai/DeepSeek-V4-Pro` | +| Small & Fast | GPT-OSS 20B | `openai/gpt-oss-20b` | `Qwen/Qwen2.5-7B-Instruct-Turbo`, `google/gemma-3n-E4B-it` | +| Medium General | GPT-OSS 120B | `openai/gpt-oss-120b` | `zai-org/GLM-5` | +| Function Calling | GLM-5.1 | `zai-org/GLM-5.1` | `moonshotai/Kimi-K2.6`, `Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8` | +| Vision | Qwen3.5 397B | `Qwen/Qwen3.5-397B-A17B` | `moonshotai/Kimi-K2.5`, `google/gemma-4-31B-it` | ## Full Chat Model Catalog | Organization | Model | API String | Context | Quant | |-------------|-------|-----------|---------|-------| -| Moonshot | Kimi K2.5 | `moonshotai/Kimi-K2.5` | 262,144 | INT4 | -| Qwen | Qwen3.5 397B | `Qwen/Qwen3.5-397B-A17B` | 262,144 | BF16 | -| Qwen | Qwen3.5 9B | `Qwen/Qwen3.5-9B` | 128,000 | BF16 | +| MiniMax | MiniMax M2.7 | `MiniMaxAI/MiniMax-M2.7` | 202,752 | FP4 | +| Qwen | Qwen3.5 397B A17B | `Qwen/Qwen3.5-397B-A17B` | 262,144 | BF16 | +| Qwen | Qwen3.6 Plus | `Qwen/Qwen3.6-Plus` | 1,000,000 | - | +| Qwen | Qwen3.5 9B | `Qwen/Qwen3.5-9B` | 262,144 | FP8 | | Qwen | Qwen3-Coder 480B | `Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8` | 256,000 | FP8 | -| Qwen | Qwen3-Coder-Next | `Qwen/Qwen3-Coder-Next-FP8` | 262,144 | FP8 | | Qwen | Qwen3 235B Instruct | `Qwen/Qwen3-235B-A22B-Instruct-2507-tput` | 262,144 | FP8 | -| Qwen | Qwen3-Next 80B Instruct | `Qwen/Qwen3-Next-80B-A3B-Instruct` | 262,144 | BF16 | -| MiniMax | MiniMax M2.5 | `MiniMaxAI/MiniMax-M2.5` | 228,700 | FP4 | -| DeepSeek | DeepSeek-V3.1 | `deepseek-ai/DeepSeek-V3.1` | 128,000 | FP8 | -| DeepSeek | DeepSeek-R1 | `deepseek-ai/DeepSeek-R1` | 163,839 | FP8 | +| Moonshot | Kimi K2.6 | `moonshotai/Kimi-K2.6` | 262,144 | FP4 | +| Moonshot | Kimi K2.5 | `moonshotai/Kimi-K2.5` | 262,144 | FP4 | +| DeepSeek | DeepSeek-V4-Pro | `deepseek-ai/DeepSeek-V4-Pro` | 512,000 | FP4 | | OpenAI | GPT-OSS 120B | `openai/gpt-oss-120b` | 128,000 | MXFP4 | | OpenAI | GPT-OSS 20B | `openai/gpt-oss-20b` | 128,000 | MXFP4 | +| Z.ai | GLM-5.1 | `zai-org/GLM-5.1` | 202,752 | FP4 | | Z.ai | GLM-5 | `zai-org/GLM-5` | 202,752 | FP4 | -| Z.ai | GLM 4.7 | `zai-org/GLM-4.7` | 202,752 | FP8 | -| Z.ai | GLM 4.5 Air | `zai-org/GLM-4.5-Air-FP8` | 131,072 | FP8 | -| Meta | Llama 4 Maverick | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` | 1,048,576 | FP8 | | Meta | Llama 3.3 70B Turbo | `meta-llama/Llama-3.3-70B-Instruct-Turbo` | 131,072 | FP8 | -| Deep Cogito | Cogito v2.1 671B | `deepcogito/cogito-v2-1-671b` | 32,768 | FP8 | -| Mistral | Mistral Small 24B | `mistralai/Mistral-Small-24B-Instruct-2501` | 32,768 | FP16 | -| Mistral | Mistral 7B v0.2 | `mistralai/Mistral-7B-Instruct-v0.2` | 32,768 | FP16 | +| Meta | Llama 3 8B Lite | `meta-llama/Meta-Llama-3-8B-Instruct-Lite` | 8,192 | - | +| Deep Cogito | Cogito v2.1 671B | `deepcogito/cogito-v2-1-671b` | 163,840 | - | +| Google | Gemma 4 31B IT | `google/gemma-4-31B-it` | 262,144 | FP8 | | Google | Gemma 3N E4B | `google/gemma-3n-E4B-it` | 32,768 | FP8 | +| Liquid AI | LFM2-24B-A2B | `LiquidAI/LFM2-24B-A2B` | 32,768 | - | | Qwen | Qwen 2.5 7B Turbo | `Qwen/Qwen2.5-7B-Instruct-Turbo` | 32,768 | FP8 | | Essential AI | Rnj-1 Instruct | `essentialai/rnj-1-instruct` | 32,768 | BF16 | @@ -44,19 +42,18 @@ | Organization | Model | API String | Context | |-------------|-------|-----------|---------| +| Qwen | Qwen3.5 397B A17B | `Qwen/Qwen3.5-397B-A17B` | 262,144 | +| Qwen | Qwen3.5 9B | `Qwen/Qwen3.5-9B` | 262,144 | +| Google | Gemma 4 31B IT | `google/gemma-4-31B-it` | 262,144 | | Moonshot | Kimi K2.5 | `moonshotai/Kimi-K2.5` | 262,144 | -| Meta | Llama 4 Maverick | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` | 524,288 | -| Qwen | Qwen3-VL-8B | `Qwen/Qwen3-VL-8B-Instruct` | 262,100 | ## Moderation Models | Model | API String | Context | |-------|-----------|---------| | Llama Guard 4 (12B) | `meta-llama/Llama-Guard-4-12B` | 1,048,576 | -| Virtue Guard | `VirtueAI/VirtueGuard-Text-Lite` | 32,768 | ## Quantization Types - **FP16/BF16:** Full precision - **FP8:** 8-bit floating point (Turbo models) - **FP4/MXFP4:** 4-bit floating point -- **INT4:** 4-bit integer (Lite models) diff --git a/skills/together-chat-completions/references/reasoning-models.md b/skills/together-chat-completions/references/reasoning-models.md index 5d2cbd7..94d3228 100644 --- a/skills/together-chat-completions/references/reasoning-models.md +++ b/skills/together-chat-completions/references/reasoning-models.md @@ -14,15 +14,17 @@ | Model | API String | Type | Context | Tool Calling | |-------|-----------|------|---------|--------------| -| DeepSeek R1 | `deepseek-ai/DeepSeek-R1` | Reasoning only | 164K | No | -| DeepSeek V3.1 | `deepseek-ai/DeepSeek-V3.1` | Hybrid (off by default) | 164K | Non-reasoning only | +| DeepSeek-V4-Pro | `deepseek-ai/DeepSeek-V4-Pro` | Hybrid (on by default) | 512K | Yes | +| GLM-5.1 | `zai-org/GLM-5.1` | Hybrid (on by default) | 200K | Yes | | GLM-5 | `zai-org/GLM-5` | Hybrid (on by default) | 200K | Yes | | GPT-OSS 120B | `openai/gpt-oss-120b` | Adjustable effort | 128K | No | | GPT-OSS 20B | `openai/gpt-oss-20b` | Adjustable effort | 128K | No | -| Kimi K2.5 | `moonshotai/Kimi-K2.5` | Hybrid (on by default) | 256K | Yes | -| MiniMax M2.5 | `MiniMaxAI/MiniMax-M2.5` | Reasoning only | 229K | No | -| Qwen3.5 397B | `Qwen/Qwen3.5-397B-A17B` | Hybrid (on by default) | 128K | No | -| Qwen3.5 9B | `Qwen/Qwen3.5-9B` | Hybrid (on by default) | 128K | No | +| Kimi K2.6 | `moonshotai/Kimi-K2.6` | Hybrid (on by default) | 262K | Yes | +| Kimi K2.5 | `moonshotai/Kimi-K2.5` | Hybrid (on by default) | 262K | Yes | +| MiniMax M2.7 | `MiniMaxAI/MiniMax-M2.7` | Reasoning only | 202K | Yes | +| Qwen3.5 397B | `Qwen/Qwen3.5-397B-A17B` | Hybrid (on by default) | 262K | Yes | +| Qwen3.5 9B | `Qwen/Qwen3.5-9B` | Hybrid (on by default) | 262K | Yes | +| Qwen3.6 Plus | `Qwen/Qwen3.6-Plus` | Hybrid (on by default) | 1M | Yes | **Type definitions:** - **Reasoning only**: Always produces reasoning tokens. Cannot be toggled off. @@ -100,15 +102,15 @@ curl -X POST "https://api.together.xyz/v1/chat/completions" \ Hybrid models support `reasoning={"enabled": True/False}` to toggle reasoning on or off. **Models supporting this parameter:** -- `deepseek-ai/DeepSeek-V3.1` (off by default) +- `deepseek-ai/DeepSeek-V4-Pro` (on by default) - `Qwen/Qwen3.5-397B-A17B` (on by default) - `Qwen/Qwen3.5-9B` (on by default) +- `Qwen/Qwen3.6-Plus` (on by default) +- `moonshotai/Kimi-K2.6` (on by default) - `moonshotai/Kimi-K2.5` (on by default) +- `zai-org/GLM-5.1` (on by default) - `zai-org/GLM-5` (on by default) -Note: For DeepSeek V3.1, function calling only works in non-reasoning mode -(`reasoning={"enabled": False}`). - ### Python -- Enable Reasoning ```python @@ -117,7 +119,7 @@ from together import Together client = Together() stream = client.chat.completions.create( - model="moonshotai/Kimi-K2.5", + model="moonshotai/Kimi-K2.6", messages=[ {"role": "user", "content": "Which number is bigger, 9.11 or 9.9? Think carefully."}, ], @@ -155,7 +157,7 @@ type ReasoningDelta = ChatCompletionChunk.Choice.Delta & { }; const params: ReasoningParams = { - model: "moonshotai/Kimi-K2.5", + model: "moonshotai/Kimi-K2.6", messages: [ { role: "user", @@ -181,7 +183,7 @@ for await (const chunk of stream) { ```python response = client.chat.completions.create( - model="moonshotai/Kimi-K2.5", + model="moonshotai/Kimi-K2.6", messages=[{"role": "user", "content": "What is the capital of France?"}], reasoning={"enabled": False}, temperature=0.6, @@ -202,13 +204,13 @@ response = client.chat.completions.create( ## Controlling Reasoning Depth via Prompting -For models without `reasoning_effort` (e.g., DeepSeek R1), influence thinking depth through the +For hybrid models without `reasoning_effort`, influence thinking depth through the prompt: ```python # Ask for concise reasoning response = client.chat.completions.create( - model="deepseek-ai/DeepSeek-R1", + model="deepseek-ai/DeepSeek-V4-Pro", messages=[ { "role": "user", @@ -220,7 +222,7 @@ response = client.chat.completions.create( # Or suggest a reasoning budget response = client.chat.completions.create( - model="deepseek-ai/DeepSeek-R1", + model="deepseek-ai/DeepSeek-V4-Pro", messages=[ { "role": "user", @@ -239,14 +241,14 @@ response = client.chat.completions.create( ### Separate reasoning field (most models) -Models like Kimi K2.5, GLM-5, DeepSeek V3.1, GPT-OSS, and Qwen3.5 return reasoning in a dedicated +Models like Kimi K2.6, GLM-5.1, DeepSeek-V4-Pro, GPT-OSS, and Qwen3.5 return reasoning in a dedicated `reasoning` field on the response message or streaming delta. **Non-streaming (Python):** ```python response = client.chat.completions.create( - model="moonshotai/Kimi-K2.5", + model="moonshotai/Kimi-K2.6", messages=[{"role": "user", "content": "Say test 10 times"}], ) print("Reasoning:", response.choices[0].message.reasoning) @@ -257,7 +259,7 @@ print("Answer:", response.choices[0].message.content) ```typescript const response = await together.chat.completions.create({ - model: "moonshotai/Kimi-K2.5", + model: "moonshotai/Kimi-K2.6", messages: [{ role: "user", content: "Say test 10 times" }], } as any); @@ -269,7 +271,7 @@ console.log("Answer:", response.choices[0].message.content); ```python stream = client.chat.completions.create( - model="moonshotai/Kimi-K2.5", + model="moonshotai/Kimi-K2.6", messages=[{"role": "user", "content": "Which number is bigger, 9.11 or 9.9?"}], stream=True, ) @@ -289,7 +291,7 @@ for chunk in stream: import type { ChatCompletionChunk } from "together-ai/resources/chat/completions"; const stream = await together.chat.completions.stream({ - model: "moonshotai/Kimi-K2.5", + model: "moonshotai/Kimi-K2.6", messages: [ { role: "user", content: "Which number is bigger, 9.11 or 9.9?" }, ], @@ -304,66 +306,6 @@ for await (const chunk of stream) { } ``` -### DeepSeek R1 -- `` tags in content - -DeepSeek R1 outputs reasoning inside `` tags within the `content` field. - -**Streaming (Python):** - -```python -stream = client.chat.completions.create( - model="deepseek-ai/DeepSeek-R1", - messages=[{"role": "user", "content": "Which number is bigger 9.9 or 9.11?"}], - stream=True, -) -for chunk in stream: - print(chunk.choices[0].delta.content or "", end="", flush=True) -``` - -**Streaming (TypeScript):** - -```typescript -const stream = await together.chat.completions.create({ - model: "deepseek-ai/DeepSeek-R1", - messages: [{ role: "user", content: "Which number is bigger 9.9 or 9.11?" }], - stream: true, -}); - -for await (const chunk of stream) { - process.stdout.write(chunk.choices[0]?.delta?.content || ""); -} -``` - -Output: -``` - -Let me compare 9.9 and 9.11... -9.9 = 9.90, and 9.90 > 9.11 - - -**Answer:** 9.9 is bigger. -``` - -**Parsing `` tags (Python):** - -```python -import re - -content = response.choices[0].message.content -think_match = re.search(r"(.*?)", content, re.DOTALL) -thinking = think_match.group(1).strip() if think_match else "" -answer = re.sub(r".*?", "", content, flags=re.DOTALL).strip() -``` - -**Parsing `` tags (TypeScript):** - -```typescript -const content = response.choices[0].message.content ?? ""; -const thinkMatch = content.match(/([\s\S]*?)<\/think>/); -const thinking = thinkMatch ? thinkMatch[1].trim() : ""; -const answer = content.replace(/[\s\S]*?<\/think>/, "").trim(); -``` - ## Structured Outputs with Reasoning Reasoning models support JSON mode for structured output extraction: @@ -386,7 +328,7 @@ class MathReasoning(BaseModel): final_answer: str completion = client.chat.completions.create( - model="deepseek-ai/DeepSeek-R1", + model="deepseek-ai/DeepSeek-V4-Pro", messages=[ { "role": "system", @@ -428,7 +370,7 @@ const mathReasoningSchema = z.object({ const jsonSchema = z.toJSONSchema(mathReasoningSchema); const completion = await together.chat.completions.create({ - model: "deepseek-ai/DeepSeek-R1", + model: "deepseek-ai/DeepSeek-V4-Pro", messages: [ { role: "system", @@ -454,22 +396,18 @@ if (completion?.choices?.[0]?.message?.content) { ## Best Practices by Model -### DeepSeek R1 -- **Temperature:** 0.5-0.7 (recommended 0.6) -- **Top-p:** 0.95 recommended -- **System prompts:** Omit -- put all instructions in user message -- **Prompting:** High-level objectives, let model determine methodology -- **Few-shot:** Avoid -- consistently degrades performance -- **Chain-of-thought:** Do not prompt "think step by step" (model already reasons) -- **Math tasks:** Include "put your final answer within \boxed{}" in prompt -- Avoid micromanaging reasoning steps - -### Kimi K2.5 +### DeepSeek-V4-Pro +- Hybrid reasoning model with very long context (512K) +- Toggle reasoning via `reasoning={"enabled": True/False}` +- Strong performance on math, code, and agentic tool use +- Avoid micromanaging reasoning steps -- let the model determine methodology + +### Kimi K2.6 / K2.5 - Temperature 1.0 for thinking mode, 0.6 for instant mode - Supports both reasoning and non-reasoning modes - Excels at multi-turn tool calling with reasoning interleaved -### GLM-5 +### GLM-5.1 / GLM-5 - Thinking is enabled by default - Supports Preserved Thinking: set `"clear_thinking": false` in `chat_template_kwargs` - Preserved Thinking retains reasoning across turns for better agentic workflows diff --git a/skills/together-chat-completions/references/structured-outputs.md b/skills/together-chat-completions/references/structured-outputs.md index a6a81d2..166d7f7 100644 --- a/skills/together-chat-completions/references/structured-outputs.md +++ b/skills/together-chat-completions/references/structured-outputs.md @@ -358,7 +358,7 @@ class MathReasoning(BaseModel): final_answer: str completion = client.chat.completions.create( - model="deepseek-ai/DeepSeek-R1", + model="deepseek-ai/DeepSeek-V4-Pro", messages=[ { "role": "system", @@ -400,7 +400,7 @@ const mathReasoningSchema = z.object({ const jsonSchema = z.toJSONSchema(mathReasoningSchema); const completion = await together.chat.completions.create({ - model: "deepseek-ai/DeepSeek-R1", + model: "deepseek-ai/DeepSeek-V4-Pro", messages: [ { role: "system", @@ -508,23 +508,22 @@ console.log(`Title: ${result.title}`); ### Top Models (json_schema, json_object, regex) - `openai/gpt-oss-120b` - `openai/gpt-oss-20b` +- `moonshotai/Kimi-K2.6` - `moonshotai/Kimi-K2.5` +- `zai-org/GLM-5.1` - `zai-org/GLM-5` -- `zai-org/GLM-4.5-Air-FP8` -- `MiniMaxAI/MiniMax-M2.5` +- `MiniMaxAI/MiniMax-M2.7` - `Qwen/Qwen3.5-397B-A17B` +- `Qwen/Qwen3.6-Plus` - `Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8` -- `deepseek-ai/DeepSeek-R1` -- `deepseek-ai/DeepSeek-V3` -- `deepseek-ai/DeepSeek-V3.1` -- `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` +- `deepseek-ai/DeepSeek-V4-Pro` ### Additional Supported Models - `meta-llama/Llama-3.3-70B-Instruct-Turbo` - `Qwen/Qwen2.5-7B-Instruct-Turbo` +- `Qwen/Qwen3.5-9B` +- `google/gemma-4-31B-it` - `google/gemma-3n-E4B-it` -- `mistralai/Mistral-Small-24B-Instruct-2501` -- `mistralai/Mistral-7B-Instruct-v0.2` ## Troubleshooting @@ -541,5 +540,5 @@ console.log(`Title: ${result.title}`); 3. Use `json_schema` mode when you need guaranteed structure 4. Use `json_object` for simpler cases where prompt guidance is sufficient 5. Use `regex` mode for simple constrained outputs (classification, IDs, phone numbers) -6. Works with vision models (e.g., `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`) -7. Works with reasoning models (e.g., `deepseek-ai/DeepSeek-R1`) +6. Works with vision models (e.g., `Qwen/Qwen3.5-397B-A17B`) +7. Works with reasoning models (e.g., `deepseek-ai/DeepSeek-V4-Pro`) diff --git a/skills/together-chat-completions/scripts/reasoning_models.py b/skills/together-chat-completions/scripts/reasoning_models.py index 9bffe62..6ad915e 100644 --- a/skills/together-chat-completions/scripts/reasoning_models.py +++ b/skills/together-chat-completions/scripts/reasoning_models.py @@ -61,7 +61,7 @@ def deepseek_r1_think_tags() -> None: """DeepSeek R1 outputs reasoning in tags within content.""" print("=== DeepSeek R1 ( tags) ===") stream = client.chat.completions.create( - model="deepseek-ai/DeepSeek-R1", + model="deepseek-ai/DeepSeek-V4-Pro", messages=[ {"role": "user", "content": "Which number is bigger 9.9 or 9.11?"}, ], diff --git a/skills/together-chat-completions/scripts/reasoning_models.ts b/skills/together-chat-completions/scripts/reasoning_models.ts index efd81e4..00d9621 100644 --- a/skills/together-chat-completions/scripts/reasoning_models.ts +++ b/skills/together-chat-completions/scripts/reasoning_models.ts @@ -62,7 +62,7 @@ async function deepseekR1ThinkTags(): Promise { console.log("=== DeepSeek R1 ( tags) ==="); const stream = await client.chat.completions.create({ - model: "deepseek-ai/DeepSeek-R1", + model: "deepseek-ai/DeepSeek-V4-Pro", messages: [ { role: "user", content: "Which number is bigger 9.9 or 9.11?" }, ], diff --git a/skills/together-chat-completions/scripts/structured_outputs.py b/skills/together-chat-completions/scripts/structured_outputs.py index 3b3ef4e..f73bddd 100644 --- a/skills/together-chat-completions/scripts/structured_outputs.py +++ b/skills/together-chat-completions/scripts/structured_outputs.py @@ -114,7 +114,7 @@ def reasoning_json_example() -> None: """Extract structured JSON from a reasoning model.""" print("=== json_schema + reasoning model ===") completion = client.chat.completions.create( - model="deepseek-ai/DeepSeek-R1", + model="deepseek-ai/DeepSeek-V4-Pro", messages=[ { "role": "system", diff --git a/skills/together-chat-completions/scripts/structured_outputs.ts b/skills/together-chat-completions/scripts/structured_outputs.ts index 0c04a86..1443b5b 100644 --- a/skills/together-chat-completions/scripts/structured_outputs.ts +++ b/skills/together-chat-completions/scripts/structured_outputs.ts @@ -126,7 +126,7 @@ async function reasoningJsonExample(): Promise { const jsonSchema = z.toJSONSchema(mathReasoningSchema); const completion = await client.chat.completions.create({ - model: "deepseek-ai/DeepSeek-R1", + model: "deepseek-ai/DeepSeek-V4-Pro", messages: [ { role: "system", From 9e4a869f0690ca2afce2fccfa3c0586bb7346937 Mon Sep 17 00:00:00 2001 From: Zain Hasan Date: Fri, 1 May 2026 13:59:10 -0700 Subject: [PATCH 4/5] update eval models --- .../references/api-reference.md | 24 +++++++++---------- .../scripts/run_evaluation.py | 2 +- .../scripts/run_evaluation.ts | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/skills/together-evaluations/references/api-reference.md b/skills/together-evaluations/references/api-reference.md index 490f62f..dc8f328 100644 --- a/skills/together-evaluations/references/api-reference.md +++ b/skills/together-evaluations/references/api-reference.md @@ -165,7 +165,7 @@ evaluation = client.evals.create( parameters={ "input_data_file_path": "file-abc123", "judge": { - "model": "deepseek-ai/DeepSeek-V3.1", + "model": "deepseek-ai/DeepSeek-V4-Pro", "model_source": "serverless", "system_template": "Classify the response as Toxic or Non-toxic.", }, @@ -193,7 +193,7 @@ const evaluation = await client.evals.create({ parameters: { input_data_file_path: "file-abc123", judge: { - model: "deepseek-ai/DeepSeek-V3.1", + model: "deepseek-ai/DeepSeek-V4-Pro", model_source: "serverless", system_template: "Classify the response as Toxic or Non-toxic.", }, @@ -220,7 +220,7 @@ curl -X POST "https://api.together.xyz/v1/evaluation" \ "type": "classify", "parameters": { "judge": { - "model": "deepseek-ai/DeepSeek-V3.1", + "model": "deepseek-ai/DeepSeek-V4-Pro", "model_source": "serverless", "system_template": "Classify the response as Toxic or Non-toxic." }, @@ -247,7 +247,7 @@ evaluation = client.evals.create( parameters={ "input_data_file_path": "file-abc123", "judge": { - "model": "deepseek-ai/DeepSeek-V3.1", + "model": "deepseek-ai/DeepSeek-V4-Pro", "model_source": "serverless", "system_template": "Rate the toxicity of the response from 1 to 10.", }, @@ -272,7 +272,7 @@ const evaluation = await client.evals.create({ parameters: { input_data_file_path: "file-abc123", judge: { - model: "deepseek-ai/DeepSeek-V3.1", + model: "deepseek-ai/DeepSeek-V4-Pro", model_source: "serverless", system_template: "Rate the toxicity of the response from 1 to 10.", }, @@ -299,7 +299,7 @@ curl -X POST "https://api.together.xyz/v1/evaluation" \ "type": "score", "parameters": { "judge": { - "model": "deepseek-ai/DeepSeek-V3.1", + "model": "deepseek-ai/DeepSeek-V4-Pro", "model_source": "serverless", "system_template": "Rate the toxicity of the response from 1 to 10." }, @@ -327,7 +327,7 @@ evaluation = client.evals.create( parameters={ "input_data_file_path": "file-abc123", "judge": { - "model": "deepseek-ai/DeepSeek-V3.1", + "model": "deepseek-ai/DeepSeek-V4-Pro", "model_source": "serverless", "system_template": "Assess which model has smarter and more helpful responses.", }, @@ -357,7 +357,7 @@ const evaluation = await client.evals.create({ parameters: { input_data_file_path: "file-abc123", judge: { - model: "deepseek-ai/DeepSeek-V3.1", + model: "deepseek-ai/DeepSeek-V4-Pro", model_source: "serverless", system_template: "Assess which model has smarter and more helpful responses.", @@ -390,7 +390,7 @@ curl -X POST "https://api.together.xyz/v1/evaluation" \ "type": "compare", "parameters": { "judge": { - "model": "deepseek-ai/DeepSeek-V3.1", + "model": "deepseek-ai/DeepSeek-V4-Pro", "model_source": "serverless", "system_template": "Assess which model has smarter and more helpful responses." }, @@ -424,7 +424,7 @@ evaluation = client.evals.create( parameters={ "input_data_file_path": "file-abc123", "judge": { - "model": "deepseek-ai/DeepSeek-V3.1", + "model": "deepseek-ai/DeepSeek-V4-Pro", "model_source": "serverless", "system_template": ( "Assess which response is better. Consider clarity, accuracy, and usefulness." @@ -488,7 +488,7 @@ evaluation = client.evals.create( parameters={ "input_data_file_path": "file-abc123", "judge": { - "model": "deepseek-ai/DeepSeek-V3.1", + "model": "deepseek-ai/DeepSeek-V4-Pro", "model_source": "serverless", "system_template": "Classify the response as Toxic or Non-toxic.", }, @@ -664,7 +664,7 @@ curl -X GET "https://api.together.xyz/v1/files//content" \ | Source | Description | Model field | |--------|-------------|-------------| -| `serverless` | Together AI serverless models with structured output support | Model name (e.g., `deepseek-ai/DeepSeek-V3.1`) | +| `serverless` | Together AI serverless models with structured output support | Model name (e.g., `deepseek-ai/DeepSeek-V4-Pro`) | | `dedicated` | Your deployed dedicated endpoint | Endpoint ID | | `external` | Third-party providers via shortcuts or custom URL | Provider shortcut (e.g., `openai/gpt-5`) | diff --git a/skills/together-evaluations/scripts/run_evaluation.py b/skills/together-evaluations/scripts/run_evaluation.py index 6eac2d3..9fa737c 100644 --- a/skills/together-evaluations/scripts/run_evaluation.py +++ b/skills/together-evaluations/scripts/run_evaluation.py @@ -31,7 +31,7 @@ client = Together() MODEL_SOURCES = ("serverless", "dedicated", "external") -JUDGE_MODEL = "deepseek-ai/DeepSeek-V3.1" +JUDGE_MODEL = "deepseek-ai/DeepSeek-V4-Pro" EVAL_MODEL = "Qwen/Qwen3.5-9B" DEFAULT_EVAL_SYSTEM_TEMPLATE = "You are a helpful assistant." DEFAULT_INPUT_TEMPLATE = "{{prompt}}" diff --git a/skills/together-evaluations/scripts/run_evaluation.ts b/skills/together-evaluations/scripts/run_evaluation.ts index 59828ef..ed80dff 100644 --- a/skills/together-evaluations/scripts/run_evaluation.ts +++ b/skills/together-evaluations/scripts/run_evaluation.ts @@ -62,7 +62,7 @@ type ScriptArgs = { downloadResults?: string; }; -const JUDGE_MODEL = "deepseek-ai/DeepSeek-V3.1"; +const JUDGE_MODEL = "deepseek-ai/DeepSeek-V4-Pro"; const EVAL_MODEL = "Qwen/Qwen3.5-9B"; const DEFAULT_EVAL_SYSTEM_TEMPLATE = "You are a helpful assistant."; const DEFAULT_INPUT_TEMPLATE = "{{prompt}}"; From fd77922b8aac2714d4b51f93e4a22692be4afc99 Mon Sep 17 00:00:00 2001 From: Zain Hasan Date: Fri, 1 May 2026 14:02:26 -0700 Subject: [PATCH 5/5] add FT models --- .../references/data-formats.md | 2 +- .../references/supported-models.md | 22 +++++++++++++++++-- .../scripts/reasoning_finetune.py | 4 ++-- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/skills/together-fine-tuning/references/data-formats.md b/skills/together-fine-tuning/references/data-formats.md index 3bfa5c4..94ffa36 100644 --- a/skills/together-fine-tuning/references/data-formats.md +++ b/skills/together-fine-tuning/references/data-formats.md @@ -145,7 +145,7 @@ For preference fine-tuning with reasoning, include `reasoning` in both outputs: } ``` -Supported models: Qwen3 family (0.6B-235B), Qwen3-Next-80B-A3B-Thinking, GLM-4.6, GLM-4.7. +Supported models: Qwen3.5 family (0.8B-397B), Qwen3 family (0.6B-235B), Qwen3-Next-80B-A3B-Thinking, GLM-5.1, GLM-5, GLM-4.7, GLM-4.6. ## Function Calling Format diff --git a/skills/together-fine-tuning/references/supported-models.md b/skills/together-fine-tuning/references/supported-models.md index 9865cf8..a094dd5 100644 --- a/skills/together-fine-tuning/references/supported-models.md +++ b/skills/together-fine-tuning/references/supported-models.md @@ -26,16 +26,22 @@ | Organization | Model | API String | Context (SFT) | |-------------|-------|-----------|---------------| +| Qwen | Qwen3.5 397B A17B | `Qwen/Qwen3.5-397B-A17B` | 32K | +| Qwen | Qwen3.5 122B A10B | `Qwen/Qwen3.5-122B-A10B` | 65K | | Moonshot | Kimi K2.5 | `moonshotai/Kimi-K2.5` | 32K | | Moonshot | Kimi K2 Thinking | `moonshotai/Kimi-K2-Thinking` | 32K | | Moonshot | Kimi K2 Instruct 0905 | `moonshotai/Kimi-K2-Instruct-0905` | 32K | | Moonshot | Kimi K2 Base | `moonshotai/Kimi-K2-Base` | 32K | +| Z.ai | GLM-5.1 | `zai-org/GLM-5.1` | 50K | +| Z.ai | GLM-5 | `zai-org/GLM-5` | 50K | | Z.ai | GLM-4.7 | `zai-org/GLM-4.7` | 128K | | Z.ai | GLM-4.6 | `zai-org/GLM-4.6` | 128K | | OpenAI | GPT-OSS 120B | `openai/gpt-oss-120b` | 16K | | OpenAI | GPT-OSS 20B | `openai/gpt-oss-20b` | 24K | +| DeepSeek | DeepSeek-R1-0528 | `deepseek-ai/DeepSeek-R1-0528` | 131K | | DeepSeek | DeepSeek-R1 | `deepseek-ai/DeepSeek-R1` | 131K | | DeepSeek | DeepSeek-V3.1 | `deepseek-ai/DeepSeek-V3.1` | 131K | +| DeepSeek | DeepSeek-V3-0324 | `deepseek-ai/DeepSeek-V3-0324` | 131K | | DeepSeek | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3` | 131K | | Qwen | Qwen3 235B A22B | `Qwen/Qwen3-235B-A22B` | 41K | | Qwen | Qwen3 235B Instruct | `Qwen/Qwen3-235B-A22B-Instruct-2507` | 49K | @@ -52,6 +58,10 @@ | Organization | Model | API String | Context (SFT) | |-------------|-------|-----------|---------------| +| Qwen | Qwen3.5 27B | `Qwen/Qwen3.5-27B` | 32K | +| Qwen | Qwen3.5 9B | `Qwen/Qwen3.5-9B` | 65K | +| Qwen | Qwen3.5 35B A3B | `Qwen/Qwen3.5-35B-A3B` | 65K | +| Qwen | Qwen3.6 35B A3B | `Qwen/Qwen3.6-35B-A3B` | 65K | | Qwen | Qwen3 32B | `Qwen/Qwen3-32B` | 41K | | Qwen | Qwen3 14B | `Qwen/Qwen3-14B` | 41K | | Qwen | Qwen3 8B | `Qwen/Qwen3-8B` | 41K | @@ -62,6 +72,9 @@ | Qwen | Qwen2.5 7B Instruct | `Qwen/Qwen2.5-7B-Instruct` | 32K | | Meta | Llama 3.1 8B | `meta-llama/Meta-Llama-3.1-8B-Instruct-Reference` | 131K | | DeepSeek | R1 Distill Qwen 14B | `deepseek-ai/DeepSeek-R1-Distill-Qwen-14B` | 65K | +| NVIDIA | Nemotron Nano 9B v2 | `nvidia/NVIDIA-Nemotron-Nano-9B-v2` | 32K | +| Google | Gemma 4 31B IT | `google/gemma-4-31B-it` | 49K | +| Google | Gemma 4 26B A4B IT | `google/gemma-4-26B-A4B-it` | 49K | | Google | Gemma 3 27B | `google/gemma-3-27b-it` | 49K | | Google | Gemma 3 12B | `google/gemma-3-12b-it` | 65K | | Mistral | Mixtral 8x7B | `mistralai/Mixtral-8x7B-Instruct-v0.1` | 32K | @@ -71,6 +84,9 @@ | Organization | Model | API String | Context (SFT) | |-------------|-------|-----------|---------------| +| Qwen | Qwen3.5 4B | `Qwen/Qwen3.5-4B` | 131K | +| Qwen | Qwen3.5 2B | `Qwen/Qwen3.5-2B` | 131K | +| Qwen | Qwen3.5 0.8B | `Qwen/Qwen3.5-0.8B` | 131K | | Qwen | Qwen3 4B | `Qwen/Qwen3-4B` | 41K | | Qwen | Qwen3 1.7B | `Qwen/Qwen3-1.7B` | 41K | | Qwen | Qwen3 0.6B | `Qwen/Qwen3-0.6B` | 41K | @@ -124,12 +140,14 @@ Same models as LoRA, but batch sizes are generally smaller. Key full-fine-tuning | Organization | Model | API String | |-------------|-------|-----------| +| Qwen | Qwen3.5 family | `Qwen/Qwen3.5-*` (0.8B, 2B, 4B, 9B, 27B, 35B-A3B, 122B-A10B, 397B-A17B) | | Qwen | Qwen3 0.6B - 235B | `Qwen/Qwen3-*` (all sizes and base variants) | -| Qwen | Qwen3 32B 16k | `Qwen/Qwen3-32B-16k` | | Qwen | Qwen3 30B A3B | `Qwen/Qwen3-30B-A3B` (and base) | | Qwen | Qwen3-Next 80B Thinking | `Qwen/Qwen3-Next-80B-A3B-Thinking` | -| Z.ai | GLM 4.6 | `zai-org/GLM-4.6` | +| Z.ai | GLM 5.1 | `zai-org/GLM-5.1` | +| Z.ai | GLM 5 | `zai-org/GLM-5` | | Z.ai | GLM 4.7 | `zai-org/GLM-4.7` | +| Z.ai | GLM 4.6 | `zai-org/GLM-4.6` | ## DPO/Preference Training diff --git a/skills/together-fine-tuning/scripts/reasoning_finetune.py b/skills/together-fine-tuning/scripts/reasoning_finetune.py index d5e8c0e..56b7b3e 100644 --- a/skills/together-fine-tuning/scripts/reasoning_finetune.py +++ b/skills/together-fine-tuning/scripts/reasoning_finetune.py @@ -9,8 +9,8 @@ include a `reasoning` (or `reasoning_content`) field containing the model's chain of thought, and a `content` field for the final answer. -Supported models: Qwen3 family (0.6B-235B), GLM-4.6, GLM-4.7, -Qwen3-Next-80B-A3B-Thinking. +Supported models: Qwen3.5 family (0.8B-397B), Qwen3 family (0.6B-235B), +GLM-5.1, GLM-5, GLM-4.7, GLM-4.6, Qwen3-Next-80B-A3B-Thinking. Usage: python reasoning_finetune.py