Skip to content

Commit 7098f0e

Browse files
authored
Evaluation: TTS (#619)
1 parent cc1fb74 commit 7098f0e

54 files changed

Lines changed: 4997 additions & 496 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/issue-formatter.yml

Lines changed: 0 additions & 126 deletions
This file was deleted.
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
"""add tts evaluation tables
2+
3+
Revision ID: 049
4+
Revises: 048
5+
Create Date: 2026-02-14 12:00:00.000000
6+
7+
"""
8+
9+
import sqlalchemy as sa
10+
from alembic import op
11+
from sqlalchemy.dialects import postgresql
12+
13+
# revision identifiers, used by Alembic.
14+
revision = "049"
15+
down_revision = "048"
16+
branch_labels = None
17+
depends_on = None
18+
19+
20+
def upgrade():
21+
# Create tts_result table
22+
op.create_table(
23+
"tts_result",
24+
sa.Column(
25+
"id",
26+
sa.Integer(),
27+
nullable=False,
28+
comment="Unique identifier for the TTS result",
29+
),
30+
sa.Column(
31+
"sample_text",
32+
sa.Text(),
33+
nullable=False,
34+
comment="Input text that will be synthesized to speech",
35+
),
36+
sa.Column(
37+
"object_store_url",
38+
sa.String(),
39+
nullable=True,
40+
comment="S3 URL of the generated WAV audio file",
41+
),
42+
sa.Column(
43+
"metadata",
44+
postgresql.JSONB(astext_type=sa.Text()),
45+
nullable=True,
46+
comment="Audio metadata: {duration_seconds, size_bytes}",
47+
),
48+
sa.Column(
49+
"provider",
50+
sa.String(length=100),
51+
nullable=False,
52+
comment="TTS provider used (e.g., gemini-2.5-pro-preview-tts)",
53+
),
54+
sa.Column(
55+
"status",
56+
sa.String(length=20),
57+
nullable=False,
58+
server_default="PENDING",
59+
comment="Result status: PENDING, SUCCESS, FAILED",
60+
),
61+
sa.Column(
62+
"score",
63+
postgresql.JSONB(astext_type=sa.Text()),
64+
nullable=True,
65+
comment="Extensible evaluation metrics",
66+
),
67+
sa.Column(
68+
"is_correct",
69+
sa.Boolean(),
70+
nullable=True,
71+
comment="Human feedback flag on audio quality correctness",
72+
),
73+
sa.Column(
74+
"comment",
75+
sa.Text(),
76+
nullable=True,
77+
comment="Human feedback comment on audio quality",
78+
),
79+
sa.Column(
80+
"error_message",
81+
sa.Text(),
82+
nullable=True,
83+
comment="Error message if synthesis failed",
84+
),
85+
sa.Column(
86+
"evaluation_run_id",
87+
sa.Integer(),
88+
nullable=False,
89+
comment="Reference to the evaluation run",
90+
),
91+
sa.Column(
92+
"organization_id",
93+
sa.Integer(),
94+
nullable=False,
95+
comment="Reference to the organization",
96+
),
97+
sa.Column(
98+
"project_id",
99+
sa.Integer(),
100+
nullable=False,
101+
comment="Reference to the project",
102+
),
103+
sa.Column(
104+
"inserted_at",
105+
sa.DateTime(),
106+
nullable=False,
107+
comment="Timestamp when the result was created",
108+
),
109+
sa.Column(
110+
"updated_at",
111+
sa.DateTime(),
112+
nullable=False,
113+
comment="Timestamp when the result was last updated",
114+
),
115+
sa.ForeignKeyConstraint(
116+
["evaluation_run_id"],
117+
["evaluation_run.id"],
118+
name="fk_tts_result_run_id",
119+
ondelete="CASCADE",
120+
),
121+
sa.ForeignKeyConstraint(
122+
["organization_id"],
123+
["organization.id"],
124+
ondelete="CASCADE",
125+
),
126+
sa.ForeignKeyConstraint(
127+
["project_id"],
128+
["project.id"],
129+
ondelete="CASCADE",
130+
),
131+
sa.PrimaryKeyConstraint("id"),
132+
)
133+
op.create_index(
134+
"ix_tts_result_run_id",
135+
"tts_result",
136+
["evaluation_run_id"],
137+
unique=False,
138+
)
139+
op.create_index(
140+
"idx_tts_result_feedback",
141+
"tts_result",
142+
["evaluation_run_id", "is_correct"],
143+
unique=False,
144+
)
145+
op.create_index(
146+
"idx_tts_result_status",
147+
"tts_result",
148+
["evaluation_run_id", "status"],
149+
unique=False,
150+
)
151+
152+
153+
def downgrade():
154+
op.drop_index("idx_tts_result_status", table_name="tts_result")
155+
op.drop_index("idx_tts_result_feedback", table_name="tts_result")
156+
op.drop_index("ix_tts_result_run_id", table_name="tts_result")
157+
op.drop_table("tts_result")
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Create a new TTS evaluation dataset with text samples.
2+
3+
Required fields:
4+
- **name**: Dataset name
5+
- **samples**: List of text samples, each with a **text** field
6+
7+
Optional fields:
8+
- **description**: Dataset description
9+
- **language_id**: ID of a language from the global languages table
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Get a TTS evaluation dataset by ID.
2+
3+
Returns dataset including sample count.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Get a single TTS synthesis result by ID.
2+
3+
Returns the result including audio URL, metadata, and human feedback status.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Get a TTS evaluation run by ID with optional results.
2+
3+
Query parameters:
4+
- `include_results`: Include synthesis results (default: true)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
List all TTS evaluation datasets for the current project.
2+
3+
Supports pagination with `limit` and `offset` parameters.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
List TTS evaluation runs for the current project.
2+
3+
Supports filtering by `dataset_id` and `status`, with pagination via `limit` and `offset`.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Start a TTS evaluation run on a dataset.
2+
3+
Required fields:
4+
- **run_name**: Name for this evaluation run
5+
- **dataset_id**: ID of the TTS dataset to evaluate
6+
7+
Optional fields:
8+
- **models**: List of TTS models to use (default: `["gemini-2.5-pro-preview-tts"]`)
9+
10+
The evaluation will:
11+
1. Process each text sample through the specified TTS models
12+
2. Generate speech audio using Gemini Batch API
13+
3. Store WAV audio files in S3 for human review
14+
15+
**Supported models:** `gemini-2.5-pro-preview-tts`
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Update human feedback on a TTS synthesis result.
2+
3+
Fields:
4+
- **is_correct**: Whether the synthesized audio quality is acceptable (null to clear)
5+
- **comment**: Optional feedback comment

0 commit comments

Comments
 (0)