Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<PropertyGroup Condition="'$(RuntimeIdentifier)'==''">
<RuntimeIdentifier>$(NETCoreSdkRuntimeIdentifier)</RuntimeIdentifier>
</PropertyGroup>

<!-- Include the main program -->
<ItemGroup>
<Compile Include="../../src/LiveAudioTranscriptionExample/*.cs" />
<Compile Include="../../src/Shared/*.cs" />
</ItemGroup>

<!-- Packages -->
<ItemGroup>
<PackageReference Include="Microsoft.AI.Foundry.Local" />
<PackageReference Include="NAudio" Version="2.2.1" />
</ItemGroup>

<!-- ONNX Runtime GPU and CUDA provider (required for Linux)-->
<ItemGroup Condition="'$(RuntimeIdentifier)' == 'linux-x64'">
<PackageReference Include="Microsoft.ML.OnnxRuntime.Gpu" />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" />
</ItemGroup>

</Project>
105 changes: 105 additions & 0 deletions samples/cs/GettingStarted/src/LiveAudioTranscriptionExample/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Live Audio Transcription — Foundry Local SDK Example
//
// Demonstrates real-time microphone-to-text using:
// SDK (FoundryLocalManager) → Core (NativeAOT DLL) → onnxruntime-genai (StreamingProcessor)

using Microsoft.AI.Foundry.Local;
using NAudio.Wave;

Console.WriteLine("===========================================================");
Console.WriteLine(" Foundry Local -- Live Audio Transcription Demo");
Console.WriteLine("===========================================================");
Console.WriteLine();

var config = new Configuration
{
AppName = "foundry_local_samples",
LogLevel = Microsoft.AI.Foundry.Local.LogLevel.Information
};

await FoundryLocalManager.CreateAsync(config, Utils.GetAppLogger());
var mgr = FoundryLocalManager.Instance;

await Utils.RunWithSpinner("Registering execution providers", mgr.EnsureEpsDownloadedAsync());

var catalog = await mgr.GetCatalogAsync();

var model = await catalog.GetModelAsync("nemotron") ?? throw new Exception("Model \"nemotron\" not found in catalog");

await model.DownloadAsync(progress =>
{
Console.Write($"\rDownloading model: {progress:F2}%");
if (progress >= 100f)
{
Console.WriteLine();
}
});

Console.Write($"Loading model {model.Id}...");
await model.LoadAsync();
Console.WriteLine("done.");

var audioClient = await model.GetAudioClientAsync();
var session = audioClient.CreateLiveTranscriptionSession();
session.Settings.SampleRate = 16000;
session.Settings.Channels = 1;
session.Settings.Language = "en";

await session.StartAsync();
Console.WriteLine(" Session started");

var readTask = Task.Run(async () =>
{
try
{
await foreach (var result in session.GetTranscriptionStream())
{
if (result.IsFinal)
{
Console.WriteLine();
Console.WriteLine($" [FINAL] {result.Text}");
Console.Out.Flush();
}
else if (!string.IsNullOrEmpty(result.Text))
{
Console.ForegroundColor = ConsoleColor.Cyan;
Console.Write(result.Text);
Console.ResetColor();
Console.Out.Flush();
}
}
}
catch (OperationCanceledException) { }
});

using var waveIn = new WaveInEvent
{
WaveFormat = new WaveFormat(rate: 16000, bits: 16, channels: 1),
BufferMilliseconds = 100
};

waveIn.DataAvailable += (sender, e) =>
{
if (e.BytesRecorded > 0)
{
_ = session.AppendAsync(new ReadOnlyMemory<byte>(e.Buffer, 0, e.BytesRecorded));
}
};

Console.WriteLine();
Console.WriteLine("===========================================================");
Console.WriteLine(" LIVE TRANSCRIPTION ACTIVE");
Console.WriteLine(" Speak into your microphone.");
Console.WriteLine(" Transcription appears in real-time (cyan text).");
Console.WriteLine(" Press ENTER to stop recording.");
Console.WriteLine("===========================================================");
Console.WriteLine();

waveIn.StartRecording();
Console.ReadLine();
waveIn.StopRecording();

await session.StopAsync();
await readTask;

await model.UnloadAsync();
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<!-- For Windows use the following -->
<TargetFramework>net9.0-windows10.0.26100</TargetFramework>
<WindowsAppSDKSelfContained>false</WindowsAppSDKSelfContained>
<Platforms>ARM64;x64</Platforms>
<WindowsPackageType>None</WindowsPackageType>
<EnableCoreMrtTooling>false</EnableCoreMrtTooling>
</PropertyGroup>

<PropertyGroup Condition="'$(RuntimeIdentifier)'==''">
<RuntimeIdentifier>$(NETCoreSdkRuntimeIdentifier)</RuntimeIdentifier>
</PropertyGroup>

<ItemGroup>
<Compile Include="../../src/LiveAudioTranscriptionExample/*.cs" />
<Compile Include="../../src/Shared/*.cs" />
</ItemGroup>

<!-- Use WinML package for local Foundry SDK on Windows -->
<ItemGroup>
<PackageReference Include="Microsoft.AI.Foundry.Local.WinML" />
<PackageReference Include="NAudio" Version="2.2.1" />
</ItemGroup>

</Project>
60 changes: 60 additions & 0 deletions sdk/cs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,64 @@ audioClient.Settings.Language = "en";
audioClient.Settings.Temperature = 0.0f;
```

### Live Audio Transcription (Real-Time Streaming)

For real-time microphone-to-text transcription, use `CreateLiveTranscriptionSession()`. Audio is pushed as raw PCM chunks and transcription results stream back as an `IAsyncEnumerable`.

The streaming result type (`LiveAudioTranscriptionResponse`) extends `AudioCreateTranscriptionResponse` from the Betalgo OpenAI SDK, so it's compatible with the file-based transcription output format while adding streaming-specific fields.

```csharp
var audioClient = await model.GetAudioClientAsync();
var session = audioClient.CreateLiveTranscriptionSession();

// Configure audio format (must be set before StartAsync)
session.Settings.SampleRate = 16000;
session.Settings.Channels = 1;
session.Settings.Language = "en";

await session.StartAsync();

// Push audio from a microphone callback (thread-safe)
waveIn.DataAvailable += (sender, e) =>
{
_ = session.AppendAsync(new ReadOnlyMemory<byte>(e.Buffer, 0, e.BytesRecorded));
};

// Read transcription results as they arrive
await foreach (var result in session.GetTranscriptionStream())
{
// result inherits from AudioCreateTranscriptionResponse
// - result.Text — incremental transcribed text (per chunk, not accumulated)
// - result.IsFinal — true for final results, false for interim hypotheses
// - result.Segments — segment-level timing data (Start/End in seconds)
// - result.Language — language code
Console.Write(result.Text);
}

await session.StopAsync();
```

#### Output Type

| Field | Type | Description |
|-------|------|-------------|
| `Text` | `string` | Transcribed text from this audio chunk (inherited from `AudioCreateTranscriptionResponse`) |
| `IsFinal` | `bool` | Whether this is a final or interim result. Nemotron always returns `true`. |
| `Language` | `string` | Language code (inherited) |
| `Duration` | `float` | Audio duration in seconds (inherited) |
| `Segments` | `List<Segment>` | Segment timing with `Start`/`End` offsets (inherited) |
| `Words` | `List<WordSegment>` | Word-level timing (inherited, when available) |

#### Session Lifecycle

| Method | Description |
|--------|-------------|
| `StartAsync()` | Initialize the streaming session. Settings are frozen after this call. |
| `AppendAsync(pcmData)` | Push a chunk of raw PCM audio. Thread-safe (bounded internal queue). |
| `GetTranscriptionStream()` | Async enumerable of transcription results. |
| `StopAsync()` | Signal end-of-audio, flush remaining audio, and clean up. |
| `DisposeAsync()` | Calls `StopAsync` if needed. Use `await using` for automatic cleanup. |

### Web Service

Start an OpenAI-compatible REST endpoint for use by external tools or processes:
Expand Down Expand Up @@ -297,6 +355,8 @@ Key types:
| [`ModelVariant`](./docs/api/microsoft.ai.foundry.local.modelvariant.md) | Specific model variant (hardware/quantization) |
| [`OpenAIChatClient`](./docs/api/microsoft.ai.foundry.local.openaichatclient.md) | Chat completions (sync + streaming) |
| [`OpenAIAudioClient`](./docs/api/microsoft.ai.foundry.local.openaiaudioclient.md) | Audio transcription (sync + streaming) |
| [`LiveAudioTranscriptionSession`](./docs/api/microsoft.ai.foundry.local.openai.liveaudiotranscriptionsession.md) | Real-time audio streaming session |
| [`LiveAudioTranscriptionResponse`](./docs/api/microsoft.ai.foundry.local.openai.liveaudiotranscriptionresponse.md) | Streaming transcription result (extends `AudioCreateTranscriptionResponse`) |
| [`ModelInfo`](./docs/api/microsoft.ai.foundry.local.modelinfo.md) | Full model metadata record |

## Tests
Expand Down
115 changes: 115 additions & 0 deletions sdk/cs/src/Detail/CoreInterop.cs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,31 @@ private static unsafe partial void CoreExecuteCommandWithCallback(RequestBuffer*
nint callbackPtr, // NativeCallbackFn pointer
nint userData);

[LibraryImport(LibraryName, EntryPoint = "execute_command_with_binary")]
[UnmanagedCallConv(CallConvs = new[] { typeof(System.Runtime.CompilerServices.CallConvCdecl) })]
private static unsafe partial void CoreExecuteCommandWithBinary(StreamingRequestBuffer* nativeRequest,
ResponseBuffer* nativeResponse);

// --- Audio streaming P/Invoke imports (kept for future dedicated entry points) ---

[LibraryImport(LibraryName, EntryPoint = "audio_stream_start")]
[UnmanagedCallConv(CallConvs = new[] { typeof(System.Runtime.CompilerServices.CallConvCdecl) })]
private static unsafe partial void CoreAudioStreamStart(
RequestBuffer* request,
ResponseBuffer* response);

[LibraryImport(LibraryName, EntryPoint = "audio_stream_push")]
[UnmanagedCallConv(CallConvs = new[] { typeof(System.Runtime.CompilerServices.CallConvCdecl) })]
private static unsafe partial void CoreAudioStreamPush(
StreamingRequestBuffer* request,
ResponseBuffer* response);

[LibraryImport(LibraryName, EntryPoint = "audio_stream_stop")]
[UnmanagedCallConv(CallConvs = new[] { typeof(System.Runtime.CompilerServices.CallConvCdecl) })]
private static unsafe partial void CoreAudioStreamStop(
RequestBuffer* request,
ResponseBuffer* response);

// helper to capture exceptions in callbacks
internal class CallbackHelper
{
Expand Down Expand Up @@ -331,4 +356,94 @@ public Task<Response> ExecuteCommandWithCallbackAsync(string commandName, CoreIn
return Task.Run(() => ExecuteCommandWithCallback(commandName, commandInput, callback), ct);
}

/// <summary>
/// Marshal a ResponseBuffer from unmanaged memory into a managed Response and free the unmanaged memory.
/// </summary>
private Response MarshalResponse(ResponseBuffer response)
{
Response result = new();

if (response.Data != IntPtr.Zero && response.DataLength > 0)
{
byte[] managedResponse = new byte[response.DataLength];
Marshal.Copy(response.Data, managedResponse, 0, response.DataLength);
result.Data = System.Text.Encoding.UTF8.GetString(managedResponse);
}

if (response.Error != IntPtr.Zero && response.ErrorLength > 0)
{
result.Error = Marshal.PtrToStringUTF8(response.Error, response.ErrorLength)!;
}

Marshal.FreeHGlobal(response.Data);
Marshal.FreeHGlobal(response.Error);

return result;
}

// --- Audio streaming managed implementations ---
// Route through the existing execute_command / execute_command_with_binary entry points.
// The Core handles audio_stream_start / audio_stream_stop as command cases in ExecuteCommandManaged,
// and audio_stream_push as a command case in ExecuteCommandWithBinaryManaged.

public Response StartAudioStream(CoreInteropRequest request)
{
return ExecuteCommand("audio_stream_start", request);
}

public Response PushAudioData(CoreInteropRequest request, ReadOnlyMemory<byte> audioData)
{
try
{
var commandInputJson = request.ToJson();
byte[] commandBytes = System.Text.Encoding.UTF8.GetBytes("audio_stream_push");
byte[] inputBytes = System.Text.Encoding.UTF8.GetBytes(commandInputJson);

IntPtr commandPtr = Marshal.AllocHGlobal(commandBytes.Length);
Marshal.Copy(commandBytes, 0, commandPtr, commandBytes.Length);

IntPtr inputPtr = Marshal.AllocHGlobal(inputBytes.Length);
Marshal.Copy(inputBytes, 0, inputPtr, inputBytes.Length);

// Pin the managed audio data so GC won't move it during the native call
using var audioHandle = audioData.Pin();

unsafe
{
var reqBuf = new StreamingRequestBuffer
{
Command = commandPtr,
CommandLength = commandBytes.Length,
Data = inputPtr,
DataLength = inputBytes.Length,
BinaryData = (nint)audioHandle.Pointer,
BinaryDataLength = audioData.Length
};

ResponseBuffer response = default;

try
{
CoreExecuteCommandWithBinary(&reqBuf, &response);
}
finally
{
Marshal.FreeHGlobal(commandPtr);
Marshal.FreeHGlobal(inputPtr);
}

return MarshalResponse(response);
}
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
throw new FoundryLocalException("Error executing audio_stream_push", ex, _logger);
}
}

public Response StopAudioStream(CoreInteropRequest request)
{
return ExecuteCommand("audio_stream_stop", request);
}

}
17 changes: 17 additions & 0 deletions sdk/cs/src/Detail/ICoreInterop.cs
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,21 @@ Task<Response> ExecuteCommandAsync(string commandName, CoreInteropRequest? comma
Task<Response> ExecuteCommandWithCallbackAsync(string commandName, CoreInteropRequest? commandInput,
CallbackFn callback,
CancellationToken? ct = null);

// --- Audio streaming session support ---

[StructLayout(LayoutKind.Sequential)]
protected unsafe struct StreamingRequestBuffer
{
public nint Command;
public int CommandLength;
public nint Data; // JSON params
public int DataLength;
public nint BinaryData; // raw PCM audio bytes
public int BinaryDataLength;
}

Response StartAudioStream(CoreInteropRequest request);
Response PushAudioData(CoreInteropRequest request, ReadOnlyMemory<byte> audioData);
Response StopAudioStream(CoreInteropRequest request);
}
Loading
Loading