Add Azure OpenAI support for embeddings (#52)

Daniel-Vaz · web-flow · commit 1b7376ec78a6 · 2026-02-13T11:16:01.000+01:00
* (feat) support azure openai for embedding

Signed-off-by: dvaz-external &lt;dvaz.external@epo.org&gt;

* fix doc2vec test

Signed-off-by: dvaz-external &lt;dvaz.external@epo.org&gt;

---------

Signed-off-by: dvaz-external &lt;dvaz.external@epo.org&gt;
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ The primary goal is to prepare documentation content for Retrieval-Augmented Gen
 *   **HTML to Markdown:** Converts extracted HTML to clean Markdown using `turndown`, preserving code blocks and basic formatting.
     *   **Clean Heading Text:** Automatically removes anchor links (like `[](#section-id)`) from heading text for cleaner hierarchy display.
 *   **Intelligent Chunking:** Splits Markdown content into manageable chunks based on headings and token limits, preserving context.
-*   **Vector Embeddings:** Generates embeddings for each chunk using OpenAI's `text-embedding-3-large` model.
+*   **Vector Embeddings:** Generates embeddings for each chunk using OpenAI or Azure OpenAI (configurable).
 *   **Vector Storage:** Supports storing chunks, metadata, and embeddings in:
     *   **SQLite:** Using `better-sqlite3` and the `sqlite-vec` extension for efficient vector search.
     *   **Qdrant:** A dedicated vector database, using the `@qdrant/js-client-rest`.
@@ -98,7 +98,7 @@ This ensures that searches for parent topics (like "Installation") will also mat
 *   **Node.js:** Version 18 or higher recommended (check `.nvmrc` if available).
 *   **npm:** Node Package Manager (usually comes with Node.js).
 *   **TypeScript:** As the project is written in TypeScript (`ts-node` is used for execution via `npm start`).
-*   **OpenAI API Key:** You need an API key from OpenAI to generate embeddings.
+*   **OpenAI API Key or Azure OpenAI Credentials:** You need either an OpenAI API key or Azure OpenAI credentials to generate embeddings.
 *   **GitHub Personal Access Token:** Required for accessing GitHub issues (set as `GITHUB_PERSONAL_ACCESS_TOKEN` in your environment).
 *   **Zendesk API Token:** Required for accessing Zendesk tickets and articles (set as `ZENDESK_API_TOKEN` in your environment).
 *   **(Optional) Qdrant Instance:** If using the `qdrant` database type, you need a running Qdrant instance accessible from where you run the script.
@@ -129,8 +129,20 @@ Configuration is managed through two files:
     ```dotenv
     # .env
 
-    # Required: Your OpenAI API Key
+    # Embedding Provider Configuration
+    # Optional: Specify which provider to use (defaults to 'openai' if not set)
+    # Can also be configured in config.yaml
+    EMBEDDING_PROVIDER="azure"  # or "openai"
+
+    # Required: Your OpenAI API Key (if using OpenAI provider)
     OPENAI_API_KEY="sk-..."
+    OPENAI_MODEL="text-embedding-3-large"  # Optional, defaults to text-embedding-3-large
+
+    # Required: Your Azure OpenAI credentials (if using Azure provider)
+    AZURE_OPENAI_KEY="your-azure-key"
+    AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com"
+    AZURE_OPENAI_DEPLOYMENT_NAME="text-embedding-3-large"
+    AZURE_OPENAI_API_VERSION="2024-10-21"
 
     # Required for GitHub sources
     GITHUB_PERSONAL_ACCESS_TOKEN="ghp_..."
@@ -206,6 +218,21 @@ Configuration is managed through two files:
 
     **Example (`config.yaml`):**
     ```yaml
+    # Optional: Configure embedding provider
+    # Can also be set via EMBEDDING_PROVIDER environment variable
+    # Defaults to OpenAI if not specified
+    embedding:
+      provider: 'openai'  # or 'azure'
+      openai:
+        api_key: '${OPENAI_API_KEY}'  # Optional, uses env var by default
+        model: 'text-embedding-3-large'  # Optional, defaults to text-embedding-3-large
+      # For Azure OpenAI, use this instead:
+      # azure:
+      #   api_key: '${AZURE_OPENAI_KEY}'
+      #   endpoint: '${AZURE_OPENAI_ENDPOINT}'
+      #   deployment_name: 'text-embedding-3-large'
+      #   api_version: '2024-10-21'  # Optional
+
     sources:
       # Website source example
       - type: 'website'
diff --git a/doc2vec.ts b/doc2vec.ts
@@ -9,7 +9,7 @@ import * as os from 'os';
 import { exec } from 'child_process';
 import { promisify } from 'util';
 import { Buffer } from 'buffer';
-import { OpenAI } from "openai";
+import { OpenAI, AzureOpenAI } from "openai";
 import * as dotenv from "dotenv";
 import { Logger, LogLevel } from './logger';
 import { Utils } from './utils';
@@ -35,7 +35,8 @@ dotenv.config();
 
 export class Doc2Vec {
     private config: Config;
-    private openai: OpenAI;
+    private openai: OpenAI | AzureOpenAI;
+    private embeddingModel: string;
     private contentProcessor: ContentProcessor;
     private logger: Logger;
     private configDir: string;
@@ -52,7 +53,45 @@ export class Doc2Vec {
         this.logger.info('Initializing Doc2Vec');
         this.config = this.loadConfig(configPath);
         this.configDir = path.dirname(path.resolve(configPath));
-        this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
+        
+        // Initialize OpenAI or Azure OpenAI based on configuration
+        // Check environment variable if not specified in config
+        const embeddingProvider = this.config.embedding?.provider || (process.env.EMBEDDING_PROVIDER as 'openai' | 'azure') || 'openai';
+        const embeddingConfig = this.config.embedding || { provider: embeddingProvider };
+        
+        if (embeddingProvider === 'azure') {
+            const azureApiKey = embeddingConfig.azure?.api_key || process.env.AZURE_OPENAI_KEY;
+            const azureEndpoint = embeddingConfig.azure?.endpoint || process.env.AZURE_OPENAI_ENDPOINT;
+            const azureDeploymentName = embeddingConfig.azure?.deployment_name || process.env.AZURE_OPENAI_DEPLOYMENT_NAME || 'text-embedding-3-large';
+            const azureApiVersion = embeddingConfig.azure?.api_version || process.env.AZURE_OPENAI_API_VERSION || '2024-10-21';
+            
+            if (!azureApiKey || !azureEndpoint) {
+                this.logger.error('Azure OpenAI requires api_key and endpoint to be configured');
+                process.exit(1);
+            }
+            
+            this.openai = new AzureOpenAI({
+                apiKey: azureApiKey,
+                endpoint: azureEndpoint,
+                deployment: azureDeploymentName,
+                apiVersion: azureApiVersion,
+            });
+            this.embeddingModel = azureDeploymentName;
+            this.logger.info(`Using Azure OpenAI with deployment: ${azureDeploymentName}`);
+        } else {
+            const openaiApiKey = embeddingConfig.openai?.api_key || process.env.OPENAI_API_KEY;
+            const openaiModel = embeddingConfig.openai?.model || process.env.OPENAI_MODEL || 'text-embedding-3-large';
+            
+            if (!openaiApiKey) {
+                this.logger.error('OpenAI requires api_key to be configured');
+                process.exit(1);
+            }
+            
+            this.openai = new OpenAI({ apiKey: openaiApiKey });
+            this.embeddingModel = openaiModel;
+            this.logger.info(`Using OpenAI with model: ${openaiModel}`);
+        }
+        
         this.contentProcessor = new ContentProcessor(this.logger);
     }
 
@@ -1483,7 +1522,7 @@ export class Doc2Vec {
         try {
             logger.debug(`Creating embeddings for ${texts.length} texts`);
             const response = await this.openai.embeddings.create({
-                model: "text-embedding-3-large",
+                model: this.embeddingModel,
                 input: texts,
             });
             logger.debug(`Successfully created ${response.data.length} embeddings`);
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
     "name": "doc2vec",
-    "version": "2.1.0",
+    "version": "2.2.0",
     "type": "commonjs",
     "description": "",
     "main": "dist/doc2vec.js",
diff --git a/tests/doc2vec.test.ts b/tests/doc2vec.test.ts
@@ -194,6 +194,9 @@ describe('Doc2Vec class', () => {
         }) as any;
         process.exit = mockProcessExit as any;
 
+        // Provide a dummy API key so the constructor validation doesn't call process.exit
+        process.env.OPENAI_API_KEY = 'test-key-for-tests';
+
         // Ensure test config directory exists
         if (!fs.existsSync(testConfigDir)) {
             fs.mkdirSync(testConfigDir, { recursive: true });
diff --git a/types.ts b/types.ts
@@ -79,8 +79,23 @@ export interface QdrantDatabaseParams {
     collection_name?: string;
 }
 
+export interface EmbeddingConfig {
+    provider: 'openai' | 'azure';
+    openai?: {
+        api_key?: string;  // Can also use OPENAI_API_KEY env var
+        model?: string;    // Default: text-embedding-3-large
+    };
+    azure?: {
+        api_key?: string;        // Can also use AZURE_OPENAI_KEY env var
+        endpoint?: string;       // Can also use AZURE_OPENAI_ENDPOINT env var
+        deployment_name?: string; // Can also use AZURE_OPENAI_DEPLOYMENT_NAME env var
+        api_version?: string;    // Default: 2024-10-21
+    };
+}
+
 export interface Config {
     sources: SourceConfig[];
+    embedding?: EmbeddingConfig;  // Optional, defaults to OpenAI
 }
 
 export interface DocumentChunk {

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "doc2vec",`
`3`		`- "version": "2.1.0",`
	`3`	`+ "version": "2.2.0",`
`4`	`4`	`"type": "commonjs",`
`5`	`5`	`"description": "",`
`6`	`6`	`"main": "dist/doc2vec.js",`