misraCoder/03_format_dataset.py at main · TonyStef/misraCoder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Step 03: Format pairs into training JSONL for fine-tuning.

Reads pairs from data/pairs/, generates task descriptions using GPT-5-Nano,
formats as chat conversations, and splits into train/val JSONL files.

Requires OPENAI_API_KEY environment variable.

Usage:
    python 03_format_dataset.py
"""
import asyncio
import json
import os
import random

import yaml
from openai import AsyncOpenAI
from tqdm import tqdm


SYSTEM_PROMPT = "You are a C++ developer that writes strictly MISRA C:2012 compliant code. All code you produce must pass cppcheck MISRA analysis with zero violations."

DESCRIBE_PROMPT = """Look at this C++ program and describe what it does in ONE sentence.
Focus on the algorithm/task, not the implementation details.
Start with a verb like "Reads", "Computes", "Finds", "Sorts", etc.
Return ONLY the one-sentence description, nothing else.

```cpp
{code}
```"""


async def describe_code(client, code: str, semaphore) -> str:
    async with semaphore:
        try:
            response = await client.chat.completions.create(
                model="gpt-5-mini",
                max_completion_tokens=1000,
                messages=[{"role": "user", "content": DESCRIBE_PROMPT.format(code=code)}],
            )
            text = response.choices[0].message.content or ""
            return text.strip().rstrip(".")
        except Exception:
            return "Write a C++ program that reads input, processes it, and outputs the result"


async def main():
    with open("config.yaml") as f:
        config = yaml.safe_load(f)

    fmt = config["formatting"]
    pairs_dir = config["paths"]["code_pairs"]
    output_dir = fmt["output_dir"]
    train_split = fmt["train_split"]

    os.makedirs(output_dir, exist_ok=True)

    client = AsyncOpenAI()

    pair_files = sorted([f for f in os.listdir(pairs_dir) if f.endswith(".json") and not f.startswith(".")])

    if not pair_files:
        print("No pairs found.")
        return

    print(f"Generating task descriptions for {len(pair_files)} pairs...\n")

    semaphore = asyncio.Semaphore(30)
    descriptions = {}
    progress = tqdm(total=len(pair_files), desc="Describing")

    async def describe_one(filename):
        with open(os.path.join(pairs_dir, filename)) as f:
            data = json.load(f)
        desc = await describe_code(client, data["original"], semaphore)
        descriptions[filename] = (data, desc)
        progress.update(1)

    tasks = [describe_one(f) for f in pair_files]
    await asyncio.gather(*tasks)
    progress.close()

    examples = []
    for filename, (data, desc) in descriptions.items():
        examples.append({
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": desc},
                {"role": "assistant", "content": data["fixed"]},
            ]
        })

    random.seed(42)
    random.shuffle(examples)

    split_idx = int(len(examples) * train_split)
    train = examples[:split_idx]
    val = examples[split_idx:]

    train_path = os.path.join(output_dir, "train.jsonl")
    val_path = os.path.join(output_dir, "val.jsonl")

    with open(train_path, "w") as f:
        for ex in train:
            f.write(json.dumps(ex) + "\n")

    with open(val_path, "w") as f:
        for ex in val:
            f.write(json.dumps(ex) + "\n")

    print(f"\nDone.")
    print(f"  Total examples: {len(examples)}")
    print(f"  Train: {len(train)} -> {train_path}")
    print(f"  Val:   {len(val)} -> {val_path}")


if __name__ == "__main__":
    asyncio.run(main())