-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path03_format_dataset.py
More file actions
119 lines (89 loc) · 3.49 KB
/
03_format_dataset.py
File metadata and controls
119 lines (89 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Step 03: Format pairs into training JSONL for fine-tuning.
Reads pairs from data/pairs/, generates task descriptions using GPT-5-Nano,
formats as chat conversations, and splits into train/val JSONL files.
Requires OPENAI_API_KEY environment variable.
Usage:
python 03_format_dataset.py
"""
import asyncio
import json
import os
import random
import yaml
from openai import AsyncOpenAI
from tqdm import tqdm
SYSTEM_PROMPT = "You are a C++ developer that writes strictly MISRA C:2012 compliant code. All code you produce must pass cppcheck MISRA analysis with zero violations."
DESCRIBE_PROMPT = """Look at this C++ program and describe what it does in ONE sentence.
Focus on the algorithm/task, not the implementation details.
Start with a verb like "Reads", "Computes", "Finds", "Sorts", etc.
Return ONLY the one-sentence description, nothing else.
```cpp
{code}
```"""
async def describe_code(client, code: str, semaphore) -> str:
async with semaphore:
try:
response = await client.chat.completions.create(
model="gpt-5-mini",
max_completion_tokens=1000,
messages=[{"role": "user", "content": DESCRIBE_PROMPT.format(code=code)}],
)
text = response.choices[0].message.content or ""
return text.strip().rstrip(".")
except Exception:
return "Write a C++ program that reads input, processes it, and outputs the result"
async def main():
with open("config.yaml") as f:
config = yaml.safe_load(f)
fmt = config["formatting"]
pairs_dir = config["paths"]["code_pairs"]
output_dir = fmt["output_dir"]
train_split = fmt["train_split"]
os.makedirs(output_dir, exist_ok=True)
client = AsyncOpenAI()
pair_files = sorted([f for f in os.listdir(pairs_dir) if f.endswith(".json") and not f.startswith(".")])
if not pair_files:
print("No pairs found.")
return
print(f"Generating task descriptions for {len(pair_files)} pairs...\n")
semaphore = asyncio.Semaphore(30)
descriptions = {}
progress = tqdm(total=len(pair_files), desc="Describing")
async def describe_one(filename):
with open(os.path.join(pairs_dir, filename)) as f:
data = json.load(f)
desc = await describe_code(client, data["original"], semaphore)
descriptions[filename] = (data, desc)
progress.update(1)
tasks = [describe_one(f) for f in pair_files]
await asyncio.gather(*tasks)
progress.close()
examples = []
for filename, (data, desc) in descriptions.items():
examples.append({
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": desc},
{"role": "assistant", "content": data["fixed"]},
]
})
random.seed(42)
random.shuffle(examples)
split_idx = int(len(examples) * train_split)
train = examples[:split_idx]
val = examples[split_idx:]
train_path = os.path.join(output_dir, "train.jsonl")
val_path = os.path.join(output_dir, "val.jsonl")
with open(train_path, "w") as f:
for ex in train:
f.write(json.dumps(ex) + "\n")
with open(val_path, "w") as f:
for ex in val:
f.write(json.dumps(ex) + "\n")
print(f"\nDone.")
print(f" Total examples: {len(examples)}")
print(f" Train: {len(train)} -> {train_path}")
print(f" Val: {len(val)} -> {val_path}")
if __name__ == "__main__":
asyncio.run(main())