SQL-LLM-Distillation-GRPO/main.py at main · prodesk98/SQL-LLM-Distillation-GRPO · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import argparse

from logging_ import setup_logger
setup_logger()
from loguru import logger


parser = argparse.ArgumentParser(
    prog="uv run main.py",
    description=(
        "A modular framework for SQL data distillation with reasoning using Large Language Models (LLMs). "
        "Supports multiple backends (OpenAI, vLLM, Groq, Hugging Face) and integrates Chain of Thought prompting "
        "with online reinforcement learning (GRPO) for high-quality SQL generation."
    ),
    epilog="Developed by Protons · GitHub: https://github.com/prodesk98",
)

subparsers = parser.add_subparsers(dest="command", required=True, help="Subcommands like 'train', 'distill'")

# Train subparser
parser_train = subparsers.add_parser(
    "train",
    help="Train a model using the specified parameters.",
)
parser_train.add_argument(
    "--model",
    type=str,
    default="unsloth/gemma-3-1b-it",
    help="Model name for training. Default is 'unsloth/gemma-3-1b-it'.",
)
parser_train.add_argument(
    "--dataset-repo-id",
    type=str,
    default="gretelai/synthetic_text_to_sql",
    help="Path to the training dataset. Default is 'gretelai/synthetic_text_to_sql'.",
)
parser_train.add_argument(
    "--publish-repo-id",
    type=str,
    help="Path to the published repository. This is a required argument if --publish is set.",
)
#

# Distill subparser
parser_distill = subparsers.add_parser(
    "distill",
    help="Distill a model using the specified parameters.",
)
parser_distill.add_argument(
    "--model",
    type=str,
    default=None,
    help="Model name for distillation. Default is 'None'.",
)
parser_distill.add_argument(
    "--publish",
    action="store_true",
    help="Publish the distilled model to Hugging Face Hub. Default is False.",
)
parser_distill.add_argument(
    "--publish-repo-id",
    type=str,
    help="Path to the published repository. This is a required argument if --publish is set.",
)
parser_distill.add_argument(
    "--private-repo",
    action="store_true",
    help="Create a private repository on Hugging Face Hub. Default is False.",
)
parser_distill.add_argument(
    "--dataset-repo-id",
    type=str,
    default="gretelai/synthetic_text_to_sql",
    help="Path to the distillation dataset.",
)
parser_distill.add_argument(
    "--limit",
    type=int,
    default=100,
    help="Limit the number of samples to distill. Default is 100.",
)
parser_distill.add_argument(
    "--provider",
    type=str,
    default="OpenAI",
    choices=["OpenAI", "vLLM", "Groq", "HuggingFace"],
    help="Provider for the distillation process. Default is 'OpenAI'.",
)
parser_distill.add_argument(
    "--validate",
    action="store_true",
    help="Validate the SQL query. Default is False.",
)
parser_distill.add_argument(
    "--remove-no-valid",
    action="store_true",
    help="Remove invalid SQL queries from the dataset. Default is False.",
)
parser_distill.add_argument(
    "--batch-size",
    type=int,
    default=8,
    help="Batch size for the distillation process. Default is 8.",
)
parser_distill.add_argument(
    "--retries",
    type=int,
    default=3,
    help="Number of retries for the distillation process. Default is 3.",
)
parser_distill.add_argument(
    "--use-ray",
    action="store_true",
    help="Use Ray for distributed processing. Default is False.",
)
#

args = parser.parse_args()

if args.command == "train":
    if args.model is None and args.provider != "HuggingFace":
        raise ValueError("model must be provided when provider is not HuggingFace")
    if args.dataset_repo_id is None:
        raise ValueError("dataset_repo_id must be provided")

    logger.info(
        f"Training model: {args.model} with dataset: {args.dataset_repo_id}"
    )

    from control.trainer_control import TrainerControl
    trainer = TrainerControl(
        args.model,
        dataset_repo_id=args.dataset_repo_id,
        use_vllm=True,
        load_in_4bit=True,
        publish_repo_id=args.publish_repo_id,
    )
    trainer.train()
    trainer.publish()
elif args.command == "distill":
    if args.publish and args.publish_repo_id is None:
        raise ValueError("publish_repo_id must be provided when publish is True")
    if args.model is None and args.provider != "HuggingFace":
        raise ValueError("model must be provided when provider is not HuggingFace")

    logger.info(
        f"Distilling model: [{args.provider}] {args.model} with dataset: {args.dataset_repo_id} with {args.limit} samples."
    )

    from control.distill_control import DistillControl
    distill = DistillControl(
        model=args.model,
        dataset_repo_id=args.dataset_repo_id,
        publish_repo_id=args.publish_repo_id,
        publish=args.publish,
        limit=args.limit,
        provider=args.provider,
        validate=args.validate,
        private_repo=args.private_repo,
        use_ray=args.use_ray,
        batch_size=args.batch_size,
    )
    distill.run()
else:
    parser.print_help()
    raise ValueError("Invalid command. Use 'train' or 'distill'.")