-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
309 lines (253 loc) · 10.5 KB
/
Makefile
File metadata and controls
309 lines (253 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# Makefile for BGG predictive models
# Default settings
RAW_DIR := data/raw
.PHONY: help clean all
help: ## Show this help message
@echo 'Usage:'
@echo ' make help Show this help message'
@echo ' make requirements Install/update Python dependencies'
@echo ' make format Format code using ruff'
@echo ' make lint Lint code using ruff'
@echo ' make fix Fix linting issues using ruff'
@echo ' make test Run tests using pytest'
@echo ' make data Fetch training data from BigQuery'
@echo ' make models Train all model candidates'
@echo ' make register Register all models to scoring service'
@echo ' make register_embeddings Register embeddings model to embeddings service'
@echo ' make clean_experiments Remove all experiment subfolders'
@echo ' make clean_predictions Remove data/prediction subfolders'
@echo ' make years Show year configuration for model training'
@echo ' make evaluate Evaluate models over time using config.yaml'
@echo ' make evaluate-dry-run Show what evaluation would do without running'
@echo ' make predictions Generate predictions using trained models'
@echo ' make embeddings Train all embedding models (pca, svd, umap)'
@echo ' make embeddings_pca Train PCA embeddings'
@echo ' make embeddings_svd Train SVD embeddings'
@echo ' make embeddings_umap Train UMAP embeddings'
@echo ' make embeddings_autoencoder Train Autoencoder embeddings (requires torch)'
@echo ' make text_embeddings Train text embeddings from descriptions (PMI+SVD)'
@echo ' make register_text_embeddings Register text embeddings model to GCS'
@echo ' make experiment_dashboard Launch predictions dashboard'
@echo ' make predictions_dashboard Launch geek rating dashboard'
@echo ' make unsupervised_dashboard Launch unsupervised learning dashboard'
@echo ' make upload_experiments Upload experiments to Google Cloud Storage'
@echo ' make download_experiments Download experiments from Google Cloud Storage'
@echo ' make docker-training Build and run training Docker image locally'
@echo ' make docker-scoring Build and run scoring Docker image locally'
@echo ' make start-scoring Start scoring service with credentials'
@echo ' make stop-scoring Stop scoring service'
@echo ' make scoring-service Build and run scoring service locally'
@echo ' make scoring-service-upload Build and run scoring service and upload to BigQuery'
@echo ' make streamlit-build Build Streamlit Docker image'
@echo ' make streamlit-run Run Streamlit Docker container'
@echo ' make streamlit-stop Stop Streamlit container'
@echo ' make streamlit-test Build and test Streamlit Docker image interactively'
# requirements
.PHONY: requirements format lint
requirements:
uv sync
format:
uv run ruff format .
lint:
uv run ruff check .
fix:
uv run ruff check . --fix
test:
uv run -m pytest tests/
## fetch training data from BigQuery
.PHONY: data
data:
uv run -m src.pipeline.data --model hurdle
uv run -m src.pipeline.data --model complexity
# model types
LINEAR ?= linear
RIDGE ?= ridge
LOGISTIC ?= logistic
CATBOOST ?= catboost
LIGHTGBM ?= lightgbm
LIGHTGBM_LINEAR ?= lightgbm_linear
# set defaults
## train all model candidates and predict geek rating
.PHONY: models
models: hurdle complexity rating users_rated geek_rating
# train individual models
hurdle: train_hurdle
complexity: train_complexity score_complexity
rating: train_rating
users_rated: train_users_rated
geek_rating: train_geek_rating
## train individual models
# hurdle model
train_hurdle:
uv run -m src.pipeline.train \
--model hurdle
score_hurdle:
uv run -m src.pipeline.score \
--model hurdle
# complexity
train_complexity:
uv run -m src.pipeline.train \
--model complexity
score_complexity:
uv run -m src.pipeline.score \
--model complexity \
--all-years
# rating
train_rating:
uv run -m src.pipeline.train \
--model rating
# users rated
# rating
train_users_rated:
uv run -m src.pipeline.train \
--model users_rated
# geek rating
train_geek_rating:
uv run -m src.pipeline.train \
--model geek_rating
## finalize
finalize:
uv run -m src.pipeline.finalize
# evaluate over time using config.yaml settings
.PHONY: evaluate evaluate-dry-run
evaluate:
uv run -m src.pipeline.evaluate
evaluate-dry-run: ## Show what evaluation would do without running
uv run -m src.pipeline.evaluate --dry-run
## embeddings models (settings from config.yaml, data from BigQuery)
.PHONY: embeddings embeddings_pca embeddings_svd embeddings_autoencoder
embeddings: embeddings_pca embeddings_svd embeddings_autoencoder
embeddings_pca:
uv run -m src.models.embeddings.train --algorithm pca
embeddings_svd:
uv run -m src.models.embeddings.train --algorithm svd
embeddings_autoencoder:
uv run -m src.models.embeddings.train --algorithm autoencoder
## text embeddings (word embeddings from descriptions)
.PHONY: text_embeddings text_embeddings_pmi
text_embeddings: text_embeddings_pmi
text_embeddings_pmi:
uv run -m src.models.text_embeddings.train --algorithm pmi
## text embeddings registration and scoring
TEXT_EMBEDDINGS_CANDIDATE ?= text-embeddings
register_text_embeddings:
uv run -m text_embeddings_service.register_model \
--experiment $(TEXT_EMBEDDINGS_CANDIDATE) \
--name text-embeddings-v$(CURRENT_YEAR) \
--description "Production (v$(CURRENT_YEAR)) text embeddings for game descriptions"
### register models (reads from config.yaml)
.PHONY: register register-dry-run
register:
uv run python register.py
register-dry-run:
uv run python register.py --dry-run
EMBEDDINGS_CANDIDATE ?= svd-embeddings
register_embeddings:
uv run -m embeddings_service.register_model \
--experiment $(EMBEDDINGS_CANDIDATE) \
--name embeddings-v$(CURRENT_YEAR) \
--description "Production (v$(CURRENT_YEAR)) SVD embeddings for game similarity"
## dashboard
.PHONY: streamlit dashboard
streamlit dashboard:
uv run streamlit run src/streamlit/Home.py
## view experiments
experiments:
uv run streamlit run src/monitor/experiment_dashboard.py
# dashboard to look at predicted geek rating
predictions_dashboard:
uv run streamlit run src/monitor/predictions_dashboard.py
## view experiments
unsupervised_dashboard:
uv run streamlit run src/monitor/unsupervised_dashboard.py
clean-experiments:
@uv run python -c "import shutil; from pathlib import Path; \
p = Path('models/experiments'); \
dirs = [d for d in p.iterdir() if d.is_dir()] if p.exists() else []; \
print(f'This will delete {len(dirs)} subfolders in models/experiments/'); \
confirm = input('Are you sure? (y/n) ') if dirs else 'n'; \
[shutil.rmtree(d) for d in dirs] if confirm == 'y' else None; \
print('Subfolders deleted.' if confirm == 'y' and dirs else 'Aborted.' if dirs else 'No subfolders found.')"
# remove local predictions
.PHONY: clean_predictions
clean-data:
@uv run python -c "import shutil; from pathlib import Path; \
p = Path('data/predictions'); \
dirs = [d for d in p.iterdir() if d.is_dir()] if p.exists() else []; \
print(f'This will delete {len(dirs)} subfolders in data/predictions/'); \
confirm = input('Are you sure? (y/n) ') if dirs else 'n'; \
[shutil.rmtree(d) for d in dirs] if confirm == 'y' else None; \
print('Subfolders deleted.' if confirm == 'y' and dirs else 'Aborted.' if dirs else 'No subfolders found.')"
# upload experiments to Google Cloud Storage
# Use ENVIRONMENT=prod or ENVIRONMENT=dev to specify, or ENVIRONMENT=auto to detect from git branch
.PHONY: upload_experiments
upload-experiments:
uv run -m src.utils.sync_experiments --create-bucket $(if $(ENVIRONMENT),--environment $(ENVIRONMENT),)
.PHONY: download_experiments
download-experiments:
uv run -m src.utils.sync_experiments --download $(if $(ENVIRONMENT),--environment $(ENVIRONMENT),)
# Setup git hooks for automatic experiment syncing
.PHONY: setup-hooks
setup-hooks:
git config core.hooksPath .githooks
@echo "Git hooks configured to use .githooks directory"
# dockerfile training locally
.PHONY: docker-training docker-scoring scoring-service
docker-training:
docker build -f docker/training.Dockerfile -t bgg-training:test . \
&& docker run -it \
--env-file .env \
bgg-training:test python -c "import os; print('Environment Variables:'); print(f'GCP_PROJECT_ID: {os.getenv(\"GCP_PROJECT_ID\")}'); print(f'GCS_BUCKET_NAME: {os.getenv(\"GCS_BUCKET_NAME\")}')"
# run scoring service with credentials mounted
docker-scoring:
docker build -f docker/scoring.Dockerfile -t bgg-scoring-service .
start-scoring: docker-scoring
@docker rm -f bgg-scoring 2>/dev/null || true
docker run -d \
--name bgg-scoring \
-p 8087:8080 \
-v $(PWD)/credentials:/app/credentials \
-e GOOGLE_APPLICATION_CREDENTIALS=/app/credentials/service-account-key.json \
--env-file .env \
bgg-scoring-service
stop-scoring:
@if docker ps -q --filter name=bgg-scoring | grep -q .; then \
echo "Stopping scoring service container"; \
docker stop bgg-scoring && docker rm bgg-scoring; \
else \
echo "No running scoring service container found"; \
docker rm bgg-scoring 2>/dev/null || true; \
fi
# run scoring service locally
scoring-service:
uv run -m scoring_service.score \
--service-url http://localhost:8087 \
--start-year $(SCORE_START_YEAR) \
--end-year $(SCORE_END_YEAR) \
--download
scoring-service-upload:
uv run -m scoring_service.score \
--service-url http://localhost:8087 \
--start-year $(SCORE_START_YEAR) \
--end-year $(SCORE_END_YEAR) \
--upload-to-bigquery \
--download
# Streamlit targets
.PHONY: streamlit-build streamlit-run streamlit-stop
streamlit-build: ## Build Streamlit Docker image
docker build -f docker/streamlit.Dockerfile -t bgg-streamlit:test .
streamlit-run: ## Run Streamlit Docker container
docker run -d \
-p 8080:8080 \
--env-file .env \
--name bgg-streamlit-container \
bgg-streamlit:test
@echo "Streamlit available at: http://localhost:8080"
streamlit-stop: ## Stop Streamlit container
@container=$$(docker ps -q --filter name=bgg-streamlit-container); \
if [ -n "$$container" ]; then \
echo "Stopping Streamlit container: $$container"; \
docker stop $$container && docker rm $$container; \
else \
echo "No running Streamlit container found"; \
fi