-
Notifications
You must be signed in to change notification settings - Fork 4
cmd/seed: synthetic dataset for local development #70
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
ZukwiZ
wants to merge
1
commit into
master
Choose a base branch
from
feat/dev-seed-synthetic
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+258
−0
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,3 +12,6 @@ static/*.db-wal | |
|
|
||
| .secrets | ||
| .env | ||
|
|
||
| internal-docs/ | ||
| internal/devseed/fixtures/ | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| // Command seed loads a deterministic synthetic dataset into the local | ||
| // Reverse Watch Postgres databases for local dashboard development. | ||
| // It is NEVER intended to run in production and refuses to run unless | ||
| // Config.Environment is "development". | ||
| // | ||
| // go run ./cmd/seed | ||
| // | ||
| // The insert uses ON CONFLICT (id) DO NOTHING, so re-running is safe. | ||
| package main | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "os" | ||
| "time" | ||
|
|
||
| "reverse-watch/config" | ||
| "reverse-watch/domain/models" | ||
| "reverse-watch/domain/models/constants" | ||
| "reverse-watch/internal/devseed" | ||
| "reverse-watch/logging" | ||
| "reverse-watch/repository/factory" | ||
| "reverse-watch/secret" | ||
| ) | ||
|
|
||
| func main() { | ||
| logging.Initialize() | ||
| cfg := config.Load() | ||
|
|
||
| if cfg.Environment != constants.EnvironmentDevelopment { | ||
| fmt.Fprintf(os.Stderr, "refusing to seed: environment is %q (only %q is allowed)\n", cfg.Environment, constants.EnvironmentDevelopment) | ||
| os.Exit(1) | ||
| } | ||
|
|
||
| // Required by factory bootstrap (e.g. admin API key seeding). The | ||
| // synthetic generator pre-populates its own IDs, so the snowflake | ||
| // generator does not actually run for them. | ||
| models.InitSnowflakeGenerator(0, 0) | ||
|
|
||
| keygen := secret.NewKeyGenerator(cfg.Environment) | ||
| f, err := factory.NewFactory(cfg, keygen) | ||
| if err != nil { | ||
| fmt.Fprintf(os.Stderr, "failed to create factory: %v\n", err) | ||
| os.Exit(1) | ||
| } | ||
| defer func() { | ||
| if err := f.Close(); err != nil { | ||
| fmt.Fprintf(os.Stderr, "failed to close factory: %v\n", err) | ||
| } | ||
| }() | ||
|
|
||
| reversals := devseed.GenerateSynthetic(time.Now().UTC()) | ||
| fmt.Printf("generated %d synthetic reversals (deterministic seed)\n", len(reversals)) | ||
|
|
||
| inserted, err := devseed.InsertReversals(f.PublicDB(), reversals) | ||
| if err != nil { | ||
| fmt.Fprintf(os.Stderr, "failed to insert reversals: %v\n", err) | ||
| os.Exit(1) | ||
| } | ||
| skipped := int64(len(reversals)) - inserted | ||
| fmt.Printf("seed complete: %d inserted, %d already present (skipped)\n", inserted, skipped) | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,179 @@ | ||
| // Package devseed loads dev-only fixture data into the local Postgres | ||
| // instance. It is intentionally not wired into the main binary — call it | ||
| // from cmd/seed (or a test) when you need realistic data locally. | ||
| package devseed | ||
|
|
||
| import ( | ||
| "math" | ||
| "math/rand" | ||
| "time" | ||
|
|
||
| "reverse-watch/domain/models" | ||
|
|
||
| "gorm.io/gorm" | ||
| "gorm.io/gorm/clause" | ||
| ) | ||
|
|
||
| const ( | ||
| syntheticRNGSeed int64 = 42 | ||
| syntheticDays = 180 | ||
| syntheticTargetTotal = 9800 | ||
| syntheticBaseSteamID uint64 = 76561198000000000 | ||
| syntheticBaseReporter uint = 2_900_000 | ||
| ) | ||
|
|
||
| var syntheticMarketplaces = []struct { | ||
| slug string | ||
| weight float64 | ||
| }{ | ||
| {"csfloat", 0.80}, | ||
| {"tradeit", 0.10}, | ||
| {"skinport", 0.05}, | ||
| {"swap.gg", 0.05}, | ||
| } | ||
|
|
||
| // GenerateSynthetic returns a deterministic ~6-month dataset (~9,800 rows, | ||
| // at least one per day, gentle sinusoid with occasional spikes / quiet | ||
| // days). Snowflake IDs are unique within the slice and won't collide with | ||
| // real CSV-seeded IDs, so callers can pipe the result straight into | ||
| // InsertReversals. | ||
| func GenerateSynthetic(now time.Time) []*models.Reversal { | ||
| rng := rand.New(rand.NewSource(syntheticRNGSeed)) | ||
| nowMs := uint64(now.UnixMilli()) | ||
| today := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) | ||
|
|
||
| counts := make([]int, syntheticDays) | ||
| for d := 0; d < syntheticDays; d++ { | ||
| base := 40.0 + 10.0*math.Sin(float64(d)/30.0) | ||
| var mult float64 | ||
| switch r := rng.Float64(); { | ||
| case r < 0.05: | ||
| mult = 2.5 + rng.Float64()*2.5 | ||
| case r < 0.15: | ||
| mult = 0.2 + rng.Float64()*0.3 | ||
| default: | ||
| mult = 0.7 + rng.Float64()*0.6 | ||
| } | ||
| counts[d] = int(math.Max(1, base*mult+rng.NormFloat64()*5)) | ||
| } | ||
|
|
||
| total := 0 | ||
| for _, c := range counts { | ||
| total += c | ||
| } | ||
| if total > 0 { | ||
| scale := float64(syntheticTargetTotal) / float64(total) | ||
| for d := range counts { | ||
| counts[d] = int(math.Max(1, math.Round(float64(counts[d])*scale))) | ||
| } | ||
| } | ||
|
|
||
| rows := make([]*models.Reversal, 0, syntheticTargetTotal+200) | ||
| var steamOffset uint64 = 1 | ||
| var seq uint16 | ||
|
|
||
| for d := 0; d < syntheticDays; d++ { | ||
| dayStart := today.AddDate(0, 0, -(syntheticDays-1-d)) | ||
| for i := 0; i < counts[d]; i++ { | ||
| reversedAt := dayStart.Add(time.Duration(rng.Float64() * float64(24*time.Hour))) | ||
| if uint64(reversedAt.UnixMilli()) > nowMs { | ||
| reversedAt = now.Add(-1 * time.Minute) | ||
| } | ||
| reportDelay := time.Duration(rng.Intn(8*60)+1) * time.Minute | ||
| createdAt := reversedAt.Add(reportDelay) | ||
| if uint64(createdAt.UnixMilli()) > nowMs { | ||
| createdAt = now | ||
| } | ||
|
|
||
| srcRoll := rng.Float64() | ||
| var src models.Source | ||
| var related *models.SteamID | ||
| switch { | ||
| case srcRoll < 0.90: | ||
| src = models.SourceDirect | ||
| case srcRoll < 0.95: | ||
| src = models.SourceRelatedUser | ||
| relID := models.SteamID(syntheticBaseSteamID + (steamOffset+10_000)*97) | ||
| related = &relID | ||
| default: | ||
| src = models.SourceUserReport | ||
| } | ||
|
|
||
| var expunged *uint64 | ||
| if rng.Float64() < 0.015 { | ||
| eAt := createdAt.Add(time.Duration(rng.Intn(24*60)+1) * time.Minute) | ||
| if uint64(eAt.UnixMilli()) < nowMs && uint64(eAt.UnixMilli()) > uint64(createdAt.UnixMilli()) { | ||
| ems := uint64(eAt.UnixMilli()) | ||
| expunged = &ems | ||
| } | ||
| } | ||
|
|
||
| steamID := models.SteamID(syntheticBaseSteamID + steamOffset*73 + uint64(rng.Intn(50))) | ||
| steamOffset++ | ||
|
|
||
| // Snowflake encodes created_at + a 12-bit per-ms sequence; | ||
| // mirrors domain/models/snowflake.go so generated IDs sort | ||
| // chronologically alongside production rows. | ||
| seq = (seq + 1) & 0x0FFF | ||
| sfTs := uint64(createdAt.UnixMilli()) - models.Epoch | ||
| sf := models.Snowflake((sfTs << 22) | uint64(seq)) | ||
|
|
||
| reporter := syntheticBaseReporter + uint(steamOffset) | ||
| mp := pickMarketplace(rng) | ||
|
|
||
| rows = append(rows, &models.Reversal{ | ||
| Model: models.Model{ | ||
| ID: sf, | ||
| CreatedAt: uint64(createdAt.UnixMilli()), | ||
| UpdatedAt: uint64(createdAt.UnixMilli()), | ||
| }, | ||
| SteamID: steamID, | ||
| MarketplaceSlug: mp, | ||
| Source: &src, | ||
| RelatedSteamID: related, | ||
| ReversedAt: uint64(reversedAt.UnixMilli()), | ||
| ReporterInternalID: &reporter, | ||
| ExpungedAt: expunged, | ||
| }) | ||
| } | ||
| } | ||
| return rows | ||
| } | ||
|
|
||
| func pickMarketplace(rng *rand.Rand) string { | ||
| r := rng.Float64() | ||
| cum := 0.0 | ||
| for _, mp := range syntheticMarketplaces { | ||
| cum += mp.weight | ||
| if r < cum { | ||
| return mp.slug | ||
| } | ||
| } | ||
| return syntheticMarketplaces[0].slug | ||
| } | ||
|
|
||
| // insertChunkSize keeps each bulk insert under Postgres's 65,535 | ||
| // parameter-per-statement cap. At ~11 columns per Reversal, 1,000 rows | ||
| // uses ~11k parameters. | ||
| const insertChunkSize = 1000 | ||
|
|
||
| // InsertReversals bulk-inserts reversals with ON CONFLICT (id) DO NOTHING, | ||
| // so the seed is idempotent. Returns the number of rows actually inserted. | ||
| func InsertReversals(db *gorm.DB, reversals []*models.Reversal) (int64, error) { | ||
| if len(reversals) == 0 { | ||
| return 0, nil | ||
| } | ||
| var inserted int64 | ||
| for i := 0; i < len(reversals); i += insertChunkSize { | ||
| end := i + insertChunkSize | ||
| if end > len(reversals) { | ||
| end = len(reversals) | ||
| } | ||
| res := db.Clauses(clause.OnConflict{DoNothing: true}).Create(reversals[i:end]) | ||
| if res.Error != nil { | ||
| return inserted, res.Error | ||
| } | ||
| inserted += res.RowsAffected | ||
| } | ||
| return inserted, nil | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seed not idempotent due to time-dependent snowflake IDs
Medium Severity
GenerateSyntheticreceivestime.Now().UTC(), and all generated snowflake IDs embedcreatedAttimestamps derived from that value. Sincetodayshifts daily and thenowMscap changes every millisecond, re-running the seed at a different time produces entirely different snowflake IDs. BecauseON CONFLICT (id) DO NOTHINGkeys on these IDs, a second run on a different day inserts ~9.8k additional rows instead of being a no-op, contradicting the documented idempotency guarantee. Anchoring to a fixed reference time instead oftime.Now()would make the output truly deterministic.Additional Locations (2)
internal/devseed/synthetic.go#L42-L43internal/devseed/synthetic.go#L117-L119Reviewed by Cursor Bugbot for commit b067b6b. Configure here.