Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ static/*.db-wal

.secrets
.env

internal-docs/
internal/devseed/fixtures/
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,21 @@ If you're looking to participate by contributing reversal reports (i.e. marketpl

The server starts on port `80` by default (configurable via `HTTP_PORT`).

### Seeding local data

Production ingests live data from contributing marketplaces. For local development, a deterministic synthetic dataset (~6 months, ~9.8k rows with realistic daily variance) can be loaded with:

```bash
go run ./cmd/seed
```

The seed:

- Refuses to run unless `Environment` is `development`.
- Uses `INSERT … ON CONFLICT (id) DO NOTHING`, so it's safe to re-run.
- Generates a deterministic 6-month dataset so the dashboard at `/` has enough data to exercise every period (7d / 30d / 3m / 6m / 1y).
- Uses a synthetic Steam ID prefix (`76561198000000000`) so generated IDs are clearly fake.

## Configuration

Configuration is loaded from environment variables or a `config.json` file.
Expand Down
61 changes: 61 additions & 0 deletions cmd/seed/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Command seed loads a deterministic synthetic dataset into the local
// Reverse Watch Postgres databases for local dashboard development.
// It is NEVER intended to run in production and refuses to run unless
// Config.Environment is "development".
//
// go run ./cmd/seed
//
// The insert uses ON CONFLICT (id) DO NOTHING, so re-running is safe.
package main

import (
"fmt"
"os"
"time"

"reverse-watch/config"
"reverse-watch/domain/models"
"reverse-watch/domain/models/constants"
"reverse-watch/internal/devseed"
"reverse-watch/logging"
"reverse-watch/repository/factory"
"reverse-watch/secret"
)

func main() {
logging.Initialize()
cfg := config.Load()

if cfg.Environment != constants.EnvironmentDevelopment {
fmt.Fprintf(os.Stderr, "refusing to seed: environment is %q (only %q is allowed)\n", cfg.Environment, constants.EnvironmentDevelopment)
os.Exit(1)
}

// Required by factory bootstrap (e.g. admin API key seeding). The
// synthetic generator pre-populates its own IDs, so the snowflake
// generator does not actually run for them.
models.InitSnowflakeGenerator(0, 0)

keygen := secret.NewKeyGenerator(cfg.Environment)
f, err := factory.NewFactory(cfg, keygen)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to create factory: %v\n", err)
os.Exit(1)
}
defer func() {
if err := f.Close(); err != nil {
fmt.Fprintf(os.Stderr, "failed to close factory: %v\n", err)
}
}()

reversals := devseed.GenerateSynthetic(time.Now().UTC())
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seed not idempotent due to time-dependent snowflake IDs

Medium Severity

GenerateSynthetic receives time.Now().UTC(), and all generated snowflake IDs embed createdAt timestamps derived from that value. Since today shifts daily and the nowMs cap changes every millisecond, re-running the seed at a different time produces entirely different snowflake IDs. Because ON CONFLICT (id) DO NOTHING keys on these IDs, a second run on a different day inserts ~9.8k additional rows instead of being a no-op, contradicting the documented idempotency guarantee. Anchoring to a fixed reference time instead of time.Now() would make the output truly deterministic.

Additional Locations (2)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit b067b6b. Configure here.

fmt.Printf("generated %d synthetic reversals (deterministic seed)\n", len(reversals))

inserted, err := devseed.InsertReversals(f.PublicDB(), reversals)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to insert reversals: %v\n", err)
os.Exit(1)
}
skipped := int64(len(reversals)) - inserted
fmt.Printf("seed complete: %d inserted, %d already present (skipped)\n", inserted, skipped)
}
179 changes: 179 additions & 0 deletions internal/devseed/synthetic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
// Package devseed loads dev-only fixture data into the local Postgres
// instance. It is intentionally not wired into the main binary — call it
// from cmd/seed (or a test) when you need realistic data locally.
package devseed

import (
"math"
"math/rand"
"time"

"reverse-watch/domain/models"

"gorm.io/gorm"
"gorm.io/gorm/clause"
)

const (
syntheticRNGSeed int64 = 42
syntheticDays = 180
syntheticTargetTotal = 9800
syntheticBaseSteamID uint64 = 76561198000000000
syntheticBaseReporter uint = 2_900_000
)

var syntheticMarketplaces = []struct {
slug string
weight float64
}{
{"csfloat", 0.80},
{"tradeit", 0.10},
{"skinport", 0.05},
{"swap.gg", 0.05},
}

// GenerateSynthetic returns a deterministic ~6-month dataset (~9,800 rows,
// at least one per day, gentle sinusoid with occasional spikes / quiet
// days). Snowflake IDs are unique within the slice and won't collide with
// real CSV-seeded IDs, so callers can pipe the result straight into
// InsertReversals.
func GenerateSynthetic(now time.Time) []*models.Reversal {
rng := rand.New(rand.NewSource(syntheticRNGSeed))
nowMs := uint64(now.UnixMilli())
today := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC)

counts := make([]int, syntheticDays)
for d := 0; d < syntheticDays; d++ {
base := 40.0 + 10.0*math.Sin(float64(d)/30.0)
var mult float64
switch r := rng.Float64(); {
case r < 0.05:
mult = 2.5 + rng.Float64()*2.5
case r < 0.15:
mult = 0.2 + rng.Float64()*0.3
default:
mult = 0.7 + rng.Float64()*0.6
}
counts[d] = int(math.Max(1, base*mult+rng.NormFloat64()*5))
}

total := 0
for _, c := range counts {
total += c
}
if total > 0 {
scale := float64(syntheticTargetTotal) / float64(total)
for d := range counts {
counts[d] = int(math.Max(1, math.Round(float64(counts[d])*scale)))
}
}

rows := make([]*models.Reversal, 0, syntheticTargetTotal+200)
var steamOffset uint64 = 1
var seq uint16

for d := 0; d < syntheticDays; d++ {
dayStart := today.AddDate(0, 0, -(syntheticDays-1-d))
for i := 0; i < counts[d]; i++ {
reversedAt := dayStart.Add(time.Duration(rng.Float64() * float64(24*time.Hour)))
if uint64(reversedAt.UnixMilli()) > nowMs {
reversedAt = now.Add(-1 * time.Minute)
}
reportDelay := time.Duration(rng.Intn(8*60)+1) * time.Minute
createdAt := reversedAt.Add(reportDelay)
if uint64(createdAt.UnixMilli()) > nowMs {
createdAt = now
}

srcRoll := rng.Float64()
var src models.Source
var related *models.SteamID
switch {
case srcRoll < 0.90:
src = models.SourceDirect
case srcRoll < 0.95:
src = models.SourceRelatedUser
relID := models.SteamID(syntheticBaseSteamID + (steamOffset+10_000)*97)
related = &relID
default:
src = models.SourceUserReport
}

var expunged *uint64
if rng.Float64() < 0.015 {
eAt := createdAt.Add(time.Duration(rng.Intn(24*60)+1) * time.Minute)
if uint64(eAt.UnixMilli()) < nowMs && uint64(eAt.UnixMilli()) > uint64(createdAt.UnixMilli()) {
ems := uint64(eAt.UnixMilli())
expunged = &ems
}
}

steamID := models.SteamID(syntheticBaseSteamID + steamOffset*73 + uint64(rng.Intn(50)))
steamOffset++

// Snowflake encodes created_at + a 12-bit per-ms sequence;
// mirrors domain/models/snowflake.go so generated IDs sort
// chronologically alongside production rows.
seq = (seq + 1) & 0x0FFF
sfTs := uint64(createdAt.UnixMilli()) - models.Epoch
sf := models.Snowflake((sfTs << 22) | uint64(seq))

reporter := syntheticBaseReporter + uint(steamOffset)
mp := pickMarketplace(rng)

rows = append(rows, &models.Reversal{
Model: models.Model{
ID: sf,
CreatedAt: uint64(createdAt.UnixMilli()),
UpdatedAt: uint64(createdAt.UnixMilli()),
},
SteamID: steamID,
MarketplaceSlug: mp,
Source: &src,
RelatedSteamID: related,
ReversedAt: uint64(reversedAt.UnixMilli()),
ReporterInternalID: &reporter,
ExpungedAt: expunged,
})
}
}
return rows
}

func pickMarketplace(rng *rand.Rand) string {
r := rng.Float64()
cum := 0.0
for _, mp := range syntheticMarketplaces {
cum += mp.weight
if r < cum {
return mp.slug
}
}
return syntheticMarketplaces[0].slug
}

// insertChunkSize keeps each bulk insert under Postgres's 65,535
// parameter-per-statement cap. At ~11 columns per Reversal, 1,000 rows
// uses ~11k parameters.
const insertChunkSize = 1000

// InsertReversals bulk-inserts reversals with ON CONFLICT (id) DO NOTHING,
// so the seed is idempotent. Returns the number of rows actually inserted.
func InsertReversals(db *gorm.DB, reversals []*models.Reversal) (int64, error) {
if len(reversals) == 0 {
return 0, nil
}
var inserted int64
for i := 0; i < len(reversals); i += insertChunkSize {
end := i + insertChunkSize
if end > len(reversals) {
end = len(reversals)
}
res := db.Clauses(clause.OnConflict{DoNothing: true}).Create(reversals[i:end])
if res.Error != nil {
return inserted, res.Error
}
inserted += res.RowsAffected
}
return inserted, nil
}
Loading