-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_1m_minimal_variant.sh
More file actions
171 lines (145 loc) · 4.88 KB
/
load_1m_minimal_variant.sh
File metadata and controls
171 lines (145 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/bin/bash
"""
Load 1M Records into Minimal Variant Schema
===========================================
This script loads 1M Bluesky records into ClickHouse using the Variant(JSON) type.
This demonstrates ClickHouse's new Variant type for JSON analytics.
"""
set -e
# Configuration
DATABASE="bluesky_minimal_1m"
TABLE="bluesky_data"
DATA_FILE="bluesky_1m_baseline.jsonl"
RECORDS_TO_LOAD=1000000
echo "========================================"
echo "LOADING 1M RECORDS - MINIMAL VARIANT"
echo "========================================"
echo "Database: $DATABASE"
echo "Table: $TABLE"
echo "Data file: $DATA_FILE"
echo "Records to load: $RECORDS_TO_LOAD"
echo ""
# Check if data file exists
if [ ! -f "$DATA_FILE" ]; then
echo "❌ Error: Data file $DATA_FILE not found!"
echo "Please ensure the 1M baseline data file is available."
exit 1
fi
echo "✓ Data file found: $(du -h $DATA_FILE | cut -f1)"
# Step 1: Create database and table schema
echo ""
echo "1. Creating database and table schema..."
clickhouse client --query "
CREATE DATABASE IF NOT EXISTS $DATABASE;
DROP TABLE IF EXISTS $DATABASE.$TABLE;
CREATE TABLE $DATABASE.$TABLE (
data Variant(JSON)
) ENGINE = MergeTree
ORDER BY tuple()
SETTINGS
allow_experimental_variant_type = 1,
use_variant_as_common_type = 1
COMMENT 'Minimal Variant - using ClickHouse Variant(JSON) type for flexible JSON storage';
"
if [ $? -eq 0 ]; then
echo "✓ Schema created successfully"
else
echo "❌ Schema creation failed"
exit 1
fi
# Step 2: Load data with proper JSON wrapping
echo ""
echo "2. Loading $RECORDS_TO_LOAD records into minimal variant..."
echo " This will take a few minutes..."
start_time=$(date +%s)
# Use sed to wrap each JSON record in {"data": ...} format for JSONEachRow
cat "$DATA_FILE" | head -$RECORDS_TO_LOAD | sed 's/^/{"data":/' | sed 's/$/}/' | \
clickhouse client --query "INSERT INTO $DATABASE.$TABLE FORMAT JSONEachRow"
load_result=$?
end_time=$(date +%s)
load_duration=$((end_time - start_time))
if [ $load_result -eq 0 ]; then
echo "✓ Data loaded successfully in ${load_duration}s"
else
echo "❌ Data loading failed"
exit 1
fi
# Step 3: Verify data integrity
echo ""
echo "3. Verifying data integrity..."
# Check record count
record_count=$(clickhouse client --query "SELECT count() FROM $DATABASE.$TABLE")
echo " Records loaded: $record_count"
if [ "$record_count" -ne "$RECORDS_TO_LOAD" ]; then
echo "❌ Warning: Expected $RECORDS_TO_LOAD records, got $record_count"
fi
# Check data content using both query methods
echo " Checking data content..."
# Method 1: Direct JSON field access (fastest)
echo " Testing direct JSON field access..."
sample_data_direct=$(clickhouse client --query "
SELECT
toString(data.JSON.did) as did,
toString(data.JSON.kind) as kind,
toString(data.JSON.time_us) as time_us
FROM $DATABASE.$TABLE
LIMIT 3
")
if [ $? -eq 0 ] && [ -n "$sample_data_direct" ]; then
echo "✓ Direct access verified:"
echo "$sample_data_direct" | head -3
else
echo "❌ Direct access verification failed"
exit 1
fi
# Method 2: JSONExtract method (slower but compatible)
echo " Testing JSONExtract method..."
sample_data_extract=$(clickhouse client --query "
SELECT
JSONExtractString(toString(data.JSON), 'did') as did,
JSONExtractString(toString(data.JSON), 'kind') as kind,
JSONExtractString(toString(data.JSON), 'time_us') as time_us
FROM $DATABASE.$TABLE
LIMIT 3
SETTINGS max_threads = 1, max_memory_usage = 4000000000
")
if [ $? -eq 0 ] && [ -n "$sample_data_extract" ]; then
echo "✓ JSONExtract method verified:"
echo "$sample_data_extract" | head -3
else
echo "❌ JSONExtract method verification failed"
fi
# Step 4: Show table statistics
echo ""
echo "4. Table statistics:"
# Get table size
table_size=$(clickhouse client --query "
SELECT formatReadableSize(sum(bytes_on_disk))
FROM system.parts
WHERE database = '$DATABASE' AND table = '$TABLE' AND active = 1
")
# Check variant type distribution
echo " Table size: $table_size"
echo " Variant type distribution:"
clickhouse client --query "
SELECT variantType(data) as variant_type, count()
FROM $DATABASE.$TABLE
GROUP BY variant_type
"
echo ""
echo "========================================"
echo "✅ MINIMAL VARIANT LOADING COMPLETED"
echo "========================================"
echo "Database: $DATABASE"
echo "Table: $TABLE"
echo "Records: $record_count"
echo "Size: $table_size"
echo "Load time: ${load_duration}s"
echo ""
echo "Query Methods Available:"
echo "1. Direct JSON field access (FASTEST):"
echo " SELECT toString(data.JSON.kind), count() FROM $DATABASE.$TABLE GROUP BY toString(data.JSON.kind)"
echo ""
echo "2. JSONExtract method (SLOWER but compatible):"
echo " SELECT JSONExtractString(toString(data.JSON), 'kind'), count() FROM $DATABASE.$TABLE GROUP BY JSONExtractString(toString(data.JSON), 'kind') SETTINGS max_threads=1, max_memory_usage=4000000000"
echo ""