@@ -137,6 +137,7 @@ pip install pydeequ[duckdb]
137137``` python
138138import duckdb
139139import pydeequ
140+ from pydeequ.v2.verification import AnalysisRunner, VerificationSuite
140141from pydeequ.v2.analyzers import Size, Completeness, Mean
141142from pydeequ.v2.checks import Check, CheckLevel
142143from pydeequ.v2.predicates import eq, gte
@@ -152,18 +153,19 @@ con.execute("""
152153""" )
153154
154155# Create an engine from the connection
155- engine = pydeequ.connect(con, table = " users " )
156+ engine = pydeequ.connect(con)
156157
157158# Run analyzers
158- metrics = engine.compute_metrics([
159- Size(),
160- Completeness(" id" ),
161- Completeness(" age" ),
162- Mean(" age" ),
163- ])
159+ result = (AnalysisRunner(engine)
160+ .onData(table = " users" )
161+ .addAnalyzer(Size())
162+ .addAnalyzer(Completeness(" id" ))
163+ .addAnalyzer(Completeness(" age" ))
164+ .addAnalyzer(Mean(" age" ))
165+ .run())
166+
164167print (" Metrics:" )
165- for m in metrics:
166- print (f " { m.name} ( { m.instance} ): { m.value} " )
168+ print (result.to_string(index = False ))
167169
168170# Run constraint checks
169171check = (Check(CheckLevel.Error, " Data quality checks" )
@@ -172,16 +174,13 @@ check = (Check(CheckLevel.Error, "Data quality checks")
172174 .isComplete(" name" )
173175 .hasCompleteness(" age" , gte(0.5 )))
174176
175- results = engine.run_checks([check] )
176- print ( " \n Constraint Results: " )
177- for r in results:
178- print ( f " { r.constraint } : { r.constraint_status } " )
177+ result = (VerificationSuite(engine )
178+ .onData( table = " users " )
179+ .addCheck(check)
180+ .run() )
179181
180- # Profile columns
181- profiles = engine.profile_columns()
182- print (" \n Column Profiles:" )
183- for p in profiles:
184- print (f " { p.column} : completeness= { p.completeness} , distinct= { p.approx_distinct_values} " )
182+ print (" \n Constraint Results:" )
183+ print (result.to_string(index = False ))
185184
186185con.close()
187186```
@@ -269,12 +268,14 @@ pip install pydeequ[spark]
269268
270269``` python
271270from pyspark.sql import SparkSession, Row
271+ import pydeequ
272272from pydeequ.v2.checks import Check, CheckLevel
273273from pydeequ.v2.verification import VerificationSuite
274274from pydeequ.v2.predicates import eq, gte
275275
276276# Connect to Spark Connect server
277277spark = SparkSession.builder.remote(" sc://localhost:15002" ).getOrCreate()
278+ engine = pydeequ.connect(spark)
278279
279280# Create sample data
280281df = spark.createDataFrame([
@@ -292,12 +293,12 @@ check = (Check(CheckLevel.Error, "Data quality checks")
292293 .isUnique(" id" ))
293294
294295# Run verification
295- result = (VerificationSuite(spark )
296- .onData(df)
296+ result = (VerificationSuite(engine )
297+ .onData(dataframe = df)
297298 .addCheck(check)
298299 .run())
299300
300- result.show( truncate = False )
301+ print ( result.to_string( index = False ) )
301302spark.stop()
302303```
303304
@@ -344,14 +345,21 @@ from pydeequ.v2.analyzers import (
344345 Uniqueness, Entropy, Correlation
345346)
346347
347- result = (AnalysisRunner(spark)
348- .onData(df)
348+ # DuckDB
349+ result = (AnalysisRunner(engine)
350+ .onData(table = " users" )
349351 .addAnalyzer(Size())
350352 .addAnalyzer(Completeness(" name" ))
351353 .addAnalyzer(Mean(" age" ))
352354 .run())
353355
354- result.show()
356+ # Spark
357+ result = (AnalysisRunner(engine)
358+ .onData(dataframe = df)
359+ .addAnalyzer(Size())
360+ .addAnalyzer(Completeness(" name" ))
361+ .addAnalyzer(Mean(" age" ))
362+ .run())
355363```
356364
357365### Constraint Methods
@@ -386,26 +394,21 @@ result.show()
386394Profile column distributions and statistics across your dataset:
387395
388396``` python
389- from pydeequ.v2.profiles import ColumnProfilerRunner, KLLParameters
397+ from pydeequ.v2.profiles import ColumnProfilerRunner
390398
391399# Basic profiling
392- profiles = (ColumnProfilerRunner(spark)
393- .onData(df)
400+ profiles = (ColumnProfilerRunner(engine)
401+ .onData(table = " users" ) # DuckDB: use table=
402+ # .onData(dataframe=df) # Spark: use dataframe=
394403 .run())
395404
396- profiles.show()
397-
398- # Advanced profiling with options
399- profiles = (ColumnProfilerRunner(spark)
400- .onData(df)
401- .restrictToColumns([" id" , " name" , " age" ]) # Profile specific columns
402- .withLowCardinalityHistogramThreshold(100 ) # Generate histograms for low-cardinality columns
403- .withKLLProfiling() # Enable KLL sketch for approximate quantiles
404- .setKLLParameters(KLLParameters(
405- sketch_size = 2048 ,
406- shrinking_factor = 0.64 ,
407- num_buckets = 64
408- ))
405+ print (profiles)
406+
407+ # With options
408+ profiles = (ColumnProfilerRunner(engine)
409+ .onData(table = " users" )
410+ .restrictToColumns([" id" , " name" , " age" ])
411+ .withLowCardinalityHistogramThreshold(100 )
409412 .run())
410413```
411414
@@ -436,21 +439,13 @@ Auto-generate data quality constraints based on your data:
436439from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules
437440
438441# Basic suggestion generation
439- suggestions = (ConstraintSuggestionRunner(spark)
440- .onData(df)
442+ suggestions = (ConstraintSuggestionRunner(engine)
443+ .onData(table = " users" ) # DuckDB: use table=
444+ # .onData(dataframe=df) # Spark: use dataframe=
441445 .addConstraintRules(Rules.DEFAULT )
442446 .run())
443447
444- suggestions.show(truncate = False )
445-
446- # Advanced usage with train/test evaluation
447- suggestions = (ConstraintSuggestionRunner(spark)
448- .onData(df)
449- .addConstraintRules(Rules.DEFAULT )
450- .addConstraintRules(Rules.EXTENDED )
451- .restrictToColumns([" id" , " status" , " score" ])
452- .useTrainTestSplitWithTestsetRatio(0.2 , seed = 42 ) # Evaluate suggestions on test set
453- .run())
448+ print (suggestions)
454449```
455450
456451** Available Rule Sets:**
@@ -509,10 +504,12 @@ result = ColumnProfilerRunner(spark).onData(df).run()
509504for col, profile in result.profiles.items():
510505 print (profile)
511506
512- # After (2.0) - returns DataFrame
507+ # After (2.0) - unified engine API
508+ import pydeequ
513509from pydeequ.v2.profiles import ColumnProfilerRunner
514- result = ColumnProfilerRunner(spark).onData(df).run()
515- result.show()
510+ engine = pydeequ.connect(spark)
511+ result = ColumnProfilerRunner(engine).onData(dataframe = df).run()
512+ print (result)
516513```
517514
518515** Suggestions changes:**
@@ -522,10 +519,12 @@ from pydeequ.suggestions import ConstraintSuggestionRunner, DEFAULT
522519result = ConstraintSuggestionRunner(spark).onData(df).addConstraintRule(DEFAULT()).run()
523520print (result)
524521
525- # After (2.0) - returns DataFrame
522+ # After (2.0) - unified engine API
523+ import pydeequ
526524from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules
527- result = ConstraintSuggestionRunner(spark).onData(df).addConstraintRules(Rules.DEFAULT ).run()
528- result.show()
525+ engine = pydeequ.connect(spark)
526+ result = ConstraintSuggestionRunner(engine).onData(dataframe = df).addConstraintRules(Rules.DEFAULT ).run()
527+ print (result)
529528```
530529
531530---
0 commit comments