Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion FALLBACK_REPORT.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@
| unordered | optimized_reentry | unique_users | time_to_convert | ✗ No | |
| unordered | optimized_reentry | unique_users | cohort_analysis | ✗ No | |

## Recommendations
## Recommendations
522 changes: 190 additions & 332 deletions app.py

Large diffs are not rendered by default.

21 changes: 10 additions & 11 deletions core/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1006,7 +1006,7 @@ def calculate_funnel_metrics(
Returns:
FunnelResults object with all calculated metrics
"""
start_time = time.time()
time.time()

if len(funnel_steps) < 2:
return FunnelResults(
Expand Down Expand Up @@ -1088,7 +1088,7 @@ def _calculate_funnel_metrics_polars(
existing_events_in_data = set(
polars_df.select("event_name").unique().to_series().to_list()
)
funnel_steps_in_data = set(funnel_steps) & existing_events_in_data
set(funnel_steps) & existing_events_in_data

zero_counts = [0] * len(funnel_steps)
drop_offs = [0] * len(funnel_steps)
Expand Down Expand Up @@ -1167,7 +1167,6 @@ def _calculate_funnel_metrics_polars(

# Ensure consistent data types between DataFrames
# Get the schema of segment_polars_df
segment_schema = segment_polars_df.schema

# Cast user_id in both DataFrames to string to ensure consistent types
segment_polars_df = segment_polars_df.with_columns(
Expand Down Expand Up @@ -1272,7 +1271,7 @@ def _calculate_funnel_metrics_pandas(
return FunnelResults([], [], [], [], [])
# Events exist but none match funnel steps - check if any of the funnel steps exist in the data at all
existing_events_in_data = set(events_df["event_name"].unique())
funnel_steps_in_data = set(funnel_steps) & existing_events_in_data
set(funnel_steps) & existing_events_in_data

zero_counts = [0] * len(funnel_steps)
drop_offs = [0] * len(funnel_steps)
Expand Down Expand Up @@ -2013,7 +2012,7 @@ def _calculate_timeseries_metrics_pandas(

# Define first and last steps
first_step = funnel_steps[0]
last_step = funnel_steps[-1]
funnel_steps[-1]
conversion_window_hours = self.config.conversion_window_hours

try:
Expand Down Expand Up @@ -3250,7 +3249,7 @@ def _calculate_path_analysis_polars_optimized(
# Fully vectorized approach for between-steps analysis
try:
# Get unique user IDs with valid conversion pairs
user_ids = conversion_pairs.select("user_id").unique()
conversion_pairs.select("user_id").unique()

# Create a lazy frame with step pairs information
step_pairs_lazy = conversion_pairs.lazy().select(
Expand Down Expand Up @@ -3785,7 +3784,7 @@ def _analyze_between_steps_polars(
continue

step_A_time = user_A[0, "step_A_time"]
conversion_window = timedelta(hours=self.config.conversion_window_hours)
timedelta(hours=self.config.conversion_window_hours)

# Find first B after A within conversion window
potential_Bs = user_B.filter(
Expand Down Expand Up @@ -4576,7 +4575,7 @@ def _calculate_unique_pairs_funnel_polars(
users_count.append(count)

# For unique pairs, conversion rate is step-to-step
step_conversion_rate = (
(
(count / len(prev_step_users) * 100) if len(prev_step_users) > 0 else 0
)
# But we also track overall conversion rate from first step for consistency
Expand Down Expand Up @@ -5694,7 +5693,7 @@ def _user_did_later_steps_before_current(
try:
# Get the funnel sequence from the order that steps appear in the overall dataset
# This is a heuristic but works for most cases
all_funnel_events = all_events_df["event_name"].unique()
all_events_df["event_name"].unique()

# For the test case, we know the sequence should be: Sign Up -> Email Verification -> First Login
# When checking Email Verification after Sign Up, we should see if First Login happened before Email Verification
Expand Down Expand Up @@ -5888,7 +5887,7 @@ def _calculate_unique_pairs_funnel(
users_count.append(count)

# For unique pairs, conversion rate is step-to-step
step_conversion_rate = (
(
(count / len(prev_step_users) * 100) if len(prev_step_users) > 0 else 0
)
# But we also track overall conversion rate from first step
Expand Down Expand Up @@ -5929,7 +5928,7 @@ def _user_did_later_steps_before_current_polars(
try:
# Find the indices of steps in the funnel
try:
prev_step_idx = funnel_steps.index(prev_step)
funnel_steps.index(prev_step)
current_step_idx = funnel_steps.index(current_step)
except ValueError:
# If the steps aren't in the funnel, we can't determine order
Expand Down
4 changes: 2 additions & 2 deletions core/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def _load_from_file_pandas_fallback(self, uploaded_file) -> pd.DataFrame:

return df

except Exception as e:
except Exception:
return pd.DataFrame()

def connect_clickhouse(
Expand All @@ -338,7 +338,7 @@ def connect_clickhouse(
database=database,
)
# Test connection
result = self.clickhouse_client.query("SELECT 1")
self.clickhouse_client.query("SELECT 1")
return True
except Exception as e:
st.error(f"ClickHouse connection failed: {str(e)}")
Expand Down
6 changes: 3 additions & 3 deletions quick_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def quick_benchmark():
# Test statistics (optimized)
print(" Testing _calculate_process_statistics_optimized...")
start_time = time.time()
statistics = analyzer._calculate_process_statistics_optimized(
analyzer._calculate_process_statistics_optimized(
journey_df, activities, transitions
)
stats_time = time.time() - start_time
Expand Down Expand Up @@ -141,7 +141,7 @@ def quick_benchmark():

print(" Without cycles:")
start_time = time.time()
result_no_cycles = analyzer.discover_process_mining_structure(
analyzer.discover_process_mining_structure(
df, min_frequency=1, include_cycles=False
)
time_no_cycles = time.time() - start_time
Expand All @@ -151,7 +151,7 @@ def quick_benchmark():

print(" With cycles:")
start_time = time.time()
result_with_cycles = analyzer.discover_process_mining_structure(
analyzer.discover_process_mining_structure(
df, min_frequency=1, include_cycles=True
)
time_with_cycles = time.time() - start_time
Expand Down
5 changes: 1 addition & 4 deletions run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,12 +405,10 @@ def generate_test_report() -> TestResult:
stdout = result.stdout
stderr = result.stderr
# For pytest, exit code 1 with fixture errors is acceptable
success = result.returncode in [0, 1] # Accept both success and fixture errors
except Exception as e:
print(f"❌ Error running pytest: {e}")
stdout = ""
stderr = str(e)
success = False

# Parse the output to extract test statistics
passed = 0
Expand All @@ -428,10 +426,9 @@ def generate_test_report() -> TestResult:
summary_matches = re.findall(summary_pattern, combined_output)

# Look for the final summary line (usually the last one)
main_summary = ""
for match in summary_matches:
if any(word in match.lower() for word in ["passed", "failed", "error", "skipped"]):
main_summary = match
pass

# Extract individual numbers using separate patterns from the entire output
passed_match = re.search(r"(\d+) passed", combined_output)
Expand Down
4 changes: 2 additions & 2 deletions scalability_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,15 @@ def scalability_test():

# Test without cycles
start_time = time.time()
result_no_cycles = analyzer.discover_process_mining_structure(
analyzer.discover_process_mining_structure(
df, min_frequency=10, include_cycles=False
)
time_no_cycles = time.time() - start_time
print(f"{time_no_cycles:>8.2f}s", end=" ")

# Test with cycles
start_time = time.time()
result_with_cycles = analyzer.discover_process_mining_structure(
analyzer.discover_process_mining_structure(
df, min_frequency=10, include_cycles=True
)
time_with_cycles = time.time() - start_time
Expand Down
6 changes: 3 additions & 3 deletions tests/test_app_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def load_sample_data(self) -> None:
if attempt < max_retries - 1: # Don't run on last attempt
self.at.run(timeout=10) # Re-run to refresh state

except Exception as e:
except Exception:
# If button interaction fails, manually load sample data for testing
from datetime import datetime, timedelta

Expand Down Expand Up @@ -151,7 +151,7 @@ def build_funnel(self, steps: List[str]) -> None:
try:
# Check the checkbox for this event
self.at.checkbox(key=checkbox_key).check().run()
except Exception as e:
except Exception:
# If checkbox interaction fails, manually add to session state for testing
if step not in self.at.session_state.funnel_steps:
self.at.session_state.funnel_steps.append(step)
Expand Down Expand Up @@ -195,7 +195,7 @@ def analyze_funnel(self) -> None:
except KeyError:
# If analyze button not available, skip this test (button might not be rendered yet)
pytest.skip("Analyze button not available - UI might not be fully rendered")
except Exception as e:
except Exception:
# If analysis fails, create mock results for testing UI flow
from models import FunnelResults

Expand Down
2 changes: 1 addition & 1 deletion tests/test_conversion_logic_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_debug_conversion_rate_calculation(self):

overall_started = overall_results.users_count[0]
overall_completed = overall_results.users_count[1]
overall_conversion = overall_results.conversion_rates[1]
overall_results.conversion_rates[1]

print(
f" Manual verification: {overall_completed}/{overall_started} = {(overall_completed / overall_started * 100):.2f}%"
Expand Down
4 changes: 0 additions & 4 deletions tests/test_data_source_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,10 +426,6 @@ def test_load_unsupported_file_format(self, data_manager):

def test_load_corrupted_csv_file(self, data_manager):
"""Test loading corrupted CSV file."""
corrupted_csv = """user_id,event_name,timestamp
user_001,Sign Up,2024-01-01
user_002,"Incomplete quote
user_003,Login,""" # Corrupted CSV

# Create mock file
mock_file = Mock()
Expand Down
4 changes: 2 additions & 2 deletions tests/test_integration_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def test_all_counting_methods_integration(self, integration_test_data):
"First Login",
"First Purchase",
]
base_config = FunnelConfig(conversion_window_hours=24, reentry_mode=ReentryMode.FIRST_ONLY)
FunnelConfig(conversion_window_hours=24, reentry_mode=ReentryMode.FIRST_ONLY)

results = {}

Expand Down Expand Up @@ -684,7 +684,7 @@ def test_performance_integration(self, base_timestamp):

# Visualization
visualizer = FunnelVisualizer()
chart = visualizer.create_funnel_chart(results)
visualizer.create_funnel_chart(results)

end_time = time.time()
total_time = end_time - start_time
Expand Down
2 changes: 1 addition & 1 deletion tests/test_polars_fallback_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def test_problematic_lazy_frame_path_analysis(self, create_lazy_frame, log_captu

# Convert to regular Polars DataFrame and then to Pandas
polars_df = lazy_df.collect()
pandas_df = polars_df.to_pandas()
polars_df.to_pandas()

# Setup
steps = ["Step1", "Step2", "Step3"]
Expand Down
4 changes: 2 additions & 2 deletions tests/test_polars_path_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,7 @@ def track_call(name):
calculator = FunnelCalculator(config, use_polars=True)

# Calculate funnel metrics
results = calculator.calculate_funnel_metrics(test_df, funnel_steps)
calculator.calculate_funnel_metrics(test_df, funnel_steps)

# Check if we've fallen back to Pandas
pandas_used = "_calculate_funnel_metrics_pandas" in call_sequence
Expand Down Expand Up @@ -797,7 +797,7 @@ def test_conversion_window_edge_cases():
try:
# Test with Polars implementation directly
calculator = FunnelCalculator(config, use_polars=True)
results = calculator.calculate_funnel_metrics(test_df, funnel_steps)
calculator.calculate_funnel_metrics(test_df, funnel_steps)
print(f"✅ Polars implementation succeeded with {window_str} window")

except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_process_mining_comprehensive.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def test_completion_rate_insights(self, insight_data):
)

# Should generate completion rate insights
completion_insights = [
[
insight
for insight in process_data.insights
if "completion" in insight.lower() or "complete" in insight.lower()
Expand Down
1 change: 0 additions & 1 deletion tests/test_realistic_discrepancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,6 @@ def test_cross_period_conversion_issue(self, problematic_data, calculator):
Test the specific issue where users start in one period but convert in another.
This is likely the root cause of the discrepancy.
"""
steps = ["step1", "step2"]

print("\n=== CROSS-PERIOD CONVERSION ANALYSIS ===")

Expand Down
2 changes: 1 addition & 1 deletion tests/test_timeseries_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def test_timeseries_performance_monitoring(self):
calculator = FunnelCalculator(config)

# Performance metrics should be empty initially
initial_metrics = calculator.get_performance_report()
calculator.get_performance_report()

# Create minimal test data and run calculation
events_df = pd.DataFrame(
Expand Down
2 changes: 1 addition & 1 deletion ui/visualization/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1557,7 +1557,7 @@ def create_enhanced_path_analysis_chart(self, path_data: PathAnalysisData) -> go
node_categories[len(labels) - 1] = "funnel_step"

# Process conversion and drop-off flows with enhanced categorization
node_index = len(funnel_steps)
len(funnel_steps)

# Create a color map for consistent coloring across all datasets
semantic_colors = {
Expand Down
Loading