-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
129 lines (101 loc) · 4.01 KB
/
data_preprocessing.py
File metadata and controls
129 lines (101 loc) · 4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def load_and_display_data(file_path):
"""
Load the dataset and display the first few rows, info and summary statistics
"""
df = pd.read_csv(file_path)
print("Dataset Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())
print("\nSummary statistics:")
print(df.describe())
return df
def handle_missing_values(df):
"""
Handle missing values in the dataset
"""
# Convert the 'timestamp' column to datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df['timestamp'] = df['timestamp'].fillna(method='ffill') # Forward fill for timestamps
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
return df
def engineer_features(df):
"""
Engineer new features in the dataset
"""
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])
# Create time-based features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
# Create lag features
df['energy_consumption_lag1'] = df['energy_consumption'].shift(1)
df['energy_consumption_lag2'] = df['energy_consumption'].shift(2) # ADDED LAG 2
# Create interaction features
df['temp_humidity_interaction'] = df['temperature'] * df['humidity']
df['temperature_bins'] = pd.cut(df['temperature'], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
print("New features added:")
print(df[['hour', 'day_of_week', 'month', 'is_weekend', 'temp_humidity_interaction', 'temperature_bins']].head())
return df
def visualize_data(df):
"""
Visualize the dataset
"""
# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=[np.number])
# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Energy Consumption Features')
plt.show()
# Pairplot of key features
sns.pairplot(df[['temperature', 'humidity', 'energy_consumption', 'temp_humidity_interaction']])
plt.suptitle('Pairplot of Key Features vs Energy Consumption', y=1.02)
plt.show()
# Distribution of energy consumption
plt.figure(figsize=(10, 6))
sns.histplot(df['energy_consumption'], kde=True)
plt.title('Distribution of Energy Consumption')
plt.xlabel('Energy Consumption')
plt.show()
# Distribution of temperature bins
plt.figure(figsize=(10, 6))
sns.countplot(x='temperature_bins', data=df, order=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
plt.title('Distribution of Temperature Bins')
plt.xlabel('Temperature Bins')
plt.ylabel('Count')
plt.show()
# Energy consumption by day of the week
plt.figure(figsize=(10, 6))
sns.boxplot(x='day_of_week', y='energy_consumption', data=df)
plt.title('Energy Consumption by Day of the Week')
plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
plt.ylabel('Energy Consumption')
plt.show()
# Energy consumption: Weekday vs Weekend
plt.figure(figsize=(8, 5))
sns.boxplot(x='is_weekend', y='energy_consumption', data=df)
plt.title('Energy Consumption: Weekday vs Weekend')
plt.xlabel('Is Weekend (0=Weekday, 1=Weekend)')
plt.ylabel('Energy Consumption')
plt.show()
def preprocess_and_explore_data(file_path):
"""
Preprocess and explore the dataset
"""
df = load_and_display_data(file_path)
df = handle_missing_values(df)
df = engineer_features(df)
visualize_data(df)
return df
if __name__ == '__main__':
processed_df = preprocess_and_explore_data('./energy_consumption_data.csv')
print("\nProcessed DataFrame:")
print(processed_df.head())