-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweather.py
More file actions
106 lines (78 loc) · 2.88 KB
/
weather.py
File metadata and controls
106 lines (78 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import math
import pandas as pd
from collections import Counter
# Calculate the entropy of a dataset
def calculate_entropy(data):
total = len(data)
label_counts = Counter(data)
entropy = 0.0
for count in label_counts.values():
probability = count / total
entropy -= probability * math.log2(probability)
return entropy
# Calculate information gain for a feature
# def calculate_information_gain(data, feature, target):
# # Calculate the entropy of the entire dataset
# total_entropy = calculate_entropy(data[target])
# # Get unique values of the feature
# feature_values = data[feature].unique()
# weighted_entropy = 0.0
# # Calculate the entropy for each feature value
# for value in feature_values:
# subset = data[data[feature] == value]
# weighted_entropy += (len(subset) / len(data)) * calculate_entropy(subset[target])
# # Information gain is the reduction in entropy
# return total_entropy - weighted_entropy
def calculate_information_gain(data, feature, target):
# Entropy of the whole dataset
total_entropy = calculate_entropy(data[target])
# Get unique values for the feature
values = data[feature].unique()
weighted_entropy = 0
for val in values:
subset = data[data[feature] == val]
prob = len(subset) / len(data)
weighted_entropy += prob * calculate_entropy(subset[target])
# Information Gain = Original Entropy - Weighted Feature Entropy
info_gain = total_entropy - weighted_entropy
return info_gain
# ID3 Algorithm: Build the decision tree
def id3(data, features, target):
labels = data[target]
# Return label if pure
if len(set(labels)) == 1:
return labels.iloc[0]
# Return majority class if no features left
if not features:
return labels.mode()[0]
# Choose best feature using info gain
best = max(features, key=lambda f: calculate_information_gain(data, f, target))
tree = {best: {}}
# Split and build subtree
for val in data[best].unique():
sub = data[data[best] == val]
remaining = [f for f in features if f != best]
tree[best][val] = id3(sub, remaining, target)
return tree
# Function to predict using the decision tree
def predict(tree, sample):
while isinstance(tree, dict):
key = next(iter(tree))
tree = tree[key][sample[key]]
return tree
# 🧪 Simple dataset with 2 features
data = {
'Outlook': ['Sunny', 'Sunny', 'Rain', 'Rain'],
'Humidity': ['High', 'Low', 'High', 'Low'],
'PlayTennis': ['No', 'Yes', 'Yes', 'Yes']
}
df = pd.DataFrame(data)
# Features and target
features = ['Outlook', 'Humidity']
target = 'PlayTennis'
# Train and predict
tree = id3(df, features, target)
print("Decision Tree:", tree)
# Sample prediction
sample = {'Outlook': 'Sunny', 'Humidity': 'Low'}
print("Prediction:", predict(tree, sample))