Python_ML_Algorithms/weather.py at main · GeethikaMaddi/Python_ML_Algorithms · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import math
import pandas as pd
from collections import Counter

# Calculate the entropy of a dataset
def calculate_entropy(data):
    total = len(data)
    label_counts = Counter(data)
    entropy = 0.0
    for count in label_counts.values():
        probability = count / total
        entropy -= probability * math.log2(probability)
    return entropy


# Calculate information gain for a feature
# def calculate_information_gain(data, feature, target):
#     # Calculate the entropy of the entire dataset
#     total_entropy = calculate_entropy(data[target])
#     # Get unique values of the feature
#     feature_values = data[feature].unique()
#     weighted_entropy = 0.0
#     # Calculate the entropy for each feature value
#     for value in feature_values:
#         subset = data[data[feature] == value]
#         weighted_entropy += (len(subset) / len(data)) * calculate_entropy(subset[target])
#     # Information gain is the reduction in entropy
#     return total_entropy - weighted_entropy


def calculate_information_gain(data, feature, target):
    # Entropy of the whole dataset
    total_entropy = calculate_entropy(data[target])

    # Get unique values for the feature
    values = data[feature].unique()

    weighted_entropy = 0
    for val in values:
        subset = data[data[feature] == val]
        prob = len(subset) / len(data)
        weighted_entropy += prob * calculate_entropy(subset[target])

    # Information Gain = Original Entropy - Weighted Feature Entropy
    info_gain = total_entropy - weighted_entropy
    return info_gain


# ID3 Algorithm: Build the decision tree
def id3(data, features, target):
    labels = data[target]

    # Return label if pure
    if len(set(labels)) == 1:
        return labels.iloc[0]
    # Return majority class if no features left
    if not features:
        return labels.mode()[0]

    # Choose best feature using info gain
    best = max(features, key=lambda f: calculate_information_gain(data, f, target))
    tree = {best: {}}

    # Split and build subtree
    for val in data[best].unique():
        sub = data[data[best] == val]
        remaining = [f for f in features if f != best]
        tree[best][val] = id3(sub, remaining, target)

    return tree


# Function to predict using the decision tree
def predict(tree, sample):
    while isinstance(tree, dict):
        key = next(iter(tree))
        tree = tree[key][sample[key]]
    return tree

# 🧪 Simple dataset with 2 features
data = {
    'Outlook': ['Sunny', 'Sunny', 'Rain', 'Rain'],
    'Humidity': ['High', 'Low', 'High', 'Low'],
    'PlayTennis': ['No', 'Yes', 'Yes', 'Yes']
}
df = pd.DataFrame(data)

# Features and target
features = ['Outlook', 'Humidity']
target = 'PlayTennis'

# Train and predict
tree = id3(df, features, target)
print("Decision Tree:", tree)

# Sample prediction
sample = {'Outlook': 'Sunny', 'Humidity': 'Low'}
print("Prediction:", predict(tree, sample))