Simple_DataProfiling/Csv_cleaner.py at main · AceSCav/Simple_DataProfiling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import sys
import os

file = sys.argv[1]

def df_loader(file):
    """
    Reads a CSV file and returns a pandas DataFrame.
    """
    df = pd.read_csv(file)
    return df

def basic_info(df):
    """
    Displays basic information about a pandas DataFrame, including dimensions,
    summary statistics (excluding 'ID' column if present), data types, and
    missing values per column.
    """
    print('Dataframe dimension: ', df.shape)
    print('\n')
    print('Basics Statistics')
    if 'ID' in df.columns:
        print('id column removed')
        print(df.drop(columns=['ID']).describe())

    else:
        print(df.describe())
    print('\n')
    print('Data Types')
    print(df.dtypes)
    print('\n')
    print('Missing or Null Values')
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    print(df.isnull().sum())
    print('\n')

def first_nLines(df):
    """
    Asks the user to input a number and prints the first n rows of a DataFrame.
    """
    n = int(input('Enter the number of lines you want to see: '))
    print(df.head(n))

def null_cleaner(df):
    """
    Prompts the user to select a column and removes all rows with null values
    in that column. Displays how many rows were removed.
    """
    print("Available columns:")
    print(df.columns.tolist())
    colunm = input('Enter the name of the column you want to remove null values: ').strip()

    if colunm not in df.columns:
        print("Column not found. Operation canceled.")
        return

    before = df.shape[0]  # Number of rows before the removal
    df.dropna(subset=[colunm], inplace=True)
    after = df.shape[0]  # Number of rows after the removal

    removed = before - after
    print(f"Null values removed! {removed} rows were deleted.") if removed > 0 else print("No null values found.")

def mean_completer(df):
    """
    Prompts the user to select a numeric column and fills all null values in that
    column with the column's mean value.
    """
    print("Available columns:")
    print(df.columns.tolist())
    colunm = input('Enter the name of the column you want to fill null values with the mean: ').strip()

    if colunm not in df.columns:
        print("Column not found. Operation canceled.")
        return

    if not np.issubdtype(df[colunm].dtype, np.number):
        print("Selected column is not numeric. Operation canceled.")
        return

    mean_value = df[colunm].mean()
    df[colunm].fillna(mean_value, inplace=True)
    print(f"Null values in column '{colunm}' filled with mean value: {mean_value}")


def duplicate_cleaner(df):
    """
    Prompts the user to select one or more columns and removes duplicate rows
    based on those columns. Displays how many rows were removed.
    """
    colunms = []
    print("Available columns:")
    print(df.columns.tolist())
    while True:
        colunm = input('Enter the name of the column you want to remove duplicates (or type "done" to finish): ').strip()
        if colunm.lower() == "done":
            break
        colunms.append(colunm)

    if not colunms:  # Verifica se a lista está vazia
        print("No columns selected. Operation canceled.")
        return

    before = df.shape[0]
    df.drop_duplicates(subset=colunms, keep='first', inplace=True)
    after = df.shape[0]

    print(f"Duplicates removed! {before - after} rows were deleted.")


def outlier_checker(df):
    """
    Prompts the user to select a numeric column and detects outliers based on
    the Interquartile Range (IQR) method. Displays the number of outliers found
    and optionally removes them from the DataFrame.
    """
    print("Available columns:")
    print(df.columns.tolist())
    colunm = input("Enter the name of the column you want to check for outliers: ").strip()
    if colunm not in df.columns:
        print("Column not found. Operation canceled.")
        return
    if not np.issubdtype(df[colunm].dtype, np.number):
        print("Selected column is not numeric. Operation canceled.")
        return
    Q1 = df[colunm].quantile(0.25)
    Q3 = df[colunm].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[colunm] < lower_bound) | (df[colunm] > upper_bound)]
    num_outliers = outliers.shape[0]
    print(f"Lower Bound: {lower_bound}")
    print(f"Upper Bound: {upper_bound}")
    print(f"Number of outliers detected: {num_outliers}")
    if num_outliers > 0:
        choice = input("Do you want to remove outliers? (y/n): ").strip().lower()
        if choice == 'y':
            before = df.shape[0]
            df.drop(outliers.index, inplace=True)
            after = df.shape[0]
            print(f"Outliers removed successfully! {before - after} rows were deleted.")
        else:
            print("Operation canceled.")
    else:
        print("No outliers detected.")

def standardize_column(df):
    """
    Prompts the user to select a numeric column and standardizes it by
    transforming its values to have zero mean and unit variance.
    """
    print("Available columns:")
    print(df.columns.tolist())
    colunm = input("Enter the name of the column you want to standardize: ").strip()
    if colunm not in df.columns:
        print("Column not found. Operation canceled.")
        return
    if not np.issubdtype(df[colunm].dtype, np.number):
        print("Selected column is not numeric. Operation canceled.")
        return
    scaler = StandardScaler()
    df[colunm] = scaler.fit_transform(df[[colunm]])
    print(f"Column '{colunm}' standardized successfully!")


def correlation_matrix(df):
    """
    Calculates and plots the correlation matrix for numeric columns in a DataFrame
    using a heatmap.
    """
    df_numeric = df.select_dtypes(include=['number'])
    corr = df_numeric.corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()


def histogram(df):
    """
    Prompts the user to select a column and displays a histogram with KDE
    (Kernel Density Estimate) for that column.
    """
    print("Available columns:")
    print(df.columns.tolist())
    colunm = input("Enter the name of the column you want to visualize the histogram for: ")
    plt.figure(figsize=(10, 6))
    sns.histplot(df[colunm], bins=30, kde=True, color='blue')
    plt.title(f'Histogram of {colunm}')
    plt.show()

def csv_exporter(df):
    """
    Prompts the user to enter a file name and exports the DataFrame to a CSV file
    in the current working directory.
    """
    output_filename = input('Enter the file name to save (e.g., file.csv): ')
    current_directory = os.getcwd()
    output_path = os.path.join(current_directory, output_filename)
    df.to_csv(output_path, index=False)
    print('File exported successfully!')

def menu():
    """
    Displays a menu with options for various DataFrame operations and prompts
    the user to select one.
    """
    print('1. Basic Info')
    print('2. First n lines')
    print('3. Null Cleaner')
    print('4. Duplicate Cleaner')
    print('5. Mean Completer')
    print('6. Outlier Checker')
    print('7. Standardize Column')
    print('8. Correlation Matrix')
    print('9. Histogram')
    print('10. CSV Exporter')
    print('11. Exit')
    choice = int(input('Enter your choice: '))
    return choice

def main():
    """
    Main loop that loads a DataFrame and executes user-selected DataFrame
    operations in a menu-driven interface until the user chooses to exit.
    """
    df = df_loader(file)
    while True:
        choice = menu()
        if choice == 1:
            basic_info(df)
        elif choice == 2:
            first_nLines(df)
        elif choice == 3:
            null_cleaner(df)
        elif choice == 4:
            duplicate_cleaner(df)
        elif choice == 5:
            mean_completer(df)
        elif choice == 6:
            outlier_checker(df)
        elif choice == 7:
            standardize_column(df)
        elif choice == 8:
            correlation_matrix(df)
        elif choice == 9:
            histogram(df)
        elif choice == 10:
            csv_exporter(df)
        elif choice == 11:
            sys.exit()
        else:
            print('Invalid choice!')

if __name__ == '__main__':
    main()