basicr/basicr.txt at main · JkaivaRevilla/basicr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#Cargar y explorar el dataset
df <- read.csv("GermanCredit.csv")
# Exploración básica
str(df)
summary(df)
# Ver cuántos NA hay por columna
colSums(is.na(df))
# Reporte automático de exploración
library(DataExplorer)
create_report(df)
#2 Imputación de valores faltantes
library(mice)
# Imputación de datos con mice
df_imputado <- mice(df, m = 5, method = 'pmm', seed = 500)
# Ver los resultados de imputación
summary(df_imputado)
# Obtener el dataset imputado
df_completo <- complete(df_imputado, 1)

# 3 Transformación de variables: Conversión de variables categóricas
df_completo$credit_risk <- factor(df_completo$credit_risk, levels = c(0, 1))
# Otras transformaciones, si es necesario:
df_completo$employment_duration <- as.factor(df_completo$employment_duration)
df_completo$personal_status_sex <- as.factor(df_completo$personal_status_sex)
df_completo$property <- as.factor(df_completo$property)

# 4. Dividir los datos en entrenamiento y prueba
set.seed(123)
train_index <- createDataPartition(df_completo$credit_risk, p = 0.7, list = FALSE)
train <- df_completo[train_index, ]
test <- df_completo[-train_index, ]

# 5 Cargar el paquete para árboles de decisión
library(rpart)
# Modelo de Árbol de Decisión
modelo_arbol <- rpart(credit_risk ~ ., data = train, method = "class")
# Visualizar el árbol
library(rpart.plot)
rpart.plot(modelo_arbol)
# Predicciones con el modelo de árbol
pred_arbol <- predict(modelo_arbol, test, type = "class")

# 6 Evaluación del modelo (matriz de confusión)
library(caret)
pred_arbol <- factor(pred_arbol, levels = c(0, 1))
confusionMatrix(pred_arbol, test$credit_risk)
# Cargar el paquete para regresión logística
library(glmnet)
# Modelo de Regresión Logística
modelo_logit <- glm(credit_risk ~ ., data = train, family = "binomial")
# Predicciones con el modelo de regresión logística
pred_logit_prob <- predict(modelo_logit, test, type = "response")
pred_logit <- ifelse(pred_logit_prob > 0.5, 1, 0)
pred_logit <- factor(pred_logit, levels = c(0, 1))

# Evaluación del modelo
confusionMatrix(pred_logit, test$credit_risk)

# 7 Evaluacion de los modelos
library(pROC)
# ROC para Árbol de Decisión
roc_arbol <- roc(response = test$credit_risk, predictor = as.numeric(pred_arbol))
auc_arbol <- auc(roc_arbol)

# ROC para Regresión Logística
roc_logit <- roc(response = test$credit_risk, predictor = as.numeric(pred_logit))
auc_logit <- auc(roc_logit)

# Graficar las curvas ROC
plot.roc(roc_arbol, col = "blue", main = "Curvas ROC - Modelos", lwd = 2)
lines.roc(roc_logit, col = "red", lwd = 2)

# Mostrar los valores AUC
auc_arbol
auc_logit


#analisis univariado, bivariado
library(ggplot2)

# Variable numérica
ggplot(df, aes(x = age)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  theme_minimal()

# Variable categórica
ggplot(df, aes(x = housing)) +
  geom_bar(fill = "orange") +
  theme_minimal()
# Boxplot edad vs. riesgo crediticio
ggplot(df, aes(x = credit_risk, y = age, fill = credit_risk)) +
  geom_boxplot() +
  theme_minimal()

# Tabla de frecuencia
table(df$credit_risk, df$housing)