-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_transform.R
More file actions
88 lines (85 loc) · 3.49 KB
/
data_transform.R
File metadata and controls
88 lines (85 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
check_sse <- function(X, centering = FALSE, random = FALSE, seed = NULL) {
#' Estimation of the number of strongly spiked eigenvalues of the covariance matrix
#'
#' @param X matrix, shape (d, n) - Data matrix (features x samples)
#' @param centering logical - If TRUE, center the data before splitting
#' @param random logical - If TRUE, randomly split samples; if FALSE, split deterministically
#' @param seed integer or NULL - Random seed for reproducibility (used if random=TRUE)
#' @return khat integer - Estimated number of spiked eigenvalues
if (centering) {
X <- sweep(X, 1, rowMeans(X), "-")
}
d <- nrow(X)
n <- ncol(X)
n1 <- as.integer(ceiling(n / 2))
n2 <- n - n1
q <- min(n2 - 1, d)
if (isTRUE(random)) {
if (!is.null(seed)) set.seed(seed)
idx <- sample.int(n, n, replace = FALSE)
X1 <- X[, idx[1:n1], drop = FALSE]
X2 <- X[, idx[(n1 + 1):n], drop = FALSE]
} else {
X1 <- X[, 1:n1, drop = FALSE]
X2 <- X[, (n1 + 1):n, drop = FALSE]
}
X1 <- sweep(X1, 1, rowMeans(X1), "-")
X2 <- sweep(X2, 1, rowMeans(X2), "-")
Sd <- t(X1) %*% X2 %*% t(X2) %*% X1 / ((n1 - 1) * (n2 - 1))
cdmval <- eigen(Sd, symmetric = TRUE, only.values = TRUE)$values[1:q]
kappa <- sqrt(log(n) / n)
khat <- q
psi <- sum(diag(Sd))
tau_hat <- 1 - cdmval[1] / psi
condition <- tau_hat * (1 + kappa)
if (q > 1) {
for (l in 1:(q - 1)) {
if (condition > 1) {
khat <- l - 1
break
}
psi <- psi - cdmval[l]
tau_hat <- 1 - cdmval[l + 1] / psi
condition <- tau_hat * (1 + (l + 1) * kappa)
}
}
khat <- min(khat, n2 - 2)
return(as.integer(khat))
}
NRM_trans <- function(X, centering = FALSE, sse_point = NULL, random = FALSE, seed = NULL) {
#' Data transformation based on the noise-reduction methodology
#'
#' @param X matrix, shape (d, n) - Data matrix (features x samples)
#' @param centering logical - If TRUE, center the data before further processing
#' @param sse_point integer or NULL - Number of spiked eigenvalues; if NULL, estimated by check_sse
#' @param random logical - If TRUE, randomly split samples for sse estimation
#' @param seed integer or NULL - Random seed for reproducibility
#' @return list with X_trans (sse_point x n) and nrmvec (d x sse_point)
if (centering) {
X <- sweep(X, 1, rowMeans(X), "-")
}
d <- nrow(X)
n <- ncol(X)
sse_point <- if (is.null(sse_point)) check_sse(X, centering = centering, random = random, seed = seed) else as.integer(sse_point)
# Calculate sample eigenvalues and dual-eigenvectors
X0 <- if (centering) X else sweep(X, 1, rowMeans(X), "-")
eg <- eigen(t(X0) %*% X0 / (n - 1), symmetric = TRUE)
eigvals <- eg$values[1:sse_point]
dualvecs <- eg$vectors[, 1:sse_point, drop = FALSE]
#Noise-reduced eigenvalues and eigenvectors
trSd <- sum(X0^2) / (n - 1)
cs <- cumsum(eigvals)
denom <- n - (seq_len(sse_point) + 1)
nrmval <- eigvals - (trSd - cs) / denom
nrmvec <- (X0 %*% dualvecs) / (sqrt(n - 1) * matrix(sqrt(nrmval), nrow = d, ncol = sse_point, byrow = TRUE))
#NRM transformation
c <- sqrt(n - 1) / (n - 2)
nr_projection_term <- sqrt(n - 1) * crossprod(nrmvec, X)
#Calculate bias correction term
dualvec_scaled <- sweep(dualvecs, 2, sqrt(nrmval), "/")
x0_dot_x <- colSums(X0 * X)
bias_correction_term <- (n / (n - 1)) * (t(dualvec_scaled) * matrix(x0_dot_x, nrow = sse_point, ncol = n, byrow = TRUE))
#Final transformed data
X_trans <- c * (nr_projection_term - bias_correction_term)
return(list(X_trans = X_trans, nrmvec = nrmvec))
}