diff --git a/R/estimate_fpc_scores.R b/R/estimate_fpc_scores.R index 88a8d59..ec4b518 100644 --- a/R/estimate_fpc_scores.R +++ b/R/estimate_fpc_scores.R @@ -6,36 +6,37 @@ #' #' @return #' -estimate_fpc_scores = function(tfb_fpc_obj, Ypred, method = "blup") { - - if (method != "blup") { stop("Only estimation using BLUPs are currently implemented.") } +estimate_fpc_scores <- function(tfb_fpc_obj, Ypred, method = "blup") { + if (method != "blup") { + stop("Only estimation using BLUPs are currently implemented.") + } - efunctions = tfb_fpc_obj[["efunctions"]] - evalues = tfb_fpc_obj[["evalues"]] - mu = tfb_fpc_obj[["mu"]] - npc = tfb_fpc_obj[["npc"]] - error_var = tfb_fpc_obj[["error_var"]] + efunctions <- tfb_fpc_obj[["efunctions"]] + evalues <- tfb_fpc_obj[["evalues"]] + mu <- tfb_fpc_obj[["mu"]] + npc <- tfb_fpc_obj[["npc"]] + error_var <- tfb_fpc_obj[["error_var"]] - D = dim(efunctions) - I.pred = dim(Ypred)[1] + D <- dim(efunctions) + I.pred <- dim(Ypred)[1] - D.inv = diag(1/evalues, nrow = npc, ncol = npc) - Z = efunctions - data.tilde = Ypred - matrix(mu, I.pred, D, byrow = TRUE) - Yhat = matrix(0, nrow = I.pred, ncol = D) - rownames(Yhat) = rownames(Ypred) - colnames(Yhat) = colnames(Ypred) - scores = matrix(NA, nrow = I.pred, ncol = npc) + D.inv <- diag(1 / evalues, nrow = npc, ncol = npc) + Z <- efunctions + data.tilde <- Ypred - matrix(mu, I.pred, D, byrow = TRUE) + Yhat <- matrix(0, nrow = I.pred, ncol = D) + rownames(Yhat) <- rownames(Ypred) + colnames(Yhat) <- colnames(Ypred) + scores <- matrix(NA, nrow = I.pred, ncol = npc) for (i.subj in 1:I.pred) { - obs.points = which(!is.na(Ypred[i.subj, ])) - if (error_var == 0 & length(obs.points) < npc) + obs.points <- which(!is.na(Ypred[i.subj, ])) + if (error_var == 0 && length(obs.points) < npc) { stop("Measurement error estimated to be zero and there are fewer observed points than PCs; scores cannot be estimated.") - Zcur = matrix(Z[obs.points, ], nrow = length(obs.points), ncol = dim(Z)[2]) - ZtZ_sD.inv = solve(crossprod(Zcur) + error_var * D.inv) - scores[i.subj, ] = ZtZ_sD.inv %*% t(Zcur) %*% (data.tilde[i.subj, obs.points]) - Yhat[i.subj, ] = t(as.matrix(mu)) + scores[i.subj, ] %*% t(efunctions) + } + Zcur <- matrix(Z[obs.points, ], nrow = length(obs.points), ncol = dim(Z)[2]) + ZtZ_sD.inv <- solve(crossprod(Zcur) + error_var * D.inv) + scores[i.subj, ] <- ZtZ_sD.inv %*% t(Zcur) %*% (data.tilde[i.subj, obs.points]) + Yhat[i.subj, ] <- t(as.matrix(mu)) + scores[i.subj, ] %*% t(efunctions) } scores - } diff --git a/R/fpca-methods.R b/R/fpca-methods.R index d979a31..c16ba3f 100644 --- a/R/fpca-methods.R +++ b/R/fpca-methods.R @@ -13,7 +13,7 @@ ##' @method residuals rfr_fpca ##' @aliases fitted.rfr_fpca ##' @export -residuals.rfr_fpca <- function(object, ...){ +residuals.rfr_fpca <- function(object, ...) { object$Y - tfd(object$Yhat_tfb, arg = tf_arg(object$Y)) } @@ -21,7 +21,7 @@ residuals.rfr_fpca <- function(object, ...){ ##' @method fitted rfr_fpca ##' @export ##' @rdname residuals.rfr_fpca -fitted.rfr_fpca <- function(object, ...){ +fitted.rfr_fpca <- function(object, ...) { tfd(object$Yhat_tfb, arg = tf_arg(object$Y)) } @@ -38,10 +38,10 @@ fitted.rfr_fpca <- function(object, ...){ ##' @importFrom stats fitted fitted.values ##' @method predict rfr_fpca ##' @export -predict.rfr_fpca <- function(object, newdata, ...){ +predict.rfr_fpca <- function(object, newdata, ...) { ## need different behavior for regular vs. irregular objects? - # nah, just need to make sure it works for irregular objects. - # default behavior (with no new data) is to return fitted values + # nah, just need to make sure it works for irregular objects. + # default behavior (with no new data) is to return fitted values if (missing(newdata) || is.null(newdata)) { return(fitted(object)) } @@ -49,16 +49,16 @@ predict.rfr_fpca <- function(object, newdata, ...){ ## include some data checks -- args for new data and fpc expansion, etc ## also that `model_var` is included in `newdata` - model_var = object$model_var - new_tf = newdata[[model_var]] + model_var <- object$model_var + new_tf <- newdata[[model_var]] - Ypred = as.matrix(spread(tf_unnest(new_tf), key = .data$arg, value = .data$value)[,-1]) + Ypred <- as.matrix(spread(tf_unnest(new_tf), key = .data$arg, value = .data$value)[, -1]) - new_scores = estimate_fpc_scores(object, Ypred) + new_scores <- estimate_fpc_scores(object, Ypred) coef_list <- split(cbind(1, new_scores), row(cbind(1, new_scores))) - tfb_ob = object$Yhat_tfb + tfb_ob <- object$Yhat_tfb structure( coef_list, @@ -72,5 +72,4 @@ predict.rfr_fpca <- function(object, newdata, ...){ error_variance = attr(tfb_ob, "error_variance"), class = c("tfb_fpc", "tfb", "tf", "vctrs_vctr") ) - } diff --git a/R/fpca-utils.R b/R/fpca-utils.R index 7a2e052..b7a6bdd 100644 --- a/R/fpca-utils.R +++ b/R/fpca-utils.R @@ -1,17 +1,17 @@ ##' Convert tfb_fpc object to a list ##' @param tfb_fpc_obj object turned by `tfb_fpc` ##' @importFrom stats coefficients -extract_fpca <- function(tfb_fpc_obj){ +extract_fpca <- function(tfb_fpc_obj) { # may change this to not return the basis object - N = length(tfb_fpc_obj) - efunctions = attr(tfb_fpc_obj, "basis_matrix") - evalues = attr(tfb_fpc_obj, "score_variance") - npc = length(evalues) - error_var = attr(tfb_fpc_obj, "error_variance") + N <- length(tfb_fpc_obj) + efunctions <- attr(tfb_fpc_obj, "basis_matrix") + evalues <- attr(tfb_fpc_obj, "score_variance") + npc <- length(evalues) + error_var <- attr(tfb_fpc_obj, "error_variance") - coef_list = coefficients(tfb_fpc_obj) - score_list = lapply(coef_list, "[", -1) #drop intercepts - scores = do.call("rbind", score_list) + coef_list <- coefficients(tfb_fpc_obj) + score_list <- lapply(coef_list, "[", -1) # drop intercepts + scores <- do.call("rbind", score_list) fpca_obj <- list( Yhat_tfb = tfb_fpc_obj, @@ -48,12 +48,10 @@ extract_fpca <- function(tfb_fpc_obj){ #' refundr:::extract_fpca() %>% #' refundr:::extract_fpc_scores() #' -extract_fpc_scores = function(rfr_fpca_obj){ +extract_fpc_scores <- function(rfr_fpca_obj) { + score_mat <- rfr_fpca_obj$scores - score_mat = rfr_fpca_obj$scores - - colnames(score_mat) = str_c(rfr_fpca_obj$model_var, 1:dim(score_mat)[2], sep = "_score_") + colnames(score_mat) <- str_c(rfr_fpca_obj$model_var, 1:dim(score_mat)[2], sep = "_score_") as_tibble(score_mat) - } diff --git a/R/fpca_face.R b/R/fpca_face.R index 5a6725f..349c3dd 100644 --- a/R/fpca_face.R +++ b/R/fpca_face.R @@ -73,12 +73,11 @@ #' @importFrom tidyr spread #' @importFrom rlang .data #' @export -fpca_face <-function(data=NULL,Y.pred = NULL,argvals=NULL,pve = 0.99, npc = NULL, - center=TRUE,knots=35,p=3,m=2,lambda=NULL,alpha = 1, - search.grid=TRUE,search.length=100, - method="L-BFGS-B", lower=-20,upper=20, control=NULL){ - - data <- as.matrix(spread(as.data.frame(data), key = .data$arg, value = .data$value)[,-1]) +fpca_face <- function(data = NULL, Y.pred = NULL, argvals = NULL, pve = 0.99, npc = NULL, + center = TRUE, knots = 35, p = 3, m = 2, lambda = NULL, alpha = 1, + search.grid = TRUE, search.length = 100, + method = "L-BFGS-B", lower = -20, upper = 20, control = NULL) { + data <- as.matrix(spread(as.data.frame(data), key = .data$arg, value = .data$value)[, -1]) ## data: data, I by J data matrix, functions on rows ## argvals: vector of J @@ -89,8 +88,8 @@ fpca_face <-function(data=NULL,Y.pred = NULL,argvals=NULL,pve = 0.99, npc = NUL ## lambda: user-selected smoothing parameter ## method: see R function "optim" ## lower, upper, control: see R function "optim" - #require(Matrix) - #source("pspline.setting.R") + # require(Matrix) + # source("pspline.setting.R") ##### redo this so that it still accepts a dataframe (see ydata argument from original code) ##### but have this as an S3 class where it can accept either. @@ -100,44 +99,46 @@ fpca_face <-function(data=NULL,Y.pred = NULL,argvals=NULL,pve = 0.99, npc = NUL I <- data_dim[1] ## number of subjects J <- data_dim[2] ## number of obs per function - if(is.null(argvals)) argvals <- (1:J)/J-1/2/J ## if NULL, assume equally spaced + if (is.null(argvals)) argvals <- (1:J) / J - 1 / 2 / J ## if NULL, assume equally spaced - meanX <- rep(0,J) - if(center) {##center the functions - meanX <- colMeans(data, na.rm=TRUE) - meanX <- smooth.spline(argvals,meanX,all.knots =TRUE)$y - data <- t(t(data)- meanX) + meanX <- rep(0, J) + if (center) { ## center the functions + meanX <- colMeans(data, na.rm = TRUE) + meanX <- smooth.spline(argvals, meanX, all.knots = TRUE)$y + data <- t(t(data) - meanX) } ## specify the B-spline basis: knots p.p <- p m.p <- m - if(length(knots)==1){ - if(knots+p.p>=J) cat("Too many knots!\n") - stopifnot(knots+p.p= J) cat("Too many knots!\n") + stopifnot(knots + p.p < J) K.p <- knots - knots <- seq(-p.p,K.p+p.p,length=K.p+1+2*p.p)/K.p - knots <- knots*(max(argvals)-min(argvals)) + min(argvals) + knots <- seq(-p.p, K.p + p.p, length = K.p + 1 + 2 * p.p) / K.p + knots <- knots * (max(argvals) - min(argvals)) + min(argvals) } - if(length(knots)>1) K.p <- length(knots)-2*p.p-1 - if(K.p>=J) cat("Too many knots!\n") - stopifnot(K.p 1) K.p <- length(knots) - 2 * p.p - 1 + if (K.p >= J) cat("Too many knots!\n") + stopifnot(K.p < J) c.p <- K.p + p.p ######### precalculation for smoothing ############# - List <- pspline.setting(argvals,knots,p.p,m.p) + List <- pspline.setting(argvals, knots, p.p, m.p) B <- List$B Bt <- Matrix(t(as.matrix(B))) s <- List$s Sigi.sqrt <- List$Sigi.sqrt U <- List$U - A0 <- Sigi.sqrt%*%U - MM <- function(A,s,option=1){ - if(option==2) - return(A*(s%*%t(rep(1,dim(A)[2])))) - if(option==1) - return(A*(rep(1,dim(A)[1])%*%t(s))) + A0 <- Sigi.sqrt %*% U + MM <- function(A, s, option = 1) { + if (option == 2) { + return(A * (s %*% t(rep(1, dim(A)[2])))) + } + if (option == 1) { + return(A * (rep(1, dim(A)[1]) %*% t(s))) + } } ######## precalculation for missing data ######## @@ -145,37 +146,37 @@ fpca_face <-function(data=NULL,Y.pred = NULL,argvals=NULL,pve = 0.99, npc = NUL Niter.miss <- 1 Index.miss <- is.na(data) - if(sum(Index.miss)>0){ + if (sum(Index.miss) > 0) { num.miss <- rowSums(is.na(data)) - for(i in 1:I){ - if(num.miss[i]>0){ - y <- data[i,] + for (i in 1:I) { + if (num.miss[i] > 0) { + y <- data[i, ] seq <- (1:J)[!is.na(y)] - seq2 <-(1:J)[is.na(y)] + seq2 <- (1:J)[is.na(y)] t1 <- argvals[seq] t2 <- argvals[seq2] - fit <- smooth.spline(t1,y[seq]) - temp <- predict(fit,t2,all.knots=TRUE)$y - if(max(t2)>max(t1)) temp[t2>max(t1)] <- mean(y[seq])#y[seq[length(seq)]] - if(min(t2) max(t1)) temp[t2 > max(t1)] <- mean(y[seq]) # y[seq[length(seq)]] + if (min(t2) < min(t1)) temp[t2 < min(t1)] <- mean(y[seq]) # y[seq[1]] + data[i, seq2] <- temp } } - Y0 <- matrix(NA,c.p,I) + Y0 <- matrix(NA, c.p, I) imputation <- TRUE Niter.miss <- 100 } - convergence.vector <- rep(0,Niter.miss); + convergence.vector <- rep(0, Niter.miss) iter.miss <- 1 totalmiss <- mean(Index.miss) - while(iter.miss <= Niter.miss&&convergence.vector[iter.miss]==0) { + while (iter.miss <= Niter.miss && convergence.vector[iter.miss] == 0) { ################################################### ######## Transform the Data ############# ################################################### - Ytilde <- as.matrix(t(A0)%*%as.matrix(Bt%*%t(data))) + Ytilde <- as.matrix(t(A0) %*% as.matrix(Bt %*% t(data))) C_diag <- rowSums(Ytilde^2) - if(iter.miss==1) Y0 = Ytilde + if (iter.miss == 1) Y0 <- Ytilde ################################################### ######## Select Smoothing Parameters ############# ################################################### @@ -183,119 +184,126 @@ fpca_face <-function(data=NULL,Y.pred = NULL,argvals=NULL,pve = 0.99, npc = NUL Ytilde_square <- sum(Ytilde^2) face_gcv <- function(x) { lambda <- exp(x) - lambda_s <- (lambda*s)^2/(1 + lambda*s)^2 - gcv <- sum(C_diag*lambda_s) - Ytilde_square + Y_square - trace <- sum(1/(1+lambda*s)) - gcv <- gcv/(1-alpha*trace/J/(1-totalmiss))^2 + lambda_s <- (lambda * s)^2 / (1 + lambda * s)^2 + gcv <- sum(C_diag * lambda_s) - Ytilde_square + Y_square + trace <- sum(1 / (1 + lambda * s)) + gcv <- gcv / (1 - alpha * trace / J / (1 - totalmiss))^2 return(gcv) } - if(is.null(lambda)) { - if(!search.grid){ - fit <- optim(0,face_gcv,method=method,lower=lower,upper=upper,control=control) - if(fit$convergence>0) { - expression <- paste("Smoothing failed! The code is:",fit$convergence) + if (is.null(lambda)) { + if (!search.grid) { + fit <- optim(0, face_gcv, method = method, lower = lower, upper = upper, control = control) + if (fit$convergence > 0) { + expression <- paste("Smoothing failed! The code is:", fit$convergence) print(expression) } lambda <- exp(fit$par) } else { - Lambda <- seq(lower,upper,length=search.length) + Lambda <- seq(lower, upper, length = search.length) Length <- length(Lambda) - Gcv <- rep(0,Length) - for(i in 1:Length) + Gcv <- rep(0, Length) + for (i in 1:Length) { Gcv[i] <- face_gcv(Lambda[i]) + } i0 <- which.min(Gcv) lambda <- exp(Lambda[i0]) } } - YS <- MM(Ytilde,1/(1+lambda*s),2) + YS <- MM(Ytilde, 1 / (1 + lambda * s), 2) ################################################### #### Eigendecomposition of Smoothed Data ######### ################################################### - if(c.p <= I){ - temp <- YS%*%t(YS)/I - Eigen <- eigen(temp,symmetric=TRUE) + if (c.p <= I) { + temp <- YS %*% t(YS) / I + Eigen <- eigen(temp, symmetric = TRUE) A <- Eigen$vectors - Sigma <- Eigen$values/J + Sigma <- Eigen$values / J } else { - temp <- t(YS)%*%YS/I - Eigen <- eigen(temp,symmetric=TRUE) - Sigma <- Eigen$values/J - #N <- sum(Sigma>0.0000001) - A <- YS%*%(Eigen$vectors%*%diag(1/sqrt(Eigen$values)))/sqrt(I) + temp <- t(YS) %*% YS / I + Eigen <- eigen(temp, symmetric = TRUE) + Sigma <- Eigen$values / J + # N <- sum(Sigma>0.0000001) + A <- YS %*% (Eigen$vectors %*% diag(1 / sqrt(Eigen$values))) / sqrt(I) } - if(iter.miss>1&&iter.miss< Niter.miss) { - diff <- norm(YS-YS.temp,"F")/norm(YS,"F") - if(diff <= 0.02) - convergence.vector[iter.miss+1] <- 1 + if (iter.miss > 1 && iter.miss < Niter.miss) { + diff <- norm(YS - YS.temp, "F") / norm(YS, "F") + if (diff <= 0.02) { + convergence.vector[iter.miss + 1] <- 1 + } } YS.temp <- YS iter.miss <- iter.miss + 1 - N <- min(I,c.p) + N <- min(I, c.p) d <- Sigma[1:N] - d <- d[d>0] - per <- cumsum(d)/sum(d) + d <- d[d > 0] + per <- cumsum(d) / sum(d) - N <- ifelse (is.null(npc), min(which(per>pve)), min(npc, length(d))) + N <- ifelse(is.null(npc), min(which(per > pve)), min(npc, length(d))) - #print(c(iter.miss,convergence.vector[iter.miss+1],lambda,diff)) + # print(c(iter.miss,convergence.vector[iter.miss+1],lambda,diff)) ######################################### ####### Principal Scores ######### ######## data imputation ######### ######################################### - if(imputation) { - A.N <- A[,1:N] + if (imputation) { + A.N <- A[, 1:N] d <- Sigma[1:N] - sigmahat2 <- max(mean(data[!Index.miss]^2) -sum(Sigma),0) - Xi <- t(A.N)%*%Ytilde - Xi <- t(as.matrix(B%*%(A0%*%((A.N%*%diag(d/(d+sigmahat2/J)))%*%Xi)))) - data <- data*(1-Index.miss) + Xi*Index.miss - if(sum(is.na(data))>0) + sigmahat2 <- max(mean(data[!Index.miss]^2) - sum(Sigma), 0) + Xi <- t(A.N) %*% Ytilde + Xi <- t(as.matrix(B %*% (A0 %*% ((A.N %*% diag(d / (d + sigmahat2 / J))) %*% Xi)))) + data <- data * (1 - Index.miss) + Xi * Index.miss + if (sum(is.na(data)) > 0) { print("error") + } } - #if(iter.miss%%10==0) print(iter.miss) + # if(iter.miss%%10==0) print(iter.miss) } ## end of while loop ### now calculate scores - if(is.null(Y.pred)) Y.pred = data - else {Y.pred = t(t(as.matrix(Y.pred))-meanX)} + if (is.null(Y.pred)) { + Y.pred <- data + } else { + Y.pred <- t(t(as.matrix(Y.pred)) - meanX) + } - N <- ifelse (is.null(npc), min(which(per>pve)), npc) - if (N>ncol(A)) { - warning(paste0("The requested npc of ", npc, - " is greater than the maximum allowable value of ", - ncol(A), ". Using ", ncol(A), ".")) + N <- ifelse(is.null(npc), min(which(per > pve)), npc) + if (N > ncol(A)) { + warning(paste0( + "The requested npc of ", npc, + " is greater than the maximum allowable value of ", + ncol(A), ". Using ", ncol(A), "." + )) N <- ncol(A) } npc <- N - Ytilde <- as.matrix(t(A0)%*%(Bt%*%t(Y.pred))) - sigmahat2 <- max(mean(data[!Index.miss]^2) -sum(Sigma),0) - Xi <- t(Ytilde)%*%(A[,1:N]/sqrt(J)) - Xi <- MM(Xi,Sigma[1:N]/(Sigma[1:N] + sigmahat2/J)) + Ytilde <- as.matrix(t(A0) %*% (Bt %*% t(Y.pred))) + sigmahat2 <- max(mean(data[!Index.miss]^2) - sum(Sigma), 0) + Xi <- t(Ytilde) %*% (A[, 1:N] / sqrt(J)) + Xi <- MM(Xi, Sigma[1:N] / (Sigma[1:N] + sigmahat2 / J)) - eigenvectors = as.matrix(B%*%(A0%*%A[,1:N])) - eigenvalues = Sigma[1:N] #- sigmahat2/J + eigenvectors <- as.matrix(B %*% (A0 %*% A[, 1:N])) + eigenvalues <- Sigma[1:N] #- sigmahat2/J - Yhat <- t(A[,1:N])%*%Ytilde - Yhat <- as.matrix(B%*%(A0%*%A[,1:N]%*%diag(eigenvalues/(eigenvalues+sigmahat2/J))%*%Yhat)) + Yhat <- t(A[, 1:N]) %*% Ytilde + Yhat <- as.matrix(B %*% (A0 %*% A[, 1:N] %*% diag(eigenvalues / (eigenvalues + sigmahat2 / J)) %*% Yhat)) Yhat <- t(Yhat + meanX) - scores <- sqrt(J)*Xi[,1:N] + scores <- sqrt(J) * Xi[, 1:N] mu <- meanX - efunctions <- eigenvectors[,1:N] - evalues <- J*eigenvalues[1:N] + efunctions <- eigenvectors[, 1:N] + evalues <- J * eigenvalues[1:N] error_var <- sigmahat2 ret.objects <- c("scores", "mu", "efunctions", "evalues", "npc", "error_var") - ret = lapply(1:length(ret.objects), function(u) get(ret.objects[u])) - names(ret) = ret.objects - class(ret) = "fpca" - return(ret) - + ret <- lapply(seq_along(ret.objects), function(u) get(ret.objects[u])) + names(ret) <- ret.objects + class(ret) <- "fpca" + ret } diff --git a/R/fpca_sc.R b/R/fpca_sc.R index d92d914..0f47bc7 100644 --- a/R/fpca_sc.R +++ b/R/fpca_sc.R @@ -81,57 +81,66 @@ ##' @importFrom mgcv gam predict.gam ##' @importFrom gamm4 gamm4 ##' @export -fpca_sc <- function(data, Y.pred = NULL, argvals = NULL, random.int = FALSE, - nbasis = 10, pve = 0.99, npc = NULL, - useSymm = FALSE, makePD = FALSE, center = TRUE, cov.est.method = 2, integration = "trapezoidal") { - - #data <- tidyfun:::df_2_mat(data) ## calls complete.cases on the data, only use this once fixed regular function - data <- as.matrix(spread(as.data.frame(data), key = .data$arg, value = .data$value)[,-1]) - if (is.null(Y.pred)) - Y.pred = data - D = NCOL(data) - I = NROW(data) - I.pred = NROW(Y.pred) +fpca_sc <- function( + data, Y.pred = NULL, argvals = NULL, random.int = FALSE, + nbasis = 10, pve = 0.99, npc = NULL, + useSymm = FALSE, makePD = FALSE, center = TRUE, cov.est.method = 2, integration = "trapezoidal") { + # data <- tidyfun:::df_2_mat(data) ## calls complete.cases on the data, only use this once fixed regular function + data <- as.matrix(spread(as.data.frame(data), key = .data$arg, value = .data$value)[, -1]) + if (is.null(Y.pred)) { + Y.pred <- data + } + D <- NCOL(data) + I <- NROW(data) + I.pred <- NROW(Y.pred) - if (is.null(argvals)) - argvals = seq(0, 1, length = D) + if (is.null(argvals)) { + argvals <- seq(0, 1, length = D) + } - d.vec = rep(argvals, each = I) - id = rep(1:I, rep(D, I)) + d.vec <- rep(argvals, each = I) + id <- rep(1:I, rep(D, I)) if (center) { if (random.int) { ri_data <- data.frame(y = as.vector(data), d.vec = d.vec, id = factor(id)) - gam0 = gamm4(y ~ s(d.vec, k = nbasis), random = ~(1 | id), data = ri_data)$gam + gam0 <- gamm4(y ~ s(d.vec, k = nbasis), random = ~ (1 | id), data = ri_data)$gam rm(ri_data) - } else gam0 = gam(as.vector(data) ~ s(d.vec, k = nbasis)) - mu = predict(gam0, newdata = data.frame(d.vec = argvals)) - data.tilde = data - matrix(mu, I, D, byrow = TRUE) + } else { + gam0 <- gam(as.vector(data) ~ s(d.vec, k = nbasis)) + } + mu <- predict(gam0, newdata = data.frame(d.vec = argvals)) + data.tilde <- data - matrix(mu, I, D, byrow = TRUE) } else { - data.tilde = data - mu = rep(0, D) + data.tilde <- data + mu <- rep(0, D) } if (cov.est.method == 2) { # smooth raw covariance estimate - cov.sum = cov.count = cov.mean = matrix(0, D, D) + cov.sum <- cov.count <- cov.mean <- matrix(0, D, D) for (i in 1:I) { - obs.points = which(!is.na(data[i, ])) - cov.count[obs.points, obs.points] = cov.count[obs.points, obs.points] + + obs.points <- which(!is.na(data[i, ])) + cov.count[obs.points, obs.points] <- cov.count[obs.points, obs.points] + 1 - cov.sum[obs.points, obs.points] = cov.sum[obs.points, obs.points] + tcrossprod(data.tilde[i, - obs.points]) + cov.sum[obs.points, obs.points] <- cov.sum[obs.points, obs.points] + tcrossprod(data.tilde[ + i, + obs.points + ]) } - G.0 = ifelse(cov.count == 0, NA, cov.sum/cov.count) - diag.G0 = diag(G.0) - diag(G.0) = NA + G.0 <- ifelse(cov.count == 0, NA, cov.sum / cov.count) + diag.G0 <- diag(G.0) + diag(G.0) <- NA if (!useSymm) { - row.vec = rep(argvals, each = D) - col.vec = rep(argvals, D) - npc.0 = matrix(predict(gam(as.vector(G.0) ~ te(row.vec, col.vec, k = nbasis), - weights = as.vector(cov.count)), newdata = data.frame(row.vec = row.vec, - col.vec = col.vec)), D, D) - npc.0 = (npc.0 + t(npc.0))/2 + row.vec <- rep(argvals, each = D) + col.vec <- rep(argvals, D) + npc.0 <- matrix(predict(gam(as.vector(G.0) ~ te(row.vec, col.vec, k = nbasis), + weights = as.vector(cov.count) + ), newdata = data.frame( + row.vec = row.vec, + col.vec = col.vec + )), D, D) + npc.0 <- (npc.0 + t(npc.0)) / 2 } else { use <- upper.tri(G.0, diag = TRUE) use[2, 1] <- use[ncol(G.0), ncol(G.0) - 1] <- TRUE @@ -152,34 +161,40 @@ fpca_sc <- function(data, Y.pred = NULL, argvals = NULL, random.int = FALSE, } } else if (cov.est.method == 1) { # smooth y(s1)y(s2) values to obtain covariance estimate - row.vec = col.vec = G.0.vec = c() - cov.sum = cov.count = cov.mean = matrix(0, D, D) + row.vec <- col.vec <- G.0.vec <- c() + cov.sum <- cov.count <- cov.mean <- matrix(0, D, D) for (i in 1:I) { - obs.points = which(!is.na(data[i, ])) - temp = tcrossprod(data.tilde[i, obs.points]) - diag(temp) = NA - row.vec = c(row.vec, rep(argvals[obs.points], each = length(obs.points))) - col.vec = c(col.vec, rep(argvals[obs.points], length(obs.points))) - G.0.vec = c(G.0.vec, as.vector(temp)) + obs.points <- which(!is.na(data[i, ])) + temp <- tcrossprod(data.tilde[i, obs.points]) + diag(temp) <- NA + row.vec <- c(row.vec, rep(argvals[obs.points], each = length(obs.points))) + col.vec <- c(col.vec, rep(argvals[obs.points], length(obs.points))) + G.0.vec <- c(G.0.vec, as.vector(temp)) # still need G.O raw to calculate to get the raw to get the diagonal - cov.count[obs.points, obs.points] = cov.count[obs.points, obs.points] + + cov.count[obs.points, obs.points] <- cov.count[obs.points, obs.points] + 1 - cov.sum[obs.points, obs.points] = cov.sum[obs.points, obs.points] + tcrossprod(data.tilde[i, - obs.points]) + cov.sum[obs.points, obs.points] <- cov.sum[obs.points, obs.points] + tcrossprod(data.tilde[ + i, + obs.points + ]) } - row.vec.pred = rep(argvals, each = D) - col.vec.pred = rep(argvals, D) - npc.0 = matrix(predict(gam(G.0.vec ~ te(row.vec, col.vec, k = nbasis)), newdata = data.frame(row.vec = row.vec.pred, - col.vec = col.vec.pred)), D, D) - npc.0 = (npc.0 + t(npc.0))/2 - G.0 = ifelse(cov.count == 0, NA, cov.sum/cov.count) - diag.G0 = diag(G.0) + row.vec.pred <- rep(argvals, each = D) + col.vec.pred <- rep(argvals, D) + npc.0 <- matrix(predict(gam(G.0.vec ~ te(row.vec, col.vec, k = nbasis)), newdata = data.frame( + row.vec = row.vec.pred, + col.vec = col.vec.pred + )), D, D) + npc.0 <- (npc.0 + t(npc.0)) / 2 + G.0 <- ifelse(cov.count == 0, NA, cov.sum / cov.count) + diag.G0 <- diag(G.0) } if (makePD) { npc.0 <- { - tmp <- Matrix::nearPD(npc.0, corr = FALSE, keepDiag = FALSE, do2eigen = TRUE, - trace = TRUE) + tmp <- Matrix::nearPD(npc.0, + corr = FALSE, keepDiag = FALSE, do2eigen = TRUE, + trace = TRUE + ) as.matrix(tmp$mat) } } @@ -187,47 +202,49 @@ fpca_sc <- function(data, Y.pred = NULL, argvals = NULL, random.int = FALSE, ### Chapter 8) w <- quadWeights(argvals, method = integration) Wsqrt <- diag(sqrt(w)) - Winvsqrt <- diag(1/(sqrt(w))) + Winvsqrt <- diag(1 / (sqrt(w))) V <- Wsqrt %*% npc.0 %*% Wsqrt - evalues = eigen(V, symmetric = TRUE, only.values = TRUE)$values + evalues <- eigen(V, symmetric = TRUE, only.values = TRUE)$values ### - evalues = replace(evalues, which(evalues <= 0), 0) - npc = ifelse(is.null(npc), min(which(cumsum(evalues)/sum(evalues) > pve)), npc) - efunctions = matrix(Winvsqrt %*% eigen(V, symmetric = TRUE)$vectors[, seq(len = npc)], - nrow = D, ncol = npc) - evalues = eigen(V, symmetric = TRUE, only.values = TRUE)$values[1:npc] # use correct matrix for eigenvalue problem - cov.hat = efunctions %*% tcrossprod(diag(evalues, nrow = npc, ncol = npc), efunctions) + evalues <- replace(evalues, which(evalues <= 0), 0) + npc <- ifelse(is.null(npc), min(which(cumsum(evalues) / sum(evalues) > pve)), npc) + efunctions <- matrix(Winvsqrt %*% eigen(V, symmetric = TRUE)$vectors[, seq(len = npc)], + nrow = D, ncol = npc + ) + evalues <- eigen(V, symmetric = TRUE, only.values = TRUE)$values[1:npc] # use correct matrix for eigenvalue problem + cov.hat <- efunctions %*% tcrossprod(diag(evalues, nrow = npc, ncol = npc), efunctions) ### numerical integration for estimation of sigma2 - T.len <- argvals[D] - argvals[1] # total interval length - T1.min <- min(which(argvals >= argvals[1] + 0.25 * T.len)) # left bound of narrower interval T1 - T1.max <- max(which(argvals <= argvals[D] - 0.25 * T.len)) # right bound of narrower interval T1 - DIAG = (diag.G0 - diag(cov.hat))[T1.min:T1.max] # function values + T.len <- argvals[D] - argvals[1] # total interval length + T1.min <- min(which(argvals >= argvals[1] + 0.25 * T.len)) # left bound of narrower interval T1 + T1.max <- max(which(argvals <= argvals[D] - 0.25 * T.len)) # right bound of narrower interval T1 + DIAG <- (diag.G0 - diag(cov.hat))[T1.min:T1.max] # function values w2 <- quadWeights(argvals[T1.min:T1.max], method = integration) sigma2 <- max(weighted.mean(DIAG, w = w2, na.rm = TRUE), 0) error_var <- sigma2 #### - D.inv = diag(1/evalues, nrow = npc, ncol = npc) - Z = efunctions - data.tilde = Y.pred - matrix(mu, I.pred, D, byrow = TRUE) - Yhat = matrix(0, nrow = I.pred, ncol = D) - rownames(Yhat) = rownames(Y.pred) - colnames(Yhat) = colnames(Y.pred) - scores = matrix(NA, nrow = I.pred, ncol = npc) + D.inv <- diag(1 / evalues, nrow = npc, ncol = npc) + Z <- efunctions + data.tilde <- Y.pred - matrix(mu, I.pred, D, byrow = TRUE) + Yhat <- matrix(0, nrow = I.pred, ncol = D) + rownames(Yhat) <- rownames(Y.pred) + colnames(Yhat) <- colnames(Y.pred) + scores <- matrix(NA, nrow = I.pred, ncol = npc) for (i.subj in 1:I.pred) { - obs.points = which(!is.na(Y.pred[i.subj, ])) - if (sigma2 == 0 & length(obs.points) < npc) + obs.points <- which(!is.na(Y.pred[i.subj, ])) + if (sigma2 == 0 && length(obs.points) < npc) { stop("Measurement error estimated to be zero and there are fewer observed points than PCs; scores cannot be estimated.") - Zcur = matrix(Z[obs.points, ], nrow = length(obs.points), ncol = dim(Z)[2]) - ZtZ_sD.inv = solve(crossprod(Zcur) + sigma2 * D.inv) - scores[i.subj, ] = ZtZ_sD.inv %*% t(Zcur) %*% (data.tilde[i.subj, obs.points]) - Yhat[i.subj, ] = t(as.matrix(mu)) + scores[i.subj, ] %*% t(efunctions) + } + Zcur <- matrix(Z[obs.points, ], nrow = length(obs.points), ncol = dim(Z)[2]) + ZtZ_sD.inv <- solve(crossprod(Zcur) + sigma2 * D.inv) + scores[i.subj, ] <- ZtZ_sD.inv %*% t(Zcur) %*% (data.tilde[i.subj, obs.points]) + Yhat[i.subj, ] <- t(as.matrix(mu)) + scores[i.subj, ] %*% t(efunctions) } - ret.objects = c("mu", "efunctions", "scores", "npc", "evalues", "error_var") + ret.objects <- c("mu", "efunctions", "scores", "npc", "evalues", "error_var") - ret = lapply(1:length(ret.objects), function(u) get(ret.objects[u])) - names(ret) = ret.objects - class(ret) = "fpca" - return(ret) + ret <- lapply(seq_along(ret.objects), function(u) get(ret.objects[u])) + names(ret) <- ret.objects + class(ret) <- "fpca" + ret } diff --git a/R/fpca_ssvd.R b/R/fpca_ssvd.R index be8b437..b62f0e2 100644 --- a/R/fpca_ssvd.R +++ b/R/fpca_ssvd.R @@ -1,39 +1,39 @@ -#'Smoothed FPCA via iterative penalized rank one SVDs. +#' Smoothed FPCA via iterative penalized rank one SVDs. #' -#'Implements the algorithm of Huang, Shen, Buja (2008) for finding smooth right -#'singular vectors of a matrix `X` containing (contaminated) evaluations of -#'functional random variables on a regular, equidistant grid. If the number of -#'smooth SVs to extract is not specified, the function hazards a guess for the -#'appropriate number based on the asymptotically optimal truncation threshold -#'under the assumption of a low rank matrix contaminated with i.i.d. Gaussian -#'noise with unknown variance derived in Donoho, Gavish (2013). Please note that -#'Donoho, Gavish (2013) should be regarded as experimental for functional PCA, -#'and will typically not work well if you have more observations than grid -#'points. +#' Implements the algorithm of Huang, Shen, Buja (2008) for finding smooth right +#' singular vectors of a matrix `X` containing (contaminated) evaluations of +#' functional random variables on a regular, equidistant grid. If the number of +#' smooth SVs to extract is not specified, the function hazards a guess for the +#' appropriate number based on the asymptotically optimal truncation threshold +#' under the assumption of a low rank matrix contaminated with i.i.d. Gaussian +#' noise with unknown variance derived in Donoho, Gavish (2013). Please note that +#' Donoho, Gavish (2013) should be regarded as experimental for functional PCA, +#' and will typically not work well if you have more observations than grid +#' points. #' -#'@param Y data matrix (rows: observations; columns: grid of eval. points) -#'@param argvals the argument values of the function evaluations in `Y`, +#' @param Y data matrix (rows: observations; columns: grid of eval. points) +#' @param argvals the argument values of the function evaluations in `Y`, #' defaults to a equidistant grid from 0 to 1. See Details. -#'@param npc how many smooth SVs to try to extract, if `NA` (the default) +#' @param npc how many smooth SVs to try to extract, if `NA` (the default) #' the hard thresholding rule of Donoho, Gavish (2013) is used (see Details, #' References). -#'@param center center `Y` so that its column-means are 0? Defaults to +#' @param center center `Y` so that its column-means are 0? Defaults to #' `TRUE` -#'@param maxiter how many iterations of the power algorithm to perform at most +#' @param maxiter how many iterations of the power algorithm to perform at most #' (defaults to 15) -#'@param tol convergence tolerance for power algorithm (defaults to 1e-4) -#'@param diffpen difference penalty order controlling the desired smoothness of +#' @param tol convergence tolerance for power algorithm (defaults to 1e-4) +#' @param diffpen difference penalty order controlling the desired smoothness of #' the right singular vectors, defaults to 3 (i.e., deviations from local #' quadratic polynomials). -#'@param gridsearch use [stats::optimize()] or a grid search to find +#' @param gridsearch use [stats::optimize()] or a grid search to find #' GCV-optimal smoothing parameters? defaults to `TRUE`. -#'@param alphagrid grid of smoothing parameter values for grid search -#'@param lower.alpha lower limit for for smoothing parameter if +#' @param alphagrid grid of smoothing parameter values for grid search +#' @param lower.alpha lower limit for for smoothing parameter if #' `!gridsearch` -#'@param upper.alpha upper limit for smoothing parameter if `!gridsearch` -#'@param integration ignored, see Details. -#'@author Fabian Scheipl -#'@references Huang, J. Z., Shen, H., and Buja, A. (2008). Functional principal +#' @param upper.alpha upper limit for smoothing parameter if `!gridsearch` +#' @param integration ignored, see Details. +#' @author Fabian Scheipl +#' @references Huang, J. Z., Shen, H., and Buja, A. (2008). Functional principal #' components analysis via penalized rank one approximation. *Electronic #' Journal of Statistics*, 2, 678-695 #' @@ -41,26 +41,30 @@ #' Values is 4/sqrt(3). eprint arXiv:1305.5870. Available from #' . #' @importFrom stats median optimize -fpca_ssvd <- function(Y=NULL, argvals = NULL, npc = NA, center = TRUE, maxiter = 15, - tol = 1e-4, diffpen = 3, gridsearch = TRUE, alphagrid = 1.5^(-20:40), - lower.alpha = 1e-5, upper.alpha = 1e7, integration = "trapezoidal"){ - +fpca_ssvd <- function( + Y = NULL, argvals = NULL, npc = NA, center = TRUE, maxiter = 15, + tol = 1e-4, diffpen = 3, gridsearch = TRUE, alphagrid = 1.5^(-20:40), + lower.alpha = 1e-5, upper.alpha = 1e7, integration = "trapezoidal") { stopifnot(!is.null(Y)) - if(any(is.na(Y))) stop("No missing values in allowed.") + if (any(is.na(Y))) stop("No missing values in allowed.") m <- ncol(Y) n <- nrow(Y) irregular <- FALSE - if(!is.null(argvals)) { - stopifnot(is.numeric(argvals), + if (!is.null(argvals)) { + stopifnot( + is.numeric(argvals), length(argvals) == m, - all(!is.na(argvals))) - if(any(diff(argvals)/mean(diff(argvals)) > 1.05 | - diff(argvals)/mean(diff(argvals)) < 0.95)) { - warning(paste("non-equidistant -grid detected:", + all(!is.na(argvals)) + ) + if (any(diff(argvals) / mean(diff(argvals)) > 1.05 | + diff(argvals) / mean(diff(argvals)) < 0.95)) { + warning(paste( + "non-equidistant -grid detected:", "fpca.ssvd will return orthonormal eigenvectors of the function evaluations", "not evaluations of the orthonormal eigenvectors.", - "Use fpca.sc() for the latter instead.")) + "Use fpca.sc() for the latter instead." + )) irregular <- TRUE } } else { @@ -68,45 +72,45 @@ fpca_ssvd <- function(Y=NULL, argvals = NULL, npc = NA, center = TRUE, maxiter = } - #GCV criterion from eq. (10), App. C: - gcv <- function(alpha, w, m, lambda){ - sqrt(sum( (w * (alpha*lambda)/(1 + alpha*lambda))^2 ))/ - (1- 1/m * sum( 1/(1 + alpha*lambda) )) + # GCV criterion from eq. (10), App. C: + gcv <- function(alpha, w, m, lambda) { + sqrt(sum((w * (alpha * lambda) / (1 + alpha * lambda))^2)) / + (1 - 1 / m * sum(1 / (1 + alpha * lambda))) } # Recursion for difference operator matrix - makeDiffOp <- function(degree, dim){ - if(degree==0){ + makeDiffOp <- function(degree, dim) { + if (degree == 0) { return(diag(dim)) } else { - return(diff(makeDiffOp(degree-1, dim))) + return(diff(makeDiffOp(degree - 1, dim))) } } - if(is.na(npc)){ + if (is.na(npc)) { npc <- getNPC.DonohoGavish(Y) } - if(!is.numeric(npc)) stop("Invalid .") - if(npc<1 | npc>min(m,n)) stop("Invalid .") + if (!is.numeric(npc)) stop("Invalid .") + if (npc < 1 || npc > min(m, n)) stop("Invalid .") - if(!is.numeric(alphagrid)) stop("Invalid .") - if(any(is.na(alphagrid)) | any(alphagrid<.Machine$double.eps)) stop("Invalid .") + if (!is.numeric(alphagrid)) stop("Invalid .") + if (any(is.na(alphagrid)) | any(alphagrid < .Machine$double.eps)) stop("Invalid .") uhoh <- numeric(0) - Omega <- crossprod(makeDiffOp(degree=diffpen, dim=m)) - eOmega <- eigen(Omega, symmetric=TRUE) + Omega <- crossprod(makeDiffOp(degree = diffpen, dim = m)) + eOmega <- eigen(Omega, symmetric = TRUE) lambda <- eOmega$values - Gamma <- eOmega$vectors + Gamma <- eOmega$vectors - U <- matrix(NA, nrow=n, ncol=npc) - V <- matrix(NA, nrow=m, ncol=npc) + U <- matrix(NA, nrow = n, ncol = npc) + V <- matrix(NA, nrow = m, ncol = npc) d <- rep(NA, npc) Yorig <- Y - if(center){ - meanY <- predict(smooth.spline(x=1:m, y=colMeans(Y)), x=1:m)$y + if (center) { + meanY <- predict(smooth.spline(x = 1:m, y = colMeans(Y)), x = 1:m)$y Y <- t(t(Y) - meanY) } else { meanY <- rep(0, ncol(Y)) @@ -114,83 +118,92 @@ fpca_ssvd <- function(Y=NULL, argvals = NULL, npc = NA, center = TRUE, maxiter = Ynow <- Y - for(k in 1:npc){ + for (k in 1:npc) { ## 'Power algorithm', Section 3 / Appendix C : - YnowGamma <- Ynow%*%Gamma - vold <- svd(Ynow, nu=0, nv=1)$v[,1] + YnowGamma <- Ynow %*% Gamma + vold <- svd(Ynow, nu = 0, nv = 1)$v[, 1] u <- Ynow %*% vold - iter <- 1; reldiff <- tol+1 - while(all(c(reldiff > tol, iter<=maxiter))){ - w <- t(YnowGamma)%*%u - if(gridsearch){ - gridmin <- which.min(sapply(alphagrid, gcv, w=w, m=m, lambda=lambda)) + iter <- 1 + reldiff <- tol + 1 + while (all(c(reldiff > tol, iter <= maxiter))) { + w <- t(YnowGamma) %*% u + if (gridsearch) { + gridmin <- which.min(sapply(alphagrid, gcv, w = w, m = m, lambda = lambda)) minalpha <- alphagrid[gridmin] } else { - minalpha <- optimize(f=gcv, interval=c(lower.alpha, upper.alpha), - w=w, m=m, lambda=lambda)$minimum + minalpha <- optimize( + f = gcv, interval = c(lower.alpha, upper.alpha), + w = w, m = m, lambda = lambda + )$minimum } # v = S(alpha)^-1 Y u = Gamma%*%(I + minalpha*Lambda)^-1 Gamma' Y u) - vnew <- Gamma%*%(w/(1+minalpha*lambda)) + vnew <- Gamma %*% (w / (1 + minalpha * lambda)) - vnew <- vnew/sqrt(sum(vnew^2)) - reldiff <- sum((vold-vnew)^2)/sum(vold^2) - iter <- iter +1 + vnew <- vnew / sqrt(sum(vnew^2)) + reldiff <- sum((vold - vnew)^2) / sum(vold^2) + iter <- iter + 1 vold <- vnew u <- Ynow %*% vold } # end while(reldiff) - if(reldiff > tol){ - warning("Not converged for SV ", k, - "; relative difference was ", format(reldiff),".") + if (reldiff > tol) { + warning( + "Not converged for SV ", k, + "; relative difference was ", format(reldiff), "." + ) } - U[,k] <- u/sqrt(sum(u^2)) - V[,k] <- vnew - d[k] <- sqrt(sum(u^2)) + U[, k] <- u / sqrt(sum(u^2)) + V[, k] <- vnew + d[k] <- sqrt(sum(u^2)) - Ynow <- Ynow - U[, k, drop=FALSE] %*% (t(V[, k, drop=FALSE]) * d[k]) - noisesv <- svd(Ynow, nu=0, nv=0)$d[1] - if(noisesv > 1.1 * d[k]){ + Ynow <- Ynow - U[, k, drop = FALSE] %*% (t(V[, k, drop = FALSE]) * d[k]) + noisesv <- svd(Ynow, nu = 0, nv = 0)$d[1] + if (noisesv > 1.1 * d[k]) { uhoh <- c(uhoh, k) } - }# end for(k) - if(length(uhoh)) warning("First SV for remaining un-smooth signal larger than ", - "SV found for smooth signal for component(s) ", paste(uhoh, collapse=",")) - if(!irregular){ + } # end for(k) + if (length(uhoh)) { + warning( + "First SV for remaining un-smooth signal larger than ", + "SV found for smooth signal for component(s) ", paste(uhoh, collapse = ",") + ) + } + if (!irregular) { # scale smooth eigenvectors so they're scaled as realizations of orthonormal # eigenfunctions i.e. so that colSums(diff(argvals) * V^2) == 1 instead of # crossprod(V) == diag(npc) scale <- sqrt(mean(diff(argvals))) V <- V / scale - d <- d * scale + d <- d * scale } else { scale <- 1 } - scores <- U%*%(d * diag(length(d))) + scores <- U %*% (d * diag(length(d))) - ret = list( - #Yhat = t(meanY + t(scores%*%t(V))), + ret <- list( + # Yhat = t(meanY + t(scores%*%t(V))), scores = scores, mu = meanY, efunctions = V, - #FIXME: this should be d^2 but the scaling is way off.... + # FIXME: this should be d^2 but the scaling is way off.... evalues = diag(var(scores)), - npc = npc) - class(ret) = "fpca" - return(ret) - + npc = npc + ) + class(ret) <- "fpca" + ret } -getNPC.DonohoGavish <- function(X){ +getNPC.DonohoGavish <- function(X) { # use Gavish and Donoho (2014) for estimating suitable number of sv's to extract: n <- nrow(X) m <- ncol(X) - beta <- n/m + beta <- n / m - if(beta > 1 | beta < 1e-3){ + if (beta > 1 || beta < 1e-3) { warning("Approximation for \\beta(\\omega) may be invalid.") } # ## approx for omega.beta below eq. (25): @@ -208,11 +221,11 @@ getNPC.DonohoGavish <- function(X){ # lambda.beta <- sqrt(2*(beta+1) + (8*beta)/(beta + 1 + sqrt(beta^2 + 14*beta +1))) # omega.beta <- lambda.beta/sqrt(mu.beta) - omega.beta <- .56*beta^3 - 0.95*beta^2 + 1.82*beta + 1.43 - y <- svd(X, nu=0, nv=0)$d - rankY <- min(which(cumsum(y[y>0])/sum(y[y>0]) > .995)) + omega.beta <- .56 * beta^3 - 0.95 * beta^2 + 1.82 * beta + 1.43 + y <- svd(X, nu = 0, nv = 0)$d + rankY <- min(which(cumsum(y[y > 0]) / sum(y[y > 0]) > .995)) y.med <- median(y) - npc <- min(max(1, sum(y > omega.beta * y.med)), rankY) - return(npc) + npc <- min(max(1, sum(y > omega.beta * y.med)), rankY) + npc } diff --git a/R/pspline.setting.R b/R/pspline.setting.R index 62a9f5f..df01b05 100644 --- a/R/pspline.setting.R +++ b/R/pspline.setting.R @@ -1,94 +1,91 @@ #' @importFrom splines spline.des -pspline.setting <- function(x,knots=select_knots(x,35),p=3,m=2,periodicity=FALSE,weight=NULL){ - -# x: the marginal data points -# knots: the list of interior knots or the numbers of interior knots -# p: degrees for B-splines, with defaults values 3 -# m: orders of difference penalty, with default values 2 -#require(splines) -#require(Matrix) - -### design matrix -K = length(knots)-2*p-1 -B = splines::spline.des(knots=knots, x=x, ord = p+1,outer.ok = TRUE)$design -if(periodicity){ - Bint = B[,-c(1:p,K+1:p)] - Bleft = B[,1:p] - Bright = B[,K+1:p] - B = cbind(Bint,Bleft+Bright) -} - - -difference.penalty <-function(m,p,K,periodicity=FALSE){ - - # parameter m: difference order - # parameter p: degree of B-splines - # parameter K: number of interior knots - c = rep(0,m+1) - - for(i in 0:m) - c[i+1] = (-1)^(i+1)*factorial(m)/(factorial(i)*factorial(m-i)) - - if(!periodicity){ - - M = matrix(0,nrow=K+p-m,ncol=K+p) - for(i in 1:(K+p-m)) M[i,i:(i+m)] = c - } - if(periodicity){ - - M = matrix(0,nrow=K,ncol=K) - for(i in 1:(K-m)) M[i,i:(i+m)] = c - for(i in (K-m+1):K) M[i,c(i:K,1:(m-K+i))] = c +pspline.setting <- function(x, knots = select_knots(x, 35), p = 3, m = 2, periodicity = FALSE, weight = NULL) { + # x: the marginal data points + # knots: the list of interior knots or the numbers of interior knots + # p: degrees for B-splines, with defaults values 3 + # m: orders of difference penalty, with default values 2 + # require(splines) + # require(Matrix) + + ### design matrix + K <- length(knots) - 2 * p - 1 + B <- splines::spline.des(knots = knots, x = x, ord = p + 1, outer.ok = TRUE)$design + if (periodicity) { + Bint <- B[, -c(1:p, K + 1:p)] + Bleft <- B[, 1:p] + Bright <- B[, K + 1:p] + B <- cbind(Bint, Bleft + Bright) } - return(M) -} - - -P = difference.penalty(m,p,K,periodicity) -P1 = Matrix(P) -P2 = Matrix(t(P)) -P = P2%*%P1 - -MM <- function(A,s,option=1){ - if(option==2) - return(A*(s%*%t(rep(1,dim(A)[2])))) - if(option==1) - return(A*(rep(1,dim(A)[1])%*%t(s))) -} -if(is.null(weight)) weight <- rep(1,length(x)) + difference.penalty <- function(m, p, K, periodicity = FALSE) { + # parameter m: difference order + # parameter p: degree of B-splines + # parameter K: number of interior knots + c <- rep(0, m + 1) + + for (i in 0:m) { + c[i + 1] <- (-1)^(i + 1) * factorial(m) / (factorial(i) * factorial(m - i)) + } + + if (!periodicity) { + M <- matrix(0, nrow = K + p - m, ncol = K + p) + for (i in 1:(K + p - m)) M[i, i:(i + m)] <- c + } else { + M <- matrix(0, nrow = K, ncol = K) + for (i in 1:(K - m)) M[i, i:(i + m)] <- c + for (i in (K - m + 1):K) M[i, c(i:K, 1:(m - K + i))] <- c + } + M + } -B1 = Matrix(MM(t(B),weight)) -B = Matrix(B) -Sig = B1%*%B -eSig = eigen(Sig) -V = eSig$vectors -E = eSig$values -if(min(E)<=0.0000001) {#cat("Warning! t(B)%*%B is singular!\n"); - #cat("A small identity matrix is added!\n"); - E <- E + 0.000001; + P <- difference.penalty(m, p, K, periodicity) + P1 <- Matrix(P) + P2 <- Matrix(t(P)) + P <- P2 %*% P1 -} -Sigi_sqrt = MM(V,1/sqrt(E))%*%t(V) + MM <- function(A, s, option = 1) { + if (option == 2) { + return(A * (s %*% t(rep(1, dim(A)[2])))) + } + if (option == 1) { + return(A * (rep(1, dim(A)[1]) %*% t(s))) + } + } -#Sigi = V%*%diag(1/E)%*%t(V) -tUPU = Sigi_sqrt%*%(P%*%Sigi_sqrt) -Esig = eigen(tUPU,symmetric=TRUE) -U = Esig$vectors -s = Esig$values -if(!periodicity) s[(K+p-m+1):(K+p)]=0 -if(periodicity) s[K] = 0 -A = B%*%(Sigi_sqrt%*%U) + if (is.null(weight)) weight <- rep(1, length(x)) -List = list( - "A" = A, - "B" = B, - "s" = s, - "Sigi.sqrt"=Sigi_sqrt, - "U" = U, - "P" = P) -return(List) + B1 <- Matrix(MM(t(B), weight)) + B <- Matrix(B) + Sig <- B1 %*% B + eSig <- eigen(Sig) + V <- eSig$vectors + E <- eSig$values + if (min(E) <= 0.0000001) { # cat("Warning! t(B)%*%B is singular!\n"); + # cat("A small identity matrix is added!\n"); + E <- E + 0.000001 + } + Sigi_sqrt <- MM(V, 1 / sqrt(E)) %*% t(V) + + # Sigi = V%*%diag(1/E)%*%t(V) + tUPU <- Sigi_sqrt %*% (P %*% Sigi_sqrt) + Esig <- eigen(tUPU, symmetric = TRUE) + U <- Esig$vectors + s <- Esig$values + if (!periodicity) s[(K + p - m + 1):(K + p)] <- 0 + if (periodicity) s[K] <- 0 + A <- B %*% (Sigi_sqrt %*% U) + + List <- list( + "A" = A, + "B" = B, + "s" = s, + "Sigi.sqrt" = Sigi_sqrt, + "U" = U, + "P" = P + ) + + List } diff --git a/R/quadWeights.R b/R/quadWeights.R index 93d5423..d84374b 100644 --- a/R/quadWeights.R +++ b/R/quadWeights.R @@ -4,19 +4,20 @@ #' @param argvals function arguments. #' @param method quadrature method. Can be either `trapedoidal` or `midpoint`. #' @return a vector of quadrature weights for the points supplied in `argvals`. -#' @author Clara Happ, with modifications by Philip Reiss +#' @author Clara Happ, with modifications by Philip Reiss # TODO: check about 'midpoint' # TODO: have this function called by lf, af, etc. # TODO: harmonize with quadrature implemented elsewhere: 'simpson' and 'riemann' options? -quadWeights<- function(argvals, method = "trapezoidal") -{ +quadWeights <- function(argvals, method = "trapezoidal") { ret <- switch(method, - trapezoidal = {D <- length(argvals) - 1/2*c(argvals[2] - argvals[1], argvals[3:D] -argvals[1:(D-2)], argvals[D] - argvals[D-1])}, - midpoint = c(0,diff(argvals)), # why is this called 'midpoint'??? - stop("function quadWeights: choose either trapezoidal or midpoint quadrature rule")) - - return(ret) + trapezoidal = { + D <- length(argvals) + 1 / 2 * c(argvals[2] - argvals[1], argvals[3:D] - argvals[1:(D - 2)], argvals[D] - argvals[D - 1]) + }, + midpoint = c(0, diff(argvals)), # why is this called 'midpoint'??? + stop("function quadWeights: choose either trapezoidal or midpoint quadrature rule") + ) + ret } diff --git a/R/refundr-package.R b/R/refundr-package.R index 1ec39c5..3a42e20 100644 --- a/R/refundr-package.R +++ b/R/refundr-package.R @@ -1,7 +1,7 @@ #' refundr #' #' -#'@description `refundr` makes functional data analyses +#' @description `refundr` makes functional data analyses #' in `R` more consistent and user friendly\cr\cr #' `refundr` provides: #' - tidy interface for common fpca methods and funcional regression diff --git a/R/rfr_fpca.R b/R/rfr_fpca.R index 6281d97..55f2c23 100644 --- a/R/rfr_fpca.R +++ b/R/rfr_fpca.R @@ -37,7 +37,7 @@ ##' @import tidyfun ##' @importFrom dplyr pull ##' @importFrom rlang enquo `!!` -rfr_fpca <- function(Y, data, pve = 0.99, npc = NULL, method = NULL, ...){ +rfr_fpca <- function(Y, data, pve = 0.99, npc = NULL, method = NULL, ...) { UseMethod("rfr_fpca", pull(data, !!enquo(Y))) } @@ -46,7 +46,7 @@ rfr_fpca <- function(Y, data, pve = 0.99, npc = NULL, method = NULL, ...){ #' @importFrom dplyr mutate #' @importFrom rlang `:=` #' @export -rfr_fpca.tfb <- function(Y, data, pve = 0.99, npc = NULL, ...){ - data = mutate(data, !!enquo(Y) := tfd(!!enquo(Y))) +rfr_fpca.tfb <- function(Y, data, pve = 0.99, npc = NULL, ...) { + data <- mutate(data, !!enquo(Y) := tfd(!!enquo(Y))) rfr_fpca(Y = !!enquo(Y), data = data, pve = .99, npc = npc, ...) } diff --git a/R/rfr_fpca.tfd_irreg.R b/R/rfr_fpca.tfd_irreg.R index 44425e1..19d3bd1 100644 --- a/R/rfr_fpca.tfd_irreg.R +++ b/R/rfr_fpca.tfd_irreg.R @@ -1,10 +1,9 @@ ##' @describeIn rfr_fpca FPCA for data on irregular grids defaults to [refund::fpca.sc()] ##' @importFrom rlang as_name enquo `!!` ##' @export -rfr_fpca.tfd_irreg <- function(Y, data, pve = 0.99, npc = NULL, method = fpca_sc, ...){ - - method_name = as_name(enquo(method)) - model_var_name = as_name(enquo(Y)) +rfr_fpca.tfd_irreg <- function(Y, data, pve = 0.99, npc = NULL, method = fpca_sc, ...) { + method_name <- as_name(enquo(method)) + model_var_name <- as_name(enquo(Y)) tf_vec <- pull(data, !!enquo(Y)) results <- tfb_fpc(tf_vec, method = method, pve = pve, npc = npc, ...) @@ -12,7 +11,7 @@ rfr_fpca.tfd_irreg <- function(Y, data, pve = 0.99, npc = NULL, method = fpca_sc results_ls <- extract_fpca(results) results_ls$Y <- tf_vec results_ls$model_var <- model_var_name - results_ls$method = method_name + results_ls$method <- method_name - return(results_ls) + results_ls } diff --git a/R/rfr_fpca.tfd_reg.R b/R/rfr_fpca.tfd_reg.R index 5c457cc..bd3b926 100644 --- a/R/rfr_fpca.tfd_reg.R +++ b/R/rfr_fpca.tfd_reg.R @@ -1,10 +1,9 @@ ##' @describeIn rfr_fpca FPCA for data on a regular grid defaults to [refund::fpca.face()] ##' @importFrom rlang as_name enquo `!!` ##' @export -rfr_fpca.tfd_reg <- function(Y, data, pve = 0.99, npc = NULL, method = fpca_face, ...){ - - method_name = as_name(enquo(method)) - model_var_name = as_name(enquo(Y)) +rfr_fpca.tfd_reg <- function(Y, data, pve = 0.99, npc = NULL, method = fpca_face, ...) { + method_name <- as_name(enquo(method)) + model_var_name <- as_name(enquo(Y)) tf_vec <- pull(data, !!enquo(Y)) results <- tfb_fpc(tf_vec, method = method, pve = pve, npc = npc, ...) @@ -12,7 +11,7 @@ rfr_fpca.tfd_reg <- function(Y, data, pve = 0.99, npc = NULL, method = fpca_face results_ls <- extract_fpca(results) results_ls$Y <- tf_vec results_ls$model_var <- model_var_name - results_ls$method = method_name + results_ls$method <- method_name - return(results_ls) + results_ls } diff --git a/data-raw/chf_df.R b/data-raw/chf_df.R index 5dfb4a7..14c802f 100644 --- a/data-raw/chf_df.R +++ b/data-raw/chf_df.R @@ -17,11 +17,14 @@ covar <- read_csv(here::here("data-raw", "covariate.csv")) activity <- read_csv(here::here("data-raw", "activity.csv")) chf_df <- inner_join(covar, filter(activity, week == 1), by = "id") %>% - tf_gather(activity.1:activity.1440, key = activity) %>% - select(-week) %>% - mutate(day = ordered(day, - levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", - "Saturday", "Sunday"), - labels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))) + tf_gather(activity.1:activity.1440, key = activity) %>% + select(-week) %>% + mutate(day = ordered(day, + levels = c( + "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", + "Saturday", "Sunday" + ), + labels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun") + )) usethis::use_data(chf_df, overwrite = TRUE) diff --git a/data-raw/dti_df.R b/data-raw/dti_df.R index f5080aa..a5e0320 100644 --- a/data-raw/dti_df.R +++ b/data-raw/dti_df.R @@ -1,6 +1,6 @@ ########################################################### # The DTI dataset is a perennial example in refund, and is -# used often in tidyfun examples as well. We therefore +# used often in tidyfun examples as well. We therefore # include this dataset as an example; this code produces a # tidy version of the DTI data. The code also appears in a # vignette showing data conversions. @@ -9,13 +9,14 @@ library(tidyfun) library(tibble) -dti_df = tibble( - id = refund::DTI$ID, +dti_df <- tibble( + id = refund::DTI$ID, visit = refund::DTI$visit, - sex = refund::DTI$sex, - case = factor(ifelse(refund::DTI$case, "MS", "control"))) + sex = refund::DTI$sex, + case = factor(ifelse(refund::DTI$case, "MS", "control")) +) -dti_df$cca = tfd(refund::DTI$cca, arg = seq(0,1, l = 93)) -dti_df$rcst = tfd(refund::DTI$rcst, arg = seq(0, 1, l = 55)) +dti_df$cca <- tfd(refund::DTI$cca, arg = seq(0, 1, l = 93)) +dti_df$rcst <- tfd(refund::DTI$rcst, arg = seq(0, 1, l = 55)) usethis::use_data(dti_df, overwrite = TRUE) diff --git a/tests/testthat/test-rfr_fpca.R b/tests/testthat/test-rfr_fpca.R index 7508ce8..7e52811 100644 --- a/tests/testthat/test-rfr_fpca.R +++ b/tests/testthat/test-rfr_fpca.R @@ -11,14 +11,14 @@ argvalues <- seq(0, 1, l = 100) eigenvalues <- exp(-seq(0, 1, l = npc)) eigenfunctions <- poly(argvalues, npc) scores <- { - #generate orthogonal score vectors with desired variance + # generate orthogonal score vectors with desired variance raw <- svd(replicate(n, rnorm(npc))) t(scale(raw$v)) * sqrt(eigenvalues) } -data_reg <- 1 + t(eigenfunctions %*% scores) %>% tfd +data_reg <- 1 + t(eigenfunctions %*% scores) %>% tfd() data_irreg <- data_reg %>% tf_sparsify(dropout = .05) -data_tfb <- data_reg %>% tfb +data_tfb <- data_reg %>% tfb() df_reg <- tibble( data_reg = data_reg @@ -37,20 +37,24 @@ test_that("rfr_fpca defaults run on regular data", { expect_is(rfr_fpca(data_reg, df_reg), "rfr_fpca") reg_fpca <- rfr_fpca(data_reg, df_reg) - expect_equivalent(mean(data_reg) %>% tf_evaluations %>% unlist, - reg_fpca$mu, - tolerance = 0.01 * max(reg_fpca$mu)) - expect_equivalent(reg_fpca$evalues/eigenvalues, - rep(1, npc), - tolerance = 0.05) + expect_equivalent(mean(data_reg) %>% tf_evaluations() %>% unlist(), + reg_fpca$mu, + tolerance = 0.01 * max(reg_fpca$mu) + ) + expect_equivalent(reg_fpca$evalues / eigenvalues, + rep(1, npc), + tolerance = 0.05 + ) # abs to remove sign flips expect_true( mean(abs(abs(reg_fpca$efunctions) - abs(unclass(eigenfunctions)))) < - mean(abs(eigenfunctions))/10) + mean(abs(eigenfunctions)) / 10 + ) expect_equivalent( - abs(reg_fpca$scores)/abs(t(scores)), + abs(reg_fpca$scores) / abs(t(scores)), matrix(1, n, npc), - tolerance = 0.05) + tolerance = 0.05 + ) }) @@ -58,20 +62,24 @@ test_that("rfr_fpca defaults run on irregular data", { expect_is(rfr_fpca(data_irreg, df_irreg), "rfr_fpca") irreg_fpca <- rfr_fpca(data_irreg, df_irreg) - expect_equivalent(mean(data_irreg, na.rm = TRUE) %>% tf_evaluations %>% unlist, - irreg_fpca$mu, - tolerance = 0.01 * max(irreg_fpca$mu)) - expect_equivalent(irreg_fpca$evalues/eigenvalues, - rep(1, npc), - tolerance = 0.1) + expect_equivalent(mean(data_irreg, na.rm = TRUE) %>% tf_evaluations() %>% unlist(), + irreg_fpca$mu, + tolerance = 0.01 * max(irreg_fpca$mu) + ) + expect_equivalent(irreg_fpca$evalues / eigenvalues, + rep(1, npc), + tolerance = 0.1 + ) # abs to remove sign flips expect_true( mean(abs(abs(irreg_fpca$efunctions) - abs(unclass(eigenfunctions)))) < - mean(abs(eigenfunctions))/10) + mean(abs(eigenfunctions)) / 10 + ) # more lenient sanity checks for irregular data. thresholds rather arbitrary. expect_true( - mean(abs(irreg_fpca$scores/t(scores)) < .85) < .15 & - mean(abs(irreg_fpca$scores/t(scores)) > 1.15) < .15) + mean(abs(irreg_fpca$scores / t(scores)) < .85) < .15 & + mean(abs(irreg_fpca$scores / t(scores)) > 1.15) < .15 + ) }) @@ -93,7 +101,6 @@ test_that("residuals and fitted method for rfr_fpca don't error", { expect_equivalent(residuals(reg_fpca), data_reg - fitted(reg_fpca)) expect_equivalent(residuals(irreg_fpca), data_irreg - fitted(irreg_fpca)) - }) @@ -103,24 +110,26 @@ test_that("predict functions work for fpca", { # check that predictions and fitted values are the same expect_equivalent(predict(reg_fpca), fitted(reg_fpca)) - expect_equivalent(predict(reg_fpca, newdata = df_reg[1:10,]), reg_fpca$Yhat_tfb[1:10]) - expect_equivalent(predict(irreg_fpca, newdata = df_irreg[1:10,]), irreg_fpca$Yhat_tfb[1:10]) + expect_equivalent(predict(reg_fpca, newdata = df_reg[1:10, ]), reg_fpca$Yhat_tfb[1:10]) + expect_equivalent(predict(irreg_fpca, newdata = df_irreg[1:10, ]), irreg_fpca$Yhat_tfb[1:10]) # check that you can make predictions for irregular data using FPCs from regular data expect_equivalent( - df_reg[1:10,] %>% + df_reg[1:10, ] %>% mutate(data_reg = tf_sparsify(data_reg, dropout = .05)) %>% predict(reg_fpca, newdata = .), - reg_fpca$Yhat_tfb[1:10]) + reg_fpca$Yhat_tfb[1:10] + ) expect_equivalent( - tfd(predict(reg_fpca, newdata = df_irreg[1:2,] %>% rename(data_reg = data_irreg))), - tfd(predict(irreg_fpca, newdata = df_irreg[1:2,])), - tolerance = .01) + tfd(predict(reg_fpca, newdata = df_irreg[1:2, ] %>% rename(data_reg = data_irreg))), + tfd(predict(irreg_fpca, newdata = df_irreg[1:2, ])), + tolerance = .01 + ) # predict breaks when you supply a df without the right column name expect_error( - predict(reg_fpca, newdata = df_irreg[1:2,]) + predict(reg_fpca, newdata = df_irreg[1:2, ]) ) #