diff --git a/DESCRIPTION b/DESCRIPTION index 66988e22e36810cdf501c0184b07be8c3005a96c..0e98e0c348bb471501baf92934c2853a342e7ba7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,8 @@ Depends: GenSA, parallel, stats, - graphics + graphics, + r2r Imports: Rcpp LinkingTo: diff --git a/NAMESPACE b/NAMESPACE index fca965d6eb13b3c85a80fc8e8eb1f0a64bd4e5ca..11e794c600a48c5845df02a896cde438e49faf18 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -85,6 +85,8 @@ importFrom(parallel,makeCluster) importFrom(parallel,mclapply) importFrom(parallel,parLapply) importFrom(parallel,stopCluster) +importFrom(r2r,has_key) +importFrom(r2r,hashmap) importFrom(stats,acf) importFrom(stats,binomial) importFrom(stats,cor) diff --git a/R/arguments.R b/R/arguments.R index ce67bf7019a74efccd48682fa8c8d0d3ca2688d0..bc00f5e9305b9f92aa889a340d5c29a80860d10f 100644 --- a/R/arguments.R +++ b/R/arguments.R @@ -124,6 +124,60 @@ gen.probs.gmjmcmc <- function (transforms) { #' @param data The dataset that will be used in the algorithm #' #' @return A list of parameters to use when running the mjmcmc function. +#' +#' The list contains the following elements: +#' +#' \describe{ +#' \item{\code{burn_in}}{The burn-in period for the MJMCMC algorithm, which is set to 100 iterations by default.} +#' +#' \item{\code{mh}}{A list containing parameters for the regular Metropolis-Hastings (MH) kernel: +#' \describe{ +#' \item{\code{neigh.size}}{The size of the neighborhood for MH proposals with fixed proposal size, default set to 1.} +#' \item{\code{neigh.min}}{The minimum neighborhood size for random proposal size, default set to 1.} +#' \item{\code{neigh.max}}{The maximum neighborhood size for random proposal size, default set to 2.} +#' } +#' } +#' +#' \item{\code{large}}{A list containing parameters for the large jump kernel: +#' \describe{ +#' \item{\code{neigh.size}}{The size of the neighborhood for large jump proposals with fixed neighborhood size, default set to the smaller of 0.35 \eqn{\times p} and 35, where \eqn{p} is the number of covariates.} +#' \item{\code{neigh.min}}{The minimum neighborhood size for large jumps with random size of the neighborhood, default set to the smaller of 0.25 \eqn{\times p} and 25.} +#' \item{\code{neigh.max}}{The maximum neighborhood size for large jumps with random size of the neighborhood, default set to the smaller of 0.45 \eqn{\times p} and 45.} +#' } +#' } +#' +#' \item{\code{random}}{A list containing a parameter for the randomization kernel: +#' \describe{ +#' \item{\code{prob}}{The small probability of changing the component around the mode, default set to 0.01.} +#' } +#' } +#' +#' \item{\code{sa}}{A list containing parameters for the simulated annealing kernel: +#' \describe{ +#' \item{\code{probs}}{A numeric vector of length 6 specifying the probabilities for different types of proposals in the simulated annealing algorithm.} +#' \item{\code{neigh.size}}{The size of the neighborhood for the simulated annealing proposals, default set to 1.} +#' \item{\code{neigh.min}}{The minimum neighborhood size, default set to 1.} +#' \item{\code{neigh.max}}{The maximum neighborhood size, default set to 2.} +#' \item{\code{t.init}}{The initial temperature for simulated annealing, default set to 10.} +#' \item{\code{t.min}}{The minimum temperature for simulated annealing, default set to 0.0001.} +#' \item{\code{dt}}{The temperature decrement factor, default set to 3.} +#' \item{\code{M}}{The number of iterations in the simulated annealing process, default set to 12.} +#' } +#' } +#' +#' \item{\code{greedy}}{A list containing parameters for the greedy algorithm: +#' \describe{ +#' \item{\code{probs}}{A numeric vector of length 6 specifying the probabilities for different types of proposals in the greedy algorithm.} +#' \item{\code{neigh.size}}{The size of the neighborhood for greedy algorithm proposals, set to 1.} +#' \item{\code{neigh.min}}{The minimum neighborhood size for greedy proposals, set to 1.} +#' \item{\code{neigh.max}}{The maximum neighborhood size for greedy proposals, set to 2.} +#' \item{\code{steps}}{The number of steps for the greedy algorithm, set to 20.} +#' \item{\code{tries}}{The number of tries for the greedy algorithm, set to 3.} +#' } +#' } +#' +#' \item{\code{loglik}}{A list to store log-likelihood values, which is by default empty.} +#' } #' #' Note that the `$loglik` item is an empty list, which is passed to the log likelihood function of the model, #' intended to store parameters that the estimator function should use. diff --git a/R/gmjmcmc.R b/R/gmjmcmc.R index 9a14a55d009c3c8ca8b0f8a835c1f77a1ae06539..d3e3917a5a79c3143cfb5c1c7623376d3e8514c0 100644 --- a/R/gmjmcmc.R +++ b/R/gmjmcmc.R @@ -106,7 +106,7 @@ gmjmcmc <- function ( # Initialize first model of population model.cur <- as.logical(rbinom(n = length(S[[p]]), size = 1, prob = 0.5)) - model.cur.res <- loglik.pre(loglik.pi, model.cur, complex, data.t, params$loglik) + model.cur.res <- loglik.pre(loglik.pi, model.cur, complex, data.t, params$loglik, NULL, FALSE) model.cur <- list(prob = 0, model = model.cur, coefs = model.cur.res$coefs, crit = model.cur.res$crit, alpha = 0) best.crit <- model.cur$crit # Reset first best criteria value diff --git a/R/gmjmcmc_support.R b/R/gmjmcmc_support.R index 4ef31cb27f9fa8f294ee7b65059737c1d317700f..db52330fdf26b61e2b24d3b397d58dd09db08c8c 100644 --- a/R/gmjmcmc_support.R +++ b/R/gmjmcmc_support.R @@ -109,7 +109,15 @@ precalc.features <- function (data, features) { # TODO: Compare to previous mliks here instead, also add a flag to do that in full likelihood estimation scenarios. # Function to call the model function -loglik.pre <- function (loglik.pi, model, complex, data, params = NULL) { +loglik.pre <- function (loglik.pi, model, complex, data, params = NULL, visited.models = visited.models, sub = sub) { + if (!is.null(visited.models) && has_key(visited.models, model)) { + if (!sub) { + return(visited.models[[model]]) + } else { + params$coefs <- visited.models[[model]]$coefs + params$crit <- visited.models[[model]]$crit + } + } # Get the complexity measures for just this model complex <- list(width = complex$width[model], oc = complex$oc[model], depth = complex$depth[model]) # Call the model estimator with the data and the model, note that we add the intercept to every model @@ -118,7 +126,7 @@ loglik.pre <- function (loglik.pi, model, complex, data, params = NULL) { if (!is.numeric(model.res$crit) || is.nan(model.res$crit)) model.res$crit <- -.Machine$double.xmax # Alpha cannot be calculated if the current and proposed models have crit which are -Inf or Inf if (is.infinite(model.res$crit)) { - if (model.res$crit > 0) model.res$crit <- .Machine$double.xmax + if (model.res$crit > 0) model.res$crit <- .Machine$double.xmax else model.res$crit <- -.Machine$double.xmax } return(model.res) diff --git a/R/imports.R b/R/imports.R index d5cd734103db50bcb8092a085298218e00c60bf8..03779eadb35df3ed8ee9765cd4e878d378bdccfe 100644 --- a/R/imports.R +++ b/R/imports.R @@ -4,4 +4,5 @@ #' @importFrom stats rnorm runif lm pnorm lm.fit var rbinom acf binomial cor gaussian median qnorm sd model.matrix model.response predict #' @importFrom graphics barplot text lines #' @importFrom utils sessionInfo +#' @importFrom r2r hashmap has_key NULL diff --git a/R/local_optim.R b/R/local_optim.R index eb6767b9135d1bf0bd5bcbd754fe8b3150357bd0..28696f0ce504f856fbdf3a57842ea18ce02583ef 100644 --- a/R/local_optim.R +++ b/R/local_optim.R @@ -3,7 +3,7 @@ # Created by: jonlachmann # Created on: 2021-02-11 -simulated.annealing <- function (model, data, loglik.pi, indices, complex, params, loglikparams, kernel=NULL) { +simulated.annealing <- function (model, data, loglik.pi, indices, complex, params, loglikparams, kernel=NULL, visited.models=NULL, sub = FALSE) { # Initialize a list to keep models that we visit in models <- vector("list", 0) @@ -13,7 +13,7 @@ simulated.annealing <- function (model, data, loglik.pi, indices, complex, param temp <- params$t.init # Initial temperature # Calculate current likelihood - model.res <- loglik.pre(loglik.pi, model, complex, data, loglikparams) + model.res <- loglik.pre(loglik.pi, model, complex, data, loglikparams,visited.models, sub) model.lik <- model.res$crit models[[length(models) + 1]] <- list(prob=NA, model=model, coefs=model.res$coefs, crit=model.lik, alpha=NA) # print(paste("SA Start:", model.lik)) @@ -22,7 +22,7 @@ simulated.annealing <- function (model, data, loglik.pi, indices, complex, param for (m in 1:params$M) { # Get a modified model as proposal and calculate its likelihood proposal <- xor(model, gen.proposal(model, params$kern, kernel, indices)$swap) - model.proposal <- loglik.pre(loglik.pi, proposal, complex, data, loglikparams) + model.proposal <- loglik.pre(loglik.pi, proposal, complex, data, loglikparams, visited.models = visited.models, sub = sub) proposal.lik <- model.proposal$crit # Store the model that we have calculated models[[length(models) + 1]] <- list(prob=NA, model=proposal, coefs=model.proposal$coefs, crit=proposal.lik, alpha=NA) @@ -42,7 +42,7 @@ simulated.annealing <- function (model, data, loglik.pi, indices, complex, param return(list(model=model, kern=kernel, models=models)) } -greedy.optim <- function (model, data, loglik.pi, indices, complex, params, loglikparams, kernel=NULL) { +greedy.optim <- function (model, data, loglik.pi, indices, complex, params, loglikparams, kernel = NULL, visited.models = NULL, sub = FALSE) { # Initialize a list to keep models that we visit in models <- vector("list", 0) @@ -50,7 +50,7 @@ greedy.optim <- function (model, data, loglik.pi, indices, complex, params, logl if (is.null(kernel)) kernel <- sample.int(n = 6, size = 1, prob = params$kern$probs) # Calculate current likelihood - model.res <- loglik.pre(loglik.pi, model, complex, data, loglikparams) + model.res <- loglik.pre(loglik.pi, model, complex, data, loglikparams,visited.models, sub) model.lik <- model.res$crit models[[length(models)+1]] <- list(prob=NA, model=model, coefs=model.res$coefs, crit=model.lik, alpha=NA) @@ -62,7 +62,7 @@ greedy.optim <- function (model, data, loglik.pi, indices, complex, params, logl for (j in 1:params$tries) { # Get a modified model as proposal and calculate its likelihood proposal <- xor(model, gen.proposal(model, params$kern, kernel, indices)$swap) - model.proposal <- loglik.pre(loglik.pi, proposal, complex, data, loglikparams) + model.proposal <- loglik.pre(loglik.pi, proposal, complex, data, loglikparams, visited.models=visited.models, sub = sub) proposal.lik <- model.proposal$crit # Store the model that we have calculated models[[length(models)+1]] <- list(prob=NA, model=proposal, coefs=model.proposal$coefs, crit=proposal.lik, alpha=NA) @@ -80,12 +80,12 @@ greedy.optim <- function (model, data, loglik.pi, indices, complex, params, logl return(list(model=model, kern=kernel, models=models)) } -local.optim <- function (model, data, loglik.pi, indices, complex, type, params, kernel=NULL) { +local.optim <- function (model, data, loglik.pi, indices, complex, type, params, kernel = NULL, visited.models = NULL, sub = FALSE) { if (type == 1) { - return(simulated.annealing(model, data, loglik.pi, indices, complex, params$sa, params$loglik, kernel)) + return(simulated.annealing(model, data, loglik.pi, indices, complex, params$sa, params$loglik, kernel, visited.models = visited.models, sub = sub)) } if (type == 2) { - return(greedy.optim(model, data, loglik.pi, indices, complex, params$greedy, params$loglik, kernel)) + return(greedy.optim(model, data, loglik.pi, indices, complex, params$greedy, params$loglik, kernel, visited.models = visited.models, sub = sub)) } if (type == 3) { return("not implemented") diff --git a/R/mjmcmc.R b/R/mjmcmc.R index 4e7117aa819882519b9e71e042bb6ef57f096cbf..e3ad703982653468ffcaa8ddd8426467809dc98d 100644 --- a/R/mjmcmc.R +++ b/R/mjmcmc.R @@ -48,7 +48,7 @@ mjmcmc <- function (data, loglik.pi = gaussian.loglik, N = 100, probs = NULL, pa # Initialize first model model.cur <- as.logical(rbinom(n = length(S), size = 1, prob = 0.5)) - model.cur.res <- loglik.pre(loglik.pi, model.cur, complex, data, params$loglik) + model.cur.res <- loglik.pre(loglik.pi, model.cur, complex, data, params$loglik, visited.models=NULL, sub = sub) model.cur <- list(prob=0, model=model.cur, coefs=model.cur.res$coefs, crit=model.cur.res$crit, alpha=0) if (verbose) cat("\nMJMCMC begin.\n") @@ -96,7 +96,8 @@ mjmcmc.loop <- function (data, complex, loglik.pi, model.cur, N, probs, params, # Initialize a vector to contain local opt visited models lo.models <- vector("list", 0) # Initialize list for keeping track of unique visited models - visited.models <- list(models = matrix(model.cur$model, 1, covar_count), crit = model.cur$crit, count = 1) + visited.models <- hashmap() + visited.models[[model.cur$model]] <- list(crit = model.cur$crit, coefs = model.cur$coefs) best.crit <- model.cur$crit # Set first best criteria value progress <- 0 @@ -107,7 +108,7 @@ mjmcmc.loop <- function (data, complex, loglik.pi, model.cur, N, probs, params, if (i > params$burn_in) pip_estimate <- mcmc_total / i else pip_estimate <- rep(1 / covar_count, covar_count) - proposal <- mjmcmc.prop(data, loglik.pi, model.cur, complex, pip_estimate, probs, params, visited.models) + proposal <- mjmcmc.prop(data, loglik.pi, model.cur, complex, pip_estimate, probs, params, visited.models, sub = sub) if (proposal$crit > best.crit) { best.crit <- proposal$crit if (verbose) cat(paste("\rNew best population crit:", best.crit, "\n")) @@ -116,33 +117,12 @@ mjmcmc.loop <- function (data, complex, loglik.pi, model.cur, N, probs, params, # If we did a large jump and visited models to save if (!is.null(proposal$models)) { lo.models <- c(lo.models, proposal$models) - # If we are doing subsampling and want to update best mliks - if (sub) { - for (mod in seq_along(proposal$models)) { - # Check if we have seen this model before - mod.idx <- vec_in_mat(visited.models$models[seq_len(visited.models$count), , drop = FALSE], proposal$models[[mod]]$model) - if (mod.idx == 0) { - # If we have not seen the model before, add it - visited.models$count <- visited.models$count + 1 - visited.models$crit <- c(visited.models$crit, proposal$models[[mod]]$crit) - visited.models$models <- rbind(visited.models$models, proposal$models[[mod]]$model) - } # This is a model seen before, set the best of the values available - else visited.models$crit[mod.idx] <- max(proposal$models[[mod]]$crit, visited.models$crit[mod.idx]) - } + for (mod in seq_along(proposal$models)) { + visited.models[[proposal$models[[mod]]$model]] <- list(crit = proposal$models[[mod]]$crit, coefs = proposal$models[[mod]]$coefs) } proposal$models <- NULL } - if (sub) { - # Check if we have seen this model before - mod.idx <- vec_in_mat(visited.models$models[seq_len(visited.models$count), , drop = FALSE], proposal$model) - if (mod.idx == 0) { - # If we have not seen the model before, add it - visited.models$count <- visited.models$count + 1 - visited.models$crit <- c(visited.models$crit, proposal$crit) - visited.models$models <- rbind(visited.models$models, proposal$model) - } # This is a model seen before, set the best of the values available - else visited.models$crit[mod.idx] <- max (proposal$crit, visited.models$crit[mod.idx]) - } + visited.models[[proposal$model]] <- list(crit = proposal$crit, coefs = proposal$coefs) if (log(runif(1)) <= proposal$alpha) { model.cur <- proposal @@ -177,11 +157,11 @@ mjmcmc.loop <- function (data, complex, loglik.pi, model.cur, N, probs, params, #' @param probs A list of the various probability vectors to use #' @param params A list of the various parameters for all the parts of the algorithm #' @param visited.models A list of the previously visited models to use when subsampling and avoiding recalculation -#' +#' @param sub An indicator that if the likelihood is inexact and should be improved each model visit (EXPERIMENTAL!) #' #' @noRd #' -mjmcmc.prop <- function (data, loglik.pi, model.cur, complex, pip_estimate, probs, params, visited.models=NULL) { +mjmcmc.prop <- function (data, loglik.pi, model.cur, complex, pip_estimate, probs, params, visited.models = NULL, sub = FALSE) { l <- runif(1) if (l < probs$large) { ### Large jump @@ -196,18 +176,18 @@ mjmcmc.prop <- function (data, loglik.pi, model.cur, complex, pip_estimate, prob chi.0.star <- xor(model.cur$model, large.jump$swap) # Swap large jump indices # Optimize to find a mode - localopt <- local.optim(chi.0.star, data, loglik.pi, !large.jump$swap, complex, q.o, params) # Do local optimization + localopt <- local.optim(chi.0.star, data, loglik.pi, !large.jump$swap, complex, q.o, params, visited.models = visited.models, sub = sub) # Do local optimization chi.k.star <- localopt$model # Randomize around the mode - proposal <- gen.proposal(chi.k.star, list(neigh.size = length(pip_estimate), neigh.min = 1, neigh.max = length(pip_estimate)), q.r, NULL, (pip_estimate*0 + 1 - params$random$prob), prob=TRUE) + proposal <- gen.proposal(chi.k.star, list(neigh.size = length(pip_estimate), neigh.min = 1, neigh.max = length(pip_estimate)), q.r, NULL, (pip_estimate * 0 + 1 - params$random$prob), prob=TRUE) proposal$model <- xor(chi.k.star, proposal$swap) # Do a backwards large jump and add in the kernel used in local optim to use the same for backwards local optim. chi.0 <- xor(proposal$model, large.jump$swap) # Do a backwards local optimization - localopt2 <- local.optim(chi.0, data, loglik.pi, !large.jump$swap, complex, q.o, params, kernel = localopt$kern) + localopt2 <- local.optim(chi.0, data, loglik.pi, !large.jump$swap, complex, q.o, params, kernel = localopt$kern, visited.models=visited.models, sub = sub) chi.k <- localopt2$model ### Calculate acceptance probability @@ -231,18 +211,9 @@ mjmcmc.prop <- function (data, loglik.pi, model.cur, complex, pip_estimate, prob model.cur$prob <- prob.proposal(proposal$model, model.cur$model, q.g, params$mh, pip_estimate) } # Calculate log likelihoods for the proposed model - proposal.res <- loglik.pre(loglik.pi, proposal$model, complex, data, params$loglik) + proposal.res <- loglik.pre(loglik.pi, proposal$model, complex, data, params$loglik, visited.models=visited.models, sub = sub) proposal$crit <- proposal.res$crit - # TODO: Compare to a list of best mliks for all visited models, - # TODO: update that list if our estimate is better, otherwise update our estimate. - # TODO: Save all models visited by local optim, and update the best mliks if we see one during local optim. - # If we are running with subsampling, check the list for a better mlik - if (!is.null(visited.models)) { - mod.idx <- vec_in_mat(visited.models$models[1:visited.models$count,,drop=FALSE], proposal$model) - if (mod.idx != 0) proposal$crit <- max(proposal$crit, visited.models$crit[mod.idx]) - } - # Calculate acceptance probability for proposed model proposal$alpha <- min(0, (proposal$crit + model.cur$prob) - (model.cur$crit + proposal$prob)) diff --git a/R/predict.R b/R/predict.R index 9ea1189b933a7125b0938764ce33eb15f99b9cb8..88f01337f5571c1cab64c91f8526268a1cee7b13 100644 --- a/R/predict.R +++ b/R/predict.R @@ -39,7 +39,7 @@ predict.gmjmcmc <- function (object, x, link = function(x) x, quantiles = c(0.02 rm(na.matr) } else x <- as.matrix(x) - merged <- merge_results(list(object),data = cbind(1,x),populations = pop,tol = tol) + merged <- merge_results(list(object),data = cbind(1, x), populations = pop, tol = tol) set.transforms(transforms.bak) return(predict.gmjmcmc_merged(merged, x, link, quantiles)) } @@ -104,9 +104,6 @@ predict.gmjmcmc.2 <- function (object, x, link = function(x) x, quantiles = c(0. #' #' @export predict.gmjmcmc_merged <- function (object, x, link = function(x) x, quantiles = c(0.025, 0.5, 0.975), pop = NULL,tol = 0.0000001, ...) { - - - if(!is.null(attr(object,which = "imputed"))) { df <- data.frame(x) diff --git a/man/gen.params.mjmcmc.Rd b/man/gen.params.mjmcmc.Rd index 65c79c82d6451fbcbb3736abf8857021635517f1..0fe0b17f344a047c1161a6a6a4de4d8ce6f658fb 100644 --- a/man/gen.params.mjmcmc.Rd +++ b/man/gen.params.mjmcmc.Rd @@ -12,6 +12,60 @@ gen.params.mjmcmc(data) \value{ A list of parameters to use when running the mjmcmc function. +The list contains the following elements: + +\describe{ +\item{\code{burn_in}}{The burn-in period for the MJMCMC algorithm, which is set to 100 iterations by default.} + +\item{\code{mh}}{A list containing parameters for the regular Metropolis-Hastings (MH) kernel: +\describe{ +\item{\code{neigh.size}}{The size of the neighborhood for MH proposals with fixed proposal size, default set to 1.} +\item{\code{neigh.min}}{The minimum neighborhood size for random proposal size, default set to 1.} +\item{\code{neigh.max}}{The maximum neighborhood size for random proposal size, default set to 2.} +} +} + +\item{\code{large}}{A list containing parameters for the large jump kernel: +\describe{ +\item{\code{neigh.size}}{The size of the neighborhood for large jump proposals with fixed neighborhood size, default set to the smaller of 0.35 \eqn{\times p} and 35, where \eqn{p} is the number of covariates.} +\item{\code{neigh.min}}{The minimum neighborhood size for large jumps with random size of the neighborhood, default set to the smaller of 0.25 \eqn{\times p} and 25.} +\item{\code{neigh.max}}{The maximum neighborhood size for large jumps with random size of the neighborhood, default set to the smaller of 0.45 \eqn{\times p} and 45.} +} +} + +\item{\code{random}}{A list containing a parameter for the randomization kernel: +\describe{ +\item{\code{prob}}{The small probability of changing the component around the mode, default set to 0.01.} +} +} + +\item{\code{sa}}{A list containing parameters for the simulated annealing kernel: +\describe{ +\item{\code{probs}}{A numeric vector of length 6 specifying the probabilities for different types of proposals in the simulated annealing algorithm.} +\item{\code{neigh.size}}{The size of the neighborhood for the simulated annealing proposals, default set to 1.} +\item{\code{neigh.min}}{The minimum neighborhood size, default set to 1.} +\item{\code{neigh.max}}{The maximum neighborhood size, default set to 2.} +\item{\code{t.init}}{The initial temperature for simulated annealing, default set to 10.} +\item{\code{t.min}}{The minimum temperature for simulated annealing, default set to 0.0001.} +\item{\code{dt}}{The temperature decrement factor, default set to 3.} +\item{\code{M}}{The number of iterations in the simulated annealing process, default set to 12.} +} +} + +\item{\code{greedy}}{A list containing parameters for the greedy algorithm: +\describe{ +\item{\code{probs}}{A numeric vector of length 6 specifying the probabilities for different types of proposals in the greedy algorithm.} +\item{\code{neigh.size}}{The size of the neighborhood for greedy algorithm proposals, set to 1.} +\item{\code{neigh.min}}{The minimum neighborhood size for greedy proposals, set to 1.} +\item{\code{neigh.max}}{The maximum neighborhood size for greedy proposals, set to 2.} +\item{\code{steps}}{The number of steps for the greedy algorithm, set to 20.} +\item{\code{tries}}{The number of tries for the greedy algorithm, set to 3.} +} +} + +\item{\code{loglik}}{A list to store log-likelihood values, which is by default empty.} +} + Note that the \verb{$loglik} item is an empty list, which is passed to the log likelihood function of the model, intended to store parameters that the estimator function should use. } diff --git a/tests_current/Ex10_Sec5_4.R b/tests_current/Ex10_Sec5_4.R deleted file mode 100644 index 5de0324ff75186d367d33a887191c17c0d5e40ac..0000000000000000000000000000000000000000 --- a/tests_current/Ex10_Sec5_4.R +++ /dev/null @@ -1,132 +0,0 @@ -####################################################### -# -# Example 10 (Section 5.4): Epil data set from the INLA package -# -# Mixed Effect Poisson Model with Fractional Polynomials -# -# This is the valid version for the JSS Paper -# -####################################################### - - - -#install.packages("INLA",repos=c(getOption("repos"),INLA="https://inla.r-inla-download.org/R/stable"), dep=TRUE) -#options(repos=c( inlabruorg = "https://inlabru-org.r-universe.dev", INLA = "https://inla.r-inla-download.org/R/testing", CRAN = "https://cran.rstudio.com") ) -#install.packages("fmesher") - - -library(FBMS) -library(INLA) -library(tictoc) -use.fbms = FALSE - -data = INLA::Epil -data = data[,-c(5,6)] - -df = data[1:5] -df$V2 = rep(c(0,1,0,0),59) -df$V3 = rep(c(0,0,1,0),59) -df$V4 = rep(c(0,0,0,1),59) - - -#df$Trt.Base = df$Trt * df$Base -#df$Trt.Age = df$Trt * df$Age - -transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2","p0p0","p0p05","p0p1","p0p2","p0p3","p0p05","p0pm05","p0pm1","p0pm2") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(1,1,0,1) # Only modifications! - -params <- gen.params.gmjmcmc(df) -params$feat$D <- 2 # Set depth of features to 2 (allow for interactions) -params$loglik$r = 1/dim(df)[1] - -#specify indices for a random effect -params$loglik$PID = data$Ind # patient ids for repeated measurements -params$loglik$INLA.num.threads = 10 # Number of threads used by INLA - -params$feat$keep.min = 0.2 - -params$greedy$steps = 2 -params$greedy$tries = 1 -params$sa$t.min = 0.1 -params$sa$dt = 10 - - - -#estimator function - -poisson.loglik.inla <- function (y, x, model, complex, params) -{ - - if(sum(model)>1) - { - data1 = data.frame(y, as.matrix(x[,model]), params$PID) - formula1 = as.formula(paste0(names(data1)[1],"~",paste0(names(data1)[3:(dim(data1)[2]-1)],collapse = "+"),"+ f(params.PID,model = \"iid\")")) - } else - { - data1 = data.frame(y, params$PID) - formula1 = as.formula(paste0(names(data1)[1],"~","1 + f(params.PID,model = \"iid\")")) - } - - #to make sure inla is not stuck - inla.setOption(inla.timeout=30) - inla.setOption(num.threads=params$INLA.num.threads) - - mod<-NULL - - #importance with error handling for unstable libraries that one does not trust 100% - tryCatch({ - mod <- inla(family = "poisson",silent = 1L,safe = F, data = data1,formula = formula1) - }, error = function(e) { - # Handle the error by setting result to NULL - mod <- NULL - # You can also print a message or log the error if needed - cat("An error occurred:", conditionMessage(e), "\n") - }) - - # logarithm of model prior - if (length(params$r) == 0) params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log.prior(params, complex) - - if(length(mod)<3||length(mod$mlik[1])==0) { - return(list(crit = -10000 + lp,coefs = rep(0,dim(data1)[2]-2))) - } else { - mloglik <- mod$mlik[1] - return(list(crit = mloglik + lp, coefs = mod$summary.fixed$mode)) - } -} - -set.seed(03052024) - -if (use.fbms) { - result <- fbms(data = df, family = "custom", loglik.pi = poisson.loglik.inla, method = "gmjmcmc", - transforms = transforms, probs = probs, params = params, P=3) -} else { - result <- gmjmcmc(data = df, loglik.pi = poisson.loglik.inla, transforms = transforms, - probs = probs, params = params, P = 3) -} - -plot(result) -summary(result) - - - -set.seed(23052024) - -tic() -params$loglik$INLA.num.threads = 1 # Number of threads used by INLA set to 1 -if (use.fbms) { - result2 <- fbms(data = df, family = "custom", loglik.pi = poisson.loglik.inla, - method = "gmjmcmc.parallel", runs = 40, cores = 40, - transforms = transforms, probs = probs, params = params, P=25) -} else { - result2 <- gmjmcmc.parallel(runs = 40, cores = 40, data = df, loglik.pi = poisson.loglik.inla, - transforms = transforms, probs = probs, params = params, P = 25) -} -time.inla = toc() - -plot(result2) -summary(result2, labels = names(df)[-1], tol = 0.01) - - - diff --git a/tests_current/Ex10_Sec6_2.R b/tests_current/Ex10_Sec6_2.R index f69df64752eb1741530150795cf399598c3a3137..3182366022aafc8f77dfd1b0b503d34dfcc49359 100644 --- a/tests_current/Ex10_Sec6_2.R +++ b/tests_current/Ex10_Sec6_2.R @@ -79,7 +79,7 @@ mixed.model.loglik.lme4 <- function (y, x, model, complex, params) if (length(params$r) == 0) params$r <- 1/dim(x)[1] # default value or parameter r lp <- log.prior(params, complex) - mloglik <- as.numeric(logLik(mm)) - log(length(y)) * (dim(data)[2] - 2) #Laplace approximation for beta prior + mloglik <- as.numeric(logLik(mm)) - 0.5*log(length(y)) * (dim(data)[2] - 2) #Laplace approximation for beta prior return(list(crit = mloglik + lp, coefs = fixef(mm))) } @@ -179,7 +179,7 @@ mixed.model.loglik.rtmb <- function (y, x, model, complex, params) # if(length(beta)==0) { # return(list(crit = -10000 + lp,coefs = rep(0,dim(data1)[2]-2))) # } else { - mloglik <- -2*opt$objective + mloglik <- -opt$objective - 0.5*log(dim(x)[1])*msize return(list(crit = mloglik + lp, coefs = opt$par[-(1:2)])) # } } @@ -209,8 +209,8 @@ if (use.fbms) { } time.lme4 = toc() -plot(result1b) -summary(result1b) +plot(result1a) +summary(result1a, labels = names(df)[-1]) tic() @@ -236,7 +236,8 @@ if (use.fbms) { probs = probs, params = params, P = 3) } time.rtmb = toc() - +plot(result1c) +summary(result1c, labels = names(df)[-1]) c(time.lme4$callback_msg, time.inla$callback_msg, time.rtmb$callback_msg) @@ -248,7 +249,7 @@ c(time.lme4$callback_msg, time.inla$callback_msg, time.rtmb$callback_msg) set.seed(20062024) params$feat$pop.max = 10 -result2a <- gmjmcmc.parallel(runs = 40, cores = 40, data = df, loglik.pi = mixed.model.loglik.lme4, transforms = transforms, N.init=100, probs = probs, params = params, P = 25) +result2a <- gmjmcmc.parallel(runs = 40, cores = 10, data = df, loglik.pi = mixed.model.loglik.lme4, transforms = transforms, N.init=100, probs = probs, params = params, P = 25) summary(result2a,tol = 0.05,labels=names(df)[-1]) diff --git a/tests_current/Ex11_Sec5_5.R b/tests_current/Ex11_Sec5_5.R deleted file mode 100644 index 9336307b343122c6b394ced9bbd812eeabab9a44..0000000000000000000000000000000000000000 --- a/tests_current/Ex11_Sec5_5.R +++ /dev/null @@ -1,166 +0,0 @@ -####################################################### -# -# Example 11 (Section 5.5): -# -# Subsampling -# -# Heart Disease Health Indicators Dataset” -# -# This is the valid version for the JSS Paper -# -####################################################### - -#install.packages("tictoc") -library(tictoc) - -#install.packages("FBMS") -library(FBMS) -#library(devtools) -#devtools::install_github("jonlachmann/irls.sgd", force=T, build_vignettes=F) -library(irls.sgd) - - - -setwd("/home/florian/FBMS/") - -df = read.csv2(file = "heart_disease_health_indicators_BRFSS2015.csv",sep = ",",dec = ".") - -summary(df) -dim(df) - -#number of observations in the data - -n = dim(df)[1] - -#number of covariates - -p = dim(df)[2] - 1 - -params <- gen.params.gmjmcmc(data = df) -params$loglik$r = 0.5 -params$loglik$subs = 0.01 - - -transforms <- c("sigmoid","pm1","p0","p05","p2","p3") -probs <- gen.probs.gmjmcmc(transforms) - -logistic.posterior.bic.irlssgd <- function (y, x, model, complex, params) -{ - mod <- irls.sgd(as.matrix(x[,model]), y, binomial(), - irls.control=list(subs=params$subs, maxit=20, tol=1e-7, cooling = c(1,0.9,0.75), expl = c(3,1.5,1)), - sgd.control=list(subs=params$subs, maxit=250, alpha=0.001, decay=0.99, histfreq=10)) - - # logarithm of marginal likelihood - mloglik <- -mod$deviance /2 - log(length(y)) * (mod$rank-1) - - # logarithm of model prior - if (length(params$r) == 0) params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log.prior(params, complex) - - return(list(crit = mloglik + lp, coefs = mod$coefficients)) - -} - - - -############################ -# -# Testing runtime -# -############################ - -set.seed(100001) -tic() -# subsampling analysis -tmp1 <- gmjmcmc(df, logistic.posterior.bic.irlssgd, transforms = transforms, - params = params, P = 2, sub = T) -time1 = toc() - -set.seed(100002) -tic() -# regular analysis -tmp2 <- gmjmcmc(df, logistic.loglik, transforms = transforms, - params = params, P = 2) -time2 = toc() - -c(time1, time2) - -############################ -# -# More serious analysis -# -############################ - - -# with subsampling - -set.seed(100003) - -tic() -result <- gmjmcmc.parallel(runs = 10,cores = 10, data = df, - loglik.pi = logistic.posterior.bic.irlssgd, - transforms = transforms, params = params, P = 3, sub = T) -time3 = toc() - -summary(result) - -# without subsampling - - -set.seed(100004) - -tic() -result1a <- gmjmcmc.parallel(runs = 10,cores = 10, data = df, - loglik.pi = logistic.loglik, - transforms = transforms, params = params, P = 3) -time4 = toc() - -summary(result1a) - - - -############################ -# -# Probably too ambitious analysis -# -############################ - -# with subsampling - -set.seed(100005) -tic() -result2 <- gmjmcmc.parallel(runs = 40,cores = 40, data = df, - loglik.pi = logistic.posterior.bic.irlssgd, - transforms = transforms, params = params, P = 10, sub = T) -time5 = toc() -summary(result2) - - - -# regular analysis - - -set.seed(100006) - -tic() -result2a <- gmjmcmc.parallel(runs = 40,cores = 40, data = df, - loglik.pi = logistic.loglik, - transforms = transforms, params = params, P = 10) -time6 = toc() - - -summary(result2a) - - - -############################################################################ - -C = cor(df, use = "everything", - method = "spearman") - -corrplot::corrplot(C) - -apply((abs(C - diag(diag(C)))), 2, max) - -save.image("Ex11_Results.RData") - diff --git a/tests_current/Ex12_Sec5_6.R b/tests_current/Ex12_Sec5_6.R deleted file mode 100644 index 91c94ce6546ea2b1c9f6915993a91b09b6c48db3..0000000000000000000000000000000000000000 --- a/tests_current/Ex12_Sec5_6.R +++ /dev/null @@ -1,172 +0,0 @@ -####################################################### -# -# Example 12 (Section 5.6): -# -# Cox Regression -# -# This is the valid version for the JSS Paper -# -####################################################### - - -#install.packages("FBMS") -library(FBMS) -library(pec) #for the computation of cindex - -#install.packages("survival") -library(survival) - -setwd("/home/florian/FBMS/") - -download.file('https://www.uniklinik-freiburg.de/fileadmin/mediapool/08_institute/biometrie-statistik/Dateien/Studium_und_Lehre/Lehrbuecher/Multivariable_Model-building/gbsg_br_ca.zip', - 'gbsg_br_ca.zip') -df1 <- read.csv(unz('gbsg_br_ca.zip', - 'gbsg_br_ca/gbsg_br_ca.csv'), - header = TRUE) -#system('rm whitehall1.zip') - - -df <- df1[, c(13, 14, 2:4, 6:8, 10:12)] -names(df) = c("time","cens",names(df)[3:ncol(df)]) - - -set.seed(123) -train <- c(sample((1:nrow(df))[df$cens == 1], sum(df$cens)*2/3), # split separately events - sample((1:nrow(df))[df$cens == 0], sum(!df$cens)*2/3)) # and censored observations - - -df.train <- df[train,] -df.test <- df[-train,] - -time <- df.train$time - - -params <- gen.params.gmjmcmc(data = df.train[,-1]) -params$loglik$r = 0.5 -params$loglik$time = time #the time variable goes into the params structure - -params$feat$keep.min = 0.2 - -transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2") -probs <- gen.probs.gmjmcmc(transforms) -#probs$gen <- c(1,1,0,1) - - -#specify the estimator function for cox -surv.pseudo.loglik = function(y, x, model, complex, params){ - - if(length(params$r)==0) - params$r = 0.5 - data <- data.frame(time = params$time, cens = y, as.matrix(x[,model]))[,-3] # Removing intercept - if(dim(data)[2]==2) - { - return(list(crit=-10000, coefs=rep(0,1))) - } else { - formula1 = as.formula(paste0("Surv(time,cens)","~ 1 + .")) - - out = coxph(formula1, data = data) - - # logarithm of marginal likelihood - mloglik <- (out$loglik[2] - out$loglik[1])/2 - log(length(y)) * (dim(data)[2] - 2) - - # logarithm of model prior - if (length(params$r) == 0) params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log.prior(params, complex) - - return(list(crit = mloglik + lp, coefs = c(0,out$coefficients))) - - } - -} - -#Single chain analysis (just to illustrate that it works) -set.seed(5) -result <- gmjmcmc(data = df.train[,-1], loglik.pi = surv.pseudo.loglik, transforms = transforms, params = params, P = 5) - -summary(result,tol = 0.01,labels = names(df.train)[-c(1:2)],effects = c(0.025,0.5,0.975)) -summary(result) -summary(result,tol = 0.01,labels = names(df.train)[-c(1:2)]) - -linpreds.train <- predict(result,df.train[,-(1:2)], link = function(x) x) -linpreds <- predict(result,df.test[,-(1:2)], link = function(x) x) - -#plot(linpreds$aggr$mean) - -#Parallel version -set.seed(15) -probs$gen <- c(1,1,1,1) -result2 <- gmjmcmc.parallel(runs = 80, cores = 40, data = df.train[,-1], - loglik.pi = surv.pseudo.loglik, transforms = transforms, - probs = probs, params = params, P = 25) -summary(result2,tol = 0.01,labels = names(df.train)[-1],effects = c(0.025,0.5,0.975)) - -linpreds2.train <- predict(result2,df.train[,-(1:2)], link = function(x) x) -linpreds2 <- predict(result2,df.test[,-(1:2)], link = function(x) x) -#plot(linpreds2$aggr$mean) - - - -############################################# -#Parallel version only linear terms -set.seed(25) -probs$gen <- c(0,0,0,1) -result3 <- gmjmcmc.parallel(runs = 80, cores = 40, data = df.train[,-1], - loglik.pi = surv.pseudo.loglik, transforms = transforms, - probs = probs, params = params, P = 25) - -summary(result3,tol = 0.01,labels = names(df.train)[-(1:2)],effects = c(0.025,0.5,0.975)) - -linpreds3.train <- predict(result3,df.train[,-(1:2)], link = function(x) x) -linpreds3 <- predict(result3,df.test[,-(1:2)], link = function(x) x) -#plot(linpreds2$aggr$mean) - -############################################# -#Parallel version only fractional polynomials -set.seed(35) -probs$gen <- c(0,1,0,1) -result4 <- gmjmcmc.parallel(runs = 80, cores = 40, data = df.train[,-1], - loglik.pi = surv.pseudo.loglik, transforms = transforms, - probs = probs, params = params, P = 25) - -summary(result4,tol = 0.01,labels = names(df.train)[-(1:2)],effects = c(0.025,0.5,0.975)) - -linpreds4.train <- predict(result4,df.train[,-(1:2)], link = function(x) x) -linpreds4 <- predict(result4,df.test[,-(1:2)], link = function(x) x) -#plot(linpreds2$aggr$mean) - -# Compute cindex using package pec - -df.train$average.lin.pred1 <- linpreds.train$aggr$mean -df.train$average.lin.pred2 <- linpreds2.train$aggr$mean -df.train$average.lin.pred3 <- linpreds3.train$aggr$mean -df.train$average.lin.pred4 <- linpreds4.train$aggr$mean - -df.test$average.lin.pred1 <- linpreds$aggr$mean -df.test$average.lin.pred2 <- linpreds2$aggr$mean -df.test$average.lin.pred3 <- linpreds3$aggr$mean -df.test$average.lin.pred4 <- linpreds4$aggr$mean - -mod1 <- coxph(Surv(time, cens) ~ average.lin.pred1, data = as.data.frame(df.train), x = TRUE) -cindex1 <- cindex(mod1, mod1$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod2 <- coxph(Surv(time, cens) ~ average.lin.pred2, data = as.data.frame(df.train), x = TRUE) -cindex2 <- cindex(mod2, mod2$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod3 <- coxph(Surv(time, cens) ~ average.lin.pred3, data = as.data.frame(df.train), x = TRUE) -cindex3 <- cindex(mod3, mod3$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod4 <- coxph(Surv(time, cens) ~ average.lin.pred4, data = as.data.frame(df.train), x = TRUE) -cindex4 <- cindex(mod4, mod4$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - - -#Full model -mod5 <- coxph(Surv(time, cens) ~ 1+., data = as.data.frame(df.train[,1:11]),x = T) -cindex5 <- cindex(mod5, mod5$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -#Model without predictors -mod6 <- coxph(Surv(time, cens) ~ 1, data = as.data.frame(df.train[,1:11]),x = T) -cindex6 <- cindex(mod6, mod6$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -round(unlist(c(cindex1, cindex2, cindex3, cindex4, cindex5, cindex6)),3) - - diff --git a/tests_current/Ex12_Sec6_4.R b/tests_current/Ex12_Sec6_4.R index 2bf0acc3a4340a549041683e93638f532ad0456d..18f131808cf37ef8c92fc47c5784fb9f05512ebe 100644 --- a/tests_current/Ex12_Sec6_4.R +++ b/tests_current/Ex12_Sec6_4.R @@ -48,19 +48,29 @@ probs <- gen.probs.gmjmcmc(transforms) logistic.posterior.bic.irlssgd <- function (y, x, model, complex, params) { - mod <- irls.sgd(as.matrix(x[,model]), y, binomial(), + if (!is.null(params$crit)) { + mod <- glm.sgd(x[,model], y, binomial(), sgd.ctrl = list(start=params$coefs, subs=params$subs, maxit=10, alpha=0.00008, decay=0.99, histfreq=10)) + mod$deviance <- get_deviance(mod$coefficients, x[,model], y, binomial()) + mod$rank <- length(mod$coefficients) + } else { + mod <- irls.sgd(as.matrix(x[,model]), y, binomial(), irls.control=list(subs=params$subs, maxit=20, tol=1e-7, cooling = c(1,0.9,0.75), expl = c(3,1.5,1)), sgd.control=list(subs=params$subs, maxit=250, alpha=0.001, decay=0.99, histfreq=10)) + } # logarithm of marginal likelihood - mloglik <- -mod$deviance /2 - log(length(y)) * (mod$rank-1) + mloglik <- -mod$deviance / 2 - 0.5 * log(length(y)) * (mod$rank - 1) # logarithm of model prior if (length(params$r) == 0) params$r <- 1/dim(x)[1] # default value or parameter r lp <- log.prior(params, complex) + crit <- mloglik + lp + + if (!is.null(params$crit) && params$crit > crit) { + return(list(crit = params$crit, coefs = params$coefs)) + } - return(list(crit = mloglik + lp, coefs = mod$coefficients)) - + return(list(crit = crit, coefs = mod$coefficients)) } diff --git a/tests_current/Ex13_Sec5_7.R b/tests_current/Ex13_Sec5_7.R deleted file mode 100644 index ad841519f3c4fe93d2edf53cc249e63095d58628..0000000000000000000000000000000000000000 --- a/tests_current/Ex13_Sec5_7.R +++ /dev/null @@ -1,275 +0,0 @@ -####################################################### -# -# Example 13 (Section 5.7): -# -# Prediction using non-linear Projections with different model prior -# -# DATA - abalone data set -# -# Data set is available at https://www.kaggle.com/datasets/rodolfomendes/abalone-dataset -# -# For convenience we provide the file abalone.csv which contains already the names of the variables -# -# -# This is the valid version for the JSS Paper -# -####################################################### - - -library(FBMS) -use.fbms = FALSE - -setwd("/home/florian/FBMS/") - - -df = read.csv2(file = "abalone.csv",sep = ",",dec = ".")[,c(9,1:8)] -df$Sex_F_vs_I = as.numeric(df$Sex == "F") -df$Sex_M_vs_I = as.numeric(df$Sex == "M") -df$Sex = as.factor(df$Sex) -df$Rings = as.numeric(df$Rings) - -summary(df) - - -#number of observations in the data - -n = dim(df)[1] - -# Create Training and Test dataset - -# Sam Waugh (1995) "Extending and benchmarking Cascade-Correlation", PhD -# thesis, Computer Science Department, University of Tasmania. - -#-- Test set performance (final 1044 examples, first 3133 used for training): -# 24.86% Cascade-Correlation (no hidden nodes) -# 26.25% Cascade-Correlation (5 hidden nodes) -# 21.5% C4.5 -# 0.0% Linear Discriminate Analysis -# 3.57% k=5 Nearest Neighbour -# (Problem encoded as a classification task) - - -# remove variable sex because gmjmcmc cannot handle factor variables -df.training = df[1:3133,-2] -df.test = df[3134:n,-2] - - -summary(df.training) - - -pred.RMSE = rep(0,5) # Collect the results of prediction RMSE from the five different methods - - - -transforms <- c("sigmoid") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(0,0,1,1) #Only projections! - -params <- gen.params.gmjmcmc(df.training) -params$loglik$r = 0.2 - -############################################## -# -# New function for model prior -# -# - -gaussian.loglik.gap.open.prior = function(y, x, model, complex, params){ - { - if (length(params) == 0) - params <- list(r = 1/dim(x)[1]) #list(r = 0.5) - suppressWarnings({ - mod <- fastglm(as.matrix(x[, model]), y, family = gaussian()) - }) - - - # ret <- (-(mod$deviance + 2 * log(length(y)) * (mod$rank - - # 1) - 2 * log(params$r) * (sum(complex$oc))))/2 - - # logarithm of marginal likelihood - mloglik <- -mod$deviance/2 + log(length(y)) * (mod$rank - 1) - - # logarithm of model prior - #lp <- log(params$r) * (sum(complex$depth) + 0.2 * sum(complex$oc)) - lp <- log(params$r) * (sum(complex$depth * complex$oc)) - # lp <- log(params$r) * (sum(complex$oc)) - return(list(crit = mloglik + lp, coefs = mod$coefficients)) - } - -} - - - - -############################################################################# -# -# Using method 0 for alpha (simply set to 1, default) -# -############################################################################# - -set.seed(5001) - - -#result <- gmjmcmc(df.training, logistic.loglik, logistic.loglik.alpha, transforms -# = transforms, probs = probs, params = params) - -if (use.fbms) { - result <- fbms(data = df.training, family = "custom", loglik.pi = gaussian.loglik.gap.open.prior, - method = "gmjmcmc", transforms = transforms, - probs = probs, params = params) -} else { -# result <- gmjmcmc(df.training, transforms = transforms, probs = probs) - - result <- gmjmcmc(df.training, loglik.pi = gaussian.loglik.gap.open.prior, - transforms = transforms, probs = probs) - - } -summary(result) - -pred = predict(result, x = df.test[,-1], link = function(x)(x)) - -pred.RMSE[1] = sqrt(mean((pred$aggr$mean - df.test$Rings)^2)) - -plot(pred$aggr$mean, df.test$Rings) - - - - - -############################################################################# -# -# Parallel version -# -############################################################################# - - -set.seed(5002) - -if (use.fbms) { - result_parallel <- fbms(data = df.training, family = "custom", loglik.pi = gaussian.loglik.gap.open.prior, - loglik.alpha = gaussian.loglik.alpha, - method = "gmjmcmc.parallel", runs = 40, cores = 40, - transforms = transforms, probs = probs, params = params, P=25) -} else { - result_parallel = gmjmcmc.parallel(runs = 40, cores = 40, data = df.training, - loglik.pi = gaussian.loglik.gap.open.prior, - loglik.alpha = gaussian.loglik.alpha, - transforms = transforms, probs = probs, params = params, P=25) -} -summary(result_parallel) - - - -pred_parallel = predict(result_parallel, x = df.test[,-1], link = function(x)(x)) - -pred.RMSE[2] = sqrt(mean((pred_parallel$aggr$mean - df.test$Rings)^2)) - -plot(pred_parallel$aggr$mean, df.test$Rings) -abline(0,1) - - - -############################################################################# -# -# Using method 3 to estimate alpha -# -############################################################################# - -params$feat$alpha = 3 - - -set.seed(5003) - - -if (use.fbms) { - result.a3 <- fbms(data = df.training, family = "custom", loglik.pi = gaussian.loglik.gap.open.prior, - method = "gmjmcmc", transforms = transforms, - probs = probs, params = params) -} else { - result.a3 <- gmjmcmc(df.training, transforms = transforms, probs = probs, params = params) -} -summary(result.a3) - - - -pred.a3 = predict(result.a3, x = df.test[,-1], link = function(x)(x)) - -pred.RMSE[3] = sqrt(mean((pred.a3$aggr$mean - df.test$Rings)^2)) - -plot(pred.a3$aggr$mean, df.test$Rings) - - - - - -############################################################################# -# -# Parallel version params$feat$alpha = 3 -# -############################################################################# - -params$feat$alpha = 3 - -set.seed(5004) - -if (use.fbms) { - result_parallel.a3 <- fbms(data = df.training, family = "custom", loglik.pi = gaussian.loglik.gap.open.prior, - loglik.alpha = gaussian.loglik.alpha, - method = "gmjmcmc.parallel", runs = 40, cores = 40, - transforms = transforms, probs = probs, params = params, P=25) -} else { - result_parallel.a3 = gmjmcmc.parallel(runs = 40, cores = 40, data = df.training, - loglik.pi = gaussian.loglik.gap.open.prior, - loglik.alpha = gaussian.loglik.alpha, - transforms = transforms, probs = probs, params = params, P=25) -} -summary(result_parallel.a3) - - - -pred_parallel.a3 = predict(result_parallel.a3, x = df.test[,-1], link = function(x)(x)) - -pred.RMSE[4] = sqrt(mean((pred_parallel.a3$aggr$mean - df.test$Rings)^2)) - -plot(pred_parallel.a3$aggr$mean, df.test$Rings) -abline(0,1) - - - - - - - -############################################################################# -# -# Parallel version with fractional polynomials -# -############################################################################# - -transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2","p0p0","p0p05","p0p1","p0p2","p0p3","p0p05","p0pm05","p0pm1","p0pm2") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(0,1,0,1) #Only modifications! - -set.seed(50005) - -if (use.fbms) { - result.fp <- fbms(data = df.training, method = "gmjmcmc.parallel", runs = 40, cores = 40, - transforms = transforms, probs = probs, params = params, P=25) -} else { - result.fp <- gmjmcmc.parallel(runs = 40, cores = 40, data = df.training, - loglik.pi =gaussian.loglik,loglik.alpha = gaussian.loglik.alpha, - transforms = transforms, probs = probs, params = params, P=25) -} -summary(result.fp) - - - -pred_fp = predict(result.fp, x = df.test[,-1], link = function(x)(x)) - -pred.RMSE[5] = sqrt(mean((pred_fp$aggr$mean - df.test$Rings)^2)) - -plot(pred_fp$aggr$mean, df.test$Rings) - - - -pred.RMSE diff --git a/tests_current/Ex13_Sec6_5.R b/tests_current/Ex13_Sec6_5.R index 453287b5917247a4bd4c1057d17dc6e6e5fa597b..4f09cb53adc5ba6f48eba27a35a9547fe6ce1a66 100644 --- a/tests_current/Ex13_Sec6_5.R +++ b/tests_current/Ex13_Sec6_5.R @@ -69,7 +69,7 @@ surv.pseudo.loglik = function(y, x, model, complex, params){ out = coxph(formula1, data = data) # logarithm of marginal likelihood - mloglik <- (out$loglik[2] - out$loglik[1])/2 - log(length(y)) * (dim(data)[2] - 2) + mloglik <- (out$loglik[2] - out$loglik[1]) - log(length(y)) * (dim(data)[2] - 2)/2 # logarithm of model prior if (length(params$r) == 0) params$r <- 1/dim(x)[1] # default value or parameter r diff --git a/tests_current/Ex1_Sec3.R b/tests_current/Ex1_Sec3.R index b173632c7fcfad8e152f9c97295dc30888068cbc..62ecfa8eecd18cc9cdf73720002fe949d54ed215 100644 --- a/tests_current/Ex1_Sec3.R +++ b/tests_current/Ex1_Sec3.R @@ -34,14 +34,26 @@ use.fbms = FALSE params <- gen.params.gmjmcmc(df) -set.seed(123) + + +#setting residual variance fixed to the one from the Linear model +#lmm <- lm(formula = MajorAxis ~ .,df) +#params$loglik$var <- var(lmm$residuals) +#set.seed(123) +#to set variance to unknown use below +#params$loglik$var <- "unknown" +#to set to 1 +#params$loglik$var <- 1 if (use.fbms) { - result.default <- fbms(formula = MajorAxis ~ 1 + . , data = df, method = "gmjmcmc", transforms = transforms) + result.default <- fbms(formula = MajorAxis ~ 1 + . , data = df, method = "gmjmcmc", transforms = transforms, params = params) } else { - result.default <- gmjmcmc(df, transforms = transforms) + result.default <- gmjmcmc(df, transforms = transforms, params = params) } +summary(result.default, labels = names(df)[-1]) +preds <- predict(result.default, df[,-1]) +sqrt(mean((preds$aggr$mean - df$MajorAxis)^2)) #################################################### # @@ -54,11 +66,12 @@ set.seed(123) if (use.fbms) { result.P50 <- fbms(data = df, method = "gmjmcmc", transforms = transforms, - P=50, N.init=1000, N.final=5000) + P=150, N.init=1000, N.final=1000, params = params) } else { result.P50 <- gmjmcmc(df, transforms = transforms, - P=50, N.init=1000, N.final=5000) + P=150, N.init=1000, N.final=1000, params = params) } +summary(result.P50, labels = names(df)[-1]) #################################################### # @@ -67,18 +80,14 @@ if (use.fbms) { #################################################### set.seed(124) - -# Actual parallel analysis works currently only under Linux or Mac -# result_mm = gmjmcmc.parallel(runs = 4, cores = 4,df, gaussian.loglik, gaussian.loglik.alpha, transforms) - if (use.fbms) { result_parallel <- fbms(data = df, method = "gmjmcmc.parallel", transforms = transforms, runs = 40, cores = 10, P=25,params = params) } else { - result_parallel <- gmjmcmc.parallel(runs = 40, cores = 10,data = df, loglik.pi = gaussian.loglik, + result_parallel <- gmjmcmc.parallel(runs = 40, cores = 10, data = df, loglik.pi = gaussian.loglik, transforms = transforms, P=25,params = params) } - +summary(result_parallel, tol = 0.01,labels = names(df)[-1]) #################################################### # # Inspection of Results (Section 3.4) @@ -128,7 +137,7 @@ plot(result_parallel, 12) # Prediction #preds <- predict(result, df[,-1], link = function(x) x) -preds <- predict(result.default, df[,-1]) +preds <- predict(result.default, df[,-1]) pdf("prediction.pdf") plot(preds$aggr$mean, df$MajorAxis) @@ -156,7 +165,7 @@ rmse.P50 <- sqrt(mean((preds.P50$aggr$mean - df$MajorAxis)^2)) ############################### -preds.multi <- predict(result_parallel , df[,-1], link = function(x) x) +preds.multi <- predict(result_parallel , df[,-1], link = function(x) x) pdf("pred_parallel.pdf") plot(preds.multi$aggr$mean, df$MajorAxis) @@ -164,6 +173,4 @@ dev.off() rmse.parallel <- sqrt(mean((preds.multi$aggr$mean - df$MajorAxis)^2)) - - c(rmse.default, rmse.P50, rmse.parallel) \ No newline at end of file diff --git a/tests_current/Ex2_Sec4_1.R b/tests_current/Ex2_Sec4_1.R index ed826f6d7969c55d66cbfa5d5d1a3ff80f43de45..de32a6585f61aad2fc4c2922a7de9166edc2a6bb 100644 --- a/tests_current/Ex2_Sec4_1.R +++ b/tests_current/Ex2_Sec4_1.R @@ -87,6 +87,8 @@ probs.lin <- gen.probs.mjmcmc() params.lin <- gen.params.mjmcmc(df) params.lin$loglik$r <- 1/dim(df)[1] +#to set variance to unknown uncomment below +#params.lin$loglik$var <- "unknown" if (use.fbms) { result.lin <- fbms(data = df, N = 5000) diff --git a/tests_current/Ex5_Sec4_3.R b/tests_current/Ex5_Sec4_3.R index 9ac8bdf78af9bfd8512c8d4ff336c7f006651558..a2f0095d41b12a2834bd80757ad98f9855b1de5f 100644 --- a/tests_current/Ex5_Sec4_3.R +++ b/tests_current/Ex5_Sec4_3.R @@ -42,7 +42,9 @@ probs <- gen.probs.gmjmcmc(transforms) probs$gen <- c(0,1,0,1) # Only modifications! params <- gen.params.gmjmcmc(df) params$feat$D <- 1 # Set depth of features to 1 -params$loglik$var <- "unknown" + +#to set variance to unknown uncomment below +#params$loglik$var <- "unknown" #################################################### # diff --git a/tests_current/Ex6_Sec4_4.R b/tests_current/Ex6_Sec4_4.R index 2e9124478212d1528e602536108e1d2fe221099e..5fd491a768e87c4a138b9fb89e59fa36517c09cf 100644 --- a/tests_current/Ex6_Sec4_4.R +++ b/tests_current/Ex6_Sec4_4.R @@ -67,7 +67,7 @@ probs$gen <- c(0,0,1,1) #Only projections! params <- gen.params.gmjmcmc(df.training) params$loglik$r = 0.9 - +params$loglik$var = "unknown" ############################################################################# @@ -132,8 +132,8 @@ abline(0,1) # Using method 3 to estimate alpha # ############################################################################# - params$feat$alpha = "deep" +#params$feat$alpha = "random" set.seed(5003) @@ -165,7 +165,7 @@ plot(pred.a3$aggr$mean, df.test$Rings) # ############################################################################# -params$feat$alpha = "deep" +params$feat$alpha = "random" set.seed(5004)