From 5188675cfcd62e7ceb7ae618cc891ce09cce0c82 Mon Sep 17 00:00:00 2001 From: jonlachmann Date: Sun, 27 Nov 2022 12:37:49 +0100 Subject: [PATCH 01/13] Store coefficients for each model. Only gaussian loglik supports it as of now. Start writing a predict function. --- R/gmjmcmc.R | 3 ++- R/gmjmcmc_support.R | 17 +++++++++-------- R/likelihoods.R | 2 +- R/local_optim.R | 22 +++++++++++++--------- R/mjmcmc.R | 9 ++++++--- R/predict.R | 20 ++++++++++++++++++++ R/results.R | 8 ++++---- 7 files changed, 55 insertions(+), 26 deletions(-) create mode 100644 R/predict.R diff --git a/R/gmjmcmc.R b/R/gmjmcmc.R index 8e037b9..96936d0 100644 --- a/R/gmjmcmc.R +++ b/R/gmjmcmc.R @@ -58,7 +58,8 @@ gmjmcmc <- function (data, loglik.pi, loglik.alpha, transforms, T, N.init, N.fin else data.t <- data # Initialize first model of population model.cur <- as.logical(rbinom(n = length(S[[t]]), size = 1, prob = 0.5)) - model.cur <- list(prob=0, model=model.cur, crit=loglik.pre(loglik.pi, model.cur, complex, data.t, params$loglik), alpha=0) + model.cur.res <- loglik.pre(loglik.pi, model.cur, complex, data.t, params$loglik) + model.cur <- list(prob=0, model=model.cur, coefs=model.cur.res$coefs, crit=model.cur.res$crit, alpha=0) best.crit <- model.cur$crit # Reset first best criteria value # Run MJMCMC over the population diff --git a/R/gmjmcmc_support.R b/R/gmjmcmc_support.R index c21262a..60de7b2 100644 --- a/R/gmjmcmc_support.R +++ b/R/gmjmcmc_support.R @@ -42,13 +42,14 @@ marginal.probs <- function (models) { # Function for calculating feature importance through renormalized model estimates marginal.probs.renorm <- function (models) { + models <- lapply(models, function (x) x[c("model", "crit")]) model.size <- length(models[[1]]$model) - models.matrix <- matrix(unlist(models), ncol=model.size+3, byrow=T) - models.matrix <- models.matrix[(!duplicated(models.matrix[,2:(model.size+1)], dim=1, fromLast=T)),] - max_mlik <- max(models.matrix[,(model.size+2)]) - crit.sum <- sum(exp(models.matrix[,(model.size+2)]-max_mlik)) + models.matrix <- matrix(unlist(models), ncol=model.size+1, byrow=T) + models.matrix <- models.matrix[(!duplicated(models.matrix[,1:(model.size)], dim=1, fromLast=T)),] + max_mlik <- max(models.matrix[,(model.size+1)]) + crit.sum <- sum(exp(models.matrix[,(model.size+1)]-max_mlik)) probs <- matrix(NA,1,model.size) - for (i in 2:(model.size+1)) probs[i-1] <- sum(exp(models.matrix[as.logical(models.matrix[,i]),(model.size+2)]-max_mlik))/crit.sum + for (i in 1:(model.size)) probs[i] <- sum(exp(models.matrix[as.logical(models.matrix[,i]),(model.size+1)]-max_mlik))/crit.sum return(probs) } @@ -71,10 +72,10 @@ loglik.pre <- function (loglik.pi, model, complex, data, params) { # Get the complexity measures for just this model complex <- list(width=complex$width[model], oc=complex$oc[model], depth=complex$depth[model]) # Call the model estimator with the data and the model, note that we add the intercept to every model - crit <- loglik.pi(data[,1], data[,-1], c(T,model), complex, params) + model.res <- loglik.pi(data[,1], data[,-1], c(T,model), complex, params) # Check that the critical value is acceptable - if (!is.numeric(crit) || is.nan(crit)) crit <- -.Machine$double.xmax - return(crit) + if (!is.numeric(model.res$crit) || is.nan(model.res$crit)) model.res$crit <- -.Machine$double.xmax + return(model.res) } #' Summarize results from GMJMCMC diff --git a/R/likelihoods.R b/R/likelihoods.R index 73d8f25..c284558 100644 --- a/R/likelihoods.R +++ b/R/likelihoods.R @@ -47,7 +47,7 @@ logistic.loglik.alpha <- function (a, data, mu_func) { gaussian.loglik <- function (y, x, model, complex, params) { suppressWarnings({mod <- fastglm(as.matrix(x[,model]), y, family=gaussian())}) ret <- (-(mod$deviance + 2*log(length(y))*(mod$rank-1) - 2*log(params$r)*(sum(complex$oc))))/2 - return(ret) + return(list(crit=ret, coefs=mod$coefficients)) } #' Log likelihood function for gaussian regression for alpha calculation diff --git a/R/local_optim.R b/R/local_optim.R index c730708..a27b192 100644 --- a/R/local_optim.R +++ b/R/local_optim.R @@ -13,20 +13,22 @@ simulated.annealing <- function (model, data, loglik.pi, indices, complex, param temp <- params$t.init # Initial temperature # Calculate current likelihood - model.lik <- loglik.pre(loglik.pi, model, complex, data, loglikparams) - models[[length(models)+1]] <- list(prob=NA, model=model, crit=model.lik, alpha=NA) + model.res <- loglik.pre(loglik.pi, model, complex, data, loglikparams) + model.lik <- model.res$crit + models[[length(models) + 1]] <- list(prob=NA, model=model, coefs=model.res$coefs, crit=model.lik, alpha=NA) # print(paste("SA Start:", model.lik)) while (temp > params$t.min) { # Make M tries at current temperature for (m in 1:params$M) { # Get a modified model as proposal and calculate its likelihood proposal <- xor(model, gen.proposal(model, params$kern, kernel, indices)$swap) - proposal.lik <- loglik.pre(loglik.pi, proposal, complex, data, loglikparams) + model.proposal <- loglik.pre(loglik.pi, proposal, complex, data, loglikparams) + proposal.lik <- model.proposal$crit # Store the model that we have calculated - models[[length(models)+1]] <- list(prob=NA, model=proposal, crit=proposal.lik, alpha=NA) + models[[length(models) + 1]] <- list(prob=NA, model=proposal, coefs=model.proposal$coefs, crit=proposal.lik, alpha=NA) # Calculate move probability for negative steps (Bolzmann distribution, see Blum and Roli p. 274) if (proposal.lik > model.lik) alpha <- 1 - else alpha <- min(1, exp((proposal.lik - model.lik)/temp)) + else alpha <- min(1, exp((proposal.lik - model.lik) / temp)) # Accept move with probability alpha if (runif(1) < alpha) { model <- proposal @@ -48,8 +50,9 @@ greedy.optim <- function (model, data, loglik.pi, indices, complex, params, logl if (is.null(kernel)) kernel <- sample.int(n = 6, size = 1, prob = params$kern$probs) # Calculate current likelihood - model.lik <- loglik.pre(loglik.pi, model, complex, data, loglikparams) - models[[length(models)+1]] <- list(prob=NA, model=model, crit=model.lik, alpha=NA) + model.res <- loglik.pre(loglik.pi, model, complex, data, loglikparams) + model.lik <- model.res$crit + models[[length(models)+1]] <- list(prob=NA, model=model, coefs=model.res$coefs, crit=model.lik, alpha=NA) # Run the algorithm for the number of steps specified for (i in 1:params$steps) { @@ -59,9 +62,10 @@ greedy.optim <- function (model, data, loglik.pi, indices, complex, params, logl for (j in 1:params$tries) { # Get a modified model as proposal and calculate its likelihood proposal <- xor(model, gen.proposal(model, params$kern, kernel, indices)$swap) - proposal.lik <- loglik.pre(loglik.pi, proposal, complex, data, loglikparams) + model.proposal <- loglik.pre(loglik.pi, proposal, complex, data, loglikparams) + proposal.lik <- model.proposal$crit # Store the model that we have calculated - models[[length(models)+1]] <- list(prob=NA, model=proposal, crit=proposal.lik, alpha=NA) + models[[length(models)+1]] <- list(prob=NA, model=proposal, coefs=model.proposal$coefs, crit=proposal.lik, alpha=NA) if (proposal.lik > proposal.lik.best) { proposal.best <- proposal proposal.lik.best <- proposal.lik diff --git a/R/mjmcmc.R b/R/mjmcmc.R index 9e0238d..80bb0b2 100644 --- a/R/mjmcmc.R +++ b/R/mjmcmc.R @@ -27,7 +27,8 @@ mjmcmc <- function (data, loglik.pi, N, probs, params, sub=F) { # Initialize first model model.cur <- as.logical(rbinom(n = length(S), size = 1, prob = 0.5)) - model.cur <- list(prob=0, model=model.cur, crit=loglik.pre(loglik.pi, model.cur, complex, data, params$loglik), alpha=0) + model.cur.res <- loglik.pre(loglik.pi, model.cur, complex, data, params$loglik) + model.cur <- list(prob=0, model=model.cur, coefs=model.cur.res$coefs, crit=model.cur.res$crit, alpha=0) best.crit <- model.cur$crit # Set first best criteria value cat("\nMJMCMC begin.\n") @@ -185,7 +186,8 @@ mjmcmc.prop <- function (data, loglik.pi, model.cur, complex, pip_estimate, prob model.cur$prob <- prob.proposal(proposal$model, model.cur$model, q.g, params$mh, pip_estimate) } # Calculate log likelihoods for the proposed model - proposal$crit <- loglik.pre(loglik.pi, proposal$model, complex, data, params$loglik) + proposal.res <- loglik.pre(loglik.pi, proposal$model, complex, data, params$loglik) + proposal$crit <- proposal.res$crit # TODO: Compare to a list of best mliks for all visited models, # TODO: update that list if our estimate is better, otherwise update our estimate. @@ -193,7 +195,7 @@ mjmcmc.prop <- function (data, loglik.pi, model.cur, complex, pip_estimate, prob # If we are running with subsampling, check the list for a better mlik if (!is.null(visited.models)) { mod.idx <- vec_in_mat(visited.models$models[1:visited.models$count,,drop=F], proposal$model) - if (mod.idx != 0) proposal$crit <- max (proposal$crit, visited.models$crit[mod.idx]) + if (mod.idx != 0) proposal$crit <- max(proposal$crit, visited.models$crit[mod.idx]) } # Calculate acceptance probability for proposed model @@ -201,5 +203,6 @@ mjmcmc.prop <- function (data, loglik.pi, model.cur, complex, pip_estimate, prob ### Format results and return them proposal$swap <- NULL; proposal$S <- NULL + proposal$coefs <- proposal.res$coefs return(proposal) } \ No newline at end of file diff --git a/R/predict.R b/R/predict.R new file mode 100644 index 0000000..b90637a --- /dev/null +++ b/R/predict.R @@ -0,0 +1,20 @@ +#' Predict using a BGNLM model. +#' @param model The model to use. +#' @param x The new data to use for the prediction, a matrix where each row is an observation. +#' @param link The link function to use +#' @param populations The populations to use +predict.bgnlm <- function (model, x, link=function(x) x, populations="last") { + if (populations=="last") pops.use <- length(model$populations) + else if (populations=="all") pops.use <- seq_len(length(model$populations)) + else if (populations=="best") pops.use <- which.max(unlist(model$best.margs)) + + # TODO: Generalize for multiple populations + models <- model$models[[pops.use]] + features <- model$populations[[pops.use]] + + # Precalculate the features for the new data (c(0,1...) is because precalc features thinks there is an intercept and y col). + x.precalc <- precalc.features(cbind(0, 1, x), features)[, -1] + +} + +x <- kmdata[1:10, -1] diff --git a/R/results.R b/R/results.R index 9e13daf..fc1657e 100644 --- a/R/results.R +++ b/R/results.R @@ -63,7 +63,7 @@ merge.results <- function (results, populations="last", complex.measure=1, tol=0 # row 4 is the total weighted density of those features feats.map <- matrix(1:feat.count, 4, feat.count, byrow=T) for (i in seq_len(nrow(cors))) { - equiv.feats <- which(cors[i,] >= (1-tol)) + equiv.feats <- which(cors[i, ] >= (1 - tol)) # Compare equivalent features complexity to find most simple equiv.complex <- list(width=complex$width[equiv.feats], oc=complex$oc[equiv.feats], depth=complex$depth[equiv.feats]) equiv.simplest <- lapply(equiv.complex, which.min) @@ -71,11 +71,11 @@ merge.results <- function (results, populations="last", complex.measure=1, tol=0 feats.map[4,equiv.feats] <- sum(renorms[equiv.feats]) } # Select the simplest features based on the specified complexity measure and sort them - feats.simplest.ids <- feats.map[complex.measure,unique(feats.map[complex.measure,])] - feats.simplest.ids <- feats.simplest.ids[order(feats.map[4,feats.simplest.ids])] + feats.simplest.ids <- feats.map[complex.measure,unique(feats.map[complex.measure, ])] + feats.simplest.ids <- feats.simplest.ids[order(feats.map[4, feats.simplest.ids])] counts <- sapply(feats.simplest.ids, function(x) sum(feats.map[1,] == x)) feats.simplest <- features[feats.simplest.ids] - importance <- feats.map[4,feats.simplest.ids] + importance <- feats.map[4, feats.simplest.ids] merged <- list(features=feats.simplest, marg.probs=importance, counts=counts) attr(merged, "class") <- "gmjmcmcresult" return(merged) -- GitLab From 788c85ae7dd10d8bdd9e7f4359700639f3ab2e42 Mon Sep 17 00:00:00 2001 From: jonlachmann Date: Sun, 27 Nov 2022 15:18:33 +0100 Subject: [PATCH 02/13] Predict function works but needs further testing and fixes for quantiles. --- R/gmjmcmc.R | 2 +- R/gmjmcmc_support.R | 23 ++++++++++++++++------- R/likelihoods.R | 19 ++++++++++++++++++- R/nonlinear_functions.R | 8 ++++++++ R/predict.R | 34 +++++++++++++++++++++++++++++++--- R/results.R | 2 +- 6 files changed, 75 insertions(+), 13 deletions(-) diff --git a/R/gmjmcmc.R b/R/gmjmcmc.R index 96936d0..51f6568 100644 --- a/R/gmjmcmc.R +++ b/R/gmjmcmc.R @@ -70,7 +70,7 @@ gmjmcmc <- function (data, loglik.pi, loglik.alpha, transforms, T, N.init, N.fin # Add the models visited in the current population to the model list models[[t]] <- mjmcmc_res$models # Calculate marginal likelihoods for current features - marg.probs[[t]] <- marginal.probs.renorm(c(mjmcmc_res$models, mjmcmc_res$lo.models)) + marg.probs[[t]] <- marginal.probs.renorm(c(mjmcmc_res$models, mjmcmc_res$lo.models))$probs # Store best marginal model probability for current population best.margs[[t]] <- mjmcmc_res$best.crit # Print the marginal posterior distribution of the features after MJMCMC diff --git a/R/gmjmcmc_support.R b/R/gmjmcmc_support.R index 60de7b2..b64fae3 100644 --- a/R/gmjmcmc_support.R +++ b/R/gmjmcmc_support.R @@ -40,19 +40,28 @@ marginal.probs <- function (models) { return(probs) } -# Function for calculating feature importance through renormalized model estimates -marginal.probs.renorm <- function (models) { +#' Function for calculating feature importance through renormalized model estimates +#' @param models The models to use. +#' @oaram type Select which probabilities are of interest, features or models +marginal.probs.renorm <- function (models, type="features") { models <- lapply(models, function (x) x[c("model", "crit")]) model.size <- length(models[[1]]$model) models.matrix <- matrix(unlist(models), ncol=model.size+1, byrow=T) - models.matrix <- models.matrix[(!duplicated(models.matrix[,1:(model.size)], dim=1, fromLast=T)),] + duplicates <- duplicated(models.matrix[,1:(model.size)], dim=1, fromLast=T) + models.matrix <- models.matrix[!duplicates, ] max_mlik <- max(models.matrix[,(model.size+1)]) crit.sum <- sum(exp(models.matrix[,(model.size+1)]-max_mlik)) - probs <- matrix(NA,1,model.size) - for (i in 1:(model.size)) probs[i] <- sum(exp(models.matrix[as.logical(models.matrix[,i]),(model.size+1)]-max_mlik))/crit.sum - return(probs) + if (type == "features") { + probs <- matrix(NA,1,model.size) + for (i in 1:(model.size)) probs[i] <- sum(exp(models.matrix[as.logical(models.matrix[,i]),(model.size+1)]-max_mlik))/crit.sum + } else if (type =="models") { + probs <- matrix(NA,1, nrow(models.matrix)) + for (i in seq_len(nrow(models.matrix))) probs[i] <- sum(exp(models.matrix[i, ncol(models.matrix)]-max_mlik))/crit.sum + } + return(list(idx=which(!duplicates), probs=probs)) } + # Function for precalculating features for a new feature population precalc.features <- function (data, features) { precalc <- matrix(NA, nrow(data), length(features)+2) @@ -91,7 +100,7 @@ summary.gmjresult <- function (results, population="last") { for (i in seq_along(feature_strings)) { feature_strings[[i]] <- print.feature(results$populations[[pops]][[i]]) } - feature_importance <- marginal.probs.renorm(results$models[[pops]]) + feature_importance <- marginal.probs.renorm(results$models[[pops]])$probs return(list(features=feature_strings, importance=feature_importance)) } diff --git a/R/likelihoods.R b/R/likelihoods.R index c284558..18cb11a 100644 --- a/R/likelihoods.R +++ b/R/likelihoods.R @@ -18,7 +18,7 @@ logistic.loglik <- function (y, x, model, complex, params) { r <- 20/223 suppressWarnings({mod <- fastglm(as.matrix(x[,model]), y, family=binomial())}) ret <- (-(mod$deviance -2*log(r)*sum(complex$oc)))/2 - return(ret) + return(list(crit=ret, coefs=mod$coefficients)) } #' Log likelihood function for logistic regression for alpha calculation @@ -50,6 +50,23 @@ gaussian.loglik <- function (y, x, model, complex, params) { return(list(crit=ret, coefs=mod$coefficients)) } +#' Log likelihood function for gaussian regression with a prior p(m)=r*sum(total_width), using subsampling. +#' +#' @param y A vector containing the dependent variable +#' @param x The matrix containing the precalculated features +#' @param model The model to estimate as a logical vector +#' @param complex A list of complexity measures for the features +#' @param params A list of parameters for the log likelihood, supplied by the user +#' +#' @export gaussian.loglik.bic.irlssgd +gaussian.loglik.bic.irlssgd <- function (y, x, model, complex, params) { + mod <- irls.sgd(as.matrix(x[,model]), y, gaussian(), + irls.control=list(subs=params$subs, maxit=20, tol=1e-7, cooling = c(1,0.9,0.75), expl = c(3,1.5,1)), + sgd.control=list(subs=params$subs, maxit=250, alpha=0.001, decay=0.99, histfreq=10)) + ret <- (-(mod$deviance + 2*log(length(y))*(mod$rank-1) - 2*log(params$r)*(sum(complex$oc))))/2 + return(list(crit=ret, coefs=mod$coefficients)) +} + #' Log likelihood function for gaussian regression for alpha calculation #' This function is just the bare likelihood function #' Note that it only gives a proportional value and is equivalent to least squares diff --git a/R/nonlinear_functions.R b/R/nonlinear_functions.R index 62c4790..236ef8b 100644 --- a/R/nonlinear_functions.R +++ b/R/nonlinear_functions.R @@ -91,6 +91,14 @@ gauss <- function(x) exp(-x*x) #' @export to25 to25 <- function(x)abs(x)^(2.5) +#' To 3 power +#' +#' @param x The vector of values +#' @return x^(3) +#' +#' @export to35 +to3 <- function(x)abs(x)^(3) + #' To 3.5 power #' #' @param x The vector of values diff --git a/R/predict.R b/R/predict.R index b90637a..1d86147 100644 --- a/R/predict.R +++ b/R/predict.R @@ -3,18 +3,46 @@ #' @param x The new data to use for the prediction, a matrix where each row is an observation. #' @param link The link function to use #' @param populations The populations to use -predict.bgnlm <- function (model, x, link=function(x) x, populations="last") { +#' @param quantiles The quantiles to calculate credible intervals for the posterior moddes (in model space). +predict.bgnlm <- function (model, x, link=function(x) x, populations="last", quantiles=c(0.025, 0.5, 0.975)) { if (populations=="last") pops.use <- length(model$populations) else if (populations=="all") pops.use <- seq_len(length(model$populations)) else if (populations=="best") pops.use <- which.max(unlist(model$best.margs)) - # TODO: Generalize for multiple populations + # TODO: Generalize for multiple populations models <- model$models[[pops.use]] features <- model$populations[[pops.use]] # Precalculate the features for the new data (c(0,1...) is because precalc features thinks there is an intercept and y col). x.precalc <- precalc.features(cbind(0, 1, x), features)[, -1] + # Get the renormalized marginal probabilities and indices of non-duplicate models + model.probs <- marginal.probs.renorm(models, "models") + models <- models[model.probs$idx] + + yhat <- matrix(NA, nrow=nrow(x), ncol=length(models)) + for (i in seq_along(models)) { + yhat[, i] <- link(x.precalc[, c(TRUE, models[[i]]$model), drop=FALSE] %*% models[[i]]$coefs) + } + + mean.pred <- rowSums(yhat * as.numeric(model.probs$probs)) + quantiles <- apply(yhat, 1, weighted.quantiles, weights=model.probs$probs, prob=quantiles) + + return(list(mean=mean.pred, quantiles=quantiles)) } -x <- kmdata[1:10, -1] +#' Calculate weighted quantiles +#' @param values The values to use +#' @param weights The weights of the values +#' @param prob The probabilities of the quantiles to use +#' @return +weighted.quantiles <- function (values, weights, prob=c(0.025, 0.975)) { + ordered <- order(values) + P <- cumsum(weights[ordered]) + + iv <- integer(length(prob)) + for (i in seq_along(iv)) { + iv[i] <- which.max(P >= prob[i]) + } + {values[ordered]}[iv] +} diff --git a/R/results.R b/R/results.R index fc1657e..1f4f48f 100644 --- a/R/results.R +++ b/R/results.R @@ -117,7 +117,7 @@ summary.gmjmcmcresult <- function (results, pop="last") { # Get features as strings for printing feats.strings <- sapply(results$populations[[pop]], print.feature) # Get marginal posterior of features - marg.probs <- marginal.probs.renorm(results$models[[pop]]) + marg.probs <- marginal.probs.renorm(results$models[[pop]])$probs # Print the final distribution cat(" Importance | Feature\n") print.dist(marg.probs, feats.strings, -1) -- GitLab From d505c4bd516baff814c48c78a8e3f46a3fbe2ad4 Mon Sep 17 00:00:00 2001 From: jonlachmann Date: Sun, 27 Nov 2022 15:32:35 +0100 Subject: [PATCH 03/13] Removed redundant sum. --- R/gmjmcmc_support.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/gmjmcmc_support.R b/R/gmjmcmc_support.R index b64fae3..54feff9 100644 --- a/R/gmjmcmc_support.R +++ b/R/gmjmcmc_support.R @@ -56,7 +56,7 @@ marginal.probs.renorm <- function (models, type="features") { for (i in 1:(model.size)) probs[i] <- sum(exp(models.matrix[as.logical(models.matrix[,i]),(model.size+1)]-max_mlik))/crit.sum } else if (type =="models") { probs <- matrix(NA,1, nrow(models.matrix)) - for (i in seq_len(nrow(models.matrix))) probs[i] <- sum(exp(models.matrix[i, ncol(models.matrix)]-max_mlik))/crit.sum + for (i in seq_len(nrow(models.matrix))) probs[i] <- exp(models.matrix[i, ncol(models.matrix)]-max_mlik)/crit.sum } return(list(idx=which(!duplicates), probs=probs)) } -- GitLab From e2c4f301ea20e432a246a9bf1dc8e8e835f0f5d7 Mon Sep 17 00:00:00 2001 From: jonlachmann Date: Sun, 27 Nov 2022 22:57:15 +0100 Subject: [PATCH 04/13] Added a comment in predict. --- R/predict.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/predict.R b/R/predict.R index 1d86147..7f2aa93 100644 --- a/R/predict.R +++ b/R/predict.R @@ -9,7 +9,7 @@ predict.bgnlm <- function (model, x, link=function(x) x, populations="last", qua else if (populations=="all") pops.use <- seq_len(length(model$populations)) else if (populations=="best") pops.use <- which.max(unlist(model$best.margs)) - # TODO: Generalize for multiple populations + # TODO: Generalize for multiple populations - Use population weights multiplied by model weights. models <- model$models[[pops.use]] features <- model$populations[[pops.use]] -- GitLab From 71083f54d6dd67d5ecf76388d15bb5ccdf8edf51 Mon Sep 17 00:00:00 2001 From: jonlachmann Date: Wed, 7 Dec 2022 20:41:45 +0100 Subject: [PATCH 05/13] Save pruned results in merge.results. Predict for multiple populations implemented. --- R/parallel.R | 7 +++---- R/predict.R | 56 +++++++++++++++++++++++++++++++--------------------- R/results.R | 20 ++++++++++++++++--- 3 files changed, 53 insertions(+), 30 deletions(-) diff --git a/R/parallel.R b/R/parallel.R index 9f23e0e..7fc8ef8 100644 --- a/R/parallel.R +++ b/R/parallel.R @@ -8,12 +8,11 @@ mjmcmc.parallel <- function (runs, cores=getOption("mc.cores", 2L), ...) { } -#' Run multiple gmjmcmc runs in parallel, merging the results before returning. +#' Run multiple gmjmcmc runs in parallel returning a list of all results. #' @param runs The number of runs to run #' @param cores The number of cores to run on #' @param ... Parameters to pass to gmjmcmc -#' @return Merged results from multiple gmjmcmc runs +#' @return Results from multiple gmjmcmc runs gmjmcmc.parallel <- function (runs, cores=getOption("mc.cores", 2L), ...) { results <- mclapply(seq_len(runs), function (x) { gmjmcmc(...) }, mc.cores=cores) - merged <- merge.results(results) -} \ No newline at end of file +} diff --git a/R/predict.R b/R/predict.R index 7f2aa93..5075602 100644 --- a/R/predict.R +++ b/R/predict.R @@ -2,33 +2,43 @@ #' @param model The model to use. #' @param x The new data to use for the prediction, a matrix where each row is an observation. #' @param link The link function to use -#' @param populations The populations to use #' @param quantiles The quantiles to calculate credible intervals for the posterior moddes (in model space). -predict.bgnlm <- function (model, x, link=function(x) x, populations="last", quantiles=c(0.025, 0.5, 0.975)) { - if (populations=="last") pops.use <- length(model$populations) - else if (populations=="all") pops.use <- seq_len(length(model$populations)) - else if (populations=="best") pops.use <- which.max(unlist(model$best.margs)) - - # TODO: Generalize for multiple populations - Use population weights multiplied by model weights. - models <- model$models[[pops.use]] - features <- model$populations[[pops.use]] - - # Precalculate the features for the new data (c(0,1...) is because precalc features thinks there is an intercept and y col). - x.precalc <- precalc.features(cbind(0, 1, x), features)[, -1] - - # Get the renormalized marginal probabilities and indices of non-duplicate models - model.probs <- marginal.probs.renorm(models, "models") - models <- models[model.probs$idx] - - yhat <- matrix(NA, nrow=nrow(x), ncol=length(models)) - for (i in seq_along(models)) { - yhat[, i] <- link(x.precalc[, c(TRUE, models[[i]]$model), drop=FALSE] %*% models[[i]]$coefs) +predict.bgnlm <- function (model, x, link=function(x) x, quantiles=c(0.025, 0.5, 0.975)) { + preds <- list() + for (i in seq_along(model$results)) { + preds[[i]] <- list() + for (j in seq_along(model$results[[i]]$populations)) { + # Select the models and features to predict from at this iteration + models <- model$results[[i]]$models[[j]] + features <- model$results[[i]]$populations[[j]] + model.probs <- model$results[[i]]$model.probs[[j]] + + # Precalculate the features for the new data (c(0,1...) is because precalc features thinks there is an intercept and y col). + x.precalc <- precalc.features(cbind(0, 1, x), features)[, -1] + + yhat <- matrix(NA, nrow=nrow(x), ncol=length(models)) + for (k in seq_along(models)) { + yhat[, k] <- link(x.precalc[, c(TRUE, models[[k]]$model), drop=FALSE] %*% models[[k]]$coefs) + } + + mean.pred <- rowSums(yhat * as.numeric(model.probs)) + pred.quant <- apply(yhat, 1, weighted.quantiles, weights=model.probs, prob=quantiles) + + preds[[i]][[j]] <- list(mean=mean.pred, quantiles=pred.quant) + } } - mean.pred <- rowSums(yhat * as.numeric(model.probs$probs)) - quantiles <- apply(yhat, 1, weighted.quantiles, weights=model.probs$probs, prob=quantiles) + aggr <- list() + aggr$mean <- 0 * preds[[1]][[1]]$mean + aggr$quantiles <- 0 * preds[[1]][[1]]$quantiles + for (i in seq_along(preds)) { + for (j in seq_along(preds[[i]])) { + aggr$mean <- aggr$mean + preds[[i]][[j]]$mean * model$results[[i]]$pop.weights[j] + aggr$quantiles <- aggr$quantiles + preds[[i]][[j]]$quantiles * model$results[[i]]$pop.weights[j] + } + } - return(list(mean=mean.pred, quantiles=quantiles)) + return(list(aggr=aggr, preds=preds)) } #' Calculate weighted quantiles diff --git a/R/results.R b/R/results.R index 1f4f48f..287cdd2 100644 --- a/R/results.R +++ b/R/results.R @@ -31,11 +31,25 @@ merge.results <- function (results, populations="last", complex.measure=1, tol=0 # Collect all features and their renormalized weighted values features <- vector("list") renorms <- vector("list") + weight_idx <- 1 for (i in 1:res.count) { + results[[i]]$pop.weights <- rep(NA, length(results[[i]]$populations)) + results[[i]]$model.probs <- list() for (pop in pops.use[[i]]) { features <- append(features, results[[i]]$populations[[pop]]) - renorms <- append(renorms, pop.weights[i]*results[[i]]$marg.probs[[pop]]) + renorms <- append(renorms, pop.weights[weight_idx] * results[[i]]$marg.probs[[pop]]) + results[[i]]$pop.weights[pop] <- pop.weights[weight_idx] + weight_idx <- weight_idx + 1 + + model.probs <- marginal.probs.renorm(results[[i]]$models[[pop]], "models") + results[[i]]$model.probs[[pop]] <- model.probs$probs + results[[i]]$models[[pop]] <- results[[i]]$models[[pop]][model.probs$idx] } + accept.tot <- results[[i]]$accept.tot + best <- results[[i]]$best + results[[i]] <- lapply(results[[i]], function (x) x[pops.use[[i]]]) + results[[i]]$accept.tot <- accept.tot + results[[i]]$best <- best } renorms <- unlist(renorms) na.feats <- which(is.na(renorms)) @@ -76,8 +90,8 @@ merge.results <- function (results, populations="last", complex.measure=1, tol=0 counts <- sapply(feats.simplest.ids, function(x) sum(feats.map[1,] == x)) feats.simplest <- features[feats.simplest.ids] importance <- feats.map[4, feats.simplest.ids] - merged <- list(features=feats.simplest, marg.probs=importance, counts=counts) - attr(merged, "class") <- "gmjmcmcresult" + merged <- list(features=feats.simplest, marg.probs=importance, counts=counts, results=results) + attr(merged, "class") <- "bgnlm" return(merged) } -- GitLab From 039ef87b4d68deaaffe91868417f5264cdffce4a Mon Sep 17 00:00:00 2001 From: jonlachmann Date: Wed, 7 Dec 2022 21:16:33 +0100 Subject: [PATCH 06/13] Add population weights to population-wise predictions in predict fun. --- R/predict.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/predict.R b/R/predict.R index 5075602..c6fc29e 100644 --- a/R/predict.R +++ b/R/predict.R @@ -24,7 +24,7 @@ predict.bgnlm <- function (model, x, link=function(x) x, quantiles=c(0.025, 0.5, mean.pred <- rowSums(yhat * as.numeric(model.probs)) pred.quant <- apply(yhat, 1, weighted.quantiles, weights=model.probs, prob=quantiles) - preds[[i]][[j]] <- list(mean=mean.pred, quantiles=pred.quant) + preds[[i]][[j]] <- list(mean=mean.pred, quantiles=pred.quant, weight=model$results[[i]]$pop.weights[j]) } } -- GitLab From fbea64d71124728b5bf31f4d8bae19cb51794033 Mon Sep 17 00:00:00 2001 From: jonlachmann Date: Wed, 7 Dec 2022 21:22:32 +0100 Subject: [PATCH 07/13] Update docs. --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ R/gmjmcmc_support.R | 2 +- R/predict.R | 7 ++++++- man/gaussian.loglik.bic.irlssgd.Rd | 22 ++++++++++++++++++++++ man/gmjmcmc.parallel.Rd | 21 +++++++++++++++++++++ man/marginal.probs.renorm.Rd | 16 ++++++++++++++++ man/merge.results.Rd | 5 ++++- man/mjmcmc.Rd | 2 ++ man/mjmcmc.parallel.Rd | 21 +++++++++++++++++++++ man/mjmcmc.prop.Rd | 5 +++++ man/predict.bgnlm.Rd | 20 ++++++++++++++++++++ man/to3.Rd | 17 +++++++++++++++++ man/var.loglik.Rd | 22 ++++++++++++++++++++++ man/weighted.quantiles.Rd | 21 +++++++++++++++++++++ 15 files changed, 181 insertions(+), 4 deletions(-) create mode 100644 man/gaussian.loglik.bic.irlssgd.Rd create mode 100644 man/gmjmcmc.parallel.Rd create mode 100644 man/marginal.probs.renorm.Rd create mode 100644 man/mjmcmc.parallel.Rd create mode 100644 man/predict.bgnlm.Rd create mode 100644 man/to3.Rd create mode 100644 man/var.loglik.Rd create mode 100644 man/weighted.quantiles.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 4b33ea6..1762512 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,5 +21,5 @@ Suggests: knitr, rmarkdown, markdown -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.2 VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index af7dd4b..236691c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(expi) export(gauss) export(gaussian.loglik) export(gaussian.loglik.alpha) +export(gaussian.loglik.bic.irlssgd) export(gen.params.list) export(gen.probs.list) export(gmjmcmc) @@ -21,6 +22,7 @@ export(logistic.loglik.alpha) export(merge.results) export(mjmcmc) export(model.string) +export(predict.bgnlm) export(set.transforms) export(sigmoid) export(sini) diff --git a/R/gmjmcmc_support.R b/R/gmjmcmc_support.R index 54feff9..799aae6 100644 --- a/R/gmjmcmc_support.R +++ b/R/gmjmcmc_support.R @@ -42,7 +42,7 @@ marginal.probs <- function (models) { #' Function for calculating feature importance through renormalized model estimates #' @param models The models to use. -#' @oaram type Select which probabilities are of interest, features or models +#' @param type Select which probabilities are of interest, features or models marginal.probs.renorm <- function (models, type="features") { models <- lapply(models, function (x) x[c("model", "crit")]) model.size <- length(models[[1]]$model) diff --git a/R/predict.R b/R/predict.R index c6fc29e..abcaaea 100644 --- a/R/predict.R +++ b/R/predict.R @@ -1,8 +1,11 @@ #' Predict using a BGNLM model. +#' #' @param model The model to use. #' @param x The new data to use for the prediction, a matrix where each row is an observation. #' @param link The link function to use #' @param quantiles The quantiles to calculate credible intervals for the posterior moddes (in model space). +#' +#' @export predict.bgnlm predict.bgnlm <- function (model, x, link=function(x) x, quantiles=c(0.025, 0.5, 0.975)) { preds <- list() for (i in seq_along(model$results)) { @@ -42,10 +45,12 @@ predict.bgnlm <- function (model, x, link=function(x) x, quantiles=c(0.025, 0.5, } #' Calculate weighted quantiles +#' #' @param values The values to use #' @param weights The weights of the values #' @param prob The probabilities of the quantiles to use -#' @return +#' +#' @return Weighted quantiles weighted.quantiles <- function (values, weights, prob=c(0.025, 0.975)) { ordered <- order(values) P <- cumsum(weights[ordered]) diff --git a/man/gaussian.loglik.bic.irlssgd.Rd b/man/gaussian.loglik.bic.irlssgd.Rd new file mode 100644 index 0000000..7686eb3 --- /dev/null +++ b/man/gaussian.loglik.bic.irlssgd.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/likelihoods.R +\name{gaussian.loglik.bic.irlssgd} +\alias{gaussian.loglik.bic.irlssgd} +\title{Log likelihood function for gaussian regression with a prior p(m)=r*sum(total_width), using subsampling.} +\usage{ +gaussian.loglik.bic.irlssgd(y, x, model, complex, params) +} +\arguments{ +\item{y}{A vector containing the dependent variable} + +\item{x}{The matrix containing the precalculated features} + +\item{model}{The model to estimate as a logical vector} + +\item{complex}{A list of complexity measures for the features} + +\item{params}{A list of parameters for the log likelihood, supplied by the user} +} +\description{ +Log likelihood function for gaussian regression with a prior p(m)=r*sum(total_width), using subsampling. +} diff --git a/man/gmjmcmc.parallel.Rd b/man/gmjmcmc.parallel.Rd new file mode 100644 index 0000000..350fbda --- /dev/null +++ b/man/gmjmcmc.parallel.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parallel.R +\name{gmjmcmc.parallel} +\alias{gmjmcmc.parallel} +\title{Run multiple gmjmcmc runs in parallel returning a list of all results.} +\usage{ +gmjmcmc.parallel(runs, cores = getOption("mc.cores", 2L), ...) +} +\arguments{ +\item{runs}{The number of runs to run} + +\item{cores}{The number of cores to run on} + +\item{...}{Parameters to pass to gmjmcmc} +} +\value{ +Results from multiple gmjmcmc runs +} +\description{ +Run multiple gmjmcmc runs in parallel returning a list of all results. +} diff --git a/man/marginal.probs.renorm.Rd b/man/marginal.probs.renorm.Rd new file mode 100644 index 0000000..be55d84 --- /dev/null +++ b/man/marginal.probs.renorm.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gmjmcmc_support.R +\name{marginal.probs.renorm} +\alias{marginal.probs.renorm} +\title{Function for calculating feature importance through renormalized model estimates} +\usage{ +marginal.probs.renorm(models, type = "features") +} +\arguments{ +\item{models}{The models to use.} + +\item{type}{Select which probabilities are of interest, features or models} +} +\description{ +Function for calculating feature importance through renormalized model estimates +} diff --git a/man/merge.results.Rd b/man/merge.results.Rd index 92b0201..66db51b 100644 --- a/man/merge.results.Rd +++ b/man/merge.results.Rd @@ -6,7 +6,7 @@ This function will weight the features based on the best mlik in that population and merge the results together, simplifying by merging equivalent features (having high correlation).} \usage{ -\method{merge}{results}(results, populations = "last", complex.measure = 1, tol = 0) +\method{merge}{results}(results, populations = "last", complex.measure = 1, tol = 0, data = NULL) } \arguments{ \item{results}{A list containing multiple results from GMJMCMC.} @@ -15,6 +15,9 @@ and merge the results together, simplifying by merging equivalent features (havi 1=total width, 2=operation count and 3=depth.} \item{tol}{The tolerance to use for the correlation when finding equivalent features, default is 0.} + +\item{data}{Data to use when comparing features, default is NULL meaning that mock data will be generated, +if data is supplied it should be of the same form as is required by gmjmcmc, i.e. with both x, y and an intercept.} } \description{ Merge a list of multiple results from many runs diff --git a/man/mjmcmc.Rd b/man/mjmcmc.Rd index 72bf490..086bc72 100644 --- a/man/mjmcmc.Rd +++ b/man/mjmcmc.Rd @@ -18,6 +18,8 @@ and the rest of the columns should be the independent variables.} \item{probs}{A list of the various probability vectors to use} \item{params}{A list of the various parameters for all the parts of the algorithm} + +\item{sub}{An indicator that if the likelihood is inexact and should be improved each model visit (EXPERIMENTAL!)} } \description{ Main algorithm for MJMCMC diff --git a/man/mjmcmc.parallel.Rd b/man/mjmcmc.parallel.Rd new file mode 100644 index 0000000..83fecd7 --- /dev/null +++ b/man/mjmcmc.parallel.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parallel.R +\name{mjmcmc.parallel} +\alias{mjmcmc.parallel} +\title{Run multiple mjmcmc runs in parallel, merging the results before returning.} +\usage{ +mjmcmc.parallel(runs, cores = getOption("mc.cores", 2L), ...) +} +\arguments{ +\item{runs}{The number of runs to run} + +\item{cores}{The number of cores to run on} + +\item{...}{Parameters to pass to mjmcmc} +} +\value{ +Merged results from multiple mjmcmc runs +} +\description{ +Run multiple mjmcmc runs in parallel, merging the results before returning. +} diff --git a/man/mjmcmc.prop.Rd b/man/mjmcmc.prop.Rd index ce36faa..70b4559 100644 --- a/man/mjmcmc.prop.Rd +++ b/man/mjmcmc.prop.Rd @@ -9,6 +9,7 @@ mjmcmc.prop( loglik.pi, model.cur, complex, + pip_estimate, probs, params, visited.models = NULL @@ -21,6 +22,10 @@ mjmcmc.prop( \item{model.cur}{The current model to make the proposal respective to} +\item{complex}{The complexity measures used when evaluating the marginal likelihood} + +\item{pip_estimate}{The current posterior inclusion probability estimate, used for proposals} + \item{probs}{A list of the various probability vectors to use} \item{params}{A list of the various parameters for all the parts of the algorithm} diff --git a/man/predict.bgnlm.Rd b/man/predict.bgnlm.Rd new file mode 100644 index 0000000..cd788f9 --- /dev/null +++ b/man/predict.bgnlm.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.R +\name{predict.bgnlm} +\alias{predict.bgnlm} +\title{Predict using a BGNLM model.} +\usage{ +\method{predict}{bgnlm}(model, x, link = function(x) x, quantiles = c(0.025, 0.5, 0.975)) +} +\arguments{ +\item{model}{The model to use.} + +\item{x}{The new data to use for the prediction, a matrix where each row is an observation.} + +\item{link}{The link function to use} + +\item{quantiles}{The quantiles to calculate credible intervals for the posterior moddes (in model space).} +} +\description{ +Predict using a BGNLM model. +} diff --git a/man/to3.Rd b/man/to3.Rd new file mode 100644 index 0000000..a7a5f86 --- /dev/null +++ b/man/to3.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nonlinear_functions.R +\name{to3} +\alias{to3} +\title{To 3 power} +\usage{ +to3(x) +} +\arguments{ +\item{x}{The vector of values} +} +\value{ +x^(3) +} +\description{ +To 3 power +} diff --git a/man/var.loglik.Rd b/man/var.loglik.Rd new file mode 100644 index 0000000..a751fb0 --- /dev/null +++ b/man/var.loglik.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/likelihoods.R +\name{var.loglik} +\alias{var.loglik} +\title{Log likelihood function for a VAR model} +\usage{ +var.loglik(y, x, model, complex, params) +} +\arguments{ +\item{y}{A matrix containing the dependent variables} + +\item{x}{The matrix containing the precalculated features} + +\item{model}{The model to estimate as a logical vector} + +\item{complex}{A list of complexity measures for the features} + +\item{params}{A list of parameters for the log likelihood, supplied by the user} +} +\description{ +Log likelihood function for a VAR model +} diff --git a/man/weighted.quantiles.Rd b/man/weighted.quantiles.Rd new file mode 100644 index 0000000..1901bb1 --- /dev/null +++ b/man/weighted.quantiles.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.R +\name{weighted.quantiles} +\alias{weighted.quantiles} +\title{Calculate weighted quantiles} +\usage{ +weighted.quantiles(values, weights, prob = c(0.025, 0.975)) +} +\arguments{ +\item{values}{The values to use} + +\item{weights}{The weights of the values} + +\item{prob}{The probabilities of the quantiles to use} +} +\value{ +Weighted quantiles +} +\description{ +Calculate weighted quantiles +} -- GitLab From b1ac47bb33db41cdc8b1858180a8197fc24720ad Mon Sep 17 00:00:00 2001 From: Jon Date: Mon, 23 Jan 2023 21:26:41 +0100 Subject: [PATCH 08/13] Fix mean calculation in predict function. --- R/predict.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/predict.R b/R/predict.R index abcaaea..2a457ba 100644 --- a/R/predict.R +++ b/R/predict.R @@ -7,6 +7,7 @@ #' #' @export predict.bgnlm predict.bgnlm <- function (model, x, link=function(x) x, quantiles=c(0.025, 0.5, 0.975)) { + x <- as.matrix(x) preds <- list() for (i in seq_along(model$results)) { preds[[i]] <- list() @@ -24,7 +25,7 @@ predict.bgnlm <- function (model, x, link=function(x) x, quantiles=c(0.025, 0.5, yhat[, k] <- link(x.precalc[, c(TRUE, models[[k]]$model), drop=FALSE] %*% models[[k]]$coefs) } - mean.pred <- rowSums(yhat * as.numeric(model.probs)) + mean.pred <- rowSums(yhat %*% diag(as.numeric(model.probs))) pred.quant <- apply(yhat, 1, weighted.quantiles, weights=model.probs, prob=quantiles) preds[[i]][[j]] <- list(mean=mean.pred, quantiles=pred.quant, weight=model$results[[i]]$pop.weights[j]) -- GitLab From e00547d575efba737c64c42480282825c2ec682c Mon Sep 17 00:00:00 2001 From: Jon Date: Mon, 23 Jan 2023 22:18:32 +0100 Subject: [PATCH 09/13] Adjust logistic likelihood to take r as a parameter. --- R/likelihoods.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/likelihoods.R b/R/likelihoods.R index 18cb11a..7785303 100644 --- a/R/likelihoods.R +++ b/R/likelihoods.R @@ -15,9 +15,8 @@ #' #' @export logistic.loglik logistic.loglik <- function (y, x, model, complex, params) { - r <- 20/223 suppressWarnings({mod <- fastglm(as.matrix(x[,model]), y, family=binomial())}) - ret <- (-(mod$deviance -2*log(r)*sum(complex$oc)))/2 + ret <- (-(mod$deviance -2*log(params$r)*sum(complex$oc)))/2 return(list(crit=ret, coefs=mod$coefficients)) } -- GitLab From fdbbc45f42dc29e99de13d6019140c44463ce9de Mon Sep 17 00:00:00 2001 From: Aliaksandr Date: Thu, 9 Feb 2023 13:12:51 +0100 Subject: [PATCH 10/13] adding two tuning parameters: 1. for bart models 2. for controlling whether we want to check colinearity --- R/alpha_generation.R | 4 ++-- R/arguments.R | 5 +++-- R/feature_generation.R | 15 +++++++++++---- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/R/alpha_generation.R b/R/alpha_generation.R index 593aa68..0abc18c 100644 --- a/R/alpha_generation.R +++ b/R/alpha_generation.R @@ -53,8 +53,8 @@ alpha_3 <- function (feature, data, loglik) { else done <- TRUE } if (sum(sares$par==0) == featfun$count) { - print("All zero feature occured.") - return(NULL) + print("All zero feature occured.") + return(NULL) } # Inject the new alphas into the feature feature <- update.alphas(feature, sares$par) diff --git a/R/arguments.R b/R/arguments.R index 309be23..62c8de5 100644 --- a/R/arguments.R +++ b/R/arguments.R @@ -80,8 +80,9 @@ gen.params.list <- function (data, G=F) { keep.org = F, # Always keep original covariates in every population prel.filter = 0, # Filtration threshold for first population (i.e. filter covariates even if keep.org=T) keep.min = 0.8, # Minimum proportion of features to always keep [0,1] - eps = 0.05) # Inclusion probability limit for feature generation - + eps = 0.05, # Inclusion probability limit for feature generation + col = T, # Whether the colinearity should be checked + bart = F) # Whether bart is used params$feat <- feat_params params$rescale.large <- F diff --git a/R/feature_generation.R b/R/feature_generation.R index 399827c..a3b79c7 100644 --- a/R/feature_generation.R +++ b/R/feature_generation.R @@ -18,8 +18,11 @@ gen.modification <- function (features, marg.probs, trans.probs) { } # Generate a projection feature -gen.projection <- function (features, marg.probs, trans.probs, max.width) { - feat.count <- sample.int(n = (min(max.width, (length(features)))-1), size = 1) + 1 # TODO: Should be a specific distribution? +gen.projection <- function (features, marg.probs, trans.probs, max.width,bart) { + feat.count <- 1 + if(!bart) feat.count <- sample.int(n = (min(max.width, (length(features)))-1), size = 1) + 1 # TODO: Should be a specific distribution? + + feats <- sample.int(n = length(features), size = feat.count, prob = marg.probs) trans <- sample.int(n = length(trans.probs), size = 1, prob = trans.probs) # TODO: Generate alphas properly using various methods @@ -41,7 +44,7 @@ gen.feature <- function (features, marg.probs, data, loglik.alpha, transforms, p feat.type <- sample.int(n = 4, size = 1, prob = probs$gen) if (feat.type == 1) feat <- gen.multiplication(features, marg.probs) if (feat.type == 2) feat <- gen.modification(features, marg.probs, probs$trans) - if (feat.type == 3) feat <- gen.projection(features, marg.probs, probs$trans, params$L) + if (feat.type == 3) feat <- gen.projection(features, marg.probs, probs$trans, params$L,params$bart) if (feat.type == 4) feat <- gen.new(features, F.0.size) # Check that the feature is not too wide or deep if (!(depth.feature(feat) > params$D || width.feature(feat) > params$L)) { @@ -53,7 +56,11 @@ gen.feature <- function (features, marg.probs, data, loglik.alpha, transforms, p # Check for linear dependence of new the feature if (length(features) == F.0.size) feats <- list() else feats <- features[(F.0.size+1):length(features)] - if (!check.collinearity(feat, feats, F.0.size)) feat.ok <- T + if(params$col) + { + if (!check.collinearity(feat, feats, F.0.size)) + feat.ok <- T + }else feat.ok <- T } } tries <- tries + 1 -- GitLab From 03071030ef069ca1bfdae0f70fcefc863d89d8d5 Mon Sep 17 00:00:00 2001 From: Aliaksandr Date: Fri, 10 Feb 2023 17:15:26 +0100 Subject: [PATCH 11/13] returning an object of interest from summary function --- R/results.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/results.R b/R/results.R index 287cdd2..a9ad083 100644 --- a/R/results.R +++ b/R/results.R @@ -137,6 +137,9 @@ summary.gmjmcmcresult <- function (results, pop="last") { print.dist(marg.probs, feats.strings, -1) # Print the best marginal likelihood cat("\nBest marginal likelihood: ", results$best, "\n") + + ord.marg = order(marg.probs[1,],decreasing = T) + return(data.frame(feats.strings = feats.strings[ord.marg], marg.probs = marg.probs[1,ord.marg])) } #' Function to plot the results, works both for results from gmjmcmc and -- GitLab From f0577c64687d5f83ab1a85ee2f11e73c832e2021 Mon Sep 17 00:00:00 2001 From: jonlachmann Date: Sat, 11 Feb 2023 13:05:53 +0100 Subject: [PATCH 12/13] Rename argument which turns on or off collinearity checks when generating features -> "check.col". Add an argument which limits the size of projections, can be set to null and is defaulted to be equal to L. --- R/alpha_generation.R | 4 ++-- R/arguments.R | 10 +++++----- R/feature_generation.R | 21 ++++++++++----------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/R/alpha_generation.R b/R/alpha_generation.R index 0abc18c..593aa68 100644 --- a/R/alpha_generation.R +++ b/R/alpha_generation.R @@ -53,8 +53,8 @@ alpha_3 <- function (feature, data, loglik) { else done <- TRUE } if (sum(sares$par==0) == featfun$count) { - print("All zero feature occured.") - return(NULL) + print("All zero feature occured.") + return(NULL) } # Inject the new alphas into the feature feature <- update.alphas(feature, sares$par) diff --git a/R/arguments.R b/R/arguments.R index 62c8de5..3d98cc1 100644 --- a/R/arguments.R +++ b/R/arguments.R @@ -74,15 +74,15 @@ gen.params.list <- function (data, G=F) { # Add GMJMCMC specific parameters if (G) { ## GM parameters - feat_params <- list(D=5, L=15, # Hard limits on feature complexity - alpha=0, # alpha strategy (0=None, 1,2,3=strategies as per Hubin et al.) TODO: Fully Bayesian - pop.max=as.integer(ncov*1.5), # Max features population size + feat_params <- list(D = 5, L = 15, # Hard limits on feature complexity + alpha = 0, # alpha strategy (0=None, 1,2,3=strategies as per Hubin et al.) TODO: Fully Bayesian + pop.max = as.integer(ncov * 1.5), # Max features population size keep.org = F, # Always keep original covariates in every population prel.filter = 0, # Filtration threshold for first population (i.e. filter covariates even if keep.org=T) keep.min = 0.8, # Minimum proportion of features to always keep [0,1] eps = 0.05, # Inclusion probability limit for feature generation - col = T, # Whether the colinearity should be checked - bart = F) # Whether bart is used + check.col = T, # Whether the colinearity should be checked + max.proj.size = 15) # Maximum projection size params$feat <- feat_params params$rescale.large <- F diff --git a/R/feature_generation.R b/R/feature_generation.R index a3b79c7..4e21ab2 100644 --- a/R/feature_generation.R +++ b/R/feature_generation.R @@ -18,11 +18,11 @@ gen.modification <- function (features, marg.probs, trans.probs) { } # Generate a projection feature -gen.projection <- function (features, marg.probs, trans.probs, max.width,bart) { - feat.count <- 1 - if(!bart) feat.count <- sample.int(n = (min(max.width, (length(features)))-1), size = 1) + 1 # TODO: Should be a specific distribution? - - +gen.projection <- function (features, marg.probs, trans.probs, max.width, max.size) { + if (!is.null(max.size)) { + max.width <- min(max.width, max.size + 1) + } + feat.count <- sample.int(n = (min(max.width, (length(features)))-1), size = 1) # TODO: Should be a specific distribution? feats <- sample.int(n = length(features), size = feat.count, prob = marg.probs) trans <- sample.int(n = length(trans.probs), size = 1, prob = trans.probs) # TODO: Generate alphas properly using various methods @@ -44,7 +44,7 @@ gen.feature <- function (features, marg.probs, data, loglik.alpha, transforms, p feat.type <- sample.int(n = 4, size = 1, prob = probs$gen) if (feat.type == 1) feat <- gen.multiplication(features, marg.probs) if (feat.type == 2) feat <- gen.modification(features, marg.probs, probs$trans) - if (feat.type == 3) feat <- gen.projection(features, marg.probs, probs$trans, params$L,params$bart) + if (feat.type == 3) feat <- gen.projection(features, marg.probs, probs$trans, params$L, params$max.proj.size) if (feat.type == 4) feat <- gen.new(features, F.0.size) # Check that the feature is not too wide or deep if (!(depth.feature(feat) > params$D || width.feature(feat) > params$L)) { @@ -56,11 +56,10 @@ gen.feature <- function (features, marg.probs, data, loglik.alpha, transforms, p # Check for linear dependence of new the feature if (length(features) == F.0.size) feats <- list() else feats <- features[(F.0.size+1):length(features)] - if(params$col) - { - if (!check.collinearity(feat, feats, F.0.size)) - feat.ok <- T - }else feat.ok <- T + if (params$check.col && !check.collinearity(feat, feats, F.0.size)) + feat.ok <- T + else if (!params$check.col) + feat.ok <- T } } tries <- tries + 1 -- GitLab From a7d23d95afad085259525aad8f42e608ada5fe79 Mon Sep 17 00:00:00 2001 From: jonlachmann Date: Sat, 11 Feb 2023 13:22:52 +0100 Subject: [PATCH 13/13] Initialize simulated annealing with random numbers to avoid all zero features. Add an option to round alphas when printing features and use 2 decimal places everywhere a feature is just printed for display. --- R/alpha_generation.R | 6 +++--- R/feature.R | 10 ++++++++-- R/gmjmcmc.R | 8 ++++---- R/gmjmcmc_support.R | 2 +- R/results.R | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/R/alpha_generation.R b/R/alpha_generation.R index 593aa68..ff657df 100644 --- a/R/alpha_generation.R +++ b/R/alpha_generation.R @@ -40,12 +40,12 @@ alpha_3 <- function (feature, data, loglik) { if(featfun$count == 0) return(feature) # Set initial range for Simulated Annealing - print("Generating alphas") + cat("Generating alphas\n") range <- 10 done <- FALSE while(!done) { # Run simulated annealing on current range - sares <- GenSA(rep(0,featfun$count), loglik, + sares <- GenSA(rnorm(featfun$count), loglik, rep(-range/2,featfun$count), rep(range/2,featfun$count), control=list(max.call=5e3), data, featfun$formula) # Check if any estimate is on the edge of the range, if so, extend the range and run again @@ -53,7 +53,7 @@ alpha_3 <- function (feature, data, loglik) { else done <- TRUE } if (sum(sares$par==0) == featfun$count) { - print("All zero feature occured.") + cat("All zero feature occured.\n") return(NULL) } # Inject the new alphas into the feature diff --git a/R/feature.R b/R/feature.R index ba0c9d1..a707641 100644 --- a/R/feature.R +++ b/R/feature.R @@ -97,9 +97,11 @@ update.alphas <- function (feature, alphas, recurse=FALSE) { #' @param transforms The list of transforms to use #' @param dataset Set the regular covariates as columns in a dataset #' @param alphas Print a "?" instead of actual alphas to prepare the output for alpha estimation +#' @param labels Should the covariates be named, or just referred to as their place in the data.frame. +#' @param round Should numbers be rounded when printing? Default is FALSE, otherwise it can be set to the number of decimal places. #' #' @export -print.feature <- function (feature, dataset=F, alphas=F, labels=F) { +print.feature <- function (feature, dataset=F, alphas=F, labels=F, round=F) { transforms <- getOption("gmjmcmc-transformations") if(is.null(transforms)) stop("Please set the gmjmcmc-transformations option to your non-linear functions (see ?set.transforms).") fString <- "" @@ -115,6 +117,10 @@ print.feature <- function (feature, dataset=F, alphas=F, labels=F) { op <- "*" fString <- paste0(fString, "(") } + # If we are printing rounded features for neat output, round all alphas + if (round) { + feat[,3] <- round(feat[,3], round) + } for (j in seq_len(nrow(feat))) { # No plus or multiplication sign on the last one if (j == nrow(feat)) op <- "" @@ -131,7 +137,7 @@ print.feature <- function (feature, dataset=F, alphas=F, labels=F) { if (alphas) fString <- paste0(fString, "?*") else fString <- paste0(fString, feat[j,3], "*") } - fString <- paste0(fString, print.feature(feature[[feat[j,2]]], dataset, alphas, labels), op) + fString <- paste0(fString, print.feature(feature[[feat[j,2]]], dataset, alphas, labels, round), op) } } fString <- paste0(fString, ")") diff --git a/R/gmjmcmc.R b/R/gmjmcmc.R index 51f6568..289d831 100644 --- a/R/gmjmcmc.R +++ b/R/gmjmcmc.R @@ -76,7 +76,7 @@ gmjmcmc <- function (data, loglik.pi, loglik.alpha, transforms, T, N.init, N.fin # Print the marginal posterior distribution of the features after MJMCMC cat(paste("\rCurrent best crit:", mjmcmc_res$best.crit, "\n")) cat("Feature importance:\n") - print.dist(marg.probs[[t]], sapply(S[[t]], print.feature, labels=labels), probs$filter) + print.dist(marg.probs[[t]], sapply(S[[t]], print.feature, labels = labels, round = 2), probs$filter) if (params$rescale.large) prev.large <- params$large # Generate a new population of features for the next iteration (if this is not the last) if (t != T) { @@ -148,14 +148,14 @@ gmjmcmc.transition <- function (S.t, F.0, data, loglik.alpha, marg.probs.F.0, ma # Perform the replacements for (i in feats.replace) { prev.size <- length(S.t) - prev.feat.string <- print.feature(S.t[[i]], labels=labels) + prev.feat.string <- print.feature(S.t[[i]], labels=labels, round = 2) S.t[[i]] <- gen.feature(c(F.0, S.t), marg.probs.use, data, loglik.alpha, transforms, probs, length(F.0), params) if (prev.size > length(S.t)) { cat("Removed feature", prev.feat.string, "\n") cat("Population shrinking, returning.\n") return(S.t) } - cat("Replaced feature", prev.feat.string, "with", print.feature(S.t[[i]], labels=labels), "\n") + cat("Replaced feature", prev.feat.string, "with", print.feature(S.t[[i]], labels=labels, round = 2), "\n") feats.keep[i] <- T marg.probs.use[i] <- mean(marg.probs.use) } @@ -169,7 +169,7 @@ gmjmcmc.transition <- function (S.t, F.0, data, loglik.alpha, marg.probs.F.0, ma cat("Population not growing, returning.\n") return(S.t) } - cat("Added feature", print.feature(S.t[[i]], labels=labels), "\n") + cat("Added feature", print.feature(S.t[[i]], labels=labels, round = 2), "\n") marg.probs.use <- c(marg.probs.use, params$eps) } } diff --git a/R/gmjmcmc_support.R b/R/gmjmcmc_support.R index 799aae6..ce54548 100644 --- a/R/gmjmcmc_support.R +++ b/R/gmjmcmc_support.R @@ -98,7 +98,7 @@ summary.gmjresult <- function (results, population="last") { else pops <- population feature_strings <- vector("list", length(results$populations[[pops]])) for (i in seq_along(feature_strings)) { - feature_strings[[i]] <- print.feature(results$populations[[pops]][[i]]) + feature_strings[[i]] <- print.feature(results$populations[[pops]][[i]], round = 2) } feature_importance <- marginal.probs.renorm(results$models[[pops]])$probs return(list(features=feature_strings, importance=feature_importance)) diff --git a/R/results.R b/R/results.R index a9ad083..4f16706 100644 --- a/R/results.R +++ b/R/results.R @@ -129,7 +129,7 @@ model.string <- function (model, features, link) { summary.gmjmcmcresult <- function (results, pop="last") { if (pop=="last") pop <- length(results$models) # Get features as strings for printing - feats.strings <- sapply(results$populations[[pop]], print.feature) + feats.strings <- sapply(results$populations[[pop]], print.feature, round = 2) # Get marginal posterior of features marg.probs <- marginal.probs.renorm(results$models[[pop]])$probs # Print the final distribution -- GitLab