diff --git a/DESCRIPTION b/DESCRIPTION index e61597c6..af5aab08 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: clusterExperiment Title: Compare Clusterings for Single-Cell Sequencing -Version: 1.3.1 +Version: 1.3.2 Description: Provides functionality for running and comparing many different clusterings of single-cell sequencing data or other large mRNA Expression data sets. Authors@R: c(person("Elizabeth", "Purdom", email = "epurdom@stat.berkeley.edu", @@ -29,15 +29,47 @@ Imports: matrixStats, graphics, parallel, - MAST, - RSpectra + RSpectra, + kernlab Suggests: BiocStyle, knitr, diagram, testthat, - scRNAseq + scRNAseq, + MAST VignetteBuilder: knitr LazyData: true RoxygenNote: 6.0.1 biocViews: Clustering, RNASeq, Sequencing, Software, SingleCell +Collate: + 'AllClasses.R' + 'AllGenerics.R' + 'AllHelper.R' + 'AllHelperClusterFunction.R' + 'JiashinJiCode.R' + 'addClusters.R' + 'internalClusterFunctions.R' + 'internalFunctions.R' + 'builtInClusterFunctions.R' + 'clusterContrasts.R' + 'clusterLabels.R' + 'clusterMany.R' + 'clusterSingle.R' + 'combineMany.R' + 'dataCreation.R' + 'getFeatures.R' + 'mainClustering.R' + 'makeBlankData.R' + 'makeDendrogram.R' + 'mergeClusters.R' + 'plotBarplot.R' + 'plotClusters.R' + 'plotDendrogram.R' + 'plotHeatmap.R' + 'plottingHelpers.R' + 'rsec.R' + 'seqCluster.R' + 'subsampleClustering.R' + 'transformFunction.R' + 'workflowClusters.R' diff --git a/NAMESPACE b/NAMESPACE index 45925c2b..81bb8a5a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,12 @@ # Generated by roxygen2: do not edit by hand export(bigPalette) -export(clusterD) export(clusterExperiment) +export(clusterFunction) +export(internalFunctionCheck) +export(listBuiltInFunctions) +export(listBuiltInType01) +export(listBuiltInTypeK) export(makeBlankData) export(nFeatures) export(nSamples) @@ -15,8 +19,8 @@ export(seqPal5) export(setBreaks) export(showBigPalette) export(showHeatmapPalettes) -export(subsampleClustering) exportClasses(ClusterExperiment) +exportClasses(ClusterFunction) exportMethods("[") exportMethods("clusterLabels<-") exportMethods("clusterLegend<-") @@ -27,8 +31,10 @@ exportMethods("primaryClusterIndex<-") exportMethods("transformation<-") exportMethods(RSEC) exportMethods(addClusters) +exportMethods(algorithmType) exportMethods(clusterContrasts) exportMethods(clusterExperiment) +exportMethods(clusterFunction) exportMethods(clusterInfo) exportMethods(clusterLabels) exportMethods(clusterLegend) @@ -42,6 +48,10 @@ exportMethods(combineMany) exportMethods(convertClusterLegend) exportMethods(dendroClusterIndex) exportMethods(getBestFeatures) +exportMethods(getBuiltInFunction) +exportMethods(getPostProcessingArgs) +exportMethods(inputType) +exportMethods(mainClustering) exportMethods(makeDendrogram) exportMethods(mergeClusters) exportMethods(nClusters) @@ -58,9 +68,11 @@ exportMethods(primaryClusterIndex) exportMethods(primaryClusterNamed) exportMethods(removeClusters) exportMethods(removeUnclustered) +exportMethods(requiredArgs) exportMethods(setToCurrent) exportMethods(setToFinal) exportMethods(show) +exportMethods(subsampleClustering) exportMethods(transform) exportMethods(transformation) exportMethods(workflowClusterDetails) @@ -71,7 +83,6 @@ import(limma) import(methods) importClassesFrom(SummarizedExperiment,SummarizedExperiment) importClassesFrom(phylobase,phylo4) -importFrom(MAST,Hypothesis) importFrom(NMF,aheatmap) importFrom(RColorBrewer,brewer.pal) importFrom(RColorBrewer,brewer.pal.info) @@ -85,6 +96,7 @@ importFrom(dendextend,as.phylo.dendrogram) importFrom(graphics,plot) importFrom(howmany,howmany) importFrom(howmany,lowerbound) +importFrom(kernlab,specc) importFrom(limma,makeContrasts) importFrom(locfdr,locfdr) importFrom(matrixStats,rowVars) @@ -98,8 +110,10 @@ importFrom(phylobase,nNodes) importFrom(phylobase,nodeLabels) importFrom(phylobase,rootNode) importFrom(phylobase,subset) +importFrom(stats,cutree) importFrom(stats,dist) importFrom(stats,hclust) +importFrom(stats,kmeans) importFrom(stats,mad) importFrom(stats,prcomp) importFrom(stats,sd) diff --git a/NEWS b/NEWS index baf83270..b6a9fb45 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,28 @@ -Changes in version 1.3.1 ( Release date: 2017-06-14 ) +Changes in version 1.3.2 ( Release date: 2017-07-05) ============== Changes: +* Default for `top.can` in seqCluster are changed to be `top.can=5`. +* makeDendrogram now has the default argument `ignoreUnassignedVar=TRUE` like in RSEC +* add ClusterFunction class and update all functions to work on this. All built in cluster functions are now given ClusterFunction Objects, so all built in clustering functions can now work for either `subsampleClustering` or `mainClustering`. This will also make it easier for a user to define their own ClusterFunction object and have it be used by functions like `clusterSingle`. This is a major change in how some of the underlying functions work, but should not impact common functions like `clusterMany` and `RSEC`. Some of the more notable changes in the arguments for programmers are: + - `clusterD` and `clusterDArgs` have been changed to `mainClustering` and `mainClusterArgs`. This change was made to make these arguments more clear as to their role in the clustering workflow (and because the clusterD refered to clustering a dissimilarity but it has clustered either x or D for many versions now. ) + - `seqCluster` and `clusterSingle` no longer take the argument `clusterFunction`. `clusterFunction` must be given via `mainClusterArgs` and `subsampleArgs` to be passed to `mainClustering` or `subsamplingCluster`, respectively. Now only the upper-level function `clusterMany` takes `clusterFunction` directly as an argument. + - `mainClustering` (previously `clusterD`) and `subsampleClustering` no longer take `k` nor `alpha` as a direct argument. These arguments, like all arguments used directly by the cluster function, now need to be passed to the clustering function in a list via `clusterArgs`. + - The list of available built-in clustering routines provided by the package can now be accessed via `listBuiltInFunctions()`. The functions that are used for these functions are now available to the user via their ClusterFunction object that the user can access with the function `getBuiltInFunction`. (see ?listBuiltInFunctions) +* `hiearchical01` clustering now has a different default, namely to apply `as.dist` to the input `diss` in order to get a `dist` object, rather than `dist(1-diss)` which was previously the default for historical reasons. This is controlled by argument `whichHierDist`, and can be set to the previous version by passing `whichHierDist="dist"` to the `clusterArgs` argument in either `subsampleArgs` or `mainClusterArgs`, depending on where `hierarchical01` is being used. +* Spectral clustering is now available (`"spectral"`) via the `specc` function of `kernlab`. +* `clusterSingle` now only returns the dissimilarity matrix in the `coClustering` slot if `subsample=TRUE` in the call. Furthermore, for the resulting dissimilarity to replace an existing `coClustering` slot value, the user must request it by setting `replaceCoClustering=TRUE` in the call to `clusterSingle`. +* Removed default value for argument `proportion` in `combineMany`. The previous default was `proportion=1` but didn't match most common use cases and was accidentally called by upper functions like RSEC. +* If the `clusterFunction` argument is not given to `subsampleArgs` by the user explicitly, and the `clusterFunction` of `mainClusterArgs` is appropriate, it will be used for `subsampleClustering`; if the `clusterFunction` in `mainClusterArgs` is not appropriate (e.g. `subsampleClustering` needs a type `K` because `sequential=TRUE`), then the default for the `subsampleClustering` will be `'pam'`. This changes the previous behavior of `subsampleClustering` where the default was 'pam' in all cases where not explicitly given by the user. This change should have no impact on RSEC: since the `clusterFunction` for the `mainClustering` terms is a `'01'` type in RSEC and the `subsampleClustering` has to be type `'K'` when `sequential=TRUE`, it will revert to the default `"pam"` as before. + +Bugs: +* Fixed error so where if `clusterSingle` was called on existing clusterExperiment object it would overwrite the information of the existing `clusterExperiment` object. +* Fixed `RSEC` so now if rerun on existing `clusterExperiment` object, it will grab defaults from the matrix version (previously defaults were those of the underlying function, which were not always the same, e.g. `combineProportion` default was previously 1) +* Fixed `clusterMany` so now it explicitly sets `dimReduce="none"` in call to `clusterSingle`. Before, might have been calling all of the `dimReduce` defaults (i.e. all of them!). +* Fixed so gives error if whichClusters in `plotBarplot` doesn't match anything. + +Changes in version 1.3.1 ( Release date: 2017-06-14 ) +======= +Changes: * change how `plotHeatmap` handles visualizeData argument, so not required to have same number of genes as original, only same number of samples. * Now if color of vectors given in `clusterLegend` does not have names, `plotHeatmap` will give them names matching the variable so that they will be used by `aheatmap` (previously would have left all colors white because do not have matching names). * Large changes to how dendrograms are plotted by `plotDendrogram` and `mergeClusters`. This includes the ability to see the before and after clusterings along side the mergeClusters result, as well as a new slot added to the clusterExperiment class (`dendro_outbranch`). The names of several arguments to `mergeClusters` and `plotDendrogram` were changed for clarity: @@ -16,6 +38,7 @@ Changes: * Move MAST into 'suggests' pacakge so that not need R 3.4 to run the package. * Change calculation of PCA dimensionality reduction to use `svds` from `RSpectra` package to improve speed + Bugs: * Fixed bug in RSEC where `combineProportion` argument was being ignored (set to 1) * Fixed bug in definition of `transform` so that extends existing generic rather than masking it. diff --git a/R/AllClasses.R b/R/AllClasses.R index ff424f70..846d7f2b 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -2,6 +2,7 @@ setOldClass("dendrogram") setClassUnion("dendrogramOrNULL",members=c("dendrogram", "NULL")) setClassUnion("matrixOrNULL",members=c("matrix", "NULL")) setClassUnion("matrixOrMissing",members=c("matrix", "missing")) +setClassUnion("functionOrNULL",members=c("function", "NULL")) #' @title Class ClusterExperiment #' #' @description \code{ClusterExperiment} is a class that extends @@ -339,8 +340,8 @@ setMethod( #'@param dendro_samples dendrogram. Sets the `dendro_samples` slot (see Slots). #'@param dendro_clusters dendrogram. Sets the `dendro_clusters` slot (see #' Slots). -#' @param dendro_outbranch logical. Sets the `dendro_outbranch` slot (see Slots) #'@param dendro_index numeric. Sets the dendro_index slot (see Slots). +#' @param dendro_outbranch logical. Sets the dendro_outbranch slot (see Slots). #'@param coClustering matrix. Sets the `coClustering` slot (see Slots). #'@details The \code{clusterExperiment} constructor function gives clusterLabels #' based on the column names of the input matrix/SummarizedExperiment. If @@ -412,3 +413,264 @@ setMethod( validObject(out) return(out) }) + + +################ clusterFunction class + +#' @title Class ClusterFunction +#' +#' @description \code{ClusterFunction} is a class for holding functions that can +#' be used for clustering in the clustering algorithms in this package. +#' +#' @docType class +#' @aliases ClusterFunction ClusterFunction-class clusterFunction +#' @slot clusterFUN a function defining the clustering function. See details for +#' required arguments. +#' @slot inputType a character defining what type of input \code{clusterFUN} +#' takes. Must be one of either "diss","X", or "either" +#' @slot algorithmType a character defining what type of clustering algorithm +#' \code{clusterFUN} is. Must be one of either "01" or "K". \code{clusterFUN} +#' must take the corresponding required arguments (see details below). +#' @slot classifyFUN a function that takes as input new data and the output of +#' \code{clusterFUN} (when \code{cluster.only=FALSE} and results in cluster +#' assignments of the new data. Note that the function should assume that the +#' input 'x' is not the same samples that were input to the clusterFunction +#' (but can assume that it is the same number of features/columns). Used in +#' subsampling clustering. If given value \code{NULL} then subsampling can +#' only be \code{"InSample"}, see \code{\link{subsampleClustering}}. +#' @slot inputClassifyType the input type for the classification function (if +#' not NULL); like \code{inputType}, must be one of "diss","X", or "either" +#' @slot outputType the type of output given by \code{clusterFUN}. Must either +#' be "vector" or "list". If "vector" then the output should be a vector of +#' length equal to the number of observations with integer-valued elements +#' identifying them to different clusters; the vector assignments should be in +#' the same order as the original input of the data. Samples that are not +#' assigned to any cluster should be given a '-1' value. If "list", then it +#' must be a list equal to the length of the number of clusters, and the +#' elements of the list contain the indices of the samples in that cluster. +#' Any indices not in any of the list elements are assumed to be -1. The main +#' advantage of "list" is that it can preserve the order of the clusters if +#' the \code{clusterFUN} desires to do so. In which case the \code{orderBy} +#' argument of \code{\link{mainClustering}} can preserve this ordering (default is +#' to order by size). +#' @slot requiredArgs Any additional required arguments for \code{clusterFUN} +#' (beyond those required of all \code{clusterFUN}, described in details). +#' @slot checkFunctions logical. If TRUE, the validity check of the +#' \code{ClusterFunction} object will check the \code{clusterFUN} with simple +#' toy data using the function \code{internalFunctionCheck}. +#' @details Required arguments for \code{clusterFUN}: \itemize{ \item{"x or +#' diss"}{either \code{x} and/or \code{diss} depending on \code{inputType}. If +#' \code{x}, then \code{x} is assumed to be nfeatures x nsamples (like +#' assay(CEObj) would give)} \item{"checkArgs"}{logical argument. If +#' \code{checkArgs=TRUE}, the \code{clusterFUN} should check if the arguments +#' passed in \code{...} are valid and return an error if not; otherwise, no +#' error will be given, but the check should be done and only valid arguments +#' in \code{...} passed along. This is necessary for the function to work with +#' \code{clusterMany} which passes all arguments to all functions without +#' checking. } \item{"cluster.only"}{logical argument. If +#' \code{cluster.only=TRUE}, then \code{clusterFUN} should return only the +#' vector of cluster assignments (or list if \code{outputType="list"}). If +#' \code{cluster.only=FALSE} then the \code{clusterFUN} should return a named +#' list where one of the elements entitled \code{clustering} contains the +#' vector described above (no list!); anything else needed by the +#' \code{classifyFUN} to classify new data should be contained in the output +#' list as well. \code{cluster.only} is set internally depending on whether +#' \code{classifyFUN} will be used by subsampling or only for clustering the +#' final product.} \item{"..."}{Any additional arguments specific to the +#' algorithm used by \code{clusterFUN} should be passed via \code{...} and NOT +#' passed via arguments to \code{clusterFUN}} \item{"Other required +#' arguments"}{\code{clusterFUN} must also accept arguments required for its +#' \code{algorithmType} (see Details below).} } +#' +#' +#' @details \code{algorithmType}: Type "01" is for clustering functions that +#' expect as an input a dissimilarity matrix that takes on 0-1 values (e.g. +#' from subclustering) with 1 indicating more dissimilarity between samples. +#' "01" algorithm types must also have \code{inputType} equal to +#' \code{"diss"}. It is also generally expected that "01" algorithms use the +#' 0-1 nature of the input to set criteria as to where to find clusters. "01" +#' functions must take as an argument \code{alpha} between 0 and 1 to +#' determine the clusters, where larger values of \code{alpha} require less +#' similarity between samples in the same cluster. "K" is for clustering +#' functions that require an argument \code{k} (the number of clusters), but +#' arbitrary \code{inputType}. On the other hand, "K" algorithms are assumed +#' to need a predetermined 'k' and are also assumed to cluster all samples to +#' a cluster. If not, the post-processing steps in \code{\link{mainClustering}} such +#' as \code{findBestK} and \code{removeSil} may not operate correctly since +#' they rely on silhouette distances. +#' @name ClusterFunction-class +#' @aliases ClusterFunction +#' @rdname ClusterFunction-class +#' @export +#' +setClass( + Class = "ClusterFunction", + slots = list( + clusterFUN="function", + inputType = "character", + algorithmType = "character", + inputClassifyType = "character", + classifyFUN="functionOrNULL", + outputType = "character", + requiredArgs= "character", + checkFunctions="logical" + ) +) +.inputTypes<-c("X","diss","either") +.algTypes<-c("01","K") +.required01Args<-c("alpha") +.requiredKArgs<-c("k") +.outputTypes<-c("vector","list") + +#' @rdname ClusterFunction-class +#' @export +#' @aliases internalFunctionCheck +#' @examples +#' #Use internalFunctionCheck to check possible function +#' goodFUN<-function(x,diss,k,checkArgs,cluster.only,...){ +#' cluster::pam(x=t(x),k=k,cluster.only=cluster.only) +#' } +#' #passes internal check +#' internalFunctionCheck(goodFUN,inputType="X",algorithmType="K",outputType="vector") +#' #Note it doesn't pass if inputType="either" because no catches for x=NULL +#' internalFunctionCheck(goodFUN, inputType="either",algorithmType="K",outputType="vector") +#' myCF<-clusterFunction(clusterFUN=goodFUN, inputType="X",algorithmType="K", outputType="vector") +#' badFUN<-function(x,diss,k,checkArgs,cluster.only,...){cluster::pam(x=x,k=k)} +#' internalFunctionCheck(badFUN,inputType="X",algorithmType="K",outputType="vector") +#' @details \code{internalFunctionCheck} is the function that is called by the +#' validity check of the \code{clusterFunction} constructor (if +#' \code{checkFunctions=TRUE}). It is available as an S3 function for the user +#' to be able to test their functions and debug them, which is difficult to do +#' with a S4 validity function. +internalFunctionCheck<-function(clusterFUN,inputType,algorithmType,outputType){ + #--- Make small data + N<-20 + set.seed(2851) + x<-matrix(rnorm(N*3),ncol=N,nrow=3) + set.seed(2851) + diss<-matrix(runif(N^2,min=0,max=0.5),ncol=N,nrow=N) + diss<-diss + t(diss) + diag(diss)<-0 + #--- Set parameters + if(algorithmType=="01") argList<-list(alpha=.5) + if(algorithmType=="K") argList<-list(k=2) + argList<-c(argList,list(cluster.only=TRUE,checkArgs=FALSE)) + #--- Run function on small data + if(inputType %in% c("X")){ + test<-try(do.call(clusterFUN,c(list(x=x),argList)),silent=TRUE) + if(inherits(test,"try-error")) return(paste("function test fails with input X. ",test[1])) + } + if(inputType %in% c("diss")){ + test<-try(do.call(clusterFUN,c(list(diss=diss),argList)),silent=TRUE) + if(inherits(test,"try-error")) return(paste("function test fails with input diss.",test[1])) + } + if(inputType %in% c("either")){ + test1<-try(do.call(clusterFUN,c(list(x=x,diss=NULL),argList)),silent=TRUE) + if(inherits(test1,"try-error")) return(paste("function test fails with input x and NULL diss.",test1[1])) + test2<-try(do.call(clusterFUN,c(list(x=NULL,diss=diss),argList)),silent=TRUE) + if(inherits(test2,"try-error")){ + return(paste("function test fails with input diss and NULL x.",test2[1])) + } + test3<-try(do.call(clusterFUN,c(list(x=x,diss=diss),argList)),silent=TRUE) + if(inherits(test3,"try-error")) return(paste("function test fails both diss and x input.",test3[1])) + if(outputType=="vector" & length(test1)!=N || length(test2)!=N || length(test3)!=N) return("clusterFUN does not return a vector equal to the number of observations.") + } + else{ + if(outputType=="vector"){ + if(length(test)!=N) return("clusterFUN does not return a vector equal to the number of observations") + } + } + return(TRUE) +} + + +.checkHasArgs<-function(FUN,requiredArgs){ + funArgs<-names(as.list(args(FUN))) + all(requiredArgs %in% funArgs) +} +setValidity("ClusterFunction", function(object) { + if(is.na(object@outputType)) { + return("Must define outputType.") + } + if(!object@outputType%in%.outputTypes) return(paste("outputType must be one of",paste(.outputTypes,collapse=","))) + #---- + # inputType + #---- + if(is.na(object@inputType)) { + return("Must define inputType.") + } + if(!object@inputType%in%.inputTypes) return(paste("inputType must be one of",paste(.inputTypes,collapse=","))) + if(is.null(object@classifyFUN)& !is.na(object@inputClassifyType)) return("should not define inputClassifyType if classifyFUN is not defined") + if(!is.null(object@classifyFUN) & is.na(object@inputClassifyType)) { + return("Must define inputClassifyType if define classifyFUN.") + } + if(!is.null(object@classifyFUN) & !object@inputClassifyType%in%.inputTypes) return(paste("inputClassifyType must be one of",paste(.inputTypes,collapse=","))) + #---- + # algorithmType + #---- + if(is.na(object@algorithmType)) return("Must define algorithmType") + if(!object@algorithmType%in%.algTypes) return(paste("algorithmType must be one of",paste(.algTypes,collapse=","))) + ### Add additional checks that 'k' and '01' work as expected... in particular that take particular arguments, etc. that 'k' and '01' are expected to take. + + + #---- + # function arguments are as needed + #---- + if(object@inputType%in%c("X","either") & !.checkHasArgs(FUN=object@clusterFUN,requiredArgs="x")) return("inputType is either 'X' or 'either' but arguments to clusterFunction doesn't contain 'x'") + if(object@inputType%in%c("diss","either") & !.checkHasArgs(FUN=object@clusterFUN,requiredArgs="diss")) return("inputType is either 'diss' or 'either' but arguments to clusterFunction doesn't contain 'diss'") + if(object@algorithmType=="K" & !.checkHasArgs(FUN=object@clusterFUN,requiredArgs=.requiredKArgs)) return("algorithmType is 'K' but arguments to clusterFunction doesn't contain",paste(.requiredKArgs,collapse=",")) + if(object@algorithmType=="01" & !.checkHasArgs(FUN=object@clusterFUN, requiredArgs=.required01Args)) return("algorithmType is '01' but arguments to clusterFunction doesn't contain", paste(.required01Args,collapse=",")) + + + if(object@checkFunctions){ #user can skip the check. + out<-internalFunctionCheck(object@clusterFUN,object@inputType,object@algorithmType,object@outputType) + if(!is.logical(out) || !out) return(out) + + } + return(TRUE) + }) + +#'@description The constructor \code{clusterFunction} creates an object of the +#' class \code{ClusterFunction}. +#' +#'@param clusterFUN function bassed to slot \code{clusterFUN}. +#'@param inputType character for slot \code{inputType} +#'@param algorithmType character for slot \code{inputType} +#'@param classifyFUN function for slot \code{classifyFUN} +#'@param outputType character for slot \code{outputType} +#'@param inputClassifyType character for slot \code{inputClassifyType} +#'@param requiredArgs character for slot \code{requiredArgs} +#'@param checkFunctions logical for whether to check the input functions with +#' \code{internalFunctionsCheck} +#'@param ... arguments passed to different methods of \code{clusterFunction} +#'@return A \code{ClusterFunction} object. +#' +#' @aliases clusterFunction +#' @rdname ClusterFunction-class +#' @export +setGeneric( + name = "clusterFunction", + def = function(clusterFUN,...) { + standardGeneric("clusterFunction") + } +) +#' @rdname ClusterFunction-class +#' @export +setMethod( + f = "clusterFunction", + signature = signature("function"), + definition = function(clusterFUN, inputType,outputType,algorithmType,inputClassifyType=NA_character_,requiredArgs=NA_character_,classifyFUN=NULL,checkFunctions=TRUE){ + out <- new("ClusterFunction", + clusterFUN=clusterFUN, + inputType=inputType, + algorithmType = algorithmType, + inputClassifyType=inputClassifyType, + classifyFUN=classifyFUN, + outputType=outputType, + requiredArgs=requiredArgs, + checkFunctions=checkFunctions + ) + validObject(out) + return(out) + } +) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 4c6535bf..48aaf9b4 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -4,10 +4,27 @@ setGeneric( standardGeneric("RSEC") } ) - +setGeneric( + name = "subsampleClustering", + def = function(clusterFunction, ...) { + standardGeneric("subsampleClustering") + } +) +setGeneric( + name = "mainClustering", + def = function(clusterFunction, ...) { + standardGeneric("mainClustering") + } +) +setGeneric( + name = "seqCluster", + def = function(clusterFunction, ...) { + standardGeneric("seqCluster") + } +) setGeneric( name = "clusterSingle", - def = function(x, diss, ...) { + def = function(x, diss, ...) { standardGeneric("clusterSingle") } ) @@ -331,3 +348,35 @@ setGeneric( standardGeneric("mergeClusters") } ) +setGeneric( + name = "getBuiltInFunction", + def = function(object, ...) { + standardGeneric("getBuiltInFunction") + } +) + + +setGeneric( + name = "requiredArgs", + def = function(object, ...) { + standardGeneric("requiredArgs") + } +) +setGeneric( + name = "algorithmType", + def = function(object, ...) { + standardGeneric("algorithmType") + } +) +setGeneric( + name = "inputType", + def = function(object, ...) { + standardGeneric("inputType") + } +) +setGeneric( + name = "getPostProcessingArgs", + def = function(clusterFunction, ...) { + standardGeneric("getPostProcessingArgs") + } +) diff --git a/R/AllHelperClusterFunction.R b/R/AllHelperClusterFunction.R new file mode 100644 index 00000000..aa5ce82c --- /dev/null +++ b/R/AllHelperClusterFunction.R @@ -0,0 +1,130 @@ +#' Helper methods for the ClusterFunction class +#' +#' This is a collection of helper methods for the ClusterExperiment class. +#' @name ClusterFunction-methods +#' @aliases ClusterFunction-methods +#' @param object input to the method, usually either a \code{ClusterFunction} class or a character describing a built-in \code{ClusterFunction} object. +#' @details Note that when subsetting the data, the dendrogram information and +#' the co-clustering matrix are lost. +#' @export +setMethod( + f = "requiredArgs", + signature = c("character"), + definition = function(object) { + requiredArgs(getBuiltInFunction(object)) + } +) + +#' @rdname ClusterFunction-methods +#' @param genericOnly logical If TRUE, return only the generic required arguments (i.e. those required by the algorithm type) and not the arguments specific to that clustering found in the slot \code{requiredArgs}. If FALSE both sets of arguments are returned. +#' @export +setMethod( + f = "requiredArgs", + signature = c("ClusterFunction"), + definition = function(object,genericOnly=FALSE) { + algType<-algorithmType(object) + if(!genericOnly){ + if(!is.na(object@requiredArgs)) reqArgs<-object@requiredArgs + else reqArgs<-NULL + if(algType=="01") return(unique(sort(c(reqArgs,.required01Args)))) + if(algType=="K") return(unique(sort(c(reqArgs,.requiredKArgs)))) + } + else{ + if(algType=="01") return(unique(sort(.required01Args))) + if(algType=="K") return(unique(sort(.requiredKArgs))) + } + } +) +#' @rdname ClusterFunction-methods +#' @aliases requiredArgs +#' @export +setMethod( + f = "requiredArgs", + signature = c("character"), + definition = function(object) { + requiredArgs(getBuiltInFunction(object)) + } +) +#' @rdname ClusterFunction-methods +#' @export +setMethod( + f = "requiredArgs", + signature = c("character"), + definition = function(object) { + clObjects<-getBuiltInFunction(object) + if(length(clObjects)>1) return(lapply(clObjects,requiredArgs)) + else return(requiredArgs(clObjects)) + } +) +#' @rdname ClusterFunction-methods +#' @export +setMethod( + f = "requiredArgs", + signature = c("factor"), + definition = function(object) { + requiredArgs(as.character(object)) + } +) + +#' @rdname ClusterFunction-methods +#' @aliases algorithmType +#' @export +setMethod( + f = "algorithmType", + signature = c("ClusterFunction"), + definition = function(object) { + object@algorithmType + } +) +#' @rdname ClusterFunction-methods +#' @export +setMethod( + f = "algorithmType", + signature = c("character"), + definition = function(object) { + clObjects<-getBuiltInFunction(object) + if(length(clObjects)>1) return(sapply(clObjects,algorithmType)) + else return(algorithmType(clObjects)) + } +) +#' @rdname ClusterFunction-methods +#' @export +setMethod( + f = "algorithmType", + signature = c("factor"), + definition = function(object) { + algorithmType(as.character(object)) + } +) + + +#' @rdname ClusterFunction-methods +#' @aliases inputType +#' @export +setMethod( + f = "inputType", + signature = c("ClusterFunction"), + definition = function(object) { + object@inputType + } +) +#' @rdname ClusterFunction-methods +#' @export +setMethod( + f = "inputType", + signature = c("character"), + definition = function(object) { + clObjects<-getBuiltInFunction(object) + if(length(clObjects)>1) return(sapply(clObjects,inputType)) + else return(inputType(clObjects)) + } +) +#' @rdname ClusterFunction-methods +#' @export +setMethod( + f = "inputType", + signature = c("factor"), + definition = function(object) { + inputType(as.character(object)) + } +) diff --git a/R/addClusters.R b/R/addClusters.R index dd9efa8a..19cca222 100644 --- a/R/addClusters.R +++ b/R/addClusters.R @@ -21,11 +21,11 @@ #' @examples #' data(simData) #' -#' cl1 <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -#' sequential=FALSE, clusterDArgs=list(k=3)) -#' -#' cl2 <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -#' sequential=FALSE, clusterDArgs=list(k=5)) +#' cl1 <- clusterSingle(simData, subsample=FALSE, +#' sequential=FALSE, mainClusterArgs=list(clusterArgs=list(k=3), clusterFunction="pam")) + +#' cl2 <- clusterSingle(simData, subsample=FALSE, +#' sequential=FALSE, mainClusterArgs=list(clusterArgs=list(k=3), clusterFunction="pam")) #' #' addClusters(cl1, cl2) setMethod( diff --git a/R/builtInClusterFunctions.R b/R/builtInClusterFunctions.R new file mode 100644 index 00000000..c181385a --- /dev/null +++ b/R/builtInClusterFunctions.R @@ -0,0 +1,359 @@ +#' @include internalFunctions.R internalClusterFunctions.R + +### input to clustering: +# pam : x or dis +# hier : dis +# kmeans : x + +################ +##Internal wrapper functions for kmeans and pam +################ +.genericClassify<-function(x,centers){ + innerProd<-tcrossprod(t(x),centers) #a n x k matrix of inner-products between them + distMat<-as.matrix(dist(rbind(t(x),centers))) + distMat<-distMat[1:ncol(x),(ncol(x)+1):ncol(distMat)] + apply(distMat,1,which.min) +} +.getPassedArgs<-function(FUN,passedArgs,checkArgs){ + funArgs<-names(as.list(args(FUN))) + funName<-tail(as.character(substitute(FUN)),1) + if(any(wh<-!names(passedArgs) %in% funArgs)){ + passedArgs<-passedArgs[-which(wh)] + if(checkArgs) warning(.wrongArgsWarning(funName)) + } + return(passedArgs) +} +.wrongArgsWarning<-function(funName){paste("arguments passed via clusterArgs to the clustering function",funName,"are not all applicable (clusterArgs should only be arguments to,", funName,"). Extra arguments will be ignored")} + +##--------- +##Spectral +##--------- + +# spectral options (SamSPECTRAL for flow cytometry (function SamSPECTRAL); kernlab for standard ('specc'); kknn for similarity based on knn rather than kmeans): kernlab is either x or a kernel function +#' @importFrom kernlab specc +.speccCluster<-function(x,k,checkArgs,cluster.only,...){ + passedArgs<-.getPassedArgs(FUN=kernlab::specc,passedArgs=list(...) ,checkArgs=checkArgs) + out<-try(do.call(kernlab::specc,c(list(x=t(x),centers=k),passedArgs))) + if(inherits(out,"try-error"))stop("Spectral clustering failed, probably because k (",k,") was too large relative to the number of samples (",ncol(x),"). k must be less than the number of samples, but how much less is not straightforward.") + if(cluster.only) return(out@.Data) + else return(out) +} +.speccCF<-clusterFunction(clusterFUN=.speccCluster, classifyFUN=NULL, inputType="X", algorithmType="K",outputType="vector") + + +##--------- +##Kmeans +##--------- +#' @importFrom stats kmeans +.kmeansCluster <- function(x,k, checkArgs,cluster.only,...) { + passedArgs<-.getPassedArgs(FUN=stats::kmeans,passedArgs=list(...) ,checkArgs=checkArgs) + out<-do.call(stats::kmeans,c(list(x=t(x),centers=k),passedArgs)) + if(cluster.only) return(out$cluster) + else return(.kmeansPartitionObject(x,out)) +} +.kmeansClassify <- function(x, clusterResult) { + centers <- clusterResult$mediods + suppressWarnings(stats::kmeans(t(x), centers, iter.max = 1, algorithm = "Lloyd")$cluster) #probably uses this so always classifies points to centers +} +#make partition object same form as pam output +#' @importFrom cluster daisy silhouette +.kmeansPartitionObject<-function(x,kmeansObj){ + dissE<-(cluster::daisy(t(x)))^2 + silObj<-try(cluster::silhouette(x=kmeansObj$cluster,dist=dissE)) + silinfo<-list(widths=silObj, clus.avg.widths=summary(silObj)$clus.avg.widths, ave.width=summary(silObj)$avg.width) + return(list(mediods=kmeansObj$centers, clustering=kmeansObj$cluster, call=NA,silinfo=silinfo, objective=NA, diss=dissE, data=x)) +} +.kmeansCF<-clusterFunction(clusterFUN=.kmeansCluster, classifyFUN=.kmeansClassify, inputType="X", inputClassifyType="X", algorithmType="K",outputType="vector") +#internalFunctionCheck(.kmeansCluster,inputType="X",algType="K",outputType="vector") + +##--------- +##PAM +##--------- + +#' @importFrom cluster pam +.pamCluster<-function(x,diss,k,checkArgs,cluster.only,...){ + passedArgs<-.getPassedArgs(FUN=cluster::pam,passedArgs=list(...) ,checkArgs=checkArgs) + input<-.checkXDissInput(x,diss,checkDiss=FALSE,algType="K") + if(input=="X") return(do.call(cluster::pam, c(list(x=t(x),k=k, cluster.only=cluster.only), passedArgs))) + if(input=="diss" | input=="both") return(do.call(cluster::pam, c(list(x=diss,k=k, diss=TRUE, cluster.only=cluster.only), passedArgs))) + } +.pamClassify <- function(x, clusterResult) { #x p x n matrix + .genericClassify(x,clusterResult$medoids) +} +.pamCF<-clusterFunction(clusterFUN=.pamCluster, classifyFUN=.pamClassify, inputType="either", inputClassifyType="X", algorithmType="K",outputType="vector") + +#internalFunctionCheck(.pamCluster,inputType="either",algType="K",outputType="vector") + +##--------- +##clara +##--------- + +#' @importFrom cluster pam +.claraCluster<-function(x,k,checkArgs,cluster.only,samples=50,keep.data=FALSE,rngR=TRUE,pamLike=TRUE,correct.d=TRUE,medoids.x=FALSE,...){ + passedArgs<-.getPassedArgs(FUN=cluster::clara,passedArgs=list(...) ,checkArgs=checkArgs) + passedArgs<-c(passedArgs, list(samples=samples, keep.data=keep.data, rngR=rngR, pamLike=pamLike, correct.d=correct.d)) + out<-(do.call(cluster::clara, c(list(x=t(x),k=k), passedArgs))) + if(cluster.only) return(out$clustering) else return(out) + + } + +.claraCF<-clusterFunction(clusterFUN=.claraCluster, classifyFUN=.pamClassify, inputType="X", inputClassifyType="X", algorithmType="K",outputType="vector") + + + +##--------- +##Hiearchical01 +##--------- + +#' @importFrom stats hclust +#' @importFrom phylobase rootNode getNode descendants +.hier01Cluster<-function(diss,alpha,evalClusterMethod=c("maximum","average"),whichHierDist=c("as.dist","dist"),checkArgs,cluster.only,...) +{ + whichHierDist<-match.arg(whichHierDist) + evalClusterMethod<-match.arg(evalClusterMethod) + if(is.null(rownames(diss))) rownames(diss)<-colnames(diss)<-as.character(1:nrow(diss)) + passedArgs<-.getPassedArgs(FUN=stats::hclust,passedArgs=list(...) ,checkArgs=checkArgs) + S<-round(1-diss,10) + d<-switch(whichHierDist,"dist"=dist(S),"as.dist"=as.dist(diss)) + hDmat<-do.call(stats::hclust,c(list(d=d),passedArgs)) + + ##Could this be just done by cut with hierarchical cophenic value? Should make it an option to do that. Probably a lot faster... + method<-evalClusterMethod + phylo4Obj<-.makePhylobaseTree(hDmat,"hclust") + allTips<-phylobase::getNode(phylo4Obj, type=c("tip")) + #each internal node (including root) calculate whether passes value of alpha or not + nodesToCheck<-phylobase::rootNode(phylo4Obj) + clusterList<-list() + + while(length(nodesToCheck)>0){ + currNode<-nodesToCheck[1] + nodesToCheck<-nodesToCheck[-1] + if(currNode%in%allTips){ #block of size 1! + currTips<-names(currNode) + check<-TRUE + } + else{ + currTips<-names(phylobase::descendants(phylo4Obj,currNode,"tip")) + if(method=="maximum") check<-all(S[currTips,currTips,drop=FALSE]>=(1-alpha)) + if(method=="average") check<-all(rowMeans(S[currTips,currTips,drop=FALSE])>=(1-alpha)) + + } + if(check){ #found a block that satisfies + clusterList<-c(clusterList,list(currTips)) + } + else{ #not satisfy + childNodes<-phylobase::descendants(phylo4Obj,currNode,"children") + nodesToCheck<-c(nodesToCheck,childNodes) + } + } + clusterListIndices<-lapply(clusterList,function(tipNames){ + match(tipNames,rownames(diss)) + }) + clusterListIndices<-.orderByAlpha(clusterListIndices,S) + ##Need to update this code so converts vector result into lists of indices ... + return(clusterListIndices) +} +.hier01CF<-clusterFunction(clusterFUN=.hier01Cluster, inputType="diss", algorithmType="01",outputType="list") + +##--------- +##hiearchicalK +##--------- +#' @importFrom stats hclust cutree +.hierKCluster<-function(diss,k,checkArgs,cluster.only,...){ + passedArgs<-.getPassedArgs(FUN=stats::hclust,passedArgs=list(...) ,checkArgs=checkArgs) + hclustOut<-do.call(stats::hclust,c(list(d=as.dist(diss)),passedArgs)) + stats::cutree(hclustOut,k) +} +.hierKCF<-clusterFunction(clusterFUN=.hierKCluster, inputType="diss", algorithmType="K",outputType="vector") + +#internalFunctionCheck(.hierKCluster,inputType="diss",algType="K",outputType="vector") + + +##--------- +##Tight +##--------- +.tightCluster <- function(diss, alpha, minSize.core=2,checkArgs,cluster.only,...) +{ + #previously, diss was similarity matrix. To make it match all of the code, I need it to be diss=1-similarity so now convert it back + S<-1-diss #now is similarity matrix... + if(length(list(...))>0 & checkArgs) warning(.wrongArgsWarning("tight")) + find.candidates.one <- function(x) { + tmp <- apply(x >= 1, 1, sum) #how many in each row ==1 + #what if none of them are ==1? will this never happen because have sample of size 1? Depends what diagonal is. + if(all(tmp= 1)) # assumes x is symmetric. Takes largest in size, but arbitrarily picks between them. + } + extend.candidate <- function(S, can, alpha ) { + can.ex <- which(apply(as.matrix(S[, can] >= 1 - alpha), 1, all)) #find those that are close to those core with 1 + S.temp <- S[can.ex, can.ex] + if (!is.matrix(S.temp)) { + S.temp <- as.matrix(S.temp) + colnames(S.temp) <- names(can.ex) + } + S.bad <- apply(as.matrix(S.temp < 1 - alpha), 1,sum) + while (sum(S.bad) > 0) { + index <- which(S.bad == max(S.bad))[1] + S.temp <- S.temp[-index, -index] + S.bad <- apply(as.matrix(S.temp < 1 - alpha), + 1, sum) + } + return(can.ex[colnames(S.temp)]) + } + if(is.null(dim(S)) || dim(S)[1]!=dim(S)[2] || any(t(S)!=S)) stop("S must be a symmetric matrix") + N<-nrow(S) + colnames(S) <- 1:N + rownames(S) <- 1:N + i = 1 + S.temp <- S + res <- list() + while (!is.null(dim(S.temp)) && !is.null(dim(S.temp)) && nrow(S.temp) > 0 & any(S.temp[lower.tri(S.temp)]>1-alpha) & any(S.temp[lower.tri(S.temp)]==1)) { + #first find those that are always together (resampling =1); pick the largest such group (and if more than 1 of same size will arbitrarily pick one) + candidate.one <- find.candidates.one(S.temp) + if(is.null(candidate.one)){#no more candidates with core always together + #for now just stop if no core group + break + } + #add more on the group if always resamples with the core members >alpha proportion of the time + candidate <- extend.candidate(S.temp, candidate.one, alpha = alpha) + S.temp <- S.temp[-candidate, -candidate] + res[[i]] <- names(candidate) + mode(res[[i]]) <- "numeric" + i = i + 1 + } + res<-.orderByAlpha(res,S) + ##Need to update this code so converts vector result into lists of indices ... + return(res) + +} +.tightCF<-clusterFunction(clusterFUN=.tightCluster, inputType="diss", algorithmType="01",outputType="list") + + +######### +## Put them together so user/code can access easily +######### +.builtInClusterObjects<-list("pam"=.pamCF,"clara"=.claraCF,"kmeans"=.kmeansCF,"hierarchical01"=.hier01CF,"hierarchicalK"=.hierKCF,"tight"=.tightCF,"spectral"=.speccCF) +.builtInClusterNames<-names(.builtInClusterObjects) + +#' @title Built in ClusterFunction options +#' @param object name of built in function. +#' @description Documents the built-in clustering options that are available in +#' the clusterExperiment package. +#' @rdname builtInClusteringFunctions +#' @details \code{listBuiltInFunctions} will return the character names of +#' the built-in clustering functions available. +#' @details \code{listBuiltInTypeK} returns the names of the built-in functions +#' that have type 'K' +#' @details \code{listBuiltInType01} returns the names of the built-in functions +#' that have type '01' +#' @details \code{getBuiltInFunction} will return the +#' \code{ClusterFunction} object of a character value that corresponds to a +#' built-in function. +#' @details \code{\link{algorithmType}} and \code{\link{inputType}} will +#' return the \code{algorithmType} and \code{inputType} of the +#' built-in clusterFunction corresponding to the character value. +#' @details \strong{Built-in clustering methods:} The built-in clustering methods, the +#' names of which can be accessed by \code{listBuiltInFunctions()} are the +#' following: +#' \itemize{ +#' \item{"pam"}{Based on \code{\link[cluster]{pam}} in +#' \code{cluster} package. Arguments to that function can be passed via +#' \code{clusterArgs}. +#' Input is \code{"either"} (\code{x} or \code{diss}); algorithm type is "K"} +#' \item{"clara"}{Based on \code{\link[cluster]{clara}} in +#' \code{cluster} package. Arguments to that function can be passed via +#' \code{clusterArgs}. Note that we have changed the default arguments of +#' that function to match the recommendations in the documentation of +#' \code{\link[cluster]{clara}} (numerous functions are set to less than optimal +#' settings for back-compatiability). Specifically, the following defaults +#' are implemented \code{samples=50}, \code{keep.data=FALSE}, +#' \code{mediods.x=FALSE},\code{rngR=TRUE}, +#' \code{pamLike=TRUE}, \code{correct.d=TRUE}. +#' Input is \code{"X"}; algorithm type is "K".} +#' \item{"kmeans"}{Based on \code{\link[stats]{kmeans}} in +#' \code{stats} package. Arguments to that function can be passed via +#' \code{clusterArgs} except for \code{centers} which is reencoded here to be +#' the argument 'k' +#' Input is \code{"X"}; algorithm type is "K"} +#' \item{"hierarchical01"}{\code{\link[stats]{hclust}} in +#' \code{stats} package is used to build hiearchical clustering. Arguments to +#' that function can be passed via \code{clusterArgs}. The +#' \code{hierarchical01} cuts the hiearchical tree based on the parameter +#' \code{alpha}. It does not use the \code{cutree} function, but instead +#' transversing down the tree until getting a block of +#' samples with whose summary of the values is greater than or equal to +#' 1-alpha. Arguments that can be passed to 'hierarchical01' are +#' 'evalClusterMethod' which determines how to summarize the samples' values +#' of D[samples,samples] for comparison to 1-alpha: "maximum" (default) takes +#' the minimum of D[samples,samples] and requires it to be less than or equal +#' to 1-alpha; "average" requires that each row mean of D[samples,samples] be +#' less than or equal to 1-alpha. Additional arguments of hclust can also be passed via +#' clusterArgs to control the hierarchical clustering of D. +#' Input is \code{"diss"}; algorithm type is "01"} +#' \item{"hierarchicalK"}{\code{\link[stats]{hclust}} in \code{stats} package is used +#' to build hiearchical clustering and \code{\link{cutree}} is used to cut the +#' tree into \code{k} clusters. +#' Input is \code{"diss"}; algorithm type is "K"} +#' \item{"tight"}{Based on the algorithm in +#' Tsang and Wong, specifically their method of picking clusters from a +#' co-occurance matrix after subsampling. The clustering encoded here is not +#' the entire tight clustering algorithm, only that single piece that +#' identifies clusters from the co-occurance matrix. +#' Arguments for the tight method are +#' 'minSize.core' (default=2), which sets the minimimum number of samples that +#' form a core cluster. +#' Input is \code{"diss"}; algorithm type is "01"} +#' \item{"spectral"}{\code{\link[kernlab]{specc}} in \code{kernlab} package +#' is used to perform spectral clustering. Note that spectral clustering can +#' produce errors if the number of clusters (K) is not sufficiently smaller than +#' the number of samples (N). K < N is not always sufficient. +#' Input is \code{"X"}; algorithm type is "K".} +#' } +#' @seealso \code{\link{ClusterFunction}}, \code{\link{algorithmType}}, \code{\link{inputType}} +#' @examples +#' listBuiltInFunctions() +#' algorithmType(c("kmeans","pam","hierarchical01")) +#' inputType(c("kmeans","pam","hierarchical01")) +#' listBuiltInTypeK() +#' listBuiltInType01() +#' @rdname builtInClusteringFunctions +#' @aliases listBuiltInFunctions +#' @export +listBuiltInFunctions<-function() { + .builtInClusterNames + + } +#' @rdname builtInClusteringFunctions +#' @aliases getBuiltInFunction +#' @export +setMethod( + f = "getBuiltInFunction", + signature = c("character"), + definition = function(object) { + if(!all(object%in%.builtInClusterNames)) stop("if give character value for a clusterFunction object must be one of",paste(.builtInClusterNames,collapse=",")) + m<-match(object,names(.builtInClusterObjects)) + if(length(m)>1) .builtInClusterObjects[m] + else .builtInClusterObjects[[m]] + + + } +) + +#' @rdname builtInClusteringFunctions +#' @aliases listBuiltInTypeK +#' @export +listBuiltInTypeK<-function() { + allBuiltInTypes<-algorithmType(.builtInClusterNames) + return(names(allBuiltInTypes)[allBuiltInTypes=="K"]) + } + +#' @rdname builtInClusteringFunctions +#' @aliases listBuiltInType01 +#' @export +listBuiltInType01<-function() { + allBuiltInTypes<-algorithmType(.builtInClusterNames) + return(names(allBuiltInTypes)[allBuiltInTypes=="01"]) + } + diff --git a/R/builtInClusterSubsampleFunctions.R b/R/builtInClusterSubsampleFunctions.R deleted file mode 100644 index f33e3862..00000000 --- a/R/builtInClusterSubsampleFunctions.R +++ /dev/null @@ -1,44 +0,0 @@ -################ -##Internal wrapper functions for kmeans and pam -################ - -###Kmeans -.kmeansCluster <- function(x,k, ...) { - out<-stats::kmeans(t(x),centers=k,...) - out<-.kmeansPartitionObject(x,out) #make it a partition object like pam. - #out$clustering<-out$cluster #stupid difference in naming... - return(out) -} -.kmeansClassify <- function(x, clusterResult) { - centers <- clusterResult$mediods - suppressWarnings(stats::kmeans(t(x), centers, iter.max = 1, algorithm = "Lloyd")$cluster) #probably uses this so always classifies points to centers -} -#make partition object same form as pam output -.kmeansPartitionObject<-function(x,kmeansObj){ - dissE<-(cluster::daisy(t(x)))^2 - silObj<-cluster::silhouette(kmeansObj$cl,dissE^2) - silinfo<-list(widths=silObj, clus.avg.widths=summary(silObj)$clus.avg.widths, ave.width=summary(silObj)$avg.width) - return(list(mediods=kmeansObj$centers,clustering=kmeansObj$cluster,call=NA,silinfo=silinfo,objective=NA,diss=dissE,data=x)) -} - -###Pam -.pamCluster <- function(x,k, ...) { cluster::pam(x=t(x),k=k,...) } #x p x n matrix -.pamClassify <- function(x, clusterResult) { #x p x n matrix - center<-clusterResult$medoids - innerProd<-tcrossprod(t(x),center) #a n x k matrix of inner-products between them - distMat<-as.matrix(dist(rbind(t(x),center))) - distMat<-distMat[1:ncol(x),(ncol(x)+1):ncol(distMat)] - apply(distMat,1,which.min) -} - -# .hierCluster<-function(x,k,...){ -# argList<-list(...) -# hout<-do.call("hclust",c(list(dist(x)),argList)) -# stats::cutree(tree, k = k) -# -# } -# .hierClassify<-function(x,clusterResult){ -# -# } - - diff --git a/R/clusterContrasts.R b/R/clusterContrasts.R index 7abf0eaf..e37c268e 100644 --- a/R/clusterContrasts.R +++ b/R/clusterContrasts.R @@ -21,8 +21,8 @@ #' from the design matrix. Appropriate to pick TRUE (default) if design will #' be input into linear model on samples that excludes -1. #' @param outputType character string. Gives format for the resulting contrast -#' matrix. Currently the only option is the format appropriate for -#' \code{\link{limma}} package, but we anticipate adding more. +#' matrix. Currently the two options are the format appropriate for +#' \code{\link[limma]{limma}} and \code{\link[MAST]{MAST}} package. #' @param ... arguments that are passed to from the \code{ClusterExperiment} #' version to the most basic numeric version. #' @details The input vector must be numeric clusters, but the external commands @@ -40,6 +40,8 @@ #' \item{\code{contrastNames}}{A vector of names for each of the contrasts. NULL if no such additional names.} #' } #' @author Elizabeth Purdom +#' @references Ritchie, ME, Phipson, B, Wu, D, Hu, Y, Law, CW, Shi, W, and Smyth, GK (2015). limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Research 43, e47. http://nar.oxfordjournals.org/content/43/7/e47 +#' @references Finak, et al. MAST: a flexible statistical framework for assessing transcriptional changes and characterizing heterogeneity in single-cell RNA sequencing data. Genome Biology (2015). #' @examples #' data(simData) #' cl <- clusterMany(simData,nPCADims=c(5,10,50), dimReduce="PCA", @@ -65,12 +67,13 @@ setMethod(f = "clusterContrasts", #' @rdname clusterContrasts #' @export #' @importFrom limma makeContrasts -#' @importFrom MAST Hypothesis setMethod(f = "clusterContrasts", signature = "vector", definition = function(cluster,contrastType=c("Dendro", "Pairs", "OneAgainstAll"), dendro=NULL, pairMat=NULL,outputType=c("limma","MAST"),removeNegative=TRUE){ - cluster<-.convertToNum(cluster) + if(outputType=="MAST" & !requireNamespace("MAST", quietly = TRUE)) stop("for outputType 'MAST', you must have package 'MAST' from Bioconductor installed.") + + cluster<-.convertToNum(cluster) if(removeNegative) cl<-cluster[cluster>0] else cl<-cluster cl<-factor(cl) contrastType<-match.arg(contrastType) diff --git a/R/clusterD.R b/R/clusterD.R deleted file mode 100644 index 5b6ec594..00000000 --- a/R/clusterD.R +++ /dev/null @@ -1,501 +0,0 @@ -#' @title Cluster distance matrix from subsampling -#' -#' @description Given a \code{n x n} matrix of distances, these functions will -#' try to find the clusters based on the given clustering function. cluster01 -#' and clusterK are internal functions and clusterD is a wrapper around these -#' two functions for easier user interface. cluster01 and clusterK are not -#' expected to be called directly by the user, except for ease in debugging -#' user-defined clustering functions. -#' -#' @aliases cluster01 -#' @aliases clusterK -#' -#' @param x \code{p x n} data matrix on which to run the clustering (samples in -#' columns). -#' @param diss \code{n x n} data matrix of dissimilarities between the samples -#' on which to run the clustering -##' @param clusterFunction clusterFunction a function that clusters a nxn matrix -#' of dissimilarities/distances. Can also be given character values to -#' indicate use of internal wrapper functions for default methods. See Details -#' for the format of what the function must take as arguments and what format -#' the function must return. -#' @param typeAlg character value of either '01' or 'K' determining whether the -#' function given in clusterFunction should be called by clusterK or -#' cluster01. Only used if clusterFunction is a user-defined function. -#' Otherwise, for methods provided by the package (i.e. by user setting -#' clusterFunction to a character value) clusterD will determine the -#' appropriate input for 'typeAlg' and will ignore user input. -#' @param distFunction a distance function to be applied to \code{D}. Only relevant if -#' input \code{D} is a matrix of data, rather than a distance. See details. -#' @param minSize the minimum number of samples in a cluster. Clusters found -#' below this size will be discarded and samples in the cluster will be given -#' a cluster assignment of "-1" to indicate that they were not clustered. -#' @param orderBy how to order the cluster (either by size or by maximum alpha -#' value). -#' @param format whether to return a list of indices in a cluster or a vector of -#' clustering assignments. List is mainly for compatibility with sequential -#' part. -#' @param clusterArgs arguments to be passed directly to the clusterFunction, -#' beyond the required input. -#' @param alpha a cutoff value of how much similarity needed for drawing blocks -#' (lower values more strict). -#' @param findBestK logical, whether should find best K based on average -#' silhouette width (only used if clusterFunction of type "K"). -#' @param k single value to be used to determine how many clusters to find, if -#' findBestK=FALSE (only used if clusterFunction of type "K"). -#' @param kRange vector of integers. If findBestK=TRUE, this gives the range of -#' k's to look over. Default is k-2 to k+20, subject to those values being -#' greater than 2. Note that default values depend on the input k, so running -#' for different choices of k and findBestK=TRUE can give different answers -#' unless kRange is set to be the same. -#' @param silCutoff Requirement on minimum silhouette width to be included in -#' cluster (only if removeSil=TRUE). -#' @param removeSil logical as to whether remove when silhouette < silCutoff -#' (only used if clusterFunction of type "K") -#' @param checkArgs logical as to whether should give warning if arguments given -#' that don't match clustering choices given. Otherwise, inapplicable -#' arguments will be ignored without warning. -#' @param returnD logical as to whether to return the D matrix in output. -#' @param ... arguments given to clusterD to be passed to cluster01 or clusterK -#' (depending on the value of typeAlg). Examples include 'k' for clusterK or -#' 'alpha' for cluster01. These should not be the arguments needed by -#' clusterFunction (which should be passed via the argument 'clusterArgs') but -#' the actual arguments of cluster01 or clusterK. -#' @details To provide a distance matrix via the argument \code{distFunction}, -#' the function must be defined to take the distance of the rows of a matrix -#' (internally, the function will call \code{distFunction(t(x))}. This is to -#' be compatible with the input for the \code{dist} function. -#' \code{as.matrix} will be performed on the output of \code{distFunction}, -#' so if the object returned has a \code{as.matrix} method that will convert -#' the output into a symmetric matrix of distances, this is fine (for -#' example the class \code{dist} for objects returned by \code{dist} have -#' such a method). If \code{distFunction=NA}, then a default distance will -#' be calculated based on the type of clustering algorithm of -#' \code{clusterFunction}. For type "K" the default is to take \code{dist} -#' as the distance function. For type "01", the default is to take the -#' (1-cor(x))/2. -#' -#' @details Types of algorithms: cluster01 is for clustering functions that -#' expect as an input D that takes on 0-1 values (e.g. from subclustering). -#' clusterK is for clustering functions that require an input k, the number of -#' clusters, but arbitrary distance/dissimilarity matrix. cluster01 and -#' clusterK are given as separate functions in order to allow the user to -#' provide different clustering functions that expect different types of input -#' and for us to provide different shared processing of the results that is -#' different for these different types of clustering methods (for example, -#' removing low silhouette values is appropriate for clusterK clustering -#' functions rather than cluster01 functions). It is also generally expected -#' that cluster01 algorithms use the 0-1 nature of the input to set criteria -#' as to where to find clusters and therefore do not need a pre-determined -#' 'k'. On the other hand, clusterK functions are assumed to need a -#' predetermined 'k' and are also assumed to cluster all samples to a cluster, -#' and therefore clusterK gives options to exclude poorly clustered samples -#' via silhouette distances. -#' -#' @details cluster01 required format for input and output for clusterFunction: -#' clusterFunction should be a function that takes (as a minimum) an argument -#' "D" and "alpha". 0-1 clustering algorithms are expected to use the fact -#' that the D input is 0-1 range to find the clusters, rather than a user -#' defined number of clusters; "alpha" is the parameter that tunes the finding -#' of such clusters. For example, a candidate block of samples might be -#' considered a cluster if all values of D are greater than or equal to -#' 1-alpha. The output is a list with each element corresponding to a cluster -#' and the elements of the list corresponding to the indices of the samples -#' that are in the cluster. The list is expected to be in order of 'best -#' clusters' (as defined by the clusterFunction), with first being the best -#' and last being worst. -#' -#' @details cluster01 methods: "tight" method refers to the method of finding -#' clusters from a subsampling matrix given internally in the tight -#' algorithm code of Tsang and Wong. Arguments for the tight method are -#' 'minSize.core' (default=2), which sets the minimimum number of samples -#' that form a core cluster. "hierarchical01" refers to running the hclust -#' algorithm on D and transversing down the tree until getting a block of -#' samples with whose summary of the values is greater than or equal to -#' 1-alpha. Arguments that can be passed to 'hierarchical' are -#' 'evalClusterMethod' which determines how to summarize the samples' values -#' of D[samples,samples] for comparison to 1-alpha: "maximum" (default) -#' takes the minimum of D[samples,samples] and requires it to be less than -#' or equal to 1-alpha; "average" requires that each row mean of -#' D[samples,samples] be less than or equal to 1-alpha. Arguments of -#' hclust can also be passed via clusterArgs to control the hierarchical -#' clustering of D. -#' -#' @details clusterK required format for input and output for clusterFunction: -#' clusterFunction should be a function that takes as a minimum an argument -#' 'D' and 'k'. The output must be a clustering, specified by integer values. -#' The function \code{\link{silhouette}} will be used on the clustering to -#' calculate silhouette scores for each observation. -#' -#' @details clusterK methods: "pam" performs pam clustering on the input -#' \code{D} matrix using \code{\link{pam}} in the cluster package. Arguments -#' to \code{\link{pam}} can be passed via 'clusterArgs', except for the -#' arguments 'x' and 'k' which are given by D and k directly. "hierarchicalK" -#' performs hierarchical clustering on the input via the \code{\link{hclust}} -#' and then applies \code{\link{cutree}} with the specified k to obtain -#' clusters. Arguments to \code{\link{hclust}} can be passed via -#' \code{clusterArgs}. -#' -#' @return clusterD returns a vector of cluster assignments (if format="vector") -#' or a list of indices for each cluster (if format="list"). Clusters less -#' than minSize are removed. If orderBy="size" the clusters are reordered by -#' the size of the cluster, instead of by the internal ordering of the -#' clusterFunction. -#' -#' @return cluster01 and clusterK return a list of indices of the clusters found, -#' which each element of the list corresponding to a cluster and the elements -#' of that list a vector of indices giving the indices of the samples assigned -#' to that cluster. Indices not included in any list are assumed to have not -#' been clustered. The list is assumed to be ordered in terms of the `best' -#' cluster (as defined by the clusterFunction for cluster01 or by average -#' silhoute for clusterK), for example in terms of most internal similarity of -#' the elements, or average silhouette width. -#' -#' @examples -#' data(simData) -#' cl1<-clusterD(simData,clusterFunction="pam",k=3) -#' cl2<-clusterD(simData,clusterFunction="hierarchical01") -#' cl3<-clusterD(simData,clusterFunction="tight") -#' #change distance to manhattan distance -#' cl4<-clusterD(simData,clusterFunction="pam",k=3, -#' distFunction=function(x){dist(x,method="manhattan")}) -#' -#' #run hierarchical method for finding blocks, with method of evaluating -#' #coherence of block set to evalClusterMethod="average", and the hierarchical -#' #clustering using single linkage: -#' clustSubHier <- clusterD(simData, clusterFunction="hierarchical01", alpha=0.1, -#' minSize=5, clusterArgs=list(evalClusterMethod="average", method="single")) -#' -#' #do tight -#' clustSubTight <- clusterD(simData, clusterFunction="tight", alpha=0.1, -#' minSize=5) -#' -#' #two twists to pam -#' clustSubPamK <- clusterD(simData, clusterFunction="pam", silCutoff=0, minSize=5, -#' removeSil=TRUE, k=3) -#' clustSubPamBestK <- clusterD(simData, clusterFunction="pam", silCutoff=0, -#' minSize=5, removeSil=TRUE, findBestK=TRUE, kRange=2:10) -#' -#' # note that passing the wrong arguments for an algorithm results in warnings -#' # (which can be turned off with checkArgs=FALSE) -#' clustSubTight_test <- clusterD(simData, clusterFunction="tight", alpha=0.1, -#' minSize=5, removeSil=TRUE) -#' clustSubTight_test2 <- clusterD(simData, clusterFunction="tight", alpha=0.1, -#' clusterArgs=list(evalClusterMethod="average")) -#' @export -#' @importFrom cluster daisy silhouette pam -clusterD<-function(x=NULL, diss=NULL,clusterFunction=c("hierarchical01","tight","pam","hierarchicalK"), - typeAlg=c("01","K"),distFunction=NA,minSize=1, orderBy=c("size","best"), - format=c("vector","list"),clusterArgs=NULL,checkArgs=TRUE,returnD=FALSE,...){ - input<-.checkXDissInput(x,diss) - passedArgs<-list(...) - orderBy<-match.arg(orderBy) - format<-match.arg(format) - clusterFunction<-match.arg(clusterFunction) - if(!is.function(clusterFunction)) typeAlg<-.checkAlgType(clusterFunction) - if(length(passedArgs)>0){ - #get rid of wrong args passed because of user confusion between the two - whRightArgs<-which(names(passedArgs) %in% switch(typeAlg,"01"=.args01,"K"=.argsK)) - if(length(whRightArgs)!=length(passedArgs) & checkArgs) warning("Some arguments passed via '...' do not match the choice of typeAlg") - if(length(whRightArgs)>0) passedArgs<-passedArgs[whRightArgs] - else passedArgs<-NULL - } - ####################### - ### Create distance if needed, and check it. - ####################### - #browser() - #browser() - if(input=="X"){ - if(!is.function(distFunction) && is.na(distFunction)){ - distFunction<-switch(typeAlg,"01"=function(x){(1-cor(t(x)))/2},"K"=function(x){dist(x)}) - } - D<-try(as.matrix(distFunction(t(x)))) #distances assumed to be of observations on rows - if(inherits(D,"try-error")) stop("input distance gives error when applied to x") - if(!all(dim(D) == c(ncol(x),ncol(x)))) stop("distance function must result in a ",ncol(x),"by",ncol(x),"matrix of distances") - if(!all(D==t(D))) stop("distance function must result in a symmetric matrix") - - } - else D<-diss - .checkDistFunction(D) - ####################### - ####Run clustering: - ####################### - if(typeAlg=="01") { - if(any(D>1)) stop("distance function must give values between 0 and 1 for clusterFunction", clusterFunction) - res<-do.call("cluster01",c(list(diss=D,clusterFunction=clusterFunction,clusterArgs=clusterArgs,checkArgs=checkArgs),passedArgs)) - } - if(typeAlg=="K") { - res<-do.call("clusterK",c(list(diss=D,clusterFunction=clusterFunction,clusterArgs=clusterArgs,checkArgs=checkArgs),passedArgs)) - } - - ####################### - #Now format into desired output - ####################### - N<-nrow(D) - clusterSize<-sapply(res, length) - if(length(res)>0) res <- res[clusterSize>=minSize] - if(length(res)==0){ #No clusters pass -# if(format=="list") return(res) -# else return(rep(-1,nrow(D))) - if(format=="vector") res<-rep(-1,nrow(D)) - } - else{ - #now reorders final groups by size - if(orderBy=="size"){ - clusterSize<-sapply(res, length) #redo because dropped! - res <- res[order(clusterSize,decreasing=TRUE)] - } - names(res)<-as.character(1:length(res)) - - #if(format=="list") return(res) - if(format=="vector"){ - - valMat<-do.call("rbind",mapply(res,names(res),FUN=function(ind,val){cbind(ind,rep(as.numeric(val),length=length(ind)))},SIMPLIFY=FALSE)) - clusterVec<-rep("-1",length=N) - clusterVec[valMat[,1]]<-valMat[,2] - clusterVec<-as.numeric(clusterVec) - names(clusterVec)<-rownames(D) - res<-clusterVec #return(clusterVec) - } - } - if(!returnD) return(res) - else return(list(result=res,D=D)) -} - -.args01<-c("alpha") -#' @rdname clusterD -cluster01<-function(diss, clusterFunction=c("hierarchical01","tight"), alpha=0.1, clusterArgs=NULL,checkArgs) -{ - D<-diss - if(!is.function(clusterFunction)){ - method<-match.arg(clusterFunction) - ##These return lists of indices of clusters satisifying alpha criteria - if(method=="tight") clusterFunction<-.tightClusterDMat - if(method=="hierarchical01") clusterFunction<-.hier01ClusterDMat - } - res<-do.call(clusterFunction,c(list(D=D,alpha=alpha,checkArgs=checkArgs),clusterArgs)) - return(res) -} -.hier01ClusterDMat<-function(D,alpha,evalClusterMethod=c("maximum","average"),whichHierDist=c("dist","D"),checkArgs,...) -{ - whichHierDist<-match.arg(whichHierDist) - evalClusterMethod<-match.arg(evalClusterMethod) - if(is.null(rownames(D))) rownames(D)<-colnames(D)<-as.character(1:nrow(D)) - passedArgs<-list(...) - hclustArgs<-names(as.list(args(stats::hclust))) - if(any(!names(passedArgs) %in% hclustArgs)){ - wh<-which(!names(passedArgs) %in% hclustArgs) - passedArgs<-passedArgs[-wh] - if(checkArgs) warning("arguments passed via clusterArgs to hierarchical clustering method not all applicable (should only be arguments to hclust). Will be ignored") - } - #use to be (when D was similarity matrix): - #hDmat<-do.call(stats::hclust,c(list(d=dist(D)),passedArgs)) -# browser() - S<-round(1-D,10) - d<-switch(whichHierDist,"dist"=dist(S),"D"=as.dist(D)) - hDmat<-do.call(stats::hclust,c(list(d=d),passedArgs)) - - method<-evalClusterMethod - phylo4Obj<-.makePhylobaseTree(hDmat,"hclust") - allTips<-phylobase::getNode(phylo4Obj, type=c("tip")) - #each internal node (including root) calculate whether passes value of alpha or not - nodesToCheck<-phylobase::rootNode(phylo4Obj) - clusterList<-list() - - #was slower with this code: - # allInternal<-phylobase::getNode(phylo4Obj, type=c("internal")) - # allTipsByInternal<-lapply(allInternal,function(currNode){names(phylobase::descendants(phylo4Obj,currNode,"tip"))}) - # allChecks<-sapply(allTipsByInternal,function(currTips){ - # if(method=="maximum") check<-all(D[currTips,currTips,drop=FALSE]>=(1-alpha)) - # if(method=="average") check<-all(rowMeans(D[currTips,currTips,drop=FALSE])>=(1-alpha)) - # return(check) - # }) -# names(allChecks)<-names(allInternal) - while(length(nodesToCheck)>0){ - currNode<-nodesToCheck[1] - nodesToCheck<-nodesToCheck[-1] - if(currNode%in%allTips){ #block of size 1! - currTips<-names(currNode) - check<-TRUE - } - else{ - # wh<-match(currNode,allInternal) - # currTips<-allTipsByInternal[[wh]] - # check<-allChecks[[wh]] - currTips<-names(phylobase::descendants(phylo4Obj,currNode,"tip")) - if(method=="maximum") check<-all(S[currTips,currTips,drop=FALSE]>=(1-alpha)) - if(method=="average") check<-all(rowMeans(S[currTips,currTips,drop=FALSE])>=(1-alpha)) - - } - if(check){ #found a block that satisfies - clusterList<-c(clusterList,list(currTips)) - } - else{ #not satisfy - childNodes<-phylobase::descendants(phylo4Obj,currNode,"children") - nodesToCheck<-c(nodesToCheck,childNodes) - } - } -# browser() - clusterListIndices<-lapply(clusterList,function(tipNames){ - match(tipNames,rownames(D)) - }) - clusterListIndices<-.orderByAlpha(clusterListIndices,S) - return(clusterListIndices) -} -.orderByAlpha<-function(res,S) -{ - if(length(res)>0){ - alphaMax<-unlist(lapply(res, function(x){ - vals<-lower.tri(S[x,x]) #don't grab diag - 1-min(vals) #max(alpha)=1-min(S) - })) - res <- res[order(alphaMax, decreasing=TRUE)] - - } - else return(res) -} -.tightClusterDMat <- function(D, alpha, minSize.core=2,checkArgs,...) -{ - #previously, D was similarity matrix. To make it match in clusterD, I need it to be D=1-similarity - #so now convert it back - S<-1-D #now is similarity matrix... - if(length(list(...))>0 & checkArgs) warning("some arguments passed via clusterArgs to tight clustering method are not applicable") - find.candidates.one <- function(x) { - tmp <- apply(x >= 1, 1, sum) #how many in each row ==1 - #what if none of them are ==1? will this never happen because have sample of size 1? Depends what diagonal is. - if(all(tmp= 1)) # assumes x is symmetric. Takes largest in size, but arbitrarily picks between them. - } - extend.candidate <- function(S, can, alpha ) { - can.ex <- which(apply(as.matrix(S[, can] >= 1 - alpha), 1, all)) #find those that are close to those core with 1 - S.temp <- S[can.ex, can.ex] - if (!is.matrix(S.temp)) { - S.temp <- as.matrix(S.temp) - colnames(S.temp) <- names(can.ex) - } - S.bad <- apply(as.matrix(S.temp < 1 - alpha), 1,sum) - while (sum(S.bad) > 0) { - index <- which(S.bad == max(S.bad))[1] - S.temp <- S.temp[-index, -index] - S.bad <- apply(as.matrix(S.temp < 1 - alpha), - 1, sum) - } - return(can.ex[colnames(S.temp)]) - } - if(is.null(dim(S)) || dim(S)[1]!=dim(S)[2] || any(t(S)!=S)) stop("S must be a symmetric matrix") - N<-nrow(S) - colnames(S) <- 1:N - rownames(S) <- 1:N - i = 1 - S.temp <- S - res <- list() - while (!is.null(dim(S.temp)) && !is.null(dim(S.temp)) && nrow(S.temp) > 0 & any(S.temp[lower.tri(S.temp)]>1-alpha) & any(S.temp[lower.tri(S.temp)]==1)) { - #first find those that are always together (resampling =1); pick the largest such group (and if more than 1 of same size will arbitrarily pick one) - candidate.one <- find.candidates.one(S.temp) - if(is.null(candidate.one)){#no more candidates with core always together - #for now just stop if no core group - break - } - #add more on the group if always resamples with the core members >alpha proportion of the time - candidate <- extend.candidate(S.temp, candidate.one, alpha = alpha) - S.temp <- S.temp[-candidate, -candidate] - res[[i]] <- names(candidate) - mode(res[[i]]) <- "numeric" - i = i + 1 - } - res<-.orderByAlpha(res,S) - return(res) - -} - - - - -.argsK<-c("findBestK","k","kRange","removeSil","silCutoff") -#' @rdname clusterD -clusterK<-function(diss, clusterFunction=c("pam","hierarchicalK"),findBestK=FALSE, k, kRange,removeSil=FALSE,silCutoff=0,clusterArgs=NULL,checkArgs) -{ - D<-diss - if(!findBestK && missing(k)) stop("If findBestK=FALSE, must provide k") - if(findBestK){ - if(missing(kRange)){ - if(!missing(k)) kRange<-(k-2):(k+20) - else kRange<-2:20 - } - if(any(kRange<2)){ - kRange<-kRange[kRange>=2] - if(length(kRange)==0) stop("Undefined values for kRange; must be greater than or equal to 2") - } - } - ##These return lists of indices of clusters satisifying alpha criteria - if(!is.function(clusterFunction)){ - method<-match.arg(clusterFunction) - if(method =="pam") clusterFunction<-function(D,k,checkArgs,...){ - passedArgs<-list(...) - pamArgs<-names(as.list(args(cluster::pam))) - if(any(wh<-!names(passedArgs) %in% pamArgs)){ - passedArgs<-passedArgs[-which(wh)] - if(checkArgs) warning("arguments passed via clusterArgs to pam not all applicable (should only be arguments to pam). Will be ignored") - } - do.call(cluster::pam,c(list(x=D,k=k,diss=TRUE,cluster.only=TRUE),passedArgs)) - - } - if(method =="hierarchicalK") clusterFunction<-function(D,k,checkArgs,...){ - passedArgs<-list(...) - hierArgs<-names(as.list(args(stats::hclust))) - if(any(wh<-!names(passedArgs) %in% hierArgs)){ - passedArgs<-passedArgs[-which(wh)] - if(checkArgs) warning("arguments passed via clusterArgs to pam not all applicable (should only be arguments to pam). Will be ignored") - } -# browser() - hclustOut<-do.call(stats::hclust,c(list(d=as.dist(D)),passedArgs)) - cutree(hclustOut,k) - } - } - - - if(findBestK) ks<-kRange else ks<-k - if(any(ks>= nrow(D))) ks<-ks[ks1){ - whichBest<-which.max(sapply(silClusters, mean)) - finalCluster<-clusters[[whichBest]] - sil<-silClusters[[whichBest]][,"sil_width"] - } - else{ - finalCluster<-clusters[[1]] - sil<-silClusters[[1]][,"sil_width"] - } - if(removeSil){ - cl<-as.numeric(sil>silCutoff) - cl[cl==0]<- -1 - cl[cl>0]<-finalCluster[cl>0] - sil[cl == -1] <- -Inf #make the -1 cluster the last one in order - } - else{ - cl<-finalCluster - } - - #make list of indices and put in order of silhouette width (of positive) - clList<-tapply(1:length(cl),cl,function(x){x},simplify=FALSE) - clAveWidth<-tapply(sil,cl,mean,na.rm=TRUE) - clList[order(clAveWidth,decreasing=TRUE)] - - #remove -1 group - if(removeSil){ - whNotAssign<-which(sapply(clList,function(x){all(cl[x]== -1)})) - if(length(whNotAssign)>1) stop("Coding error in removing unclustered samples") - if(length(whNotAssign)>0) clList<-clList[-whNotAssign] - } - return(clList) - -} - - diff --git a/R/clusterMany.R b/R/clusterMany.R index 5709d64c..a521f763 100644 --- a/R/clusterMany.R +++ b/R/clusterMany.R @@ -1,119 +1,133 @@ #' Create a matrix of clustering across values of parameters -#' -#' Given a range of parameters, this funciton will return a matrix with the -#' clustering of the samples across the range, which can be passed to +#' +#' Given a range of parameters, this function will return a matrix with the +#' clustering of the samples across the range, which can be passed to #' \code{plotClusters} for visualization. -#' +#' #' @aliases clusterMany -#' -#' @param x the data on which to run the clustering. Can be: matrix (with genes -#' in rows), a list of datasets overwhich the clusterings should be run, a -#' \code{SummarizedExperiment} object, or a \code{ClusterExperiment} object. -#' @param ks the range of k values (see details for meaning for different -#' choices). -#' @param alphas values of alpha to be tried. Only used for clusterFunctions of -#' type '01' (either 'tight' or 'hierarchical01'). Determines tightness -#' required in creating clusters from the dissimilarity matrix. Takes on -#' values in [0,1]. See \code{\link{clusterD}}. +#' +#' @param x the data matrix on which to run the clustering. Can be: matrix (with +#' genes in rows), a list of datasets overwhich the clusterings should be run, +#' a \code{SummarizedExperiment} object, or a \code{ClusterExperiment} object. +#' @param ks the range of k values (see details for the meaning of \code{k} for +#' different choices of other parameters). +#' @param alphas values of alpha to be tried. Only used for clusterFunctions of +#' type '01'. Determines tightness required in creating clusters from the +#' dissimilarity matrix. Takes on values in [0,1]. See documentation of +#' \code{\link{ClusterFunction}}. #' @param betas values of \code{beta} to be tried in sequential steps. Only used -#' for \code{sequential=TRUE}. Determines the similarity between two clusters +#' for \code{sequential=TRUE}. Determines the similarity between two clusters #' required in order to deem the cluster stable. Takes on values in [0,1]. See -#' \code{\link{seqCluster}}. -#' @param clusterFunction function used for the clustering. Note that unlike in +#' documentation of \code{\link{seqCluster}}. +#' @param clusterFunction function used for the clustering. Note that unlike in #' \code{\link{clusterSingle}}, this must be a character vector of pre-defined -#' clustering techniques provided by \code{\link{clusterSingle}}, and can not -#' be a user-defined function. Current functions are "tight", -#' "hierarchical01","hierarchicalK", and "pam" -#' @param minSizes the minimimum size required for a cluster (in -#' \code{clusterD}). Clusters smaller than this are not kept and samples are -#' left unassigned. -#' @param distFunction a vector of character strings that are the names of -#' distance functions found in the global environment. See the help pages of -#' \code{\link{clusterD}} for details about the required format of distance -#' functions. Currently, this distance function must be applicable for all -#' clusterFunction types tried. Therefore, it is not possible to intermix type "K" -#' and type "01" algorithms if you also give distances to evaluate via -#' \code{distFunction} unless all distances give 0-1 values for the distance -#' (and hence are possible for both type "01" and "K" algorithms). -#' @param nVarDims vector of the number of the most variable features to keep -#' (when "var", "cv", or "mad" is identified in \code{dimReduce}). If NA is +#' clustering techniques, and can not be a user-defined function. Current +#' functions can be found by typing \code{listBuiltInFunctions()} into the +#' command-line. +#' @param minSizes the minimimum size required for a cluster (in the +#' \code{mainClustering} step). Clusters smaller than this are not kept and samples +#' are left unassigned. +#' @param distFunction a vector of character strings that are the names of +#' distance functions found in the global environment. See the help pages of +#' \code{\link{clusterSingle}} for details about the required format of +#' distance functions. Currently, this distance function must be applicable +#' for all clusterFunction types tried. Therefore, it is not possible in +#' \code{clusterMany} to intermix type "K" and type "01" algorithms if you +#' also give distances to evaluate via \code{distFunction} unless all +#' distances give 0-1 values for the distance (and hence are possible for both +#' type "01" and "K" algorithms). +#' @param nVarDims vector of the number of the most variable features to keep +#' (when "var", "cv", or "mad" is identified in \code{dimReduce}). If NA is #' included, then the full dataset will also be included. -#' @param nPCADims vector of the number of PCs to use (when 'PCA' is identified +#' @param nPCADims vector of the number of PCs to use (when 'PCA' is identified #' in \code{dimReduce}). If NA is included, then the full dataset will also be #' included. -#' @param eraseOld logical. Only relevant if input \code{x} is of class -#' \code{ClusterExperiment}. If TRUE, will erase existing workflow results -#' (clusterMany as well as mergeClusters and combineMany). If FALSE, existing -#' workflow results will have "\code{_i}" added to the clusterTypes value, -#' where \code{i} is one more than the largest such existing workflow +#' @param eraseOld logical. Only relevant if input \code{x} is of class +#' \code{ClusterExperiment}. If TRUE, will erase existing workflow results +#' (clusterMany as well as mergeClusters and combineMany). If FALSE, existing +#' workflow results will have "\code{_i}" added to the clusterTypes value, +#' where \code{i} is one more than the largest such existing workflow #' clusterTypes. +#' @param findBestK logical, whether should find best K based on average +#' silhouette width (only used when clusterFunction of type "K"). +#' @param silCutoff Requirement on minimum silhouette width to be included in +#' cluster (only for combinations where removeSil=TRUE). +#' @param removeSil logical as to whether remove when silhouette < silCutoff +#' (only used if clusterFunction of type "K") #' @inheritParams clusterSingle -#' @inheritParams clusterD +#' @inheritParams mainClustering #' @param ncores the number of threads -#' @param random.seed a value to set seed before each run of clusterSingle (so -#' that all of the runs are run on the same subsample of the data). Note, if -#' 'random.seed' is set, argument 'ncores' should NOT be passed via -#' subsampleArgs; instead set the argument 'ncores' of -#' clusterMany directly (which is preferred for improving speed anyway). +#' @param random.seed a value to set seed before each run of clusterSingle (so +#' that all of the runs are run on the same subsample of the data). Note, if +#' 'random.seed' is set, argument 'ncores' should NOT be passed via +#' subsampleArgs; instead set the argument 'ncores' of clusterMany directly +#' (which is preferred for improving speed anyway). #' @param run logical. If FALSE, doesn't run clustering, but just returns matrix #' of parameters that will be run, for the purpose of inspection by user (with -#' rownames equal to the names of the resulting column names of clMat object -#' that would be returned if \code{run=TRUE}). Even if \code{run=FALSE}, +#' rownames equal to the names of the resulting column names of clMat object +#' that would be returned if \code{run=TRUE}). Even if \code{run=FALSE}, #' however, the function will create the dimensionality reductions of the data #' indicated by the user input. -#' @param ... For signature \code{list}, arguments to be passed on to mclapply -#' (if ncores>1). For all the other signatures, arguments to be passed to the +#' @param ... For signature \code{list}, arguments to be passed on to mclapply +#' (if ncores>1). For all the other signatures, arguments to be passed to the #' method for signature \code{list}. #' @param verbose logical. If TRUE it will print informative messages. -#' @details While the function allows for multiple values of clusterFunction, -#' the code does not reuse the same subsampling matrix and try different -#' clusterFunctions on it. If sequential=TRUE, different -#' subsampleclusterFunctions will create different sets of data to subsample -#' so it is not possible; if sequential=FALSE, we have not implemented -#' functionality for this reuse. Setting the \code{random.seed} value, -#' however, should mean that the subsampled matrix is the same for each, but -#' there is no gain in computational complexity (i.e. each subsampled +#' @details Some combinations of these parameters are not feasible. See the +#' documentation of \code{\link{clusterSingle}} for important information on +#' how these parameter choices interact. +#' @details While the function allows for multiple values of clusterFunction, +#' the code does not reuse the same subsampling matrix and try different +#' clusterFunctions on it. This is because if sequential=TRUE, different +#' subsample clusterFunctions will create different sets of data to subsample +#' so it is not possible; if sequential=FALSE, we have not implemented +#' functionality for this reuse. Setting the \code{random.seed} value, +#' however, should mean that the subsampled matrix is the same for each, but +#' there is no gain in computational complexity (i.e. each subsampled #' co-occurence matrix is recalculated for each set of parameters). -#' -#' @details The argument 'ks' is interpreted differently for different choices -#' of the other parameters. When/if sequential=TRUE, ks defines the argument -#' k0 of \code{\link{seqCluster}}. Otherwise, 'ks' values are set in both -#' subsampleArgs[["k"]] and clusterDArgs[["k"]] that are passed to -#' \code{\link{clusterD}} and \code{\link{subsampleClustering}}. This passing -#' of these arguments via \code{subsampleArgs[["k"]]} will only have an effect -#' if `subsample=TRUE`. Similarly, the passing of \code{clusterDArgs[["k"]]} -#' will only have an effect when the clusterFunction argument includes a -#' clustering algorithm of type "K". When/if "findBestK=TRUE", ks also defines -#' the kRange argument of \code{\link{clusterD}} unless kRange is specified by -#' the user via the clusterDArgs; note this means that the default option of -#' setting kRange that depends on the input k (see \code{\link{clusterD}}) is -#' not available in clusterMany. -#' @details If the input is a \code{ClusterExperiment} object, currently -#' existing \code{orderSamples},\code{coClustering} or dendrogram slots will -#' be retained. -#' @return If \code{run=TRUE} and the input is either a matrix, a -#' \code{SummarizedExperiment} object, or a \code{ClusterExperiment} object, +#' +#' @details The argument \code{ks} is interpreted differently for different +#' choices of the other parameters. When/if sequential=TRUE, \code{ks} defines +#' the argument \code{k0} of \code{\link{seqCluster}}. Otherwise, \code{ks} +#' values are the \code{k} values for \strong{both} the mainClustering and +#' subsampling step (i.e. assigned to the \code{subsampleArgs} and +#' \code{mainClusterArgs} that are passed to \code{\link{mainClustering}} and +#' \code{\link{subsampleClustering}} unless \code{k} is set appropriately in +#' \code{subsampleArgs}. The passing of these arguments via +#' \code{subsampleArgs} will only have an effect if `subsample=TRUE`. +#' Similarly, the passing of \code{mainClusterArgs[["k"]]} will only have an +#' effect when the clusterFunction argument includes a clustering algorithm of +#' type "K". When/if "findBestK=TRUE", \code{ks} also defines the +#' \code{kRange} argument of \code{\link{mainClustering}} unless \code{kRange} is +#' specified by the user via the \code{mainClusterArgs}; note this means that the +#' default option of setting \code{kRange} that depends on the input \code{k} +#' (see \code{\link{mainClustering}}) is not available in \code{clusterMany}, only +#' in \code{\link{clusterSingle}}. +#' @details If the input is a \code{ClusterExperiment} object, current +#' implementation is that existing \code{orderSamples},\code{coClustering} or +#' the many dendrogram slots will be retained. +#' @return If \code{run=TRUE} and the input is either a matrix, a +#' \code{SummarizedExperiment} object, or a \code{ClusterExperiment} object, #' will return a \code{ClusterExperiment} object, where the results are stored -#' as clusterings with clusterTypes \code{clusterMany}. Depending on -#' \code{eraseOld} argument above, this will either delete existing such -#' objects, or change the clusterTypes of existing objects. See argument -#' \code{eraseOld} above. Arbitrarily the first clustering is set as the +#' as clusterings with clusterTypes \code{clusterMany}. Depending on +#' \code{eraseOld} argument above, this will either delete existing such +#' objects, or change the clusterTypes of existing objects. See argument +#' \code{eraseOld} above. Arbitrarily the first clustering is set as the #' primaryClusteringIndex. -#' -#' @return If \code{run=TRUE} and the input is a list of data sets, a list with -#' the following objects: \itemize{ \item{\code{clMat}}{ a matrix with each -#' column corresponding to a clustering and each row to a sample.} -#' \item{\code{clusterInfo}}{ a list with information regarding clustering +#' +#' @return If \code{run=TRUE} and the input is a list of data sets, a list with +#' the following objects: \itemize{ \item{\code{clMat}}{ a matrix with each +#' column corresponding to a clustering and each row to a sample.} +#' \item{\code{clusterInfo}}{ a list with information regarding clustering #' results (only relevant entries for those clusterings with sequential=TRUE)} -#' \item{\code{paramMatrix}}{ a matrix giving the parameters of each -#' clustering, where each column is a possible parameter set by the user and -#' passed to \code{\link{clusterSingle}} and each row of paramMatrix -#' corresponds to a clustering in \code{clMat}} \item{\code{clusterDArgs}}{ a -#' list of (possibly modified) arguments to clusterDArgs} -#' \item{\code{seqArgs=seqArgs}}{a list of (possibly modified) arguments to -#' seqArgs} \item{\code{subsampleArgs}}{a list of (possibly modified) +#' \item{\code{paramMatrix}}{ a matrix giving the parameters of each +#' clustering, where each column is a possible parameter set by the user and +#' passed to \code{\link{clusterSingle}} and each row of paramMatrix +#' corresponds to a clustering in \code{clMat}} \item{\code{mainClusterArgs}}{ a +#' list of (possibly modified) arguments to mainClusterArgs} +#' \item{\code{seqArgs=seqArgs}}{a list of (possibly modified) arguments to +#' seqArgs} \item{\code{subsampleArgs}}{a list of (possibly modified) #' arguments to subsampleArgs} } -#' @return If \code{run=FALSE} a list similar to that described above, but +#' @return If \code{run=FALSE} a list similar to that described above, but #' without the clustering results. #' #' @examples @@ -149,7 +163,7 @@ #' system.time(clusterTrack <- clusterMany(simData, ks=2:15, #' alphas=c(0.1,0.2,0.3), findBestK=c(TRUE,FALSE), sequential=c(FALSE), #' subsample=c(FALSE), removeSil=c(TRUE), clusterFunction="pam", -#' clusterDArgs=list(minSize=5, kRange=2:15), ncores=1, random.seed=48120)) +#' mainClusterArgs=list(minSize=5, kRange=2:15), ncores=1, random.seed=48120)) #' } #' #' @rdname clusterMany @@ -169,8 +183,8 @@ # # clSmaller <- clusterMany(simData, nPCADims=c(5,10,50), dimReduce="PCA", # paramMatrix=checkParamsMat, subsampleArgs=checkParams$subsampleArgs, -# seqArgs=checkParams$seqArgs, clusterDArgs=checkParams$clusterDArgs) -#' @export +# seqArgs=checkParams$seqArgs, mainClusterArgs=checkParams$mainClusterArgs) +#' @export setMethod( f = "clusterMany", signature = signature(x = "matrix"), @@ -223,14 +237,14 @@ setMethod( silCutoff=0, distFunction=NA, betas=0.9, minSizes=1, verbose=FALSE, - clusterDArgs=NULL, + mainClusterArgs=NULL, subsampleArgs=NULL, seqArgs=NULL, ncores=1, random.seed=NULL, run=TRUE, ... ) { - paramMatrix<-NULL + paramMatrix<-NULL data <- x if(!is.null(random.seed)){ if(!is.null(subsampleArgs) && "ncores" %in% names(subsampleArgs)){ @@ -240,7 +254,7 @@ setMethod( if(!all(sapply(data, function(y){is.matrix(y) || is.data.frame(y)}))) { stop("if data is a list, it must be a list with each element of the list a data.frame or matrix") } - #check all same number of observations: + #check all same number of observations: Why do we have this check?? Why does it matter?? if(!length(unique(sapply(data,NCOL)))==1) { stop("All data sets must have the same number of observations") } @@ -261,39 +275,41 @@ setMethod( #code sets to single value and then will do unique #also deals with just in case the user gave duplicated values of something by mistake. ########### - typeK <- which(param[,"clusterFunction"] %in% c("pam","hierarchicalK")) + paramAlgTypes<-algorithmType(param[,"clusterFunction"]) + if(length(paramAlgTypes)!=nrow(param)) stop("Internal coding error in clusterMany: not getting right number of type of algorithms from param") + typeK <- which( paramAlgTypes=="K") if(length(typeK)>0){ - param[typeK,"alpha"] <- NA #just a nothing value, because doesn't mean anything here - #param[typeK,"beta"] <- NA #just a nothing value, because doesn't mean anything here - - #if findBestK make sure other arguments make sense: + param[typeK,"alpha"] <- NA #just a nothing value, because doesn't mean anything here + #-------- + #if findBestK make sure other arguments make sense: + #-------- whFindBestK <- which(param[,"findBestK"]) if(length(whFindBestK)>0){ - #by default make kRange in clusterD equal to the ks. Note this will be true of ALL - if(!"kRange" %in% names(clusterDArgs)) { - clusterDArgs[["kRange"]]<-ks + #by default make kRange in mainClustering equal to the ks. Note this will be true of ALL + if(!"kRange" %in% names(mainClusterArgs)) { + mainClusterArgs[["kRange"]]<-ks } - #if findBestK=TRUE, and sequential=FALSE, then need to set 'k'=NA whNoSeq <- which(!param[,"sequential"]) if(length(intersect(whFindBestK,whNoSeq))>0){ param[intersect(whFindBestK,whNoSeq),"k"] <- NA } - - #and if subsample=TRUE, then user needs to set k via subsampleArgs + #and if subsample=TRUE, then user needs to set k via subsampleArgs + ##Might could handle this better by call to .checkSubsampleClusterDArgs whNoSeqSub <- which(!param[,"sequential"] & param[,"subsample"]) if(length(intersect(whFindBestK,whNoSeqSub))>0 & - is.null(subsampleArgs[["k"]])) { - stop("must provide k in subsampleArgs because there are combinations of findBestK=TRUE, sequential=FALSE and subsample=TRUE. (Note this will set 'k' for all that subsample, even for other parameter combinations)") + is.null(subsampleArgs[["clusterArgs"]]) && is.null(subsampleArgs[["clusterArgs"]][["k"]])){ + stop("must provide k in 'clusterArgs' element of 'subsampleArgs' because there are combinations of findBestK=TRUE, sequential=FALSE and subsample=TRUE. (Note this will set 'k' for all combinations that subsample, not just this parameter combinations)") } } } - type01 <- which(param[,"clusterFunction"] %in% c("hierarchical01","tight")) + type01 <- which( paramAlgTypes=="01") if(length(type01)>0){ param[type01,"findBestK"] <- FALSE param[type01,"removeSil"] <- FALSE param[type01,"silCutoff"] <- 0 } + ##Turn off distFunction for those that subsample, because will use that of co-occurance whSubsample<-which(param[,"subsample"]) if(length(whSubsample)>0){ param[whSubsample,"distFunction"]<-NA @@ -314,7 +330,9 @@ setMethod( ##### #deal with those that are invalid combinations: - ##### + # Might could handle this better by call to .checkSubsampleClusterDArgs for each parameter combination + # Also, if ever reinstate param option, then should apply these checks to that param + ###### whInvalid <- which(!param[,"subsample"] & param[,"sequential"] & param[,"findBestK"]) if(length(whInvalid)>0) { @@ -332,8 +350,8 @@ setMethod( param<-param[-whInvalid,] } - #if type K and not findBestK, need to give the k value. - whInvalid <- which(is.na(param[,"k"]) & !param[,"findBestK"] & param[,"clusterFunction"] %in% c("pam","hierarchicalK") ) + #if type K and not findBestK, need to give the k value. + whInvalid <- which(is.na(param[,"k"]) & !param[,"findBestK"] & algorithmType(param[,"clusterFunction"])=="K" ) if(length(whInvalid)>0){ param<-param[-whInvalid,] } @@ -353,13 +371,11 @@ setMethod( } else { stop("set of parameters imply only 1 combination. If you wish to run a single clustering, use 'clusterSingle'") } - cnames <- gsub("dataset=","",cnames) cnames <- gsub("= ","=",cnames) - cnames[param[,"sequential"]] <- gsub("k=", "k0=", - cnames[param[,"sequential"]]) + cnames[param[,"sequential"]] <- gsub("k=", "k0=", cnames[param[,"sequential"]]) rownames(param) <- cnames - } else{ + } else{ #if paramMatrix!=NULL, have killed off this code for now, because doesn't work. if(!run) { stop("If paramMatrix is given, run should be TRUE. Otherwise there is no effect.") } @@ -371,12 +387,13 @@ setMethod( stop("input paramMatrix must have row names") } cnames<-rownames(paramMatrix) - } + } if(verbose) { cat(nrow(param),"parameter combinations,",sum(param[,"sequential"]),"use sequential method.\n") } - + if(is.null(mainClusterArgs)) mainClusterArgs<-list(clusterArgs=list()) + if(is.null(subsampleArgs)) subsampleArgs<-list(clusterArgs=list()) paramFun <- function(i){ par <- param[i,] #make them logical values... otherwise adds a space before the TRUE and doesn't recognize. @@ -392,34 +409,34 @@ setMethod( seqArgs[["k0"]] <- par[["k"]] } else{ #to be safe, set both in case user set one. - subsampleArgs[["k"]] <- par[["k"]] - clusterDArgs[["k"]] <- par[["k"]] + subsampleArgs[["clusterArgs"]][["k"]] <- par[["k"]] + mainClusterArgs[["clusterArgs"]][["k"]] <- par[["k"]] } } - #browser() - clusterDArgs[["alpha"]] <- par[["alpha"]] + mainClusterArgs[["clusterArgs"]][["alpha"]] <- par[["alpha"]] seqArgs[["beta"]] <- par[["beta"]] - clusterDArgs[["minSize"]] <- par[["minSize"]] - clusterDArgs[["findBestK"]] <- findBestK - clusterDArgs[["removeSil"]] <- removeSil - clusterDArgs[["silCutoff"]] <- par[["silCutoff"]] - clusterDArgs[["checkArgs"]] <- FALSE #turn off printing of warnings that arguments off + mainClusterArgs[["minSize"]] <- par[["minSize"]] + mainClusterArgs[["findBestK"]] <- findBestK + mainClusterArgs[["removeSil"]] <- removeSil + mainClusterArgs[["silCutoff"]] <- par[["silCutoff"]] + mainClusterArgs[["checkArgs"]] <- FALSE #turn off printing of warnings that arguments off + mainClusterArgs[["clusterFunction"]]<-clusterFunction seqArgs[["verbose"]]<-FALSE if(!is.null(random.seed)) { set.seed(random.seed) } + ##Note that currently, checkDiss=FALSE, also turns off warnings about arguments if(!is.null(distFunction)){ diss<- allDist[[paste(as.character(par[["dataset"]]),distFunction,sep="--")]] - clusterSingle(x=dataList[[as.character(par[["dataset"]])]], diss=diss,subsample=subsample, - clusterFunction=clusterFunction, clusterDArgs=clusterDArgs, + clusterSingle(x=dataList[[as.character(par[["dataset"]])]], diss=diss,subsample=subsample, dimReduce="none", + mainClusterArgs=mainClusterArgs, subsampleArgs=subsampleArgs, seqArgs=seqArgs, - sequential=sequential, transFun=function(x){x}) #dimReduce=dimReduce,ndims=ndims, - } + sequential=sequential, transFun=function(x){x},checkDiss=FALSE) } else clusterSingle(x=dataList[[as.character(par[["dataset"]])]], subsample=subsample, - clusterFunction=clusterFunction, clusterDArgs=clusterDArgs, + mainClusterArgs=mainClusterArgs, dimReduce="none", subsampleArgs=subsampleArgs, seqArgs=seqArgs, - sequential=sequential, transFun=function(x){x}) #dimReduce=dimReduce,ndims=ndims, - } + sequential=sequential, transFun=function(x){x},checkDiss=FALSE) + } if(run){ ##Calculate distances necessary only once if(any(!is.na(param[,"distFunction"]))){ @@ -429,9 +446,11 @@ setMethod( allDist<-lapply(1:nrow(distParam),function(ii){ distFun<-as.character(distParam[ii,"distFunction"]) dataName<-as.character(distParam[ii,"dataset"]) - fun<-get(distFun,envir=globalenv()) - distMat<-as.matrix(fun(t(dataList[[dataName]]))) - .checkDistFunction(distMat) #check it here! + algCheckType<-if(any(paramAlgTypes=="01")) "01" else "K" #be conservative and check for the 01 type if any of clusterFunctions are 01. + distMat<-.makeDiss(dataList[[dataName]],distFunction=distFun,checkDiss=TRUE,algType=algCheckType) + # fun<-get(distFun,envir=globalenv()) + # distMat<-as.matrix(fun(t(dataList[[dataName]]))) + # .checkDistFunction(distMat) #check it here! return(distMat) }) names(allDist)<-paste(distParam[,"dataset"],distParam[,"distFunction"],sep="--") @@ -466,13 +485,13 @@ setMethod( }, SIMPLIFY=FALSE) return(list(clMat=clMat, clusterInfo=clInfo, paramMatrix=param, - clusterDArgs=clusterDArgs, seqArgs=seqArgs, + mainClusterArgs=mainClusterArgs, seqArgs=seqArgs, subsampleArgs=subsampleArgs)) } else{ if(verbose) { cat("Returning Parameter Combinations without running them (to run them choose run=TRUE)\n") } - return(list(paramMatrix=param, clusterDArgs=clusterDArgs, seqArgs=seqArgs, + return(list(paramMatrix=param, mainClusterArgs=mainClusterArgs, seqArgs=seqArgs, subsampleArgs=subsampleArgs)) } } diff --git a/R/clusterSingle.R b/R/clusterSingle.R index 2ffbbaf9..f8654511 100644 --- a/R/clusterSingle.R +++ b/R/clusterSingle.R @@ -1,54 +1,146 @@ #' General wrapper method to cluster the data -#' -#' Given a data matrix, \code{\link{SummarizedExperiment}}, or -#' \code{\link{ClusterExperiment}} object, this function will find clusters, +#' +#' Given input data, \code{\link{SummarizedExperiment}}, or +#' \code{\link{ClusterExperiment}} object, this function will find clusters, #' based on a single specification of parameters. -#' -#' @param x the data on which to run the clustering (features in rows). -#' @param diss \code{n x n} data matrix of dissimilarities between the samples -#' on which to run the clustering (only if \code{subsample=FALSE}) +#' +#' @param x the data on which to run the clustering (features in rows), or a +#' \code{\link{SummarizedExperiment}}, or \code{\link{ClusterExperiment}} +#' object. +#' @param diss \code{n x n} data matrix of dissimilarities between the samples +#' on which to run the clustering. #' @param subsample logical as to whether to subsample via -#' \code{\link{subsampleClustering}} to get the distance matrix at each -#' iteration; otherwise the distance function will be determined by argument -#' \code{distFunction} passed in \code{clusterDArgs} (if input a data matrix). -#' @param sequential logical whether to use the sequential strategy (see -#' details of \code{\link{seqCluster}}). -#' @param clusterFunction passed to \code{\link{clusterD}} option -#' 'clusterFunction' to indicate method of clustering, see -#' \code{\link{clusterD}}. -#' @param clusterDArgs list of additional arguments to be passed to -#' \code{\link{clusterD}}. -#' @param subsampleArgs list of arguments to be passed to +#' \code{\link{subsampleClustering}}. If TRUE, clustering in mainClustering step is +#' done on the co-occurance between clusterings in the subsampled clustering +#' results. If FALSE, the mainClustering step will be run directly on +#' \code{x}/\code{diss} +#' @param sequential logical whether to use the sequential strategy (see details +#' of \code{\link{seqCluster}}). Can be used in combination with +#' \code{subsample=TRUE} or \code{FALSE}. +#' @param mainClusterArgs list of arguments to be passed for the mainClustering step, see +#' help pages of \code{\link{mainClustering}}. +#' @param subsampleArgs list of arguments to be passed to the subsampling step +#' (if \code{subsample=TRUE}), see help pages of #' \code{\link{subsampleClustering}}. -#' @param seqArgs list of additional arguments to be passed to -#' \code{\link{seqCluster}}. -#' @param isCount logical. Whether the data are in counts, in which case the -#' default \code{transFun} argument is set as log2(x+1). This is simply a -#' convenience to the user, and can be overridden by giving an explicit +#' @param seqArgs list of arguments to be passed to \code{\link{seqCluster}}. +#' @param isCount logical. Whether the data are in counts, in which case the +#' default \code{transFun} argument is set as log2(x+1). This is simply a +#' convenience to the user, and can be overridden by giving an explicit #' function to \code{transFun}. #' @param transFun function A function to use to transform the input data matrix #' before clustering. #' @param dimReduce character A character identifying what type of #' dimensionality reduction to perform before clustering. Options are -#' "none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more +#' "none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more #' details. -#' @param ndims integer An integer identifying how many dimensions to reduce to +#' @param ndims integer An integer identifying how many dimensions to reduce to #' in the reduction specified by \code{dimReduce} -#' @param clusterLabel a string used to describe the clustering. By -#' default it is equal to "clusterSingle", to indicate that this clustering is -#' the result of a call to \code{clusterSingle}. - -#' @param ... arguments to be passed on to the method for signature +#' @param clusterLabel a string used to describe the clustering. By default it +#' is equal to "clusterSingle", to indicate that this clustering is the result +#' of a call to \code{clusterSingle}. +#' @param checkDiss logical. Whether to check whether the input \code{diss} is +#' valid. +#' @param ... arguments to be passed on to the method for signature #' \code{matrix}. -#' -#' @details If sequential=TRUE, the sequential clustering controls the 'k' -#' argument of the underlying clustering so setting 'k=' in the list given to -#' clusterDArgs or subsampleArgs will not do anything and will produce a -#' warning to that effect. -#' -#' @return A \code{\link{ClusterExperiment}} object. -#' -#' @seealso \code{\link{clusterMany}} to compare multiple choices of parameters. +#' @details \code{clusterSingle} is an 'expert-oriented' function, intended to +#' be used when a user wants to run a single clustering and/or have a great +#' deal of control over the clustering parameters. Most users will find +#' \code{\link{clusterMany}} more relevant. However, \code{\link{clusterMany}} +#' makes certain assumptions about the intention of certain combinations of +#' parameters that might not match the user's intent; similarly +#' \code{\link{clusterMany}} does not directly take a dissimilarity matrix but +#' only a matrix of values \code{x} (though a user can define a distance +#' function to be applied to \code{x} in \code{\link{clusterMany}}). +#' @details Unlike \code{\link{clusterMany}}, most of the relevant arguments for +#' the actual clustering algorithms in \code{clusterSingle} are passed to the +#' relevant steps via the arguments \code{mainClusterArgs}, \code{subsampleArgs}, +#' and \code{seqArgs}. These arguments should be \emph{named} lists with +#' parameters that match the corresponding functions: +#' \code{\link{mainClustering}},\code{\link{subsampleClustering}}, and +#' \code{\link{seqCluster}}. These functions are not meant to be called by the +#' user, but rather accessed via calls to \code{clusterSingle}. But the user +#' can look at the help files of those functions for more information +#' regarding the parameters that they take. +#' @details Only certain combinations of parameters are possible for certain +#' choices of \code{sequential} and \code{subsample}. These restrictions are +#' documented below. \itemize{ \item{\code{clusterFunction} for +#' \code{mainClusterArgs}: }{The choice of \code{subsample=TRUE} also controls +#' what algorithm type of clustering functions can be used in the mainClustering +#' step. When \code{subsample=TRUE}, then resulting co-clustering matrix from +#' subsampling is converted to a dissimilarity (specificaly 1-coclustering +#' values) and is passed to \code{diss} of \code{\link{mainClustering}}. For this +#' reason, the \code{ClusterFunction} object given to \code{\link{mainClustering}} +#' via the argument \code{mainClusterArgs} must take input of the form of a +#' dissimilarity. When \code{subsample=FALSE} and \code{sequential=TRUE}, the +#' \code{clusterFunction} passed in \code{clusterArgs} element of +#' \code{mainClusterArgs} must define a \code{ClusterFunction} object with +#' \code{algorithmType} 'K'. When \code{subsample=FALSE} and +#' \code{sequential=FALSE}, then there are no restrictions on the +#' \code{ClusterFunction} and that clustering is applied directly to the input +#' data. } \item{\code{clusterFunction} for \code{subsampleArgs}: }{If the +#' \code{ClusterFunction} object given to the \code{clusterArgs} of +#' \code{subsamplingArgs} is missing the algorithm will use the default for +#' \code{\link{subsampleClustering}} (currently "pam"). If +#' \code{sequential=TRUE}, this \code{ClusterFunction} object must be of type +#' 'K'. } \item{Setting \code{k} for subsampling: }{If \code{subsample=TRUE} +#' and \code{sequential=TRUE}, the current K of the sequential iteration +#' determines the 'k' argument passed to \code{\link{subsampleClustering}} so +#' setting 'k=' in the list given to the subsampleArgs will not do anything +#' and will produce a warning to that effect (see documentation of +#' \code{\link{seqCluster}}).} \item{Setting \code{k} for mainClustering step: }{If +#' \code{sequential=TRUE} then the user should not set \code{k} in the +#' \code{clusterArgs} argument of \code{mainClusterArgs} because it must be set +#' by the sequential code, which has a iterative reseting of the parameters. +#' Specifically if \code{subsample=FALSE}, then the sequential method iterates +#' over choices of \code{k} to cluster the input data. And if +#' \code{subsample=TRUE}, then the \code{k} in the clustering of mainClustering step +#' (assuming the clustering function is of type 'K') will use the \code{k} +#' used in the subsampling step to make sure that the \code{k} used in the +#' mainClustering step is reasonable. } \item{Setting \code{findBestK} in +#' \code{mainClusterArgs}: }{If \code{sequential=TRUE} and +#' \code{subsample=FALSE}, the user should not set 'findBestK=TRUE' in +#' \code{mainClusterArgs}. This is because in this case the sequential method +#' changes \code{k}; an error message will be given if this combination of +#' options are set. However, if \code{sequential=TRUE} and +#' \code{subsample=TRUE}, then passing either 'findBestK=TRUE' or +#' 'findBestK=FALSE' via \code{mainClusterArgs} will function as expected +#' (assuming the \code{clusterFunction} argument passed to \code{mainClusterArgs} +#' is of type 'K'). In particular, the sequential step will set the number of +#' clusters \code{k} for clustering of each subsample. If findBestK=FALSE, +#' that same \code{k} will be used for mainClustering step that clusters the +#' resulting co-occurance matrix after subsampling. If findBestK=TRUE, then +#' \code{\link{mainClustering}} will search for best k. Note that the default +#' 'kRange' over which \code{\link{mainClustering}} searches when findBestK=TRUE +#' depends on the input value of \code{k} which is set by the sequential +#' method if \code{sequential=TRUE}), see above. The user can change +#' \code{kRange} to not depend on \code{k} and to be fixed across all of the +#' sequential steps by setting \code{kRange} explicitly in the +#' \code{mainClusterArgs} list.} } +#' @return A \code{\link{ClusterExperiment}} object if input was \code{x} a +#' matrix (or \code{assay} of a \code{ClusterExperiment} or +#' \code{SummarizedExperiment} object). +#' @return If input was \code{diss}, then the result is a list with values +#' \itemize{ \item{clustering: }{The vector of clustering results} +#' \item{clusterInfo: }{A list with information about the parameters run in +#' the clustering} \item{diss: }{The dissimilarity matrix used in the +#' clustering} } +#' @details To provide a distance matrix via the argument \code{distFunction}, +#' the function must be defined to take the distance of the rows of a matrix +#' (internally, the function will call \code{distFunction(t(x))}. This is to +#' be compatible with the input for the \code{dist} function. \code{as.matrix} +#' will be performed on the output of \code{distFunction}, so if the object +#' returned has a \code{as.matrix} method that will convert the output into a +#' symmetric matrix of distances, this is fine (for example the class +#' \code{dist} for objects returned by \code{dist} have such a method). If +#' \code{distFunction=NA}, then a default distance will be calculated based on +#' the type of clustering algorithm of \code{clusterFunction}. For type "K" +#' the default is to take \code{dist} as the distance function. For type "01", +#' the default is to take the (1-cor(x))/2. +#' +#' @seealso \code{\link{clusterMany}} to compare multiple choices of parameters, +#' and \code{\link{mainClustering}},\code{\link{subsampleClustering}}, and +#' \code{\link{seqCluster}} for the underlying functions called by +#' \code{clusterSingle}. #' #' @name clusterSingle #' @@ -59,33 +151,111 @@ #' #following code takes some time. #' #use clusterSingle to do sequential clustering #' #(same as example in seqCluster only using clusterSingle ...) -#' set.seed(44261) -#' clustSeqHier_v2 <- clusterSingle(simData, clusterFunction="hierarchical01", -#' sequential=TRUE, subsample=TRUE, subsampleArgs=list(resamp.n=100, samp.p=0.7, -#' clusterFunction="kmeans", clusterArgs=list(nstart=10)), -#' seqArgs=list(beta=0.8, k0=5), clusterDArgs=list(minSize=5)) +# ' set.seed(44261) +# ' clustSeqHier_v2 <- clusterSingle(simData, +# ' sequential=TRUE, subsample=TRUE, subsampleArgs=list(resamp.n=100, samp.p=0.7, +# ' clusterFunction="kmeans", clusterArgs=list(nstart=10)), +# ' seqArgs=list(beta=0.8, k0=5), mainClusterArgs=list(minSize=5, +#' clusterFunction="hierarchical01",clusterArgs=list(alpha=0.1))) #' } #' #' #use clusterSingle to do just clustering k=3 with no subsampling -#' clustNothing <- clusterSingle(simData, clusterFunction="pam", -#' subsample=FALSE, sequential=FALSE, clusterDArgs=list(k=3)) +#' clustNothing <- clusterSingle(simData, +#' subsample=FALSE, sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", +#' clusterArgs=list(k=3))) +#' #compare to standard pam +#' cluster::pam(t(simData),k=3,cluster.only=TRUE) +#' @aliases clusterSingle,missing,matrixOrNULL-method +#' @rdname clusterSingle +#' @export +setMethod( + f = "clusterSingle", + signature = signature(x = "missing",diss="matrixOrNULL"), + definition = function(x, diss,...) { + clusterSingle(x=NULL,diss=diss,...) + +}) + +#' @rdname clusterSingle +#' @export +setMethod( + f = "clusterSingle", + signature = signature(x = "matrixOrNULL",diss="missing"), + definition = function(x, diss,...) { + clusterSingle(x=x,diss=NULL,...) + +}) + +#' @rdname clusterSingle +#' @export +setMethod( + f = "clusterSingle", + signature = signature(x = "SummarizedExperiment", diss="missing"), + definition = function(x, ...) { + outval <- clusterSingle(assay(x), ...) + retval <- .addBackSEInfo(newObj=outval,oldObj=x) + return(retval) + } +) + + +#' @rdname clusterSingle +#' @param replaceCoClustering logical. Applicable if \code{x} is a +#' \code{ClusterExperiment} object. If TRUE, the co-clustering resulting from +#' subsampling is returned in the coClustering object and replaces any +#' existing coClustering object in the slot \code{coClustering}. #' @export -#' @aliases clusterSingle clusterSingle-methods clusterSingle,matrix-method -#' clusterSingle,ClusterExperiment-method clusterSingle,matrix,missing-method -#' clusterSingle,matrixOrMissing,matrixOrMissing-method +setMethod( + f = "clusterSingle", + signature = signature(x = "ClusterExperiment", diss="missing"), + definition = function(x, replaceCoClustering=FALSE,...) { + + outval <- clusterSingle(assay(x),transFun=transformation(x),...) + retval<-addClusters(x,outval) + #make most recent clustering the primary cluster + primaryClusterIndex(retval)<-nClusters(retval) + if(replaceCoClustering | is.null(outval@coClustering)) retval@coClustering<-outval@coClustering + return(retval) + } +) #' @rdname clusterSingle +#' @export setMethod( f = "clusterSingle", - signature = signature(x = "matrixOrMissing",diss="matrixOrMissing"), - definition = function(x, diss,subsample=TRUE, sequential=FALSE, - clusterFunction=c("tight", "hierarchical01", "pam","hierarchicalK"), - clusterDArgs=NULL, subsampleArgs=NULL, seqArgs=NULL, + signature = signature(x = "matrixOrNULL",diss="matrixOrNULL"), + definition = function(x, diss, subsample=TRUE, sequential=FALSE, + mainClusterArgs=NULL, subsampleArgs=NULL, seqArgs=NULL, isCount=FALSE,transFun=NULL, dimReduce=c("none","PCA","var","cv","mad"), - ndims=NA,clusterLabel="clusterSingle") { - if(missing(x)) x<-NULL - if(missing(diss)) diss<-NULL - input<-.checkXDissInput(x,diss) - if(input %in% c("X","both")){ + ndims=NA,clusterLabel="clusterSingle",checkDiss=TRUE) { + ########## + ##Check arguments and set defaults as needed + ##Note, some checks are duplicative of internal, but better here, because don't want to find error after already done extensive calculation... + ########## + checkOut<-.checkSubsampleClusterDArgs(x=x, diss=diss, subsample=subsample, sequential=sequential, mainClusterArgs=mainClusterArgs, subsampleArgs=subsampleArgs, checkDiss=checkDiss) + if(is.character(checkOut)) stop(checkOut) + else { + mainClusterArgs<-checkOut$mainClusterArgs + subsampleArgs<-checkOut$subsampleArgs + input<-checkOut$inputClusterD + } + if(sequential){ + if(is.null(seqArgs)) { + ##To DO: Question: if missing seqArgs, should we grab k0 from subsampleArgs? + stop("if sequential=TRUE, must give seqArgs so as to identify k0 and beta") + } + if(!"k0"%in%names(seqArgs)) { + stop("seqArgs must contain element 'k0'") + } + if(!"beta"%in%names(seqArgs)) { + stop("seqArgs must contain element 'beta'") + } + } + ########## + ## Handle dimensionality reduction: + ########## + ###Don't do this until do the checks, because takes some time. + if(input %in% c("X")){ + N <- dim(x)[2] origX <- x #ngenes x nsamples ########## ##transformation to data x that will be input to clustering @@ -103,94 +273,40 @@ setMethod( dimReduce=dimReduce, transFun=transFun, isCount=isCount) x <- transObj$x - #browser() if(is.null(dim(x)) || NCOL(x)!=NCOL(origX)) { stop("Error in the internal transformation of x") } transFun <- transObj$transFun #need it later to create clusterExperimentObject - N <- dim(x)[2] } else{ - mess<-"input to clusterSingle includes the original data matrix x" - if(subsample) stop("subsampling can only be done if",mess) - if(dimReduce!="none") stop("dimReduce only applies when",mess) + if(any(dimReduce!="none")) stop("dimReduce only applies when diss not given or clusterFunction object doesn't accept the given diss as input") + N<-nrow(diss) + if(!is.null(x)) origX<-x } - if(input %in% c("both","diss") && !is.null(clusterDArgs) && "distFunction" %in% names(clusterDArgs)){ - if(!is.na(clusterDArgs[["distFunction"]])) stop("if give diss as input to clusterSingle, cannot specify 'distFunction' in clusterDArgs") - } - ########## - ##Checks that arguments make sense: - ########## - if(!is.function(clusterFunction)){ - clusterFunction <- match.arg(clusterFunction) - typeAlg <- .checkAlgType(clusterFunction) - } - else{ - if(is.null(clusterDArgs) || (! "typeAlg" %in% names(clusterDArgs))) - stop("if you provide your own clustering algorithm to be passed to clusterD, then you must specify 'typeAlg' in clusterDArgs") - else typeAlg <- clusterDArgs[["typeAlg"]] - } - if(typeAlg == "K"){ - if("findBestK" %in% names(clusterDArgs) & !subsample & sequential){ - if(clusterDArgs[["findBestK"]]) - stop("Cannot do sequential clustering where subsample=FALSE and 'findBestK=TRUE' is passed via clusterDArgs. See help documentation.") - } - } - if(subsample){ - if(!is.null(clusterDArgs) && "distFunction" %in% names(clusterDArgs) && !is.na(clusterDArgs[["distFunction"]])){ - warning("if 'subsample=TRUE', 'distFunction' argument in clusterDArgs is ignored.") - clusterDArgs[["distFunction"]]<-NA - } + if(input %in% c("both","diss") && !is.null(mainClusterArgs) && "distFunction" %in% names(mainClusterArgs)){ + if(!is.na(mainClusterArgs[["distFunction"]])) stop("if give diss as input to clusterSingle, cannot specify 'distFunction' in mainClusterArgs") } + + + ########## + ## Start running clustering + ########## if(sequential){ - if(is.null(seqArgs)) { - stop("must give seqArgs so as to identify k0") - } - if(!"k0"%in%names(seqArgs)) { - stop("seqArgs must contain element 'k0'") - } - outlist <- do.call("seqCluster", + outlist <- do.call("seqCluster", c(list(x=x, diss=diss,subsample=subsample, subsampleArgs=subsampleArgs, - clusterDArgs=clusterDArgs, - clusterFunction=clusterFunction), seqArgs)) + mainClusterArgs=mainClusterArgs), seqArgs)) } else{ - if(subsample){ - if(is.null(subsampleArgs) || !("k" %in% names(subsampleArgs))){ - if(!is.null(clusterDArgs) && ("k" %in% names(clusterDArgs))){ - #give by default the clusterDArgs to subsampling. - warning("did not give 'k' in 'subsampleArgs'. - Set to 'k' argument in 'clusterDArgs'") - if(is.null(subsampleArgs)) - subsampleArgs <- list("k"=clusterDArgs[["k"]]) - else - subsampleArgs[["k"]] <- clusterDArgs[["k"]] - } - else - stop("if not sequential and do subsampling, - must pass 'k' in subsampleArgs") - } - } - else if(typeAlg=="K" && !is.null(clusterDArgs) && !"k" %in% names(clusterDArgs)){ - #if don't specify k, then must have findBestK=TRUE in clusterDArgs; - #is by default, so only need to check that if specified it, - #set it to TRUE - if("findBestK" %in% names(clusterDArgs) && !clusterDArgs[["findBestK"]]) - stop("if not sequential and clusterFunction is of type 'K' (e.g. pam) - and findBestK=FALSE in clusterDArgs, must pass 'k' via - clusterDArgs list") - } ########## - ##Actually run the clustering. .clusterWrapper just deciphers choices and makes clustering. + ##.clusterWrapper just deciphers choices and makes clustering. ########## - finalClusterList <- .clusterWrapper(x=x, diss=diss, clusterFunction=clusterFunction, + finalClusterList <- .clusterWrapper(x=x, diss=diss, subsample=subsample, subsampleArgs=subsampleArgs, - clusterDArgs=clusterDArgs, - typeAlg=typeAlg) - outlist <- list("clustering"=.convertClusterListToVector(finalClusterList$results, N)) + mainClusterArgs=mainClusterArgs) + outlist <- list("clustering"=.convertClusterListToVector(finalClusterList$result, N)) } clInfo<-list(list(clusterInfo = outlist$clusterInfo, @@ -198,7 +314,7 @@ setMethod( subsample = subsample, sequential = sequential, clusterFunction = clusterFunction, - clusterDArgs = clusterDArgs, + mainClusterArgs = mainClusterArgs, subsampleArgs = subsampleArgs, seqArgs = seqArgs, dimReduce=dimReduce, @@ -207,77 +323,46 @@ setMethod( ########## ## Convert to clusterExperiment Object ########## - if(input %in% c("X","both")){ + if(!is.null(x)){ #if give diss and x, will use diss but still have x to make CE object with retval <- clusterExperiment(origX, outlist$clustering, transformation=transFun, clusterInfo=clInfo, clusterTypes="clusterSingle") clusterLabels(retval)<-clusterLabel - if(!sequential) { - retval@coClustering<-finalClusterList$D + if(!sequential & subsample) { + retval@coClustering<-1-finalClusterList$diss } validObject(retval) return(retval) } else{ - out<-list(clustering=outlist$clustering,clusterInfo=clInfo) + out<-list(clustering=outlist$clustering,clusterInfo=clInfo,diss=outlist$diss) } } ) - -#' @rdname clusterSingle -#' @export -setMethod( - f = "clusterSingle", - signature = signature(x = "SummarizedExperiment", diss="missing"), - definition = function(x, ...) { - outval <- clusterSingle(assay(x), ...) - retval <- .addBackSEInfo(newObj=outval,oldObj=x) - return(retval) - } -) - - -#' @rdname clusterSingle -#' @export -setMethod( - f = "clusterSingle", - signature = signature(x = "ClusterExperiment", diss="missing"), - definition = function(x, ...) { - - outval <- clusterSingle(assay(x),...) - - ## eap: I think we should add it, so I changed it here. You might try a couple of versions. - retval<-addClusters(outval, x) #should keep primary cluster as most recent, so outval first - return(retval) - } -) - - -#wrapper that calls the clusterSampling and clusterD routines in reasonable order. -.clusterWrapper <- function(x, diss, subsample, clusterFunction,clusterDArgs=NULL, - subsampleArgs=NULL,typeAlg) +#wrapper that calls the clusterSampling and mainClustering routines in reasonable order. +#called by both seqCluster and clusterSingle +#clusterFunction assumed to be in mainClusterArgs and subsampleArgs +.clusterWrapper <- function(x, diss, subsample, mainClusterArgs=NULL, subsampleArgs=NULL) { if(subsample){ - if(is.null(subsampleArgs) || !"k" %in% names(subsampleArgs)) stop("must provide k in 'subsampleArgs' (or if sequential should have been set by sequential strategy)") Dbar<-do.call("subsampleClustering",c(list(x=x),subsampleArgs)) diss<-1-Dbar #make it a distance. x<-NULL - if(typeAlg=="K"){ - if(is.null(clusterDArgs)) clusterDArgs<-list(k=subsampleArgs[["k"]]) - else if(!"k" %in% names(clusterDArgs)) clusterDArgs[["k"]]<-subsampleArgs[["k"]] #either sequential sets this value, or get error in subsampleClustering, so always defined. - } - } - if(typeAlg=="K"){ - findBestK<-FALSE - if(!is.null(clusterDArgs) && "findBestK" %in% names(clusterDArgs)){ - findBestK<-clusterDArgs[["findBestK"]] - } - if(is.null(clusterDArgs) || (!"k" %in% names(clusterDArgs) && !findBestK)) stop("if not type 'K' algorithm, must give k in 'clusterDArgs' (or if sequential should have been set by sequential strategy)") + + ##This was to make it automatic so if subsample and didn't give 'k' to mainClustering, would do the same for mainClustering. Now have added this directly to sequential, and then by default if missing from subsampling should pull from mainClustering (i.e. should happen the other way). + # if(typeAlg=="K"){ + # if(is.null(mainClusterArgs)) mainClusterArgs<-list(k=subsampleArgs[["k"]]) + # else if(!"k" %in% names(mainClusterArgs)) mainClusterArgs[["k"]]<-subsampleArgs[["k"]] #either sequential sets this value, or get error in subsampleClustering, so always defined. + # } } - resList<-do.call("clusterD",c(list(x=x,diss=diss,format="list", clusterFunction=clusterFunction,returnD=TRUE),clusterDArgs)) - return(list(results=resList$result,D=resList$D)) + resList<-do.call("mainClustering",c(list(x=x,diss=diss,format="list", returnData=TRUE),mainClusterArgs)) + return(resList) } + + + + diff --git a/R/combineMany.R b/R/combineMany.R index c36d1cc1..9f22857c 100644 --- a/R/combineMany.R +++ b/R/combineMany.R @@ -9,11 +9,11 @@ #' @param whichClusters a numeric or character vector that specifies which #' clusters to compare (missing if x is a matrix) #' @param clusterFunction the clustering to use (passed to -#' \code{\link{clusterD}}); currently must be of type '01'. +#' \code{\link{mainClustering}}); currently must be of type '01'. #' @param minSize minimum size required for a set of samples to be considered in -#' a cluster because of shared clustering, passed to \code{\link{clusterD}} +#' a cluster because of shared clustering, passed to \code{\link{mainClustering}} #' @param proportion The proportion of times that two sets of samples should be -#' together in order to be grouped into a cluster (if <1, passed to clusterD +#' together in order to be grouped into a cluster (if <1, passed to mainClustering #' via alpha = 1 - proportion) #' @param propUnassigned samples with greater than this proportion of #' assignments equal to '-1' are assigned a '-1' cluster value as a last step @@ -24,17 +24,17 @@ #' @details The function tries to find a consensus cluster across many different #' clusterings of the same samples. It does so by creating a \code{nSamples} x #' \code{nSamples} matrix of the percentage of co-occurance of each sample and -#' then calling clusterD to cluster the co-occurance matrix. The function +#' then calling mainClustering to cluster the co-occurance matrix. The function #' assumes that '-1' labels indicate clusters that are not assigned to a #' cluster. Co-occurance with the unassigned cluster is treated differently #' than other clusters. The percent co-occurance is taken only with respect to #' those clusterings where both samples were assigned. Then samples with more #' than \code{propUnassigned} values that are '-1' across all of the #' clusterings are assigned a '-1' regardless of their cluster assignment. -#'@details The method calls \code{\link{clusterD}} on the proportion matrix with +#'@details The method calls \code{\link{mainClustering}} on the proportion matrix with #' \code{clusterFunction} as the 01 clustering algorithm, \code{alpha=1-proportion}, #' \code{minSize=minSize}, and \code{evalClusterMethod=c("average")}. See help of -#' \code{\link{clusterD}} for more details. +#' \code{\link{mainClustering}} for more details. #' @return If x is a matrix, a list with values #' \itemize{ #' \item{\code{clustering}}{ vector of cluster assignments, with "-1" implying @@ -45,7 +45,7 @@ #' out of those not '-1'} \item{\code{noUnassignedCorrection}{ a vector of #' cluster assignments before samples were converted to '-1' because had #' >\code{propUnassigned} '-1' values (i.e. the direct output of the -#' \code{clusterD} output.)}} +#' \code{mainClustering} output.)}} #' } #' #' @return If x is a \code{\link{ClusterExperiment}}, a @@ -88,7 +88,7 @@ setMethod( f = "combineMany", signature = signature(x = "matrix", whichClusters = "missing"), - definition = function(x, whichClusters, proportion=1, + definition = function(x, whichClusters, proportion, clusterFunction="hierarchical01", propUnassigned=.5, minSize=5) { @@ -104,12 +104,12 @@ setMethod( cl[is.na(cl)] <- -1 sharedPerct<-NULL } else{ - if(is.character(clusterFunction)){ - typeAlg <- .checkAlgType(clusterFunction) + if(is.character(clusterFunction)) typeAlg <- algorithmType(clusterFunction) + else if(class(clusterFunction)=="ClusterFunction") typeAlg<-algorithmType(clusterFunction) else stop("clusterFunction must be either built in clusterFunction name or a ClusterFunction object") if(typeAlg!="01") { - stop("combineMany is only implemented for '01' type clustering functions (see help of clusterD)") + stop("combineMany is only implemented for '01' type clustering functions (see ?ClusterFunction)") } - } + ##Make clusterMat character, just in case clusterMat <- apply(clusterMat, 2, as.character) clusterMat[clusterMat == "-1"] <- NA @@ -118,12 +118,12 @@ setMethod( #fix those pairs that have no clusterings for which they are both not '-1' diag(sharedPerct)[is.na(diag(sharedPerct)) | is.nan(diag(sharedPerct))]<-1 #only happens if -1 in all samples... sharedPerct[is.na(sharedPerct) | is.nan(sharedPerct)] <- 0 - cl <- clusterD(diss=1-sharedPerct, clusterFunction=clusterFunction, - alpha=1-proportion, minSize=minSize, format="vector", - clusterArgs=list(evalClusterMethod=c("average"))) + cl <- mainClustering(diss=1-sharedPerct, clusterFunction=clusterFunction, + minSize=minSize, format="vector", + clusterArgs=list(alpha=1-proportion, evalClusterMethod=c("average"))) if(is.character(cl)) { - stop("coding error -- clusterD should return numeric vector") + stop("coding error -- mainClustering should return numeric vector") } } ##Now define as unassigned any samples with >= propUnassigned '-1' values in clusterMat diff --git a/R/getFeatures.R b/R/getFeatures.R index d71adac1..db1d1714 100644 --- a/R/getFeatures.R +++ b/R/getFeatures.R @@ -18,9 +18,9 @@ #' @param isCount logical as to whether input data is count data, in which #' case to perform voom correction to data. See details. #' @param ... options to pass to \code{\link{topTable}} or -#' \code{\link{topTableF}} (see \code{\link{limma}} package) -#' @param normalize.method character value, passed to \code{\link{voom}} in -#' \code{\link{limma}} package. Only used if \code{countData=TRUE}. +#' \code{\link[limma]{topTableF}} (see \code{\link[limma]{limma}} package) +#' @param normalize.method character value, passed to \code{\link[limma]{voom}} in +#' \code{\link[limma]{limma}} package. Only used if \code{countData=TRUE}. #' Note that the default value is set to "none", which is not the #' default value of \code{\link{voom}}. #' @inheritParams clusterContrasts,ClusterExperiment-method @@ -68,8 +68,8 @@ #' should be the default for RNA-Seq data. If the input data is a #' `ClusterExperiment` object, setting `isCount=TRUE` will cause the program #' to ignore the internally stored transformation function and instead use -#' voom with log2(x+0.5). Alternatively, `isCount=FALSE` for a -#' `ClusterExperiment` object will cause the DE to be performed with `limma` +#' voom with log2(x+0.5). Alternatively, \code{isCount=FALSE} for a +#' \code{ClusterExperiment} object will cause the DE to be performed with \code{limma} #' after transforming the data with the stored transformation. Although some #' writing about "voom" seem to suggest that it would be appropriate for #' arbitrary transformations, the authors have cautioned against using it for @@ -96,12 +96,15 @@ #' the dendrogram.} #' } #' +#' @references Ritchie, ME, Phipson, B, Wu, D, Hu, Y, Law, CW, Shi, W, and Smyth, GK (2015). limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Research 43, e47. http://nar.oxfordjournals.org/content/43/7/e47 +#' @references Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom: precision weights unlock linear model analysis tools for RNA-seq read counts. Genome Biology 15, R29. http://genomebiology.com/2014/15/2/R29 +#' @references Smyth, G. K. (2004). Linear models and empirical Bayes methods for assessing differential expression in microarray experiments. Statistical Applications in Genetics and Molecular Biology, Volume 3, Article 3. http://www.statsci.org/smyth/pubs/ebayes.pdf #' @examples #' data(simData) #' #' #create a clustering, for 8 clusters (truth was 4) -#' cl <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -#' sequential=FALSE, clusterDArgs=list(k=8)) +#' cl <- clusterSingle(simData, subsample=FALSE, +#' sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) #' #' #basic F test, return all, even if not significant: #' testF <- getBestFeatures(cl, contrastType="F", number=nrow(simData), diff --git a/R/internalClusterFunctions.R b/R/internalClusterFunctions.R new file mode 100644 index 00000000..4d9275cc --- /dev/null +++ b/R/internalClusterFunctions.R @@ -0,0 +1,276 @@ +####### +#Internal algorithms for clustering +####### +#check what type +.checkAlgType<-function(clusterFunction){ + ##These return lists of indices of clusters satisifying alpha criteria + if(clusterFunction=="tight") type<-"01" + if(clusterFunction=="hierarchical01") type<-"01" + if(clusterFunction=="hierarchicalK") type<-"K" + if(clusterFunction=="pam") type<-"K" + return(type) +} +#convert list output into cluster vector. +.convertClusterListToVector<-function(clusterList,N) +{ + clust.id <- rep(-1, N) + nfound<-length(clusterList) + if(nfound>0){ + #make cluster ids in order of when found + for (i in 1:length(clusterList)) clust.id[clusterList[[i]]] <- i + } + return(clust.id) +} +#Note, only returns 'both' if inputType is not given...otherwise picks +.checkXDissInput<-function(x,diss,inputType=NA,algType,checkDiss=TRUE){ + if(is.null(x) & is.null(diss)) stop("must give either x or diss argument") + # if(!is.null(x) & !is.null(diss)) stop("cannot give both x and diss argument") + if(!is.null(x) & is.null(diss)) input<-"X" + if(!is.null(x) & !is.null(diss)) input<-"both" + if(is.null(x) & !is.null(diss)) input<-"diss" + if(input %in% c("diss","both") & checkDiss) .checkDissFunction(diss,algType=algType) + if(input == "both" && ncol(x)!=ncol(diss)) stop("ncol(x)!=ncol(diss): if both x and diss given then must have compatible dimensions.") + if(!is.na(inputType)){ + if(input=="both"){ + if(inputType=="diss") input<-"diss" + if(inputType=="X") input<-"X" + if(inputType=="either") input<-"diss" #if both given and both acceptable, use diss. + } + if(input == "diss" & inputType=="X") stop("given clusterFunction/classifyFuntion only takes a X matrix") + #commented this out, because actually want the ability to use distFunction to make default diss if missing one. + # if(input == "X" & inputType=="diss") stop("given clusterFunction/classifyFuntion only takes dissimilarity matrix") + + } + + return(input) +} +.makeDiss<-function(x,distFunction,algType,checkDiss){ + if(!is.function(distFunction)){ + if(length(distFunction)>1) stop("if distFunction is not a function, it must be of length 1") + if(is.character(distFunction)){ + distFunction<-get(distFunction,envir=globalenv()) + }else if(is.na(distFunction)){ + distFunction<-switch(algType, "01"=function(x){(1-cor(t(x)))/2}, "K"=function(x){dist(x)}) + }else stop("if distFunction is not a function, it must be either NA or a character") + } + D<-try(as.matrix(distFunction(t(x)))) #distances assumed to be of observations on rows + if(inherits(D,"try-error")) stop("input distance function gives error when applied to x") + if(!all(dim(D) == c(ncol(x),ncol(x)))) stop("input distance function must result in a ",ncol(x),"by",ncol(x),"matrix of distances") + if(checkDiss) .checkDissFunction(D,algType=algType) + return(D) + + +} +.checkDissFunction<-function(D,algType=NA){ + if(any(is.na(as.vector(D)))) stop("NA values found in dissimilarity matrix (could be from too small of subsampling if classifyMethod!='All', see documentation of subsampleClustering)") + if(any(is.na(D) | is.nan(D) | is.infinite(D))) stop("Dissimilarity matrix contains either NAs, NANs or Infinite values.") + if(any(D<0)) stop("Dissimilarity matrix must have strictly positive values") + if(any(diag(D)!=0)) stop("Dissimilarity matrix must have zero values on the diagonal") + if(!all(D==t(D))) stop("Dissimilarity matrix must result in a symmetric matrix") + if(algType=="01" & any(D>1)) stop("distance function must give values between 0 and 1 when algorithm type of the ClusterFunction object is '01'") +} + + + + +.clusterVectorToList<-function(vec){ + clList<-tapply(1:length(vec),vec,function(x){x},simplify=FALSE) + whNotAssign<-which(sapply(clList,function(x){all(vec[x]== -1)})) + if(length(whNotAssign)>1) stop("Internal coding error in removing unclustered samples") + if(length(whNotAssign)>0) clList<-clList[-whNotAssign] +} +.clusterListToVector<-function(ll,N){ + if(length(ll)==0) return(rep(-1,N)) + else{ + names(ll)<-as.character(1:length(ll)) + clId<-lapply(1:length(ll),function(ii){rep(ii,length=length(ll[[ii]]))}) + clVal<-unlist(clId) + clInd<-unlist(ll) + clusterVec<-rep(-1,length=N) + clusterVec[clInd]<-clVal + return(clusterVec) + + } + +} +.orderByAlpha<-function(res,S) +{ + if(length(res)>0){ + alphaMax<-unlist(lapply(res, function(x){ + vals<-lower.tri(S[x,x]) #don't grab diag + 1-min(vals) #max(alpha)=1-min(S) + })) + res <- res[order(alphaMax, decreasing=TRUE)] + + } + else return(res) +} + +.makeDataArgs<-function(dataInput,funInput,xData,dissData){ + if(dataInput=="X"){ + if(funInput=="diss") stop("Internal coding error: should have caught that wrong data input ('X') for this clusterFunction") + argsClusterList<-switch(funInput,"X"=list(x=xData), "either"=list(diss=NULL,x=xData)) + } + if(dataInput=="diss"){ + if(funInput=="X") stop("Internal coding error: should have caught that wrong data input ('diss') for this clusterFunction") + argsClusterList<-switch(funInput,"diss"=list(diss=dissData), "either"=list(diss=dissData,x=NULL) ) + } + return(argsClusterList) +} + +###This function checks the mainClusterArgs and subsampleArgs to make sure make sense with combination of sequential, subsample, x, and diss given by the user. If error, returns a character string describing error, otherwise returns list with necessary information. +.checkSubsampleClusterDArgs<-function(x,diss,subsample,sequential,mainClusterArgs,subsampleArgs,checkDiss,warn=checkDiss){ + + ######## + #checks for mainClustering stuff + ######## + if("clusterFunction" %in% names(mainClusterArgs)){ + #get clusterFunction for cluster D + clusterFunction<-mainClusterArgs[["clusterFunction"]] + if(is.character(clusterFunction)) clusterFunction<-getBuiltInFunction(clusterFunction) + + #Following input commands will return only X or Diss because gave the inputType argument... + input<-.checkXDissInput(x, diss, inputType=inputType(clusterFunction), algType=algorithmType(clusterFunction), checkDiss=checkDiss) + algType<-algorithmType(clusterFunction) + } + else{ + return("Must provide 'clusterFunction' for the mainClustering step to be able to run (give 'clusterFunction' argument via 'mainClusterArgs')") + } + #this check is done in mainClustering, but want to do this check before run subsampling...also can give message that clearer that it refers to mainClustering. + reqArgs<-requiredArgs(clusterFunction) + if(sequential & length(reqArgs)>0) reqArgs<- reqArgs[-which(reqArgs=="k")] + #remove required args not needed if certain postProcessArgs are given: + # don't need define 'k' if choose 'findBestK=TRUE' + if(length(reqArgs)>0 & algorithmType(clusterFunction)=="K" & "findBestK" %in% names(mainClusterArgs)){ + if(mainClusterArgs[["findBestK"]]) reqArgs<-reqArgs[-which(reqArgs=="k")] + } + if(length(reqArgs)>0){ + if(("clusterArgs"%in% names(mainClusterArgs) & !all(reqArgs %in% names(mainClusterArgs[["clusterArgs"]]))) || !("clusterArgs"%in% names(mainClusterArgs))) return(paste("For the clusterFunction algorithm type ('",algorithmType(clusterFunction),"') given in 'mainClusterArgs', must supply arguments:",reqArgs,"These must be supplied as elements of the list of 'clusterArgs' given in 'mainClusterArgs'")) + } + if(sequential){ + #Reason, if subsample=FALSE, then need to change k of the mainClustering step for sequential. If subsample=TRUE, similarly set the k of mainClustering step to match that used in subsample. Either way, can't be predefined by user + if("clusterArgs" %in% names(mainClusterArgs)){ + if("k" %in% names(mainClusterArgs[["clusterArgs"]]) ){ + #remove predefined versions of k from both. + whK<-which(names(mainClusterArgs[["clusterArgs"]])=="k") + if(warn) warning("Setting 'k' in mainClusterArgs when sequential clustering is requested will have no effect.") + mainClusterArgs[["clusterArgs"]]<-mainClusterArgs[["clusterArgs"]][-whK] + } + }else{ + mainClusterArgs[["clusterArgs"]]<-NULL #make it exist... not sure if I need this. + } + } + ######### + # Checks related to subsample=TRUE + ######### + if(subsample){ + #Reason: if subsampling, then the D from subsampling sent to the clusterFunction. + if(inputType(clusterFunction)=="X") return("If choosing subsample=TRUE, the clusterFunction used in the mainClustering step must take input that is dissimilarity.") + if("clusterFunction" %in% names(subsampleArgs)){ + subsampleCF<-subsampleArgs[["clusterFunction"]] + if(is.character(subsampleCF)) subsampleCF<-getBuiltInFunction(subsampleCF) + subsampleAlgType<-algorithmType(subsampleCF) + #Reason: seqCluster requires subsampling cluster function to be of type "K" + if(sequential & algorithmType(subsampleCF)!="K"){ + if(warn) warning("If subsample=TRUE, sequentical clustering can only be implemented with a clusterFunction for subsampling that has algorithmType 'K'. See documentation of seqCluster. Will ignore this argument of subsampleArgs and set to default of 'pam'") + subsampleArgs[["clusterFunction"]]<-"pam" + } + inputSubsample<-.checkXDissInput(x,diss, inputType=inputType(subsampleCF), algType=algorithmType(subsampleCF), checkDiss=checkDiss) #if algorithm on one is 01 and other isn't, need to check diss again. + diffSubsampleCF<-TRUE + } +# else stop("must provide clusterFunction to subsampleArgs if subsample=TRUE") + #this makes default to be same as mainClustering + else{ + if(!sequential || algorithmType(clusterFunction)=="K"){ + if(warn) warning("a clusterFunction was not set for subsampleClustering -- set to be the same as the mainClustering step.") + subsampleArgs[["clusterFunction"]]<-clusterFunction + subsampleCF<-clusterFunction + inputSubsample<-input + diffSubsampleCF<-FALSE + } + else{ + if(warn) warning("a clusterFunction was not set for subsampleClustering and sequential=TRUE means that it must be of type 'K' so cannot be set to that of mainClustering step. The clusterFunction was set to the default of 'pam'") + subsampleArgs[["clusterFunction"]]<-"pam" + subsampleCF<-getBuiltInFunction("pam") + inputSubsample<-.checkXDissInput(x,diss, inputType=inputType(subsampleCF), algType=algorithmType(subsampleCF), checkDiss=checkDiss) #if algorithm on one is 01 and other isn't, need to check diss again. + diffSubsampleCF<-TRUE + } + } + if(is.null(subsampleCF@classifyFUN)){ + if("classifyMethod" %in% names(subsampleArgs) && subsampleArgs[["classifyMethod"]]!="InSample") stop("Cannot set 'classifyMethod' to anything but 'InSample' if do not specify a clusterFunction in subsampleArgs that has a non-null classifyFUN slot") + subsampleArgs[["classifyMethod"]]<-"InSample" + } + #Reason: check subsampleArgs has required arguments for function, repeated from subsamplingClustering, but want it here before do calculations... if not, see if can borrow from mainClusterArgs + ##------ + ##Check have required args for subsample. If missing, 'borrow' those args from mainClusterArgs. + ##------ + reqSubArgs<-requiredArgs(subsampleCF) + + #Reason: sequential sets k for the subsampling via k0 + if(sequential & length(reqSubArgs)>0) reqSubArgs<- reqSubArgs[-which(reqSubArgs=="k")] + if(length(reqSubArgs)>0){ + #check if can borrow... + if("clusterArgs" %in% names(mainClusterArgs)){ + mainReqArgs<-requiredArgs(clusterFunction) + mainReqArgs<-mainReqArgs[mainReqArgs%in%names(mainClusterArgs[["clusterArgs"]])] + if(!is.null(subsampleArgs) && "clusterArgs" %in% names(subsampleArgs)){ + #check if existing clusterArgs has required names already + #if not, give them those of mainClustering if exist. + if(!all(reqSubArgs %in% names(subsampleArgs[["clusterArgs"]]))) { + missingArgs<-reqSubArgs[!reqSubArgs%in%names(subsampleArgs[["clusterArgs"]])] + missingArgs<-missingArgs[missingArgs%in%mainReqArgs] + + } + else missingArgs<-c() + } + else{ + missingArgs<-reqSubArgs[reqSubArgs%in%mainReqArgs] + } + if(length(missingArgs)>0){ + subsampleArgs[["clusterArgs"]][missingArgs]<-mainClusterArgs[["clusterArgs"]][missingArgs] + if(warn) warning("missing arguments ",missingArgs," provided from those in 'mainClusterArgs'") + } + } + #now check if got everything needed... + if(("clusterArgs" %in% names(subsampleArgs) & !all(reqSubArgs %in% names(subsampleArgs[["clusterArgs"]]))) || !("clusterArgs"%in% names(subsampleArgs))) return(paste("For the clusterFunction algorithm type ('",algorithmType(subsampleCF),"') given in 'subsampleArgs', must supply arguments:",reqSubArgs,". These must be supplied as elements of the list of 'clusterArgs' given in 'subsampleArgs'")) + } + #Reason, if subsample=TRUE, user can't set distance function because use diss from subsampling. + if(!is.null(mainClusterArgs) && "distFunction" %in% names(mainClusterArgs) && !is.na(mainClusterArgs[["distFunction"]])){ + if(warn) warning("if 'subsample=TRUE', 'distFunction' argument in mainClusterArgs is ignored.") + mainClusterArgs[["distFunction"]]<-NA + } + if(sequential){ + #Reason: if subsampling, sequential goes over different k values, so user can't set k + if("clusterArgs" %in% names(subsampleArgs) && "k" %in% names(subsampleArgs[["clusterArgs"]])){ + #remove predefined versions of k from both. + whK<-which(names(subsampleArgs[["clusterArgs"]])=="k") + if(warn) warning("Setting 'k' in subsampleArgs when sequential=TRUE is called will have no effect.") + subsampleArgs[["clusterArgs"]]<-subsampleArgs[["clusterArgs"]][-whK] + } + if(!"clusterArgs" %in% names(subsampleArgs) ){ + subsampleArgs[["clusterArgs"]]<-list() #make it if doesn't exist + } + } + } + else{ #not subsample + if(sequential){ + #Reason: if subsample=FALSE, and sequential then need to adjust K in the mainClustering step, so need algorithm of type K + if(algorithmType(clusterFunction) != "K"){ + return("if subsample=FALSE, sequentical clustering can only be implemented with a clusterFunction with algorithmType 'K'. See documentation of seqCluster.") + subsampleArgs<-subsampleArgs[-which(names(subsampleArgs)=="clusterFunction")] + } + #Reason: subsample=FALSE can't do sequential clustering and findBestK=TRUE because need to remove cluster based on testing many k and finding stable, and if not doing it over subsample, then do it over actual clustering + if(algorithmType(clusterFunction) == "K"){ + if("findBestK" %in% names(mainClusterArgs)){ + if(mainClusterArgs[["findBestK"]]) return("Cannot do sequential clustering where subsample=FALSE and 'findBestK=TRUE' is passed to the mainClustering step via mainClusterArgs. See help documentation of seqCluster.") + } + } + } + + } + + return(list(inputClusterD=input,mainClusterArgs=mainClusterArgs,subsampleArgs=subsampleArgs)) + +} + + diff --git a/R/internalFunctions.R b/R/internalFunctions.R index 107ea3a6..56354d76 100644 --- a/R/internalFunctions.R +++ b/R/internalFunctions.R @@ -1,19 +1,3 @@ -.checkXDissInput<-function(x,diss){ - if(is.null(x) & is.null(diss)) stop("must give either x or diss argument") - # if(!is.null(x) & !is.null(diss)) stop("cannot give both x and diss argument") - if(!is.null(x) & is.null(diss)) input<-"X" - if(!is.null(x) & !is.null(diss)) input<-"both" - if(is.null(x) & !is.null(diss)) input<-"diss" - if(input %in% c("diss","both")) .checkDistFunction(diss) - if(input == "both" && ncol(x)!=ncol(diss)) stop("ncol(x)!=ncol(diss): if both x and diss then must have compatible dimensions.") - return(input) -} -.checkDistFunction<-function(D){ - if(any(is.na(as.vector(D)))) stop("NA values found in D (could be from too small of subsampling if classifyMethod!='All', see documentation of subsampleClustering)") - if(any(is.na(D) | is.nan(D) | is.infinite(D))) stop("D matrix contains either NAs, NANs or Infinite values.") - if(any(D<0)) stop("distance function must give strictly positive values") - if(any(diag(D)!=0)) stop("distance function must have zero values on the diagonal of the distance matrix") -} .addPrefixToClusterNames<-function(ceObj,prefix,whCluster){ ceLegend<-clusterLegend(ceObj)[[whCluster]] @@ -244,32 +228,6 @@ return(wh) } -####### -#Internal algorithms for clustering -####### -#check what type -.checkAlgType<-function(clusterFunction){ - ##These return lists of indices of clusters satisifying alpha criteria - if(clusterFunction=="tight") type<-"01" - if(clusterFunction=="hierarchical01") type<-"01" - if(clusterFunction=="hierarchicalK") type<-"K" - if(clusterFunction=="pam") type<-"K" - return(type) -} - - - -#convert list output into cluster vector. -.convertClusterListToVector<-function(clusterList,N) -{ - clust.id <- rep(-1, N) - nfound<-length(clusterList) - if(nfound>0){ - #make cluster ids in order of when found - for (i in 1:length(clusterList)) clust.id[clusterList[[i]]] <- i - } - return(clust.id) -} #### diff --git a/R/mainClustering.R b/R/mainClustering.R new file mode 100644 index 00000000..68cccb72 --- /dev/null +++ b/R/mainClustering.R @@ -0,0 +1,281 @@ +#' @title Cluster distance matrix from subsampling +#' +#' @description Given input data, this function will try to find the clusters +#' based on the given ClusterFunction object. +#' @name mainClustering +#' +#' @param x \code{p x n} data matrix on which to run the clustering (samples in +#' columns). +#' @param diss \code{n x n} data matrix of dissimilarities between the samples +#' on which to run the clustering +#' @param distFunction a distance function to be applied to \code{D}. Only +#' relevant if input is only \code{x} (a matrix of data), and +#' \code{diss=NULL}. See details of \code{\link{clusterSingle}} for the +#' required format of the distance function. +#' @param minSize the minimum number of samples in a cluster. Clusters found +#' below this size will be discarded and samples in the cluster will be given +#' a cluster assignment of "-1" to indicate that they were not clustered. +#' @param orderBy how to order the cluster (either by size or by maximum alpha +#' value). If orderBy="size" the numbering of the clusters are reordered by +#' the size of the cluster, instead of by the internal ordering of the +#' \code{clusterFUN} defined in the \code{ClusterFunction} object (an internal +#' ordering is only possible if slot \code{outputType} of the +#' \code{ClusterFunction} is \code{"list"}). +#' @param format whether to return a list of indices in a cluster or a vector of +#' clustering assignments. List is mainly for compatibility with sequential +#' part. +#' @param clusterArgs arguments to be passed directly to the \code{clusterFUN} +#' slot of the \code{ClusterFunction} object +#' @param checkArgs logical as to whether should give warning if arguments given +#' that don't match clustering choices given. Otherwise, inapplicable +#' arguments will be ignored without warning. +#' @param returnData logical as to whether to return the \code{diss} or \code{x} +#' matrix in the output. If \code{FALSE} only the clustering vector is +#' returned. +#' @param ... arguments passed to the post-processing steps of the clustering. +#' The available post-processing arguments for a \code{ClusterFunction} object +#' depend on it's algorithm type and can be found by calling +#' \code{getPostProcessingArgs}. See details below for documentation. +#' @inheritParams subsampleClustering +#' @inheritParams clusterSingle +#' @details \code{mainClustering} is not meant to be called by the user. It is only an +#' exported function so as to be able to clearly document the arguments for +#' \code{mainClustering} which can be passed via the argument \code{mainClusterArgs} in +#' functions like \code{\link{clusterSingle}} and \code{\link{clusterMany}}. +#' +#' @return mainClustering returns a vector of cluster assignments (if format="vector") +#' or a list of indices for each cluster (if format="list"). Clusters less +#' than minSize are removed. +#' +#' @examples +#' data(simData) +#' cl1<-mainClustering(x=simData,clusterFunction="pam",clusterArgs=list(k=3)) +#' cl2<-mainClustering(simData,clusterFunction="hierarchical01",clusterArgs=list(alpha=.1)) +#' cl3<-mainClustering(simData,clusterFunction="tight",clusterArgs=list(alpha=.1)) +#' #change distance to manhattan distance +#' cl4<-mainClustering(simData,clusterFunction="pam",clusterArgs=list(k=3), +#' distFunction=function(x){dist(x,method="manhattan")}) +#' +#' #run hierarchical method for finding blocks, with method of evaluating +#' #coherence of block set to evalClusterMethod="average", and the hierarchical +#' #clustering using single linkage: +#' clustSubHier <- mainClustering(simData, clusterFunction="hierarchical01", +#' minSize=5, clusterArgs=list(alpha=0.1,evalClusterMethod="average", method="single")) +#' +#' #do tight +#' clustSubTight <- mainClustering(simData, clusterFunction="tight", clusterArgs=list(alpha=0.1), +#' minSize=5) +#' +#' #two twists to pam +#' clustSubPamK <- mainClustering(simData, clusterFunction="pam", silCutoff=0, minSize=5, +#' removeSil=TRUE, clusterArgs=list(k=3)) +#' clustSubPamBestK <- mainClustering(simData, clusterFunction="pam", silCutoff=0, +#' minSize=5, removeSil=TRUE, findBestK=TRUE, kRange=2:10) +#' +#' # note that passing the wrong arguments for an algorithm results in warnings +#' # (which can be turned off with checkArgs=FALSE) +#' clustSubTight_test <- mainClustering(simData, clusterFunction="tight", +#' clusterArgs=list(alpha=0.1), minSize=5, removeSil=TRUE) +#' clustSubTight_test2 <- mainClustering(simData, clusterFunction="tight", +#' clusterArgs=list(alpha=0.1,evalClusterMethod="average")) +#' @rdname mainClustering +#' @aliases mainClustering,character-method +#' @export +setMethod( + f = "mainClustering", + signature = signature(clusterFunction = "character"), + definition = function(clusterFunction,...){ + mainClustering(getBuiltInFunction(clusterFunction),...) + + } + ) +#' @rdname mainClustering +#' @export +setMethod( + f = "mainClustering", + signature = signature(clusterFunction = "ClusterFunction"), +definition=function(clusterFunction,x=NULL, diss=NULL, + distFunction=NA,clusterArgs=NULL,minSize=1, orderBy=c("size","best"), + format=c("vector","list"),checkArgs=TRUE,checkDiss=TRUE,returnData=FALSE,...){ + orderBy<-match.arg(orderBy) + format<-match.arg(format) + postProcessArgs<-list(...) + if(length(postProcessArgs)>0){ + #get rid of wrong args passed because of user confusion between the two + whRightArgs<-which(names(postProcessArgs) %in% getPostProcessingArgs(clusterFunction)) + if(length(whRightArgs)!=length(postProcessArgs) & checkArgs) warning("Some arguments passed via '...' in mainClustering do not match the algorithmType of the given ClusterFunction object") + if(length(whRightArgs)>0) postProcessArgs<-postProcessArgs[whRightArgs] + else postProcessArgs<-NULL + } + ####################### + ### Check input and Create distance if needed, and check it. + ####################### + input<-.checkXDissInput(x,diss,inputType=clusterFunction@inputType,algType=clusterFunction@algorithmType,checkDiss=checkDiss) + #K-post processing requires diss for the silhouette. + doKPostProcess<-FALSE + if(clusterFunction@algorithmType=="K"){ + if("findBestK" %in% names(postProcessArgs) && postProcessArgs[["findBestK"]]) doKPostProcess<-TRUE + if("removeSil" %in% names(postProcessArgs) && postProcessArgs[["removeSil"]]) doKPostProcess<-TRUE + } + if(input=="X" & (clusterFunction@inputType=="diss" || doKPostProcess)){ + diss<-.makeDiss(x,distFunction=distFunction,checkDiss=checkDiss,algType=clusterFunction@algorithmType) + if(clusterFunction@inputType=="diss") input<-"diss" + } + + + #----- + # Other Checks + #----- + reqArgs<-requiredArgs(clusterFunction) + #remove required args not needed if certain postProcessArgs are given: + # don't need define 'k' if choose 'findBestK=TRUE' + if(algorithmType(clusterFunction)=="K" & "findBestK" %in% names(postProcessArgs)){ + if(postProcessArgs[["findBestK"]]) reqArgs<-reqArgs[-which(reqArgs=="k")] + } + if(length(reqArgs)>0 & !all(reqArgs %in% names(clusterArgs))) stop(paste("For this clusterFunction algorithm type ('",algorithmType(clusterFunction),"') must supply arguments",reqArgs,"as elements of the list of 'clusterArgs'")) + + + ####################### + ####Run clustering: + ####################### + if(input %in% c("X","both")) N <- dim(x)[2] else N<-dim(diss)[2] + + argsClusterList<-.makeDataArgs(dataInput=input,funInput=clusterFunction@inputType, xData=x, dissData=diss) + argsClusterList<-c(argsClusterList, clusterArgs, list("checkArgs"=checkArgs, "cluster.only"=TRUE)) + if(algorithmType(clusterFunction)=="01") { + res<-do.call(clusterFunction@clusterFUN,argsClusterList) + } + if(algorithmType(clusterFunction)=="K") { + res<-do.call(".postProcessClusterK",c(list(clusterFunction=clusterFunction,clusterArgs=argsClusterList,N=N,orderBy=orderBy,diss=diss),postProcessArgs)) + ###Note to self: .postProcessClusterK returns clusters in list form. + } + + ####################### + #Now format into desired output, order + ####################### + #this is perhaps not efficient. For now will do this, then consider going back and only converting when, where needed. + if(clusterFunction@outputType=="vector" & algorithmType(clusterFunction)!="K"){ + res<-.clusterVectorToList(res) + } + clusterSize<-sapply(res, length) + if(length(res)>0) res <- res[clusterSize>=minSize] + if(length(res)!=0 & orderBy=="size"){ #i.e. there exist clusters found that passed minSize + clusterSize<-sapply(res, length) #redo because dropped small clusters earlier + res <- res[order(clusterSize,decreasing=TRUE)] + } + if(format=="vector"){ + res<-.clusterListToVector(res,N) + names(res)<-if(input=="X") colnames(x) else rownames(diss) + } + if(!returnData) return(res) + else return(list(results=res,diss=diss,x=x)) +} +) + + + +#' @rdname mainClustering +#' @aliases getPostProcessingArgs +#' @details Post-processing Arguments: For post-processing the clustering, +#' currently only type 'K' algorithms have a defined post-processing. +#' Specifically +#' \itemize{ +#' \item{"findBestK"}{logical, whether should find best K based on average +#' silhouette width (only used if clusterFunction of type "K").} +#' \item{"kRange"}{vector of integers to try for k values if findBestK=TRUE. If +#' \code{k} is given in \code{clusterArgs}, then default is k-2 to k+20, +#' subject to those values being greater than 2; if not the default is +#' \code{2:20}. Note that default values depend on the input k, so running for +#' different choices of k and findBestK=TRUE can give different answers unless +#' kRange is set to be the same.} +#' \item{"removeSil"}{logical as to whether remove the assignment of a sample +#' to a cluster when the sample's silhouette value is less than +#' \code{silCutoff}} +#' \item{"silCutoff"}{Cutoff on the minimum silhouette width to be included in +#' cluster (only used if removeSil=TRUE).} +#' } +#' @export +setMethod( + f = "getPostProcessingArgs", + signature = c("ClusterFunction"), + definition = function(clusterFunction) { + switch(algorithmType(clusterFunction),"01"=.argsPostCluster01,"K"=.argsPostClusterK) +} +) + +.argsPostCluster01<-c("") +.argsPostClusterK<-c("findBestK","kRange","removeSil","silCutoff") + +#' @importFrom cluster silhouette +.postProcessClusterK<-function(clusterFunction,findBestK=FALSE, kRange,removeSil=FALSE,silCutoff=0,clusterArgs,N,orderBy,diss=NULL) +{ + doPostProcess<-(findBestK | removeSil ) & !is.null(diss) #whether will calculate silhouette or not; if not, speeds up the function... + k<-clusterArgs[["k"]] + if(!findBestK && is.null(k)) stop("If findBestK=FALSE, must provide k") + if(!is.null(k)) clusterArgs<-clusterArgs[-which(names(clusterArgs)=="k")] + if(findBestK){ + if(missing(kRange)){ + if(!is.null(k)) kRange<-(k-2):(k+20) + else kRange<-2:20 + } + if(any(kRange<2)){ + kRange<-kRange[kRange>=2] + if(length(kRange)==0) stop("Undefined values for kRange; must be greater than or equal to 2") + } + ks<-kRange + } + else ks<-k + if(any(ks>= N)) ks<-ks[ks1){ + whichBest<-which.max(sapply(silClusters, mean)) + finalCluster<-clusters[[whichBest]] + sil<-silClusters[[whichBest]][,"sil_width"] + } + else{ + finalCluster<-clusters[[1]] + sil<-silClusters[[1]][,"sil_width"] + } + if(removeSil){ + cl<-as.numeric(sil>silCutoff) + cl[cl==0]<- -1 + cl[cl>0]<-finalCluster[cl>0] + sil[cl == -1] <- -Inf #make the -1 cluster the last one in order + } + else{ + cl<-finalCluster + } + } + else{ + cl<-clusters[[1]] + } + + + #make list of indices and put in order of silhouette width (of positive) + clList<-tapply(1:length(cl),cl,function(x){x},simplify=FALSE) + if(doPostProcess){ + if(orderBy=="best"){ + clAveWidth<-tapply(sil,cl,mean,na.rm=TRUE) + clList[order(clAveWidth,decreasing=TRUE)] + } + #remove -1 group + if(removeSil){ + whNotAssign<-which(sapply(clList,function(x){all(cl[x]== -1)})) + if(length(whNotAssign)>1) stop("Coding error in removing unclustered samples") + if(length(whNotAssign)>0) clList<-clList[-whNotAssign] + } + } + + return(clList) + +} + + diff --git a/R/makeDendrogram.R b/R/makeDendrogram.R index 38ded715..ad86a44c 100644 --- a/R/makeDendrogram.R +++ b/R/makeDendrogram.R @@ -49,8 +49,8 @@ #' data(simData) #' #' #create a clustering, for 8 clusters (truth was 3) -#' cl <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -#' sequential=FALSE, clusterDArgs=list(k=8)) +#' cl <- clusterSingle(simData, subsample=FALSE, +#' sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) #' #' #create dendrogram of clusters: #' hcl <- makeDendrogram(cl) @@ -63,7 +63,7 @@ setMethod( f = "makeDendrogram", signature = "ClusterExperiment", definition = function(x, whichCluster="primaryCluster",dimReduce=c("none", "PCA", "var","cv","mad"), - ndims=NA,ignoreUnassignedVar=FALSE,unassignedSamples=c("outgroup", "cluster"),...) + ndims=NA,ignoreUnassignedVar=TRUE,unassignedSamples=c("outgroup", "cluster"),...) { unassignedSamples<-match.arg(unassignedSamples) if(is.character(whichCluster)) whCl<-.TypeIntoIndices(x,whClusters=whichCluster) else whCl<-whichCluster diff --git a/R/mergeClusters.R b/R/mergeClusters.R index 48df445a..9d048c1c 100644 --- a/R/mergeClusters.R +++ b/R/mergeClusters.R @@ -1,4 +1,4 @@ -.availMergeMethods<-c("adjP", "locfdr", "MB", "JC") +.availMergeMethods<-c("adjP", "locfdr", "MB", "JC","PC","Storey") #' @title Merge clusters based on dendrogram #' #' @description Takes an input of hierarchical clusterings of clusters and @@ -51,7 +51,9 @@ #' transformation stored in the object. If FALSE, then transform(x) will be #' given to the input and will be used for both \code{makeDendrogram} and #' \code{getBestFeatures}, with no voom correction. -#' @details "JC" refers to the method of Ji and Cai (2007), and implementation +#' @details "Storey" refers to the method of Storey (2002). "PC" refers to the +#' method of Pounds and Cheng (2004). "JC" refers to the method of +#' Ji and Cai (2007), and implementation #' of "JC" method is copied from code available on Jiashin Ji's website, #' December 16, 2015 #' (http://www.stat.cmu.edu/~jiashun/Research/software/NullandProp/). "locfdr" @@ -84,13 +86,23 @@ #' @return If `x` is a \code{\link{ClusterExperiment}}, it returns a new #' \code{ClusterExperiment} object with an additional clustering based on the #' merging. This becomes the new primary clustering. +#' @references Ji and Cai (2007), "Estimating the Null and the Proportion +#' of Nonnull Effects in Large-Scale Multiple Comparisons", JASA 102: 495-906. +#' @references Efron (2004) "Large-scale simultaneous hypothesis testing: +#' the choice of a null hypothesis," JASA, 99: 96-104. +#' @references Meinshausen and Buhlmann (2005) "Lower bounds for the +#' number of false null hypotheses for multiple testing of associations", +#' Biometrika 92(4): 893-907. +#' @references Storey (2002) "A direct approach to false discovery rates", J. R. Statist. Soc. B 64 (3)": 479-498. +#' @references Pounds and Cheng (2004). "Improving false discovery rate estimation." Bioinformatics 20(11): 1737-1745. + #' @seealso makeDendrogram, plotDendrogram, getBestFeatures #' @examples #' data(simData) #' #' #create a clustering, for 8 clusters (truth was 3) -#' cl<-clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -#' sequential=FALSE, clusterDArgs=list(k=8)) +#' cl<-clusterSingle(simData, subsample=FALSE, +#' sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) #' #' #give more interesting names to clusters: #' newNames<- paste("Cluster",clusterLegend(cl)[[1]][,"name"],sep="") @@ -119,8 +131,8 @@ setMethod(f = "mergeClusters", signature = signature(x = "matrix"), definition = function(x, cl, dendro=NULL, - mergeMethod=c("none", "adjP", "locfdr", "MB", "JC"), - plotInfo=c("none", "all", "mergeMethod","adjP", "locfdr", "MB", "JC"), + mergeMethod=c("none", "Storey","PC","adjP", "locfdr", "MB", "JC"), + plotInfo=c("none", "all", "Storey","PC","adjP", "locfdr", "MB", "JC","mergeMethod"), cutoff=0.1, plot=TRUE, isCount=TRUE, ...) { dendroSamples<-NULL #currently option is not implemented for matrix version... @@ -152,11 +164,13 @@ setMethod(f = "mergeClusters", if(plotInfo=="all") whMethodCalculate<-.availMergeMethods if(plotInfo%in% .availMergeMethods) whMethodCalculate<-unique(c(whMethodCalculate,plotInfo)) sigByNode <- by(sigTable, sigTable$ContrastName, function(x) { + storey<-if("Storey" %in% whMethodCalculate) .myTryFunc(pvalues=x$P.Value, FUN=.m1_Storey) else NA + pc <-if("PC" %in% whMethodCalculate) .myTryFunc(pvalues=x$P.Value, FUN=.m1_PC) else NA mb <-if("MB" %in% whMethodCalculate) .myTryFunc(pvalues=x$P.Value, FUN=.m1_MB) else NA locfdr <-if("locfdr" %in% whMethodCalculate) .myTryFunc(tstats=x$t, FUN=.m1_locfdr) else NA jc <-if("JC" %in% whMethodCalculate) .myTryFunc(tstats=x$t, FUN=.m1_JC) else NA adjP<-if("adjP" %in% whMethodCalculate) .m1_adjP(x$adj) else NA - return(c("adjP"=adjP, "locfdr"=locfdr, "MB"=mb,"JC"=jc)) + return(c("Storey"=storey,"PC"=pc,"adjP"=adjP, "locfdr"=locfdr, "MB"=mb,"JC"=jc)) }) newcl <- cl phylo4Obj <- .makePhylobaseTree(dendro, "dendro") @@ -345,7 +359,20 @@ This makes sense only for counts.") else return(NA) } -#functions for estimating m1/m, the proportion of non-null +#functions for estimating m1/m, the proportion of **non-null** +.m1_Storey<-function(pvalues,lambda=0.5){ + m<-length(pvalues) + num<-length(which(pvalues>lambda)) + pi0<-num/(1-lambda)/m + return(1-pi0) + +} +.m1_PC<-function(pvalues){ + pi0<-2*mean(pvalues) + return(1-pi0) + +} + .m1_MB<-function(pvalues){ nCorrect<-max(howmany::lowerbound(howmany::howmany(pvalues))) #the most you can call correctly return(nCorrect/length(pvalues)) diff --git a/R/plotBarplot.R b/R/plotBarplot.R index 0ac35b83..90031b79 100644 --- a/R/plotBarplot.R +++ b/R/plotBarplot.R @@ -58,7 +58,9 @@ setMethod( signature = signature(clusters = "ClusterExperiment",whichClusters="character"), definition = function(clusters, whichClusters,...) { - wh<-head(.TypeIntoIndices(clusters,whClusters=whichClusters),2) + wh<-.TypeIntoIndices(clusters,whClusters=whichClusters) + if(length(wh)==0) stop("invalid choice of 'whichClusters'") + wh<-head(wh,2) #limit it to 2 return(plotBarplot(clusters,whichClusters=wh,...)) }) diff --git a/R/plotClusters.R b/R/plotClusters.R index 18aabb41..01c556c7 100644 --- a/R/plotClusters.R +++ b/R/plotClusters.R @@ -115,8 +115,8 @@ #' \link[ConsensusClusterPlus]{ConsensusClusterPlus} by Matt Wilkerson and Peter #' Waltman). #' -#' @seealso The \link[ConsensusClusterPlus]{ConsensusClusterPlus} package. -#' +#' @seealso The \code{\link[ConsensusClusterPlus]{ConsensusClusterPlus}} package. +#' @references Wilkerson, D. M, Hayes and Neil D (2010). "ConsensusClusterPlus: a class discovery tool with confidence assessments and item tracking." Bioinformatics, 26(12), pp. 1572-1573. #' @export #' #' @examples diff --git a/R/plotDendrogram.R b/R/plotDendrogram.R index 5ea061d2..d9b6c79c 100644 --- a/R/plotDendrogram.R +++ b/R/plotDendrogram.R @@ -45,8 +45,8 @@ #' data(simData) #' #' #create a clustering, for 8 clusters (truth was 3) -#' cl <-clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -#' sequential=FALSE, clusterDArgs=list(k=8)) +#' cl <-clusterSingle(simData, subsample=FALSE, +#' sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) #' #' #create dendrogram of clusters and then #' # merge clusters based ondendrogram: @@ -138,7 +138,7 @@ setMethod( ### For plotting of dendrogram for the merging ### Add information about the merging as node labels and change edge type ############### - if(!is.null(mergePlotType) && mergePlotType %in% c("all","adjP", "locfdr", "MB", "JC","mergeMethod")){ + if(!is.null(mergePlotType) && mergePlotType %in% c("all",.availMergeMethods,"mergeMethod")){ ##### #convert names of internal nodes for plotting ##### diff --git a/R/plotHeatmap.R b/R/plotHeatmap.R index d9dd62bd..101bd1c3 100644 --- a/R/plotHeatmap.R +++ b/R/plotHeatmap.R @@ -70,7 +70,7 @@ #' @param missingColor color assigned to cluster values of '-2' ("missing"). #' @param ... for signature \code{matrix}, arguments passed to \code{aheatmap}. #' For the other signatures, passed to the method for signature \code{matrix}. -#' Not all arguments can be passed to aheatmap effectively, see details. +#' Not all arguments can be passed to \code{aheatmap} effectively, see details. #' @param nFeatures integer indicating how many features should be used (if #' \code{clusterFeaturesData} is 'var' or 'PCA'). #' @param isSymmetric logical. if TRUE indicates that the input matrix is @@ -78,8 +78,8 @@ #' sample matrices (e.g., correlation). #' @param overRideClusterLimit logical. Whether to override the internal limit #' that only allows 10 clusterings/annotations. If overridden, may result in -#' incomprehensible errors from aheatmap. Only override this if you have a -#' very large plotting device and want to see if aheatmap can render it. +#' incomprehensible errors from \code{aheatmap}. Only override this if you have a +#' very large plotting device and want to see if \code{aheatmap} can render it. #' @inheritParams clusterSingle #' #' @details The plotHeatmap function calls \code{\link[NMF]{aheatmap}} to draw @@ -161,7 +161,7 @@ #' \code{sampleData}, and in fact runs out of colors and the remaining levels #' get the color white. Thus if you have many factors or many levels in those #' factors, you should set their colors via \code{clusterLegend}. -#' @details Many arguments can be passed on to aheatmap, however, some are set +#' @details Many arguments can be passed on to \code{aheatmap}, however, some are set #' internally by \code{plotHeatmap.} In particular, setting the values of #' \code{Rowv} or \code{Colv} will cause errors. \code{color} in #' \code{aheatmap} is replaced by \code{colorScale} in \code{plotHeatmap.} The @@ -184,7 +184,7 @@ #' for quantile.} #' } #' @author Elizabeth Purdom -#' +#' @seealso \code{\link[NMF]{aheatmap}} #' @export #' #' @examples diff --git a/R/plottingHelpers.R b/R/plottingHelpers.R index 7ff81dae..e79e33a8 100644 --- a/R/plottingHelpers.R +++ b/R/plottingHelpers.R @@ -245,8 +245,8 @@ bigPalette<-.thisPal #' showHeatmapPalettes() #' #' #compare the palettes on heatmap -#' cl <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -#' sequential=FALSE, clusterDArgs=list(k=8)) +#' cl <- clusterSingle(simData, subsample=FALSE, +#' sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) #' #' \dontrun{ #' par(mfrow=c(2,3)) diff --git a/R/rsec.R b/R/rsec.R index 0838641b..d7bfc7be 100644 --- a/R/rsec.R +++ b/R/rsec.R @@ -1,6 +1,6 @@ -#' Resampling-based Sequential Ensemble Clustering +#' @title Resampling-based Sequential Ensemble Clustering #' -#' Implementation of the RSEC algorithm (Resampling-based Sequential Ensemble +#' @description Implementation of the RSEC algorithm (Resampling-based Sequential Ensemble #' Clustering) for single cell sequencing data. This is a wrapper function #' around the existing clusterExperiment workflow that results in the output of #' RSEC. @@ -27,12 +27,12 @@ setMethod( definition = function(x, isCount=FALSE,transFun=NULL, dimReduce="PCA",nVarDims=NA, nPCADims=c(50), k0s=4:15, - clusterFunction=c("tight","hierarchical01"), + clusterFunction=listBuiltInType01(), alphas=c(0.1,0.2,0.3),betas=0.9, minSizes=1, combineProportion=0.7, combineMinSize=5, dendroReduce="mad",dendroNDims=1000, mergeMethod="adjP",mergeCutoff=0.05,verbose=FALSE, - clusterDArgs=NULL, + mainClusterArgs=NULL, subsampleArgs=NULL, seqArgs=NULL, ncores=1, random.seed=NULL, run=TRUE @@ -47,7 +47,7 @@ setMethod( sequential=TRUE,removeSil=FALSE,subsample=TRUE,silCutoff=0,distFunction=NA, isCount=isCount,transFun=transFun, dimReduce=dimReduce,nVarDims=nVarDims,nPCADims=nPCADims, - clusterDArgs=clusterDArgs,subsampleArgs=subsampleArgs, + mainClusterArgs=mainClusterArgs,subsampleArgs=subsampleArgs, seqArgs=seqArgs,ncores=ncores,random.seed=random.seed,run=run) #browser() if(run){ @@ -55,10 +55,27 @@ setMethod( } return(ce) }) +.methodFormals <- function(f, signature = character()) { + #to find defaults of RSEC + #from this conversation: + #http://r.789695.n4.nabble.com/Q-Get-formal-arguments-of-my-implemented-S4-method-td4702420.html + fdef <- getGeneric(f) + method <- selectMethod(fdef, signature) + genFormals <- base::formals(fdef) + b <- body(method) + if(is(b, "{") && is(b[[2]], "<-") && identical(b[[2]][[2]], as.name(".local"))) { + local <- eval(b[[2]][[3]]) + if(is.function(local)) + return(formals(local)) + warning("Expected a .local assignment to be a function. Corrupted method?") + } + genFormals +} .postClusterMany<-function(ce,...){ -# function(ce,combineProportion,combineMinSize,dendroReduce,dendroNDims,mergeMethod,mergeCutoff,isCount) + defaultArgs<-.methodFormals("RSEC",signature="matrix") passedArgs<-list(...) - + whNotShared<-which(!names(defaultArgs)%in%names(passedArgs) ) + if(length(whNotShared)>0) passedArgs<-c(passedArgs,defaultArgs[whNotShared]) ###CombineMany args1<-list() if("combineProportion" %in% names(passedArgs)) args1<-c(args1,"proportion"=passedArgs$combineProportion) @@ -79,11 +96,11 @@ setMethod( ce<-dendroTry args1<-list() if("mergeCutoff" %in% names(passedArgs)) args1<-c(args1,"cutoff"=passedArgs$mergeCutoff) - if("mergeMethod" %in% names(passedArgs)){ + if("mergeMethod" %in% names(passedArgs) && passedArgs$mergeMethod!="none"){ args1<-c(args1,"mergeMethod"=passedArgs$mergeMethod) - ce <- do.call( mergeClusters,c(list(x=ce,plotType="none"), args1, passedArgs[c("isCount")])) + ce <- do.call( mergeClusters,c(list(x=ce,plot=FALSE,plotInfo="none"), args1, passedArgs[c("isCount")])) } - else note("clusters will not be merged unless argument 'mergeMethod' is given") + else note("clusters will not be merged because argument 'mergeMethod' was not given (or was equal to 'none')") } else note("makeDendrogram encountered following error and therefore clusters were not merged:\n", dendroTry) return(ce) diff --git a/R/seqCluster.R b/R/seqCluster.R index 14f755ae..a3b82f15 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -10,19 +10,17 @@ #' on which to run the clustering #' @param k0 the value of K at the first iteration of sequential algorithm, see #' details below or vignette. -#' @param clusterFunction passed to clusterDMat option 'clusterFunction' to -#' indicate method of clustering, see \code{\link{clusterD}}. #' @param subsample logical as to whether to subsample via #' \code{\link{subsampleClustering}} to get the distance matrix at each #' iteration; otherwise the distance matrix is set by arguments to -#' \code{\link{clusterD}}. +#' \code{\link{mainClustering}}. #' @param beta value between 0 and 1 to decide how stable clustership membership #' has to be before 'finding' and removing the cluster. -#' @param top.can only the top.can clusters from \code{\link{clusterD}} (ranked -#' by 'orderBy' argument given to \code{\link{clusterD}}) will be compared +#' @param top.can only the top.can clusters from \code{\link{mainClustering}} (ranked +#' by 'orderBy' argument given to \code{\link{mainClustering}}) will be compared #' pairwise for stability. Making this very big will effectively remove this #' parameter and all pairwise comparisons of all clusters found will be -#' considered. This might result in smaller clusters being found. Current +#' considered. This might result in smaller clusters being found. The current #' default is fairly large, so probably will have little effect. #' @param remain.n when only this number of samples are left (i.e. not yet #' clustered) then algorithm will stop. @@ -34,85 +32,78 @@ #' progress. #' @param subsampleArgs list of arguments to be passed to #' \code{\link{subsampleClustering}}. -#' @param clusterDArgs list of arguments to be passed to -#' \code{\link{clusterD}}(which can include arguments to be passed to -#' \code{\link{cluster01}} or \code{\link{clusterK}}). -#' -#' @details This code is adapted from the code of the tightClust -#' package of Tseng and Wong -#' @details Each iteration of the algorithm will cluster the current set of -#' samples. Depending on the method, the number of clusters resulting from -#' \code{\link{clusterD}} may not be equal to the K used in the clustering of -#' the (subsampled) data. The resulting clusters will then be compared to -#' clusters found in the previous iteration that set the subsampling -#' clustering to K-1. For computational (and other?) convenience, only the -#' first top.can clusters of each iteration will be compared to the first -#' top.can clusters of previous iteration for similarity (where top.can -#' currently refers to ordering by size, so first top.can largest clusters). -#' -#' @details If there is a cluster in the current iteration that has overlap -#' similarity > beta to a cluster in the previous iteration, then the cluster -#' with the largest such similarity will be identified as a 'final' cluster -#' and the samples in it will be removed for future iterations. The algorithm -#' will then continue to the next iteration, but without these samples. -#' Furthermore, in this case K for the next iteration will NOT be set to K+1, -#' but will be reset to kinit-1, where kinit was the first K used after the -#' previous 'final' cluster was removed. If kinit-1 beta to any in the previous iteration, then -#' the algorithm will move to the next iteration (i.e. redo after increasing K -#' to K+1). -#' -#' @details If there are less than remain.n samples left after finding a cluster -#' and removing its samples, the algorithm will stop, as subsampling is deamed -#' to no longer be appropriate. If the K has to be increased to beyond k.max -#' without finding any pair of clusters with overlap > beta, then the -#' algorithm will stop. Any samples not found as part of a 'final' cluster -#' after the algorithm stops, will be classified as unclustered (given a value -#' of -1) -#' -#' @details 'subsample' controls what is the D (distance) matrix used for -#' clustering at each iteration. If subsample=TRUE, D is given via -#' \code{\link{subsampleClustering}} function with k=K (with additional -#' arguments passed via subsampleArgs). If subsample=FALSE, D is dist(x), for -#' the samples currently considered in the iteration and clusterFunction must -#' be of the 'K' type (e.g. "pam", see \code{\link{clusterD}}) or an error -#' will be produced. The nsample x nsample matrix D is then clustered via -#' \code{\link{clusterD}} to find clusters. The option 'clusterFunction' is -#' passed to the argument 'clusterFunction' of \code{\link{clusterD}} to -#' control what method is used to cluster D. -#' -#' @details If clusterFunction is of type 'K' (e.g. "pam", see -#' \code{\link{clusterD}}) the 'k' argument of \code{\link{clusterK}} called -#' by \code{\link{clusterD}} is set to the current iteration of K by the -#' sequential iteration, so setting 'k=' in the list given to clusterDArgs -#' will not do anything and will produce a warning to that effect. -#' -#' @details Similarly, the current K of the iteration also determines the 'k' -#' argument passed to \code{\link{subsampleClustering}} so setting 'k=' in -#' the list given to the subsampleArgs will not do anything and will produce a -#' warning to that effect. -#' -#' @details If subsample=FALSE and 'findBestK=FALSE' is passed to clusterDArgs, -#' then each iteration will run the clustering given by clusterFunction on -#' dist(x) iterating over k. However, if subsample=FALSE, you should not set -#' 'findBestK=TRUE' (otherwise clustering dist(x) will be essentially the same -#' for iterating over different k and there is no method implemented to change -#' the choice of how to remove a cluster other than similarity as you change -#' k); an error message will be given if this combination of options are set. -#' -#' @details However, if clusterFunction="pam" (or is of type 'K') and -#' subsample=TRUE passing either 'findBestK=TRUE' or 'findBestK=FALSE' will -#' function as expected. In particular, the iteration over K will set the -#' number of clusters for clustering of each subsample. If findBestK=FALSE, -#' that same K will be used for clustering of DMat. If findBestK=TRUE, then -#' \code{\link{clusterD}} will search for best k; note that the default -#' 'kRange' over which \code{\link{clusterD}} searches when findBestK=TRUE -#' depends on the input value of 'k' (you can change this to a fixed set of -#' values by setting 'kRange' explicitly in the clusterDArgs list). -#' +#' @param mainClusterArgs list of arguments to be passed to +#' \code{\link{mainClustering}}). +#' @inheritParams clusterSingle +#' @details \code{seqCluster} is not meant to be called by the user. It is only +#' an exported function so as to be able to clearly document the arguments for +#' \code{seqCluster} which can be passed via the argument \code{seqArgs} in +#' functions like \code{\link{clusterSingle}} and \code{\link{clusterMany}}. +#' @details This code is adapted from the sequential protion of the code of the +#' tightClust package of Tseng and Wong. At each iteration of the algorithm it +#' finds a set of samples that constitute a homogeneous cluster and remove +#' them, and iterate again to find the next set of samples that form a +#' cluster. +#' @details In each iteration, to determine the next set of homogeneous set of +#' samples, the algorithm will iteratively cluster the current set of samples +#' for a series of increasing values of the parameter $K$, starting at a value +#' \code{kinit} and increasing by 1 at each iteration, until a sufficiently +#' homogeneous set of clusters is found. For the first set of homogeneous +#' samples, \code{kinit} is set to the argument $k0$, and for iteration, +#' \code{kinit} is increased internally. +#' @details Depending on the value of \code{subsample} how the value of $K$ is +#' used differs. If \code{subsample=TRUE}, $K$ is the \code{k} sent to the +#' cluster function \code{clusterFunction} sent to +#' \code{\link{subsampleClustering}} via \code{subsampleArgs}; then +#' \code{\link{mainClustering}} is run on the result of the co-occurance matrix from +#' \code{\link{subsampleClustering}} with the \code{ClusterFunction} object +#' defined in the argument \code{clusterFunction} set via \code{mainClusterArgs}. +#' The number of clusters actually resulting from this run of +#' \code{\link{mainClustering}} may not be equal to the $K$ sent to the clustering +#' done in \code{\link{subsampleClustering}}. If \code{subsample=FALSE}, +#' \code{\link{mainClustering}} is called directly on the data to determine the +#' clusters and $K$ set by \code{seqCluster} for this iteration determines the +#' parameter of the clustering done by \code{\link{mainClustering}}. Specifically, +#' the argument \code{clusterFunction} defines the clustering of the +#' \code{\link{mainClustering}} step and \code{k} is sent to that +#' \code{ClusterFunction} object. This means that if \code{subsample=FALSE}, +#' the \code{clusterFunction} must be of \code{algorithmType} "K". +#' @details In either setting of \code{subsample}, the resulting clusters from +#' \code{\link{mainClustering}} for a particular $K$ will be compared to clusters +#' found in the previous iteration of $K-1$. For computational (and other?) +#' convenience, only the first \code{top.can} clusters of each iteration will +#' be compared to the first \code{top.can} clusters of previous iteration for +#' similarity (where \code{top.can} currently refers to ordering by size, so +#' first \code{top.can} largest clusters. +#' @details If there is no cluster of the first \code{top.can} in the current +#' iteration $K$ that has overlap similarity > \code{beta} to any in the +#' previous iteration, then the algorithm will move to the next iteration, +#' increasing to $K+1$. +#' +#' @details If, however, of these clusters there is a cluster in the current +#' iteration $K$ that has overlap similarity > beta to a cluster in the +#' previous iteration $K-1$, then the cluster with the largest such similarity +#' will be identified as a homogenous set of samples and the samples in it +#' will be removed and designated as such. The algorithm will then start again +#' to determine the next set of homogenous samples, but without these samples. +#' Furthermore, in this case (i.e. a cluster was found and removed), the value +#' of \code{kinit} will be be reset to \code{kinit-1}; i.e. the range of +#' increasing $K$ that will be iterated over to find a set of homogenous +#' samples will start off one value less than was the case for the previous +#' set of homogeneous samples. If \code{kinit-1}<\code{k.min}, then +#' \code{kinit} will be set to \code{k.min}. +#' +#' +#' @details If there are less than \code{remain.n} samples left after finding a +#' cluster and removing its samples, the algorithm will stop, as subsampling +#' is deamed to no longer be appropriate. If the K has to be increased to +#' beyond \code{k.max} without finding any pair of clusters with overlap > +#' beta, then the algorithm will stop. Any samples not found as part of a +#' homogenous set of clusters at that point will be classified as unclustered +#' (given a value of -1) +#' @details Certain combinations of inputs to \code{mainClusterArgs} and +#' \code{subsampleArgs} are not allowed. See \code{\link{clusterSingle}} for +#' these explanations. #' @return A list with values #' \itemize{ #' @@ -133,39 +124,38 @@ #' Approach for Identifying Stable and Tight Patterns in Data", Biometrics, #' 61:10-16. #' -#' @seealso tight.clust +#' @seealso tight.clust, +#' \code{\link{clusterSingle}},\code{\link{mainClustering}},\code{\link{subsampleClustering}} +#' #' @examples #' \dontrun{ #' data(simData) #' #' set.seed(12908) -#' -#' clustSeqHier <- seqCluster(t(simData), k0=5, subsample=TRUE, -#' clusterFunction="hierarchical01", beta=0.8, subsampleArgs=list(resamp.n=100, +#' clustSeqHier <- seqCluster(simData, k0=5, subsample=TRUE, +#' beta=0.8, subsampleArgs=list(resamp.n=100, #' samp.p=0.7, clusterFunction="kmeans", clusterArgs=list(nstart=10)), -#' clusterDArgs=list(minSize=5)) +#' mainClusterArgs=list(minSize=5,clusterFunction="hierarchical01",clusterArgs=list(alpha=0.1))) #' } #' @export -seqCluster<-function (x=NULL, diss=NULL, k0, clusterFunction=c("tight","hierarchical01","pam","hierarchicalK"), - subsample=TRUE,beta = 0.7, top.can = 15, remain.n = 30, k.min = 3, - k.max=k0+10,verbose=TRUE, subsampleArgs=NULL,clusterDArgs=NULL) +#' @rdname seqCluster +#' @export +seqCluster<-function(x=NULL, diss=NULL, k0, + subsample=TRUE, beta, top.can = 5, remain.n = 30, k.min = 3, + k.max=k0+10,verbose=TRUE, subsampleArgs=NULL,mainClusterArgs=NULL,checkDiss=TRUE) { - input<-.checkXDissInput(x,diss) - #for now, if use pam for subsampleClusterMethod, just use given k. - if(!is.function(clusterFunction)){ - clusterFunction<-match.arg(clusterFunction) - if(!is.function(clusterFunction)) typeAlg<-.checkAlgType(clusterFunction) - } - else{ - if(! "typeAlg" %in% clusterDArgs) stop("if you provide your own clustering algorithm to be passed to clusterD, then you must specify 'typeAlg' in clusterDArgs") - else typeAlg<-clusterDArgs[["typeAlg"]] - } - if(typeAlg == "K"){ - if("findBestK" %in% names(clusterDArgs) & !subsample){ - if(clusterDArgs[["findBestK"]]) stop("Cannot do sequential clustering where subsample=FALSE and 'findBestK=TRUE' is passed via clusterDArgs. See help documentation.") - } - - } + ######## + ####Checks + ######## + + checkOut<-.checkSubsampleClusterDArgs(x=x,diss=diss,subsample=subsample,sequential=TRUE,mainClusterArgs=mainClusterArgs,subsampleArgs=subsampleArgs,checkDiss=checkDiss) + if(is.character(checkOut)) stop(checkOut) +else { + mainClusterArgs<-checkOut$mainClusterArgs + subsampleArgs<-checkOut$subsampleArgs + input<-checkOut$inputClusterD +} + ################ ################ ###The following is legacy of tight.clust. They originally had programmed ability to look across more than 2 at each step to determing the stability of a cluster. This was not what they described in paper, and function is hard-coded at 2, but I have left code here in case we ever wanted to reconsider this issue. @@ -180,23 +170,14 @@ seqCluster<-function (x=NULL, diss=NULL, k0, clusterFunction=c("tight","hierarch whReturn<-switch(kReturn,"last"=seq.num,"first"=1) #way to index which one gets returned. ################ ################ - if(input %in% c("X","both")) N <- dim(x)[2] + if(input %in% c("X")) N <- dim(x)[2] if(input=="diss") N<-dim(diss)[2] if(verbose){ - if(input %in% c("X","both")) cat(paste("Number of points:", N, "\tDimension:", dim(x)[1], "\n")) + if(input %in% c("X")) cat(paste("Number of points:", N, "\tDimension:", dim(x)[1], "\n")) else cat(paste("Number of points:", N,"\n")) } -# if(input %in% c("X","both")){ -# original.data <- x -# colnames(x) <- as.character(1:N) -# id <- colnames(x) -# } -# else{ -# original.data <- diss -# id<-colnames(diss) -# } - if(input %in% c("X","both")) colnames(x) <- as.character(1:N) - if(input %in% c("diss","both")) colnames(diss)<-rownames(diss)<-as.character(1:N) + if(input %in% c("X")) colnames(x) <- as.character(1:N) + if(input %in% c("diss")) colnames(diss)<-rownames(diss)<-as.character(1:N) #iterative setup remain <- N #keep track of how many samples not yet clustered (stop when less than remain.n) @@ -210,56 +191,45 @@ seqCluster<-function (x=NULL, diss=NULL, k0, clusterFunction=c("tight","hierarch kstart<-c() #the starting k for the cluster kend<-c() #the ending k for the cluster whyStop<-NULL - if("k" %in% names(subsampleArgs)){ - #remove predefined versions of k from both. - whK<-which(names(subsampleArgs)=="k") - warning("Setting 'k' in subsampleArgs when the seqCluster is called will have no effect.") - subsampleArgs<-subsampleArgs[-whK] - } - if("k" %in% names(clusterDArgs)){ - whK<-which(names(clusterDArgs)=="k") - warning("Setting 'k' in clusterDArgs when the seqCluster is called will have no effect.") - clusterDArgs<-clusterDArgs[-whK] + + updateClustering<-function(newk){ + if(verbose) cat(paste("k =", newk,"\n")) + if(subsample){ + tempArgs<-subsampleArgs + tempArgs[["clusterArgs"]]<-c(list(k=newk), subsampleArgs[["clusterArgs"]]) #set k + #also set the k for the mainClustering to be the same as in subsampling. + tempClusterDArgs<-mainClusterArgs + tempClusterDArgs[["clusterArgs"]] <- c(list(k=newk), mainClusterArgs[["clusterArgs"]]) + + res <- .clusterWrapper(x=x, subsample=subsample, subsampleArgs=tempArgs, mainClusterArgs=tempClusterDArgs)$results + } + else{ + tempArgs<-mainClusterArgs + tempArgs[["clusterArgs"]]<-c(list(k=newk), mainClusterArgs[["clusterArgs"]]) #set k + res <- .clusterWrapper(x=x, diss=diss, subsample=subsample, subsampleArgs=subsampleArgs, mainClusterArgs=tempArgs)$results + + } + return(res) } while (remain >= remain.n && (found || k <= k.max)) { if (found) { #i.e. start finding new cluster if(verbose) cat(paste("Looking for cluster", nfound + 1, "...\n")) k <- k.start currentStart<-k.start #will add this to kstart if successful in finding cluster - #find clusters for K,K+1 for (i in 1:seq.num) { - if(verbose) cat(paste("k =", k + i - 1,"\n")) - if(subsample){ - tempArgs<-c(list(k=k + i - 1),subsampleArgs) #set k - res <- .clusterWrapper(x=x, subsample=subsample, clusterFunction=clusterFunction, subsampleArgs=tempArgs, clusterDArgs=clusterDArgs,typeAlg=typeAlg)$results - } - else{ - tempArgs<-c(list(k=k + i - 1),clusterDArgs) #set k - res <- .clusterWrapper(x=x, diss=diss, subsample=subsample, clusterFunction=clusterFunction, subsampleArgs=subsampleArgs, clusterDArgs=tempArgs,typeAlg=typeAlg)$results - - } - # if(length(res)==0) { - # cat(paste("Found",paste(nClusterPerK,collapse=","),"clusters for k=",paste(k+1:seq.num-1,collapse=","),". Stopping because zero-length cluster.\n")) - # whyStop<-paste("Stopped in midst of searching for cluster",nfound+1," because no clusters meeting criteria found for iteration k=",k+i-1,"and previous clusters not similar enough.") - # } + newk<-k + i - 1 + res<-updateClustering(newk) if(length(res)>0) res <- res[1:min(top.can,length(res))] candidates[[i]]<-res } } else { #need to go increase to K+2,K+3, etc. candidates <- candidates[-1] #remove old k - if(verbose) cat(paste("k =", k + seq.num - 1, "\n")) + newk<-k + seq.num - 1 + if(verbose) cat(paste("k =", newk, "\n")) #add new k (because always list o) - if(subsample){ - tempArgs<-c(list(k=k + seq.num - 1),subsampleArgs) #set k - res <- .clusterWrapper(x=x, diss=diss, subsample=subsample, clusterFunction=clusterFunction, subsampleArgs=tempArgs, clusterDArgs=clusterDArgs,typeAlg=typeAlg)$results - } - else{ - tempArgs<-c(list(k=k + seq.num - 1),clusterDArgs) #set k - res <- .clusterWrapper(x=x, diss=diss, subsample=subsample, clusterFunction=clusterFunction, subsampleArgs=subsampleArgs, clusterDArgs=tempArgs,typeAlg=typeAlg)$results - - } + res<-updateClustering(newk) if(length(res)>0) res <- res[1:min(top.can,length(res))] candidates[[seq.num]] <- res } @@ -273,7 +243,6 @@ seqCluster<-function (x=NULL, diss=NULL, k0, clusterFunction=c("tight","hierarch #all invalid -- probably means that for some k there were no candidates found. So should stop. if(verbose) cat(paste("Found ",paste(nClusterPerK,collapse=","),"clusters for k=",paste(k+1:seq.num-1,collapse=","),", respectively. Stopping iterating because zero-length cluster.\n")) whyStop<-paste("Stopped in midst of searching for cluster",nfound+1," because no clusters meeting criteria found for iteration k=",k+i-1,"and previous clusters not similar enough.") - #browser() break } if(length(whInvalid)>0){ @@ -314,11 +283,11 @@ seqCluster<-function (x=NULL, diss=NULL, k0, clusterFunction=c("tight","hierarch found.temp <- candidates[[whReturn]][[tempIndex[which.max(beta.temp)[1], whReturn]]] kend[[nfound]]<-k+seq.num-1 #just assuming returning last here! kstart[[nfound]]<-currentStart - if(input %in% c("X","both")) tclust[[nfound]] <- colnames(x)[found.temp] #need to do rownames, because remove rows from x + if(input %in% c("X")) tclust[[nfound]] <- colnames(x)[found.temp] #need to do rownames, because remove rows from x else tclust[[nfound]] <- colnames(diss)[found.temp] #need to do rownames, because remove rows from x mode(tclust[[nfound]]) <- "numeric" - if(input %in% c("X","both")) x <- x[,-found.temp] - if(input %in% c("diss","both")) diss<-diss[-found.temp,-found.temp] + if(input %in% c("X")) x <- x[,-found.temp] + if(input %in% c("diss")) diss<-diss[-found.temp,-found.temp] remain <- remain - length(tclust[[nfound]]) if(verbose) cat(paste("Cluster size:", length(tclust[[nfound]]), "\tRemaining number of points:", remain, "\n"), @@ -333,7 +302,6 @@ seqCluster<-function (x=NULL, diss=NULL, k0, clusterFunction=c("tight","hierarch if(remain< remain.n) whyStop<-"Ran out of samples" if(!found & k>k.max) whyStop<-paste("Went past k.max=",k.max,"in looking for cluster with similarity to previous.") } - #browser() clusterVector<-.convertClusterListToVector(tclust,N) if(all(clusterVector==-1) & length(tclust)>0) stop("coding error") if(nfound>0){ diff --git a/R/subsampleClustering.R b/R/subsampleClustering.R index df22acc2..225da89c 100644 --- a/R/subsampleClustering.R +++ b/R/subsampleClustering.R @@ -1,131 +1,201 @@ #' Cluster subsamples of the data -#' -#' Given a data matrix, this function will subsample the rows -#' (samples), cluster the subsamples, and return a \code{n x n} matrix with the -#' probability of co-occurance. -#' +#' +#' Given input data, this function will subsample the samples, cluster the +#' subsamples, and return a \code{n x n} matrix with the probability of +#' co-occurance. +#' @name subsampleClustering #' @param x the data on which to run the clustering (samples in columns). -#' @param k number of clusters to find for each clustering of a subsample -#' (passed to clusterFunction). -#' @param clusterFunction a function that clusters a \code{p x n} matrix of -#' data. Can also be given character values 'pam' or 'kmeans' to indicate use -#' of internal wrapper functions. Must accept arguments 'x' and 'k' (whether -#' uses them or not). See Details for format of what must return. -#' @param clusterArgs a list of parameter arguments to be passed to -#' clusterFunction. +#' @param diss a dissimilarity matrix on which to run the clustering. +#' @param clusterFunction a \code{\link{ClusterFunction}} object that defines +#' the clustering routine. See \code{\link{ClusterFunction}} for required +#' format of user-defined clustering routines. User can also give a character +#' value to the argument \code{clusterFunction} to indicate the use of +#' clustering routines provided in package. Type +#' \code{\link{listBuiltInFunctions}} at command prompt to see the built-in +#' clustering routines. If \code{clusterFunction} is missing, the default is +#' set to "pam". +#' @param clusterArgs a list of parameter arguments to be passed to the function +#' defined in the \code{clusterFunction} slot of the \code{ClusterFunction} +#' object. For any given \code{\link{ClusterFunction}} object, use function +#' \code{\link{requiredArgs}} to get a list of required arguments for the +#' object. #' @param resamp.num the number of subsamples to draw. #' @param samp.p the proportion of samples to sample for each subsample. -#' @param classifyMethod method for determining which samples should be used in -#' the co-occurance matrix. "All"= all samples, "OutOfSample"= those not -#' subsampled, and "InSample"=those in the subsample. "All" and "OutOfSample" -#' require that you provide classifyFunction to define how to classify those -#' samples not in the subsample into a cluster. If "All" is chosen, all -#' samples will be classified into clusters via the classifyFunctions, not -#' just those that are out-of-sample. Note if not choose 'All' possible to get -#' NAs in resulting D matrix (particularly if not enough subsamples taken). -#' @param classifyFunction a function which, given the output of clusterFunction -#' and new data points, will classify the new data points into a cluster. -#' @param ncores integer giving the number of cores. If ncores>1, mclapply will +#' @param classifyMethod method for determining which samples should be used in +#' calculating the co-occurance matrix. "All"= all samples, "OutOfSample"= +#' those not subsampled, and "InSample"=those in the subsample. See details +#' for explanation. +#' @param ncores integer giving the number of cores. If ncores>1, mclapply will #' be called. #' @param ... arguments passed to mclapply (if ncores>1). -#' -#' @details The \code{clusterFunction} must be a function that takes as an -#' argument 'x' which is a \code{p x n} matrix of data and integer 'k'. It -#' minimally must return a list with element named 'clustering' giving the -#' vector of cluster ids. To be incorporated with the larger hierarchy, it -#' should be list with elements of a partition object, just as is returned by -#' \code{\link[cluster]{pam}}. Generally, the user will need to write a -#' wrapper function to do this. In the case of pam or kmeans, the user can -#' identify clusterFunction as "pam" or "kmeans", and the package functions -#' will use internally written wrappers for the clusterFunction and -#' classifyFunction arguments. Additional arguments should be supplied via -#' clusterArgs. -#' -#' @details The classifyFunction should take as an object a data matrix 'x' with -#' samples on the columns, and the output of the clusterFunction. Note that the -#' function should assume that the input 'x' is not the same samples that were -#' input to the clusterFunction (but can assume that it is the same number of -#' features/columns). -#' -#' @return A \code{n x n} matrix of co-occurances. +#' @inheritParams mainClustering +#' @inheritParams clusterSingle +#' +#' @details \code{subsampleClustering} is not usually called directly by the +#' user. It is only an exported function so as to be able to clearly document +#' the arguments for \code{subsampleClustering} which can be passed via the +#' argument \code{subsampleArgs} in functions like \code{\link{clusterSingle}} +#' and \code{\link{clusterMany}}. +#' @details \code{requiredArgs:} The choice of "All" or "OutOfSample" for +#' \code{requiredArgs} require the classification of arbitrary samples not +#' originally in the clustering to clusters; this is done via the classifyFUN +#' provided in the \code{\link{ClusterFunction}} object. If the +#' \code{\link{ClusterFunction}} object does not have such a function to +#' define how to classify into a cluster samples not in the subsample that +#' created the clustering then \code{classifyMethod} must be +#' \code{"InSample"}. Note that if "All" is chosen, all samples will be +#' classified into clusters via the classifyFUN, not just those that are +#' out-of-sample; this could result in different assignments to clusters for +#' the in-sample samples than their original assignment by the clustering +#' depending on the classification function. If you do not choose 'All',it is +#' possible to get NAs in resulting S matrix (particularly if when not enough +#' subsamples are taken) which can cause errors if you then pass the resulting +#' D=1-S matrix to \code{\link{mainClustering}}. For this reason the default is +#' "All". +#' @return A \code{n x n} matrix of co-occurances, i.e. a symmetric matrix with +#' [i,j] entries equal to the percentage of subsamples where the ith and jth +#' sample were clustered into the same cluster. The percentage is only out of +#' those subsamples where the ith and jth samples were both assigned to a +#' clustering. If \code{classifyMethod=="All"}, this is all subsamples for all +#' i,j pairs. But if \code{classifyMethod=="InSample"} or +#' \code{classifyMethod=="OutOfSample"}, then the percentage is only taken on +#' those subsamples where the ith and jth sample were both in or out of +#' sample, respectively, relative to the subsample. #' #' @examples #' data(simData) +#' coOccur <- subsampleClustering(clusterFunction="kmeans", x=simData, +#' clusterArgs=list(k=3,nstart=10), resamp.n=100, samp.p=0.7) #' -#' subD <- subsampleClustering(t(simData), k=3, clusterFunction="kmeans", -#' clusterArgs=list(nstart=10), resamp.n=100, samp.p=0.7) -#' -#' heatmap(subD) +#' #visualize the resulting co-occurance matrix +#' plotHeatmap(coOccur) +#' @aliases subsampleClustering,character-method #' @export -subsampleClustering<-function(x,k,clusterFunction="pam", clusterArgs=NULL, - classifyMethod=c("All","InSample","OutOfSample"),classifyFunction=NULL, - resamp.num = 100, samp.p = 0.7,ncores=1,... ) -{ - #input<-.checkXDissInput(x,diss) - if(!is.function(clusterFunction)){ - if(clusterFunction%in%c("pam","kmeans")){ - if(clusterFunction=="pam"){ - clusterFunction<-.pamCluster - classifyFunction<-.pamClassify - } - else if(clusterFunction=="kmeans"){ - clusterFunction<-.kmeansCluster - classifyFunction<-.kmeansClassify - } - } - else stop("clusterFunction must define a function") - } - classifyMethod<-match.arg(classifyMethod) - if(classifyMethod %in% c("All","OutOfSample") && missing(classifyFunction)){ - stop("Must provide a classification function for the 'All' or 'OutOfSample' options") +setMethod( + f = "subsampleClustering", + signature = signature(clusterFunction = "character"), + definition = function(clusterFunction,...){ + subsampleClustering(getBuiltInFunction(clusterFunction),...) + } - #if(input %in% c("X","both")) N <- dim(x)[2] else N<-dim(diss)[2] - N <- dim(x)[2] + ) + +# #' @rdname subsampleClustering +# #' @export +# setMethod( +# f = "subsampleClustering", +# signature = signature(clusterFunction = "missing"), +# definition = function(clusterFunction,...){ +# subsampleClustering(clusterFunction="pam",...) +# } +# ) + +#' @rdname subsampleClustering +#' @export +setMethod( + f = "subsampleClustering", + signature = signature(clusterFunction = "ClusterFunction"), +definition=function(clusterFunction, x=NULL,diss=NULL,distFunction=NA,clusterArgs=NULL, + classifyMethod=c("All","InSample","OutOfSample"), + resamp.num = 100, samp.p = 0.7,ncores=1,checkArgs=TRUE,checkDiss=TRUE,... ) +{ + ####################### + ### Check both types of inputs and create diss if needed, and check it. + ####################### + input<-.checkXDissInput(x,diss,inputType=clusterFunction@inputType,checkDiss=checkDiss) + classifyMethod<-match.arg(classifyMethod) + if(classifyMethod %in% c("All","OutOfSample") && is.null(clusterFunction@classifyFUN)){ + classifyMethod<-"InSample" #silently change it... + } + else{ + inputClassify<-.checkXDissInput(x, diss, inputType=clusterFunction@inputClassifyType, checkDiss=FALSE) #don't need to check it twice! + } + if((input=="X" & clusterFunction@inputType=="diss") || (classifyMethod!="InSample" && inputClassify=="X" && clusterFunction@inputClassifyType=="diss")){ + diss<-.makeDiss(x,distFunction=distFunction,checkDiss=checkDiss,algType=clusterFunction@algorithmType) + if(input=="X") input<-"diss" + if(inputClassify=="X") inputClassify<-"diss" + } + #----- + # Other Checks + #----- + reqArgs<-requiredArgs(clusterFunction) + if(!all(reqArgs %in% names(clusterArgs))) stop(paste("For this clusterFunction algorithm type ('",algorithmType(clusterFunction),"') must supply arguments",reqArgs,"as elements of the list of 'clusterArgs'")) + +#----- +# Basic parameters, subsamples +#----- + if(input %in% c("X","both")) N <- dim(x)[2] else N<-dim(diss)[2] subSize <- round(samp.p * N) idx<-replicate(resamp.num,sample(1:N,size=subSize)) #each column a set of indices for the subsample. - perSample<-function(ids){ -# xWithIds<-switch(input,"X"=x[,ids,drop=FALSE],"diss"=x,"both"=x[,ids,drop=FALSE]) -# dissWithIds<-switch(input,"X"=diss,"diss"=diss[ids,ids,drop=FALSE],"both"=diss[ids,ids,drop=FALSE]) - xWithIds<-x[,ids,drop=FALSE] - #result<-do.call(clusterFunction,c(list(x=xWithIds,diss=dissWithIds,k=k),clusterArgs)) - result<-do.call(clusterFunction,c(list(x=xWithIds,k=k),clusterArgs)) - #if(classifyMethod=="All") classX<-classifyFunction(x=x,diss=diss,result) - if(classifyMethod=="All") classX<-classifyFunction(x=x,result) - if(classifyMethod=="OutOfSample"){ - # xWithoutIds<-switch(input,"X"=x[,-ids,drop=FALSE],"diss"=x,"both"=x[,-ids,drop=FALSE]) - # dissWithoutIds<-switch(input,"X"=diss,"diss"=diss[-ids,-ids,drop=FALSE],"both"=diss[-ids,-ids,drop=FALSE]) - xWithoutIds<-x[,-ids,drop=FALSE] - #classElse<-classifyFunction(x=xWithoutIds, diss=dissWithoutIds,result) - classElse<-classifyFunction(x=xWithoutIds, result) - classX<-rep(NA,N) - classX[-ids]<-classElse - } - if(classifyMethod=="InSample"){ - classX<-rep(NA,N) - classX[ids]<-result$clustering - } - D <- outer(classX, classX, function(a, b) a == b) - Dinclude<-matrix(1,N,N) - whNA<-which(is.na(classX)) - if(length(whNA)>0){ - Dinclude[whNA,]<-0 #don't add them to the denominator either - Dinclude[,whNA]<-0 - D[whNA,]<-0 #don't add to sum - D[,whNA]<-0 - } - return(list(D=D,Dinclude=Dinclude)) - } + #----- + # Function that calls the clustering for each subsample + #----- + perSample<-function(ids){ + ##---- + ##Cluster part of subsample + ##---- + argsClusterList<-.makeDataArgs(dataInput=input, funInput=clusterFunction@inputType, xData=x[,ids,drop=FALSE], dissData=diss[ids,ids,drop=FALSE]) + + #if doing InSample, do cluster.only because will be more efficient, e.g. pam and kmeans. + argsClusterList<-c(argsClusterList,list("checkArgs"=checkArgs,"cluster.only"= (classifyMethod=="InSample") )) + result<-do.call(clusterFunction@clusterFUN,c(argsClusterList,clusterArgs)) + + ##---- + ##Classify part of subsample + ##---- + if(classifyMethod=="All"){ + argsClassifyList<-.makeDataArgs(dataInput=inputClassify,funInput=clusterFunction@inputClassifyType, xData=x, dissData=diss) + classX<-do.call(clusterFunction@classifyFUN,c(argsClassifyList,list(clusterResult=result))) + } + if(classifyMethod=="OutOfSample"){ + argsClassifyList<-.makeDataArgs(dataInput=inputClassify,funInput=clusterFunction@inputClassifyType, xData=x[,-ids,drop=FALSE], dissData=diss[-ids,-ids,drop=FALSE]) + classElse<-do.call(clusterFunction@classifyFUN,c(argsClassifyList, list(clusterResult=result))) + classX<-rep(NA,N) + classX[-ids]<-classElse + } + if(classifyMethod=="InSample"){ + classX<-rep(NA,N) + #methods that do not have + if(is.list(result)){ + #the next shouldn't happen any more because should be cluster.only=TRUE + # if("clustering" %in% names(result)){ + # classX[ids]<-result$clustering + # } + # else{ + if(clusterFunction@outputType=="list"){ + resultVec <-.convertClusterListToVector(result,N=length(ids)) + classX[ids]<-resultVec + } + else stop("The clusterFunction given to subsampleClustering returns a list when cluster.only=FALSE but does not have a named element 'clustering' nor outputType='list'") +# } + } + else{ + classX[ids]<-result + + } + } + D <- outer(classX, classX, function(a, b) a == b) + Dinclude<-matrix(1,N,N) + whNA<-which(is.na(classX)) + if(length(whNA)>0){ + Dinclude[whNA,]<-0 #don't add them to the denominator either + Dinclude[,whNA]<-0 + D[whNA,]<-0 #don't add to sum + D[,whNA]<-0 + } + return(list(D=D,Dinclude=Dinclude)) + } if(ncores==1){ DList<-apply(idx,2,perSample) } else{ - DList<-parallel::mclapply(1:ncol(idx),function(nc){perSample(idx[,nc])},mc.cores=ncores,...) + DList<-parallel::mclapply(1:ncol(idx), function(nc){ perSample(idx[,nc]) }, mc.cores=ncores,...) } DDenom<-Reduce("+",lapply(DList,function(y){y$Dinclude})) DNum<-Reduce("+",lapply(DList,function(y){y$D})) - Dbar = DNum/DDenom -# if(input %in% c("X","both")) rownames(Dbar)<-colnames(Dbar)<-colnames(x) -# else rownames(Dbar)<-colnames(Dbar)<-colnames(diss) + Dbar <- DNum/DDenom + if(input %in% c("X")) rownames(Dbar)<-colnames(Dbar)<-colnames(x) + else rownames(Dbar)<-colnames(Dbar)<-colnames(diss) rownames(Dbar)<-colnames(Dbar)<-colnames(x) - return(Dbar) -} + return(Dbar) +}) diff --git a/man/ClusterExperiment-class.Rd b/man/ClusterExperiment-class.Rd index d225310d..3ba490f2 100644 --- a/man/ClusterExperiment-class.Rd +++ b/man/ClusterExperiment-class.Rd @@ -63,7 +63,7 @@ Slots).} \item{dendro_clusters}{dendrogram. Sets the `dendro_clusters` slot (see Slots).} -\item{dendro_outbranch}{logical. Sets the `dendro_outbranch` slot (see Slots)} +\item{dendro_outbranch}{logical. Sets the dendro_outbranch slot (see Slots).} \item{coClustering}{matrix. Sets the `coClustering` slot (see Slots).} } diff --git a/man/ClusterFunction-class.Rd b/man/ClusterFunction-class.Rd new file mode 100644 index 00000000..e627e458 --- /dev/null +++ b/man/ClusterFunction-class.Rd @@ -0,0 +1,156 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/AllClasses.R +\docType{class} +\name{ClusterFunction-class} +\alias{ClusterFunction-class} +\alias{ClusterFunction} +\alias{clusterFunction} +\alias{internalFunctionCheck} +\alias{clusterFunction} +\alias{clusterFunction,function-method} +\title{Class ClusterFunction} +\usage{ +internalFunctionCheck(clusterFUN, inputType, algorithmType, outputType) + +clusterFunction(clusterFUN, ...) + +\S4method{clusterFunction}{`function`}(clusterFUN, inputType, outputType, + algorithmType, inputClassifyType = NA_character_, + requiredArgs = NA_character_, classifyFUN = NULL, checkFunctions = TRUE) +} +\arguments{ +\item{clusterFUN}{function bassed to slot \code{clusterFUN}.} + +\item{inputType}{character for slot \code{inputType}} + +\item{algorithmType}{character for slot \code{inputType}} + +\item{outputType}{character for slot \code{outputType}} + +\item{...}{arguments passed to different methods of \code{clusterFunction}} + +\item{inputClassifyType}{character for slot \code{inputClassifyType}} + +\item{requiredArgs}{character for slot \code{requiredArgs}} + +\item{classifyFUN}{function for slot \code{classifyFUN}} + +\item{checkFunctions}{logical for whether to check the input functions with +\code{internalFunctionsCheck}} +} +\value{ +A \code{ClusterFunction} object. +} +\description{ +\code{ClusterFunction} is a class for holding functions that can + be used for clustering in the clustering algorithms in this package. + +The constructor \code{clusterFunction} creates an object of the + class \code{ClusterFunction}. +} +\details{ +Required arguments for \code{clusterFUN}: \itemize{ \item{"x or + diss"}{either \code{x} and/or \code{diss} depending on \code{inputType}. If + \code{x}, then \code{x} is assumed to be nfeatures x nsamples (like + assay(CEObj) would give)} \item{"checkArgs"}{logical argument. If + \code{checkArgs=TRUE}, the \code{clusterFUN} should check if the arguments + passed in \code{...} are valid and return an error if not; otherwise, no + error will be given, but the check should be done and only valid arguments + in \code{...} passed along. This is necessary for the function to work with + \code{clusterMany} which passes all arguments to all functions without + checking. } \item{"cluster.only"}{logical argument. If + \code{cluster.only=TRUE}, then \code{clusterFUN} should return only the + vector of cluster assignments (or list if \code{outputType="list"}). If + \code{cluster.only=FALSE} then the \code{clusterFUN} should return a named + list where one of the elements entitled \code{clustering} contains the + vector described above (no list!); anything else needed by the + \code{classifyFUN} to classify new data should be contained in the output + list as well. \code{cluster.only} is set internally depending on whether + \code{classifyFUN} will be used by subsampling or only for clustering the + final product.} \item{"..."}{Any additional arguments specific to the + algorithm used by \code{clusterFUN} should be passed via \code{...} and NOT + passed via arguments to \code{clusterFUN}} \item{"Other required + arguments"}{\code{clusterFUN} must also accept arguments required for its + \code{algorithmType} (see Details below).} } + +\code{algorithmType}: Type "01" is for clustering functions that + expect as an input a dissimilarity matrix that takes on 0-1 values (e.g. + from subclustering) with 1 indicating more dissimilarity between samples. + "01" algorithm types must also have \code{inputType} equal to + \code{"diss"}. It is also generally expected that "01" algorithms use the + 0-1 nature of the input to set criteria as to where to find clusters. "01" + functions must take as an argument \code{alpha} between 0 and 1 to + determine the clusters, where larger values of \code{alpha} require less + similarity between samples in the same cluster. "K" is for clustering + functions that require an argument \code{k} (the number of clusters), but + arbitrary \code{inputType}. On the other hand, "K" algorithms are assumed + to need a predetermined 'k' and are also assumed to cluster all samples to + a cluster. If not, the post-processing steps in \code{\link{mainClustering}} such + as \code{findBestK} and \code{removeSil} may not operate correctly since + they rely on silhouette distances. + +\code{internalFunctionCheck} is the function that is called by the + validity check of the \code{clusterFunction} constructor (if + \code{checkFunctions=TRUE}). It is available as an S3 function for the user + to be able to test their functions and debug them, which is difficult to do + with a S4 validity function. +} +\section{Slots}{ + +\describe{ +\item{\code{clusterFUN}}{a function defining the clustering function. See details for +required arguments.} + +\item{\code{inputType}}{a character defining what type of input \code{clusterFUN} +takes. Must be one of either "diss","X", or "either"} + +\item{\code{algorithmType}}{a character defining what type of clustering algorithm +\code{clusterFUN} is. Must be one of either "01" or "K". \code{clusterFUN} +must take the corresponding required arguments (see details below).} + +\item{\code{classifyFUN}}{a function that takes as input new data and the output of +\code{clusterFUN} (when \code{cluster.only=FALSE} and results in cluster +assignments of the new data. Note that the function should assume that the +input 'x' is not the same samples that were input to the clusterFunction +(but can assume that it is the same number of features/columns). Used in +subsampling clustering. If given value \code{NULL} then subsampling can +only be \code{"InSample"}, see \code{\link{subsampleClustering}}.} + +\item{\code{inputClassifyType}}{the input type for the classification function (if +not NULL); like \code{inputType}, must be one of "diss","X", or "either"} + +\item{\code{outputType}}{the type of output given by \code{clusterFUN}. Must either +be "vector" or "list". If "vector" then the output should be a vector of +length equal to the number of observations with integer-valued elements +identifying them to different clusters; the vector assignments should be in +the same order as the original input of the data. Samples that are not +assigned to any cluster should be given a '-1' value. If "list", then it +must be a list equal to the length of the number of clusters, and the +elements of the list contain the indices of the samples in that cluster. +Any indices not in any of the list elements are assumed to be -1. The main +advantage of "list" is that it can preserve the order of the clusters if +the \code{clusterFUN} desires to do so. In which case the \code{orderBy} +argument of \code{\link{mainClustering}} can preserve this ordering (default is +to order by size).} + +\item{\code{requiredArgs}}{Any additional required arguments for \code{clusterFUN} +(beyond those required of all \code{clusterFUN}, described in details).} + +\item{\code{checkFunctions}}{logical. If TRUE, the validity check of the +\code{ClusterFunction} object will check the \code{clusterFUN} with simple +toy data using the function \code{internalFunctionCheck}.} +}} + +\examples{ +#Use internalFunctionCheck to check possible function +goodFUN<-function(x,diss,k,checkArgs,cluster.only,...){ +cluster::pam(x=t(x),k=k,cluster.only=cluster.only) +} +#passes internal check +internalFunctionCheck(goodFUN,inputType="X",algorithmType="K",outputType="vector") +#Note it doesn't pass if inputType="either" because no catches for x=NULL +internalFunctionCheck(goodFUN, inputType="either",algorithmType="K",outputType="vector") +myCF<-clusterFunction(clusterFUN=goodFUN, inputType="X",algorithmType="K", outputType="vector") +badFUN<-function(x,diss,k,checkArgs,cluster.only,...){cluster::pam(x=x,k=k)} +internalFunctionCheck(badFUN,inputType="X",algorithmType="K",outputType="vector") +} diff --git a/man/ClusterFunction-methods.Rd b/man/ClusterFunction-methods.Rd new file mode 100644 index 00000000..b88bd0cf --- /dev/null +++ b/man/ClusterFunction-methods.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/AllHelperClusterFunction.R +\docType{methods} +\name{ClusterFunction-methods} +\alias{ClusterFunction-methods} +\alias{requiredArgs,ClusterFunction-method} +\alias{requiredArgs,character-method} +\alias{requiredArgs} +\alias{requiredArgs,character-method} +\alias{requiredArgs,factor-method} +\alias{algorithmType,ClusterFunction-method} +\alias{algorithmType} +\alias{algorithmType,character-method} +\alias{algorithmType,factor-method} +\alias{inputType,ClusterFunction-method} +\alias{inputType} +\alias{inputType,character-method} +\alias{inputType,factor-method} +\title{Helper methods for the ClusterFunction class} +\usage{ +\S4method{requiredArgs}{character}(object) + +\S4method{requiredArgs}{ClusterFunction}(object, genericOnly = FALSE) + +\S4method{requiredArgs}{character}(object) + +\S4method{requiredArgs}{character}(object) + +\S4method{requiredArgs}{factor}(object) + +\S4method{algorithmType}{ClusterFunction}(object) + +\S4method{algorithmType}{character}(object) + +\S4method{algorithmType}{factor}(object) + +\S4method{inputType}{ClusterFunction}(object) + +\S4method{inputType}{character}(object) + +\S4method{inputType}{factor}(object) +} +\arguments{ +\item{object}{input to the method, usually either a \code{ClusterFunction} class or a character describing a built-in \code{ClusterFunction} object.} + +\item{genericOnly}{logical If TRUE, return only the generic required arguments (i.e. those required by the algorithm type) and not the arguments specific to that clustering found in the slot \code{requiredArgs}. If FALSE both sets of arguments are returned.} +} +\description{ +This is a collection of helper methods for the ClusterExperiment class. +} +\details{ +Note that when subsetting the data, the dendrogram information and +the co-clustering matrix are lost. +} diff --git a/man/RSEC.Rd b/man/RSEC.Rd index af52cbd5..74898baa 100644 --- a/man/RSEC.Rd +++ b/man/RSEC.Rd @@ -13,12 +13,12 @@ \usage{ \S4method{RSEC}{matrix}(x, isCount = FALSE, transFun = NULL, dimReduce = "PCA", nVarDims = NA, nPCADims = c(50), k0s = 4:15, - clusterFunction = c("tight", "hierarchical01"), alphas = c(0.1, 0.2, 0.3), + clusterFunction = listBuiltInType01(), alphas = c(0.1, 0.2, 0.3), betas = 0.9, minSizes = 1, combineProportion = 0.7, combineMinSize = 5, dendroReduce = "mad", dendroNDims = 1000, mergeMethod = "adjP", mergeCutoff = 0.05, verbose = FALSE, - clusterDArgs = NULL, subsampleArgs = NULL, seqArgs = NULL, ncores = 1, - random.seed = NULL, run = TRUE) + mainClusterArgs = NULL, subsampleArgs = NULL, seqArgs = NULL, + ncores = 1, random.seed = NULL, run = TRUE) \S4method{RSEC}{SummarizedExperiment}(x, ...) @@ -28,13 +28,13 @@ rerunClusterMany = FALSE, ...) } \arguments{ -\item{x}{the data on which to run the clustering. Can be: matrix (with genes -in rows), a list of datasets overwhich the clusterings should be run, a -\code{SummarizedExperiment} object, or a \code{ClusterExperiment} object.} +\item{x}{the data matrix on which to run the clustering. Can be: matrix (with +genes in rows), a list of datasets overwhich the clusterings should be run, +a \code{SummarizedExperiment} object, or a \code{ClusterExperiment} object.} -\item{isCount}{logical. Whether the data are in counts, in which case the -default \code{transFun} argument is set as log2(x+1). This is simply a -convenience to the user, and can be overridden by giving an explicit +\item{isCount}{logical. Whether the data are in counts, in which case the +default \code{transFun} argument is set as log2(x+1). This is simply a +convenience to the user, and can be overridden by giving an explicit function to \code{transFun}.} \item{transFun}{function A function to use to transform the input data matrix @@ -42,38 +42,38 @@ before clustering.} \item{dimReduce}{character A character identifying what type of dimensionality reduction to perform before clustering. Options are -"none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more +"none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more details.} -\item{nVarDims}{vector of the number of the most variable features to keep -(when "var", "cv", or "mad" is identified in \code{dimReduce}). If NA is +\item{nVarDims}{vector of the number of the most variable features to keep +(when "var", "cv", or "mad" is identified in \code{dimReduce}). If NA is included, then the full dataset will also be included.} -\item{nPCADims}{vector of the number of PCs to use (when 'PCA' is identified +\item{nPCADims}{vector of the number of PCs to use (when 'PCA' is identified in \code{dimReduce}). If NA is included, then the full dataset will also be included.} \item{k0s}{the k0 parameter for sequential clustering (see \code{\link{seqCluster}})} -\item{clusterFunction}{function used for the clustering. Note that unlike in +\item{clusterFunction}{function used for the clustering. Note that unlike in \code{\link{clusterSingle}}, this must be a character vector of pre-defined -clustering techniques provided by \code{\link{clusterSingle}}, and can not -be a user-defined function. Current functions are "tight", -"hierarchical01","hierarchicalK", and "pam"} +clustering techniques, and can not be a user-defined function. Current +functions can be found by typing \code{listBuiltInFunctions()} into the +command-line.} -\item{alphas}{values of alpha to be tried. Only used for clusterFunctions of -type '01' (either 'tight' or 'hierarchical01'). Determines tightness -required in creating clusters from the dissimilarity matrix. Takes on -values in [0,1]. See \code{\link{clusterD}}.} +\item{alphas}{values of alpha to be tried. Only used for clusterFunctions of +type '01'. Determines tightness required in creating clusters from the +dissimilarity matrix. Takes on values in [0,1]. See documentation of +\code{\link{ClusterFunction}}.} \item{betas}{values of \code{beta} to be tried in sequential steps. Only used -for \code{sequential=TRUE}. Determines the similarity between two clusters +for \code{sequential=TRUE}. Determines the similarity between two clusters required in order to deem the cluster stable. Takes on values in [0,1]. See -\code{\link{seqCluster}}.} +documentation of \code{\link{seqCluster}}.} -\item{minSizes}{the minimimum size required for a cluster (in -\code{clusterD}). Clusters smaller than this are not kept and samples are -left unassigned.} +\item{minSizes}{the minimimum size required for a cluster (in the +\code{mainClustering} step). Clusters smaller than this are not kept and samples +are left unassigned.} \item{combineProportion}{passed to \code{proportion} in \code{\link{combineMany}}} @@ -89,39 +89,39 @@ left unassigned.} \item{verbose}{logical. If TRUE it will print informative messages.} -\item{clusterDArgs}{list of additional arguments to be passed to -\code{\link{clusterD}}.} +\item{mainClusterArgs}{list of arguments to be passed for the mainClustering step, see +help pages of \code{\link{mainClustering}}.} -\item{subsampleArgs}{list of arguments to be passed to +\item{subsampleArgs}{list of arguments to be passed to the subsampling step +(if \code{subsample=TRUE}), see help pages of \code{\link{subsampleClustering}}.} -\item{seqArgs}{list of additional arguments to be passed to -\code{\link{seqCluster}}.} +\item{seqArgs}{list of arguments to be passed to \code{\link{seqCluster}}.} \item{ncores}{the number of threads} -\item{random.seed}{a value to set seed before each run of clusterSingle (so -that all of the runs are run on the same subsample of the data). Note, if -'random.seed' is set, argument 'ncores' should NOT be passed via -subsampleArgs; instead set the argument 'ncores' of -clusterMany directly (which is preferred for improving speed anyway).} +\item{random.seed}{a value to set seed before each run of clusterSingle (so +that all of the runs are run on the same subsample of the data). Note, if +'random.seed' is set, argument 'ncores' should NOT be passed via +subsampleArgs; instead set the argument 'ncores' of clusterMany directly +(which is preferred for improving speed anyway).} \item{run}{logical. If FALSE, doesn't run clustering, but just returns matrix of parameters that will be run, for the purpose of inspection by user (with -rownames equal to the names of the resulting column names of clMat object -that would be returned if \code{run=TRUE}). Even if \code{run=FALSE}, +rownames equal to the names of the resulting column names of clMat object +that would be returned if \code{run=TRUE}). Even if \code{run=FALSE}, however, the function will create the dimensionality reductions of the data indicated by the user input.} -\item{...}{For signature \code{list}, arguments to be passed on to mclapply -(if ncores>1). For all the other signatures, arguments to be passed to the +\item{...}{For signature \code{list}, arguments to be passed on to mclapply +(if ncores>1). For all the other signatures, arguments to be passed to the method for signature \code{list}.} -\item{eraseOld}{logical. Only relevant if input \code{x} is of class -\code{ClusterExperiment}. If TRUE, will erase existing workflow results -(clusterMany as well as mergeClusters and combineMany). If FALSE, existing -workflow results will have "\code{_i}" added to the clusterTypes value, -where \code{i} is one more than the largest such existing workflow +\item{eraseOld}{logical. Only relevant if input \code{x} is of class +\code{ClusterExperiment}. If TRUE, will erase existing workflow results +(clusterMany as well as mergeClusters and combineMany). If FALSE, existing +workflow results will have "\code{_i}" added to the clusterTypes value, +where \code{i} is one more than the largest such existing workflow clusterTypes.} \item{rerunClusterMany}{logical. If the object is a clusterExperiment object, diff --git a/man/addClusters.Rd b/man/addClusters.Rd index 4ab8d040..1939ad6d 100644 --- a/man/addClusters.Rd +++ b/man/addClusters.Rd @@ -76,11 +76,10 @@ addClusters adds y to x, and is thus not symmetric in the two \examples{ data(simData) -cl1 <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -sequential=FALSE, clusterDArgs=list(k=3)) - -cl2 <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -sequential=FALSE, clusterDArgs=list(k=5)) +cl1 <- clusterSingle(simData, subsample=FALSE, +sequential=FALSE, mainClusterArgs=list(clusterArgs=list(k=3), clusterFunction="pam")) +cl2 <- clusterSingle(simData, subsample=FALSE, +sequential=FALSE, mainClusterArgs=list(clusterArgs=list(k=3), clusterFunction="pam")) addClusters(cl1, cl2) } diff --git a/man/builtInClusteringFunctions.Rd b/man/builtInClusteringFunctions.Rd new file mode 100644 index 00000000..b282288d --- /dev/null +++ b/man/builtInClusteringFunctions.Rd @@ -0,0 +1,112 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/builtInClusterFunctions.R +\docType{methods} +\name{listBuiltInFunctions} +\alias{listBuiltInFunctions} +\alias{getBuiltInFunction,character-method} +\alias{getBuiltInFunction} +\alias{listBuiltInTypeK} +\alias{listBuiltInType01} +\title{Built in ClusterFunction options} +\usage{ +listBuiltInFunctions() + +\S4method{getBuiltInFunction}{character}(object) + +listBuiltInTypeK() + +listBuiltInType01() +} +\arguments{ +\item{object}{name of built in function.} +} +\description{ +Documents the built-in clustering options that are available in + the clusterExperiment package. +} +\details{ +\code{listBuiltInFunctions} will return the character names of + the built-in clustering functions available. + +\code{listBuiltInTypeK} returns the names of the built-in functions + that have type 'K' + +\code{listBuiltInType01} returns the names of the built-in functions + that have type '01' + +\code{getBuiltInFunction} will return the + \code{ClusterFunction} object of a character value that corresponds to a + built-in function. + +\code{\link{algorithmType}} and \code{\link{inputType}} will +return the \code{algorithmType} and \code{inputType} of the + built-in clusterFunction corresponding to the character value. + +\strong{Built-in clustering methods:} The built-in clustering methods, the + names of which can be accessed by \code{listBuiltInFunctions()} are the + following: +\itemize{ +\item{"pam"}{Based on \code{\link[cluster]{pam}} in + \code{cluster} package. Arguments to that function can be passed via + \code{clusterArgs}. +Input is \code{"either"} (\code{x} or \code{diss}); algorithm type is "K"} +\item{"clara"}{Based on \code{\link[cluster]{clara}} in + \code{cluster} package. Arguments to that function can be passed via + \code{clusterArgs}. Note that we have changed the default arguments of + that function to match the recommendations in the documentation of + \code{\link[cluster]{clara}} (numerous functions are set to less than optimal + settings for back-compatiability). Specifically, the following defaults + are implemented \code{samples=50}, \code{keep.data=FALSE}, +\code{mediods.x=FALSE},\code{rngR=TRUE}, + \code{pamLike=TRUE}, \code{correct.d=TRUE}. +Input is \code{"X"}; algorithm type is "K".} +\item{"kmeans"}{Based on \code{\link[stats]{kmeans}} in + \code{stats} package. Arguments to that function can be passed via + \code{clusterArgs} except for \code{centers} which is reencoded here to be + the argument 'k' +Input is \code{"X"}; algorithm type is "K"} +\item{"hierarchical01"}{\code{\link[stats]{hclust}} in + \code{stats} package is used to build hiearchical clustering. Arguments to + that function can be passed via \code{clusterArgs}. The + \code{hierarchical01} cuts the hiearchical tree based on the parameter + \code{alpha}. It does not use the \code{cutree} function, but instead + transversing down the tree until getting a block of + samples with whose summary of the values is greater than or equal to + 1-alpha. Arguments that can be passed to 'hierarchical01' are + 'evalClusterMethod' which determines how to summarize the samples' values + of D[samples,samples] for comparison to 1-alpha: "maximum" (default) takes + the minimum of D[samples,samples] and requires it to be less than or equal + to 1-alpha; "average" requires that each row mean of D[samples,samples] be + less than or equal to 1-alpha. Additional arguments of hclust can also be passed via + clusterArgs to control the hierarchical clustering of D. +Input is \code{"diss"}; algorithm type is "01"} +\item{"hierarchicalK"}{\code{\link[stats]{hclust}} in \code{stats} package is used + to build hiearchical clustering and \code{\link{cutree}} is used to cut the + tree into \code{k} clusters. +Input is \code{"diss"}; algorithm type is "K"} +\item{"tight"}{Based on the algorithm in + Tsang and Wong, specifically their method of picking clusters from a + co-occurance matrix after subsampling. The clustering encoded here is not + the entire tight clustering algorithm, only that single piece that + identifies clusters from the co-occurance matrix. +Arguments for the tight method are + 'minSize.core' (default=2), which sets the minimimum number of samples that + form a core cluster. +Input is \code{"diss"}; algorithm type is "01"} +\item{"spectral"}{\code{\link[kernlab]{specc}} in \code{kernlab} package +is used to perform spectral clustering. Note that spectral clustering can +produce errors if the number of clusters (K) is not sufficiently smaller than +the number of samples (N). K < N is not always sufficient. +Input is \code{"X"}; algorithm type is "K".} +} +} +\examples{ +listBuiltInFunctions() +algorithmType(c("kmeans","pam","hierarchical01")) +inputType(c("kmeans","pam","hierarchical01")) +listBuiltInTypeK() +listBuiltInType01() +} +\seealso{ +\code{\link{ClusterFunction}}, \code{\link{algorithmType}}, \code{\link{inputType}} +} diff --git a/man/clusterContrasts.Rd b/man/clusterContrasts.Rd index 70235c46..ed0e9485 100644 --- a/man/clusterContrasts.Rd +++ b/man/clusterContrasts.Rd @@ -36,8 +36,8 @@ to be compared and must match the names of the clusters in the vector \code{cluster}.} \item{outputType}{character string. Gives format for the resulting contrast -matrix. Currently the only option is the format appropriate for -\code{\link{limma}} package, but we anticipate adding more.} +matrix. Currently the two options are the format appropriate for +\code{\link[limma]{limma}} and \code{\link[MAST]{MAST}} package.} \item{removeNegative}{logical, whether to remove negative valued clusters from the design matrix. Appropriate to pick TRUE (default) if design will @@ -76,6 +76,11 @@ clusterContrasts(cl,contrastType="Pairs") cl<-makeDendrogram(cl) clusterContrasts(cl,contrastType="Pairs") } +\references{ +Ritchie, ME, Phipson, B, Wu, D, Hu, Y, Law, CW, Shi, W, and Smyth, GK (2015). limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Research 43, e47. http://nar.oxfordjournals.org/content/43/7/e47 + +Finak, et al. MAST: a flexible statistical framework for assessing transcriptional changes and characterizing heterogeneity in single-cell RNA sequencing data. Genome Biology (2015). +} \author{ Elizabeth Purdom } diff --git a/man/clusterD.Rd b/man/clusterD.Rd deleted file mode 100644 index e7f71b3f..00000000 --- a/man/clusterD.Rd +++ /dev/null @@ -1,224 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clusterD.R -\name{clusterD} -\alias{clusterD} -\alias{cluster01} -\alias{cluster01} -\alias{clusterK} -\title{Cluster distance matrix from subsampling} -\usage{ -clusterD(x = NULL, diss = NULL, clusterFunction = c("hierarchical01", - "tight", "pam", "hierarchicalK"), typeAlg = c("01", "K"), - distFunction = NA, minSize = 1, orderBy = c("size", "best"), - format = c("vector", "list"), clusterArgs = NULL, checkArgs = TRUE, - returnD = FALSE, ...) - -cluster01(diss, clusterFunction = c("hierarchical01", "tight"), alpha = 0.1, - clusterArgs = NULL, checkArgs) - -clusterK(diss, clusterFunction = c("pam", "hierarchicalK"), - findBestK = FALSE, k, kRange, removeSil = FALSE, silCutoff = 0, - clusterArgs = NULL, checkArgs) -} -\arguments{ -\item{x}{\code{p x n} data matrix on which to run the clustering (samples in -columns).} - -\item{diss}{\code{n x n} data matrix of dissimilarities between the samples -on which to run the clustering} - -\item{clusterFunction}{clusterFunction a function that clusters a nxn matrix -of dissimilarities/distances. Can also be given character values to -indicate use of internal wrapper functions for default methods. See Details -for the format of what the function must take as arguments and what format -the function must return.} - -\item{typeAlg}{character value of either '01' or 'K' determining whether the -function given in clusterFunction should be called by clusterK or -cluster01. Only used if clusterFunction is a user-defined function. -Otherwise, for methods provided by the package (i.e. by user setting -clusterFunction to a character value) clusterD will determine the -appropriate input for 'typeAlg' and will ignore user input.} - -\item{distFunction}{a distance function to be applied to \code{D}. Only relevant if -input \code{D} is a matrix of data, rather than a distance. See details.} - -\item{minSize}{the minimum number of samples in a cluster. Clusters found -below this size will be discarded and samples in the cluster will be given -a cluster assignment of "-1" to indicate that they were not clustered.} - -\item{orderBy}{how to order the cluster (either by size or by maximum alpha -value).} - -\item{format}{whether to return a list of indices in a cluster or a vector of -clustering assignments. List is mainly for compatibility with sequential -part.} - -\item{clusterArgs}{arguments to be passed directly to the clusterFunction, -beyond the required input.} - -\item{checkArgs}{logical as to whether should give warning if arguments given -that don't match clustering choices given. Otherwise, inapplicable -arguments will be ignored without warning.} - -\item{returnD}{logical as to whether to return the D matrix in output.} - -\item{...}{arguments given to clusterD to be passed to cluster01 or clusterK -(depending on the value of typeAlg). Examples include 'k' for clusterK or -'alpha' for cluster01. These should not be the arguments needed by -clusterFunction (which should be passed via the argument 'clusterArgs') but -the actual arguments of cluster01 or clusterK.} - -\item{alpha}{a cutoff value of how much similarity needed for drawing blocks -(lower values more strict).} - -\item{findBestK}{logical, whether should find best K based on average -silhouette width (only used if clusterFunction of type "K").} - -\item{k}{single value to be used to determine how many clusters to find, if -findBestK=FALSE (only used if clusterFunction of type "K").} - -\item{kRange}{vector of integers. If findBestK=TRUE, this gives the range of -k's to look over. Default is k-2 to k+20, subject to those values being -greater than 2. Note that default values depend on the input k, so running -for different choices of k and findBestK=TRUE can give different answers -unless kRange is set to be the same.} - -\item{removeSil}{logical as to whether remove when silhouette < silCutoff -(only used if clusterFunction of type "K")} - -\item{silCutoff}{Requirement on minimum silhouette width to be included in -cluster (only if removeSil=TRUE).} -} -\value{ -clusterD returns a vector of cluster assignments (if format="vector") - or a list of indices for each cluster (if format="list"). Clusters less - than minSize are removed. If orderBy="size" the clusters are reordered by - the size of the cluster, instead of by the internal ordering of the - clusterFunction. - -cluster01 and clusterK return a list of indices of the clusters found, - which each element of the list corresponding to a cluster and the elements - of that list a vector of indices giving the indices of the samples assigned - to that cluster. Indices not included in any list are assumed to have not - been clustered. The list is assumed to be ordered in terms of the `best' - cluster (as defined by the clusterFunction for cluster01 or by average - silhoute for clusterK), for example in terms of most internal similarity of - the elements, or average silhouette width. -} -\description{ -Given a \code{n x n} matrix of distances, these functions will - try to find the clusters based on the given clustering function. cluster01 - and clusterK are internal functions and clusterD is a wrapper around these - two functions for easier user interface. cluster01 and clusterK are not - expected to be called directly by the user, except for ease in debugging - user-defined clustering functions. -} -\details{ -To provide a distance matrix via the argument \code{distFunction}, - the function must be defined to take the distance of the rows of a matrix - (internally, the function will call \code{distFunction(t(x))}. This is to - be compatible with the input for the \code{dist} function. - \code{as.matrix} will be performed on the output of \code{distFunction}, - so if the object returned has a \code{as.matrix} method that will convert - the output into a symmetric matrix of distances, this is fine (for - example the class \code{dist} for objects returned by \code{dist} have - such a method). If \code{distFunction=NA}, then a default distance will - be calculated based on the type of clustering algorithm of - \code{clusterFunction}. For type "K" the default is to take \code{dist} - as the distance function. For type "01", the default is to take the - (1-cor(x))/2. - -Types of algorithms: cluster01 is for clustering functions that - expect as an input D that takes on 0-1 values (e.g. from subclustering). - clusterK is for clustering functions that require an input k, the number of - clusters, but arbitrary distance/dissimilarity matrix. cluster01 and - clusterK are given as separate functions in order to allow the user to - provide different clustering functions that expect different types of input - and for us to provide different shared processing of the results that is - different for these different types of clustering methods (for example, - removing low silhouette values is appropriate for clusterK clustering - functions rather than cluster01 functions). It is also generally expected - that cluster01 algorithms use the 0-1 nature of the input to set criteria - as to where to find clusters and therefore do not need a pre-determined - 'k'. On the other hand, clusterK functions are assumed to need a - predetermined 'k' and are also assumed to cluster all samples to a cluster, - and therefore clusterK gives options to exclude poorly clustered samples - via silhouette distances. - -cluster01 required format for input and output for clusterFunction: - clusterFunction should be a function that takes (as a minimum) an argument - "D" and "alpha". 0-1 clustering algorithms are expected to use the fact - that the D input is 0-1 range to find the clusters, rather than a user - defined number of clusters; "alpha" is the parameter that tunes the finding - of such clusters. For example, a candidate block of samples might be - considered a cluster if all values of D are greater than or equal to - 1-alpha. The output is a list with each element corresponding to a cluster - and the elements of the list corresponding to the indices of the samples - that are in the cluster. The list is expected to be in order of 'best - clusters' (as defined by the clusterFunction), with first being the best - and last being worst. - -cluster01 methods: "tight" method refers to the method of finding - clusters from a subsampling matrix given internally in the tight - algorithm code of Tsang and Wong. Arguments for the tight method are - 'minSize.core' (default=2), which sets the minimimum number of samples - that form a core cluster. "hierarchical01" refers to running the hclust - algorithm on D and transversing down the tree until getting a block of - samples with whose summary of the values is greater than or equal to - 1-alpha. Arguments that can be passed to 'hierarchical' are - 'evalClusterMethod' which determines how to summarize the samples' values - of D[samples,samples] for comparison to 1-alpha: "maximum" (default) - takes the minimum of D[samples,samples] and requires it to be less than - or equal to 1-alpha; "average" requires that each row mean of - D[samples,samples] be less than or equal to 1-alpha. Arguments of - hclust can also be passed via clusterArgs to control the hierarchical - clustering of D. - -clusterK required format for input and output for clusterFunction: - clusterFunction should be a function that takes as a minimum an argument - 'D' and 'k'. The output must be a clustering, specified by integer values. - The function \code{\link{silhouette}} will be used on the clustering to - calculate silhouette scores for each observation. - -clusterK methods: "pam" performs pam clustering on the input - \code{D} matrix using \code{\link{pam}} in the cluster package. Arguments - to \code{\link{pam}} can be passed via 'clusterArgs', except for the - arguments 'x' and 'k' which are given by D and k directly. "hierarchicalK" - performs hierarchical clustering on the input via the \code{\link{hclust}} - and then applies \code{\link{cutree}} with the specified k to obtain - clusters. Arguments to \code{\link{hclust}} can be passed via - \code{clusterArgs}. -} -\examples{ -data(simData) -cl1<-clusterD(simData,clusterFunction="pam",k=3) -cl2<-clusterD(simData,clusterFunction="hierarchical01") -cl3<-clusterD(simData,clusterFunction="tight") -#change distance to manhattan distance -cl4<-clusterD(simData,clusterFunction="pam",k=3, - distFunction=function(x){dist(x,method="manhattan")}) - -#run hierarchical method for finding blocks, with method of evaluating -#coherence of block set to evalClusterMethod="average", and the hierarchical -#clustering using single linkage: -clustSubHier <- clusterD(simData, clusterFunction="hierarchical01", alpha=0.1, -minSize=5, clusterArgs=list(evalClusterMethod="average", method="single")) - -#do tight -clustSubTight <- clusterD(simData, clusterFunction="tight", alpha=0.1, -minSize=5) - -#two twists to pam -clustSubPamK <- clusterD(simData, clusterFunction="pam", silCutoff=0, minSize=5, -removeSil=TRUE, k=3) -clustSubPamBestK <- clusterD(simData, clusterFunction="pam", silCutoff=0, -minSize=5, removeSil=TRUE, findBestK=TRUE, kRange=2:10) - -# note that passing the wrong arguments for an algorithm results in warnings -# (which can be turned off with checkArgs=FALSE) -clustSubTight_test <- clusterD(simData, clusterFunction="tight", alpha=0.1, -minSize=5, removeSil=TRUE) -clustSubTight_test2 <- clusterD(simData, clusterFunction="tight", alpha=0.1, -clusterArgs=list(evalClusterMethod="average")) -} diff --git a/man/clusterMany.Rd b/man/clusterMany.Rd index a93f53b0..b57c4a4f 100644 --- a/man/clusterMany.Rd +++ b/man/clusterMany.Rd @@ -16,7 +16,7 @@ \S4method{clusterMany}{list}(x, ks = NA, clusterFunction, alphas = 0.1, findBestK = FALSE, sequential = FALSE, removeSil = FALSE, subsample = FALSE, silCutoff = 0, distFunction = NA, betas = 0.9, - minSizes = 1, verbose = FALSE, clusterDArgs = NULL, + minSizes = 1, verbose = FALSE, mainClusterArgs = NULL, subsampleArgs = NULL, seqArgs = NULL, ncores = 1, random.seed = NULL, run = TRUE, ...) @@ -29,177 +29,188 @@ \S4method{clusterMany}{data.frame}(x, ...) } \arguments{ -\item{x}{the data on which to run the clustering. Can be: matrix (with genes -in rows), a list of datasets overwhich the clusterings should be run, a -\code{SummarizedExperiment} object, or a \code{ClusterExperiment} object.} +\item{x}{the data matrix on which to run the clustering. Can be: matrix (with +genes in rows), a list of datasets overwhich the clusterings should be run, +a \code{SummarizedExperiment} object, or a \code{ClusterExperiment} object.} \item{dimReduce}{character A character identifying what type of dimensionality reduction to perform before clustering. Options are -"none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more +"none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more details.} -\item{nVarDims}{vector of the number of the most variable features to keep -(when "var", "cv", or "mad" is identified in \code{dimReduce}). If NA is +\item{nVarDims}{vector of the number of the most variable features to keep +(when "var", "cv", or "mad" is identified in \code{dimReduce}). If NA is included, then the full dataset will also be included.} -\item{nPCADims}{vector of the number of PCs to use (when 'PCA' is identified +\item{nPCADims}{vector of the number of PCs to use (when 'PCA' is identified in \code{dimReduce}). If NA is included, then the full dataset will also be included.} \item{transFun}{function A function to use to transform the input data matrix before clustering.} -\item{isCount}{logical. Whether the data are in counts, in which case the -default \code{transFun} argument is set as log2(x+1). This is simply a -convenience to the user, and can be overridden by giving an explicit +\item{isCount}{logical. Whether the data are in counts, in which case the +default \code{transFun} argument is set as log2(x+1). This is simply a +convenience to the user, and can be overridden by giving an explicit function to \code{transFun}.} -\item{...}{For signature \code{list}, arguments to be passed on to mclapply -(if ncores>1). For all the other signatures, arguments to be passed to the +\item{...}{For signature \code{list}, arguments to be passed on to mclapply +(if ncores>1). For all the other signatures, arguments to be passed to the method for signature \code{list}.} -\item{ks}{the range of k values (see details for meaning for different -choices).} +\item{ks}{the range of k values (see details for the meaning of \code{k} for +different choices of other parameters).} -\item{clusterFunction}{function used for the clustering. Note that unlike in +\item{clusterFunction}{function used for the clustering. Note that unlike in \code{\link{clusterSingle}}, this must be a character vector of pre-defined -clustering techniques provided by \code{\link{clusterSingle}}, and can not -be a user-defined function. Current functions are "tight", -"hierarchical01","hierarchicalK", and "pam"} +clustering techniques, and can not be a user-defined function. Current +functions can be found by typing \code{listBuiltInFunctions()} into the +command-line.} -\item{alphas}{values of alpha to be tried. Only used for clusterFunctions of -type '01' (either 'tight' or 'hierarchical01'). Determines tightness -required in creating clusters from the dissimilarity matrix. Takes on -values in [0,1]. See \code{\link{clusterD}}.} +\item{alphas}{values of alpha to be tried. Only used for clusterFunctions of +type '01'. Determines tightness required in creating clusters from the +dissimilarity matrix. Takes on values in [0,1]. See documentation of +\code{\link{ClusterFunction}}.} \item{findBestK}{logical, whether should find best K based on average -silhouette width (only used if clusterFunction of type "K").} +silhouette width (only used when clusterFunction of type "K").} -\item{sequential}{logical whether to use the sequential strategy (see -details of \code{\link{seqCluster}}).} +\item{sequential}{logical whether to use the sequential strategy (see details +of \code{\link{seqCluster}}). Can be used in combination with +\code{subsample=TRUE} or \code{FALSE}.} \item{removeSil}{logical as to whether remove when silhouette < silCutoff (only used if clusterFunction of type "K")} \item{subsample}{logical as to whether to subsample via -\code{\link{subsampleClustering}} to get the distance matrix at each -iteration; otherwise the distance function will be determined by argument -\code{distFunction} passed in \code{clusterDArgs} (if input a data matrix).} +\code{\link{subsampleClustering}}. If TRUE, clustering in mainClustering step is +done on the co-occurance between clusterings in the subsampled clustering +results. If FALSE, the mainClustering step will be run directly on +\code{x}/\code{diss}} \item{silCutoff}{Requirement on minimum silhouette width to be included in -cluster (only if removeSil=TRUE).} - -\item{distFunction}{a vector of character strings that are the names of -distance functions found in the global environment. See the help pages of -\code{\link{clusterD}} for details about the required format of distance -functions. Currently, this distance function must be applicable for all -clusterFunction types tried. Therefore, it is not possible to intermix type "K" -and type "01" algorithms if you also give distances to evaluate via -\code{distFunction} unless all distances give 0-1 values for the distance -(and hence are possible for both type "01" and "K" algorithms).} +cluster (only for combinations where removeSil=TRUE).} + +\item{distFunction}{a vector of character strings that are the names of +distance functions found in the global environment. See the help pages of +\code{\link{clusterSingle}} for details about the required format of +distance functions. Currently, this distance function must be applicable +for all clusterFunction types tried. Therefore, it is not possible in +\code{clusterMany} to intermix type "K" and type "01" algorithms if you +also give distances to evaluate via \code{distFunction} unless all +distances give 0-1 values for the distance (and hence are possible for both +type "01" and "K" algorithms).} \item{betas}{values of \code{beta} to be tried in sequential steps. Only used -for \code{sequential=TRUE}. Determines the similarity between two clusters +for \code{sequential=TRUE}. Determines the similarity between two clusters required in order to deem the cluster stable. Takes on values in [0,1]. See -\code{\link{seqCluster}}.} +documentation of \code{\link{seqCluster}}.} -\item{minSizes}{the minimimum size required for a cluster (in -\code{clusterD}). Clusters smaller than this are not kept and samples are -left unassigned.} +\item{minSizes}{the minimimum size required for a cluster (in the +\code{mainClustering} step). Clusters smaller than this are not kept and samples +are left unassigned.} \item{verbose}{logical. If TRUE it will print informative messages.} -\item{clusterDArgs}{list of additional arguments to be passed to -\code{\link{clusterD}}.} +\item{mainClusterArgs}{list of arguments to be passed for the mainClustering step, see +help pages of \code{\link{mainClustering}}.} -\item{subsampleArgs}{list of arguments to be passed to +\item{subsampleArgs}{list of arguments to be passed to the subsampling step +(if \code{subsample=TRUE}), see help pages of \code{\link{subsampleClustering}}.} -\item{seqArgs}{list of additional arguments to be passed to -\code{\link{seqCluster}}.} +\item{seqArgs}{list of arguments to be passed to \code{\link{seqCluster}}.} \item{ncores}{the number of threads} -\item{random.seed}{a value to set seed before each run of clusterSingle (so -that all of the runs are run on the same subsample of the data). Note, if -'random.seed' is set, argument 'ncores' should NOT be passed via -subsampleArgs; instead set the argument 'ncores' of -clusterMany directly (which is preferred for improving speed anyway).} +\item{random.seed}{a value to set seed before each run of clusterSingle (so +that all of the runs are run on the same subsample of the data). Note, if +'random.seed' is set, argument 'ncores' should NOT be passed via +subsampleArgs; instead set the argument 'ncores' of clusterMany directly +(which is preferred for improving speed anyway).} \item{run}{logical. If FALSE, doesn't run clustering, but just returns matrix of parameters that will be run, for the purpose of inspection by user (with -rownames equal to the names of the resulting column names of clMat object -that would be returned if \code{run=TRUE}). Even if \code{run=FALSE}, +rownames equal to the names of the resulting column names of clMat object +that would be returned if \code{run=TRUE}). Even if \code{run=FALSE}, however, the function will create the dimensionality reductions of the data indicated by the user input.} -\item{eraseOld}{logical. Only relevant if input \code{x} is of class -\code{ClusterExperiment}. If TRUE, will erase existing workflow results -(clusterMany as well as mergeClusters and combineMany). If FALSE, existing -workflow results will have "\code{_i}" added to the clusterTypes value, -where \code{i} is one more than the largest such existing workflow +\item{eraseOld}{logical. Only relevant if input \code{x} is of class +\code{ClusterExperiment}. If TRUE, will erase existing workflow results +(clusterMany as well as mergeClusters and combineMany). If FALSE, existing +workflow results will have "\code{_i}" added to the clusterTypes value, +where \code{i} is one more than the largest such existing workflow clusterTypes.} } \value{ -If \code{run=TRUE} and the input is either a matrix, a - \code{SummarizedExperiment} object, or a \code{ClusterExperiment} object, +If \code{run=TRUE} and the input is either a matrix, a + \code{SummarizedExperiment} object, or a \code{ClusterExperiment} object, will return a \code{ClusterExperiment} object, where the results are stored - as clusterings with clusterTypes \code{clusterMany}. Depending on - \code{eraseOld} argument above, this will either delete existing such - objects, or change the clusterTypes of existing objects. See argument - \code{eraseOld} above. Arbitrarily the first clustering is set as the + as clusterings with clusterTypes \code{clusterMany}. Depending on + \code{eraseOld} argument above, this will either delete existing such + objects, or change the clusterTypes of existing objects. See argument + \code{eraseOld} above. Arbitrarily the first clustering is set as the primaryClusteringIndex. -If \code{run=TRUE} and the input is a list of data sets, a list with - the following objects: \itemize{ \item{\code{clMat}}{ a matrix with each - column corresponding to a clustering and each row to a sample.} - \item{\code{clusterInfo}}{ a list with information regarding clustering +If \code{run=TRUE} and the input is a list of data sets, a list with + the following objects: \itemize{ \item{\code{clMat}}{ a matrix with each + column corresponding to a clustering and each row to a sample.} + \item{\code{clusterInfo}}{ a list with information regarding clustering results (only relevant entries for those clusterings with sequential=TRUE)} - \item{\code{paramMatrix}}{ a matrix giving the parameters of each - clustering, where each column is a possible parameter set by the user and - passed to \code{\link{clusterSingle}} and each row of paramMatrix - corresponds to a clustering in \code{clMat}} \item{\code{clusterDArgs}}{ a - list of (possibly modified) arguments to clusterDArgs} - \item{\code{seqArgs=seqArgs}}{a list of (possibly modified) arguments to - seqArgs} \item{\code{subsampleArgs}}{a list of (possibly modified) + \item{\code{paramMatrix}}{ a matrix giving the parameters of each + clustering, where each column is a possible parameter set by the user and + passed to \code{\link{clusterSingle}} and each row of paramMatrix + corresponds to a clustering in \code{clMat}} \item{\code{mainClusterArgs}}{ a + list of (possibly modified) arguments to mainClusterArgs} + \item{\code{seqArgs=seqArgs}}{a list of (possibly modified) arguments to + seqArgs} \item{\code{subsampleArgs}}{a list of (possibly modified) arguments to subsampleArgs} } -If \code{run=FALSE} a list similar to that described above, but +If \code{run=FALSE} a list similar to that described above, but without the clustering results. } \description{ -Given a range of parameters, this funciton will return a matrix with the -clustering of the samples across the range, which can be passed to +Given a range of parameters, this function will return a matrix with the +clustering of the samples across the range, which can be passed to \code{plotClusters} for visualization. } \details{ -While the function allows for multiple values of clusterFunction, - the code does not reuse the same subsampling matrix and try different - clusterFunctions on it. If sequential=TRUE, different - subsampleclusterFunctions will create different sets of data to subsample - so it is not possible; if sequential=FALSE, we have not implemented - functionality for this reuse. Setting the \code{random.seed} value, - however, should mean that the subsampled matrix is the same for each, but - there is no gain in computational complexity (i.e. each subsampled +Some combinations of these parameters are not feasible. See the + documentation of \code{\link{clusterSingle}} for important information on + how these parameter choices interact. + +While the function allows for multiple values of clusterFunction, + the code does not reuse the same subsampling matrix and try different + clusterFunctions on it. This is because if sequential=TRUE, different + subsample clusterFunctions will create different sets of data to subsample + so it is not possible; if sequential=FALSE, we have not implemented + functionality for this reuse. Setting the \code{random.seed} value, + however, should mean that the subsampled matrix is the same for each, but + there is no gain in computational complexity (i.e. each subsampled co-occurence matrix is recalculated for each set of parameters). -The argument 'ks' is interpreted differently for different choices - of the other parameters. When/if sequential=TRUE, ks defines the argument - k0 of \code{\link{seqCluster}}. Otherwise, 'ks' values are set in both - subsampleArgs[["k"]] and clusterDArgs[["k"]] that are passed to - \code{\link{clusterD}} and \code{\link{subsampleClustering}}. This passing - of these arguments via \code{subsampleArgs[["k"]]} will only have an effect - if `subsample=TRUE`. Similarly, the passing of \code{clusterDArgs[["k"]]} - will only have an effect when the clusterFunction argument includes a - clustering algorithm of type "K". When/if "findBestK=TRUE", ks also defines - the kRange argument of \code{\link{clusterD}} unless kRange is specified by - the user via the clusterDArgs; note this means that the default option of - setting kRange that depends on the input k (see \code{\link{clusterD}}) is - not available in clusterMany. - -If the input is a \code{ClusterExperiment} object, currently - existing \code{orderSamples},\code{coClustering} or dendrogram slots will - be retained. +The argument \code{ks} is interpreted differently for different + choices of the other parameters. When/if sequential=TRUE, \code{ks} defines + the argument \code{k0} of \code{\link{seqCluster}}. Otherwise, \code{ks} + values are the \code{k} values for \strong{both} the mainClustering and + subsampling step (i.e. assigned to the \code{subsampleArgs} and + \code{mainClusterArgs} that are passed to \code{\link{mainClustering}} and + \code{\link{subsampleClustering}} unless \code{k} is set appropriately in + \code{subsampleArgs}. The passing of these arguments via + \code{subsampleArgs} will only have an effect if `subsample=TRUE`. + Similarly, the passing of \code{mainClusterArgs[["k"]]} will only have an + effect when the clusterFunction argument includes a clustering algorithm of + type "K". When/if "findBestK=TRUE", \code{ks} also defines the + \code{kRange} argument of \code{\link{mainClustering}} unless \code{kRange} is + specified by the user via the \code{mainClusterArgs}; note this means that the + default option of setting \code{kRange} that depends on the input \code{k} + (see \code{\link{mainClustering}}) is not available in \code{clusterMany}, only + in \code{\link{clusterSingle}}. + +If the input is a \code{ClusterExperiment} object, current + implementation is that existing \code{orderSamples},\code{coClustering} or + the many dendrogram slots will be retained. } \examples{ data(simData) @@ -234,7 +245,7 @@ plotClusters(clMat, axisLine=-2) system.time(clusterTrack <- clusterMany(simData, ks=2:15, alphas=c(0.1,0.2,0.3), findBestK=c(TRUE,FALSE), sequential=c(FALSE), subsample=c(FALSE), removeSil=c(TRUE), clusterFunction="pam", -clusterDArgs=list(minSize=5, kRange=2:15), ncores=1, random.seed=48120)) +mainClusterArgs=list(minSize=5, kRange=2:15), ncores=1, random.seed=48120)) } } diff --git a/man/clusterSingle.Rd b/man/clusterSingle.Rd index e89fd543..b1b5ed04 100644 --- a/man/clusterSingle.Rd +++ b/man/clusterSingle.Rd @@ -3,56 +3,66 @@ \docType{methods} \name{clusterSingle} \alias{clusterSingle} -\alias{clusterSingle-methods} -\alias{clusterSingle,matrix-method} -\alias{clusterSingle,ClusterExperiment-method} -\alias{clusterSingle,matrix,missing-method} -\alias{clusterSingle,matrixOrMissing,matrixOrMissing-method} +\alias{clusterSingle,missing,matrixOrNULL-method} +\alias{clusterSingle,matrixOrNULL,missing-method} \alias{clusterSingle,SummarizedExperiment,missing-method} \alias{clusterSingle,ClusterExperiment,missing-method} +\alias{clusterSingle,matrixOrNULL,matrixOrNULL-method} \title{General wrapper method to cluster the data} \usage{ -\S4method{clusterSingle}{matrixOrMissing,matrixOrMissing}(x, diss, - subsample = TRUE, sequential = FALSE, clusterFunction = c("tight", - "hierarchical01", "pam", "hierarchicalK"), clusterDArgs = NULL, - subsampleArgs = NULL, seqArgs = NULL, isCount = FALSE, - transFun = NULL, dimReduce = c("none", "PCA", "var", "cv", "mad"), - ndims = NA, clusterLabel = "clusterSingle") +\S4method{clusterSingle}{missing,matrixOrNULL}(x, diss, ...) + +\S4method{clusterSingle}{matrixOrNULL,missing}(x, diss, ...) \S4method{clusterSingle}{SummarizedExperiment,missing}(x, diss, ...) -\S4method{clusterSingle}{ClusterExperiment,missing}(x, diss, ...) +\S4method{clusterSingle}{ClusterExperiment,missing}(x, + replaceCoClustering = FALSE, ...) + +\S4method{clusterSingle}{matrixOrNULL,matrixOrNULL}(x, diss, subsample = TRUE, + sequential = FALSE, mainClusterArgs = NULL, subsampleArgs = NULL, + seqArgs = NULL, isCount = FALSE, transFun = NULL, + dimReduce = c("none", "PCA", "var", "cv", "mad"), ndims = NA, + clusterLabel = "clusterSingle", checkDiss = TRUE) } \arguments{ -\item{x}{the data on which to run the clustering (features in rows).} +\item{x}{the data on which to run the clustering (features in rows), or a +\code{\link{SummarizedExperiment}}, or \code{\link{ClusterExperiment}} +object.} -\item{diss}{\code{n x n} data matrix of dissimilarities between the samples -on which to run the clustering (only if \code{subsample=FALSE})} +\item{diss}{\code{n x n} data matrix of dissimilarities between the samples +on which to run the clustering.} -\item{subsample}{logical as to whether to subsample via -\code{\link{subsampleClustering}} to get the distance matrix at each -iteration; otherwise the distance function will be determined by argument -\code{distFunction} passed in \code{clusterDArgs} (if input a data matrix).} +\item{...}{arguments to be passed on to the method for signature +\code{matrix}.} -\item{sequential}{logical whether to use the sequential strategy (see -details of \code{\link{seqCluster}}).} +\item{replaceCoClustering}{logical. Applicable if \code{x} is a +\code{ClusterExperiment} object. If TRUE, the co-clustering resulting from +subsampling is returned in the coClustering object and replaces any +existing coClustering object in the slot \code{coClustering}.} -\item{clusterFunction}{passed to \code{\link{clusterD}} option -'clusterFunction' to indicate method of clustering, see -\code{\link{clusterD}}.} +\item{subsample}{logical as to whether to subsample via +\code{\link{subsampleClustering}}. If TRUE, clustering in mainClustering step is +done on the co-occurance between clusterings in the subsampled clustering +results. If FALSE, the mainClustering step will be run directly on +\code{x}/\code{diss}} + +\item{sequential}{logical whether to use the sequential strategy (see details +of \code{\link{seqCluster}}). Can be used in combination with +\code{subsample=TRUE} or \code{FALSE}.} -\item{clusterDArgs}{list of additional arguments to be passed to -\code{\link{clusterD}}.} +\item{mainClusterArgs}{list of arguments to be passed for the mainClustering step, see +help pages of \code{\link{mainClustering}}.} -\item{subsampleArgs}{list of arguments to be passed to +\item{subsampleArgs}{list of arguments to be passed to the subsampling step +(if \code{subsample=TRUE}), see help pages of \code{\link{subsampleClustering}}.} -\item{seqArgs}{list of additional arguments to be passed to -\code{\link{seqCluster}}.} +\item{seqArgs}{list of arguments to be passed to \code{\link{seqCluster}}.} -\item{isCount}{logical. Whether the data are in counts, in which case the -default \code{transFun} argument is set as log2(x+1). This is simply a -convenience to the user, and can be overridden by giving an explicit +\item{isCount}{logical. Whether the data are in counts, in which case the +default \code{transFun} argument is set as log2(x+1). This is simply a +convenience to the user, and can be overridden by giving an explicit function to \code{transFun}.} \item{transFun}{function A function to use to transform the input data matrix @@ -60,32 +70,125 @@ before clustering.} \item{dimReduce}{character A character identifying what type of dimensionality reduction to perform before clustering. Options are -"none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more +"none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more details.} -\item{ndims}{integer An integer identifying how many dimensions to reduce to +\item{ndims}{integer An integer identifying how many dimensions to reduce to in the reduction specified by \code{dimReduce}} -\item{clusterLabel}{a string used to describe the clustering. By -default it is equal to "clusterSingle", to indicate that this clustering is -the result of a call to \code{clusterSingle}.} +\item{clusterLabel}{a string used to describe the clustering. By default it +is equal to "clusterSingle", to indicate that this clustering is the result +of a call to \code{clusterSingle}.} -\item{...}{arguments to be passed on to the method for signature -\code{matrix}.} +\item{checkDiss}{logical. Whether to check whether the input \code{diss} is +valid.} } \value{ -A \code{\link{ClusterExperiment}} object. +A \code{\link{ClusterExperiment}} object if input was \code{x} a + matrix (or \code{assay} of a \code{ClusterExperiment} or + \code{SummarizedExperiment} object). + +If input was \code{diss}, then the result is a list with values + \itemize{ \item{clustering: }{The vector of clustering results} + \item{clusterInfo: }{A list with information about the parameters run in + the clustering} \item{diss: }{The dissimilarity matrix used in the + clustering} } } \description{ -Given a data matrix, \code{\link{SummarizedExperiment}}, or -\code{\link{ClusterExperiment}} object, this function will find clusters, +Given input data, \code{\link{SummarizedExperiment}}, or +\code{\link{ClusterExperiment}} object, this function will find clusters, based on a single specification of parameters. } \details{ -If sequential=TRUE, the sequential clustering controls the 'k' - argument of the underlying clustering so setting 'k=' in the list given to - clusterDArgs or subsampleArgs will not do anything and will produce a - warning to that effect. +\code{clusterSingle} is an 'expert-oriented' function, intended to + be used when a user wants to run a single clustering and/or have a great + deal of control over the clustering parameters. Most users will find + \code{\link{clusterMany}} more relevant. However, \code{\link{clusterMany}} + makes certain assumptions about the intention of certain combinations of + parameters that might not match the user's intent; similarly + \code{\link{clusterMany}} does not directly take a dissimilarity matrix but + only a matrix of values \code{x} (though a user can define a distance + function to be applied to \code{x} in \code{\link{clusterMany}}). + +Unlike \code{\link{clusterMany}}, most of the relevant arguments for + the actual clustering algorithms in \code{clusterSingle} are passed to the + relevant steps via the arguments \code{mainClusterArgs}, \code{subsampleArgs}, + and \code{seqArgs}. These arguments should be \emph{named} lists with + parameters that match the corresponding functions: + \code{\link{mainClustering}},\code{\link{subsampleClustering}}, and + \code{\link{seqCluster}}. These functions are not meant to be called by the + user, but rather accessed via calls to \code{clusterSingle}. But the user + can look at the help files of those functions for more information + regarding the parameters that they take. + +Only certain combinations of parameters are possible for certain + choices of \code{sequential} and \code{subsample}. These restrictions are + documented below. \itemize{ \item{\code{clusterFunction} for + \code{mainClusterArgs}: }{The choice of \code{subsample=TRUE} also controls + what algorithm type of clustering functions can be used in the mainClustering + step. When \code{subsample=TRUE}, then resulting co-clustering matrix from + subsampling is converted to a dissimilarity (specificaly 1-coclustering + values) and is passed to \code{diss} of \code{\link{mainClustering}}. For this + reason, the \code{ClusterFunction} object given to \code{\link{mainClustering}} + via the argument \code{mainClusterArgs} must take input of the form of a + dissimilarity. When \code{subsample=FALSE} and \code{sequential=TRUE}, the + \code{clusterFunction} passed in \code{clusterArgs} element of + \code{mainClusterArgs} must define a \code{ClusterFunction} object with + \code{algorithmType} 'K'. When \code{subsample=FALSE} and + \code{sequential=FALSE}, then there are no restrictions on the + \code{ClusterFunction} and that clustering is applied directly to the input + data. } \item{\code{clusterFunction} for \code{subsampleArgs}: }{If the + \code{ClusterFunction} object given to the \code{clusterArgs} of + \code{subsamplingArgs} is missing the algorithm will use the default for + \code{\link{subsampleClustering}} (currently "pam"). If + \code{sequential=TRUE}, this \code{ClusterFunction} object must be of type + 'K'. } \item{Setting \code{k} for subsampling: }{If \code{subsample=TRUE} + and \code{sequential=TRUE}, the current K of the sequential iteration + determines the 'k' argument passed to \code{\link{subsampleClustering}} so + setting 'k=' in the list given to the subsampleArgs will not do anything + and will produce a warning to that effect (see documentation of + \code{\link{seqCluster}}).} \item{Setting \code{k} for mainClustering step: }{If + \code{sequential=TRUE} then the user should not set \code{k} in the + \code{clusterArgs} argument of \code{mainClusterArgs} because it must be set + by the sequential code, which has a iterative reseting of the parameters. + Specifically if \code{subsample=FALSE}, then the sequential method iterates + over choices of \code{k} to cluster the input data. And if + \code{subsample=TRUE}, then the \code{k} in the clustering of mainClustering step + (assuming the clustering function is of type 'K') will use the \code{k} + used in the subsampling step to make sure that the \code{k} used in the + mainClustering step is reasonable. } \item{Setting \code{findBestK} in + \code{mainClusterArgs}: }{If \code{sequential=TRUE} and + \code{subsample=FALSE}, the user should not set 'findBestK=TRUE' in + \code{mainClusterArgs}. This is because in this case the sequential method + changes \code{k}; an error message will be given if this combination of + options are set. However, if \code{sequential=TRUE} and + \code{subsample=TRUE}, then passing either 'findBestK=TRUE' or + 'findBestK=FALSE' via \code{mainClusterArgs} will function as expected + (assuming the \code{clusterFunction} argument passed to \code{mainClusterArgs} + is of type 'K'). In particular, the sequential step will set the number of + clusters \code{k} for clustering of each subsample. If findBestK=FALSE, + that same \code{k} will be used for mainClustering step that clusters the + resulting co-occurance matrix after subsampling. If findBestK=TRUE, then + \code{\link{mainClustering}} will search for best k. Note that the default + 'kRange' over which \code{\link{mainClustering}} searches when findBestK=TRUE + depends on the input value of \code{k} which is set by the sequential + method if \code{sequential=TRUE}), see above. The user can change + \code{kRange} to not depend on \code{k} and to be fixed across all of the + sequential steps by setting \code{kRange} explicitly in the + \code{mainClusterArgs} list.} } + +To provide a distance matrix via the argument \code{distFunction}, + the function must be defined to take the distance of the rows of a matrix + (internally, the function will call \code{distFunction(t(x))}. This is to + be compatible with the input for the \code{dist} function. \code{as.matrix} + will be performed on the output of \code{distFunction}, so if the object + returned has a \code{as.matrix} method that will convert the output into a + symmetric matrix of distances, this is fine (for example the class + \code{dist} for objects returned by \code{dist} have such a method). If + \code{distFunction=NA}, then a default distance will be calculated based on + the type of clustering algorithm of \code{clusterFunction}. For type "K" + the default is to take \code{dist} as the distance function. For type "01", + the default is to take the (1-cor(x))/2. } \examples{ data(simData) @@ -94,17 +197,19 @@ data(simData) #following code takes some time. #use clusterSingle to do sequential clustering #(same as example in seqCluster only using clusterSingle ...) -set.seed(44261) -clustSeqHier_v2 <- clusterSingle(simData, clusterFunction="hierarchical01", -sequential=TRUE, subsample=TRUE, subsampleArgs=list(resamp.n=100, samp.p=0.7, -clusterFunction="kmeans", clusterArgs=list(nstart=10)), -seqArgs=list(beta=0.8, k0=5), clusterDArgs=list(minSize=5)) + clusterFunction="hierarchical01",clusterArgs=list(alpha=0.1))) } #use clusterSingle to do just clustering k=3 with no subsampling -clustNothing <- clusterSingle(simData, clusterFunction="pam", -subsample=FALSE, sequential=FALSE, clusterDArgs=list(k=3)) +clustNothing <- clusterSingle(simData, + subsample=FALSE, sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", + clusterArgs=list(k=3))) +#compare to standard pam +cluster::pam(t(simData),k=3,cluster.only=TRUE) } \seealso{ -\code{\link{clusterMany}} to compare multiple choices of parameters. +\code{\link{clusterMany}} to compare multiple choices of parameters, + and \code{\link{mainClustering}},\code{\link{subsampleClustering}}, and + \code{\link{seqCluster}} for the underlying functions called by + \code{clusterSingle}. } diff --git a/man/combineMany.Rd b/man/combineMany.Rd index 6d7a6f81..616bbf30 100644 --- a/man/combineMany.Rd +++ b/man/combineMany.Rd @@ -9,7 +9,7 @@ \alias{combineMany,ClusterExperiment,missing-method} \title{Find sets of samples that stay together across clusterings} \usage{ -\S4method{combineMany}{matrix,missing}(x, whichClusters, proportion = 1, +\S4method{combineMany}{matrix,missing}(x, whichClusters, proportion, clusterFunction = "hierarchical01", propUnassigned = 0.5, minSize = 5) \S4method{combineMany}{ClusterExperiment,numeric}(x, whichClusters, @@ -26,24 +26,24 @@ clusters to compare (missing if x is a matrix)} \item{proportion}{The proportion of times that two sets of samples should be -together in order to be grouped into a cluster (if <1, passed to clusterD +together in order to be grouped into a cluster (if <1, passed to mainClustering via alpha = 1 - proportion)} \item{clusterFunction}{the clustering to use (passed to -\code{\link{clusterD}}); currently must be of type '01'.} +\code{\link{mainClustering}}); currently must be of type '01'.} \item{propUnassigned}{samples with greater than this proportion of assignments equal to '-1' are assigned a '-1' cluster value as a last step (only if proportion < 1)} \item{minSize}{minimum size required for a set of samples to be considered in -a cluster because of shared clustering, passed to \code{\link{clusterD}}} +a cluster because of shared clustering, passed to \code{\link{mainClustering}}} -\item{eraseOld}{logical. Only relevant if input \code{x} is of class -\code{ClusterExperiment}. If TRUE, will erase existing workflow results -(clusterMany as well as mergeClusters and combineMany). If FALSE, existing -workflow results will have "\code{_i}" added to the clusterTypes value, -where \code{i} is one more than the largest such existing workflow +\item{eraseOld}{logical. Only relevant if input \code{x} is of class +\code{ClusterExperiment}. If TRUE, will erase existing workflow results +(clusterMany as well as mergeClusters and combineMany). If FALSE, existing +workflow results will have "\code{_i}" added to the clusterTypes value, +where \code{i} is one more than the largest such existing workflow clusterTypes.} \item{clusterLabel}{a string used to describe the type of clustering. By @@ -65,7 +65,7 @@ If x is a matrix, a list with values out of those not '-1'} \item{\code{noUnassignedCorrection}{ a vector of cluster assignments before samples were converted to '-1' because had >\code{propUnassigned} '-1' values (i.e. the direct output of the - \code{clusterD} output.)}} + \code{mainClustering} output.)}} } If x is a \code{\link{ClusterExperiment}}, a @@ -81,7 +81,7 @@ a new clustering vector. The function tries to find a consensus cluster across many different clusterings of the same samples. It does so by creating a \code{nSamples} x \code{nSamples} matrix of the percentage of co-occurance of each sample and - then calling clusterD to cluster the co-occurance matrix. The function + then calling mainClustering to cluster the co-occurance matrix. The function assumes that '-1' labels indicate clusters that are not assigned to a cluster. Co-occurance with the unassigned cluster is treated differently than other clusters. The percent co-occurance is taken only with respect to @@ -89,10 +89,10 @@ The function tries to find a consensus cluster across many different than \code{propUnassigned} values that are '-1' across all of the clusterings are assigned a '-1' regardless of their cluster assignment. -The method calls \code{\link{clusterD}} on the proportion matrix with +The method calls \code{\link{mainClustering}} on the proportion matrix with \code{clusterFunction} as the 01 clustering algorithm, \code{alpha=1-proportion}, \code{minSize=minSize}, and \code{evalClusterMethod=c("average")}. See help of -\code{\link{clusterD}} for more details. +\code{\link{mainClustering}} for more details. } \examples{ data(simData) diff --git a/man/getBestFeatures.Rd b/man/getBestFeatures.Rd index 54f9aac3..5b09537e 100644 --- a/man/getBestFeatures.Rd +++ b/man/getBestFeatures.Rd @@ -44,13 +44,13 @@ to be compared and must match the names of the clusters in the vector \item{isCount}{logical as to whether input data is count data, in which case to perform voom correction to data. See details.} -\item{normalize.method}{character value, passed to \code{\link{voom}} in -\code{\link{limma}} package. Only used if \code{countData=TRUE}. +\item{normalize.method}{character value, passed to \code{\link[limma]{voom}} in +\code{\link[limma]{limma}} package. Only used if \code{countData=TRUE}. Note that the default value is set to "none", which is not the default value of \code{\link{voom}}.} \item{...}{options to pass to \code{\link{topTable}} or -\code{\link{topTableF}} (see \code{\link{limma}} package)} +\code{\link[limma]{topTableF}} (see \code{\link[limma]{limma}} package)} } \value{ A \code{data.frame} in the same format as @@ -123,8 +123,8 @@ Note that the default option for \code{\link[limma]{topTable}} is should be the default for RNA-Seq data. If the input data is a `ClusterExperiment` object, setting `isCount=TRUE` will cause the program to ignore the internally stored transformation function and instead use - voom with log2(x+0.5). Alternatively, `isCount=FALSE` for a - `ClusterExperiment` object will cause the DE to be performed with `limma` + voom with log2(x+0.5). Alternatively, \code{isCount=FALSE} for a + \code{ClusterExperiment} object will cause the DE to be performed with \code{limma} after transforming the data with the stored transformation. Although some writing about "voom" seem to suggest that it would be appropriate for arbitrary transformations, the authors have cautioned against using it for @@ -136,8 +136,8 @@ Note that the default option for \code{\link[limma]{topTable}} is data(simData) #create a clustering, for 8 clusters (truth was 4) -cl <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -sequential=FALSE, clusterDArgs=list(k=8)) +cl <- clusterSingle(simData, subsample=FALSE, +sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) #basic F test, return all, even if not significant: testF <- getBestFeatures(cl, contrastType="F", number=nrow(simData), @@ -178,3 +178,10 @@ plot(testF$P.Value[order(testF$Index)], testFVoom$P.Value[order(testFVoom$Index)],log="xy") } +\references{ +Ritchie, ME, Phipson, B, Wu, D, Hu, Y, Law, CW, Shi, W, and Smyth, GK (2015). limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Research 43, e47. http://nar.oxfordjournals.org/content/43/7/e47 + +Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom: precision weights unlock linear model analysis tools for RNA-seq read counts. Genome Biology 15, R29. http://genomebiology.com/2014/15/2/R29 + +Smyth, G. K. (2004). Linear models and empirical Bayes methods for assessing differential expression in microarray experiments. Statistical Applications in Genetics and Molecular Biology, Volume 3, Article 3. http://www.statsci.org/smyth/pubs/ebayes.pdf +} diff --git a/man/mainClustering.Rd b/man/mainClustering.Rd new file mode 100644 index 00000000..d08919da --- /dev/null +++ b/man/mainClustering.Rd @@ -0,0 +1,141 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mainClustering.R +\docType{methods} +\name{mainClustering} +\alias{mainClustering} +\alias{mainClustering,character-method} +\alias{mainClustering,ClusterFunction-method} +\alias{getPostProcessingArgs,ClusterFunction-method} +\alias{getPostProcessingArgs} +\title{Cluster distance matrix from subsampling} +\usage{ +\S4method{mainClustering}{character}(clusterFunction, ...) + +\S4method{mainClustering}{ClusterFunction}(clusterFunction, x = NULL, + diss = NULL, distFunction = NA, clusterArgs = NULL, minSize = 1, + orderBy = c("size", "best"), format = c("vector", "list"), + checkArgs = TRUE, checkDiss = TRUE, returnData = FALSE, ...) + +\S4method{getPostProcessingArgs}{ClusterFunction}(clusterFunction) +} +\arguments{ +\item{clusterFunction}{a \code{\link{ClusterFunction}} object that defines +the clustering routine. See \code{\link{ClusterFunction}} for required +format of user-defined clustering routines. User can also give a character +value to the argument \code{clusterFunction} to indicate the use of +clustering routines provided in package. Type +\code{\link{listBuiltInFunctions}} at command prompt to see the built-in +clustering routines. If \code{clusterFunction} is missing, the default is +set to "pam".} + +\item{...}{arguments passed to the post-processing steps of the clustering. +The available post-processing arguments for a \code{ClusterFunction} object +depend on it's algorithm type and can be found by calling +\code{getPostProcessingArgs}. See details below for documentation.} + +\item{x}{\code{p x n} data matrix on which to run the clustering (samples in +columns).} + +\item{diss}{\code{n x n} data matrix of dissimilarities between the samples +on which to run the clustering} + +\item{distFunction}{a distance function to be applied to \code{D}. Only +relevant if input is only \code{x} (a matrix of data), and +\code{diss=NULL}. See details of \code{\link{clusterSingle}} for the +required format of the distance function.} + +\item{clusterArgs}{arguments to be passed directly to the \code{clusterFUN} +slot of the \code{ClusterFunction} object} + +\item{minSize}{the minimum number of samples in a cluster. Clusters found +below this size will be discarded and samples in the cluster will be given +a cluster assignment of "-1" to indicate that they were not clustered.} + +\item{orderBy}{how to order the cluster (either by size or by maximum alpha +value). If orderBy="size" the numbering of the clusters are reordered by +the size of the cluster, instead of by the internal ordering of the +\code{clusterFUN} defined in the \code{ClusterFunction} object (an internal +ordering is only possible if slot \code{outputType} of the +\code{ClusterFunction} is \code{"list"}).} + +\item{format}{whether to return a list of indices in a cluster or a vector of +clustering assignments. List is mainly for compatibility with sequential +part.} + +\item{checkArgs}{logical as to whether should give warning if arguments given +that don't match clustering choices given. Otherwise, inapplicable +arguments will be ignored without warning.} + +\item{checkDiss}{logical. Whether to check whether the input \code{diss} is +valid.} + +\item{returnData}{logical as to whether to return the \code{diss} or \code{x} +matrix in the output. If \code{FALSE} only the clustering vector is +returned.} +} +\value{ +mainClustering returns a vector of cluster assignments (if format="vector") + or a list of indices for each cluster (if format="list"). Clusters less + than minSize are removed. +} +\description{ +Given input data, this function will try to find the clusters + based on the given ClusterFunction object. +} +\details{ +\code{mainClustering} is not meant to be called by the user. It is only an + exported function so as to be able to clearly document the arguments for + \code{mainClustering} which can be passed via the argument \code{mainClusterArgs} in + functions like \code{\link{clusterSingle}} and \code{\link{clusterMany}}. + +Post-processing Arguments: For post-processing the clustering, + currently only type 'K' algorithms have a defined post-processing. + Specifically +\itemize{ + \item{"findBestK"}{logical, whether should find best K based on average + silhouette width (only used if clusterFunction of type "K").} + \item{"kRange"}{vector of integers to try for k values if findBestK=TRUE. If + \code{k} is given in \code{clusterArgs}, then default is k-2 to k+20, + subject to those values being greater than 2; if not the default is + \code{2:20}. Note that default values depend on the input k, so running for + different choices of k and findBestK=TRUE can give different answers unless + kRange is set to be the same.} + \item{"removeSil"}{logical as to whether remove the assignment of a sample + to a cluster when the sample's silhouette value is less than + \code{silCutoff}} + \item{"silCutoff"}{Cutoff on the minimum silhouette width to be included in + cluster (only used if removeSil=TRUE).} +} +} +\examples{ +data(simData) +cl1<-mainClustering(x=simData,clusterFunction="pam",clusterArgs=list(k=3)) +cl2<-mainClustering(simData,clusterFunction="hierarchical01",clusterArgs=list(alpha=.1)) +cl3<-mainClustering(simData,clusterFunction="tight",clusterArgs=list(alpha=.1)) +#change distance to manhattan distance +cl4<-mainClustering(simData,clusterFunction="pam",clusterArgs=list(k=3), + distFunction=function(x){dist(x,method="manhattan")}) + +#run hierarchical method for finding blocks, with method of evaluating +#coherence of block set to evalClusterMethod="average", and the hierarchical +#clustering using single linkage: +clustSubHier <- mainClustering(simData, clusterFunction="hierarchical01", +minSize=5, clusterArgs=list(alpha=0.1,evalClusterMethod="average", method="single")) + +#do tight +clustSubTight <- mainClustering(simData, clusterFunction="tight", clusterArgs=list(alpha=0.1), +minSize=5) + +#two twists to pam +clustSubPamK <- mainClustering(simData, clusterFunction="pam", silCutoff=0, minSize=5, +removeSil=TRUE, clusterArgs=list(k=3)) +clustSubPamBestK <- mainClustering(simData, clusterFunction="pam", silCutoff=0, +minSize=5, removeSil=TRUE, findBestK=TRUE, kRange=2:10) + +# note that passing the wrong arguments for an algorithm results in warnings +# (which can be turned off with checkArgs=FALSE) +clustSubTight_test <- mainClustering(simData, clusterFunction="tight", +clusterArgs=list(alpha=0.1), minSize=5, removeSil=TRUE) +clustSubTight_test2 <- mainClustering(simData, clusterFunction="tight", +clusterArgs=list(alpha=0.1,evalClusterMethod="average")) +} diff --git a/man/makeDendrogram.Rd b/man/makeDendrogram.Rd index 55a86aec..a5d3f8c8 100644 --- a/man/makeDendrogram.Rd +++ b/man/makeDendrogram.Rd @@ -9,7 +9,7 @@ \usage{ \S4method{makeDendrogram}{ClusterExperiment}(x, whichCluster = "primaryCluster", dimReduce = c("none", "PCA", "var", "cv", - "mad"), ndims = NA, ignoreUnassignedVar = FALSE, + "mad"), ndims = NA, ignoreUnassignedVar = TRUE, unassignedSamples = c("outgroup", "cluster"), ...) \S4method{makeDendrogram}{matrix}(x, cluster, @@ -25,10 +25,10 @@ primaryCluster.} \item{dimReduce}{character A character identifying what type of dimensionality reduction to perform before clustering. Options are -"none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more +"none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more details.} -\item{ndims}{integer An integer identifying how many dimensions to reduce to +\item{ndims}{integer An integer identifying how many dimensions to reduce to in the reduction specified by \code{dimReduce}} \item{ignoreUnassignedVar}{logical indicating whether dimensionality reduction @@ -80,8 +80,8 @@ permitted for the \code{@dendro_samples} slot. data(simData) #create a clustering, for 8 clusters (truth was 3) -cl <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -sequential=FALSE, clusterDArgs=list(k=8)) +cl <- clusterSingle(simData, subsample=FALSE, +sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) #create dendrogram of clusters: hcl <- makeDendrogram(cl) diff --git a/man/mergeClusters.Rd b/man/mergeClusters.Rd index d50317cd..2acb2853 100644 --- a/man/mergeClusters.Rd +++ b/man/mergeClusters.Rd @@ -8,9 +8,9 @@ \title{Merge clusters based on dendrogram} \usage{ \S4method{mergeClusters}{matrix}(x, cl, dendro = NULL, - mergeMethod = c("none", "adjP", "locfdr", "MB", "JC"), - plotInfo = c("none", "all", "mergeMethod", "adjP", "locfdr", "MB", "JC"), - cutoff = 0.1, plot = TRUE, isCount = TRUE, ...) + mergeMethod = c("none", "Storey", "PC", "adjP", "locfdr", "MB", "JC"), + plotInfo = c("none", "all", "Storey", "PC", "adjP", "locfdr", "MB", "JC", + "mergeMethod"), cutoff = 0.1, plot = TRUE, isCount = TRUE, ...) \S4method{mergeClusters}{ClusterExperiment}(x, eraseOld = FALSE, isCount = FALSE, mergeMethod = "none", plotInfo = "all", @@ -60,11 +60,11 @@ details.} For signature \code{ClusterExperiment} arguments passed to the method for signature \code{matrix} and then onto \code{\link{plot.phylo}}.} -\item{eraseOld}{logical. Only relevant if input \code{x} is of class -\code{ClusterExperiment}. If TRUE, will erase existing workflow results -(clusterMany as well as mergeClusters and combineMany). If FALSE, existing -workflow results will have "\code{_i}" added to the clusterTypes value, -where \code{i} is one more than the largest such existing workflow +\item{eraseOld}{logical. Only relevant if input \code{x} is of class +\code{ClusterExperiment}. If TRUE, will erase existing workflow results +(clusterMany as well as mergeClusters and combineMany). If FALSE, existing +workflow results will have "\code{_i}" added to the clusterTypes value, +where \code{i} is one more than the largest such existing workflow clusterTypes.} \item{clusterLabel}{a string used to describe the type of clustering. By @@ -111,7 +111,9 @@ If \code{isCount=TRUE}, and the input is a matrix, \code{log2(count given to the input and will be used for both \code{makeDendrogram} and \code{getBestFeatures}, with no voom correction. -"JC" refers to the method of Ji and Cai (2007), and implementation +"Storey" refers to the method of Storey (2002). "PC" refers to the +method of Pounds and Cheng (2004). "JC" refers to the method of +Ji and Cai (2007), and implementation of "JC" method is copied from code available on Jiashin Ji's website, December 16, 2015 (http://www.stat.cmu.edu/~jiashun/Research/software/NullandProp/). "locfdr" @@ -139,8 +141,8 @@ If the dendrogram was made with option data(simData) #create a clustering, for 8 clusters (truth was 3) -cl<-clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -sequential=FALSE, clusterDArgs=list(k=8)) +cl<-clusterSingle(simData, subsample=FALSE, +sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) #give more interesting names to clusters: newNames<- paste("Cluster",clusterLegend(cl)[[1]][,"name"],sep="") @@ -162,6 +164,21 @@ leafType="clusters",label="name") #compare merged to original table(primaryCluster(cl), primaryCluster(merged)) +} +\references{ +Ji and Cai (2007), "Estimating the Null and the Proportion +of Nonnull Effects in Large-Scale Multiple Comparisons", JASA 102: 495-906. + +Efron (2004) “Large-scale simultaneous hypothesis testing: +the choice of a null hypothesis,” JASA, 99: 96–104. + +Meinshausen and Buhlmann (2005) "Lower bounds for the +number of false null hypotheses for multiple testing of associations", +Biometrika 92(4): 893-907. + +Storey (2002) "A direct approach to false discovery rates", J. R. Statist. Soc. B 64 (3)": 479–498. + +Pounds and Cheng (2004). "Improving false discovery rate estimation." Bioinformatics 20(11): 1737-1745. } \seealso{ makeDendrogram, plotDendrogram, getBestFeatures diff --git a/man/plotClusters.Rd b/man/plotClusters.Rd index 585a2c64..28de7add 100644 --- a/man/plotClusters.Rd +++ b/man/plotClusters.Rd @@ -249,8 +249,11 @@ cl2 <- clusterExperiment(assay(cl), clMatNew, transformation=transformation(cl)) plotClusters(cl2) } +\references{ +Wilkerson, D. M, Hayes and Neil D (2010). “ConsensusClusterPlus: a class discovery tool with confidence assessments and item tracking.” Bioinformatics, 26(12), pp. 1572-1573. +} \seealso{ -The \link[ConsensusClusterPlus]{ConsensusClusterPlus} package. +The \code{\link[ConsensusClusterPlus]{ConsensusClusterPlus}} package. } \author{ Elizabeth Purdom and Marla Johnson (based on the tracking plot in diff --git a/man/plotDendrogram.Rd b/man/plotDendrogram.Rd index edab8b0e..6367a883 100644 --- a/man/plotDendrogram.Rd +++ b/man/plotDendrogram.Rd @@ -64,8 +64,8 @@ If \code{leafType="clusters"}, the plotting function will work best data(simData) #create a clustering, for 8 clusters (truth was 3) -cl <-clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -sequential=FALSE, clusterDArgs=list(k=8)) +cl <-clusterSingle(simData, subsample=FALSE, +sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) #create dendrogram of clusters and then # merge clusters based ondendrogram: diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index 13178b33..d22b7428 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -44,9 +44,9 @@ \code{\link[SummarizedExperiment]{SummarizedExperiment}} object. The interpretation of parameters depends on the type of the input to \code{data}.} -\item{isCount}{logical. Whether the data are in counts, in which case the -default \code{transFun} argument is set as log2(x+1). This is simply a -convenience to the user, and can be overridden by giving an explicit +\item{isCount}{logical. Whether the data are in counts, in which case the +default \code{transFun} argument is set as log2(x+1). This is simply a +convenience to the user, and can be overridden by giving an explicit function to \code{transFun}.} \item{transFun}{function A function to use to transform the input data matrix @@ -54,7 +54,7 @@ before clustering.} \item{...}{for signature \code{matrix}, arguments passed to \code{aheatmap}. For the other signatures, passed to the method for signature \code{matrix}. -Not all arguments can be passed to aheatmap effectively, see details.} +Not all arguments can be passed to \code{aheatmap} effectively, see details.} \item{clusterSamplesData}{If \code{data} is a matrix, either a matrix that will be used to in \code{hclust} to define the hiearchical clustering of @@ -139,8 +139,8 @@ sample matrices (e.g., correlation).} \item{overRideClusterLimit}{logical. Whether to override the internal limit that only allows 10 clusterings/annotations. If overridden, may result in -incomprehensible errors from aheatmap. Only override this if you have a -very large plotting device and want to see if aheatmap can render it.} +incomprehensible errors from \code{aheatmap}. Only override this if you have a +very large plotting device and want to see if \code{aheatmap} can render it.} \item{invert}{logical determining whether the coClustering matrix should be inverted to be 1-coClustering for plotting. By default, if the diagonal @@ -256,7 +256,7 @@ If you have a factor with many levels, it is important to note that get the color white. Thus if you have many factors or many levels in those factors, you should set their colors via \code{clusterLegend}. -Many arguments can be passed on to aheatmap, however, some are set +Many arguments can be passed on to \code{aheatmap}, however, some are set internally by \code{plotHeatmap.} In particular, setting the values of \code{Rowv} or \code{Colv} will cause errors. \code{color} in \code{aheatmap} is replaced by \code{colorScale} in \code{plotHeatmap.} The @@ -321,6 +321,9 @@ plotHeatmap(simData, colorScale=seqPal1, breaks=.90, main="0.90 Quantile Upper Limit") } +} +\seealso{ +\code{\link[NMF]{aheatmap}} } \author{ Elizabeth Purdom diff --git a/man/plottingFunctions.Rd b/man/plottingFunctions.Rd index 6bb32147..79b18cf7 100644 --- a/man/plottingFunctions.Rd +++ b/man/plottingFunctions.Rd @@ -106,8 +106,8 @@ setBreaks(data=simData,breaks=.9) showHeatmapPalettes() #compare the palettes on heatmap -cl <- clusterSingle(simData, clusterFunction="pam", subsample=FALSE, -sequential=FALSE, clusterDArgs=list(k=8)) +cl <- clusterSingle(simData, subsample=FALSE, +sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8))) \dontrun{ par(mfrow=c(2,3)) diff --git a/man/seqCluster.Rd b/man/seqCluster.Rd index 3b642395..a3eed9e4 100644 --- a/man/seqCluster.Rd +++ b/man/seqCluster.Rd @@ -4,10 +4,10 @@ \alias{seqCluster} \title{Program for sequentially clustering, removing cluster, and starting again.} \usage{ -seqCluster(x = NULL, diss = NULL, k0, clusterFunction = c("tight", - "hierarchical01", "pam", "hierarchicalK"), subsample = TRUE, beta = 0.7, - top.can = 15, remain.n = 30, k.min = 3, k.max = k0 + 10, - verbose = TRUE, subsampleArgs = NULL, clusterDArgs = NULL) +seqCluster(x = NULL, diss = NULL, k0, subsample = TRUE, beta, + top.can = 5, remain.n = 30, k.min = 3, k.max = k0 + 10, + verbose = TRUE, subsampleArgs = NULL, mainClusterArgs = NULL, + checkDiss = TRUE) } \arguments{ \item{x}{\code{p x n} data matrix on which to run the clustering (samples in @@ -19,22 +19,19 @@ on which to run the clustering} \item{k0}{the value of K at the first iteration of sequential algorithm, see details below or vignette.} -\item{clusterFunction}{passed to clusterDMat option 'clusterFunction' to -indicate method of clustering, see \code{\link{clusterD}}.} - \item{subsample}{logical as to whether to subsample via \code{\link{subsampleClustering}} to get the distance matrix at each iteration; otherwise the distance matrix is set by arguments to -\code{\link{clusterD}}.} +\code{\link{mainClustering}}.} \item{beta}{value between 0 and 1 to decide how stable clustership membership has to be before 'finding' and removing the cluster.} -\item{top.can}{only the top.can clusters from \code{\link{clusterD}} (ranked -by 'orderBy' argument given to \code{\link{clusterD}}) will be compared +\item{top.can}{only the top.can clusters from \code{\link{mainClustering}} (ranked +by 'orderBy' argument given to \code{\link{mainClustering}}) will be compared pairwise for stability. Making this very big will effectively remove this parameter and all pairwise comparisons of all clusters found will be -considered. This might result in smaller clusters being found. Current +considered. This might result in smaller clusters being found. The current default is fairly large, so probably will have little effect.} \item{remain.n}{when only this number of samples are left (i.e. not yet @@ -52,9 +49,11 @@ progress.} \item{subsampleArgs}{list of arguments to be passed to \code{\link{subsampleClustering}}.} -\item{clusterDArgs}{list of arguments to be passed to -\code{\link{clusterD}}(which can include arguments to be passed to -\code{\link{cluster01}} or \code{\link{clusterK}}).} +\item{mainClusterArgs}{list of arguments to be passed to +\code{\link{mainClustering}}).} + +\item{checkDiss}{logical. Whether to check whether the input \code{diss} is +valid.} } \value{ A list with values @@ -80,92 +79,90 @@ routines, and sequentially remove best clusters, and iterate to find clusters. } \details{ -This code is adapted from the code of the tightClust - package of Tseng and Wong - -Each iteration of the algorithm will cluster the current set of - samples. Depending on the method, the number of clusters resulting from - \code{\link{clusterD}} may not be equal to the K used in the clustering of - the (subsampled) data. The resulting clusters will then be compared to - clusters found in the previous iteration that set the subsampling - clustering to K-1. For computational (and other?) convenience, only the - first top.can clusters of each iteration will be compared to the first - top.can clusters of previous iteration for similarity (where top.can - currently refers to ordering by size, so first top.can largest clusters). - -If there is a cluster in the current iteration that has overlap - similarity > beta to a cluster in the previous iteration, then the cluster - with the largest such similarity will be identified as a 'final' cluster - and the samples in it will be removed for future iterations. The algorithm - will then continue to the next iteration, but without these samples. - Furthermore, in this case K for the next iteration will NOT be set to K+1, - but will be reset to kinit-1, where kinit was the first K used after the - previous 'final' cluster was removed. If kinit-1 beta to any in the previous iteration, then - the algorithm will move to the next iteration (i.e. redo after increasing K - to K+1). - -If there are less than remain.n samples left after finding a cluster - and removing its samples, the algorithm will stop, as subsampling is deamed - to no longer be appropriate. If the K has to be increased to beyond k.max - without finding any pair of clusters with overlap > beta, then the - algorithm will stop. Any samples not found as part of a 'final' cluster - after the algorithm stops, will be classified as unclustered (given a value - of -1) - -'subsample' controls what is the D (distance) matrix used for - clustering at each iteration. If subsample=TRUE, D is given via - \code{\link{subsampleClustering}} function with k=K (with additional - arguments passed via subsampleArgs). If subsample=FALSE, D is dist(x), for - the samples currently considered in the iteration and clusterFunction must - be of the 'K' type (e.g. "pam", see \code{\link{clusterD}}) or an error - will be produced. The nsample x nsample matrix D is then clustered via - \code{\link{clusterD}} to find clusters. The option 'clusterFunction' is - passed to the argument 'clusterFunction' of \code{\link{clusterD}} to - control what method is used to cluster D. - -If clusterFunction is of type 'K' (e.g. "pam", see - \code{\link{clusterD}}) the 'k' argument of \code{\link{clusterK}} called - by \code{\link{clusterD}} is set to the current iteration of K by the - sequential iteration, so setting 'k=' in the list given to clusterDArgs - will not do anything and will produce a warning to that effect. - -Similarly, the current K of the iteration also determines the 'k' - argument passed to \code{\link{subsampleClustering}} so setting 'k=' in - the list given to the subsampleArgs will not do anything and will produce a - warning to that effect. - -If subsample=FALSE and 'findBestK=FALSE' is passed to clusterDArgs, - then each iteration will run the clustering given by clusterFunction on - dist(x) iterating over k. However, if subsample=FALSE, you should not set - 'findBestK=TRUE' (otherwise clustering dist(x) will be essentially the same - for iterating over different k and there is no method implemented to change - the choice of how to remove a cluster other than similarity as you change - k); an error message will be given if this combination of options are set. - -However, if clusterFunction="pam" (or is of type 'K') and - subsample=TRUE passing either 'findBestK=TRUE' or 'findBestK=FALSE' will - function as expected. In particular, the iteration over K will set the - number of clusters for clustering of each subsample. If findBestK=FALSE, - that same K will be used for clustering of DMat. If findBestK=TRUE, then - \code{\link{clusterD}} will search for best k; note that the default - 'kRange' over which \code{\link{clusterD}} searches when findBestK=TRUE - depends on the input value of 'k' (you can change this to a fixed set of - values by setting 'kRange' explicitly in the clusterDArgs list). +\code{seqCluster} is not meant to be called by the user. It is only + an exported function so as to be able to clearly document the arguments for + \code{seqCluster} which can be passed via the argument \code{seqArgs} in + functions like \code{\link{clusterSingle}} and \code{\link{clusterMany}}. + +This code is adapted from the sequential protion of the code of the + tightClust package of Tseng and Wong. At each iteration of the algorithm it + finds a set of samples that constitute a homogeneous cluster and remove + them, and iterate again to find the next set of samples that form a + cluster. + +In each iteration, to determine the next set of homogeneous set of + samples, the algorithm will iteratively cluster the current set of samples + for a series of increasing values of the parameter $K$, starting at a value + \code{kinit} and increasing by 1 at each iteration, until a sufficiently + homogeneous set of clusters is found. For the first set of homogeneous + samples, \code{kinit} is set to the argument $k0$, and for iteration, + \code{kinit} is increased internally. + +Depending on the value of \code{subsample} how the value of $K$ is + used differs. If \code{subsample=TRUE}, $K$ is the \code{k} sent to the + cluster function \code{clusterFunction} sent to + \code{\link{subsampleClustering}} via \code{subsampleArgs}; then + \code{\link{mainClustering}} is run on the result of the co-occurance matrix from + \code{\link{subsampleClustering}} with the \code{ClusterFunction} object + defined in the argument \code{clusterFunction} set via \code{mainClusterArgs}. + The number of clusters actually resulting from this run of + \code{\link{mainClustering}} may not be equal to the $K$ sent to the clustering + done in \code{\link{subsampleClustering}}. If \code{subsample=FALSE}, + \code{\link{mainClustering}} is called directly on the data to determine the + clusters and $K$ set by \code{seqCluster} for this iteration determines the + parameter of the clustering done by \code{\link{mainClustering}}. Specifically, + the argument \code{clusterFunction} defines the clustering of the + \code{\link{mainClustering}} step and \code{k} is sent to that + \code{ClusterFunction} object. This means that if \code{subsample=FALSE}, + the \code{clusterFunction} must be of \code{algorithmType} "K". + +In either setting of \code{subsample}, the resulting clusters from + \code{\link{mainClustering}} for a particular $K$ will be compared to clusters + found in the previous iteration of $K-1$. For computational (and other?) + convenience, only the first \code{top.can} clusters of each iteration will + be compared to the first \code{top.can} clusters of previous iteration for + similarity (where \code{top.can} currently refers to ordering by size, so + first \code{top.can} largest clusters. + +If there is no cluster of the first \code{top.can} in the current + iteration $K$ that has overlap similarity > \code{beta} to any in the + previous iteration, then the algorithm will move to the next iteration, + increasing to $K+1$. + +If, however, of these clusters there is a cluster in the current + iteration $K$ that has overlap similarity > beta to a cluster in the + previous iteration $K-1$, then the cluster with the largest such similarity + will be identified as a homogenous set of samples and the samples in it + will be removed and designated as such. The algorithm will then start again + to determine the next set of homogenous samples, but without these samples. + Furthermore, in this case (i.e. a cluster was found and removed), the value + of \code{kinit} will be be reset to \code{kinit-1}; i.e. the range of + increasing $K$ that will be iterated over to find a set of homogenous + samples will start off one value less than was the case for the previous + set of homogeneous samples. If \code{kinit-1}<\code{k.min}, then + \code{kinit} will be set to \code{k.min}. + +If there are less than \code{remain.n} samples left after finding a + cluster and removing its samples, the algorithm will stop, as subsampling + is deamed to no longer be appropriate. If the K has to be increased to + beyond \code{k.max} without finding any pair of clusters with overlap > + beta, then the algorithm will stop. Any samples not found as part of a + homogenous set of clusters at that point will be classified as unclustered + (given a value of -1) + +Certain combinations of inputs to \code{mainClusterArgs} and + \code{subsampleArgs} are not allowed. See \code{\link{clusterSingle}} for + these explanations. } \examples{ \dontrun{ data(simData) set.seed(12908) - -clustSeqHier <- seqCluster(t(simData), k0=5, subsample=TRUE, -clusterFunction="hierarchical01", beta=0.8, subsampleArgs=list(resamp.n=100, +clustSeqHier <- seqCluster(simData, k0=5, subsample=TRUE, +beta=0.8, subsampleArgs=list(resamp.n=100, samp.p=0.7, clusterFunction="kmeans", clusterArgs=list(nstart=10)), -clusterDArgs=list(minSize=5)) +mainClusterArgs=list(minSize=5,clusterFunction="hierarchical01",clusterArgs=list(alpha=0.1))) } } \references{ @@ -174,5 +171,6 @@ Tseng and Wong (2005), "Tight Clustering: A Resampling-Based 61:10-16. } \seealso{ -tight.clust +tight.clust, + \code{\link{clusterSingle}},\code{\link{mainClustering}},\code{\link{subsampleClustering}} } diff --git a/man/subsampleClustering.Rd b/man/subsampleClustering.Rd index 14ec3a8a..fb2a45ad 100644 --- a/man/subsampleClustering.Rd +++ b/man/subsampleClustering.Rd @@ -1,81 +1,110 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/subsampleClustering.R +\docType{methods} \name{subsampleClustering} \alias{subsampleClustering} +\alias{subsampleClustering,character-method} +\alias{subsampleClustering,ClusterFunction-method} \title{Cluster subsamples of the data} \usage{ -subsampleClustering(x, k, clusterFunction = "pam", clusterArgs = NULL, - classifyMethod = c("All", "InSample", "OutOfSample"), - classifyFunction = NULL, resamp.num = 100, samp.p = 0.7, ncores = 1, - ...) +\S4method{subsampleClustering}{character}(clusterFunction, ...) + +\S4method{subsampleClustering}{ClusterFunction}(clusterFunction, x = NULL, + diss = NULL, distFunction = NA, clusterArgs = NULL, + classifyMethod = c("All", "InSample", "OutOfSample"), resamp.num = 100, + samp.p = 0.7, ncores = 1, checkArgs = TRUE, checkDiss = TRUE, ...) } \arguments{ -\item{x}{the data on which to run the clustering (samples in columns).} +\item{clusterFunction}{a \code{\link{ClusterFunction}} object that defines +the clustering routine. See \code{\link{ClusterFunction}} for required +format of user-defined clustering routines. User can also give a character +value to the argument \code{clusterFunction} to indicate the use of +clustering routines provided in package. Type +\code{\link{listBuiltInFunctions}} at command prompt to see the built-in +clustering routines. If \code{clusterFunction} is missing, the default is +set to "pam".} + +\item{...}{arguments passed to mclapply (if ncores>1).} -\item{k}{number of clusters to find for each clustering of a subsample -(passed to clusterFunction).} +\item{x}{the data on which to run the clustering (samples in columns).} -\item{clusterFunction}{a function that clusters a \code{p x n} matrix of -data. Can also be given character values 'pam' or 'kmeans' to indicate use -of internal wrapper functions. Must accept arguments 'x' and 'k' (whether -uses them or not). See Details for format of what must return.} +\item{diss}{a dissimilarity matrix on which to run the clustering.} -\item{clusterArgs}{a list of parameter arguments to be passed to -clusterFunction.} +\item{distFunction}{a distance function to be applied to \code{D}. Only +relevant if input is only \code{x} (a matrix of data), and +\code{diss=NULL}. See details of \code{\link{clusterSingle}} for the +required format of the distance function.} -\item{classifyMethod}{method for determining which samples should be used in -the co-occurance matrix. "All"= all samples, "OutOfSample"= those not -subsampled, and "InSample"=those in the subsample. "All" and "OutOfSample" -require that you provide classifyFunction to define how to classify those -samples not in the subsample into a cluster. If "All" is chosen, all -samples will be classified into clusters via the classifyFunctions, not -just those that are out-of-sample. Note if not choose 'All' possible to get -NAs in resulting D matrix (particularly if not enough subsamples taken).} +\item{clusterArgs}{a list of parameter arguments to be passed to the function +defined in the \code{clusterFunction} slot of the \code{ClusterFunction} +object. For any given \code{\link{ClusterFunction}} object, use function +\code{\link{requiredArgs}} to get a list of required arguments for the +object.} -\item{classifyFunction}{a function which, given the output of clusterFunction -and new data points, will classify the new data points into a cluster.} +\item{classifyMethod}{method for determining which samples should be used in +calculating the co-occurance matrix. "All"= all samples, "OutOfSample"= +those not subsampled, and "InSample"=those in the subsample. See details +for explanation.} \item{resamp.num}{the number of subsamples to draw.} \item{samp.p}{the proportion of samples to sample for each subsample.} -\item{ncores}{integer giving the number of cores. If ncores>1, mclapply will +\item{ncores}{integer giving the number of cores. If ncores>1, mclapply will be called.} -\item{...}{arguments passed to mclapply (if ncores>1).} +\item{checkArgs}{logical as to whether should give warning if arguments given +that don't match clustering choices given. Otherwise, inapplicable +arguments will be ignored without warning.} + +\item{checkDiss}{logical. Whether to check whether the input \code{diss} is +valid.} } \value{ -A \code{n x n} matrix of co-occurances. +A \code{n x n} matrix of co-occurances, i.e. a symmetric matrix with + [i,j] entries equal to the percentage of subsamples where the ith and jth + sample were clustered into the same cluster. The percentage is only out of + those subsamples where the ith and jth samples were both assigned to a + clustering. If \code{classifyMethod=="All"}, this is all subsamples for all + i,j pairs. But if \code{classifyMethod=="InSample"} or + \code{classifyMethod=="OutOfSample"}, then the percentage is only taken on + those subsamples where the ith and jth sample were both in or out of + sample, respectively, relative to the subsample. } \description{ -Given a data matrix, this function will subsample the rows -(samples), cluster the subsamples, and return a \code{n x n} matrix with the -probability of co-occurance. +Given input data, this function will subsample the samples, cluster the +subsamples, and return a \code{n x n} matrix with the probability of +co-occurance. } \details{ -The \code{clusterFunction} must be a function that takes as an - argument 'x' which is a \code{p x n} matrix of data and integer 'k'. It - minimally must return a list with element named 'clustering' giving the - vector of cluster ids. To be incorporated with the larger hierarchy, it - should be list with elements of a partition object, just as is returned by - \code{\link[cluster]{pam}}. Generally, the user will need to write a - wrapper function to do this. In the case of pam or kmeans, the user can - identify clusterFunction as "pam" or "kmeans", and the package functions - will use internally written wrappers for the clusterFunction and - classifyFunction arguments. Additional arguments should be supplied via - clusterArgs. +\code{subsampleClustering} is not usually called directly by the + user. It is only an exported function so as to be able to clearly document + the arguments for \code{subsampleClustering} which can be passed via the + argument \code{subsampleArgs} in functions like \code{\link{clusterSingle}} + and \code{\link{clusterMany}}. -The classifyFunction should take as an object a data matrix 'x' with - samples on the columns, and the output of the clusterFunction. Note that the - function should assume that the input 'x' is not the same samples that were - input to the clusterFunction (but can assume that it is the same number of - features/columns). +\code{requiredArgs:} The choice of "All" or "OutOfSample" for + \code{requiredArgs} require the classification of arbitrary samples not + originally in the clustering to clusters; this is done via the classifyFUN + provided in the \code{\link{ClusterFunction}} object. If the + \code{\link{ClusterFunction}} object does not have such a function to + define how to classify into a cluster samples not in the subsample that + created the clustering then \code{classifyMethod} must be + \code{"InSample"}. Note that if "All" is chosen, all samples will be + classified into clusters via the classifyFUN, not just those that are + out-of-sample; this could result in different assignments to clusters for + the in-sample samples than their original assignment by the clustering + depending on the classification function. If you do not choose 'All',it is + possible to get NAs in resulting S matrix (particularly if when not enough + subsamples are taken) which can cause errors if you then pass the resulting + D=1-S matrix to \code{\link{mainClustering}}. For this reason the default is + "All". } \examples{ data(simData) +coOccur <- subsampleClustering(clusterFunction="kmeans", x=simData, +clusterArgs=list(k=3,nstart=10), resamp.n=100, samp.p=0.7) -subD <- subsampleClustering(t(simData), k=3, clusterFunction="kmeans", -clusterArgs=list(nstart=10), resamp.n=100, samp.p=0.7) - -heatmap(subD) +#visualize the resulting co-occurance matrix +plotHeatmap(coOccur) } diff --git a/man/workflowClusters.Rd b/man/workflowClusters.Rd index 8549e46c..12844c73 100644 --- a/man/workflowClusters.Rd +++ b/man/workflowClusters.Rd @@ -31,11 +31,11 @@ \item{whichCluster}{which cluster to set to current in the workflow} -\item{eraseOld}{logical. Only relevant if input \code{x} is of class -\code{ClusterExperiment}. If TRUE, will erase existing workflow results -(clusterMany as well as mergeClusters and combineMany). If FALSE, existing -workflow results will have "\code{_i}" added to the clusterTypes value, -where \code{i} is one more than the largest such existing workflow +\item{eraseOld}{logical. Only relevant if input \code{x} is of class +\code{ClusterExperiment}. If TRUE, will erase existing workflow results +(clusterMany as well as mergeClusters and combineMany). If FALSE, existing +workflow results will have "\code{_i}" added to the clusterTypes value, +where \code{i} is one more than the largest such existing workflow clusterTypes.} \item{clusterLabel}{optional string value to give to cluster set to be "final"} diff --git a/tests/checkClusterMany/clusterManyTest.R b/tests/checkClusterMany/clusterManyTest.R index 87024d4b..99246688 100644 --- a/tests/checkClusterMany/clusterManyTest.R +++ b/tests/checkClusterMany/clusterManyTest.R @@ -1,10 +1,13 @@ #Usage: nohup RScript clusterManyTest.R & - +# If get that corrupted file, probably copied from laptop or elsewhere that only has tag +# Do git lfs checkout L5_sumExp.rda library(devtools) load_all() #install.packages(pkgs="../../../clusterExperiment",repos=NULL,type="source") #library(clusterExperiment) load("L5_sumExp.rda") +outpath<-"resultsDirectory" +if(!file.exists(outpath)) dir.create(outpath) ncores<-5 args<-commandArgs(TRUE) if(length(args)==0) stop("Usage should be 'RScript clusterManyTest.R ' where will be name on saved file of output.") @@ -15,7 +18,7 @@ x<-sessionInfo() version<-x$otherPkgs[["clusterExperiment"]][["Version"]] nm<-paste(tag,"_",version,sep="") -outfile<-paste(nm,".Rout",sep="") +outfile<-file.path(outpath,paste(nm,".Rout",sep="")) cat("Results for test of",version,"\n",file=outfile) cat("-------------------\n",file=outfile,append=TRUE) cat("Running clusterMany...",file=outfile,append=TRUE) @@ -27,29 +30,31 @@ cat("Running clusterMany...",file=outfile,append=TRUE) # clusterFunction="kmeans", # clusterArgs=list(nstart=1)), # seqArgs=list(beta=0.9,k.min=3,verbose=FALSE), -# clusterDArgs=list(minSize=5, verbose=FALSE), +# mainClusterArgs=list(minSize=5, verbose=FALSE), # random.seed=21321, run=TRUE) cl <-clusterMany(l5, dimReduce = "PCA", nPCADims = 50, isCount=TRUE, ks=4:8, clusterFunction="hierarchical01", - beta=0.9, minSize=5, + beta=0.9, minSize=5, mainClusterArgs=list(clusterArgs=list("whichHierDist"="dist")), #added this to be back-compatible with previous defauls. + seqArgs=list(top.can=15),#added this to be back-compatible with previous defauls. alphas=c(0.2,0.3), subsample=TRUE, sequential=TRUE, ncores=ncores, subsampleArgs=list(resamp.num=20, clusterFunction="kmeans", clusterArgs=list(nstart=1)), random.seed=21321, run=TRUE) + #save(cl, file=paste(tag,"_",version,".rda",sep="")) cat("done.",file=outfile,append=TRUE) mat<-clusterMatrix(cl) row.names(mat)<-colnames(cl) matFile<-paste(nm,".txt",sep="") -write.table(mat,file=matFile,sep=",",col.names = TRUE,row.names = TRUE) +write.table(mat,file=file.path(outpath,matFile),sep=",",col.names = TRUE,row.names = TRUE) cat("Current Version:",version,"\n",file=outfile,append=TRUE) cat("User-given tag:",tag,"\n",file=outfile,append=TRUE) ##Read both in, just to make sure not catching differences due write/read differences cat("Compare",matFile,"to fixed version (", fixedVersion,")", ":\n",file=outfile,append=TRUE) compMat<-read.table(fixedVersion,sep=",",header=TRUE) -newMat<-read.table(matFile,sep=",",header=TRUE) +newMat<-read.table(file.path(outpath,matFile),sep=",",header=TRUE) compResult<-all.equal(compMat,newMat) printResult<-if(isTRUE(compResult)) "Yes" else "No" cat("Are all entries the same?\n",printResult,"\n",file=outfile,append=TRUE) diff --git a/tests/testthat/test_RSEC.R b/tests/testthat/test_RSEC.R index 35d28150..8978858a 100644 --- a/tests/testthat/test_RSEC.R +++ b/tests/testthat/test_RSEC.R @@ -1,23 +1,25 @@ context("RSEC") source("create_objects.R") test_that("`RSEC` works with matrix, clusterExperiment, summarizedExperiment",{ - ##these examples don't do dendrogram/merge because all -1 after combineMany - ##only tests clusterMany, combineMany parts. - RSEC(x=mat, isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", + ##these examples don't do dendrogram/merge because all -1 after combineMany + ##only tests clusterMany, combineMany parts. + ##so can't do expect_silent, because returns NOTE about that issue. + rsecOut1<-RSEC(x=mat, isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", subsampleArgs=list(resamp.num=5),random.seed=495 ) - rsecOut<-RSEC(x=cc, isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", + rsecOut2<-RSEC(x=cc, isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", subsampleArgs=list(resamp.num=5),random.seed=495 ) - RSEC(x=ccSE,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", + rsecOut3<-RSEC(x=ccSE,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", subsampleArgs=list(resamp.num=5),random.seed=495) - RSEC(x=se,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", + rsecOut4<-RSEC(x=se,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", subsampleArgs=list(resamp.num=5),random.seed=495) #test rerunClusterMany argument: - RSEC(rsecOut,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none",rerunClusterMany=TRUE,subsampleArgs=list(resamp.num=5),random.seed=495) - RSEC(rsecOut,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none",rerunClusterMany=FALSE,subsampleArgs=list(resamp.num=5),random.seed=495) + rsecOut5<-RSEC(rsecOut2,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none",rerunClusterMany=TRUE,subsampleArgs=list(resamp.num=5),random.seed=495) + #makes dendrogram so important have here so has to catch defaults of RSEC... + rsecOut6<-RSEC(rsecOut2,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none",rerunClusterMany=FALSE,subsampleArgs=list(resamp.num=5),random.seed=495) }) - + test_that("`RSEC` works through whole series of steps",{ #bigger example where actually goes through all the steps (above skips the merging, in particular, because no dendrogram); takes some time: rsecOut<-RSEC(x=assay(seSimCount), isCount=TRUE,dimReduce="none", @@ -26,23 +28,23 @@ rsecOut<-RSEC(x=assay(seSimCount), isCount=TRUE,dimReduce="none", subsampleArgs=list(resamp.num=5),random.seed=495 ) ##check same as individual steps - ceOut<-clusterMany(x=assay(seSimCount),ks=4:5,clusterFunction="tight",alphas=0.1,betas=0.9,minSizes=1, + expect_silent(ceOut<-clusterMany(x=assay(seSimCount),ks=4:5,clusterFunction="tight",alphas=0.1,betas=0.9,minSizes=1, isCount=TRUE, dimReduce="none", transFun = NULL, sequential=TRUE,removeSil=FALSE,subsample=TRUE,silCutoff=0,distFunction=NA, nVarDims=NA,nPCADims=NA, - clusterDArgs=NULL,subsampleArgs=list(resamp.num=5), + mainClusterArgs=NULL,subsampleArgs=list(resamp.num=5), ncores=1,run=TRUE,seqArgs=list(verbose=FALSE),random.seed=495 - ) + )) expect_equal(clusterMatrix(rsecOut,whichClusters="clusterMany"),clusterMatrix(ceOut)) - + #gives 'note', can't use expect_silent combOut<-combineMany(ceOut, proportion = 0.7,minSize = 5) expect_equal(clusterMatrix(rsecOut,whichClusters="combineMany"),clusterMatrix(combOut,whichClusters="combineMany")) expect_equal(coClustering(rsecOut),coClustering(combOut)) - - dendOut<-makeDendrogram(combOut,dimReduce="none",ndims=NA) + + expect_silent(dendOut<-makeDendrogram(combOut,dimReduce="none",ndims=NA)) expect_equal(dendOut@dendro_clusters,rsecOut@dendro_clusters) expect_equal(dendOut@dendro_outbranch,rsecOut@dendro_outbranch) - + #now should be the same, check all objects except dendro_samples because very big: mergeOut<-mergeClusters(dendOut,mergeMethod = "adjP", cutoff = 0.05,isCount=TRUE) expect_equal(dendroClusterIndex(mergeOut),dendroClusterIndex(rsecOut)) @@ -53,3 +55,11 @@ rsecOut<-RSEC(x=assay(seSimCount), isCount=TRUE,dimReduce="none", expect_equal(clusterTypes(rsecOut),clusterTypes(mergeOut)) }) +test_that("`RSEC` works with no merging",{ + #bigger example where actually goes through all the steps (above skips the merging, in particular, because no dendrogram); takes some time: + rsecOut<-RSEC(x=assay(seSimCount), isCount=TRUE,dimReduce="none", + k0s=4:5,clusterFunction="tight", alphas=0.1, + betas=0.9,dendroReduce="none",minSizes=1, + subsampleArgs=list(resamp.num=5),random.seed=495, + mergeMethod="none") +}) diff --git a/tests/testthat/test_clusterMany.R b/tests/testthat/test_clusterMany.R index a583fd71..5b718313 100644 --- a/tests/testthat/test_clusterMany.R +++ b/tests/testthat/test_clusterMany.R @@ -3,19 +3,20 @@ source("create_objects.R") test_that("`clusterMany` works with matrix, list of data, ClusterExperiment objects, and SummarizedExperiments", { - clustNothing <- clusterMany(mat, ks=c(3,4),clusterFunction=c("pam","hierarchicalK","hierarchical01","tight"), + #check all builtin methods + expect_silent(clustNothing <- clusterMany(mat, ks=c(3,4),clusterFunction=listBuiltInFunctions(), subsample=FALSE, sequential=FALSE, - isCount=FALSE,verbose=FALSE) - clustDF <- clusterMany(data.frame(mat), ks=c(3,4),clusterFunction=c("pam","hierarchicalK","hierarchical01","tight"), + isCount=FALSE,verbose=FALSE)) + expect_silent(clustDF <- clusterMany(data.frame(mat), ks=c(3,4),clusterFunction=listBuiltInFunctions(), subsample=FALSE, sequential=FALSE, - isCount=FALSE,verbose=FALSE) + isCount=FALSE,verbose=FALSE)) expect_is(clustNothing, "ClusterExperiment") expect_is(clustNothing, "SummarizedExperiment") - clustNothing2 <- clusterMany(se, ks=c(3,4),clusterFunction="pam", + expect_silent(clustNothing2 <- clusterMany(se, ks=c(3,4),clusterFunction="pam", subsample=FALSE, sequential=FALSE, - isCount=FALSE,verbose=FALSE) + isCount=FALSE,verbose=FALSE)) expect_equal(colData(clustNothing2),colData(se)) expect_equal(rownames(clustNothing2),rownames(se)) expect_equal(colnames(clustNothing2),colnames(se)) @@ -26,9 +27,9 @@ test_that("`clusterMany` works with matrix, list of data, ClusterExperiment obje expect_true(all(clusterTypes(clustNothing)=="clusterMany")) #test running on clusterExperiment Object -- should add the new clustering - clustNothing3 <- clusterMany(ccSE, ks=c(3,4),clusterFunction="pam", + expect_silent(clustNothing3 <- clusterMany(ccSE, ks=c(3,4),clusterFunction="pam", subsample=FALSE, sequential=FALSE, - isCount=FALSE,verbose=FALSE) + isCount=FALSE,verbose=FALSE)) expect_true(nClusters(clustNothing3) == nClusters(ccSE) + 2) expect_equal(colData(clustNothing3),colData(ccSE)) expect_equal(rownames(clustNothing3),rownames(ccSE)) @@ -36,15 +37,13 @@ test_that("`clusterMany` works with matrix, list of data, ClusterExperiment obje expect_equal(metadata(clustNothing3),metadata(ccSE)) expect_equal(rowData(clustNothing3),rowData(ccSE)) - test <- clusterSingle(se, clusterFunction="pam", - subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=4),isCount=FALSE) - clustNothing3<- clusterMany(test, ks=c(3,4),clusterFunction="pam", + expect_silent(test <- clusterSingle(se, subsample=FALSE, sequential=FALSE, mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=4)),isCount=FALSE)) + expect_silent(clustNothing3<- clusterMany(test, ks=c(3,4),clusterFunction="pam", subsample=FALSE, sequential=FALSE,verbose=FALSE, - isCount=FALSE) - clustNothing4<- clusterMany(clustNothing3, ks=c(3:4),clusterFunction="pam", + isCount=FALSE)) + expect_silent(clustNothing4<- clusterMany(clustNothing3, ks=c(3:4),clusterFunction="pam", subsample=FALSE, sequential=FALSE,verbose=FALSE, - isCount=FALSE,eraseOld=TRUE) + isCount=FALSE,eraseOld=TRUE)) expect_equal(clustNothing3,clustNothing4) clustNothing5<- clusterMany(clustNothing3, ks=c(5:6),clusterFunction="pam", @@ -59,16 +58,17 @@ test_that("`clusterMany` works with matrix, list of data, ClusterExperiment obje }) test_that("`clusterMany` works changing parameters", { #check dim reduce - cc <- clusterMany(mat, ks=c(3,4),nVarDim=c(10,15),nPCADim=c(3,4),dimReduce=c("none","PCA","var","cv","mad"),clusterFunction="pam", + expect_silent(cc <- clusterMany(mat, ks=c(3,4),nVarDim=c(10,15),nPCADim=c(3,4),dimReduce=c("none","PCA","var","cv","mad"),clusterFunction="pam", subsample=FALSE, sequential=FALSE,verbose=FALSE, isCount=FALSE) + ) #check giving paramMatrix - param <- clusterMany(mat, ks=c(3,4),nVarDim=c(10,15),nPCADim=c(3,4),dimReduce=c("none","PCA","var"),clusterFunction="pam", + expect_silent(param <- clusterMany(mat, ks=c(3,4),nVarDim=c(10,15),nPCADim=c(3,4),dimReduce=c("none","PCA","var"),clusterFunction="pam", subsample=FALSE, sequential=FALSE,run=FALSE,verbose=FALSE, - isCount=FALSE) + isCount=FALSE)) # cc2 <- clusterMany(mat, ks=c(3,4),nVarDim=c(10, 15),nPCADim=c(3,4),dimReduce=c("none","PCA","var"),clusterFunction="pam", # subsample=FALSE, sequential=FALSE,verbose=FALSE, - # isCount=FALSE,paramMatrix=param$paramMatrix,clusterDArgs=param$clusterDArgs,seqArgs=param$seqArgs,subsampleArgs=param$subsampleArgs) + # isCount=FALSE,paramMatrix=param$paramMatrix,mainClusterArgs=param$mainClusterArgs,seqArgs=param$seqArgs,subsampleArgs=param$subsampleArgs) # expect_equal(cc,cc2) # #check giving distance -- this still doesn't work. @@ -79,18 +79,18 @@ test_that("`clusterMany` works changing parameters", { # subsample=FALSE, sequential=FALSE,verbose=FALSE, # isCount=FALSE) - #check doesn't spit out warnings because alphas/clusterD args not match + #check doesn't spit out warnings because alphas/mainClustering args not match expect_silent(clusterMany(mat, clusterFunction=c("pam","hierarchical01"),ks=c(3,4), alphas=c(0.1,0.2), subsample=FALSE, sequential=FALSE,verbose=FALSE, - clusterDArgs=list(clusterArgs=list(evalClusterMethod="average")), + mainClusterArgs=list(clusterArgs=list(evalClusterMethod="average")), isCount=FALSE)) - #check doesn't spit out warnings because alphas/clusterD args not match + #check doesn't spit out warnings because alphas/mainClustering args not match expect_silent(clusterMany(mat, clusterFunction=c("pam","hierarchical01"),ks=c(3,4), betas=c(.7,.9), minSizes=c(3,5), subsample=FALSE, sequential=FALSE,verbose=FALSE, - clusterDArgs=list(clusterArgs=list(evalClusterMethod="average")), + mainClusterArgs=list(clusterArgs=list(evalClusterMethod="average")), isCount=FALSE)) }) diff --git a/tests/testthat/test_clusterSingle.R b/tests/testthat/test_clusterSingle.R index 5fee6e83..2a7f9678 100644 --- a/tests/testthat/test_clusterSingle.R +++ b/tests/testthat/test_clusterSingle.R @@ -4,252 +4,302 @@ source("create_objects.R") test_that("`clusterSingle` works with matrix, ClusterExperiment objects, and SummarizedExperiments", { - clustNothing <- clusterSingle(mat, clusterFunction="pam", + expect_silent(clustNothing <- clusterSingle(mat, subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=3),isCount=FALSE) + mainClusterArgs=list(clusterArgs=list(k=3), clusterFunction="pam"), isCount=FALSE)) expect_equal(clusterLabels(clustNothing),"clusterSingle") expect_is(clustNothing, "ClusterExperiment") expect_is(clustNothing, "SummarizedExperiment") #test clusterLabel - clustNothing2 <- clusterSingle(mat, clusterFunction="pam", + expect_silent(clustNothing2 <- clusterSingle(mat, mainClusterArgs=list(clusterArgs=list(k=3),clusterFunction="pam"), subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=3),isCount=FALSE,clusterLabel="myownClustering") + isCount=FALSE,clusterLabel="myownClustering")) expect_equal(clusterLabels(clustNothing2),"myownClustering") #test default 01 distance - x1 <- clusterSingle(mat, clusterFunction="tight", + expect_silent(x1 <- clusterSingle(mat, mainClusterArgs= list(clusterArgs=list(alpha=0.1),clusterFunction="tight"), subsample=FALSE, sequential=FALSE, - isCount=FALSE) - expect_error(clusterSingle(mat, clusterFunction="tight", - subsample=FALSE, sequential=FALSE, - clusterDArgs=list(distFunction=function(x){dist(x,method="manhattan")}),isCount=FALSE),"distance function must give values between 0 and 1") - - #test default 01 distance - x2<-clusterSingle(mat, clusterFunction="tight", - subsample=FALSE, sequential=FALSE, - isCount=FALSE) + isCount=FALSE)) + #error because not 01 distance + expect_error(clusterSingle(mat, mainClusterArgs= list(clusterArgs=list(alpha=0.1),clusterFunction="tight",distFunction=function(x){dist(x,method="manhattan")}), + subsample=FALSE, sequential=FALSE,isCount=FALSE),"distance function must give values between 0 and 1") + #test default K distance + expect_silent(x2 <- clusterSingle(mat, mainClusterArgs= list(clusterArgs=list(k=3),clusterFunction="hierarchicalK"),subsample=FALSE, sequential=FALSE, isCount=FALSE)) + #warn wrong arguments - expect_warning(clusterSingle(mat, clusterFunction="tight", + expect_warning(clusterSingle(mat, mainClusterArgs= list(clusterArgs=list(k=3,alpha=0.1),clusterFunction="tight"), subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=3),isCount=FALSE),"do not match the choice of typeAlg") + ,isCount=FALSE),"arguments passed via clusterArgs to the clustering function tight are not all applicable") #turn off warning - expect_silent(clusterSingle(mat, clusterFunction="tight", - subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=3,checkArgs=FALSE),isCount=FALSE)) + expect_silent(clusterSingle(mat, mainClusterArgs= list(clusterArgs=list(k=3,alpha=0.1),checkArgs=FALSE,clusterFunction="tight"), + subsample=FALSE, sequential=FALSE, + ,isCount=FALSE)) - clustNothing2 <- clusterSingle(se, clusterFunction="pam", - subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=3),isCount=FALSE) + ###Apply to SE + expect_silent(clustNothing2 <- clusterSingle(se, mainClusterArgs=list(clusterArgs=list(k=3),clusterFunction="pam"), + subsample=FALSE, sequential=FALSE, + isCount=FALSE)) expect_equal(clusterMatrix(clustNothing2), clusterMatrix(clustNothing)) #test running on clusterExperiment Object -- should add the new clustering - clustNothing3 <- clusterSingle(clustNothing2, clusterFunction="pam", - subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=4),is=FALSE) + expect_silent(clustNothing3 <- clusterSingle(clustNothing2, mainClusterArgs=list(clusterArgs=list(k=4),clusterFunction="pam"), + subsample=FALSE, sequential=FALSE, + isCount=FALSE)) expect_equal(NCOL(clusterMatrix(clustNothing3)),2) expect_equal(length(table(primaryCluster(clustNothing3))),4,info="Check reset primary cluster after run clusterSingle") }) -test_that("Different options algorithms of `clusterD` ", { - #check algorithms - clusterSingle(mat, clusterFunction="tight", - subsample=FALSE, sequential=FALSE, - isCount=FALSE) - clusterSingle(mat, clusterFunction="hierarchical01", - subsample=FALSE, sequential=FALSE, - isCount=FALSE) - clusterSingle(mat, clusterFunction="hierarchicalK", clusterDArgs=list("k"=3), - subsample=FALSE, sequential=FALSE, - isCount=FALSE) - #K algorithm options - clusterSingle(mat, clusterFunction="hierarchicalK", - subsample=FALSE, sequential=FALSE, clusterDArgs=list(findBestK=TRUE,removeSil=TRUE), - isCount=FALSE) - clusterSingle(mat, clusterFunction="pam", clusterDArgs=list(findBestK=TRUE,removeSil=TRUE), - subsample=FALSE, sequential=FALSE, - isCount=FALSE) - + + # > clustSeqHier_v2 <- clusterSingle(simData, + # + sequential=FALSE, subsample=TRUE, subsampleArgs=list(resamp.n=100, samp.p=0.7, + # + clusterFunction="kmeans", clusterArgs=list(nstart=10)), + # + seqArgs=list(beta=0.8, k0=5), mainClusterArgs=list(minSize=5,clusterFunction="hierarchical01")) + # Error in .local(x, diss, ...) : + # For the clusterFunction algorithm type (' 01 ') given in 'mainClusterArgs', must supply arguments: alpha These must be supplied as elements of the list of 'clusterArgs' given in 'mainClusterArgs' + # > set.seed(44261) + # > clustSeqHier_v2 <- clusterSingle(simData, + # + sequential=FALSE, subsample=TRUE, subsampleArgs=list(resamp.n=100, samp.p=0.7, + # + clusterFunction="kmeans", clusterArgs=list(nstart=10)), + # + seqArgs=list(beta=0.8, k0=5), mainClusterArgs=list(minSize=5,clusterFunction="hierarchical01",clusterArgs=list(alpha=0.1))) + + +test_that("Different options algorithms of `mainClustering` ", { + #check builtIn algorithms + #bigger matrix so not kill spectral + set.seed(3325) + biggerMat<-matrix(data=rnorm(20*50), ncol=50) + + kMethods<-listBuiltInTypeK() + for(cf in kMethods){ + expect_silent(clusterSingle(mat, mainClusterArgs= list(clusterArgs=list(k=3), clusterFunction=cf), + subsample=FALSE, sequential=FALSE,isCount=FALSE) + ) + #post-processing arguments for type 'K' + #Upped + expect_silent(clusterSingle(biggerMat, mainClusterArgs= list(clusterArgs=list(k=3), clusterFunction=cf,findBestK=TRUE,removeSil=TRUE), subsample=FALSE, sequential=FALSE,isCount=FALSE)) + + } + aMethods<-listBuiltInType01() + for(cf in aMethods){ + expect_silent(clusterSingle(mat, mainClusterArgs= list(clusterArgs=list(alpha=0.1),clusterFunction=cf), + subsample=FALSE, sequential=FALSE,isCount=FALSE)) + } + ######## - #Check clusterD + #Check mainClustering ######## ###Check pam exactly same: - x<-clusterD(mat, clusterFunction="pam",k=3, - minSize=1, removeSil=FALSE) + expect_silent(x<-mainClustering(mat, clusterFunction="pam",clusterArgs=list(k=3), + minSize=1, removeSil=FALSE)) expect_equal(length(x),ncol(mat)) x2<-cluster::pam(t(mat),k=3,cluster.only=TRUE) expect_equal(x,x2) ###Check hierarchicalK exactly same: - x<-clusterD(mat, clusterFunction="hierarchicalK",k=3, - minSize=1, removeSil=FALSE) + expect_silent(x<-mainClustering(mat, clusterFunction="hierarchicalK",clusterArgs=list(k=3), + minSize=1, removeSil=FALSE)) expect_equal(length(x),ncol(mat)) x2<-stats::cutree(stats::hclust(dist(t(mat))),k=3) expect_equal(x,x2) #check giving wrong parameters gives warning: - expect_warning(clusterD(mat, clusterFunction="tight", alpha=0.1, - minSize=5, removeSil=TRUE),"do not match the choice of typeAlg") - expect_warning(clusterD(mat, clusterFunction="pam", alpha=0.1, - minSize=5, removeSil=TRUE, findBestK=TRUE),"do not match the choice of typeAlg") - expect_warning(clusterD(mat, clusterFunction="tight", alpha=0.1, - clusterArgs=list(evalClusterMethod="average")),"arguments passed via clusterArgs") - expect_warning(clusterD(mat, clusterFunction="hierarchical01", alpha=0.1, - clusterArgs=list(minSize.core=4)),"arguments passed via clusterArgs") + expect_warning(mainClustering(mat, clusterFunction="tight", clusterArgs=list(alpha=0.1), + minSize=5, removeSil=TRUE),"do not match the algorithmType") + expect_error(mainClustering(mat, clusterFunction="tight", clusterArgs=list(k=3), + minSize=5, removeSil=TRUE),"must supply arguments alpha") + expect_error(mainClustering(mat, clusterFunction="pam", clusterArgs=list(alpha=0.1), + minSize=5, removeSil=TRUE),"must supply arguments k") + expect_warning(mainClustering(mat, clusterFunction="tight", clusterArgs=list(k=3,alpha=0.1), + minSize=5),"arguments passed via clusterArgs to the clustering function tight are not all applicable") + expect_warning(mainClustering(mat, clusterFunction="pam", clusterArgs=list(k=3,alpha=0.1), + minSize=5, removeSil=TRUE),"arguments passed via clusterArgs to the clustering function pam are not all applicable") + + + + expect_warning(mainClustering(mat, clusterFunction="tight", clusterArgs=list(alpha=0.1, evalClusterMethod="average")),"arguments passed via clusterArgs to the clustering function tight are not all applicable") + expect_warning(mainClustering(mat, clusterFunction="hierarchical01", clusterArgs=list(alpha=0.1, minSize.core=4)),"arguments passed via clusterArgs to the clustering function hclust are not all applicable") + + #test default 01 distance + expect_silent(mainClustering(mat, clusterFunction="tight", clusterArgs=list(alpha=0.1))) + #test default K distance + expect_silent(mainClustering(mat, clusterFunction="hierarchicalK", clusterArgs=list(k=3))) + + #check turn off if checkArgs=TRUE - expect_silent(clusterD(mat, clusterFunction="tight", alpha=0.1,checkArgs=FALSE, + expect_silent(mainClustering(mat, clusterFunction="tight", clusterArgs=list(alpha=0.1),checkArgs=FALSE, minSize=5, removeSil=TRUE)) - expect_silent(clusterD(mat, clusterFunction="pam", alpha=0.1,checkArgs=FALSE, + expect_silent(mainClustering(mat, clusterFunction="pam", clusterArgs=list(alpha=0.1),checkArgs=FALSE, minSize=5, removeSil=TRUE, findBestK=TRUE)) - expect_silent(clusterD(mat, clusterFunction="tight", alpha=0.1,checkArgs=FALSE, - clusterArgs=list(evalClusterMethod="average"))) - expect_silent(clusterD(mat, clusterFunction="hierarchical01", alpha=0.1,checkArgs=FALSE, - clusterArgs=list(minSize.core=4))) + expect_silent(mainClustering(mat, clusterFunction="tight", clusterArgs=list(alpha=0.1,evalClusterMethod="average"),checkArgs=FALSE)) + expect_silent(mainClustering(mat, clusterFunction="hierarchical01", checkArgs=FALSE, + clusterArgs=list(alpha=0.1,minSize.core=4))) }) -test_that("Different options of subsampling",{ - #check subsample - clustSubsample <- clusterSingle(mat, clusterFunction="pam", - subsample=TRUE, sequential=FALSE, - subsampleArgs=list(resamp.num=3, k=3), - clusterDArgs=list(k=3),isCount=FALSE) - expect_equal(NCOL(coClustering(clustSubsample)),NCOL(mat)) - clusterSingle(mat, clusterFunction="pam", - subsample=TRUE, sequential=FALSE, - subsampleArgs=list(resamp.num=3, k=3,clusterFunction="kmeans"), - clusterDArgs=list(k=3),isCount=FALSE) - set.seed(1045) - clusterSingle(mat, clusterFunction="pam", - subsample=TRUE, sequential=FALSE, - subsampleArgs=list(resamp.num=20, k=3,classifyMethod="InSample"), - clusterDArgs=list(k=3),isCount=FALSE) - set.seed(1045) - clusterSingle(mat, clusterFunction="pam", - subsample=TRUE, sequential=FALSE, - subsampleArgs=list(resamp.num=40, k=3,classifyMethod="OutOfSample"), - clusterDArgs=list(k=3),isCount=FALSE) - set.seed(1045) - expect_error(clusterSingle(mat, clusterFunction="pam", - subsample=TRUE, sequential=FALSE, - subsampleArgs=list(resamp.num=20, k=3,classifyMethod="OutOfSample"), - clusterDArgs=list(k=3),isCount=FALSE),"NA values found in D") - - #errors in missing args in subsample - expect_warning(clusterSingle(mat, clusterFunction="pam", - subsample=TRUE, sequential=FALSE, - subsampleArgs=list(resamp.num=3), - clusterDArgs=list(k=3), isCount=FALSE), - "did not give 'k' in 'subsampleArgs'.") - expect_error(clusterSingle(mat, clusterFunction="pam", - subsample=TRUE, sequential=FALSE, - subsampleArgs=list(resamp.num=3), isCount=FALSE), - "must pass 'k' in subsampleArgs") - -}) - -test_that("Different options of clusterD",{ +test_that("Different options of mainClustering",{ #check errors and warnings - expect_error(clusterSingle(mat, clusterFunction="pam", - subsample=FALSE, sequential=TRUE, - seqArgs=list(verbose=FALSE), - isCount=FALSE,clusterDArgs=list("typeAlg"=="K")), + expect_error(clusterSingle(mat, subsample=FALSE, sequential=TRUE, seqArgs=list(verbose=FALSE), isCount=FALSE,mainClusterArgs=list(clusterFunction="pam")), "seqArgs must contain element 'k0'") - expect_error(clusterSingle(mat, clusterFunction="pam", - subsample=FALSE, sequential=TRUE, - seqArgs=list(verbose=FALSE), - isCount=FALSE, clusterDArgs=list("findBestK"==TRUE)), + expect_error(clusterSingle(mat, subsample=FALSE, sequential=TRUE, seqArgs=list(verbose=FALSE), isCount=FALSE, mainClusterArgs=list(clusterFunction="pam","findBestK"==TRUE)), "seqArgs must contain element 'k0'") - expect_warning(clusterSingle(mat, clusterFunction="tight", - subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=3), isCount=FALSE), - "do not match the choice of typeAlg") - expect_warning(clusterSingle(mat, clusterFunction="tight", + expect_error(clusterSingle(mat, subsample=FALSE, sequential=FALSE, - clusterDArgs=list(findBestK=TRUE),isCount=FALSE), - "do not match the choice of typeAlg") - expect_error(clusterSingle(mat, clusterFunction="tight", + mainClusterArgs=list(clusterFunction="tight",clusterArgs=list(k=3)), isCount=FALSE), + "must supply arguments: alpha") + expect_warning(clusterSingle(mat, subsample=FALSE, sequential=FALSE, mainClusterArgs=list(clusterFunction="tight",clusterArgs=list(alpha=0.1),findBestK=TRUE),isCount=FALSE), + "Some arguments passed via '...' in mainClustering do not match the algorithmType") + expect_error(clusterSingle(mat, subsample=FALSE, sequential=FALSE, - clusterDArgs=list(distFunction=function(x){abs(cor(t(x)))}),isCount=FALSE), - "distance function must have zero values on the diagonal") + mainClusterArgs=list(clusterFunction="tight",clusterArgs=list(alpha=0.1),distFunction=function(x){abs(cor(t(x)))}),isCount=FALSE), + "Dissimilarity matrix must have zero values on the diagonal") }) -test_that("Different options of seqCluster",{ - #check sequential - clustSeq <- clusterSingle(mat, clusterFunction="pam", - subsample=FALSE, sequential=TRUE, - isCount=FALSE,seqArgs=list(k0=5,verbose=FALSE)) - expect_error(clusterSingle(mat, clusterFunction="pam", - subsample=FALSE, sequential=TRUE, - isCount=FALSE), "must give seqArgs so as to identify k0") +test_that("Different options of subsampling",{ + clustSubsample <- clusterSingle(mat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(resamp.num=3, clusterArgs=list(k=3)), mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=3)), isCount=FALSE) + expect_equal(NCOL(coClustering(clustSubsample)),NCOL(mat)) + + #check subsample works with all of the builtin functions and opposite type in mainClusterArgs + set.seed(3325) + biggerMat<-matrix(data=rnorm(20*100), ncol=100) + kMethods<-listBuiltInTypeK() + for(cf in kMethods){ + set.seed(1045) + expect_silent(clusterSingle(biggerMat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(resamp.num=20, clusterArgs=list(k=3),clusterFunction=cf,classifyMethod="InSample"), mainClusterArgs=list(clusterFunction="hierarchical01", clusterArgs=list(alpha=0.3)),isCount=FALSE)) + if(!is.null(getBuiltInFunction(cf)@classifyFUN)){ + set.seed(1045) + expect_silent(clusterSingle(mat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(resamp.num=20, clusterArgs=list(k=3),clusterFunction=cf,classifyMethod="All"), mainClusterArgs=list(clusterFunction="hierarchical01", clusterArgs=list(alpha=0.1)),isCount=FALSE)) + set.seed(1045) + expect_silent(clusterSingle(mat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(resamp.num=40, clusterArgs=list(k=3),clusterFunction=cf,classifyMethod="OutOfSample"), mainClusterArgs=list(clusterFunction="hierarchical01", clusterArgs=list(alpha=0.1)),isCount=FALSE)) + + } + } + aMethods<-listBuiltInType01() + for(cf in aMethods){ + + set.seed(1045) + expect_silent(clusterSingle(mat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(resamp.num=20, clusterArgs=list(alpha=0.1),clusterFunction=cf,classifyMethod="InSample"), mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=3)),isCount=FALSE)) + if(!is.null(getBuiltInFunction(cf)@classifyFUN)){ + ##Check outofsample/all + set.seed(1045) + expect_silent(clusterSingle(mat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(resamp.num=20, clusterArgs=list(alpha=0.1),clusterFunction=cf,classifyMethod="All"), mainClusterArgs=list(clusterFunction="hierarchical01", clusterArgs=list(k=3)),isCount=FALSE)) + set.seed(1045) + expect_silent(clusterSingle(mat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(resamp.num=40, clusterArgs=list(alpha=0.1),clusterFunction=cf,classifyMethod="OutOfSample"), mainClusterArgs=list(clusterFunction="hierarchical01", clusterArgs=list(k=3)),isCount=FALSE)) + + } + } + + ## get NA values + set.seed(1045) + expect_error(clusterSingle(mat, + subsample=TRUE, sequential=FALSE, + subsampleArgs=list(resamp.num=20,clusterArgs=list(k=3),classifyMethod="OutOfSample"), + mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)),isCount=FALSE),"NA values found in dissimilarity matrix") - clustSeq <- clusterSingle(mat, clusterFunction="tight", - subsample=FALSE, sequential=TRUE, - isCount=FALSE,seqArgs=list(k0=5,verbose=FALSE)) - clustSeq <- clusterSingle(mat, clusterFunction="hierarchicalK", - subsample=FALSE, sequential=TRUE, - isCount=FALSE,seqArgs=list(k0=5,verbose=FALSE)) - clustSeq <- clusterSingle(mat, clusterFunction="hierarchical01", - subsample=FALSE, sequential=TRUE, - isCount=FALSE,seqArgs=list(k0=5,verbose=FALSE)) + #warnings in missing args in subsample -- should borrow from mainClusterArgs . + expect_warning(clusterSingle(mat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(clusterFunction="pam",resamp.num=3), mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)), isCount=FALSE), + "missing arguments k provided from those in 'mainClusterArgs'") + #warnings in missing clusterFunction in subsample -- should borrow from mainClusterArgs . + expect_warning(clusterSingle(mat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(resamp.num=3,clusterArgs=list(k=3)), mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)), isCount=FALSE), + "a clusterFunction was not set for subsampleClustering") + #different function types -- should error out. + expect_error(clusterSingle(mat, subsample=TRUE, sequential=FALSE, subsampleArgs=list(clusterFunction="pam",resamp.num=3), mainClusterArgs=list(clusterFunction="tight",clusterArgs=list(alpha=0.1)), isCount=FALSE), + "must supply arguments: k") + +}) + + +test_that("Different options of seqCluster",{ + #check sequential + expect_silent(clustSeq <- clusterSingle(mat,subsample=FALSE, sequential=TRUE,mainClusterArgs=list(clusterFunction="pam"),isCount=FALSE,seqArgs=list(k0=5,beta=0.9,verbose=FALSE))) + expect_error(clusterSingle(mat,subsample=FALSE, sequential=TRUE,mainClusterArgs=list(clusterFunction="pam"),isCount=FALSE), "if sequential=TRUE, must give seqArgs so as to identify k0 and beta") + expect_error(clusterSingle(mat,subsample=FALSE, sequential=TRUE,mainClusterArgs=list(clusterFunction="pam"),isCount=FALSE,seqArgs=list(k0=5,verbose=FALSE)), "seqArgs must contain element 'beta'") + expect_error(clusterSingle(mat,subsample=FALSE, sequential=TRUE,mainClusterArgs=list(clusterFunction="pam"),isCount=FALSE,seqArgs=list(beta=0.9,verbose=FALSE)), "seqArgs must contain element 'k0'") + + #right clusterFunctions + expect_error(clusterSingle(mat, mainClusterArgs=list(clusterFunction="kmeans"), subsample=TRUE, sequential=TRUE, subsampleArgs=list(clusterFunction="pam",n.sample=40), isCount=FALSE,seqArgs=list(k0=5,beta=0.9,verbose=FALSE)), + "If choosing subsample=TRUE, the clusterFunction used in the mainClustering step must take input that is dissimilarity") + expect_error(clusterSingle(mat, mainClusterArgs=list(clusterFunction="tight"), subsample=FALSE, sequential=TRUE, isCount=FALSE,seqArgs=list(k0=5,beta=0.9,verbose=FALSE)), + "if subsample=FALSE, sequentical clustering can only be implemented with a clusterFunction with algorithmType 'K'") + #warning if try to set k + expect_warning(clusterSingle(mat, mainClusterArgs=list(clusterFunction="pam"), subsample=TRUE, sequential=TRUE, subsampleArgs=list(clusterFunction="pam",n.sample=40,clusterArgs=list(k=3)), isCount=FALSE,seqArgs=list(k0=5,beta=0.9,verbose=FALSE)), + "Setting 'k' in subsampleArgs when sequential=TRUE is called will have no effect.") + expect_warning(clusterSingle(mat, mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)), subsample=FALSE, sequential=TRUE, subsampleArgs=list(clusterFunction="pam",n.sample=40), isCount=FALSE,seqArgs=list(k0=5,beta=0.9,verbose=FALSE)), + "Setting 'k' in mainClusterArgs when sequential clustering is requested will have no effect.") + + #check all algorithms + kMethods<-listBuiltInTypeK() + for(cf in kMethods){ + #check if no subsampling + expect_silent(clusterSingle(mat, mainClusterArgs=list(clusterFunction=cf), + subsample=FALSE, sequential=TRUE, + isCount=FALSE,seqArgs=list(k0=5,beta=0.9,verbose=FALSE))) + } + kMethods<-listBuiltInType01() + for(cf in kMethods){ + #check if no subsampling + expect_silent(clusterSingle(mat, mainClusterArgs=list(clusterFunction=cf), + subsample=TRUE, sequential=TRUE, + subsampleArgs=list(clusterFunction="pam",n.sample=40), + isCount=FALSE,seqArgs=list(k0=5,beta=0.9,verbose=FALSE))) + } + }) test_that("Different options of `clusterSingle` ", { #check isCount - clustCount <- clusterSingle(smSimCount, clusterFunction="pam", + expect_silent(clusterSingle(smSimCount, subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=3),isCount=TRUE) - expect_error(clusterSingle(smSimData, clusterFunction="pam", + mainClusterArgs=list(clusterArgs=list(k=3), clusterFunction="pam"),isCount=TRUE) ) + expect_error(clusterSingle(smSimData, subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=3),isCount=TRUE),info="test error handling for isCount=TRUE when can't take log") + mainClusterArgs=list(clusterArgs=list(k=3), clusterFunction="pam"),isCount=TRUE),"User-supplied `transFun` produces NA values",info="test error handling for isCount=TRUE when can't take log") #check pca reduction - clustndims <- clusterSingle(mat, clusterFunction="pam", + expect_silent(clusterSingle(mat, subsample=FALSE, sequential=FALSE, dimReduce="PCA", - ndims=3, clusterDArgs=list(k=3),isCount=FALSE) - expect_error(clusterSingle(mat, clusterFunction="pam", + ndims=3, mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)),isCount=FALSE)) + expect_error(clusterSingle(mat, subsample=FALSE, sequential=FALSE, dimReduce="PCA", ndims=NROW(simData)+1, - clusterDArgs=list(k=3),isCount=FALSE)) + mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)),isCount=FALSE),"the number of PCA dimensions must be strictly less than the number of rows of input data matrix") #check var reduction - clustndims <- clusterSingle(mat, clusterFunction="pam", + expect_silent(clusterSingle(mat, subsample=FALSE, sequential=FALSE, dimReduce="var", ndims=3, - clusterDArgs=list(k=3), isCount=FALSE) - expect_error(clusterSingle(mat, clusterFunction="pam", + mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)), isCount=FALSE)) + expect_error(clusterSingle(mat, subsample=FALSE, sequential=FALSE, - dimReduce="var", ndims=NROW(mat)+1, - clusterDArgs=list(k=3),isCount=FALSE), + dimReduce="var", ndims=NROW(mat)+1, mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)),isCount=FALSE), "the number of most variable features must be strictly less than the number of rows of input data matrix") - expect_warning(clusterSingle(mat, clusterFunction="pam", + expect_warning(clusterSingle(mat, subsample=FALSE, sequential=FALSE, - dimReduce="none",ndims =3, - clusterDArgs=list(k=3),isCount=FALSE), + dimReduce="none",ndims =3, mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)),isCount=FALSE), "specifying ndims has no effect if dimReduce==`none`") - clustndims <- clusterSingle(mat, clusterFunction="pam", + expect_silent(clusterSingle(mat, subsample=FALSE, sequential=FALSE, dimReduce="cv", - ndims=3, clusterDArgs=list(k=3),isCount=FALSE) - clustndims <- clusterSingle(mat, clusterFunction="pam", + ndims=3, mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)),isCount=FALSE)) + expect_silent(clusterSingle(mat, subsample=FALSE, sequential=FALSE, dimReduce="mad", - ndims=3, clusterDArgs=list(k=3),isCount=FALSE) + ndims=3, mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)),isCount=FALSE)) }) test_that("`clusterSingle` preserves the colData and rowData of SE", { - cl <- clusterSingle(se, clusterFunction="pam", + expect_silent(cl<-clusterSingle(se, subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=3),isCount=FALSE) + mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3)),isCount=FALSE)) expect_equal(colData(cl),colData(se)) expect_equal(rownames(cl),rownames(se)) diff --git a/tests/testthat/test_combineMany.R b/tests/testthat/test_combineMany.R index 53d78c7d..26d6c03f 100644 --- a/tests/testthat/test_combineMany.R +++ b/tests/testthat/test_combineMany.R @@ -5,22 +5,24 @@ test_that("`combineMany` works with matrix and ClusterExperiment objects", { clustNothing <- clusterMany(mat, ks=c(3,4),clusterFunction="pam", subsample=FALSE, sequential=FALSE, isCount=FALSE,verbose=FALSE) - x1<-combineMany(clustNothing,whichClusters = "clusterMany") - x2<-combineMany(clustNothing) + x1<-combineMany(clustNothing,proportion=1,whichClusters = "clusterMany") + x2<-combineMany(clustNothing,proportion=1) expect_equal(x1,x2) - expect_error(combineMany(clusterSingle(mat, subsample=FALSE, - clusterFunction="pam", - clusterDArgs=list(k=3))), + ceObj<-clusterSingle(mat, subsample=FALSE, + mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=3))) + expect_error(combineMany(ceObj,proportion=1), "no clusters specified") + expect_error(combineMany(ceObj,whichCluster="clusterSingle"), + 'argument "proportion" is missing, with no default') - shared1 <- combineMany(clusterMatrix(clustNothing)) - shared2 <- combineMany(clustNothing, "all") + shared1 <- combineMany(clusterMatrix(clustNothing),proportion=1) + shared2 <- combineMany(clustNothing, "all",proportion=1) expect_equal(shared1$clustering, primaryCluster(shared2)) - shared3 <- combineMany(clustNothing, "workflow") + shared3 <- combineMany(clustNothing, "workflow",proportion=1) expect_equal(shared2, shared3) - shared4 <- combineMany(clustNothing, 1:nClusters(clustNothing)) + shared4 <- combineMany(clustNothing, 1:nClusters(clustNothing),proportion=1) expect_equal(shared3, shared4) shared5 <- combineMany(clustNothing, "workflow", @@ -44,8 +46,8 @@ test_that("`combineMany` works when multiple runs of workflow", { subsample=FALSE, sequential=FALSE, isCount=FALSE,verbose=FALSE) - shared1 <- combineMany(clustNothing, "all") - shared2<-combineMany(shared1,"all") + shared1 <- combineMany(clustNothing, "all",proportion=1) + shared2<-combineMany(shared1,"all",proportion=1) expect_true("combineMany.1" %in% clusterTypes(shared2)) clustNothing2 <- clusterMany(shared2, ks=c(5,6), clusterFunction="pam", @@ -55,18 +57,18 @@ test_that("`combineMany` works when multiple runs of workflow", { expect_true("clusterMany.2" %in% clusterTypes(clustNothing2)) expect_true("combineMany.2" %in% clusterTypes(clustNothing2)) - shared3 <- combineMany(clustNothing2, "all") - shared4 <- combineMany(clusterMatrix(clustNothing2)) + shared3 <- combineMany(clustNothing2, "all",proportion=1) + shared4 <- combineMany(clusterMatrix(clustNothing2),proportion=1) expect_equal(shared4$clustering, primaryCluster(shared3)) - shared5 <- combineMany(clustNothing2, "workflow") - shared6 <- combineMany(clusterMatrix(clustNothing2)[,1:2]) + shared5 <- combineMany(clustNothing2, "workflow",proportion=1) + shared6 <- combineMany(clusterMatrix(clustNothing2)[,1:2],proportion=1) expect_equal(shared6$clustering, primaryCluster(shared5)) clustNothing3 <- addClusters(clustNothing2, primaryCluster(shared5)) - shared7 <- combineMany(clustNothing3, "all") - shared8 <- combineMany(clustNothing3, "workflow") + shared7 <- combineMany(clustNothing3, "all",proportion=1) + shared8 <- combineMany(clustNothing3, "workflow",proportion=1) }) test_that("`combineMany` preserves the colData and rowData of SE", { diff --git a/tests/testthat/test_constructor.R b/tests/testthat/test_constructor.R index 1b9899ec..028ce9f0 100644 --- a/tests/testthat/test_constructor.R +++ b/tests/testthat/test_constructor.R @@ -137,9 +137,7 @@ test_that("removing clusters work as promised",{ expect_equal(length(clusterInfo(c7)), nClusters(c4)-2) #When have dendrogram - cl1 <- clusterSingle(smSimData, clusterFunction="pam", - subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=6),isCount=FALSE) + cl1 <- clusterSingle(smSimData, subsample=FALSE, sequential=FALSE, mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=6)),isCount=FALSE) leg<-clusterLegend(cl1)[[primaryClusterIndex(cl1)]] leg[,"name"]<-letters[1:6] clusterLegend(cl1)[[primaryClusterIndex(cl1)]]<-leg diff --git a/tests/testthat/test_constructorClusterFunction.R b/tests/testthat/test_constructorClusterFunction.R new file mode 100644 index 00000000..9888839b --- /dev/null +++ b/tests/testthat/test_constructorClusterFunction.R @@ -0,0 +1,6 @@ +context("Constructor") +source("create_objects.R") + +test_that("`clusterFunction` constructor works", { + + }) \ No newline at end of file diff --git a/tests/testthat/test_mergeClusters.R b/tests/testthat/test_mergeClusters.R index 9abf1acd..b9b24d90 100644 --- a/tests/testthat/test_mergeClusters.R +++ b/tests/testthat/test_mergeClusters.R @@ -2,9 +2,9 @@ context("mergeCLusters") source("create_objects.R") test_that("`mergeClusters` works with matrix and ClusterExperiment objects", { - cl1 <- clusterSingle(smSimData, clusterFunction="pam", + cl1 <- clusterSingle(smSimData, subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=6),isCount=FALSE) + mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=6)),isCount=FALSE) leg<-clusterLegend(cl1)[[primaryClusterIndex(cl1)]] leg[,"name"]<-letters[1:6] clusterLegend(cl1)[[primaryClusterIndex(cl1)]]<-leg @@ -15,16 +15,17 @@ test_that("`mergeClusters` works with matrix and ClusterExperiment objects", { dendro=clustWithDendro@dendro_clusters, mergeMethod="adjP", plotInfo="mergeMethod") - + #check plotting types: clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none",plotInfo="all") clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none", plotInfo="adjP") clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none", plotInfo="locfdr") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="locfdr", plotInfo="mergeMethod") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="MB", plotInfo="mergeMethod") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="JC", plotInfo="mergeMethod") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod") expect_error(clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none", plotInfo="mergeMethod"),"can only plot 'mergeMethod' results if one method is selected") clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="none") + + #check all methods run + for(method in clusterExperiment:::.availMergeMethods){ + clustMerged <- mergeClusters(clustWithDendro, mergeMethod=method, plotInfo="mergeMethod") + } expect_true("mergeClusters" %in% clusterTypes(clustMerged)) expect_true("mergeClusters" %in% colnames(clusterMatrix(clustMerged))) @@ -53,9 +54,9 @@ test_that("`mergeClusters` works with matrix and ClusterExperiment objects", { test_that("`mergeClusters` preserves the colData and rowData of SE", { - cl <- clusterSingle(smSimSE, clusterFunction="pam", + cl <- clusterSingle(smSimSE, subsample=FALSE, sequential=FALSE, - clusterDArgs=list(k=6),isCount=FALSE) + mainClusterArgs=list(clusterFunction="pam",clusterArgs=list(k=6)),isCount=FALSE) cl <- makeDendrogram(cl) cl <- mergeClusters(cl, mergeMethod = "adjP") expect_equal(colData(cl),colData(smSimSE)) diff --git a/vignettes/bibFile.bib b/vignettes/bibFile.bib index bb69ccb1..b4dc3fe1 100644 --- a/vignettes/bibFile.bib +++ b/vignettes/bibFile.bib @@ -47,4 +47,34 @@ @article{Finak:2015id pages = {1--13}, month = dec } +@article{Smyth:2004gh, +author = {Smyth, Gordon K}, +title = {{Linear models and empirical bayes methods for assessing differential expression in microarray experiments.}}, +journal = {Statistical Applications in Genetics and Molecular Biology}, +year = {2004}, +volume = {3}, +number = {1}, +pages = {Article3--25} +} + +@article{Law:2014ff, +author = {Law, Charity W and Chen, Yunshun and Shi, Wei and Smyth, Gordon K}, +title = {{voom: Precision weights unlock linear model analysis tools for RNA-seq read counts.}}, +journal = {Genome biology}, +year = {2014}, +volume = {15}, +number = {2}, +pages = {1--17}, +month = feb +} +@article{Ritchie:2015fa, +author = {Ritchie, Matthew E and Phipson, Belinda and Wu, Di and Hu, Yifang and Law, Charity W and Shi, Wei and Smyth, Gordon K}, +title = {{limma powers differential expression analyses for RNA-sequencing and microarray studies}}, +journal = {Nucleic Acids Research}, +year = {2015}, +volume = {43}, +number = {7}, +pages = {e47--e47}, +month = apr +} diff --git a/vignettes/clusterExperimentTutorial.Rmd b/vignettes/clusterExperimentTutorial.Rmd index 36b8fdd3..c67b275c 100644 --- a/vignettes/clusterExperimentTutorial.Rmd +++ b/vignettes/clusterExperimentTutorial.Rmd @@ -25,17 +25,17 @@ message=FALSE) # Introduction {#Intro} -The goal of this package is to encourage the user to try many different clustering algorithms in one package structure. We give tools for running many different clusterings and choices of parameters. We also provide visualization to compare many different clusterings and algorithm tools to find common shared clustering patterns. We implement common post-processing steps unrelated to the specific clustering algorithm (e.g. subsampling the data for stability, finding cluster-specific markers via differential expression, etc). +The goal of this package is to encourage the user to try many different clustering algorithms in one package structure, and we provide strategies for creating a unified clustering from these many clustering resutls. We give tools for running many different clusterings and choices of parameters. We also provide visualization to compare many different clusterings and algorithm tools to find common shared clustering patterns. We implement common post-processing steps unrelated to the specific clustering algorithm (e.g. subsampling the data for stability, finding cluster-specific markers via differential expression, etc). -The other main goal of this package is to implement strategies that we have developed for finding a single robust clustering based on the many clusterings that the user might create by perturbing various parameters of a clustering algorithm. There are several steps to these strategies that we call our standard clustering workflow. Our RSEC algorithm (Resampling-based Sequential Ensemble Clustering) is our preferred realization of this workflow that depends on subsampling on and other ensembl methods to provide robust clusterings, particularly for single-cell sequencing experiments and other large mRNA-Seq experiments. +The other main goal of this package is to implement strategies that we have developed in the RSEC algorithm (Resampling-based Sequential Ensemble Clustering) for finding a single robust clustering based on the many clusterings that the user might create by perturbing various parameters of a clustering algorithm. There are several steps to these strategies that we call our standard clustering workflow. The `RSEC` function is our preferred realization of this workflow that depends on subsampling on and other ensemble methods to provide robust clusterings, particularly for single-cell sequencing experiments and other large mRNA-Seq experiments. -We also provide a class `clusterExperiment` that inherits from `SummarizedExperiment` to store the many clusterings and related information. +We also provide a class `ClusterExperiment` that inherits from `SummarizedExperiment` to store the many clusterings and related information, and a class `ClusterFunction` that encodes a clustering routine in a standardized way so that it can interact with our clustering workflow algorithms. -All of our methods also have a barebones version that allows input of matrices and greater control. This comes at the expense of the user having to manage and keep track of the clusters, input data, transformation of the data, etc. We do not discuss these barebone versions in this tutorial. Instead, we focus on using `SummarizedExperiment` object as the input and working with the resulting `ClusterExperiment` object. See the help pages of each method for more on how to allow for matrix input. +All of our methods also have a barebones version that allows input of matrices and greater control. This comes at the expense of the user having to manage and keep track of the clusters, input data, transformation of the data, etc. We do not discuss these barebone versions in this tutorial. Instead, we focus on using the `SummarizedExperiment` object as the input and working with the resulting `ClusterExperiment` object. See the help pages of each method for more on how to allow for matrix input. Although this package was developed with (single-cell) RNA-seq data in mind, its use is not limited to RNA-seq or even to gene expression data. Any dataset characterized by high dimensionality could benefit from the methods implemented here. -## The clustering workflow +## The RSEC clustering workflow The package encodes many common practices that are shared across clustering algorithms, like subsampling the data, computing silhouette width, sequential clustering procedures, and so forth. It also provides novel strategies that we developed as part of the RSEC algorithm. @@ -49,19 +49,19 @@ As mentioned above, RSEC is a specific algorithm for creating a clustering that The basic premise of RSEC is to find small, robust clusters of samples, and then merge them into larger clusters as relevant. We find that many algorithmic methods for choosing the appropriate number of clusters for methods err on the side of too few clusters. However, we find in practice that we tend to prefer to err on finding many clusters and then merging them based on examining the data. -RSEC makes many specific choices in this basic workflow, and many steps of this workflow are useful separately from RSEC. For this reason, the `clusterExperiment` package generalizes this workflow so that the user can use these tools with their own choices. We call this generalization the clustering workflow, as oppose to the specific choices set in RSEC. We will introduce this workflow in its general setting, and only ing specific sections devoted to the `RSEC` function will discussed the RSEC algorithm. For this reason, the examples shown here use simple clustering choices in the workflow to save on computational overhead; RSEC uses extensive subsampling techniques and takes much longer to run than is practical for a vignette. +RSEC makes many specific choices in this basic workflow, while many steps of this workflow are useful for users separately from RSEC. For this reason, the `clusterExperiment` package generalizes this workflow so that the user can follow this workflow with their own choices. We call this generalization the clustering workflow, as oppose to the specific choices set in RSEC. We will introduce this workflow in its general setting, and only in specific sections devoted to the `RSEC` function will discussed the RSEC algorithm. For this reason, the examples shown here use simple clustering choices in the workflow to save on computational overhead; our preferred choices encoded in `RSEC` uses extensive subsampling techniques and takes much longer to run than is practical for a vignette. Users can also run or add their own clusters to this workflow at different stages. Additional functionality for clustering is available in the `clusterSingle` function, and the user should see the documentation in the help page of that function. However, there is special functionality to ease in quickly visualizing and managing the results of this workflow. ### The RSEC Routine -The RSEC algorithm (Resampling-based Sequential Ensemble Clustering) was developed specifically for finding robust clusters in single cell sequencing data. It follows our above suggested workflow, but makes specific choices along the way that we find to be advantageous in clustering large mRNA expression datasets, like those of single cell sequencing. It is implemented in the function `RSEC`. This vignette serves to show how the pieces of the workflow operate, and use choices that are shown are not those recommended by RSEC. This is both to show the flexibility of the functions and because RSEC is computationally more time-intensive, since it is based on both resampling, and sequential discovery of clusters. +The RSEC algorithm (Resampling-based Sequential Ensemble Clustering) was developed specifically for finding robust clusters in single cell sequencing data. It follows our above suggested workflow, but makes specific choices along the way that we find to be advantageous in clustering large mRNA expression datasets, like those of single cell sequencing. It is implemented in the function `RSEC`. This vignette serves to show how the pieces of the workflow operate, and use choices that are shown are not those recommended by RSEC. We choose to show alternatives to the choices in RSEC both to show the flexibility of the functions and because RSEC is computationally more time-intensive, since it is based on both resampling, and sequential iteration of subsampling routines to discover. ## Finding related features/genes A common practice after determining clusters is to perform differential gene expression analysis in order to find genes that show the greatest differences amongst the clusters. We would stress that this is purely an exploratory technique, and any p-values that result from this analysis are not valid, in the sense that they are likely to be inflated. This is because the same data was used to define the clusters and to perform differential expression analysis. -Since this is a common task, we provide the function `getBestFeatures` to perform various kinds of differential expression analysis between the clusters. A common F-statistic between groups can be chosen. However, we find that it is far more informative to do pairwise comparisons between clusters, or one cluster against all, in order to find genes that are specific to a particular cluster. An option for all of these choices is provided in the `getBestFeatures` function. The `getBestFeatures` function uses the DE analysis provided by the `limma` package. In addition, the `getBestFeatures` function provides an option to do use the "voom" correction in the `limma` package to account for the mean-variance relationship that is common in count data. +Since this is a common task, we provide the function `getBestFeatures` to perform various kinds of differential expression analysis between the clusters. A common F-statistic between groups can be chosen. However, we find that it is far more informative to do pairwise comparisons between clusters, or one cluster against all, in order to find genes that are specific to a particular cluster. An option for all of these choices is provided in the `getBestFeatures` function. The `getBestFeatures` function uses the DE analysis provided by the `limma` package. In addition, the `getBestFeatures` function provides an option to do use the "voom" correction in the `limma` package to account for the mean-variance relationship that is common in count data. The tests performed by `getBestFeatures` are specific contrasts between clustering groups; these contrasts can be retrieved without performing the tests using `clusterContrasts`, including in a format appropriate for the `MAST` algorithm. ## Visualization Tools @@ -69,7 +69,7 @@ We provide a visualization to compare many clusterings of the same data in the f # Quickstart {#quickstart} -We will go quickly through the standard steps of clustering using the `clusterExperiment` package, before turning to more details. The standard workflow we envision is the following: +We will go quickly through the standard workflow steps of clustering using the `clusterExperiment` package, before turning to more details. The standard workflow we envision is the following: * `clusterMany` -- run desired clusterings * `combineMany` -- get a unified clustering @@ -77,6 +77,7 @@ We will go quickly through the standard steps of clustering using the `clusterE * `mergeClusters` -- merge together clusters with little DE between their genes. * `getBestFeatures` -- Find Features that are differential between the final clustering. +The first four steps (the clustering steps) are done in one function call by `RSEC`, which is the preferred usage. However, to understand the parameters available in `RSEC` it is useful to go through the steps first. ## Data example {#data} @@ -125,13 +126,14 @@ assays(se) <- list(normalized_counts=fq) ## Step 1: Clustering with `clusterMany` {#step1} -`clusterMany` lets the user quickly pick between many clustering options and run all of the clusterings in one single command. In the quick start we pick a simple set of clusterings based on varying the dimensionality reduction options. The way to designate which options to vary is to give multiple values to an argument. Due to a bug in R, we need to set `getClass.msg=FALSE` or otherwise a slew of annoying warnings will spit out at every call; this should be fixed in the next patch to R. +`clusterMany` lets the user quickly pick between many clustering options and run all of the clusterings in one single command. In the quick start we pick a simple set of clusterings based on varying the dimensionality reduction options. The way to designate which options to vary is to give multiple values to an argument. + Here is our call to `clusterMany`: ```{r clusterMany} library(clusterExperiment) -ce<-clusterMany(se, clusterFunction="pam",ks=5:10, +ce<-clusterMany(se, clusterFunction="pam",ks=5:10, minSizes=5, isCount=TRUE,dimReduce=c("PCA","var"),nVarDims=c(100,500,1000), nPCADims=c(5,15,50),run=TRUE) ``` @@ -146,11 +148,11 @@ In this call to `clusterMany` we made the follow choices about what to vary: By giving only a single value to the relative argument, we keep the other possible options fixed, for example: * we used 'pam' for all clustering (`clusterFunction`) -* we left the default of `minSize` which requires clusters to be of size at least 5; smaller clusters will be ignored and samples assigned to them given the unassigned value of -1. +* we set `minSizes=5`. This is an argument that allows the user to set a minimum required size and clusters of size less than that value will be ignored and samples assigned to them given the unassigned value of -1. The default of `1` would mean that this option is not used. We also set `isCount=TRUE` to indicate that our input data are counts. This means that operations for clustering and visualizations will internally transform the data as $log_2(x+1)$ (We could have alternatively explicitly set a transformation by giving a function to the `transFun` argument, for example if we wanted $\sqrt(x)$ or $log(x+\epsilon)$ or just `identity`). -We can visualize the resulting clusterings using the `plotClusters` command. It is also useful to change the amount of space to allow for the labels of the clusterings, so we will reset the `mar` option in `par` (we also change `axisLine` argument for this reason). +We can visualize the resulting clusterings using the `plotClusters` command. For this visualization, it is useful to change the amount of space on the left of the plot to allow for the labels of the clusterings, so we will reset the `mar` option in `par`. We also decrease the `axisLine` argument that decides the amount of space between the axis and the labels to give more space to the labels (`axisLine` is passed internally to the `line` option in `axis`). ```{r plotClusterEx1} defaultMar<-par("mar") @@ -161,15 +163,15 @@ plotClusters(ce,main="Clusters from clusterMany", whichClusters="workflow", samp This plot shows the samples in the columns, and different clusterings on the rows. Each sample is color coded based on its clustering for that row, where the colors have been chosen to try to match up clusters across different clusterings that show large overlap. Moreover, the samples have been ordered so that each subsequent clustering (starting at the top and going down) will try to order the samples to keep the clusters together, without rearranging the clustering blocks of the previous clustering/row. -Notice that we also added the `sampleData` argument in our call, indicating that we also want to visualize some information about the samples saved in the `colData` slot (inherited from our original `fluidigm` object). We chose the columns "Biological_Condition" and "Cluster2" from `colData`, which correspond to the original biological condition of the experiment, and the clusters reported in the original paper, respectively. These are shown at the bottom of the plot. +Notice that we also added the `sampleData` argument in our call, indicating that we also want to visualize some information about the samples saved in the `colData` slot (inherited from our original `fluidigm` object). We chose the columns "Biological_Condition" and "Cluster2" from `colData`, which correspond to the original biological condition of the experiment, and the clusters reported in the original paper, respectively. The data from `sampleData` are always shown at the bottom of the plot. Notice that some samples are white. This indicates that they have the value -1, meaning they were not clustered. This is from our choices to require at least 5 samples to make a cluster. We can see that some clusters are fairly stable across different choices of dimensions while others can vary dramatically. -We have set `whichClusters="workflow"` to only plot clusters created from the workflow. Right now that's all there are anyway, but as commands get rerun with different options, other clusterings can build up in the object (see discussion in [this section](#rerun) about how multiple calls to workflow are stored). So setting `whichClusters="workflow"` means that we are only going to see our most recent calls to the workflow. +We have set `whichClusters="workflow"` to only plot clusters created from the workflow. Right now that's all there are anyway, but as commands get rerun with different options, other clusterings can build up in the object (see discussion in [this section](#rerun) about how multiple calls to workflow are stored). So setting `whichClusters="workflow"` means that we are only going to see our most recent calls to the workflow. `whichClusters` can be set to limit to specific clusterings or specific steps in the workflow (so far we only have the 1 step, which is the `clusterMany` step). -The labels shown are those given by `clusterMany` but can be a bit much for plotting. We can assign new labels if we prefer, for example to be more succinct, by changing the `clusterLabels` of the object (note we are permanently changing the labels here within the `ClusterExperiment` object). We choose to remove "Features" as being too wordy: +The labels shown are those given automatically by `clusterMany` but can be a bit much for plotting. We can assign new labels in our `ClusterExperiment` object if we prefer, for example to be more succinct, by changing the `clusterLabels` of the object (note we are permanently changing the labels here within the `ClusterExperiment` object). We choose to remove "Features" as being too wordy: ```{r plotCluster_newLabels} cl<-clusterLabels(ce) @@ -177,7 +179,7 @@ cl<-gsub("Features","",cl) clusterLabels(ce)<-cl ``` -We will also show the clusters in a different order, which corresponds to varying the number of dimensions, rather than k. +We will also give to the `whichClusters` argument indices of clusters stored in the `ClusterExperiment` object, which can allow us to show the clusters in a different order. Here we'll pick an order which corresponds to varying the number of dimensions, rather than k. We do this by parsing the labels of the clusters, which we can get with the `clusterLabels` command. ```{r plotCluster_newOrder} cl<-clusterLabels(ce) @@ -189,6 +191,26 @@ plotClusters(ce,main="Clusters from clusterMany", whichClusters=ord, sampleData= We see that the order in which the clusters are given to `plotClusters` changes the plot greatly. There are many different options for how to run `plotClusters` discussed in in the detailed section on [plotClusters](#plotClusters), but for now, this plot is good enough for a quick visualization. +We can examine sizes of a single clustering with the function `plotBarplot`. By default, the cluster picked will be the primary cluster, which for the result of `clusterMany` is rather arbitrarily just the first cluster. + +```{r plotBarplot1} +plotBarplot(ce) +``` + +We can also pick a particular cluster. + +```{r plotBarplot1.2} +plotBarplot(ce,whichClusters=c("nPCA=15,k=6" )) +``` + + +We can also compare two specific clusters with a simple barplot using `plotBarplot`. Here we compare the above clusering (PCA with 15 dimensions and k=6), with changing to k=8: + +```{r plotBarplot2} +plotBarplot(ce,whichClusters=c("nPCA=15,k=6" ,"nPCA=15,k=8")) +``` + + ### The output The output of `clusterMany` is a `ClusterExperiment` object. This is a class built for this package and explained in the section on [ClusterExperiment Objects](#ceobjects). In the object `ce` the clusters are stored, names and colors for each cluster within a clustering are assigned, and other information about the clusterings is recorded. Furthermore, all of the information in the original `SummarizedExperiment` is retained. @@ -201,21 +223,21 @@ For right now we will only mention the most basic such function that retrieves t head(clusterMatrix(ce)[,1:3]) ``` -Because we have changed `clusterLabels` above, these new cluster labels are shown here. Notice that some of the samples are assigned the value of -1. -1 has the significance of encoding samples that are not assigned to any cluster. Why certain samples are not clustered depends on the underlying choices of the clustering routine. In this case, the default in `clusterMany` set the minimum size of a cluster to be 5, which resulted in -1 assignments. +Because we have changed `clusterLabels` above, these new cluster labels are shown here. Notice that some of the samples are assigned the value of `-1`. `-1` has the significance of encoding samples that are not assigned to any cluster. Why certain samples are not clustered depends on the underlying choices of the clustering routine. In this case, we set `minSizes=5` in `clusterMany` requiring the minimum size of a cluster to be 5, which resulted in -1 assignments. -Another special value is -2 discussed in the section on [ClusterExperiment objects](#ceobjects) +Another special value is `-2` discussed in the section on [ClusterExperiment objects](#ceobjects) ## Step 2: Find a consensus with `combineMany` {#step2} To find a consensus clustering across the many different clusterings created by `clusterMany` the function `combineMany` can be used next. ```{r} -ce<-combineMany(ce) +ce<-combineMany(ce,proportion=1) ``` -Notice we get a warning that we did not specify any clusters to combine, so it is using the default -- those from the `clusterMany` call. +The `proportion` argument indicates the minimum proportion of times samples should be with other samples in the cluster they are assigned to in order to be clustered together in the final assignment. Notice we get a warning that we did not specify any clusters to combine, so it is using the default -- those from the `clusterMany` call. -If we look at the `clusterMatrix` of the returned `ce` object, we see that the new cluster from `combineMany` has been added to the existing clusterings. This is the basic strategy of the functions in this package. Any clustering that is created is added to existing clusterings, so the user does not need to keep track of past clusterings and can easily compare what has changed. +If we look at the `clusterMatrix` of the returned `ce` object, we see that the new cluster from `combineMany` has been added to the existing clusterings. This is the basic strategy of all of these functions in this package. Any clustering function that is applied to an existing `ClusterExperiment` object adds the new clustering to the set of existing clusterings, so the user does not need to keep track of past clusterings and can easily compare what has changed. We can again run `plotClusters`, which will now also show the result of `combineMany`: @@ -226,13 +248,13 @@ plotClusters(ce,whichClusters="workflow") ``` -The default result of `combineMany` is not usually a great choice, and certainly isn't helpful here. The clustering from the default `combineMany` leaves most samples unassigned (white in the above plot). This is because the default way of combining is very conservative -- it requires samples to be in the same cluster in *every clustering* to be assigned a cluster. This is quite stringent. We can vary this by setting the `proportion` argument to indicate the minimum proportion of times they should be together with other samples in the cluster they are assigned to. Explicit details on how `combineMany` makes these clusters are discussed in the section on [combineMany](#combineMany). +The choices of `proportion=1` in `combineMany` is not usually a great choice, and certainly isn't helpful here. The clustering from the default `combineMany` leaves most samples unassigned (white in the above plot). This is because we requires samples to be in the same cluster in *every clustering* in order to be assigned to a cluster together. This is quite stringent. We can vary this by setting the `proportion` argument to be lower. Explicit details on how `combineMany` makes these clusters are discussed in the section on [combineMany](#combineMany). -So let's label the one we found "combineMany, default" and then create a new one. (Making an informative label will make it easier to keep track of this particular clustering later, particularly if we make multiple calls to the workflow). +So let's label the one we found as "combineMany,1" and then create a new one. (Making or changing the label to an informative label will make it easier to keep track of this particular clustering later, particularly if we make multiple calls to the workflow). ```{r combineMany_changeLabel} wh<-which(clusterLabels(ce)=="combineMany") -if(length(wh)!=1) stop() else clusterLabels(ce)[wh]<-"combineMany,default" +if(length(wh)!=1) stop() else clusterLabels(ce)[wh]<-"combineMany,1" ``` Now we'll rerun `combineMany` with `proportion=0.7`. This time, we will give it an informative label upfront in our call to `combineMany`. @@ -253,7 +275,7 @@ plotClusters(ce,whichClusters="workflow",main="Min. Size=3") ``` -We can also visualize the proportion of times these clusters were together across these clusterings (this information was made and stored in the ClusterExperiment object when we called `combineMany` as long as `proportion` value is <1): +We can also visualize the proportion of times these clusters were together across these clusterings (this information was made and stored in the ClusterExperiment object when we called `combineMany` provided that `proportion` argument is <1): ```{r plotCoClustering_quickstart} plotCoClustering(ce) @@ -263,13 +285,13 @@ This visualization can help in determining whether to change the value of `propo ## Step 3: Merge clusters together with `makeDendrogram` and `mergeClusters` {#step3} -It is not uncommon in practice to create forty or more clusterings with `clusterMany`, in which case the results of `combineMany` can often still result in too many small clusters. We might wonder if they are necessary or could be logically combined together. We could change the value of `proportion` in our call to `combineMany`. But we have found that it is often after looking at the clusters and how different they look on individual genes that we best make this determination, rather than the proportion of times they are together in different clustering routines. +Once you start varying the parameters, is not uncommon in practice to create forty or more clusterings with `clusterMany`. In which case the results of `combineMany` can often result in too many small clusters. We might wonder if they are necessary or could be logically combined together. We could change the value of `proportion` in our call to `combineMany`. But we have found that it is often after looking at the clusters, for example with a heatmap, and how different they look on individual genes that we best make this determination, rather than the proportion of times they are together in different clustering routines. -For this reason, we often find the need for an additional clustering step that merges clusters together that are not different, based on running tests of differential expression between the clusters found in `combineMany`. We often display and use both sets of clusters side-by-side (that from `combineMany` and that from `mergeClusters`). +For this reason, we often find the need for an additional clustering step that merges clusters together that are not different, based on running tests of differential expression between the clusters found in `combineMany`. This is done by the function `mergeClusters`. We often display and use both sets of clusters side-by-side (that from `combineMany` and that from `mergeClusters`). -`mergeClusters` needs a hierarchical clustering of the clusters; it then goes progressively up that hierarchy, deciding whether two adjacent clusters can be merged. The function `makeDendrogram` makes such a hierarchy between clusters (by applying `hclust` to the medoids of the clusters). Because the results of `mergeClusters` are so dependent on that hierarchy, we require the user to call `makeDendrogram` rather than calling it internally. This is because different options in `makeDendrogram` can affect how the clusters are hierarchically ordered, and we want to encourage the user make these choices. +`mergeClusters` needs a hierarchical clustering of the clusters in order to merge clusters; it then goes progressively up that hierarchy, deciding whether two adjacent clusters can be merged. The function `makeDendrogram` makes such a hierarchy between clusters (by applying `hclust` to the medoids of the clusters). Because the results of `mergeClusters` are so dependent on that hierarchy, we require the user to call `makeDendrogram` rather than calling it automatically internally. This is because different options in `makeDendrogram` can affect how the clusters are hierarchically ordered, and we want to encourage the user make these choices. -As an example, here we use the 500 most variable genes to make the cluster hierarchy. +As an example, here we use the 500 most variable genes to make the cluster hierarchy (note we can make different choices here than we did in the clustering). ```{r makeDendrogram} ce<-makeDendrogram(ce,dimReduce="var",ndims=500) @@ -278,14 +300,21 @@ plotDendrogram(ce) We can see that clusters 1 and 3 are most closely related, at least in the top 500 most variable genes. +We can see the relative size of the clusters by setting some options in `plotDendrogram`: + +```{r makeDendrogram2} +plotDendrogram(ce,labelType="colorblock",leafType="sample") +``` + + +Notice I don't need to make the dendrogram again, because it's saved in `ce`. If we look at the summary of `ce`, it now has 'makeDendrogram' marked as 'Yes'. ```{r makeDendrogram_show} ce ``` - -Now we are ready to actually merge clusters together. We now run `mergeClusters` that will go up this hierarchy and compare the level of differential expression (DE) in each pair. In other words, if we focus on the left side of the tree, DE tests are run, between 1 and 3, and between 6 and 8. If there is not enough DE between each of these (based on a cutoff that can be set by the user), then clusters 1 and 3 and/or 6 and 8 will be merged. And so on up the tree. +Now we are ready to actually merge clusters together. We run `mergeClusters` that will go up this hierarchy and compare the level of differential expression (DE) in each pair. In other words, if we focus on the left side of the tree, DE tests are run, between 1 and 3, and between 6 and 8. If there is not enough DE between each of these (based on a cutoff that can be set by the user), then clusters 1 and 3 and/or 6 and 8 will be merged. And so on up the tree. It is useful to first run `mergeClusters` without actually creating any merged clusters so as to preview what the final clustering will be (and perhaps to help in setting the cutoff). @@ -294,7 +323,6 @@ mergeClusters(ce,mergeMethod="adjP",plotInfo="mergeMethod") ``` Then we can decide on a cutoff and visualize the resulting clustering. - @@ -319,7 +347,7 @@ By choosing "dendrogramValue" for the clustering of the samples, we will be show ## Step 4: Finding Features related to the clusters {#step4} -The last step is to then find features that are different between these clusters, as a start to understanding biological differences in the samples. The function `getBestFeatures` performs tests for differential expression (i.e. different mean levels) between the clusters for each feature. It relies on `limma` to run the differential expression analysis, with `voom` correction if the data are indicated by the user to be counts. +The last step is to then find features that are different between these clusters, as a start to understanding biological differences in the samples. The function `getBestFeatures` performs tests for differential expression (i.e. different mean levels) between the clusters for each feature. It relies on `limma` [@Smyth:2004gh, @Ritchie:2015fa] to run the differential expression analysis, with `voom` [@Law:2014ff] correction if the data are indicated by the user to be counts. There are several types of tests that can be performed to identify features that are different between the clusters. Here we perform all pairwise tests between the clusters. @@ -335,7 +363,7 @@ We can visualize only these significantly different pair-wise features with `plo length(pairsAll$Feature)==length(unique(pairsAll$Feature)) ``` -In this case they are not unique. Hence, we will make sure we take only unique gene values so that they are not plotted multiple times in our heatmap. (This is a good practice even if in a particular case the genes are unique). +In this case they are not unique because the same gene can be significant for different pairs tests. Hence, we will make sure we take only unique gene values so that they are not plotted multiple times in our heatmap. (This is a good practice even if in a particular case the genes are unique). ```{r getBestFeatures_heatmap} plotHeatmap(ce, clusterSamplesData="dendrogramValue", @@ -344,7 +372,7 @@ plotHeatmap(ce, clusterSamplesData="dendrogramValue", breaks=.99) ``` -Notice that the samples clustered into the -1 cluster (i.e. not assigned) are clustered as an outgroup. They can also be mixed into the dendrogram (see [makeDendrogram](#makeDendrogram)) +Notice that the samples clustered into the `-1` cluster (i.e. not assigned) are clustered as an outgroup. This is a choice that is made when the dendrogram was made with `makeDendrogram`. These samples can also be mixed into the dendrogram (see [makeDendrogram](#makeDendrogram)) # ClusterExperiment Objects {#ceobjects} @@ -356,7 +384,7 @@ Typing the name at the control prompt results in a quick summary of the object. ce ``` -This summary tells us the total number of clusterings (`r nClusters(ce)`), and gives some indication as to what parts of the standard workflow have been completed and stored in this object. It also gives information regarding the `primaryCluster` of the object. The `primaryCluster` is just one of the clusterings that has been chosen to be the "primary" clustering, meaning that by default various functions will turn to this clustering as the desired clustering to use. The "primaryCluster" can be reset by the user (see `primaryClusterIndex`). `clusterMany` arbitrarily sets the 'primaryCluster' to the first one, and each later step of the workflow sets the primary index to the most recent, but the user can set a specific clustering to be the primaryCluster with `primaryClusterIndex`. +This summary tells us the total number of clusterings (`r nClusters(ce)`), and gives some indication as to what parts of the standard workflow have been completed and stored in this object. It also gives information regarding the `primaryCluster` of the object. The `primaryCluster` is just one of the clusterings that has been chosen to be the "primary" clustering, meaning that by default various functions will turn to this clustering as the desired clustering to use. The "primaryCluster" can be reset by the user (see `primaryClusterIndex`). `clusterMany` arbitrarily sets the 'primaryCluster' to the first one, and each later step of the workflow sets the primary index to the most recent, but the user can set a specific clustering to be the primaryCluster with `primaryClusterIndex`. Often, if a function is not given a specific clustering (usually via an option `whichCluster` or `whichClusters`) the "primary" cluster is taken by default. There are also additional commands to access the clusterings and their related information (type `help("ClusterExperiment-methods")` for more). @@ -371,13 +399,15 @@ Remember that we made multiple calls to `combineMany`: only the last such call w **Negative Valued Cluster Assignments** The different clusters are stored as consecutive integers, with '-1' and '-2' having special meaning. '-1' refers to samples that were not clustered by the clustering algorithm. In our example, we removed clusters that didn't meet specific size criterion, so they were assigned '-1'. '-2' is for samples that were not included in the original input to the clustering. This is useful if, for example, you cluster on a subset of the samples, and then want to store this clustering with the clusterings done on all the data. You can create a vector of clusterings that give '-2' to the samples not originally used and then add these clusterings to the `ce` object manually with `addClusters`. -`clusterLabels` gives the column names of the `clusterMatrix`; `clusterMany` has given column names based on the parameter choices, and later steps in the workflow also give a name (or allow the user to set them). Clusterings might also have no specific label if the user created them. As we've seen, the user can also change these labels. +`clusterLabels` gives the column names of the `clusterMatrix`; `clusterMany` has given column names based on the parameter choices, and later steps in the workflow also give a name (or allow the user to set them). ```{r CEHelperCommands2} head(clusterLabels(ce),10) ``` -`clusterTypes` on the other hand indicates what call made the clustering. Unlike the labels, it is wise to not change the values of `clusterTypes` lightly. +As we've seen, the user can also change these labels. + +`clusterTypes` on the other hand indicates what call made the clustering. Unlike the labels, it is wise to not change the values of `clusterTypes` unless you are sure of what you are doing. ```{r CEHelperCommands3} head(clusterTypes(ce),10) @@ -389,14 +419,14 @@ The information that was in the original `fluidigm` object has also been preserv colData(ce)[,1:5] ``` -Another important slot in the `ClusterExperiment` object is the `clusterLegend` slot. This consists of a list, one element per column or clustering of `clusterMatrix`. +Another important slot in the `ClusterExperiment` object is the `clusterLegend` slot. This consists of a list, one element per column or clustering of `clusterMatrix`, that gives colors and names to each cluster within a clustering. ```{r CEClusterLengend} length(clusterLegend(ce)) clusterLegend(ce)[1:2] ``` -We can see that each element of `clusterLegend` consists of a matrix, with number of rows equal to the number of clusters in the clustering. The columns store information about that cluster. `clusterIds` is the internal id used in `clusterMatrix` to identify the cluster, `name` is a name for the cluster, and `color` is a color for that cluster. `color` is used in plotting and visualizing the clusters, and `name` is an arbitrary character string for a cluster. They are automatically given default values when the `ClusterExperiment` object is created, but we will see under the description of visualization methods how the user might want to manipulate these for better plotting results. +We can see that each element of `clusterLegend` consists of a matrix, with number of rows equal to the number of clusters in the clustering. The columns store information about that cluster. `clusterIds` is the internal id (integer) used in `clusterMatrix` to identify the cluster, `name` is a name for the cluster, and `color` is a color for that cluster. `color` is used in plotting and visualizing the clusters, and `name` is an arbitrary character string for a cluster. They are automatically given default values when the `ClusterExperiment` object is created, but we will see under the description of visualization methods how the user might want to manipulate these for better plotting results. # Visualizing the data {#visual} @@ -463,7 +493,7 @@ plotClusters(ce_temp, sampleData=c("Biological_Condition","Cluster2"), ``` ## Heatmap with the clusters {#plotHeatmap} -There is also a default heatmap command for a ClusterExperiment object that we used in the Quick Start. By default it clusters on the most variable features (after transforming the data) and shows the primaryCluster alongside the data. The primaryCluster now that we've run the workflow will be set as that from the mergeClusters step. +There is also a default heatmap command for a `ClusterExperiment` object that we used in the Quick Start. By default it clusters on the most variable features (after transforming the data) and shows the `primaryCluster` alongside the data. The `primaryCluster`, now that we've run the workflow, has been set as that from the last mergeClusters step. ```{r plotHeatmap_Ex1} par(mfrow=c(1,1)) @@ -476,6 +506,7 @@ The `plotHeatmap` command has numerous options, in addition to those of `aheatma * Easy inclusion of clustering information or sample information, based on the ClusterExperiment object. * Additional methods for ordering/clustering the samples that makes use of the clustering information. * Use of separate input data for clustering and for visualization. +* Setting the breaks for better visualization ### Displaying clustering or sample information @@ -526,52 +557,142 @@ While count data is a common type of data, it is also common that the input data In this case, it can be convenient to have the *visualization* of the data (i.e. the color scale), be based on a count scale that is interpretable, even while the clustering is done based on the normalized data. This is possible by giving a new matrix of values to the argument `visualizeData`. In this case, the color scale (and clustering of the features) is based on the input `visualizeData` matrix, but all clustering of the samples is done on the internal data in the `ClusterExperiment` object. +### Setting the breaks + +Usually, the breaks that determine the colors of the heatmap are evenly spaced across the range of the data in the entire matrix. When there are a few outlier samples or genes, they can dominate the color and make it impossible to visualize the bulk of the data. + +For this reason, the argument `breaks` in `plotHeatmap` allows for a value between 0 and 1, to indicate that the range of colors should be chosen as equally spaced between certain quantiles of the data. For example, if `breaks=0.99`, the range of equally spaced breaks will stop at the top 0.99 quantile of the data and anything above that value gets assigned the single extreme color. If there is negative data in the matrix, then it also will use the lower quantile of the data to stop the range of equally spaced breaks (see `?setBreaks`) + +Here + + +```{r plotHeatmap_break99} +plotHeatmap(ce,clusterSamplesData="primaryCluster", + whichClusters="primaryCluster", breaks=0.99, + main="Heatmap with clusterMany, breaks=0.99",annLegend=FALSE) +``` + +```{r plotHeatmap_break95} +plotHeatmap(ce,clusterSamplesData="primaryCluster", + whichClusters="primaryCluster", breaks=0.95, + main="Heatmap with clusterMany, breaks=0.95",annLegend=FALSE) +``` + +The function `setBreaks` which is called internally by `plotHeatmap` is also a stand-alone function that the user can call directly to have greater flexibility in getting breaks for the heatmap. For example it allows the user to specify that the breaks should be symmetric around 0. We also provide some default color spectrum that can be better for different settings or symmetric data around 0 -- see `?showHeatmapPalettes` # The clustering workflow {#workflow} We will now go into more detail about important options for the main parts of the clustering workflow. ## clusterMany {#clusterMany} - -### Overview of the implemented clustering procedures In the quick start section we picked some simple and familiar clustering options that would run quickly and needed little explanation. However, our workflow generally assumes more complex options and more parameter variations are tried. Before getting into the specific options of `clusterMany`, let us first describe some of these more complicated setups, since many of the arguments of `clusterMany` depend on understanding them. -**Clustering Algorithms (`clusterD`):** Clustering algorithms generally start with a particular predefined distance or dissimilarity between the samples, usually encoded in a $n x n$ matrix, $D$. In our package, we consider only such clustering algorithms. The input could also be similarities, though we will continue to call such a matrix $D$. +### Base clustering algorithms and the `ClusterFunction` class -The simplest scenario is a simple calculation of $D$ and then a clustering of $D$, usually dependent on particular parameters. In this package we try to group together algorithms that cluster $D$ based on common parameters and operations that can be done. Currently there are two "types" of algorithms we consider, which we call type "K" and "01". The help page of `clusterD` documents these choices more fully, but we give an overview here. Many of the parameters that are allowed to vary in `clusterMany` refer to parameters for the clustering of the $D$ matrix, either for "K" or "01" type algorithms. +This package is meant to be able to use and compare different clustering routines. However, the required input, arguments, etc. of different clustering algorithms varies greatly. We create the `ClusterFunction` class so that we can take ensure that the necessary information to fit into our workflow is well defined, and otherwise the other details of the algorithm can be ignored. In general, the user will not need to know the details of this class, since they will use built-in functions provided by the package which can be accessed by character values. To see the set of character values that correspond to built in functions, - The "K" algorithms are so called because their main parameter requirement is that the user specifies the number of clusters ($K$) to be created. They assume the input $D$ are dissimilarities, and depending on the algorithm, may have additional expectations. "pam" and "kmeans" are examples of such types of algorithms. +```{r builtInFunctions} +listBuiltInFunctions() +``` + +If you are interested in implementing your own `ClusterFunction` object see the documentation of the `ClusterFunction` class. -The "01" algorithms are so named because the algorithm assumes that the input $D$ consists of *similarities* between samples and that the similarities encoded in $D$ are on a scale of 0-1. They use this fact to make the primary user-specified parameter be not the number of final clusters, but a measure $\alpha$ of how dissimilar samples in the same cluster can be (on a scale of 0-1). Given $\alpha$, the algorithm then implements a method to then determine the clusters (so $\alpha$ implicitly determines $K$). These methods rely on the assumption that because the 0-1 scale has special significance, the user will be able to make an determination more easily as to the level of dissimilarity allowed in a true cluster, rather than predetermine the number of clusters $K$. The current 01 methods are "tight", "hierarchical01" and "pam". +There are some important features of any clustering algorithm that are encoded in the `ClusterFunction` object for which it is important to understand because they affect which algorithms can be used when. +**`inputType`** The type of input the algorithm expects, which can be either an $p x n$ matrix of features, in which case the argument `x` gives that data, or a $n x n$ matrix of dissimilarities, in which case the argument `diss`. Some algorithms can accept either type. To determine the `inputType` of an algorithm(s), +```{r getInputType} +inputType(c("kmeans","pam","hierarchicalK")) +``` -**Subsampling** In addition to the basic clustering algorithms on a matrix $D$, we also implement many other common cluster processing steps that are relevant to the result of the clustering. We have already seen such an example with dimensionality reduction, where the input $D$ is determined based on different input data. A more significant processing that we perform is calculation of a dissimilarity matrix $D$ not by a distance on the vector of data, but by subsampling and clustering of the subsampled data. The resulting $D$ matrix is a matrix of co-clustering percentages. Each entry is a percentage of subsamples where the two samples shared a clustering over the many subsamplings of the data (there are slight variations as how this can be calculated, see help pages of `subsampleClustering` ). Note that this implies there actually two different clustering algorithms (and sets of corresponding parameters) -- one for the clustering on the subsampled data, and one for the clustering of the resulting $D$ of the percentage of coClustering of samples. `clusterMany` focuses on varying parameters related to the clustering of $D$, and generally assumes that the underlying clustering on the subsampled data is simple (e.g. "pam"). (The underlying clustering machinery in our package is performed by a function `clusterSingle` which is not discussed in this tutorial, but can be called explicitly for fine-grained control over all of the features ). The subsampling option is computationally expensive, and when coupled with comparing many parameters, does result in a lengthy evaluation of `clusterMany`. However, we recommend it as one of the most useful methods for getting stable clustering results. +**`algorithmType`** we group together algorithms that cluster based on common strategies that affect how we can use them in our workflow. Currently there are two "types" of algorithms we consider, which we call type "K" and "01". We can determine the type of a builtin function by the following: + +```{r getAlgorithmType} +algorithmType(c("kmeans","hierarchicalK","hierarchical01")) +``` + +The "K" algorithms are so called because their main parameter requirement is that the user specifies the number of clusters ($K$) to be created and require an input of `k` to the clustering function. Built in 'K' algorithms are: + +```{r builtInKFunctions} +listBuiltInTypeK() +``` + The "01" algorithms are so named because the algorithm assumes that the input is a *disimilarities* between samples and that the similarities encoded in $D$ are on a scale of 0-1. The clustering functions should use this fact to make the primary user-specified parameter be not the number of final clusters, but a measure $\alpha$ of how dissimilar samples in the same cluster can be (on a scale of 0-1). Given $\alpha$, the algorithm then implements a method to then determine the clusters (so $\alpha$ implicitly determines $K$). These methods rely on the assumption that because the 0-1 scale has special significance, the user will be able to make an determination more easily as to the level of dissimilarity allowed in a true cluster, rather than predetermine the number of clusters $K$. The current 01 methods are: + +```{r builtIn01Functions} +listBuiltInType01() +``` + +**`requiredArgs` The different algorithm types correspond to requiring different input types (`k` versus `alpha`). This is usually sorted out by `clusterMany`, which will only dispatch the appropriate one. Clustering functions can also have additional required arguments. See below for more discussion about how these arguments can be passed along to `clusterMany` or `RSEC`. + +To see all of the required arguments of a function, + +```{r requiredArgs} +requiredArgs(c("hierarchical01","hierarchicalK")) +``` + + +### Internal clustering procedures +`clusterMany` iteratively calls a function `clusterSingle` over the collection of parameters. `clusterSingle` is the clustering workhorse, and may be used by the user who wants more fine-grained control, see documentation of `clusterSingle`. + +Within each call of `clusterSingle`, there are three possible steps, depending on the value of `subsample` and `sequential`. If these are both false, then just a basic clustering routine is done on the input data (called the "main" clustering). If `subsample=TRUE`, there is first a step that subsamples and clusters the subsamples to calculate a co-occurance matrix, and that is used as the input for the main clustering step. If `sequential=TRUE` this process is iterated over and over again to iteratively select the best clusters (see `?seqCluster` for a detailed description). Each of these steps has a function that goes with it, but that should not generally be called by the user. However, the documentation of these functions can be useful. + +In particular, arguments to these functions that are not set by `clusterMany` can be passed via *named* lists: `subsampleArgs`, `mainClusterArgs`, and `seqArgs`. Some of the arguments to these steps can be varied in `clusterMany`, but more esoteric ones should be sent to these arguments of `clusterMany` (and they will be fixed for parameter combinations tried in `clusterMany`). + +**Main Clustering Step** (`mainClustering`) The main clustering step described above is done by the function `mainClustering`. In addition to the basic clustering algorithms on the input data, we also implement many other common cluster processing steps that are relevant to the result of the clustering. We have already seen such an example with dimensionality reduction, where the input $D$ is determined based on different input data. Many of the arguments to `mainClustering` are arguments to `clusterMany` as well so that `mainClusterArgs` is usually not needed. The main exception would be to send more esoteric arguments to the underlying clustering function called in the main clustering step. The syntax for this would be to give a nested list to the argument `mainClusterArgs` + +```{r mainClusterArgsSyntax,eval=FALSE} +clusterMany(x,clusterFunction="hierarchicalK", ... , mainClusterArgs=list(clusterArgs=list(method="single") )) +``` +Here we change the argument `method` in the clustering function `hclust` called by the `hierarchicalK` function to `single`. + +**Subsampling** (`subsampleClustering`) A more significant processing that can be coupled with any clustering algorithm is to continually by subsample the data and cluster the subsampled data. This creates a $n x n$ matrix $S$ that is a matrix of co-clustering percentages -- how many times two samples co-clustered together over the subsamples (there are slight variations as how this can be calculated, see help pages of `subsampleClustering` ). This does not itself give a clustering, but the resulting $S$ matrix can then form the basis for clustering the samples. Specifically, the matrix $D=1-S$ is then given as input to the main clustering step described above. The subsampling option is computationally expensive, and when coupled with comparing many parameters, does result in a lengthy evaluation of `clusterMany`. However, we recommend it as one of the most useful methods for getting stable clustering results. + +Note that the juxtaposition of these two steps (the subsampling and then feeding the results to the main clustering function) implies there actually two different possible clustering algorithms (and sets of corresponding parameters) -- one for the clustering on the subsampled data, and one for the clustering of the resulting $D$ based on the percentage of coClustering of samples. This brings up a restriction on the clustering function in the main clustering step -- it needs to be able to handle input that is a dissimilarity (`inputType` is either `diss` or `either`). Furthermore, the user might want to set clustering function and corresponding parameters separately for the two steps. The way that `clusterMany` handles this is that the main arguments of `clusterMany` focus on varying the parameters related to the main clustering step (the clustering of $D$ after subsampling). For this reason, the argument `clusterFunction` varies the clustering function used by the main clustering step, not the subsampling step. The clustering function of the subsampling step can be specified by the user via `subsampleArgs`, but in this case it is set for *all* calls of `clusterMany` and does not vary. Alternatively, if the user doesn't specify the `clusterFunction` in `subsampleArgs` then the default is to use `clusterFunction` of the main clustering step along with any required arguments given by the user for that function (there are some cases where using the `clusterFunction` of the main step is not possible for the subsampling step, in which case the default is to use "pam"). + +More generally, since few of the arguments to `subsampleClustering` are allowed to be varied by the direct arguments to `clusterMany`, it is also more common to want to change these arguments via the argument `subsampleArgs`. Examples might be `resamp.num` (the number of subsamples to draw) or `samp.p` (the proportion of samples to draw in each subsample) -- see `?subsampleClustering` for a full documentation of the possible arguments. In addition, there are arguments to be passed to the underlying clustering function; like for `mainClustering`, these arguments would be a nested list to the argument `subsampleArgs`. + +An example of a syntax that sets the arguments for `subsampleClustering` would be: + +```{r subsampleArgsSyntax,eval=FALSE} +clusterMany(x,..., subsampleArgs=list(resamp.num=100,samp.p=0.5,clusterFunction="hiearchicalK", clusterArgs=list(method="single") )) +``` + + +**Sequential Detection of Clusters** Another complicated addition that can be added to the main clustering step is the implementation of sequential clustering. This refers to clustering of the data, then removing the "best" cluster, and then re-clustering the remaining samples, and then continuing this iteration until all samples are clustered (or the algorithm in some other way calls a stop). Such sequential clustering can often be convenient when there is very dominant cluster, for example, that is far away from the other mass of data. Removing samples in these clusters and resampling can sometimes be more productive and result in a clustering more robust to the choice of samples. A particular implementation of such a sequential method, based upon [@tseng2005], is implemented in the `clusterExperiment` package when the option `sequential=TRUE` is chosen (see `?seqCluster` for documentation of how the iteration is done). Sequential clustering can also be quite computationally expensive, particularly when paired with subsampling to determine $D$ at each step of the iteration. + + Because of the iterative nature of the sequential step, there are many possible parameters (see `?seqCluster`). Like subsample clustering, `clusterMany` does not allow variation of very many of these parameters, but they can be set via passing arguments in a named list to `seqArgs`. An example of a syntax that sets the arguments for `seqCluster` would be: + +```{r seqArgsSyntax,eval=FALSE} +clusterMany(x,..., seqArgs=list( remain.n=10)) +``` + +This code changes the `remain.n` option of the sequential step, which governs when the sequential step stops because there are not enough samples remaining. -**Sequential Detection of Clusters** Another complicated addition to the clustering that requires additional explanation is the implementation of sequential clustering. This refers to clustering of the data, then removing the "best" cluster, and then re-clustering the remaining samples, and then continuing this iteration until all samples are clustered (or the algorithm in some other way calls a stop). Such sequential clustering can often be convenient when there is very dominant cluster, for example, that is far away from the other mass of data. Removing samples in these clusters and resampling can sometimes be more productive and result in results more robust to the choice of samples. A particular implementation of such a sequential method, based upon [@tseng2005], is implemented in the `clusterExperiment` package. Because of the iterative nature of this process, there are many possible parameters (see `help(seqCluster)`). `clusterMany` does not allow variation of very many of these parameters, but instead just has the choice of running the sequential clustering or not. Sequential clustering can also be quite computationally expensive, particularly when paired with subsampling to determine $D$ at each step of the iteration. ### Arguments of `clusterMany` -Now that we've explained the underlying architecture of the clustering provided in the package, we discuss the parameters that can be varied in `clusterMany`. There are additional arguments available for `clusterMany` but right now we focus on only the ones that can be given multiple options. Recall that arguments in `clusterMany` that take on multiple values mean that the combinations of all the multiple valued arguments will be given as input for a clustering routine. +Now that we've explained the underlying architecture of the clustering provided in the package, and how to set the arguments that can't be varied, we discuss the parameters that *can* be varied in `clusterMany`. (There are a few additional arguments available for `clusterMany` that govern how `clusterMany` works, but right now we focus on only the ones that can be given multiple options). + +Recall that arguments in `clusterMany` that take on multiple values mean that the combinations of all the multiple valued arguments will be given as input for a clustering routine. * `sequential` This parameter consists of logical values, TRUE and/or FALSE, indicating whether the sequential strategy should be implemented or not. * `subsample` This parameter consists of logical values, TRUE and/or FALSE, indicating whether the subsampling strategy for determining $D$ should be implemented or not. -* `clusterFunction` The clustering functions to be tried. If `subsample=TRUE` is part of the combination, then `clusterFunction` the method that will be used on the co-clustering matrix $D$ created from subsampling the data (where pam clustering is used on the subsampled data). Otherwise, `clusterFunction` is the clustering method that will be used on `dist(t(x))` -* `ks` The argument 'ks' is interpreted differently for different choices of the other parameters. If `sequential=TRUE` is part of the combination, `ks` defines the argument `k0` of sequential clustering (see `help(seqCluster)`), which is approximately like the initial starting point for the number of clusters in the sequential process. Otherwise, `ks` is passed to set $K$ of both the clustering of subsampled data and the actual clustering of the data (if a $K$ needs to be set, i.e. a type "K" algorithm). When/if `findBestK=TRUE` is part of the combination, `ks` also defines the range of values to search for the best k (see the details in `help(clusterMany)` for more). -* `dimReduce` These are character strings indicating what choices of dimensionality reduction should be tried. The choices are "PCA", indicating clustering on the top principal components, "var", indicating clustering on the top most variable features, and "none", indicating the whole data set should be used. If either "PCA" or "var" are chosen, the following parameters indicate the number of such features to be used (and can be a vector of values as we have seen): +* `clusterFunction` The clustering functions to be tried in the *main clustering step*. Recall if `subsample=TRUE` is part of the combination, then `clusterFunction` the method that will be used on the matrix $D$ created from subsampling the data. Otherwise, `clusterFunction` is the clustering method that will be used directly on the data. +* `ks` The argument 'ks' is interpreted differently for different choices of the other parameters *and can differ from between parameter combinations!*. If `sequential=TRUE` is part of the parameter combination, `ks` defines the argument `k0` of sequential clustering (see `?seqCluster`), which is approximately like the initial starting point for the number of clusters in the sequential process. Otherwise, `ks` is passed to set `k` of both the main clustering step (and by default that of the subsampled data), and is only relevant if `clusterFunction` is of type "K". When/if `findBestK=TRUE` is part of the combination, `ks` also defines the range of values to search for the best k (see the details in the documentation of `clusterMany` for more). +* `dimReduce` These are character strings indicating what choices of dimensionality reduction should be tried. The choices are "PCA", indicating clustering on the top principal components, "var", indicating clustering on the top most variable features, and "none", indicating the whole data set should be used. If either "PCA" or "var" are chosen, the following parameters indicate the number of such features to be used (and can be a vector of values to try as we have seen): * `nVarDims` * `nPCADims` -* `distFunction` These are character values giving functions that provide a distance matrix between the samples, when applied to the data. These functions should be accessible in the global environment (`clusterMany` applies `get` to the global environment to access these functions). To make them compatible with the `dist` function, these functions should assume the samples are in the rows, i.e. they should work when applied to t(assay(ce)). We give an example in the next subsection below. -* `minSizes` these are integer values determining the minimum size required for a cluster (passed to the `clusterD` part of clustering). +* `distFunction` These are character values giving functions that provide a distance matrix between the samples, when applied to the data. These functions should be accessible in the global environment (`clusterMany` applies `get` to the global environment to access these functions). To make them compatible with the standard R function `dist`, these functions should assume the samples are in the rows, i.e. they should work when applied to t(assay(ce)). We give an example in the next subsection below. +* `minSizes` these are integer values determining the minimum size required for a cluster (passed to the `mainClustering` part of clustering). * `alphas` These are the $\alpha$ parameters for "01" clustering techniques; these values are only relevant if one of the `clusterFunction` values is a "01" clustering algorithm. The values given to `alphas` should be between 0 and 1, with smaller values indicating greater similarity required between the clusters. * `betas` These are the $\beta$ parameters for sequential clustering; these values are only relevant if `sequential=TRUE` and determine the level of stability required between changes in the parameters to determine that a cluster is stable. * `findBestK` This option is for "K" clustering techniques, and indicates that $K$ should be chosen automatically as the $K$ that gives the largest silhouette distance between clusters. -* `removeSil` A logical value as to whether samples with small silhouette distance to their assigned cluster are "removed", in the sense that they are not given their original cluster assignment but instead assigned -1. This option is for "K" clustering techniques as a method of removing poorly clustered samples (the "01" techniques used by `clusterMany` generally do this intrinsically as part of the algorithm). +* `removeSil` A logical value as to whether samples with small silhouette distance to their assigned cluster are "removed", in the sense that they are not given their original cluster assignment but instead assigned -1. This option is for "K" clustering techniques as a method of removing poorly clustered samples. * `silCutoff` If `removeSil` is TRUE, then `silCutoff` determines the cutoff on silhouette distance for unassigning the sample. -`clusterMany` tries to have generally simple interface, and for this reason makes choices about what is meant by certain combinations. For example, in combinations where `findBestK=TRUE`, `ks=2:10` is taken to mean that the clustering should find the best $k$ out of the range of 2-10. However, in other combinations `ks` might indicate the specific number of clusters, $k$, that should be found. For parameter combinations that are not what is desired, the user should consider making direct calls to `clusterSingle` where all of these options (and many more) can be explicitly called. +`clusterMany` tries to have generally simple interface, and for this reason makes choices about what is meant by certain combinations of parameters. For example, in combinations where `findBestK=TRUE`, `ks=2:10` is taken to mean that the clustering should find the best $k$ out of the range of 2-10. However, in other parameter combinations where `findBestK=FALSE` the same `ks` might indicate the specific number of clusters, $K$, that should be found. To see the parameter choices that will be run, the user can set `run=FALSE` and the output will be a matrix of the parameter values indicated by the choices of the user. For parameter combinations that are not what is desired, the user should consider making direct calls to `clusterSingle` where all of these options combinations (and many more) can be explicitly called. -Other parameters for the clustering are kept fixed. As described above, there are many more possible parameters in play than are considered in `clusterMany`. These parameters can be set via the arguments `clusterDArgs`, `subsampleArgs` and `seqArgs`. These arguments correspond to the different processes described above (the clustering of $D$, the creation of $D$ via subsampling, and the sequential clustering process, respectively). These arguments take a list of arguments that are sent directly to `clusterSingle`. However, these arguments may be overridden by the interpretation of `clusterMany` of how different combinations interact; again for complete control direct calls to `clusterSingle` are necessary. +Other parameters for the clustering are kept fixed. As described above, there are many more possible parameters in play than are considered in `clusterMany`. These parameters can be set via the arguments `mainClusterArgs`, `subsampleArgs` and `seqArgs`. These arguments correspond to the different processes described above (the main clustering step, the creation of $D$ to be clustered via subsampling, and the sequential clustering process, respectively). These arguments take a list of arguments that are sent directly to `clusterSingle`. However, these arguments may be overridden by the interpretation of `clusterMany` of how different combinations interact; again for complete control direct calls to `clusterSingle` are necessary. ```{r tableArguments, echo=FALSE, message=FALSE, warnings=FALSE, results='asis'} @@ -580,19 +701,19 @@ tabl <- " Argument| Dependencies | Passed to | Argument passed to ---------------|-----------------|:-------------:|------:| ks | sequential=TRUE | seqCluster | k0 -- | sequential=FALSE, findBestK=FALSE, clusterFunction of type 'K' | clusterD | k +- | sequential=FALSE, findBestK=FALSE, clusterFunction of type 'K' | mainClustering | k - | sequential=FALSE, findBestK=FALSE, subsample=TRUE | subsampleClustering | k -- | sequential=FALSE, findBestK=TRUE, clusterFunction of type 'K' | clusterD | kRange +- | sequential=FALSE, findBestK=TRUE, clusterFunction of type 'K' | mainClustering | kRange dimReduce | none | transform | dimReduce nVarDims | dimReduce in 'mad','cv','var' | transform | nVarDims nPCADims | dimReduce='PCA' | transform | nPCADims -clusterFunction| none | clusterD | clusterFunction -minSizes | none | clusterD | minSize -distFunction | subsample=FALSE | clusterD | distFunction -alphas | clusterFunction of type '01'| clusterD | alpha -findBestK | clusterFunction of type 'K' | clusterD | findBestK -removeSil | clusterFunction of type 'K' | clusterD | removeSil -silCutoff | clusterFunction of type 'K' | clusterD | silCutoff +clusterFunction| none | mainClustering | clusterFunction +minSizes | none | mainClustering | minSize +distFunction | subsample=FALSE | mainClustering | distFunction +alphas | clusterFunction of type '01'| mainClustering | alpha +findBestK | clusterFunction of type 'K' | mainClustering | findBestK +removeSil | clusterFunction of type 'K' | mainClustering | removeSil +silCutoff | clusterFunction of type 'K' | mainClustering | silCutoff betas | sequential=TRUE | seqCluster | beta " cat(tabl) # output the table in a format good for HTML/PDF/docx conversion @@ -611,18 +732,20 @@ spearDist<-function(x){(1-cor(t(x),method="spearman"))/2} These distances are defined so as to give distance of 0 between samples with correlation 1, and distance of 1 for correlation -1. -We will also compare using different algorithms for clustering. Since we chose distances between 0-1, we can use any algorithm. Currently, `clusterMany` requires that the distances work with all of the `clusterFunction` choices given. Since some of the `clusterFunction` algorithms require a distance matrix between 0-1, this means we can only compare all of the algorithms when the distance is a 0-1 distance. (Future versions will probably try to create a work around so that the algorithm just skips algorithms that don't match the distance). +We will also compare using different algorithms for clustering. Currently, `clusterMany` requires that the distances work with all of the `clusterFunction` choices given. Since some of the `clusterFunction` algorithms require a distance matrix between 0-1, this means we can only compare all of the algorithms when the distance is a 0-1 distance. (Future versions may try to create a work around so that the algorithm just skips algorithms that don't match the distance). Since the distances we defined are between 0-1, however, we can use any algorithm that takes dissimilarities as input. + +**Note on 0-1 clustering when `subsample=FALSE`** We would note that the default values of $\alpha$ in `clusterMany` and `RSEC` for the 0-1 clustering were set with the distance $D$ the result of subsampling or other concensus summary in mind. In generally, subsampling creates a $D$ matrix with high similarity for many samples who share a cluster (the proportion of times samples are seen together for well clustered samples can easily be in the .8-.95 range, or even exactly 1). For this reason the default $\alpha$ is 0.1 which requires distances between samples in the 0.1 range or less (i.e. a similarity in the range of 0.9 or more). -**Note on 0-1 clustering when `subsample=FALSE`** We would note that the default values $\alpha$ for the 0-1 clustering were set with the distance $D$ the result of subsampling or other concensus summary in mind. In generally, subsampling creates a $D$ matrix with high similarity for many samples who share a cluster (the proportion of times samples are seen together for well clustered samples can easily be in the .8-.95 range, or even exactly 1). For this reason the default $\alpha$ is 0.1 which requires distances between samples in the 0.1 range or less (i.e. a similarity in the range of 0.9 or more). We show an example of the $D$ matrix from subsampling; we make use of the `clusterSingle` which is the workhorse mentioned above that runs a single clustering command directly, which gives the output $D$ from the sampling in the "coClustering" slot of `ce`. Note that the result is $1-p_{ij}$ where $p_{ij}$ is the proportion of times sample $i$ and $j$ clustered together. +To illustrate this point, we show an example of the $D$ matrix from subsampling. To do this we make use of the `clusterSingle` which is the workhorse mentioned above that runs a single clustering command directly; it gives the output $D$ from the sampling in the "coClustering" slot of `ce` when we set `replaceCoCluster=TRUE` (and therefore we save it as a separate object, so that it doesn't write over the existing "coClustering" slot in `ce`). Note that the result is $1-p_{ij}$ where $p_{ij}$ is the proportion of times sample $i$ and $j$ clustered together. ```{r visualizeSubsamplingD} -ceSub<-clusterSingle(ce,dimReduce="mad",ndims=1000,subsample=TRUE,clusterFunction="hierarchical01",subsampleArgs=list(k=8),clusterLabel="subsamplingCluster",clusterDArgs=list(minSize=5)) +ceSub<-clusterSingle(ce,dimReduce="mad",ndims=1000,subsample=TRUE,subsampleArgs=list(clusterFunction="pam",clusterArgs=list(k=8)),clusterLabel="subsamplingCluster",mainClusterArgs=list(clusterFunction="hierarchical01",clusterArgs=list(alpha=0.1),minSize=5), replaceCoClustering=TRUE) plotCoClustering(ceSub,colorScale=rev(seqPal5)) ``` -We see even here, the default of $\alpha=0.1$ was perhaps too conservative since only two clusters came out (with size greater than 5). +We see even here, the default of $\alpha=0.1$ was perhaps too conservative since only two clusters came out (at leastwith size greater than 5). -The distances based on correlation calculated directly on the data, such as we created above, are often used for clustering expression data. But they are unlikely to have distances as low as seen in subsampling, even for well clustered samples. Here's a visualization of the correlation distance matrix we defined above (using Spearman's correlation) on the top 1000 most variable features: +However, the distances based on correlation calculated directly on the data, such as we created above, are also often used for clustering expression data directly (i.e. without the subsampling step). But they are unlikely to have dissimilarities as low as seen in subsampling, even for well clustered samples. Here's a visualization of the correlation distance matrix we defined above (using Spearman's correlation) on the top 1000 most variable features: ```{r visualizeSpearmanDist} dSp<-spearDist(t(transform(ce,dimReduce="mad",nVarDims=1000))) @@ -648,7 +771,7 @@ par(mar=c(1.1,15.1,1.1,1.1)) plotClusters(ceDist,axisLine=-2,sampleData=c("Biological_Condition")) ``` -Notice that using the 01 methods did not give relevant results +Notice that using the "tight" methods did not give relevant results (no samples were clustered) ### Dealing with large numbers of clusterings @@ -674,11 +797,11 @@ We can set `ncores` argument to have these clusterings done in parallel. If `nco We can now run this, either by giving the information in `checkParam$paramMatrix` to clusterMany argument `paramMatrix`, or by recalling the function from scratch. If the user has not changed `paramMatrix`, there's no advantage in giving `paramMatrix` to `clusterMany` rather than just recalling `clusterMany`, but we'll do it here just to show how it is done. ```{r clusterManyCheckParam2,eval=FALSE} -# ce<-clusterMany(se, paramMatrix=checkParam$paramMatrix, clusterDArgs=checkParam$clusterDArgs, seqArgs=checkParam$seqArgs,subsampleArgs=checkParam$subsampleArgs) +# ce<-clusterMany(se, paramMatrix=checkParam$paramMatrix, mainClusterArgs=checkParam$mainClusterArgs, seqArgs=checkParam$seqArgs,subsampleArgs=checkParam$subsampleArgs) ce<-clusterMany(ce, clusterFunction="pam",ks=2:10,findBestK=TRUE,removeSil=c(TRUE), isCount=TRUE,dimReduce=c("PCA","var"),nVarDims=c(100,500,1000),nPCADims=c(5,15,50),run=TRUE) ``` -Note that we also provided in the above call the additional arguments `clusterDArgs`, `seqArgs` and `subsampleArgs` which normally we might neglect with a direct call to `clusterMany`. This is because in creating the `paramMatrix`, `clusterMany` may internally change these default values, and we want to make sure we exactly replicate what we would get from a direct call. +Note that we also provided in the above call the additional arguments `mainClusterArgs`, `seqArgs` and `subsampleArgs` which normally we might neglect with a direct call to `clusterMany`. This is because in creating the `paramMatrix`, `clusterMany` may internally change these default values, and we want to make sure we exactly replicate what we would get from a direct call. --> ## Create a unified cluster from many clusters with `combineMany` {#combineMany} @@ -750,11 +873,12 @@ This is the clustering from combining only the clusterings from `clusterMany` th We might prefer to get back to the dendrogram based on our `combineMany` in quick start (the "combineMany, final" clustering). We've lost that dendrogram when we called `makeDendrogram` again. However, we can rerun `makeDendrogram` and choose a different clustering from which to make the dendrogram. +To vary the display, we'll show the display where we put blocks of color instead of names (`labelType="colorblock"`) and we also decide to show the leaves as the individual samples instead of the clusters (`leafType="sample"`); the combination of which gives us color blocks equal to the size of the clusterings. ```{r remakeMakeDendrogram} ce<-makeDendrogram(ce,dimReduce="var",ndims=500, whichCluster="combineMany,final") -plotDendrogram(ce) +plotDendrogram(ce,leafType="sample",labelType="colorblock") ``` Note that the clusterType of this clustering is not "combineMany", but "combineMany.x", where "x" indicates what iteration it was: @@ -774,7 +898,7 @@ show(ce) ### Merging clusters with little differential expression {#mergeClusters} We then can use this hierarchy of clusters to merge clusters that show little difference in expression. We do this by testing, for each node of the dendrogram, for which features is the mean of the set of clusters to the right split of the node is equal to the mean on the left split. This is done via the `getBestFeatures` (see section on [getBestFeatures](#getBestFeatures)), where the `type` argument is set to "Dendro". -Starting at the bottom of the tree, those clusters that have the percentage of features with differential expression below a certain value (determined by the argument `cutoff`) are merged into a larger cluster. This testing of differences and merging continues until the estimated percentage of non-null DE features is above `cutoff`. This means lower values of `cutoff` result in less merging of clusters. There are multiple methods of estimation of the percentage of non-null features implemented. The option `mergeMethod="adjP"` which we showed earlier is the simplest: the proportion found significant by calculating the proportion of DE genes at a given False Discovery Rate threshold (using the Benjamini-Hochberg procedure). However, other methods are also implemented (see the help of `mergeClusters`). +Starting at the bottom of the tree, those clusters that have the percentage of features with differential expression below a certain value (determined by the argument `cutoff`) are merged into a larger cluster. This testing of differences and merging continues until the estimated percentage of non-null DE features is above `cutoff`. This means lower values of `cutoff` result in less merging of clusters. There are multiple methods of estimation of the percentage of non-null features implemented. The option `mergeMethod="adjP"` which we showed earlier is the simplest: the proportion found significant by calculating the proportion of DE genes at a given False Discovery Rate threshold (using the Benjamini-Hochberg procedure). However, other more sophisticated methods are also implemented (see the help of `mergeClusters`). Notice that `mergeClusters` will always run based on the clustering that made the currently existing dendrogram. So it is always good to check that it is what we expect. @@ -782,10 +906,10 @@ Notice that `mergeClusters` will always run based on the clustering that made th ce ``` -`mergeClusters` can also be run without merging the cluster, and simply drawing a plot showing the dendrogram along with the estimates of the percentage of non-null features to aid in deciding a cutoff and method. By setting `plotType="all"`, all of the estimates of the different methods are displayed simultaneously, while before in the QuickStart, we only showed the default values. +`mergeClusters` can also be run without merging the cluster, and simply drawing a plot showing the dendrogram along with the estimates of the percentage of non-null features to aid in deciding a cutoff and method. By setting `plotInfo="all"`, all of the estimates of the different methods are displayed simultaneously, while before in the QuickStart, we only showed the default values. ```{r mergeClusters_plot,fig.width=12} -mergeClusters(ce,mergeMethod="none",plotType="all") +mergeClusters(ce,mergeMethod="none",plotInfo="all") ``` Now we can pick a cutoff. We'll give it a label to keep it separate from the previous run we had made. @@ -804,6 +928,8 @@ ce<-mergeClusters(ce,cutoff=0.15,mergeMethod="MB", ce ``` +Note, we can turn off plotting completely by setting `plot=FALSE`. + ## Keeping track of and rerunning elements of the workflow {#rerun} The commands we have shown above show a workflow which continually saves the results over the previous object, so that additional information just gets added to the existing object. @@ -863,7 +989,7 @@ A cleaner way to do this would have been to first set the desired cluster ("merg The clustering workflow described so far is a generalization of our RSEC algorithm for single-cell sequencing data. The RSEC algorithm is particularly oriented around using subsampling and sequential discovery of clusters to find very robust signals. -In particular, `RSEC` is a single function that follows the entire workflow described above, but makes the choices to set `subsample=TRUE` and `sequential=TRUE`. Furthermore, the only clustering functions that are allowed are the "01" types ("hierarchical01" and "tight"). This removes a number of options from clusterMany, making for a slightly reduced set of commands. `RSEC` also implements the `combineMany`, `makeDendrogram` and `mergeClusters` steps, again with not all arguments available to be set. Furthermore, the defaults set in `RSEC` are those we choose for our algorithm, and occassionally vary from stand-alone method. The final output is a `clusterExperiment` object as you would get from following the workflow. +In particular, `RSEC` is a single function that follows the entire workflow described above, but makes the choices to set `subsample=TRUE` and `sequential=TRUE`. Furthermore, the only clustering functions that are allowed are the "01" types (currently "hierarchical01" and "tight"). This removes a number of options from clusterMany, making for a slightly reduced set of arguments. `RSEC` also implements the `combineMany`, `makeDendrogram` and `mergeClusters` steps, again with not all arguments available to be set. Furthermore, the defaults set in `RSEC` are those we choose for our algorithm, and occassionally vary from stand-alone method. The final output is a `clusterExperiment` object as you would get from following the workflow. We give the following correspondence to help see what arguments of each component are fixed by RSEC, and which are allowed to be set by the user (as well as their correspondence to arguments in the workflow functions). @@ -881,7 +1007,7 @@ tabl <- " - | | alphas | | - | | betas | | - | | minSizes | | -- | | clusterDArgs | | +- | | mainClusterArgs | | - | | subsampleArgs | | - | | seqArgs | | - | | run | | @@ -896,7 +1022,7 @@ tabl <- " - | ignoreUnassignedVar=TRUE | dendroReduce | dimReduce | - | unassignedSamples= *(default)* | dendroNDims | ndims | *mergeClusters* | | | | -- | plotType='none' | mergeMethod | | +- | plot=FALSE | mergeMethod | | - | | mergeCutoff | cutoff | - | | isCount | | used for both mergeMethod and clusterMany " @@ -910,7 +1036,7 @@ cat(tabl) # output the table in a format good for HTML/PDF/docx conversion The function `getBestFeatures` finds features in the data that are strongly differentiated between the clusters of a given clustering. Finding the best features is generally the last step in the workflow, once a final clustering has been decided upon, though as we have seen it is also called internally in `mergeClusters` to decide between which clusters to merge together. -The function `getBestFeatures` calls `limma` on input data to determine the gene features most associated with a particular clustering. `getBestFeatures` picks the `primaryCluster` of a `ClusterExperiment` object as the clustering to use to find features. If the standard workflow is followed, this will be the last completed step (usually the result of `mergeClusters` or manually choosing a final cluster via `setToFinal`). The primaryCluster can of course be changed by setting `primaryClusterIndex` to point to a different clustering. +The function `getBestFeatures` calls `limma` [@Smyth:2004gh, @Ritchie:2015fa] on input data to determine the gene features most associated with a particular clustering. `getBestFeatures` picks the `primaryCluster` of a `ClusterExperiment` object as the clustering to use to find features. If the standard workflow is followed, this will be the last completed step (usually the result of `mergeClusters` or manually choosing a final cluster via `setToFinal`). The primaryCluster can of course be changed by setting `primaryClusterIndex` to point to a different clustering. Since our most recent clustering (the one determined by our `setToFinal`) only has 2 clusterings, we are going to reset the primary clustering to be our result from `combineMany`, with the label "mergeClusters,v3". This will be better for explaining the functionality of the `getBestFeatures` method. @@ -993,7 +1119,7 @@ plotDendrogram(ce,show.node.label=TRUE) ## Analysis for count and other RNASeq data -The `getBestFeatures` method for `ClusterExperiment` objects has an argument `isCount`. If this is marked `TRUE` then the data in `assay(x)` is assumed to be counts, and the call to `limma` uses the `voom` correction. This correction deals with the mean-variance relationship that is found with count data. This means that the differential expression analysis is done on $log_2(x+0.5)$. This is *regardless of what transformation is stored in the `ClusterExperiment` object*! The `voom` call within `getBestFeatures` however, is by default set to `normalize.method = "none"` in the call to `voom` (though the user can set `normalize.method` in the call to `getBestFeatures`). +The `getBestFeatures` method for `ClusterExperiment` objects has an argument `isCount`. If this is marked `TRUE` then the data in `assay(x)` is assumed to be counts, and the call to `limma` uses the `voom`[@Law:2014ff] correction. This correction deals with the mean-variance relationship that is found with count data. This means that the differential expression analysis is done on $log_2(x+0.5)$. This is *regardless of what transformation is stored in the `ClusterExperiment` object*! The `voom` call within `getBestFeatures` however, is by default set to `normalize.method = "none"` in the call to `voom` (though the user can set `normalize.method` in the call to `getBestFeatures`). If instead `isCount=FALSE`, then `limma` is performed on `transform(x)`, i.e. the data after transformation of the data with the transformation stored in the `ClusterExperiment` object. In this case, there is no `voom` correction. @@ -1001,7 +1127,7 @@ Unlike edgeR or DESeq, the voom correction does not explicitly require a count m ## Piping into other DE routines -Ultimately, for many settings, the user may prefer to use other techniques for differential expression analysis or have more control over certain aspects of it. The function `clusterContrasts` may be called by the user to get the contrasts that are defined within `getBestFeatures` (e.g. dendrogram contrasts or pairwise contrasts). These contrasts, which are in the format needed for `limma` can be piped into programs that allow for contrasts in their linear models like edgeR [@Robinson:2010cw] for mRNA-Seq or MAST [@Finak:2015id] for single-cell sequencing. +Ultimately, for many settings, the user may prefer to use other techniques for differential expression analysis or have more control over certain aspects of it. The function `clusterContrasts` may be called by the user to get the contrasts that are defined within `getBestFeatures` (e.g. dendrogram contrasts or pairwise contrasts). These contrasts, which are in the format needed for `limma` can be piped into programs that allow for contrasts in their linear models like edgeR [@Robinson:2010cw] for mRNA-Seq; they can also be chosen to be returned in the formated needed by MAST [@Finak:2015id] for single-cell sequencing by settting `outputType="MAST"`. Similarly, more complicated normalizations, like RUV [@GagnonBartsch:2011jv], adjust each gene individually for unwanted batch or other variation within the linear model. In this case, a matrix $W$ that describes this variation should be included in the linear model. Again, this can be done in other programs, using the contrasts provided by `clusterContrasts` diff --git a/vignettes/clusterExperimentTutorial.html b/vignettes/clusterExperimentTutorial.html index cf8aa2c6..7529ce90 100644 --- a/vignettes/clusterExperimentTutorial.html +++ b/vignettes/clusterExperimentTutorial.html @@ -10,7 +10,7 @@ - + clusterExperiment Vignette @@ -51,14 +51,14 @@

Contents

  • 1 Introduction
  • 5 The clustering workflow
    • 5.1 clusterMany
    • 5.2 Create a unified cluster from many clusters with combineMany
    • 5.3 Creating a Hierarchy of Clusters and Merging clusters
        @@ -123,13 +125,13 @@

        Contents

        -->

        1 Introduction

        -

        The goal of this package is to encourage the user to try many different clustering algorithms in one package structure. We give tools for running many different clusterings and choices of parameters. We also provide visualization to compare many different clusterings and algorithm tools to find common shared clustering patterns. We implement common post-processing steps unrelated to the specific clustering algorithm (e.g. subsampling the data for stability, finding cluster-specific markers via differential expression, etc).

        -

        The other main goal of this package is to implement strategies that we have developed for finding a single robust clustering based on the many clusterings that the user might create by perturbing various parameters of a clustering algorithm. There are several steps to these strategies that we call our standard clustering workflow. Our RSEC algorithm (Resampling-based Sequential Ensemble Clustering) is our preferred realization of this workflow that depends on subsampling on and other ensembl methods to provide robust clusterings, particularly for single-cell sequencing experiments and other large mRNA-Seq experiments.

        -

        We also provide a class clusterExperiment that inherits from SummarizedExperiment to store the many clusterings and related information.

        -

        All of our methods also have a barebones version that allows input of matrices and greater control. This comes at the expense of the user having to manage and keep track of the clusters, input data, transformation of the data, etc. We do not discuss these barebone versions in this tutorial. Instead, we focus on using SummarizedExperiment object as the input and working with the resulting ClusterExperiment object. See the help pages of each method for more on how to allow for matrix input.

        +

        The goal of this package is to encourage the user to try many different clustering algorithms in one package structure, and we provide strategies for creating a unified clustering from these many clustering resutls. We give tools for running many different clusterings and choices of parameters. We also provide visualization to compare many different clusterings and algorithm tools to find common shared clustering patterns. We implement common post-processing steps unrelated to the specific clustering algorithm (e.g. subsampling the data for stability, finding cluster-specific markers via differential expression, etc).

        +

        The other main goal of this package is to implement strategies that we have developed in the RSEC algorithm (Resampling-based Sequential Ensemble Clustering) for finding a single robust clustering based on the many clusterings that the user might create by perturbing various parameters of a clustering algorithm. There are several steps to these strategies that we call our standard clustering workflow. The RSEC function is our preferred realization of this workflow that depends on subsampling on and other ensemble methods to provide robust clusterings, particularly for single-cell sequencing experiments and other large mRNA-Seq experiments.

        +

        We also provide a class ClusterExperiment that inherits from SummarizedExperiment to store the many clusterings and related information, and a class ClusterFunction that encodes a clustering routine in a standardized way so that it can interact with our clustering workflow algorithms.

        +

        All of our methods also have a barebones version that allows input of matrices and greater control. This comes at the expense of the user having to manage and keep track of the clusters, input data, transformation of the data, etc. We do not discuss these barebone versions in this tutorial. Instead, we focus on using the SummarizedExperiment object as the input and working with the resulting ClusterExperiment object. See the help pages of each method for more on how to allow for matrix input.

        Although this package was developed with (single-cell) RNA-seq data in mind, its use is not limited to RNA-seq or even to gene expression data. Any dataset characterized by high dimensionality could benefit from the methods implemented here.

        -
        -

        1.1 The clustering workflow

        +
        +

        1.1 The RSEC clustering workflow

        The package encodes many common practices that are shared across clustering algorithms, like subsampling the data, computing silhouette width, sequential clustering procedures, and so forth. It also provides novel strategies that we developed as part of the RSEC algorithm.

        As mentioned above, RSEC is a specific algorithm for creating a clustering that follows these basic steps:

          @@ -142,17 +144,17 @@

          1.1 The clustering workflow

      The basic premise of RSEC is to find small, robust clusters of samples, and then merge them into larger clusters as relevant. We find that many algorithmic methods for choosing the appropriate number of clusters for methods err on the side of too few clusters. However, we find in practice that we tend to prefer to err on finding many clusters and then merging them based on examining the data.

      -

      RSEC makes many specific choices in this basic workflow, and many steps of this workflow are useful separately from RSEC. For this reason, the clusterExperiment package generalizes this workflow so that the user can use these tools with their own choices. We call this generalization the clustering workflow, as oppose to the specific choices set in RSEC. We will introduce this workflow in its general setting, and only ing specific sections devoted to the RSEC function will discussed the RSEC algorithm. For this reason, the examples shown here use simple clustering choices in the workflow to save on computational overhead; RSEC uses extensive subsampling techniques and takes much longer to run than is practical for a vignette.

      +

      RSEC makes many specific choices in this basic workflow, while many steps of this workflow are useful for users separately from RSEC. For this reason, the clusterExperiment package generalizes this workflow so that the user can follow this workflow with their own choices. We call this generalization the clustering workflow, as oppose to the specific choices set in RSEC. We will introduce this workflow in its general setting, and only in specific sections devoted to the RSEC function will discussed the RSEC algorithm. For this reason, the examples shown here use simple clustering choices in the workflow to save on computational overhead; our preferred choices encoded in RSEC uses extensive subsampling techniques and takes much longer to run than is practical for a vignette.

      Users can also run or add their own clusters to this workflow at different stages. Additional functionality for clustering is available in the clusterSingle function, and the user should see the documentation in the help page of that function. However, there is special functionality to ease in quickly visualizing and managing the results of this workflow.

      1.1.1 The RSEC Routine

      -

      The RSEC algorithm (Resampling-based Sequential Ensemble Clustering) was developed specifically for finding robust clusters in single cell sequencing data. It follows our above suggested workflow, but makes specific choices along the way that we find to be advantageous in clustering large mRNA expression datasets, like those of single cell sequencing. It is implemented in the function RSEC. This vignette serves to show how the pieces of the workflow operate, and use choices that are shown are not those recommended by RSEC. This is both to show the flexibility of the functions and because RSEC is computationally more time-intensive, since it is based on both resampling, and sequential discovery of clusters.

      +

      The RSEC algorithm (Resampling-based Sequential Ensemble Clustering) was developed specifically for finding robust clusters in single cell sequencing data. It follows our above suggested workflow, but makes specific choices along the way that we find to be advantageous in clustering large mRNA expression datasets, like those of single cell sequencing. It is implemented in the function RSEC. This vignette serves to show how the pieces of the workflow operate, and use choices that are shown are not those recommended by RSEC. We choose to show alternatives to the choices in RSEC both to show the flexibility of the functions and because RSEC is computationally more time-intensive, since it is based on both resampling, and sequential iteration of subsampling routines to discover.

1.3 Visualization Tools

@@ -161,7 +163,7 @@

1.3 Visualization Tools

2 Quickstart

-

We will go quickly through the standard steps of clustering using the clusterExperiment package, before turning to more details. The standard workflow we envision is the following:

+

We will go quickly through the standard workflow steps of clustering using the clusterExperiment package, before turning to more details. The standard workflow we envision is the following:

  • clusterMany – run desired clusterings
  • combineMany – get a unified clustering
  • @@ -169,6 +171,7 @@

    2 Quickstart

  • mergeClusters – merge together clusters with little DE between their genes.
  • getBestFeatures – Find Features that are differential between the final clustering.
+

The first four steps (the clustering steps) are done in one function call by RSEC, which is the preferred usage. However, to understand the parameters available in RSEC it is useful to go through the steps first.

2.1 Data example

We will make use of a single cell RNA sequencing experiment made available in the scRNAseq package.

@@ -224,10 +227,10 @@

2.2 Step 0: Filtering and normali

2.3 Step 1: Clustering with clusterMany

-

clusterMany lets the user quickly pick between many clustering options and run all of the clusterings in one single command. In the quick start we pick a simple set of clusterings based on varying the dimensionality reduction options. The way to designate which options to vary is to give multiple values to an argument. Due to a bug in R, we need to set getClass.msg=FALSE or otherwise a slew of annoying warnings will spit out at every call; this should be fixed in the next patch to R.

+

clusterMany lets the user quickly pick between many clustering options and run all of the clusterings in one single command. In the quick start we pick a simple set of clusterings based on varying the dimensionality reduction options. The way to designate which options to vary is to give multiple values to an argument.

Here is our call to clusterMany:

library(clusterExperiment)
-ce<-clusterMany(se, clusterFunction="pam",ks=5:10,
+ce<-clusterMany(se, clusterFunction="pam",ks=5:10, minSizes=5,
       isCount=TRUE,dimReduce=c("PCA","var"),nVarDims=c(100,500,1000),
       nPCADims=c(5,15,50),run=TRUE)

In this call to clusterMany we made the follow choices about what to vary:

@@ -240,32 +243,41 @@

2.3 Step 1: Clustering with By giving only a single value to the relative argument, we keep the other possible options fixed, for example:

  • we used ‘pam’ for all clustering (clusterFunction)
  • -
  • we left the default of minSize which requires clusters to be of size at least 5; smaller clusters will be ignored and samples assigned to them given the unassigned value of -1.
  • +
  • we set minSizes=5. This is an argument that allows the user to set a minimum required size and clusters of size less than that value will be ignored and samples assigned to them given the unassigned value of -1. The default of 1 would mean that this option is not used.

We also set isCount=TRUE to indicate that our input data are counts. This means that operations for clustering and visualizations will internally transform the data as \(log_2(x+1)\) (We could have alternatively explicitly set a transformation by giving a function to the transFun argument, for example if we wanted \(\sqrt(x)\) or \(log(x+\epsilon)\) or just identity).

-

We can visualize the resulting clusterings using the plotClusters command. It is also useful to change the amount of space to allow for the labels of the clusterings, so we will reset the mar option in par (we also change axisLine argument for this reason).

+

We can visualize the resulting clusterings using the plotClusters command. For this visualization, it is useful to change the amount of space on the left of the plot to allow for the labels of the clusterings, so we will reset the mar option in par. We also decrease the axisLine argument that decides the amount of space between the axis and the labels to give more space to the labels (axisLine is passed internally to the line option in axis).

defaultMar<-par("mar")
 plotCMar<-c(1.1,8.1,4.1,1.1)
 par(mar=plotCMar)
 plotClusters(ce,main="Clusters from clusterMany", whichClusters="workflow", sampleData=c("Biological_Condition","Cluster2"), axisLine=-1)
-

+

This plot shows the samples in the columns, and different clusterings on the rows. Each sample is color coded based on its clustering for that row, where the colors have been chosen to try to match up clusters across different clusterings that show large overlap. Moreover, the samples have been ordered so that each subsequent clustering (starting at the top and going down) will try to order the samples to keep the clusters together, without rearranging the clustering blocks of the previous clustering/row.

-

Notice that we also added the sampleData argument in our call, indicating that we also want to visualize some information about the samples saved in the colData slot (inherited from our original fluidigm object). We chose the columns “Biological_Condition” and “Cluster2” from colData, which correspond to the original biological condition of the experiment, and the clusters reported in the original paper, respectively. These are shown at the bottom of the plot.

+

Notice that we also added the sampleData argument in our call, indicating that we also want to visualize some information about the samples saved in the colData slot (inherited from our original fluidigm object). We chose the columns “Biological_Condition” and “Cluster2” from colData, which correspond to the original biological condition of the experiment, and the clusters reported in the original paper, respectively. The data from sampleData are always shown at the bottom of the plot.

Notice that some samples are white. This indicates that they have the value -1, meaning they were not clustered. This is from our choices to require at least 5 samples to make a cluster.

We can see that some clusters are fairly stable across different choices of dimensions while others can vary dramatically.

-

We have set whichClusters="workflow" to only plot clusters created from the workflow. Right now that’s all there are anyway, but as commands get rerun with different options, other clusterings can build up in the object (see discussion in this section about how multiple calls to workflow are stored). So setting whichClusters="workflow" means that we are only going to see our most recent calls to the workflow.

-

The labels shown are those given by clusterMany but can be a bit much for plotting. We can assign new labels if we prefer, for example to be more succinct, by changing the clusterLabels of the object (note we are permanently changing the labels here within the ClusterExperiment object). We choose to remove “Features” as being too wordy:

+

We have set whichClusters="workflow" to only plot clusters created from the workflow. Right now that’s all there are anyway, but as commands get rerun with different options, other clusterings can build up in the object (see discussion in this section about how multiple calls to workflow are stored). So setting whichClusters="workflow" means that we are only going to see our most recent calls to the workflow. whichClusters can be set to limit to specific clusterings or specific steps in the workflow (so far we only have the 1 step, which is the clusterMany step).

+

The labels shown are those given automatically by clusterMany but can be a bit much for plotting. We can assign new labels in our ClusterExperiment object if we prefer, for example to be more succinct, by changing the clusterLabels of the object (note we are permanently changing the labels here within the ClusterExperiment object). We choose to remove “Features” as being too wordy:

cl<-clusterLabels(ce)
 cl<-gsub("Features","",cl)
 clusterLabels(ce)<-cl
-

We will also show the clusters in a different order, which corresponds to varying the number of dimensions, rather than k.

+

We will also give to the whichClusters argument indices of clusters stored in the ClusterExperiment object, which can allow us to show the clusters in a different order. Here we’ll pick an order which corresponds to varying the number of dimensions, rather than k. We do this by parsing the labels of the clusters, which we can get with the clusterLabels command.

cl<-clusterLabels(ce)
 ndims<-sapply(sapply(strsplit(cl,","),function(x){strsplit(x[1],"=")}),function(x){x[2]})
 ord<-order(as.numeric(ndims))
 par(mar=plotCMar)
 plotClusters(ce,main="Clusters from clusterMany", whichClusters=ord, sampleData=c("Biological_Condition","Cluster2"), axisLine=-1)
-

+

We see that the order in which the clusters are given to plotClusters changes the plot greatly. There are many different options for how to run plotClusters discussed in in the detailed section on plotClusters, but for now, this plot is good enough for a quick visualization.

+

We can examine sizes of a single clustering with the function plotBarplot. By default, the cluster picked will be the primary cluster, which for the result of clusterMany is rather arbitrarily just the first cluster.

+
plotBarplot(ce)
+

+

We can also pick a particular cluster.

+
plotBarplot(ce,whichClusters=c("nPCA=15,k=6" ))
+

+

We can also compare two specific clusters with a simple barplot using plotBarplot. Here we compare the above clusering (PCA with 15 dimensions and k=6), with changing to k=8:

+
plotBarplot(ce,whichClusters=c("nPCA=15,k=6" ,"nPCA=15,k=8"))
+

2.3.1 The output

The output of clusterMany is a ClusterExperiment object. This is a class built for this package and explained in the section on ClusterExperiment Objects. In the object ce the clusters are stored, names and colors for each cluster within a clustering are assigned, and other information about the clusterings is recorded. Furthermore, all of the information in the original SummarizedExperiment is retained.

@@ -279,17 +291,17 @@

2.3.1 The output

## [4,] 1 1 1 ## [5,] 4 3 2 ## [6,] 1 4 2
-

Because we have changed clusterLabels above, these new cluster labels are shown here. Notice that some of the samples are assigned the value of -1. -1 has the significance of encoding samples that are not assigned to any cluster. Why certain samples are not clustered depends on the underlying choices of the clustering routine. In this case, the default in clusterMany set the minimum size of a cluster to be 5, which resulted in -1 assignments.

-

Another special value is -2 discussed in the section on ClusterExperiment objects

+

Because we have changed clusterLabels above, these new cluster labels are shown here. Notice that some of the samples are assigned the value of -1. -1 has the significance of encoding samples that are not assigned to any cluster. Why certain samples are not clustered depends on the underlying choices of the clustering routine. In this case, we set minSizes=5 in clusterMany requiring the minimum size of a cluster to be 5, which resulted in -1 assignments.

+

Another special value is -2 discussed in the section on ClusterExperiment objects

2.4 Step 2: Find a consensus with combineMany

To find a consensus clustering across the many different clusterings created by clusterMany the function combineMany can be used next.

-
ce<-combineMany(ce)
+
ce<-combineMany(ce,proportion=1)
## Note: no clusters specified to combine, using results from clusterMany
-

Notice we get a warning that we did not specify any clusters to combine, so it is using the default – those from the clusterMany call.

-

If we look at the clusterMatrix of the returned ce object, we see that the new cluster from combineMany has been added to the existing clusterings. This is the basic strategy of the functions in this package. Any clustering that is created is added to existing clusterings, so the user does not need to keep track of past clusterings and can easily compare what has changed.

+

The proportion argument indicates the minimum proportion of times samples should be with other samples in the cluster they are assigned to in order to be clustered together in the final assignment. Notice we get a warning that we did not specify any clusters to combine, so it is using the default – those from the clusterMany call.

+

If we look at the clusterMatrix of the returned ce object, we see that the new cluster from combineMany has been added to the existing clusterings. This is the basic strategy of all of these functions in this package. Any clustering function that is applied to an existing ClusterExperiment object adds the new clustering to the set of existing clusterings, so the user does not need to keep track of past clusterings and can easily compare what has changed.

We can again run plotClusters, which will now also show the result of combineMany:

head(clusterMatrix(ce)[,1:3])
##      combineMany nVAR=100,k=5 nVAR=500,k=5
@@ -301,47 +313,50 @@ 

2.4 Step 2: Find a consensus with ## [6,] -1 1 4

par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow")
-

-

The default result of combineMany is not usually a great choice, and certainly isn’t helpful here. The clustering from the default combineMany leaves most samples unassigned (white in the above plot). This is because the default way of combining is very conservative – it requires samples to be in the same cluster in every clustering to be assigned a cluster. This is quite stringent. We can vary this by setting the proportion argument to indicate the minimum proportion of times they should be together with other samples in the cluster they are assigned to. Explicit details on how combineMany makes these clusters are discussed in the section on combineMany.

-

So let’s label the one we found “combineMany, default” and then create a new one. (Making an informative label will make it easier to keep track of this particular clustering later, particularly if we make multiple calls to the workflow).

+

+

The choices of proportion=1 in combineMany is not usually a great choice, and certainly isn’t helpful here. The clustering from the default combineMany leaves most samples unassigned (white in the above plot). This is because we requires samples to be in the same cluster in every clustering in order to be assigned to a cluster together. This is quite stringent. We can vary this by setting the proportion argument to be lower. Explicit details on how combineMany makes these clusters are discussed in the section on combineMany.

+

So let’s label the one we found as “combineMany,1” and then create a new one. (Making or changing the label to an informative label will make it easier to keep track of this particular clustering later, particularly if we make multiple calls to the workflow).

wh<-which(clusterLabels(ce)=="combineMany")
-if(length(wh)!=1) stop() else clusterLabels(ce)[wh]<-"combineMany,default"
+if(length(wh)!=1) stop() else clusterLabels(ce)[wh]<-"combineMany,1"

Now we’ll rerun combineMany with proportion=0.7. This time, we will give it an informative label upfront in our call to combineMany.

ce<-combineMany(ce,proportion=0.7,clusterLabel="combineMany,0.7")
## Note: no clusters specified to combine, using results from clusterMany
par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow")
-

+

We see that more clusters are detected. Those that are still not assigned a cluster from combineMany clearly vary across the clusterings as to whether the samples are clustered together or not. Varying the proportion argument will adjust whether some of the unclustered samples get added to a cluster. There is also a minSize parameter for combineMany, with the default of minSize=5. We could reduce that requirement as well and more of the unclustered samples would be grouped into a cluster. Here, we reduce it to minSize=3 (we’ll call this “combineMany,final”):

ce<-combineMany(ce,proportion=0.7,minSize=3,clusterLabel="combineMany,final")
## Note: no clusters specified to combine, using results from clusterMany
par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow",main="Min. Size=3")
-

-

We can also visualize the proportion of times these clusters were together across these clusterings (this information was made and stored in the ClusterExperiment object when we called combineMany as long as proportion value is <1):

+

+

We can also visualize the proportion of times these clusters were together across these clusterings (this information was made and stored in the ClusterExperiment object when we called combineMany provided that proportion argument is <1):

plotCoClustering(ce)
-

+

This visualization can help in determining whether to change the value of proportion (though see combineMany for how -1 assignments affect combineMany).

2.5 Step 3: Merge clusters together with makeDendrogram and mergeClusters

-

It is not uncommon in practice to create forty or more clusterings with clusterMany, in which case the results of combineMany can often still result in too many small clusters. We might wonder if they are necessary or could be logically combined together. We could change the value of proportion in our call to combineMany. But we have found that it is often after looking at the clusters and how different they look on individual genes that we best make this determination, rather than the proportion of times they are together in different clustering routines.

-

For this reason, we often find the need for an additional clustering step that merges clusters together that are not different, based on running tests of differential expression between the clusters found in combineMany. We often display and use both sets of clusters side-by-side (that from combineMany and that from mergeClusters).

-

mergeClusters needs a hierarchical clustering of the clusters; it then goes progressively up that hierarchy, deciding whether two adjacent clusters can be merged. The function makeDendrogram makes such a hierarchy between clusters (by applying hclust to the medoids of the clusters). Because the results of mergeClusters are so dependent on that hierarchy, we require the user to call makeDendrogram rather than calling it internally. This is because different options in makeDendrogram can affect how the clusters are hierarchically ordered, and we want to encourage the user make these choices.

-

As an example, here we use the 500 most variable genes to make the cluster hierarchy.

+

Once you start varying the parameters, is not uncommon in practice to create forty or more clusterings with clusterMany. In which case the results of combineMany can often result in too many small clusters. We might wonder if they are necessary or could be logically combined together. We could change the value of proportion in our call to combineMany. But we have found that it is often after looking at the clusters, for example with a heatmap, and how different they look on individual genes that we best make this determination, rather than the proportion of times they are together in different clustering routines.

+

For this reason, we often find the need for an additional clustering step that merges clusters together that are not different, based on running tests of differential expression between the clusters found in combineMany. This is done by the function mergeClusters. We often display and use both sets of clusters side-by-side (that from combineMany and that from mergeClusters).

+

mergeClusters needs a hierarchical clustering of the clusters in order to merge clusters; it then goes progressively up that hierarchy, deciding whether two adjacent clusters can be merged. The function makeDendrogram makes such a hierarchy between clusters (by applying hclust to the medoids of the clusters). Because the results of mergeClusters are so dependent on that hierarchy, we require the user to call makeDendrogram rather than calling it automatically internally. This is because different options in makeDendrogram can affect how the clusters are hierarchically ordered, and we want to encourage the user make these choices.

+

As an example, here we use the 500 most variable genes to make the cluster hierarchy (note we can make different choices here than we did in the clustering).

ce<-makeDendrogram(ce,dimReduce="var",ndims=500)
 plotDendrogram(ce)
-

+

We can see that clusters 1 and 3 are most closely related, at least in the top 500 most variable genes.

-

If we look at the summary of ce, it now has ‘makeDendrogram’ marked as ‘Yes’.

+

We can see the relative size of the clusters by setting some options in plotDendrogram:

+
plotDendrogram(ce,labelType="colorblock",leafType="sample")
+

+

Notice I don’t need to make the dendrogram again, because it’s saved in ce. If we look at the summary of ce, it now has ‘makeDendrogram’ marked as ‘Yes’.

ce
## class: ClusterExperiment 
 ## dim: 7069 65 
 ## Primary cluster type: combineMany 
 ## Primary cluster label: combineMany,final 
 ## Table of clusters (of primary clustering):
-## -1 c1 c2 c3 c4 c5 c6 
-## 12  9  7 15 14  4  4 
+## -1 c1 c2 c3 c4 c5 c6 c7 
+##  6  4  8  5  9 15 14  4 
 ## Total number of clusterings: 39 
 ## Dendrogram run on 'combineMany,final' (cluster index: 1)
 ## -----------
@@ -350,36 +365,33 @@ 

2.5 Step 3: Merge clusters togeth ## combineMany run? Yes ## makeDendrogram run? Yes ## mergeClusters run? No

-

Now we are ready to actually merge clusters together. We now run mergeClusters that will go up this hierarchy and compare the level of differential expression (DE) in each pair. In other words, if we focus on the left side of the tree, DE tests are run, between 1 and 3, and between 6 and 8. If there is not enough DE between each of these (based on a cutoff that can be set by the user), then clusters 1 and 3 and/or 6 and 8 will be merged. And so on up the tree.

+

Now we are ready to actually merge clusters together. We run mergeClusters that will go up this hierarchy and compare the level of differential expression (DE) in each pair. In other words, if we focus on the left side of the tree, DE tests are run, between 1 and 3, and between 6 and 8. If there is not enough DE between each of these (based on a cutoff that can be set by the user), then clusters 1 and 3 and/or 6 and 8 will be merged. And so on up the tree.

It is useful to first run mergeClusters without actually creating any merged clusters so as to preview what the final clustering will be (and perhaps to help in setting the cutoff).

mergeClusters(ce,mergeMethod="adjP",plotInfo="mergeMethod")
## Note: Merging will be done on ' combineMany,final ', with clustering index 1
-

-

Then we can decide on a cutoff and visualize the resulting clustering.

- +

+

Then we can decide on a cutoff and visualize the resulting clustering.

ce<-mergeClusters(ce,mergeMethod="adjP",cutoff=0.01)
## Note: Merging will be done on ' combineMany,final ', with clustering index 1
-
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 2.9. Rerun with
-## increased df
-
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 1.6. Rerun with
+
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 3.3. Rerun with
 ## increased df
-

+

par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow", sampleData=c("Biological_Condition","Cluster2"))
 plotCoClustering(ce,whichClusters=c("mergeClusters","combineMany"),
                  sampleData=c("Biological_Condition","Cluster2"),annLegend=FALSE)
-

+

Notice that mergeClusters combines clusters based on the actual values of the features, while the coClustering plot shows how often the samples clustered together. It is not uncommon that mergeClusters will merge clusters that don’t look “close” on the coClustering plot. This can be due to just the choices of the hierarchical clustering, but can also be because the two merged clusters are not often confused for each other across the clustering algorithms, yet don’t have strong differences on individual genes. This can be the case especially when the clustering is done on reduced PCA space, where an accumulation of small differences might consistently separate the samples (so the two clusters are not “confused” as to the samples), but because the differences are not strong on individual genes, mergeClusters combines them. These are ultimately different criteria.

Finally, we can do a heatmap visualizing this final step of clustering.

plotHeatmap(ce,clusterSamplesData="dendrogramValue",breaks=.99,
             sampleData=c("Biological_Condition", "Cluster1", "Cluster2"))
-

+

By choosing “dendrogramValue” for the clustering of the samples, we will be showing the clusters according to the hierarchical ordering of the clusters found by makeDendrogram. The argument breaks=0.99 means that the last color of the heatmap spectrum will be forced to be the top 1% of the data (rather than evenly spaced through the entire range of values). This can be helpful in making sure that rare extreme values in the upper range do not absorb too much space in the color spectrum. There are many more options for plotHeatmap, some of which are discussed in the section on plotHeatmap.

2.6 Step 4: Finding Features related to the clusters

-

The last step is to then find features that are different between these clusters, as a start to understanding biological differences in the samples. The function getBestFeatures performs tests for differential expression (i.e. different mean levels) between the clusters for each feature. It relies on limma to run the differential expression analysis, with voom correction if the data are indicated by the user to be counts.

+

The last step is to then find features that are different between these clusters, as a start to understanding biological differences in the samples. The function getBestFeatures performs tests for differential expression (i.e. different mean levels) between the clusters for each feature. It relies on limma (Smyth 2004, Ritchie et al. (2015)) to run the differential expression analysis, with voom (Law et al. 2014) correction if the data are indicated by the user to be counts.

There are several types of tests that can be performed to identify features that are different between the clusters. Here we perform all pairwise tests between the clusters.

pairsAll<-getBestFeatures(ce,contrastType="Pairs",p.value=0.05,
                           number=nrow(ce),isCount=TRUE)
@@ -387,30 +399,30 @@

2.6 Step 4: Finding Features rela ## with the transformation function in the slot `transformation`. ## This makes sense only for counts.
head(pairsAll)
-
##   IndexInOriginal Contrast Feature     logFC    AveExpr         t
-## 1             552    X1-X2  BCL11B  9.967559  2.2054809  8.128140
-## 2            6565    X1-X2     VIM -9.010410  5.8344528 -7.711733
-## 3            3309    X1-X2    MIAT  8.400380  6.4698388  6.590747
-## 4            4501    X1-X2 PRTFDC1 -8.249512 -0.1792585 -6.434117
-## 5            1763    X1-X2   EPHA3  8.465910  2.6400851  6.020382
-## 6            2210    X1-X2    GLI3 -7.736866  3.3201396 -5.961636
+
##   IndexInOriginal Contrast Feature     logFC   AveExpr         t
+## 1             552    X1-X2  BCL11B -8.681448 1.9224168 -8.648195
+## 2            2210    X1-X2    GLI3  7.620029 3.7082864  7.625723
+## 3            3052    X1-X2    LRP8 -7.459610 1.1835595 -6.901136
+## 4            4501    X1-X2 PRTFDC1  7.366113 0.2544401  6.834179
+## 5            4357    X1-X2    PON2  7.727171 3.2898659  6.632537
+## 6            2455    X1-X2    HES1  6.537574 1.7812745  6.421161
 ##        P.Value    adj.P.Val         B
-## 1 6.996166e-12 4.239077e-09 15.416450
-## 2 4.314551e-11 2.060781e-08 13.179277
-## 3 5.459041e-09 1.254304e-06  9.041400
-## 4 1.061382e-08 2.224182e-06  8.804356
-## 5 6.024265e-08 9.812334e-06  7.361316
-## 6 7.687870e-08 1.197039e-05  6.924331
+## 1 2.070282e-13 8.130457e-11 19.148442 +## 2 2.584449e-11 6.046330e-09 14.512002 +## 3 7.415013e-10 1.136581e-07 11.700657 +## 4 1.007046e-09 1.477953e-07 11.328702 +## 5 2.518849e-09 3.277131e-07 10.528783 +## 6 6.527305e-09 7.626697e-07 9.603245

We can visualize only these significantly different pair-wise features with plotHeatmap by using the column “IndexInOriginal” in the result of getBestFeatures to quickly identify the genes to be used in the heatmap. Notice that the same genes can be replicated across different contrasts, so we will not always have unique genes:

length(pairsAll$Feature)==length(unique(pairsAll$Feature))
## [1] FALSE
-

In this case they are not unique. Hence, we will make sure we take only unique gene values so that they are not plotted multiple times in our heatmap. (This is a good practice even if in a particular case the genes are unique).

+

In this case they are not unique because the same gene can be significant for different pairs tests. Hence, we will make sure we take only unique gene values so that they are not plotted multiple times in our heatmap. (This is a good practice even if in a particular case the genes are unique).

plotHeatmap(ce, clusterSamplesData="dendrogramValue",
             clusterFeaturesData=unique(pairsAll[,"IndexInOriginal"]),
             main="Heatmap of features w/ significant pairwise differences",
             breaks=.99)
-

-

Notice that the samples clustered into the -1 cluster (i.e. not assigned) are clustered as an outgroup. They can also be mixed into the dendrogram (see makeDendrogram)

+

+

Notice that the samples clustered into the -1 cluster (i.e. not assigned) are clustered as an outgroup. This is a choice that is made when the dendrogram was made with makeDendrogram. These samples can also be mixed into the dendrogram (see makeDendrogram)

@@ -423,8 +435,8 @@

3 ClusterExperiment Objects

## Primary cluster type: mergeClusters ## Primary cluster label: mergeClusters ## Table of clusters (of primary clustering): -## -1 m1 m2 m3 m4 m5 m6 -## 12 9 7 15 14 4 4 +## -1 m1 m2 m3 m4 +## 6 13 17 15 14 ## Total number of clusterings: 40 ## Dendrogram run on 'combineMany,final' (cluster index: 2) ## ----------- @@ -433,17 +445,17 @@

3 ClusterExperiment Objects

## combineMany run? Yes ## makeDendrogram run? Yes ## mergeClusters run? Yes -

This summary tells us the total number of clusterings (40), and gives some indication as to what parts of the standard workflow have been completed and stored in this object. It also gives information regarding the primaryCluster of the object. The primaryCluster is just one of the clusterings that has been chosen to be the “primary” clustering, meaning that by default various functions will turn to this clustering as the desired clustering to use. The “primaryCluster” can be reset by the user (see primaryClusterIndex). clusterMany arbitrarily sets the ‘primaryCluster’ to the first one, and each later step of the workflow sets the primary index to the most recent, but the user can set a specific clustering to be the primaryCluster with primaryClusterIndex.

+

This summary tells us the total number of clusterings (40), and gives some indication as to what parts of the standard workflow have been completed and stored in this object. It also gives information regarding the primaryCluster of the object. The primaryCluster is just one of the clusterings that has been chosen to be the “primary” clustering, meaning that by default various functions will turn to this clustering as the desired clustering to use. The “primaryCluster” can be reset by the user (see primaryClusterIndex). clusterMany arbitrarily sets the ‘primaryCluster’ to the first one, and each later step of the workflow sets the primary index to the most recent, but the user can set a specific clustering to be the primaryCluster with primaryClusterIndex. Often, if a function is not given a specific clustering (usually via an option whichCluster or whichClusters) the “primary” cluster is taken by default.

There are also additional commands to access the clusterings and their related information (type help("ClusterExperiment-methods") for more).

The cluster assignments are stored in the clusterMatrix slot of ce, with samples on the rows and different clusterings on the columns. We can look at the cluster matrix and the primary cluster with the commands clusterMatrix and primaryCluster

head(clusterMatrix(ce))[,1:5]
-
##      mergeClusters combineMany,final combineMany,0.7 combineMany,default
-## [1,]            -1                -1              -1                  -1
-## [2,]            -1                -1              -1                  -1
-## [3,]            -1                -1              -1                  -1
-## [4,]             1                 1               1                  -1
-## [5,]            -1                -1              -1                  -1
-## [6,]            -1                -1              -1                  -1
+
##      mergeClusters combineMany,final combineMany,0.7 combineMany,1
+## [1,]            -1                -1              -1            -1
+## [2,]            -1                -1              -1            -1
+## [3,]             1                 1              -1            -1
+## [4,]             2                 2               1            -1
+## [5,]             1                 1              -1            -1
+## [6,]             2                 3               2            -1
 ##      nVAR=100,k=5
 ## [1,]            1
 ## [2,]            2
@@ -452,18 +464,19 @@ 

3 ClusterExperiment Objects

## [5,] 4 ## [6,] 1
primaryCluster(ce)
-
##  [1] -1 -1 -1  1 -1 -1  2  3  3  3  4  4  1  4 -1  5  2  3  2 -1  4  4  4
-## [24]  3 -1  3  3  3  6  4  4  1  5  2  2  6 -1  4 -1  6  2  3  3  1  3  3
-## [47]  1 -1  2  4  4  4  3  4 -1  1  1  3  3  1  6  5  4  1  5
+
##  [1] -1 -1  1  2  1  2  1  3  3  3  4  4  2  4  1  2  1  3  1  1  4  4  4
+## [24]  3 -1  3  3  3  2  4  4  2  2  1  1 -1  1  4 -1  2  1  3  3  2  3  3
+## [47]  2 -1  1  4  4  4  3  4  1  2  2  3  3  2  2  2  4  2  2

Remember that we made multiple calls to combineMany: only the last such call will be shown when we use whichClusters="workflow" in our plotting (see this section for a discussion of how these repeated calls are handled.)

Negative Valued Cluster Assignments The different clusters are stored as consecutive integers, with ‘-1’ and ‘-2’ having special meaning. ‘-1’ refers to samples that were not clustered by the clustering algorithm. In our example, we removed clusters that didn’t meet specific size criterion, so they were assigned ‘-1’. ‘-2’ is for samples that were not included in the original input to the clustering. This is useful if, for example, you cluster on a subset of the samples, and then want to store this clustering with the clusterings done on all the data. You can create a vector of clusterings that give ‘-2’ to the samples not originally used and then add these clusterings to the ce object manually with addClusters.

-

clusterLabels gives the column names of the clusterMatrix; clusterMany has given column names based on the parameter choices, and later steps in the workflow also give a name (or allow the user to set them). Clusterings might also have no specific label if the user created them. As we’ve seen, the user can also change these labels.

+

clusterLabels gives the column names of the clusterMatrix; clusterMany has given column names based on the parameter choices, and later steps in the workflow also give a name (or allow the user to set them).

head(clusterLabels(ce),10)
-
##  [1] "mergeClusters"       "combineMany,final"   "combineMany,0.7"    
-##  [4] "combineMany,default" "nVAR=100,k=5"        "nVAR=500,k=5"       
-##  [7] "nVAR=1000,k=5"       "nPCA=5,k=5"          "nPCA=15,k=5"        
+
##  [1] "mergeClusters"     "combineMany,final" "combineMany,0.7"  
+##  [4] "combineMany,1"     "nVAR=100,k=5"      "nVAR=500,k=5"     
+##  [7] "nVAR=1000,k=5"     "nPCA=5,k=5"        "nPCA=15,k=5"      
 ## [10] "nPCA=50,k=5"
-

clusterTypes on the other hand indicates what call made the clustering. Unlike the labels, it is wise to not change the values of clusterTypes lightly.

+

As we’ve seen, the user can also change these labels.

+

clusterTypes on the other hand indicates what call made the clustering. Unlike the labels, it is wise to not change the values of clusterTypes unless you are sure of what you are doing.

head(clusterTypes(ce),10)
##  [1] "mergeClusters" "combineMany"   "combineMany.2" "combineMany.1"
 ##  [5] "clusterMany"   "clusterMany"   "clusterMany"   "clusterMany"  
@@ -484,7 +497,7 @@ 

3 ClusterExperiment Objects

## SRR1275285 5300270 4276650 80.6873 41.6394 0.0227383 ## SRR1275366 7701320 6373600 82.7600 68.9431 0.0266275 ## SRR1275261 13425000 9554960 71.1727 62.0001 0.0200522
-

Another important slot in the ClusterExperiment object is the clusterLegend slot. This consists of a list, one element per column or clustering of clusterMatrix.

+

Another important slot in the ClusterExperiment object is the clusterLegend slot. This consists of a list, one element per column or clustering of clusterMatrix, that gives colors and names to each cluster within a clustering.

length(clusterLegend(ce))
## [1] 40
clusterLegend(ce)[1:2]
@@ -495,8 +508,6 @@

3 ClusterExperiment Objects

## 2 "2" "#33A02C" "m2" ## 3 "3" "#E31A1C" "m3" ## 4 "4" "#FF7F00" "m4" -## 5 "5" "#6A3D9A" "m5" -## 6 "6" "#B15928" "m6" ## ## $`combineMany,final` ## clusterIds color name @@ -506,8 +517,9 @@

3 ClusterExperiment Objects

## 3 "3" "#E31A1C" "c3" ## 4 "4" "#FF7F00" "c4" ## 5 "5" "#6A3D9A" "c5" -## 6 "6" "#B15928" "c6"
-

We can see that each element of clusterLegend consists of a matrix, with number of rows equal to the number of clusters in the clustering. The columns store information about that cluster. clusterIds is the internal id used in clusterMatrix to identify the cluster, name is a name for the cluster, and color is a color for that cluster. color is used in plotting and visualizing the clusters, and name is an arbitrary character string for a cluster. They are automatically given default values when the ClusterExperiment object is created, but we will see under the description of visualization methods how the user might want to manipulate these for better plotting results.

+## 6 "6" "#B15928" "c6" +## 7 "7" "#2ef4ca" "c7"
+

We can see that each element of clusterLegend consists of a matrix, with number of rows equal to the number of clusters in the clustering. The columns store information about that cluster. clusterIds is the internal id (integer) used in clusterMatrix to identify the cluster, name is a name for the cluster, and color is a color for that cluster. color is used in plotting and visualizing the clusters, and name is an arbitrary character string for a cluster. They are automatically given default values when the ClusterExperiment object is created, but we will see under the description of visualization methods how the user might want to manipulate these for better plotting results.

4 Visualizing the data

@@ -517,18 +529,18 @@

4.1 Plotting the clusters

par(mar=plotCMar)
 plotClusters(ce,main="Clusters from clusterMany", whichClusters="workflow", 
              axisLine=-1)
-

+

We have seen that we can get very different plots depending on how we order the clusterings, and what clusterings are included. The argument whichClusters allows the user to choose different clusterings or provide an explicit ordering of the clusterings. whichClusters can take either a single character value, or a vector of either characters or indices. If whichClusters matches either “all” or “workflow”, then the clusterings chosen are either all, or only those from the most recent calls to the workflow functions. Choosing “workflow” removes from the visualization both user-defined clusterings and also previous calls to the workflow that have since been rerun. Setting whichClusters="workflow" can be a useful if you have called a method like combineMany several times, as we did, only with different parameters. All of those runs are saved (unless eraseOld=TRUE), but you may not want to plot them.

If whichClusters is a character that is not one of these designated values, the entries should match a clusterType value (like clusterMany) or a clusterLabel value (with exact matching). Alternatively, the user can specify numeric indices corresponding to the columns of clusterMatrix that provide the order of the clusters.

par(mar=plotCMar)
 plotClusters(ce,whichClusters="clusterMany",
                main="Only Clusters from clusterMany",axisLine=-1)
-

+

We can also add to our plot (categorical) information on each of our subjects from the colData of our SummarizedExperiment object (which is also retained in our ClusterExperiment object). This can be helpful to see if the clusters correspond to other features of the samples, such as sample batches. Here we add the values from the columns “Biological_Condition” and “Cluster2” that were present in the fluidigm object and given with the published data.

par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow", sampleData=c("Biological_Condition","Cluster2"), 
                main="Workflow clusters plus other data",axisLine=-1)
-

+

4.1.1 Saving the alignment of plotClusters

plotClusters invisibly returns a ClusterExperiment object. In our earlier calls to plotCluster, this would be the same as the input object and so there is no reason to save it. However, the alignment and color assignments created by plotClusters can be requested to be saved via the resetNames, resetColors and resetOrderSamples arguments. If any of these are set to TRUE, then the object returned will be different than those of the input. Specifically, if resetColors=TRUE the colorLegend of the returned object will be changed so that the colors assigned to each cluster will be as were shown in the plot. Similarly, if resetNames=TRUE the names of the clusters will be changed to be integer values, but now the integers will be aligned to try to be the same across clusters (and therefore not consecutive integers, which is why these are saved as names for the clusters and not clusterIds). If resetOrderSamples=TRUE, then the order of the samples shown in the plot will be similarly saved in the slot orderSamples.

@@ -536,7 +548,7 @@

4.1.1 Saving the alignment of plo
par(mar=plotCMar)
 ce_temp<-plotClusters(ce,whichClusters="workflow", sampleData=c("Biological_Condition","Cluster2"), 
                main="Clusters from clusterMany, different order",axisLine=-1,resetNames=TRUE,resetColors=TRUE,resetOrderSamples=TRUE)
-

+

clusterLegend(ce_temp)[c("mergeClusters","combineMany,final")]
## $mergeClusters
 ##    clusterIds color     name
@@ -545,18 +557,17 @@ 

4.1.1 Saving the alignment of plo ## 2 "2" "#33A02C" "2" ## 3 "3" "#E31A1C" "3" ## 4 "4" "#FF7F00" "4" -## 5 "5" "#6A3D9A" "5" -## 6 "6" "#B15928" "6" ## ## $`combineMany,final` ## clusterIds color name ## -1 "-1" "white" "-1" -## 1 "1" "#1F78B4" "1" +## 1 "1" "#2ef4ca" "7" ## 2 "2" "#33A02C" "2" -## 3 "3" "#E31A1C" "3" -## 4 "4" "#FF7F00" "4" -## 5 "5" "#6A3D9A" "5" -## 6 "6" "#B15928" "6"

+## 3 "3" "#B15928" "6" +## 4 "4" "#1F78B4" "1" +## 5 "5" "#E31A1C" "3" +## 6 "6" "#FF7F00" "4" +## 7 "7" "#bd18ea" "8"

Now, the clusterLegend slot of the object no longer has the default color/name assignments, but it has names and colors that match across the clusters. Notice, that this means the prefix “m” or “c” that was previously given to distinguish the combineMany result from the mergeClusters result is now gone (the user could manually add them by changing the clusterLegend values).

We can also force plotClusters to use the existing color definitions, rather than create its own. This makes particular sense if you want to have continuity between plots – i.e. be sure that a particular cluster always has a certain color – but would like to do different variations of plotClusters to get a sense of how similar the clusters are.

For example, we set the colors above based on the cluster order from plotClusters where the clusters were ordered according to the workflow. But now we want to plot only the clusters from clusterMany, yet keep the same colors as before so we can compare them. We do this by setting the argument existingColors="all", meaning use all of the existing colors (currently this is the only option available for how to use the existing colors).

@@ -570,21 +581,22 @@

4.1.1 Saving the alignment of plo existingColors="all", whichClusters="clusterMany", main="clusterMany Clusters, fix the color of clusters", axisLine=-1) -

+

4.2 Heatmap with the clusters

-

There is also a default heatmap command for a ClusterExperiment object that we used in the Quick Start. By default it clusters on the most variable features (after transforming the data) and shows the primaryCluster alongside the data. The primaryCluster now that we’ve run the workflow will be set as that from the mergeClusters step.

+

There is also a default heatmap command for a ClusterExperiment object that we used in the Quick Start. By default it clusters on the most variable features (after transforming the data) and shows the primaryCluster alongside the data. The primaryCluster, now that we’ve run the workflow, has been set as that from the last mergeClusters step.

par(mfrow=c(1,1))
 par(mar=defaultMar)
 plotHeatmap(ce,main="Heatmap with clusterMany")
-

+

The plotHeatmap command has numerous options, in addition to those of aheatmap. plotHeatmap mainly provides additional functionality in the following areas:

  • Easy inclusion of clustering information or sample information, based on the ClusterExperiment object.
  • Additional methods for ordering/clustering the samples that makes use of the clustering information.
  • Use of separate input data for clustering and for visualization.
  • +
  • Setting the breaks for better visualization

4.2.1 Displaying clustering or sample information

@@ -604,7 +616,7 @@

4.2.2 Additional options for clus
plotHeatmap(ce,clusterSamplesData="primaryCluster",
             whichClusters="primaryCluster",
             main="Heatmap with clusterMany",annLegend=FALSE)
-

+

As an improvement upon this, we can cluster the clusters into a dendrogram so that the most similar clusters will be near each other. We already did this before with our call to makeDendrogram. We haven’t done anything to change that, so the dendrogram from that call is still stored in the object. We can check this in the information shown in our object:

show(ce)
## class: ClusterExperiment 
@@ -612,8 +624,8 @@ 

4.2.2 Additional options for clus ## Primary cluster type: mergeClusters ## Primary cluster label: mergeClusters ## Table of clusters (of primary clustering): -## -1 m1 m2 m3 m4 m5 m6 -## 12 9 7 15 14 4 4 +## -1 m1 m2 m3 m4 +## 6 13 17 15 14 ## Total number of clusterings: 40 ## Dendrogram run on 'combineMany,final' (cluster index: 2) ## ----------- @@ -627,7 +639,7 @@

4.2.2 Additional options for clus whichClusters=c("mergeClusters","combineMany"), main="Heatmap with clusterMany", sampleData=c("Biological_Condition","Cluster2"),annLegend=FALSE)

-

+

If there is not a dendrogram stored, plotHeatmap will call makeDendrogram based on the primary cluster (with the default settings of makeDendrogram); calling makeDendrogram on ce is preferred so that the user can control the choices in how it is done (which we will discuss below). For visualization purposes, the dendrogram for the combineMany cluster is preferred to that of the mergeCluster cluster, since “combineMany,final” is just a finer partition of the “mergeClusters” clustering.

@@ -635,6 +647,21 @@

4.2.3 Using separate input data f

While count data is a common type of data, it is also common that the input data in the SummarizedExperiment object might be normalized data from a normalization package such as RUVSeq. In this case, the clustering and all numerical calculations should be done on the normalized data (which may or may not need a log transform). However, these normalized data might not be on a logical count scale (for example, in RUVSeq, the normalize data are residuals after subtracting out gene-specific batch effects).

In this case, it can be convenient to have the visualization of the data (i.e. the color scale), be based on a count scale that is interpretable, even while the clustering is done based on the normalized data. This is possible by giving a new matrix of values to the argument visualizeData. In this case, the color scale (and clustering of the features) is based on the input visualizeData matrix, but all clustering of the samples is done on the internal data in the ClusterExperiment object.

+
+

4.2.4 Setting the breaks

+

Usually, the breaks that determine the colors of the heatmap are evenly spaced across the range of the data in the entire matrix. When there are a few outlier samples or genes, they can dominate the color and make it impossible to visualize the bulk of the data.

+

For this reason, the argument breaks in plotHeatmap allows for a value between 0 and 1, to indicate that the range of colors should be chosen as equally spaced between certain quantiles of the data. For example, if breaks=0.99, the range of equally spaced breaks will stop at the top 0.99 quantile of the data and anything above that value gets assigned the single extreme color. If there is negative data in the matrix, then it also will use the lower quantile of the data to stop the range of equally spaced breaks (see ?setBreaks)

+

Here

+
plotHeatmap(ce,clusterSamplesData="primaryCluster",
+            whichClusters="primaryCluster", breaks=0.99,
+            main="Heatmap with clusterMany, breaks=0.99",annLegend=FALSE)
+

+
plotHeatmap(ce,clusterSamplesData="primaryCluster",
+            whichClusters="primaryCluster", breaks=0.95,
+            main="Heatmap with clusterMany, breaks=0.95",annLegend=FALSE)
+

+

The function setBreaks which is called internally by plotHeatmap is also a stand-alone function that the user can call directly to have greater flexibility in getting breaks for the heatmap. For example it allows the user to specify that the breaks should be symmetric around 0. We also provide some default color spectrum that can be better for different settings or symmetric data around 0 – see ?showHeatmapPalettes

+
@@ -642,39 +669,80 @@

5 The clustering workflow

We will now go into more detail about important options for the main parts of the clustering workflow.

5.1 clusterMany

-
-

5.1.1 Overview of the implemented clustering procedures

In the quick start section we picked some simple and familiar clustering options that would run quickly and needed little explanation. However, our workflow generally assumes more complex options and more parameter variations are tried. Before getting into the specific options of clusterMany, let us first describe some of these more complicated setups, since many of the arguments of clusterMany depend on understanding them.

-

Clustering Algorithms (clusterD): Clustering algorithms generally start with a particular predefined distance or dissimilarity between the samples, usually encoded in a \(n x n\) matrix, \(D\). In our package, we consider only such clustering algorithms. The input could also be similarities, though we will continue to call such a matrix \(D\).

-

The simplest scenario is a simple calculation of \(D\) and then a clustering of \(D\), usually dependent on particular parameters. In this package we try to group together algorithms that cluster \(D\) based on common parameters and operations that can be done. Currently there are two “types” of algorithms we consider, which we call type “K” and “01”. The help page of clusterD documents these choices more fully, but we give an overview here. Many of the parameters that are allowed to vary in clusterMany refer to parameters for the clustering of the \(D\) matrix, either for “K” or “01” type algorithms.

-

The “K” algorithms are so called because their main parameter requirement is that the user specifies the number of clusters (\(K\)) to be created. They assume the input \(D\) are dissimilarities, and depending on the algorithm, may have additional expectations. “pam” and “kmeans” are examples of such types of algorithms.

-

The “01” algorithms are so named because the algorithm assumes that the input \(D\) consists of similarities between samples and that the similarities encoded in \(D\) are on a scale of 0-1. They use this fact to make the primary user-specified parameter be not the number of final clusters, but a measure \(\alpha\) of how dissimilar samples in the same cluster can be (on a scale of 0-1). Given \(\alpha\), the algorithm then implements a method to then determine the clusters (so \(\alpha\) implicitly determines \(K\)). These methods rely on the assumption that because the 0-1 scale has special significance, the user will be able to make an determination more easily as to the level of dissimilarity allowed in a true cluster, rather than predetermine the number of clusters \(K\). The current 01 methods are “tight”, “hierarchical01” and “pam”.

-

Subsampling In addition to the basic clustering algorithms on a matrix \(D\), we also implement many other common cluster processing steps that are relevant to the result of the clustering. We have already seen such an example with dimensionality reduction, where the input \(D\) is determined based on different input data. A more significant processing that we perform is calculation of a dissimilarity matrix \(D\) not by a distance on the vector of data, but by subsampling and clustering of the subsampled data. The resulting \(D\) matrix is a matrix of co-clustering percentages. Each entry is a percentage of subsamples where the two samples shared a clustering over the many subsamplings of the data (there are slight variations as how this can be calculated, see help pages of subsampleClustering ). Note that this implies there actually two different clustering algorithms (and sets of corresponding parameters) – one for the clustering on the subsampled data, and one for the clustering of the resulting \(D\) of the percentage of coClustering of samples. clusterMany focuses on varying parameters related to the clustering of \(D\), and generally assumes that the underlying clustering on the subsampled data is simple (e.g. “pam”). (The underlying clustering machinery in our package is performed by a function clusterSingle which is not discussed in this tutorial, but can be called explicitly for fine-grained control over all of the features ). The subsampling option is computationally expensive, and when coupled with comparing many parameters, does result in a lengthy evaluation of clusterMany. However, we recommend it as one of the most useful methods for getting stable clustering results.

-

Sequential Detection of Clusters Another complicated addition to the clustering that requires additional explanation is the implementation of sequential clustering. This refers to clustering of the data, then removing the “best” cluster, and then re-clustering the remaining samples, and then continuing this iteration until all samples are clustered (or the algorithm in some other way calls a stop). Such sequential clustering can often be convenient when there is very dominant cluster, for example, that is far away from the other mass of data. Removing samples in these clusters and resampling can sometimes be more productive and result in results more robust to the choice of samples. A particular implementation of such a sequential method, based upon (Tseng and Wong 2005), is implemented in the clusterExperiment package. Because of the iterative nature of this process, there are many possible parameters (see help(seqCluster)). clusterMany does not allow variation of very many of these parameters, but instead just has the choice of running the sequential clustering or not. Sequential clustering can also be quite computationally expensive, particularly when paired with subsampling to determine \(D\) at each step of the iteration.

+
+

5.1.1 Base clustering algorithms and the ClusterFunction class

+

This package is meant to be able to use and compare different clustering routines. However, the required input, arguments, etc. of different clustering algorithms varies greatly. We create the ClusterFunction class so that we can take ensure that the necessary information to fit into our workflow is well defined, and otherwise the other details of the algorithm can be ignored. In general, the user will not need to know the details of this class, since they will use built-in functions provided by the package which can be accessed by character values. To see the set of character values that correspond to built in functions,

+
listBuiltInFunctions()
+
## [1] "pam"            "kmeans"         "hierarchical01" "hierarchicalK" 
+## [5] "tight"
+

If you are interested in implementing your own ClusterFunction object see the documentation of the ClusterFunction class.

+

There are some important features of any clustering algorithm that are encoded in the ClusterFunction object for which it is important to understand because they affect which algorithms can be used when.

+

inputType The type of input the algorithm expects, which can be either an \(p x n\) matrix of features, in which case the argument x gives that data, or a \(n x n\) matrix of dissimilarities, in which case the argument diss. Some algorithms can accept either type. To determine the inputType of an algorithm(s),

+
inputType(c("kmeans","pam","hierarchicalK"))
+
##        kmeans           pam hierarchicalK 
+##           "X"      "either"        "diss"
+

algorithmType we group together algorithms that cluster based on common strategies that affect how we can use them in our workflow. Currently there are two “types” of algorithms we consider, which we call type “K” and “01”. We can determine the type of a builtin function by the following:

+
algorithmType(c("kmeans","hierarchicalK","hierarchical01"))
+
##         kmeans  hierarchicalK hierarchical01 
+##            "K"            "K"           "01"
+

The “K” algorithms are so called because their main parameter requirement is that the user specifies the number of clusters (\(K\)) to be created and require an input of k to the clustering function. Built in ‘K’ algorithms are:

+
listBuiltInTypeK()
+
## [1] "pam"           "kmeans"        "hierarchicalK"
+

The “01” algorithms are so named because the algorithm assumes that the input is a disimilarities between samples and that the similarities encoded in \(D\) are on a scale of 0-1. The clustering functions should use this fact to make the primary user-specified parameter be not the number of final clusters, but a measure \(\alpha\) of how dissimilar samples in the same cluster can be (on a scale of 0-1). Given \(\alpha\), the algorithm then implements a method to then determine the clusters (so \(\alpha\) implicitly determines \(K\)). These methods rely on the assumption that because the 0-1 scale has special significance, the user will be able to make an determination more easily as to the level of dissimilarity allowed in a true cluster, rather than predetermine the number of clusters \(K\). The current 01 methods are:

+
listBuiltInType01()
+
## [1] "hierarchical01" "tight"
+

**requiredArgs The different algorithm types correspond to requiring different input types (k versus alpha). This is usually sorted out by clusterMany, which will only dispatch the appropriate one. Clustering functions can also have additional required arguments. See below for more discussion about how these arguments can be passed along to clusterMany or RSEC.

+

To see all of the required arguments of a function,

+
requiredArgs(c("hierarchical01","hierarchicalK"))
+
## $hierarchical01
+## [1] "alpha"
+## 
+## $hierarchicalK
+## [1] "k"
+
+
+

5.1.2 Internal clustering procedures

+

clusterMany iteratively calls a function clusterSingle over the collection of parameters. clusterSingle is the clustering workhorse, and may be used by the user who wants more fine-grained control, see documentation of clusterSingle.

+

Within each call of clusterSingle, there are three possible steps, depending on the value of subsample and sequential. If these are both false, then just a basic clustering routine is done on the input data (called the “main” clustering). If subsample=TRUE, there is first a step that subsamples and clusters the subsamples to calculate a co-occurance matrix, and that is used as the input for the main clustering step. If sequential=TRUE this process is iterated over and over again to iteratively select the best clusters (see ?seqCluster for a detailed description). Each of these steps has a function that goes with it, but that should not generally be called by the user. However, the documentation of these functions can be useful.

+

In particular, arguments to these functions that are not set by clusterMany can be passed via named lists: subsampleArgs, mainClusterArgs, and seqArgs. Some of the arguments to these steps can be varied in clusterMany, but more esoteric ones should be sent to these arguments of clusterMany (and they will be fixed for parameter combinations tried in clusterMany).

+

Main Clustering Step (mainClustering) The main clustering step described above is done by the function mainClustering. In addition to the basic clustering algorithms on the input data, we also implement many other common cluster processing steps that are relevant to the result of the clustering. We have already seen such an example with dimensionality reduction, where the input \(D\) is determined based on different input data. Many of the arguments to mainClustering are arguments to clusterMany as well so that mainClusterArgs is usually not needed. The main exception would be to send more esoteric arguments to the underlying clustering function called in the main clustering step. The syntax for this would be to give a nested list to the argument mainClusterArgs

+
clusterMany(x,clusterFunction="hierarchicalK", ... , mainClusterArgs=list(clusterArgs=list(method="single") )) 
+

Here we change the argument method in the clustering function hclust called by the hierarchicalK function to single.

+

Subsampling (subsampleClustering) A more significant processing that can be coupled with any clustering algorithm is to continually by subsample the data and cluster the subsampled data. This creates a \(n x n\) matrix \(S\) that is a matrix of co-clustering percentages – how many times two samples co-clustered together over the subsamples (there are slight variations as how this can be calculated, see help pages of subsampleClustering ). This does not itself give a clustering, but the resulting \(S\) matrix can then form the basis for clustering the samples. Specifically, the matrix \(D=1-S\) is then given as input to the main clustering step described above. The subsampling option is computationally expensive, and when coupled with comparing many parameters, does result in a lengthy evaluation of clusterMany. However, we recommend it as one of the most useful methods for getting stable clustering results.

+

Note that the juxtaposition of these two steps (the subsampling and then feeding the results to the main clustering function) implies there actually two different possible clustering algorithms (and sets of corresponding parameters) – one for the clustering on the subsampled data, and one for the clustering of the resulting \(D\) based on the percentage of coClustering of samples. This brings up a restriction on the clustering function in the main clustering step – it needs to be able to handle input that is a dissimilarity (inputType is either diss or either). Furthermore, the user might want to set clustering function and corresponding parameters separately for the two steps. The way that clusterMany handles this is that the main arguments of clusterMany focus on varying the parameters related to the main clustering step (the clustering of \(D\) after subsampling). For this reason, the argument clusterFunction varies the clustering function used by the main clustering step, not the subsampling step. The clustering function of the subsampling step can be specified by the user via subsampleArgs, but in this case it is set for all calls of clusterMany and does not vary. Alternatively, if the user doesn’t specify the clusterFunction in subsampleArgs then the default is to use clusterFunction of the main clustering step along with any required arguments given by the user for that function (there are some cases where using the clusterFunction of the main step is not possible for the subsampling step, in which case the default is to use “pam”).

+

More generally, since few of the arguments to subsampleClustering are allowed to be varied by the direct arguments to clusterMany, it is also more common to want to change these arguments via the argument subsampleArgs. Examples might be resamp.num (the number of subsamples to draw) or samp.p (the proportion of samples to draw in each subsample) – see ?subsampleClustering for a full documentation of the possible arguments. In addition, there are arguments to be passed to the underlying clustering function; like for mainClustering, these arguments would be a nested list to the argument subsampleArgs.

+

An example of a syntax that sets the arguments for subsampleClustering would be:

+
clusterMany(x,..., subsampleArgs=list(resamp.num=100,samp.p=0.5,clusterFunction="hiearchicalK", clusterArgs=list(method="single") )) 
+

Sequential Detection of Clusters Another complicated addition that can be added to the main clustering step is the implementation of sequential clustering. This refers to clustering of the data, then removing the “best” cluster, and then re-clustering the remaining samples, and then continuing this iteration until all samples are clustered (or the algorithm in some other way calls a stop). Such sequential clustering can often be convenient when there is very dominant cluster, for example, that is far away from the other mass of data. Removing samples in these clusters and resampling can sometimes be more productive and result in a clustering more robust to the choice of samples. A particular implementation of such a sequential method, based upon (Tseng and Wong 2005), is implemented in the clusterExperiment package when the option sequential=TRUE is chosen (see ?seqCluster for documentation of how the iteration is done). Sequential clustering can also be quite computationally expensive, particularly when paired with subsampling to determine \(D\) at each step of the iteration.

+

Because of the iterative nature of the sequential step, there are many possible parameters (see ?seqCluster). Like subsample clustering, clusterMany does not allow variation of very many of these parameters, but they can be set via passing arguments in a named list to seqArgs. An example of a syntax that sets the arguments for seqCluster would be:

+
clusterMany(x,..., seqArgs=list( remain.n=10)) 
+

This code changes the remain.n option of the sequential step, which governs when the sequential step stops because there are not enough samples remaining.

-

5.1.2 Arguments of clusterMany

-

Now that we’ve explained the underlying architecture of the clustering provided in the package, we discuss the parameters that can be varied in clusterMany. There are additional arguments available for clusterMany but right now we focus on only the ones that can be given multiple options. Recall that arguments in clusterMany that take on multiple values mean that the combinations of all the multiple valued arguments will be given as input for a clustering routine.

+

5.1.3 Arguments of clusterMany

+

Now that we’ve explained the underlying architecture of the clustering provided in the package, and how to set the arguments that can’t be varied, we discuss the parameters that can be varied in clusterMany. (There are a few additional arguments available for clusterMany that govern how clusterMany works, but right now we focus on only the ones that can be given multiple options).

+

Recall that arguments in clusterMany that take on multiple values mean that the combinations of all the multiple valued arguments will be given as input for a clustering routine.

  • sequential This parameter consists of logical values, TRUE and/or FALSE, indicating whether the sequential strategy should be implemented or not.
  • subsample This parameter consists of logical values, TRUE and/or FALSE, indicating whether the subsampling strategy for determining \(D\) should be implemented or not.
  • -
  • clusterFunction The clustering functions to be tried. If subsample=TRUE is part of the combination, then clusterFunction the method that will be used on the co-clustering matrix \(D\) created from subsampling the data (where pam clustering is used on the subsampled data). Otherwise, clusterFunction is the clustering method that will be used on dist(t(x))
  • -
  • ks The argument ‘ks’ is interpreted differently for different choices of the other parameters. If sequential=TRUE is part of the combination, ks defines the argument k0 of sequential clustering (see help(seqCluster)), which is approximately like the initial starting point for the number of clusters in the sequential process. Otherwise, ks is passed to set \(K\) of both the clustering of subsampled data and the actual clustering of the data (if a \(K\) needs to be set, i.e. a type “K” algorithm). When/if findBestK=TRUE is part of the combination, ks also defines the range of values to search for the best k (see the details in help(clusterMany) for more).
  • -
  • dimReduce These are character strings indicating what choices of dimensionality reduction should be tried. The choices are “PCA”, indicating clustering on the top principal components, “var”, indicating clustering on the top most variable features, and “none”, indicating the whole data set should be used. If either “PCA” or “var” are chosen, the following parameters indicate the number of such features to be used (and can be a vector of values as we have seen): +
  • clusterFunction The clustering functions to be tried in the main clustering step. Recall if subsample=TRUE is part of the combination, then clusterFunction the method that will be used on the matrix \(D\) created from subsampling the data. Otherwise, clusterFunction is the clustering method that will be used directly on the data.
  • +
  • ks The argument ‘ks’ is interpreted differently for different choices of the other parameters and can differ from between parameter combinations!. If sequential=TRUE is part of the parameter combination, ks defines the argument k0 of sequential clustering (see ?seqCluster), which is approximately like the initial starting point for the number of clusters in the sequential process. Otherwise, ks is passed to set k of both the main clustering step (and by default that of the subsampled data), and is only relevant if clusterFunction is of type “K”. When/if findBestK=TRUE is part of the combination, ks also defines the range of values to search for the best k (see the details in the documentation of clusterMany for more).
  • +
  • dimReduce These are character strings indicating what choices of dimensionality reduction should be tried. The choices are “PCA”, indicating clustering on the top principal components, “var”, indicating clustering on the top most variable features, and “none”, indicating the whole data set should be used. If either “PCA” or “var” are chosen, the following parameters indicate the number of such features to be used (and can be a vector of values to try as we have seen):
    • nVarDims
    • nPCADims
  • -
  • distFunction These are character values giving functions that provide a distance matrix between the samples, when applied to the data. These functions should be accessible in the global environment (clusterMany applies get to the global environment to access these functions). To make them compatible with the dist function, these functions should assume the samples are in the rows, i.e. they should work when applied to t(assay(ce)). We give an example in the next subsection below.
  • -
  • minSizes these are integer values determining the minimum size required for a cluster (passed to the clusterD part of clustering).
  • +
  • distFunction These are character values giving functions that provide a distance matrix between the samples, when applied to the data. These functions should be accessible in the global environment (clusterMany applies get to the global environment to access these functions). To make them compatible with the standard R function dist, these functions should assume the samples are in the rows, i.e. they should work when applied to t(assay(ce)). We give an example in the next subsection below.
  • +
  • minSizes these are integer values determining the minimum size required for a cluster (passed to the mainClustering part of clustering).
  • alphas These are the \(\alpha\) parameters for “01” clustering techniques; these values are only relevant if one of the clusterFunction values is a “01” clustering algorithm. The values given to alphas should be between 0 and 1, with smaller values indicating greater similarity required between the clusters.
  • betas These are the \(\beta\) parameters for sequential clustering; these values are only relevant if sequential=TRUE and determine the level of stability required between changes in the parameters to determine that a cluster is stable.
  • findBestK This option is for “K” clustering techniques, and indicates that \(K\) should be chosen automatically as the \(K\) that gives the largest silhouette distance between clusters.
  • -
  • removeSil A logical value as to whether samples with small silhouette distance to their assigned cluster are “removed”, in the sense that they are not given their original cluster assignment but instead assigned -1. This option is for “K” clustering techniques as a method of removing poorly clustered samples (the “01” techniques used by clusterMany generally do this intrinsically as part of the algorithm).
  • +
  • removeSil A logical value as to whether samples with small silhouette distance to their assigned cluster are “removed”, in the sense that they are not given their original cluster assignment but instead assigned -1. This option is for “K” clustering techniques as a method of removing poorly clustered samples.
  • silCutoff If removeSil is TRUE, then silCutoff determines the cutoff on silhouette distance for unassigning the sample.
-

clusterMany tries to have generally simple interface, and for this reason makes choices about what is meant by certain combinations. For example, in combinations where findBestK=TRUE, ks=2:10 is taken to mean that the clustering should find the best \(k\) out of the range of 2-10. However, in other combinations ks might indicate the specific number of clusters, \(k\), that should be found. For parameter combinations that are not what is desired, the user should consider making direct calls to clusterSingle where all of these options (and many more) can be explicitly called.

-

Other parameters for the clustering are kept fixed. As described above, there are many more possible parameters in play than are considered in clusterMany. These parameters can be set via the arguments clusterDArgs, subsampleArgs and seqArgs. These arguments correspond to the different processes described above (the clustering of \(D\), the creation of \(D\) via subsampling, and the sequential clustering process, respectively). These arguments take a list of arguments that are sent directly to clusterSingle. However, these arguments may be overridden by the interpretation of clusterMany of how different combinations interact; again for complete control direct calls to clusterSingle are necessary.

+

clusterMany tries to have generally simple interface, and for this reason makes choices about what is meant by certain combinations of parameters. For example, in combinations where findBestK=TRUE, ks=2:10 is taken to mean that the clustering should find the best \(k\) out of the range of 2-10. However, in other parameter combinations where findBestK=FALSE the same ks might indicate the specific number of clusters, \(K\), that should be found. To see the parameter choices that will be run, the user can set run=FALSE and the output will be a matrix of the parameter values indicated by the choices of the user. For parameter combinations that are not what is desired, the user should consider making direct calls to clusterSingle where all of these options combinations (and many more) can be explicitly called.

+

Other parameters for the clustering are kept fixed. As described above, there are many more possible parameters in play than are considered in clusterMany. These parameters can be set via the arguments mainClusterArgs, subsampleArgs and seqArgs. These arguments correspond to the different processes described above (the main clustering step, the creation of \(D\) to be clustered via subsampling, and the sequential clustering process, respectively). These arguments take a list of arguments that are sent directly to clusterSingle. However, these arguments may be overridden by the interpretation of clusterMany of how different combinations interact; again for complete control direct calls to clusterSingle are necessary.

@@ -694,7 +762,7 @@

5.1.2 Arguments of clusterM

- + @@ -706,7 +774,7 @@

5.1.2 Arguments of clusterM

- + @@ -730,43 +798,43 @@

5.1.2 Arguments of clusterM

- + - + - + - + - + - + - + @@ -779,19 +847,20 @@

5.1.2 Arguments of clusterM

- sequential=FALSE, findBestK=FALSE, clusterFunction of type ‘K’clusterDmainClustering k
- sequential=FALSE, findBestK=TRUE, clusterFunction of type ‘K’clusterDmainClustering kRange
clusterFunction noneclusterDmainClustering clusterFunction
minSizes noneclusterDmainClustering minSize
distFunction subsample=FALSEclusterDmainClustering distFunction
alphas clusterFunction of type ‘01’clusterDmainClustering alpha
findBestK clusterFunction of type ‘K’clusterDmainClustering findBestK
removeSil clusterFunction of type ‘K’clusterDmainClustering removeSil
silCutoff clusterFunction of type ‘K’clusterDmainClustering silCutoff
-

5.1.3 Example changing the distance function and clustering algorithm

+

5.1.4 Example changing the distance function and clustering algorithm

Providing different distance functions is slightly more involved than the other parameters, so we give an example here.

First we define distances that we would like to compare. We are going to define two distances that take values between 0-1 based on different choices of correlation.

corDist<-function(x){(1-cor(t(x),method="pearson"))/2}
 spearDist<-function(x){(1-cor(t(x),method="spearman"))/2}

These distances are defined so as to give distance of 0 between samples with correlation 1, and distance of 1 for correlation -1.

-

We will also compare using different algorithms for clustering. Since we chose distances between 0-1, we can use any algorithm. Currently, clusterMany requires that the distances work with all of the clusterFunction choices given. Since some of the clusterFunction algorithms require a distance matrix between 0-1, this means we can only compare all of the algorithms when the distance is a 0-1 distance. (Future versions will probably try to create a work around so that the algorithm just skips algorithms that don’t match the distance).

-

Note on 0-1 clustering when subsample=FALSE We would note that the default values \(\alpha\) for the 0-1 clustering were set with the distance \(D\) the result of subsampling or other concensus summary in mind. In generally, subsampling creates a \(D\) matrix with high similarity for many samples who share a cluster (the proportion of times samples are seen together for well clustered samples can easily be in the .8-.95 range, or even exactly 1). For this reason the default \(\alpha\) is 0.1 which requires distances between samples in the 0.1 range or less (i.e. a similarity in the range of 0.9 or more). We show an example of the \(D\) matrix from subsampling; we make use of the clusterSingle which is the workhorse mentioned above that runs a single clustering command directly, which gives the output \(D\) from the sampling in the “coClustering” slot of ce. Note that the result is \(1-p_{ij}\) where \(p_{ij}\) is the proportion of times sample \(i\) and \(j\) clustered together.

-
ceSub<-clusterSingle(ce,dimReduce="mad",ndims=1000,subsample=TRUE,clusterFunction="hierarchical01",subsampleArgs=list(k=8),clusterLabel="subsamplingCluster",clusterDArgs=list(minSize=5))
+

We will also compare using different algorithms for clustering. Currently, clusterMany requires that the distances work with all of the clusterFunction choices given. Since some of the clusterFunction algorithms require a distance matrix between 0-1, this means we can only compare all of the algorithms when the distance is a 0-1 distance. (Future versions may try to create a work around so that the algorithm just skips algorithms that don’t match the distance). Since the distances we defined are between 0-1, however, we can use any algorithm that takes dissimilarities as input.

+

Note on 0-1 clustering when subsample=FALSE We would note that the default values of \(\alpha\) in clusterMany and RSEC for the 0-1 clustering were set with the distance \(D\) the result of subsampling or other concensus summary in mind. In generally, subsampling creates a \(D\) matrix with high similarity for many samples who share a cluster (the proportion of times samples are seen together for well clustered samples can easily be in the .8-.95 range, or even exactly 1). For this reason the default \(\alpha\) is 0.1 which requires distances between samples in the 0.1 range or less (i.e. a similarity in the range of 0.9 or more).

+

To illustrate this point, we show an example of the \(D\) matrix from subsampling. To do this we make use of the clusterSingle which is the workhorse mentioned above that runs a single clustering command directly; it gives the output \(D\) from the sampling in the “coClustering” slot of ce when we set replaceCoCluster=TRUE (and therefore we save it as a separate object, so that it doesn’t write over the existing “coClustering” slot in ce). Note that the result is \(1-p_{ij}\) where \(p_{ij}\) is the proportion of times sample \(i\) and \(j\) clustered together.

+
ceSub<-clusterSingle(ce,dimReduce="mad",ndims=1000,subsample=TRUE,subsampleArgs=list(clusterFunction="pam",clusterArgs=list(k=8)),clusterLabel="subsamplingCluster",mainClusterArgs=list(clusterFunction="hierarchical01",clusterArgs=list(alpha=0.1),minSize=5), replaceCoClustering=TRUE)
 plotCoClustering(ceSub,colorScale=rev(seqPal5))
-

-

We see even here, the default of \(\alpha=0.1\) was perhaps too conservative since only two clusters came out (with size greater than 5).

-

The distances based on correlation calculated directly on the data, such as we created above, are often used for clustering expression data. But they are unlikely to have distances as low as seen in subsampling, even for well clustered samples. Here’s a visualization of the correlation distance matrix we defined above (using Spearman’s correlation) on the top 1000 most variable features:

+

+

We see even here, the default of \(\alpha=0.1\) was perhaps too conservative since only two clusters came out (at leastwith size greater than 5).

+

However, the distances based on correlation calculated directly on the data, such as we created above, are also often used for clustering expression data directly (i.e. without the subsampling step). But they are unlikely to have dissimilarities as low as seen in subsampling, even for well clustered samples. Here’s a visualization of the correlation distance matrix we defined above (using Spearman’s correlation) on the top 1000 most variable features:

dSp<-spearDist(t(transform(ce,dimReduce="mad",nVarDims=1000)))
 plotHeatmap(dSp,isSymmetric=TRUE,colorScale=rev(seqPal5))

@@ -807,11 +876,11 @@

5.1.3 Example changing the distan clusterLabels(ceDist)<-gsub("hierarchical","hier",clusterLabels(ceDist)) par(mar=c(1.1,15.1,1.1,1.1)) plotClusters(ceDist,axisLine=-2,sampleData=c("Biological_Condition"))

-

-

Notice that using the 01 methods did not give relevant results

+

+

Notice that using the “tight” methods did not give relevant results (no samples were clustered)

-

5.1.4 Dealing with large numbers of clusterings

+

5.1.5 Dealing with large numbers of clusterings

A good first check before running clusterMany is to determine how many clusterings you are asking for. clusterMany has some limited internal checks to not do unnecessary duplicates (e.g. removeSil only works with some clusterFunctions so clusterMany would detect that), but generally takes all combinations. This can take a while for more complicated clustering techniques, so it is a good idea to check what you are getting into. You can do this by running clusterMany with run=FALSE.

In the following we consider expanding our original clustering choices to consider individual choices of \(K\) (rather than just findBestK=TRUE).

checkParam<-clusterMany(se, clusterFunction="pam", ks=2:10,
@@ -829,11 +898,11 @@ 

5.1.4 Dealing with large numbers ```r -# ce<-clusterMany(se, paramMatrix=checkParam$paramMatrix, clusterDArgs=checkParam$clusterDArgs, seqArgs=checkParam$seqArgs,subsampleArgs=checkParam$subsampleArgs) +# ce<-clusterMany(se, paramMatrix=checkParam$paramMatrix, mainClusterArgs=checkParam$mainClusterArgs, seqArgs=checkParam$seqArgs,subsampleArgs=checkParam$subsampleArgs) ce<-clusterMany(ce, clusterFunction="pam",ks=2:10,findBestK=TRUE,removeSil=c(TRUE), isCount=TRUE,dimReduce=c("PCA","var"),nVarDims=c(100,500,1000),nPCADims=c(5,15,50),run=TRUE) ``` -Note that we also provided in the above call the additional arguments `clusterDArgs`, `seqArgs` and `subsampleArgs` which normally we might neglect with a direct call to `clusterMany`. This is because in creating the `paramMatrix`, `clusterMany` may internally change these default values, and we want to make sure we exactly replicate what we would get from a direct call. +Note that we also provided in the above call the additional arguments `mainClusterArgs`, `seqArgs` and `subsampleArgs` which normally we might neglect with a direct call to `clusterMany`. This is because in creating the `paramMatrix`, `clusterMany` may internally change these default values, and we want to make sure we exactly replicate what we would get from a direct call. -->

@@ -843,19 +912,19 @@

5.2 Create a unified cluster from

As mentioned in the Quick Start section, the default option for combineMany is to only define a cluster when all of the samples are in the same clusters across all clusterings. However, this is generally too conservative and just results in most samples not being assigned to a cluster.

Instead combineMany has a parameter proportion that governs in what proportion of clusterings the samples should be together. Internally, combineMany makes a coClustering matrix \(D\). Like the \(D\) created by subsampling in clusterMany, the coClustering matrix takes on values 0-1 for the proportion of times the samples are together in the clustering. This \(D\) matrix is saved in the ce object and can be visualized with plotCoClustering (which is just a call to plotHeatmap). Recall the one we last made in the QuickStart, with our last call to combineMany (proportion=0.7 and minSize=3).

plotCoClustering(ce)
-

+

combineMany performs the clustering by running a “01” clustering algorithm on the \(D\) matrix of percentage co-clustering (the default being “hierarchical01”). The alpha argument to the 01 clustering is 1-proportion. Also passed to the clustering algorithm is the parameter minSize which sets the minimum size of a cluster.

We can also manually choose the set of clusters to use in combineMany with the argument whichClusters. Here we choose only the clusters that correspond to using dimensionality reduction using the most variable features. We also set minSize to be lower than the default of 5 to allow for smaller clusters

wh<-grep("nVAR",clusterLabels(ce))
 ce<-combineMany(ce,whichCluster=wh,proportion=0.7,minSize=3,
                 clusterLabel="combineMany,nVAR")
 plotCoClustering(ce)
-

+

We can compare to all of our other versions of combineMany. While they do not all have clusterTypes equal to “combineMany” (only the most recent call has clusterType exactly equal to “combineMany”), they all have “combineMany” as part of their clusterType, even though they have different clusterLabels (and now we’ll see that it was useful to give them different labels!)

wh<-grep("combineMany",clusterTypes(ce))
 par(mar=plotCMar)
 plotClusters(ce,whichClusters=rev(wh),axisLine=-1)
-

+

Treatment of Unclustered assignments -1 values are treated separately in the calculation. In particular, they are not considered in the calculation of percentage co-clustering – the percent co-clustering is taken only with respect to those clusterings where both samples were assigned. However, a post-processing is done to the clusters found from running the clustering on the \(D\) matrix. For each sample, the percentage of times that they were marked -1 in the clusterings is calculated. If this percentage is greater than the argument propUnassigned then the sample is marked as -1 in the clustering returned by combineMany.

Good scenarios for running combineMany Varying certain parameters result in clusterings better for combineMany than other sets of parameters. In particular, if there are huge discrepancies in the set of clusterings given to combineMany, the results will be a shattering of the samples into many small clusters. Similarly, if the number of clusters \(K\) is very different, the end result will likely be like that of the large \(K\), and how much value that is (rather than just picking the clustering with the largest \(K\)), is debatable. However, for “01” clustering algorithms or clusterings using the sequential algorithm, varying the underlying parameters \(\alpha\) or \(k_0\) often results in roughly similar clusterings across the parameters so that creating a consensus across them is highly informative.

@@ -870,7 +939,7 @@

5.3.1 makeDendrogram

Like clustering, the dendrogram can depend on what features are included from the data. The same options for clustering are available for the hierarchical clustering of the clusters, namely choices of dimensionality reduction via dimReduce and the number of dimensions via ndims.

ce<-makeDendrogram(ce,dimReduce="var",ndims=500)
 plotDendrogram(ce)
-

Notice that the plot of the dendrogram shows the hierarchy of the clusters (and color codes them according to the colors stored in colorLegend slot).

+

Notice that the plot of the dendrogram shows the hierarchy of the clusters (and color codes them according to the colors stored in colorLegend slot).

Recall that the most recent clustering made is from our call to combineMany, where we experimented with using on some of the clusterings from clusterMany, so that is our current primaryCluster:

show(ce)
## class: ClusterExperiment 
@@ -879,7 +948,7 @@ 

5.3.1 makeDendrogram

## Primary cluster label: combineMany,nVAR ## Table of clusters (of primary clustering): ## -1 c1 c2 c3 c4 c5 c6 c7 -## 10 5 5 9 15 12 4 5 +## 10 7 3 9 15 13 5 3 ## Total number of clusterings: 41 ## Dendrogram run on 'combineMany,nVAR' (cluster index: 1) ## ----------- @@ -890,10 +959,11 @@

5.3.1 makeDendrogram

## mergeClusters run? No

This is the clustering from combining only the clusterings from clusterMany that use the top most variable genes. Because it is the primaryCluster, it was the clustering that was used by default to make the dendrogram.

We might prefer to get back to the dendrogram based on our combineMany in quick start (the “combineMany, final” clustering). We’ve lost that dendrogram when we called makeDendrogram again. However, we can rerun makeDendrogram and choose a different clustering from which to make the dendrogram.

+

To vary the display, we’ll show the display where we put blocks of color instead of names (labelType="colorblock") and we also decide to show the leaves as the individual samples instead of the clusters (leafType="sample"); the combination of which gives us color blocks equal to the size of the clusterings.

ce<-makeDendrogram(ce,dimReduce="var",ndims=500,
                    whichCluster="combineMany,final")
-plotDendrogram(ce)
-

+plotDendrogram(ce,leafType="sample",labelType="colorblock")
+

Note that the clusterType of this clustering is not “combineMany”, but “combineMany.x”, where “x” indicates what iteration it was:

clusterTypes(ce)[which(clusterLabels(ce)=="combineMany,final")]
## [1] "combineMany.3"
@@ -905,8 +975,8 @@

5.3.1 makeDendrogram

## Primary cluster type: combineMany ## Primary cluster label: combineMany,final ## Table of clusters (of primary clustering): -## -1 c1 c2 c3 c4 c5 c6 -## 12 9 7 15 14 4 4 +## -1 c1 c2 c3 c4 c5 c6 c7 +## 6 4 8 5 9 15 14 4 ## Total number of clusterings: 41 ## Dendrogram run on 'combineMany,final' (cluster index: 3) ## ----------- @@ -919,7 +989,7 @@

5.3.1 makeDendrogram

5.3.2 Merging clusters with little differential expression

We then can use this hierarchy of clusters to merge clusters that show little difference in expression. We do this by testing, for each node of the dendrogram, for which features is the mean of the set of clusters to the right split of the node is equal to the mean on the left split. This is done via the getBestFeatures (see section on getBestFeatures), where the type argument is set to “Dendro”.

-

Starting at the bottom of the tree, those clusters that have the percentage of features with differential expression below a certain value (determined by the argument cutoff) are merged into a larger cluster. This testing of differences and merging continues until the estimated percentage of non-null DE features is above cutoff. This means lower values of cutoff result in less merging of clusters. There are multiple methods of estimation of the percentage of non-null features implemented. The option mergeMethod="adjP" which we showed earlier is the simplest: the proportion found significant by calculating the proportion of DE genes at a given False Discovery Rate threshold (using the Benjamini-Hochberg procedure). However, other methods are also implemented (see the help of mergeClusters).

+

Starting at the bottom of the tree, those clusters that have the percentage of features with differential expression below a certain value (determined by the argument cutoff) are merged into a larger cluster. This testing of differences and merging continues until the estimated percentage of non-null DE features is above cutoff. This means lower values of cutoff result in less merging of clusters. There are multiple methods of estimation of the percentage of non-null features implemented. The option mergeMethod="adjP" which we showed earlier is the simplest: the proportion found significant by calculating the proportion of DE genes at a given False Discovery Rate threshold (using the Benjamini-Hochberg procedure). However, other more sophisticated methods are also implemented (see the help of mergeClusters).

Notice that mergeClusters will always run based on the clustering that made the currently existing dendrogram. So it is always good to check that it is what we expect.

ce
## class: ClusterExperiment 
@@ -927,8 +997,8 @@ 

5.3.2 Merging clusters with littl ## Primary cluster type: combineMany ## Primary cluster label: combineMany,final ## Table of clusters (of primary clustering): -## -1 c1 c2 c3 c4 c5 c6 -## 12 9 7 15 14 4 4 +## -1 c1 c2 c3 c4 c5 c6 c7 +## 6 4 8 5 9 15 14 4 ## Total number of clusterings: 41 ## Dendrogram run on 'combineMany,final' (cluster index: 3) ## ----------- @@ -937,30 +1007,26 @@

5.3.2 Merging clusters with littl ## combineMany run? Yes ## makeDendrogram run? Yes ## mergeClusters run? No

-

mergeClusters can also be run without merging the cluster, and simply drawing a plot showing the dendrogram along with the estimates of the percentage of non-null features to aid in deciding a cutoff and method. By setting plotType="all", all of the estimates of the different methods are displayed simultaneously, while before in the QuickStart, we only showed the default values.

-
mergeClusters(ce,mergeMethod="none",plotType="all")
+

mergeClusters can also be run without merging the cluster, and simply drawing a plot showing the dendrogram along with the estimates of the percentage of non-null features to aid in deciding a cutoff and method. By setting plotInfo="all", all of the estimates of the different methods are displayed simultaneously, while before in the QuickStart, we only showed the default values.

+
mergeClusters(ce,mergeMethod="none",plotInfo="all")
## Note: Merging will be done on ' combineMany,final ', with clustering index 3
-
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 2.9. Rerun with
-## increased df
-
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 1.6. Rerun with
+
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 3.3. Rerun with
 ## increased df
-

+

Now we can pick a cutoff. We’ll give it a label to keep it separate from the previous run we had made.

ce<-mergeClusters(ce,cutoff=0.05,mergeMethod="adjP",clusterLabel="mergeClusters,v2")
## Note: Merging will be done on ' combineMany,final ', with clustering index 3
-
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 2.9. Rerun with
+
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 3.3. Rerun with
 ## increased df
-
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 1.6. Rerun with
-## increased df
-

+

ce
## class: ClusterExperiment 
 ## dim: 7069 65 
 ## Primary cluster type: mergeClusters 
 ## Primary cluster label: mergeClusters,v2 
 ## Table of clusters (of primary clustering):
-## -1 m1 m2 m3 
-## 12 31  7 15 
+## -1 m1 m2 
+##  6 28 31 
 ## Total number of clusterings: 42 
 ## Dendrogram run on 'combineMany,final' (cluster index: 4)
 ## -----------
@@ -973,19 +1039,17 @@ 

5.3.2 Merging clusters with littl
ce<-mergeClusters(ce,cutoff=0.15,mergeMethod="MB",
                   clusterLabel="mergeClusters,v3")
## Note: Merging will be done on ' combineMany,final ', with clustering index 4
-
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 2.9. Rerun with
-## increased df
-
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 1.6. Rerun with
+
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 3.3. Rerun with
 ## increased df
-

+

ce
## class: ClusterExperiment 
 ## dim: 7069 65 
 ## Primary cluster type: mergeClusters 
 ## Primary cluster label: mergeClusters,v3 
 ## Table of clusters (of primary clustering):
-## -1 m1 m2 m3 m4 m5 m6 
-## 12  9  7 15 14  4  4 
+## -1 m1 m2 m3 m4 m5 
+##  6 13 12  5 15 14 
 ## Total number of clusterings: 43 
 ## Dendrogram run on 'combineMany,final' (cluster index: 5)
 ## -----------
@@ -994,6 +1058,7 @@ 

5.3.2 Merging clusters with littl ## combineMany run? Yes ## makeDendrogram run? Yes ## mergeClusters run? Yes

+

Note, we can turn off plotting completely by setting plot=FALSE.

@@ -1017,15 +1082,15 @@

5.4 Keeping track of and rerunnin ## clusterMany 36 0 0 0 0

Explicit details about every workflow cluster and their index in clusterMatrix is given by workflowClusterDetails:

head(workflowClusterDetails(ce),8)
-
##   index          type iteration               label
-## 1     1 mergeClusters         0    mergeClusters,v3
-## 2     2 mergeClusters         4    mergeClusters,v2
-## 3     3   combineMany         4    combineMany,nVAR
-## 4     4 mergeClusters         3     mergeClusters.3
-## 5     5   combineMany         0   combineMany,final
-## 6     6   combineMany         2     combineMany,0.7
-## 7     7   combineMany         1 combineMany,default
-## 8     8   clusterMany         0        nVAR=100,k=5
+
##   index          type iteration             label
+## 1     1 mergeClusters         0  mergeClusters,v3
+## 2     2 mergeClusters         4  mergeClusters,v2
+## 3     3   combineMany         4  combineMany,nVAR
+## 4     4 mergeClusters         3   mergeClusters.3
+## 5     5   combineMany         0 combineMany,final
+## 6     6   combineMany         2   combineMany,0.7
+## 7     7   combineMany         1     combineMany,1
+## 8     8   clusterMany         0      nVAR=100,k=5

A note on the whichCluster argument Many functions take the whichCluster argument for identifying a clustering or clusterings on which to perform an action. These arguments all act similarly across functions, and allow the user to give character arguments. As described above, these can be shortcuts like “workflow”, or they can match either clusterTypes or clusterLabels of the object. It is important to note that matching is first done to clusterTypes, and then if not successful to clusterLabels. Since neither clusterTypes nor clusterLabels is guaranteed to be unique, the user should be careful in how they make the call. And, of course, whichCluster arguments can also take explicit numeric integers that identify the column(s) of the clusterMatrix that should be used.

5.4.1 Designate a Final Clustering

@@ -1036,7 +1101,7 @@

5.4.1 Designate a Final Clusterin clusterLabel="Final Clustering") par(mar=plotCMar) plotClusters(ce,whichClusters="workflow") -

+

Note that because it is labeled as “final” it shows up automatically in “workflow” clusters in our plotClusters plot. It has also been set as our primaryCluster and has the new clusterLabel we gave it in the call to setToFinal.

This didn’t get rid of our undesired mergeClusters result that is most recent. It still shows up as “the” mergeClusters result. This might be undesired. We could remove that “mergeClusters” result with removeClusters. Alternatively, we could manually change the clusterTypes to mergeClusters.x so that it doesn’t show up as current. A cleaner way to do this would have been to first set the desired cluster (“mergeClusters.4”) to the most current iteration with setToCurrent, which would have bumped up the existing mergeClusters result to be no longer current.

@@ -1044,7 +1109,7 @@

5.4.1 Designate a Final Clusterin

5.5 RSEC

The clustering workflow described so far is a generalization of our RSEC algorithm for single-cell sequencing data. The RSEC algorithm is particularly oriented around using subsampling and sequential discovery of clusters to find very robust signals.

-

In particular, RSEC is a single function that follows the entire workflow described above, but makes the choices to set subsample=TRUE and sequential=TRUE. Furthermore, the only clustering functions that are allowed are the “01” types (“hierarchical01” and “tight”). This removes a number of options from clusterMany, making for a slightly reduced set of commands. RSEC also implements the combineMany, makeDendrogram and mergeClusters steps, again with not all arguments available to be set. Furthermore, the defaults set in RSEC are those we choose for our algorithm, and occassionally vary from stand-alone method. The final output is a clusterExperiment object as you would get from following the workflow.

+

In particular, RSEC is a single function that follows the entire workflow described above, but makes the choices to set subsample=TRUE and sequential=TRUE. Furthermore, the only clustering functions that are allowed are the “01” types (currently “hierarchical01” and “tight”). This removes a number of options from clusterMany, making for a slightly reduced set of arguments. RSEC also implements the combineMany, makeDendrogram and mergeClusters steps, again with not all arguments available to be set. Furthermore, the defaults set in RSEC are those we choose for our algorithm, and occassionally vary from stand-alone method. The final output is a clusterExperiment object as you would get from following the workflow.

We give the following correspondence to help see what arguments of each component are fixed by RSEC, and which are allowed to be set by the user (as well as their correspondence to arguments in the workflow functions).

@@ -1116,7 +1181,7 @@

5.5 RSEC

- + @@ -1205,7 +1270,7 @@

5.5 RSEC

- + @@ -1229,7 +1294,7 @@

5.5 RSEC

6 Finding Features related to a Clustering

The function getBestFeatures finds features in the data that are strongly differentiated between the clusters of a given clustering. Finding the best features is generally the last step in the workflow, once a final clustering has been decided upon, though as we have seen it is also called internally in mergeClusters to decide between which clusters to merge together.

-

The function getBestFeatures calls limma on input data to determine the gene features most associated with a particular clustering. getBestFeatures picks the primaryCluster of a ClusterExperiment object as the clustering to use to find features. If the standard workflow is followed, this will be the last completed step (usually the result of mergeClusters or manually choosing a final cluster via setToFinal). The primaryCluster can of course be changed by setting primaryClusterIndex to point to a different clustering.

+

The function getBestFeatures calls limma (Smyth 2004, Ritchie et al. (2015)) on input data to determine the gene features most associated with a particular clustering. getBestFeatures picks the primaryCluster of a ClusterExperiment object as the clustering to use to find features. If the standard workflow is followed, this will be the last completed step (usually the result of mergeClusters or manually choosing a final cluster via setToFinal). The primaryCluster can of course be changed by setting primaryClusterIndex to point to a different clustering.

Since our most recent clustering (the one determined by our setToFinal) only has 2 clusterings, we are going to reset the primary clustering to be our result from combineMany, with the label “mergeClusters,v3”. This will be better for explaining the functionality of the getBestFeatures method.

wh<-which(clusterLabels(ce)=="mergeClusters,v3")
 if(length(wh)==1) primaryClusterIndex(ce)<-wh else stop()
@@ -1244,22 +1309,22 @@

6.1.1 All Pairwise

The option type="Pairs", which we saw earlier, performs all pair-wise tests between the clusters for each feature, testing for each pair of clusters whether the mean of the feature is different between the two clusters. Here is the example from above using all pairwise comparisons:

pairsAllTop<-getBestFeatures(ce,contrastType="Pairs",p.value=0.05)
 dim(pairsAllTop)
-
## [1] 150   9
+
## [1] 100   9
head(pairsAllTop)
##   IndexInOriginal Contrast Feature     logFC   AveExpr         t
-## 1            6565    X1-X2     VIM -8.608470  7.366498 -7.374382
-## 2            3309    X1-X2    MIAT  7.875230  7.987637  6.479908
-## 3             552    X1-X2  BCL11B  9.186117  4.120686  6.468115
-## 4            4501    X1-X2 PRTFDC1 -7.635334  1.913974 -6.235150
-## 5            2210    X1-X2    GLI3 -7.460028  5.121621 -6.189493
-## 6            1284    X1-X2   CXADR  6.796402 10.516764  5.723578
+## 1             552    X1-X2  BCL11B -8.912981  3.848816 -8.174292
+## 2            2210    X1-X2    GLI3  6.690233  5.473713  6.845450
+## 3            6565    X1-X2     VIM  6.818923  7.749640  6.694650
+## 4            3309    X1-X2    MIAT -7.097724  7.325323 -6.518758
+## 5            5745    X1-X2   STMN2 -6.692038  8.314910 -6.449138
+## 6            1744    X1-X2    ENO1  5.615936 10.554084  6.370684
 ##        P.Value    adj.P.Val         B
-## 1 5.322547e-10 3.865591e-07 10.834597
-## 2 1.815383e-08 7.103104e-06  8.032926
-## 3 1.901249e-08 7.357625e-06  7.995912
-## 4 4.724549e-08 1.522698e-05  7.265297
-## 5 5.643513e-08 1.765221e-05  7.122301
-## 6 3.408406e-07 7.608638e-05  5.669800
+## 1 9.355870e-12 7.780782e-09 15.078592 +## 2 2.492757e-09 8.595756e-07 10.328588 +## 3 4.665952e-09 1.499255e-06 9.790555 +## 4 9.663605e-09 2.689450e-06 9.164671 +## 5 1.287780e-08 3.422298e-06 8.917550 +## 6 1.778408e-08 4.411075e-06 8.639557

Notice that compared to the quick start guide, We didn’t set the parameter number which is passed to topTable, so we can get out at most 10 significant features for each contrast/comparison (because the default value of number in topTable is 10). Similarly, if we didn’t set a value for p.value, topTable would return the top number genes per contrast, regardless of whether they were all significant or not. These are the defaults of topTable, which we purposefully do not modify, but we urge the user to read the documentation of topTable carefully to understand what is being asked for. In the QuickStart, we set number=NROW(ce) to make sure we got all significant genes.

In addition to the columns provided by topTable, the column “Contrast” tells us what pairwise contrast the result is from. “X1-X2” means a comparison of cluster 1 and cluster 2. The column “IndexInOriginal” gives the index of the gene to the original input data matrix, namely assay(ce). The other columns are given by topTable (with the column “Feature” renamed – it is usually “ProbeID” in limma).

@@ -1268,20 +1333,20 @@

6.1.2 One Against All

The choice type="OneAgainsAll" performs a comparison of a cluster against the mean of all of the other clusters.

best1vsAll<-getBestFeatures(ce,contrastType="OneAgainstAll",p.value=0.05)
 head(best1vsAll)
-
##   IndexInOriginal ContrastName              Contrast Feature     logFC
-## 1            6565            1 X1-(X2+X3+X4+X5+X6)/5     VIM -5.569268
-## 2            5753            1 X1-(X2+X3+X4+X5+X6)/5   STRAP -5.808820
-## 3            2960            1 X1-(X2+X3+X4+X5+X6)/5    LDHA -6.447274
-## 4            6090            1 X1-(X2+X3+X4+X5+X6)/5 TMEM258 -4.881551
-## 5             552            1 X1-(X2+X3+X4+X5+X6)/5  BCL11B  5.678299
-## 6            2350            1 X1-(X2+X3+X4+X5+X6)/5   GSTP1 -5.005114
+
##   IndexInOriginal ContrastName           Contrast Feature     logFC
+## 1             538            1 X1-(X2+X3+X4+X5)/4    BCAN  6.760057
+## 2            2210            1 X1-(X2+X3+X4+X5)/4    GLI3  6.400933
+## 3            3849            1 X1-(X2+X3+X4+X5)/4    NRG1  6.536965
+## 4            1284            1 X1-(X2+X3+X4+X5)/4   CXADR -6.342056
+## 5             435            1 X1-(X2+X3+X4+X5)/4  ATP1A2  6.878362
+## 6            2455            1 X1-(X2+X3+X4+X5)/4    HES1  5.493516
 ##    AveExpr         t      P.Value    adj.P.Val        B
-## 1 7.366498 -6.372501 2.764171e-08 6.776853e-06 8.270257
-## 2 7.975761 -6.358567 2.918907e-08 6.877918e-06 8.223424
-## 3 7.200328 -5.436302 1.015365e-06 1.469819e-04 5.159886
-## 4 6.563927 -5.423463 1.065741e-06 1.516858e-04 5.117965
-## 5 4.120686  5.340430 1.456449e-06 1.936990e-04 4.847522
-## 6 8.755384 -5.125553 3.245503e-06 3.781724e-04 4.153399
+## 1 2.302045 8.440550 3.043138e-12 2.195096e-09 16.77940 +## 2 5.473713 8.159071 9.976469e-12 6.079626e-09 15.72254 +## 3 2.401928 8.074085 1.427970e-11 8.011365e-09 15.40276 +## 4 9.885678 -7.637553 9.000285e-11 4.185725e-08 13.75715 +## 5 2.465131 7.590835 1.095771e-10 4.965390e-08 13.58089 +## 6 3.593156 7.341638 3.125841e-10 1.150863e-07 12.64084

Notice that now there is both a “Contrast” and a “ContrastName” column. Like before, “Contrast” gives an explicit definition of what is the comparisons, in the form of “(X2+X3+X4+X5+X6)/5-X1”, meaning the mean of the means of clusters 2-6 is compared to the mean of cluster1. “ContrastName” interprets this into a more usable name, namely that this contrast can be easily identified as a test of “cluster 1”.

@@ -1297,8 +1362,8 @@

6.1.3 Dendrogram

## Primary cluster type: mergeClusters ## Primary cluster label: mergeClusters,v3 ## Table of clusters (of primary clustering): -## -1 m1 m2 m3 m4 m5 m6 -## 12 9 7 15 14 4 4 +## -1 m1 m2 m3 m4 m5 +## 6 13 12 5 15 14 ## Total number of clusterings: 43 ## Dendrogram run on 'combineMany,final' (cluster index: 5) ## ----------- @@ -1311,39 +1376,38 @@

6.1.3 Dendrogram

ce<-makeDendrogram(ce,dimReduce="var",ndims=500)
 bestDendro<-getBestFeatures(ce,contrastType="Dendro",p.value=0.05)
 head(bestDendro)
-
##   IndexInOriginal ContrastName                  Contrast Feature     logFC
-## 1            2210        Node1 (X4+X5+X1+X6)/4-(X2+X3)/2    GLI3 -8.212906
-## 2            5745        Node1 (X4+X5+X1+X6)/4-(X2+X3)/2   STMN2  8.524708
-## 3            4285        Node1 (X4+X5+X1+X6)/4-(X2+X3)/2  PLXNA2  8.197781
-## 4            1465        Node1 (X4+X5+X1+X6)/4-(X2+X3)/2    DLK1 -5.338176
-## 5            3886        Node1 (X4+X5+X1+X6)/4-(X2+X3)/2  NUDCD2 -7.065937
-## 6            5276        Node1 (X4+X5+X1+X6)/4-(X2+X3)/2   SFRP1 -7.849545
+
##   IndexInOriginal ContrastName               Contrast Feature     logFC
+## 1            2210        Node1 (X5+X2+X3)/3-(X1+X4)/2    GLI3 -7.784163
+## 2            5745        Node1 (X5+X2+X3)/3-(X1+X4)/2   STMN2  7.997804
+## 3            4285        Node1 (X5+X2+X3)/3-(X1+X4)/2  PLXNA2  7.827597
+## 4            1465        Node1 (X5+X2+X3)/3-(X1+X4)/2    DLK1 -5.378393
+## 5            4286        Node1 (X5+X2+X3)/3-(X1+X4)/2  PLXNA4  7.333733
+## 6            2291        Node1 (X5+X2+X3)/3-(X1+X4)/2    GPC3 -4.600692
 ##    AveExpr          t      P.Value    adj.P.Val        B
-## 1 5.121621 -11.139064 2.528864e-16 2.234567e-12 25.24467
-## 2 8.742343  11.057577 3.418369e-16 2.416445e-12 24.98301
-## 3 5.572182  10.071251 1.393355e-14 6.156015e-11 21.73829
-## 4 3.227247  -9.162249 4.633852e-13 1.191666e-09 18.63158
-## 5 4.109444  -8.995111 8.893410e-13 2.087345e-09 18.04976
-## 6 5.533309  -8.979601 9.449010e-13 2.087345e-09 17.99562
+## 1 5.473713 -11.618188 6.343480e-18 4.484206e-14 29.03659 +## 2 8.314910 11.242937 2.834187e-17 1.335658e-13 27.69925 +## 3 5.186352 10.797543 1.708818e-16 6.039816e-13 26.08662 +## 4 2.916002 -10.623857 3.462440e-16 1.087822e-12 25.45063 +## 5 4.970650 10.075551 3.279045e-15 7.726524e-12 23.41829 +## 6 2.520560 -9.721577 1.419241e-14 3.086959e-11 22.08789

Again, there is both a “ContrastName” and “Contrast” column. The “Contrast” column identifies which clusters where on each side of the node (and hence commpared) and “ContrastName” is a name for the node.

levels((bestDendro)$Contrast)
-
## [1] "(X4+X5+X1+X6)/4-(X2+X3)/2" "X4-(X5+X1+X6)/3"          
-## [3] "X2-X3"                     "X5-(X1+X6)/2"             
-## [5] "X1-X6"
+
## [1] "(X5+X2+X3)/3-(X1+X4)/2" "X5-(X2+X3)/2"          
+## [3] "X1-X4"                  "X2-X3"

We can plot the dendrogram showing the node names to help make sense of which contrasts go with which nodes (plotDendrogram calls plot.phylo from the ape package and can take those arguments).

plotDendrogram(ce,show.node.label=TRUE)
-

+

6.2 Analysis for count and other RNASeq data

-

The getBestFeatures method for ClusterExperiment objects has an argument isCount. If this is marked TRUE then the data in assay(x) is assumed to be counts, and the call to limma uses the voom correction. This correction deals with the mean-variance relationship that is found with count data. This means that the differential expression analysis is done on \(log_2(x+0.5)\). This is regardless of what transformation is stored in the ClusterExperiment object! The voom call within getBestFeatures however, is by default set to normalize.method = "none" in the call to voom (though the user can set normalize.method in the call to getBestFeatures).

+

The getBestFeatures method for ClusterExperiment objects has an argument isCount. If this is marked TRUE then the data in assay(x) is assumed to be counts, and the call to limma uses the voom(Law et al. 2014) correction. This correction deals with the mean-variance relationship that is found with count data. This means that the differential expression analysis is done on \(log_2(x+0.5)\). This is regardless of what transformation is stored in the ClusterExperiment object! The voom call within getBestFeatures however, is by default set to normalize.method = "none" in the call to voom (though the user can set normalize.method in the call to getBestFeatures).

If instead isCount=FALSE, then limma is performed on transform(x), i.e. the data after transformation of the data with the transformation stored in the ClusterExperiment object. In this case, there is no voom correction.

Unlike edgeR or DESeq, the voom correction does not explicitly require a count matrix, and therefore it has been proposed that it can be used on FPKM or TPM entries, or data normalized via RUV. Setting isCount=TRUE even if the data in the assay slot is not count will have this effect. However, the authors of the package do not recommend using voom on anything other than counts, see e.g. this discussion.

6.3 Piping into other DE routines

-

Ultimately, for many settings, the user may prefer to use other techniques for differential expression analysis or have more control over certain aspects of it. The function clusterContrasts may be called by the user to get the contrasts that are defined within getBestFeatures (e.g. dendrogram contrasts or pairwise contrasts). These contrasts, which are in the format needed for limma can be piped into programs that allow for contrasts in their linear models like edgeR (Robinson, Mccarthy, and Smyth 2010) for mRNA-Seq or MAST (Finak et al. 2015) for single-cell sequencing.

+

Ultimately, for many settings, the user may prefer to use other techniques for differential expression analysis or have more control over certain aspects of it. The function clusterContrasts may be called by the user to get the contrasts that are defined within getBestFeatures (e.g. dendrogram contrasts or pairwise contrasts). These contrasts, which are in the format needed for limma can be piped into programs that allow for contrasts in their linear models like edgeR (Robinson, Mccarthy, and Smyth 2010) for mRNA-Seq; they can also be chosen to be returned in the formated needed by MAST (Finak et al. 2015) for single-cell sequencing by settting outputType="MAST".

Similarly, more complicated normalizations, like RUV (Gagnon-Bartsch and Speed 2011), adjust each gene individually for unwanted batch or other variation within the linear model. In this case, a matrix \(W\) that describes this variation should be included in the linear model. Again, this can be done in other programs, using the contrasts provided by clusterContrasts

@@ -1372,7 +1436,7 @@

7 Session Information

## [8] datasets base ## ## other attached packages: -## [1] clusterExperiment_1.3.0-9009 scRNAseq_1.2.0 +## [1] clusterExperiment_1.3.1-9002 scRNAseq_1.2.0 ## [3] SummarizedExperiment_1.6.3 DelayedArray_0.2.7 ## [5] matrixStats_0.52.2 Biobase_2.36.2 ## [7] GenomicRanges_1.28.3 GenomeInfoDb_1.12.2 @@ -1381,8 +1445,8 @@

7 Session Information

## ## loaded via a namespace (and not attached): ## [1] nlme_3.1-131 bitops_1.0-6 -## [3] bold_0.4.0 doParallel_1.0.10 -## [5] RColorBrewer_1.1-2 progress_1.1.2 +## [3] bold_0.4.0 progress_1.1.2 +## [5] doParallel_1.0.10 RColorBrewer_1.1-2 ## [7] httr_1.2.1 rprojroot_1.2 ## [9] prabclus_2.2-6 tools_3.4.0 ## [11] backports_1.1.0 R6_2.2.1 @@ -1415,17 +1479,18 @@

7 Session Information

## [65] uuid_0.1-2 taxize_0.8.4 ## [67] fpc_2.1-10 rngtools_1.2.4 ## [69] reshape2_1.4.2 codetools_0.2-15 -## [71] XML_3.98-1.7 evaluate_0.10 -## [73] RNeXML_2.0.7 data.table_1.10.4 -## [75] foreach_1.4.3 locfdr_1.1-8 -## [77] gtable_0.2.0 tidyr_0.6.3 -## [79] reshape_0.8.6 kernlab_0.9-25 -## [81] assertthat_0.2.0 ggplot2_2.2.1 -## [83] gridBase_0.4-7 phylobase_0.8.4 -## [85] xtable_1.8-2 class_7.3-14 -## [87] viridisLite_0.2.0 tibble_1.3.3 -## [89] iterators_1.0.8 registry_0.3 -## [91] cluster_2.0.6 +## [71] XML_3.98-1.7 glue_1.1.0 +## [73] evaluate_0.10 RNeXML_2.0.7 +## [75] data.table_1.10.4 foreach_1.4.3 +## [77] locfdr_1.1-8 gtable_0.2.0 +## [79] tidyr_0.6.3 reshape_0.8.6 +## [81] kernlab_0.9-25 assertthat_0.2.0 +## [83] ggplot2_2.2.1 gridBase_0.4-7 +## [85] phylobase_0.8.4 xtable_1.8-2 +## [87] RSpectra_0.12-0 class_7.3-14 +## [89] viridisLite_0.2.0 tibble_1.3.3 +## [91] iterators_1.0.8 registry_0.3 +## [93] cluster_2.0.6

References

@@ -1436,9 +1501,18 @@

References

Gagnon-Bartsch, Johann A., and Terence P Speed. 2011. “Using control genes to correct for unwanted variation in microarray data.” Biostatistics (Oxford, England), November.

+
+

Law, Charity W, Yunshun Chen, Wei Shi, and Gordon K Smyth. 2014. “voom: Precision weights unlock linear model analysis tools for RNA-seq read counts.” Genome Biology 15 (2): 1–17.

+
+
+

Ritchie, Matthew E, Belinda Phipson, Di Wu, Yifang Hu, Charity W Law, Wei Shi, and Gordon K Smyth. 2015. “limma powers differential expression analyses for RNA-sequencing and microarray studies.” Nucleic Acids Research 43 (7): e47–e47.

+

Robinson, Mark D, Davis J Mccarthy, and Gordon K Smyth. 2010. “edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.” Bioinformatics (Oxford, England) 26 (1): 139–40.

+
+

Smyth, Gordon K. 2004. “Linear models and empirical bayes methods for assessing differential expression in microarray experiments.” Statistical Applications in Genetics and Molecular Biology 3 (1): Article3–25.

+

Tseng, George C., and Wing H. Wong. 2005. “Tight Clustering: A Resampling-Based Approach for Identifying Stable and Tight Patterns in Data.” Biometrics 61 (1): 10–16.

diff --git a/vignettes/oldVignette/clusterCellsDocumentation.Rmd b/vignettes/oldVignette/clusterCellsDocumentation.Rmd index 4e243430..c9496723 100644 --- a/vignettes/oldVignette/clusterCellsDocumentation.Rmd +++ b/vignettes/oldVignette/clusterCellsDocumentation.Rmd @@ -67,7 +67,7 @@ There are three main choices that the user needs to make for `clusterSingle`: The `clusterSingle` calls underlying functions for each of these tasks, each of which can take many arguments. For simplicity in syntax, the user passes arguments to each of these underlying functions as lists of arguments. -* Clustering is done via `clusterD` and arguments to `clusterD` are passed as a list via the argument `clusterDArgs`. See Section XXX below on `clusterD` function for more details about possible arguments. +* Clustering is done via `clusterD` and arguments to `clusterD` are passed as a list via the argument `mainClusterArgs`. See Section XXX below on `clusterD` function for more details about possible arguments. * `subsample=TRUE` : if subsampling, `subsampleClustering` is called and arguments to `subsampleClustering` are passed as a list via the argument `subsampleArgs`. See Section XXX below on `subsampleClustering`. * `sequential=TRUE` : if sequential method, `seqCluster` is called, and arguments to `seqCluster` are passed as a list via the argument `seqArgs`. See Section XXX below on `seqCluster`. @@ -86,11 +86,11 @@ There are two parts to the basic clustering sequence: * `k` the number of clusters to fit in the subsampled data. This must be passed by the user via `subsampleArgs` (unless `sequential=TRUE`, see below, where it is set internally) * `clusterFunction` the function to use to cluster the subsampled data. The default is `pam` from the `cluster` package but can also be user-defined. The argument `clusterFunction` of `clusterSingle` is *not* passed to this argument -- `clusterSingle` by default uses the `pam` function -- so the user must pass this *via `subsampleArgs`* if they want a different function. -* Clustering an input $D$ matrix `clusterD`, which can be either the co-occurance matrix from `subsampleClustering` or otherwise is calculated as `dist(x)`. The arguments passed to `clusterD` are via the argument `clusterDArgs` in `clusterSingle`. The most important arguments are: +* Clustering an input $D$ matrix `clusterD`, which can be either the co-occurance matrix from `subsampleClustering` or otherwise is calculated as `dist(x)`. The arguments passed to `clusterD` are via the argument `mainClusterArgs` in `clusterSingle`. The most important arguments are: * `D` the $n\times n$ matrix of dissimilarities * `clusterFunction` the type of clustering to do on the dissimilarity matrix. * Note that both of the above options are direct arguments of `clusterSingle` (i.e. do not need to be passed with a list of arguments for). Therefore `clusterFunction` argument of `clusterSingle` is passed to `clusterD` (and not to `subsampleClustering`) - * Depending on the choice of `clusterFunction` some other arguments may be necessary to pass via a list of arguments to `clusterDArgs` (e.g. if "pam", must also pass an argument `k` or `findBestK=TRUE`) + * Depending on the choice of `clusterFunction` some other arguments may be necessary to pass via a list of arguments to `mainClusterArgs` (e.g. if "pam", must also pass an argument `k` or `findBestK=TRUE`) We will load some simulated data that comes with the package with four underlying clusters. First we will do just basic clustering of the data for 4 clusters using `pam` on the standard distance matrix. @@ -98,7 +98,7 @@ We will load some simulated data that comes with the package with four underlyin library(clusterExperiment) data(simData) simpleCluster<-clusterSingle(simData, subsample=FALSE, - sequential=FALSE, clusterFunction="pam",clusterDArgs=list('k'=4)) + sequential=FALSE, clusterFunction="pam",mainClusterArgs=list('k'=4)) #compare to direct call of pam library(cluster) pamCluster<-pam(dist(simData),4) @@ -110,7 +110,7 @@ Of course, this is not the best use case, since the pam object from `pam` has mu ```{r basicSubsampling} #library(clusterPackage) subsampleCluster<-clusterSingle(simData, subsample=TRUE, - sequential=FALSE, clusterFunction="pam",clusterDArgs=list('k'=4)) + sequential=FALSE, clusterFunction="pam",mainClusterArgs=list('k'=4)) table(pamCluster$clustering,subsampleCluster$clustering) ``` @@ -118,7 +118,7 @@ Notice that we got a warning. This is because we *could* have chosen to cluster ```{r basicSubsampling2} subsampleCluster2<-clusterSingle(simData, subsample=TRUE, sequential=FALSE, - clusterFunction="pam",clusterDArgs=list('k'=4), + clusterFunction="pam",mainClusterArgs=list('k'=4), subsampleArgs=list("k"=2, clusterFunction="kmeans")) table(subsampleCluster2$clustering,subsampleCluster$clustering) ``` @@ -320,7 +320,7 @@ Users can also change the parameters and try to construct their own combinations ## Limitations of `clusterMany` -However, there can be sets of parameter choices that are not realizable using `clusterMany`, even with defining your own `paramMatrix`. `clusterMany` makes some choices and interpretations of the parameters that the user cannot override. A current example is varying the $K$ used for clustering the subsampled data when `subsample=TRUE`. Individual calls to `clusterSingle` can set this $K$ via the `subsampleArgs=list("k"=...)` (assuming that `sequential=FALSE`). However, `clusterMany` has for simplicity a single argument `ks` that means different parameters in different contexts. In cases where `sequential=TRUE`, `ks` defines `k0` argument of `seqCluster`, which therefore also sets the $K$ for clustering subsampled data when `subsample=TRUE`. In cases where `findBestK=TRUE` (for the clusterD algorithm) then `ks` also defines `kRanges`, if `kRanges` is not already defined by the user in `clusterDArgs`. For cases where `findBestK=TRUE` and `sequential=FALSE` and `subsample=TRUE`, then $K$ for clustering of subsampled data MUST be passed via the argument `subsampleArgs`. And if `findBestK=FALSE`, then the `ks` argument defines both the $K$ for clustering of subsampled data and the $K$ used for clustering the resulting co-ocurrance matrix $D$ (overriding any user specification of either of those parameters via `clusterDArgs` or `subsampleArgs`). +However, there can be sets of parameter choices that are not realizable using `clusterMany`, even with defining your own `paramMatrix`. `clusterMany` makes some choices and interpretations of the parameters that the user cannot override. A current example is varying the $K$ used for clustering the subsampled data when `subsample=TRUE`. Individual calls to `clusterSingle` can set this $K$ via the `subsampleArgs=list("k"=...)` (assuming that `sequential=FALSE`). However, `clusterMany` has for simplicity a single argument `ks` that means different parameters in different contexts. In cases where `sequential=TRUE`, `ks` defines `k0` argument of `seqCluster`, which therefore also sets the $K$ for clustering subsampled data when `subsample=TRUE`. In cases where `findBestK=TRUE` (for the clusterD algorithm) then `ks` also defines `kRanges`, if `kRanges` is not already defined by the user in `mainClusterArgs`. For cases where `findBestK=TRUE` and `sequential=FALSE` and `subsample=TRUE`, then $K$ for clustering of subsampled data MUST be passed via the argument `subsampleArgs`. And if `findBestK=FALSE`, then the `ks` argument defines both the $K$ for clustering of subsampled data and the $K$ used for clustering the resulting co-ocurrance matrix $D$ (overriding any user specification of either of those parameters via `mainClusterArgs` or `subsampleArgs`). As the above example makes clear, `clusterMany` is a convenience wrapper that chooses simplicity in the input parameters over fine specification by the users, and in doing so makes subtle choices for the user that are deemed reasonable. It is intended to let the user explore parameters painlessly, but for finer control the user needs to write their own wrapper around `clusterSingle`. diff --git a/vignettes/oldVignette/clusterCellsDocumentation.html b/vignettes/oldVignette/clusterCellsDocumentation.html index ab271b98..56630cde 100644 --- a/vignettes/oldVignette/clusterCellsDocumentation.html +++ b/vignettes/oldVignette/clusterCellsDocumentation.html @@ -86,7 +86,7 @@

2016-03-12

The clusterSingle calls underlying functions for each of these tasks, each of which can take many arguments. For simplicity in syntax, the user passes arguments to each of these underlying functions as lists of arguments.

    -
  • Clustering is done via clusterD and arguments to clusterD are passed as a list via the argument clusterDArgs. See Section XXX below on clusterD function for more details about possible arguments.
  • +
  • Clustering is done via clusterD and arguments to clusterD are passed as a list via the argument mainClusterArgs. See Section XXX below on clusterD function for more details about possible arguments.
  • subsample=TRUE : if subsampling, subsampleClustering is called and arguments to subsampleClustering are passed as a list via the argument subsampleArgs. See Section XXX below on subsampleClustering.
  • sequential=TRUE : if sequential method, seqCluster is called, and arguments to seqCluster are passed as a list via the argument seqArgs. See Section XXX below on seqCluster.
@@ -100,11 +100,11 @@

Basic Clustering Sequence

  • x the \(n\times p\) data matrix (which is handled by the argument x to clusterSingle so the user does not need to set it)
  • k the number of clusters to fit in the subsampled data. This must be passed by the user via subsampleArgs (unless sequential=TRUE, see below, where it is set internally)
  • clusterFunction the function to use to cluster the subsampled data. The default is pam from the cluster package but can also be user-defined. The argument clusterFunction of clusterSingle is not passed to this argument – clusterSingle by default uses the pam function – so the user must pass this via subsampleArgs if they want a different function.

  • -
  • Clustering an input \(D\) matrix clusterD, which can be either the co-occurance matrix from subsampleClustering or otherwise is calculated as dist(x). The arguments passed to clusterD are via the argument clusterDArgs in clusterSingle. The most important arguments are:
  • +
  • Clustering an input \(D\) matrix clusterD, which can be either the co-occurance matrix from subsampleClustering or otherwise is calculated as dist(x). The arguments passed to clusterD are via the argument mainClusterArgs in clusterSingle. The most important arguments are:
  • D the \(n\times n\) matrix of dissimilarities
  • clusterFunction the type of clustering to do on the dissimilarity matrix.
  • Note that both of the above options are direct arguments of clusterSingle (i.e. do not need to be passed with a list of arguments for). Therefore clusterFunction argument of clusterSingle is passed to clusterD (and not to subsampleClustering)
  • -
  • Depending on the choice of clusterFunction some other arguments may be necessary to pass via a list of arguments to clusterDArgs (e.g. if “pam”, must also pass an argument k or findBestK=TRUE)

  • +
  • Depending on the choice of clusterFunction some other arguments may be necessary to pass via a list of arguments to mainClusterArgs (e.g. if “pam”, must also pass an argument k or findBestK=TRUE)

  • We will load some simulated data that comes with the package with four underlying clusters. First we will do just basic clustering of the data for 4 clusters using pam on the standard distance matrix.

    library(clusterExperiment)
    @@ -140,7 +140,7 @@

    Basic Clustering Sequence

    ## 'citation("Biobase")', and for packages 'citation("pkgname")'.
    data(simData)
     simpleCluster<-clusterSingle(simData, subsample=FALSE, 
    -                          sequential=FALSE, clusterFunction="pam",clusterDArgs=list('k'=4))
    +                          sequential=FALSE, clusterFunction="pam",mainClusterArgs=list('k'=4))
     #compare to direct call of pam
     library(cluster)
     pamCluster<-pam(dist(simData),4)
    @@ -154,10 +154,10 @@ 

    Basic Clustering Sequence

    Of course, this is not the best use case, since the pam object from pam has much more information. Let’s instead choose to do subsampling, and then cluster the co-occurance matrix.

    #library(clusterPackage)
     subsampleCluster<-clusterSingle(simData, subsample=TRUE, 
    -                             sequential=FALSE, clusterFunction="pam",clusterDArgs=list('k'=4))
    + sequential=FALSE, clusterFunction="pam",mainClusterArgs=list('k'=4))
    ## Warning in clusterSingle(simData, subsample = TRUE, sequential = FALSE,
     ## clusterFunction = "pam", : did not give 'k' in 'subsampleArgs'. Set to 'k'
    -## argument in 'clusterDArgs'
    +## argument in 'mainClusterArgs'
    table(pamCluster$clustering,subsampleCluster$clustering)
    ##    
     ##      1  2  3  4
    @@ -167,7 +167,7 @@ 

    Basic Clustering Sequence

    ## 4 97 1 1 1

    Notice that we got a warning. This is because we could have chosen to cluster with a different number of clusters on the subsampled data than we clustered the co-occurance data. Similarly, the ‘clusterFunction’ argument is passed to clusterD – i.e. the clustering that is done on the co-occurance matrix. We could use something different for clustering of the subsampled data. For example, the following clusters the subsampled data with k=2 and use kmeans, but then clusters the co-occurance matrix with k=4 and pam.

    subsampleCluster2<-clusterSingle(simData, subsample=TRUE, sequential=FALSE,
    -                              clusterFunction="pam",clusterDArgs=list('k'=4),
    +                              clusterFunction="pam",mainClusterArgs=list('k'=4),
                                   subsampleArgs=list("k"=2, clusterFunction="kmeans"))
     table(subsampleCluster2$clustering,subsampleCluster$clustering)
    ##    
    @@ -259,7 +259,7 @@ 

    User-defined choices of parameters

    Limitations of clusterMany

    -

    However, there can be sets of parameter choices that are not realizable using clusterMany, even with defining your own paramMatrix. clusterMany makes some choices and interpretations of the parameters that the user cannot override. A current example is varying the \(K\) used for clustering the subsampled data when subsample=TRUE. Individual calls to clusterSingle can set this \(K\) via the subsampleArgs=list("k"=...) (assuming that sequential=FALSE). However, clusterMany has for simplicity a single argument ks that means different parameters in different contexts. In cases where sequential=TRUE, ks defines k0 argument of seqCluster, which therefore also sets the \(K\) for clustering subsampled data when subsample=TRUE. In cases where findBestK=TRUE (for the clusterD algorithm) then ks also defines kRanges, if kRanges is not already defined by the user in clusterDArgs. For cases where findBestK=TRUE and sequential=FALSE and subsample=TRUE, then \(K\) for clustering of subsampled data MUST be passed via the argument subsampleArgs. And if findBestK=FALSE, then the ks argument defines both the \(K\) for clustering of subsampled data and the \(K\) used for clustering the resulting co-ocurrance matrix \(D\) (overriding any user specification of either of those parameters via clusterDArgs or subsampleArgs).

    +

    However, there can be sets of parameter choices that are not realizable using clusterMany, even with defining your own paramMatrix. clusterMany makes some choices and interpretations of the parameters that the user cannot override. A current example is varying the \(K\) used for clustering the subsampled data when subsample=TRUE. Individual calls to clusterSingle can set this \(K\) via the subsampleArgs=list("k"=...) (assuming that sequential=FALSE). However, clusterMany has for simplicity a single argument ks that means different parameters in different contexts. In cases where sequential=TRUE, ks defines k0 argument of seqCluster, which therefore also sets the \(K\) for clustering subsampled data when subsample=TRUE. In cases where findBestK=TRUE (for the clusterD algorithm) then ks also defines kRanges, if kRanges is not already defined by the user in mainClusterArgs. For cases where findBestK=TRUE and sequential=FALSE and subsample=TRUE, then \(K\) for clustering of subsampled data MUST be passed via the argument subsampleArgs. And if findBestK=FALSE, then the ks argument defines both the \(K\) for clustering of subsampled data and the \(K\) used for clustering the resulting co-ocurrance matrix \(D\) (overriding any user specification of either of those parameters via mainClusterArgs or subsampleArgs).

    As the above example makes clear, clusterMany is a convenience wrapper that chooses simplicity in the input parameters over fine specification by the users, and in doing so makes subtle choices for the user that are deemed reasonable. It is intended to let the user explore parameters painlessly, but for finer control the user needs to write their own wrapper around clusterSingle.

    diff --git a/vignettes/oldVignette/clusterCellsTutorial_old.Rmd b/vignettes/oldVignette/clusterCellsTutorial_old.Rmd index e7b19a55..98172b89 100644 --- a/vignettes/oldVignette/clusterCellsTutorial_old.Rmd +++ b/vignettes/oldVignette/clusterCellsTutorial_old.Rmd @@ -32,8 +32,8 @@ drawClusterAll <- function(highlight=NULL, Layer2Options=NULL,plotOptions=TRUE,L Layer1Names<-c("clusterFunction","subsample", "sequential") Layer1Options<-list("clusterFunction"=c("tight", "hierarchical01", "pam", "kmeans"), "subsample"=c("subsample.FALSE","subsample.TRUE"), "sequential"=c("sequential.FALSE","sequential.TRUE")) - Layer2Names<-c("clusterDArgs","subsampleArgs","seqArgs") - Layer2NamesSimple<-c("clusterDArgs","subsampleArgs","seqArgs") + Layer2Names<-c("mainClusterArgs","subsampleArgs","seqArgs") + Layer2NamesSimple<-c("mainClusterArgs","subsampleArgs","seqArgs") #x,y,box.size,box.prop,box.type makeDf<-function(x,y,box.size,box.prop,box.type){data.frame(x=x,y=y,box.size=box.size,box.prop=box.prop,box.type=box.type,stringsAsFactors =FALSE)} posMatrix<-makeDf(x=0.5,y=0.95,box.size=0.06,box.prop=0.4,box.type="diamond") @@ -49,7 +49,7 @@ drawClusterAll <- function(highlight=NULL, Layer2Options=NULL,plotOptions=TRUE,L "subsample"=list("subsample.FALSE"=makeDf(0.45, 0.59,0.03,0.4,values),"subsample.TRUE"=makeDf(0.55, 0.59,0.03,0.4,values)), "sequential"=list("sequential.FALSE"=makeDf(0.77, 0.59,0.03,0.4,values),"sequential.TRUE"=makeDf(0.88, 0.59,0.03,0.4,values)) ) - Layer2Pos<-list("clusterDArgs"=makeDf(0.2,0.44,boxRadx(Layer2Names[1]),0.4,options), + Layer2Pos<-list("mainClusterArgs"=makeDf(0.2,0.44,boxRadx(Layer2Names[1]),0.4,options), "subsampleArgs"=makeDf(0.5,.44,boxRadx(Layer2Names[2]),0.4,options), "seqArgs"=makeDf(0.825, 0.44,boxRadx(Layer2Names[3]),0.4,options)) @@ -207,7 +207,7 @@ The arguments of `clusterSingle` indicate which of these tasks should be perform Here are the argument calls for `clusterSingle`: ```{r clusterSingleArgs, eval=FALSE} -clusterSingle(x, subsample = TRUE, sequential = FALSE, clusterFunction = c("tight", "hierarchical01", "pam", "kmeans"), clusterDArgs = NULL, subsampleArgs = NULL, seqArgs = NULL) +clusterSingle(x, subsample = TRUE, sequential = FALSE, clusterFunction = c("tight", "hierarchical01", "pam", "kmeans"), mainClusterArgs = NULL, subsampleArgs = NULL, seqArgs = NULL) ``` The primary arguments `clusterFunction`,`subsample`, and `sequential` govern each of these tasks: @@ -218,7 +218,7 @@ We can visualize arguments and the values they can take on as a diagram: drawClusterAll() ``` -The remaining arguments (`clusterDArgs`, `subsampleArgs`, and `seqArgs`) are for passing additional options to each of these tasks, and we will show examples of how they are used. +The remaining arguments (`mainClusterArgs`, `subsampleArgs`, and `seqArgs`) are for passing additional options to each of these tasks, and we will show examples of how they are used. ## Example: PAM Clustering @@ -237,12 +237,12 @@ simpleCluster = clusterSingle(expressions, subsample=FALSE, sequential=FALSE, cl This is because even to perform simple PAM, the PAM algorithm needs additional parameters to be set, namely 'k', the number of groups. -However, there is no argument in `clusterSingle` for this. Because it is specific to a particular choice of clustering function, we don't want to clutter up `clusterSingle` with all the possible options that might be needed downstream for a particularly task. Instead, `clusterSingle` lets you pass such arguments to the function that actually does the task of clustering a distance matrix, `clusterD` via the `clusterDArgs` option. So the user doesn't need to call `clusterD` explicitly, but often will need to pass an argument to `clusterD`. +However, there is no argument in `clusterSingle` for this. Because it is specific to a particular choice of clustering function, we don't want to clutter up `clusterSingle` with all the possible options that might be needed downstream for a particularly task. Instead, `clusterSingle` lets you pass such arguments to the function that actually does the task of clustering a distance matrix, `clusterD` via the `mainClusterArgs` option. So the user doesn't need to call `clusterD` explicitly, but often will need to pass an argument to `clusterD`. In this case we need to pass the argument 'k'. Specifically, we need to create a list with an element of name "k" that takes on the value, say 5, for our k. ```{r visualizePamArgs, echo=FALSE} -drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"),Layer2Options=list("clusterDArgs"="k"),plotOptions=TRUE) +drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"),Layer2Options=list("mainClusterArgs"="k"),plotOptions=TRUE) ``` ```{r clusterSinglePam, warning=FALSE, message=FALSE} @@ -250,7 +250,7 @@ library(clusterExperiment) library(cluster) #load("data/expressions.Rda") expression = expressions[, 1:3] -simpleCluster = clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", clusterDArgs=list('k'=5)) +simpleCluster = clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", mainClusterArgs=list('k'=5)) table(simpleCluster$clustering) ``` @@ -277,11 +277,11 @@ There are other options we can pick to change the clustering of the distance mat We can pass these arguments to `clusterD`, again as a list: ```{r visualizeArgsClusterD, echo=FALSE} -drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"),Layer2Options=list("clusterDArgs"=c("k","findBestK","removeSil","kRange")),plotOptions=TRUE) +drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"),Layer2Options=list("mainClusterArgs"=c("k","findBestK","removeSil","kRange")),plotOptions=TRUE) ``` ```{r clusterD} -Cluster<-clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", clusterDArgs=list(findBestK=TRUE, removeSil=TRUE, kRange=2:10)) +Cluster<-clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", mainClusterArgs=list(findBestK=TRUE, removeSil=TRUE, kRange=2:10)) table(true=expressions$cluster, PAM=Cluster$clustering) ``` @@ -352,7 +352,7 @@ In this demonstration, clustering will first be performed with basic PAM with k= ```{r getBestFeatures} data(simData) -cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, clusterDArgs=list(k=4,removeSil=TRUE)) +cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, mainClusterArgs=list(k=4,removeSil=TRUE)) pairsAll<-getBestFeatures(cl$clustering,simData,type="Pairs") head(pairsAll) @@ -364,7 +364,7 @@ The column `Contrast` in the output signifies the pairwise difference (or contra ```{r getBestFeaturesAllSig} data(simData) -cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, clusterDArgs=list(k=4,removeSil=TRUE)) +cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, mainClusterArgs=list(k=4,removeSil=TRUE)) pairsAll2<-getBestFeatures(cl$clustering,simData,type="Pairs",p.value=0.05,number=ncol(simData)) table(pairsAll2$Contrast) ``` diff --git a/vignettes/oldVignette/clusterCellsTutorial_old.html b/vignettes/oldVignette/clusterCellsTutorial_old.html index b502113f..029788d9 100644 --- a/vignettes/oldVignette/clusterCellsTutorial_old.html +++ b/vignettes/oldVignette/clusterCellsTutorial_old.html @@ -88,8 +88,8 @@

    2016-03-28

    Layer1Names<-c("clusterFunction","subsample", "sequential") Layer1Options<-list("clusterFunction"=c("tight", "hierarchical", "pam", "kmeans"), "subsample"=c("subsample.FALSE","subsample.TRUE"), "sequential"=c("sequential.FALSE","sequential.TRUE")) - Layer2Names<-c("clusterDArgs","subsampleArgs","seqArgs") - Layer2NamesSimple<-c("clusterDArgs","subsampleArgs","seqArgs") + Layer2Names<-c("mainClusterArgs","subsampleArgs","seqArgs") + Layer2NamesSimple<-c("mainClusterArgs","subsampleArgs","seqArgs") #x,y,box.size,box.prop,box.type makeDf<-function(x,y,box.size,box.prop,box.type){data.frame(x=x,y=y,box.size=box.size,box.prop=box.prop,box.type=box.type,stringsAsFactors =FALSE)} posMatrix<-makeDf(x=0.5,y=0.95,box.size=0.06,box.prop=0.4,box.type="diamond") @@ -105,7 +105,7 @@

    2016-03-28

    "subsample"=list("subsample.FALSE"=makeDf(0.45, 0.59,0.03,0.4,values),"subsample.TRUE"=makeDf(0.55, 0.59,0.03,0.4,values)), "sequential"=list("sequential.FALSE"=makeDf(0.77, 0.59,0.03,0.4,values),"sequential.TRUE"=makeDf(0.88, 0.59,0.03,0.4,values)) ) - Layer2Pos<-list("clusterDArgs"=makeDf(0.2,0.44,boxRadx(Layer2Names[1]),0.4,options), + Layer2Pos<-list("mainClusterArgs"=makeDf(0.2,0.44,boxRadx(Layer2Names[1]),0.4,options), "subsampleArgs"=makeDf(0.5,.44,boxRadx(Layer2Names[2]),0.4,options), "seqArgs"=makeDf(0.825, 0.44,boxRadx(Layer2Names[3]),0.4,options)) @@ -206,11 +206,11 @@

    Clustering with clusterSingle

    The arguments of clusterSingle indicate which of these tasks should be performed and allow the user to pass options for each of these tasks (that are passed to the downstream functions). The options are documented in the more detailed documentation vignette, but here we run through some common examples.

    Here are the argument calls for clusterSingle:

    -
    clusterSingle(x, subsample = TRUE, sequential = FALSE, clusterFunction = c("tight", "hierarchical", "pam", "kmeans"), clusterDArgs = NULL, subsampleArgs = NULL, seqArgs = NULL)
    +
    clusterSingle(x, subsample = TRUE, sequential = FALSE, clusterFunction = c("tight", "hierarchical", "pam", "kmeans"), mainClusterArgs = NULL, subsampleArgs = NULL, seqArgs = NULL)

    The primary arguments clusterFunction,subsample, and sequential govern each of these tasks:

    We can visualize arguments and the values they can take on as a diagram:

    -

    The remaining arguments (clusterDArgs, subsampleArgs, and seqArgs) are for passing additional options to each of these tasks, and we will show examples of how they are used.

    +

    The remaining arguments (mainClusterArgs, subsampleArgs, and seqArgs) are for passing additional options to each of these tasks, and we will show examples of how they are used.

    Example: PAM Clustering

    Let us use the function clusterSingle to perform simple Partition Around Medoids (PAM) clustering on our expression data set with k=5. In this expression matrix, rows are samples and columns are features.

    @@ -219,16 +219,16 @@

    Example: PAM Clustering

    However, notice that doing this will return an error:

    simpleCluster = clusterSingle(expressions, subsample=FALSE, sequential=FALSE, clusterFunction="pam")
    ## Warning in dist(x): NAs introduced by coercion
    -
    ## Error in .clusterWrapper(x, clusterFunction = clusterFunction, subsample = subsample, : if not subsampling, must give k in 'clusterDArgs' (or if sequential should have been set by sequential strategy)
    +
    ## Error in .clusterWrapper(x, clusterFunction = clusterFunction, subsample = subsample, : if not subsampling, must give k in 'mainClusterArgs' (or if sequential should have been set by sequential strategy)

    This is because even to perform simple PAM, the PAM algorithm needs additional parameters to be set, namely ‘k’, the number of groups.

    -

    However, there is no argument in clusterSingle for this. Because it is specific to a particular choice of clustering function, we don’t want to clutter up clusterSingle with all the possible options that might be needed downstream for a particularly task. Instead, clusterSingle lets you pass such arguments to the function that actually does the task of clustering a distance matrix, clusterD via the clusterDArgs option. So the user doesn’t need to call clusterD explicitly, but often will need to pass an argument to clusterD.

    +

    However, there is no argument in clusterSingle for this. Because it is specific to a particular choice of clustering function, we don’t want to clutter up clusterSingle with all the possible options that might be needed downstream for a particularly task. Instead, clusterSingle lets you pass such arguments to the function that actually does the task of clustering a distance matrix, clusterD via the mainClusterArgs option. So the user doesn’t need to call clusterD explicitly, but often will need to pass an argument to clusterD.

    In this case we need to pass the argument ‘k’. Specifically, we need to create a list with an element of name “k” that takes on the value, say 5, for our k.

    library(clusterExperiment)
     library(cluster)
     #load("data/expressions.Rda")
     expression = expressions[, 1:3]
    -simpleCluster = clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", clusterDArgs=list('k'=5))
    +simpleCluster = clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", mainClusterArgs=list('k'=5))
     table(simpleCluster$clustering)
    ## 
     ##   1   2   3   4   5 
    @@ -262,7 +262,7 @@ 

    More options for PAM clustering

    We can pass these arguments to clusterD, again as a list:

    -
    Cluster<-clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", clusterDArgs=list(findBestK=TRUE, removeSil=TRUE, kRange=2:10))
    +
    Cluster<-clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", mainClusterArgs=list(findBestK=TRUE, removeSil=TRUE, kRange=2:10))
     
     table(true=expressions$cluster, PAM=Cluster$clustering)
    ##     PAM
    @@ -330,7 +330,7 @@ 

    Finding Marker Genes

    In this demonstration, clustering will first be performed with basic PAM with k=4 and removing negative silhouette widths. We use the simulated data found in the package simData.

    getBestFeatures will be called to determine the top features associated with every pairwise comparison between the clusters. By default, topTable in limma only returns the top 10, which in this context means 10 per each of the pairwise comparisons.

    data(simData)
    -cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, clusterDArgs=list(k=4,removeSil=TRUE))
    +cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, mainClusterArgs=list(k=4,removeSil=TRUE))
     pairsAll<-getBestFeatures(cl$clustering,simData,type="Pairs")
     
     head(pairsAll)
    @@ -351,7 +351,7 @@

    Finding Marker Genes

    The column Contrast in the output signifies the pairwise difference (or contrast) for which the information on the row comes from. IndexInOriginal matches the gene to its index in the original dataset. The other columns are provided by topTable in limma (see documentation therein).

    getBestFeatures accepts arguments to limma’s function topTable to decide which genes should be returned (and in what order). In particular, we can set an adjusted p-value cutoff for each contrast, and set number to control the number of genes returned for each contrast. By setting number to be the length of all genes, and p.value=0.05, we can return all genes for each contrast that have adjusted p-values less than 0.05.

    data(simData)
    -cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, clusterDArgs=list(k=4,removeSil=TRUE))
    +cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, mainClusterArgs=list(k=4,removeSil=TRUE))
     pairsAll2<-getBestFeatures(cl$clustering,simData,type="Pairs",p.value=0.05,number=ncol(simData))
     table(pairsAll2$Contrast)
    ## 
    
    - clusterDArgsmainClusterArgs
    -plotType=‘none’plot=FALSE mergeMethod