From 372bb4a94a1b404328c4c50ad780a480bf49efa8 Mon Sep 17 00:00:00 2001 From: anqi Date: Thu, 15 Jan 2015 19:44:03 -0800 Subject: [PATCH] Update h2o.createFrame with optional response column. Add a Runit test of this feature. --- R/h2o-package/R/ParseImport.R | 19 +++++-- R/h2o-package/man/h2o.createFrame.Rd | 13 +++-- .../runit_demo_random_data_glm.R | 2 +- .../runit_demo_random_data_pca.R | 2 +- R/tests/testdir_misc/runit_createFrame.R | 32 +++++++++++ src/main/java/hex/CreateFrame.java | 3 ++ src/main/java/water/fvec/FrameCreator.java | 54 ++++++++++--------- 7 files changed, 87 insertions(+), 38 deletions(-) create mode 100644 R/tests/testdir_misc/runit_createFrame.R diff --git a/R/h2o-package/R/ParseImport.R b/R/h2o-package/R/ParseImport.R index 1505f770ab..9bb1b0c2ec 100644 --- a/R/h2o-package/R/ParseImport.R +++ b/R/h2o-package/R/ParseImport.R @@ -105,11 +105,13 @@ h2o.assign <- function(data, key) { .h2o.exec2(expr = data@key, h2o = data@h2o, dest_key = key) } -h2o.createFrame <- function(object, key, rows, cols, seed, randomize, value, real_range, categorical_fraction, factors, integer_fraction, integer_range, binary_fraction=0, binary_ones_fraction=0.5, missing_fraction, response_factors) { +h2o.createFrame <- function(object, key, rows = 10000, cols = 10, seed, randomize = TRUE, value = 0, real_range = 100, categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100, binary_fraction = 0.1, binary_ones_fraction = 0.02, missing_fraction = 0.01, response_factors = 2, has_response = FALSE) { + if(class(object) != "H2OClient") stop("object must be of class H2OClient") + if(!is.character(key)) stop("key must be a character string") if(!is.numeric(rows)) stop("rows must be a numeric value") if(!is.numeric(cols)) stop("cols must be a numeric value") - if(!is.numeric(seed)) stop("seed must be a numeric value") - if(!is.logical(randomize)) stop("randomize must be a boolean value") + if(!missing(seed) && !is.numeric(seed)) stop("seed must be a numeric value") + if(!is.logical(randomize)) stop("randomize must be a logical value") if(!is.numeric(value)) stop("value must be a numeric value") if(!is.numeric(real_range)) stop("real_range must be a numeric value") if(!is.numeric(categorical_fraction)) stop("categorical_fraction must be a numeric value") @@ -120,9 +122,16 @@ h2o.createFrame <- function(object, key, rows, cols, seed, randomize, value, rea if(!is.numeric(response_factors)) stop("response_factors must be a numeric value") if(!is.numeric(binary_fraction)) stop("binary_fraction must be a numeric value") if(!is.numeric(binary_ones_fraction)) stop("binary_ones_fraction must be a numeric value") + if(!is.logical(has_response)) stop("has_response must be a logical value") - res <- .h2o.__remoteSend(object, .h2o.__PAGE_CreateFrame, key = key, rows = rows, cols = cols, seed = seed, randomize = as.numeric(randomize), value = value, real_range = real_range, - categorical_fraction = categorical_fraction, factors = factors, integer_fraction = integer_fraction, integer_range = integer_range, binary_fraction = binary_fraction, binary_ones_fraction=binary_ones_fraction, missing_fraction = missing_fraction, response_factors = response_factors) + if(missing(seed)) + res <- .h2o.__remoteSend(object, .h2o.__PAGE_CreateFrame, key = key, rows = rows, cols = cols, randomize = as.numeric(randomize), value = value, real_range = real_range, + categorical_fraction = categorical_fraction, factors = factors, integer_fraction = integer_fraction, integer_range = integer_range, binary_fraction = binary_fraction, + binary_ones_fraction = binary_ones_fraction, missing_fraction = missing_fraction, response_factors = response_factors, has_response = as.numeric(has_response)) + else + res <- .h2o.__remoteSend(object, .h2o.__PAGE_CreateFrame, key = key, rows = rows, cols = cols, seed = seed, randomize = as.numeric(randomize), value = value, real_range = real_range, + categorical_fraction = categorical_fraction, factors = factors, integer_fraction = integer_fraction, integer_range = integer_range, binary_fraction = binary_fraction, + binary_ones_fraction = binary_ones_fraction, missing_fraction = missing_fraction, response_factors = response_factors, has_response = as.numeric(has_response)) .h2o.exec2(expr = key, h2o = object, dest_key = key) } diff --git a/R/h2o-package/man/h2o.createFrame.Rd b/R/h2o-package/man/h2o.createFrame.Rd index b28ee2e8bf..7ec66f87dc 100644 --- a/R/h2o-package/man/h2o.createFrame.Rd +++ b/R/h2o-package/man/h2o.createFrame.Rd @@ -8,16 +8,14 @@ Create an H2O Frame Create an H2O data frame from scratch, with optional randomization. Supports categoricals, integers, reals and missing values. } \usage{ -h2o.createFrame(object, key, rows, cols, seed, randomize, value, real_range, - categorical_fraction, factors, integer_fraction, integer_range, - binary_fraction, binary_ones_fraction, - missing_fraction, response_factors) +h2o.createFrame(object, key = "", rows = 10000, cols = 10, seed, randomize = TRUE, value = 0, real_range = 100, + categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100, binary_fraction = 0.1, + binary_ones_fraction = 0.02, missing_fraction = 0.01, response_factors = 2, has_response = FALSE) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{object}{An \code{\linkS4class{H2OClient}} object containing the IP address and port of the server running H2O.} - \item{key}{ - The unique hex key assigned to the created frame.} + \item{key}{The unique hex key assigned to the created frame.} \item{rows}{Number of rows} \item{cols}{Number of columns} \item{seed}{Random number seed} @@ -32,6 +30,7 @@ h2o.createFrame(object, key, rows, cols, seed, randomize, value, real_range, \item{binary_ones_fraction}{Fraction of 1's in binary columns (for randomize=true)} \item{missing_fraction}{Fraction of missing values} \item{response_factors}{Number of factor levels of the first column (1=real, 2=binomial, N=multinomial)} + \item{has_response}{Whether an additional response column should be generated. The final data frame will have cols+1 columns} } \value{ Returns an H2O data frame. @@ -44,7 +43,7 @@ myframe = h2o.createFrame(localH2O, 'myframekey', rows = 1000, cols = 10, categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100, binary_fraction = 0.1, binary_ones_fraction = 0.01, - missing_fraction = 0.1, response_factors = 2) + missing_fraction = 0.1, response_factors = 2, has_response = FALSE) head(myframe) summary(myframe) h2o.shutdown(localH2O) diff --git a/R/tests/testdir_demos/runit_demo_random_data_glm.R b/R/tests/testdir_demos/runit_demo_random_data_glm.R index 2d8b8c3fb4..1304abeb99 100644 --- a/R/tests/testdir_demos/runit_demo_random_data_glm.R +++ b/R/tests/testdir_demos/runit_demo_random_data_glm.R @@ -37,7 +37,7 @@ for(i in 1:length(rows)){ # changing number of rows categorical_fraction = 0.0, factors = 10, integer_fraction = 0.4, integer_range = 100, missing_fraction = 0, response_factors = 1, - binary_fraction = 0, binary_ones_fraction = 0.5) ) + binary_fraction = 0, binary_ones_fraction = 0.5, has_response = TRUE) ) create_frm_time[i,j] = as.numeric(sst[3]) mem = h2o.ls(conn,"myframe") frm_size[i,j] = as.numeric(mem[2]) diff --git a/R/tests/testdir_demos/runit_demo_random_data_pca.R b/R/tests/testdir_demos/runit_demo_random_data_pca.R index e010e8e873..aee65baca7 100644 --- a/R/tests/testdir_demos/runit_demo_random_data_pca.R +++ b/R/tests/testdir_demos/runit_demo_random_data_pca.R @@ -39,7 +39,7 @@ for(i in 1:length(rows)){ # changing number of rows categorical_fraction = 0.0, factors = 10, integer_fraction = 0.4, integer_range = 100, missing_fraction = 0, response_factors = 1, - binary_fraction = 0, binary_ones_fraction = 0.5) ) + binary_fraction = 0, binary_ones_fraction = 0.5, has_response = TRUE) ) create_frm_time[i,j] = as.numeric(sst[3]) mem = h2o.ls(conn,"myframe") diff --git a/R/tests/testdir_misc/runit_createFrame.R b/R/tests/testdir_misc/runit_createFrame.R new file mode 100644 index 0000000000..4f94225658 --- /dev/null +++ b/R/tests/testdir_misc/runit_createFrame.R @@ -0,0 +1,32 @@ +## +# Testing creation of random data frame in H2O +## + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + +test.createFrame <- function(conn) { + Log.info("Create a data frame with rows = 10000, cols = 100") + hex <- h2o.createFrame(conn, "hex", rows = 10000, cols = 100, categorical_fraction = 0.1, factors = 5, integer_fraction = 0.5, integer_range = 1) + expect_equal(dim(hex), c(10000, 100)) + expect_equal(length(colnames(hex)), 100) + + Log.info("Check that 0.1 * 100 = 10 columns are categorical") + fac_col <- sapply(1:100, function(i) is.factor(hex[,i])) + num_fac <- sum(fac_col) + expect_equal(num_fac/100, 0.1) + + Log.info("Create a data frame with rows = 100, cols = 10") + hex2 <- h2o.createFrame(conn, "hex2", rows = 100, cols = 10, randomize = FALSE, value = 5, categorical_fraction = 0, integer_fraction = 0, missing_fraction = 0, has_response = TRUE) + print(summary(hex2)) + expect_equal(dim(hex2), c(100, 11)) + expect_equal(length(colnames(hex2)), 11) + + Log.info("Check that all data entries are equal to 5") + cons_col <- sapply(1:10, function(i) { min(hex2[,i]) == 5 && max(hex2[,i]) == 5 }) + expect_true(all(cons_col)) + + testEnd() +} + +doTest("Create a random data frame in H2O", test.createFrame) \ No newline at end of file diff --git a/src/main/java/hex/CreateFrame.java b/src/main/java/hex/CreateFrame.java index da3fb5ede3..6883861120 100644 --- a/src/main/java/hex/CreateFrame.java +++ b/src/main/java/hex/CreateFrame.java @@ -65,6 +65,9 @@ public class CreateFrame extends Request2 { public boolean positive_response; // only for response_factors=1 + @API(help = "Whether an additional response column should be generated", filter = Default.class, json=true) + public boolean has_response = false; + @Override public Response serve() { try { if (integer_fraction + binary_fraction + categorical_fraction > 1) throw new IllegalArgumentException("Integer, binary and categorical fractions must add up to <= 1."); diff --git a/src/main/java/water/fvec/FrameCreator.java b/src/main/java/water/fvec/FrameCreator.java index c1df1434ca..9d282aaa2b 100644 --- a/src/main/java/water/fvec/FrameCreator.java +++ b/src/main/java/water/fvec/FrameCreator.java @@ -23,7 +23,7 @@ public FrameCreator(CreateFrame createFrame, Key job) { _job=job; _createFrame = createFrame; - int[] idx = Utils.seq(1, _createFrame.cols+1); + int[] idx = _createFrame.has_response ? Utils.seq(1, _createFrame.cols + 1) : Utils.seq(0, _createFrame.cols); int[] shuffled_idx = new int[idx.length]; Utils.shuffleArray(idx, idx.length, shuffled_idx, _createFrame.seed, 0); @@ -44,13 +44,17 @@ public FrameCreator(CreateFrame createFrame, Key job) { // create domains for categorical variables if (_createFrame.randomize) { - assert(_createFrame.response_factors >= 1); - _domain = new String[_createFrame.cols+1][]; - _domain[0] = _createFrame.response_factors == 1 ? null : new String[_createFrame.response_factors]; - if (_domain[0] != null) { - for (int i=0; i <_domain[0].length; ++i) { - _domain[0][i] = "resp." + i; + if(_createFrame.has_response) { + assert(_createFrame.response_factors >= 1); + _domain = new String[_createFrame.cols+1][]; + _domain[0] = _createFrame.response_factors == 1 ? null : new String[_createFrame.response_factors]; + if (_domain[0] != null) { + for (int i = 0; i < _domain[0].length; ++i) { + _domain[0][i] = "resp." + i; + } } + } else { + _domain = new String[_createFrame.cols][]; } for (int c : _cat_cols) { @@ -76,14 +80,19 @@ public FrameCreator(CreateFrame createFrame, Key job) { final private Key _job; @Override public void compute2() { - Vec[] vecs = Vec.makeNewCons(_createFrame.rows, _createFrame.cols+1, _createFrame.value, _domain); + int totcols = _createFrame.has_response ? (_createFrame.cols+1) : _createFrame.cols; + Vec[] vecs = Vec.makeNewCons(_createFrame.rows, totcols, _createFrame.value, _domain); String[] names = new String[vecs.length]; - names[0] = "response"; - for( int i=1; i1) - cs[0].set0(r, (int)(rng.nextDouble() * _createFrame.response_factors)); //classification - else if (_createFrame.positive_response) - cs[0].set0(r, _createFrame.real_range * rng.nextDouble()); //regression with positive response - else - cs[0].set0(r, _createFrame.real_range * (1 - 2 * rng.nextDouble())); //regression + if(_createFrame.has_response) { + for (int r = 0; r < cs[0]._len; r++) { + setSeed(rng, 0, cs[0]._start + r); + if (_createFrame.response_factors > 1) + cs[0].set0(r, (int) (rng.nextDouble() * _createFrame.response_factors)); //classification + else if (_createFrame.positive_response) + cs[0].set0(r, _createFrame.real_range * rng.nextDouble()); //regression with positive response + else + cs[0].set0(r, _createFrame.real_range * (1 - 2 * rng.nextDouble())); //regression + } } for (int c : _cat_cols) { @@ -162,8 +173,6 @@ else if (_createFrame.positive_response) } } - - public static class MissingInserter extends MRTask2 { final long _seed; final double _frac; @@ -192,7 +201,4 @@ public void map (Chunk[]cs){ } } } - - - }