Skip to content

Commit

Permalink
resolve comments
Browse files Browse the repository at this point in the history
  • Loading branch information
wbo4958 committed Feb 17, 2020
1 parent 9a2df0d commit 1966b0f
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest {
val r = new Random(0)
var df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
toDF("feature", "label")
// since 0.82/model has 251 features and xgboost has enabled column check since 1.0.0
// we must ensure column number should be same with model's
// 0.82/model was trained with 251 features. and transform will throw exception
// if feature size of data is not equal to 251
for (x <- 1 to 250) {
df = df.withColumn(s"feature_${x}", lit(1))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,29 +87,36 @@ object MultiClassification extends TrainTestData {
}

object Regression extends TrainTestData {
val train: Seq[XGBLabeledPoint] = getLabeledPoints("/machine.txt.train", 36, zeroBased = true)
val test: Seq[XGBLabeledPoint] = getLabeledPoints("/machine.txt.test", 36, zeroBased = true)
val MACHINE_COL_NUM = 36
val train: Seq[XGBLabeledPoint] = getLabeledPoints(
"/machine.txt.train", MACHINE_COL_NUM, zeroBased = true)
val test: Seq[XGBLabeledPoint] = getLabeledPoints(
"/machine.txt.test", MACHINE_COL_NUM, zeroBased = true)
}

object Ranking extends TrainTestData {
val RANK_COL_NUM = 3
val train: Seq[XGBLabeledPoint] = getLabeledPointsWithGroup("/rank.train.csv")
val test: Seq[XGBLabeledPoint] = getLabeledPoints("/rank.test.txt", 3, zeroBased = false)
val test: Seq[XGBLabeledPoint] = getLabeledPoints(
"/rank.test.txt", RANK_COL_NUM, zeroBased = false)

private def getGroups(resource: String): Seq[Int] = {
getResourceLines(resource).map(_.toInt).toList
}
}

object Synthetic extends {
val TRAIN_COL_NUM = 3
val TRAIN_WRONG_COL_NUM = 2
val train: Seq[XGBLabeledPoint] = Seq(
XGBLabeledPoint(1.0f, 3, Array(0, 1), Array(1.0f, 2.0f)),
XGBLabeledPoint(0.0f, 3, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
XGBLabeledPoint(0.0f, 3, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
XGBLabeledPoint(1.0f, 3, Array(0, 1), Array(1.0f, 2.0f))
XGBLabeledPoint(1.0f, TRAIN_COL_NUM, Array(0, 1), Array(1.0f, 2.0f)),
XGBLabeledPoint(0.0f, TRAIN_COL_NUM, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
XGBLabeledPoint(0.0f, TRAIN_COL_NUM, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
XGBLabeledPoint(1.0f, TRAIN_COL_NUM, Array(0, 1), Array(1.0f, 2.0f))
)

val trainWithDiffFeatureSize: Seq[XGBLabeledPoint] = Seq(
XGBLabeledPoint(1.0f, 2, Array(0, 1), Array(1.0f, 2.0f)),
XGBLabeledPoint(0.0f, 3, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f))
XGBLabeledPoint(1.0f, TRAIN_WRONG_COL_NUM, Array(0, 1), Array(1.0f, 2.0f)),
XGBLabeledPoint(0.0f, TRAIN_COL_NUM, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f))
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,9 @@ package ml.dmlc.xgboost4j.scala.spark

import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql._
import org.scalatest.FunSuite
import org.apache.spark.Partitioner
import org.apache.spark.ml.feature.VectorAssembler

import scala.util.Random

class XGBoostClassifierSuite extends FunSuite with PerTest {

Expand Down Expand Up @@ -310,41 +306,4 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
xgb.fit(repartitioned)
}

test("feature size of predication data should be same with model's") {
val modelPath = getClass.getResource("/model/0.82/model").getPath
val model = XGBoostClassificationModel.read.load(modelPath)
val r = new Random(0)
val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
toDF("feature", "label")
val assembler = new VectorAssembler()
.setInputCols(df.columns.filter(!_.contains("label")))
.setOutputCol("features")
intercept[Exception] {
model.transform(assembler.transform(df)).show()
}
}

test("feature size must be same for distributed train") {
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic",
"num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
import DataUtils._
val sparkSession = SparkSession.builder().getOrCreate()
import sparkSession.implicits._
val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
.map(lp => (lp.label, lp)).partitionBy(
new Partitioner {
override def numPartitions: Int = 2

override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
}
).map(_._2).zipWithIndex().map {
case (lp, id) =>
(id, lp.label, lp.features)
}.toDF("id", "label", "features")
val xgb = new XGBoostClassifier(paramMap)
intercept[Exception] {
xgb.fit(repartitioned)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@ package ml.dmlc.xgboost4j.scala.spark

import java.nio.file.Files

import ml.dmlc.xgboost4j.java.XGBoostError

import scala.util.Random
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
import ml.dmlc.xgboost4j.scala.DMatrix
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.TaskContext
import org.apache.spark.{Partitioner, TaskContext}
import org.scalatest.FunSuite
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.lit

class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
Expand Down Expand Up @@ -350,8 +353,8 @@ class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
val r = new Random(0)
var df = ss.createDataFrame(Seq.fill(100000)(1).map(i => (i, i))).
toDF("feature", "label").repartition(5)
// since 0.82/model has 251 features and xgboost has enabled column check since 1.0.0
// we must ensure column number should be same with model's
// 0.82/model was trained with 251 features. and transform will throw exception
// if feature size of data is not equal to 251
for (x <- 1 to 250) {
df = df.withColumn(s"feature_${x}", lit(1))
}
Expand All @@ -370,4 +373,46 @@ class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
df1.collect()
df2.collect()
}

test("transform throwing exception when feature size of dataset is different with model's") {
val modelPath = getClass.getResource("/model/0.82/model").getPath
val model = XGBoostClassificationModel.read.load(modelPath)
val r = new Random(0)
// 0.82/model was trained with 251 features. and transform will throw exception
// if feature size of data is not equal to 251
val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
toDF("feature", "label")
val assembler = new VectorAssembler()
.setInputCols(df.columns.filter(!_.contains("label")))
.setOutputCol("features")
val thrown = intercept[Exception] {
model.transform(assembler.transform(df)).show()
}
assert(thrown.getMessage.contains(
"Number of columns does not match number of features in booster"))
}

test("train throwing exception when feature size of dataset is different on distributed train") {
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic",
"num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
import DataUtils._
val sparkSession = SparkSession.builder().getOrCreate()
import sparkSession.implicits._
val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
.map(lp => (lp.label, lp)).partitionBy(
new Partitioner {
override def numPartitions: Int = 2

override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
}
).map(_._2).zipWithIndex().map {
case (lp, id) =>
(id, lp.label, lp.features)
}.toDF("id", "label", "features")
val xgb = new XGBoostClassifier(paramMap)
intercept[XGBoostError] {
xgb.fit(repartitioned)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public void testCreateFromDataIteratorWithDiffFeatureSize() throws XGBoostError
java.util.List<LabeledPoint> blist = new java.util.LinkedList<LabeledPoint>();
int featureSize = 4;
for (int i = 0; i < nrep; ++i) {
// set some rows with wrong feature size
if (i % 10 == 1) {
featureSize = 5;
}
Expand Down

0 comments on commit 1966b0f

Please sign in to comment.