resolve comments

dmlc · Mar 2, 2020 · 74544e5 · 74544e5
1 parent 5b58992
commit 74544e5
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 55 deletions.
diff --git a/...kages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/...kages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
@@ -139,8 +139,8 @@ class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest {
     val r = new Random(0)
     var df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
       toDF("feature", "label")
-    // since 0.82/model has 251 features and xgboost has enabled column check since 1.0.0
-    // we must ensure column number should be same with model's
+    // 0.82/model was trained with 251 features. and transform will throw exception
+    // if feature size of data is not equal to 251
     for (x <- 1 to 250) {
       df = df.withColumn(s"feature_${x}", lit(1))
     }

diff --git a/...packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala b/...packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
@@ -87,29 +87,36 @@ object MultiClassification extends TrainTestData {
 }
 
 object Regression extends TrainTestData {
-  val train: Seq[XGBLabeledPoint] = getLabeledPoints("/machine.txt.train", 36, zeroBased = true)
-  val test: Seq[XGBLabeledPoint] = getLabeledPoints("/machine.txt.test", 36, zeroBased = true)
+  val MACHINE_COL_NUM = 36
+  val train: Seq[XGBLabeledPoint] = getLabeledPoints(
+    "/machine.txt.train", MACHINE_COL_NUM, zeroBased = true)
+  val test: Seq[XGBLabeledPoint] = getLabeledPoints(
+    "/machine.txt.test", MACHINE_COL_NUM, zeroBased = true)
 }
 
 object Ranking extends TrainTestData {
+  val RANK_COL_NUM = 3
   val train: Seq[XGBLabeledPoint] = getLabeledPointsWithGroup("/rank.train.csv")
-  val test: Seq[XGBLabeledPoint] = getLabeledPoints("/rank.test.txt", 3, zeroBased = false)
+  val test: Seq[XGBLabeledPoint] = getLabeledPoints(
+    "/rank.test.txt", RANK_COL_NUM, zeroBased = false)
 
   private def getGroups(resource: String): Seq[Int] = {
     getResourceLines(resource).map(_.toInt).toList
   }
 }
 
 object Synthetic extends {
+  val TRAIN_COL_NUM = 3
+  val TRAIN_WRONG_COL_NUM = 2
   val train: Seq[XGBLabeledPoint] = Seq(
-    XGBLabeledPoint(1.0f, 3, Array(0, 1), Array(1.0f, 2.0f)),
-    XGBLabeledPoint(0.0f, 3, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
-    XGBLabeledPoint(0.0f, 3, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
-    XGBLabeledPoint(1.0f, 3, Array(0, 1), Array(1.0f, 2.0f))
+    XGBLabeledPoint(1.0f, TRAIN_COL_NUM, Array(0, 1), Array(1.0f, 2.0f)),
+    XGBLabeledPoint(0.0f, TRAIN_COL_NUM, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
+    XGBLabeledPoint(0.0f, TRAIN_COL_NUM, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
+    XGBLabeledPoint(1.0f, TRAIN_COL_NUM, Array(0, 1), Array(1.0f, 2.0f))
   )
 
   val trainWithDiffFeatureSize: Seq[XGBLabeledPoint] = Seq(
-    XGBLabeledPoint(1.0f, 2, Array(0, 1), Array(1.0f, 2.0f)),
-    XGBLabeledPoint(0.0f, 3, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f))
+    XGBLabeledPoint(1.0f, TRAIN_WRONG_COL_NUM, Array(0, 1), Array(1.0f, 2.0f)),
+    XGBLabeledPoint(0.0f, TRAIN_COL_NUM, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f))
   )
 }
diff --git a/...xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/...xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -18,13 +18,9 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
 import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.sql._
 import org.scalatest.FunSuite
 import org.apache.spark.Partitioner
-import org.apache.spark.ml.feature.VectorAssembler
-
-import scala.util.Random
 
 class XGBoostClassifierSuite extends FunSuite with PerTest {
 
@@ -310,41 +306,4 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
     xgb.fit(repartitioned)
   }
 
-  test("feature size of predication data should be same with model's") {
-    val modelPath = getClass.getResource("/model/0.82/model").getPath
-    val model = XGBoostClassificationModel.read.load(modelPath)
-    val r = new Random(0)
-    val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
-      toDF("feature", "label")
-    val assembler = new VectorAssembler()
-      .setInputCols(df.columns.filter(!_.contains("label")))
-      .setOutputCol("features")
-    intercept[Exception] {
-      model.transform(assembler.transform(df)).show()
-    }
-  }
-
-  test("feature size must be same for distributed train") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic",
-      "num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
-    import DataUtils._
-    val sparkSession = SparkSession.builder().getOrCreate()
-    import sparkSession.implicits._
-    val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
-      .map(lp => (lp.label, lp)).partitionBy(
-      new Partitioner {
-        override def numPartitions: Int = 2
-
-        override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
-      }
-    ).map(_._2).zipWithIndex().map {
-      case (lp, id) =>
-        (id, lp.label, lp.features)
-    }.toDF("id", "label", "features")
-    val xgb = new XGBoostClassifier(paramMap)
-    intercept[Exception] {
-      xgb.fit(repartitioned)
-    }
-  }
 }
diff --git a/...es/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala b/...es/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -18,14 +18,17 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import java.nio.file.Files
 
+import ml.dmlc.xgboost4j.java.XGBoostError
+
 import scala.util.Random
 import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import ml.dmlc.xgboost4j.scala.DMatrix
 import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
 import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.spark.TaskContext
+import org.apache.spark.{Partitioner, TaskContext}
 import org.scalatest.FunSuite
 import org.apache.spark.ml.feature.VectorAssembler
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions.lit
 
 class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
@@ -350,8 +353,8 @@ class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
     val r = new Random(0)
     var df = ss.createDataFrame(Seq.fill(100000)(1).map(i => (i, i))).
       toDF("feature", "label").repartition(5)
-    // since 0.82/model has 251 features and xgboost has enabled column check since 1.0.0
-    // we must ensure column number should be same with model's
+    // 0.82/model was trained with 251 features. and transform will throw exception
+    // if feature size of data is not equal to 251
     for (x <- 1 to 250) {
       df = df.withColumn(s"feature_${x}", lit(1))
     }
@@ -370,4 +373,46 @@ class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
     df1.collect()
     df2.collect()
   }
+
+  test("transform throwing exception when feature size of dataset is different with model's") {
+    val modelPath = getClass.getResource("/model/0.82/model").getPath
+    val model = XGBoostClassificationModel.read.load(modelPath)
+    val r = new Random(0)
+    // 0.82/model was trained with 251 features. and transform will throw exception
+    // if feature size of data is not equal to 251
+    val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
+      toDF("feature", "label")
+    val assembler = new VectorAssembler()
+      .setInputCols(df.columns.filter(!_.contains("label")))
+      .setOutputCol("features")
+    val thrown = intercept[Exception] {
+      model.transform(assembler.transform(df)).show()
+    }
+    assert(thrown.getMessage.contains(
+      "Number of columns does not match number of features in booster"))
+  }
+
+  test("train throwing exception when feature size of dataset is different on distributed train") {
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "binary:logistic",
+      "num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
+    import DataUtils._
+    val sparkSession = SparkSession.builder().getOrCreate()
+    import sparkSession.implicits._
+    val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
+      .map(lp => (lp.label, lp)).partitionBy(
+      new Partitioner {
+        override def numPartitions: Int = 2
+
+        override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
+      }
+    ).map(_._2).zipWithIndex().map {
+      case (lp, id) =>
+        (id, lp.label, lp.features)
+    }.toDF("id", "label", "features")
+    val xgb = new XGBoostClassifier(paramMap)
+    intercept[XGBoostError] {
+      xgb.fit(repartitioned)
+    }
+  }
 }
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
@@ -66,6 +66,7 @@ public void testCreateFromDataIteratorWithDiffFeatureSize() throws XGBoostError
     java.util.List<LabeledPoint> blist = new java.util.LinkedList<LabeledPoint>();
     int featureSize = 4;
     for (int i = 0; i < nrep; ++i) {
+      // set some rows with wrong feature size
       if (i % 10 == 1) {
         featureSize = 5;
       }