summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLester Solbakken <lesters@oath.com>2020-02-02 17:39:44 +0100
committerLester Solbakken <lesters@oath.com>2020-02-02 17:39:44 +0100
commitf656ff5c15d95905f48d5829278ec241f1941577 (patch)
tree41d1fd4f8bc22df172acac42bfc39abd136036c0
parent99f3a7193090cfcd6b5fdbbe612f53d892f9d86b (diff)
Add support for importing LightGBM models
-rw-r--r--application/src/main/java/com/yahoo/application/Application.java2
-rw-r--r--application/src/test/app-packages/model-evaluation/models/lightgbm/regression.json275
-rw-r--r--application/src/test/java/com/yahoo/application/container/ContainerModelEvaluationTest.java7
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/expressiontransforms/ExpressionTransforms.java1
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/expressiontransforms/LightGBMFeatureConverter.java59
-rw-r--r--config-model/src/test/cfg/application/ml_models/models/lightgbm_regression.json275
-rw-r--r--config-model/src/test/cfg/application/ml_models/searchdefinitions/test.sd6
-rw-r--r--config-model/src/test/cfg/application/ml_serving/models/lightgbm_regression.json275
-rw-r--r--config-model/src/test/integration/lightgbm/models/regression.json275
-rw-r--r--config-model/src/test/java/com/yahoo/searchdefinition/processing/RankProfileSearchFixture.java2
-rw-r--r--config-model/src/test/java/com/yahoo/searchdefinition/processing/RankingExpressionWithLightGBMTestCase.java88
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java4
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/ml/MlModelsTest.java5
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/ml/ModelEvaluationTest.java10
-rw-r--r--model-evaluation/abi-spec.json1
-rw-r--r--model-evaluation/src/main/java/ai/vespa/models/evaluation/FunctionEvaluator.java16
-rw-r--r--model-evaluation/src/main/java/ai/vespa/models/handler/ModelsEvaluationHandler.java10
-rw-r--r--model-evaluation/src/test/java/ai/vespa/models/evaluation/MlModelsImportingTest.java19
-rw-r--r--model-evaluation/src/test/java/ai/vespa/models/handler/ModelsEvaluationHandlerTest.java35
-rw-r--r--model-evaluation/src/test/resources/config/models/rank-profiles.cfg3
-rw-r--r--model-integration/src/main/config/model-integration.xml3
-rw-r--r--model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMImporter.java54
-rw-r--r--model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMNode.java67
-rw-r--r--model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMParser.java146
-rw-r--r--model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/package-info.java5
-rw-r--r--model-integration/src/test/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMImportEvaluationTestCase.java49
-rw-r--r--model-integration/src/test/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMTestBase.java42
-rw-r--r--model-integration/src/test/models/lightgbm/classification.json275
-rw-r--r--model-integration/src/test/models/lightgbm/regression.json275
-rwxr-xr-xmodel-integration/src/test/models/lightgbm/train_lightgbm_classification.py54
-rwxr-xr-xmodel-integration/src/test/models/lightgbm/train_lightgbm_regression.py53
31 files changed, 2379 insertions, 12 deletions
diff --git a/application/src/main/java/com/yahoo/application/Application.java b/application/src/main/java/com/yahoo/application/Application.java
index 5f9b1f51863..d4b1735e4c1 100644
--- a/application/src/main/java/com/yahoo/application/Application.java
+++ b/application/src/main/java/com/yahoo/application/Application.java
@@ -2,6 +2,7 @@
package com.yahoo.application;
import ai.vespa.rankingexpression.importer.configmodelview.MlModelImporter;
+import ai.vespa.rankingexpression.importer.lightgbm.LightGBMImporter;
import ai.vespa.rankingexpression.importer.onnx.OnnxImporter;
import ai.vespa.rankingexpression.importer.tensorflow.TensorFlowImporter;
import ai.vespa.rankingexpression.importer.vespa.VespaImporter;
@@ -117,6 +118,7 @@ public final class Application implements AutoCloseable {
List<MlModelImporter> modelImporters = List.of(new VespaImporter(),
new TensorFlowImporter(),
new OnnxImporter(),
+ new LightGBMImporter(),
new XGBoostImporter());
DeployState deployState = new DeployState.Builder()
.applicationPackage(FilesApplicationPackage.fromFile(path.toFile(), true))
diff --git a/application/src/test/app-packages/model-evaluation/models/lightgbm/regression.json b/application/src/test/app-packages/model-evaluation/models/lightgbm/regression.json
new file mode 100644
index 00000000000..cf0488ecd8b
--- /dev/null
+++ b/application/src/test/app-packages/model-evaluation/models/lightgbm/regression.json
@@ -0,0 +1,275 @@
+{
+ "name": "tree",
+ "version": "v3",
+ "num_class": 1,
+ "num_tree_per_iteration": 1,
+ "label_index": 0,
+ "max_feature_idx": 3,
+ "average_output": false,
+ "objective": "regression",
+ "feature_names": [
+ "numerical_1",
+ "numerical_2",
+ "categorical_1",
+ "categorical_2"
+ ],
+ "monotone_constraints": [],
+ "tree_info": [
+ {
+ "tree_index": 0,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 68.5353012084961,
+ "threshold": 0.46643291586559305,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 2.1594397038037663,
+ "leaf_weight": 469,
+ "leaf_count": 469
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 3,
+ "split_gain": 41.27640151977539,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.246035,
+ "internal_weight": 531,
+ "internal_count": 531,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": 2.235297305276056,
+ "leaf_weight": 302,
+ "leaf_count": 302
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 2.1792953471546546,
+ "leaf_weight": 229,
+ "leaf_count": 229
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 1,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 2,
+ "split_gain": 64.22250366210938,
+ "threshold": "3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 0.03070842919354316,
+ "leaf_weight": 399,
+ "leaf_count": 399
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 0,
+ "split_gain": 36.74250030517578,
+ "threshold": 0.5102250691730842,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.204906,
+ "internal_weight": 601,
+ "internal_count": 601,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.04439151147520909,
+ "leaf_weight": 315,
+ "leaf_count": 315
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.005117411709368601,
+ "leaf_weight": 286,
+ "leaf_count": 286
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 2,
+ "num_leaves": 3,
+ "num_cat": 0,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 57.1327018737793,
+ "threshold": 0.668665477622446,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 40.859100341796875,
+ "threshold": 0.008118820676863816,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.162926,
+ "internal_weight": 681,
+ "internal_count": 681,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.15361238490967524,
+ "leaf_weight": 21,
+ "leaf_count": 21
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": -0.01192330846157292,
+ "leaf_weight": 660,
+ "leaf_count": 660
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": 0.03499044894987518,
+ "leaf_weight": 319,
+ "leaf_count": 319
+ }
+ }
+ },
+ {
+ "tree_index": 3,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 0,
+ "split_gain": 54.77090072631836,
+ "threshold": 0.5201391072644542,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.02141000620783247,
+ "leaf_weight": 543,
+ "leaf_count": 543
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 2,
+ "split_gain": 27.200700759887695,
+ "threshold": "0||1",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.255704,
+ "internal_weight": 457,
+ "internal_count": 457,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.004121485787596721,
+ "leaf_weight": 191,
+ "leaf_count": 191
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.04534090904886873,
+ "leaf_weight": 266,
+ "leaf_count": 266
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 4,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 3,
+ "split_gain": 51.84349822998047,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 39.352699279785156,
+ "threshold": 0.27283279016959255,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0.188414,
+ "internal_weight": 593,
+ "internal_count": 593,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.01924803254356527,
+ "leaf_weight": 184,
+ "leaf_count": 184
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.03643772842347651,
+ "leaf_weight": 409,
+ "leaf_count": 409
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.02701711918923075,
+ "leaf_weight": 407,
+ "leaf_count": 407
+ }
+ }
+ }
+ ],
+ "pandas_categorical": [
+ [
+ "a",
+ "b",
+ "c",
+ "d",
+ "e"
+ ],
+ [
+ "i",
+ "j",
+ "k",
+ "l",
+ "m"
+ ]
+ ]
+} \ No newline at end of file
diff --git a/application/src/test/java/com/yahoo/application/container/ContainerModelEvaluationTest.java b/application/src/test/java/com/yahoo/application/container/ContainerModelEvaluationTest.java
index 79510375414..3d7eed1e729 100644
--- a/application/src/test/java/com/yahoo/application/container/ContainerModelEvaluationTest.java
+++ b/application/src/test/java/com/yahoo/application/container/ContainerModelEvaluationTest.java
@@ -45,7 +45,7 @@ public class ContainerModelEvaluationTest {
}
private void assertLoadedModels(JDisc jdisc) {
{
- String expected = "{\"xgboost_xgboost_2_2\":\"http://localhost/model-evaluation/v1/xgboost_xgboost_2_2\",\"onnx_mnist_softmax\":\"http://localhost/model-evaluation/v1/onnx_mnist_softmax\",\"tensorflow_mnist_softmax_saved\":\"http://localhost/model-evaluation/v1/tensorflow_mnist_softmax_saved\",\"tensorflow_mnist_saved\":\"http://localhost/model-evaluation/v1/tensorflow_mnist_saved\",\"vespa_example\":\"http://localhost/model-evaluation/v1/vespa_example\"}";
+ String expected = "{\"xgboost_xgboost_2_2\":\"http://localhost/model-evaluation/v1/xgboost_xgboost_2_2\",\"onnx_mnist_softmax\":\"http://localhost/model-evaluation/v1/onnx_mnist_softmax\",\"tensorflow_mnist_softmax_saved\":\"http://localhost/model-evaluation/v1/tensorflow_mnist_softmax_saved\",\"tensorflow_mnist_saved\":\"http://localhost/model-evaluation/v1/tensorflow_mnist_saved\",\"vespa_example\":\"http://localhost/model-evaluation/v1/vespa_example\",\"lightgbm_regression\":\"http://localhost/model-evaluation/v1/lightgbm_regression\"}";
assertResponse("http://localhost/model-evaluation/v1", expected, jdisc);
}
@@ -55,6 +55,11 @@ public class ContainerModelEvaluationTest {
}
{
+ String expected = "{\"cells\":[{\"address\":{},\"value\":1.9130086820218188}]}";
+ assertResponse("http://localhost/model-evaluation/v1/lightgbm_regression/eval", expected, jdisc);
+ }
+
+ {
// Note: The specific response value here has not been verified
String expected = "{\"cells\":[{\"address\":{\"d0\":\"0\",\"d1\":\"0\"},\"value\":-0.5066885003407351},{\"address\":{\"d0\":\"0\",\"d1\":\"1\"},\"value\":0.3912837743150205},{\"address\":{\"d0\":\"0\",\"d1\":\"2\"},\"value\":-0.12401806321703948},{\"address\":{\"d0\":\"0\",\"d1\":\"3\"},\"value\":-0.7019029168606575},{\"address\":{\"d0\":\"0\",\"d1\":\"4\"},\"value\":0.13120114146441697},{\"address\":{\"d0\":\"0\",\"d1\":\"5\"},\"value\":0.6611923203384626},{\"address\":{\"d0\":\"0\",\"d1\":\"6\"},\"value\":-0.22365810810026446},{\"address\":{\"d0\":\"0\",\"d1\":\"7\"},\"value\":-0.0740018307465809},{\"address\":{\"d0\":\"0\",\"d1\":\"8\"},\"value\":0.056492490256153896},{\"address\":{\"d0\":\"0\",\"d1\":\"9\"},\"value\":-0.18422015072393733}]}";
assertResponse("http://localhost/model-evaluation/v1/tensorflow_mnist_saved/serving_default.y/eval?input=" + inputTensor(), expected, jdisc);
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/expressiontransforms/ExpressionTransforms.java b/config-model/src/main/java/com/yahoo/searchdefinition/expressiontransforms/ExpressionTransforms.java
index 6fdf448a39b..a6707ec7ac0 100644
--- a/config-model/src/main/java/com/yahoo/searchdefinition/expressiontransforms/ExpressionTransforms.java
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/expressiontransforms/ExpressionTransforms.java
@@ -27,6 +27,7 @@ public class ExpressionTransforms {
ImmutableList.of(new TensorFlowFeatureConverter(),
new OnnxFeatureConverter(),
new XgboostFeatureConverter(),
+ new LightGBMFeatureConverter(),
new ConstantDereferencer(),
new ConstantTensorTransformer(),
new FunctionInliner(),
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/expressiontransforms/LightGBMFeatureConverter.java b/config-model/src/main/java/com/yahoo/searchdefinition/expressiontransforms/LightGBMFeatureConverter.java
new file mode 100644
index 00000000000..5bde627dc0a
--- /dev/null
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/expressiontransforms/LightGBMFeatureConverter.java
@@ -0,0 +1,59 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.searchdefinition.expressiontransforms;
+
+import com.yahoo.path.Path;
+import com.yahoo.searchlib.rankingexpression.rule.Arguments;
+import com.yahoo.searchlib.rankingexpression.rule.CompositeNode;
+import com.yahoo.searchlib.rankingexpression.rule.ExpressionNode;
+import com.yahoo.searchlib.rankingexpression.rule.ReferenceNode;
+import com.yahoo.searchlib.rankingexpression.transform.ExpressionTransformer;
+import com.yahoo.vespa.model.ml.ConvertedModel;
+import com.yahoo.vespa.model.ml.FeatureArguments;
+
+import java.io.UncheckedIOException;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Replaces instances of the lightgbm(model-path) pseudofeature with the
+ * native Vespa ranking expression implementing the same computation.
+ *
+ * @author lesters
+ */
+public class LightGBMFeatureConverter extends ExpressionTransformer<RankProfileTransformContext> {
+
+ /** A cache of imported models indexed by model path. This avoids importing the same model multiple times. */
+ private final Map<Path, ConvertedModel> convertedLightGBMModels = new HashMap<>();
+
+ @Override
+ public ExpressionNode transform(ExpressionNode node, RankProfileTransformContext context) {
+ if (node instanceof ReferenceNode)
+ return transformFeature((ReferenceNode) node, context);
+ else if (node instanceof CompositeNode)
+ return super.transformChildren((CompositeNode) node, context);
+ else
+ return node;
+ }
+
+ private ExpressionNode transformFeature(ReferenceNode feature, RankProfileTransformContext context) {
+ if ( ! feature.getName().equals("lightgbm")) return feature;
+
+ try {
+ FeatureArguments arguments = asFeatureArguments(feature.getArguments());
+ ConvertedModel convertedModel =
+ convertedLightGBMModels.computeIfAbsent(arguments.path(),
+ path -> ConvertedModel.fromSourceOrStore(path, true, context));
+ return convertedModel.expression(arguments, context);
+ } catch (IllegalArgumentException | UncheckedIOException e) {
+ throw new IllegalArgumentException("Could not use LightGBM model from " + feature, e);
+ }
+ }
+
+ private FeatureArguments asFeatureArguments(Arguments arguments) {
+ if (arguments.size() != 1)
+ throw new IllegalArgumentException("A lightgbm node must take a single argument pointing to " +
+ "the LightGBM model file under [application]/models");
+ return new FeatureArguments(arguments);
+ }
+
+}
diff --git a/config-model/src/test/cfg/application/ml_models/models/lightgbm_regression.json b/config-model/src/test/cfg/application/ml_models/models/lightgbm_regression.json
new file mode 100644
index 00000000000..cf0488ecd8b
--- /dev/null
+++ b/config-model/src/test/cfg/application/ml_models/models/lightgbm_regression.json
@@ -0,0 +1,275 @@
+{
+ "name": "tree",
+ "version": "v3",
+ "num_class": 1,
+ "num_tree_per_iteration": 1,
+ "label_index": 0,
+ "max_feature_idx": 3,
+ "average_output": false,
+ "objective": "regression",
+ "feature_names": [
+ "numerical_1",
+ "numerical_2",
+ "categorical_1",
+ "categorical_2"
+ ],
+ "monotone_constraints": [],
+ "tree_info": [
+ {
+ "tree_index": 0,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 68.5353012084961,
+ "threshold": 0.46643291586559305,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 2.1594397038037663,
+ "leaf_weight": 469,
+ "leaf_count": 469
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 3,
+ "split_gain": 41.27640151977539,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.246035,
+ "internal_weight": 531,
+ "internal_count": 531,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": 2.235297305276056,
+ "leaf_weight": 302,
+ "leaf_count": 302
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 2.1792953471546546,
+ "leaf_weight": 229,
+ "leaf_count": 229
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 1,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 2,
+ "split_gain": 64.22250366210938,
+ "threshold": "3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 0.03070842919354316,
+ "leaf_weight": 399,
+ "leaf_count": 399
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 0,
+ "split_gain": 36.74250030517578,
+ "threshold": 0.5102250691730842,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.204906,
+ "internal_weight": 601,
+ "internal_count": 601,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.04439151147520909,
+ "leaf_weight": 315,
+ "leaf_count": 315
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.005117411709368601,
+ "leaf_weight": 286,
+ "leaf_count": 286
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 2,
+ "num_leaves": 3,
+ "num_cat": 0,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 57.1327018737793,
+ "threshold": 0.668665477622446,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 40.859100341796875,
+ "threshold": 0.008118820676863816,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.162926,
+ "internal_weight": 681,
+ "internal_count": 681,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.15361238490967524,
+ "leaf_weight": 21,
+ "leaf_count": 21
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": -0.01192330846157292,
+ "leaf_weight": 660,
+ "leaf_count": 660
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": 0.03499044894987518,
+ "leaf_weight": 319,
+ "leaf_count": 319
+ }
+ }
+ },
+ {
+ "tree_index": 3,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 0,
+ "split_gain": 54.77090072631836,
+ "threshold": 0.5201391072644542,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.02141000620783247,
+ "leaf_weight": 543,
+ "leaf_count": 543
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 2,
+ "split_gain": 27.200700759887695,
+ "threshold": "0||1",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.255704,
+ "internal_weight": 457,
+ "internal_count": 457,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.004121485787596721,
+ "leaf_weight": 191,
+ "leaf_count": 191
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.04534090904886873,
+ "leaf_weight": 266,
+ "leaf_count": 266
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 4,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 3,
+ "split_gain": 51.84349822998047,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 39.352699279785156,
+ "threshold": 0.27283279016959255,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0.188414,
+ "internal_weight": 593,
+ "internal_count": 593,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.01924803254356527,
+ "leaf_weight": 184,
+ "leaf_count": 184
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.03643772842347651,
+ "leaf_weight": 409,
+ "leaf_count": 409
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.02701711918923075,
+ "leaf_weight": 407,
+ "leaf_count": 407
+ }
+ }
+ }
+ ],
+ "pandas_categorical": [
+ [
+ "a",
+ "b",
+ "c",
+ "d",
+ "e"
+ ],
+ [
+ "i",
+ "j",
+ "k",
+ "l",
+ "m"
+ ]
+ ]
+} \ No newline at end of file
diff --git a/config-model/src/test/cfg/application/ml_models/searchdefinitions/test.sd b/config-model/src/test/cfg/application/ml_models/searchdefinitions/test.sd
index ab5e42f983d..247df8a0241 100644
--- a/config-model/src/test/cfg/application/ml_models/searchdefinitions/test.sd
+++ b/config-model/src/test/cfg/application/ml_models/searchdefinitions/test.sd
@@ -33,8 +33,12 @@ search test {
expression: xgboost("xgboost_2_2")
}
+ function my_lightgbm() {
+ expression: lightgbm("lightgbm_regression")
+ }
+
first-phase {
- expression: mnist_tensorflow + mnist_softmax_tensorflow + mnist_softmax_onnx + my_xgboost
+ expression: mnist_tensorflow + mnist_softmax_tensorflow + mnist_softmax_onnx + my_xgboost + my_lightgbm
}
}
diff --git a/config-model/src/test/cfg/application/ml_serving/models/lightgbm_regression.json b/config-model/src/test/cfg/application/ml_serving/models/lightgbm_regression.json
new file mode 100644
index 00000000000..cf0488ecd8b
--- /dev/null
+++ b/config-model/src/test/cfg/application/ml_serving/models/lightgbm_regression.json
@@ -0,0 +1,275 @@
+{
+ "name": "tree",
+ "version": "v3",
+ "num_class": 1,
+ "num_tree_per_iteration": 1,
+ "label_index": 0,
+ "max_feature_idx": 3,
+ "average_output": false,
+ "objective": "regression",
+ "feature_names": [
+ "numerical_1",
+ "numerical_2",
+ "categorical_1",
+ "categorical_2"
+ ],
+ "monotone_constraints": [],
+ "tree_info": [
+ {
+ "tree_index": 0,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 68.5353012084961,
+ "threshold": 0.46643291586559305,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 2.1594397038037663,
+ "leaf_weight": 469,
+ "leaf_count": 469
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 3,
+ "split_gain": 41.27640151977539,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.246035,
+ "internal_weight": 531,
+ "internal_count": 531,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": 2.235297305276056,
+ "leaf_weight": 302,
+ "leaf_count": 302
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 2.1792953471546546,
+ "leaf_weight": 229,
+ "leaf_count": 229
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 1,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 2,
+ "split_gain": 64.22250366210938,
+ "threshold": "3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 0.03070842919354316,
+ "leaf_weight": 399,
+ "leaf_count": 399
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 0,
+ "split_gain": 36.74250030517578,
+ "threshold": 0.5102250691730842,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.204906,
+ "internal_weight": 601,
+ "internal_count": 601,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.04439151147520909,
+ "leaf_weight": 315,
+ "leaf_count": 315
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.005117411709368601,
+ "leaf_weight": 286,
+ "leaf_count": 286
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 2,
+ "num_leaves": 3,
+ "num_cat": 0,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 57.1327018737793,
+ "threshold": 0.668665477622446,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 40.859100341796875,
+ "threshold": 0.008118820676863816,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.162926,
+ "internal_weight": 681,
+ "internal_count": 681,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.15361238490967524,
+ "leaf_weight": 21,
+ "leaf_count": 21
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": -0.01192330846157292,
+ "leaf_weight": 660,
+ "leaf_count": 660
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": 0.03499044894987518,
+ "leaf_weight": 319,
+ "leaf_count": 319
+ }
+ }
+ },
+ {
+ "tree_index": 3,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 0,
+ "split_gain": 54.77090072631836,
+ "threshold": 0.5201391072644542,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.02141000620783247,
+ "leaf_weight": 543,
+ "leaf_count": 543
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 2,
+ "split_gain": 27.200700759887695,
+ "threshold": "0||1",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.255704,
+ "internal_weight": 457,
+ "internal_count": 457,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.004121485787596721,
+ "leaf_weight": 191,
+ "leaf_count": 191
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.04534090904886873,
+ "leaf_weight": 266,
+ "leaf_count": 266
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 4,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 3,
+ "split_gain": 51.84349822998047,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 39.352699279785156,
+ "threshold": 0.27283279016959255,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0.188414,
+ "internal_weight": 593,
+ "internal_count": 593,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.01924803254356527,
+ "leaf_weight": 184,
+ "leaf_count": 184
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.03643772842347651,
+ "leaf_weight": 409,
+ "leaf_count": 409
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.02701711918923075,
+ "leaf_weight": 407,
+ "leaf_count": 407
+ }
+ }
+ }
+ ],
+ "pandas_categorical": [
+ [
+ "a",
+ "b",
+ "c",
+ "d",
+ "e"
+ ],
+ [
+ "i",
+ "j",
+ "k",
+ "l",
+ "m"
+ ]
+ ]
+} \ No newline at end of file
diff --git a/config-model/src/test/integration/lightgbm/models/regression.json b/config-model/src/test/integration/lightgbm/models/regression.json
new file mode 100644
index 00000000000..cf0488ecd8b
--- /dev/null
+++ b/config-model/src/test/integration/lightgbm/models/regression.json
@@ -0,0 +1,275 @@
+{
+ "name": "tree",
+ "version": "v3",
+ "num_class": 1,
+ "num_tree_per_iteration": 1,
+ "label_index": 0,
+ "max_feature_idx": 3,
+ "average_output": false,
+ "objective": "regression",
+ "feature_names": [
+ "numerical_1",
+ "numerical_2",
+ "categorical_1",
+ "categorical_2"
+ ],
+ "monotone_constraints": [],
+ "tree_info": [
+ {
+ "tree_index": 0,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 68.5353012084961,
+ "threshold": 0.46643291586559305,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 2.1594397038037663,
+ "leaf_weight": 469,
+ "leaf_count": 469
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 3,
+ "split_gain": 41.27640151977539,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.246035,
+ "internal_weight": 531,
+ "internal_count": 531,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": 2.235297305276056,
+ "leaf_weight": 302,
+ "leaf_count": 302
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 2.1792953471546546,
+ "leaf_weight": 229,
+ "leaf_count": 229
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 1,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 2,
+ "split_gain": 64.22250366210938,
+ "threshold": "3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 0.03070842919354316,
+ "leaf_weight": 399,
+ "leaf_count": 399
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 0,
+ "split_gain": 36.74250030517578,
+ "threshold": 0.5102250691730842,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.204906,
+ "internal_weight": 601,
+ "internal_count": 601,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.04439151147520909,
+ "leaf_weight": 315,
+ "leaf_count": 315
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.005117411709368601,
+ "leaf_weight": 286,
+ "leaf_count": 286
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 2,
+ "num_leaves": 3,
+ "num_cat": 0,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 57.1327018737793,
+ "threshold": 0.668665477622446,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 40.859100341796875,
+ "threshold": 0.008118820676863816,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.162926,
+ "internal_weight": 681,
+ "internal_count": 681,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.15361238490967524,
+ "leaf_weight": 21,
+ "leaf_count": 21
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": -0.01192330846157292,
+ "leaf_weight": 660,
+ "leaf_count": 660
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": 0.03499044894987518,
+ "leaf_weight": 319,
+ "leaf_count": 319
+ }
+ }
+ },
+ {
+ "tree_index": 3,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 0,
+ "split_gain": 54.77090072631836,
+ "threshold": 0.5201391072644542,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.02141000620783247,
+ "leaf_weight": 543,
+ "leaf_count": 543
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 2,
+ "split_gain": 27.200700759887695,
+ "threshold": "0||1",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.255704,
+ "internal_weight": 457,
+ "internal_count": 457,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.004121485787596721,
+ "leaf_weight": 191,
+ "leaf_count": 191
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.04534090904886873,
+ "leaf_weight": 266,
+ "leaf_count": 266
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 4,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 3,
+ "split_gain": 51.84349822998047,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 39.352699279785156,
+ "threshold": 0.27283279016959255,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0.188414,
+ "internal_weight": 593,
+ "internal_count": 593,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.01924803254356527,
+ "leaf_weight": 184,
+ "leaf_count": 184
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.03643772842347651,
+ "leaf_weight": 409,
+ "leaf_count": 409
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.02701711918923075,
+ "leaf_weight": 407,
+ "leaf_count": 407
+ }
+ }
+ }
+ ],
+ "pandas_categorical": [
+ [
+ "a",
+ "b",
+ "c",
+ "d",
+ "e"
+ ],
+ [
+ "i",
+ "j",
+ "k",
+ "l",
+ "m"
+ ]
+ ]
+} \ No newline at end of file
diff --git a/config-model/src/test/java/com/yahoo/searchdefinition/processing/RankProfileSearchFixture.java b/config-model/src/test/java/com/yahoo/searchdefinition/processing/RankProfileSearchFixture.java
index 08dd5148b29..0cd6674751e 100644
--- a/config-model/src/test/java/com/yahoo/searchdefinition/processing/RankProfileSearchFixture.java
+++ b/config-model/src/test/java/com/yahoo/searchdefinition/processing/RankProfileSearchFixture.java
@@ -15,6 +15,7 @@ import com.yahoo.searchdefinition.parser.ParseException;
import ai.vespa.rankingexpression.importer.configmodelview.ImportedMlModels;
import ai.vespa.rankingexpression.importer.onnx.OnnxImporter;
import ai.vespa.rankingexpression.importer.tensorflow.TensorFlowImporter;
+import ai.vespa.rankingexpression.importer.lightgbm.LightGBMImporter;
import ai.vespa.rankingexpression.importer.xgboost.XGBoostImporter;
import java.util.HashMap;
@@ -33,6 +34,7 @@ class RankProfileSearchFixture {
private final ImmutableList<MlModelImporter> importers = ImmutableList.of(new TensorFlowImporter(),
new OnnxImporter(),
+ new LightGBMImporter(),
new XGBoostImporter());
private RankProfileRegistry rankProfileRegistry = new RankProfileRegistry();
private final QueryProfileRegistry queryProfileRegistry;
diff --git a/config-model/src/test/java/com/yahoo/searchdefinition/processing/RankingExpressionWithLightGBMTestCase.java b/config-model/src/test/java/com/yahoo/searchdefinition/processing/RankingExpressionWithLightGBMTestCase.java
new file mode 100644
index 00000000000..79d19371f1c
--- /dev/null
+++ b/config-model/src/test/java/com/yahoo/searchdefinition/processing/RankingExpressionWithLightGBMTestCase.java
@@ -0,0 +1,88 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.searchdefinition.processing;
+
+import com.yahoo.config.application.api.ApplicationPackage;
+import com.yahoo.io.IOUtils;
+import com.yahoo.path.Path;
+import com.yahoo.searchdefinition.parser.ParseException;
+import org.junit.After;
+import org.junit.Test;
+
+import java.io.IOException;
+
+/**
+ * @author lesters
+ */
+public class RankingExpressionWithLightGBMTestCase {
+
+ private final Path applicationDir = Path.fromString("src/test/integration/lightgbm/");
+
+ private final static String lightGBMExpression =
+ "if (!(numerical_2 >= 0.46643291586559305), 2.1594397038037663, if (categorical_2 in [\"k\", \"l\", \"m\"], 2.235297305276056, 2.1792953471546546)) + if (categorical_1 in [\"d\", \"e\"], 0.03070842919354316, if (!(numerical_1 >= 0.5102250691730842), -0.04439151147520909, 0.005117411709368601)) + if (!(numerical_2 >= 0.668665477622446), if (!(numerical_2 >= 0.008118820676863816), -0.15361238490967524, -0.01192330846157292), 0.03499044894987518) + if (!(numerical_1 >= 0.5201391072644542), -0.02141000620783247, if (categorical_1 in [\"a\", \"b\"], -0.004121485787596721, 0.04534090904886873)) + if (categorical_2 in [\"k\", \"l\", \"m\"], if (!(numerical_2 >= 0.27283279016959255), -0.01924803254356527, 0.03643772842347651), -0.02701711918923075)";
+
+ @After
+ public void removeGeneratedModelFiles() {
+ IOUtils.recursiveDeleteDir(applicationDir.append(ApplicationPackage.MODELS_GENERATED_DIR).toFile());
+ }
+
+ @Test
+ public void testLightGBMReference() {
+ RankProfileSearchFixture search = fixtureWith("lightgbm('regression.json')");
+ search.assertFirstPhaseExpression(lightGBMExpression, "my_profile");
+ }
+
+ @Test
+ public void testNestedLightGBMReference() {
+ RankProfileSearchFixture search = fixtureWith("5 + sum(lightgbm('regression.json'))");
+ search.assertFirstPhaseExpression("5 + reduce(" + lightGBMExpression + ", sum)", "my_profile");
+ }
+
+ @Test
+ public void testImportingFromStoredExpressions() throws IOException {
+ RankProfileSearchFixture search = fixtureWith("lightgbm('regression.json')");
+ search.assertFirstPhaseExpression(lightGBMExpression, "my_profile");
+
+ // At this point the expression is stored - copy application to another location which do not have a models dir
+ Path storedApplicationDirectory = applicationDir.getParentPath().append("copy");
+ try {
+ storedApplicationDirectory.toFile().mkdirs();
+ IOUtils.copyDirectory(applicationDir.append(ApplicationPackage.MODELS_GENERATED_DIR).toFile(),
+ storedApplicationDirectory.append(ApplicationPackage.MODELS_GENERATED_DIR).toFile());
+ RankingExpressionWithTensorFlowTestCase.StoringApplicationPackage storedApplication = new RankingExpressionWithTensorFlowTestCase.StoringApplicationPackage(storedApplicationDirectory);
+ RankProfileSearchFixture searchFromStored = fixtureWith("lightgbm('regression.json')");
+ searchFromStored.assertFirstPhaseExpression(lightGBMExpression, "my_profile");
+ }
+ finally {
+ IOUtils.recursiveDeleteDir(storedApplicationDirectory.toFile());
+ }
+ }
+
+ private RankProfileSearchFixture fixtureWith(String firstPhaseExpression) {
+ return fixtureWith(firstPhaseExpression, null, null,
+ new RankingExpressionWithTensorFlowTestCase.StoringApplicationPackage(applicationDir));
+ }
+
+ private RankProfileSearchFixture fixtureWith(String firstPhaseExpression,
+ String constant,
+ String field,
+ RankingExpressionWithTensorFlowTestCase.StoringApplicationPackage application) {
+ try {
+ RankProfileSearchFixture fixture = new RankProfileSearchFixture(
+ application,
+ application.getQueryProfiles(),
+ " rank-profile my_profile {\n" +
+ " first-phase {\n" +
+ " expression: " + firstPhaseExpression +
+ " }\n" +
+ " }",
+ constant,
+ field);
+ fixture.compileRankProfile("my_profile", applicationDir.append("models"));
+ return fixture;
+ } catch (ParseException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+}
+
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java b/config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java
index ce36ecc4a1c..7a3b76db7f8 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/ml/ImportedModelTester.java
@@ -1,7 +1,6 @@
// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.model.ml;
-import ai.vespa.rankingexpression.importer.vespa.VespaImporter;
import com.google.common.collect.ImmutableList;
import com.yahoo.config.model.ApplicationPackageTester;
import ai.vespa.rankingexpression.importer.configmodelview.MlModelImporter;
@@ -10,8 +9,10 @@ import com.yahoo.io.GrowableByteBuffer;
import com.yahoo.io.IOUtils;
import com.yahoo.path.Path;
import com.yahoo.searchdefinition.RankingConstant;
+import ai.vespa.rankingexpression.importer.lightgbm.LightGBMImporter;
import ai.vespa.rankingexpression.importer.onnx.OnnxImporter;
import ai.vespa.rankingexpression.importer.tensorflow.TensorFlowImporter;
+import ai.vespa.rankingexpression.importer.vespa.VespaImporter;
import ai.vespa.rankingexpression.importer.xgboost.XGBoostImporter;
import com.yahoo.tensor.Tensor;
import com.yahoo.tensor.serialization.TypedBinaryFormat;
@@ -35,6 +36,7 @@ public class ImportedModelTester {
private final ImmutableList<MlModelImporter> importers = ImmutableList.of(new TensorFlowImporter(),
new OnnxImporter(),
+ new LightGBMImporter(),
new XGBoostImporter(),
new VespaImporter());
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/ml/MlModelsTest.java b/config-model/src/test/java/com/yahoo/vespa/model/ml/MlModelsTest.java
index c5c475360a3..ced7243adf5 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/ml/MlModelsTest.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/ml/MlModelsTest.java
@@ -45,7 +45,7 @@ public class MlModelsTest {
private void verify(VespaModel model) {
assertEquals("Global models are created (although not used directly here",
- 4, model.rankProfileList().getRankProfiles().size());
+ 5, model.rankProfileList().getRankProfiles().size());
RankProfilesConfig.Builder builder = new RankProfilesConfig.Builder();
model.getSearchClusters().get(0).getConfig(builder);
@@ -71,8 +71,9 @@ public class MlModelsTest {
"rankingExpression(mnist_softmax_tensorflow).rankingScript: join(reduce(join(rename(rankingExpression(Placeholder), (d0, d1), (d0, d2)), constant(mnist_softmax_saved_layer_Variable_read), f(a,b)(a * b)), sum, d2), constant(mnist_softmax_saved_layer_Variable_1_read), f(a,b)(a + b))\n" +
"rankingExpression(mnist_softmax_onnx).rankingScript: join(reduce(join(rename(rankingExpression(Placeholder), (d0, d1), (d0, d2)), constant(mnist_softmax_Variable), f(a,b)(a * b)), sum, d2), constant(mnist_softmax_Variable_1), f(a,b)(a + b))\n" +
"rankingExpression(my_xgboost).rankingScript: if (f29 < -0.1234567, if (!(f56 >= -0.242398), 1.71218, -1.70044), if (f109 < 0.8723473, -1.94071, 1.85965)) + if (!(f60 >= -0.482947), if (f29 < -4.2387498, 0.784718, -0.96853), -6.23624)\n" +
+ "rankingExpression(my_lightgbm).rankingScript: if (!(numerical_2 >= 0.46643291586559305), 2.1594397038037663, if (categorical_2 in [\"k\", \"l\", \"m\"], 2.235297305276056, 2.1792953471546546)) + if (categorical_1 in [\"d\", \"e\"], 0.03070842919354316, if (!(numerical_1 >= 0.5102250691730842), -0.04439151147520909, 0.005117411709368601)) + if (!(numerical_2 >= 0.668665477622446), if (!(numerical_2 >= 0.008118820676863816), -0.15361238490967524, -0.01192330846157292), 0.03499044894987518) + if (!(numerical_1 >= 0.5201391072644542), -0.02141000620783247, if (categorical_1 in [\"a\", \"b\"], -0.004121485787596721, 0.04534090904886873)) + if (categorical_2 in [\"k\", \"l\", \"m\"], if (!(numerical_2 >= 0.27283279016959255), -0.01924803254356527, 0.03643772842347651), -0.02701711918923075)\n" +
"vespa.rank.firstphase: rankingExpression(firstphase)\n" +
- "rankingExpression(firstphase).rankingScript: rankingExpression(mnist_tensorflow) + rankingExpression(mnist_softmax_tensorflow) + rankingExpression(mnist_softmax_onnx) + rankingExpression(my_xgboost)\n" +
+ "rankingExpression(firstphase).rankingScript: rankingExpression(mnist_tensorflow) + rankingExpression(mnist_softmax_tensorflow) + rankingExpression(mnist_softmax_onnx) + rankingExpression(my_xgboost) + rankingExpression(my_lightgbm)\n" +
"vespa.type.attribute.argument: tensor<float>(d0[],d1[784])\n";
}
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/ml/ModelEvaluationTest.java b/config-model/src/test/java/com/yahoo/vespa/model/ml/ModelEvaluationTest.java
index 3d4ac7f2eeb..2d3ddc33afb 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/ml/ModelEvaluationTest.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/ml/ModelEvaluationTest.java
@@ -96,9 +96,10 @@ public class ModelEvaluationTest {
cluster.getConfig(cb);
RankingConstantsConfig constantsConfig = new RankingConstantsConfig(cb);
- assertEquals(4, config.rankprofile().size());
+ assertEquals(5, config.rankprofile().size());
Set<String> modelNames = config.rankprofile().stream().map(v -> v.name()).collect(Collectors.toSet());
assertTrue(modelNames.contains("xgboost_2_2"));
+ assertTrue(modelNames.contains("lightgbm_regression"));
assertTrue(modelNames.contains("mnist_saved"));
assertTrue(modelNames.contains("mnist_softmax"));
assertTrue(modelNames.contains("mnist_softmax_saved"));
@@ -112,13 +113,18 @@ public class ModelEvaluationTest {
ModelsEvaluator evaluator = new ModelsEvaluator(new ToleratingMissingConstantFilesRankProfilesConfigImporter(MockFileAcquirer.returnFile(null))
.importFrom(config, constantsConfig));
- assertEquals(4, evaluator.models().size());
+ assertEquals(5, evaluator.models().size());
Model xgboost = evaluator.models().get("xgboost_2_2");
assertNotNull(xgboost);
assertNotNull(xgboost.evaluatorOf());
assertNotNull(xgboost.evaluatorOf("xgboost_2_2"));
+ Model lightgbm = evaluator.models().get("lightgbm_regression");
+ assertNotNull(lightgbm);
+ assertNotNull(lightgbm.evaluatorOf());
+ assertNotNull(lightgbm.evaluatorOf("lightgbm_regression"));
+
Model tensorflow_mnist = evaluator.models().get("mnist_saved");
assertNotNull(tensorflow_mnist);
assertEquals(1, tensorflow_mnist.functions().size());
diff --git a/model-evaluation/abi-spec.json b/model-evaluation/abi-spec.json
index 5a75c8b31ea..d465464de7f 100644
--- a/model-evaluation/abi-spec.json
+++ b/model-evaluation/abi-spec.json
@@ -8,6 +8,7 @@
"methods": [
"public ai.vespa.models.evaluation.FunctionEvaluator bind(java.lang.String, com.yahoo.tensor.Tensor)",
"public ai.vespa.models.evaluation.FunctionEvaluator bind(java.lang.String, double)",
+ "public ai.vespa.models.evaluation.FunctionEvaluator bind(java.lang.String, java.lang.String)",
"public ai.vespa.models.evaluation.FunctionEvaluator setMissingValue(com.yahoo.tensor.Tensor)",
"public ai.vespa.models.evaluation.FunctionEvaluator setMissingValue(double)",
"public com.yahoo.tensor.Tensor evaluate()",
diff --git a/model-evaluation/src/main/java/ai/vespa/models/evaluation/FunctionEvaluator.java b/model-evaluation/src/main/java/ai/vespa/models/evaluation/FunctionEvaluator.java
index 994f6dd9b64..e373a54bcd1 100644
--- a/model-evaluation/src/main/java/ai/vespa/models/evaluation/FunctionEvaluator.java
+++ b/model-evaluation/src/main/java/ai/vespa/models/evaluation/FunctionEvaluator.java
@@ -2,6 +2,7 @@
package ai.vespa.models.evaluation;
import com.yahoo.searchlib.rankingexpression.ExpressionFunction;
+import com.yahoo.searchlib.rankingexpression.evaluation.StringValue;
import com.yahoo.searchlib.rankingexpression.evaluation.TensorValue;
import com.yahoo.tensor.Tensor;
import com.yahoo.tensor.TensorType;
@@ -61,6 +62,21 @@ public class FunctionEvaluator {
}
/**
+ * Binds the given variable referred in this expression to the given value.
+ * String values are not yet supported in tensors.
+ *
+ * @param name the variable to bind
+ * @param value the value this becomes bound to
+ * @return this for chaining
+ */
+ public FunctionEvaluator bind(String name, String value) {
+ if (evaluated)
+ throw new IllegalStateException("Cannot bind a new value in a used evaluator");
+ context.put(name, new StringValue(value));
+ return this;
+ }
+
+ /**
* Sets the default value to use for variables which are not bound
*
* @param value the default value
diff --git a/model-evaluation/src/main/java/ai/vespa/models/handler/ModelsEvaluationHandler.java b/model-evaluation/src/main/java/ai/vespa/models/handler/ModelsEvaluationHandler.java
index 5c353fcdf35..de23a8c6526 100644
--- a/model-evaluation/src/main/java/ai/vespa/models/handler/ModelsEvaluationHandler.java
+++ b/model-evaluation/src/main/java/ai/vespa/models/handler/ModelsEvaluationHandler.java
@@ -77,8 +77,14 @@ public class ModelsEvaluationHandler extends ThreadedHttpRequestHandler {
property(request, missingValueKey).ifPresent(missingValue -> evaluator.setMissingValue(Tensor.from(missingValue)));
for (Map.Entry<String, TensorType> argument : evaluator.function().argumentTypes().entrySet()) {
- property(request, argument.getKey()).ifPresent(value -> evaluator.bind(argument.getKey(),
- Tensor.from(argument.getValue(), value)));
+ Optional<String> value = property(request, argument.getKey());
+ if (value.isPresent()) {
+ try {
+ evaluator.bind(argument.getKey(), Tensor.from(argument.getValue(), value.get()));
+ } catch (IllegalArgumentException e) {
+ evaluator.bind(argument.getKey(), value.get()); // since we don't yet support tensors with string values
+ }
+ }
}
Tensor result = evaluator.evaluate();
return new Response(200, JsonFormat.encode(result));
diff --git a/model-evaluation/src/test/java/ai/vespa/models/evaluation/MlModelsImportingTest.java b/model-evaluation/src/test/java/ai/vespa/models/evaluation/MlModelsImportingTest.java
index 9320ac3fad8..0d13b7d4660 100644
--- a/model-evaluation/src/test/java/ai/vespa/models/evaluation/MlModelsImportingTest.java
+++ b/model-evaluation/src/test/java/ai/vespa/models/evaluation/MlModelsImportingTest.java
@@ -25,7 +25,7 @@ public class MlModelsImportingTest {
public void testImportingModels() {
ModelTester tester = new ModelTester("src/test/resources/config/models/");
- assertEquals(4, tester.models().size());
+ assertEquals(5, tester.models().size());
// TODO: When we get type information in Models, replace the evaluator.context().names() check below by that
{
@@ -47,7 +47,24 @@ public class MlModelsImportingTest {
}
{
+ Model lightgbm = tester.models().get("lightgbm_regression");
+ // Function
+ assertEquals(1, lightgbm.functions().size());
+ ExpressionFunction function = tester.assertFunction("lightgbm_regression",
+ "(optimized sum of condition trees of size 480 bytes)",
+ lightgbm);
+ assertEquals("tensor()", function.returnType().get().toString());
+ assertEquals("categorical_1, categorical_2, numerical_1, numerical_2", commaSeparated(function.arguments()));
+ function.arguments().forEach(arg -> assertEquals(TensorType.empty, function.argumentTypes().get(arg)));
+
+ // Evaluator
+ FunctionEvaluator evaluator = lightgbm.evaluatorOf();
+ assertEquals("categorical_1, categorical_2, numerical_1, numerical_2", evaluator.context().names().stream().sorted().collect(Collectors.joining(", ")));
+ assertEquals(1.91300868202, evaluator.evaluate().sum().asDouble(), delta);
+ }
+
+ {
Model onnxMnistSoftmax = tester.models().get("mnist_softmax");
// Function
diff --git a/model-evaluation/src/test/java/ai/vespa/models/handler/ModelsEvaluationHandlerTest.java b/model-evaluation/src/test/java/ai/vespa/models/handler/ModelsEvaluationHandlerTest.java
index 629c82f410a..c9e49d3be02 100644
--- a/model-evaluation/src/test/java/ai/vespa/models/handler/ModelsEvaluationHandlerTest.java
+++ b/model-evaluation/src/test/java/ai/vespa/models/handler/ModelsEvaluationHandlerTest.java
@@ -56,7 +56,7 @@ public class ModelsEvaluationHandlerTest {
public void testListModels() {
String url = "http://localhost/model-evaluation/v1";
String expected =
- "{\"mnist_softmax\":\"http://localhost/model-evaluation/v1/mnist_softmax\",\"mnist_saved\":\"http://localhost/model-evaluation/v1/mnist_saved\",\"mnist_softmax_saved\":\"http://localhost/model-evaluation/v1/mnist_softmax_saved\",\"xgboost_2_2\":\"http://localhost/model-evaluation/v1/xgboost_2_2\"}";
+ "{\"mnist_softmax\":\"http://localhost/model-evaluation/v1/mnist_softmax\",\"mnist_saved\":\"http://localhost/model-evaluation/v1/mnist_saved\",\"mnist_softmax_saved\":\"http://localhost/model-evaluation/v1/mnist_softmax_saved\",\"xgboost_2_2\":\"http://localhost/model-evaluation/v1/xgboost_2_2\",\"lightgbm_regression\":\"http://localhost/model-evaluation/v1/lightgbm_regression\"}";
assertResponse(url, 200, expected);
}
@@ -94,6 +94,39 @@ public class ModelsEvaluationHandlerTest {
}
@Test
+ public void testLightGBMEvaluationWithoutBindings() {
+ String url = "http://localhost/model-evaluation/v1/lightgbm_regression/eval";
+ String expected = "{\"cells\":[{\"address\":{},\"value\":1.9130086820218188}]}";
+ assertResponse(url, 200, expected);
+ }
+
+ @Test
+ public void testLightGBMEvaluationWithBindings() {
+ Map<String, String> properties = new HashMap<>();
+ properties.put("numerical_1", "0.1");
+ properties.put("numerical_2", "0.2");
+ properties.put("categorical_1", "a");
+ properties.put("categorical_2", "i");
+ properties.put("non-existing-binding", "-1");
+ String url = "http://localhost/model-evaluation/v1/lightgbm_regression/eval";
+ String expected = "{\"cells\":[{\"address\":{},\"value\":2.054697758469921}]}";
+ assertResponse(url, properties, 200, expected);
+ }
+
+ @Test
+ public void testLightGBMEvaluationWithMissingValue() {
+ Map<String, String> properties = new HashMap<>();
+ properties.put("missing-value", "-1.0");
+ properties.put("numerical_2", "0.5");
+ properties.put("categorical_1", "b");
+ properties.put("categorical_2", "j");
+ properties.put("non-existing-binding", "-1");
+ String url = "http://localhost/model-evaluation/v1/lightgbm_regression/eval";
+ String expected = "{\"cells\":[{\"address\":{},\"value\":2.0745534018208094}]}";
+ assertResponse(url, properties, 200, expected);
+ }
+
+ @Test
public void testMnistSoftmaxDetails() {
String url = "http://localhost:8080/model-evaluation/v1/mnist_softmax";
String expected = "{\"model\":\"mnist_softmax\",\"functions\":[{\"function\":\"default.add\",\"info\":\"http://localhost:8080/model-evaluation/v1/mnist_softmax/default.add\",\"eval\":\"http://localhost:8080/model-evaluation/v1/mnist_softmax/default.add/eval\",\"arguments\":[{\"name\":\"Placeholder\",\"type\":\"tensor(d0[],d1[784])\"}]}]}";
diff --git a/model-evaluation/src/test/resources/config/models/rank-profiles.cfg b/model-evaluation/src/test/resources/config/models/rank-profiles.cfg
index c25c5ba555b..385115b7cd4 100644
--- a/model-evaluation/src/test/resources/config/models/rank-profiles.cfg
+++ b/model-evaluation/src/test/resources/config/models/rank-profiles.cfg
@@ -26,3 +26,6 @@ rankprofile[3].fef.property[3].name "rankingExpression(serving_default.y).input.
rankprofile[3].fef.property[3].value "tensor(d0[],d1[784])"
rankprofile[3].fef.property[4].name "rankingExpression(serving_default.y).type"
rankprofile[3].fef.property[4].value "tensor(d1[10])"
+rankprofile[4].name "lightgbm_regression"
+rankprofile[4].fef.property[0].name "rankingExpression(lightgbm_regression).rankingScript"
+rankprofile[4].fef.property[0].value "if (!(numerical_2 >= 0.46643291586559305), 2.1594397038037663, if (categorical_2 in ["k", "l", "m"], 2.235297305276056, 2.1792953471546546)) + if (categorical_1 in ["d", "e"], 0.03070842919354316, if (!(numerical_1 >= 0.5102250691730842), -0.04439151147520909, 0.005117411709368601)) + if (!(numerical_2 >= 0.668665477622446), if (!(numerical_2 >= 0.008118820676863816), -0.15361238490967524, -0.01192330846157292), 0.03499044894987518) + if (!(numerical_1 >= 0.5201391072644542), -0.02141000620783247, if (categorical_1 in ["a", "b"], -0.004121485787596721, 0.04534090904886873)) + if (categorical_2 in ["k", "l", "m"], if (!(numerical_2 >= 0.27283279016959255), -0.01924803254356527, 0.03643772842347651), -0.02701711918923075)"
diff --git a/model-integration/src/main/config/model-integration.xml b/model-integration/src/main/config/model-integration.xml
index 90ec7d7275e..34f5f0ce31a 100644
--- a/model-integration/src/main/config/model-integration.xml
+++ b/model-integration/src/main/config/model-integration.xml
@@ -1,11 +1,12 @@
<!-- Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
<!-- Component which can import some ml model.
This is included into the config server services.xml to enable it to translate
- model pseudofeatures in ranking expressions during config mddel building.
+ model pseudo features in ranking expressions during config model building.
It is provided as separate bundles instead of being included in the config model
because some of these (TensorFlow) includes
JNI code, and so can only exist in one instance in the server. -->
<component id="ai.vespa.rankingexpression.importer.onnx.OnnxImporter" bundle="model-integration" />
<component id="ai.vespa.rankingexpression.importer.tensorflow.TensorFlowImporter" bundle="model-integration" />
<component id="ai.vespa.rankingexpression.importer.xgboost.XGBoostImporter" bundle="model-integration" />
+<component id="ai.vespa.rankingexpression.importer.lightgbm.LightGBMImporter" bundle="model-integration" />
<component id="ai.vespa.rankingexpression.importer.vespa.VespaImporter" bundle="model-integration" />
diff --git a/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMImporter.java b/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMImporter.java
new file mode 100644
index 00000000000..76caa652ad2
--- /dev/null
+++ b/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMImporter.java
@@ -0,0 +1,54 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package ai.vespa.rankingexpression.importer.lightgbm;
+
+import ai.vespa.rankingexpression.importer.ImportedModel;
+import ai.vespa.rankingexpression.importer.ModelImporter;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.yahoo.searchlib.rankingexpression.RankingExpression;
+import com.yahoo.searchlib.rankingexpression.parser.ParseException;
+
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * Converts a LightGBM model into a ranking expression.
+ *
+ * @author lesters
+ */
+public class LightGBMImporter extends ModelImporter {
+
+ @Override
+ public boolean canImport(String modelPath) {
+ File modelFile = new File(modelPath);
+ if ( ! modelFile.isFile()) return false;
+ return modelFile.toString().endsWith(".json") && probe(modelFile);
+ }
+
+ /**
+ * Returns true if the give file looks like a LightGBM json file.
+ * Currently, we just check if the json has an element called "tree_info"
+ */
+ private boolean probe(File modelFile) {
+ try {
+ return new ObjectMapper().readTree(modelFile).has("tree_info");
+ } catch (IOException e) {
+ throw new IllegalArgumentException("Could not read '" + modelFile + "'", e);
+ }
+ }
+
+ @Override
+ public ImportedModel importModel(String modelName, String modelPath) {
+ try {
+ ImportedModel model = new ImportedModel(modelName, modelPath);
+ LightGBMParser parser = new LightGBMParser(modelPath);
+ RankingExpression expression = new RankingExpression(parser.toRankingExpression());
+ model.expression(modelName, expression);
+ return model;
+ } catch (IOException e) {
+ throw new IllegalArgumentException("Could not import LightGBM model from '" + modelPath + "'", e);
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Could not parse ranking expression resulting from '" + modelPath + "'", e);
+ }
+ }
+
+}
diff --git a/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMNode.java b/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMNode.java
new file mode 100644
index 00000000000..dc76ed8cb6f
--- /dev/null
+++ b/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMNode.java
@@ -0,0 +1,67 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package ai.vespa.rankingexpression.importer.lightgbm;
+
+/**
+ * @author lesters
+ */
+public class LightGBMNode {
+
+ // split nodes
+ private int split_feature;
+ private String threshold; // double for numerical, string for categorical
+ private String decision_type;
+ private boolean default_left;
+ private String missing_type;
+ private int internal_count;
+ private LightGBMNode left_child;
+ private LightGBMNode right_child;
+
+ // leaf nodes
+ private double leaf_value;
+ private int leaf_count;
+
+ public int getSplit_feature() {
+ return split_feature;
+ }
+
+ public String getThreshold() {
+ return threshold;
+ }
+
+ public String getDecision_type() {
+ return decision_type;
+ }
+
+ public boolean isDefault_left() {
+ return default_left;
+ }
+
+ public String getMissing_type() {
+ return missing_type;
+ }
+
+ public int getInternal_count() {
+ return internal_count;
+ }
+
+ public LightGBMNode getLeft_child() {
+ return left_child;
+ }
+
+ public LightGBMNode getRight_child() {
+ return right_child;
+ }
+
+ public double getLeaf_value() {
+ return leaf_value;
+ }
+
+ public int getLeaf_count() {
+ return leaf_count;
+ }
+
+ public boolean isLeaf() {
+ return left_child == null && right_child == null;
+ }
+
+} \ No newline at end of file
diff --git a/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMParser.java b/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMParser.java
new file mode 100644
index 00000000000..996343674ce
--- /dev/null
+++ b/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMParser.java
@@ -0,0 +1,146 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package ai.vespa.rankingexpression.importer.lightgbm;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+/**
+ * @author lesters
+ */
+class LightGBMParser {
+
+ private final String objective;
+ private final List<LightGBMNode> nodes;
+ private final List<String> featureNames;
+ private final Map<Integer, List<String>> categoryValues; // pr feature index
+
+ LightGBMParser(String filePath) throws JsonProcessingException, IOException {
+ ObjectMapper mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+ JsonNode root = mapper.readTree(new File(filePath));
+
+ objective = root.get("objective").asText("regression");
+ featureNames = parseFeatureNames(root);
+ nodes = parseTrees(mapper, root);
+ categoryValues = parseCategoryValues(root);
+ }
+
+ private List<String> parseFeatureNames(JsonNode root) {
+ List<String> features = new ArrayList<>();
+ for (JsonNode name : root.get("feature_names")) {
+ features.add(name.textValue());
+ }
+ return features;
+ }
+
+ private List<LightGBMNode> parseTrees(ObjectMapper mapper, JsonNode root) throws JsonProcessingException {
+ List<LightGBMNode> nodes = new ArrayList<>();
+ for (JsonNode treeNode : root.get("tree_info")) {
+ nodes.add(mapper.treeToValue(treeNode.get("tree_structure"), LightGBMNode.class));
+ }
+ return nodes;
+ }
+
+ private Map<Integer, List<String>> parseCategoryValues(JsonNode root) {
+ Map<Integer, List<String>> categoryValues = new HashMap<>();
+
+ // Since the JSON format does not explicitly tell which features are
+ // categorical, we traverse the decision tree looking for categorical
+ // decisions and use that to determine which categorical features.
+ Set<Integer> categoricalFeatures = new TreeSet<>();
+ nodes.forEach(node -> findCategoricalFeatures(node, categoricalFeatures));
+
+ // Again, the LightGBM JSON format does not explicitly tell which
+ // categorical values map to each categorical feature. The assumption
+ // here is that the order they appear in the "pandas_categorical"
+ // structure is the same order as the "feature_names".
+ var pandasFeatureIterator = root.get("pandas_categorical").iterator();
+ var categoricalFeatureIterator = categoricalFeatures.iterator();
+ while (pandasFeatureIterator.hasNext() && categoricalFeatureIterator.hasNext()) {
+ List<String> values = new ArrayList<>();
+ pandasFeatureIterator.next().forEach(value -> values.add(value.textValue()));
+ categoryValues.put(categoricalFeatureIterator.next(), values);
+ }
+
+ return categoryValues;
+ }
+
+ private void findCategoricalFeatures(LightGBMNode node, Set<Integer> categoricalFeatures) {
+ if (node == null || node.isLeaf()) {
+ return;
+ }
+ if (node.getDecision_type().equals("==")) {
+ categoricalFeatures.add(node.getSplit_feature());
+ }
+ findCategoricalFeatures(node.getLeft_child(), categoricalFeatures);
+ findCategoricalFeatures(node.getRight_child(), categoricalFeatures);
+ }
+
+ String toRankingExpression() {
+ return applyObjective(nodes.stream().map(this::nodeToRankingExpression).collect(Collectors.joining(" + \n")));
+ }
+
+ // See https://lightgbm.readthedocs.io/en/latest/Parameters.html#objective
+ private String applyObjective(String expression) {
+ if (objective.startsWith("binary") || objective.equals("cross_entropy")) {
+ return "sigmoid(" + expression + ")";
+ }
+ if (objective.equals("poisson") || objective.equals("gamma") || objective.equals("tweedie")) {
+ return "exp(" + expression + ")";
+ }
+ return expression; // else: use expression directly
+ }
+
+ private String nodeToRankingExpression(LightGBMNode node) {
+ if (node.isLeaf()) {
+ return Double.toString(node.getLeaf_value());
+ } else {
+ String condition;
+ String feature = featureNames.get(node.getSplit_feature());
+ if (node.getDecision_type().equals("==")) {
+ String values = transformCategoryIndexesToValues(node);
+ if (node.isDefault_left()) { // means go left (true) when isNan
+ condition = "isNan(" + feature + ") || (" + feature + " in [ " + values + "])";
+ } else {
+ condition = feature + " in [" + values + "]";
+ }
+ } else { // assumption: all other decision types are <=
+ double value = Double.parseDouble(node.getThreshold());
+ if (node.isDefault_left()) {
+ condition = "!(" + feature + " >= " + value + ")";
+ } else {
+ condition = feature + " < " + value;
+ }
+ }
+ String left = nodeToRankingExpression(node.getLeft_child());
+ String right = nodeToRankingExpression(node.getRight_child());
+ return "if (" + condition + ", " + left + ", " + right + ")";
+ }
+ }
+
+ private String transformCategoryIndexesToValues(LightGBMNode node) {
+ return Arrays.stream(node.getThreshold().split("\\|\\|"))
+ .map(index -> "\"" + transformCategoryIndexToValue(node.getSplit_feature(), index) + "\"")
+ .collect(Collectors.joining(","));
+ }
+
+ private String transformCategoryIndexToValue(int featureIndex, String valueIndex) {
+ if ( ! categoryValues.containsKey(featureIndex) ) {
+ return valueIndex; // We don't have a pandas categorical lookup table
+ }
+ return categoryValues.get(featureIndex).get(Integer.parseInt(valueIndex));
+ }
+
+} \ No newline at end of file
diff --git a/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/package-info.java b/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/package-info.java
new file mode 100644
index 00000000000..b29145ee21b
--- /dev/null
+++ b/model-integration/src/main/java/ai/vespa/rankingexpression/importer/lightgbm/package-info.java
@@ -0,0 +1,5 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+package ai.vespa.rankingexpression.importer.lightgbm;
+
+import com.yahoo.osgi.annotation.ExportPackage;
diff --git a/model-integration/src/test/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMImportEvaluationTestCase.java b/model-integration/src/test/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMImportEvaluationTestCase.java
new file mode 100644
index 00000000000..d2ef4b984ff
--- /dev/null
+++ b/model-integration/src/test/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMImportEvaluationTestCase.java
@@ -0,0 +1,49 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package ai.vespa.rankingexpression.importer.lightgbm;
+
+import com.yahoo.searchlib.rankingexpression.RankingExpression;
+import com.yahoo.searchlib.rankingexpression.evaluation.ArrayContext;
+import com.yahoo.searchlib.rankingexpression.evaluation.ContextIndex;
+import com.yahoo.searchlib.rankingexpression.evaluation.DoubleValue;
+import com.yahoo.searchlib.rankingexpression.evaluation.ExpressionOptimizer;
+import com.yahoo.searchlib.rankingexpression.evaluation.gbdtoptimization.GBDTForestNode;
+import org.junit.Test;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ * @author lesters
+ */
+public class LightGBMImportEvaluationTestCase extends LightGBMTestBase {
+
+ @Test
+ public void testRegression() {
+ RankingExpression expression = importModel("src/test/models/lightgbm/regression.json");
+ ArrayContext context = new ArrayContext(expression, true, DoubleValue.NaN);
+
+ assertEvaluation(1.91300868, expression, features(context));
+ assertEvaluation(2.05469776, expression, features(context).add("numerical_1", 0.1).add("numerical_2", 0.2).add("categorical_1", "a").add("categorical_2", "i"));
+ assertEvaluation(2.0745534, expression, features(context).add("numerical_2", 0.5).add("categorical_1", "b").add("categorical_2", "j"));
+ assertEvaluation(2.3571838, expression, features(context).add("numerical_1", 0.7).add("numerical_2", 0.8).add("categorical_2", "m"));
+
+ ExpressionOptimizer optimizer = new ExpressionOptimizer();
+ optimizer.optimize(expression, (ContextIndex)context);
+ assertTrue(expression.getRoot() instanceof GBDTForestNode);
+
+ assertEvaluation(1.91300868, expression, features(context));
+ assertEvaluation(2.05469776, expression, features(context).add("numerical_1", 0.1).add("numerical_2", 0.2).add("categorical_1", "a").add("categorical_2", "i"));
+ assertEvaluation(2.0745534, expression, features(context).add("numerical_2", 0.5).add("categorical_1", "b").add("categorical_2", "j"));
+ assertEvaluation(2.3571838, expression, features(context).add("numerical_1", 0.7).add("numerical_2", 0.8).add("categorical_2", "m"));
+ }
+
+ @Test
+ public void testClassification() {
+ RankingExpression expression = importModel("src/test/models/lightgbm/classification.json");
+ ArrayContext context = new ArrayContext(expression, DoubleValue.NaN);
+ assertEvaluation(0.37464997, expression, features(context));
+ assertEvaluation(0.37464997, expression, features(context).add("numerical_1", 0.1).add("numerical_2", 0.2).add("categorical_1", "a").add("categorical_2", "i"));
+ assertEvaluation(0.38730827, expression, features(context).add("numerical_2", 0.5).add("categorical_1", "b").add("categorical_2", "j"));
+ assertEvaluation(0.5647872, expression, features(context).add("numerical_1", 0.7).add("numerical_2", 0.8).add("categorical_2", "m"));
+ }
+
+}
diff --git a/model-integration/src/test/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMTestBase.java b/model-integration/src/test/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMTestBase.java
new file mode 100644
index 00000000000..80c2ce68394
--- /dev/null
+++ b/model-integration/src/test/java/ai/vespa/rankingexpression/importer/lightgbm/LightGBMTestBase.java
@@ -0,0 +1,42 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package ai.vespa.rankingexpression.importer.lightgbm;
+
+import com.yahoo.searchlib.rankingexpression.RankingExpression;
+import com.yahoo.searchlib.rankingexpression.evaluation.ArrayContext;
+import com.yahoo.searchlib.rankingexpression.evaluation.StringValue;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author lesters
+ */
+class LightGBMTestBase {
+
+ RankingExpression importModel(String path) {
+ return new LightGBMImporter().importModel("lightgbm", path).expressions().get("lightgbm");
+ }
+
+ void assertEvaluation(double expected, RankingExpression expr, TestFeatures features) {
+ assertEquals(expected, expr.evaluate(features.context).asDouble(), 1e-6);
+ }
+
+ TestFeatures features(ArrayContext context) {
+ return new TestFeatures(context.clone());
+ }
+
+ static class TestFeatures {
+ private final ArrayContext context;
+ TestFeatures(ArrayContext context) {
+ this.context = context;
+ }
+ TestFeatures add(String name, double value) {
+ context.put(name, value);
+ return this;
+ }
+ TestFeatures add(String name, String value) {
+ context.put(name, new StringValue(value));
+ return this;
+ }
+ }
+
+}
diff --git a/model-integration/src/test/models/lightgbm/classification.json b/model-integration/src/test/models/lightgbm/classification.json
new file mode 100644
index 00000000000..1087446519d
--- /dev/null
+++ b/model-integration/src/test/models/lightgbm/classification.json
@@ -0,0 +1,275 @@
+{
+ "name": "tree",
+ "version": "v3",
+ "num_class": 1,
+ "num_tree_per_iteration": 1,
+ "label_index": 0,
+ "max_feature_idx": 3,
+ "average_output": false,
+ "objective": "binary sigmoid:1",
+ "feature_names": [
+ "numerical_1",
+ "numerical_2",
+ "categorical_1",
+ "categorical_2"
+ ],
+ "monotone_constraints": [],
+ "tree_info": [
+ {
+ "tree_index": 0,
+ "num_leaves": 3,
+ "num_cat": 2,
+ "shrinkage": 1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 3,
+ "split_gain": 13080.099609375,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 100000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 2,
+ "split_gain": 8303.599609375,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.598248,
+ "internal_weight": 14841.2,
+ "internal_count": 59371,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 0.10149588882231209,
+ "leaf_weight": 8812.104370772839,
+ "leaf_count": 35252
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": -0.05076009488472203,
+ "leaf_weight": 6029.137221112847,
+ "leaf_count": 24119
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.1075553310531564,
+ "leaf_weight": 10156.217760130763,
+ "leaf_count": 40629
+ }
+ }
+ },
+ {
+ "tree_index": 1,
+ "num_leaves": 3,
+ "num_cat": 0,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 12144.5,
+ "threshold": 0.4932456977560694,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 100000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.07039230856418545,
+ "leaf_weight": 12362.572675153613,
+ "leaf_count": 49561
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 0,
+ "split_gain": 6445.509765625,
+ "threshold": 0.4026061210695467,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0.691647,
+ "internal_weight": 12581.6,
+ "internal_count": 50439,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.016713933964828474,
+ "leaf_weight": 5157.183633238077,
+ "leaf_count": 20675
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.12881836794307533,
+ "leaf_weight": 7424.385220557451,
+ "leaf_count": 29764
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 2,
+ "num_leaves": 3,
+ "num_cat": 2,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 2,
+ "split_gain": 11470.099609375,
+ "threshold": "3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 100000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 0.0837843210726433,
+ "leaf_weight": 9858.360527098179,
+ "leaf_count": 39612
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 3,
+ "split_gain": 8077.8701171875,
+ "threshold": "3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": -0.549408,
+ "internal_weight": 15039.7,
+ "internal_count": 60388,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": 0.035561394754096094,
+ "leaf_weight": 5955.117423638701,
+ "leaf_count": 23896
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": -0.11424082565448186,
+ "leaf_weight": 9084.538012728095,
+ "leaf_count": 36492
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 3,
+ "num_leaves": 3,
+ "num_cat": 0,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 0,
+ "split_gain": 11022.599609375,
+ "threshold": 0.5135386524711826,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 100000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 5789.919921875,
+ "threshold": 0.6237474076885036,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.641438,
+ "internal_weight": 12881.9,
+ "internal_count": 51907,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.11613056205533928,
+ "leaf_weight": 8044.6355674266815,
+ "leaf_count": 32426
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.022313103333779363,
+ "leaf_weight": 4837.266924858093,
+ "leaf_count": 19481
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": 0.06927713686880098,
+ "leaf_weight": 11923.512641906738,
+ "leaf_count": 48093
+ }
+ }
+ },
+ {
+ "tree_index": 4,
+ "num_leaves": 3,
+ "num_cat": 2,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 3,
+ "split_gain": 9828.9501953125,
+ "threshold": "3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 100000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 0.07771712562582928,
+ "leaf_weight": 9804.427681803703,
+ "leaf_count": 39586
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 2,
+ "split_gain": 6332.2900390625,
+ "threshold": "3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": -0.51112,
+ "internal_weight": 14922.7,
+ "internal_count": 60414,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": 0.029062142260340918,
+ "leaf_weight": 5933.120021238923,
+ "leaf_count": 23922
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": -0.10400033924773491,
+ "leaf_weight": 8989.602796778083,
+ "leaf_count": 36492
+ }
+ }
+ }
+ }
+ ],
+ "pandas_categorical": [
+ [
+ "a",
+ "b",
+ "c",
+ "d",
+ "e"
+ ],
+ [
+ "i",
+ "j",
+ "k",
+ "l",
+ "m"
+ ]
+ ]
+} \ No newline at end of file
diff --git a/model-integration/src/test/models/lightgbm/regression.json b/model-integration/src/test/models/lightgbm/regression.json
new file mode 100644
index 00000000000..cf0488ecd8b
--- /dev/null
+++ b/model-integration/src/test/models/lightgbm/regression.json
@@ -0,0 +1,275 @@
+{
+ "name": "tree",
+ "version": "v3",
+ "num_class": 1,
+ "num_tree_per_iteration": 1,
+ "label_index": 0,
+ "max_feature_idx": 3,
+ "average_output": false,
+ "objective": "regression",
+ "feature_names": [
+ "numerical_1",
+ "numerical_2",
+ "categorical_1",
+ "categorical_2"
+ ],
+ "monotone_constraints": [],
+ "tree_info": [
+ {
+ "tree_index": 0,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 68.5353012084961,
+ "threshold": 0.46643291586559305,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 2.1594397038037663,
+ "leaf_weight": 469,
+ "leaf_count": 469
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 3,
+ "split_gain": 41.27640151977539,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.246035,
+ "internal_weight": 531,
+ "internal_count": 531,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": 2.235297305276056,
+ "leaf_weight": 302,
+ "leaf_count": 302
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 2.1792953471546546,
+ "leaf_weight": 229,
+ "leaf_count": 229
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 1,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 2,
+ "split_gain": 64.22250366210938,
+ "threshold": "3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": 0.03070842919354316,
+ "leaf_weight": 399,
+ "leaf_count": 399
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 0,
+ "split_gain": 36.74250030517578,
+ "threshold": 0.5102250691730842,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.204906,
+ "internal_weight": 601,
+ "internal_count": 601,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.04439151147520909,
+ "leaf_weight": 315,
+ "leaf_count": 315
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.005117411709368601,
+ "leaf_weight": 286,
+ "leaf_count": 286
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 2,
+ "num_leaves": 3,
+ "num_cat": 0,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 1,
+ "split_gain": 57.1327018737793,
+ "threshold": 0.668665477622446,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 40.859100341796875,
+ "threshold": 0.008118820676863816,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": -0.162926,
+ "internal_weight": 681,
+ "internal_count": 681,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.15361238490967524,
+ "leaf_weight": 21,
+ "leaf_count": 21
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": -0.01192330846157292,
+ "leaf_weight": 660,
+ "leaf_count": 660
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": 0.03499044894987518,
+ "leaf_weight": 319,
+ "leaf_count": 319
+ }
+ }
+ },
+ {
+ "tree_index": 3,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 0,
+ "split_gain": 54.77090072631836,
+ "threshold": 0.5201391072644542,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.02141000620783247,
+ "leaf_weight": 543,
+ "leaf_count": 543
+ },
+ "right_child": {
+ "split_index": 1,
+ "split_feature": 2,
+ "split_gain": 27.200700759887695,
+ "threshold": "0||1",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0.255704,
+ "internal_weight": 457,
+ "internal_count": 457,
+ "left_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.004121485787596721,
+ "leaf_weight": 191,
+ "leaf_count": 191
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.04534090904886873,
+ "leaf_weight": 266,
+ "leaf_count": 266
+ }
+ }
+ }
+ },
+ {
+ "tree_index": 4,
+ "num_leaves": 3,
+ "num_cat": 1,
+ "shrinkage": 0.1,
+ "tree_structure": {
+ "split_index": 0,
+ "split_feature": 3,
+ "split_gain": 51.84349822998047,
+ "threshold": "2||3||4",
+ "decision_type": "==",
+ "default_left": false,
+ "missing_type": "NaN",
+ "internal_value": 0,
+ "internal_weight": 0,
+ "internal_count": 1000,
+ "left_child": {
+ "split_index": 1,
+ "split_feature": 1,
+ "split_gain": 39.352699279785156,
+ "threshold": 0.27283279016959255,
+ "decision_type": "<=",
+ "default_left": true,
+ "missing_type": "NaN",
+ "internal_value": 0.188414,
+ "internal_weight": 593,
+ "internal_count": 593,
+ "left_child": {
+ "leaf_index": 0,
+ "leaf_value": -0.01924803254356527,
+ "leaf_weight": 184,
+ "leaf_count": 184
+ },
+ "right_child": {
+ "leaf_index": 2,
+ "leaf_value": 0.03643772842347651,
+ "leaf_weight": 409,
+ "leaf_count": 409
+ }
+ },
+ "right_child": {
+ "leaf_index": 1,
+ "leaf_value": -0.02701711918923075,
+ "leaf_weight": 407,
+ "leaf_count": 407
+ }
+ }
+ }
+ ],
+ "pandas_categorical": [
+ [
+ "a",
+ "b",
+ "c",
+ "d",
+ "e"
+ ],
+ [
+ "i",
+ "j",
+ "k",
+ "l",
+ "m"
+ ]
+ ]
+} \ No newline at end of file
diff --git a/model-integration/src/test/models/lightgbm/train_lightgbm_classification.py b/model-integration/src/test/models/lightgbm/train_lightgbm_classification.py
new file mode 100755
index 00000000000..ac00437d192
--- /dev/null
+++ b/model-integration/src/test/models/lightgbm/train_lightgbm_classification.py
@@ -0,0 +1,54 @@
+#! /usr/bin/env python3
+# coding: utf-8
+
+import json
+import random
+
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+
+
+def category_value(arr):
+ values = { np.NaN: 0, "a":1, "b":2, "c":3, "d":4, "e":5, "i":1, "j":2, "k":3, "l":4, "m":5 }
+ return np.array([ 0.21 * values[i] for i in arr ])
+
+# Create training set
+num_examples = 100000
+missing_prob = 0.01
+features = pd.DataFrame({
+ "numerical_1": np.random.random(num_examples),
+ "numerical_2": np.random.random(num_examples),
+ "categorical_1": pd.Series(np.random.permutation(["a", "b", "c", "d", "e"] * int(num_examples/5)), dtype="category"),
+ "categorical_2": pd.Series(np.random.permutation(["i", "j", "k", "l", "m"] * int(num_examples/5)), dtype="category"),
+ })
+
+# randomly insert missing values
+for i in range(int(num_examples * len(features.columns) * missing_prob)):
+ features.loc[random.randint(0, num_examples-1), features.columns[random.randint(0, len(features.columns)-1)]] = None
+
+# create targets (with 0.0 as default for missing values)
+target = features["numerical_1"] + features["numerical_2"] + category_value(features["categorical_1"]) + category_value(features["categorical_2"])
+target = (target > 2.24) * 1.0
+lgb_train = lgb.Dataset(features, target)
+
+# Train model
+params = {
+ 'objective': 'binary',
+ 'metric': 'binary_logloss',
+ 'num_leaves': 3,
+}
+model = lgb.train(params, lgb_train, num_boost_round=5)
+
+# Save model
+with open("classification.json", "w") as f:
+ json.dump(model.dump_model(), f, indent=2)
+
+# Predict (for comparison with Vespa evaluation)
+predict_features = pd.DataFrame({
+ "numerical_1": pd.Series([ None, 0.1, None, 0.7]),
+ "numerical_2": pd.Series([np.NaN, 0.2, 0.5, 0.8]),
+ "categorical_1": pd.Series([ None, "a", "b", None], dtype="category"),
+ "categorical_2": pd.Series([ None, "i", "j", "m"], dtype="category"),
+ })
+print(model.predict(predict_features))
diff --git a/model-integration/src/test/models/lightgbm/train_lightgbm_regression.py b/model-integration/src/test/models/lightgbm/train_lightgbm_regression.py
new file mode 100755
index 00000000000..3e74e38da35
--- /dev/null
+++ b/model-integration/src/test/models/lightgbm/train_lightgbm_regression.py
@@ -0,0 +1,53 @@
+#! /usr/bin/env python3
+# coding: utf-8
+
+import json
+import random
+
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+
+
+def category_value(arr):
+ values = { np.NaN: 0, "a":1, "b":2, "c":3, "d":4, "e":5, "i":1, "j":2, "k":3, "l":4, "m":5 }
+ return np.array([ 0.21 * values[i] for i in arr ])
+
+# Create training set
+num_examples = 100000
+missing_prob = 0.01
+features = pd.DataFrame({
+ "numerical_1": np.random.random(num_examples),
+ "numerical_2": np.random.random(num_examples),
+ "categorical_1": pd.Series(np.random.permutation(["a", "b", "c", "d", "e"] * int(num_examples/5)), dtype="category"),
+ "categorical_2": pd.Series(np.random.permutation(["i", "j", "k", "l", "m"] * int(num_examples/5)), dtype="category"),
+ })
+
+# randomly insert missing values
+for i in range(int(num_examples * len(features.columns) * missing_prob)):
+ features.loc[random.randint(0, num_examples-1), features.columns[random.randint(0, len(features.columns)-1)]] = None
+
+# create targets (with 0.0 as default for missing values)
+target = features["numerical_1"] + features["numerical_2"] + category_value(features["categorical_1"]) + category_value(features["categorical_2"])
+lgb_train = lgb.Dataset(features, target)
+
+# Train model
+params = {
+ 'objective': 'mse',
+ 'metric': {'l2', 'l1'},
+ 'num_leaves': 3,
+}
+model = lgb.train(params, lgb_train, num_boost_round=2)
+
+# Save model
+with open("regression.json", "w") as f:
+ json.dump(model.dump_model(), f, indent=2)
+
+# Predict (for comparison with Vespa evaluation)
+predict_features = pd.DataFrame({
+ "numerical_1": pd.Series([ None, 0.1, None, 0.7]),
+ "numerical_2": pd.Series([np.NaN, 0.2, 0.5, 0.8]),
+ "categorical_1": pd.Series([ None, "a", "b", None], dtype="category"),
+ "categorical_2": pd.Series([ None, "i", "j", "m"], dtype="category"),
+ })
+print(model.predict(predict_features))