diff options
Diffstat (limited to 'model-integration/src/test/models/lightgbm')
4 files changed, 657 insertions, 0 deletions
diff --git a/model-integration/src/test/models/lightgbm/classification.json b/model-integration/src/test/models/lightgbm/classification.json new file mode 100644 index 00000000000..1087446519d --- /dev/null +++ b/model-integration/src/test/models/lightgbm/classification.json @@ -0,0 +1,275 @@ +{ + "name": "tree", + "version": "v3", + "num_class": 1, + "num_tree_per_iteration": 1, + "label_index": 0, + "max_feature_idx": 3, + "average_output": false, + "objective": "binary sigmoid:1", + "feature_names": [ + "numerical_1", + "numerical_2", + "categorical_1", + "categorical_2" + ], + "monotone_constraints": [], + "tree_info": [ + { + "tree_index": 0, + "num_leaves": 3, + "num_cat": 2, + "shrinkage": 1, + "tree_structure": { + "split_index": 0, + "split_feature": 3, + "split_gain": 13080.099609375, + "threshold": "2||3||4", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 100000, + "left_child": { + "split_index": 1, + "split_feature": 2, + "split_gain": 8303.599609375, + "threshold": "2||3||4", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": 0.598248, + "internal_weight": 14841.2, + "internal_count": 59371, + "left_child": { + "leaf_index": 0, + "leaf_value": 0.10149588882231209, + "leaf_weight": 8812.104370772839, + "leaf_count": 35252 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": -0.05076009488472203, + "leaf_weight": 6029.137221112847, + "leaf_count": 24119 + } + }, + "right_child": { + "leaf_index": 1, + "leaf_value": -0.1075553310531564, + "leaf_weight": 10156.217760130763, + "leaf_count": 40629 + } + } + }, + { + "tree_index": 1, + "num_leaves": 3, + "num_cat": 0, + "shrinkage": 0.1, + "tree_structure": { + "split_index": 0, + "split_feature": 1, + "split_gain": 12144.5, + "threshold": 0.4932456977560694, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 100000, + "left_child": { + "leaf_index": 0, + "leaf_value": -0.07039230856418545, + "leaf_weight": 12362.572675153613, + "leaf_count": 49561 + }, + "right_child": { + "split_index": 1, + "split_feature": 0, + "split_gain": 6445.509765625, + "threshold": 0.4026061210695467, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": 0.691647, + "internal_weight": 12581.6, + "internal_count": 50439, + "left_child": { + "leaf_index": 1, + "leaf_value": -0.016713933964828474, + "leaf_weight": 5157.183633238077, + "leaf_count": 20675 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": 0.12881836794307533, + "leaf_weight": 7424.385220557451, + "leaf_count": 29764 + } + } + } + }, + { + "tree_index": 2, + "num_leaves": 3, + "num_cat": 2, + "shrinkage": 0.1, + "tree_structure": { + "split_index": 0, + "split_feature": 2, + "split_gain": 11470.099609375, + "threshold": "3||4", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 100000, + "left_child": { + "leaf_index": 0, + "leaf_value": 0.0837843210726433, + "leaf_weight": 9858.360527098179, + "leaf_count": 39612 + }, + "right_child": { + "split_index": 1, + "split_feature": 3, + "split_gain": 8077.8701171875, + "threshold": "3||4", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": -0.549408, + "internal_weight": 15039.7, + "internal_count": 60388, + "left_child": { + "leaf_index": 1, + "leaf_value": 0.035561394754096094, + "leaf_weight": 5955.117423638701, + "leaf_count": 23896 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": -0.11424082565448186, + "leaf_weight": 9084.538012728095, + "leaf_count": 36492 + } + } + } + }, + { + "tree_index": 3, + "num_leaves": 3, + "num_cat": 0, + "shrinkage": 0.1, + "tree_structure": { + "split_index": 0, + "split_feature": 0, + "split_gain": 11022.599609375, + "threshold": 0.5135386524711826, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 100000, + "left_child": { + "split_index": 1, + "split_feature": 1, + "split_gain": 5789.919921875, + "threshold": 0.6237474076885036, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": -0.641438, + "internal_weight": 12881.9, + "internal_count": 51907, + "left_child": { + "leaf_index": 0, + "leaf_value": -0.11613056205533928, + "leaf_weight": 8044.6355674266815, + "leaf_count": 32426 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": 0.022313103333779363, + "leaf_weight": 4837.266924858093, + "leaf_count": 19481 + } + }, + "right_child": { + "leaf_index": 1, + "leaf_value": 0.06927713686880098, + "leaf_weight": 11923.512641906738, + "leaf_count": 48093 + } + } + }, + { + "tree_index": 4, + "num_leaves": 3, + "num_cat": 2, + "shrinkage": 0.1, + "tree_structure": { + "split_index": 0, + "split_feature": 3, + "split_gain": 9828.9501953125, + "threshold": "3||4", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 100000, + "left_child": { + "leaf_index": 0, + "leaf_value": 0.07771712562582928, + "leaf_weight": 9804.427681803703, + "leaf_count": 39586 + }, + "right_child": { + "split_index": 1, + "split_feature": 2, + "split_gain": 6332.2900390625, + "threshold": "3||4", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": -0.51112, + "internal_weight": 14922.7, + "internal_count": 60414, + "left_child": { + "leaf_index": 1, + "leaf_value": 0.029062142260340918, + "leaf_weight": 5933.120021238923, + "leaf_count": 23922 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": -0.10400033924773491, + "leaf_weight": 8989.602796778083, + "leaf_count": 36492 + } + } + } + } + ], + "pandas_categorical": [ + [ + "a", + "b", + "c", + "d", + "e" + ], + [ + "i", + "j", + "k", + "l", + "m" + ] + ] +}
\ No newline at end of file diff --git a/model-integration/src/test/models/lightgbm/regression.json b/model-integration/src/test/models/lightgbm/regression.json new file mode 100644 index 00000000000..cf0488ecd8b --- /dev/null +++ b/model-integration/src/test/models/lightgbm/regression.json @@ -0,0 +1,275 @@ +{ + "name": "tree", + "version": "v3", + "num_class": 1, + "num_tree_per_iteration": 1, + "label_index": 0, + "max_feature_idx": 3, + "average_output": false, + "objective": "regression", + "feature_names": [ + "numerical_1", + "numerical_2", + "categorical_1", + "categorical_2" + ], + "monotone_constraints": [], + "tree_info": [ + { + "tree_index": 0, + "num_leaves": 3, + "num_cat": 1, + "shrinkage": 1, + "tree_structure": { + "split_index": 0, + "split_feature": 1, + "split_gain": 68.5353012084961, + "threshold": 0.46643291586559305, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 1000, + "left_child": { + "leaf_index": 0, + "leaf_value": 2.1594397038037663, + "leaf_weight": 469, + "leaf_count": 469 + }, + "right_child": { + "split_index": 1, + "split_feature": 3, + "split_gain": 41.27640151977539, + "threshold": "2||3||4", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": 0.246035, + "internal_weight": 531, + "internal_count": 531, + "left_child": { + "leaf_index": 1, + "leaf_value": 2.235297305276056, + "leaf_weight": 302, + "leaf_count": 302 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": 2.1792953471546546, + "leaf_weight": 229, + "leaf_count": 229 + } + } + } + }, + { + "tree_index": 1, + "num_leaves": 3, + "num_cat": 1, + "shrinkage": 0.1, + "tree_structure": { + "split_index": 0, + "split_feature": 2, + "split_gain": 64.22250366210938, + "threshold": "3||4", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 1000, + "left_child": { + "leaf_index": 0, + "leaf_value": 0.03070842919354316, + "leaf_weight": 399, + "leaf_count": 399 + }, + "right_child": { + "split_index": 1, + "split_feature": 0, + "split_gain": 36.74250030517578, + "threshold": 0.5102250691730842, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": -0.204906, + "internal_weight": 601, + "internal_count": 601, + "left_child": { + "leaf_index": 1, + "leaf_value": -0.04439151147520909, + "leaf_weight": 315, + "leaf_count": 315 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": 0.005117411709368601, + "leaf_weight": 286, + "leaf_count": 286 + } + } + } + }, + { + "tree_index": 2, + "num_leaves": 3, + "num_cat": 0, + "shrinkage": 0.1, + "tree_structure": { + "split_index": 0, + "split_feature": 1, + "split_gain": 57.1327018737793, + "threshold": 0.668665477622446, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 1000, + "left_child": { + "split_index": 1, + "split_feature": 1, + "split_gain": 40.859100341796875, + "threshold": 0.008118820676863816, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": -0.162926, + "internal_weight": 681, + "internal_count": 681, + "left_child": { + "leaf_index": 0, + "leaf_value": -0.15361238490967524, + "leaf_weight": 21, + "leaf_count": 21 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": -0.01192330846157292, + "leaf_weight": 660, + "leaf_count": 660 + } + }, + "right_child": { + "leaf_index": 1, + "leaf_value": 0.03499044894987518, + "leaf_weight": 319, + "leaf_count": 319 + } + } + }, + { + "tree_index": 3, + "num_leaves": 3, + "num_cat": 1, + "shrinkage": 0.1, + "tree_structure": { + "split_index": 0, + "split_feature": 0, + "split_gain": 54.77090072631836, + "threshold": 0.5201391072644542, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 1000, + "left_child": { + "leaf_index": 0, + "leaf_value": -0.02141000620783247, + "leaf_weight": 543, + "leaf_count": 543 + }, + "right_child": { + "split_index": 1, + "split_feature": 2, + "split_gain": 27.200700759887695, + "threshold": "0||1", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": 0.255704, + "internal_weight": 457, + "internal_count": 457, + "left_child": { + "leaf_index": 1, + "leaf_value": -0.004121485787596721, + "leaf_weight": 191, + "leaf_count": 191 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": 0.04534090904886873, + "leaf_weight": 266, + "leaf_count": 266 + } + } + } + }, + { + "tree_index": 4, + "num_leaves": 3, + "num_cat": 1, + "shrinkage": 0.1, + "tree_structure": { + "split_index": 0, + "split_feature": 3, + "split_gain": 51.84349822998047, + "threshold": "2||3||4", + "decision_type": "==", + "default_left": false, + "missing_type": "NaN", + "internal_value": 0, + "internal_weight": 0, + "internal_count": 1000, + "left_child": { + "split_index": 1, + "split_feature": 1, + "split_gain": 39.352699279785156, + "threshold": 0.27283279016959255, + "decision_type": "<=", + "default_left": true, + "missing_type": "NaN", + "internal_value": 0.188414, + "internal_weight": 593, + "internal_count": 593, + "left_child": { + "leaf_index": 0, + "leaf_value": -0.01924803254356527, + "leaf_weight": 184, + "leaf_count": 184 + }, + "right_child": { + "leaf_index": 2, + "leaf_value": 0.03643772842347651, + "leaf_weight": 409, + "leaf_count": 409 + } + }, + "right_child": { + "leaf_index": 1, + "leaf_value": -0.02701711918923075, + "leaf_weight": 407, + "leaf_count": 407 + } + } + } + ], + "pandas_categorical": [ + [ + "a", + "b", + "c", + "d", + "e" + ], + [ + "i", + "j", + "k", + "l", + "m" + ] + ] +}
\ No newline at end of file diff --git a/model-integration/src/test/models/lightgbm/train_lightgbm_classification.py b/model-integration/src/test/models/lightgbm/train_lightgbm_classification.py new file mode 100755 index 00000000000..ac00437d192 --- /dev/null +++ b/model-integration/src/test/models/lightgbm/train_lightgbm_classification.py @@ -0,0 +1,54 @@ +#! /usr/bin/env python3 +# coding: utf-8 + +import json +import random + +import lightgbm as lgb +import numpy as np +import pandas as pd + + +def category_value(arr): + values = { np.NaN: 0, "a":1, "b":2, "c":3, "d":4, "e":5, "i":1, "j":2, "k":3, "l":4, "m":5 } + return np.array([ 0.21 * values[i] for i in arr ]) + +# Create training set +num_examples = 100000 +missing_prob = 0.01 +features = pd.DataFrame({ + "numerical_1": np.random.random(num_examples), + "numerical_2": np.random.random(num_examples), + "categorical_1": pd.Series(np.random.permutation(["a", "b", "c", "d", "e"] * int(num_examples/5)), dtype="category"), + "categorical_2": pd.Series(np.random.permutation(["i", "j", "k", "l", "m"] * int(num_examples/5)), dtype="category"), + }) + +# randomly insert missing values +for i in range(int(num_examples * len(features.columns) * missing_prob)): + features.loc[random.randint(0, num_examples-1), features.columns[random.randint(0, len(features.columns)-1)]] = None + +# create targets (with 0.0 as default for missing values) +target = features["numerical_1"] + features["numerical_2"] + category_value(features["categorical_1"]) + category_value(features["categorical_2"]) +target = (target > 2.24) * 1.0 +lgb_train = lgb.Dataset(features, target) + +# Train model +params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'num_leaves': 3, +} +model = lgb.train(params, lgb_train, num_boost_round=5) + +# Save model +with open("classification.json", "w") as f: + json.dump(model.dump_model(), f, indent=2) + +# Predict (for comparison with Vespa evaluation) +predict_features = pd.DataFrame({ + "numerical_1": pd.Series([ None, 0.1, None, 0.7]), + "numerical_2": pd.Series([np.NaN, 0.2, 0.5, 0.8]), + "categorical_1": pd.Series([ None, "a", "b", None], dtype="category"), + "categorical_2": pd.Series([ None, "i", "j", "m"], dtype="category"), + }) +print(model.predict(predict_features)) diff --git a/model-integration/src/test/models/lightgbm/train_lightgbm_regression.py b/model-integration/src/test/models/lightgbm/train_lightgbm_regression.py new file mode 100755 index 00000000000..3e74e38da35 --- /dev/null +++ b/model-integration/src/test/models/lightgbm/train_lightgbm_regression.py @@ -0,0 +1,53 @@ +#! /usr/bin/env python3 +# coding: utf-8 + +import json +import random + +import lightgbm as lgb +import numpy as np +import pandas as pd + + +def category_value(arr): + values = { np.NaN: 0, "a":1, "b":2, "c":3, "d":4, "e":5, "i":1, "j":2, "k":3, "l":4, "m":5 } + return np.array([ 0.21 * values[i] for i in arr ]) + +# Create training set +num_examples = 100000 +missing_prob = 0.01 +features = pd.DataFrame({ + "numerical_1": np.random.random(num_examples), + "numerical_2": np.random.random(num_examples), + "categorical_1": pd.Series(np.random.permutation(["a", "b", "c", "d", "e"] * int(num_examples/5)), dtype="category"), + "categorical_2": pd.Series(np.random.permutation(["i", "j", "k", "l", "m"] * int(num_examples/5)), dtype="category"), + }) + +# randomly insert missing values +for i in range(int(num_examples * len(features.columns) * missing_prob)): + features.loc[random.randint(0, num_examples-1), features.columns[random.randint(0, len(features.columns)-1)]] = None + +# create targets (with 0.0 as default for missing values) +target = features["numerical_1"] + features["numerical_2"] + category_value(features["categorical_1"]) + category_value(features["categorical_2"]) +lgb_train = lgb.Dataset(features, target) + +# Train model +params = { + 'objective': 'mse', + 'metric': {'l2', 'l1'}, + 'num_leaves': 3, +} +model = lgb.train(params, lgb_train, num_boost_round=2) + +# Save model +with open("regression.json", "w") as f: + json.dump(model.dump_model(), f, indent=2) + +# Predict (for comparison with Vespa evaluation) +predict_features = pd.DataFrame({ + "numerical_1": pd.Series([ None, 0.1, None, 0.7]), + "numerical_2": pd.Series([np.NaN, 0.2, 0.5, 0.8]), + "categorical_1": pd.Series([ None, "a", "b", None], dtype="category"), + "categorical_2": pd.Series([ None, "i", "j", "m"], dtype="category"), + }) +print(model.predict(predict_features)) |