aboutsummaryrefslogtreecommitdiffstats
path: root/model-integration/src/main/resources/configdefinitions/llm-local-client.def
diff options
context:
space:
mode:
Diffstat (limited to 'model-integration/src/main/resources/configdefinitions/llm-local-client.def')
-rwxr-xr-xmodel-integration/src/main/resources/configdefinitions/llm-local-client.def29
1 files changed, 29 insertions, 0 deletions
diff --git a/model-integration/src/main/resources/configdefinitions/llm-local-client.def b/model-integration/src/main/resources/configdefinitions/llm-local-client.def
new file mode 100755
index 00000000000..c06c24b33e5
--- /dev/null
+++ b/model-integration/src/main/resources/configdefinitions/llm-local-client.def
@@ -0,0 +1,29 @@
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package=ai.vespa.llm.clients
+
+# The LLM model to use
+model model
+
+# Maximum number of requests to handle in parallel pr container node
+parallelRequests int default=10
+
+# Additional number of requests to put in queue for processing before starting to reject new requests
+maxQueueSize int default=10
+
+# Use GPU
+useGpu bool default=false
+
+# Maximum number of model layers to run on GPU
+gpuLayers int default=1000000
+
+# Number of threads to use for CPU processing - -1 means use all available cores
+# Not used for GPU processing
+threads int default=-1
+
+# Context size for the model
+# Context is divided between parallel requests. So for 10 parallel requests, each "slot" gets 1/10 of the context
+contextSize int default=512
+
+# Maximum number of tokens to process in one request - overriden by inference parameters
+maxTokens int default=512
+