summaryrefslogtreecommitdiffstats
path: root/container-search/src/main/resources/configdefinitions/llm-local-client.def
diff options
context:
space:
mode:
Diffstat (limited to 'container-search/src/main/resources/configdefinitions/llm-local-client.def')
-rwxr-xr-xcontainer-search/src/main/resources/configdefinitions/llm-local-client.def32
1 files changed, 32 insertions, 0 deletions
diff --git a/container-search/src/main/resources/configdefinitions/llm-local-client.def b/container-search/src/main/resources/configdefinitions/llm-local-client.def
new file mode 100755
index 00000000000..08eab19f0f8
--- /dev/null
+++ b/container-search/src/main/resources/configdefinitions/llm-local-client.def
@@ -0,0 +1,32 @@
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package=ai.vespa.llm.clients
+
+# Url to the model to use
+modelUrl url default=""
+
+# Local file path to the model to use - will have precedence over model_url if set - mostly for testing
+localLlmFile string default=""
+
+# Maximum number of requests to handle in parallel pr container node
+parallelRequests int default=10
+
+# Additional number of requests to put in queue for processing before starting to reject new requests
+maxQueueSize int default=10
+
+# Use GPU
+useGpu bool default=false
+
+# Maximum number of model layers to run on GPU
+gpuLayers int default=1000000
+
+# Number of threads to use for CPU processing - -1 means use all available cores
+# Not used for GPU processing
+threads int default=-1
+
+# Context size for the model
+# Context is divided between parallel requests. So for 10 parallel requests, each "slot" gets 1/10 of the context
+contextSize int default=512
+
+# Maximum number of tokens to process in one request - overriden by inference parameters
+maxTokens int default=512
+