summaryrefslogtreecommitdiffstats
path: root/model-integration
diff options
context:
space:
mode:
authorLester Solbakken <lester.solbakken@gmail.com>2024-04-24 14:37:13 +0200
committerLester Solbakken <lester.solbakken@gmail.com>2024-04-24 14:37:13 +0200
commit7cf79f73c247f02c21a859541b655c5503f8e985 (patch)
tree62211aee88f47227bdcd64b6e212282c0bff524f /model-integration
parent6ea9d8cb66f6e86583ac3f02ef1c13d9ff913e8e (diff)
Update defaults for local LLM config
Diffstat (limited to 'model-integration')
-rwxr-xr-xmodel-integration/src/main/resources/configdefinitions/llm-local-client.def6
1 files changed, 3 insertions, 3 deletions
diff --git a/model-integration/src/main/resources/configdefinitions/llm-local-client.def b/model-integration/src/main/resources/configdefinitions/llm-local-client.def
index c06c24b33e5..4823a53ec46 100755
--- a/model-integration/src/main/resources/configdefinitions/llm-local-client.def
+++ b/model-integration/src/main/resources/configdefinitions/llm-local-client.def
@@ -5,13 +5,13 @@ package=ai.vespa.llm.clients
model model
# Maximum number of requests to handle in parallel pr container node
-parallelRequests int default=10
+parallelRequests int default=1
# Additional number of requests to put in queue for processing before starting to reject new requests
maxQueueSize int default=10
# Use GPU
-useGpu bool default=false
+useGpu bool default=true
# Maximum number of model layers to run on GPU
gpuLayers int default=1000000
@@ -22,7 +22,7 @@ threads int default=-1
# Context size for the model
# Context is divided between parallel requests. So for 10 parallel requests, each "slot" gets 1/10 of the context
-contextSize int default=512
+contextSize int default=4096
# Maximum number of tokens to process in one request - overriden by inference parameters
maxTokens int default=512