diff options
author | Lester Solbakken <lester.solbakken@gmail.com> | 2024-04-24 14:37:13 +0200 |
---|---|---|
committer | Lester Solbakken <lester.solbakken@gmail.com> | 2024-04-24 14:37:13 +0200 |
commit | 7cf79f73c247f02c21a859541b655c5503f8e985 (patch) | |
tree | 62211aee88f47227bdcd64b6e212282c0bff524f /model-integration | |
parent | 6ea9d8cb66f6e86583ac3f02ef1c13d9ff913e8e (diff) |
Update defaults for local LLM config
Diffstat (limited to 'model-integration')
-rwxr-xr-x | model-integration/src/main/resources/configdefinitions/llm-local-client.def | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/model-integration/src/main/resources/configdefinitions/llm-local-client.def b/model-integration/src/main/resources/configdefinitions/llm-local-client.def index c06c24b33e5..4823a53ec46 100755 --- a/model-integration/src/main/resources/configdefinitions/llm-local-client.def +++ b/model-integration/src/main/resources/configdefinitions/llm-local-client.def @@ -5,13 +5,13 @@ package=ai.vespa.llm.clients model model # Maximum number of requests to handle in parallel pr container node -parallelRequests int default=10 +parallelRequests int default=1 # Additional number of requests to put in queue for processing before starting to reject new requests maxQueueSize int default=10 # Use GPU -useGpu bool default=false +useGpu bool default=true # Maximum number of model layers to run on GPU gpuLayers int default=1000000 @@ -22,7 +22,7 @@ threads int default=-1 # Context size for the model # Context is divided between parallel requests. So for 10 parallel requests, each "slot" gets 1/10 of the context -contextSize int default=512 +contextSize int default=4096 # Maximum number of tokens to process in one request - overriden by inference parameters maxTokens int default=512 |