diff options
Diffstat (limited to 'model-integration/src/main/resources/configdefinitions/llm-local-client.def')
-rwxr-xr-x | model-integration/src/main/resources/configdefinitions/llm-local-client.def | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/model-integration/src/main/resources/configdefinitions/llm-local-client.def b/model-integration/src/main/resources/configdefinitions/llm-local-client.def new file mode 100755 index 00000000000..c06c24b33e5 --- /dev/null +++ b/model-integration/src/main/resources/configdefinitions/llm-local-client.def @@ -0,0 +1,29 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package=ai.vespa.llm.clients + +# The LLM model to use +model model + +# Maximum number of requests to handle in parallel pr container node +parallelRequests int default=10 + +# Additional number of requests to put in queue for processing before starting to reject new requests +maxQueueSize int default=10 + +# Use GPU +useGpu bool default=false + +# Maximum number of model layers to run on GPU +gpuLayers int default=1000000 + +# Number of threads to use for CPU processing - -1 means use all available cores +# Not used for GPU processing +threads int default=-1 + +# Context size for the model +# Context is divided between parallel requests. So for 10 parallel requests, each "slot" gets 1/10 of the context +contextSize int default=512 + +# Maximum number of tokens to process in one request - overriden by inference parameters +maxTokens int default=512 + |