summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--client/go/Makefile3
-rw-r--r--client/go/internal/cli/cmd/config.go2
-rw-r--r--client/go/internal/vespa/document/circuit_breaker.go26
-rw-r--r--client/go/internal/vespa/document/dispatcher.go46
-rw-r--r--client/go/internal/vespa/document/http.go14
-rw-r--r--client/go/internal/vespa/document/http_test.go8
-rw-r--r--config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java6
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java4
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java12
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/PathGroup.java12
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/Policy.java10
-rw-r--r--linguistics/abi-spec.json1
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java10
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java40
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java11
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java19
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java1
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java1
-rw-r--r--opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java41
-rw-r--r--opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java19
-rw-r--r--searchcore/src/tests/proton/matching/querynodes_test.cpp4
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp3
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/querynodes.h2
-rw-r--r--searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp4
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp41
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp1
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_header.cpp9
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_header.h9
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp10
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_search_helper.h11
-rw-r--r--searchlib/src/vespa/searchlib/fef/itermdata.h2
-rw-r--r--searchlib/src/vespa/searchlib/fef/itermfielddata.h2
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_ucs4.cpp19
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_ucs4.h1
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/blueprint.h1
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/field_spec.h2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp31
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h4
-rw-r--r--searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h18
-rw-r--r--searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h5
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp2
43 files changed, 320 insertions, 154 deletions
diff --git a/client/go/Makefile b/client/go/Makefile
index 9edfc940151..95da52c2383 100644
--- a/client/go/Makefile
+++ b/client/go/Makefile
@@ -86,7 +86,8 @@ $(DIST_TARGETS): DIST_NAME=vespa-cli_$(VERSION)_$(GOOS)_$(GOARCH)
$(DIST_TARGETS): dist-version ci manpages
$(DIST_TARGETS):
mkdir -p $(DIST)/$(DIST_NAME)/bin
- env CGO_ENABLED=0 GOOS=$(GOOS) GOARCH=$(GOARCH) go build -o $(DIST)/$(DIST_NAME)/bin $(GO_FLAGS) ./...
+ env CGO_ENABLED=0 GOOS=$(GOOS) GOARCH=$(GOARCH) go build -o $(DIST)/$(DIST_NAME)/bin $(GO_FLAGS) \
+ github.com/vespa-engine/vespa/client/go/internal/cli/cmd/vespa
cp -a $(PROJECT_ROOT)/LICENSE $(DIST)/$(DIST_NAME)
if [ "$(GOOS)" = "windows" ]; then \
cd $(DIST) && zip -r $(DIST)/$(DIST_NAME).zip $(DIST_NAME); \
diff --git a/client/go/internal/cli/cmd/config.go b/client/go/internal/cli/cmd/config.go
index 0e120546c8b..409254c4349 100644
--- a/client/go/internal/cli/cmd/config.go
+++ b/client/go/internal/cli/cmd/config.go
@@ -515,7 +515,7 @@ func (c *Config) readAPIKey(cli *CLI, system vespa.System, tenantName string) ([
if _, err := os.Stat(c.authConfigPath()); err == nil {
return nil, nil // We have auth config, so we should prefer Auth0 over API key
}
- cli.printWarning("Authenticating with API key. This is discouraged in non-CI environments", "Authenticate with 'vespa auth login' instead")
+ cli.printWarning("Authenticating with API key, intended for use in CI environments.", "Authenticate with 'vespa auth login' instead")
}
return os.ReadFile(c.apiKeyPath(tenantName))
}
diff --git a/client/go/internal/vespa/document/circuit_breaker.go b/client/go/internal/vespa/document/circuit_breaker.go
index 17fc595d58f..f7f0f4360df 100644
--- a/client/go/internal/vespa/document/circuit_breaker.go
+++ b/client/go/internal/vespa/document/circuit_breaker.go
@@ -27,38 +27,38 @@ type timeCircuitBreaker struct {
graceDuration time.Duration
doomDuration time.Duration
- failingSinceMillis int64
+ failingSinceMillis atomic.Int64
lastError atomic.Value
- halfOpen atomic.Value
- open atomic.Value
+ halfOpen atomic.Bool
+ open atomic.Bool
now func() time.Time
}
func (b *timeCircuitBreaker) Success() {
- atomic.StoreInt64(&b.failingSinceMillis, math.MaxInt64)
- if !b.open.Load().(bool) {
+ b.failingSinceMillis.Store(math.MaxInt64)
+ if !b.open.Load() {
b.halfOpen.CompareAndSwap(true, false)
}
}
func (b *timeCircuitBreaker) Error(err error) {
- if atomic.CompareAndSwapInt64(&b.failingSinceMillis, math.MaxInt64, b.now().UnixMilli()) {
+ if b.failingSinceMillis.CompareAndSwap(math.MaxInt64, b.now().UnixMilli()) {
b.lastError.Store(err)
}
}
func (b *timeCircuitBreaker) State() CircuitState {
- failingDuration := b.now().Sub(time.UnixMilli(atomic.LoadInt64(&b.failingSinceMillis)))
+ failingDuration := b.now().Sub(time.UnixMilli(b.failingSinceMillis.Load()))
if failingDuration > b.graceDuration {
b.halfOpen.CompareAndSwap(false, true)
}
if b.doomDuration > 0 && failingDuration > b.doomDuration {
b.open.CompareAndSwap(false, true)
}
- if b.open.Load().(bool) {
+ if b.open.Load() {
return CircuitOpen
- } else if b.halfOpen.Load().(bool) {
+ } else if b.halfOpen.Load() {
return CircuitHalfOpen
}
return CircuitClosed
@@ -66,11 +66,11 @@ func (b *timeCircuitBreaker) State() CircuitState {
func NewCircuitBreaker(graceDuration, doomDuration time.Duration) *timeCircuitBreaker {
b := &timeCircuitBreaker{
- graceDuration: graceDuration,
- doomDuration: doomDuration,
- now: time.Now,
- failingSinceMillis: math.MaxInt64,
+ graceDuration: graceDuration,
+ doomDuration: doomDuration,
+ now: time.Now,
}
+ b.failingSinceMillis.Store(math.MaxInt64)
b.open.Store(false)
b.halfOpen.Store(false)
return b
diff --git a/client/go/internal/vespa/document/dispatcher.go b/client/go/internal/vespa/document/dispatcher.go
index 2ad5b841616..7a19d21f278 100644
--- a/client/go/internal/vespa/document/dispatcher.go
+++ b/client/go/internal/vespa/document/dispatcher.go
@@ -20,7 +20,6 @@ type Dispatcher struct {
stats Stats
started bool
- ready chan documentOp
results chan documentOp
msgs chan string
@@ -29,7 +28,6 @@ type Dispatcher struct {
output io.Writer
verbose bool
- queuePool sync.Pool
mu sync.Mutex
statsMu sync.Mutex
wg sync.WaitGroup
@@ -57,7 +55,6 @@ func NewDispatcher(feeder Feeder, throttler Throttler, breaker CircuitBreaker, o
output: output,
verbose: verbose,
}
- d.queuePool.New = func() any { return NewQueue[documentOp]() }
d.start()
return d
}
@@ -110,23 +107,14 @@ func (d *Dispatcher) start() {
if d.started {
return
}
- d.ready = make(chan documentOp, 4096)
d.results = make(chan documentOp, 4096)
d.msgs = make(chan string, 4096)
d.started = true
- d.wg.Add(3)
- go d.dispatchReady()
+ d.wg.Add(2)
go d.processResults()
go d.printMessages()
}
-func (d *Dispatcher) dispatchReady() {
- defer d.wg.Done()
- for op := range d.ready {
- d.dispatch(op)
- }
-}
-
func (d *Dispatcher) dispatch(op documentOp) {
if !d.acceptDocument() {
d.msgs <- fmt.Sprintf("refusing to dispatch document %s: too many errors", op.document.Id.String())
@@ -163,13 +151,19 @@ func (d *Dispatcher) dispatchNext(id Id) {
if !ok {
panic("no queue exists for " + id.String() + ": this should not happen")
}
- if next, ok := q.Poll(); ok {
- // we have more operations with this ID: notify dispatcher about the next one
- d.ready <- next
- } else {
+ hasNext := q != nil
+ if hasNext {
+ next, ok := q.Poll()
+ if ok {
+ // we have more operations with this ID: dispatch the next one
+ d.dispatch(next)
+ } else {
+ hasNext = false
+ }
+ }
+ if !hasNext {
// no more operations with this ID: release slot
delete(d.inflight, k)
- d.queuePool.Put(q)
d.releaseSlot()
}
}
@@ -191,12 +185,15 @@ func (d *Dispatcher) enqueue(op documentOp, isRetry bool) error {
d.mu.Unlock()
return fmt.Errorf("refusing to enqueue document %s: too many errors", op.document.Id.String())
}
- key := op.document.Id.String()
- q, ok := d.inflight[key]
+ k := op.document.Id.String()
+ q, ok := d.inflight[k]
if !ok {
- q = d.queuePool.Get().(*Queue[documentOp])
- d.inflight[key] = q
+ d.inflight[k] = nil // track operation, but defer allocating queue until needed
} else {
+ if q == nil {
+ q = NewQueue[documentOp]()
+ d.inflight[k] = q
+ }
q.Add(op, isRetry)
}
if !isRetry {
@@ -204,9 +201,9 @@ func (d *Dispatcher) enqueue(op documentOp, isRetry bool) error {
}
d.mu.Unlock()
if !ok && !isRetry {
- // first operation with this ID: acquire slot
+ // first operation with this ID: acquire slot and dispatch
d.acquireSlot()
- d.ready <- op
+ d.dispatch(op)
d.throttler.Sent()
}
return nil
@@ -248,7 +245,6 @@ func (d *Dispatcher) Close() error {
d.inflightWg.Wait() // Wait for all inflight operations to complete
d.mu.Lock()
if d.started {
- close(d.ready)
close(d.results)
close(d.msgs)
d.started = false
diff --git a/client/go/internal/vespa/document/http.go b/client/go/internal/vespa/document/http.go
index ce57ac55f03..e083f017c4a 100644
--- a/client/go/internal/vespa/document/http.go
+++ b/client/go/internal/vespa/document/http.go
@@ -46,7 +46,7 @@ type Client struct {
options ClientOptions
httpClients []countingHTTPClient
now func() time.Time
- sendCount int32
+ sendCount atomic.Int32
gzippers sync.Pool
buffers sync.Pool
pending chan *pendingDocument
@@ -65,13 +65,11 @@ type ClientOptions struct {
type countingHTTPClient struct {
client util.HTTPClient
- inflight int64
+ inflight atomic.Int64
}
-func (c *countingHTTPClient) addInflight(n int64) { atomic.AddInt64(&c.inflight, n) }
-
func (c *countingHTTPClient) Do(req *http.Request, timeout time.Duration) (*http.Response, error) {
- defer c.addInflight(-1)
+ defer c.inflight.Add(-1)
return c.client.Do(req, timeout)
}
@@ -186,18 +184,18 @@ func (c *Client) methodAndURL(d Document, sb *bytes.Buffer) (string, string) {
func (c *Client) leastBusyClient() *countingHTTPClient {
leastBusy := c.httpClients[0]
min := int64(math.MaxInt64)
- next := atomic.AddInt32(&c.sendCount, 1)
+ next := c.sendCount.Add(1)
start := int(next) % len(c.httpClients)
for i := range c.httpClients {
j := (i + start) % len(c.httpClients)
client := c.httpClients[j]
- inflight := atomic.LoadInt64(&client.inflight)
+ inflight := client.inflight.Load()
if inflight < min {
leastBusy = client
min = inflight
}
}
- leastBusy.addInflight(1)
+ leastBusy.inflight.Add(1)
return &leastBusy
}
diff --git a/client/go/internal/vespa/document/http_test.go b/client/go/internal/vespa/document/http_test.go
index 6eda5f04fd6..1bc3a6c9f39 100644
--- a/client/go/internal/vespa/document/http_test.go
+++ b/client/go/internal/vespa/document/http_test.go
@@ -36,13 +36,13 @@ func TestLeastBusyClient(t *testing.T) {
httpClients = append(httpClients, &mockHTTPClient{i, &httpClient})
}
client, _ := NewClient(ClientOptions{}, httpClients)
- client.httpClients[0].addInflight(1)
- client.httpClients[1].addInflight(1)
+ client.httpClients[0].inflight.Add(1)
+ client.httpClients[1].inflight.Add(1)
assertLeastBusy(t, 2, client)
assertLeastBusy(t, 2, client)
assertLeastBusy(t, 3, client)
- client.httpClients[3].addInflight(1)
- client.httpClients[1].addInflight(-1)
+ client.httpClients[3].inflight.Add(1)
+ client.httpClients[1].inflight.Add(-1)
assertLeastBusy(t, 1, client)
}
diff --git a/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java b/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java
index c143aa43d53..06ea202b9c3 100644
--- a/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java
+++ b/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java
@@ -52,7 +52,7 @@ public class NGramTestCase extends AbstractSchemaTestCase {
@Test
void testInvalidNGramSetting1() throws IOException, ParseException {
try {
- Schema schema = ApplicationBuilder.buildFromFile("src/test/examples/invalidngram1.sd");
+ ApplicationBuilder.buildFromFile("src/test/examples/invalidngram1.sd");
fail("Should cause an exception");
}
catch (IllegalArgumentException e) {
@@ -63,7 +63,7 @@ public class NGramTestCase extends AbstractSchemaTestCase {
@Test
void testInvalidNGramSetting2() throws IOException, ParseException {
try {
- Schema schema = ApplicationBuilder.buildFromFile("src/test/examples/invalidngram2.sd");
+ ApplicationBuilder.buildFromFile("src/test/examples/invalidngram2.sd");
fail("Should cause an exception");
}
catch (IllegalArgumentException e) {
@@ -74,7 +74,7 @@ public class NGramTestCase extends AbstractSchemaTestCase {
@Test
void testInvalidNGramSetting3() throws IOException, ParseException {
try {
- Schema schema = ApplicationBuilder.buildFromFile("src/test/examples/invalidngram3.sd");
+ ApplicationBuilder.buildFromFile("src/test/examples/invalidngram3.sd");
fail("Should cause an exception");
}
catch (IllegalArgumentException e) {
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
index c1d415b8e27..01bb606e9ee 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
@@ -107,7 +107,9 @@ public final class Tokenizer {
if (i >= source.length()) break;
int c = source.codePointAt(i);
- if (characterClasses.isLetterOrDigit(c) || (c == '\'' && acceptApostropheAsWordCharacter(currentIndex))) {
+ if (characterClasses.isSymbol(c)) { // treat each symbol is a separate word
+ addToken(WORD, Character.toString(c), i, i + 1);
+ } else if (characterClasses.isLetterOrDigit(c) || (c == '\'' && acceptApostropheAsWordCharacter(currentIndex))) {
i = consumeWordOrNumber(i, currentIndex);
} else if (Character.isWhitespace(c)) {
addToken(SPACE, " ", i, i + 1);
diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
index 583e89bacd6..f35ffcee0c6 100644
--- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
+++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
@@ -2580,4 +2580,16 @@ public class ParseTestCase {
void testNoGrammar4() {
tester.assertParsed("WEAKAND(100) foo bar baz one two 37", "foo -(bar baz \"one two\" 37)", Query.Type.TOKENIZE);
}
+
+ @Test
+ void testEmojis() {
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ String emoji2 = "\uD83D\uDE00"; // 😀
+
+ tester.assertParsed(emoji1, emoji1, Query.Type.ANY);
+ tester.assertParsed(emoji2, emoji2, Query.Type.ANY);
+ tester.assertParsed("AND " + emoji1 + " " + emoji2, emoji1 + emoji2, Query.Type.ANY);
+ tester.assertParsed("AND " + emoji1 + " foo " + emoji2, emoji1 + "foo" + emoji2, Query.Type.ANY);
+ }
+
}
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/PathGroup.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/PathGroup.java
index ac895022130..ccf79e7eca3 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/PathGroup.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/PathGroup.java
@@ -21,6 +21,9 @@ enum PathGroup {
/** Paths exclusive to operators (including read), used for system management. */
classifiedOperator("/application/v4/notifications",
+ "/routing/v1/",
+ "/routing/v1/status/environment/{*}",
+ "/routing/v1/inactive/environment/{*}",
"/configserver/v1/{*}",
"/deployment/v1/{*}"),
@@ -34,9 +37,6 @@ enum PathGroup {
"/os/v1/{*}",
"/provision/v2/{*}",
"/zone/v2/{*}",
- "/routing/v1/",
- "/routing/v1/status/environment/{*}",
- "/routing/v1/inactive/environment/{*}",
"/state/v1/{*}",
"/changemanagement/v1/{*}"),
@@ -139,8 +139,10 @@ enum PathGroup {
"/application/v4/tenant/{tenant}/application/{application}/environment/{environment}/region/{region}/instance/{ignored}/suspended",
"/application/v4/tenant/{tenant}/application/{application}/environment/{environment}/region/{region}/instance/{ignored}/service/{*}",
"/application/v4/tenant/{tenant}/application/{application}/environment/{environment}/region/{region}/instance/{ignored}/global-rotation/{*}",
- "/application/v4/tenant/{tenant}/application/{application}/metering",
- "/routing/v1/inactive/tenant/{tenant}/application/{application}/instance/{ignored}/environment/prod/region/{region}"),
+ "/application/v4/tenant/{tenant}/application/{application}/metering"),
+
+ applicationRouting(Matcher.tenant,
+ Matcher.application, "/routing/v1/inactive/tenant/{tenant}/application/{application}/instance/{ignored}/environment/prod/region/{region}"),
// TODO jonmv: remove
/** Path used to restart development nodes. */
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/Policy.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/Policy.java
index 9a28226c921..2f8ea368b21 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/Policy.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/role/Policy.java
@@ -33,10 +33,10 @@ enum Policy {
/** Full access to everything. */
supporter(Privilege.grant(Action.read)
- .on(PathGroup.allExcept(PathGroup.classifiedOperator))
+ .on(PathGroup.allExcept(PathGroup.classifiedOperator, PathGroup.applicationRouting))
.in(SystemName.all()),
Privilege.grant(Action.all())
- .on(PathGroup.classifiedOperator)
+ .on(PathGroup.classifiedOperator, PathGroup.applicationRouting)
.in(SystemName.all())),
/** Full access to user management for a tenant in select systems. */
@@ -87,12 +87,12 @@ enum Policy {
/** Read access to application information and settings. */
applicationRead(Privilege.grant(Action.read)
- .on(PathGroup.application, PathGroup.applicationInfo, PathGroup.reindexing, PathGroup.serviceDump, PathGroup.dropDocuments)
+ .on(PathGroup.application, PathGroup.applicationInfo, PathGroup.applicationRouting, PathGroup.reindexing, PathGroup.serviceDump, PathGroup.dropDocuments)
.in(SystemName.all())),
/** Update access to application information and settings. */
applicationUpdate(Privilege.grant(Action.update)
- .on(PathGroup.application, PathGroup.applicationInfo)
+ .on(PathGroup.application, PathGroup.applicationInfo, PathGroup.applicationRouting)
.in(SystemName.all())),
/** Access to delete a certain application. */
@@ -102,7 +102,7 @@ enum Policy {
/** Full access to application information and settings. */
applicationOperations(Privilege.grant(Action.write())
- .on(PathGroup.applicationInfo, PathGroup.productionRestart, PathGroup.reindexing, PathGroup.serviceDump, PathGroup.dropDocuments)
+ .on(PathGroup.applicationInfo, PathGroup.applicationRouting, PathGroup.productionRestart, PathGroup.reindexing, PathGroup.serviceDump, PathGroup.dropDocuments)
.in(SystemName.all())),
/** Access to create and delete developer and deploy keys under a tenant. */
diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json
index f35b9036fd8..a6aa902c688 100644
--- a/linguistics/abi-spec.json
+++ b/linguistics/abi-spec.json
@@ -322,6 +322,7 @@
"methods" : [
"public void <init>()",
"public boolean isLetter(int)",
+ "public boolean isSymbol(int)",
"public boolean isDigit(int)",
"public boolean isLatinDigit(int)",
"public boolean isLatin(int)",
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
index 413dce0d6c1..f6177262bf9 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -13,9 +13,8 @@ public class CharacterClasses {
* which are useful to view as letters even though not defined as such in unicode.
*/
public boolean isLetter(int c) {
- if (java.lang.Character.isLetter(c)) return true;
+ if (Character.isLetter(c)) return true;
if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
- // if (c == '_') return true;
// Some CJK punctuation defined as word characters
if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
@@ -30,6 +29,13 @@ public class CharacterClasses {
}
/**
+ * Returns true if the character is in the class "other symbol" - emojis etc.
+ */
+ public boolean isSymbol(int c) {
+ return Character.getType(c) == Character.OTHER_SYMBOL;
+ }
+
+ /**
* Returns true for code points which should be considered digits - same as java.lang.Character.isDigit
*/
public boolean isDigit(int c) {
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
index 83110c0021e..210d7ac94ff 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -88,46 +88,54 @@ public class GramSplitter {
}
private Gram findNext() {
- // Skip to next word character
- while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) {
+ // Skip to next indexable character
+ while (i < input.length() && !isIndexable(input.codePointAt(i))) {
i = input.next(i);
isFirstAfterSeparator = true;
}
- if (i >= input.length()) return null;
-
- UnicodeString gram = input.substring(i, n);
- int nonWordChar = indexOfNonWordCodepoint(gram);
- if (nonWordChar == 0) throw new RuntimeException("Programming error");
-
- if (nonWordChar > 0)
- gram = new UnicodeString(gram.toString().substring(0, nonWordChar));
+ if (i >= input.length()) return null; // no indexable characters
+ int tokenStart = i;
+ UnicodeString gram = input.substring(tokenStart, n);
+ int tokenEnd = tokenEnd(gram);
+ gram = new UnicodeString(gram.toString().substring(0, tokenEnd));
if (gram.codePointCount() == n) { // normal case: got a full length gram
Gram g = new Gram(i, gram.codePointCount());
i = input.next(i);
isFirstAfterSeparator = false;
return g;
}
- else { // gram is too short due either to a non-word separator or end of string
- if (isFirstAfterSeparator) { // make a gram anyway
+ else { // gram is too short due either to being a symbol, being followed by a non-word separator, or end of string
+ if (isFirstAfterSeparator || ( gram.codePointCount() == 1 && characterClasses.isSymbol(gram.codePointAt(0)))) { // make a gram anyway
Gram g = new Gram(i, gram.codePointCount());
i = input.next(i);
isFirstAfterSeparator = false;
return g;
} else { // skip to next
- i = input.skip(gram.codePointCount() + 1, i);
+ i = input.skip(gram.codePointCount(), i);
isFirstAfterSeparator = true;
return findNext();
}
}
}
- private int indexOfNonWordCodepoint(UnicodeString s) {
- for (int i = 0; i < s.length(); i = s.next(i)) {
+ private boolean isIndexable(int codepoint) {
+ if (characterClasses.isLetterOrDigit(codepoint)) return true;
+ if (characterClasses.isSymbol(codepoint)) return true;
+ return false;
+ }
+
+ /** Given a string s starting by an indexable character, return the position where that token should end. */
+ private int tokenEnd(UnicodeString s) {
+ if (characterClasses.isSymbol(s.codePointAt(0)))
+ return s.next(0); // symbols have length 1
+
+ int i = 0;
+ for (; i < s.length(); i = s.next(i)) {
if ( ! characterClasses.isLetterOrDigit(s.codePointAt(i)))
return i;
}
- return -1;
+ return i;
}
@Override
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
index 6cefcfbf67a..a219efce3cd 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
@@ -49,6 +49,17 @@ public class GramSplitterTestCase {
}
@Test
+ public void testEmojis() {
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ assertGramSplit(emoji1, 2, "[" + emoji1+ "]");
+ assertGramSplit(emoji1 + emoji2, 2, "[" + emoji1 + ", " + emoji2 + "]");
+ assertGramSplit(emoji1 + "." + emoji2, 2, "[" + emoji1 + ", " + emoji2 + "]");
+ assertGramSplit("." + emoji1 + "." + emoji2 + ".", 2, "[" + emoji1 + ", " + emoji2 + "]");
+ assertGramSplit("foo" + emoji1 + "bar" + emoji2 + "baz", 2, "[fo, oo, " + emoji1 + ", ba, ar, " + emoji2 + ", ba, az]");
+ }
+
+ @Test
public void testSpaceCornerCases() {
// space corner cases
assertGramSplit("e en e", 1, "[e, e, n, e]");
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
index f9ff66ee345..b4f080405bd 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
@@ -1,10 +1,18 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
+import com.yahoo.language.Language;
import com.yahoo.language.process.AbstractTokenizerTestCase;
import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
import org.junit.Test;
+import java.util.Iterator;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
/**
* @author Steinar Knutsen
* @author bratseth
@@ -33,4 +41,15 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {
" ", "gods", ".", "running", ")");
}
+ @Test
+ public void testTokenizeEmojis() {
+ TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);
+
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ tester.assertTokens(emoji1, emoji1);
+ tester.assertTokens(emoji1 + "foo", emoji1, "foo");
+ tester.assertTokens(emoji1 + emoji2, emoji1, emoji2);
+ }
+
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 64efeb85e63..1ff1b5c8ffe 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -619,8 +619,10 @@ public class NodeAgentImpl implements NodeAgent {
} catch (OrchestratorException e) {
// Ensure the ACLs are up to date: The reason we're unable to suspend may be because some other
// node is unable to resume because the ACL rules of SOME Docker container is wrong...
+ // Same can happen with stale WireGuard config, so update that too
try {
aclMaintainer.ifPresent(maintainer -> maintainer.converge(context));
+ wireguardTasks.forEach(task -> getContainer(context).ifPresent(c -> task.converge(context, c.id())));
} catch (RuntimeException suppressed) {
logger.log(Level.WARNING, "Suppressing ACL update failure: " + suppressed);
e.addSuppressed(suppressed);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index fd6b15609d6..1a97f1e02f7 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -755,7 +755,6 @@ public class Nodes {
if ( ! host.type().canRun(NodeType.tenant)) return false;
if (host.status().wantToRetire()) return false;
if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false;
- if (suspended(host)) return false;
if (dynamicProvisioning)
return EnumSet.of(Node.State.active, Node.State.ready, Node.State.provisioned).contains(host.state());
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
index 9dcc564190b..57d859df476 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
@@ -140,6 +140,7 @@ public class NodePrioritizer {
for (Node host : allNodes) {
if ( ! nodes.canAllocateTenantNodeTo(host, dynamicProvisioning)) continue;
+ if (nodes.suspended(host)) continue; // Hosts that are suspended may be down for some time, e.g. for OS upgrade
if (host.reservedTo().isPresent() && !host.reservedTo().get().equals(application.tenant())) continue;
if (host.reservedTo().isPresent() && application.instance().isTester()) continue;
if (host.exclusiveToApplicationId().isPresent()) continue; // Never allocate new nodes to exclusive hosts
diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index 8080dc92729..5452da71775 100644
--- a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -25,7 +25,6 @@ import java.util.List;
*/
public class OpenNlpTokenizer implements Tokenizer {
- private final static int SPACE_CODE = 32;
private final Normalizer normalizer;
private final Transformer transformer;
private final SimpleTokenizer simpleTokenizer;
@@ -74,26 +73,26 @@ public class OpenNlpTokenizer implements Tokenizer {
}
private SnowballStemmer.ALGORITHM algorithmFor(Language language) {
- switch (language) {
- case DANISH: return SnowballStemmer.ALGORITHM.DANISH;
- case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH;
- case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH;
- case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH;
- case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN;
- case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN;
- case IRISH: return SnowballStemmer.ALGORITHM.IRISH;
- case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN;
- case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN;
- case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN;
- case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE;
- case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN;
- case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN;
- case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH;
- case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH;
- case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH;
- case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH;
- default: return null;
- }
+ return switch (language) {
+ case DANISH -> SnowballStemmer.ALGORITHM.DANISH;
+ case DUTCH -> SnowballStemmer.ALGORITHM.DUTCH;
+ case FINNISH -> SnowballStemmer.ALGORITHM.FINNISH;
+ case FRENCH -> SnowballStemmer.ALGORITHM.FRENCH;
+ case GERMAN -> SnowballStemmer.ALGORITHM.GERMAN;
+ case HUNGARIAN -> SnowballStemmer.ALGORITHM.HUNGARIAN;
+ case IRISH -> SnowballStemmer.ALGORITHM.IRISH;
+ case ITALIAN -> SnowballStemmer.ALGORITHM.ITALIAN;
+ case NORWEGIAN_BOKMAL -> SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case NORWEGIAN_NYNORSK -> SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case PORTUGUESE -> SnowballStemmer.ALGORITHM.PORTUGUESE;
+ case ROMANIAN -> SnowballStemmer.ALGORITHM.ROMANIAN;
+ case RUSSIAN -> SnowballStemmer.ALGORITHM.RUSSIAN;
+ case SPANISH -> SnowballStemmer.ALGORITHM.SPANISH;
+ case SWEDISH -> SnowballStemmer.ALGORITHM.SWEDISH;
+ case TURKISH -> SnowballStemmer.ALGORITHM.TURKISH;
+ case ENGLISH -> SnowballStemmer.ALGORITHM.ENGLISH;
+ default -> null;
+ };
}
}
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index a5daf7f0531..78412f94fd4 100644
--- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -150,8 +150,7 @@ public class OpenNlpTokenizationTestCase {
@Test
public void testIndexability() {
String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652";
- for (StemMode stemMode : new StemMode[] { StemMode.NONE,
- StemMode.SHORTEST }) {
+ for (StemMode stemMode : new StemMode[] { StemMode.NONE, StemMode.SHORTEST }) {
for (Language l : List.of(Language.INDONESIAN, Language.ENGLISH, Language.ARABIC)) {
for (boolean accentDrop : new boolean[] { true, false }) {
for (Token token : tokenizer.tokenize(input, l, stemMode, accentDrop)) {
@@ -165,6 +164,22 @@ public class OpenNlpTokenizationTestCase {
}
@Test
+ public void testTokenizeEmojis() {
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ Iterator<Token> tokens1 = tokenizer.tokenize(emoji1, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens1.hasNext());
+ assertEquals(emoji1, tokens1.next().getTokenString());
+ assertFalse(tokens1.hasNext());
+
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ Iterator<Token> tokens2 = tokenizer.tokenize(emoji1 + emoji2, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens2.hasNext());
+ assertEquals(emoji1, tokens2.next().getTokenString());
+ assertEquals(emoji2, tokens2.next().getTokenString());
+ assertFalse(tokens2.hasNext());
+ }
+
+ @Test
public void testTokenTypes() {
testTokenTypes(Language.ENGLISH);
testTokenTypes(Language.SPANISH);
diff --git a/searchcore/src/tests/proton/matching/querynodes_test.cpp b/searchcore/src/tests/proton/matching/querynodes_test.cpp
index 15fcc8a3fd7..3c9220bcdb8 100644
--- a/searchcore/src/tests/proton/matching/querynodes_test.cpp
+++ b/searchcore/src/tests/proton/matching/querynodes_test.cpp
@@ -520,9 +520,9 @@ TEST("requireThatSimpleIntermediatesGetProperBlending") {
TEST("control query nodes size") {
EXPECT_EQUAL(160u, sizeof(search::query::NumberTerm));
- EXPECT_EQUAL(192u, sizeof(ProtonNodeTypes::NumberTerm));
+ EXPECT_EQUAL(280u, sizeof(ProtonNodeTypes::NumberTerm));
EXPECT_EQUAL(160u, sizeof(search::query::StringTerm));
- EXPECT_EQUAL(192u, sizeof(ProtonNodeTypes::StringTerm));
+ EXPECT_EQUAL(280u, sizeof(ProtonNodeTypes::StringTerm));
}
} // namespace
diff --git a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp
index 03e15830ac5..68845cf7f7f 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp
@@ -23,7 +23,7 @@ struct Mixer {
Mixer() : attributes() {}
void addAttribute(Blueprint::UP attr) {
- if (attributes.get() == 0) {
+ if ( ! attributes) {
attributes = std::make_unique<OrBlueprint>();
}
attributes->addChild(std::move(attr));
@@ -62,6 +62,7 @@ private:
void buildChildren(IntermediateBlueprint &parent,
const std::vector<search::query::Node *> &children)
{
+ parent.reserve(children.size());
for (size_t i = 0; i < children.size(); ++i) {
parent.addChild(BlueprintBuilder::build(_requestContext, *children[i], _context));
}
diff --git a/searchcore/src/vespa/searchcore/proton/matching/querynodes.h b/searchcore/src/vespa/searchcore/proton/matching/querynodes.h
index 03b274b7233..0e01884d504 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/querynodes.h
+++ b/searchcore/src/vespa/searchcore/proton/matching/querynodes.h
@@ -44,7 +44,7 @@ public:
};
private:
- std::vector<FieldEntry> _fields;
+ vespalib::SmallVector<FieldEntry, 1u> _fields;
void propagate_document_frequency(uint32_t matching_count_doc, uint32_t total_doc_count);
diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
index 5ba90d2b077..6e5971ea81d 100644
--- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
+++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
@@ -389,8 +389,8 @@ testSingleValue(Attribute & svsa, Config &cfg)
TEST("testSingleValue")
{
EXPECT_EQUAL(24u, sizeof(SearchContext));
- EXPECT_EQUAL(32u, sizeof(StringSearchHelper));
- EXPECT_EQUAL(88u, sizeof(attribute::SingleStringEnumSearchContext));
+ EXPECT_EQUAL(40u, sizeof(StringSearchHelper));
+ EXPECT_EQUAL(96u, sizeof(attribute::SingleStringEnumSearchContext));
{
Config cfg(BasicType::STRING, CollectionType::SINGLE);
SingleValueStringAttribute svsa("svsa", cfg);
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index e3c9e05073e..841d7f92b62 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -9,6 +9,7 @@
#include <vespa/searchlib/tensor/doc_vector_access.h>
#include <vespa/searchlib/tensor/distance_functions.h>
#include <vespa/searchlib/tensor/hnsw_index.h>
+#include <vespa/searchlib/tensor/mips_distance_transform.h>
#include <vespa/searchlib/tensor/nearest_neighbor_index.h>
#include <vespa/searchlib/tensor/nearest_neighbor_index_factory.h>
#include <vespa/searchlib/tensor/nearest_neighbor_index_loader.h>
@@ -54,6 +55,7 @@ using search::tensor::DocVectorAccess;
using search::tensor::HnswIndex;
using search::tensor::HnswIndexType;
using search::tensor::HnswTestNode;
+using search::tensor::MipsDistanceFunctionFactoryBase;
using search::tensor::NearestNeighborIndex;
using search::tensor::NearestNeighborIndexFactory;
using search::tensor::NearestNeighborIndexLoader;
@@ -285,13 +287,15 @@ public:
void populate_address_space_usage(AddressSpaceUsage&) const override {}
void get_state(const vespalib::slime::Inserter&) const override {}
void shrink_lid_space(uint32_t) override { }
- std::unique_ptr<NearestNeighborIndexSaver> make_saver() const override {
+ std::unique_ptr<NearestNeighborIndexSaver> make_saver(vespalib::GenericHeader& header) const override {
+ (void) header;
if (_index_value != 0) {
return std::make_unique<MockIndexSaver>(_index_value);
}
return std::unique_ptr<NearestNeighborIndexSaver>();
}
- std::unique_ptr<NearestNeighborIndexLoader> make_loader(FastOS_FileInterface& file) override {
+ std::unique_ptr<NearestNeighborIndexLoader> make_loader(FastOS_FileInterface& file, const vespalib::GenericHeader& header) override {
+ (void) header;
return std::make_unique<MockIndexLoader>(_index_value, file);
}
std::vector<Neighbor> find_top_k(uint32_t k,
@@ -342,12 +346,15 @@ class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory {
const vespalib::string test_dir = "test_data/";
const vespalib::string attr_name = test_dir + "my_attr";
+const vespalib::string hnsw_max_squared_norm = "hnsw.max_squared_norm";
+
struct FixtureTraits {
bool use_dense_tensor_attribute = false;
bool use_direct_tensor_attribute = false;
bool enable_hnsw_index = false;
bool use_mock_index = false;
bool use_mmap_file_allocator = false;
+ bool use_mips_distance = false;
FixtureTraits dense() && {
use_dense_tensor_attribute = true;
@@ -381,6 +388,14 @@ struct FixtureTraits {
return *this;
}
+ FixtureTraits mips_hnsw() && {
+ use_dense_tensor_attribute = true;
+ enable_hnsw_index = true;
+ use_mock_index = false;
+ use_mips_distance = true;
+ return *this;
+ }
+
FixtureTraits direct() && {
use_dense_tensor_attribute = false;
use_direct_tensor_attribute = true;
@@ -606,8 +621,9 @@ Fixture::Fixture(const vespalib::string &typeSpec, FixtureTraits traits)
_mmap_allocator_base_dir("mmap-file-allocator-factory-dir")
{
if (traits.enable_hnsw_index) {
- _cfg.set_distance_metric(DistanceMetric::Euclidean);
- _cfg.set_hnsw_index_params(HnswIndexParams(4, 20, DistanceMetric::Euclidean));
+ auto dm = traits.use_mips_distance ? DistanceMetric::Dotproduct : DistanceMetric::Euclidean;
+ _cfg.set_distance_metric(dm);
+ _cfg.set_hnsw_index_params(HnswIndexParams(4, 20, dm));
}
vespalib::alloc::MmapFileAllocatorFactory::instance().setup(_mmap_allocator_base_dir);
setup();
@@ -1254,6 +1270,23 @@ TEST_F("Nearest neighbor index type is added to attribute file header", DenseTen
EXPECT_EQUAL("hnsw", header.getTag("nearest_neighbor_index").asString());
}
+class DenseTensorAttributeMipsIndex : public Fixture {
+public:
+ DenseTensorAttributeMipsIndex() : Fixture(vec_2d_spec, FixtureTraits().mips_hnsw()) {}
+};
+
+TEST_F("Nearest neighbor index with mips distance metrics stores square of max distance", DenseTensorAttributeMipsIndex)
+{
+ f.set_example_tensors();
+ f.save();
+ auto header = f.get_file_header();
+ EXPECT_TRUE(header.hasTag(hnsw_max_squared_norm));
+ EXPECT_EQUAL(130.0, header.getTag(hnsw_max_squared_norm).asFloat());
+ f.load();
+ auto& norm_store = dynamic_cast<MipsDistanceFunctionFactoryBase&>(f.hnsw_index().distance_function_factory()).get_max_squared_norm_store();
+ EXPECT_EQUAL(130.0, norm_store.get_max());
+}
+
template <typename ParentT>
class NearestNeighborBlueprintFixtureBase : public ParentT {
private:
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
index 9d1ec1b37a8..7a622030d98 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
@@ -860,6 +860,7 @@ void
CreateBlueprintVisitor::createShallowWeightedSet(WS *bp, MultiTerm &n, const FieldSpec &fs, bool isInteger) {
Blueprint::UP result(bp);
SearchContextParams scParams = createContextParams();
+ bp->reserve(n.getNumTerms());
for (uint32_t i(0); i < n.getNumTerms(); i++) {
FieldSpec childfs = bp->getNextChildField(fs);
auto term = n.getAsString(i);
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
index 122c2c0c55e..cde1686828f 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
@@ -5,6 +5,8 @@
#include <vespa/vespalib/data/databuffer.h>
#include <vespa/vespalib/util/exceptions.h>
+using vespalib::GenericHeader;
+
namespace search::attribute {
namespace {
@@ -57,7 +59,8 @@ AttributeHeader::AttributeHeader(const vespalib::string &fileName)
_uniqueValueCount(0),
_totalValueCount(0),
_createSerialNum(0u),
- _version(0)
+ _version(0),
+ _extra_tags()
{
}
@@ -244,6 +247,10 @@ AttributeHeader::addTags(vespalib::GenericHeader &header) const
header.putTag(Tag(predicateLowerBoundTag, params.lower_bound()));
header.putTag(Tag(predicateUpperBoundTag, params.upper_bound()));
}
+ for (uint32_t i = 0; i < _extra_tags.getNumTags(); ++i) {
+ auto& tag = _extra_tags.getTag(i);
+ header.putTag(tag);
+ }
}
bool
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.h b/searchlib/src/vespa/searchlib/attribute/attribute_header.h
index 7c0b8f3084b..8c5a0edc6a6 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_header.h
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.h
@@ -2,16 +2,15 @@
#pragma once
-#include <vespa/vespalib/stllike/string.h>
+#include <vespa/eval/eval/value_type.h>
#include <vespa/searchcommon/attribute/basictype.h>
#include <vespa/searchcommon/attribute/collectiontype.h>
#include <vespa/searchcommon/attribute/hnsw_index_params.h>
#include <vespa/searchcommon/attribute/predicate_params.h>
-#include <vespa/eval/eval/value_type.h>
+#include <vespa/vespalib/data/fileheader.h>
+#include <vespa/vespalib/stllike/string.h>
#include <optional>
-namespace vespalib { class GenericHeader; }
-
namespace search::attribute {
/**
@@ -34,6 +33,7 @@ private:
uint64_t _totalValueCount;
uint64_t _createSerialNum;
uint32_t _version;
+ vespalib::GenericHeader _extra_tags;
void internalExtractTags(const vespalib::GenericHeader &header);
public:
@@ -71,6 +71,7 @@ public:
const std::optional<HnswIndexParams>& get_hnsw_index_params() const { return _hnsw_index_params; }
static AttributeHeader extractTags(const vespalib::GenericHeader &header, const vespalib::string &file_name);
void addTags(vespalib::GenericHeader &header) const;
+ vespalib::GenericHeader& get_extra_tags() noexcept { return _extra_tags; }
};
}
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
index 206c2bcbd69..17a0e6256d4 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
@@ -29,10 +29,10 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased)
term.getFuzzyPrefixLength(),
isCased());
} else if (isCased()) {
- _term._char = term.getTerm();
+ _term = term.getTerm();
_termLen = term.getTermLen();
} else {
- term.term(_term._ucs4);
+ _ucs4 = term.asUcs4();
}
}
@@ -49,7 +49,7 @@ StringSearchHelper::isMatch(const char *src) const {
return getFuzzyMatcher().isMatch(src);
}
if (__builtin_expect(isCased(), false)) {
- int res = strncmp(_term._char, src, _termLen);
+ int res = strncmp(_term, src, _termLen);
return (res == 0) && (src[_termLen] == 0 || isPrefix());
}
vespalib::Utf8ReaderForZTS u8reader(src);
@@ -58,11 +58,11 @@ StringSearchHelper::isMatch(const char *src) const {
for (;; ++j) {
val = u8reader.getChar();
val = vespalib::LowerCase::convert(val);
- if (_term._ucs4[j] == 0 || _term._ucs4[j] != val) {
+ if (_ucs4[j] == 0 || _ucs4[j] != val) {
break;
}
}
- return (_term._ucs4[j] == 0 && (val == 0 || isPrefix()));
+ return (_ucs4[j] == 0 && (val == 0 || isPrefix()));
}
}
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.h b/searchlib/src/vespa/searchlib/attribute/string_search_helper.h
index 4d69b61449e..7bfcf0e4292 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.h
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.h
@@ -16,6 +16,7 @@ namespace search::attribute {
*/
class StringSearchHelper {
public:
+ using FuzzyMatcher = vespalib::FuzzyMatcher;
StringSearchHelper(QueryTermUCS4 & qTerm, bool cased);
StringSearchHelper(StringSearchHelper&&) noexcept;
StringSearchHelper(const StringSearchHelper &) = delete;
@@ -27,14 +28,12 @@ public:
bool isCased() const noexcept { return _isCased; }
bool isFuzzy() const noexcept { return _isFuzzy; }
const vespalib::Regex & getRegex() const noexcept { return _regex; }
- const vespalib::FuzzyMatcher & getFuzzyMatcher() const noexcept { return *_fuzzyMatcher; }
+ const FuzzyMatcher & getFuzzyMatcher() const noexcept { return *_fuzzyMatcher; }
private:
vespalib::Regex _regex;
- std::unique_ptr<vespalib::FuzzyMatcher> _fuzzyMatcher;
- union {
- const ucs4_t *_ucs4;
- const char *_char;
- } _term;
+ std::unique_ptr<FuzzyMatcher> _fuzzyMatcher;
+ std::unique_ptr<ucs4_t[]> _ucs4;
+ const char * _term;
uint32_t _termLen;
bool _isPrefix;
bool _isRegex;
diff --git a/searchlib/src/vespa/searchlib/fef/itermdata.h b/searchlib/src/vespa/searchlib/fef/itermdata.h
index 306c91f7ab2..9a063cf93ee 100644
--- a/searchlib/src/vespa/searchlib/fef/itermdata.h
+++ b/searchlib/src/vespa/searchlib/fef/itermdata.h
@@ -16,7 +16,7 @@ namespace search::fef {
class ITermData
{
protected:
- virtual ~ITermData() {}
+ virtual ~ITermData() = default;
public:
/**
diff --git a/searchlib/src/vespa/searchlib/fef/itermfielddata.h b/searchlib/src/vespa/searchlib/fef/itermfielddata.h
index 057a5794fa9..88fa8c5f781 100644
--- a/searchlib/src/vespa/searchlib/fef/itermfielddata.h
+++ b/searchlib/src/vespa/searchlib/fef/itermfielddata.h
@@ -76,7 +76,7 @@ public:
**/
virtual TermFieldHandle getHandle(MatchDataDetails requested_details) const = 0;
protected:
- virtual ~ITermFieldData() {}
+ virtual ~ITermFieldData() = default;
private:
uint32_t _fieldId;
uint32_t _matching_doc_count;
diff --git a/searchlib/src/vespa/searchlib/query/query_term_ucs4.cpp b/searchlib/src/vespa/searchlib/query/query_term_ucs4.cpp
index e68685bd78c..8c3c2514877 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_ucs4.cpp
+++ b/searchlib/src/vespa/searchlib/query/query_term_ucs4.cpp
@@ -38,17 +38,26 @@ QueryTermUCS4::fillUCS4() {
* This is a 'dirty' optimisation, but this is done to avoid writing a lot of data and blow the cpu caches with something
* you do not really need most of the time. That matters when qps is very high and query is wide, and hits are few.
*/
- std::lock_guard guard(_globalMutex);
- ucs4_t * ucs4 = _termUCS4.load(std::memory_order_relaxed);
- if (ucs4 != nullptr) return ucs4;
- ucs4 = new ucs4_t[_cachedTermLen + 1];
+ std::unique_ptr<ucs4_t[]> ucs4 = asUcs4();
+ ucs4_t * next = ucs4.get();
+ {
+ std::lock_guard guard(_globalMutex);
+ ucs4_t *prev = _termUCS4.load(std::memory_order_relaxed);
+ if (prev != nullptr) return prev;
+ _termUCS4.store(ucs4.release(), std::memory_order_relaxed);
+ }
+ return next;
+}
+
+std::unique_ptr<ucs4_t[]>
+QueryTermUCS4::asUcs4() const {
+ auto ucs4 = std::make_unique<ucs4_t[]>(_cachedTermLen + 1);
vespalib::Utf8Reader r(getTermString());
uint32_t i(0);
while (r.hasMore()) {
ucs4[i++] = r.getChar();
}
ucs4[_cachedTermLen] = 0;
- _termUCS4.store(ucs4);
return ucs4;
}
diff --git a/searchlib/src/vespa/searchlib/query/query_term_ucs4.h b/searchlib/src/vespa/searchlib/query/query_term_ucs4.h
index 0639ce8a74c..673927cf685 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_ucs4.h
+++ b/searchlib/src/vespa/searchlib/query/query_term_ucs4.h
@@ -21,6 +21,7 @@ public:
uint32_t getTermLen() const { return _cachedTermLen; }
uint32_t term(const char * & t) const { t = getTerm(); return _cachedTermLen; }
void visitMembers(vespalib::ObjectVisitor &visitor) const override;
+ std::unique_ptr<ucs4_t[]> asUcs4() const;
uint32_t term(const ucs4_t * & t) {
t = _termUCS4.load(std::memory_order_relaxed);
if (t == nullptr) {
diff --git a/searchlib/src/vespa/searchlib/queryeval/blueprint.h b/searchlib/src/vespa/searchlib/queryeval/blueprint.h
index 1ea02e41a62..dc7a0992d82 100644
--- a/searchlib/src/vespa/searchlib/queryeval/blueprint.h
+++ b/searchlib/src/vespa/searchlib/queryeval/blueprint.h
@@ -331,6 +331,7 @@ public:
size_t childCnt() const { return _children.size(); }
const Blueprint &getChild(size_t n) const { return *_children[n]; }
Blueprint &getChild(size_t n) { return *_children[n]; }
+ void reserve(size_t sz) { _children.reserve(sz); }
IntermediateBlueprint & insertChild(size_t n, Blueprint::UP child);
IntermediateBlueprint &addChild(Blueprint::UP child);
Blueprint::UP removeChild(size_t n);
diff --git a/searchlib/src/vespa/searchlib/queryeval/field_spec.h b/searchlib/src/vespa/searchlib/queryeval/field_spec.h
index a1050209b41..fd925fdf4ff 100644
--- a/searchlib/src/vespa/searchlib/queryeval/field_spec.h
+++ b/searchlib/src/vespa/searchlib/queryeval/field_spec.h
@@ -86,7 +86,7 @@ public:
class FieldSpecList
{
private:
- std::vector<FieldSpec> _list;
+ vespalib::SmallVector<FieldSpec, 1> _list;
public:
FieldSpecList() = default;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index 3fdad3d507b..3e0cd71be8b 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -6,6 +6,7 @@
#include "hash_set_visited_tracker.h"
#include "hnsw_index_loader.hpp"
#include "hnsw_index_saver.h"
+#include "mips_distance_transform.h"
#include "random_level_generator.h"
#include "vector_bundle.h"
#include <vespa/searchlib/attribute/address_space_components.h>
@@ -31,6 +32,7 @@ using search::StateExplorerUtils;
using search::queryeval::GlobalFilter;
using vespalib::datastore::CompactionStrategy;
using vespalib::datastore::EntryRef;
+using vespalib::GenericHeader;
namespace {
@@ -41,6 +43,29 @@ constexpr size_t max_level_array_size = 16;
constexpr size_t max_link_array_size = 193;
constexpr vespalib::duration MAX_COUNT_DURATION(100ms);
+const vespalib::string hnsw_max_squared_norm = "hnsw.max_squared_norm";
+
+void save_mips_max_distance(GenericHeader& header, DistanceFunctionFactory& dff) {
+ auto* mips_dff = dynamic_cast<MipsDistanceFunctionFactoryBase*>(&dff);
+ if (mips_dff != nullptr) {
+ auto& norm_store = mips_dff->get_max_squared_norm_store();
+ header.putTag(GenericHeader::Tag(hnsw_max_squared_norm, norm_store.get_max()));
+ }
+}
+
+void load_mips_max_distance(const GenericHeader& header, DistanceFunctionFactory& dff) {
+ auto* mips_dff = dynamic_cast<MipsDistanceFunctionFactoryBase*>(&dff);
+ if (mips_dff != nullptr) {
+ auto& norm_store = mips_dff->get_max_squared_norm_store();
+ if (header.hasTag(hnsw_max_squared_norm)) {
+ auto& tag = header.getTag(hnsw_max_squared_norm);
+ if (tag.getType() == GenericHeader::Tag::Type::TYPE_FLOAT) {
+ (void) norm_store.get_max(tag.asFloat());
+ }
+ }
+ }
+}
+
bool has_link_to(vespalib::ConstArrayRef<uint32_t> links, uint32_t id) {
for (uint32_t link : links) {
if (link == id) return true;
@@ -836,16 +861,18 @@ HnswIndex<type>::shrink_lid_space(uint32_t doc_id_limit)
template <HnswIndexType type>
std::unique_ptr<NearestNeighborIndexSaver>
-HnswIndex<type>::make_saver() const
+HnswIndex<type>::make_saver(GenericHeader& header) const
{
+ save_mips_max_distance(header, distance_function_factory());
return std::make_unique<HnswIndexSaver<type>>(_graph);
}
template <HnswIndexType type>
std::unique_ptr<NearestNeighborIndexLoader>
-HnswIndex<type>::make_loader(FastOS_FileInterface& file)
+HnswIndex<type>::make_loader(FastOS_FileInterface& file, const vespalib::GenericHeader& header)
{
assert(get_entry_nodeid() == 0); // cannot load after index has data
+ load_mips_max_distance(header, distance_function_factory());
using ReaderType = FileReader<uint32_t>;
using LoaderType = HnswIndexLoader<ReaderType, type>;
return std::make_unique<LoaderType>(_graph, _id_mapping, std::make_unique<ReaderType>(&file));
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index 0809dcf4fe3..7858cb65bf9 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -211,8 +211,8 @@ public:
void get_state(const vespalib::slime::Inserter& inserter) const override;
void shrink_lid_space(uint32_t doc_id_limit) override;
- std::unique_ptr<NearestNeighborIndexSaver> make_saver() const override;
- std::unique_ptr<NearestNeighborIndexLoader> make_loader(FastOS_FileInterface& file) override;
+ std::unique_ptr<NearestNeighborIndexSaver> make_saver(vespalib::GenericHeader& header) const override;
+ std::unique_ptr<NearestNeighborIndexLoader> make_loader(FastOS_FileInterface& file, const vespalib::GenericHeader& header) override;
std::vector<Neighbor> find_top_k(
uint32_t k,
diff --git a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h
index fabd6bfcc57..833fa3e689b 100644
--- a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h
+++ b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h
@@ -37,6 +37,18 @@ public:
}
};
+class MipsDistanceFunctionFactoryBase : public DistanceFunctionFactory {
+protected:
+ std::shared_ptr<MaximumSquaredNormStore> _sq_norm_store;
+public:
+ MipsDistanceFunctionFactoryBase()
+ : _sq_norm_store(std::make_shared<MaximumSquaredNormStore>())
+ {
+ }
+ ~MipsDistanceFunctionFactoryBase() = default;
+ MaximumSquaredNormStore& get_max_squared_norm_store() noexcept { return *_sq_norm_store; }
+};
+
/**
* Factory for distance functions which can apply a transformation
* mapping Maximum Inner Product Search to a nearest neighbor
@@ -45,10 +57,10 @@ public:
* to the longest vector inserted so far, or at least length 1.
*/
template<typename FloatType>
-class MipsDistanceFunctionFactory : public DistanceFunctionFactory {
- std::shared_ptr<MaximumSquaredNormStore> _sq_norm_store;
+class MipsDistanceFunctionFactory : public MipsDistanceFunctionFactoryBase {
public:
- MipsDistanceFunctionFactory() : _sq_norm_store(std::make_shared<MaximumSquaredNormStore>()) {}
+ MipsDistanceFunctionFactory() : MipsDistanceFunctionFactoryBase() { }
+ ~MipsDistanceFunctionFactory() = default;
BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override;
diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
index 4b7b934fee0..9cd1065a356 100644
--- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
@@ -14,6 +14,7 @@
class FastOS_FileInterface;
+namespace vespalib { class GenericHeader; }
namespace vespalib::datastore {
class CompactionSpec;
class CompactionStrategy;
@@ -88,14 +89,14 @@ public:
* This function is always called by the attribute write thread,
* and the caller ensures that an attribute read guard is held during the lifetime of the saver.
*/
- virtual std::unique_ptr<NearestNeighborIndexSaver> make_saver() const = 0;
+ virtual std::unique_ptr<NearestNeighborIndexSaver> make_saver(vespalib::GenericHeader& header) const = 0;
/**
* Creates a loader that is used to load the index from the given file.
*
* This might throw std::runtime_error.
*/
- virtual std::unique_ptr<NearestNeighborIndexLoader> make_loader(FastOS_FileInterface& file) = 0;
+ virtual std::unique_ptr<NearestNeighborIndexLoader> make_loader(FastOS_FileInterface& file, const vespalib::GenericHeader& header) = 0;
virtual std::vector<Neighbor> find_top_k(uint32_t k,
const BoundDistanceFunction &df,
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
index 5e554f76779..f499695a584 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
@@ -357,10 +357,11 @@ TensorAttribute::onInitSave(vespalib::stringref fileName)
{
vespalib::GenerationHandler::Guard guard(getGenerationHandler().
takeGuard());
- auto index_saver = (_index ? _index->make_saver() : std::unique_ptr<NearestNeighborIndexSaver>());
+ auto header = this->createAttributeHeader(fileName);
+ auto index_saver = (_index ? _index->make_saver(header.get_extra_tags()) : std::unique_ptr<NearestNeighborIndexSaver>());
return std::make_unique<TensorAttributeSaver>
(std::move(guard),
- this->createAttributeHeader(fileName),
+ std::move(header),
attribute::make_entry_ref_vector_snapshot(_refVector, getCommittedDocIdLimit()),
_tensorStore,
std::move(index_saver));
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
index aada583627b..2ea28fd822d 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
@@ -273,7 +273,7 @@ TensorAttributeLoader::load_index()
{
FileWithHeader index_file(LoadUtils::openFile(_attr, TensorAttributeSaver::index_file_suffix()));
try {
- auto index_loader = _index->make_loader(index_file.file());
+ auto index_loader = _index->make_loader(index_file.file(), index_file.header());
size_t cnt = 0;
while (index_loader->load_next()) {
if ((++cnt % LOAD_COMMIT_INTERVAL) == 0) {