aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/util
diff options
context:
space:
mode:
Diffstat (limited to 'searchlib/src/vespa/searchlib/util')
-rw-r--r--searchlib/src/vespa/searchlib/util/CMakeLists.txt4
-rw-r--r--searchlib/src/vespa/searchlib/util/bufferwriter.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/bufferwriter.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/comprbuffer.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/comprbuffer.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/comprfile.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/comprfile.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/dirtraverse.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/dirtraverse.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/drainingbufferwriter.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/drainingbufferwriter.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/file_settings.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/file_with_header.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/file_with_header.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/filealign.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/filealign.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/fileheadertk.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/fileheadertk.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/filekit.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/filekit.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/filesizecalculator.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/filesizecalculator.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/fileutil.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/fileutil.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/fileutil.hpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/foldedstringcompare.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/foldedstringcompare.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp9
-rw-r--r--searchlib/src/vespa/searchlib/util/linguisticsannotation.h11
-rw-r--r--searchlib/src/vespa/searchlib/util/logutil.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/logutil.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/posting_priority_queue.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/posting_priority_queue.hpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.hpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/random_normal.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/randomgenerator.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/rawbuf.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/rawbuf.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/searchable_stats.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/state_explorer_utils.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/state_explorer_utils.h2
-rw-r--r--searchlib/src/vespa/searchlib/util/token_extractor.cpp162
-rw-r--r--searchlib/src/vespa/searchlib/util/token_extractor.h63
-rw-r--r--searchlib/src/vespa/searchlib/util/url.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/util/url.h2
48 files changed, 291 insertions, 44 deletions
diff --git a/searchlib/src/vespa/searchlib/util/CMakeLists.txt b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
index 0d8c8ecb2c2..e9661b5e919 100644
--- a/searchlib/src/vespa/searchlib/util/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
vespa_add_library(searchlib_util OBJECT
SOURCES
bufferwriter.cpp
@@ -13,10 +13,12 @@ vespa_add_library(searchlib_util OBJECT
filesizecalculator.cpp
fileutil.cpp
foldedstringcompare.cpp
+ linguisticsannotation.cpp
logutil.cpp
rawbuf.cpp
slime_output_raw_buf_adapter.cpp
state_explorer_utils.cpp
+ token_extractor.cpp
url.cpp
DEPENDS
)
diff --git a/searchlib/src/vespa/searchlib/util/bufferwriter.cpp b/searchlib/src/vespa/searchlib/util/bufferwriter.cpp
index 4477554c0eb..6b90c8bc547 100644
--- a/searchlib/src/vespa/searchlib/util/bufferwriter.cpp
+++ b/searchlib/src/vespa/searchlib/util/bufferwriter.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "bufferwriter.h"
diff --git a/searchlib/src/vespa/searchlib/util/bufferwriter.h b/searchlib/src/vespa/searchlib/util/bufferwriter.h
index 64177f05a30..656164865a2 100644
--- a/searchlib/src/vespa/searchlib/util/bufferwriter.h
+++ b/searchlib/src/vespa/searchlib/util/bufferwriter.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/comprbuffer.cpp b/searchlib/src/vespa/searchlib/util/comprbuffer.cpp
index ab883ee6956..0a058601068 100644
--- a/searchlib/src/vespa/searchlib/util/comprbuffer.cpp
+++ b/searchlib/src/vespa/searchlib/util/comprbuffer.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "comprbuffer.h"
#include <vespa/fastos/file.h>
diff --git a/searchlib/src/vespa/searchlib/util/comprbuffer.h b/searchlib/src/vespa/searchlib/util/comprbuffer.h
index 9c0ccfec228..fdb16254021 100644
--- a/searchlib/src/vespa/searchlib/util/comprbuffer.h
+++ b/searchlib/src/vespa/searchlib/util/comprbuffer.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/comprfile.cpp b/searchlib/src/vespa/searchlib/util/comprfile.cpp
index f284e562b3d..ff74dc7a0e0 100644
--- a/searchlib/src/vespa/searchlib/util/comprfile.cpp
+++ b/searchlib/src/vespa/searchlib/util/comprfile.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "comprfile.h"
#include <vespa/fastos/file.h>
diff --git a/searchlib/src/vespa/searchlib/util/comprfile.h b/searchlib/src/vespa/searchlib/util/comprfile.h
index 9be26f38a1a..8f8cffaffd6 100644
--- a/searchlib/src/vespa/searchlib/util/comprfile.h
+++ b/searchlib/src/vespa/searchlib/util/comprfile.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/dirtraverse.cpp b/searchlib/src/vespa/searchlib/util/dirtraverse.cpp
index c1e8b6b7396..079434515a1 100644
--- a/searchlib/src/vespa/searchlib/util/dirtraverse.cpp
+++ b/searchlib/src/vespa/searchlib/util/dirtraverse.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "dirtraverse.h"
#include <vespa/vespalib/util/size_literals.h>
diff --git a/searchlib/src/vespa/searchlib/util/dirtraverse.h b/searchlib/src/vespa/searchlib/util/dirtraverse.h
index c26246e2596..91c8f3a2c50 100644
--- a/searchlib/src/vespa/searchlib/util/dirtraverse.h
+++ b/searchlib/src/vespa/searchlib/util/dirtraverse.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/drainingbufferwriter.cpp b/searchlib/src/vespa/searchlib/util/drainingbufferwriter.cpp
index b16632282ff..b9d04fa3680 100644
--- a/searchlib/src/vespa/searchlib/util/drainingbufferwriter.cpp
+++ b/searchlib/src/vespa/searchlib/util/drainingbufferwriter.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "drainingbufferwriter.h"
#include <cassert>
diff --git a/searchlib/src/vespa/searchlib/util/drainingbufferwriter.h b/searchlib/src/vespa/searchlib/util/drainingbufferwriter.h
index 0891c298539..59430ec863d 100644
--- a/searchlib/src/vespa/searchlib/util/drainingbufferwriter.h
+++ b/searchlib/src/vespa/searchlib/util/drainingbufferwriter.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/file_settings.h b/searchlib/src/vespa/searchlib/util/file_settings.h
index 1bdd1a56cda..f469f76ae81 100644
--- a/searchlib/src/vespa/searchlib/util/file_settings.h
+++ b/searchlib/src/vespa/searchlib/util/file_settings.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/file_with_header.cpp b/searchlib/src/vespa/searchlib/util/file_with_header.cpp
index d3aa435b0eb..5212db35557 100644
--- a/searchlib/src/vespa/searchlib/util/file_with_header.cpp
+++ b/searchlib/src/vespa/searchlib/util/file_with_header.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "file_with_header.h"
#include "file_settings.h"
diff --git a/searchlib/src/vespa/searchlib/util/file_with_header.h b/searchlib/src/vespa/searchlib/util/file_with_header.h
index 4432b76be67..341d1dff003 100644
--- a/searchlib/src/vespa/searchlib/util/file_with_header.h
+++ b/searchlib/src/vespa/searchlib/util/file_with_header.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/filealign.cpp b/searchlib/src/vespa/searchlib/util/filealign.cpp
index 9ea10d8218e..2a2af4bf08d 100644
--- a/searchlib/src/vespa/searchlib/util/filealign.cpp
+++ b/searchlib/src/vespa/searchlib/util/filealign.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "filealign.h"
#include <vespa/fastos/file.h>
diff --git a/searchlib/src/vespa/searchlib/util/filealign.h b/searchlib/src/vespa/searchlib/util/filealign.h
index 0ac082716e8..73ca03926a0 100644
--- a/searchlib/src/vespa/searchlib/util/filealign.h
+++ b/searchlib/src/vespa/searchlib/util/filealign.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/fileheadertk.cpp b/searchlib/src/vespa/searchlib/util/fileheadertk.cpp
index aa27fdde669..20d93895c34 100644
--- a/searchlib/src/vespa/searchlib/util/fileheadertk.cpp
+++ b/searchlib/src/vespa/searchlib/util/fileheadertk.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "fileheadertk.h"
#include <vespa/vespalib/component/vtag.h>
diff --git a/searchlib/src/vespa/searchlib/util/fileheadertk.h b/searchlib/src/vespa/searchlib/util/fileheadertk.h
index c5c1c7dfdca..60c29f74cf9 100644
--- a/searchlib/src/vespa/searchlib/util/fileheadertk.h
+++ b/searchlib/src/vespa/searchlib/util/fileheadertk.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
#include <vespa/vespalib/data/fileheader.h>
diff --git a/searchlib/src/vespa/searchlib/util/filekit.cpp b/searchlib/src/vespa/searchlib/util/filekit.cpp
index 4012ef00dae..04f15635860 100644
--- a/searchlib/src/vespa/searchlib/util/filekit.cpp
+++ b/searchlib/src/vespa/searchlib/util/filekit.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "filekit.h"
#include <vespa/vespalib/util/error.h>
diff --git a/searchlib/src/vespa/searchlib/util/filekit.h b/searchlib/src/vespa/searchlib/util/filekit.h
index dbd6d2e5a2e..dc8a18810e6 100644
--- a/searchlib/src/vespa/searchlib/util/filekit.h
+++ b/searchlib/src/vespa/searchlib/util/filekit.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/filesizecalculator.cpp b/searchlib/src/vespa/searchlib/util/filesizecalculator.cpp
index c50d402db0e..7b5ef8ec1ba 100644
--- a/searchlib/src/vespa/searchlib/util/filesizecalculator.cpp
+++ b/searchlib/src/vespa/searchlib/util/filesizecalculator.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "filesizecalculator.h"
#include <vespa/vespalib/data/fileheader.h>
diff --git a/searchlib/src/vespa/searchlib/util/filesizecalculator.h b/searchlib/src/vespa/searchlib/util/filesizecalculator.h
index be795b84c8b..4f91b7ccf26 100644
--- a/searchlib/src/vespa/searchlib/util/filesizecalculator.h
+++ b/searchlib/src/vespa/searchlib/util/filesizecalculator.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/fileutil.cpp b/searchlib/src/vespa/searchlib/util/fileutil.cpp
index f602c66b544..25aa2345cb7 100644
--- a/searchlib/src/vespa/searchlib/util/fileutil.cpp
+++ b/searchlib/src/vespa/searchlib/util/fileutil.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "fileutil.hpp"
#include "filesizecalculator.h"
diff --git a/searchlib/src/vespa/searchlib/util/fileutil.h b/searchlib/src/vespa/searchlib/util/fileutil.h
index 97bec4e1bba..a85193675eb 100644
--- a/searchlib/src/vespa/searchlib/util/fileutil.h
+++ b/searchlib/src/vespa/searchlib/util/fileutil.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
#include <vector>
diff --git a/searchlib/src/vespa/searchlib/util/fileutil.hpp b/searchlib/src/vespa/searchlib/util/fileutil.hpp
index 5b5303ef169..1a4ef9c52b6 100644
--- a/searchlib/src/vespa/searchlib/util/fileutil.hpp
+++ b/searchlib/src/vespa/searchlib/util/fileutil.hpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
#include "fileutil.h"
diff --git a/searchlib/src/vespa/searchlib/util/foldedstringcompare.cpp b/searchlib/src/vespa/searchlib/util/foldedstringcompare.cpp
index 53b9a2db31d..2a8c7134e15 100644
--- a/searchlib/src/vespa/searchlib/util/foldedstringcompare.cpp
+++ b/searchlib/src/vespa/searchlib/util/foldedstringcompare.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "foldedstringcompare.h"
#include <vespa/vespalib/text/utf8.h>
diff --git a/searchlib/src/vespa/searchlib/util/foldedstringcompare.h b/searchlib/src/vespa/searchlib/util/foldedstringcompare.h
index cd7cd325667..ae54e35672b 100644
--- a/searchlib/src/vespa/searchlib/util/foldedstringcompare.h
+++ b/searchlib/src/vespa/searchlib/util/foldedstringcompare.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp
new file mode 100644
index 00000000000..c8aef561319
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp
@@ -0,0 +1,9 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "linguisticsannotation.h"
+
+namespace search::linguistics {
+
+const vespalib::string SPANTREE_NAME("linguistics");
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.h b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h
new file mode 100644
index 00000000000..83a19bed986
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h
@@ -0,0 +1,11 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace search::linguistics {
+
+extern const vespalib::string SPANTREE_NAME;
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/logutil.cpp b/searchlib/src/vespa/searchlib/util/logutil.cpp
index 2fd3205cdb2..18381cda786 100644
--- a/searchlib/src/vespa/searchlib/util/logutil.cpp
+++ b/searchlib/src/vespa/searchlib/util/logutil.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "logutil.h"
#include "dirtraverse.h"
diff --git a/searchlib/src/vespa/searchlib/util/logutil.h b/searchlib/src/vespa/searchlib/util/logutil.h
index 6afb654a960..f6d525e6167 100644
--- a/searchlib/src/vespa/searchlib/util/logutil.h
+++ b/searchlib/src/vespa/searchlib/util/logutil.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
#include <vespa/vespalib/util/jsonwriter.h>
diff --git a/searchlib/src/vespa/searchlib/util/posting_priority_queue.h b/searchlib/src/vespa/searchlib/util/posting_priority_queue.h
index c1549b32f93..153023829c3 100644
--- a/searchlib/src/vespa/searchlib/util/posting_priority_queue.h
+++ b/searchlib/src/vespa/searchlib/util/posting_priority_queue.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/posting_priority_queue.hpp b/searchlib/src/vespa/searchlib/util/posting_priority_queue.hpp
index b7fc9cfab05..f4a7204c668 100644
--- a/searchlib/src/vespa/searchlib/util/posting_priority_queue.hpp
+++ b/searchlib/src/vespa/searchlib/util/posting_priority_queue.hpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.h b/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.h
index 9debcd06ea6..0c9ae80839a 100644
--- a/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.h
+++ b/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.hpp b/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.hpp
index 5676f6326df..ba2b23fb15c 100644
--- a/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.hpp
+++ b/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.hpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/random_normal.h b/searchlib/src/vespa/searchlib/util/random_normal.h
index 2b9e566303c..18f51284b6e 100644
--- a/searchlib/src/vespa/searchlib/util/random_normal.h
+++ b/searchlib/src/vespa/searchlib/util/random_normal.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/randomgenerator.h b/searchlib/src/vespa/searchlib/util/randomgenerator.h
index 89b65a65e92..66c739deef6 100644
--- a/searchlib/src/vespa/searchlib/util/randomgenerator.h
+++ b/searchlib/src/vespa/searchlib/util/randomgenerator.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/rawbuf.cpp b/searchlib/src/vespa/searchlib/util/rawbuf.cpp
index 3af29d7eed5..6b8efb58b60 100644
--- a/searchlib/src/vespa/searchlib/util/rawbuf.cpp
+++ b/searchlib/src/vespa/searchlib/util/rawbuf.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "rawbuf.h"
#include <cassert>
diff --git a/searchlib/src/vespa/searchlib/util/rawbuf.h b/searchlib/src/vespa/searchlib/util/rawbuf.h
index 9ecfbc23c24..0ede792b177 100644
--- a/searchlib/src/vespa/searchlib/util/rawbuf.h
+++ b/searchlib/src/vespa/searchlib/util/rawbuf.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/searchable_stats.h b/searchlib/src/vespa/searchlib/util/searchable_stats.h
index e785a4c4483..089c4414493 100644
--- a/searchlib/src/vespa/searchlib/util/searchable_stats.h
+++ b/searchlib/src/vespa/searchlib/util/searchable_stats.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
#include <vespa/vespalib/util/memoryusage.h>
diff --git a/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.cpp b/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.cpp
index 273dd2d37cd..d16c25f1510 100644
--- a/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.cpp
+++ b/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "slime_output_raw_buf_adapter.h"
diff --git a/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.h b/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.h
index d00a0714045..49fc3e50549 100644
--- a/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.h
+++ b/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/state_explorer_utils.cpp b/searchlib/src/vespa/searchlib/util/state_explorer_utils.cpp
index d61737d2a5f..d662dea6e26 100644
--- a/searchlib/src/vespa/searchlib/util/state_explorer_utils.cpp
+++ b/searchlib/src/vespa/searchlib/util/state_explorer_utils.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "state_explorer_utils.h"
#include <vespa/searchcommon/attribute/status.h>
diff --git a/searchlib/src/vespa/searchlib/util/state_explorer_utils.h b/searchlib/src/vespa/searchlib/util/state_explorer_utils.h
index 9a8d1a7d9db..4bc5e6e8fcf 100644
--- a/searchlib/src/vespa/searchlib/util/state_explorer_utils.h
+++ b/searchlib/src/vespa/searchlib/util/state_explorer_utils.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
new file mode 100644
index 00000000000..a78f30afe21
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
@@ -0,0 +1,162 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "token_extractor.h"
+#include "linguisticsannotation.h"
+#include <vespa/document/annotation/alternatespanlist.h>
+#include <vespa/document/annotation/span.h>
+#include <vespa/document/annotation/spanlist.h>
+#include <vespa/document/annotation/spantreevisitor.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/util/exceptions.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".searchlib.util.token_extractor");
+
+using document::AlternateSpanList;
+using document::Annotation;
+using document::AnnotationType;
+using document::Document;
+using document::FieldValue;
+using document::SimpleSpanList;
+using document::Span;
+using document::SpanList;
+using document::SpanNode;
+using document::SpanTreeVisitor;
+using document::StringFieldValue;
+using vespalib::Utf8Reader;
+
+namespace search::linguistics {
+
+namespace {
+
+class SpanFinder : public SpanTreeVisitor {
+public:
+ int32_t begin_pos;
+ int32_t end_pos;
+
+ SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
+ Span span() { return Span(begin_pos, end_pos - begin_pos); }
+
+ void visit(const Span &node) override {
+ begin_pos = std::min(begin_pos, node.from());
+ end_pos = std::max(end_pos, node.from() + node.length());
+ }
+ void visit(const SpanList &node) override {
+ for (const auto & span_ : node) {
+ span_->accept(*this);
+ }
+ }
+ void visit(const SimpleSpanList &node) override {
+ for (const auto & span_ : node) {
+ span_.accept(*this);
+ }
+ }
+ void visit(const AlternateSpanList &node) override {
+ for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
+ visit(node.getSubtree(i));
+ }
+ }
+};
+
+Span
+getSpan(const SpanNode &span_node)
+{
+ SpanFinder finder;
+ span_node.accept(finder);
+ return finder.span();
+}
+
+vespalib::stringref
+get_span_string_or_alternative(vespalib::stringref s, const Span &span, const FieldValue* fv)
+{
+ if (fv != nullptr) {
+ auto raw = fv->getAsRaw();
+ return {raw.first, raw.second};
+ } else {
+ return {s.data() + span.from(), static_cast<size_t>(span.length())};
+ }
+}
+
+size_t
+truncated_word_len(vespalib::stringref word, size_t max_byte_len)
+{
+ Utf8Reader reader(word);
+ while (reader.hasMore()) {
+ auto last_pos = reader.getPos();
+ (void) reader.getChar();
+ if (reader.getPos() > max_byte_len) {
+ return last_pos;
+ }
+ }
+ return reader.getPos(); // No truncation
+}
+
+constexpr size_t max_fmt_len = 100; // Max length of word in logs
+
+}
+
+TokenExtractor::TokenExtractor(const vespalib::string& field_name, size_t max_word_len)
+ : _field_name(field_name),
+ _max_word_len(max_word_len)
+{
+}
+
+TokenExtractor::~TokenExtractor() = default;
+
+vespalib::stringref
+TokenExtractor::sanitize_word(vespalib::stringref word, const document::Document* doc) const
+{
+ size_t len = strnlen(word.data(), word.size());
+ if (len < word.size()) {
+ size_t old_len = word.size();
+ len = truncated_word_len(word, len);
+ word = word.substr(0, len);
+ if (doc != nullptr) {
+ LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, document %s field %s, truncated word prefix is %.*s", old_len, word.size(), doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data());
+ }
+ }
+ if (word.size() > _max_word_len) {
+ if (doc != nullptr) {
+ LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.*s", word.size(), _max_word_len, doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data());
+ }
+ return {};
+ }
+ return word;
+}
+
+void
+TokenExtractor::consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const Span& span, const FieldValue* fv, const Document* doc) const
+{
+ if (span.length() > 0 && span.from() >= 0 &&
+ static_cast<size_t>(span.from()) + static_cast<size_t>(span.length()) <= text.size()) {
+ auto word = get_span_string_or_alternative(text, span, fv);
+ word = sanitize_word(word, doc);
+ if (!word.empty()) {
+ terms.emplace_back(span, word, fv != nullptr);
+ }
+ }
+}
+
+void
+TokenExtractor::extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const Document* doc) const
+{
+ auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME);
+ if (tree == nullptr) {
+ /* field might not be annotated if match type is exact */
+ consider_word(terms, text, Span(0, text.size()), nullptr, doc);
+ return;
+ }
+ for (const Annotation & annotation : *tree) {
+ const SpanNode *span = annotation.getSpanNode();
+ if ((span != nullptr) && annotation.valid() &&
+ (annotation.getType() == *AnnotationType::TERM))
+ {
+ Span sp = getSpan(*span);
+ consider_word(terms, text, sp, annotation.getFieldValue(), doc);
+ }
+ }
+ std::sort(terms.begin(), terms.end());
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h
new file mode 100644
index 00000000000..4955448b0c2
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.h
@@ -0,0 +1,63 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/document/annotation/span.h>
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/stllike/string.h>
+#include <vector>
+
+namespace document {
+
+class Document;
+class Span;
+class StringFieldValue;
+
+}
+
+namespace search::linguistics {
+
+/*
+ * Class used to extract tokens from annotated string field value.
+ */
+class TokenExtractor {
+ const vespalib::string& _field_name;
+ size_t _max_word_len;
+
+public:
+ struct SpanTerm {
+ document::Span span;
+ vespalib::stringref word;
+ bool altered;
+
+ SpanTerm(const document::Span& span_, vespalib::stringref word_, bool altered_) noexcept
+ : span(span_),
+ word(word_),
+ altered(altered_)
+ {
+ }
+ SpanTerm() noexcept
+ : span(),
+ word(),
+ altered(false)
+ {
+ }
+ bool operator<(const SpanTerm& rhs) const noexcept {
+ if (span != rhs.span) {
+ return span < rhs.span;
+ }
+ return word < rhs.word;
+ }
+ };
+
+private:
+ void consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const document::Span& span, const document::FieldValue* fv, const document::Document* doc) const;
+
+public:
+ TokenExtractor(const vespalib::string& field_name, size_t max_word_len);
+ ~TokenExtractor();
+ void extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const document::Document* doc) const;
+ vespalib::stringref sanitize_word(vespalib::stringref word, const document::Document* doc) const;
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/url.cpp b/searchlib/src/vespa/searchlib/util/url.cpp
index 141f54363e9..3af2f31eec6 100644
--- a/searchlib/src/vespa/searchlib/util/url.cpp
+++ b/searchlib/src/vespa/searchlib/util/url.cpp
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "url.h"
#include <algorithm>
diff --git a/searchlib/src/vespa/searchlib/util/url.h b/searchlib/src/vespa/searchlib/util/url.h
index 796640a131e..05511ef8b08 100644
--- a/searchlib/src/vespa/searchlib/util/url.h
+++ b/searchlib/src/vespa/searchlib/util/url.h
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once