diff options
Diffstat (limited to 'searchlib/src/vespa/searchlib/util')
48 files changed, 291 insertions, 44 deletions
diff --git a/searchlib/src/vespa/searchlib/util/CMakeLists.txt b/searchlib/src/vespa/searchlib/util/CMakeLists.txt index 0d8c8ecb2c2..e9661b5e919 100644 --- a/searchlib/src/vespa/searchlib/util/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/util/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(searchlib_util OBJECT SOURCES bufferwriter.cpp @@ -13,10 +13,12 @@ vespa_add_library(searchlib_util OBJECT filesizecalculator.cpp fileutil.cpp foldedstringcompare.cpp + linguisticsannotation.cpp logutil.cpp rawbuf.cpp slime_output_raw_buf_adapter.cpp state_explorer_utils.cpp + token_extractor.cpp url.cpp DEPENDS ) diff --git a/searchlib/src/vespa/searchlib/util/bufferwriter.cpp b/searchlib/src/vespa/searchlib/util/bufferwriter.cpp index 4477554c0eb..6b90c8bc547 100644 --- a/searchlib/src/vespa/searchlib/util/bufferwriter.cpp +++ b/searchlib/src/vespa/searchlib/util/bufferwriter.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "bufferwriter.h" diff --git a/searchlib/src/vespa/searchlib/util/bufferwriter.h b/searchlib/src/vespa/searchlib/util/bufferwriter.h index 64177f05a30..656164865a2 100644 --- a/searchlib/src/vespa/searchlib/util/bufferwriter.h +++ b/searchlib/src/vespa/searchlib/util/bufferwriter.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/comprbuffer.cpp b/searchlib/src/vespa/searchlib/util/comprbuffer.cpp index ab883ee6956..0a058601068 100644 --- a/searchlib/src/vespa/searchlib/util/comprbuffer.cpp +++ b/searchlib/src/vespa/searchlib/util/comprbuffer.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "comprbuffer.h" #include <vespa/fastos/file.h> diff --git a/searchlib/src/vespa/searchlib/util/comprbuffer.h b/searchlib/src/vespa/searchlib/util/comprbuffer.h index 9c0ccfec228..fdb16254021 100644 --- a/searchlib/src/vespa/searchlib/util/comprbuffer.h +++ b/searchlib/src/vespa/searchlib/util/comprbuffer.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/comprfile.cpp b/searchlib/src/vespa/searchlib/util/comprfile.cpp index f284e562b3d..ff74dc7a0e0 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.cpp +++ b/searchlib/src/vespa/searchlib/util/comprfile.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "comprfile.h" #include <vespa/fastos/file.h> diff --git a/searchlib/src/vespa/searchlib/util/comprfile.h b/searchlib/src/vespa/searchlib/util/comprfile.h index 9be26f38a1a..8f8cffaffd6 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.h +++ b/searchlib/src/vespa/searchlib/util/comprfile.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/dirtraverse.cpp b/searchlib/src/vespa/searchlib/util/dirtraverse.cpp index c1e8b6b7396..079434515a1 100644 --- a/searchlib/src/vespa/searchlib/util/dirtraverse.cpp +++ b/searchlib/src/vespa/searchlib/util/dirtraverse.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "dirtraverse.h" #include <vespa/vespalib/util/size_literals.h> diff --git a/searchlib/src/vespa/searchlib/util/dirtraverse.h b/searchlib/src/vespa/searchlib/util/dirtraverse.h index c26246e2596..91c8f3a2c50 100644 --- a/searchlib/src/vespa/searchlib/util/dirtraverse.h +++ b/searchlib/src/vespa/searchlib/util/dirtraverse.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/drainingbufferwriter.cpp b/searchlib/src/vespa/searchlib/util/drainingbufferwriter.cpp index b16632282ff..b9d04fa3680 100644 --- a/searchlib/src/vespa/searchlib/util/drainingbufferwriter.cpp +++ b/searchlib/src/vespa/searchlib/util/drainingbufferwriter.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "drainingbufferwriter.h" #include <cassert> diff --git a/searchlib/src/vespa/searchlib/util/drainingbufferwriter.h b/searchlib/src/vespa/searchlib/util/drainingbufferwriter.h index 0891c298539..59430ec863d 100644 --- a/searchlib/src/vespa/searchlib/util/drainingbufferwriter.h +++ b/searchlib/src/vespa/searchlib/util/drainingbufferwriter.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/file_settings.h b/searchlib/src/vespa/searchlib/util/file_settings.h index 1bdd1a56cda..f469f76ae81 100644 --- a/searchlib/src/vespa/searchlib/util/file_settings.h +++ b/searchlib/src/vespa/searchlib/util/file_settings.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/file_with_header.cpp b/searchlib/src/vespa/searchlib/util/file_with_header.cpp index d3aa435b0eb..5212db35557 100644 --- a/searchlib/src/vespa/searchlib/util/file_with_header.cpp +++ b/searchlib/src/vespa/searchlib/util/file_with_header.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "file_with_header.h" #include "file_settings.h" diff --git a/searchlib/src/vespa/searchlib/util/file_with_header.h b/searchlib/src/vespa/searchlib/util/file_with_header.h index 4432b76be67..341d1dff003 100644 --- a/searchlib/src/vespa/searchlib/util/file_with_header.h +++ b/searchlib/src/vespa/searchlib/util/file_with_header.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/filealign.cpp b/searchlib/src/vespa/searchlib/util/filealign.cpp index 9ea10d8218e..2a2af4bf08d 100644 --- a/searchlib/src/vespa/searchlib/util/filealign.cpp +++ b/searchlib/src/vespa/searchlib/util/filealign.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "filealign.h" #include <vespa/fastos/file.h> diff --git a/searchlib/src/vespa/searchlib/util/filealign.h b/searchlib/src/vespa/searchlib/util/filealign.h index 0ac082716e8..73ca03926a0 100644 --- a/searchlib/src/vespa/searchlib/util/filealign.h +++ b/searchlib/src/vespa/searchlib/util/filealign.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/fileheadertk.cpp b/searchlib/src/vespa/searchlib/util/fileheadertk.cpp index aa27fdde669..20d93895c34 100644 --- a/searchlib/src/vespa/searchlib/util/fileheadertk.cpp +++ b/searchlib/src/vespa/searchlib/util/fileheadertk.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "fileheadertk.h" #include <vespa/vespalib/component/vtag.h> diff --git a/searchlib/src/vespa/searchlib/util/fileheadertk.h b/searchlib/src/vespa/searchlib/util/fileheadertk.h index c5c1c7dfdca..60c29f74cf9 100644 --- a/searchlib/src/vespa/searchlib/util/fileheadertk.h +++ b/searchlib/src/vespa/searchlib/util/fileheadertk.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include <vespa/vespalib/data/fileheader.h> diff --git a/searchlib/src/vespa/searchlib/util/filekit.cpp b/searchlib/src/vespa/searchlib/util/filekit.cpp index 4012ef00dae..04f15635860 100644 --- a/searchlib/src/vespa/searchlib/util/filekit.cpp +++ b/searchlib/src/vespa/searchlib/util/filekit.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "filekit.h" #include <vespa/vespalib/util/error.h> diff --git a/searchlib/src/vespa/searchlib/util/filekit.h b/searchlib/src/vespa/searchlib/util/filekit.h index dbd6d2e5a2e..dc8a18810e6 100644 --- a/searchlib/src/vespa/searchlib/util/filekit.h +++ b/searchlib/src/vespa/searchlib/util/filekit.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/filesizecalculator.cpp b/searchlib/src/vespa/searchlib/util/filesizecalculator.cpp index c50d402db0e..7b5ef8ec1ba 100644 --- a/searchlib/src/vespa/searchlib/util/filesizecalculator.cpp +++ b/searchlib/src/vespa/searchlib/util/filesizecalculator.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "filesizecalculator.h" #include <vespa/vespalib/data/fileheader.h> diff --git a/searchlib/src/vespa/searchlib/util/filesizecalculator.h b/searchlib/src/vespa/searchlib/util/filesizecalculator.h index be795b84c8b..4f91b7ccf26 100644 --- a/searchlib/src/vespa/searchlib/util/filesizecalculator.h +++ b/searchlib/src/vespa/searchlib/util/filesizecalculator.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/fileutil.cpp b/searchlib/src/vespa/searchlib/util/fileutil.cpp index f602c66b544..25aa2345cb7 100644 --- a/searchlib/src/vespa/searchlib/util/fileutil.cpp +++ b/searchlib/src/vespa/searchlib/util/fileutil.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "fileutil.hpp" #include "filesizecalculator.h" diff --git a/searchlib/src/vespa/searchlib/util/fileutil.h b/searchlib/src/vespa/searchlib/util/fileutil.h index 97bec4e1bba..a85193675eb 100644 --- a/searchlib/src/vespa/searchlib/util/fileutil.h +++ b/searchlib/src/vespa/searchlib/util/fileutil.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include <vector> diff --git a/searchlib/src/vespa/searchlib/util/fileutil.hpp b/searchlib/src/vespa/searchlib/util/fileutil.hpp index 5b5303ef169..1a4ef9c52b6 100644 --- a/searchlib/src/vespa/searchlib/util/fileutil.hpp +++ b/searchlib/src/vespa/searchlib/util/fileutil.hpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include "fileutil.h" diff --git a/searchlib/src/vespa/searchlib/util/foldedstringcompare.cpp b/searchlib/src/vespa/searchlib/util/foldedstringcompare.cpp index 53b9a2db31d..2a8c7134e15 100644 --- a/searchlib/src/vespa/searchlib/util/foldedstringcompare.cpp +++ b/searchlib/src/vespa/searchlib/util/foldedstringcompare.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "foldedstringcompare.h" #include <vespa/vespalib/text/utf8.h> diff --git a/searchlib/src/vespa/searchlib/util/foldedstringcompare.h b/searchlib/src/vespa/searchlib/util/foldedstringcompare.h index cd7cd325667..ae54e35672b 100644 --- a/searchlib/src/vespa/searchlib/util/foldedstringcompare.h +++ b/searchlib/src/vespa/searchlib/util/foldedstringcompare.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp new file mode 100644 index 00000000000..c8aef561319 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp @@ -0,0 +1,9 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "linguisticsannotation.h" + +namespace search::linguistics { + +const vespalib::string SPANTREE_NAME("linguistics"); + +} diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.h b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h new file mode 100644 index 00000000000..83a19bed986 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h @@ -0,0 +1,11 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace search::linguistics { + +extern const vespalib::string SPANTREE_NAME; + +} diff --git a/searchlib/src/vespa/searchlib/util/logutil.cpp b/searchlib/src/vespa/searchlib/util/logutil.cpp index 2fd3205cdb2..18381cda786 100644 --- a/searchlib/src/vespa/searchlib/util/logutil.cpp +++ b/searchlib/src/vespa/searchlib/util/logutil.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "logutil.h" #include "dirtraverse.h" diff --git a/searchlib/src/vespa/searchlib/util/logutil.h b/searchlib/src/vespa/searchlib/util/logutil.h index 6afb654a960..f6d525e6167 100644 --- a/searchlib/src/vespa/searchlib/util/logutil.h +++ b/searchlib/src/vespa/searchlib/util/logutil.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include <vespa/vespalib/util/jsonwriter.h> diff --git a/searchlib/src/vespa/searchlib/util/posting_priority_queue.h b/searchlib/src/vespa/searchlib/util/posting_priority_queue.h index c1549b32f93..153023829c3 100644 --- a/searchlib/src/vespa/searchlib/util/posting_priority_queue.h +++ b/searchlib/src/vespa/searchlib/util/posting_priority_queue.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/posting_priority_queue.hpp b/searchlib/src/vespa/searchlib/util/posting_priority_queue.hpp index b7fc9cfab05..f4a7204c668 100644 --- a/searchlib/src/vespa/searchlib/util/posting_priority_queue.hpp +++ b/searchlib/src/vespa/searchlib/util/posting_priority_queue.hpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.h b/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.h index 9debcd06ea6..0c9ae80839a 100644 --- a/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.h +++ b/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.hpp b/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.hpp index 5676f6326df..ba2b23fb15c 100644 --- a/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.hpp +++ b/searchlib/src/vespa/searchlib/util/posting_priority_queue_merger.hpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/random_normal.h b/searchlib/src/vespa/searchlib/util/random_normal.h index 2b9e566303c..18f51284b6e 100644 --- a/searchlib/src/vespa/searchlib/util/random_normal.h +++ b/searchlib/src/vespa/searchlib/util/random_normal.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/randomgenerator.h b/searchlib/src/vespa/searchlib/util/randomgenerator.h index 89b65a65e92..66c739deef6 100644 --- a/searchlib/src/vespa/searchlib/util/randomgenerator.h +++ b/searchlib/src/vespa/searchlib/util/randomgenerator.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/rawbuf.cpp b/searchlib/src/vespa/searchlib/util/rawbuf.cpp index 3af29d7eed5..6b8efb58b60 100644 --- a/searchlib/src/vespa/searchlib/util/rawbuf.cpp +++ b/searchlib/src/vespa/searchlib/util/rawbuf.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "rawbuf.h" #include <cassert> diff --git a/searchlib/src/vespa/searchlib/util/rawbuf.h b/searchlib/src/vespa/searchlib/util/rawbuf.h index 9ecfbc23c24..0ede792b177 100644 --- a/searchlib/src/vespa/searchlib/util/rawbuf.h +++ b/searchlib/src/vespa/searchlib/util/rawbuf.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/searchable_stats.h b/searchlib/src/vespa/searchlib/util/searchable_stats.h index e785a4c4483..089c4414493 100644 --- a/searchlib/src/vespa/searchlib/util/searchable_stats.h +++ b/searchlib/src/vespa/searchlib/util/searchable_stats.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include <vespa/vespalib/util/memoryusage.h> diff --git a/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.cpp b/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.cpp index 273dd2d37cd..d16c25f1510 100644 --- a/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.cpp +++ b/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "slime_output_raw_buf_adapter.h" diff --git a/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.h b/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.h index d00a0714045..49fc3e50549 100644 --- a/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.h +++ b/searchlib/src/vespa/searchlib/util/slime_output_raw_buf_adapter.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/state_explorer_utils.cpp b/searchlib/src/vespa/searchlib/util/state_explorer_utils.cpp index d61737d2a5f..d662dea6e26 100644 --- a/searchlib/src/vespa/searchlib/util/state_explorer_utils.cpp +++ b/searchlib/src/vespa/searchlib/util/state_explorer_utils.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "state_explorer_utils.h" #include <vespa/searchcommon/attribute/status.h> diff --git a/searchlib/src/vespa/searchlib/util/state_explorer_utils.h b/searchlib/src/vespa/searchlib/util/state_explorer_utils.h index 9a8d1a7d9db..4bc5e6e8fcf 100644 --- a/searchlib/src/vespa/searchlib/util/state_explorer_utils.h +++ b/searchlib/src/vespa/searchlib/util/state_explorer_utils.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp new file mode 100644 index 00000000000..a78f30afe21 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp @@ -0,0 +1,162 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "token_extractor.h" +#include "linguisticsannotation.h" +#include <vespa/document/annotation/alternatespanlist.h> +#include <vespa/document/annotation/span.h> +#include <vespa/document/annotation/spanlist.h> +#include <vespa/document/annotation/spantreevisitor.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/util/exceptions.h> + +#include <vespa/log/log.h> +LOG_SETUP(".searchlib.util.token_extractor"); + +using document::AlternateSpanList; +using document::Annotation; +using document::AnnotationType; +using document::Document; +using document::FieldValue; +using document::SimpleSpanList; +using document::Span; +using document::SpanList; +using document::SpanNode; +using document::SpanTreeVisitor; +using document::StringFieldValue; +using vespalib::Utf8Reader; + +namespace search::linguistics { + +namespace { + +class SpanFinder : public SpanTreeVisitor { +public: + int32_t begin_pos; + int32_t end_pos; + + SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} + Span span() { return Span(begin_pos, end_pos - begin_pos); } + + void visit(const Span &node) override { + begin_pos = std::min(begin_pos, node.from()); + end_pos = std::max(end_pos, node.from() + node.length()); + } + void visit(const SpanList &node) override { + for (const auto & span_ : node) { + span_->accept(*this); + } + } + void visit(const SimpleSpanList &node) override { + for (const auto & span_ : node) { + span_.accept(*this); + } + } + void visit(const AlternateSpanList &node) override { + for (size_t i = 0; i < node.getNumSubtrees(); ++i) { + visit(node.getSubtree(i)); + } + } +}; + +Span +getSpan(const SpanNode &span_node) +{ + SpanFinder finder; + span_node.accept(finder); + return finder.span(); +} + +vespalib::stringref +get_span_string_or_alternative(vespalib::stringref s, const Span &span, const FieldValue* fv) +{ + if (fv != nullptr) { + auto raw = fv->getAsRaw(); + return {raw.first, raw.second}; + } else { + return {s.data() + span.from(), static_cast<size_t>(span.length())}; + } +} + +size_t +truncated_word_len(vespalib::stringref word, size_t max_byte_len) +{ + Utf8Reader reader(word); + while (reader.hasMore()) { + auto last_pos = reader.getPos(); + (void) reader.getChar(); + if (reader.getPos() > max_byte_len) { + return last_pos; + } + } + return reader.getPos(); // No truncation +} + +constexpr size_t max_fmt_len = 100; // Max length of word in logs + +} + +TokenExtractor::TokenExtractor(const vespalib::string& field_name, size_t max_word_len) + : _field_name(field_name), + _max_word_len(max_word_len) +{ +} + +TokenExtractor::~TokenExtractor() = default; + +vespalib::stringref +TokenExtractor::sanitize_word(vespalib::stringref word, const document::Document* doc) const +{ + size_t len = strnlen(word.data(), word.size()); + if (len < word.size()) { + size_t old_len = word.size(); + len = truncated_word_len(word, len); + word = word.substr(0, len); + if (doc != nullptr) { + LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, document %s field %s, truncated word prefix is %.*s", old_len, word.size(), doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data()); + } + } + if (word.size() > _max_word_len) { + if (doc != nullptr) { + LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.*s", word.size(), _max_word_len, doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data()); + } + return {}; + } + return word; +} + +void +TokenExtractor::consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const Span& span, const FieldValue* fv, const Document* doc) const +{ + if (span.length() > 0 && span.from() >= 0 && + static_cast<size_t>(span.from()) + static_cast<size_t>(span.length()) <= text.size()) { + auto word = get_span_string_or_alternative(text, span, fv); + word = sanitize_word(word, doc); + if (!word.empty()) { + terms.emplace_back(span, word, fv != nullptr); + } + } +} + +void +TokenExtractor::extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const Document* doc) const +{ + auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME); + if (tree == nullptr) { + /* field might not be annotated if match type is exact */ + consider_word(terms, text, Span(0, text.size()), nullptr, doc); + return; + } + for (const Annotation & annotation : *tree) { + const SpanNode *span = annotation.getSpanNode(); + if ((span != nullptr) && annotation.valid() && + (annotation.getType() == *AnnotationType::TERM)) + { + Span sp = getSpan(*span); + consider_word(terms, text, sp, annotation.getFieldValue(), doc); + } + } + std::sort(terms.begin(), terms.end()); +} + +} diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h new file mode 100644 index 00000000000..4955448b0c2 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/token_extractor.h @@ -0,0 +1,63 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/document/annotation/span.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/vespalib/stllike/string.h> +#include <vector> + +namespace document { + +class Document; +class Span; +class StringFieldValue; + +} + +namespace search::linguistics { + +/* + * Class used to extract tokens from annotated string field value. + */ +class TokenExtractor { + const vespalib::string& _field_name; + size_t _max_word_len; + +public: + struct SpanTerm { + document::Span span; + vespalib::stringref word; + bool altered; + + SpanTerm(const document::Span& span_, vespalib::stringref word_, bool altered_) noexcept + : span(span_), + word(word_), + altered(altered_) + { + } + SpanTerm() noexcept + : span(), + word(), + altered(false) + { + } + bool operator<(const SpanTerm& rhs) const noexcept { + if (span != rhs.span) { + return span < rhs.span; + } + return word < rhs.word; + } + }; + +private: + void consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const document::Span& span, const document::FieldValue* fv, const document::Document* doc) const; + +public: + TokenExtractor(const vespalib::string& field_name, size_t max_word_len); + ~TokenExtractor(); + void extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const document::Document* doc) const; + vespalib::stringref sanitize_word(vespalib::stringref word, const document::Document* doc) const; +}; + +} diff --git a/searchlib/src/vespa/searchlib/util/url.cpp b/searchlib/src/vespa/searchlib/util/url.cpp index 141f54363e9..3af2f31eec6 100644 --- a/searchlib/src/vespa/searchlib/util/url.cpp +++ b/searchlib/src/vespa/searchlib/util/url.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "url.h" #include <algorithm> diff --git a/searchlib/src/vespa/searchlib/util/url.h b/searchlib/src/vespa/searchlib/util/url.h index 796640a131e..05511ef8b08 100644 --- a/searchlib/src/vespa/searchlib/util/url.h +++ b/searchlib/src/vespa/searchlib/util/url.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once |