diff options
Diffstat (limited to 'searchlib/src/vespa/searchlib/bitcompression')
14 files changed, 69 insertions, 13 deletions
diff --git a/searchlib/src/vespa/searchlib/bitcompression/CMakeLists.txt b/searchlib/src/vespa/searchlib/bitcompression/CMakeLists.txt index 1572fbced7d..ff25bb2deb7 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/bitcompression/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(searchlib_bitcompression OBJECT SOURCES compression.cpp diff --git a/searchlib/src/vespa/searchlib/bitcompression/README.md b/searchlib/src/vespa/searchlib/bitcompression/README.md new file mode 100644 index 00000000000..e387ec099fb --- /dev/null +++ b/searchlib/src/vespa/searchlib/bitcompression/README.md @@ -0,0 +1,56 @@ +<!-- Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --> + +## About the disk dictionary format + +The designs of the disk index dictionary formats were incremental, due +to changing requirements over the years. + +### 1st generation + +Patricia tree in memory. + +### 2nd generation, 1998-09-04 + +Problem: Dictionary did not fit in memory (machine had 512 MB ram) +when indexing 5 million web documents on a single machine. + +Changed format to variable length records on disk, with a sparse +version of the dictionary in memory (each 256th word) to limit disk +access for binary search. + +### 3rd generation, 2000-03-09 + +Problem: Too many disk read operations and too many bytes read from disk +(limited PCI bandwidth). + +Changed format to a "paged" dictionary where a dictionary lookup would +use 1 disk read, reading 4 kiB of data. Data was not compressed. Could +not memory map whole dictionary. The sparse files were read into +memory and used to determine the page to use for further lookup. +Binary search within the pages read from disk. + +### 4th generation, 2002-08-16 + +Problem: Dictionary used too much disk space. + +Changed format to compressed format. Decompression could not contain +much state, thus delta values were compressed using exp golomb coding. + +Two levels of skip lists within each page, where skip list on a level +contained enough information to skip on all levels below within the +same page. + +Start of word was replaced by a byte telling how many bytes is +ommitted due to the prefix being common with previous words (word +before in dictionary and word before in the lookup order). + +### 5th generation, 2010-08-21 + +Payload ("value") changed when skip information was added for large +posting lists. Added overflow handling for long words / huge payloads. +Added another level of pages ("sparse pages") to improve compression. + +### 6th generation, 2015-05-12 + +Started using a separate dictionary for each index field instead of a +shared dictionary across all index fields. Minor changes. diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp index 4752ddfb64f..0f089c60e4b 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "compression.h" #include <vespa/searchlib/fef/termfieldmatchdata.h> diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h index 2d6b8083d43..232572a6314 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/compression.h +++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/bitcompression/countcompression.cpp b/searchlib/src/vespa/searchlib/bitcompression/countcompression.cpp index 7c38931df77..a1f27028874 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/countcompression.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/countcompression.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "countcompression.h" #include <vespa/searchlib/index/postinglistcounts.h> diff --git a/searchlib/src/vespa/searchlib/bitcompression/countcompression.h b/searchlib/src/vespa/searchlib/bitcompression/countcompression.h index 2f3b0646bdb..6dd8ec9d350 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/countcompression.h +++ b/searchlib/src/vespa/searchlib/bitcompression/countcompression.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/bitcompression/pagedict4.cpp b/searchlib/src/vespa/searchlib/bitcompression/pagedict4.cpp index 444c935b6f8..335b953e90c 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/pagedict4.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/pagedict4.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "pagedict4.h" #include <vespa/searchlib/index/postinglistcounts.h> diff --git a/searchlib/src/vespa/searchlib/bitcompression/pagedict4.h b/searchlib/src/vespa/searchlib/bitcompression/pagedict4.h index b162bdc3f2b..ba53b415368 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/pagedict4.h +++ b/searchlib/src/vespa/searchlib/bitcompression/pagedict4.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp index 1276ed410d9..e2b2d849f24 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "posocc_field_params.h" #include <vespa/searchcommon/common/schema.h> diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h index 9894bfb112d..ea8e6bcf73c 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h +++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp index 9d6258ce26f..92368db8cf6 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "posocc_fields_params.h" #include <vespa/searchlib/index/postinglistparams.h> diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h index 8748557e5a7..df5d80b5a29 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h +++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp index 8e1bfd2875c..bfd09337bab 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "posocccompression.h" #include "posocc_fields_params.h" diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h index 2dc747f6265..64ed4b5fd37 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h +++ b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include "compression.h" |