aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authortmartins <thigm85@gmail.com>2020-12-07 20:41:04 +0100
committertmartins <thigm85@gmail.com>2020-12-07 20:41:04 +0100
commitd80e73f9162819cff8f5ddd6e0474f6ab481a5c7 (patch)
tree00667786e0030bf0b091130fb6875e31c51edce1
parent663900a1ea319d3e9f72a0cd546ff3e9706eda93 (diff)
remove pyvespa old files as it has its own repo now.
-rw-r--r--.github/workflows/python.yml35
-rw-r--r--python/vespa/.gitignore141
-rw-r--r--python/vespa/MANIFEST.in2
-rw-r--r--python/vespa/Pipfile11
-rw-r--r--python/vespa/README.md118
-rw-r--r--python/vespa/docs/sphinx/Makefile20
-rw-r--r--python/vespa/docs/sphinx/source/application-package.ipynb139
-rw-r--r--python/vespa/docs/sphinx/source/collect-training-data.ipynb1232
-rw-r--r--python/vespa/docs/sphinx/source/conf.py63
-rw-r--r--python/vespa/docs/sphinx/source/connect-to-vespa-instance.ipynb980
-rw-r--r--python/vespa/docs/sphinx/source/create-and-deploy-vespa-cloud.ipynb993
-rw-r--r--python/vespa/docs/sphinx/source/create-and-deploy-vespa-docker.ipynb214
-rw-r--r--python/vespa/docs/sphinx/source/deploy-application.ipynb41
-rw-r--r--python/vespa/docs/sphinx/source/evaluation.ipynb297
-rw-r--r--python/vespa/docs/sphinx/source/howto.rst12
-rw-r--r--python/vespa/docs/sphinx/source/index.rst61
-rw-r--r--python/vespa/docs/sphinx/source/install.rst8
-rw-r--r--python/vespa/docs/sphinx/source/query-model.ipynb41
-rw-r--r--python/vespa/docs/sphinx/source/query.ipynb297
-rw-r--r--python/vespa/docs/sphinx/source/quickstart.rst11
-rw-r--r--python/vespa/docs/sphinx/source/reference-api.rst35
-rw-r--r--python/vespa/docs/sphinx/source/requirements.txt5
-rw-r--r--python/vespa/setup.py39
-rw-r--r--python/vespa/vespa/__init__.py3
-rw-r--r--python/vespa/vespa/_nbdev.py13
-rw-r--r--python/vespa/vespa/application.py301
-rw-r--r--python/vespa/vespa/evaluation.py132
-rw-r--r--python/vespa/vespa/json_serialization.py77
-rw-r--r--python/vespa/vespa/package.py786
-rw-r--r--python/vespa/vespa/query.py229
-rw-r--r--python/vespa/vespa/templates/hosts.xml7
-rw-r--r--python/vespa/vespa/templates/schema.txt28
-rw-r--r--python/vespa/vespa/templates/services.xml16
-rw-r--r--python/vespa/vespa/test_application.py375
-rw-r--r--python/vespa/vespa/test_evaluation.py186
-rw-r--r--python/vespa/vespa/test_package.py243
-rw-r--r--python/vespa/vespa/test_query.py190
37 files changed, 0 insertions, 7381 deletions
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
deleted file mode 100644
index 5839ae90dfd..00000000000
--- a/.github/workflows/python.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: CI
-on:
- pull_request:
- push:
- branches:
- - master
-jobs:
- build:
- runs-on: ubuntu-latest
- defaults:
- run:
- working-directory: python/vespa
- steps:
- - uses: actions/checkout@v1
- - uses: actions/setup-python@v1
- with:
- python-version: '3.7'
- architecture: 'x64'
- - name: Install the library
- run: |
- pip install pytest
- pip install -e .
- - name: Test with pytest
- run: |
- pytest
- - name: Build and publish
- if: github.event_name == 'push' && github.ref == 'refs/heads/master'
- env:
- TWINE_USERNAME: __token__
- TWINE_PASSWORD: ${{ secrets.test_pypi_password }}
- run: |
- python -m pip install --upgrade pip
- pip install setuptools wheel twine
- python setup.py sdist bdist_wheel
- twine upload --repository testpypi dist/*
diff --git a/python/vespa/.gitignore b/python/vespa/.gitignore
deleted file mode 100644
index 9ca09886342..00000000000
--- a/python/vespa/.gitignore
+++ /dev/null
@@ -1,141 +0,0 @@
-*.bak
-.gitattributes
-.last_checked
-.gitconfig
-*.bak
-*.log
-*~
-~*
-_tmp*
-tmp*
-tags
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-env/
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# PyInstaller
-# Usually these files are written by a python script from a template
-# before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# dotenv
-.env
-
-# virtualenv
-.venv
-venv/
-ENV/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-
-.vscode
-*.swp
-
-# osx generated files
-.DS_Store
-.DS_Store?
-.Trashes
-ehthumbs.db
-Thumbs.db
-.idea
-
-# pytest
-.pytest_cache
-
-# tools/trust-doc-nbs
-docs_src/.last_checked
-
-# symlinks to fastai
-docs_src/fastai
-tools/fastai
-
-# link checker
-checklink/cookies.txt
-
-# .gitconfig is now autogenerated
-.gitconfig
-
diff --git a/python/vespa/MANIFEST.in b/python/vespa/MANIFEST.in
deleted file mode 100644
index 8f9127e0494..00000000000
--- a/python/vespa/MANIFEST.in
+++ /dev/null
@@ -1,2 +0,0 @@
-include README.md
-recursive-exclude * __pycache__
diff --git a/python/vespa/Pipfile b/python/vespa/Pipfile
deleted file mode 100644
index b723d0199f8..00000000000
--- a/python/vespa/Pipfile
+++ /dev/null
@@ -1,11 +0,0 @@
-[[source]]
-name = "pypi"
-url = "https://pypi.org/simple"
-verify_ssl = true
-
-[dev-packages]
-
-[packages]
-
-[requires]
-python_version = "3.7"
diff --git a/python/vespa/README.md b/python/vespa/README.md
deleted file mode 100644
index 00d8cc2e769..00000000000
--- a/python/vespa/README.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# Vespa library for data analysis
-> Provide data analysis support for Vespa applications
-
-
-## Install
-
-`pip install pyvespa`
-
-## Connect to a Vespa app
-
-> Connect to a running Vespa application
-
-```
-from vespa.application import Vespa
-
-app = Vespa(url = "https://api.cord19.vespa.ai")
-```
-
-## Define a Query model
-
-> Easily define matching and ranking criteria
-
-```
-from vespa.query import Query, Union, WeakAnd, ANN, RankProfile
-from random import random
-
-match_phase = Union(
- WeakAnd(hits = 10),
- ANN(
- doc_vector="title_embedding",
- query_vector="title_vector",
- embedding_model=lambda x: [random() for x in range(768)],
- hits = 10,
- label="title"
- )
-)
-
-rank_profile = RankProfile(name="bm25", list_features=True)
-
-query_model = Query(match_phase=match_phase, rank_profile=rank_profile)
-```
-
-## Query the vespa app
-
-> Send queries via the query API. See the [query page](/vespa/query) for more examples.
-
-```
-query_result = app.query(
- query="Is remdesivir an effective treatment for COVID-19?",
- query_model=query_model
-)
-```
-
-```
-query_result.number_documents_retrieved
-```
-
-## Labelled data
-
-> How to structure labelled data
-
-```
-labelled_data = [
- {
- "query_id": 0,
- "query": "Intrauterine virus infections and congenital heart disease",
- "relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}]
- },
- {
- "query_id": 1,
- "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
- "relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}]
- }
-]
-```
-
-Non-relevant documents are assigned `"score": 0` by default. Relevant documents will be assigned `"score": 1` by default if the field is missing from the labelled data. The defaults for both relevant and non-relevant documents can be modified on the appropriate methods.
-
-## Collect training data
-
-> Collect training data to analyse and/or improve ranking functions. See the [collect training data page](/vespa/collect_training_data) for more examples.
-
-```
-training_data_batch = app.collect_training_data(
- labelled_data = labelled_data,
- id_field = "id",
- query_model = query_model,
- number_additional_docs = 2
-)
-training_data_batch
-```
-
-## Evaluating a query model
-
-> Define metrics and evaluate query models. See the [evaluation page](/vespa/evaluation) for more examples.
-
-We will define the following evaluation metrics:
-* % of documents retrieved per query
-* recall @ 10 per query
-* MRR @ 10 per query
-
-```
-from vespa.evaluation import MatchRatio, Recall, ReciprocalRank
-
-eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)]
-```
-
-Evaluate:
-
-```
-evaluation = app.evaluate(
- labelled_data = labelled_data,
- eval_metrics = eval_metrics,
- query_model = query_model,
- id_field = "id",
-)
-evaluation
-```
diff --git a/python/vespa/docs/sphinx/Makefile b/python/vespa/docs/sphinx/Makefile
deleted file mode 100644
index d0c3cbf1020..00000000000
--- a/python/vespa/docs/sphinx/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS ?=
-SPHINXBUILD ?= sphinx-build
-SOURCEDIR = source
-BUILDDIR = build
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/python/vespa/docs/sphinx/source/application-package.ipynb b/python/vespa/docs/sphinx/source/application-package.ipynb
deleted file mode 100644
index 5042148b040..00000000000
--- a/python/vespa/docs/sphinx/source/application-package.ipynb
+++ /dev/null
@@ -1,139 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Create Vespa application packages\n",
- "\n",
- "> Python API to create application packages"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Our goal is to create the following `msmarco` schema using our python API, based on our [text search tutorial](https://docs.vespa.ai/documentation/tutorials/text-search.html)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "```\n",
- "schema msmarco {\n",
- " document msmarco {\n",
- " field id type string {\n",
- " indexing: attribute | summary\n",
- " }\n",
- " field title type string {\n",
- " indexing: index | summary\n",
- " index: enable-bm25\n",
- " }\n",
- " field body type string {\n",
- " indexing: index | summary\n",
- " index: enable-bm25\n",
- " }\n",
- " }\n",
- "\n",
- " fieldset default {\n",
- " fields: title, body\n",
- " }\n",
- "\n",
- " rank-profile default {\n",
- " first-phase {\n",
- " expression: nativeRank(title, body)\n",
- " }\n",
- " }\n",
- "\n",
- " rank-profile bm25 inherits default {\n",
- " first-phase {\n",
- " expression: bm25(title) + bm25(body)\n",
- " }\n",
- " }\n",
- "\n",
- "}\n",
- "```"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Schema API"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.package import Document, Field, Schema, FieldSet, RankProfile, ApplicationPackage\n",
- "\n",
- "document = Document(\n",
- " fields=[\n",
- " Field(name = \"id\", type = \"string\", indexing = [\"attribute\", \"summary\"]),\n",
- " Field(name = \"title\", type = \"string\", indexing = [\"index\", \"summary\"], index = \"enable-bm25\"),\n",
- " Field(name = \"body\", type = \"string\", indexing = [\"index\", \"summary\"], index = \"enable-bm25\") \n",
- " ]\n",
- ")\n",
- "\n",
- "msmarco_schema = Schema(\n",
- " name = \"msmarco\", \n",
- " document = document, \n",
- " fieldsets = [FieldSet(name = \"default\", fields = [\"title\", \"body\"])],\n",
- " rank_profiles = [RankProfile(name = \"default\", first_phase = \"nativeRank(title, body)\")]\n",
- ")\n",
- "\n",
- "app_package = ApplicationPackage(name = \"msmarco\", schema=msmarco_schema)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Modify the application package"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can add a new rank profile:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "app_package.schema.add_rank_profile(\n",
- " RankProfile(name = \"bm25\", inherits = \"default\", first_phase = \"bm25(title) + bm25(body)\")\n",
- ")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/vespa/docs/sphinx/source/collect-training-data.ipynb b/python/vespa/docs/sphinx/source/collect-training-data.ipynb
deleted file mode 100644
index 1584b58e6b1..00000000000
--- a/python/vespa/docs/sphinx/source/collect-training-data.ipynb
+++ /dev/null
@@ -1,1232 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Collect training data from Vespa applications\n",
- "\n",
- "> Collect training data to analyse and/or improve ranking functions"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Example setup"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Connect to the application and define a query model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.application import Vespa\n",
- "from vespa.query import Query, RankProfile, OR\n",
- "\n",
- "app = Vespa(url = \"https://api.cord19.vespa.ai\")\n",
- "query_model = Query(\n",
- " match_phase = OR(),\n",
- " rank_profile = RankProfile(name=\"bm25\", list_features=True))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Define some labelled data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "labelled_data = [\n",
- " {\n",
- " \"query_id\": 0, \n",
- " \"query\": \"Intrauterine virus infections and congenital heart disease\",\n",
- " \"relevant_docs\": [{\"id\": 0, \"score\": 1}, {\"id\": 3, \"score\": 1}]\n",
- " },\n",
- " {\n",
- " \"query_id\": 1, \n",
- " \"query\": \"Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus\",\n",
- " \"relevant_docs\": [{\"id\": 1, \"score\": 1}, {\"id\": 5, \"score\": 1}]\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Collect training data in batch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>attributeMatch(authors.first)</th>\n",
- " <th>attributeMatch(authors.first).averageWeight</th>\n",
- " <th>attributeMatch(authors.first).completeness</th>\n",
- " <th>attributeMatch(authors.first).fieldCompleteness</th>\n",
- " <th>attributeMatch(authors.first).importance</th>\n",
- " <th>attributeMatch(authors.first).matches</th>\n",
- " <th>attributeMatch(authors.first).maxWeight</th>\n",
- " <th>attributeMatch(authors.first).normalizedWeight</th>\n",
- " <th>attributeMatch(authors.first).normalizedWeightedWeight</th>\n",
- " <th>attributeMatch(authors.first).queryCompleteness</th>\n",
- " <th>...</th>\n",
- " <th>textSimilarity(results).queryCoverage</th>\n",
- " <th>textSimilarity(results).score</th>\n",
- " <th>textSimilarity(title).fieldCoverage</th>\n",
- " <th>textSimilarity(title).order</th>\n",
- " <th>textSimilarity(title).proximity</th>\n",
- " <th>textSimilarity(title).queryCoverage</th>\n",
- " <th>textSimilarity(title).score</th>\n",
- " <th>document_id</th>\n",
- " <th>query_id</th>\n",
- " <th>relevant</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.062500</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.142857</td>\n",
- " <td>0.055357</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>213690</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.285714</td>\n",
- " <td>0.666667</td>\n",
- " <td>0.739583</td>\n",
- " <td>0.571429</td>\n",
- " <td>0.587426</td>\n",
- " <td>225739</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.142857</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.437500</td>\n",
- " <td>0.142857</td>\n",
- " <td>0.224554</td>\n",
- " <td>3</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>213690</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.285714</td>\n",
- " <td>0.666667</td>\n",
- " <td>0.739583</td>\n",
- " <td>0.571429</td>\n",
- " <td>0.587426</td>\n",
- " <td>225739</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>6</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.111111</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.083333</td>\n",
- " <td>0.047222</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>7</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>176163</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>8</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.187500</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.250000</td>\n",
- " <td>0.612500</td>\n",
- " <td>13597</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>9</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.083333</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.083333</td>\n",
- " <td>0.041667</td>\n",
- " <td>5</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>10</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>176163</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>11</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.187500</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.250000</td>\n",
- " <td>0.612500</td>\n",
- " <td>13597</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>12 rows × 984 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " attributeMatch(authors.first) \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).averageWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).completeness \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).fieldCompleteness \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).importance \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).matches \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).maxWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).normalizedWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).normalizedWeightedWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).queryCompleteness ... \\\n",
- "0 0.0 ... \n",
- "1 0.0 ... \n",
- "2 0.0 ... \n",
- "3 0.0 ... \n",
- "4 0.0 ... \n",
- "5 0.0 ... \n",
- "6 0.0 ... \n",
- "7 0.0 ... \n",
- "8 0.0 ... \n",
- "9 0.0 ... \n",
- "10 0.0 ... \n",
- "11 0.0 ... \n",
- "\n",
- " textSimilarity(results).queryCoverage textSimilarity(results).score \\\n",
- "0 0.0 0.0 \n",
- "1 0.0 0.0 \n",
- "2 0.0 0.0 \n",
- "3 0.0 0.0 \n",
- "4 0.0 0.0 \n",
- "5 0.0 0.0 \n",
- "6 0.0 0.0 \n",
- "7 0.0 0.0 \n",
- "8 0.0 0.0 \n",
- "9 0.0 0.0 \n",
- "10 0.0 0.0 \n",
- "11 0.0 0.0 \n",
- "\n",
- " textSimilarity(title).fieldCoverage textSimilarity(title).order \\\n",
- "0 0.062500 0.000000 \n",
- "1 1.000000 1.000000 \n",
- "2 0.285714 0.666667 \n",
- "3 0.142857 0.000000 \n",
- "4 1.000000 1.000000 \n",
- "5 0.285714 0.666667 \n",
- "6 0.111111 0.000000 \n",
- "7 1.000000 1.000000 \n",
- "8 0.187500 1.000000 \n",
- "9 0.083333 0.000000 \n",
- "10 1.000000 1.000000 \n",
- "11 0.187500 1.000000 \n",
- "\n",
- " textSimilarity(title).proximity textSimilarity(title).queryCoverage \\\n",
- "0 0.000000 0.142857 \n",
- "1 1.000000 1.000000 \n",
- "2 0.739583 0.571429 \n",
- "3 0.437500 0.142857 \n",
- "4 1.000000 1.000000 \n",
- "5 0.739583 0.571429 \n",
- "6 0.000000 0.083333 \n",
- "7 1.000000 1.000000 \n",
- "8 1.000000 0.250000 \n",
- "9 0.000000 0.083333 \n",
- "10 1.000000 1.000000 \n",
- "11 1.000000 0.250000 \n",
- "\n",
- " textSimilarity(title).score document_id query_id relevant \n",
- "0 0.055357 0 0 1 \n",
- "1 1.000000 213690 0 0 \n",
- "2 0.587426 225739 0 0 \n",
- "3 0.224554 3 0 1 \n",
- "4 1.000000 213690 0 0 \n",
- "5 0.587426 225739 0 0 \n",
- "6 0.047222 1 1 1 \n",
- "7 1.000000 176163 1 0 \n",
- "8 0.612500 13597 1 0 \n",
- "9 0.041667 5 1 1 \n",
- "10 1.000000 176163 1 0 \n",
- "11 0.612500 13597 1 0 \n",
- "\n",
- "[12 rows x 984 columns]"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "training_data_batch = app.collect_training_data(\n",
- " labelled_data = labelled_data,\n",
- " id_field = \"id\",\n",
- " query_model = query_model,\n",
- " number_additional_docs = 2\n",
- ")\n",
- "training_data_batch"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Collect training data point\n",
- "\n",
- "> You can have finer control with the `collect_training_data_point` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>attributeMatch(authors.first)</th>\n",
- " <th>attributeMatch(authors.first).averageWeight</th>\n",
- " <th>attributeMatch(authors.first).completeness</th>\n",
- " <th>attributeMatch(authors.first).fieldCompleteness</th>\n",
- " <th>attributeMatch(authors.first).importance</th>\n",
- " <th>attributeMatch(authors.first).matches</th>\n",
- " <th>attributeMatch(authors.first).maxWeight</th>\n",
- " <th>attributeMatch(authors.first).normalizedWeight</th>\n",
- " <th>attributeMatch(authors.first).normalizedWeightedWeight</th>\n",
- " <th>attributeMatch(authors.first).queryCompleteness</th>\n",
- " <th>...</th>\n",
- " <th>textSimilarity(results).queryCoverage</th>\n",
- " <th>textSimilarity(results).score</th>\n",
- " <th>textSimilarity(title).fieldCoverage</th>\n",
- " <th>textSimilarity(title).order</th>\n",
- " <th>textSimilarity(title).proximity</th>\n",
- " <th>textSimilarity(title).queryCoverage</th>\n",
- " <th>textSimilarity(title).score</th>\n",
- " <th>document_id</th>\n",
- " <th>query_id</th>\n",
- " <th>relevant</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.062500</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.142857</td>\n",
- " <td>0.055357</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>213690</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.285714</td>\n",
- " <td>0.666667</td>\n",
- " <td>0.739583</td>\n",
- " <td>0.571429</td>\n",
- " <td>0.587426</td>\n",
- " <td>225739</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.142857</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.437500</td>\n",
- " <td>0.142857</td>\n",
- " <td>0.224554</td>\n",
- " <td>3</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>213690</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.285714</td>\n",
- " <td>0.666667</td>\n",
- " <td>0.739583</td>\n",
- " <td>0.571429</td>\n",
- " <td>0.587426</td>\n",
- " <td>225739</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>6</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.111111</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.083333</td>\n",
- " <td>0.047222</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>7</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>176163</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>8</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.187500</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.250000</td>\n",
- " <td>0.612500</td>\n",
- " <td>13597</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>9</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.083333</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.083333</td>\n",
- " <td>0.041667</td>\n",
- " <td>5</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>10</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>176163</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>11</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.187500</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.250000</td>\n",
- " <td>0.612500</td>\n",
- " <td>13597</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>12 rows × 984 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " attributeMatch(authors.first) \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).averageWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).completeness \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).fieldCompleteness \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).importance \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).matches \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).maxWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).normalizedWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).normalizedWeightedWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).queryCompleteness ... \\\n",
- "0 0.0 ... \n",
- "1 0.0 ... \n",
- "2 0.0 ... \n",
- "3 0.0 ... \n",
- "4 0.0 ... \n",
- "5 0.0 ... \n",
- "6 0.0 ... \n",
- "7 0.0 ... \n",
- "8 0.0 ... \n",
- "9 0.0 ... \n",
- "10 0.0 ... \n",
- "11 0.0 ... \n",
- "\n",
- " textSimilarity(results).queryCoverage textSimilarity(results).score \\\n",
- "0 0.0 0.0 \n",
- "1 0.0 0.0 \n",
- "2 0.0 0.0 \n",
- "3 0.0 0.0 \n",
- "4 0.0 0.0 \n",
- "5 0.0 0.0 \n",
- "6 0.0 0.0 \n",
- "7 0.0 0.0 \n",
- "8 0.0 0.0 \n",
- "9 0.0 0.0 \n",
- "10 0.0 0.0 \n",
- "11 0.0 0.0 \n",
- "\n",
- " textSimilarity(title).fieldCoverage textSimilarity(title).order \\\n",
- "0 0.062500 0.000000 \n",
- "1 1.000000 1.000000 \n",
- "2 0.285714 0.666667 \n",
- "3 0.142857 0.000000 \n",
- "4 1.000000 1.000000 \n",
- "5 0.285714 0.666667 \n",
- "6 0.111111 0.000000 \n",
- "7 1.000000 1.000000 \n",
- "8 0.187500 1.000000 \n",
- "9 0.083333 0.000000 \n",
- "10 1.000000 1.000000 \n",
- "11 0.187500 1.000000 \n",
- "\n",
- " textSimilarity(title).proximity textSimilarity(title).queryCoverage \\\n",
- "0 0.000000 0.142857 \n",
- "1 1.000000 1.000000 \n",
- "2 0.739583 0.571429 \n",
- "3 0.437500 0.142857 \n",
- "4 1.000000 1.000000 \n",
- "5 0.739583 0.571429 \n",
- "6 0.000000 0.083333 \n",
- "7 1.000000 1.000000 \n",
- "8 1.000000 0.250000 \n",
- "9 0.000000 0.083333 \n",
- "10 1.000000 1.000000 \n",
- "11 1.000000 0.250000 \n",
- "\n",
- " textSimilarity(title).score document_id query_id relevant \n",
- "0 0.055357 0 0 1 \n",
- "1 1.000000 213690 0 0 \n",
- "2 0.587426 225739 0 0 \n",
- "3 0.224554 3 0 1 \n",
- "4 1.000000 213690 0 0 \n",
- "5 0.587426 225739 0 0 \n",
- "6 0.047222 1 1 1 \n",
- "7 1.000000 176163 1 0 \n",
- "8 0.612500 13597 1 0 \n",
- "9 0.041667 5 1 1 \n",
- "10 1.000000 176163 1 0 \n",
- "11 0.612500 13597 1 0 \n",
- "\n",
- "[12 rows x 984 columns]"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from pandas import concat, DataFrame\n",
- "\n",
- "\n",
- "training_data = []\n",
- "for query_data in labelled_data:\n",
- " for doc_data in query_data[\"relevant_docs\"]:\n",
- " training_data_point = app.collect_training_data_point(\n",
- " query = query_data[\"query\"],\n",
- " query_id = query_data[\"query_id\"],\n",
- " relevant_id = doc_data[\"id\"],\n",
- " id_field = \"id\",\n",
- " query_model = query_model,\n",
- " number_additional_docs = 2\n",
- " )\n",
- " training_data.extend(training_data_point)\n",
- "training_data = DataFrame.from_records(training_data)\n",
- "training_data"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/vespa/docs/sphinx/source/conf.py b/python/vespa/docs/sphinx/source/conf.py
deleted file mode 100644
index a3784794c04..00000000000
--- a/python/vespa/docs/sphinx/source/conf.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath("../../.."))
-
-
-# -- Project information -----------------------------------------------------
-
-project = "pyvespa"
-copyright = "Verizon Media 2020 Licensed under Apache License 2.0"
-author = "Vespa team"
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = ["sphinx.ext.autodoc", "nbsphinx", "sphinx_rtd_theme"]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-master_doc = "index"
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-#
-html_theme = "sphinx_rtd_theme"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-html_sidebars = {
- "**": [
- "about.html",
- "navigation.html",
- "relations.html", # needs 'show_related': True theme option to display
- "searchbox.html",
- "donate.html",
- ]
-}
diff --git a/python/vespa/docs/sphinx/source/connect-to-vespa-instance.ipynb b/python/vespa/docs/sphinx/source/connect-to-vespa-instance.ipynb
deleted file mode 100644
index 62c2eb8163b..00000000000
--- a/python/vespa/docs/sphinx/source/connect-to-vespa-instance.ipynb
+++ /dev/null
@@ -1,980 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# How to connect with running Vespa instances\n",
- "\n",
- "> Connect and interact with CORD-19 search app.\n",
- "\n",
- "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vespa-engine/vespa/blob/master/python/vespa/docs/sphinx/source/connect-to-vespa-instance.ipynb)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This self-contained tutorial will show you how to connect to a pre-existing Vespa instance. We will use the https://cord19.vespa.ai/ app as an example. You can run this tutorial yourself in Google Colab by clicking on the badge located at the top of the tutorial."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Install"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The library is available at PyPI and therefore can be installed with `pip`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install pyvespa"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Connect to a running Vespa application"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can connect to a running Vespa application by creating an instance of [Vespa](reference-api.rst#vespa.application.Vespa) with the appropriate url. The resulting `app` will then be used to communicate with the application."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.application import Vespa\n",
- "\n",
- "app = Vespa(url = \"https://api.cord19.vespa.ai\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Define a Query model\n",
- "\n",
- "> Easily define matching and ranking criteria"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "When building a search application, we usually want to expirement with different query models. A [Query](reference-api.rst#vespa.query.Query) model consists of a match phase and a ranking phase. The matching phase will define how to match documents based on the query sent and the ranking phase will define how to rank the matched documents. Both phases can get quite complex and being able to easily express and experiment with them is very valuable."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In the example below we define the match phase to be the [Union](reference-api.rst#vespa.query.Union) of the [WeakAnd](reference-api.rst#vespa.query.WeakAnd) and the [ANN](reference-api.rst#vespa.query.ANN) operators. The `WeakAnd` will match documents based on query terms while the Approximate Nearest Neighbor (`ANN`) operator will match documents based on the distance between the query and document embeddings. This is an illustration of how easy it is to combine term and semantic matching in Vespa. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.query import Union, WeakAnd, ANN\n",
- "from random import random\n",
- "\n",
- "match_phase = Union(\n",
- " WeakAnd(hits = 10), \n",
- " ANN(\n",
- " doc_vector=\"title_embedding\", \n",
- " query_vector=\"title_vector\", \n",
- " embedding_model=lambda x: [random() for x in range(768)],\n",
- " hits = 10,\n",
- " label=\"title\"\n",
- " )\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We then define the ranking to be done by the `bm25` rank-profile that is already defined in the application package. We set `list_features=True` to be able to collect ranking-features later in this tutorial. After defining the `match_phase` and the `rank_profile` we can instantiate the `Query` model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.query import Query, RankProfile\n",
- "\n",
- "rank_profile = RankProfile(name=\"bm25\", list_features=True)\n",
- "\n",
- "query_model = Query(match_phase=match_phase, rank_profile=rank_profile)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Query the vespa app\n",
- "\n",
- "> Send queries via the query API. See the [query page](query.ipynb) for more examples."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can use the `query_model` that we just defined to issue queries to the application via the `query` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "query_result = app.query(\n",
- " query=\"Is remdesivir an effective treatment for COVID-19?\", \n",
- " query_model=query_model\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can see the number of documents that were retrieved by Vespa:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1046"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "query_result.number_documents_retrieved"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And the number of documents that were returned to us:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "10"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(query_result.hits)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Labelled data\n",
- "\n",
- "> How to structure labelled data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We often need to either evaluate query models or to collect data to improve query models through ML. In both cases we usually need labelled data. Lets create some labelled data to illustrate their expected format and their usage in the library."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Each data point contains a `query_id`, a `query` and `relevant_docs` associated with the query."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "labelled_data = [\n",
- " {\n",
- " \"query_id\": 0, \n",
- " \"query\": \"Intrauterine virus infections and congenital heart disease\",\n",
- " \"relevant_docs\": [{\"id\": 0, \"score\": 1}, {\"id\": 3, \"score\": 1}]\n",
- " },\n",
- " {\n",
- " \"query_id\": 1, \n",
- " \"query\": \"Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus\",\n",
- " \"relevant_docs\": [{\"id\": 1, \"score\": 1}, {\"id\": 5, \"score\": 1}]\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Non-relevant documents are assigned `\"score\": 0` by default. Relevant documents will be assigned `\"score\": 1` by default if the field is missing from the labelled data. The defaults for both relevant and non-relevant documents can be modified on the appropriate methods."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Collect training data\n",
- "\n",
- "> Collect training data to analyse and/or improve ranking functions. See the [collect training data page](collect-training-data.ipynb) for more examples."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can colect training data with the [collect_training_data](reference-api.rst#vespa.application.Vespa.collect_training_data) method according to a specific [Query](reference-api.rst#vespa.query.Query) model. Below we will collect two documents for each query in addition to the relevant ones."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>attributeMatch(authors.first)</th>\n",
- " <th>attributeMatch(authors.first).averageWeight</th>\n",
- " <th>attributeMatch(authors.first).completeness</th>\n",
- " <th>attributeMatch(authors.first).fieldCompleteness</th>\n",
- " <th>attributeMatch(authors.first).importance</th>\n",
- " <th>attributeMatch(authors.first).matches</th>\n",
- " <th>attributeMatch(authors.first).maxWeight</th>\n",
- " <th>attributeMatch(authors.first).normalizedWeight</th>\n",
- " <th>attributeMatch(authors.first).normalizedWeightedWeight</th>\n",
- " <th>attributeMatch(authors.first).queryCompleteness</th>\n",
- " <th>...</th>\n",
- " <th>textSimilarity(results).queryCoverage</th>\n",
- " <th>textSimilarity(results).score</th>\n",
- " <th>textSimilarity(title).fieldCoverage</th>\n",
- " <th>textSimilarity(title).order</th>\n",
- " <th>textSimilarity(title).proximity</th>\n",
- " <th>textSimilarity(title).queryCoverage</th>\n",
- " <th>textSimilarity(title).score</th>\n",
- " <th>document_id</th>\n",
- " <th>query_id</th>\n",
- " <th>relevant</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.062500</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.142857</td>\n",
- " <td>0.055357</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>213690</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.285714</td>\n",
- " <td>0.666667</td>\n",
- " <td>0.739583</td>\n",
- " <td>0.571429</td>\n",
- " <td>0.587426</td>\n",
- " <td>225739</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.142857</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.437500</td>\n",
- " <td>0.142857</td>\n",
- " <td>0.224554</td>\n",
- " <td>3</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>213690</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.285714</td>\n",
- " <td>0.666667</td>\n",
- " <td>0.739583</td>\n",
- " <td>0.571429</td>\n",
- " <td>0.587426</td>\n",
- " <td>225739</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>6</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.111111</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.083333</td>\n",
- " <td>0.047222</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>7</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>176163</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>8</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.187500</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.250000</td>\n",
- " <td>0.612500</td>\n",
- " <td>13597</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>9</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.083333</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.083333</td>\n",
- " <td>0.041667</td>\n",
- " <td>5</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>10</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>176163</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>11</th>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>...</td>\n",
- " <td>0.0</td>\n",
- " <td>0.0</td>\n",
- " <td>0.187500</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.250000</td>\n",
- " <td>0.612500</td>\n",
- " <td>13597</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>12 rows × 984 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " attributeMatch(authors.first) \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).averageWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).completeness \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).fieldCompleteness \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).importance \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).matches \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).maxWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).normalizedWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).normalizedWeightedWeight \\\n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "5 0.0 \n",
- "6 0.0 \n",
- "7 0.0 \n",
- "8 0.0 \n",
- "9 0.0 \n",
- "10 0.0 \n",
- "11 0.0 \n",
- "\n",
- " attributeMatch(authors.first).queryCompleteness ... \\\n",
- "0 0.0 ... \n",
- "1 0.0 ... \n",
- "2 0.0 ... \n",
- "3 0.0 ... \n",
- "4 0.0 ... \n",
- "5 0.0 ... \n",
- "6 0.0 ... \n",
- "7 0.0 ... \n",
- "8 0.0 ... \n",
- "9 0.0 ... \n",
- "10 0.0 ... \n",
- "11 0.0 ... \n",
- "\n",
- " textSimilarity(results).queryCoverage textSimilarity(results).score \\\n",
- "0 0.0 0.0 \n",
- "1 0.0 0.0 \n",
- "2 0.0 0.0 \n",
- "3 0.0 0.0 \n",
- "4 0.0 0.0 \n",
- "5 0.0 0.0 \n",
- "6 0.0 0.0 \n",
- "7 0.0 0.0 \n",
- "8 0.0 0.0 \n",
- "9 0.0 0.0 \n",
- "10 0.0 0.0 \n",
- "11 0.0 0.0 \n",
- "\n",
- " textSimilarity(title).fieldCoverage textSimilarity(title).order \\\n",
- "0 0.062500 0.000000 \n",
- "1 1.000000 1.000000 \n",
- "2 0.285714 0.666667 \n",
- "3 0.142857 0.000000 \n",
- "4 1.000000 1.000000 \n",
- "5 0.285714 0.666667 \n",
- "6 0.111111 0.000000 \n",
- "7 1.000000 1.000000 \n",
- "8 0.187500 1.000000 \n",
- "9 0.083333 0.000000 \n",
- "10 1.000000 1.000000 \n",
- "11 0.187500 1.000000 \n",
- "\n",
- " textSimilarity(title).proximity textSimilarity(title).queryCoverage \\\n",
- "0 0.000000 0.142857 \n",
- "1 1.000000 1.000000 \n",
- "2 0.739583 0.571429 \n",
- "3 0.437500 0.142857 \n",
- "4 1.000000 1.000000 \n",
- "5 0.739583 0.571429 \n",
- "6 0.000000 0.083333 \n",
- "7 1.000000 1.000000 \n",
- "8 1.000000 0.250000 \n",
- "9 0.000000 0.083333 \n",
- "10 1.000000 1.000000 \n",
- "11 1.000000 0.250000 \n",
- "\n",
- " textSimilarity(title).score document_id query_id relevant \n",
- "0 0.055357 0 0 1 \n",
- "1 1.000000 213690 0 0 \n",
- "2 0.587426 225739 0 0 \n",
- "3 0.224554 3 0 1 \n",
- "4 1.000000 213690 0 0 \n",
- "5 0.587426 225739 0 0 \n",
- "6 0.047222 1 1 1 \n",
- "7 1.000000 176163 1 0 \n",
- "8 0.612500 13597 1 0 \n",
- "9 0.041667 5 1 1 \n",
- "10 1.000000 176163 1 0 \n",
- "11 0.612500 13597 1 0 \n",
- "\n",
- "[12 rows x 984 columns]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "training_data_batch = app.collect_training_data(\n",
- " labelled_data = labelled_data,\n",
- " id_field = \"id\",\n",
- " query_model = query_model,\n",
- " number_additional_docs = 2\n",
- ")\n",
- "training_data_batch"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Evaluating a query model\n",
- "\n",
- "> Define metrics and evaluate query models. See the [evaluation page](evaluation.ipynb) for more examples."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We will define the following evaluation metrics:\n",
- "* % of documents retrieved per query\n",
- "* recall @ 10 per query\n",
- "* MRR @ 10 per query"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.evaluation import MatchRatio, Recall, ReciprocalRank\n",
- "\n",
- "eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Evaluate:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>query_id</th>\n",
- " <th>match_ratio_retrieved_docs</th>\n",
- " <th>match_ratio_docs_available</th>\n",
- " <th>match_ratio_value</th>\n",
- " <th>recall_10_value</th>\n",
- " <th>reciprocal_rank_10_value</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>0</td>\n",
- " <td>1254</td>\n",
- " <td>233281</td>\n",
- " <td>0.005375</td>\n",
- " <td>0.0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>1</td>\n",
- " <td>1003</td>\n",
- " <td>233281</td>\n",
- " <td>0.004300</td>\n",
- " <td>0.0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " query_id match_ratio_retrieved_docs match_ratio_docs_available \\\n",
- "0 0 1254 233281 \n",
- "1 1 1003 233281 \n",
- "\n",
- " match_ratio_value recall_10_value reciprocal_rank_10_value \n",
- "0 0.005375 0.0 0 \n",
- "1 0.004300 0.0 0 "
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "evaluation = app.evaluate(\n",
- " labelled_data = labelled_data,\n",
- " eval_metrics = eval_metrics, \n",
- " query_model = query_model, \n",
- " id_field = \"id\",\n",
- ")\n",
- "evaluation"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/vespa/docs/sphinx/source/create-and-deploy-vespa-cloud.ipynb b/python/vespa/docs/sphinx/source/create-and-deploy-vespa-cloud.ipynb
deleted file mode 100644
index 4f94ce2d7c4..00000000000
--- a/python/vespa/docs/sphinx/source/create-and-deploy-vespa-cloud.ipynb
+++ /dev/null
@@ -1,993 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Build end-to-end Vespa apps and deploy to Vespa Cloud\n",
- "\n",
- "> Python API to create, modify, deploy and interact with Vespa applications\n",
- "\n",
- "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vespa-engine/vespa/blob/master/python/vespa/docs/sphinx/source/create-and-deploy-vespa-cloud.ipynb)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This self-contained tutorial will create a simplified text search application from scratch based on the MS MARCO dataset, similar to our [text search tutorials](https://docs.vespa.ai/documentation/tutorials/text-search.html). We will then deploy the app to [Vespa Cloud](https://cloud.vespa.ai/) and interact with it by feeding data, querying and evaluating different query models."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Application package API"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We first create a `Document` instance containing the `Field`s that we want to store in the app. In this case we will keep the application simple and only feed a unique `id`, `title` and `body` of the MS MARCO documents."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.package import Document, Field\n",
- "\n",
- "document = Document(\n",
- " fields=[\n",
- " Field(name = \"id\", type = \"string\", indexing = [\"attribute\", \"summary\"]),\n",
- " Field(name = \"title\", type = \"string\", indexing = [\"index\", \"summary\"], index = \"enable-bm25\"),\n",
- " Field(name = \"body\", type = \"string\", indexing = [\"index\", \"summary\"], index = \"enable-bm25\") \n",
- " ]\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The complete `Schema` of our application will be named `msmarco` and contains the `Document` instance that we defined above, the default `FieldSet` indicates that queries will look for matches by searching both in the titles and bodies of the documents. The default `RankProfile` indicates that all the matched documents will be ranked by the `nativeRank` expression involving the title and the body of the matched documents."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.package import Schema, FieldSet, RankProfile\n",
- "\n",
- "msmarco_schema = Schema(\n",
- " name = \"msmarco\", \n",
- " document = document, \n",
- " fieldsets = [FieldSet(name = \"default\", fields = [\"title\", \"body\"])],\n",
- " rank_profiles = [RankProfile(name = \"default\", first_phase = \"nativeRank(title, body)\")]\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Once the `Schema` is defined, all we have to do is to create our msmarco `ApplicationPackage`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.package import ApplicationPackage\n",
- "\n",
- "app_package = ApplicationPackage(name = \"msmarco\", schema=msmarco_schema)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "At this point, `app_package` contains all the relevant information required to create our MS MARCO text search app. We now need to deploy it."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Deploy to Vespa Cloud"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To be able to deploy to [Vespa Cloud](https://cloud.vespa.ai/), you need to sign-up, register an application name on the Vespa Cloud console and generate your user API key."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We first create a `VespaCloud` instance that will handle the secure communication with Vespa Cloud servers. In order to do that, all we need is your Vespa Cloud tenant name, the application name that you registered, the user key you generated on the Vespa Cloud console and the application package that we created above."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.package import VespaCloud\n",
- "\n",
- "vespa_cloud = VespaCloud(\n",
- " tenant=\"vespa-team\", \n",
- " application=\"ms-marco\", \n",
- " key_location=\"/Users/username/sample_application/username.vespa-team.pem\", \n",
- " application_package=app_package\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We then deploy the application to a particular instance (named `from-notebook` in this case) and specify a folder location necessary to store required files such as certificates to allow for secure data exchange between the client and the VespaCloud servers."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Note:** It takes around 15 min to call `cloud.deploy` for the first time, as Vespa Cloud will have the setup the environment. Subsequent calls will be much faster, usually taking less than 10 seconds."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "app = vespa_cloud.deploy(\n",
- " instance='from-notebook', \n",
- " disk_folder=\"/Users/username/sample_application\"\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The `app` variable above will hold a `Vespa` instance that will be used to connect and interact with our text search application throughtout this tutorial."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Feed data to the app "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We now have our text search app up and running. We can start to feed data to it. We have pre-processed and sampled some MS MARCO data to use in this tutorial. We can load 996 documents that we want to feed and check the first two documents in this sample."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(996, 3)"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from pandas import read_csv\n",
- "\n",
- "docs = read_csv(\"https://thigm85.github.io/data/msmarco/docs.tsv\", sep = \"\\t\")\n",
- "docs.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>id</th>\n",
- " <th>title</th>\n",
- " <th>body</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>D2185715</td>\n",
- " <td>What Is an Appropriate Gift for a Bris</td>\n",
- " <td>Hub Pages Religion and Philosophy Judaism...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>D2819479</td>\n",
- " <td>lunge</td>\n",
- " <td>1lungenoun ˈlənj Popularity Bottom 40 of...</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " id title \\\n",
- "0 D2185715 What Is an Appropriate Gift for a Bris \n",
- "1 D2819479 lunge \n",
- "\n",
- " body \n",
- "0 Hub Pages Religion and Philosophy Judaism... \n",
- "1 1lungenoun ˈlənj Popularity Bottom 40 of... "
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "docs.head(2)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To feed the data we need to specify the `schema` that we are sending data to. We named our schema `msmarco` in a previous section. Each data point needs to have a unique `data_id` associated with it, independent of having an id field or not. The `fields` should be a dict containing all the fields in the schema, which are `id`, `title` and `body` in our case. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "for idx, row in docs.iterrows():\n",
- " response = app.feed_data_point(\n",
- " schema = \"msmarco\", \n",
- " data_id = str(row[\"id\"]), \n",
- " fields = {\n",
- " \"id\": str(row[\"id\"]), \n",
- " \"title\": str(row[\"title\"]), \n",
- " \"body\": str(row[\"body\"])\n",
- " }\n",
- " )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Make a simple query"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Once our application is fed we can start sending queries to it. The MS MARCO app expects to receive questions as queries and the goal of the application is to return documents that are relevant to the questions made."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In the example below, we will send a question via the `query` parameter. In addition, we need to specify how we want the documents to be matched and ranked. We do this by specifying a `Query` model. The query model below will have the `OR` operator in the match phase, indicating that the application will match all the documents which have at least one query term within the title or the body (due to the default `FieldSet` we defined earlier) of the document. And we will rank all the matched documents by the default `RankProfile` that we defined earlier."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.query import Query, OR, RankProfile as Ranking\n",
- "\n",
- "results = app.query(\n",
- " query=\"Where is my text?\", \n",
- " query_model = Query(\n",
- " match_phase=OR(), \n",
- " rank_profile=Ranking(name=\"default\")\n",
- " ),\n",
- " hits = 2\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In addition to the `query` and `query_model` parameters, we can specify a multitude of relevant Vespa parameters such as the number of `hits` that we want Vespa to return. We chose `hits=2` for simplicity in this tutorial."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "2"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(results.hits)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Change the application package and redeploy"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can also make specific changes to our application by changing the application package and redeploying. Lets add a new rank profile based on BM25 to our `Schema`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "app_package.schema.add_rank_profile(\n",
- " RankProfile(name = \"bm25\", inherits = \"default\", first_phase = \"bm25(title) + bm25(body)\")\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "After that we can redeploy our application, similar to what we did earlier:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "app = vespa_cloud.deploy('from-notebook', \"/Users/username/sample_application\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can then use the newly created `bm25` rank profile to make queries:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "2"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "results = app.query(\n",
- " query=\"Where is my text?\", \n",
- " query_model = Query(\n",
- " match_phase=OR(), \n",
- " rank_profile=Ranking(name=\"bm25\")\n",
- " ),\n",
- " hits = 2\n",
- ")\n",
- "len(results.hits)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Compare query models"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "When we are building a search application, we often want to experiment and compare different query models. In this section we want to show how easy it is to compare different query models in Vespa."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Lets load some labelled data where each data point contains a `query_id`, a `query` and a list of `relevant_docs` associated with the query. In this case, we have only one relevant document for each query."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "import requests, json\n",
- "\n",
- "labelled_data = json.loads(\n",
- " requests.get(\"https://thigm85.github.io/data/msmarco/query-labels.json\").text\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Following we can see two examples of the labelled data:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'query_id': '1',\n",
- " 'query': 'what county is aspen co',\n",
- " 'relevant_docs': [{'id': 'D1098819'}]},\n",
- " {'query_id': '2',\n",
- " 'query': 'where is aeropostale located',\n",
- " 'relevant_docs': [{'id': 'D2268823'}]}]"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "labelled_data[0:2]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Lets define two `Query` models to be compared. We are going to use the same `OR` operator in the match phase and compare the `default` and `bm25` rank profiles."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "default_ranking = Query(\n",
- " match_phase=OR(), \n",
- " rank_profile=Ranking(name=\"default\")\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "bm25_ranking = Query(\n",
- " match_phase=OR(), \n",
- " rank_profile=Ranking(name=\"bm25\")\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now we will chose which evaluation metrics we want to look at. In this case we will chose the `MatchRatio` to check how many documents have been matched by the query, the `Recall` at 10 and the `ReciprocalRank` at 10."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.evaluation import MatchRatio, Recall, ReciprocalRank\n",
- "\n",
- "eval_metrics = [MatchRatio(), Recall(at = 10), ReciprocalRank(at = 10)]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We now can run the `evaluation` method for each `Query` model. This will make queries to the application and process the results to compute the pre-defined `eval_metrics` defined above."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "default_evaluation = app.evaluate(\n",
- " labelled_data=labelled_data, \n",
- " eval_metrics=eval_metrics, \n",
- " query_model=default_ranking, \n",
- " id_field=\"id\",\n",
- " timeout=5,\n",
- " hits=10\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "bm25_evaluation = app.evaluate(\n",
- " labelled_data=labelled_data, \n",
- " eval_metrics=eval_metrics, \n",
- " query_model=bm25_ranking, \n",
- " id_field=\"id\",\n",
- " timeout=5,\n",
- " hits=10\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can then merge the DataFrames returned by the `evaluation` method and start to analyse the results."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>query_id</th>\n",
- " <th>match_ratio_retrieved_docs_default</th>\n",
- " <th>match_ratio_docs_available_default</th>\n",
- " <th>match_ratio_value_default</th>\n",
- " <th>recall_10_value_default</th>\n",
- " <th>reciprocal_rank_10_value_default</th>\n",
- " <th>match_ratio_retrieved_docs_bm25</th>\n",
- " <th>match_ratio_docs_available_bm25</th>\n",
- " <th>match_ratio_value_bm25</th>\n",
- " <th>recall_10_value_bm25</th>\n",
- " <th>reciprocal_rank_10_value_bm25</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>1</td>\n",
- " <td>914</td>\n",
- " <td>997</td>\n",
- " <td>0.916750</td>\n",
- " <td>1.0</td>\n",
- " <td>1.000</td>\n",
- " <td>914</td>\n",
- " <td>997</td>\n",
- " <td>0.916750</td>\n",
- " <td>1.0</td>\n",
- " <td>1.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>2</td>\n",
- " <td>896</td>\n",
- " <td>997</td>\n",
- " <td>0.898696</td>\n",
- " <td>1.0</td>\n",
- " <td>0.125</td>\n",
- " <td>896</td>\n",
- " <td>997</td>\n",
- " <td>0.898696</td>\n",
- " <td>1.0</td>\n",
- " <td>1.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>3</td>\n",
- " <td>970</td>\n",
- " <td>997</td>\n",
- " <td>0.972919</td>\n",
- " <td>1.0</td>\n",
- " <td>1.000</td>\n",
- " <td>970</td>\n",
- " <td>997</td>\n",
- " <td>0.972919</td>\n",
- " <td>1.0</td>\n",
- " <td>1.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>4</td>\n",
- " <td>981</td>\n",
- " <td>997</td>\n",
- " <td>0.983952</td>\n",
- " <td>1.0</td>\n",
- " <td>1.000</td>\n",
- " <td>981</td>\n",
- " <td>997</td>\n",
- " <td>0.983952</td>\n",
- " <td>1.0</td>\n",
- " <td>1.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>5</td>\n",
- " <td>748</td>\n",
- " <td>997</td>\n",
- " <td>0.750251</td>\n",
- " <td>1.0</td>\n",
- " <td>0.500</td>\n",
- " <td>748</td>\n",
- " <td>997</td>\n",
- " <td>0.750251</td>\n",
- " <td>1.0</td>\n",
- " <td>0.333333</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " query_id match_ratio_retrieved_docs_default \\\n",
- "0 1 914 \n",
- "1 2 896 \n",
- "2 3 970 \n",
- "3 4 981 \n",
- "4 5 748 \n",
- "\n",
- " match_ratio_docs_available_default match_ratio_value_default \\\n",
- "0 997 0.916750 \n",
- "1 997 0.898696 \n",
- "2 997 0.972919 \n",
- "3 997 0.983952 \n",
- "4 997 0.750251 \n",
- "\n",
- " recall_10_value_default reciprocal_rank_10_value_default \\\n",
- "0 1.0 1.000 \n",
- "1 1.0 0.125 \n",
- "2 1.0 1.000 \n",
- "3 1.0 1.000 \n",
- "4 1.0 0.500 \n",
- "\n",
- " match_ratio_retrieved_docs_bm25 match_ratio_docs_available_bm25 \\\n",
- "0 914 997 \n",
- "1 896 997 \n",
- "2 970 997 \n",
- "3 981 997 \n",
- "4 748 997 \n",
- "\n",
- " match_ratio_value_bm25 recall_10_value_bm25 reciprocal_rank_10_value_bm25 \n",
- "0 0.916750 1.0 1.000000 \n",
- "1 0.898696 1.0 1.000000 \n",
- "2 0.972919 1.0 1.000000 \n",
- "3 0.983952 1.0 1.000000 \n",
- "4 0.750251 1.0 0.333333 "
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from pandas import merge\n",
- "\n",
- "eval_comparison = merge(\n",
- " left=default_evaluation, \n",
- " right=bm25_evaluation, \n",
- " on=\"query_id\", \n",
- " suffixes=('_default', '_bm25')\n",
- ")\n",
- "eval_comparison.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Notice that we expect to observe the same match ratio for both query models since they use the same `OR` operator."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>match_ratio_value_default</th>\n",
- " <th>match_ratio_value_bm25</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>mean</th>\n",
- " <td>0.866650</td>\n",
- " <td>0.866650</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>std</th>\n",
- " <td>0.181307</td>\n",
- " <td>0.181307</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " match_ratio_value_default match_ratio_value_bm25\n",
- "mean 0.866650 0.866650\n",
- "std 0.181307 0.181307"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "eval_comparison[[\"match_ratio_value_default\", \"match_ratio_value_bm25\"]].describe().loc[[\"mean\", \"std\"]]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The `bm25` rank profile obtained a significantly higher recall than the `default`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>recall_10_value_default</th>\n",
- " <th>recall_10_value_bm25</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>mean</th>\n",
- " <td>0.840000</td>\n",
- " <td>0.960000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>std</th>\n",
- " <td>0.368453</td>\n",
- " <td>0.196946</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " recall_10_value_default recall_10_value_bm25\n",
- "mean 0.840000 0.960000\n",
- "std 0.368453 0.196946"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "eval_comparison[[\"recall_10_value_default\", \"recall_10_value_bm25\"]].describe().loc[[\"mean\", \"std\"]]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Similarly, `bm25` also get a significantly higher reciprocal rank value when compared to the `default` rank profile."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>reciprocal_rank_10_value_default</th>\n",
- " <th>reciprocal_rank_10_value_bm25</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>mean</th>\n",
- " <td>0.724750</td>\n",
- " <td>0.943333</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>std</th>\n",
- " <td>0.399118</td>\n",
- " <td>0.216103</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " reciprocal_rank_10_value_default reciprocal_rank_10_value_bm25\n",
- "mean 0.724750 0.943333\n",
- "std 0.399118 0.216103"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "eval_comparison[[\"reciprocal_rank_10_value_default\", \"reciprocal_rank_10_value_bm25\"]].describe().loc[[\"mean\", \"std\"]]"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/vespa/docs/sphinx/source/create-and-deploy-vespa-docker.ipynb b/python/vespa/docs/sphinx/source/create-and-deploy-vespa-docker.ipynb
deleted file mode 100644
index 13e74aa173d..00000000000
--- a/python/vespa/docs/sphinx/source/create-and-deploy-vespa-docker.ipynb
+++ /dev/null
@@ -1,214 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Deploy Vespa apps to Docker\n",
- "\n",
- "> Python API to deploy Vespa applications to Docker containers."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This tutorial illustrate how to deploy a Vespa application to a Docker container in your local machine. It is required to have Docker installed in the machine you are running this tutorial from. For that reason we cannot run this tutorial in Google Colab as Docker is not available on their standard runtime machines."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Application package API"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We first create a `Document` instance containing the `Field`s that we want to store in the app. In this case we will keep the application simple and only feed a unique `id`, `title` and `body` of the MS MARCO documents."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.package import Document, Field\n",
- "\n",
- "document = Document(\n",
- " fields=[\n",
- " Field(name = \"id\", type = \"string\", indexing = [\"attribute\", \"summary\"]),\n",
- " Field(name = \"title\", type = \"string\", indexing = [\"index\", \"summary\"], index = \"enable-bm25\"),\n",
- " Field(name = \"body\", type = \"string\", indexing = [\"index\", \"summary\"], index = \"enable-bm25\") \n",
- " ]\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The complete `Schema` of our application will be named `msmarco` and contains the `Document` instance that we defined above, the default `FieldSet` indicates that queries will look for matches by searching both in the titles and bodies of the documents. The default `RankProfile` indicates that all the matched documents will be ranked by the `nativeRank` expression involving the title and the body of the matched documents."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.package import Schema, FieldSet, RankProfile\n",
- "\n",
- "msmarco_schema = Schema(\n",
- " name = \"msmarco\", \n",
- " document = document, \n",
- " fieldsets = [FieldSet(name = \"default\", fields = [\"title\", \"body\"])],\n",
- " rank_profiles = [RankProfile(name = \"default\", first_phase = \"nativeRank(title, body)\")]\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Once the `Schema` is defined, all we have to do is to create our msmarco `ApplicationPackage`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.package import ApplicationPackage\n",
- "\n",
- "app_package = ApplicationPackage(name = \"msmarco\", schema=msmarco_schema)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "At this point, `app_package` contains all the relevant information required to create our MS MARCO text search app. We now need to deploy it."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Deploy to a Docker container"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "For the following to work you need to run this from a machine with Docker installed. We first create a `VespaDocker` instance based on the application package."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.package import VespaDocker\n",
- "\n",
- "vespa_docker = VespaDocker(application_package=app_package)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We then call the `deploy` method and specify a `disk_folder`. Behind the scenes, `pyvespa` will write the Vespa config files and store them in the `disk_folder`, it will then run a Vespa engine Docker container and deploy those config files in the container."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "app = vespa_docker.deploy(disk_folder=\"/Users/username/sample_application\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The app variable above will hold a `Vespa` instance that will be used to connect and interact with our text search application. We can see the deployment message returned by the Vespa engine:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[\"Uploading application '/app/application' using http://localhost:19071/application/v2/tenant/default/session\",\n",
- " \"Session 18 for tenant 'default' created.\",\n",
- " 'Preparing session 18 using http://localhost:19071/application/v2/tenant/default/session/18/prepared',\n",
- " \"WARNING: Host named 'msmarco' may not receive any config since it is not a canonical hostname. Disregard this warning when testing in a Docker container.\",\n",
- " \"Session 18 for tenant 'default' prepared.\",\n",
- " 'Activating session 18 using http://localhost:19071/application/v2/tenant/default/session/18/active',\n",
- " \"Session 18 for tenant 'default' activated.\",\n",
- " 'Checksum: 09203c16fa5f582b712711bb98932812',\n",
- " 'Timestamp: 1598011224920',\n",
- " 'Generation: 18',\n",
- " '']"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "app.deployment_message"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Interact with the deployed application"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "From this point on you can interact with the deployed application the same way we did in the following tutorials:\n",
- "* [How to connect with running Vespa instances](connect-to-vespa-instance.ipynb)\n",
- "* [Build end-to-end Vespa apps and deploy to Vespa Cloud](create-and-deploy-vespa-cloud.ipynb)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "vespa",
- "language": "python",
- "name": "vespa"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/vespa/docs/sphinx/source/deploy-application.ipynb b/python/vespa/docs/sphinx/source/deploy-application.ipynb
deleted file mode 100644
index 9bef636f889..00000000000
--- a/python/vespa/docs/sphinx/source/deploy-application.ipynb
+++ /dev/null
@@ -1,41 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Deploy Vespa applications\n",
- "\n",
- "> Python API to deploy application packages to Vespa Cloud or to Docker containers"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "TBD"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/python/vespa/docs/sphinx/source/evaluation.ipynb b/python/vespa/docs/sphinx/source/evaluation.ipynb
deleted file mode 100644
index 54026cd5544..00000000000
--- a/python/vespa/docs/sphinx/source/evaluation.ipynb
+++ /dev/null
@@ -1,297 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Evaluate query models\n",
- "\n",
- "> Define metrics and evaluate query models"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Example setup"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Connect to the application and define a query model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.application import Vespa\n",
- "from vespa.query import Query, RankProfile, OR\n",
- "\n",
- "app = Vespa(url = \"https://api.cord19.vespa.ai\")\n",
- "query_model = Query(\n",
- " match_phase = OR(),\n",
- " rank_profile = RankProfile(name=\"bm25\", list_features=True))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Define some labelled data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "labelled_data = [\n",
- " {\n",
- " \"query_id\": 0, \n",
- " \"query\": \"Intrauterine virus infections and congenital heart disease\",\n",
- " \"relevant_docs\": [{\"id\": 0, \"score\": 1}, {\"id\": 3, \"score\": 1}]\n",
- " },\n",
- " {\n",
- " \"query_id\": 1, \n",
- " \"query\": \"Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus\",\n",
- " \"relevant_docs\": [{\"id\": 1, \"score\": 1}, {\"id\": 5, \"score\": 1}]\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Define metrics"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.evaluation import MatchRatio, Recall, ReciprocalRank\n",
- "\n",
- "eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Evaluate in batch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>query_id</th>\n",
- " <th>match_ratio_retrieved_docs</th>\n",
- " <th>match_ratio_docs_available</th>\n",
- " <th>match_ratio_value</th>\n",
- " <th>recall_10_value</th>\n",
- " <th>reciprocal_rank_10_value</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>0</td>\n",
- " <td>189800</td>\n",
- " <td>233281</td>\n",
- " <td>0.813611</td>\n",
- " <td>0.0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>1</td>\n",
- " <td>207809</td>\n",
- " <td>233281</td>\n",
- " <td>0.890810</td>\n",
- " <td>0.0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " query_id match_ratio_retrieved_docs match_ratio_docs_available \\\n",
- "0 0 189800 233281 \n",
- "1 1 207809 233281 \n",
- "\n",
- " match_ratio_value recall_10_value reciprocal_rank_10_value \n",
- "0 0.813611 0.0 0 \n",
- "1 0.890810 0.0 0 "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "evaluation = app.evaluate(\n",
- " labelled_data = labelled_data,\n",
- " eval_metrics = eval_metrics, \n",
- " query_model = query_model, \n",
- " id_field = \"id\",\n",
- ")\n",
- "evaluation"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Evaluate specific query\n",
- "\n",
- "> You can have finer control with the `evaluate_query` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>query_id</th>\n",
- " <th>match_ratio_retrieved_docs</th>\n",
- " <th>match_ratio_docs_available</th>\n",
- " <th>match_ratio_value</th>\n",
- " <th>recall_10_value</th>\n",
- " <th>reciprocal_rank_10_value</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>0</td>\n",
- " <td>189800</td>\n",
- " <td>233281</td>\n",
- " <td>0.813611</td>\n",
- " <td>0.0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>1</td>\n",
- " <td>207809</td>\n",
- " <td>233281</td>\n",
- " <td>0.890810</td>\n",
- " <td>0.0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " query_id match_ratio_retrieved_docs match_ratio_docs_available \\\n",
- "0 0 189800 233281 \n",
- "1 1 207809 233281 \n",
- "\n",
- " match_ratio_value recall_10_value reciprocal_rank_10_value \n",
- "0 0.813611 0.0 0 \n",
- "1 0.890810 0.0 0 "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from pandas import concat, DataFrame\n",
- "\n",
- "evaluation = []\n",
- "for query_data in labelled_data:\n",
- " query_evaluation = app.evaluate_query(\n",
- " eval_metrics = eval_metrics, \n",
- " query_model = query_model, \n",
- " query_id = query_data[\"query_id\"], \n",
- " query = query_data[\"query\"], \n",
- " id_field = \"id\",\n",
- " relevant_docs = query_data[\"relevant_docs\"],\n",
- " default_score = 0\n",
- " )\n",
- " evaluation.append(query_evaluation)\n",
- "evaluation = DataFrame.from_records(evaluation)\n",
- "evaluation"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/vespa/docs/sphinx/source/howto.rst b/python/vespa/docs/sphinx/source/howto.rst
deleted file mode 100644
index 680e5f1575c..00000000000
--- a/python/vespa/docs/sphinx/source/howto.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-How-to guides
-=============
-
-.. toctree::
- :hidden:
-
- application-package
- deploy-application
- query-model
- query
- evaluation
- collect-training-data
diff --git a/python/vespa/docs/sphinx/source/index.rst b/python/vespa/docs/sphinx/source/index.rst
deleted file mode 100644
index 03ef20c133e..00000000000
--- a/python/vespa/docs/sphinx/source/index.rst
+++ /dev/null
@@ -1,61 +0,0 @@
-.. pyvespa documentation master file, created by
- sphinx-quickstart on Wed Aug 26 11:11:55 2020.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Vespa python API
-================
-
-.. toctree::
- :hidden:
-
- install
- quickstart
- howto
- reference-api
-
-Vespa_ is the scalable open-sourced serving engine that enable us to store, compute and rank big data at user
-serving time. ``pyvespa`` provides a python API to Vespa. It allow us to create, modify, deploy and interact with
-running Vespa instances. The main goal of the library is to allow for faster prototyping and to facilitate
-Machine Learning experiments for Vespa applications.
-
-.. _Vespa: https://vespa.ai/
-
-
-Install
-+++++++
-
-You can install ``pyvespa`` via ``pip``:
-
-.. code:: bash
-
- pip install pyvespa
-
-Quick-start
-+++++++++++
-
-The best way to get started is by following the tutorials below. You can easily run them yourself on Google Colab
-by clicking on the badge at the top of the tutorial.
-
-
-- :doc:`connect-to-vespa-instance`
-- :doc:`create-and-deploy-vespa-cloud`
-- :doc:`create-and-deploy-vespa-docker`
-
-
-How-to guides
-+++++++++++++
-
-- :doc:`application-package`
-- :doc:`deploy-application`
-- :doc:`query-model`
-- :doc:`query`
-- :doc:`evaluation`
-- :doc:`collect-training-data`
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/python/vespa/docs/sphinx/source/install.rst b/python/vespa/docs/sphinx/source/install.rst
deleted file mode 100644
index 7cf6b16eaa5..00000000000
--- a/python/vespa/docs/sphinx/source/install.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Install pyvespa
-===============
-
-To install ``pyvespa`` type
-
-.. code:: bash
-
- pip install pyvespa
diff --git a/python/vespa/docs/sphinx/source/query-model.ipynb b/python/vespa/docs/sphinx/source/query-model.ipynb
deleted file mode 100644
index bd2e73601dc..00000000000
--- a/python/vespa/docs/sphinx/source/query-model.ipynb
+++ /dev/null
@@ -1,41 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Query models\n",
- "\n",
- "> Python API define query models"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "TBD"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/python/vespa/docs/sphinx/source/query.ipynb b/python/vespa/docs/sphinx/source/query.ipynb
deleted file mode 100644
index ec1d5e3ec01..00000000000
--- a/python/vespa/docs/sphinx/source/query.ipynb
+++ /dev/null
@@ -1,297 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Query Vespa applications\n",
- "\n",
- "> Python API to query Vespa applications"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can connect to the CORD-19 Search app and use it to exemplify the query API"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.application import Vespa\n",
- "\n",
- "app = Vespa(url = \"https://api.cord19.vespa.ai\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Specify the request body\n",
- "\n",
- "> Full flexibility by specifying the entire request body"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "body = {\n",
- " 'yql': 'select title, abstract from sources * where userQuery();',\n",
- " 'hits': 5,\n",
- " 'query': 'Is remdesivir an effective treatment for COVID-19?',\n",
- " 'type': 'any',\n",
- " 'ranking': 'bm25'\n",
- "}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "results = app.query(body=body)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "202768"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "results.number_documents_retrieved"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Specify a query model"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Query + term-matching + rank profile"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.query import Query, OR, RankProfile\n",
- "\n",
- "results = app.query(\n",
- " query=\"Is remdesivir an effective treatment for COVID-19?\", \n",
- " query_model = Query(\n",
- " match_phase=OR(), \n",
- " rank_profile=RankProfile(name=\"bm25\")\n",
- " )\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "202768"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "results.number_documents_retrieved"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Query + term-matching + ann operator + rank_profile"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "from vespa.query import Query, ANN, WeakAnd, Union, RankProfile\n",
- "from random import random\n",
- "\n",
- "match_phase = Union(\n",
- " WeakAnd(hits = 10), \n",
- " ANN(\n",
- " doc_vector=\"title_embedding\", \n",
- " query_vector=\"title_vector\", \n",
- " embedding_model=lambda x: [random() for x in range(768)],\n",
- " hits = 10,\n",
- " label=\"title\"\n",
- " )\n",
- ")\n",
- "rank_profile = RankProfile(name=\"bm25\", list_features=True)\n",
- "query_model = Query(match_phase=match_phase, rank_profile=rank_profile)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "results = app.query(query=\"Is remdesivir an effective treatment for COVID-19?\", \n",
- " query_model=query_model)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1049"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "results.number_documents_retrieved"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Recall specific documents"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's take a look at the top 3 ids from the last query."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[198698, 120155, 120154]"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "top_ids = [hit[\"fields\"][\"id\"] for hit in results.hits[0:3]]\n",
- "top_ids"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Assume that we now want to retrieve the second and third ids above. We can do so with the `recall` argument."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "results_with_recall = app.query(query=\"Is remdesivir an effective treatment for COVID-19?\", \n",
- " query_model=query_model,\n",
- " recall = (\"id\", top_ids[1:3]))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "It will only retrieve the documents with Vespa field `id` that is defined on the list that is inside the tuple."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[120155, 120154]"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "id_recalled = [hit[\"fields\"][\"id\"] for hit in results_with_recall.hits]\n",
- "id_recalled"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/vespa/docs/sphinx/source/quickstart.rst b/python/vespa/docs/sphinx/source/quickstart.rst
deleted file mode 100644
index e740b89b7f9..00000000000
--- a/python/vespa/docs/sphinx/source/quickstart.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-Quick-start
-===========
-
-The best way to get started is by following the tutorials below. You can easily run them yourself on Google Colab
-by clicking on the badge at the top of the tutorial.
-
-.. toctree::
-
- connect-to-vespa-instance
- create-and-deploy-vespa-cloud
- create-and-deploy-vespa-docker
diff --git a/python/vespa/docs/sphinx/source/reference-api.rst b/python/vespa/docs/sphinx/source/reference-api.rst
deleted file mode 100644
index a8bd94b9e9f..00000000000
--- a/python/vespa/docs/sphinx/source/reference-api.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-Reference API
-=============
-
-vespa.application module
-------------------------
-
-.. automodule:: vespa.application
- :members:
- :undoc-members:
- :show-inheritance:
- :special-members: __init__
-
-vespa.evaluation module
------------------------
-
-.. automodule:: vespa.evaluation
- :members:
- :undoc-members:
- :show-inheritance:
-
-vespa.package module
---------------------
-
-.. automodule:: vespa.package
- :members:
- :undoc-members:
- :show-inheritance:
-
-vespa.query module
-------------------
-
-.. automodule:: vespa.query
- :members:
- :undoc-members:
- :show-inheritance:
diff --git a/python/vespa/docs/sphinx/source/requirements.txt b/python/vespa/docs/sphinx/source/requirements.txt
deleted file mode 100644
index 261dd62a660..00000000000
--- a/python/vespa/docs/sphinx/source/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-python/vespa
-sphinx>=1.4
-ipykernel
-nbsphinx
-sphinx-rtd-theme \ No newline at end of file
diff --git a/python/vespa/setup.py b/python/vespa/setup.py
deleted file mode 100644
index 8a1bcd0a6b6..00000000000
--- a/python/vespa/setup.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import setuptools
-
-
-def get_target_version():
- build_nr = os.environ.get("GITHUB_RUN_NUMBER", "0+dev")
- version = "0.1"
- return "{}.{}".format(version, build_nr)
-
-
-min_python = "3.6"
-
-setuptools.setup(
- name="pyvespa",
- version=get_target_version(),
- description="Python API for vespa.ai",
- keywords="vespa, search engine, data science",
- author="Thiago G. Martins",
- author_email="tmartins@verizonmedia.com",
- license=(
- "Apache Software License 2.0",
- "OSI Approved :: Apache Software License",
- ),
- packages=setuptools.find_packages(),
- include_package_data=True,
- install_requires=["requests", "pandas", "docker", "jinja2", "cryptography"],
- python_requires=">=3.6",
- zip_safe=False,
- data_files=[
- (
- "templates",
- [
- "vespa/templates/hosts.xml",
- "vespa/templates/services.xml",
- "vespa/templates/schema.txt",
- ],
- )
- ],
-)
diff --git a/python/vespa/vespa/__init__.py b/python/vespa/vespa/__init__.py
deleted file mode 100644
index b506a040722..00000000000
--- a/python/vespa/vespa/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-__version__ = "0.0.1"
diff --git a/python/vespa/vespa/_nbdev.py b/python/vespa/vespa/_nbdev.py
deleted file mode 100644
index b68d7b2f4bc..00000000000
--- a/python/vespa/vespa/_nbdev.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# AUTOGENERATED BY NBDEV! DO NOT EDIT!
-
-__all__ = ["index", "modules", "custom_doc_links", "git_url"]
-
-index = {}
-
-modules = []
-
-doc_url = "https://vespa-engine.github.io/vespa/"
-
-git_url = "https://github.com/vespa-engine/vespa/tree/master/"
-
-def custom_doc_links(name): return None
diff --git a/python/vespa/vespa/application.py b/python/vespa/vespa/application.py
deleted file mode 100644
index c11e56d2125..00000000000
--- a/python/vespa/vespa/application.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-from typing import Optional, Dict, Tuple, List
-from pandas import DataFrame
-from requests import post
-from requests.models import Response
-
-from vespa.query import Query, VespaResult
-from vespa.evaluation import EvalMetric
-
-
-class Vespa(object):
- def __init__(
- self,
- url: str,
- port: Optional[int] = None,
- deployment_message: Optional[List[str]] = None,
- cert: Optional[str] = None,
- ) -> None:
- """
- Establish a connection with a Vespa application.
-
- :param url: Vespa instance URL.
- :param port: Vespa instance port.
- :param deployment_message: Message returned by Vespa engine after deployment. Used internally by deploy methods.
- :param cert: Path to certificate and key file.
-
- >>> Vespa(url = "https://cord19.vespa.ai")
- >>> Vespa(url = "http://localhost", port = 8080)
- >>> Vespa(url = "https://api.vespa-external.aws.oath.cloud", port = 4443, cert = "/path/to/cert-and-key.pem")
-
- """
- self.url = url
- self.port = port
- self.deployment_message = deployment_message
- self.cert = cert
-
- if port is None:
- self.end_point = self.url
- else:
- self.end_point = str(url).rstrip("/") + ":" + str(port)
- self.search_end_point = self.end_point + "/search/"
-
- def __repr__(self):
- if self.port:
- return "Vespa({}, {})".format(self.url, self.port)
- else:
- return "Vespa({})".format(self.url)
-
- def query(
- self,
- body: Optional[Dict] = None,
- query: Optional[str] = None,
- query_model: Optional[Query] = None,
- debug_request: bool = False,
- recall: Optional[Tuple] = None,
- **kwargs
- ) -> VespaResult:
- """
- Send a query request to the Vespa application.
-
- Either send 'body' containing all the request parameters or specify 'query' and 'query_model'.
-
- :param body: Dict containing all the request parameters.
- :param query: Query string
- :param query_model: Query model
- :param debug_request: return request body for debugging instead of sending the request.
- :param recall: Tuple of size 2 where the first element is the name of the field to use to recall and the
- second element is a list of the values to be recalled.
- :param kwargs: Additional parameters to be sent along the request.
- :return: Either the request body if debug_request is True or the result from the Vespa application
- """
-
- if body is None:
- assert query is not None, "No 'query' specified."
- assert query_model is not None, "No 'query_model' specified."
- body = query_model.create_body(query=query)
- if recall is not None:
- body.update(
- {
- "recall": "+("
- + " ".join(
- ["{}:{}".format(recall[0], str(doc)) for doc in recall[1]]
- )
- + ")"
- }
- )
-
- body.update(kwargs)
-
- if debug_request:
- return VespaResult(vespa_result={}, request_body=body)
- else:
- r = post(self.search_end_point, json=body, cert=self.cert)
- return VespaResult(vespa_result=r.json())
-
- def feed_data_point(self, schema: str, data_id: str, fields: Dict) -> Response:
- """
- Feed a data point to a Vespa app.
-
- :param schema: The schema that we are sending data to.
- :param data_id: Unique id associated with this data point.
- :param fields: Dict containing all the fields required by the `schema`.
- :return: Response of the HTTP POST request.
- """
- end_point = "{}/document/v1/{}/{}/docid/{}".format(
- self.end_point, schema, schema, str(data_id)
- )
- vespa_format = {"fields": fields}
- response = post(end_point, json=vespa_format, cert=self.cert)
- return response
-
- def collect_training_data_point(
- self,
- query: str,
- query_id: str,
- relevant_id: str,
- id_field: str,
- query_model: Query,
- number_additional_docs: int,
- relevant_score: int = 1,
- default_score: int = 0,
- **kwargs
- ) -> List[Dict]:
- """
- Collect training data based on a single query
-
- :param query: Query string.
- :param query_id: Query id represented as str.
- :param relevant_id: Relevant id represented as a str.
- :param id_field: The Vespa field representing the document id.
- :param query_model: Query model.
- :param number_additional_docs: Number of additional documents to retrieve for each relevant document.
- :param relevant_score: Score to assign to relevant documents. Default to 1.
- :param default_score: Score to assign to the additional documents that are not relevant. Default to 0.
- :param kwargs: Extra keyword arguments to be included in the Vespa Query.
- :return: List of dicts containing the document id (document_id), query id (query_id), scores (relevant)
- and vespa rank features returned by the Query model RankProfile used.
- """
-
- assert (
- query_model.rank_profile.list_features == "true"
- ), "Enable rank features via RankProfile is necessary."
-
- relevant_id_result = self.query(
- query=query,
- query_model=query_model,
- recall=(id_field, [relevant_id]),
- **kwargs
- )
- hits = relevant_id_result.hits
- features = []
- if len(hits) == 1 and hits[0]["fields"][id_field] == relevant_id:
- random_hits_result = self.query(
- query=query,
- query_model=query_model,
- hits=number_additional_docs,
- **kwargs
- )
- hits.extend(random_hits_result.hits)
-
- features = annotate_data(
- hits=hits,
- query_id=query_id,
- id_field=id_field,
- relevant_id=relevant_id,
- relevant_score=relevant_score,
- default_score=default_score,
- )
- return features
-
- def collect_training_data(
- self,
- labelled_data: List[Dict],
- id_field: str,
- query_model: Query,
- number_additional_docs: int,
- relevant_score: int = 1,
- default_score: int = 0,
- **kwargs
- ) -> DataFrame:
- """
- Collect training data based on a set of labelled data.
-
- :param labelled_data: Labelled data containing query, query_id and relevant ids.
- :param id_field: The Vespa field representing the document id.
- :param query_model: Query model.
- :param number_additional_docs: Number of additional documents to retrieve for each relevant document.
- :param relevant_score: Score to assign to relevant documents. Default to 1.
- :param default_score: Score to assign to the additional documents that are not relevant. Default to 0.
- :param kwargs: Extra keyword arguments to be included in the Vespa Query.
- :return: DataFrame containing document id (document_id), query id (query_id), scores (relevant)
- and vespa rank features returned by the Query model RankProfile used.
- """
-
- training_data = []
- for query_data in labelled_data:
- for doc_data in query_data["relevant_docs"]:
- training_data_point = self.collect_training_data_point(
- query=query_data["query"],
- query_id=query_data["query_id"],
- relevant_id=doc_data["id"],
- id_field=id_field,
- query_model=query_model,
- number_additional_docs=number_additional_docs,
- relevant_score=doc_data.get("score", relevant_score),
- default_score=default_score,
- **kwargs
- )
- training_data.extend(training_data_point)
- training_data = DataFrame.from_records(training_data)
- return training_data
-
- def evaluate_query(
- self,
- eval_metrics: List[EvalMetric],
- query_model: Query,
- query_id: str,
- query: str,
- id_field: str,
- relevant_docs: List[Dict],
- default_score: int = 0,
- **kwargs
- ) -> Dict:
- """
- Evaluate a query according to evaluation metrics
-
- :param eval_metrics: A list of evaluation metrics.
- :param query_model: Query model.
- :param query_id: Query id represented as str.
- :param query: Query string.
- :param id_field: The Vespa field representing the document id.
- :param relevant_docs: A list with dicts where each dict contains a doc id a optionally a doc score.
- :param default_score: Score to assign to the additional documents that are not relevant. Default to 0.
- :param kwargs: Extra keyword arguments to be included in the Vespa Query.
- :return: Dict containing query_id and metrics according to the selected evaluation metrics.
- """
-
- query_results = self.query(query=query, query_model=query_model, **kwargs)
- evaluation = {"query_id": query_id}
- for evaluator in eval_metrics:
- evaluation.update(
- evaluator.evaluate_query(
- query_results, relevant_docs, id_field, default_score
- )
- )
- return evaluation
-
- def evaluate(
- self,
- labelled_data: List[Dict],
- eval_metrics: List[EvalMetric],
- query_model: Query,
- id_field: str,
- default_score: int = 0,
- **kwargs
- ) -> DataFrame:
- """
-
- :param labelled_data: Labelled data containing query, query_id and relevant ids.
- :param eval_metrics: A list of evaluation metrics.
- :param query_model: Query model.
- :param id_field: The Vespa field representing the document id.
- :param default_score: Score to assign to the additional documents that are not relevant. Default to 0.
- :param kwargs: Extra keyword arguments to be included in the Vespa Query.
- :return: DataFrame containing query_id and metrics according to the selected evaluation metrics.
- """
- evaluation = []
- for query_data in labelled_data:
- evaluation_query = self.evaluate_query(
- eval_metrics=eval_metrics,
- query_model=query_model,
- query_id=query_data["query_id"],
- query=query_data["query"],
- id_field=id_field,
- relevant_docs=query_data["relevant_docs"],
- default_score=default_score,
- **kwargs
- )
- evaluation.append(evaluation_query)
- evaluation = DataFrame.from_records(evaluation)
- return evaluation
-
-
-# todo: a better pattern for labelled data would be (query_id, query, doc_id, score) with the possibility od
-# assigning a specific default value for those docs not mentioned
-def annotate_data(hits, query_id, id_field, relevant_id, relevant_score, default_score):
- data = []
- for h in hits:
- rank_features = h["fields"]["rankfeatures"]
- rank_features.update({"document_id": h["fields"][id_field]})
- rank_features.update({"query_id": query_id})
- rank_features.update(
- {
- "relevant": relevant_score
- if h["fields"][id_field] == relevant_id
- else default_score
- }
- )
- data.append(rank_features)
- return data
diff --git a/python/vespa/vespa/evaluation.py b/python/vespa/vespa/evaluation.py
deleted file mode 100644
index 4ca7a1d136b..00000000000
--- a/python/vespa/vespa/evaluation.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-from typing import Dict, List
-from vespa.query import VespaResult
-
-
-class EvalMetric(object):
- def __init__(self) -> None:
- pass
-
- def evaluate_query(
- self, query_results, relevant_docs, id_field, default_score
- ) -> Dict:
- raise NotImplementedError
-
-
-class MatchRatio(EvalMetric):
- def __init__(self) -> None:
- """
- Computes the ratio of documents retrieved by the match phase.
- """
- super().__init__()
- self.name = "match_ratio"
-
- def evaluate_query(
- self,
- query_results: VespaResult,
- relevant_docs: List[Dict],
- id_field: str,
- default_score: int,
- ) -> Dict:
- """
- Evaluate query results.
-
- :param query_results: Raw query results returned by Vespa.
- :param relevant_docs: A list with dicts where each dict contains a doc id a optionally a doc score.
- :param id_field: The Vespa field representing the document id.
- :param default_score: Score to assign to the additional documents that are not relevant. Default to 0.
- :return: Dict containing the number of retrieved docs (_retrieved_docs), the number of docs available in
- the corpus (_docs_available) and the match ratio (_value).
- """
- retrieved_docs = query_results.number_documents_retrieved
- docs_available = query_results.number_documents_indexed
- value = 0
- if docs_available > 0:
- value = retrieved_docs / docs_available
- return {
- str(self.name) + "_retrieved_docs": retrieved_docs,
- str(self.name) + "_docs_available": docs_available,
- str(self.name) + "_value": value,
- }
-
-
-class Recall(EvalMetric):
- def __init__(self, at: int) -> None:
- """
- Compute the recall at position `at`
-
- :param at: Maximum position on the resulting list to look for relevant docs.
- """
- super().__init__()
- self.name = "recall_" + str(at)
- self.at = at
-
- def evaluate_query(
- self,
- query_results: VespaResult,
- relevant_docs: List[Dict],
- id_field: str,
- default_score: int,
- ) -> Dict:
- """
- Evaluate query results.
-
- :param query_results: Raw query results returned by Vespa.
- :param relevant_docs: A list with dicts where each dict contains a doc id a optionally a doc score.
- :param id_field: The Vespa field representing the document id.
- :param default_score: Score to assign to the additional documents that are not relevant. Default to 0.
- :return: Dict containing the recall value (_value).
- """
-
- relevant_ids = {str(doc["id"]) for doc in relevant_docs}
- try:
- retrieved_ids = {
- str(hit["fields"][id_field]) for hit in query_results.hits[: self.at]
- }
- except KeyError:
- retrieved_ids = set()
-
- return {
- str(self.name)
- + "_value": len(relevant_ids & retrieved_ids) / len(relevant_ids)
- }
-
-
-class ReciprocalRank(EvalMetric):
- def __init__(self, at: int):
- """
- Compute the reciprocal rank at position `at`
-
- :param at: Maximum position on the resulting list to look for relevant docs.
- """
- super().__init__()
- self.name = "reciprocal_rank_" + str(at)
- self.at = at
-
- def evaluate_query(
- self,
- query_results: VespaResult,
- relevant_docs: List[Dict],
- id_field: str,
- default_score: int,
- ) -> Dict:
- """
- Evaluate query results.
-
- :param query_results: Raw query results returned by Vespa.
- :param relevant_docs: A list with dicts where each dict contains a doc id a optionally a doc score.
- :param id_field: The Vespa field representing the document id.
- :param default_score: Score to assign to the additional documents that are not relevant. Default to 0.
- :return: Dict containing the reciprocal rank value (_value).
- """
-
- relevant_ids = {str(doc["id"]) for doc in relevant_docs}
- rr = 0
- hits = query_results.hits[: self.at]
- for index, hit in enumerate(hits):
- if hit["fields"][id_field] in relevant_ids:
- rr = 1 / (index + 1)
- break
-
- return {str(self.name) + "_value": rr}
diff --git a/python/vespa/vespa/json_serialization.py b/python/vespa/vespa/json_serialization.py
deleted file mode 100644
index d5f059326e7..00000000000
--- a/python/vespa/vespa/json_serialization.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import datetime
-import json
-import typing
-
-T = typing.TypeVar("T")
-
-
-class ToJson(object):
- """
- Utility mix-in class for serializing an object to JSON. It does not really
- do any conversion on its own, but forces serialization into a standardized
- API.
-
- The serialized class is put into an envelope with some data to make it easier
- to understand what has happened.
-
- {
- "version": 1,
- "class": "Field",
- "serialized_at": "2018-10-24T12:55:32+00:00",
- "data": { ... }
- }
-
- * version: This value is hard-coded to 1.
- * class: The name of the class we serialized. For debugging purposes.
- * serialized_at: The time we serialized the instance of the class. For debugging purposes.
- * data: The actual data of the serialized class.
-
- All serialization is based on converting objects to a `dict` which is then converted
- to JSON using the standard Python json library.
- """
-
- @property
- def to_dict(self) -> typing.Mapping:
- raise NotImplementedError
-
- @property
- def to_envelope(self) -> typing.Mapping:
- return {
- "version": 1,
- "class": self.__class__.__name__,
- "serialized_at": datetime.datetime.utcnow().isoformat(),
- "data": self.to_dict,
- }
-
- @property
- def to_json(self) -> str:
- mapping = self.to_envelope
- return json.dumps(mapping)
-
-
-class FromJson(typing.Generic[T]):
- """
- A mix-in class for deserializing from JSON to an object that implements this class.
- All JSON must have the same envelope as ToJson to be able to properly deserialize the
- contents of the mapping.
- """
-
- deserializers: typing.MutableMapping[str, "FromJson"] = {}
-
- def __init_subclass__(cls, **kwargs):
- super().__init_subclass__(**kwargs) # type: ignore
- FromJson.deserializers[cls.__name__] = cls
-
- @staticmethod
- def from_json(json_string: str) -> T:
- mapping = json.loads(json_string)
- return FromJson.map(mapping)
-
- @staticmethod
- def map(mapping: typing.Mapping) -> T:
- mapping_class = FromJson.deserializers[mapping["class"]]
- return mapping_class.from_dict(mapping["data"])
-
- @staticmethod
- def from_dict(mapping: typing.Mapping) -> T:
- raise NotImplementedError
diff --git a/python/vespa/vespa/package.py b/python/vespa/vespa/package.py
deleted file mode 100644
index 301a9df43bd..00000000000
--- a/python/vespa/vespa/package.py
+++ /dev/null
@@ -1,786 +0,0 @@
-import sys
-import http.client
-import json
-import os
-import re
-import zipfile
-from base64 import standard_b64encode
-from datetime import datetime, timedelta
-from io import BytesIO
-from pathlib import Path
-from time import sleep, strftime, gmtime
-from typing import List, Mapping, Optional, IO
-
-import docker
-from cryptography import x509
-from cryptography.hazmat.backends import default_backend
-from cryptography.hazmat.primitives.asymmetric import ec
-from cryptography.hazmat.primitives import hashes
-from cryptography.hazmat.primitives import serialization
-from jinja2 import Environment, PackageLoader, select_autoescape
-
-from vespa.json_serialization import ToJson, FromJson
-from vespa.application import Vespa
-
-
-class Field(ToJson, FromJson["Field"]):
- def __init__(
- self,
- name: str,
- type: str,
- indexing: Optional[List[str]] = None,
- index: Optional[str] = None,
- ) -> None:
- """
- Object representing a Vespa document field.
-
- :param name: Field name.
- :param type: Field data type.
- :param indexing: Configures how to process data of a field during indexing.
- :param index: Sets index parameters. Content in fields with index are normalized and tokenized by default.
- """
- self.name = name
- self.type = type
- self.indexing = indexing
- self.index = index
-
- @property
- def indexing_to_text(self) -> Optional[str]:
- if self.indexing is not None:
- return " | ".join(self.indexing)
-
- @staticmethod
- def from_dict(mapping: Mapping) -> "Field":
- return Field(
- name=mapping["name"],
- type=mapping["type"],
- indexing=mapping.get("indexing", None),
- index=mapping.get("index", None),
- )
-
- @property
- def to_dict(self) -> Mapping:
- map = {"name": self.name, "type": self.type}
- if self.indexing is not None:
- map.update(indexing=self.indexing)
- if self.index is not None:
- map.update(index=self.index)
- return map
-
- def __eq__(self, other):
- if not isinstance(other, self.__class__):
- return False
- return (
- self.name == other.name
- and self.type == other.type
- and self.indexing == other.indexing
- and self.index == other.index
- )
-
- def __repr__(self):
- return "{0}({1}, {2}, {3}, {4})".format(
- self.__class__.__name__,
- repr(self.name),
- repr(self.type),
- repr(self.indexing),
- repr(self.index),
- )
-
-
-class Document(ToJson, FromJson["Document"]):
- def __init__(self, fields: Optional[List[Field]] = None) -> None:
- """
- Object representing a Vespa document.
-
- """
- self.fields = [] if not fields else fields
-
- def add_fields(self, *fields: Field):
- """
- Add Fields to the document.
-
- :param fields: fields to be added
- :return:
- """
- self.fields.extend(fields)
-
- @staticmethod
- def from_dict(mapping: Mapping) -> "Document":
- return Document(fields=[FromJson.map(field) for field in mapping.get("fields")])
-
- @property
- def to_dict(self) -> Mapping:
- map = {"fields": [field.to_envelope for field in self.fields]}
- return map
-
- def __eq__(self, other):
- if not isinstance(other, self.__class__):
- return False
- return self.fields == other.fields
-
- def __repr__(self):
- return "{0}({1})".format(
- self.__class__.__name__, repr(self.fields) if self.fields else None
- )
-
-
-class FieldSet(ToJson, FromJson["FieldSet"]):
- def __init__(self, name: str, fields: List[str]) -> None:
- """
- A fieldset groups fields together for searching.
-
- :param name: Name of the fieldset
- :param fields: Field names to be included in the fieldset.
- """
- self.name = name
- self.fields = fields
-
- @property
- def fields_to_text(self):
- if self.fields is not None:
- return ", ".join(self.fields)
-
- @staticmethod
- def from_dict(mapping: Mapping) -> "FieldSet":
- return FieldSet(name=mapping["name"], fields=mapping["fields"])
-
- @property
- def to_dict(self) -> Mapping:
- map = {"name": self.name, "fields": self.fields}
- return map
-
- def __eq__(self, other):
- if not isinstance(other, self.__class__):
- return False
- return self.name == other.name and self.fields == other.fields
-
- def __repr__(self):
- return "{0}({1}, {2})".format(
- self.__class__.__name__, repr(self.name), repr(self.fields)
- )
-
-
-class RankProfile(ToJson, FromJson["RankProfile"]):
- def __init__(
- self, name: str, first_phase: str, inherits: Optional[str] = None
- ) -> None:
- """
- Define a Vespa rank profile
-
- :param name: Rank profile name.
- :param first_phase: First phase ranking expression.
- """
- self.name = name
- self.first_phase = first_phase
- self.inherits = inherits
-
- @staticmethod
- def from_dict(mapping: Mapping) -> "RankProfile":
- return RankProfile(
- name=mapping["name"],
- first_phase=mapping["first_phase"],
- inherits=mapping.get("inherits", None),
- )
-
- @property
- def to_dict(self) -> Mapping:
- map = {"name": self.name, "first_phase": self.first_phase}
- if self.inherits is not None:
- map.update({"inherits": self.inherits})
- return map
-
- def __eq__(self, other):
- if not isinstance(other, self.__class__):
- return False
- return (
- self.name == other.name
- and self.first_phase == other.first_phase
- and self.inherits == other.inherits
- )
-
- def __repr__(self):
- return "{0}({1}, {2}, {3})".format(
- self.__class__.__name__,
- repr(self.name),
- repr(self.first_phase),
- repr(self.inherits),
- )
-
-
-class Schema(ToJson, FromJson["Schema"]):
- def __init__(
- self,
- name: str,
- document: Document,
- fieldsets: Optional[List[FieldSet]] = None,
- rank_profiles: Optional[List[RankProfile]] = None,
- ) -> None:
- """
- Create a Vespa Schema.
-
- :param name: Schema name.
- :param document: Vespa document associated with the Schema.
- :param fieldsets: A list of `FieldSet` associated with the Schema.
- :param rank_profiles: A list of `RankProfile` associated with the Schema.
- """
- self.name = name
- self.document = document
-
- self.fieldsets = {}
- if fieldsets is not None:
- self.fieldsets = {fieldset.name: fieldset for fieldset in fieldsets}
-
- self.rank_profiles = {}
- if rank_profiles is not None:
- self.rank_profiles = {
- rank_profile.name: rank_profile for rank_profile in rank_profiles
- }
-
- def add_rank_profile(self, rank_profile: RankProfile) -> None:
- """
- Add a `RankProfile` to the `Schema`.
- :param rank_profile: `RankProfile` to be added.
- :return: None.
- """
- self.rank_profiles[rank_profile.name] = rank_profile
-
- @staticmethod
- def from_dict(mapping: Mapping) -> "Schema":
- return Schema(
- name=mapping["name"],
- document=FromJson.map(mapping["document"]),
- fieldsets=[FromJson.map(fieldset) for fieldset in mapping["fieldsets"]],
- rank_profiles=[
- FromJson.map(rank_profile) for rank_profile in mapping["rank_profiles"]
- ],
- )
-
- @property
- def to_dict(self) -> Mapping:
- map = {
- "name": self.name,
- "document": self.document.to_envelope,
- "fieldsets": [
- self.fieldsets[name].to_envelope for name in self.fieldsets.keys()
- ],
- "rank_profiles": [
- self.rank_profiles[name].to_envelope
- for name in self.rank_profiles.keys()
- ],
- }
- return map
-
- def __eq__(self, other):
- if not isinstance(other, self.__class__):
- return False
- return (
- self.name == other.name
- and self.document == other.document
- and self.fieldsets == other.fieldsets
- and self.rank_profiles == other.rank_profiles
- )
-
- def __repr__(self):
- return "{0}({1}, {2}, {3}, {4})".format(
- self.__class__.__name__,
- repr(self.name),
- repr(self.document),
- repr(
- [field for field in self.fieldsets.values()] if self.fieldsets else None
- ),
- repr(
- [rank_profile for rank_profile in self.rank_profiles.values()]
- if self.rank_profiles
- else None
- ),
- )
-
-
-class ApplicationPackage(ToJson, FromJson["ApplicationPackage"]):
- def __init__(self, name: str, schema: Schema) -> None:
- """
- Vespa Application Package.
-
- :param name: Application name.
- :param schema: Schema of the application.
- """
- self.name = name
- self.schema = schema
-
- @property
- def schema_to_text(self):
- env = Environment(
- loader=PackageLoader("vespa", "templates"),
- autoescape=select_autoescape(
- disabled_extensions=("txt",),
- default_for_string=True,
- default=True,
- ),
- )
- env.trim_blocks = True
- env.lstrip_blocks = True
- schema_template = env.get_template("schema.txt")
- return schema_template.render(
- schema_name=self.schema.name,
- document_name=self.schema.name,
- fields=self.schema.document.fields,
- fieldsets=self.schema.fieldsets,
- rank_profiles=self.schema.rank_profiles,
- )
-
- @property
- def hosts_to_text(self):
- env = Environment(
- loader=PackageLoader("vespa", "templates"),
- autoescape=select_autoescape(
- disabled_extensions=("txt",),
- default_for_string=True,
- default=True,
- ),
- )
- env.trim_blocks = True
- env.lstrip_blocks = True
- schema_template = env.get_template("hosts.xml")
- return schema_template.render()
-
- @property
- def services_to_text(self):
- env = Environment(
- loader=PackageLoader("vespa", "templates"),
- autoescape=select_autoescape(
- disabled_extensions=("txt",),
- default_for_string=True,
- default=True,
- ),
- )
- env.trim_blocks = True
- env.lstrip_blocks = True
- schema_template = env.get_template("services.xml")
- return schema_template.render(
- application_name=self.name,
- document_name=self.schema.name,
- )
-
- @staticmethod
- def from_dict(mapping: Mapping) -> "ApplicationPackage":
- schema = mapping.get("schema", None)
- if schema is not None:
- schema = FromJson.map(schema)
- return ApplicationPackage(name=mapping["name"], schema=schema)
-
- @property
- def to_dict(self) -> Mapping:
- map = {"name": self.name}
- if self.schema is not None:
- map.update({"schema": self.schema.to_envelope})
- return map
-
- def __eq__(self, other):
- if not isinstance(other, self.__class__):
- return False
- return self.name == other.name and self.schema == other.schema
-
- def __repr__(self):
- return "{0}({1}, {2})".format(
- self.__class__.__name__, repr(self.name), repr(self.schema)
- )
-
-
-class VespaDocker(object):
- def __init__(
- self,
- application_package: ApplicationPackage,
- output_file: IO = sys.stdout,
- ) -> None:
- """
- Deploy application to a Vespa container
-
- :param application_package: ApplicationPackage to be deployed.
- :param output_file: Output file to write output messages.
- """
- self.application_package = application_package
- self.container = None
- self.local_port = 8080
- self.output = output_file
-
- def _run_vespa_engine_container(self, disk_folder: str, container_memory: str):
- """
- Run a vespa container.
-
- :param disk_folder: Folder containing the application files.
- :param container_memory: Memory limit of the container
- :return:
- """
- client = docker.from_env()
- if self.container is None:
- try:
- self.container = client.containers.get(self.application_package.name)
- except docker.errors.NotFound:
- self.container = client.containers.run(
- "vespaengine/vespa",
- detach=True,
- mem_limit=container_memory,
- name=self.application_package.name,
- hostname=self.application_package.name,
- privileged=True,
- volumes={disk_folder: {"bind": "/app", "mode": "rw"}},
- ports={self.local_port: self.local_port, 19112: 19112},
- )
-
- def _check_configuration_server(self) -> bool:
- """
- Check if configuration server is running and ready for deployment
-
- :return: True if configuration server is running.
- """
- return (
- self.container is not None
- and self.container.exec_run(
- "bash -c 'curl -s --head http://localhost:19071/ApplicationStatus'"
- )
- .output.decode("utf-8")
- .split("\r\n")[0]
- == "HTTP/1.1 200 OK"
- )
-
- def _create_application_package_files(self, dir_path):
- Path(os.path.join(dir_path, "application/schemas")).mkdir(
- parents=True, exist_ok=True
- )
- with open(
- os.path.join(
- dir_path,
- "application/schemas/{}.sd".format(
- self.application_package.schema.name
- ),
- ),
- "w",
- ) as f:
- f.write(self.application_package.schema_to_text)
- with open(os.path.join(dir_path, "application/hosts.xml"), "w") as f:
- f.write(self.application_package.hosts_to_text)
- with open(os.path.join(dir_path, "application/services.xml"), "w") as f:
- f.write(self.application_package.services_to_text)
-
- def deploy(self, disk_folder: str, container_memory: str = "4G"):
- """
- Deploy the application into a Vespa container.
-
- :param disk_folder: Disk folder to save the required Vespa config files.
- :param container_memory: Docker container memory available to the application.
-
- :return: a Vespa connection instance.
- """
-
- self._create_application_package_files(dir_path=disk_folder)
-
- self._run_vespa_engine_container(
- disk_folder=disk_folder, container_memory=container_memory
- )
-
- while not self._check_configuration_server():
- print("Waiting for configuration server.", file=self.output)
- sleep(5)
-
- deployment = self.container.exec_run(
- "bash -c '/opt/vespa/bin/vespa-deploy prepare /app/application && /opt/vespa/bin/vespa-deploy activate'"
- )
-
- deployment_message = deployment.output.decode("utf-8").split("\n")
-
- if not any(re.match("Generation: [0-9]+", line) for line in deployment_message):
- raise RuntimeError(deployment_message)
-
- return Vespa(
- url="http://localhost",
- port=self.local_port,
- deployment_message=deployment_message,
- )
-
-
-class VespaCloud(object):
- def __init__(
- self,
- tenant: str,
- application: str,
- key_location: str,
- application_package: ApplicationPackage,
- output_file: IO = sys.stdout,
- ) -> None:
- """
- Deploy application to the Vespa Cloud (cloud.vespa.ai)
-
- :param tenant: Tenant name registered in the Vespa Cloud.
- :param application: Application name registered in the Vespa Cloud.
- :param key_location: Location of the private key used for signing HTTP requests to the Vespa Cloud.
- :param application_package: ApplicationPackage to be deployed.
- :param output_file: Output file to write output messages.
- """
- self.tenant = tenant
- self.application = application
- self.application_package = application_package
- self.api_key = self._read_private_key(key_location)
- self.api_public_key_bytes = standard_b64encode(
- self.api_key.public_key().public_bytes(
- serialization.Encoding.PEM,
- serialization.PublicFormat.SubjectPublicKeyInfo,
- )
- )
- self.data_key, self.data_certificate = self._create_certificate_pair()
- self.private_cert_file_name = "private_cert.txt"
- self.connection = http.client.HTTPSConnection(
- "api.vespa-external.aws.oath.cloud", 4443
- )
- self.output = output_file
-
- @staticmethod
- def _read_private_key(key_location: str) -> ec.EllipticCurvePrivateKey:
- with open(key_location, "rb") as key_data:
- key = serialization.load_pem_private_key(
- key_data.read(), None, default_backend()
- )
- if not isinstance(key, ec.EllipticCurvePrivateKey):
- raise TypeError(
- "Key at " + key_location + " must be an elliptic curve private key"
- )
- return key
-
- def _write_private_key_and_cert(
- self, key: ec.EllipticCurvePrivateKey, cert: x509.Certificate, disk_folder: str
- ) -> None:
- cert_file = os.path.join(disk_folder, self.private_cert_file_name)
- with open(cert_file, "w+") as file:
- file.write(
- key.private_bytes(
- serialization.Encoding.PEM,
- serialization.PrivateFormat.TraditionalOpenSSL,
- serialization.NoEncryption(),
- ).decode("UTF-8")
- )
- file.write(cert.public_bytes(serialization.Encoding.PEM).decode("UTF-8"))
-
- @staticmethod
- def _create_certificate_pair() -> (ec.EllipticCurvePrivateKey, x509.Certificate):
- key = ec.generate_private_key(ec.SECP384R1, default_backend())
- name = x509.Name([x509.NameAttribute(x509.NameOID.COMMON_NAME, u"localhost")])
- certificate = (
- x509.CertificateBuilder()
- .subject_name(name)
- .issuer_name(name)
- .serial_number(x509.random_serial_number())
- .not_valid_before(datetime.utcnow() - timedelta(minutes=1))
- .not_valid_after(datetime.utcnow() + timedelta(days=7))
- .public_key(key.public_key())
- .sign(key, hashes.SHA256(), default_backend())
- )
- return (key, certificate)
-
- def _request(
- self, method: str, path: str, body: BytesIO = BytesIO(), headers={}
- ) -> dict:
- digest = hashes.Hash(hashes.SHA256(), default_backend())
- body.seek(0)
- digest.update(body.read())
- content_hash = standard_b64encode(digest.finalize()).decode("UTF-8")
- timestamp = (
- datetime.utcnow().isoformat() + "Z"
- ) # Java's Instant.parse requires the neutral time zone appended
- url = "https://" + self.connection.host + ":" + str(self.connection.port) + path
-
- canonical_message = method + "\n" + url + "\n" + timestamp + "\n" + content_hash
- signature = self.api_key.sign(
- canonical_message.encode("UTF-8"), ec.ECDSA(hashes.SHA256())
- )
-
- headers = {
- "X-Timestamp": timestamp,
- "X-Content-Hash": content_hash,
- "X-Key-Id": self.tenant + ":" + self.application + ":" + "default",
- "X-Key": self.api_public_key_bytes,
- "X-Authorization": standard_b64encode(signature),
- **headers,
- }
-
- body.seek(0)
- self.connection.request(method, path, body, headers)
- with self.connection.getresponse() as response:
- parsed = json.load(response)
- if response.status != 200:
- raise RuntimeError(
- "Status code "
- + str(response.status)
- + " doing "
- + method
- + " at "
- + url
- + ":\n"
- + parsed["message"]
- )
- return parsed
-
- def _get_dev_region(self) -> str:
- return self._request("GET", "/zone/v1/environment/dev/default")["name"]
-
- def _get_endpoint(self, instance: str, region: str) -> str:
- endpoints = self._request(
- "GET",
- "/application/v4/tenant/{}/application/{}/instance/{}/environment/dev/region/{}".format(
- self.tenant, self.application, instance, region
- ),
- )["endpoints"]
- container_url = [
- endpoint["url"]
- for endpoint in endpoints
- if endpoint["cluster"]
- == "{}_container".format(self.application_package.name)
- ]
- if not container_url:
- raise RuntimeError("No endpoints found for container 'test_app_container'")
- return container_url[0]
-
- def _to_application_zip(self) -> BytesIO:
- buffer = BytesIO()
- with zipfile.ZipFile(buffer, "a") as zip_archive:
- zip_archive.writestr(
- "application/schemas/{}.sd".format(
- self.application_package.schema.name
- ),
- self.application_package.schema_to_text,
- )
- zip_archive.writestr(
- "application/services.xml", self.application_package.services_to_text
- )
- zip_archive.writestr(
- "application/security/clients.pem",
- self.data_certificate.public_bytes(serialization.Encoding.PEM),
- )
-
- return buffer
-
- def _start_deployment(self, instance: str, job: str, disk_folder: str) -> int:
- deploy_path = (
- "/application/v4/tenant/{}/application/{}/instance/{}/deploy/{}".format(
- self.tenant, self.application, instance, job
- )
- )
-
- application_zip_bytes = self._to_application_zip()
-
- self._write_private_key_and_cert(
- self.data_key, self.data_certificate, disk_folder
- )
-
- response = self._request(
- "POST",
- deploy_path,
- application_zip_bytes,
- {"Content-Type": "application/zip"},
- )
- print(response["message"], file=self.output)
- return response["run"]
-
- def _get_deployment_status(
- self, instance: str, job: str, run: int, last: int
- ) -> (str, int):
-
- update = self._request(
- "GET",
- "/application/v4/tenant/{}/application/{}/instance/{}/job/{}/run/{}?after={}".format(
- self.tenant, self.application, instance, job, run, last
- ),
- )
-
- for step, entries in update["log"].items():
- for entry in entries:
- self._print_log_entry(step, entry)
- last = update.get("lastId", last)
-
- fail_status_message = {
- "error": "Unexpected error during deployment; see log for details",
- "aborted": "Deployment was aborted, probably by a newer deployment",
- "outOfCapacity": "No capacity left in zone; please contact the Vespa team",
- "deploymentFailed": "Deployment failed; see log for details",
- "installationFailed": "Installation failed; see Vespa log for details",
- "running": "Deployment not completed",
- "endpointCertificateTimeout": "Endpoint certificate not ready in time; please contact Vespa team",
- "testFailure": "Unexpected status; tests are not run for manual deployments",
- }
-
- if update["active"]:
- return "active", last
- else:
- status = update["status"]
- if status == "success":
- return "success", last
- elif status in fail_status_message.keys():
- raise RuntimeError(fail_status_message[status])
- else:
- raise RuntimeError("Unexpected status: {}".format(status))
-
- def _follow_deployment(self, instance: str, job: str, run: int) -> None:
- last = -1
- while True:
- try:
- status, last = self._get_deployment_status(instance, job, run, last)
- except RuntimeError:
- raise
-
- if status == "active":
- sleep(1)
- elif status == "success":
- return
- else:
- raise RuntimeError("Unexpected status: {}".format(status))
-
- def _print_log_entry(self, step: str, entry: dict):
- timestamp = strftime("%H:%M:%S", gmtime(entry["at"] / 1e3))
- message = entry["message"].replace("\n", "\n" + " " * 23)
- if step != "copyVespaLogs" or entry["type"] == "error":
- print(
- "{:<7} [{}] {}".format(entry["type"].upper(), timestamp, message),
- file=self.output,
- )
-
- def deploy(self, instance: str, disk_folder: str) -> Vespa:
- """
- Deploy the given application package as the given instance in the Vespa Cloud dev environment.
-
- :param instance: Name of this instance of the application, in the Vespa Cloud.
- :param disk_folder: Disk folder to save the required Vespa config files.
-
- :return: a Vespa connection instance.
- """
- region = self._get_dev_region()
- job = "dev-" + region
- run = self._start_deployment(instance, job, disk_folder)
- self._follow_deployment(instance, job, run)
- endpoint_url = self._get_endpoint(instance=instance, region=region)
- return Vespa(
- url=endpoint_url,
- cert=os.path.join(disk_folder, self.private_cert_file_name),
- )
-
- def delete(self, instance: str):
- """
- Delete the specified instance from the dev environment in the Vespa Cloud.
- :param instance: Name of the instance to delete.
- :return:
- """
- print(
- self._request(
- "DELETE",
- "/application/v4/tenant/{}/application/{}/instance/{}/environment/dev/region/{}".format(
- self.tenant, self.application, instance, self._get_dev_region()
- ),
- )["message"],
- file=self.output,
- )
-
- def close(self):
- self.connection.close()
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- self.close()
diff --git a/python/vespa/vespa/query.py b/python/vespa/vespa/query.py
deleted file mode 100644
index ed67819b821..00000000000
--- a/python/vespa/vespa/query.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-from typing import Callable, List, Optional, Dict
-
-
-#
-# Match phase
-#
-class MatchFilter(object):
- """
- Abstract class for match filters.
- """
-
- def create_match_filter(self, query: str) -> str:
- """
- Create part of the YQL expression related to the filter.
-
- :param query: Query input.
- :return: Part of the YQL expression related to the filter.
- """
- raise NotImplementedError
-
- def get_query_properties(self, query: Optional[str] = None) -> Dict:
- """
- Relevant request properties associated with the filter.
-
- :param query: Query input.
- :return: dict containing the relevant request properties associated with the filter.
- """
- raise NotImplementedError
-
-
-class AND(MatchFilter):
- def __init__(self) -> None:
- """
- Filter that match document containing all the query terms.
- """
- super().__init__()
-
- def create_match_filter(self, query: str) -> str:
- return '(userInput("{}"))'.format(query)
-
- def get_query_properties(self, query: Optional[str] = None) -> Dict:
- return {}
-
-
-class OR(MatchFilter):
- def __init__(self) -> None:
- """
- Filter that match any document containing at least one query term.
- """
- super().__init__()
-
- def create_match_filter(self, query: str) -> str:
- return '([{{"grammar": "any"}}]userInput("{}"))'.format(query)
-
- def get_query_properties(self, query: Optional[str] = None) -> Dict:
- return {}
-
-
-class WeakAnd(MatchFilter):
- def __init__(self, hits: int, field: str = "default") -> None:
- """
- Match documents according to the weakAND algorithm.
-
- Reference: https://docs.vespa.ai/documentation/using-wand-with-vespa.html
-
- :param hits: Lower bound on the number of hits to be retrieved.
- :param field: Which Vespa field to search.
- """
- super().__init__()
- self.hits = hits
- self.field = field
-
- def create_match_filter(self, query: str) -> str:
- query_tokens = query.split(" ")
- terms = ", ".join(
- ['{} contains "{}"'.format(self.field, token) for token in query_tokens]
- )
- return '([{{"targetNumHits": {}}}]weakAnd({}))'.format(self.hits, terms)
-
- def get_query_properties(self, query: Optional[str] = None) -> Dict:
- return {}
-
-
-class ANN(MatchFilter):
- def __init__(
- self,
- doc_vector: str,
- query_vector: str,
- embedding_model: Callable[[str], List[float]],
- hits: int,
- label: str,
- ) -> None:
- """
- Match documents according to the nearest neighbor operator.
-
- Reference: https://docs.vespa.ai/documentation/reference/query-language-reference.html#nearestneighbor
-
- :param doc_vector: Name of the document field to be used in the distance calculation.
- :param query_vector: Name of the query field to be used in the distance calculation.
- :param embedding_model: Model that takes query str as input and return list of floats as output.
- :param hits: Lower bound on the number of hits to return.
- :param label: A label to identify this specific operator instance.
- """
- super().__init__()
- self.doc_vector = doc_vector
- self.query_vector = query_vector
- self.embedding_model = embedding_model
- self.hits = hits
- self.label = label
-
- def create_match_filter(self, query: str) -> str:
- return '([{{"targetNumHits": {}, "label": "{}"}}]nearestNeighbor({}, {}))'.format(
- self.hits, self.label, self.doc_vector, self.query_vector
- )
-
- def get_query_properties(self, query: Optional[str] = None) -> Dict[str, str]:
- embedding_vector = self.embedding_model(query)
- return {
- "ranking.features.query({})".format(self.query_vector): str(
- embedding_vector
- )
- }
-
-
-class Union(MatchFilter):
- def __init__(self, *args: MatchFilter) -> None:
- """
- Match documents that belongs to the union of many match filters.
-
- :param args: Match filters to be taken the union of.
- """
- super().__init__()
- self.operators = args
-
- def create_match_filter(self, query: str) -> str:
- match_filters = []
- for operator in self.operators:
- match_filter = operator.create_match_filter(query=query)
- if match_filter is not None:
- match_filters.append(match_filter)
- return " or ".join(match_filters)
-
- def get_query_properties(self, query: Optional[str] = None) -> Dict[str, str]:
- query_properties = {}
- for operator in self.operators:
- query_properties.update(operator.get_query_properties(query=query))
- return query_properties
-
-
-#
-# Ranking phase
-#
-class RankProfile(object):
- def __init__(self, name: str = "default", list_features: bool = False) -> None:
- """
- Define a rank profile.
-
- :param name: Name of the rank profile as defined in a Vespa search definition.
- :param list_features: Should the ranking features be returned. Either 'true' or 'false'.
- """
- self.name = name
- self.list_features = "false"
- if list_features:
- self.list_features = "true"
-
-
-class Query(object):
- def __init__(
- self,
- match_phase: MatchFilter = AND(),
- rank_profile: RankProfile = RankProfile(),
- ) -> None:
- """
- Define a query model.
-
- :param match_phase: Define the match criteria. One of the MatchFilter options available.
- :param rank_profile: Define the rank criteria.
- """
- self.match_phase = match_phase
- self.rank_profile = rank_profile
-
- def create_body(self, query: str) -> Dict[str, str]:
- """
- Create the appropriate request body to be sent to Vespa.
-
- :param query: Query input.
- :return: dict representing the request body.
- """
-
- match_filter = self.match_phase.create_match_filter(query=query)
- query_properties = self.match_phase.get_query_properties(query=query)
-
- body = {
- "yql": "select * from sources * where {};".format(match_filter),
- "ranking": {
- "profile": self.rank_profile.name,
- "listFeatures": self.rank_profile.list_features,
- },
- }
- body.update(query_properties)
- return body
-
-
-class VespaResult(object):
- def __init__(self, vespa_result, request_body=None):
- self._vespa_result = vespa_result
- self._request_body = request_body
-
- @property
- def request_body(self) -> Optional[Dict]:
- return self._request_body
-
- @property
- def json(self) -> Dict:
- return self._vespa_result
-
- @property
- def hits(self) -> List:
- return self._vespa_result.get("root", {}).get("children", [])
-
- @property
- def number_documents_retrieved(self) -> int:
- return self._vespa_result.get("root", {}).get("fields", {}).get("totalCount", 0)
-
- @property
- def number_documents_indexed(self) -> int:
- return self._vespa_result.get("root", {}).get("coverage", {}).get("documents", 0)
diff --git a/python/vespa/vespa/templates/hosts.xml b/python/vespa/vespa/templates/hosts.xml
deleted file mode 100644
index 5c88f4c1609..00000000000
--- a/python/vespa/vespa/templates/hosts.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="utf-8" ?>
-<!-- Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
-<hosts>
- <host name="localhost">
- <alias>node1</alias>
- </host>
-</hosts> \ No newline at end of file
diff --git a/python/vespa/vespa/templates/schema.txt b/python/vespa/vespa/templates/schema.txt
deleted file mode 100644
index 0849cbbad6f..00000000000
--- a/python/vespa/vespa/templates/schema.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-schema {{ schema_name }} {
- document {{ document_name }} {
- {% for field in fields %}
- field {{ field.name }} type {{ field.type }} {
- {% if field.indexing %}
- indexing: {{ field.indexing_to_text }}
- {% endif %}
- {% if field.index %}
- index: {{ field.index }}
- {% endif %}
- }
- {% endfor %}
- }
-{% for key, value in fieldsets.items() %}
- fieldset {{ key }} {
- fields: {{ value.fields_to_text }}
- }
-{% endfor %}
-{% for key, value in rank_profiles.items() %}
- rank-profile {{ key }}{% if value.inherits %} inherits {{ value.inherits }}{% endif %} {
- {% if value.first_phase %}
- first-phase {
- expression: {{ value.first_phase }}
- }
- {% endif %}
- }
-{% endfor %}
-} \ No newline at end of file
diff --git a/python/vespa/vespa/templates/services.xml b/python/vespa/vespa/templates/services.xml
deleted file mode 100644
index c6bda296be9..00000000000
--- a/python/vespa/vespa/templates/services.xml
+++ /dev/null
@@ -1,16 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<services version="1.0">
- <container id="{{ application_name }}_container" version="1.0">
- <search></search>
- <document-api></document-api>
- </container>
- <content id="{{ application_name }}_content" version="1.0">
- <redundancy reply-after="1">1</redundancy>
- <documents>
- <document type="{{ document_name }}" mode="index"></document>
- </documents>
- <nodes>
- <node distribution-key="0" hostalias="node1"></node>
- </nodes>
- </content>
-</services> \ No newline at end of file
diff --git a/python/vespa/vespa/test_application.py b/python/vespa/vespa/test_application.py
deleted file mode 100644
index 84bd1c0a6ad..00000000000
--- a/python/vespa/vespa/test_application.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-import unittest
-from unittest.mock import Mock, call
-from pandas import DataFrame
-from pandas.testing import assert_frame_equal
-
-from vespa.application import Vespa
-from vespa.query import Query, OR, RankProfile, VespaResult
-
-
-class TestVespa(unittest.TestCase):
- def test_end_point(self):
- self.assertEqual(
- Vespa(url="https://cord19.vespa.ai").end_point, "https://cord19.vespa.ai"
- )
- self.assertEqual(
- Vespa(url="http://localhost", port=8080).end_point, "http://localhost:8080"
- )
- self.assertEqual(
- Vespa(url="http://localhost/", port=8080).end_point, "http://localhost:8080"
- )
-
-
-class TestVespaQuery(unittest.TestCase):
- def test_query(self):
- app = Vespa(url="http://localhost", port=8080)
-
- body = {"yql": "select * from sources * where test"}
- self.assertDictEqual(
- app.query(body=body, debug_request=True).request_body, body
- )
-
- self.assertDictEqual(
- app.query(
- query="this is a test",
- query_model=Query(match_phase=OR(), rank_profile=RankProfile()),
- debug_request=True,
- hits=10,
- ).request_body,
- {
- "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));',
- "ranking": {"profile": "default", "listFeatures": "false"},
- "hits": 10,
- },
- )
-
- self.assertDictEqual(
- app.query(
- query="this is a test",
- query_model=Query(match_phase=OR(), rank_profile=RankProfile()),
- debug_request=True,
- hits=10,
- recall=("id", [1, 5]),
- ).request_body,
- {
- "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));',
- "ranking": {"profile": "default", "listFeatures": "false"},
- "hits": 10,
- "recall": "+(id:1 id:5)",
- },
- )
-
-
-class TestVespaCollectData(unittest.TestCase):
- def setUp(self) -> None:
- self.app = Vespa(url="http://localhost", port=8080)
- self.raw_vespa_result_recall = {
- "root": {
- "id": "toplevel",
- "relevance": 1.0,
- "fields": {"totalCount": 1083},
- "coverage": {
- "coverage": 100,
- "documents": 62529,
- "full": True,
- "nodes": 2,
- "results": 1,
- "resultsFull": 1,
- },
- "children": [
- {
- "id": "id:covid-19:doc::40215",
- "relevance": 30.368213170494712,
- "source": "content",
- "fields": {
- "vespa_id_field": "abc",
- "sddocname": "doc",
- "body_text": "this is a body",
- "title": "this is a title",
- "rankfeatures": {"a": 1, "b": 2},
- },
- }
- ],
- }
- }
-
- self.raw_vespa_result_additional = {
- "root": {
- "id": "toplevel",
- "relevance": 1.0,
- "fields": {"totalCount": 1083},
- "coverage": {
- "coverage": 100,
- "documents": 62529,
- "full": True,
- "nodes": 2,
- "results": 1,
- "resultsFull": 1,
- },
- "children": [
- {
- "id": "id:covid-19:doc::40216",
- "relevance": 10,
- "source": "content",
- "fields": {
- "vespa_id_field": "def",
- "sddocname": "doc",
- "body_text": "this is a body 2",
- "title": "this is a title 2",
- "rankfeatures": {"a": 3, "b": 4},
- },
- },
- {
- "id": "id:covid-19:doc::40217",
- "relevance": 8,
- "source": "content",
- "fields": {
- "vespa_id_field": "ghi",
- "sddocname": "doc",
- "body_text": "this is a body 3",
- "title": "this is a title 3",
- "rankfeatures": {"a": 5, "b": 6},
- },
- },
- ],
- }
- }
-
- def test_disable_rank_features(self):
- with self.assertRaises(AssertionError):
- self.app.collect_training_data_point(
- query="this is a query",
- query_id="123",
- relevant_id="abc",
- id_field="vespa_id_field",
- query_model=Query(),
- number_additional_docs=2,
- )
-
- def test_collect_training_data_point(self):
-
- self.app.query = Mock(
- side_effect=[
- VespaResult(self.raw_vespa_result_recall),
- VespaResult(self.raw_vespa_result_additional),
- ]
- )
- query_model = Query(rank_profile=RankProfile(list_features=True))
- data = self.app.collect_training_data_point(
- query="this is a query",
- query_id="123",
- relevant_id="abc",
- id_field="vespa_id_field",
- query_model=query_model,
- number_additional_docs=2,
- timeout="15s",
- )
-
- self.assertEqual(self.app.query.call_count, 2)
- self.app.query.assert_has_calls(
- [
- call(
- query="this is a query",
- query_model=query_model,
- recall=("vespa_id_field", ["abc"]),
- timeout="15s",
- ),
- call(
- query="this is a query",
- query_model=query_model,
- hits=2,
- timeout="15s",
- ),
- ]
- )
- expected_data = [
- {"document_id": "abc", "query_id": "123", "relevant": 1, "a": 1, "b": 2},
- {"document_id": "def", "query_id": "123", "relevant": 0, "a": 3, "b": 4},
- {"document_id": "ghi", "query_id": "123", "relevant": 0, "a": 5, "b": 6},
- ]
- self.assertEqual(data, expected_data)
-
- def test_collect_training_data_point_0_recall_hits(self):
-
- self.raw_vespa_result_recall = {
- "root": {
- "id": "toplevel",
- "relevance": 1.0,
- "fields": {"totalCount": 0},
- "coverage": {
- "coverage": 100,
- "documents": 62529,
- "full": True,
- "nodes": 2,
- "results": 1,
- "resultsFull": 1,
- },
- }
- }
- self.app.query = Mock(
- side_effect=[
- VespaResult(self.raw_vespa_result_recall),
- VespaResult(self.raw_vespa_result_additional),
- ]
- )
- query_model = Query(rank_profile=RankProfile(list_features=True))
- data = self.app.collect_training_data_point(
- query="this is a query",
- query_id="123",
- relevant_id="abc",
- id_field="vespa_id_field",
- query_model=query_model,
- number_additional_docs=2,
- timeout="15s",
- )
-
- self.assertEqual(self.app.query.call_count, 1)
- self.app.query.assert_has_calls(
- [
- call(
- query="this is a query",
- query_model=query_model,
- recall=("vespa_id_field", ["abc"]),
- timeout="15s",
- ),
- ]
- )
- expected_data = []
- self.assertEqual(data, expected_data)
-
- def test_collect_training_data(self):
-
- mock_return_value = [
- {"document_id": "abc", "query_id": "123", "relevant": 1, "a": 1, "b": 2,},
- {"document_id": "def", "query_id": "123", "relevant": 0, "a": 3, "b": 4,},
- {"document_id": "ghi", "query_id": "123", "relevant": 0, "a": 5, "b": 6,},
- ]
- self.app.collect_training_data_point = Mock(return_value=mock_return_value)
- labelled_data = [
- {
- "query_id": 123,
- "query": "this is a query",
- "relevant_docs": [{"id": "abc", "score": 1}],
- }
- ]
- query_model = Query(rank_profile=RankProfile(list_features=True))
- data = self.app.collect_training_data(
- labelled_data=labelled_data,
- id_field="vespa_id_field",
- query_model=query_model,
- number_additional_docs=2,
- timeout="15s",
- )
- self.app.collect_training_data_point.assert_has_calls(
- [
- call(
- query="this is a query",
- query_id=123,
- relevant_id="abc",
- id_field="vespa_id_field",
- query_model=query_model,
- number_additional_docs=2,
- relevant_score=1,
- default_score=0,
- timeout="15s",
- )
- ]
- )
- assert_frame_equal(data, DataFrame.from_records(mock_return_value))
-
-
-class TestVespaEvaluate(unittest.TestCase):
- def setUp(self) -> None:
- self.app = Vespa(url="http://localhost", port=8080)
-
- self.labelled_data = [
- {
- "query_id": 0,
- "query": "Intrauterine virus infections and congenital heart disease",
- "relevant_docs": [{"id": "def", "score": 1}, {"id": "abc", "score": 1}],
- },
- ]
-
- self.query_results = {
- "root": {
- "id": "toplevel",
- "relevance": 1.0,
- "fields": {"totalCount": 1083},
- "coverage": {
- "coverage": 100,
- "documents": 62529,
- "full": True,
- "nodes": 2,
- "results": 1,
- "resultsFull": 1,
- },
- "children": [
- {
- "id": "id:covid-19:doc::40216",
- "relevance": 10,
- "source": "content",
- "fields": {
- "vespa_id_field": "ghi",
- "sddocname": "doc",
- "body_text": "this is a body 2",
- "title": "this is a title 2",
- "rankfeatures": {"a": 3, "b": 4},
- },
- },
- {
- "id": "id:covid-19:doc::40217",
- "relevance": 8,
- "source": "content",
- "fields": {
- "vespa_id_field": "def",
- "sddocname": "doc",
- "body_text": "this is a body 3",
- "title": "this is a title 3",
- "rankfeatures": {"a": 5, "b": 6},
- },
- },
- ],
- }
- }
-
- def test_evaluate_query(self):
- self.app.query = Mock(return_value={})
- eval_metric = Mock()
- eval_metric.evaluate_query = Mock(return_value={"metric": 1})
- eval_metric2 = Mock()
- eval_metric2.evaluate_query = Mock(return_value={"metric_2": 2})
- query_model = Query()
- evaluation = self.app.evaluate_query(
- eval_metrics=[eval_metric, eval_metric2],
- query_model=query_model,
- query_id="0",
- query="this is a test",
- id_field="vespa_id_field",
- relevant_docs=self.labelled_data[0]["relevant_docs"],
- default_score=0,
- hits=10,
- )
- self.assertEqual(self.app.query.call_count, 1)
- self.app.query.assert_has_calls(
- [call(query="this is a test", query_model=query_model, hits=10),]
- )
- self.assertEqual(eval_metric.evaluate_query.call_count, 1)
- eval_metric.evaluate_query.assert_has_calls(
- [call({}, self.labelled_data[0]["relevant_docs"], "vespa_id_field", 0),]
- )
- self.assertDictEqual(evaluation, {"query_id": "0", "metric": 1, "metric_2": 2})
-
- def test_evaluate(self):
- self.app.evaluate_query = Mock(side_effect=[{"query_id": "0", "metric": 1},])
- evaluation = self.app.evaluate(
- labelled_data=self.labelled_data,
- eval_metrics=[Mock()],
- query_model=Mock(),
- id_field="mock",
- default_score=0,
- )
- assert_frame_equal(
- evaluation, DataFrame.from_records([{"query_id": "0", "metric": 1}])
- )
diff --git a/python/vespa/vespa/test_evaluation.py b/python/vespa/vespa/test_evaluation.py
deleted file mode 100644
index b6941985d94..00000000000
--- a/python/vespa/vespa/test_evaluation.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-import unittest
-
-from vespa.query import VespaResult
-from vespa.evaluation import MatchRatio, Recall, ReciprocalRank
-
-
-class TestEvalMetric(unittest.TestCase):
- def setUp(self) -> None:
- self.labelled_data = [
- {
- "query_id": 0,
- "query": "Intrauterine virus infections and congenital heart disease",
- "relevant_docs": [{"id": "def", "score": 1}, {"id": "abc", "score": 1}],
- },
- ]
-
- self.query_results = {
- "root": {
- "id": "toplevel",
- "relevance": 1.0,
- "fields": {"totalCount": 1083},
- "coverage": {
- "coverage": 100,
- "documents": 62529,
- "full": True,
- "nodes": 2,
- "results": 1,
- "resultsFull": 1,
- },
- "children": [
- {
- "id": "id:covid-19:doc::40216",
- "relevance": 10,
- "source": "content",
- "fields": {
- "vespa_id_field": "ghi",
- "sddocname": "doc",
- "body_text": "this is a body 2",
- "title": "this is a title 2",
- "rankfeatures": {"a": 3, "b": 4},
- },
- },
- {
- "id": "id:covid-19:doc::40217",
- "relevance": 8,
- "source": "content",
- "fields": {
- "vespa_id_field": "def",
- "sddocname": "doc",
- "body_text": "this is a body 3",
- "title": "this is a title 3",
- "rankfeatures": {"a": 5, "b": 6},
- },
- },
- ],
- }
- }
-
- def test_match_ratio(self):
- metric = MatchRatio()
-
- evaluation = metric.evaluate_query(
- query_results=VespaResult(self.query_results),
- relevant_docs=self.labelled_data[0]["relevant_docs"],
- id_field="vespa_id_field",
- default_score=0,
- )
-
- self.assertDictEqual(
- evaluation,
- {
- "match_ratio_retrieved_docs": 1083,
- "match_ratio_docs_available": 62529,
- "match_ratio_value": 1083 / 62529,
- },
- )
-
- evaluation = metric.evaluate_query(
- query_results=VespaResult(
- {
- "root": {
- "id": "toplevel",
- "relevance": 1.0,
- "coverage": {
- "coverage": 100,
- "documents": 62529,
- "full": True,
- "nodes": 2,
- "results": 1,
- "resultsFull": 1,
- },
- }
- }
- ),
- relevant_docs=self.labelled_data[0]["relevant_docs"],
- id_field="vespa_id_field",
- default_score=0,
- )
-
- self.assertDictEqual(
- evaluation,
- {
- "match_ratio_retrieved_docs": 0,
- "match_ratio_docs_available": 62529,
- "match_ratio_value": 0 / 62529,
- },
- )
-
- evaluation = metric.evaluate_query(
- query_results=VespaResult(
- {
- "root": {
- "id": "toplevel",
- "relevance": 1.0,
- "fields": {"totalCount": 1083},
- "coverage": {
- "coverage": 100,
- "full": True,
- "nodes": 2,
- "results": 1,
- "resultsFull": 1,
- },
- }
- }
- ),
- relevant_docs=self.labelled_data[0]["relevant_docs"],
- id_field="vespa_id_field",
- default_score=0,
- )
-
- self.assertDictEqual(
- evaluation,
- {
- "match_ratio_retrieved_docs": 1083,
- "match_ratio_docs_available": 0,
- "match_ratio_value": 0,
- },
- )
-
- def test_recall(self):
- metric = Recall(at=2)
- evaluation = metric.evaluate_query(
- query_results=VespaResult(self.query_results),
- relevant_docs=self.labelled_data[0]["relevant_docs"],
- id_field="vespa_id_field",
- default_score=0,
- )
- self.assertDictEqual(
- evaluation, {"recall_2_value": 0.5,},
- )
-
- metric = Recall(at=1)
- evaluation = metric.evaluate_query(
- query_results=VespaResult(self.query_results),
- relevant_docs=self.labelled_data[0]["relevant_docs"],
- id_field="vespa_id_field",
- default_score=0,
- )
- self.assertDictEqual(
- evaluation, {"recall_1_value": 0.0,},
- )
-
- def test_reciprocal_rank(self):
- metric = ReciprocalRank(at=2)
- evaluation = metric.evaluate_query(
- query_results=VespaResult(self.query_results),
- relevant_docs=self.labelled_data[0]["relevant_docs"],
- id_field="vespa_id_field",
- default_score=0,
- )
- self.assertDictEqual(
- evaluation, {"reciprocal_rank_2_value": 0.5,},
- )
-
- metric = ReciprocalRank(at=1)
- evaluation = metric.evaluate_query(
- query_results=VespaResult(self.query_results),
- relevant_docs=self.labelled_data[0]["relevant_docs"],
- id_field="vespa_id_field",
- default_score=0,
- )
- self.assertDictEqual(
- evaluation, {"reciprocal_rank_1_value": 0.0,},
- )
diff --git a/python/vespa/vespa/test_package.py b/python/vespa/vespa/test_package.py
deleted file mode 100644
index 1dca8bbf014..00000000000
--- a/python/vespa/vespa/test_package.py
+++ /dev/null
@@ -1,243 +0,0 @@
-import unittest
-
-from vespa.package import (
- Field,
- Document,
- FieldSet,
- RankProfile,
- Schema,
- ApplicationPackage,
-)
-
-
-class TestField(unittest.TestCase):
- def test_field_name_type(self):
- field = Field(name="test_name", type="string")
- self.assertEqual(field.name, "test_name")
- self.assertEqual(field.type, "string")
- self.assertEqual(field.to_dict, {"name": "test_name", "type": "string"})
- self.assertEqual(field, Field(name="test_name", type="string"))
- self.assertEqual(field, Field.from_dict(field.to_dict))
- self.assertIsNone(field.indexing_to_text)
-
- def test_field_name_type_indexing_index(self):
- field = Field(
- name="body",
- type="string",
- indexing=["index", "summary"],
- index="enable-bm25",
- )
- self.assertEqual(field.name, "body")
- self.assertEqual(field.type, "string")
- self.assertEqual(field.indexing, ["index", "summary"])
- self.assertEqual(field.index, "enable-bm25")
- self.assertEqual(
- field.to_dict,
- {
- "name": "body",
- "type": "string",
- "indexing": ["index", "summary"],
- "index": "enable-bm25",
- },
- )
- self.assertEqual(
- field,
- Field(
- name="body",
- type="string",
- indexing=["index", "summary"],
- index="enable-bm25",
- ),
- )
- self.assertEqual(field, Field.from_dict(field.to_dict))
- self.assertEqual(field.indexing_to_text, "index | summary")
-
-
-class TestDocument(unittest.TestCase):
- def test_empty_document(self):
- document = Document()
- self.assertEqual(document.fields, [])
- self.assertEqual(document.to_dict, {"fields": []})
- self.assertEqual(document, Document.from_dict(document.to_dict))
-
- def test_document_one_field(self):
- document = Document()
- field = Field(name="test_name", type="string")
- document.add_fields(field)
- self.assertEqual(document.fields, [field])
- self.assertEqual(document, Document.from_dict(document.to_dict))
- self.assertEqual(document, Document([field]))
-
- def test_document_two_fields(self):
- document = Document()
- field_1 = Field(name="test_name", type="string")
- field_2 = Field(
- name="body",
- type="string",
- indexing=["index", "summary"],
- index="enable-bm25",
- )
- document.add_fields(field_1, field_2)
- self.assertEqual(document.fields, [field_1, field_2])
- self.assertEqual(document, Document.from_dict(document.to_dict))
- self.assertEqual(document, Document([field_1, field_2]))
-
-
-class TestFieldSet(unittest.TestCase):
- def test_fieldset(self):
- field_set = FieldSet(name="default", fields=["title", "body"])
- self.assertEqual(field_set.name, "default")
- self.assertEqual(field_set.fields, ["title", "body"])
- self.assertEqual(field_set, FieldSet.from_dict(field_set.to_dict))
- self.assertEqual(field_set.fields_to_text, "title, body")
-
-
-class TestRankProfile(unittest.TestCase):
- def test_rank_profile(self):
- rank_profile = RankProfile(name="bm25", first_phase="bm25(title) + bm25(body)")
- self.assertEqual(rank_profile.name, "bm25")
- self.assertEqual(rank_profile.first_phase, "bm25(title) + bm25(body)")
- self.assertEqual(rank_profile, RankProfile.from_dict(rank_profile.to_dict))
-
- def test_rank_profile_inherits(self):
- rank_profile = RankProfile(
- name="bm25", first_phase="bm25(title) + bm25(body)", inherits="default"
- )
- self.assertEqual(rank_profile.name, "bm25")
- self.assertEqual(rank_profile.first_phase, "bm25(title) + bm25(body)")
- self.assertEqual(rank_profile, RankProfile.from_dict(rank_profile.to_dict))
-
-
-class TestSchema(unittest.TestCase):
- def test_schema(self):
- schema = Schema(
- name="test_schema",
- document=Document(fields=[Field(name="test_name", type="string")]),
- fieldsets=[FieldSet(name="default", fields=["title", "body"])],
- rank_profiles=[
- RankProfile(name="bm25", first_phase="bm25(title) + bm25(body)")
- ],
- )
- self.assertEqual(schema, Schema.from_dict(schema.to_dict))
- self.assertDictEqual(
- schema.rank_profiles,
- {"bm25": RankProfile(name="bm25", first_phase="bm25(title) + bm25(body)")},
- )
- schema.add_rank_profile(
- RankProfile(name="default", first_phase="NativeRank(title)")
- )
- self.assertDictEqual(
- schema.rank_profiles,
- {
- "bm25": RankProfile(
- name="bm25", first_phase="bm25(title) + bm25(body)"
- ),
- "default": RankProfile(name="default", first_phase="NativeRank(title)"),
- },
- )
-
-
-class TestApplicationPackage(unittest.TestCase):
- def setUp(self) -> None:
- test_schema = Schema(
- name="msmarco",
- document=Document(
- fields=[
- Field(name="id", type="string", indexing=["attribute", "summary"]),
- Field(
- name="title",
- type="string",
- indexing=["index", "summary"],
- index="enable-bm25",
- ),
- Field(
- name="body",
- type="string",
- indexing=["index", "summary"],
- index="enable-bm25",
- ),
- ]
- ),
- fieldsets=[FieldSet(name="default", fields=["title", "body"])],
- rank_profiles=[
- RankProfile(name="default", first_phase="nativeRank(title, body)"),
- RankProfile(
- name="bm25",
- first_phase="bm25(title) + bm25(body)",
- inherits="default",
- ),
- ],
- )
- self.app_package = ApplicationPackage(name="test_app", schema=test_schema)
-
- def test_application_package(self):
- self.assertEqual(
- self.app_package, ApplicationPackage.from_dict(self.app_package.to_dict)
- )
-
- def test_schema_to_text(self):
- expected_result = (
- "schema msmarco {\n"
- " document msmarco {\n"
- " field id type string {\n"
- " indexing: attribute | summary\n"
- " }\n"
- " field title type string {\n"
- " indexing: index | summary\n"
- " index: enable-bm25\n"
- " }\n"
- " field body type string {\n"
- " indexing: index | summary\n"
- " index: enable-bm25\n"
- " }\n"
- " }\n"
- " fieldset default {\n"
- " fields: title, body\n"
- " }\n"
- " rank-profile default {\n"
- " first-phase {\n"
- " expression: nativeRank(title, body)\n"
- " }\n"
- " }\n"
- " rank-profile bm25 inherits default {\n"
- " first-phase {\n"
- " expression: bm25(title) + bm25(body)\n"
- " }\n"
- " }\n"
- "}"
- )
- self.assertEqual(self.app_package.schema_to_text, expected_result)
-
- def test_hosts_to_text(self):
- expected_result = (
- '<?xml version="1.0" encoding="utf-8" ?>\n'
- "<!-- Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->\n"
- "<hosts>\n"
- ' <host name="localhost">\n'
- " <alias>node1</alias>\n"
- " </host>\n"
- "</hosts>"
- )
- self.assertEqual(self.app_package.hosts_to_text, expected_result)
-
- def test_services_to_text(self):
- expected_result = (
- '<?xml version="1.0" encoding="UTF-8"?>\n'
- '<services version="1.0">\n'
- ' <container id="test_app_container" version="1.0">\n'
- " <search></search>\n"
- " <document-api></document-api>\n"
- " </container>\n"
- ' <content id="test_app_content" version="1.0">\n'
- ' <redundancy reply-after="1">1</redundancy>\n'
- " <documents>\n"
- ' <document type="msmarco" mode="index"></document>\n'
- " </documents>\n"
- " <nodes>\n"
- ' <node distribution-key="0" hostalias="node1"></node>\n'
- " </nodes>\n"
- " </content>\n"
- "</services>"
- )
-
- self.assertEqual(self.app_package.services_to_text, expected_result)
diff --git a/python/vespa/vespa/test_query.py b/python/vespa/vespa/test_query.py
deleted file mode 100644
index 1e933f25c7d..00000000000
--- a/python/vespa/vespa/test_query.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-import unittest
-
-from vespa.query import Query, OR, AND, WeakAnd, ANN, Union, RankProfile, VespaResult
-
-
-class TestMatchFilter(unittest.TestCase):
- def setUp(self) -> None:
- self.query = "this is a test"
-
- def test_and(self):
- match_filter = AND()
- self.assertEqual(
- match_filter.create_match_filter(query=self.query),
- '(userInput("this is a test"))',
- )
- self.assertDictEqual(match_filter.get_query_properties(query=self.query), {})
-
- def test_or(self):
- match_filter = OR()
- self.assertEqual(
- match_filter.create_match_filter(query=self.query),
- '([{"grammar": "any"}]userInput("this is a test"))',
- )
- self.assertDictEqual(match_filter.get_query_properties(query=self.query), {})
-
- def test_weak_and(self):
- match_filter = WeakAnd(hits=10, field="field_name")
- self.assertEqual(
- match_filter.create_match_filter(query=self.query),
- '([{"targetNumHits": 10}]weakAnd(field_name contains "this", field_name contains "is", field_name contains "", '
- 'field_name contains "a", field_name contains "test"))',
- )
- self.assertDictEqual(match_filter.get_query_properties(query=self.query), {})
-
- def test_ann(self):
- match_filter = ANN(
- doc_vector="doc_vector",
- query_vector="query_vector",
- embedding_model=lambda x: [1, 2, 3],
- hits=10,
- label="label",
- )
- self.assertEqual(
- match_filter.create_match_filter(query=self.query),
- '([{"targetNumHits": 10, "label": "label"}]nearestNeighbor(doc_vector, query_vector))',
- )
- self.assertDictEqual(
- match_filter.get_query_properties(query=self.query),
- {"ranking.features.query(query_vector)": "[1, 2, 3]"},
- )
-
- def test_union(self):
- match_filter = Union(
- WeakAnd(hits=10, field="field_name"),
- ANN(
- doc_vector="doc_vector",
- query_vector="query_vector",
- embedding_model=lambda x: [1, 2, 3],
- hits=10,
- label="label",
- ),
- )
- self.assertEqual(
- match_filter.create_match_filter(query=self.query),
- '([{"targetNumHits": 10}]weakAnd(field_name contains "this", field_name contains "is", '
- 'field_name contains "", '
- 'field_name contains "a", field_name contains "test")) or '
- '([{"targetNumHits": 10, "label": "label"}]nearestNeighbor(doc_vector, query_vector))',
- )
- self.assertDictEqual(
- match_filter.get_query_properties(query=self.query),
- {"ranking.features.query(query_vector)": "[1, 2, 3]"},
- )
-
-
-class TestRankProfile(unittest.TestCase):
- def test_rank_profile(self):
- rank_profile = RankProfile(name="rank_profile", list_features=True)
- self.assertEqual(rank_profile.name, "rank_profile")
- self.assertEqual(rank_profile.list_features, "true")
-
-
-class TestQuery(unittest.TestCase):
- def setUp(self) -> None:
- self.query = "this is a test"
-
- def test_default(self):
- query = Query()
- self.assertDictEqual(
- query.create_body(query=self.query),
- {
- "yql": 'select * from sources * where (userInput("this is a test"));',
- "ranking": {"profile": "default", "listFeatures": "false"},
- },
- )
-
- def test_match_and_rank(self):
- query = Query(
- match_phase=ANN(
- doc_vector="doc_vector",
- query_vector="query_vector",
- embedding_model=lambda x: [1, 2, 3],
- hits=10,
- label="label",
- ),
- rank_profile=RankProfile(name="bm25", list_features=True),
- )
- self.assertDictEqual(
- query.create_body(query=self.query),
- {
- "yql": 'select * from sources * where ([{"targetNumHits": 10, "label": "label"}]nearestNeighbor(doc_vector, query_vector));',
- "ranking": {"profile": "bm25", "listFeatures": "true"},
- "ranking.features.query(query_vector)": "[1, 2, 3]",
- },
- )
-
-
-class TestVespaResult(unittest.TestCase):
- def setUp(self) -> None:
- self.raw_vespa_result_empty_hits = {
- "root": {
- "id": "toplevel",
- "relevance": 1.0,
- "fields": {"totalCount": 0},
- "coverage": {
- "coverage": 100,
- "documents": 62529,
- "full": True,
- "nodes": 2,
- "results": 1,
- "resultsFull": 1,
- },
- }
- }
-
- self.raw_vespa_result = {
- "root": {
- "id": "toplevel",
- "relevance": 1.0,
- "fields": {"totalCount": 1083},
- "coverage": {
- "coverage": 100,
- "documents": 62529,
- "full": True,
- "nodes": 2,
- "results": 1,
- "resultsFull": 1,
- },
- "children": [
- {
- "id": "id:covid-19:doc::40215",
- "relevance": 30.368213170494712,
- "source": "content",
- "fields": {
- "sddocname": "doc",
- "body_text": "this is a body",
- "title": "this is a title",
- },
- }
- ],
- }
- }
-
- def test_json(self):
- vespa_result = VespaResult(vespa_result=self.raw_vespa_result)
- self.assertDictEqual(vespa_result.json, self.raw_vespa_result)
-
- def test_hits(self):
- empty_hits_vespa_result = VespaResult(
- vespa_result=self.raw_vespa_result_empty_hits
- )
- self.assertEqual(empty_hits_vespa_result.hits, [])
- vespa_result = VespaResult(vespa_result=self.raw_vespa_result)
- self.assertEqual(
- vespa_result.hits,
- [
- {
- "id": "id:covid-19:doc::40215",
- "relevance": 30.368213170494712,
- "source": "content",
- "fields": {
- "sddocname": "doc",
- "body_text": "this is a body",
- "title": "this is a title",
- },
- }
- ],
- )